Source code for relstorage.adapters.batch

##############################################################################
#
# Copyright (c) 2009 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Batch table row insert/delete support.
"""
from collections import defaultdict
import itertools

from relstorage._compat import iteritems
from relstorage._compat import perf_counter
from relstorage._util import parse_byte_size

from .interfaces import AggregateOperationTimeoutError

# A cache
# {(placeholder, count): "string"}
_row_schemas = {}

[docs] class RowBatcher(object): # pylint:disable=too-many-instance-attributes """ Generic row batcher. Inserting can use whatever paramater placeholder format and parameter format the cursor supports. Deleting needs to use ordered parameters. The placeholder can be set in the ``delete_placeholder`` attribute. """ perf_counter = perf_counter # How many total rows can be sent at once. Also used as # ``bind_limit`` if that is 0 or None. row_limit = 1024 # The total number of available bind variables a single statement # can use. bind_limit = None # The default max_allowed_packet in MySQL is 4MB, # so the data, including encoding and the rest of the query structure, # must be less than that (unless we dynamically query to find out # what value we have) size_limit = parse_byte_size('2 MB') delete_placeholder = '%s' insert_placeholder = '%s' # For testing, force the delete order to be deterministic # when multiple columns are involved sorted_deletes = False def __init__(self, cursor, row_limit=None, delete_placeholder=None, insert_placeholder=None, bind_limit=None): self.cursor = cursor if delete_placeholder is not None: self.delete_placeholder = delete_placeholder if insert_placeholder is not None: self.insert_placeholder = insert_placeholder if row_limit is not None: self.row_limit = row_limit if bind_limit is not None: self.bind_limit = bind_limit # These are cumulative self.total_rows_inserted = 0 self.total_size_inserted = 0 self.total_rows_deleted = 0 # These all get reset at each flush() self.rows_added = 0 self.size_added = 0 self.bind_params_added = 0 self.deletes = defaultdict(set) # {(table, columns_tuple): set([(column_value,)])} self.inserts = defaultdict(dict) # {(command, header, row_schema, suffix): {rowkey: [row]}} def __repr__(self): return "<%s at %x tins=%d tdel=%d prows=%d>" % ( self.__class__.__name__, id(self), self.total_rows_inserted, self.total_rows_deleted, self.rows_added, ) def _flush_if_needed(self): """ Return the number of rows updated. """ if self.rows_added >= self.row_limit: return self.flush() if self.bind_limit and self.bind_params_added >= self.bind_limit: return self.flush() if self.size_added >= self.size_limit: return self.flush() return 0 def _flush_if_would_exceed_bind(self, addition): # The bind limit is a hard limit we cannot exceed. # If adding *addition* params would cause us to exceed, # flush now. if self.bind_limit and self.bind_params_added + addition >= self.bind_limit: return self.flush() or True # This should always be at least one, right? return 0
[docs] def delete_from(self, table, **kw): """ Returns the number of rows flushed as a result of this operation. That can include inserts. """ # XXX: When deleting a lot from a single table, a bulk function # might be a lot faster. if not kw: raise AssertionError("Need at least one column value") columns = tuple(sorted(kw)) key = (table, columns) row = tuple(kw[column] for column in columns) bind_params_added = len(row) if key not in self.deletes[key] else 0 count = self._flush_if_would_exceed_bind(bind_params_added) self.deletes[key].add(row) self.rows_added += 1 self.bind_params_added += bind_params_added count += self._flush_if_needed() return count
def insert_into(self, header, row_schema, row, rowkey, size, command='INSERT', suffix=''): key = (command, header, row_schema, suffix) bind_params_added = len(row) if rowkey not in self.inserts[key] else 0 self._flush_if_would_exceed_bind(bind_params_added) # If we flushed, self.inserts has started all over. self.inserts[key][rowkey] = row # note that this may replace a row self.rows_added += 1 self.bind_params_added += bind_params_added self.size_added += size self._flush_if_needed() def row_schema_of_length(self, param_count): # Use as the *row_schema* parameter to insert_into key = (self.insert_placeholder, param_count) try: return _row_schemas[key] except KeyError: p = [self.insert_placeholder] * param_count p = ', '.join(p) _row_schemas[key] = p return p
[docs] def select_from(self, columns, table, suffix='', timeout=None, **kw): """ Handles a query of the ``WHERE col IN (?, ?,)`` type:: ``SELECT columns FROM table WHERE col IN (?, ...)`` The keyword arguments should be of length 1, containing an iterable of the values to check: ``col=(1, 2)`` or in the dynamic case ``**{indirect_var: [1, 2]}``:: batcher.select_from(('zoid', 'tid',), 'object_state', zoid=oids) The number of batches needed is determined by the length of the iterator divided by this object's ``bind_limit``, or, if that's not set, by the ``row_limit``. Returns a iterator of matching rows. Matching rows are delivered incrementally, so some number of rows may be delivered and then an exception is raised. :keyword float timeout: If given, provides a number of seconds that is the approximate maximum amount of time this method will be allowed to take. :raises AggregateOperationTimeoutError: If *timeout* is given, and the cumulative time taken to query and process some subset of batches exceeds *timeout*. This is checked after each individual batch. """ command = 'SELECT %s' % (','.join(columns),) for cursor, _count in self.__select_like( command, table, suffix, timeout, kw ): yield from cursor.fetchall()
[docs] def update_set_static(self, update_set, timeout=None, batch_done_callback=lambda total_count: None, **kw): """ As for :meth:`select_from`, but the first parameter is the complete static UPDATE statement. It must be uppercase, and startwith "UPDATE". Rows are net expected to be returned, so the cursor is completely consumed between batches. """ # We actually just consume the iteration of the cursor itself; # we can't consume the cursor. Some drivers (pg8000) throw ProgrammingError # if we try to iterate the cursor that has no rows. total_count = 0 for _cursor, total_count in self.__select_like( update_set, None, # table not used '', # suffix not used, timeout, kw ): batch_done_callback(total_count) return total_count
def __select_like(self, command, table, suffix, timeout, kw): assert len(kw) == 1, kw # filter_values may be a generic iterable or even a generator; # be sure not to materialize the whole thing at any point. Never # more than a chunk_size at a time. filter_column, filter_values = kw.popitem() filter_values = iter(filter_values) chunk_size = self.bind_limit or self.row_limit chunk_size -= 1 begin = self.perf_counter() if timeout else None count = 0 for head in filter_values: filter_subset = list(itertools.islice(filter_values, chunk_size)) filter_subset.append(head) descriptor = [[(table, (filter_column,)), filter_subset]] count += self._do_batch(command, descriptor, rows_need_flattened=False, suffix=suffix) yield self.cursor, count if timeout and self.perf_counter() - begin >= timeout: # TODO: It'd be nice not to do this if we had no more # batches to do. raise AggregateOperationTimeoutError
[docs] def flush(self): """ Return the tetal number of rows deleted or inserted in this operation. (This is the number requested, in the case of deletes, not the number that actually matched.) This can be treated as a boolean to discover if anything was flushed. """ count = 0 if self.deletes: count += self._do_deletes() self.total_rows_deleted += count self.deletes.clear() if self.inserts: count += self._do_inserts() self.total_rows_inserted += count self.inserts.clear() self.total_size_inserted += self.size_added self.rows_added = 0 self.size_added = 0 self.bind_params_added = 0 return count
def _do_deletes(self): return self._do_batch('DELETE', sorted(iteritems(self.deletes))) def _do_batch(self, command, descriptors, rows_need_flattened=True, suffix=''): count = 0 for (table, columns), rows in descriptors: count += len(rows) # Be careful not to use % formatting on a string already # containing `delete_placeholder` these_params_need_flattened = rows_need_flattened if len(columns) == 1: stmt, params, these_params_need_flattened = self._make_single_column_query( command, table, columns[0], rows, rows_need_flattened ) else: row_template = " AND ".join( ("%s=" % (column,)) + self.delete_placeholder for column in columns ) row_template = "(%s)" % (row_template,) rows_template = [row_template] * len(rows) stmt = "%s FROM %s WHERE %s" % ( command, table, " OR ".join(rows_template)) params = rows if these_params_need_flattened: params = self._flatten_params(params) stmt += suffix self.cursor.execute(stmt, params) return count def _flatten_params(self, params): params = sorted(params) if self.sorted_deletes else params params = list(itertools.chain.from_iterable(params)) return params def _make_placeholder_list_of_length(self, count): return ','.join([self.delete_placeholder] * count) def _make_single_column_query(self, command, table, filter_column, filter_value, rows_need_flattened): placeholder_str = self._make_placeholder_list_of_length(len(filter_value)) if not command.startswith('UPDATE'): stmt = "%s FROM %s WHERE %s IN (%s)" % ( command, table, filter_column, placeholder_str ) else: stmt = "%s WHERE %s IN (%s)" % ( command, filter_column, placeholder_str ) return stmt, filter_value, rows_need_flattened def _do_inserts(self): count = 0 # In the case that we have only a single table we're inserting into, # and thus common key values, we don't need to sort or iterate. # If we have multiple tables, we never want to sort by the rows too, # that doesn't make any sense. if len(self.inserts) == 1: items = [self.inserts.popitem()] else: items = sorted(iteritems(self.inserts), key=lambda i: i[0]) for (command, header, row_schema, suffix), rows in items: # Batched inserts rows = list(rows.values()) count += len(rows) value_template = "(%s)" % row_schema values_template = [value_template] * len(rows) params = list(itertools.chain.from_iterable(rows)) stmt = "%s INTO %s VALUES\n%s\n%s" % ( command, header, ', '.join(values_template), suffix) # e.g., # INSERT INTO table(c1, c2) # VALUES (%s, %s), (%s, %s), (%s, %s) # <suffix> __traceback_info__ = stmt, params self.cursor.execute(stmt, params) return count