Spaces:
Running
Running
| """ | |
| Fast cryptographic hash of Python objects, with a special case for fast | |
| hashing of numpy arrays. | |
| """ | |
| # Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org> | |
| # Copyright (c) 2009 Gael Varoquaux | |
| # License: BSD Style, 3 clauses. | |
| import decimal | |
| import hashlib | |
| import io | |
| import pickle | |
| import struct | |
| import sys | |
| import types | |
| Pickler = pickle._Pickler | |
| class _ConsistentSet(object): | |
| """Class used to ensure the hash of Sets is preserved | |
| whatever the order of its items. | |
| """ | |
| def __init__(self, set_sequence): | |
| # Forces order of elements in set to ensure consistent hash. | |
| try: | |
| # Trying first to order the set assuming the type of elements is | |
| # consistent and orderable. | |
| # This fails on python 3 when elements are unorderable | |
| # but we keep it in a try as it's faster. | |
| self._sequence = sorted(set_sequence) | |
| except (TypeError, decimal.InvalidOperation): | |
| # If elements are unorderable, sorting them using their hash. | |
| # This is slower but works in any case. | |
| self._sequence = sorted((hash(e) for e in set_sequence)) | |
| class _MyHash(object): | |
| """Class used to hash objects that won't normally pickle""" | |
| def __init__(self, *args): | |
| self.args = args | |
| class Hasher(Pickler): | |
| """A subclass of pickler, to do cryptographic hashing, rather than | |
| pickling. This is used to produce a unique hash of the given | |
| Python object that is not necessarily cryptographically secure. | |
| """ | |
| def __init__(self, hash_name="md5"): | |
| self.stream = io.BytesIO() | |
| # By default we want a pickle protocol that only changes with | |
| # the major python version and not the minor one | |
| protocol = 3 | |
| Pickler.__init__(self, self.stream, protocol=protocol) | |
| # Initialise the hash obj | |
| self._hash = hashlib.new(hash_name, usedforsecurity=False) | |
| def hash(self, obj, return_digest=True): | |
| try: | |
| self.dump(obj) | |
| except pickle.PicklingError as e: | |
| e.args += ("PicklingError while hashing %r: %r" % (obj, e),) | |
| raise | |
| dumps = self.stream.getvalue() | |
| self._hash.update(dumps) | |
| if return_digest: | |
| return self._hash.hexdigest() | |
| def save(self, obj): | |
| if isinstance(obj, (types.MethodType, type({}.pop))): | |
| # the Pickler cannot pickle instance methods; here we decompose | |
| # them into components that make them uniquely identifiable | |
| if hasattr(obj, "__func__"): | |
| func_name = obj.__func__.__name__ | |
| else: | |
| func_name = obj.__name__ | |
| inst = obj.__self__ | |
| if type(inst) is type(pickle): | |
| obj = _MyHash(func_name, inst.__name__) | |
| elif inst is None: | |
| # type(None) or type(module) do not pickle | |
| obj = _MyHash(func_name, inst) | |
| else: | |
| cls = obj.__self__.__class__ | |
| obj = _MyHash(func_name, inst, cls) | |
| Pickler.save(self, obj) | |
| def memoize(self, obj): | |
| # We want hashing to be sensitive to value instead of reference. | |
| # For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]] | |
| # to hash to the same value and that's why we disable memoization | |
| # for strings | |
| if isinstance(obj, (bytes, str)): | |
| return | |
| Pickler.memoize(self, obj) | |
| # The dispatch table of the pickler is not accessible in Python | |
| # 3, as these lines are only bugware for IPython, we skip them. | |
| def save_global(self, obj, name=None, pack=struct.pack): | |
| # We have to override this method in order to deal with objects | |
| # defined interactively in IPython that are not injected in | |
| # __main__ | |
| kwargs = dict(name=name, pack=pack) | |
| del kwargs["pack"] | |
| try: | |
| Pickler.save_global(self, obj, **kwargs) | |
| except pickle.PicklingError: | |
| Pickler.save_global(self, obj, **kwargs) | |
| module = getattr(obj, "__module__", None) | |
| if module == "__main__": | |
| my_name = name | |
| if my_name is None: | |
| my_name = obj.__name__ | |
| mod = sys.modules[module] | |
| if not hasattr(mod, my_name): | |
| # IPython doesn't inject the variables define | |
| # interactively in __main__ | |
| setattr(mod, my_name, obj) | |
| dispatch = Pickler.dispatch.copy() | |
| # builtin | |
| dispatch[type(len)] = save_global | |
| # type | |
| dispatch[type(object)] = save_global | |
| # classobj | |
| dispatch[type(Pickler)] = save_global | |
| # function | |
| dispatch[type(pickle.dump)] = save_global | |
| # We use *args in _batch_setitems signature because _batch_setitems has an | |
| # additional 'obj' argument in Python 3.14 | |
| def _batch_setitems(self, items, *args): | |
| # forces order of keys in dict to ensure consistent hash. | |
| try: | |
| # Trying first to compare dict assuming the type of keys is | |
| # consistent and orderable. | |
| # This fails on python 3 when keys are unorderable | |
| # but we keep it in a try as it's faster. | |
| Pickler._batch_setitems(self, iter(sorted(items)), *args) | |
| except TypeError: | |
| # If keys are unorderable, sorting them using their hash. This is | |
| # slower but works in any case. | |
| Pickler._batch_setitems( | |
| self, iter(sorted((hash(k), v) for k, v in items)), *args | |
| ) | |
| def save_set(self, set_items): | |
| # forces order of items in Set to ensure consistent hash | |
| Pickler.save(self, _ConsistentSet(set_items)) | |
| dispatch[type(set())] = save_set | |
| class NumpyHasher(Hasher): | |
| """Special case the hasher for when numpy is loaded.""" | |
| def __init__(self, hash_name="md5", coerce_mmap=False): | |
| """ | |
| Parameters | |
| ---------- | |
| hash_name: string | |
| The hash algorithm to be used | |
| coerce_mmap: boolean | |
| Make no difference between np.memmap and np.ndarray | |
| objects. | |
| """ | |
| self.coerce_mmap = coerce_mmap | |
| Hasher.__init__(self, hash_name=hash_name) | |
| # delayed import of numpy, to avoid tight coupling | |
| import numpy as np | |
| self.np = np | |
| if hasattr(np, "getbuffer"): | |
| self._getbuffer = np.getbuffer | |
| else: | |
| self._getbuffer = memoryview | |
| def save(self, obj): | |
| """Subclass the save method, to hash ndarray subclass, rather | |
| than pickling them. Off course, this is a total abuse of | |
| the Pickler class. | |
| """ | |
| if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject: | |
| # Compute a hash of the object | |
| # The update function of the hash requires a c_contiguous buffer. | |
| if obj.shape == (): | |
| # 0d arrays need to be flattened because viewing them as bytes | |
| # raises a ValueError exception. | |
| obj_c_contiguous = obj.flatten() | |
| elif obj.flags.c_contiguous: | |
| obj_c_contiguous = obj | |
| elif obj.flags.f_contiguous: | |
| obj_c_contiguous = obj.T | |
| else: | |
| # Cater for non-single-segment arrays: this creates a | |
| # copy, and thus alleviates this issue. | |
| # XXX: There might be a more efficient way of doing this | |
| obj_c_contiguous = obj.flatten() | |
| # memoryview is not supported for some dtypes, e.g. datetime64, see | |
| # https://github.com/numpy/numpy/issues/4983. The | |
| # workaround is to view the array as bytes before | |
| # taking the memoryview. | |
| self._hash.update(self._getbuffer(obj_c_contiguous.view(self.np.uint8))) | |
| # We store the class, to be able to distinguish between | |
| # Objects with the same binary content, but different | |
| # classes. | |
| if self.coerce_mmap and isinstance(obj, self.np.memmap): | |
| # We don't make the difference between memmap and | |
| # normal ndarrays, to be able to reload previously | |
| # computed results with memmap. | |
| klass = self.np.ndarray | |
| else: | |
| klass = obj.__class__ | |
| # We also return the dtype and the shape, to distinguish | |
| # different views on the same data with different dtypes. | |
| # The object will be pickled by the pickler hashed at the end. | |
| obj = (klass, ("HASHED", obj.dtype, obj.shape, obj.strides)) | |
| elif isinstance(obj, self.np.dtype): | |
| # numpy.dtype consistent hashing is tricky to get right. This comes | |
| # from the fact that atomic np.dtype objects are interned: | |
| # ``np.dtype('f4') is np.dtype('f4')``. The situation is | |
| # complicated by the fact that this interning does not resist a | |
| # simple pickle.load/dump roundtrip: | |
| # ``pickle.loads(pickle.dumps(np.dtype('f4'))) is not | |
| # np.dtype('f4') Because pickle relies on memoization during | |
| # pickling, it is easy to | |
| # produce different hashes for seemingly identical objects, such as | |
| # ``[np.dtype('f4'), np.dtype('f4')]`` | |
| # and ``[np.dtype('f4'), pickle.loads(pickle.dumps('f4'))]``. | |
| # To prevent memoization from interfering with hashing, we isolate | |
| # the serialization (and thus the pickle memoization) of each dtype | |
| # using each time a different ``pickle.dumps`` call unrelated to | |
| # the current Hasher instance. | |
| self._hash.update("_HASHED_DTYPE".encode("utf-8")) | |
| self._hash.update(pickle.dumps(obj)) | |
| return | |
| Hasher.save(self, obj) | |
| def hash(obj, hash_name="md5", coerce_mmap=False): | |
| """Quick calculation of a hash to identify uniquely Python objects | |
| containing numpy arrays. | |
| Parameters | |
| ---------- | |
| hash_name: 'md5' or 'sha1' | |
| Hashing algorithm used. sha1 is supposedly safer, but md5 is | |
| faster. | |
| coerce_mmap: boolean | |
| Make no difference between np.memmap and np.ndarray | |
| """ | |
| valid_hash_names = ("md5", "sha1") | |
| if hash_name not in valid_hash_names: | |
| raise ValueError( | |
| "Valid options for 'hash_name' are {}. Got hash_name={!r} instead.".format( | |
| valid_hash_names, hash_name | |
| ) | |
| ) | |
| if "numpy" in sys.modules: | |
| hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap) | |
| else: | |
| hasher = Hasher(hash_name=hash_name) | |
| return hasher.hash(obj) | |