Spaces:

Aluode
/

PerceptionLabPortable

Running

App Files Files Community

PerceptionLabPortable / python_embed /Lib /site-packages /joblib /hashing.py

Aluode

Upload folder using huggingface_hub

3bb804c verified about 1 month ago

raw

history blame

10.7 kB

	"""
	Fast cryptographic hash of Python objects, with a special case for fast
	hashing of numpy arrays.
	"""

	# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
	# Copyright (c) 2009 Gael Varoquaux
	# License: BSD Style, 3 clauses.

	import decimal
	import hashlib
	import io
	import pickle
	import struct
	import sys
	import types

	Pickler = pickle._Pickler


	class _ConsistentSet(object):
	"""Class used to ensure the hash of Sets is preserved
	whatever the order of its items.
	"""

	def __init__(self, set_sequence):
	# Forces order of elements in set to ensure consistent hash.
	try:
	# Trying first to order the set assuming the type of elements is
	# consistent and orderable.
	# This fails on python 3 when elements are unorderable
	# but we keep it in a try as it's faster.
	self._sequence = sorted(set_sequence)
	except (TypeError, decimal.InvalidOperation):
	# If elements are unorderable, sorting them using their hash.
	# This is slower but works in any case.
	self._sequence = sorted((hash(e) for e in set_sequence))


	class _MyHash(object):
	"""Class used to hash objects that won't normally pickle"""

	def __init__(self, *args):
	self.args = args


	class Hasher(Pickler):
	"""A subclass of pickler, to do cryptographic hashing, rather than
	pickling. This is used to produce a unique hash of the given
	Python object that is not necessarily cryptographically secure.
	"""

	def __init__(self, hash_name="md5"):
	self.stream = io.BytesIO()
	# By default we want a pickle protocol that only changes with
	# the major python version and not the minor one
	protocol = 3
	Pickler.__init__(self, self.stream, protocol=protocol)
	# Initialise the hash obj
	self._hash = hashlib.new(hash_name, usedforsecurity=False)

	def hash(self, obj, return_digest=True):
	try:
	self.dump(obj)
	except pickle.PicklingError as e:
	e.args += ("PicklingError while hashing %r: %r" % (obj, e),)
	raise
	dumps = self.stream.getvalue()
	self._hash.update(dumps)
	if return_digest:
	return self._hash.hexdigest()

	def save(self, obj):
	if isinstance(obj, (types.MethodType, type({}.pop))):
	# the Pickler cannot pickle instance methods; here we decompose
	# them into components that make them uniquely identifiable
	if hasattr(obj, "__func__"):
	func_name = obj.__func__.__name__
	else:
	func_name = obj.__name__
	inst = obj.__self__
	if type(inst) is type(pickle):
	obj = _MyHash(func_name, inst.__name__)
	elif inst is None:
	# type(None) or type(module) do not pickle
	obj = _MyHash(func_name, inst)
	else:
	cls = obj.__self__.__class__
	obj = _MyHash(func_name, inst, cls)
	Pickler.save(self, obj)

	def memoize(self, obj):
	# We want hashing to be sensitive to value instead of reference.
	# For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]]
	# to hash to the same value and that's why we disable memoization
	# for strings
	if isinstance(obj, (bytes, str)):
	return
	Pickler.memoize(self, obj)

	# The dispatch table of the pickler is not accessible in Python
	# 3, as these lines are only bugware for IPython, we skip them.
	def save_global(self, obj, name=None, pack=struct.pack):
	# We have to override this method in order to deal with objects
	# defined interactively in IPython that are not injected in
	# __main__
	kwargs = dict(name=name, pack=pack)
	del kwargs["pack"]
	try:
	Pickler.save_global(self, obj, **kwargs)
	except pickle.PicklingError:
	Pickler.save_global(self, obj, **kwargs)
	module = getattr(obj, "__module__", None)
	if module == "__main__":
	my_name = name
	if my_name is None:
	my_name = obj.__name__
	mod = sys.modules[module]
	if not hasattr(mod, my_name):
	# IPython doesn't inject the variables define
	# interactively in __main__
	setattr(mod, my_name, obj)

	dispatch = Pickler.dispatch.copy()
	# builtin
	dispatch[type(len)] = save_global
	# type
	dispatch[type(object)] = save_global
	# classobj
	dispatch[type(Pickler)] = save_global
	# function
	dispatch[type(pickle.dump)] = save_global

	# We use *args in _batch_setitems signature because _batch_setitems has an
	# additional 'obj' argument in Python 3.14
	def _batch_setitems(self, items, *args):
	# forces order of keys in dict to ensure consistent hash.
	try:
	# Trying first to compare dict assuming the type of keys is
	# consistent and orderable.
	# This fails on python 3 when keys are unorderable
	# but we keep it in a try as it's faster.
	Pickler._batch_setitems(self, iter(sorted(items)), *args)
	except TypeError:
	# If keys are unorderable, sorting them using their hash. This is
	# slower but works in any case.
	Pickler._batch_setitems(
	self, iter(sorted((hash(k), v) for k, v in items)), *args
	)

	def save_set(self, set_items):
	# forces order of items in Set to ensure consistent hash
	Pickler.save(self, _ConsistentSet(set_items))

	dispatch[type(set())] = save_set


	class NumpyHasher(Hasher):
	"""Special case the hasher for when numpy is loaded."""

	def __init__(self, hash_name="md5", coerce_mmap=False):
	"""
	Parameters
	----------
	hash_name: string
	The hash algorithm to be used
	coerce_mmap: boolean
	Make no difference between np.memmap and np.ndarray
	objects.
	"""
	self.coerce_mmap = coerce_mmap
	Hasher.__init__(self, hash_name=hash_name)
	# delayed import of numpy, to avoid tight coupling
	import numpy as np

	self.np = np
	if hasattr(np, "getbuffer"):
	self._getbuffer = np.getbuffer
	else:
	self._getbuffer = memoryview

	def save(self, obj):
	"""Subclass the save method, to hash ndarray subclass, rather
	than pickling them. Off course, this is a total abuse of
	the Pickler class.
	"""
	if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject:
	# Compute a hash of the object
	# The update function of the hash requires a c_contiguous buffer.
	if obj.shape == ():
	# 0d arrays need to be flattened because viewing them as bytes
	# raises a ValueError exception.
	obj_c_contiguous = obj.flatten()
	elif obj.flags.c_contiguous:
	obj_c_contiguous = obj
	elif obj.flags.f_contiguous:
	obj_c_contiguous = obj.T
	else:
	# Cater for non-single-segment arrays: this creates a
	# copy, and thus alleviates this issue.
	# XXX: There might be a more efficient way of doing this
	obj_c_contiguous = obj.flatten()

	# memoryview is not supported for some dtypes, e.g. datetime64, see
	# https://github.com/numpy/numpy/issues/4983. The
	# workaround is to view the array as bytes before
	# taking the memoryview.
	self._hash.update(self._getbuffer(obj_c_contiguous.view(self.np.uint8)))

	# We store the class, to be able to distinguish between
	# Objects with the same binary content, but different
	# classes.
	if self.coerce_mmap and isinstance(obj, self.np.memmap):
	# We don't make the difference between memmap and
	# normal ndarrays, to be able to reload previously
	# computed results with memmap.
	klass = self.np.ndarray
	else:
	klass = obj.__class__
	# We also return the dtype and the shape, to distinguish
	# different views on the same data with different dtypes.

	# The object will be pickled by the pickler hashed at the end.
	obj = (klass, ("HASHED", obj.dtype, obj.shape, obj.strides))
	elif isinstance(obj, self.np.dtype):
	# numpy.dtype consistent hashing is tricky to get right. This comes
	# from the fact that atomic np.dtype objects are interned:
	# ``np.dtype('f4') is np.dtype('f4')``. The situation is
	# complicated by the fact that this interning does not resist a
	# simple pickle.load/dump roundtrip:
	# ``pickle.loads(pickle.dumps(np.dtype('f4'))) is not
	# np.dtype('f4') Because pickle relies on memoization during
	# pickling, it is easy to
	# produce different hashes for seemingly identical objects, such as
	# ``[np.dtype('f4'), np.dtype('f4')]``
	# and ``[np.dtype('f4'), pickle.loads(pickle.dumps('f4'))]``.
	# To prevent memoization from interfering with hashing, we isolate
	# the serialization (and thus the pickle memoization) of each dtype
	# using each time a different ``pickle.dumps`` call unrelated to
	# the current Hasher instance.
	self._hash.update("_HASHED_DTYPE".encode("utf-8"))
	self._hash.update(pickle.dumps(obj))
	return
	Hasher.save(self, obj)


	def hash(obj, hash_name="md5", coerce_mmap=False):
	"""Quick calculation of a hash to identify uniquely Python objects
	containing numpy arrays.

	Parameters
	----------
	hash_name: 'md5' or 'sha1'
	Hashing algorithm used. sha1 is supposedly safer, but md5 is
	faster.
	coerce_mmap: boolean
	Make no difference between np.memmap and np.ndarray
	"""
	valid_hash_names = ("md5", "sha1")
	if hash_name not in valid_hash_names:
	raise ValueError(
	"Valid options for 'hash_name' are {}. Got hash_name={!r} instead.".format(
	valid_hash_names, hash_name
	)
	)
	if "numpy" in sys.modules:
	hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap)
	else:
	hasher = Hasher(hash_name=hash_name)
	return hasher.hash(obj)