Spaces:
Running
Running
| """ | |
| Test the hashing module. | |
| """ | |
| # Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org> | |
| # Copyright (c) 2009 Gael Varoquaux | |
| # License: BSD Style, 3 clauses. | |
| import collections | |
| import gc | |
| import hashlib | |
| import io | |
| import itertools | |
| import pickle | |
| import random | |
| import sys | |
| import time | |
| from concurrent.futures import ProcessPoolExecutor | |
| from decimal import Decimal | |
| from joblib.func_inspect import filter_args | |
| from joblib.hashing import hash | |
| from joblib.memory import Memory | |
| from joblib.test.common import np, with_numpy | |
| from joblib.testing import fixture, parametrize, raises, skipif | |
| def unicode(s): | |
| return s | |
| ############################################################################### | |
| # Helper functions for the tests | |
| def time_func(func, *args): | |
| """Time function func on *args.""" | |
| times = list() | |
| for _ in range(3): | |
| t1 = time.time() | |
| func(*args) | |
| times.append(time.time() - t1) | |
| return min(times) | |
| def relative_time(func1, func2, *args): | |
| """Return the relative time between func1 and func2 applied on | |
| *args. | |
| """ | |
| time_func1 = time_func(func1, *args) | |
| time_func2 = time_func(func2, *args) | |
| relative_diff = 0.5 * (abs(time_func1 - time_func2) / (time_func1 + time_func2)) | |
| return relative_diff | |
| class Klass(object): | |
| def f(self, x): | |
| return x | |
| class KlassWithCachedMethod(object): | |
| def __init__(self, cachedir): | |
| mem = Memory(location=cachedir) | |
| self.f = mem.cache(self.f) | |
| def f(self, x): | |
| return x | |
| ############################################################################### | |
| # Tests | |
| input_list = [ | |
| 1, | |
| 2, | |
| 1.0, | |
| 2.0, | |
| 1 + 1j, | |
| 2.0 + 1j, | |
| "a", | |
| "b", | |
| (1,), | |
| ( | |
| 1, | |
| 1, | |
| ), | |
| [ | |
| 1, | |
| ], | |
| [ | |
| 1, | |
| 1, | |
| ], | |
| {1: 1}, | |
| {1: 2}, | |
| {2: 1}, | |
| None, | |
| gc.collect, | |
| [ | |
| 1, | |
| ].append, | |
| # Next 2 sets have unorderable elements in python 3. | |
| set(("a", 1)), | |
| set(("a", 1, ("a", 1))), | |
| # Next 2 dicts have unorderable type of keys in python 3. | |
| {"a": 1, 1: 2}, | |
| {"a": 1, 1: 2, "d": {"a": 1}}, | |
| ] | |
| def test_trivial_hash(obj1, obj2): | |
| """Smoke test hash on various types.""" | |
| # Check that 2 objects have the same hash only if they are the same. | |
| are_hashes_equal = hash(obj1) == hash(obj2) | |
| are_objs_identical = obj1 is obj2 | |
| assert are_hashes_equal == are_objs_identical | |
| def test_hash_methods(): | |
| # Check that hashing instance methods works | |
| a = io.StringIO(unicode("a")) | |
| assert hash(a.flush) == hash(a.flush) | |
| a1 = collections.deque(range(10)) | |
| a2 = collections.deque(range(9)) | |
| assert hash(a1.extend) != hash(a2.extend) | |
| def three_np_arrays(): | |
| rnd = np.random.RandomState(0) | |
| arr1 = rnd.random_sample((10, 10)) | |
| arr2 = arr1.copy() | |
| arr3 = arr2.copy() | |
| arr3[0] += 1 | |
| return arr1, arr2, arr3 | |
| def test_hash_numpy_arrays(three_np_arrays): | |
| arr1, arr2, arr3 = three_np_arrays | |
| for obj1, obj2 in itertools.product(three_np_arrays, repeat=2): | |
| are_hashes_equal = hash(obj1) == hash(obj2) | |
| are_arrays_equal = np.all(obj1 == obj2) | |
| assert are_hashes_equal == are_arrays_equal | |
| assert hash(arr1) != hash(arr1.T) | |
| def test_hash_numpy_dict_of_arrays(three_np_arrays): | |
| arr1, arr2, arr3 = three_np_arrays | |
| d1 = {1: arr1, 2: arr2} | |
| d2 = {1: arr2, 2: arr1} | |
| d3 = {1: arr2, 2: arr3} | |
| assert hash(d1) == hash(d2) | |
| assert hash(d1) != hash(d3) | |
| def test_numpy_datetime_array(dtype): | |
| # memoryview is not supported for some dtypes e.g. datetime64 | |
| # see https://github.com/joblib/joblib/issues/188 for more details | |
| a_hash = hash(np.arange(10)) | |
| array = np.arange(0, 10, dtype=dtype) | |
| assert hash(array) != a_hash | |
| def test_hash_numpy_noncontiguous(): | |
| a = np.asarray(np.arange(6000).reshape((1000, 2, 3)), order="F")[:, :1, :] | |
| b = np.ascontiguousarray(a) | |
| assert hash(a) != hash(b) | |
| c = np.asfortranarray(a) | |
| assert hash(a) != hash(c) | |
| def test_hash_memmap(tmpdir, coerce_mmap): | |
| """Check that memmap and arrays hash identically if coerce_mmap is True.""" | |
| filename = tmpdir.join("memmap_temp").strpath | |
| try: | |
| m = np.memmap(filename, shape=(10, 10), mode="w+") | |
| a = np.asarray(m) | |
| are_hashes_equal = hash(a, coerce_mmap=coerce_mmap) == hash( | |
| m, coerce_mmap=coerce_mmap | |
| ) | |
| assert are_hashes_equal == coerce_mmap | |
| finally: | |
| if "m" in locals(): | |
| del m | |
| # Force a garbage-collection cycle, to be certain that the | |
| # object is delete, and we don't run in a problem under | |
| # Windows with a file handle still open. | |
| gc.collect() | |
| def test_hash_numpy_performance(): | |
| """Check the performance of hashing numpy arrays: | |
| In [22]: a = np.random.random(1000000) | |
| In [23]: %timeit hashlib.md5(a).hexdigest() | |
| 100 loops, best of 3: 20.7 ms per loop | |
| In [24]: %timeit hashlib.md5(pickle.dumps(a, protocol=2)).hexdigest() | |
| 1 loops, best of 3: 73.1 ms per loop | |
| In [25]: %timeit hashlib.md5(cPickle.dumps(a, protocol=2)).hexdigest() | |
| 10 loops, best of 3: 53.9 ms per loop | |
| In [26]: %timeit hash(a) | |
| 100 loops, best of 3: 20.8 ms per loop | |
| """ | |
| rnd = np.random.RandomState(0) | |
| a = rnd.random_sample(1000000) | |
| def md5_hash(x): | |
| return hashlib.md5(memoryview(x)).hexdigest() | |
| relative_diff = relative_time(md5_hash, hash, a) | |
| assert relative_diff < 0.3 | |
| # Check that hashing an tuple of 3 arrays takes approximately | |
| # 3 times as much as hashing one array | |
| time_hashlib = 3 * time_func(md5_hash, a) | |
| time_hash = time_func(hash, (a, a, a)) | |
| relative_diff = 0.5 * (abs(time_hash - time_hashlib) / (time_hash + time_hashlib)) | |
| assert relative_diff < 0.3 | |
| def test_bound_methods_hash(): | |
| """Make sure that calling the same method on two different instances | |
| of the same class does resolve to the same hashes. | |
| """ | |
| a = Klass() | |
| b = Klass() | |
| assert hash(filter_args(a.f, [], (1,))) == hash(filter_args(b.f, [], (1,))) | |
| def test_bound_cached_methods_hash(tmpdir): | |
| """Make sure that calling the same _cached_ method on two different | |
| instances of the same class does resolve to the same hashes. | |
| """ | |
| a = KlassWithCachedMethod(tmpdir.strpath) | |
| b = KlassWithCachedMethod(tmpdir.strpath) | |
| assert hash(filter_args(a.f.func, [], (1,))) == hash( | |
| filter_args(b.f.func, [], (1,)) | |
| ) | |
| def test_hash_object_dtype(): | |
| """Make sure that ndarrays with dtype `object' hash correctly.""" | |
| a = np.array([np.arange(i) for i in range(6)], dtype=object) | |
| b = np.array([np.arange(i) for i in range(6)], dtype=object) | |
| assert hash(a) == hash(b) | |
| def test_numpy_scalar(): | |
| # Numpy scalars are built from compiled functions, and lead to | |
| # strange pickling paths explored, that can give hash collisions | |
| a = np.float64(2.0) | |
| b = np.float64(3.0) | |
| assert hash(a) != hash(b) | |
| def test_dict_hash(tmpdir): | |
| # Check that dictionaries hash consistently, even though the ordering | |
| # of the keys is not guaranteed | |
| k = KlassWithCachedMethod(tmpdir.strpath) | |
| d = { | |
| "#s12069__c_maps.nii.gz": [33], | |
| "#s12158__c_maps.nii.gz": [33], | |
| "#s12258__c_maps.nii.gz": [33], | |
| "#s12277__c_maps.nii.gz": [33], | |
| "#s12300__c_maps.nii.gz": [33], | |
| "#s12401__c_maps.nii.gz": [33], | |
| "#s12430__c_maps.nii.gz": [33], | |
| "#s13817__c_maps.nii.gz": [33], | |
| "#s13903__c_maps.nii.gz": [33], | |
| "#s13916__c_maps.nii.gz": [33], | |
| "#s13981__c_maps.nii.gz": [33], | |
| "#s13982__c_maps.nii.gz": [33], | |
| "#s13983__c_maps.nii.gz": [33], | |
| } | |
| a = k.f(d) | |
| b = k.f(a) | |
| assert hash(a) == hash(b) | |
| def test_set_hash(tmpdir): | |
| # Check that sets hash consistently, even though their ordering | |
| # is not guaranteed | |
| k = KlassWithCachedMethod(tmpdir.strpath) | |
| s = set( | |
| [ | |
| "#s12069__c_maps.nii.gz", | |
| "#s12158__c_maps.nii.gz", | |
| "#s12258__c_maps.nii.gz", | |
| "#s12277__c_maps.nii.gz", | |
| "#s12300__c_maps.nii.gz", | |
| "#s12401__c_maps.nii.gz", | |
| "#s12430__c_maps.nii.gz", | |
| "#s13817__c_maps.nii.gz", | |
| "#s13903__c_maps.nii.gz", | |
| "#s13916__c_maps.nii.gz", | |
| "#s13981__c_maps.nii.gz", | |
| "#s13982__c_maps.nii.gz", | |
| "#s13983__c_maps.nii.gz", | |
| ] | |
| ) | |
| a = k.f(s) | |
| b = k.f(a) | |
| assert hash(a) == hash(b) | |
| def test_set_decimal_hash(): | |
| # Check that sets containing decimals hash consistently, even though | |
| # ordering is not guaranteed | |
| assert hash(set([Decimal(0), Decimal("NaN")])) == hash( | |
| set([Decimal("NaN"), Decimal(0)]) | |
| ) | |
| def test_string(): | |
| # Test that we obtain the same hash for object owning several strings, | |
| # whatever the past of these strings (which are immutable in Python) | |
| string = "foo" | |
| a = {string: "bar"} | |
| b = {string: "bar"} | |
| c = pickle.loads(pickle.dumps(b)) | |
| assert hash([a, b]) == hash([a, c]) | |
| def test_numpy_dtype_pickling(): | |
| # numpy dtype hashing is tricky to get right: see #231, #239, #251 #1080, | |
| # #1082, and explanatory comments inside | |
| # ``joblib.hashing.NumpyHasher.save``. | |
| # In this test, we make sure that the pickling of numpy dtypes is robust to | |
| # object identity and object copy. | |
| dt1 = np.dtype("f4") | |
| dt2 = np.dtype("f4") | |
| # simple dtypes objects are interned | |
| assert dt1 is dt2 | |
| assert hash(dt1) == hash(dt2) | |
| dt1_roundtripped = pickle.loads(pickle.dumps(dt1)) | |
| assert dt1 is not dt1_roundtripped | |
| assert hash(dt1) == hash(dt1_roundtripped) | |
| assert hash([dt1, dt1]) == hash([dt1_roundtripped, dt1_roundtripped]) | |
| assert hash([dt1, dt1]) == hash([dt1, dt1_roundtripped]) | |
| complex_dt1 = np.dtype([("name", np.str_, 16), ("grades", np.float64, (2,))]) | |
| complex_dt2 = np.dtype([("name", np.str_, 16), ("grades", np.float64, (2,))]) | |
| # complex dtypes objects are not interned | |
| assert hash(complex_dt1) == hash(complex_dt2) | |
| complex_dt1_roundtripped = pickle.loads(pickle.dumps(complex_dt1)) | |
| assert complex_dt1_roundtripped is not complex_dt1 | |
| assert hash(complex_dt1) == hash(complex_dt1_roundtripped) | |
| assert hash([complex_dt1, complex_dt1]) == hash( | |
| [complex_dt1_roundtripped, complex_dt1_roundtripped] | |
| ) | |
| assert hash([complex_dt1, complex_dt1]) == hash( | |
| [complex_dt1_roundtripped, complex_dt1] | |
| ) | |
| def test_hashes_stay_the_same(to_hash, expected): | |
| # We want to make sure that hashes don't change with joblib | |
| # version. For end users, that would mean that they have to | |
| # regenerate their cache from scratch, which potentially means | |
| # lengthy recomputations. | |
| # Expected results have been generated with joblib 0.9.2 | |
| assert hash(to_hash) == expected | |
| def test_hashes_are_different_between_c_and_fortran_contiguous_arrays(): | |
| # We want to be sure that the c-contiguous and f-contiguous versions of the | |
| # same array produce 2 different hashes. | |
| rng = np.random.RandomState(0) | |
| arr_c = rng.random_sample((10, 10)) | |
| arr_f = np.asfortranarray(arr_c) | |
| assert hash(arr_c) != hash(arr_f) | |
| def test_0d_array(): | |
| hash(np.array(0)) | |
| def test_0d_and_1d_array_hashing_is_different(): | |
| assert hash(np.array(0)) != hash(np.array([0])) | |
| def test_hashes_stay_the_same_with_numpy_objects(): | |
| # Note: joblib used to test numpy objects hashing by comparing the produced | |
| # hash of an object with some hard-coded target value to guarantee that | |
| # hashing remains the same across joblib versions. However, since numpy | |
| # 1.20 and joblib 1.0, joblib relies on potentially unstable implementation | |
| # details of numpy to hash np.dtype objects, which makes the stability of | |
| # hash values across different environments hard to guarantee and to test. | |
| # As a result, hashing stability across joblib versions becomes best-effort | |
| # only, and we only test the consistency within a single environment by | |
| # making sure: | |
| # - the hash of two copies of the same objects is the same | |
| # - hashing some object in two different python processes produces the same | |
| # value. This should be viewed as a proxy for testing hash consistency | |
| # through time between Python sessions (provided no change in the | |
| # environment was done between sessions). | |
| def create_objects_to_hash(): | |
| rng = np.random.RandomState(42) | |
| # Being explicit about dtypes in order to avoid | |
| # architecture-related differences. Also using 'f4' rather than | |
| # 'f8' for float arrays because 'f8' arrays generated by | |
| # rng.random.randn don't seem to be bit-identical on 32bit and | |
| # 64bit machines. | |
| to_hash_list = [ | |
| rng.randint(-1000, high=1000, size=50).astype("<i8"), | |
| tuple(rng.randn(3).astype("<f4") for _ in range(5)), | |
| [rng.randn(3).astype("<f4") for _ in range(5)], | |
| { | |
| -3333: rng.randn(3, 5).astype("<f4"), | |
| 0: [ | |
| rng.randint(10, size=20).astype("<i8"), | |
| rng.randn(10).astype("<f4"), | |
| ], | |
| }, | |
| # Non regression cases for | |
| # https://github.com/joblib/joblib/issues/308 | |
| np.arange(100, dtype="<i8").reshape((10, 10)), | |
| # Fortran contiguous array | |
| np.asfortranarray(np.arange(100, dtype="<i8").reshape((10, 10))), | |
| # Non contiguous array | |
| np.arange(100, dtype="<i8").reshape((10, 10))[:, :2], | |
| ] | |
| return to_hash_list | |
| # Create two lists containing copies of the same objects. joblib.hash | |
| # should return the same hash for to_hash_list_one[i] and | |
| # to_hash_list_two[i] | |
| to_hash_list_one = create_objects_to_hash() | |
| to_hash_list_two = create_objects_to_hash() | |
| e1 = ProcessPoolExecutor(max_workers=1) | |
| e2 = ProcessPoolExecutor(max_workers=1) | |
| try: | |
| for obj_1, obj_2 in zip(to_hash_list_one, to_hash_list_two): | |
| # testing consistency of hashes across python processes | |
| hash_1 = e1.submit(hash, obj_1).result() | |
| hash_2 = e2.submit(hash, obj_1).result() | |
| assert hash_1 == hash_2 | |
| # testing consistency when hashing two copies of the same objects. | |
| hash_3 = e1.submit(hash, obj_2).result() | |
| assert hash_1 == hash_3 | |
| finally: | |
| e1.shutdown() | |
| e2.shutdown() | |
| def test_hashing_pickling_error(): | |
| def non_picklable(): | |
| return 42 | |
| with raises(pickle.PicklingError) as excinfo: | |
| hash(non_picklable) | |
| excinfo.match("PicklingError while hashing") | |
| def test_wrong_hash_name(): | |
| msg = "Valid options for 'hash_name' are" | |
| with raises(ValueError, match=msg): | |
| data = {"foo": "bar"} | |
| hash(data, hash_name="invalid") | |