# takes a pickle, takes a file with md5sums, outputs the pickle without hash dupes. import sys import pickle import numpy as np pickle_filename, md5_filename, output_pickle_filename = sys.argv[1:] hashes = {} for line in open(md5_filename, "r"): line = line.strip("\n") md5 = line.split()[0] assert len(md5) == 32 assert line[32:34] == " " filename = line[34:] hashes[filename] = md5 print(len(hashes), "hashes read") data = pickle.load(open(pickle_filename, "rb")) print(len(data["embeddings"]), "embeddings read") filenames = data["filenames"] collected_indices = [] collected_md5s = set() for i in range(len(filenames)): filename = filenames[i] md5 = hashes[filename] # not the hash of the filename, the hash of the file content. if md5 not in collected_md5s: collected_indices.append(i) collected_md5s.add(md5) print(len(collected_indices), "unique hashes") filenames = np.array(filenames) data["filenames"] = filenames[collected_indices].tolist() data["embeddings"] = data["embeddings"][collected_indices] assert "thumbs" not in data with open(output_pickle_filename, "wb") as f: pickle.dump(data, f)