Spaces:
Sleeping
Sleeping
| # takes a pickle, takes a file with md5sums, outputs the pickle without hash dupes. | |
| import sys | |
| import pickle | |
| import numpy as np | |
| pickle_filename, md5_filename, output_pickle_filename = sys.argv[1:] | |
| hashes = {} | |
| for line in open(md5_filename, "r"): | |
| line = line.strip("\n") | |
| md5 = line.split()[0] | |
| assert len(md5) == 32 | |
| assert line[32:34] == " " | |
| filename = line[34:] | |
| hashes[filename] = md5 | |
| print(len(hashes), "hashes read") | |
| data = pickle.load(open(pickle_filename, "rb")) | |
| print(len(data["embeddings"]), "embeddings read") | |
| filenames = data["filenames"] | |
| collected_indices = [] | |
| collected_md5s = set() | |
| for i in range(len(filenames)): | |
| filename = filenames[i] | |
| md5 = hashes[filename] # not the hash of the filename, the hash of the file content. | |
| if md5 not in collected_md5s: | |
| collected_indices.append(i) | |
| collected_md5s.add(md5) | |
| print(len(collected_indices), "unique hashes") | |
| filenames = np.array(filenames) | |
| data["filenames"] = filenames[collected_indices].tolist() | |
| data["embeddings"] = data["embeddings"][collected_indices] | |
| assert "thumbs" not in data | |
| with open(output_pickle_filename, "wb") as f: | |
| pickle.dump(data, f) | |