se / dedupe.py
Daniel Varga
create embedding, deduplication, docs
8424a77
# takes a pickle, takes a file with md5sums, outputs the pickle without hash dupes.
import sys
import pickle
import numpy as np
pickle_filename, md5_filename, output_pickle_filename = sys.argv[1:]
hashes = {}
for line in open(md5_filename, "r"):
line = line.strip("\n")
md5 = line.split()[0]
assert len(md5) == 32
assert line[32:34] == " "
filename = line[34:]
hashes[filename] = md5
print(len(hashes), "hashes read")
data = pickle.load(open(pickle_filename, "rb"))
print(len(data["embeddings"]), "embeddings read")
filenames = data["filenames"]
collected_indices = []
collected_md5s = set()
for i in range(len(filenames)):
filename = filenames[i]
md5 = hashes[filename] # not the hash of the filename, the hash of the file content.
if md5 not in collected_md5s:
collected_indices.append(i)
collected_md5s.add(md5)
print(len(collected_indices), "unique hashes")
filenames = np.array(filenames)
data["filenames"] = filenames[collected_indices].tolist()
data["embeddings"] = data["embeddings"][collected_indices]
assert "thumbs" not in data
with open(output_pickle_filename, "wb") as f:
pickle.dump(data, f)