Spaces:

danielvarga
/

se

Sleeping

File size: 1,172 Bytes

8424a77

# takes a pickle, takes a file with md5sums, outputs the pickle without hash dupes.

import sys
import pickle
import numpy as np


pickle_filename, md5_filename, output_pickle_filename = sys.argv[1:]

hashes = {}
for line in open(md5_filename, "r"):
    line = line.strip("\n")
    md5 = line.split()[0]
    assert len(md5) == 32
    assert line[32:34] == "  "
    filename = line[34:]
    hashes[filename] = md5
print(len(hashes), "hashes read")

data = pickle.load(open(pickle_filename, "rb"))
print(len(data["embeddings"]), "embeddings read")

filenames = data["filenames"]
collected_indices = []
collected_md5s = set()
for i in range(len(filenames)):
    filename = filenames[i]
    md5 = hashes[filename] # not the hash of the filename, the hash of the file content.
    if md5 not in collected_md5s:
        collected_indices.append(i)
        collected_md5s.add(md5)

print(len(collected_indices), "unique hashes")
filenames = np.array(filenames)

data["filenames"] = filenames[collected_indices].tolist()
data["embeddings"] = data["embeddings"][collected_indices]
assert "thumbs" not in data

with open(output_pickle_filename, "wb") as f:
    pickle.dump(data, f)