Spaces:

danielvarga
/

se

Sleeping

se / dedupe.py

Daniel Varga

create embedding, deduplication, docs

8424a77 over 2 years ago

1.17 kB

	# takes a pickle, takes a file with md5sums, outputs the pickle without hash dupes.

	import sys
	import pickle
	import numpy as np


	pickle_filename, md5_filename, output_pickle_filename = sys.argv[1:]

	hashes = {}
	for line in open(md5_filename, "r"):
	line = line.strip("\n")
	md5 = line.split()[0]
	assert len(md5) == 32
	assert line[32:34] == " "
	filename = line[34:]
	hashes[filename] = md5
	print(len(hashes), "hashes read")

	data = pickle.load(open(pickle_filename, "rb"))
	print(len(data["embeddings"]), "embeddings read")

	filenames = data["filenames"]
	collected_indices = []
	collected_md5s = set()
	for i in range(len(filenames)):
	filename = filenames[i]
	md5 = hashes[filename] # not the hash of the filename, the hash of the file content.
	if md5 not in collected_md5s:
	collected_indices.append(i)
	collected_md5s.add(md5)

	print(len(collected_indices), "unique hashes")
	filenames = np.array(filenames)

	data["filenames"] = filenames[collected_indices].tolist()
	data["embeddings"] = data["embeddings"][collected_indices]
	assert "thumbs" not in data

	with open(output_pickle_filename, "wb") as f:
	pickle.dump(data, f)