video2slide

Paused

App Files Files Community

video2slide / post_process.py

dragonSwing

Use tqdm for processing

e0cedf5 over 2 years ago

raw

history blame contribute delete

2.12 kB

	import imagehash
	import os
	from collections import deque
	from PIL import Image
	from tqdm import tqdm


	def find_similar_images(
	base_dir, hash_size=8, hashfunc=imagehash.dhash, queue_len=5, threshold=4
	):
	snapshots_files = sorted(os.listdir(base_dir))

	hash_dict = {}
	hash_queue = deque([], maxlen=queue_len)
	duplicates = []
	num_duplicates = 0

	print("---" * 5, "Finding similar files", "---" * 5)

	with tqdm(snapshots_files) as t:
	for file in t:
	read_file = Image.open(os.path.join(base_dir, file))
	comp_hash = hashfunc(read_file, hash_size=hash_size)
	duplicate = False

	if comp_hash not in hash_dict:
	hash_dict[comp_hash] = file
	# Compare with hash queue to find out potential duplicates
	for img_hash in hash_queue:
	if img_hash - comp_hash <= threshold:
	duplicate = True
	break

	if not duplicate:
	hash_queue.append(comp_hash)
	else:
	duplicate = True

	if duplicate:
	duplicates.append(file)
	num_duplicates += 1
	t.set_postfix_str(f"Duplicate files: {num_duplicates}")

	return hash_dict, duplicates


	def remove_duplicates(
	base_dir, hash_size=8, hashfunc=imagehash.dhash, queue_len=5, threshold=4
	):
	_, duplicates = find_similar_images(
	base_dir,
	hash_size=hash_size,
	hashfunc=hashfunc,
	queue_len=queue_len,
	threshold=threshold,
	)

	if not len(duplicates):
	print("No duplicates found!")
	else:
	print("Removing duplicates...")

	for dup_file in duplicates:
	file_path = os.path.join(base_dir, dup_file)

	if os.path.exists(file_path):
	os.remove(file_path)
	else:
	print("Filepath: ", file_path, "does not exists.")

	print("All duplicates removed!")

	print("**" 10, "\n")


	if __name__ == "__main__":
	remove_duplicates("sample_1")