|
|
import imagehash |
|
|
import os |
|
|
from collections import deque |
|
|
from PIL import Image |
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
def find_similar_images( |
|
|
base_dir, hash_size=8, hashfunc=imagehash.dhash, queue_len=5, threshold=4 |
|
|
): |
|
|
snapshots_files = sorted(os.listdir(base_dir)) |
|
|
|
|
|
hash_dict = {} |
|
|
hash_queue = deque([], maxlen=queue_len) |
|
|
duplicates = [] |
|
|
num_duplicates = 0 |
|
|
|
|
|
print("---" * 5, "Finding similar files", "---" * 5) |
|
|
|
|
|
with tqdm(snapshots_files) as t: |
|
|
for file in t: |
|
|
read_file = Image.open(os.path.join(base_dir, file)) |
|
|
comp_hash = hashfunc(read_file, hash_size=hash_size) |
|
|
duplicate = False |
|
|
|
|
|
if comp_hash not in hash_dict: |
|
|
hash_dict[comp_hash] = file |
|
|
|
|
|
for img_hash in hash_queue: |
|
|
if img_hash - comp_hash <= threshold: |
|
|
duplicate = True |
|
|
break |
|
|
|
|
|
if not duplicate: |
|
|
hash_queue.append(comp_hash) |
|
|
else: |
|
|
duplicate = True |
|
|
|
|
|
if duplicate: |
|
|
duplicates.append(file) |
|
|
num_duplicates += 1 |
|
|
t.set_postfix_str(f"Duplicate files: {num_duplicates}") |
|
|
|
|
|
return hash_dict, duplicates |
|
|
|
|
|
|
|
|
def remove_duplicates( |
|
|
base_dir, hash_size=8, hashfunc=imagehash.dhash, queue_len=5, threshold=4 |
|
|
): |
|
|
_, duplicates = find_similar_images( |
|
|
base_dir, |
|
|
hash_size=hash_size, |
|
|
hashfunc=hashfunc, |
|
|
queue_len=queue_len, |
|
|
threshold=threshold, |
|
|
) |
|
|
|
|
|
if not len(duplicates): |
|
|
print("No duplicates found!") |
|
|
else: |
|
|
print("Removing duplicates...") |
|
|
|
|
|
for dup_file in duplicates: |
|
|
file_path = os.path.join(base_dir, dup_file) |
|
|
|
|
|
if os.path.exists(file_path): |
|
|
os.remove(file_path) |
|
|
else: |
|
|
print("Filepath: ", file_path, "does not exists.") |
|
|
|
|
|
print("All duplicates removed!") |
|
|
|
|
|
print("***" * 10, "\n") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
remove_duplicates("sample_1") |
|
|
|