import gradio as gr from PIL import Image import imagehash import hashlib import pandas as pd import os # ------------------------- # MD5 HASH FUNCTION # ------------------------- def get_md5(file_path): with open(file_path, "rb") as f: return hashlib.md5(f.read()).hexdigest() # ------------------------- # SIMILARITY SCORE FUNCTION # ------------------------- def get_similarity_score(hash1, hash2): diff = hash1 - hash2 similarity = (1 - diff / 64) * 100 return round(similarity, 2) # ------------------------- # MAIN FUNCTION # ------------------------- def find_duplicates(files, filter_type): if not files: return [], pd.DataFrame(columns=["Image 1", "Image 2", "Type", "Similarity (%)"]), None md5_map = {} dhash_map = {} results = [] images_preview = [] # Ensure temp folder exists temp_folder = "temp_uploads" os.makedirs(temp_folder, exist_ok=True) # Save uploaded files locally for file in files: file_path = os.path.join(temp_folder, os.path.basename(file.name)) with open(file_path, "wb") as f: f.write(file.read()) img = Image.open(file_path).convert("RGB") images_preview.append(img) # MD5 check md5 = get_md5(file_path) if md5 in md5_map: results.append([file_path, md5_map[md5], "Exact Duplicate", 100]) else: md5_map[md5] = file_path # dHash dhash_map[file_path] = imagehash.dhash(img) # Compare dHash file_names = list(dhash_map.keys()) for i in range(len(file_names)): for j in range(i + 1, len(file_names)): hash1 = dhash_map[file_names[i]] hash2 = dhash_map[file_names[j]] diff = hash1 - hash2 similarity = get_similarity_score(hash1, hash2) if diff < 10: results.append([file_names[i], file_names[j], "Similar", similarity]) # Create DataFrame df = pd.DataFrame(results, columns=["Image 1", "Image 2", "Type", "Similarity (%)"]) # Apply filter if filter_type == "Exact": df = df[df["Type"] == "Exact Duplicate"] elif filter_type == "Similar": df = df[df["Type"] == "Similar"] # Save CSV csv_path = os.path.join(temp_folder, "duplicate_results.csv") df.to_csv(csv_path, index=False) return images_preview, df, csv_path # ------------------------- # GRADIO UI # ------------------------- with gr.Blocks() as demo: gr.Markdown("## 🖼️ Image Duplicate Finder (Advanced)") with gr.Row(): file_input = gr.File( file_types=["image"], file_count="multiple", label="Upload Images" ) filter_option = gr.Radio( ["All", "Exact", "Similar"], value="All", label="Filter Results" ) run_btn = gr.Button("Find Duplicates 🔍") gr.Markdown("### 📸 Uploaded Images") gallery = gr.Gallery() gr.Markdown("### 📊 Results Table") table = gr.Dataframe() download_btn = gr.File(label="Download CSV") # Action run_btn.click( fn=find_duplicates, inputs=[file_input, filter_option], outputs=[gallery, table, download_btn] ) # ------------------------- # LAUNCH # ------------------------- demo.launch(server_name="0.0.0.0", server_port=7860)