mkshari commited on
Commit
9429840
·
verified ·
1 Parent(s): d198e2e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -69
app.py CHANGED
@@ -2,47 +2,13 @@ import gradio as gr
2
  from PIL import Image
3
  import imagehash
4
  import hashlib
5
- import torch
6
- import torchvision.transforms as transforms
7
- from torchvision import models
8
- import numpy as np
9
 
10
  # -------------------------
11
  # MD5 HASH FUNCTION
12
  # -------------------------
13
- def get_md5(file):
14
- md5 = hashlib.md5()
15
- md5.update(file)
16
- return md5.hexdigest()
17
-
18
- # -------------------------
19
- # dHash FUNCTION
20
- # -------------------------
21
- def get_dhash(image):
22
- return imagehash.dhash(image)
23
-
24
- # -------------------------
25
- # LOAD SIMPLE MODEL (OPTIONAL - lightweight instead of full ViT)
26
- # -------------------------
27
- model = models.resnet18(pretrained=True)
28
- model.eval()
29
-
30
- transform = transforms.Compose([
31
- transforms.Resize((224, 224)),
32
- transforms.ToTensor()
33
- ])
34
-
35
- def get_features(image):
36
- img = transform(image).unsqueeze(0)
37
- with torch.no_grad():
38
- features = model(img)
39
- return features.numpy()
40
-
41
- # -------------------------
42
- # SIMILARITY FUNCTION
43
- # -------------------------
44
- def cosine_similarity(a, b):
45
- return np.dot(a, b.T) / (np.linalg.norm(a) * np.linalg.norm(b))
46
 
47
  # -------------------------
48
  # MAIN FUNCTION
@@ -53,45 +19,32 @@ def find_duplicates(files):
53
 
54
  md5_map = {}
55
  dhash_map = {}
56
- features_list = []
57
  results = []
58
 
59
- images = []
60
-
61
  for file in files:
62
- img = Image.open(file.name).convert("RGB")
63
- images.append((file.name, img))
64
 
65
- # MD5
66
- file_bytes = open(file.name, "rb").read()
67
- md5 = get_md5(file_bytes)
68
 
 
 
69
  if md5 in md5_map:
70
- results.append(f"Exact Duplicate: {file.name} == {md5_map[md5]}")
71
  else:
72
- md5_map[md5] = file.name
73
 
74
- # dHash
75
- dh = get_dhash(img)
76
- dhash_map[file.name] = dh
77
 
78
- # Features
79
- features = get_features(img)
80
- features_list.append((file.name, features))
81
-
82
- # dHash comparison
83
- names = list(dhash_map.keys())
84
- for i in range(len(names)):
85
- for j in range(i+1, len(names)):
86
- if dhash_map[names[i]] - dhash_map[names[j]] < 5:
87
- results.append(f"Similar (dHash): {names[i]} ~ {names[j]}")
88
-
89
- # Feature similarity
90
- for i in range(len(features_list)):
91
- for j in range(i+1, len(features_list)):
92
- sim = cosine_similarity(features_list[i][1], features_list[j][1])
93
- if sim > 0.8:
94
- results.append(f"Near Duplicate (AI): {features_list[i][0]} ~ {features_list[j][0]}")
95
 
96
  if not results:
97
  return "No duplicates found"
@@ -106,7 +59,10 @@ interface = gr.Interface(
106
  inputs=gr.File(file_count="multiple", label="Upload Images"),
107
  outputs="text",
108
  title="Image Duplicate Finder",
109
- description="Upload images to find exact and near duplicates using MD5, dHash, and AI"
110
  )
111
 
112
- interface.launch()
 
 
 
 
2
  from PIL import Image
3
  import imagehash
4
  import hashlib
 
 
 
 
5
 
6
  # -------------------------
7
  # MD5 HASH FUNCTION
8
  # -------------------------
9
+ def get_md5(file_path):
10
+ with open(file_path, "rb") as f:
11
+ return hashlib.md5(f.read()).hexdigest()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # -------------------------
14
  # MAIN FUNCTION
 
19
 
20
  md5_map = {}
21
  dhash_map = {}
 
22
  results = []
23
 
24
+ # Process images
 
25
  for file in files:
26
+ file_path = file.name
 
27
 
28
+ # Load image
29
+ img = Image.open(file_path).convert("RGB")
 
30
 
31
+ # MD5 (Exact duplicates)
32
+ md5 = get_md5(file_path)
33
  if md5 in md5_map:
34
+ results.append(f"Exact Duplicate: {file_path} == {md5_map[md5]}")
35
  else:
36
+ md5_map[md5] = file_path
37
 
38
+ # dHash (Similar images)
39
+ dhash_map[file_path] = imagehash.dhash(img)
 
40
 
41
+ # Compare dHash
42
+ file_names = list(dhash_map.keys())
43
+ for i in range(len(file_names)):
44
+ for j in range(i + 1, len(file_names)):
45
+ diff = dhash_map[file_names[i]] - dhash_map[file_names[j]]
46
+ if diff < 5:
47
+ results.append(f"Similar Image: {file_names[i]} ~ {file_names[j]}")
 
 
 
 
 
 
 
 
 
 
48
 
49
  if not results:
50
  return "No duplicates found"
 
59
  inputs=gr.File(file_count="multiple", label="Upload Images"),
60
  outputs="text",
61
  title="Image Duplicate Finder",
62
+ description="Upload images to detect exact and similar duplicates using MD5 and dHash"
63
  )
64
 
65
+ # -------------------------
66
+ # LAUNCH (IMPORTANT FOR HF)
67
+ # -------------------------
68
+ interface.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)