SetuG commited on
Commit
d11e909
·
verified ·
1 Parent(s): 1d80794

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -45
app.py CHANGED
@@ -1,8 +1,8 @@
1
  import os
2
  import sqlite3
3
  import hashlib
4
- import numpy as np
5
  import cv2
 
6
  from PIL import Image
7
  import pytesseract
8
  from pdf2image import convert_from_bytes
@@ -11,6 +11,10 @@ from datetime import datetime
11
  from sklearn.feature_extraction.text import TfidfVectorizer
12
  from sklearn.metrics.pairwise import cosine_similarity
13
  import gradio as gr
 
 
 
 
14
 
15
  class InvoiceDuplicateDetector:
16
  def __init__(self, db_path="invoices.db"):
@@ -81,7 +85,8 @@ class InvoiceDuplicateDetector:
81
 
82
  def calculate_text_similarity(self, text1, text2):
83
  try:
84
- if not text1.strip() or not text2.strip(): return 0
 
85
  tfidf = self.vectorizer.fit_transform([text1, text2])
86
  return cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
87
  except:
@@ -90,7 +95,11 @@ class InvoiceDuplicateDetector:
90
  def hamming_distance(self, h1, h2):
91
  return sum(c1 != c2 for c1, c2 in zip(h1, h2)) if len(h1) == len(h2) else float('inf')
92
 
93
- def store_invoice(self, file_bytes, filename):
 
 
 
 
94
  file_hash = self.calculate_file_hash(file_bytes)
95
  conn = sqlite3.connect(self.db_path)
96
  cursor = conn.cursor()
@@ -106,6 +115,7 @@ class InvoiceDuplicateDetector:
106
  else:
107
  image = np.array(Image.open(BytesIO(file_bytes)).convert('RGB'))
108
  except Exception as e:
 
109
  return False, f"Error processing file: {str(e)}"
110
 
111
  image_hash = self.calculate_image_hash(image)
@@ -120,15 +130,21 @@ class InvoiceDuplicateDetector:
120
  conn.close()
121
  return True, "Stored successfully."
122
 
123
- def find_duplicates(self, file_bytes, filename, threshold=0.8):
 
 
 
 
 
124
  ext = filename.lower().split('.')[-1]
 
125
  try:
126
  if ext == 'pdf':
127
  image = self.pdf_to_image(file_bytes)
128
  else:
129
  image = np.array(Image.open(BytesIO(file_bytes)).convert('RGB'))
130
  except Exception as e:
131
- return False, f"Failed to process file: {str(e)}"
132
 
133
  image_hash = self.calculate_image_hash(image)
134
  extracted_text = self.extract_text_from_image(image)
@@ -140,6 +156,8 @@ class InvoiceDuplicateDetector:
140
  conn.close()
141
 
142
  results = []
 
 
143
  for inv in invoices:
144
  iid, fname, stored_hash, stored_text, blob = inv
145
  stored_image = np.array(self.blob_to_image(blob).convert('RGB'))
@@ -147,57 +165,146 @@ class InvoiceDuplicateDetector:
147
  text_similarity = self.calculate_text_similarity(extracted_text, stored_text)
148
  img_similarity = self.calculate_image_similarity(image, stored_image)
149
  combined = 0.4 * hash_similarity + 0.4 * text_similarity + 0.2 * img_similarity
 
150
  if combined >= threshold:
151
  results.append((fname, combined))
152
- results.sort(key=lambda x: x[1], reverse=True)
153
- return True, results
 
 
 
 
 
 
 
154
 
 
155
 
 
156
  detector = InvoiceDuplicateDetector()
157
 
158
- def upload_files(files):
 
159
  if not files:
160
  return "No files uploaded."
 
161
  results = []
162
  for file in files:
163
- try:
164
- file_bytes = file.read()
165
- filename = getattr(file, "name", "uploaded")
166
- success, message = detector.store_invoice(file_bytes, filename)
167
- results.append(f"{filename}: {message}")
168
- except Exception as e:
169
- results.append(f"{getattr(file, 'name', 'unknown')}: File read error: {str(e)}")
170
  return "\n".join(results)
171
 
172
- def check_duplicates(file):
173
- try:
174
- file_bytes = file.read()
175
- filename = getattr(file, "name", "uploaded")
176
- ok, result = detector.find_duplicates(file_bytes, filename)
177
- if not ok:
178
- return result
179
- if not result:
180
- return "✅ No duplicates found!"
181
- return "\n".join([f"🔁 {fname} — Similarity: {score:.2f}" for fname, score in result])
182
- except Exception as e:
183
- return f"File read error: {str(e)}"
184
-
185
- with gr.Blocks(theme=gr.themes.Base()) as demo:
186
- gr.Markdown("## 📄 Invoice Duplicate Detector")
187
-
188
- with gr.Row():
189
- with gr.Column():
190
- upload_input = gr.File(file_types=[".pdf", ".png", ".jpg", ".jpeg"], file_count="multiple", label="Upload Invoices")
191
- upload_btn = gr.Button("Upload")
192
- upload_output = gr.Textbox(label="Upload Result")
193
- with gr.Column():
194
- check_input = gr.File(file_types=[".pdf", ".png", ".jpg", ".jpeg"], label="Check for Duplicate")
195
- check_btn = gr.Button("Check")
196
- check_output = gr.Textbox(label="Check Result")
197
-
198
- upload_btn.click(upload_files, inputs=upload_input, outputs=upload_output)
199
- check_btn.click(check_duplicates, inputs=check_input, outputs=check_output)
200
-
201
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
 
 
1
  import os
2
  import sqlite3
3
  import hashlib
 
4
  import cv2
5
+ import numpy as np
6
  from PIL import Image
7
  import pytesseract
8
  from pdf2image import convert_from_bytes
 
11
  from sklearn.feature_extraction.text import TfidfVectorizer
12
  from sklearn.metrics.pairwise import cosine_similarity
13
  import gradio as gr
14
+ import tempfile
15
+ import shutil
16
+
17
+ # --------- CLASS DEFINITION ---------
18
 
19
  class InvoiceDuplicateDetector:
20
  def __init__(self, db_path="invoices.db"):
 
85
 
86
  def calculate_text_similarity(self, text1, text2):
87
  try:
88
+ if not text1.strip() or not text2.strip():
89
+ return 0
90
  tfidf = self.vectorizer.fit_transform([text1, text2])
91
  return cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
92
  except:
 
95
  def hamming_distance(self, h1, h2):
96
  return sum(c1 != c2 for c1, c2 in zip(h1, h2)) if len(h1) == len(h2) else float('inf')
97
 
98
+ def store_invoice_from_path(self, file_path, filename):
99
+ """Store invoice from file path (for Gradio compatibility)"""
100
+ with open(file_path, 'rb') as f:
101
+ file_bytes = f.read()
102
+
103
  file_hash = self.calculate_file_hash(file_bytes)
104
  conn = sqlite3.connect(self.db_path)
105
  cursor = conn.cursor()
 
115
  else:
116
  image = np.array(Image.open(BytesIO(file_bytes)).convert('RGB'))
117
  except Exception as e:
118
+ conn.close()
119
  return False, f"Error processing file: {str(e)}"
120
 
121
  image_hash = self.calculate_image_hash(image)
 
130
  conn.close()
131
  return True, "Stored successfully."
132
 
133
+ def find_duplicates_from_path(self, file_path, threshold=0.8):
134
+ """Find duplicates from file path (for Gradio compatibility)"""
135
+ with open(file_path, 'rb') as f:
136
+ file_bytes = f.read()
137
+
138
+ filename = os.path.basename(file_path)
139
  ext = filename.lower().split('.')[-1]
140
+
141
  try:
142
  if ext == 'pdf':
143
  image = self.pdf_to_image(file_bytes)
144
  else:
145
  image = np.array(Image.open(BytesIO(file_bytes)).convert('RGB'))
146
  except Exception as e:
147
+ return False, f"Failed to process file: {str(e)}", None, []
148
 
149
  image_hash = self.calculate_image_hash(image)
150
  extracted_text = self.extract_text_from_image(image)
 
156
  conn.close()
157
 
158
  results = []
159
+ matched_images = []
160
+
161
  for inv in invoices:
162
  iid, fname, stored_hash, stored_text, blob = inv
163
  stored_image = np.array(self.blob_to_image(blob).convert('RGB'))
 
165
  text_similarity = self.calculate_text_similarity(extracted_text, stored_text)
166
  img_similarity = self.calculate_image_similarity(image, stored_image)
167
  combined = 0.4 * hash_similarity + 0.4 * text_similarity + 0.2 * img_similarity
168
+
169
  if combined >= threshold:
170
  results.append((fname, combined))
171
+ matched_images.append(stored_image)
172
+
173
+ # Sort by similarity score
174
+ if results:
175
+ sorted_results = sorted(zip(results, matched_images), key=lambda x: x[0][1], reverse=True)
176
+ results = [r[0] for r in sorted_results]
177
+ matched_images = [r[1] for r in sorted_results]
178
+
179
+ return True, "Search completed", image, results, matched_images
180
 
181
+ # --------- GRADIO INTERFACE FUNCTIONS ---------
182
 
183
+ # Initialize detector
184
  detector = InvoiceDuplicateDetector()
185
 
186
+ def upload_and_store_invoices(files):
187
+ """Handle multiple file uploads and store them"""
188
  if not files:
189
  return "No files uploaded."
190
+
191
  results = []
192
  for file in files:
193
+ filename = os.path.basename(file.name)
194
+ success, msg = detector.store_invoice_from_path(file.name, filename)
195
+ status = "" if success else ""
196
+ results.append(f"{status} {filename}: {msg}")
197
+
 
 
198
  return "\n".join(results)
199
 
200
+ def check_for_duplicates(file, threshold):
201
+ """Check uploaded file for duplicates"""
202
+ if not file:
203
+ return "No file uploaded.", None, "No duplicates found."
204
+
205
+ filename = os.path.basename(file.name)
206
+ success, msg, input_image, results, matched_images = detector.find_duplicates_from_path(
207
+ file.name, threshold=threshold/100.0
208
+ )
209
+
210
+ if not success:
211
+ return f"Error: {msg}", None, "Error occurred during processing."
212
+
213
+ if not results:
214
+ return " No duplicates found!", input_image, "No similar invoices found in the database."
215
+
216
+ # Format results
217
+ duplicate_info = f"⚠️ Found {len(results)} potential duplicate(s):\n\n"
218
+ for i, (fname, score) in enumerate(results):
219
+ duplicate_info += f"{i+1}. **{fname}** - Similarity: {score:.2%}\n"
220
+
221
+ # Return the first matched image for display
222
+ first_match = matched_images[0] if matched_images else None
223
+
224
+ return duplicate_info, input_image, f"Showing match: {results[0][0]} (Similarity: {results[0][1]:.2%})" if results else "No matches"
225
+
226
+ def get_database_stats():
227
+ """Get statistics about stored invoices"""
228
+ conn = sqlite3.connect(detector.db_path)
229
+ cursor = conn.cursor()
230
+ cursor.execute("SELECT COUNT(*) FROM invoices")
231
+ count = cursor.fetchone()[0]
232
+ conn.close()
233
+ return f"📊 Database contains {count} stored invoices"
234
+
235
+ # --------- GRADIO INTERFACE ---------
236
+
237
+ with gr.Blocks(title="Invoice Duplicate Detector", theme=gr.themes.Soft()) as app:
238
+ gr.Markdown("# 📄 Invoice Duplicate Detector")
239
+ gr.Markdown("Upload invoices to store them in the database, then check new invoices for potential duplicates.")
240
+
241
+ with gr.Tab("📤 Upload & Store Invoices"):
242
+ gr.Markdown("### Upload invoice files to store in the database")
243
+ upload_files = gr.File(
244
+ label="Select invoice files (PDF, PNG, JPG, JPEG)",
245
+ file_count="multiple",
246
+ file_types=[".pdf", ".png", ".jpg", ".jpeg"]
247
+ )
248
+ upload_btn = gr.Button("Store Invoices", variant="primary")
249
+ upload_output = gr.Textbox(label="Upload Results", lines=5)
250
+
251
+ upload_btn.click(
252
+ fn=upload_and_store_invoices,
253
+ inputs=upload_files,
254
+ outputs=upload_output
255
+ )
256
+
257
+ with gr.Tab("🔍 Check for Duplicates"):
258
+ gr.Markdown("### Upload a file to check for duplicates")
259
+
260
+ with gr.Row():
261
+ with gr.Column():
262
+ check_file = gr.File(
263
+ label="Upload file to check",
264
+ file_types=[".pdf", ".png", ".jpg", ".jpeg"]
265
+ )
266
+ threshold_slider = gr.Slider(
267
+ minimum=50,
268
+ maximum=100,
269
+ value=80,
270
+ step=5,
271
+ label="Similarity Threshold (%)",
272
+ info="Higher values = stricter matching"
273
+ )
274
+ check_btn = gr.Button("Check for Duplicates", variant="primary")
275
+
276
+ duplicate_results = gr.Textbox(label="Duplicate Check Results", lines=5)
277
+
278
+ with gr.Row():
279
+ with gr.Column():
280
+ gr.Markdown("#### Input Invoice")
281
+ input_image = gr.Image(label="Uploaded Invoice")
282
+ with gr.Column():
283
+ gr.Markdown("#### Best Match")
284
+ match_image = gr.Image(label="Matched Invoice")
285
+
286
+ match_info = gr.Textbox(label="Match Information")
287
+
288
+ check_btn.click(
289
+ fn=check_for_duplicates,
290
+ inputs=[check_file, threshold_slider],
291
+ outputs=[duplicate_results, input_image, match_info]
292
+ )
293
+
294
+ with gr.Tab("�� Database Info"):
295
+ gr.Markdown("### Database Statistics")
296
+ stats_btn = gr.Button("Refresh Stats")
297
+ stats_output = gr.Textbox(label="Database Statistics")
298
+
299
+ stats_btn.click(
300
+ fn=get_database_stats,
301
+ outputs=stats_output
302
+ )
303
+
304
+ # Load stats on page load
305
+ app.load(fn=get_database_stats, outputs=stats_output)
306
+
307
+ if __name__ == "__main__":
308
+ app.launch()
309
 
310