SetuG commited on
Commit
530cc2d
·
verified ·
1 Parent(s): 442f6d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -151
app.py CHANGED
@@ -1,8 +1,8 @@
1
  import os
2
  import sqlite3
3
  import hashlib
4
- import cv2
5
  import numpy as np
 
6
  from PIL import Image
7
  import pytesseract
8
  from pdf2image import convert_from_bytes
@@ -11,10 +11,6 @@ from datetime import datetime
11
  from sklearn.feature_extraction.text import TfidfVectorizer
12
  from sklearn.metrics.pairwise import cosine_similarity
13
  import gradio as gr
14
- import tempfile
15
- import shutil
16
-
17
- # --------- CLASS DEFINITION ---------
18
 
19
  class InvoiceDuplicateDetector:
20
  def __init__(self, db_path="invoices.db"):
@@ -85,8 +81,7 @@ class InvoiceDuplicateDetector:
85
 
86
  def calculate_text_similarity(self, text1, text2):
87
  try:
88
- if not text1.strip() or not text2.strip():
89
- return 0
90
  tfidf = self.vectorizer.fit_transform([text1, text2])
91
  return cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
92
  except:
@@ -95,11 +90,7 @@ class InvoiceDuplicateDetector:
95
  def hamming_distance(self, h1, h2):
96
  return sum(c1 != c2 for c1, c2 in zip(h1, h2)) if len(h1) == len(h2) else float('inf')
97
 
98
- def store_invoice_from_path(self, file_path, filename):
99
- """Store invoice from file path (for Gradio compatibility)"""
100
- with open(file_path, 'rb') as f:
101
- file_bytes = f.read()
102
-
103
  file_hash = self.calculate_file_hash(file_bytes)
104
  conn = sqlite3.connect(self.db_path)
105
  cursor = conn.cursor()
@@ -115,7 +106,6 @@ class InvoiceDuplicateDetector:
115
  else:
116
  image = np.array(Image.open(BytesIO(file_bytes)).convert('RGB'))
117
  except Exception as e:
118
- conn.close()
119
  return False, f"Error processing file: {str(e)}"
120
 
121
  image_hash = self.calculate_image_hash(image)
@@ -130,21 +120,15 @@ class InvoiceDuplicateDetector:
130
  conn.close()
131
  return True, "Stored successfully."
132
 
133
- def find_duplicates_from_path(self, file_path, threshold=0.8):
134
- """Find duplicates from file path (for Gradio compatibility)"""
135
- with open(file_path, 'rb') as f:
136
- file_bytes = f.read()
137
-
138
- filename = os.path.basename(file_path)
139
  ext = filename.lower().split('.')[-1]
140
-
141
  try:
142
  if ext == 'pdf':
143
  image = self.pdf_to_image(file_bytes)
144
  else:
145
  image = np.array(Image.open(BytesIO(file_bytes)).convert('RGB'))
146
  except Exception as e:
147
- return False, f"Failed to process file: {str(e)}", None, []
148
 
149
  image_hash = self.calculate_image_hash(image)
150
  extracted_text = self.extract_text_from_image(image)
@@ -156,8 +140,6 @@ class InvoiceDuplicateDetector:
156
  conn.close()
157
 
158
  results = []
159
- matched_images = []
160
-
161
  for inv in invoices:
162
  iid, fname, stored_hash, stored_text, blob = inv
163
  stored_image = np.array(self.blob_to_image(blob).convert('RGB'))
@@ -165,146 +147,60 @@ class InvoiceDuplicateDetector:
165
  text_similarity = self.calculate_text_similarity(extracted_text, stored_text)
166
  img_similarity = self.calculate_image_similarity(image, stored_image)
167
  combined = 0.4 * hash_similarity + 0.4 * text_similarity + 0.2 * img_similarity
168
-
169
  if combined >= threshold:
170
  results.append((fname, combined))
171
- matched_images.append(stored_image)
172
-
173
- # Sort by similarity score
174
- if results:
175
- sorted_results = sorted(zip(results, matched_images), key=lambda x: x[0][1], reverse=True)
176
- results = [r[0] for r in sorted_results]
177
- matched_images = [r[1] for r in sorted_results]
178
-
179
- return True, "Search completed", image, results, matched_images
180
 
181
- # --------- GRADIO INTERFACE FUNCTIONS ---------
182
 
183
- # Initialize detector
184
  detector = InvoiceDuplicateDetector()
185
 
186
- def upload_and_store_invoices(files):
187
- """Handle multiple file uploads and store them"""
188
  if not files:
189
  return "No files uploaded."
190
-
191
  results = []
192
  for file in files:
193
- filename = os.path.basename(file.name)
194
- success, msg = detector.store_invoice_from_path(file.name, filename)
195
- status = "✅" if success else "❌"
196
- results.append(f"{status} {filename}: {msg}")
197
-
 
 
 
198
  return "\n".join(results)
199
 
200
- def check_for_duplicates(file, threshold):
201
- """Check uploaded file for duplicates"""
202
- if not file:
203
- return "No file uploaded.", None, "No duplicates found."
204
-
205
- filename = os.path.basename(file.name)
206
- success, msg, input_image, results, matched_images = detector.find_duplicates_from_path(
207
- file.name, threshold=threshold/100.0
208
- )
209
-
210
- if not success:
211
- return f"Error: {msg}", None, "Error occurred during processing."
212
-
213
- if not results:
214
- return "✅ No duplicates found!", input_image, "No similar invoices found in the database."
215
-
216
- # Format results
217
- duplicate_info = f"⚠️ Found {len(results)} potential duplicate(s):\n\n"
218
- for i, (fname, score) in enumerate(results):
219
- duplicate_info += f"{i+1}. **{fname}** - Similarity: {score:.2%}\n"
220
-
221
- # Return the first matched image for display
222
- first_match = matched_images[0] if matched_images else None
223
-
224
- return duplicate_info, input_image, f"Showing match: {results[0][0]} (Similarity: {results[0][1]:.2%})" if results else "No matches"
225
-
226
- def get_database_stats():
227
- """Get statistics about stored invoices"""
228
- conn = sqlite3.connect(detector.db_path)
229
- cursor = conn.cursor()
230
- cursor.execute("SELECT COUNT(*) FROM invoices")
231
- count = cursor.fetchone()[0]
232
- conn.close()
233
- return f"📊 Database contains {count} stored invoices"
234
-
235
- # --------- GRADIO INTERFACE ---------
236
-
237
- with gr.Blocks(title="Invoice Duplicate Detector", theme=gr.themes.Soft()) as app:
238
- gr.Markdown("# 📄 Invoice Duplicate Detector")
239
- gr.Markdown("Upload invoices to store them in the database, then check new invoices for potential duplicates.")
240
-
241
- with gr.Tab("📤 Upload & Store Invoices"):
242
- gr.Markdown("### Upload invoice files to store in the database")
243
- upload_files = gr.File(
244
- label="Select invoice files (PDF, PNG, JPG, JPEG)",
245
- file_count="multiple",
246
- file_types=[".pdf", ".png", ".jpg", ".jpeg"]
247
- )
248
- upload_btn = gr.Button("Store Invoices", variant="primary")
249
- upload_output = gr.Textbox(label="Upload Results", lines=5)
250
-
251
- upload_btn.click(
252
- fn=upload_and_store_invoices,
253
- inputs=upload_files,
254
- outputs=upload_output
255
- )
256
-
257
- with gr.Tab("🔍 Check for Duplicates"):
258
- gr.Markdown("### Upload a file to check for duplicates")
259
-
260
- with gr.Row():
261
- with gr.Column():
262
- check_file = gr.File(
263
- label="Upload file to check",
264
- file_types=[".pdf", ".png", ".jpg", ".jpeg"]
265
- )
266
- threshold_slider = gr.Slider(
267
- minimum=50,
268
- maximum=100,
269
- value=80,
270
- step=5,
271
- label="Similarity Threshold (%)",
272
- info="Higher values = stricter matching"
273
- )
274
- check_btn = gr.Button("Check for Duplicates", variant="primary")
275
-
276
- duplicate_results = gr.Textbox(label="Duplicate Check Results", lines=5)
277
-
278
- with gr.Row():
279
- with gr.Column():
280
- gr.Markdown("#### Input Invoice")
281
- input_image = gr.Image(label="Uploaded Invoice")
282
- with gr.Column():
283
- gr.Markdown("#### Best Match")
284
- match_image = gr.Image(label="Matched Invoice")
285
-
286
- match_info = gr.Textbox(label="Match Information")
287
-
288
- check_btn.click(
289
- fn=check_for_duplicates,
290
- inputs=[check_file, threshold_slider],
291
- outputs=[duplicate_results, input_image, match_info]
292
- )
293
-
294
- with gr.Tab("📊 Database Info"):
295
- gr.Markdown("### Database Statistics")
296
- stats_btn = gr.Button("Refresh Stats")
297
- stats_output = gr.Textbox(label="Database Statistics")
298
-
299
- stats_btn.click(
300
- fn=get_database_stats,
301
- outputs=stats_output
302
- )
303
-
304
- # Load stats on page load
305
- app.load(fn=get_database_stats, outputs=stats_output)
306
 
307
- if __name__ == "__main__":
308
- app.launch()
309
 
310
 
 
1
  import os
2
  import sqlite3
3
  import hashlib
 
4
  import numpy as np
5
+ import cv2
6
  from PIL import Image
7
  import pytesseract
8
  from pdf2image import convert_from_bytes
 
11
  from sklearn.feature_extraction.text import TfidfVectorizer
12
  from sklearn.metrics.pairwise import cosine_similarity
13
  import gradio as gr
 
 
 
 
14
 
15
  class InvoiceDuplicateDetector:
16
  def __init__(self, db_path="invoices.db"):
 
81
 
82
  def calculate_text_similarity(self, text1, text2):
83
  try:
84
+ if not text1.strip() or not text2.strip(): return 0
 
85
  tfidf = self.vectorizer.fit_transform([text1, text2])
86
  return cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
87
  except:
 
90
  def hamming_distance(self, h1, h2):
91
  return sum(c1 != c2 for c1, c2 in zip(h1, h2)) if len(h1) == len(h2) else float('inf')
92
 
93
+ def store_invoice(self, file_bytes, filename):
 
 
 
 
94
  file_hash = self.calculate_file_hash(file_bytes)
95
  conn = sqlite3.connect(self.db_path)
96
  cursor = conn.cursor()
 
106
  else:
107
  image = np.array(Image.open(BytesIO(file_bytes)).convert('RGB'))
108
  except Exception as e:
 
109
  return False, f"Error processing file: {str(e)}"
110
 
111
  image_hash = self.calculate_image_hash(image)
 
120
  conn.close()
121
  return True, "Stored successfully."
122
 
123
+ def find_duplicates(self, file_bytes, filename, threshold=0.8):
 
 
 
 
 
124
  ext = filename.lower().split('.')[-1]
 
125
  try:
126
  if ext == 'pdf':
127
  image = self.pdf_to_image(file_bytes)
128
  else:
129
  image = np.array(Image.open(BytesIO(file_bytes)).convert('RGB'))
130
  except Exception as e:
131
+ return False, f"Failed to process file: {str(e)}"
132
 
133
  image_hash = self.calculate_image_hash(image)
134
  extracted_text = self.extract_text_from_image(image)
 
140
  conn.close()
141
 
142
  results = []
 
 
143
  for inv in invoices:
144
  iid, fname, stored_hash, stored_text, blob = inv
145
  stored_image = np.array(self.blob_to_image(blob).convert('RGB'))
 
147
  text_similarity = self.calculate_text_similarity(extracted_text, stored_text)
148
  img_similarity = self.calculate_image_similarity(image, stored_image)
149
  combined = 0.4 * hash_similarity + 0.4 * text_similarity + 0.2 * img_similarity
 
150
  if combined >= threshold:
151
  results.append((fname, combined))
152
+ results.sort(key=lambda x: x[1], reverse=True)
153
+ return True, results
 
 
 
 
 
 
 
154
 
 
155
 
 
156
  detector = InvoiceDuplicateDetector()
157
 
158
+ def upload_files(files):
 
159
  if not files:
160
  return "No files uploaded."
 
161
  results = []
162
  for file in files:
163
+ try:
164
+ with open(file.path, "rb") as f:
165
+ file_bytes = f.read()
166
+ filename = file.name
167
+ success, message = detector.store_invoice(file_bytes, filename)
168
+ results.append(f"{filename}: {message}")
169
+ except Exception as e:
170
+ results.append(f"{getattr(file, 'name', 'unknown')}: File read error: {str(e)}")
171
  return "\n".join(results)
172
 
173
+ def check_duplicates(file):
174
+ try:
175
+ with open(file.path, "rb") as f:
176
+ file_bytes = f.read()
177
+ filename = file.name
178
+ ok, result = detector.find_duplicates(file_bytes, filename)
179
+ if not ok:
180
+ return result
181
+ if not result:
182
+ return "✅ No duplicates found!"
183
+ return "\n".join([f"🔁 {fname} — Similarity: {score:.2f}" for fname, score in result])
184
+ except Exception as e:
185
+ return f"File read error: {str(e)}"
186
+
187
+ with gr.Blocks(theme=gr.themes.Base()) as demo:
188
+ gr.Markdown("## 📄 Invoice Duplicate Detector")
189
+
190
+ with gr.Row():
191
+ with gr.Column():
192
+ upload_input = gr.File(file_types=[".pdf", ".png", ".jpg", ".jpeg"], file_count="multiple", label="Upload Invoices")
193
+ upload_btn = gr.Button("Upload")
194
+ upload_output = gr.Textbox(label="Upload Result")
195
+ with gr.Column():
196
+ check_input = gr.File(file_types=[".pdf", ".png", ".jpg", ".jpeg"], label="Check for Duplicate")
197
+ check_btn = gr.Button("Check")
198
+ check_output = gr.Textbox(label="Check Result")
199
+
200
+ upload_btn.click(upload_files, inputs=upload_input, outputs=upload_output)
201
+ check_btn.click(check_duplicates, inputs=check_input, outputs=check_output)
202
+
203
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
 
 
205
 
206