Spaces:

SetuG
/

duplicate-invoice-detector

Sleeping

App Files Files Community

SetuG commited on Jun 28, 2025

Commit

3433c14

verified ·

1 Parent(s): b7a5e71

Update app.py

Browse files

Files changed (1) hide show

app.py +190 -174

app.py CHANGED Viewed

@@ -1,174 +1,190 @@
-import os
-import sqlite3
-import hashlib
-import cv2
-import numpy as np
-from PIL import Image
-import pytesseract
-import gradio as gr
-from io import BytesIO
-from datetime import datetime
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-DB_PATH = "/tmp/invoices.db"
-class InvoiceDuplicateDetector:
-    def __init__(self, db_path=DB_PATH):
-        self.db_path = db_path
-        self.init_database()
-        self.vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
-    def init_database(self):
-        conn = sqlite3.connect(self.db_path)
-        cursor = conn.cursor()
-        cursor.execute('''
-            CREATE TABLE IF NOT EXISTS invoices (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                filename TEXT NOT NULL,
-                file_hash TEXT UNIQUE,
-                image_hash TEXT,
-                extracted_text TEXT,
-                upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-                image_data BLOB
-            )
-        ''')
-        conn.commit()
-        conn.close()
-    def calculate_file_hash(self, file_bytes):
-        return hashlib.md5(file_bytes).hexdigest()
-    def calculate_image_hash(self, image):
-        resized = cv2.resize(image, (8, 8), interpolation=cv2.INTER_AREA)
-        gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
-        avg = gray.mean()
-        binary = (gray > avg).astype(int)
-        return ''.join(str(b) for b in binary.flatten())
-    def extract_text_from_image(self, image):
-        return pytesseract.image_to_string(Image.fromarray(image)).strip()
-    def image_to_blob(self, image):
-        buffer = BytesIO()
-        Image.fromarray(image).save(buffer, format="PNG")
-        return buffer.getvalue()
-    def blob_to_image(self, blob):
-        return Image.open(BytesIO(blob))
-    def preprocess_image(self, image):
-        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-        blurred = cv2.GaussianBlur(gray, (5, 5), 0)
-        return cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
-                                     cv2.THRESH_BINARY, 11, 2)
-    def calculate_image_similarity(self, img1, img2):
-        try:
-            proc_img1 = self.preprocess_image(img1)
-            proc_img2 = self.preprocess_image(img2)
-            h, w = min(proc_img1.shape[0], proc_img2.shape[0]), min(proc_img1.shape[1], proc_img2.shape[1])
-            proc_img1 = cv2.resize(proc_img1, (w, h))
-            proc_img2 = cv2.resize(proc_img2, (w, h))
-            hist1 = cv2.calcHist([proc_img1], [0], None, [256], [0, 256])
-            hist2 = cv2.calcHist([proc_img2], [0], None, [256], [0, 256])
-            return cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)
-        except:
-            return 0
-    def calculate_text_similarity(self, text1, text2):
-        try:
-            if not text1.strip() or not text2.strip():
-                return 0
-            tfidf = self.vectorizer.fit_transform([text1, text2])
-            return cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
-        except:
-            return 0
-    def hamming_distance(self, h1, h2):
-        return sum(c1 != c2 for c1, c2 in zip(h1, h2)) if len(h1) == len(h2) else float("inf")
-    def store_invoice(self, file_bytes, filename):
-        file_hash = self.calculate_file_hash(file_bytes)
-        conn = sqlite3.connect(self.db_path)
-        cursor = conn.cursor()
-        cursor.execute("SELECT id FROM invoices WHERE file_hash=?", (file_hash,))
-        if cursor.fetchone():
-            conn.close()
-            return "⚠️ Duplicate file. Skipped."
-        ext = filename.lower().split(".")[-1]
-        if ext == "pdf":
-            return "PDF not supported on Gradio version."
-        image = np.array(Image.open(BytesIO(file_bytes)).convert("RGB"))
-        image_hash = self.calculate_image_hash(image)
-        text = self.extract_text_from_image(image)
-        blob = self.image_to_blob(image)
-        cursor.execute('''
-            INSERT INTO invoices (filename, file_hash, image_hash, extracted_text, image_data)
-            VALUES (?, ?, ?, ?, ?)
-        ''', (filename, file_hash, image_hash, text, blob))
-        conn.commit()
-        conn.close()
-        return "✅ Invoice stored successfully."
-    def find_duplicates(self, file_bytes, filename, threshold=0.8):
-        ext = filename.lower().split(".")[-1]
-        if ext == "pdf":
-            return "PDF not supported in this version.", None
-        image = np.array(Image.open(BytesIO(file_bytes)).convert("RGB"))
-        image_hash = self.calculate_image_hash(image)
-        extracted_text = self.extract_text_from_image(image)
-        conn = sqlite3.connect(self.db_path)
-        cursor = conn.cursor()
-        cursor.execute("SELECT filename, image_hash, extracted_text, image_data FROM invoices")
-        invoices = cursor.fetchall()
-        conn.close()
-        results = []
-        for fname, stored_hash, stored_text, blob in invoices:
-            stored_image = np.array(self.blob_to_image(blob).convert("RGB"))
-            hash_similarity = 1 - (self.hamming_distance(image_hash, stored_hash) / len(image_hash))
-            text_similarity = self.calculate_text_similarity(extracted_text, stored_text)
-            img_similarity = self.calculate_image_similarity(image, stored_image)
-            combined = 0.4 * hash_similarity + 0.4 * text_similarity + 0.2 * img_similarity
-            if combined >= threshold:
-                results.append((fname, combined, stored_image))
-        results.sort(key=lambda x: x[1], reverse=True)
-        if not results:
-            return "✅ No duplicates found.", None
-        else:
-            output = "⚠️ Duplicates Found:\n"
-            for fname, score, _ in results:
-                output += f"• {fname} — Similarity: {score:.2f}\n"
-            return output, [Image.fromarray(img) for _, _, img in results]
-detector = InvoiceDuplicateDetector()
-def upload_invoice(file):
-    return detector.store_invoice(file.read(), file.name)
-def check_duplicates(file):
-    result, images = detector.find_duplicates(file.read(), file.name)
-    return result, images or None
-upload_interface = gr.Interface(
-    fn=upload_invoice,
-    inputs=gr.File(type="binary", label="Upload Invoice (PNG/JPG)"),
-    outputs="text",
-    title="Upload & Store Invoice"
-)
-check_interface = gr.Interface(
-    fn=check_duplicates,
-    inputs=gr.File(type="binary", label="Check Invoice for Duplicates"),
-    outputs=["text", gr.Gallery(label="Matching Invoices")],
-    title="Check for Duplicates"
-)
-gr.TabbedInterface([upload_interface, check_interface], ["Upload Invoice", "Check Duplicate"]).launch()

+# app.py (Gradio version, Hugging Face-compatible)
+import os
+import sqlite3
+import hashlib
+import numpy as np
+import gradio as gr
+from PIL import Image
+import pytesseract
+from pdf2image import convert_from_bytes
+from io import BytesIO
+from datetime import datetime
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import cv2
+# --- Class for Duplicate Detection ---
+class InvoiceDuplicateDetector:
+    def __init__(self, db_path="invoices.db"):
+        self.db_path = db_path
+        self.init_database()
+        self.vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
+    def init_database(self):
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        cursor.execute('''CREATE TABLE IF NOT EXISTS invoices (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            filename TEXT NOT NULL,
+            file_hash TEXT UNIQUE,
+            image_hash TEXT,
+            extracted_text TEXT,
+            upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            image_data BLOB
+        )''')
+        conn.commit()
+        conn.close()
+    def calculate_file_hash(self, file_bytes):
+        return hashlib.md5(file_bytes).hexdigest()
+    def calculate_image_hash(self, image):
+        resized = cv2.resize(image, (8, 8), interpolation=cv2.INTER_AREA)
+        gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
+        avg = gray.mean()
+        binary = (gray > avg).astype(int)
+        return ''.join(str(b) for b in binary.flatten())
+    def pdf_to_image(self, file_bytes):
+        images = convert_from_bytes(file_bytes, first_page=1, last_page=1)
+        return np.array(images[0])
+    def extract_text_from_image(self, image):
+        return pytesseract.image_to_string(Image.fromarray(image)).strip()
+    def image_to_blob(self, image):
+        buffer = BytesIO()
+        Image.fromarray(image).save(buffer, format='PNG')
+        return buffer.getvalue()
+    def blob_to_image(self, blob):
+        return Image.open(BytesIO(blob))
+    def preprocess_image(self, image):
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        blurred = cv2.GaussianBlur(gray, (5, 5), 0)
+        return cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                                     cv2.THRESH_BINARY, 11, 2)
+    def calculate_image_similarity(self, img1, img2):
+        try:
+            proc_img1 = self.preprocess_image(img1)
+            proc_img2 = self.preprocess_image(img2)
+            h, w = min(proc_img1.shape[0], proc_img2.shape[0]), min(proc_img1.shape[1], proc_img2.shape[1])
+            proc_img1 = cv2.resize(proc_img1, (w, h))
+            proc_img2 = cv2.resize(proc_img2, (w, h))
+            hist1 = cv2.calcHist([proc_img1], [0], None, [256], [0, 256])
+            hist2 = cv2.calcHist([proc_img2], [0], None, [256], [0, 256])
+            return cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)
+        except:
+            return 0
+    def calculate_text_similarity(self, text1, text2):
+        try:
+            if not text1.strip() or not text2.strip(): return 0
+            tfidf = self.vectorizer.fit_transform([text1, text2])
+            return cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
+        except:
+            return 0
+    def hamming_distance(self, h1, h2):
+        return sum(c1 != c2 for c1, c2 in zip(h1, h2)) if len(h1) == len(h2) else float('inf')
+    def store_invoice(self, file_bytes, filename):
+        file_hash = self.calculate_file_hash(file_bytes)
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        cursor.execute("SELECT id FROM invoices WHERE file_hash=?", (file_hash,))
+        if cursor.fetchone():
+            conn.close()
+            return False, "Duplicate file. Skipped."
+        ext = filename.lower().split('.')[-1]
+        try:
+            if ext == 'pdf':
+                image = self.pdf_to_image(file_bytes)
+            else:
+                image = np.array(Image.open(BytesIO(file_bytes)).convert('RGB'))
+        except Exception as e:
+            return False, f"Error processing file: {str(e)}"
+        image_hash = self.calculate_image_hash(image)
+        text = self.extract_text_from_image(image)
+        blob = self.image_to_blob(image)
+        cursor.execute('''INSERT INTO invoices (filename, file_hash, image_hash, extracted_text, image_data)
+                          VALUES (?, ?, ?, ?, ?)''', (filename, file_hash, image_hash, text, blob))
+        conn.commit()
+        conn.close()
+        return True, "Stored successfully."
+    def find_duplicates(self, file_bytes, filename, threshold=0.8):
+        ext = filename.lower().split('.')[-1]
+        try:
+            if ext == 'pdf':
+                image = self.pdf_to_image(file_bytes)
+            else:
+                image = np.array(Image.open(BytesIO(file_bytes)).convert('RGB'))
+        except Exception as e:
+            return False, f"Failed to process file: {str(e)}"
+        image_hash = self.calculate_image_hash(image)
+        extracted_text = self.extract_text_from_image(image)
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        cursor.execute("SELECT filename, image_hash, extracted_text, image_data FROM invoices")
+        invoices = cursor.fetchall()
+        conn.close()
+        results = []
+        for fname, stored_hash, stored_text, blob in invoices:
+            stored_image = np.array(self.blob_to_image(blob).convert('RGB'))
+            hash_similarity = 1 - (self.hamming_distance(image_hash, stored_hash) / len(image_hash))
+            text_similarity = self.calculate_text_similarity(extracted_text, stored_text)
+            img_similarity = self.calculate_image_similarity(image, stored_image)
+            combined = 0.4 * hash_similarity + 0.4 * text_similarity + 0.2 * img_similarity
+            if combined >= threshold:
+                results.append((fname, combined))
+        results.sort(key=lambda x: x[1], reverse=True)
+        return True, results
+# --- Gradio UI ---
+detector = InvoiceDuplicateDetector()
+def upload_files(files):
+    messages = []
+    for file in files:
+        file_bytes = file.read()
+        success, msg = detector.store_invoice(file_bytes, file.name)
+        messages.append(f"{file.name}: {msg}")
+    return "\n".join(messages)
+def check_file(file):
+    file_bytes = file.read()
+    ok, result = detector.find_duplicates(file_bytes, file.name)
+    if not ok:
+        return result
+    elif not result:
+        return "✅ No duplicates found!"
+    else:
+        return "⚠️ Possible duplicates:\n" + "\n".join([f"{fname} (score: {score:.2f})" for fname, score in result])
+with gr.Blocks() as demo:
+    gr.Markdown("# 📄 Invoice Duplicate Detector")
+    gr.Markdown("### Upload Invoices")
+    upload = gr.File(file_types=[".pdf", ".png", ".jpg", ".jpeg"], file_count="multiple")
+    out1 = gr.Textbox(label="Upload Result")
+    btn1 = gr.Button("Upload")
+    btn1.click(upload_files, inputs=upload, outputs=out1)
+    gr.Markdown("### Check for Duplicates")
+    check = gr.File(file_types=[".pdf", ".png", ".jpg", ".jpeg"])
+    out2 = gr.Textbox(label="Duplicate Check Result")
+    btn2 = gr.Button("Check")
+    btn2.click(check_file, inputs=check, outputs=out2)
+if __name__ == '__main__':
+    demo.launch()