Spaces:

SetuG
/

duplicate-invoice-detector

Sleeping

File size: 7,939 Bytes

import os
import sqlite3
import hashlib
import numpy as np
import cv2
from PIL import Image
import pytesseract
from pdf2image import convert_from_bytes
from io import BytesIO
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr

class InvoiceDuplicateDetector:
    def __init__(self, db_path="invoices.db"):
        self.db_path = db_path
        self.init_database()
        self.vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)

    def init_database(self):
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS invoices (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                filename TEXT NOT NULL,
                file_hash TEXT UNIQUE,
                image_hash TEXT,
                extracted_text TEXT,
                upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                image_data BLOB
            )
        ''')
        conn.commit()
        conn.close()

    def calculate_file_hash(self, file_bytes):
        return hashlib.md5(file_bytes).hexdigest()

    def calculate_image_hash(self, image):
        resized = cv2.resize(image, (8, 8), interpolation=cv2.INTER_AREA)
        gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
        avg = gray.mean()
        binary = (gray > avg).astype(int)
        return ''.join(str(b) for b in binary.flatten())

    def pdf_to_image(self, file_bytes):
        images = convert_from_bytes(file_bytes, first_page=1, last_page=1)
        return np.array(images[0])

    def extract_text_from_image(self, image):
        return pytesseract.image_to_string(Image.fromarray(image)).strip()

    def image_to_blob(self, image):
        buffer = BytesIO()
        Image.fromarray(image).save(buffer, format='PNG')
        return buffer.getvalue()

    def blob_to_image(self, blob):
        return Image.open(BytesIO(blob))

    def preprocess_image(self, image):
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        blurred = cv2.GaussianBlur(gray, (5, 5), 0)
        return cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                     cv2.THRESH_BINARY, 11, 2)

    def calculate_image_similarity(self, img1, img2):
        try:
            proc_img1 = self.preprocess_image(img1)
            proc_img2 = self.preprocess_image(img2)
            h, w = min(proc_img1.shape[0], proc_img2.shape[0]), min(proc_img1.shape[1], proc_img2.shape[1])
            proc_img1 = cv2.resize(proc_img1, (w, h))
            proc_img2 = cv2.resize(proc_img2, (w, h))
            hist1 = cv2.calcHist([proc_img1], [0], None, [256], [0, 256])
            hist2 = cv2.calcHist([proc_img2], [0], None, [256], [0, 256])
            return cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)
        except:
            return 0

    def calculate_text_similarity(self, text1, text2):
        try:
            if not text1.strip() or not text2.strip(): return 0
            tfidf = self.vectorizer.fit_transform([text1, text2])
            return cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
        except:
            return 0

    def hamming_distance(self, h1, h2):
        return sum(c1 != c2 for c1, c2 in zip(h1, h2)) if len(h1) == len(h2) else float('inf')

    def store_invoice(self, file_bytes, filename):
        file_hash = self.calculate_file_hash(file_bytes)
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT id FROM invoices WHERE file_hash=?", (file_hash,))
        if cursor.fetchone():
            conn.close()
            return False, "Duplicate file. Skipped."

        ext = filename.lower().split('.')[-1]
        try:
            if ext == 'pdf':
                image = self.pdf_to_image(file_bytes)
            else:
                image = np.array(Image.open(BytesIO(file_bytes)).convert('RGB'))
        except Exception as e:
            return False, f"Error processing file: {str(e)}"

        image_hash = self.calculate_image_hash(image)
        text = self.extract_text_from_image(image)
        blob = self.image_to_blob(image)

        cursor.execute('''
            INSERT INTO invoices (filename, file_hash, image_hash, extracted_text, image_data)
            VALUES (?, ?, ?, ?, ?)
        ''', (filename, file_hash, image_hash, text, blob))
        conn.commit()
        conn.close()
        return True, "Stored successfully."

    def find_duplicates(self, file_bytes, filename, threshold=0.8):
        ext = filename.lower().split('.')[-1]
        try:
            if ext == 'pdf':
                image = self.pdf_to_image(file_bytes)
            else:
                image = np.array(Image.open(BytesIO(file_bytes)).convert('RGB'))
        except Exception as e:
            return False, f"Failed to process file: {str(e)}"

        image_hash = self.calculate_image_hash(image)
        extracted_text = self.extract_text_from_image(image)

        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT id, filename, image_hash, extracted_text, image_data FROM invoices")
        invoices = cursor.fetchall()
        conn.close()

        results = []
        for inv in invoices:
            iid, fname, stored_hash, stored_text, blob = inv
            stored_image = np.array(self.blob_to_image(blob).convert('RGB'))
            hash_similarity = 1 - (self.hamming_distance(image_hash, stored_hash) / len(image_hash))
            text_similarity = self.calculate_text_similarity(extracted_text, stored_text)
            img_similarity = self.calculate_image_similarity(image, stored_image)
            combined = 0.4 * hash_similarity + 0.4 * text_similarity + 0.2 * img_similarity
            if combined >= threshold:
                results.append((fname, combined))
        results.sort(key=lambda x: x[1], reverse=True)
        return True, results

detector = InvoiceDuplicateDetector()

def upload_files(files):
    if not files:
        return "No files uploaded."
    results = []
    for file in files:
        try:
            with open(file.name, "rb") as f:
                file_bytes = f.read()
            filename = os.path.basename(file.name)
            success, message = detector.store_invoice(file_bytes, filename)
            results.append(f"{filename}: {message}")
        except Exception as e:
            results.append(f"{getattr(file, 'name', 'unknown')}: File read error: {str(e)}")
    return "\n".join(results)

def check_duplicates(file):
    try:
        with open(file.name, "rb") as f:
            file_bytes = f.read()
        filename = os.path.basename(file.name)
        ok, result = detector.find_duplicates(file_bytes, filename)
        if not ok:
            return result
        if not result:
            return "✅ No duplicates found!"
        return "\n".join([f"🔁 {fname} — Similarity: {score:.2f}" for fname, score in result])
    except Exception as e:
        return f"File read error: {str(e)}"

with gr.Blocks(theme=gr.themes.Base()) as demo:
    gr.Markdown("## 📄 Invoice Duplicate Detector")

    with gr.Row():
        with gr.Column():
            upload_input = gr.File(file_types=[".pdf", ".png", ".jpg", ".jpeg"], file_count="multiple", label="Upload Invoices")
            upload_btn = gr.Button("Upload")
            upload_output = gr.Textbox(label="Upload Result")
        with gr.Column():
            check_input = gr.File(file_types=[".pdf", ".png", ".jpg", ".jpeg"], label="Check for Duplicate")
            check_btn = gr.Button("Check")
            check_output = gr.Textbox(label="Check Result")

    upload_btn.click(upload_files, inputs=upload_input, outputs=upload_output)
    check_btn.click(check_duplicates, inputs=check_input, outputs=check_output)

demo.launch()