Spaces:

Abhisesh7
/

Invoice-Fraud-Detection

Sleeping

App Files Files Community

Abhisesh7 commited on May 16, 2025

Commit

4386fe9

verified ·

1 Parent(s): 8826169

Update app.py

Browse files

Files changed (1) hide show

app.py +150 -132

app.py CHANGED Viewed

@@ -1,140 +1,158 @@
-from fastapi import FastAPI, HTTPException, File, UploadFile
-from paddleocr import PaddleOCR
-import fitz
-import tempfile
-import logging
 import os
-import requests
-import sqlite3
 from datetime import datetime
-import uvicorn
-app = FastAPI()
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Initialize PaddleOCR lazily to reduce startup memory usage
-ocr = None
-def get_ocr():
-    global ocr
-    if ocr is None:
-        logger.info("Initializing PaddleOCR")
-        ocr = PaddleOCR(use_angle_cls=False, lang='en', use_gpu=False)  # Disable angle classification and GPU
-    return ocr
-# Hugging Face API configuration
-HF_API_URL = "https://api-inference.huggingface.co/models/Abhisesh7/Invoice-Fraud-Detection"
-HF_API_KEY = os.getenv("HF_API_KEY")
-if not HF_API_KEY:
-    logger.error("Hugging Face API key not set in environment variable HF_API_KEY")
-    raise RuntimeError("Hugging Face API key not set")
-HEADERS = {"Authorization": f"Bearer {HF_API_KEY}", "Content-Type": "application/json"}
-# Initialize SQLite database
-conn = sqlite3.connect("invoices.db")
-cursor = conn.cursor()
-cursor.execute("""
-    CREATE TABLE IF NOT EXISTS invoices (
-        id INTEGER PRIMARY KEY AUTOINCREMENT,
-        vendor TEXT,
-        amount REAL,
-        date TEXT,
-        timestamp TEXT
-    )
-""")
-conn.commit()
-@app.post("/process_invoice/")
-async def process_invoice(file: UploadFile = File(...)):
     try:
-        # Validate file type
-        if not file.filename.lower().endswith(('.pdf', '.png', '.jpg', '.jpeg')):
-            raise HTTPException(status_code=400, detail="Invalid file type. Use PDF or image (PNG, JPG, JPEG).")
-        # Save uploaded file
-        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
-            content = await file.read()
-            temp_file.write(content)
-            temp_file_path = temp_file.name
-        # Extract text using PaddleOCR
-        ocr_instance = get_ocr()
-        extracted_text = ""
-        if temp_file_path.lower().endswith('.pdf'):
-            pdf_document = fitz.open(temp_file_path)
-            for page_num in range(pdf_document.page_count):
-                page = pdf_document.load_page(page_num)
-                pix = page.get_pixmap()
-                with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as img_file:
-                    pix.save(img_file.name)
-                    result = ocr_instance.ocr(img_file.name, cls=False)
-                    extracted_text += "\n".join([line[1][0] for line in result[0]]) + "\n"
-            pdf_document.close()
-        else:
-            result = ocr_instance.ocr(temp_file_path, cls=False)
-            extracted_text = "\n".join([line[1][0] for line in result[0]])
-        os.unlink(temp_file_path)
-        if not extracted_text.strip():
-            raise HTTPException(status_code=400, detail="No text extracted from file.")
-        # Call Hugging Face API
-        payload = {"text": extracted_text}
-        response = requests.post(HF_API_URL, headers=HEADERS, json=payload)
-        if response.status_code != 200:
-            raise HTTPException(status_code=response.status_code, detail=f"Hugging Face API error: {response.text}")
-        result = response.json()
-        entities = result.get("entities", {})
-        fraud_score = result.get("fraud_score", 0.0) * 100
-        fraud_reasoning = result.get("fraud_reasoning", "")
-        flagged = result.get("flagged", False)
-        # Extract invoice metadata
-        vendor = entities.get("vendor", "Unknown")
-        amount = float(entities.get("amount", 0))
-        date_str = entities.get("date", "")
-        invoice_date = datetime.strptime(date_str, "%Y-%m-%d").date().isoformat() if date_str else ""
-        # Check for duplicates
-        cursor.execute("""
-            SELECT id, timestamp FROM invoices
-            WHERE vendor = ? AND amount = ? AND date = ?
-        """, (vendor, amount, invoice_date))
-        duplicate = cursor.fetchone()
-        duplicate_info = ""
-        if duplicate:
-            duplicate_info = f"Possible duplicate of invoice processed at {duplicate[1]}"
-            fraud_reasoning += f" | {duplicate_info}"
-            flagged = True
-        # Store invoice metadata
-        timestamp = datetime.now().isoformat()
-        cursor.execute("""
-            INSERT INTO invoices (vendor, amount, date, timestamp)
-            VALUES (?, ?, ?, ?)
-        """, (vendor, amount, invoice_date, timestamp))
-        conn.commit()
-        return {
-            "extracted_text": extracted_text,
-            "vendor": vendor,
-            "amount": amount,
-            "date": invoice_date,
-            "fraud_score": fraud_score,
-            "fraud_reasoning": fraud_reasoning,
-            "flagged": flagged,
-            "duplicate_info": duplicate_info
-        }
     except Exception as e:
-        logger.error(f"Error: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
-@app.on_event("shutdown")
-def shutdown_event():
-    conn.close()
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8000)))

 import os
+os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable GPU usage
+os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"  # Disable oneDNN optimizations
+import logging
+logging.getLogger("transformers").setLevel(logging.ERROR)
+import pdfplumber
+import pandas as pd
+import numpy as np
+from transformers import pipeline
+from sklearn.ensemble import IsolationForest
+from sklearn.preprocessing import StandardScaler
+import json
+import uuid
 from datetime import datetime
+import re
+import gradio as gr
+# Initialize Hugging Face NER pipeline (force CPU)
+ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", device=-1)
+def extract_text_from_pdf(pdf_file):
+    """Extract text from a PDF invoice."""
     try:
+        with pdfplumber.open(pdf_file) as pdf:
+            text = ""
+            for page in pdf.pages:
+                text += page.extract_text() or ""
+        return text
     except Exception as e:
+        return f"Error extracting text: {str(e)}"
+def extract_entities(text):
+    """Extract entities like vendor name and amount using NER."""
+    ner_results = ner_pipeline(text)
+    vendor_name = "Unknown"
+    amount = 0.0
+    current_entity = ""
+    for entity in ner_results:
+        if entity["entity"].startswith("B-ORG"):
+            current_entity = entity["word"]
+        elif entity["entity"].startswith("I-ORG") and current_entity:
+            current_entity += " " + entity["word"]
+        elif entity["entity"] in ["B-PER", "I-PER"]:
+            continue
+        if "amount" in entity["word"].lower() or "$" in entity["word"]:
+            amount_match = re.search(r"\$?[\d,]+\.?\d*", text)
+            if amount_match:
+                amount = float(amount_match.group().replace(",", "").replace("$", ""))
+    if current_entity:
+        vendor_name = current_entity
+    return vendor_name, amount
+def detect_anomalies(df):
+    """Detect anomalies using Isolation Forest."""
+    features = ["amount"]
+    scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(df[features])
+    model = IsolationForest(contamination=0.05, random_state=42)
+    df["is_anomaly"] = model.fit_predict(X_scaled)
+    return df
+def calculate_fraud_score(amount, is_anomaly, items_listed):
+    """Calculate fraud score based on amount, anomaly, and items."""
+    score = 0.0
+    reasoning = []
+    if amount > 5000:
+        score += 40
+        reasoning.append("High invoice amount detected.")
+    elif amount < 10:
+        score += 20
+        reasoning.append("Unusually low invoice amount.")
+    if is_anomaly == -1:
+        score += 30
+        reasoning.append("Invoice flagged as an anomaly.")
+    if len(items_listed.split()) > 100:
+        score += 10
+        reasoning.append("Excessive number of items listed.")
+    return min(score, 100), "; ".join(reasoning)
+def process_invoice(pdf_file):
+    """Process a single invoice PDF and return JSON output."""
+    text = extract_text_from_pdf(pdf_file)
+    if "Error" in text:
+        return {"error": text}
+    vendor_name, amount = extract_entities(text)
+    invoice_date = datetime.now().date()
+    items_listed = text[:500]
+    data = {
+        "invoice_id": str(uuid.uuid4()),
+        "vendor_name": vendor_name,
+        "amount": amount,
+        "invoice_date": invoice_date,
+        "items_listed": items_listed
+    }
+    df = pd.DataFrame([data])
+    df = detect_anomalies(df)
+    fraud_score, fraud_reasoning = calculate_fraud_score(
+        df["amount"].iloc[0], df["is_anomaly"].iloc[0], items_listed
+    )
+    output = {
+        "Invoice_Record__c": {
+            "Vendor_Name__c": vendor_name,
+            "Invoice_Amount__c": amount,
+            "Invoice_Date__c": str(invoice_date),
+            "Items_Listed__c": items_listed,
+            "Fraud_Score__c": fraud_score,
+            "Fraud_Reasoning__c": fraud_reasoning,
+            "Flagged__c": fraud_score > 50,
+            "Reviewed_By__c": None,
+            "Status__c": "Flagged" if fraud_score > 50 else "Cleared"
+        },
+        "Entities": {
+            "Vendor": vendor_name,
+            "Amount": amount
+        },
+        "Anomalies": "Anomaly detected" if df["is_anomaly"].iloc[0] == -1 else "No anomalies"
+    }
+    # Save to JSON file
+    output_file = "fraud_detection_results.json"
+    with open(output_file, "w") as f:
+        json.dump([output], f, indent=2)
+    return json.dumps(output, indent=2)
+def gradio_interface(pdf_file):
+    """Gradio interface to process uploaded PDF and display results."""
+    if pdf_file is None:
+        return "Please upload a PDF file."
+    result = process_invoice(pdf_file)
+    return result
+# Create Gradio interface
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=gr.File(label="Upload Invoice PDF"),
+    outputs=gr.JSON(label="Fraud Detection Results"),
+    title="Invoice Fraud Detection",
+    description="Upload a PDF invoice to detect potential fraud."
+)
 if __name__ == "__main__":
+    iface.launch()