Abhisesh7 commited on
Commit
4386fe9
·
verified ·
1 Parent(s): 8826169

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +150 -132
app.py CHANGED
@@ -1,140 +1,158 @@
1
- from fastapi import FastAPI, HTTPException, File, UploadFile
2
- from paddleocr import PaddleOCR
3
- import fitz
4
- import tempfile
5
- import logging
6
  import os
7
- import requests
8
- import sqlite3
 
 
 
 
 
 
 
 
 
 
 
 
9
  from datetime import datetime
10
- import uvicorn
11
-
12
- app = FastAPI()
13
- logging.basicConfig(level=logging.INFO)
14
- logger = logging.getLogger(__name__)
15
-
16
- # Initialize PaddleOCR lazily to reduce startup memory usage
17
- ocr = None
18
- def get_ocr():
19
- global ocr
20
- if ocr is None:
21
- logger.info("Initializing PaddleOCR")
22
- ocr = PaddleOCR(use_angle_cls=False, lang='en', use_gpu=False) # Disable angle classification and GPU
23
- return ocr
24
-
25
- # Hugging Face API configuration
26
- HF_API_URL = "https://api-inference.huggingface.co/models/Abhisesh7/Invoice-Fraud-Detection"
27
- HF_API_KEY = os.getenv("HF_API_KEY")
28
- if not HF_API_KEY:
29
- logger.error("Hugging Face API key not set in environment variable HF_API_KEY")
30
- raise RuntimeError("Hugging Face API key not set")
31
- HEADERS = {"Authorization": f"Bearer {HF_API_KEY}", "Content-Type": "application/json"}
32
-
33
- # Initialize SQLite database
34
- conn = sqlite3.connect("invoices.db")
35
- cursor = conn.cursor()
36
- cursor.execute("""
37
- CREATE TABLE IF NOT EXISTS invoices (
38
- id INTEGER PRIMARY KEY AUTOINCREMENT,
39
- vendor TEXT,
40
- amount REAL,
41
- date TEXT,
42
- timestamp TEXT
43
- )
44
- """)
45
- conn.commit()
46
 
47
- @app.post("/process_invoice/")
48
- async def process_invoice(file: UploadFile = File(...)):
49
  try:
50
- # Validate file type
51
- if not file.filename.lower().endswith(('.pdf', '.png', '.jpg', '.jpeg')):
52
- raise HTTPException(status_code=400, detail="Invalid file type. Use PDF or image (PNG, JPG, JPEG).")
53
-
54
- # Save uploaded file
55
- with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
56
- content = await file.read()
57
- temp_file.write(content)
58
- temp_file_path = temp_file.name
59
-
60
- # Extract text using PaddleOCR
61
- ocr_instance = get_ocr()
62
- extracted_text = ""
63
- if temp_file_path.lower().endswith('.pdf'):
64
- pdf_document = fitz.open(temp_file_path)
65
- for page_num in range(pdf_document.page_count):
66
- page = pdf_document.load_page(page_num)
67
- pix = page.get_pixmap()
68
- with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as img_file:
69
- pix.save(img_file.name)
70
- result = ocr_instance.ocr(img_file.name, cls=False)
71
- extracted_text += "\n".join([line[1][0] for line in result[0]]) + "\n"
72
- pdf_document.close()
73
- else:
74
- result = ocr_instance.ocr(temp_file_path, cls=False)
75
- extracted_text = "\n".join([line[1][0] for line in result[0]])
76
-
77
- os.unlink(temp_file_path)
78
-
79
- if not extracted_text.strip():
80
- raise HTTPException(status_code=400, detail="No text extracted from file.")
81
-
82
- # Call Hugging Face API
83
- payload = {"text": extracted_text}
84
- response = requests.post(HF_API_URL, headers=HEADERS, json=payload)
85
-
86
- if response.status_code != 200:
87
- raise HTTPException(status_code=response.status_code, detail=f"Hugging Face API error: {response.text}")
88
-
89
- result = response.json()
90
- entities = result.get("entities", {})
91
- fraud_score = result.get("fraud_score", 0.0) * 100
92
- fraud_reasoning = result.get("fraud_reasoning", "")
93
- flagged = result.get("flagged", False)
94
-
95
- # Extract invoice metadata
96
- vendor = entities.get("vendor", "Unknown")
97
- amount = float(entities.get("amount", 0))
98
- date_str = entities.get("date", "")
99
- invoice_date = datetime.strptime(date_str, "%Y-%m-%d").date().isoformat() if date_str else ""
100
-
101
- # Check for duplicates
102
- cursor.execute("""
103
- SELECT id, timestamp FROM invoices
104
- WHERE vendor = ? AND amount = ? AND date = ?
105
- """, (vendor, amount, invoice_date))
106
- duplicate = cursor.fetchone()
107
- duplicate_info = ""
108
- if duplicate:
109
- duplicate_info = f"Possible duplicate of invoice processed at {duplicate[1]}"
110
- fraud_reasoning += f" | {duplicate_info}"
111
- flagged = True
112
-
113
- # Store invoice metadata
114
- timestamp = datetime.now().isoformat()
115
- cursor.execute("""
116
- INSERT INTO invoices (vendor, amount, date, timestamp)
117
- VALUES (?, ?, ?, ?)
118
- """, (vendor, amount, invoice_date, timestamp))
119
- conn.commit()
120
-
121
- return {
122
- "extracted_text": extracted_text,
123
- "vendor": vendor,
124
- "amount": amount,
125
- "date": invoice_date,
126
- "fraud_score": fraud_score,
127
- "fraud_reasoning": fraud_reasoning,
128
- "flagged": flagged,
129
- "duplicate_info": duplicate_info
130
- }
131
  except Exception as e:
132
- logger.error(f"Error: {e}")
133
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- @app.on_event("shutdown")
136
- def shutdown_event():
137
- conn.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  if __name__ == "__main__":
140
- uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8000)))
 
 
 
 
 
 
1
  import os
2
+ os.environ["CUDA_VISIBLE_DEVICES"] = "" # Disable GPU usage
3
+ os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0" # Disable oneDNN optimizations
4
+
5
+ import logging
6
+ logging.getLogger("transformers").setLevel(logging.ERROR)
7
+
8
+ import pdfplumber
9
+ import pandas as pd
10
+ import numpy as np
11
+ from transformers import pipeline
12
+ from sklearn.ensemble import IsolationForest
13
+ from sklearn.preprocessing import StandardScaler
14
+ import json
15
+ import uuid
16
  from datetime import datetime
17
+ import re
18
+ import gradio as gr
19
+
20
+ # Initialize Hugging Face NER pipeline (force CPU)
21
+ ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", device=-1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ def extract_text_from_pdf(pdf_file):
24
+ """Extract text from a PDF invoice."""
25
  try:
26
+ with pdfplumber.open(pdf_file) as pdf:
27
+ text = ""
28
+ for page in pdf.pages:
29
+ text += page.extract_text() or ""
30
+ return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  except Exception as e:
32
+ return f"Error extracting text: {str(e)}"
33
+
34
+ def extract_entities(text):
35
+ """Extract entities like vendor name and amount using NER."""
36
+ ner_results = ner_pipeline(text)
37
+ vendor_name = "Unknown"
38
+ amount = 0.0
39
+
40
+ current_entity = ""
41
+ for entity in ner_results:
42
+ if entity["entity"].startswith("B-ORG"):
43
+ current_entity = entity["word"]
44
+ elif entity["entity"].startswith("I-ORG") and current_entity:
45
+ current_entity += " " + entity["word"]
46
+ elif entity["entity"] in ["B-PER", "I-PER"]:
47
+ continue
48
+
49
+ if "amount" in entity["word"].lower() or "$" in entity["word"]:
50
+ amount_match = re.search(r"\$?[\d,]+\.?\d*", text)
51
+ if amount_match:
52
+ amount = float(amount_match.group().replace(",", "").replace("$", ""))
53
+
54
+ if current_entity:
55
+ vendor_name = current_entity
56
+
57
+ return vendor_name, amount
58
+
59
+ def detect_anomalies(df):
60
+ """Detect anomalies using Isolation Forest."""
61
+ features = ["amount"]
62
+ scaler = StandardScaler()
63
+ X_scaled = scaler.fit_transform(df[features])
64
+ model = IsolationForest(contamination=0.05, random_state=42)
65
+ df["is_anomaly"] = model.fit_predict(X_scaled)
66
+ return df
67
+
68
+ def calculate_fraud_score(amount, is_anomaly, items_listed):
69
+ """Calculate fraud score based on amount, anomaly, and items."""
70
+ score = 0.0
71
+ reasoning = []
72
+
73
+ if amount > 5000:
74
+ score += 40
75
+ reasoning.append("High invoice amount detected.")
76
+ elif amount < 10:
77
+ score += 20
78
+ reasoning.append("Unusually low invoice amount.")
79
+
80
+ if is_anomaly == -1:
81
+ score += 30
82
+ reasoning.append("Invoice flagged as an anomaly.")
83
+
84
+ if len(items_listed.split()) > 100:
85
+ score += 10
86
+ reasoning.append("Excessive number of items listed.")
87
+
88
+ return min(score, 100), "; ".join(reasoning)
89
+
90
+ def process_invoice(pdf_file):
91
+ """Process a single invoice PDF and return JSON output."""
92
+ text = extract_text_from_pdf(pdf_file)
93
+ if "Error" in text:
94
+ return {"error": text}
95
+
96
+ vendor_name, amount = extract_entities(text)
97
+ invoice_date = datetime.now().date()
98
+ items_listed = text[:500]
99
+
100
+ data = {
101
+ "invoice_id": str(uuid.uuid4()),
102
+ "vendor_name": vendor_name,
103
+ "amount": amount,
104
+ "invoice_date": invoice_date,
105
+ "items_listed": items_listed
106
+ }
107
+ df = pd.DataFrame([data])
108
+
109
+ df = detect_anomalies(df)
110
+
111
+ fraud_score, fraud_reasoning = calculate_fraud_score(
112
+ df["amount"].iloc[0], df["is_anomaly"].iloc[0], items_listed
113
+ )
114
 
115
+ output = {
116
+ "Invoice_Record__c": {
117
+ "Vendor_Name__c": vendor_name,
118
+ "Invoice_Amount__c": amount,
119
+ "Invoice_Date__c": str(invoice_date),
120
+ "Items_Listed__c": items_listed,
121
+ "Fraud_Score__c": fraud_score,
122
+ "Fraud_Reasoning__c": fraud_reasoning,
123
+ "Flagged__c": fraud_score > 50,
124
+ "Reviewed_By__c": None,
125
+ "Status__c": "Flagged" if fraud_score > 50 else "Cleared"
126
+ },
127
+ "Entities": {
128
+ "Vendor": vendor_name,
129
+ "Amount": amount
130
+ },
131
+ "Anomalies": "Anomaly detected" if df["is_anomaly"].iloc[0] == -1 else "No anomalies"
132
+ }
133
+
134
+ # Save to JSON file
135
+ output_file = "fraud_detection_results.json"
136
+ with open(output_file, "w") as f:
137
+ json.dump([output], f, indent=2)
138
+
139
+ return json.dumps(output, indent=2)
140
+
141
+ def gradio_interface(pdf_file):
142
+ """Gradio interface to process uploaded PDF and display results."""
143
+ if pdf_file is None:
144
+ return "Please upload a PDF file."
145
+ result = process_invoice(pdf_file)
146
+ return result
147
+
148
+ # Create Gradio interface
149
+ iface = gr.Interface(
150
+ fn=gradio_interface,
151
+ inputs=gr.File(label="Upload Invoice PDF"),
152
+ outputs=gr.JSON(label="Fraud Detection Results"),
153
+ title="Invoice Fraud Detection",
154
+ description="Upload a PDF invoice to detect potential fraud."
155
+ )
156
 
157
  if __name__ == "__main__":
158
+ iface.launch()