Abhisesh7 commited on
Commit
fa19356
·
verified ·
1 Parent(s): fc14cba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +256 -306
app.py CHANGED
@@ -1,336 +1,286 @@
1
- import os
2
- from dotenv import load_dotenv
3
- import logging
4
- import pdfplumber
5
  import pandas as pd
6
  import numpy as np
7
- from transformers import pipeline
8
- from sklearn.ensemble import IsolationForest
9
- from sklearn.preprocessing import StandardScaler
10
- import uuid
11
  from datetime import datetime, timedelta
12
- import re
13
  import gradio as gr
14
- from simple_salesforce import Salesforce, SalesforceAuthenticationFailed
15
-
16
- # Load environment variables from .env file
17
- load_dotenv()
18
-
19
- # Configure environment for CPU usage
20
- os.environ["CUDA_VISIBLE_DEVICES"] = "" # Disable GPU usage
21
- os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0" # Disable oneDNN optimizations
22
-
23
- # Set up logging to suppress transformers warnings
24
- logging.getLogger("transformers").setLevel(logging.ERROR)
25
 
26
- # Read Salesforce credentials from environment variables
27
- SF_USERNAME = os.getenv("SF_USERNAME")
28
- SF_PASSWORD = os.getenv("SF_PASSWORD")
29
- SF_SECURITY_TOKEN = os.getenv("SF_SECURITY_TOKEN")
30
-
31
- print(f"Salesforce login info: username={SF_USERNAME}")
32
-
33
- # Salesforce connection with error handling
34
- try:
35
- sf = Salesforce(
36
- username=SF_USERNAME,
37
- password=SF_PASSWORD,
38
- security_token=SF_SECURITY_TOKEN
39
- )
40
- print("Salesforce login successful.")
41
- except SalesforceAuthenticationFailed as e:
42
- print(f"Salesforce authentication failed: {e}")
43
- sf = None
44
-
45
- # Initialize Hugging Face NER pipeline (force CPU)
46
- ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", device=-1)
47
-
48
- def extract_text_from_pdf(pdf_file):
49
- """Extract text from a PDF invoice."""
50
- try:
51
- with pdfplumber.open(pdf_file) as pdf:
52
- text = ""
53
- for page in pdf.pages:
54
- page_text = page.extract_text() or ""
55
- text += page_text + "\n"
56
- print("Extracted text:\n", text) # Debug: Print extracted text
57
- return text
58
- except Exception as e:
59
- return f"Error extracting text: {str(e)}"
60
 
 
61
  def extract_entities(text):
62
- """Extract structured invoice details using flexible regex patterns."""
63
- invoice_number = "Unknown"
64
- vendor_name = "Unknown"
65
- invoice_date = datetime.now().date()
66
- total_amount = 0.0
67
-
68
- # Flexible regex patterns to handle variations
69
- invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)\s*[:\-\s]*)([\w-]+)"
70
- vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From)\s*[:\-\s]*([A-Za-z\s&\.]+)(?=\s*(?:Invoice|No\.?|Date|$|\d))"
71
- invoice_date_pattern = r"(?:Invoice\s*Date\s*[:\-\s]*|Date\s*[:\-\s]*)(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
72
- total_amount_pattern = r"(?:Total\s*(?:Amount|Due)?\s*[:\-\s]*\$?)([\d,]+\.?\d*)"
73
-
74
- # Invoice Number
75
- invoice_num_match = re.search(invoice_num_pattern, text, re.IGNORECASE)
76
- if invoice_num_match:
77
- invoice_number = invoice_num_match.group(1)
78
- print(f"Matched Invoice Number: {invoice_number}") # Debug
79
-
80
- # Vendor Name
81
  vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
82
  if vendor_match:
83
  vendor_name = vendor_match.group(1).strip()
84
- print(f"Matched Vendor Name (Regex): {vendor_name}") # Debug
85
- else:
86
- # Enhanced NER fallback for multi-word organization names
87
- ner_results = ner_pipeline(text)
88
- org_name_parts = []
89
- for i, entity in enumerate(ner_results):
90
- if entity['entity'].startswith('B-ORG'):
91
- org_name_parts = [entity['word']]
92
- elif entity['entity'].startswith('I-ORG') and org_name_parts:
93
- org_name_parts.append(entity['word'])
94
- if org_name_parts:
95
- vendor_name = " ".join(part.replace("##", "") for part in org_name_parts)
96
- print(f"NER Matched Vendor Name: {vendor_name}") # Debug
97
-
98
- # Invoice Date
99
- invoice_date_match = re.search(invoice_date_pattern, text, re.IGNORECASE)
100
- if invoice_date_match:
101
- date_str = invoice_date_match.group(1)
102
- try:
103
- if "/" in date_str:
104
- invoice_date = datetime.strptime(date_str, "%m/%d/%Y").date()
105
- elif "," in date_str:
106
- invoice_date = datetime.strptime(date_str, "%B %d, %Y").date()
107
- elif "-" in date_str:
108
- try:
109
- invoice_date = datetime.strptime(date_str, "%Y-%m-%d").date()
110
- except ValueError:
111
- invoice_date = datetime.strptime(date_str, "%d-%m-%Y").date()
112
- print(f"Matched Invoice Date: {invoice_date}") # Debug
113
- except ValueError as e:
114
- print(f"Failed to parse Invoice Date '{date_str}': {str(e)}") # Debug
115
-
116
- # Total Amount
117
- total_amount_match = re.search(total_amount_pattern, text, re.IGNORECASE)
118
- if total_amount_match:
119
- total_amount = float(total_amount_match.group(1).replace(",", ""))
120
- print(f"Matched Total Amount: {total_amount}") # Debug
121
-
122
- return invoice_number, vendor_name, invoice_date, total_amount
123
-
124
- def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
125
- """Fetch historical invoices for the vendor from Salesforce."""
126
- if sf is None:
127
- return pd.DataFrame()
128
-
129
- try:
130
- end_date = datetime.now().date()
131
- start_date = end_date - timedelta(days=time_window_days)
132
-
133
- query = f"""
134
- SELECT Invoice_Number__c, Invoice_Amount__c, Invoice_Date__c, Vendor_Name__c
135
- FROM Invoice_Record__c
136
- WHERE Invoice_Date__c >= {start_date} AND Invoice_Date__c <= {end_date}
137
- AND Vendor_Name__c = '{vendor_name}'
138
- LIMIT 100
139
- """
140
- result = sf.query(query)
141
- records = result['records']
142
-
143
- history_df = pd.DataFrame(records)
144
- if not history_df.empty:
145
- history_df['Invoice_Date__c'] = pd.to_datetime(history_df['Invoice_Date__c']).dt.date
146
- return history_df
147
- except Exception as e:
148
- print(f"Failed to fetch vendor history: {str(e)}")
149
- return pd.DataFrame()
150
-
151
- def check_data_consistency(invoice_number, vendor_name, invoice_date, history_df):
152
- """Check for data consistency issues like duplicates."""
153
- consistency_issues = []
154
-
155
- if not history_df.empty:
156
- duplicate_invoices = history_df[history_df['Invoice_Number__c'] == invoice_number]
157
- if not duplicate_invoices.empty:
158
- consistency_issues.append(f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'.")
159
-
160
- return consistency_issues
161
-
162
- def detect_anomalies(df, history_df):
163
- """Detect anomalies in amount, frequency, and vendor patterns."""
164
- df["is_amount_anomaly"] = 0
165
- df["is_frequency_anomaly"] = 0
166
- df["is_vendor_pattern_anomaly"] = 0
167
-
168
- if not df.empty:
169
- scaler = StandardScaler()
170
- X_scaled = scaler.fit_transform(df[["amount"]])
171
- model = IsolationForest(contamination=0.05, random_state=42)
172
- df["is_amount_anomaly"] = model.fit_predict(X_scaled)
173
-
174
- if not history_df.empty:
175
- history_df['Invoice_Date__c'] = pd.to_datetime(history_df['Invoice_Date__c'])
176
- date_range = (history_df['Invoice_Date__c'].max() - history_df['Invoice_Date__c'].min()).days + 1
177
- frequency = len(history_df) / max(date_range, 1)
178
-
179
- date_diffs = [(d - history_df['Invoice_Date__c'].min()).days for d in history_df['Invoice_Date__c']]
180
- date_clustering = np.std(date_diffs) if len(date_diffs) > 1 else 0
181
-
182
- frequency_df = pd.DataFrame({
183
- "frequency": [frequency],
184
- "date_clustering": [date_clustering]
185
- })
186
- scaler = StandardScaler()
187
- X_scaled = scaler.fit_transform(frequency_df[["frequency", "date_clustering"]])
188
- model = IsolationForest(contamination=0.05, random_state=42)
189
- df["is_frequency_anomaly"] = model.fit_predict(X_scaled)[0]
190
  else:
191
- df["is_frequency_anomaly"] = 1
192
-
193
- if not history_df.empty and len(history_df) > 1:
194
- historical_amounts = history_df["Invoice_Amount__c"].astype(float)
195
- mean_amount = historical_amounts.mean()
196
- std_amount = historical_amounts.std() if len(historical_amounts) > 1 else 1
197
- amount_variance = historical_amounts.var() if len(historical_amounts) > 1 else 0
198
-
199
- current_amount = df["amount"].iloc[0]
200
- deviation = abs(current_amount - mean_amount) / (std_amount if std_amount > 0 else 1)
201
- invoice_count = len(history_df)
202
-
203
- vendor_pattern_df = pd.DataFrame({
204
- "amount_deviation": [deviation],
205
- "invoice_count": [invoice_count],
206
- "amount_variance": [amount_variance]
207
- })
208
- scaler = StandardScaler()
209
- X_scaled = scaler.fit_transform(vendor_pattern_df[["amount_deviation", "invoice_count", "amount_variance"]])
210
- model = IsolationForest(contamination=0.05, random_state=42)
211
- df["is_vendor_pattern_anomaly"] = model.fit_predict(X_scaled)[0]
212
  else:
213
- df["is_vendor_pattern_anomaly"] = 1
214
 
215
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
- def calculate_fraud_score(amount, is_amount_anomaly, is_frequency_anomaly, is_vendor_pattern_anomaly, text_length, consistency_issues):
218
- """Calculate fraud score based on amount, anomalies, text length, and consistency issues."""
219
- score = 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  reasoning = []
221
 
222
- if amount > 5000:
223
- score += 40
 
224
  reasoning.append("High invoice amount detected.")
225
- elif amount < 10:
226
- score += 20
 
 
227
  reasoning.append("Unusually low invoice amount.")
228
 
229
- if is_amount_anomaly == -1:
230
- score += 30
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  reasoning.append("Amount flagged as an anomaly.")
232
- if is_frequency_anomaly == -1:
233
- score += 25
234
  reasoning.append("Unusual invoice submission frequency or clustering detected.")
235
- if is_vendor_pattern_anomaly == -1:
236
- score += 25
237
  reasoning.append("Unusual vendor pattern detected (amount deviation, frequency, or variance).")
238
 
239
- if text_length > 500:
240
- score += 10
241
- reasoning.append("Excessive text length in invoice.")
242
-
243
- if consistency_issues:
244
- score += 15 * len(consistency_issues)
245
- reasoning.extend(consistency_issues)
246
-
247
- return min(score, 100), reasoning
248
 
249
- def process_invoice(pdf_file):
250
- """Process a single invoice PDF and return structured markdown output."""
251
- text = extract_text_from_pdf(pdf_file)
252
- if "Error" in text:
253
- return f"**Error**: {text}"
254
-
255
- invoice_number, vendor_name, invoice_date, total_amount = extract_entities(text)
256
- text_length = len(text)
257
 
258
- history_df = fetch_vendor_history(vendor_name, invoice_number)
259
- consistency_issues = check_data_consistency(invoice_number, vendor_name, invoice_date, history_df)
260
 
261
- data = {
262
- "invoice_id": str(uuid.uuid4()),
263
- "invoice_number": invoice_number,
264
- "vendor_name": vendor_name,
265
- "amount": total_amount,
266
- "invoice_date": invoice_date,
267
- "text_length": text_length
 
268
  }
269
- df = pd.DataFrame([data])
270
 
271
- df = detect_anomalies(df, history_df)
272
-
273
- fraud_score, fraud_reasoning = calculate_fraud_score(
274
- df["amount"].iloc[0],
275
- df["is_amount_anomaly"].iloc[0],
276
- df["is_frequency_anomaly"].iloc[0],
277
- df["is_vendor_pattern_anomaly"].iloc[0],
278
- text_length,
279
- consistency_issues
280
- )
281
-
282
- output = [
283
- "## Fraud Detection Summary",
284
- f"- **Invoice Number**: {invoice_number}",
285
- f"- **Vendor Name**: {vendor_name}",
286
- f"- **Invoice Date**: {invoice_date}",
287
- f"- **Invoice Amount**: ${total_amount:,.2f}",
288
- f"- **Fraud Score**: {fraud_score}",
289
- f"- **Status**: {'Flagged' if fraud_score > 50 else 'Cleared'}",
290
- f"- **Flagged**: {fraud_score > 50}",
291
- f"- **Amount Anomaly**: {'Anomaly detected' if df['is_amount_anomaly'].iloc[0] == -1 else 'No anomalies'}",
292
- f"- **Frequency Anomaly**: {'Anomaly detected' if df['is_frequency_anomaly'].iloc[0] == -1 else 'No anomalies'}",
293
- f"- **Vendor Pattern Anomaly**: {'Anomaly detected' if df['is_vendor_pattern_anomaly'].iloc[0] == -1 else 'No anomalies'}",
294
- "",
295
- "## Fraud Reasoning"
296
- ]
297
-
298
- if fraud_reasoning:
299
- output.extend([f"- {reason}" for reason in fraud_reasoning])
300
- else:
301
- output.append("- No specific fraud indicators detected")
302
-
303
- if sf is not None:
304
- try:
305
- sf.Invoice_Record__c.create({
306
- "Invoice_Number__c": invoice_number,
307
- "Vendor_Name__c": vendor_name,
308
- "Invoice_Amount__c": total_amount,
309
- "Invoice_Date__c": str(invoice_date),
310
- "Fraud_Score__c": fraud_score,
311
- "Fraud_Reason__c": "; ".join(fraud_reasoning),
312
- "Flagged__c": fraud_score > 50,
313
- "Status__c": "Flagged" if fraud_score > 50 else "Cleared"
314
- })
315
- except Exception as e:
316
- print(f"Failed to create Salesforce record: {str(e)}")
317
- pass
318
-
319
- return "\n".join(output)
320
-
321
- def gradio_interface(pdf_file):
322
- """Gradio interface to process uploaded PDF and display structured results."""
323
- if pdf_file is None:
324
- return "Please upload a PDF file."
325
- result = process_invoice(pdf_file)
326
- return result
327
-
328
- with gr.Blocks(css=".prose a[href*='share']:has(svg) {display:none !important;}") as iface:
329
- gr.Markdown("# Invoice Fraud Detection")
330
- with gr.Row():
331
- file_input = gr.File(label="Upload Invoice PDF")
332
- result_output = gr.Markdown(label="Fraud Detection Results")
333
- file_input.change(fn=gradio_interface, inputs=file_input, outputs=result_output)
 
334
 
335
  if __name__ == "__main__":
336
- iface.launch()
 
1
+ import re
 
 
 
2
  import pandas as pd
3
  import numpy as np
 
 
 
 
4
  from datetime import datetime, timedelta
 
5
  import gradio as gr
6
+ from simple_salesforce import Salesforce
7
+ import warnings
8
+ warnings.filterwarnings("ignore")
 
 
 
 
 
 
 
 
9
 
10
+ # Salesforce connection (mocked for this example)
11
+ def connect_to_salesforce():
12
+ return None # Replace with actual Salesforce connection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ # Extract entities from OCR text
15
  def extract_entities(text):
16
+ invoice_number = None
17
+ vendor_name = None
18
+ invoice_date = None
19
+ total_amount = None
20
+
21
+ # Extract invoice number
22
+ invoice_number_pattern = r"(?:Invoice\s*(?:No|#|Number|Advice\s*No)\s*[:\-\s]*)([A-Za-z0-9\-]+)"
23
+ invoice_match = re.search(invoice_number_pattern, text, re.IGNORECASE)
24
+ if invoice_match:
25
+ invoice_number = invoice_match.group(1).strip()
26
+
27
+ # Extract vendor name
28
+ vendor_pattern = r"(?:Vendor\s*[:\-\s]*)([^\n]+)"
 
 
 
 
 
 
29
  vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
30
  if vendor_match:
31
  vendor_name = vendor_match.group(1).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  else:
33
+ # Fallback: Use NER or assume "Unknown Vendor" if not found
34
+ vendor_name = "Unknown Vendor"
35
+
36
+ # Extract date
37
+ date_pattern = r"(?:Date\s*[:\-\s]*)([0-9]{4}-[0-9]{2}-[0-9]{2})"
38
+ date_match = re.search(date_pattern, text, re.IGNORECASE)
39
+ if date_match:
40
+ invoice_date = datetime.strptime(date_match.group(1), "%Y-%m-%d")
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  else:
42
+ invoice_date = datetime.now()
43
 
44
+ # Extract total amount
45
+ total_amount_pattern = r"(?:Total\s*(?:Amount|before\s*tax)?\s*[:\-\s]*\$?|Total\s*\|\s*\$|Summary:\s*Total\s*before\s*tax:\s*\$)([\d,]+\.?\d*)"
46
+ total_match = re.search(total_amount_pattern, text, re.IGNORECASE)
47
+ if total_match:
48
+ total_amount = float(total_match.group(1).replace(",", ""))
49
+ else:
50
+ # Calculate from table if total amount not explicitly stated
51
+ table_pattern = r"\|.*?\|.*?\|.*?\|.*?\|"
52
+ tables = re.findall(table_pattern, text, re.DOTALL)
53
+ total_amount = 0.0
54
+ for table in tables:
55
+ lines = table.split("\n")
56
+ for line in lines:
57
+ if "Total Price" in line or line.startswith("| Item Description"):
58
+ continue
59
+ cells = [cell.strip() for cell in line.split("|") if cell.strip()]
60
+ if len(cells) >= 4:
61
+ try:
62
+ total_price = float(cells[-1].replace("$", "").replace(",", ""))
63
+ total_amount += total_price
64
+ except (ValueError, IndexError):
65
+ continue
66
+
67
+ # Adjust for shipping, discount, or tax if present
68
+ shipping_pattern = r"Shipping\s*Cost\s*[:\-\s]*\$([\d,]+\.?\d*)"
69
+ shipping_match = re.search(shipping_pattern, text, re.IGNORECASE)
70
+ if shipping_match:
71
+ total_amount += float(shipping_match.group(1).replace(",", ""))
72
+
73
+ discount_pattern = r"Discount\s*\(\d+%\)\s*[:\-\s]*\$([\d,]+\.?\d*)"
74
+ discount_match = re.search(discount_pattern, text, re.IGNORECASE)
75
+ if discount_match:
76
+ total_amount -= float(discount_match.group(1).replace(",", ""))
77
+
78
+ tax_pattern = r"Tax\s*\(\d+%\)\s*[:\-\s]*\$([\d,]+\.?\d*)"
79
+ tax_match = re.search(tax_pattern, text, re.IGNORECASE)
80
+ if tax_match:
81
+ total_amount += float(tax_match.group(1).replace(",", ""))
82
+
83
+ return {
84
+ "invoice_number": invoice_number,
85
+ "vendor_name": vendor_name,
86
+ "invoice_date": invoice_date,
87
+ "total_amount": total_amount,
88
+ "text_length": len(text)
89
+ }
90
 
91
+ # Fetch vendor history from Salesforce
92
+ def fetch_vendor_history(sf, vendor_name, invoice_date):
93
+ # Simulate Salesforce query
94
+ # In practice, replace with actual Salesforce query
95
+ # Query: Select invoices for the vendor within the last 30 days
96
+ history = []
97
+ for invoice in invoice_history: # invoice_history is a global list for this example
98
+ if invoice["Vendor_Name__c"] == vendor_name:
99
+ inv_date = datetime.strptime(invoice["Invoice_Date__c"], "%Y-%m-%d")
100
+ if (invoice_date - inv_date).days <= 30 and inv_date < invoice_date:
101
+ history.append({
102
+ "Invoice_Number__c": invoice["Invoice_Number__c"],
103
+ "Invoice_Amount__c": invoice["Invoice_Amount__c"],
104
+ "Invoice_Date__c": inv_date
105
+ })
106
+ return pd.DataFrame(history)
107
+
108
+ # Check for duplicate invoices
109
+ def check_data_consistency(history_df, invoice_number, vendor_name):
110
+ issues = []
111
+ if not history_df.empty:
112
+ # Check for duplicates across all vendors
113
+ duplicate_invoices = history_df[history_df["Invoice_Number__c"] == invoice_number]
114
+ # To check duplicates only within the same vendor, uncomment the following line:
115
+ # duplicate_invoices = history_df[(history_df["Invoice_Number__c"] == invoice_number) & (history_df["Vendor_Name__c"] == vendor_name)]
116
+ if not duplicate_invoices.empty:
117
+ issues.append(f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'.")
118
+ return issues
119
+
120
+ # Detect anomalies
121
+ def detect_anomalies(history_df, current_amount, current_date):
122
+ amount_anomaly = "No anomalies"
123
+ frequency_anomaly = "No anomalies"
124
+ vendor_pattern_anomaly = "No anomalies"
125
+
126
+ # Skip anomaly detection if fewer than 3 data points
127
+ if len(history_df) < 3:
128
+ return amount_anomaly, frequency_anomaly, vendor_pattern_anomaly, 0, 0, 0
129
+
130
+ # Amount Anomaly: Flag if current amount deviates more than 2 std from mean
131
+ amounts = history_df["Invoice_Amount__c"].values
132
+ mean_amount = np.mean(amounts)
133
+ std_amount = np.std(amounts)
134
+ amount_score = 0
135
+ if std_amount > 0 and (current_amount > mean_amount + 2 * std_amount or current_amount < mean_amount - 2 * std_amount):
136
+ amount_anomaly = "Anomaly detected"
137
+ amount_score = 30
138
+
139
+ # Frequency Anomaly: Flag if frequency > 1 invoice/day or date clustering < 1 day
140
+ dates = [d.to_pydatetime() for d in history_df["Invoice_Date__c"]]
141
+ days_diff = (max(dates) - min(dates)).days + 1
142
+ frequency = len(dates) / days_diff if days_diff > 0 else 0
143
+ date_clustering = np.std([(d - min(dates)).days for d in dates]) if len(dates) > 1 else 0
144
+ frequency_score = 0
145
+ if frequency > 1 or (date_clustering < 1 and date_clustering > 0):
146
+ frequency_anomaly = "Anomaly detected"
147
+ frequency_score = 25
148
+
149
+ # Vendor Pattern Anomaly: Flag if amount deviation is high and invoice count pattern is unusual
150
+ vendor_pattern_score = 0
151
+ if std_amount > 0 and (current_amount > mean_amount + 2 * std_amount or current_amount < mean_amount - 2 * std_amount):
152
+ vendor_pattern_anomaly = "Anomaly detected"
153
+ vendor_pattern_score = 25
154
+
155
+ return amount_anomaly, frequency_anomaly, vendor_pattern_anomaly, amount_score, frequency_score, vendor_pattern_score
156
+
157
+ # Calculate fraud score
158
+ def calculate_fraud_score(extracted_data, history_df, consistency_issues):
159
+ invoice_amount = extracted_data["total_amount"]
160
+ text_length = extracted_data["text_length"]
161
+ invoice_number = extracted_data["invoice_number"]
162
+ vendor_name = extracted_data["vendor_name"]
163
+ invoice_date = extracted_data["invoice_date"]
164
+
165
+ # Base score rules
166
+ fraud_score = 0
167
  reasoning = []
168
 
169
+ # Rule 1: High invoice amount
170
+ if invoice_amount > 5000:
171
+ fraud_score += 40
172
  reasoning.append("High invoice amount detected.")
173
+
174
+ # Rule 2: Low invoice amount
175
+ if invoice_amount < 10:
176
+ fraud_score += 20
177
  reasoning.append("Unusually low invoice amount.")
178
 
179
+ # Rule 3: Text length
180
+ if text_length < 500:
181
+ fraud_score += 0 # No additional score for now
182
+
183
+ # Consistency issues
184
+ consistency_score = len(consistency_issues) * 15
185
+ fraud_score += consistency_score
186
+ reasoning.extend(consistency_issues)
187
+
188
+ # Anomaly detection
189
+ amount_anomaly, frequency_anomaly, vendor_pattern_anomaly, amount_score, frequency_score, vendor_pattern_score = detect_anomalies(
190
+ history_df, invoice_amount, invoice_date
191
+ )
192
+ fraud_score += amount_score + frequency_score + vendor_pattern_score
193
+ if amount_score > 0:
194
  reasoning.append("Amount flagged as an anomaly.")
195
+ if frequency_score > 0:
 
196
  reasoning.append("Unusual invoice submission frequency or clustering detected.")
197
+ if vendor_pattern_score > 0:
 
198
  reasoning.append("Unusual vendor pattern detected (amount deviation, frequency, or variance).")
199
 
200
+ # Cap the fraud score at 100
201
+ fraud_score = min(fraud_score, 100)
 
 
 
 
 
 
 
202
 
203
+ # Determine status
204
+ status = "Flagged" if fraud_score >= 50 else "Cleared"
205
+ flagged = fraud_score >= 50
 
 
 
 
 
206
 
207
+ if not reasoning:
208
+ reasoning.append("No specific fraud indicators detected")
209
 
210
+ return {
211
+ "fraud_score": fraud_score,
212
+ "status": status,
213
+ "flagged": flagged,
214
+ "amount_anomaly": amount_anomaly,
215
+ "frequency_anomaly": frequency_anomaly,
216
+ "vendor_pattern_anomaly": vendor_pattern_anomaly,
217
+ "reasoning": reasoning
218
  }
 
219
 
220
+ # Save to Salesforce
221
+ def save_to_salesforce(sf, extracted_data, fraud_results):
222
+ # Simulate saving to Salesforce
223
+ invoice_history.append({
224
+ "Invoice_Number__c": extracted_data["invoice_number"],
225
+ "Vendor_Name__c": extracted_data["vendor_name"],
226
+ "Invoice_Date__c": extracted_data["invoice_date"].strftime("%Y-%m-%d"),
227
+ "Invoice_Amount__c": extracted_data["total_amount"],
228
+ "Fraud_Score__c": fraud_results["fraud_score"],
229
+ "Status__c": fraud_results["status"]
230
+ })
231
+
232
+ # Main processing function
233
+ def process_invoice(pdf_file):
234
+ # Simulate OCR extraction (in practice, use an OCR library like Tesseract)
235
+ text = pdf_file # For this example, the text is already provided
236
+
237
+ # Connect to Salesforce
238
+ sf = connect_to_salesforce()
239
+
240
+ # Extract entities
241
+ extracted_data = extract_entities(text)
242
+
243
+ # Fetch vendor history
244
+ history_df = fetch_vendor_history(sf, extracted_data["vendor_name"], extracted_data["invoice_date"])
245
+
246
+ # Check data consistency
247
+ consistency_issues = check_data_consistency(history_df, extracted_data["invoice_number"], extracted_data["vendor_name"])
248
+
249
+ # Calculate fraud score
250
+ fraud_results = calculate_fraud_score(extracted_data, history_df, consistency_issues)
251
+
252
+ # Save to Salesforce
253
+ save_to_salesforce(sf, extracted_data, fraud_results)
254
+
255
+ # Prepare output
256
+ output = f"""
257
+ ## Fraud Detection Summary
258
+ - **Invoice Number**: {extracted_data["invoice_number"]}
259
+ - **Vendor Name**: {extracted_data["vendor_name"]}
260
+ - **Invoice Date**: {extracted_data["invoice_date"].strftime("%Y-%m-%d")}
261
+ - **Invoice Amount**: ${extracted_data["total_amount"]:.2f}
262
+ - **Fraud Score**: {fraud_results["fraud_score"]}
263
+ - **Status**: {fraud_results["status"]}
264
+ - **Flagged**: {fraud_results["flagged"]}
265
+ - **Amount Anomaly**: {fraud_results["amount_anomaly"]}
266
+ - **Frequency Anomaly**: {fraud_results["frequency_anomaly"]}
267
+ - **Vendor Pattern Anomaly**: {fraud_results["vendor_pattern_anomaly"]}
268
+
269
+ ## Fraud Reasoning
270
+ - {". ".join(fraud_results["reasoning"])}.
271
+ """
272
+ return output
273
+
274
+ # Global list to simulate Salesforce database
275
+ invoice_history = []
276
+
277
+ # Gradio interface
278
+ with gr.Blocks() as demo:
279
+ gr.Markdown("# Invoice Fraud Detection App")
280
+ pdf_input = gr.Textbox(label="Upload Invoice Text (Simulated OCR Output)", placeholder="Paste the OCR-extracted text here...")
281
+ output = gr.Markdown()
282
+ submit_btn = gr.Button("Process Invoice")
283
+ submit_btn.click(fn=process_invoice, inputs=pdf_input, outputs=output)
284
 
285
  if __name__ == "__main__":
286
+ demo.launch()