Abhisesh7 commited on
Commit
f8984f3
·
verified ·
1 Parent(s): fa19356

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +306 -256
app.py CHANGED
@@ -1,286 +1,336 @@
1
- import re
 
 
 
2
  import pandas as pd
3
  import numpy as np
 
 
 
 
4
  from datetime import datetime, timedelta
 
5
  import gradio as gr
6
- from simple_salesforce import Salesforce
7
- import warnings
8
- warnings.filterwarnings("ignore")
 
 
 
 
 
 
 
 
9
 
10
- # Salesforce connection (mocked for this example)
11
- def connect_to_salesforce():
12
- return None # Replace with actual Salesforce connection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- # Extract entities from OCR text
15
  def extract_entities(text):
16
- invoice_number = None
17
- vendor_name = None
18
- invoice_date = None
19
- total_amount = None
20
-
21
- # Extract invoice number
22
- invoice_number_pattern = r"(?:Invoice\s*(?:No|#|Number|Advice\s*No)\s*[:\-\s]*)([A-Za-z0-9\-]+)"
23
- invoice_match = re.search(invoice_number_pattern, text, re.IGNORECASE)
24
- if invoice_match:
25
- invoice_number = invoice_match.group(1).strip()
26
-
27
- # Extract vendor name
28
- vendor_pattern = r"(?:Vendor\s*[:\-\s]*)([^\n]+)"
 
 
 
 
 
 
29
  vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
30
  if vendor_match:
31
  vendor_name = vendor_match.group(1).strip()
 
32
  else:
33
- # Fallback: Use NER or assume "Unknown Vendor" if not found
34
- vendor_name = "Unknown Vendor"
35
-
36
- # Extract date
37
- date_pattern = r"(?:Date\s*[:\-\s]*)([0-9]{4}-[0-9]{2}-[0-9]{2})"
38
- date_match = re.search(date_pattern, text, re.IGNORECASE)
39
- if date_match:
40
- invoice_date = datetime.strptime(date_match.group(1), "%Y-%m-%d")
41
- else:
42
- invoice_date = datetime.now()
43
-
44
- # Extract total amount
45
- total_amount_pattern = r"(?:Total\s*(?:Amount|before\s*tax)?\s*[:\-\s]*\$?|Total\s*\|\s*\$|Summary:\s*Total\s*before\s*tax:\s*\$)([\d,]+\.?\d*)"
46
- total_match = re.search(total_amount_pattern, text, re.IGNORECASE)
47
- if total_match:
48
- total_amount = float(total_match.group(1).replace(",", ""))
49
- else:
50
- # Calculate from table if total amount not explicitly stated
51
- table_pattern = r"\|.*?\|.*?\|.*?\|.*?\|"
52
- tables = re.findall(table_pattern, text, re.DOTALL)
53
- total_amount = 0.0
54
- for table in tables:
55
- lines = table.split("\n")
56
- for line in lines:
57
- if "Total Price" in line or line.startswith("| Item Description"):
58
- continue
59
- cells = [cell.strip() for cell in line.split("|") if cell.strip()]
60
- if len(cells) >= 4:
61
- try:
62
- total_price = float(cells[-1].replace("$", "").replace(",", ""))
63
- total_amount += total_price
64
- except (ValueError, IndexError):
65
- continue
66
-
67
- # Adjust for shipping, discount, or tax if present
68
- shipping_pattern = r"Shipping\s*Cost\s*[:\-\s]*\$([\d,]+\.?\d*)"
69
- shipping_match = re.search(shipping_pattern, text, re.IGNORECASE)
70
- if shipping_match:
71
- total_amount += float(shipping_match.group(1).replace(",", ""))
72
-
73
- discount_pattern = r"Discount\s*\(\d+%\)\s*[:\-\s]*\$([\d,]+\.?\d*)"
74
- discount_match = re.search(discount_pattern, text, re.IGNORECASE)
75
- if discount_match:
76
- total_amount -= float(discount_match.group(1).replace(",", ""))
77
-
78
- tax_pattern = r"Tax\s*\(\d+%\)\s*[:\-\s]*\$([\d,]+\.?\d*)"
79
- tax_match = re.search(tax_pattern, text, re.IGNORECASE)
80
- if tax_match:
81
- total_amount += float(tax_match.group(1).replace(",", ""))
82
-
83
- return {
84
- "invoice_number": invoice_number,
85
- "vendor_name": vendor_name,
86
- "invoice_date": invoice_date,
87
- "total_amount": total_amount,
88
- "text_length": len(text)
89
- }
 
 
 
 
 
 
 
 
 
 
 
90
 
91
- # Fetch vendor history from Salesforce
92
- def fetch_vendor_history(sf, vendor_name, invoice_date):
93
- # Simulate Salesforce query
94
- # In practice, replace with actual Salesforce query
95
- # Query: Select invoices for the vendor within the last 30 days
96
- history = []
97
- for invoice in invoice_history: # invoice_history is a global list for this example
98
- if invoice["Vendor_Name__c"] == vendor_name:
99
- inv_date = datetime.strptime(invoice["Invoice_Date__c"], "%Y-%m-%d")
100
- if (invoice_date - inv_date).days <= 30 and inv_date < invoice_date:
101
- history.append({
102
- "Invoice_Number__c": invoice["Invoice_Number__c"],
103
- "Invoice_Amount__c": invoice["Invoice_Amount__c"],
104
- "Invoice_Date__c": inv_date
105
- })
106
- return pd.DataFrame(history)
107
-
108
- # Check for duplicate invoices
109
- def check_data_consistency(history_df, invoice_number, vendor_name):
110
- issues = []
111
  if not history_df.empty:
112
- # Check for duplicates across all vendors
113
- duplicate_invoices = history_df[history_df["Invoice_Number__c"] == invoice_number]
114
- # To check duplicates only within the same vendor, uncomment the following line:
115
- # duplicate_invoices = history_df[(history_df["Invoice_Number__c"] == invoice_number) & (history_df["Vendor_Name__c"] == vendor_name)]
116
  if not duplicate_invoices.empty:
117
- issues.append(f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'.")
118
- return issues
119
-
120
- # Detect anomalies
121
- def detect_anomalies(history_df, current_amount, current_date):
122
- amount_anomaly = "No anomalies"
123
- frequency_anomaly = "No anomalies"
124
- vendor_pattern_anomaly = "No anomalies"
125
-
126
- # Skip anomaly detection if fewer than 3 data points
127
- if len(history_df) < 3:
128
- return amount_anomaly, frequency_anomaly, vendor_pattern_anomaly, 0, 0, 0
129
-
130
- # Amount Anomaly: Flag if current amount deviates more than 2 std from mean
131
- amounts = history_df["Invoice_Amount__c"].values
132
- mean_amount = np.mean(amounts)
133
- std_amount = np.std(amounts)
134
- amount_score = 0
135
- if std_amount > 0 and (current_amount > mean_amount + 2 * std_amount or current_amount < mean_amount - 2 * std_amount):
136
- amount_anomaly = "Anomaly detected"
137
- amount_score = 30
138
-
139
- # Frequency Anomaly: Flag if frequency > 1 invoice/day or date clustering < 1 day
140
- dates = [d.to_pydatetime() for d in history_df["Invoice_Date__c"]]
141
- days_diff = (max(dates) - min(dates)).days + 1
142
- frequency = len(dates) / days_diff if days_diff > 0 else 0
143
- date_clustering = np.std([(d - min(dates)).days for d in dates]) if len(dates) > 1 else 0
144
- frequency_score = 0
145
- if frequency > 1 or (date_clustering < 1 and date_clustering > 0):
146
- frequency_anomaly = "Anomaly detected"
147
- frequency_score = 25
148
-
149
- # Vendor Pattern Anomaly: Flag if amount deviation is high and invoice count pattern is unusual
150
- vendor_pattern_score = 0
151
- if std_amount > 0 and (current_amount > mean_amount + 2 * std_amount or current_amount < mean_amount - 2 * std_amount):
152
- vendor_pattern_anomaly = "Anomaly detected"
153
- vendor_pattern_score = 25
154
-
155
- return amount_anomaly, frequency_anomaly, vendor_pattern_anomaly, amount_score, frequency_score, vendor_pattern_score
156
-
157
- # Calculate fraud score
158
- def calculate_fraud_score(extracted_data, history_df, consistency_issues):
159
- invoice_amount = extracted_data["total_amount"]
160
- text_length = extracted_data["text_length"]
161
- invoice_number = extracted_data["invoice_number"]
162
- vendor_name = extracted_data["vendor_name"]
163
- invoice_date = extracted_data["invoice_date"]
164
-
165
- # Base score rules
166
- fraud_score = 0
167
- reasoning = []
168
 
169
- # Rule 1: High invoice amount
170
- if invoice_amount > 5000:
171
- fraud_score += 40
172
- reasoning.append("High invoice amount detected.")
173
 
174
- # Rule 2: Low invoice amount
175
- if invoice_amount < 10:
176
- fraud_score += 20
177
- reasoning.append("Unusually low invoice amount.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
- # Rule 3: Text length
180
- if text_length < 500:
181
- fraud_score += 0 # No additional score for now
182
 
183
- # Consistency issues
184
- consistency_score = len(consistency_issues) * 15
185
- fraud_score += consistency_score
186
- reasoning.extend(consistency_issues)
187
 
188
- # Anomaly detection
189
- amount_anomaly, frequency_anomaly, vendor_pattern_anomaly, amount_score, frequency_score, vendor_pattern_score = detect_anomalies(
190
- history_df, invoice_amount, invoice_date
191
- )
192
- fraud_score += amount_score + frequency_score + vendor_pattern_score
193
- if amount_score > 0:
 
 
 
194
  reasoning.append("Amount flagged as an anomaly.")
195
- if frequency_score > 0:
 
196
  reasoning.append("Unusual invoice submission frequency or clustering detected.")
197
- if vendor_pattern_score > 0:
 
198
  reasoning.append("Unusual vendor pattern detected (amount deviation, frequency, or variance).")
199
 
200
- # Cap the fraud score at 100
201
- fraud_score = min(fraud_score, 100)
 
 
 
 
 
 
 
202
 
203
- # Determine status
204
- status = "Flagged" if fraud_score >= 50 else "Cleared"
205
- flagged = fraud_score >= 50
 
 
 
 
 
206
 
207
- if not reasoning:
208
- reasoning.append("No specific fraud indicators detected")
209
 
210
- return {
211
- "fraud_score": fraud_score,
212
- "status": status,
213
- "flagged": flagged,
214
- "amount_anomaly": amount_anomaly,
215
- "frequency_anomaly": frequency_anomaly,
216
- "vendor_pattern_anomaly": vendor_pattern_anomaly,
217
- "reasoning": reasoning
218
  }
 
219
 
220
- # Save to Salesforce
221
- def save_to_salesforce(sf, extracted_data, fraud_results):
222
- # Simulate saving to Salesforce
223
- invoice_history.append({
224
- "Invoice_Number__c": extracted_data["invoice_number"],
225
- "Vendor_Name__c": extracted_data["vendor_name"],
226
- "Invoice_Date__c": extracted_data["invoice_date"].strftime("%Y-%m-%d"),
227
- "Invoice_Amount__c": extracted_data["total_amount"],
228
- "Fraud_Score__c": fraud_results["fraud_score"],
229
- "Status__c": fraud_results["status"]
230
- })
231
-
232
- # Main processing function
233
- def process_invoice(pdf_file):
234
- # Simulate OCR extraction (in practice, use an OCR library like Tesseract)
235
- text = pdf_file # For this example, the text is already provided
236
-
237
- # Connect to Salesforce
238
- sf = connect_to_salesforce()
239
-
240
- # Extract entities
241
- extracted_data = extract_entities(text)
242
-
243
- # Fetch vendor history
244
- history_df = fetch_vendor_history(sf, extracted_data["vendor_name"], extracted_data["invoice_date"])
245
-
246
- # Check data consistency
247
- consistency_issues = check_data_consistency(history_df, extracted_data["invoice_number"], extracted_data["vendor_name"])
248
-
249
- # Calculate fraud score
250
- fraud_results = calculate_fraud_score(extracted_data, history_df, consistency_issues)
251
-
252
- # Save to Salesforce
253
- save_to_salesforce(sf, extracted_data, fraud_results)
254
-
255
- # Prepare output
256
- output = f"""
257
- ## Fraud Detection Summary
258
- - **Invoice Number**: {extracted_data["invoice_number"]}
259
- - **Vendor Name**: {extracted_data["vendor_name"]}
260
- - **Invoice Date**: {extracted_data["invoice_date"].strftime("%Y-%m-%d")}
261
- - **Invoice Amount**: ${extracted_data["total_amount"]:.2f}
262
- - **Fraud Score**: {fraud_results["fraud_score"]}
263
- - **Status**: {fraud_results["status"]}
264
- - **Flagged**: {fraud_results["flagged"]}
265
- - **Amount Anomaly**: {fraud_results["amount_anomaly"]}
266
- - **Frequency Anomaly**: {fraud_results["frequency_anomaly"]}
267
- - **Vendor Pattern Anomaly**: {fraud_results["vendor_pattern_anomaly"]}
268
-
269
- ## Fraud Reasoning
270
- - {". ".join(fraud_results["reasoning"])}.
271
- """
272
- return output
273
-
274
- # Global list to simulate Salesforce database
275
- invoice_history = []
276
-
277
- # Gradio interface
278
- with gr.Blocks() as demo:
279
- gr.Markdown("# Invoice Fraud Detection App")
280
- pdf_input = gr.Textbox(label="Upload Invoice Text (Simulated OCR Output)", placeholder="Paste the OCR-extracted text here...")
281
- output = gr.Markdown()
282
- submit_btn = gr.Button("Process Invoice")
283
- submit_btn.click(fn=process_invoice, inputs=pdf_input, outputs=output)
284
 
285
  if __name__ == "__main__":
286
- demo.launch()
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import logging
4
+ import pdfplumber
5
  import pandas as pd
6
  import numpy as np
7
+ from transformers import pipeline
8
+ from sklearn.ensemble import IsolationForest
9
+ from sklearn.preprocessing import StandardScaler
10
+ import uuid
11
  from datetime import datetime, timedelta
12
+ import re
13
  import gradio as gr
14
+ from simple_salesforce import Salesforce, SalesforceAuthenticationFailed
15
+
16
+ # Load environment variables from .env file
17
+ load_dotenv()
18
+
19
+ # Configure environment for CPU usage
20
+ os.environ["CUDA_VISIBLE_DEVICES"] = "" # Disable GPU usage
21
+ os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0" # Disable oneDNN optimizations
22
+
23
+ # Set up logging to suppress transformers warnings
24
+ logging.getLogger("transformers").setLevel(logging.ERROR)
25
 
26
+ # Read Salesforce credentials from environment variables
27
+ SF_USERNAME = os.getenv("SF_USERNAME")
28
+ SF_PASSWORD = os.getenv("SF_PASSWORD")
29
+ SF_SECURITY_TOKEN = os.getenv("SF_SECURITY_TOKEN")
30
+
31
+ print(f"Salesforce login info: username={SF_USERNAME}")
32
+
33
+ # Salesforce connection with error handling
34
+ try:
35
+ sf = Salesforce(
36
+ username=SF_USERNAME,
37
+ password=SF_PASSWORD,
38
+ security_token=SF_SECURITY_TOKEN
39
+ )
40
+ print("Salesforce login successful.")
41
+ except SalesforceAuthenticationFailed as e:
42
+ print(f"Salesforce authentication failed: {e}")
43
+ sf = None
44
+
45
+ # Initialize Hugging Face NER pipeline (force CPU)
46
+ ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", device=-1)
47
+
48
+ def extract_text_from_pdf(pdf_file):
49
+ """Extract text from a PDF invoice."""
50
+ try:
51
+ with pdfplumber.open(pdf_file) as pdf:
52
+ text = ""
53
+ for page in pdf.pages:
54
+ page_text = page.extract_text() or ""
55
+ text += page_text + "\n"
56
+ print("Extracted text:\n", text) # Debug: Print extracted text
57
+ return text
58
+ except Exception as e:
59
+ return f"Error extracting text: {str(e)}"
60
 
 
61
  def extract_entities(text):
62
+ """Extract structured invoice details using flexible regex patterns."""
63
+ invoice_number = "Unknown"
64
+ vendor_name = "Unknown"
65
+ invoice_date = datetime.now().date()
66
+ total_amount = 0.0
67
+
68
+ # Flexible regex patterns to handle variations
69
+ invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)\s*[:\-\s]*)([\w-]+)"
70
+ vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From)\s*[:\-\s]*([A-Za-z\s&\.]+)(?=\s*(?:Invoice|No\.?|Date|$|\d))"
71
+ invoice_date_pattern = r"(?:Invoice\s*Date\s*[:\-\s]*|Date\s*[:\-\s]*)(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
72
+ total_amount_pattern = r"(?:Total\s*(?:Amount|Due)?\s*[:\-\s]*\$?)([\d,]+\.?\d*)"
73
+
74
+ # Invoice Number
75
+ invoice_num_match = re.search(invoice_num_pattern, text, re.IGNORECASE)
76
+ if invoice_num_match:
77
+ invoice_number = invoice_num_match.group(1)
78
+ print(f"Matched Invoice Number: {invoice_number}") # Debug
79
+
80
+ # Vendor Name
81
  vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
82
  if vendor_match:
83
  vendor_name = vendor_match.group(1).strip()
84
+ print(f"Matched Vendor Name (Regex): {vendor_name}") # Debug
85
  else:
86
+ # Enhanced NER fallback for multi-word organization names
87
+ ner_results = ner_pipeline(text)
88
+ org_name_parts = []
89
+ for i, entity in enumerate(ner_results):
90
+ if entity['entity'].startswith('B-ORG'):
91
+ org_name_parts = [entity['word']]
92
+ elif entity['entity'].startswith('I-ORG') and org_name_parts:
93
+ org_name_parts.append(entity['word'])
94
+ if org_name_parts:
95
+ vendor_name = " ".join(part.replace("##", "") for part in org_name_parts)
96
+ print(f"NER Matched Vendor Name: {vendor_name}") # Debug
97
+
98
+ # Invoice Date
99
+ invoice_date_match = re.search(invoice_date_pattern, text, re.IGNORECASE)
100
+ if invoice_date_match:
101
+ date_str = invoice_date_match.group(1)
102
+ try:
103
+ if "/" in date_str:
104
+ invoice_date = datetime.strptime(date_str, "%m/%d/%Y").date()
105
+ elif "," in date_str:
106
+ invoice_date = datetime.strptime(date_str, "%B %d, %Y").date()
107
+ elif "-" in date_str:
108
+ try:
109
+ invoice_date = datetime.strptime(date_str, "%Y-%m-%d").date()
110
+ except ValueError:
111
+ invoice_date = datetime.strptime(date_str, "%d-%m-%Y").date()
112
+ print(f"Matched Invoice Date: {invoice_date}") # Debug
113
+ except ValueError as e:
114
+ print(f"Failed to parse Invoice Date '{date_str}': {str(e)}") # Debug
115
+
116
+ # Total Amount
117
+ total_amount_match = re.search(total_amount_pattern, text, re.IGNORECASE)
118
+ if total_amount_match:
119
+ total_amount = float(total_amount_match.group(1).replace(",", ""))
120
+ print(f"Matched Total Amount: {total_amount}") # Debug
121
+
122
+ return invoice_number, vendor_name, invoice_date, total_amount
123
+
124
+ def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
125
+ """Fetch historical invoices for the vendor from Salesforce."""
126
+ if sf is None:
127
+ return pd.DataFrame()
128
+
129
+ try:
130
+ end_date = datetime.now().date()
131
+ start_date = end_date - timedelta(days=time_window_days)
132
+
133
+ query = f"""
134
+ SELECT Invoice_Number__c, Invoice_Amount__c, Invoice_Date__c, Vendor_Name__c
135
+ FROM Invoice_Record__c
136
+ WHERE Invoice_Date__c >= {start_date} AND Invoice_Date__c <= {end_date}
137
+ AND Vendor_Name__c = '{vendor_name}'
138
+ LIMIT 100
139
+ """
140
+ result = sf.query(query)
141
+ records = result['records']
142
+
143
+ history_df = pd.DataFrame(records)
144
+ if not history_df.empty:
145
+ history_df['Invoice_Date__c'] = pd.to_datetime(history_df['Invoice_Date__c']).dt.date
146
+ return history_df
147
+ except Exception as e:
148
+ print(f"Failed to fetch vendor history: {str(e)}")
149
+ return pd.DataFrame()
150
+
151
+ def check_data_consistency(invoice_number, vendor_name, invoice_date, history_df):
152
+ """Check for data consistency issues like duplicates."""
153
+ consistency_issues = []
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  if not history_df.empty:
156
+ duplicate_invoices = history_df[history_df['Invoice_Number__c'] == invoice_number]
 
 
 
157
  if not duplicate_invoices.empty:
158
+ consistency_issues.append(f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
+ return consistency_issues
 
 
 
161
 
162
+ def detect_anomalies(df, history_df):
163
+ """Detect anomalies in amount, frequency, and vendor patterns."""
164
+ df["is_amount_anomaly"] = 0
165
+ df["is_frequency_anomaly"] = 0
166
+ df["is_vendor_pattern_anomaly"] = 0
167
+
168
+ if not df.empty:
169
+ scaler = StandardScaler()
170
+ X_scaled = scaler.fit_transform(df[["amount"]])
171
+ model = IsolationForest(contamination=0.05, random_state=42)
172
+ df["is_amount_anomaly"] = model.fit_predict(X_scaled)
173
+
174
+ if not history_df.empty:
175
+ history_df['Invoice_Date__c'] = pd.to_datetime(history_df['Invoice_Date__c'])
176
+ date_range = (history_df['Invoice_Date__c'].max() - history_df['Invoice_Date__c'].min()).days + 1
177
+ frequency = len(history_df) / max(date_range, 1)
178
+
179
+ date_diffs = [(d - history_df['Invoice_Date__c'].min()).days for d in history_df['Invoice_Date__c']]
180
+ date_clustering = np.std(date_diffs) if len(date_diffs) > 1 else 0
181
+
182
+ frequency_df = pd.DataFrame({
183
+ "frequency": [frequency],
184
+ "date_clustering": [date_clustering]
185
+ })
186
+ scaler = StandardScaler()
187
+ X_scaled = scaler.fit_transform(frequency_df[["frequency", "date_clustering"]])
188
+ model = IsolationForest(contamination=0.05, random_state=42)
189
+ df["is_frequency_anomaly"] = model.fit_predict(X_scaled)[0]
190
+ else:
191
+ df["is_frequency_anomaly"] = 1
192
+
193
+ if not history_df.empty and len(history_df) > 1:
194
+ historical_amounts = history_df["Invoice_Amount__c"].astype(float)
195
+ mean_amount = historical_amounts.mean()
196
+ std_amount = historical_amounts.std() if len(historical_amounts) > 1 else 1
197
+ amount_variance = historical_amounts.var() if len(historical_amounts) > 1 else 0
198
+
199
+ current_amount = df["amount"].iloc[0]
200
+ deviation = abs(current_amount - mean_amount) / (std_amount if std_amount > 0 else 1)
201
+ invoice_count = len(history_df)
202
+
203
+ vendor_pattern_df = pd.DataFrame({
204
+ "amount_deviation": [deviation],
205
+ "invoice_count": [invoice_count],
206
+ "amount_variance": [amount_variance]
207
+ })
208
+ scaler = StandardScaler()
209
+ X_scaled = scaler.fit_transform(vendor_pattern_df[["amount_deviation", "invoice_count", "amount_variance"]])
210
+ model = IsolationForest(contamination=0.05, random_state=42)
211
+ df["is_vendor_pattern_anomaly"] = model.fit_predict(X_scaled)[0]
212
+ else:
213
+ df["is_vendor_pattern_anomaly"] = 1
214
 
215
+ return df
 
 
216
 
217
+ def calculate_fraud_score(amount, is_amount_anomaly, is_frequency_anomaly, is_vendor_pattern_anomaly, text_length, consistency_issues):
218
+ """Calculate fraud score based on amount, anomalies, text length, and consistency issues."""
219
+ score = 0.0
220
+ reasoning = []
221
 
222
+ if amount > 5000:
223
+ score += 40
224
+ reasoning.append("High invoice amount detected.")
225
+ elif amount < 10:
226
+ score += 20
227
+ reasoning.append("Unusually low invoice amount.")
228
+
229
+ if is_amount_anomaly == -1:
230
+ score += 30
231
  reasoning.append("Amount flagged as an anomaly.")
232
+ if is_frequency_anomaly == -1:
233
+ score += 25
234
  reasoning.append("Unusual invoice submission frequency or clustering detected.")
235
+ if is_vendor_pattern_anomaly == -1:
236
+ score += 25
237
  reasoning.append("Unusual vendor pattern detected (amount deviation, frequency, or variance).")
238
 
239
+ if text_length > 500:
240
+ score += 10
241
+ reasoning.append("Excessive text length in invoice.")
242
+
243
+ if consistency_issues:
244
+ score += 15 * len(consistency_issues)
245
+ reasoning.extend(consistency_issues)
246
+
247
+ return min(score, 100), reasoning
248
 
249
+ def process_invoice(pdf_file):
250
+ """Process a single invoice PDF and return structured markdown output."""
251
+ text = extract_text_from_pdf(pdf_file)
252
+ if "Error" in text:
253
+ return f"**Error**: {text}"
254
+
255
+ invoice_number, vendor_name, invoice_date, total_amount = extract_entities(text)
256
+ text_length = len(text)
257
 
258
+ history_df = fetch_vendor_history(vendor_name, invoice_number)
259
+ consistency_issues = check_data_consistency(invoice_number, vendor_name, invoice_date, history_df)
260
 
261
+ data = {
262
+ "invoice_id": str(uuid.uuid4()),
263
+ "invoice_number": invoice_number,
264
+ "vendor_name": vendor_name,
265
+ "amount": total_amount,
266
+ "invoice_date": invoice_date,
267
+ "text_length": text_length
 
268
  }
269
+ df = pd.DataFrame([data])
270
 
271
+ df = detect_anomalies(df, history_df)
272
+
273
+ fraud_score, fraud_reasoning = calculate_fraud_score(
274
+ df["amount"].iloc[0],
275
+ df["is_amount_anomaly"].iloc[0],
276
+ df["is_frequency_anomaly"].iloc[0],
277
+ df["is_vendor_pattern_anomaly"].iloc[0],
278
+ text_length,
279
+ consistency_issues
280
+ )
281
+
282
+ output = [
283
+ "## Fraud Detection Summary",
284
+ f"- **Invoice Number**: {invoice_number}",
285
+ f"- **Vendor Name**: {vendor_name}",
286
+ f"- **Invoice Date**: {invoice_date}",
287
+ f"- **Invoice Amount**: ${total_amount:,.2f}",
288
+ f"- **Fraud Score**: {fraud_score}",
289
+ f"- **Status**: {'Flagged' if fraud_score > 50 else 'Cleared'}",
290
+ f"- **Flagged**: {fraud_score > 50}",
291
+ f"- **Amount Anomaly**: {'Anomaly detected' if df['is_amount_anomaly'].iloc[0] == -1 else 'No anomalies'}",
292
+ f"- **Frequency Anomaly**: {'Anomaly detected' if df['is_frequency_anomaly'].iloc[0] == -1 else 'No anomalies'}",
293
+ f"- **Vendor Pattern Anomaly**: {'Anomaly detected' if df['is_vendor_pattern_anomaly'].iloc[0] == -1 else 'No anomalies'}",
294
+ "",
295
+ "## Fraud Reasoning"
296
+ ]
297
+
298
+ if fraud_reasoning:
299
+ output.extend([f"- {reason}" for reason in fraud_reasoning])
300
+ else:
301
+ output.append("- No specific fraud indicators detected")
302
+
303
+ if sf is not None:
304
+ try:
305
+ sf.Invoice_Record__c.create({
306
+ "Invoice_Number__c": invoice_number,
307
+ "Vendor_Name__c": vendor_name,
308
+ "Invoice_Amount__c": total_amount,
309
+ "Invoice_Date__c": str(invoice_date),
310
+ "Fraud_Score__c": fraud_score,
311
+ "Fraud_Reason__c": "; ".join(fraud_reasoning),
312
+ "Flagged__c": fraud_score > 50,
313
+ "Status__c": "Flagged" if fraud_score > 50 else "Cleared"
314
+ })
315
+ except Exception as e:
316
+ print(f"Failed to create Salesforce record: {str(e)}")
317
+ pass
318
+
319
+ return "\n".join(output)
320
+
321
+ def gradio_interface(pdf_file):
322
+ """Gradio interface to process uploaded PDF and display structured results."""
323
+ if pdf_file is None:
324
+ return "Please upload a PDF file."
325
+ result = process_invoice(pdf_file)
326
+ return result
327
+
328
+ with gr.Blocks(css=".prose a[href*='share']:has(svg) {display:none !important;}") as iface:
329
+ gr.Markdown("# Invoice Fraud Detection")
330
+ with gr.Row():
331
+ file_input = gr.File(label="Upload Invoice PDF")
332
+ result_output = gr.Markdown(label="Fraud Detection Results")
333
+ file_input.change(fn=gradio_interface, inputs=file_input, outputs=result_output)
 
334
 
335
  if __name__ == "__main__":
336
+ iface.launch()