Abhisesh7 commited on
Commit
087f2ac
·
verified ·
1 Parent(s): b03d28f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +295 -487
app.py CHANGED
@@ -20,16 +20,15 @@ load_dotenv()
20
  os.environ["CUDA_VISIBLE_DEVICES"] = "" # Disable GPU usage
21
  os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0" # Disable oneDNN optimizations
22
 
23
- # Set up logging
24
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
25
- logger = logging.getLogger(__name__)
26
 
27
  # Read Salesforce credentials from environment variables
28
  SF_USERNAME = os.getenv("SF_USERNAME")
29
  SF_PASSWORD = os.getenv("SF_PASSWORD")
30
  SF_SECURITY_TOKEN = os.getenv("SF_SECURITY_TOKEN")
31
 
32
- logger.info(f"Salesforce login info: username={SF_USERNAME}")
33
 
34
  # Salesforce connection with error handling
35
  try:
@@ -38,221 +37,158 @@ try:
38
  password=SF_PASSWORD,
39
  security_token=SF_SECURITY_TOKEN
40
  )
41
- logger.info("Salesforce login successful.")
42
  except SalesforceAuthenticationFailed as e:
43
- logger.error(f"Salesforce authentication failed: {e}")
44
- sf = None
45
- except Exception as e:
46
- logger.error(f"Unexpected error during Salesforce connection: {e}")
47
  sf = None
48
 
49
  # Initialize Hugging Face NER pipeline (force CPU)
50
- try:
51
- ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", device=-1)
52
- logger.info("NER pipeline initialized successfully.")
53
- except Exception as e:
54
- logger.error(f"Failed to initialize NER pipeline: {e}")
55
- ner_pipeline = None
56
 
57
  def extract_text_from_pdf(pdf_file):
58
- """Extract text from a PDF invoice with error handling."""
59
  try:
60
  with pdfplumber.open(pdf_file) as pdf:
61
  text = ""
62
  for page in pdf.pages:
63
  page_text = page.extract_text() or ""
64
  text += page_text + "\n"
65
- logger.info("Extracted text:\n%s", text)
66
  return text
67
  except Exception as e:
68
- logger.error("Error extracting text: %s", str(e))
69
  return f"Error extracting text: {str(e)}"
70
 
71
  def extract_items(text):
72
- """Extract items from the invoice table step by step with enhanced robustness."""
73
  items = []
74
- try:
75
- # Replace escaped dollar signs and normalize text
76
- text = text.replace(r'\$', '$')
77
- text = re.sub(r'\s+', ' ', text) # Normalize spaces
78
- logger.info("Step 1: Splitting text into lines")
79
- lines = text.split('\n')
80
- logger.info("Text split into lines: %s", lines)
81
-
82
- # Step 2: Find the table header
83
- logger.info("Step 2: Locating table header")
84
- table_start = -1
85
- for i, line in enumerate(lines):
86
- line = line.strip()
87
- if not line:
88
- continue
89
- # Look for common table header patterns
90
- if ("Item Description" in line or "Description" in line) and "Quantity" in line and "Price" in line:
91
- table_start = i + 1
92
- logger.info("Table header found at line %d: %s", i, line)
93
- break
94
-
95
- if table_start == -1:
96
- logger.warning("Table header not found.")
97
- return items
98
-
99
- # Step 3: Find the end of the table
100
- logger.info("Step 3: Locating table end")
101
- table_end = len(lines)
102
- for i in range(table_start, len(lines)):
103
- line = lines[i].strip()
104
- if not line:
105
- continue
106
- if re.search(r"Total\s*(Amount|Due|Price|Cost)", line, re.IGNORECASE):
107
- table_end = i
108
- logger.info("Table end found at line %d: %s", i, line)
109
- break
110
-
111
- logger.info("Table section identified: lines %d to %d", table_start, table_end-1)
112
- table_lines = lines[table_start:table_end]
113
- logger.info("Table lines: %s", table_lines)
114
-
115
- # Step 4: Process each row step by step
116
- logger.info("Step 4: Processing table rows one by one")
117
- # Enhanced regex to handle variations (e.g., missing pipes, extra spaces)
118
- table_row_pattern = r"(?:\|?\s*|\b)([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*(?:\|?\s*|\s+)(\d+)\s*(?:\|?\s*|\s+)([\d.]+)\s*(?:\|?\s*|\s+)([\d.]+)(?:\s*\|?\s*|\b)"
119
-
120
- for row_idx, line in enumerate(table_lines, 1):
121
- line = line.strip()
122
- if not line:
123
- logger.info("Row %d: Skipping empty row", row_idx)
124
- continue
125
-
126
- # Skip alignment rows (e.g., "|---|---|")
127
- if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
128
- logger.info("Row %d: Skipping alignment row: %s", row_idx, line)
129
- continue
130
-
131
- # Replace alignment markers in the row
132
- line = re.sub(r'\|\s*---\s*\|', '|', line)
133
- logger.info("Row %d: Processing row: %s", row_idx, line)
134
-
135
- # Step 4a: Apply regex to extract item details
136
- match = re.match(table_row_pattern, line)
137
- if not match:
138
- logger.warning("Row %d: Failed to match row: %s", row_idx, line)
139
- continue
140
-
141
- # Step 4b: Extract and validate item details
142
  description = match.group(1).strip()
143
- try:
144
- quantity = int(match.group(2))
145
- unit_price = float(match.group(3))
146
- total_price = float(match.group(4))
147
- except ValueError as e:
148
- logger.warning("Row %d: Failed to parse numbers: %s", row_idx, str(e))
149
- continue
150
-
151
- # Step 4c: Validate the extracted values
152
- if quantity <= 0 or unit_price < 0 or total_price < 0:
153
- logger.warning("Row %d: Invalid values (non-positive quantity, negative unit price, or total price): %s", row_idx, line)
154
- continue
155
-
156
- # Check if total_price ≈ quantity × unit_price
157
- expected_total = quantity * unit_price
158
- if abs(expected_total - total_price) > 0.01:
159
- logger.warning("Row %d: Total price mismatch: Expected %.2f, Got %.2f for %s", row_idx, expected_total, total_price, description)
160
- continue
161
-
162
- # Step 4d: Add the item to the list
163
- item = {
164
  "description": description,
165
  "quantity": quantity,
166
  "unit_price": unit_price,
167
  "total_price": total_price
168
- }
169
- items.append(item)
170
- logger.info("Row %d: Successfully extracted item: %s, Qty: %d, Unit Price: $%.2f, Total Price: $%.2f",
171
- row_idx, description, quantity, unit_price, total_price)
172
-
173
- # Step 5: Return the extracted items
174
- logger.info("Step 5: Extraction complete. Total items extracted: %d", len(items))
175
- return items
176
 
177
- except Exception as e:
178
- logger.error("Unexpected error in extract_items: %s", str(e))
179
- return items
180
 
181
  def extract_entities(text):
182
- """Extract structured invoice details with enhanced robustness."""
183
  invoice_number = "Unknown"
184
  vendor_name = "Unknown"
185
  invoice_date = datetime.now().date()
186
  total_amount = 0.0
187
 
188
- try:
189
- # Invoice Number
190
- invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-)([\w-]+)"
191
- invoice_num_match = re.search(invoice_num_pattern, text, re.IGNORECASE)
192
- if invoice_num_match:
193
- invoice_number = invoice_num_match.group(1) if invoice_num_match.group(1) else invoice_num_match.group(2)
194
- logger.info("Matched Invoice Number: %s", invoice_number)
195
-
196
- # Vendor Name
197
- vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Invoice|No\.?|Date|$|\d))"
198
- vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
199
- if vendor_match:
200
- vendor_name = vendor_match.group(1).strip()
201
- logger.info("Matched Vendor Name (Regex): %s", vendor_name)
202
- elif ner_pipeline:
203
- try:
204
- ner_results = ner_pipeline(text)
205
- org_name_parts = []
206
- for i, entity in enumerate(ner_results):
207
- if entity['entity'].startswith('B-ORG'):
208
- org_name_parts = [entity['word']]
209
- elif entity['entity'].startswith('I-ORG') and org_name_parts:
210
- org_name_parts.append(entity['word'])
211
- if org_name_parts:
212
- vendor_name = " ".join(part.replace("##", "") for part in org_name_parts)
213
- logger.info("NER Matched Vendor Name: %s", vendor_name)
214
- except Exception as e:
215
- logger.warning("NER failed for vendor name: %s", str(e))
216
-
217
- # Invoice Date
218
- # Support multiple date formats
219
- invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4}|\d{1,2}\s*[A-Za-z]+\s*\d{4})"
220
- invoice_date_match = re.search(invoice_date_pattern, text, re.IGNORECASE)
221
- if invoice_date_match:
222
- date_str = invoice_date_match.group(1)
223
- date_formats = [
224
- "%Y-%m-%d", "%d/%m/%Y", "%d-%m-%Y", "%B %d, %Y", "%d %B %Y"
225
- ]
226
- for date_format in date_formats:
 
227
  try:
228
- invoice_date = datetime.strptime(date_str, date_format).date()
229
- logger.info("Matched Invoice Date: %s", invoice_date)
230
- break
231
  except ValueError:
232
- continue
233
- else:
234
- logger.warning("Failed to parse Invoice Date: %s", date_str)
235
-
236
- # Total Amount
237
- total_amount_pattern = r"(?:Total\s*(?:Amount|Due)?|Amount\s*Due|Total)\s*[:\-\s]*[$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR)?"
238
- total_amount_match = re.search(total_amount_pattern, text, re.IGNORECASE)
239
- if total_amount_match:
240
- try:
241
- total_amount = float(total_amount_match.group(1).replace(",", ""))
242
- logger.info("Matched Total Amount: %.2f", total_amount)
243
- except ValueError as e:
244
- logger.warning("Failed to parse Total Amount: %s", str(e))
245
-
246
- return invoice_number, vendor_name, invoice_date, total_amount
247
 
248
- except Exception as e:
249
- logger.error("Unexpected error in extract_entities: %s", str(e))
250
- return invoice_number, vendor_name, invoice_date, total_amount
 
 
 
 
251
 
252
  def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
253
  """Fetch historical invoices for the vendor from Salesforce."""
254
  if sf is None:
255
- logger.warning("Salesforce connection not available.")
256
  return pd.DataFrame()
257
 
258
  try:
@@ -260,7 +196,7 @@ def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
260
  start_date = end_date - timedelta(days=time_window_days)
261
 
262
  query = f"""
263
- SELECT Invoice_Number__c, Invoice_Amount__c, Invoice_Date__c, Vendor_Name__c, Items_Selected__c
264
  FROM Invoice_Record__c
265
  WHERE Invoice_Date__c >= {start_date} AND Invoice_Date__c <= {end_date}
266
  AND Vendor_Name__c = '{vendor_name}'
@@ -272,337 +208,209 @@ def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
272
  history_df = pd.DataFrame(records)
273
  if not history_df.empty:
274
  history_df['Invoice_Date__c'] = pd.to_datetime(history_df['Invoice_Date__c']).dt.date
275
- logger.info("Fetched %d historical records for vendor %s", len(history_df), vendor_name)
276
- else:
277
- logger.info("No historical records found for vendor %s", vendor_name)
278
  return history_df
279
-
280
  except Exception as e:
281
- logger.error("Failed to fetch vendor history: %s", str(e))
282
  return pd.DataFrame()
283
 
284
  def check_data_consistency(invoice_number, vendor_name, invoice_date, history_df):
285
  """Check for data consistency issues like duplicates."""
286
  consistency_issues = []
287
 
288
- try:
289
- if not history_df.empty:
290
- duplicate_invoices = history_df[history_df['Invoice_Number__c'] == invoice_number]
291
- if not duplicate_invoices.empty:
292
- issue = f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'."
293
- consistency_issues.append(issue)
294
- logger.warning(issue)
295
- return consistency_issues
296
- except Exception as e:
297
- logger.error("Error in check_data_consistency: %s", str(e))
298
- return consistency_issues
299
-
300
- def parse_items_to_features(items_str):
301
- """Parse the Items_Selected__c field into features for anomaly detection."""
302
- try:
303
- if not items_str or items_str == "No items found":
304
- return 0, 0, 0
305
 
306
- max_quantity = 0
307
- total_unit_price = 0.0
308
- total_items = 0
309
 
310
- items = items_str.split("; ")
311
- for item in items:
312
- if not item:
313
- continue
314
- try:
315
- quantity_match = re.search(r"Quantity (\d+)", item)
316
- unit_price_match = re.search(r"Unit Price \$([\d.]+)", item)
317
- if quantity_match and unit_price_match:
318
- quantity = int(quantity_match.group(1))
319
- unit_price = float(unit_price_match.group(1))
320
- max_quantity = max(max_quantity, quantity)
321
- total_unit_price += unit_price
322
- total_items += 1
323
- except Exception as e:
324
- logger.warning("Error parsing item '%s': %s", item, str(e))
325
- continue
326
-
327
- avg_unit_price = total_unit_price / total_items if total_items > 0 else 0
328
- return max_quantity, avg_unit_price, total_items
329
-
330
- except Exception as e:
331
- logger.error("Error in parse_items_to_features: %s", str(e))
332
- return 0, 0, 0
333
-
334
- def detect_anomalies(df, history_df, items):
335
- """Detect anomalies with improved handling for small datasets."""
336
  df["is_amount_anomaly"] = 0
337
  df["is_frequency_anomaly"] = 0
338
  df["is_vendor_pattern_anomaly"] = 0
339
- df["is_item_anomaly"] = 0
340
 
341
- try:
342
- # Amount anomaly detection
343
- if not history_df.empty:
344
- historical_amounts = history_df["Invoice_Amount__c"].astype(float).values
345
- current_amount = df["amount"].iloc[0]
346
- amounts = np.append(historical_amounts, current_amount)
347
- if len(amounts) > 1: # Need at least 2 data points for meaningful anomaly detection
348
- amounts_df = pd.DataFrame({"amount": amounts})
349
- scaler = StandardScaler()
350
- X_scaled = scaler.fit_transform(amounts_df[["amount"]])
351
- model = IsolationForest(contamination=0.05, random_state=42)
352
- predictions = model.fit_predict(X_scaled)
353
- df["is_amount_anomaly"] = predictions[-1]
354
- logger.info("Amount anomaly detection completed: %d", df["is_amount_anomaly"].iloc[0])
355
- else:
356
- logger.info("Not enough data for amount anomaly detection.")
357
-
358
- # Frequency anomaly detection
359
- if not history_df.empty:
360
- history_df['Invoice_Date__c'] = pd.to_datetime(history_df['Invoice_Date__c'])
361
- date_range = (history_df['Invoice_Date__c'].max() - history_df['Invoice_Date__c'].min()).days + 1
362
- frequency = len(history_df) / max(date_range, 1)
363
-
364
- date_diffs = [(d - history_df['Invoice_Date__c'].min()).days for d in history_df['Invoice_Date__c']]
365
- date_clustering = np.std(date_diffs) if len(date_diffs) > 1 else 0
366
-
367
- frequency_df = pd.DataFrame({
368
- "frequency": [frequency],
369
- "date_clustering": [date_clustering]
370
- })
371
- scaler = StandardScaler()
372
- X_scaled = scaler.fit_transform(frequency_df[["frequency", "date_clustering"]])
373
- model = IsolationForest(contamination=0.05, random_state=42)
374
- df["is_frequency_anomaly"] = model.fit_predict(X_scaled)[0]
375
- logger.info("Frequency anomaly detection completed: %d", df["is_frequency_anomaly"].iloc[0])
376
- else:
377
- df["is_frequency_anomaly"] = 1
378
- logger.info("No historical data for frequency anomaly detection.")
379
-
380
- # Vendor pattern anomaly detection
381
- if not history_df.empty and len(history_df) > 1:
382
- historical_amounts = history_df["Invoice_Amount__c"].astype(float)
383
- mean_amount = historical_amounts.mean()
384
- std_amount = historical_amounts.std() if len(historical_amounts) > 1 else 1
385
- amount_variance = historical_amounts.var() if len(historical_amounts) > 1 else 0
386
-
387
- current_amount = df["amount"].iloc[0]
388
- deviation = abs(current_amount - mean_amount) / (std_amount if std_amount > 0 else 1)
389
- invoice_count = len(history_df)
390
-
391
- vendor_pattern_df = pd.DataFrame({
392
- "amount_deviation": [deviation],
393
- "invoice_count": [invoice_count],
394
- "amount_variance": [amount_variance]
395
- })
396
- scaler = StandardScaler()
397
- X_scaled = scaler.fit_transform(vendor_pattern_df[["amount_deviation", "invoice_count", "amount_variance"]])
398
- model = IsolationForest(contamination=0.05, random_state=42)
399
- df["is_vendor_pattern_anomaly"] = model.fit_predict(X_scaled)[0]
400
- logger.info("Vendor pattern anomaly detection completed: %d", df["is_vendor_pattern_anomaly"].iloc[0])
401
- else:
402
- df["is_vendor_pattern_anomaly"] = 1
403
- logger.info("Not enough data for vendor pattern anomaly detection.")
404
-
405
- # Item-level anomaly detection
406
- if not history_df.empty:
407
- historical_max_quantities = []
408
- historical_avg_unit_prices = []
409
- historical_total_items = []
410
- for items_str in history_df["Items_Selected__c"]:
411
- max_qty, avg_price, total_items = parse_items_to_features(items_str)
412
- historical_max_quantities.append(max_qty)
413
- historical_avg_unit_prices.append(avg_price)
414
- historical_total_items.append(total_items)
415
-
416
- current_max_quantity = max(item["quantity"] for item in items) if items else 0
417
- current_avg_unit_price = sum(item["unit_price"] for item in items) / len(items) if items else 0
418
- current_total_items = len(items)
419
-
420
- item_features = pd.DataFrame({
421
- "max_quantity": historical_max_quantities + [current_max_quantity],
422
- "avg_unit_price": historical_avg_unit_prices + [current_avg_unit_price],
423
- "total_items": historical_total_items + [current_total_items]
424
- })
425
-
426
- if len(item_features) > 1:
427
- scaler = StandardScaler()
428
- X_scaled = scaler.fit_transform(item_features[["max_quantity", "avg_unit_price", "total_items"]])
429
- model = IsolationForest(contamination=0.05, random_state=42)
430
- predictions = model.fit_predict(X_scaled)
431
- df["is_item_anomaly"] = predictions[-1]
432
- logger.info("Item anomaly detection completed: %d", df["is_item_anomaly"].iloc[0])
433
- else:
434
- logger.info("Not enough data for item anomaly detection.")
435
 
436
- return df
437
-
438
- except Exception as e:
439
- logger.error("Error in detect_anomalies: %s", str(e))
440
- return df
441
-
442
- def calculate_fraud_score(amount, is_amount_anomaly, is_frequency_anomaly, is_vendor_pattern_anomaly, is_item_anomaly, text_length, consistency_issues, invoice_date, items):
443
- """Calculate fraud score with additional validation rules."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  score = 0.0
445
  reasoning = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
 
447
- try:
448
- today = datetime.now().date()
449
-
450
- if amount > 5000:
451
- score += 40
452
- reasoning.append("High invoice amount detected.")
453
- elif amount < 10:
454
- score += 20
455
- reasoning.append("Unusually low invoice amount.")
456
-
457
- if invoice_date > today:
458
- score += 10
459
- reasoning.append("Invoice date is in the future.")
460
-
461
- if is_amount_anomaly == -1:
462
- score += 30
463
- reasoning.append("Amount flagged as an anomaly.")
464
- if is_frequency_anomaly == -1:
465
- score += 25
466
- reasoning.append("Unusual invoice submission frequency or clustering detected.")
467
- if is_vendor_pattern_anomaly == -1:
468
- score += 25
469
- reasoning.append("Unusual vendor pattern detected (amount deviation, frequency, or variance).")
470
- if is_item_anomaly == -1:
471
- score += 20
472
- reasoning.append("Unusual item patterns detected (quantity, unit price, or number of items).")
473
-
474
- if text_length > 500:
475
- score += 5
476
- reasoning.append("Excessive text length in invoice.")
477
-
478
- if consistency_issues:
479
- score += 15 * len(consistency_issues)
480
- reasoning.extend(consistency_issues)
481
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
  for item in items:
483
- # High quantity rule
484
- if item["quantity"] > 10:
485
- score += 10
486
- reasoning.append(f"High quantity detected for item '{item['description']}' (Quantity: {item['quantity']}).")
487
- break
488
- # New rule: High unit price
489
- if item["unit_price"] > 500:
490
- score += 15
491
- reasoning.append(f"High unit price detected for item '{item['description']}' (Unit Price: ${item['unit_price']:.2f}).")
492
- break
493
-
494
- fraud_score = min(score, 100)
495
- logger.info("Fraud score calculated: %.2f with reasons: %s", fraud_score, reasoning)
496
- return fraud_score, reasoning
497
-
498
- except Exception as e:
499
- logger.error("Error in calculate_fraud_score: %s", str(e))
500
- return score, reasoning
501
-
502
- def process_invoice(pdf_file):
503
- """Process a single invoice PDF with comprehensive error handling."""
504
- try:
505
- text = extract_text_from_pdf(pdf_file)
506
- if "Error" in text:
507
- return f"**Error**: {text}"
508
-
509
- invoice_number, vendor_name, invoice_date, total_amount = extract_entities(text)
510
- items = extract_items(text)
511
- text_length = len(text)
512
-
513
- history_df = fetch_vendor_history(vendor_name, invoice_number)
514
- consistency_issues = check_data_consistency(invoice_number, vendor_name, invoice_date, history_df)
515
-
516
- data = {
517
- "invoice_id": str(uuid.uuid4()),
518
- "invoice_number": invoice_number,
519
- "vendor_name": vendor_name,
520
- "amount": total_amount,
521
- "invoice_date": invoice_date,
522
- "text_length": text_length
523
- }
524
- df = pd.DataFrame([data])
525
-
526
- df = detect_anomalies(df, history_df, items)
527
-
528
- fraud_score, fraud_reasoning = calculate_fraud_score(
529
- df["amount"].iloc[0],
530
- df["is_amount_anomaly"].iloc[0],
531
- df["is_frequency_anomaly"].iloc[0],
532
- df["is_vendor_pattern_anomaly"].iloc[0],
533
- df["is_item_anomaly"].iloc[0],
534
- text_length,
535
- consistency_issues,
536
- invoice_date,
537
- items
538
- )
539
-
540
- items_str = "; ".join(
541
- f"{item['description']}: Quantity {item['quantity']}, Unit Price ${item['unit_price']:.2f}, Total Price ${item['total_price']:.2f}"
542
- for item in items
543
- ) if items else "No items found"
544
-
545
- output = [
546
- "## Fraud Detection Summary",
547
- f"- **Invoice Number**: {invoice_number}",
548
- f"- **Vendor Name**: {vendor_name}",
549
- f"- **Invoice Date**: {invoice_date}",
550
- f"- **Invoice Amount**: ${total_amount:,.2f}",
551
- "- **Items Selected**:",
552
- ]
553
-
554
- if items:
555
- for item in items:
556
- output.append(f" - {item['description']}: Quantity {item['quantity']}, Unit Price ${item['unit_price']:.2f}, Total Price ${item['total_price']:.2f}")
557
- else:
558
- output.append(" - No items found")
559
-
560
- output.extend([
561
- f"- **Fraud Score**: {fraud_score}",
562
- f"- **Status**: {'Flagged' if fraud_score > 50 else 'Cleared'}",
563
- f"- **Flagged**: {fraud_score > 50}",
564
- "",
565
- "## Fraud Reasoning"
566
- ])
567
-
568
- if fraud_reasoning:
569
- output.extend([f"- {reason}" for reason in fraud_reasoning])
570
- else:
571
- output.append("- No specific fraud indicators detected")
572
-
573
- if sf is not None:
574
- try:
575
- sf.Invoice_Record__c.create({
576
- "Invoice_Number__c": invoice_number if invoice_number != "Unknown" else "",
577
- "Vendor_Name__c": vendor_name if vendor_name != "Unknown" else "",
578
- "Invoice_Amount__c": float(total_amount) if total_amount is not None else 0.0,
579
- "Invoice_Date__c": str(invoice_date) if invoice_date else "",
580
- "Fraud_Score__c": float(fraud_score) if fraud_score is not None else 0.0,
581
- "Fraud_Reason__c": "; ".join(fraud_reasoning) if fraud_reasoning else "",
582
- "Flagged__c": fraud_score > 50,
583
- "Status__c": "Flagged" if fraud_score > 50 else "Cleared",
584
- "Items_Selected__c": items_str
585
- })
586
- logger.info("Successfully created Salesforce record with Items_Selected__c: %s", items_str)
587
- except Exception as e:
588
- logger.error("Failed to create Salesforce record: %s", str(e))
589
-
590
- return "\n".join(output)
591
 
592
- except Exception as e:
593
- logger.error("Unexpected error in process_invoice: %s", str(e))
594
- return f"**Error**: An unexpected error occurred: {str(e)}"
595
 
596
  def gradio_interface(pdf_file):
597
  """Gradio interface to process uploaded PDF and display structured results."""
598
  if pdf_file is None:
599
  return "Please upload a PDF file."
600
- try:
601
- result = process_invoice(pdf_file)
602
- return result
603
- except Exception as e:
604
- logger.error("Error in gradio_interface: %s", str(e))
605
- return f"**Error**: {str(e)}"
606
 
607
  with gr.Blocks(css=".prose a[href*='share']:has(svg) {display:none !important;}") as iface:
608
  gr.Markdown("# Invoice Fraud Detection")
 
20
  os.environ["CUDA_VISIBLE_DEVICES"] = "" # Disable GPU usage
21
  os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0" # Disable oneDNN optimizations
22
 
23
+ # Set up logging to suppress transformers warnings
24
+ logging.getLogger("transformers").setLevel(logging.ERROR)
 
25
 
26
  # Read Salesforce credentials from environment variables
27
  SF_USERNAME = os.getenv("SF_USERNAME")
28
  SF_PASSWORD = os.getenv("SF_PASSWORD")
29
  SF_SECURITY_TOKEN = os.getenv("SF_SECURITY_TOKEN")
30
 
31
+ print(f"Salesforce login info: username={SF_USERNAME}")
32
 
33
  # Salesforce connection with error handling
34
  try:
 
37
  password=SF_PASSWORD,
38
  security_token=SF_SECURITY_TOKEN
39
  )
40
+ print("Salesforce login successful.")
41
  except SalesforceAuthenticationFailed as e:
42
+ print(f"Salesforce authentication failed: {e}")
 
 
 
43
  sf = None
44
 
45
  # Initialize Hugging Face NER pipeline (force CPU)
46
+ ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", device=-1)
 
 
 
 
 
47
 
48
  def extract_text_from_pdf(pdf_file):
49
+ """Extract text from a PDF invoice."""
50
  try:
51
  with pdfplumber.open(pdf_file) as pdf:
52
  text = ""
53
  for page in pdf.pages:
54
  page_text = page.extract_text() or ""
55
  text += page_text + "\n"
56
+ print("Extracted text:\n", text) # Debug: Print extracted text
57
  return text
58
  except Exception as e:
 
59
  return f"Error extracting text: {str(e)}"
60
 
61
  def extract_items(text):
62
+ """Extract items from the invoice table with a simplified approach."""
63
  items = []
64
+ # Replace escaped dollar signs
65
+ text = text.replace(r'\$', '$')
66
+
67
+ # Split text into lines
68
+ lines = text.split('\n')
69
+ print("Text split into lines:", lines) # Debug
70
+
71
+ # Find the table header
72
+ table_start = -1
73
+ for i, line in enumerate(lines):
74
+ if "Item Description" in line and "Quantity" in line and "Unit Price" in line and "Total Price" in line:
75
+ table_start = i + 1 # Table data starts after the header
76
+ break
77
+
78
+ if table_start == -1:
79
+ print("Table header not found.")
80
+ return items
81
+
82
+ # Find the end of the table (before "Total Amount" or end of text)
83
+ table_end = len(lines)
84
+ for i in range(table_start, len(lines)):
85
+ if "Total Amount" in lines[i] or "Total Due" in lines[i]:
86
+ table_end = i
87
+ break
88
+
89
+ print(f"Table section: lines {table_start} to {table_end-1}") # Debug
90
+ table_lines = lines[table_start:table_end]
91
+ print("Table lines:", table_lines) # Debug
92
+
93
+ # Pattern to match table rows
94
+ # Simplified to handle multi-word descriptions and flexible spacing
95
+ table_row_pattern = r"\|?\s*([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*(\d+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
96
+
97
+ for line in table_lines:
98
+ line = line.strip()
99
+ if not line:
100
+ continue
101
+ # Skip alignment rows (e.g., "|---|---|")
102
+ if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
103
+ print(f"Skipping alignment row: {line}")
104
+ continue
105
+ # Replace alignment markers in the row (e.g., "|---|") with "|"
106
+ line = re.sub(r'\|\s*---\s*\|', '|', line)
107
+ print(f"Processing table row: {line}") # Debug
108
+ match = re.match(table_row_pattern, line)
109
+ if match:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  description = match.group(1).strip()
111
+ quantity = int(match.group(2))
112
+ unit_price = float(match.group(3))
113
+ total_price = float(match.group(4))
114
+ items.append({
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  "description": description,
116
  "quantity": quantity,
117
  "unit_price": unit_price,
118
  "total_price": total_price
119
+ })
120
+ print(f"Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}") # Debug
121
+ else:
122
+ print(f"Failed to match row: {line}")
 
 
 
 
123
 
124
+ return items
 
 
125
 
126
  def extract_entities(text):
127
+ """Extract structured invoice details using flexible regex patterns."""
128
  invoice_number = "Unknown"
129
  vendor_name = "Unknown"
130
  invoice_date = datetime.now().date()
131
  total_amount = 0.0
132
 
133
+ # Flexible regex patterns to handle various invoice formats
134
+ invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-)([\w-]+)"
135
+ vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Invoice|No\.?|Date|$|\d))"
136
+ invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
137
+ total_amount_pattern = r"(?:Total\s*(?:Amount|Due)?|Amount\s*Due|Total)\s*[:\-\s]*[$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR)?"
138
+
139
+ # Invoice Number
140
+ invoice_num_match = re.search(invoice_num_pattern, text, re.IGNORECASE)
141
+ if invoice_num_match:
142
+ invoice_number = invoice_num_match.group(1) if invoice_num_match.group(1) else invoice_num_match.group(2)
143
+ print(f"Matched Invoice Number: {invoice_number}") # Debug
144
+
145
+ # Vendor Name
146
+ vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
147
+ if vendor_match:
148
+ vendor_name = vendor_match.group(1).strip()
149
+ print(f"Matched Vendor Name (Regex): {vendor_name}") # Debug
150
+ else:
151
+ # Enhanced NER fallback for multi-word organization names
152
+ ner_results = ner_pipeline(text)
153
+ org_name_parts = []
154
+ for i, entity in enumerate(ner_results):
155
+ if entity['entity'].startswith('B-ORG'):
156
+ org_name_parts = [entity['word']]
157
+ elif entity['entity'].startswith('I-ORG') and org_name_parts:
158
+ org_name_parts.append(entity['word'])
159
+ if org_name_parts:
160
+ vendor_name = " ".join(part.replace("##", "") for part in org_name_parts)
161
+ print(f"NER Matched Vendor Name: {vendor_name}") # Debug
162
+
163
+ # Invoice Date
164
+ invoice_date_match = re.search(invoice_date_pattern, text, re.IGNORECASE)
165
+ if invoice_date_match:
166
+ date_str = invoice_date_match.group(1)
167
+ try:
168
+ if "/" in date_str:
169
+ invoice_date = datetime.strptime(date_str, "%m/%d/%Y").date()
170
+ elif "," in date_str:
171
+ invoice_date = datetime.strptime(date_str, "%B %d, %Y").date()
172
+ elif "-" in date_str:
173
  try:
174
+ invoice_date = datetime.strptime(date_str, "%Y-%m-%d").date()
 
 
175
  except ValueError:
176
+ invoice_date = datetime.strptime(date_str, "%d-%m-%Y").date()
177
+ print(f"Matched Invoice Date: {invoice_date}") # Debug
178
+ except ValueError as e:
179
+ print(f"Failed to parse Invoice Date '{date_str}': {str(e)}") # Debug
 
 
 
 
 
 
 
 
 
 
 
180
 
181
+ # Total Amount
182
+ total_amount_match = re.search(total_amount_pattern, text, re.IGNORECASE)
183
+ if total_amount_match:
184
+ total_amount = float(total_amount_match.group(1).replace(",", ""))
185
+ print(f"Matched Total Amount: {total_amount}") # Debug
186
+
187
+ return invoice_number, vendor_name, invoice_date, total_amount
188
 
189
  def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
190
  """Fetch historical invoices for the vendor from Salesforce."""
191
  if sf is None:
 
192
  return pd.DataFrame()
193
 
194
  try:
 
196
  start_date = end_date - timedelta(days=time_window_days)
197
 
198
  query = f"""
199
+ SELECT Invoice_Number__c, Invoice_Amount__c, Invoice_Date__c, Vendor_Name__c
200
  FROM Invoice_Record__c
201
  WHERE Invoice_Date__c >= {start_date} AND Invoice_Date__c <= {end_date}
202
  AND Vendor_Name__c = '{vendor_name}'
 
208
  history_df = pd.DataFrame(records)
209
  if not history_df.empty:
210
  history_df['Invoice_Date__c'] = pd.to_datetime(history_df['Invoice_Date__c']).dt.date
 
 
 
211
  return history_df
 
212
  except Exception as e:
213
+ print(f"Failed to fetch vendor history: {str(e)}")
214
  return pd.DataFrame()
215
 
216
  def check_data_consistency(invoice_number, vendor_name, invoice_date, history_df):
217
  """Check for data consistency issues like duplicates."""
218
  consistency_issues = []
219
 
220
+ if not history_df.empty:
221
+ duplicate_invoices = history_df[history_df['Invoice_Number__c'] == invoice_number]
222
+ if not duplicate_invoices.empty:
223
+ consistency_issues.append(f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'.")
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
+ return consistency_issues
 
 
226
 
227
+ def detect_anomalies(df, history_df):
228
+ """Detect anomalies in amount, frequency, and vendor patterns."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  df["is_amount_anomaly"] = 0
230
  df["is_frequency_anomaly"] = 0
231
  df["is_vendor_pattern_anomaly"] = 0
 
232
 
233
+ if not df.empty:
234
+ scaler = StandardScaler()
235
+ X_scaled = scaler.fit_transform(df[["amount"]])
236
+ model = IsolationForest(contamination=0.05, random_state=42)
237
+ df["is_amount_anomaly"] = model.fit_predict(X_scaled)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
+ if not history_df.empty:
240
+ history_df['Invoice_Date__c'] = pd.to_datetime(history_df['Invoice_Date__c'])
241
+ date_range = (history_df['Invoice_Date__c'].max() - history_df['Invoice_Date__c'].min()).days + 1
242
+ frequency = len(history_df) / max(date_range, 1)
243
+
244
+ date_diffs = [(d - history_df['Invoice_Date__c'].min()).days for d in history_df['Invoice_Date__c']]
245
+ date_clustering = np.std(date_diffs) if len(date_diffs) > 1 else 0
246
+
247
+ frequency_df = pd.DataFrame({
248
+ "frequency": [frequency],
249
+ "date_clustering": [date_clustering]
250
+ })
251
+ scaler = StandardScaler()
252
+ X_scaled = scaler.fit_transform(frequency_df[["frequency", "date_clustering"]])
253
+ model = IsolationForest(contamination=0.05, random_state=42)
254
+ df["is_frequency_anomaly"] = model.fit_predict(X_scaled)[0]
255
+ else:
256
+ df["is_frequency_anomaly"] = 1
257
+
258
+ if not history_df.empty and len(history_df) > 1:
259
+ historical_amounts = history_df["Invoice_Amount__c"].astype(float)
260
+ mean_amount = historical_amounts.mean()
261
+ std_amount = historical_amounts.std() if len(historical_amounts) > 1 else 1
262
+ amount_variance = historical_amounts.var() if len(historical_amounts) > 1 else 0
263
+
264
+ current_amount = df["amount"].iloc[0]
265
+ deviation = abs(current_amount - mean_amount) / (std_amount if std_amount > 0 else 1)
266
+ invoice_count = len(history_df)
267
+
268
+ vendor_pattern_df = pd.DataFrame({
269
+ "amount_deviation": [deviation],
270
+ "invoice_count": [invoice_count],
271
+ "amount_variance": [amount_variance]
272
+ })
273
+ scaler = StandardScaler()
274
+ X_scaled = scaler.fit_transform(vendor_pattern_df[["amount_deviation", "invoice_count", "amount_variance"]])
275
+ model = IsolationForest(contamination=0.05, random_state=42)
276
+ df["is_vendor_pattern_anomaly"] = model.fit_predict(X_scaled)[0]
277
+ else:
278
+ df["is_vendor_pattern_anomaly"] = 1
279
+
280
+ return df
281
+
282
+ def calculate_fraud_score(amount, is_amount_anomaly, is_frequency_anomaly, is_vendor_pattern_anomaly, text_length, consistency_issues, invoice_date):
283
+ """Calculate fraud score based on amount, anomalies, text length, consistency issues, and invoice date."""
284
  score = 0.0
285
  reasoning = []
286
+ today = datetime.now().date()
287
+
288
+ if amount > 5000:
289
+ score += 40
290
+ reasoning.append("High invoice amount detected.")
291
+ elif amount < 10:
292
+ score += 20
293
+ reasoning.append("Unusually low invoice amount.")
294
+
295
+ if invoice_date > today:
296
+ score += 10
297
+ reasoning.append("Invoice date is in the future.")
298
+
299
+ if is_amount_anomaly == -1:
300
+ score += 30
301
+ reasoning.append("Amount flagged as an anomaly.")
302
+ if is_frequency_anomaly == -1:
303
+ score += 25
304
+ reasoning.append("Unusual invoice submission frequency or clustering detected.")
305
+ if is_vendor_pattern_anomaly == -1:
306
+ score += 25
307
+ reasoning.append("Unusual vendor pattern detected (amount deviation, frequency, or variance).")
308
+
309
+ if text_length > 500:
310
+ score += 10
311
+ reasoning.append("Excessive text length in invoice.")
312
+
313
+ if consistency_issues:
314
+ score += 15 * len(consistency_issues)
315
+ reasoning.extend(consistency_issues)
316
+
317
+ return min(score, 100), reasoning
318
 
319
+ def process_invoice(pdf_file):
320
+ """Process a single invoice PDF and return structured markdown output."""
321
+ text = extract_text_from_pdf(pdf_file)
322
+ if "Error" in text:
323
+ return f"**Error**: {text}"
324
+
325
+ invoice_number, vendor_name, invoice_date, total_amount = extract_entities(text)
326
+ items = extract_items(text)
327
+ text_length = len(text)
328
+
329
+ history_df = fetch_vendor_history(vendor_name, invoice_number)
330
+ consistency_issues = check_data_consistency(invoice_number, vendor_name, invoice_date, history_df)
331
+
332
+ data = {
333
+ "invoice_id": str(uuid.uuid4()),
334
+ "invoice_number": invoice_number,
335
+ "vendor_name": vendor_name,
336
+ "amount": total_amount,
337
+ "invoice_date": invoice_date,
338
+ "text_length": text_length
339
+ }
340
+ df = pd.DataFrame([data])
341
+
342
+ df = detect_anomalies(df, history_df)
343
+
344
+ fraud_score, fraud_reasoning = calculate_fraud_score(
345
+ df["amount"].iloc[0],
346
+ df["is_amount_anomaly"].iloc[0],
347
+ df["is_frequency_anomaly"].iloc[0],
348
+ df["is_vendor_pattern_anomaly"].iloc[0],
349
+ text_length,
350
+ consistency_issues,
351
+ invoice_date
352
+ )
353
 
354
+ # Format items for Salesforce (semicolon-separated string)
355
+ items_str = "; ".join(
356
+ f"{item['description']}: Quantity {item['quantity']}, Unit Price ${item['unit_price']:.2f}, Total Price ${item['total_price']:.2f}"
357
+ for item in items
358
+ ) if items else "No items found"
359
+
360
+ output = [
361
+ "## Fraud Detection Summary",
362
+ f"- **Invoice Number**: {invoice_number}",
363
+ f"- **Vendor Name**: {vendor_name}",
364
+ f"- **Invoice Date**: {invoice_date}",
365
+ f"- **Invoice Amount**: ${total_amount:,.2f}",
366
+ "- **Items Selected**:",
367
+ ]
368
+
369
+ if items:
370
  for item in items:
371
+ output.append(f" - {item['description']}: Quantity {item['quantity']}, Unit Price ${item['unit_price']:.2f}, Total Price ${item['total_price']:.2f}")
372
+ else:
373
+ output.append(" - No items found")
374
+
375
+ output.extend([
376
+ f"- **Fraud Score**: {fraud_score}",
377
+ f"- **Status**: {'Flagged' if fraud_score > 50 else 'Cleared'}",
378
+ f"- **Flagged**: {fraud_score > 50}",
379
+ "",
380
+ "## Fraud Reasoning"
381
+ ])
382
+
383
+ if fraud_reasoning:
384
+ output.extend([f"- {reason}" for reason in fraud_reasoning])
385
+ else:
386
+ output.append("- No specific fraud indicators detected")
387
+
388
+ if sf is not None:
389
+ try:
390
+ sf.Invoice_Record__c.create({
391
+ "Invoice_Number__c": invoice_number,
392
+ "Vendor_Name__c": vendor_name,
393
+ "Invoice_Amount__c": total_amount,
394
+ "Invoice_Date__c": str(invoice_date),
395
+ "Fraud_Score__c": fraud_score,
396
+ "Fraud_Reason__c": "; ".join(fraud_reasoning),
397
+ "Flagged__c": fraud_score > 50,
398
+ "Status__c": "Flagged" if fraud_score > 50 else "Cleared",
399
+ "Items_Selected__c": items_str
400
+ })
401
+ print(f"Successfully created Salesforce record with Items_Selected__c: {items_str}") # Debug
402
+ except Exception as e:
403
+ print(f"Failed to create Salesforce record: {str(e)}")
404
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
 
406
+ return "\n".join(output)
 
 
407
 
408
  def gradio_interface(pdf_file):
409
  """Gradio interface to process uploaded PDF and display structured results."""
410
  if pdf_file is None:
411
  return "Please upload a PDF file."
412
+ result = process_invoice(pdf_file)
413
+ return result
 
 
 
 
414
 
415
  with gr.Blocks(css=".prose a[href*='share']:has(svg) {display:none !important;}") as iface:
416
  gr.Markdown("# Invoice Fraud Detection")