Abhisesh7 commited on
Commit
fbe7682
·
verified ·
1 Parent(s): db037c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -87
app.py CHANGED
@@ -12,7 +12,6 @@ from datetime import datetime, timedelta
12
  import re
13
  import gradio as gr
14
  from simple_salesforce import Salesforce, SalesforceAuthenticationFailed
15
- from image_ocr import extract_text_from_image # Import the image OCR function
16
 
17
  # Load environment variables from .env file
18
  load_dotenv()
@@ -62,18 +61,17 @@ def extract_text_from_pdf(pdf_file):
62
  def extract_items(text):
63
  """Extract items from the invoice table with a simplified approach."""
64
  items = []
65
- # Replace escaped dollar signs and other currency symbols
66
- text = text.replace(r'\$', '$').replace('₹', '₹')
67
 
68
  # Split text into lines
69
  lines = text.split('\n')
70
  print("Text split into lines:", lines) # Debug
71
 
72
- # Find the table header (more flexible matching)
73
  table_start = -1
74
  for i, line in enumerate(lines):
75
- # Match variations of table headers like "Item Quantity Rate Amount"
76
- if re.search(r'Item.*Quantity.*(Rate|Unit\s*Price).*(Amount|Total\s*Price)', line, re.IGNORECASE):
77
  table_start = i + 1 # Table data starts after the header
78
  break
79
 
@@ -81,10 +79,10 @@ def extract_items(text):
81
  print("Table header not found.")
82
  return items
83
 
84
- # Find the end of the table (before "Subtotal", "Total", "Tax", or end of text)
85
  table_end = len(lines)
86
  for i in range(table_start, len(lines)):
87
- if any(keyword in lines[i] for keyword in ["Subtotal", "Total", "Tax", "Balance Due", "Promo Code"]):
88
  table_end = i
89
  break
90
 
@@ -92,28 +90,33 @@ def extract_items(text):
92
  table_lines = lines[table_start:table_end]
93
  print("Table lines:", table_lines) # Debug
94
 
95
- # Updated pattern to match table rows more accurately
96
- # Captures: Description (non-greedy), Quantity (digits), Rate/Unit Price (decimal with optional currency), Amount/Total Price (decimal with optional currency)
97
- table_row_pattern = r"^(.*?)\s+(\d+)\s+(?:₹|[$£€]?\s*)([\d,]+\.?\d*)\s+(?:₹|[$£€]?\s*)([\d,]+\.?\d*)$"
98
 
99
  for line in table_lines:
100
  line = line.strip()
101
  if not line:
102
  continue
 
 
 
 
 
 
103
  print(f"Processing table row: {line}") # Debug
104
  match = re.match(table_row_pattern, line)
105
  if match:
106
  description = match.group(1).strip()
107
  # Clean the description to remove any trailing quantity or price data
108
  description = re.sub(r'\s*\d+\s*$', '', description).strip() # Remove trailing numbers
109
- description = re.sub(r'\s*(?:₹|[$£€]?)[\d,]+\.?\d*\s*$', '', description).strip() # Remove trailing prices
110
  # Skip lines that look like promo codes
111
  if "Promo Code" in description:
112
  print(f"Skipping promo code line: {line}")
113
  continue
114
  quantity = int(match.group(2))
115
- unit_price = float(match.group(3).replace(",", ""))
116
- total_price = float(match.group(4).replace(",", ""))
117
  items.append({
118
  "description": description,
119
  "quantity": quantity,
@@ -131,7 +134,6 @@ def extract_entities(text):
131
  invoice_number = "Unknown"
132
  vendor_name = "Unknown"
133
  invoice_date = datetime.now().date()
134
- due_date = None # Default to None
135
  total_amount = 0.0
136
 
137
  # Extract items first to use as a filter for NER
@@ -139,16 +141,15 @@ def extract_entities(text):
139
  item_descriptions = [item["description"].lower() for item in items]
140
 
141
  # Flexible regex patterns to handle various invoice formats
142
- invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-)([\w-]+)|#?\s*(\d+)"
143
  vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
144
- invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*((\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4}|[A-Za-z]+\s*\d{1,2}\s*\d{4}))"
145
- due_date_pattern = r"(?:Due\s*Date|Payment\s*Due\s*Date|Due\s*By)\s*[:\-\s]*((\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4}|[A-Za-z]+\s*\d{1,2}\s*\d{4}))"
146
- total_amount_pattern = r"(?:Total\s*(?:Amount|Due)?|Amount\s*Due|Total|Balance\s*Due)\s*[:\-\s]*(?:₹|[$£€])?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR|INR)?"
147
 
148
  # Invoice Number
149
  invoice_num_match = re.search(invoice_num_pattern, text, re.IGNORECASE)
150
  if invoice_num_match:
151
- invoice_number = invoice_num_match.group(1) if invoice_num_match.group(1) else (invoice_num_match.group(2) if invoice_num_match.group(2) else invoice_num_match.group(3))
152
  print(f"Matched Invoice Number: {invoice_number}") # Debug
153
 
154
  # Vendor Name
@@ -185,39 +186,17 @@ def extract_entities(text):
185
  invoice_date = datetime.strptime(date_str, "%Y-%m-%d").date()
186
  except ValueError:
187
  invoice_date = datetime.strptime(date_str, "%d-%m-%Y").date()
188
- elif re.match(r"[A-Za-z]+\s*\d{1,2}\s*\d{4}", date_str):
189
- invoice_date = datetime.strptime(date_str, "%B %d %Y").date()
190
  print(f"Matched Invoice Date: {invoice_date}") # Debug
191
  except ValueError as e:
192
  print(f"Failed to parse Invoice Date '{date_str}': {str(e)}") # Debug
193
 
194
- # Due Date
195
- due_date_match = re.search(due_date_pattern, text, re.IGNORECASE)
196
- if due_date_match:
197
- date_str = due_date_match.group(1)
198
- try:
199
- if "/" in date_str:
200
- due_date = datetime.strptime(date_str, "%m/%d/%Y").date()
201
- elif "," in date_str:
202
- due_date = datetime.strptime(date_str, "%B %d, %Y").date()
203
- elif "-" in date_str:
204
- try:
205
- due_date = datetime.strptime(date_str, "%Y-%m-%d").date()
206
- except ValueError:
207
- invoice_date = datetime.strptime(date_str, "%d-%m-%Y").date()
208
- elif re.match(r"[A-Za-z]+\s*\d{1,2}\s*\d{4}", date_str):
209
- due_date = datetime.strptime(date_str, "%B %d %Y").date()
210
- print(f"Matched Due Date: {due_date}") # Debug
211
- except ValueError as e:
212
- print(f"Failed to parse Due Date '{date_str}': {str(e)}") # Debug
213
-
214
  # Total Amount
215
  total_amount_match = re.search(total_amount_pattern, text, re.IGNORECASE)
216
  if total_amount_match:
217
  total_amount = float(total_amount_match.group(1).replace(",", ""))
218
  print(f"Matched Total Amount: {total_amount}") # Debug
219
 
220
- return invoice_number, vendor_name, invoice_date, due_date, total_amount
221
 
222
  def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
223
  """Fetch historical invoices for the vendor from Salesforce."""
@@ -312,8 +291,8 @@ def detect_anomalies(df, history_df):
312
 
313
  return df
314
 
315
- def calculate_fraud_score(amount, is_amount_anomaly, is_frequency_anomaly, is_vendor_pattern_anomaly, text_length, consistency_issues, invoice_date, due_date):
316
- """Calculate fraud score based on amount, anomalies, text length, consistency issues, invoice date, and due date."""
317
  score = 0.0
318
  reasoning = []
319
  today = datetime.now().date()
@@ -329,10 +308,6 @@ def calculate_fraud_score(amount, is_amount_anomaly, is_frequency_anomaly, is_ve
329
  score += 10
330
  reasoning.append("Invoice date is in the future.")
331
 
332
- if due_date and due_date < today:
333
- score += 10
334
- reasoning.append("Due date is in the past.")
335
-
336
  if is_amount_anomaly == -1:
337
  score += 30
338
  reasoning.append("Amount flagged as an anomaly.")
@@ -353,23 +328,13 @@ def calculate_fraud_score(amount, is_amount_anomaly, is_frequency_anomaly, is_ve
353
 
354
  return min(score, 100), reasoning
355
 
356
- def process_invoice(file_path):
357
- """Process a single invoice (PDF or image) and return structured markdown output."""
358
- # Determine file type and extract text accordingly
359
- if file_path.lower().endswith('.pdf'):
360
- text = extract_text_from_pdf(file_path)
361
- elif file_path.lower().endswith(('.png', '.jpg', '.jpeg')):
362
- # Ensure file_path is a string (Gradio might pass a TempFile object)
363
- if hasattr(file_path, 'name'):
364
- file_path = file_path.name # Extract the file path from Gradio's TempFile object
365
- text = extract_text_from_image(file_path)
366
- else:
367
- return "**Error**: Unsupported file type. Please upload a PDF or image (PNG/JPG/JPEG)."
368
-
369
  if "Error" in text:
370
  return f"**Error**: {text}"
371
 
372
- invoice_number, vendor_name, invoice_date, due_date, total_amount = extract_entities(text)
373
  items = extract_items(text)
374
  text_length = len(text)
375
 
@@ -382,7 +347,6 @@ def process_invoice(file_path):
382
  "vendor_name": vendor_name,
383
  "amount": total_amount,
384
  "invoice_date": invoice_date,
385
- "due_date": due_date,
386
  "text_length": text_length
387
  }
388
  df = pd.DataFrame([data])
@@ -396,8 +360,7 @@ def process_invoice(file_path):
396
  df["is_vendor_pattern_anomaly"].iloc[0],
397
  text_length,
398
  consistency_issues,
399
- invoice_date,
400
- due_date
401
  )
402
 
403
  # Format items for Salesforce (only include item descriptions)
@@ -406,8 +369,8 @@ def process_invoice(file_path):
406
  desc = item['description']
407
  # Additional cleaning to ensure no quantity or price data
408
  desc = re.sub(r'\s*Quantity\s*\d+', '', desc, flags=re.IGNORECASE).strip()
409
- desc = re.sub(r'\s*(?:Rate|Unit\s*Price)\s*(?:₹|[$£€])\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
410
- desc = re.sub(r'\s*(?:Amount|Total\s*Price)\s*(?:₹|[$£€])\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
411
  cleaned_items.append(desc)
412
  items_str = "; ".join(cleaned_items) if cleaned_items else "No items found"
413
  print(f"Items string for Salesforce (after cleaning): {items_str}") # Debug
@@ -423,18 +386,9 @@ def process_invoice(file_path):
423
  f"- **Invoice Number**: {invoice_number}",
424
  f"- **Vendor Name**: {vendor_name}",
425
  f"- **Invoice Date**: {invoice_date}",
426
- ]
427
-
428
- # Only add Due Date to output if it exists
429
- if due_date:
430
- output.append(f"- **Due Date**: {due_date}")
431
- else:
432
- output.append(f"- **Due Date**: Not specified")
433
-
434
- output.extend([
435
- f"- **Invoice Amount**: ₹{total_amount:,.2f}",
436
  "- **Items Selected**:",
437
- ])
438
 
439
  if items:
440
  for item in items:
@@ -463,8 +417,6 @@ def process_invoice(file_path):
463
  "Vendor_Name__c": vendor_name,
464
  "Invoice_Amount__c": total_amount,
465
  "Invoice_Date__c": str(invoice_date),
466
- # Only include Due_Date__c if due_date exists
467
- "Due_Date__c": str(due_date) if due_date else None,
468
  "Fraud_Score__c": fraud_score,
469
  "Fraud_Reason__c": "; ".join(fraud_reasoning),
470
  "Flagged__c": fraud_score > 50,
@@ -480,17 +432,17 @@ def process_invoice(file_path):
480
 
481
  return "\n".join(output)
482
 
483
- def gradio_interface(file):
484
- """Gradio interface to process uploaded file (PDF or image) and display structured results."""
485
- if file is None:
486
- return "Please upload a PDF or image file."
487
- result = process_invoice(file)
488
  return result
489
 
490
  with gr.Blocks(css=".prose a[href*='share']:has(svg) {display:none !important;}") as iface:
491
  gr.Markdown("# Invoice Fraud Detection")
492
  with gr.Row():
493
- file_input = gr.File(label="Upload Invoice (PDF or Image)")
494
  result_output = gr.Markdown(label="Fraud Detection Results")
495
  file_input.change(fn=gradio_interface, inputs=file_input, outputs=result_output)
496
 
 
12
  import re
13
  import gradio as gr
14
  from simple_salesforce import Salesforce, SalesforceAuthenticationFailed
 
15
 
16
  # Load environment variables from .env file
17
  load_dotenv()
 
61
  def extract_items(text):
62
  """Extract items from the invoice table with a simplified approach."""
63
  items = []
64
+ # Replace escaped dollar signs
65
+ text = text.replace(r'\$', '$')
66
 
67
  # Split text into lines
68
  lines = text.split('\n')
69
  print("Text split into lines:", lines) # Debug
70
 
71
+ # Find the table header
72
  table_start = -1
73
  for i, line in enumerate(lines):
74
+ if "Item Description" in line and "Quantity" in line and "Unit Price" in line and "Total Price" in line:
 
75
  table_start = i + 1 # Table data starts after the header
76
  break
77
 
 
79
  print("Table header not found.")
80
  return items
81
 
82
+ # Find the end of the table (before "Total Amount", "Promo Code", or end of text)
83
  table_end = len(lines)
84
  for i in range(table_start, len(lines)):
85
+ if "Total Amount" in lines[i] or "Total Due" in lines[i] or "Promo Code" in lines[i]:
86
  table_end = i
87
  break
88
 
 
90
  table_lines = lines[table_start:table_end]
91
  print("Table lines:", table_lines) # Debug
92
 
93
+ # Pattern to match table rows
94
+ table_row_pattern = r"\|?\s*([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*(\d+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
 
95
 
96
  for line in table_lines:
97
  line = line.strip()
98
  if not line:
99
  continue
100
+ # Skip alignment rows (e.g., "|---|---|")
101
+ if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
102
+ print(f"Skipping alignment row: {line}")
103
+ continue
104
+ # Replace alignment markers in the row (e.g., "|---|") with "|"
105
+ line = re.sub(r'\|\s*---\s*\|', '|', line)
106
  print(f"Processing table row: {line}") # Debug
107
  match = re.match(table_row_pattern, line)
108
  if match:
109
  description = match.group(1).strip()
110
  # Clean the description to remove any trailing quantity or price data
111
  description = re.sub(r'\s*\d+\s*$', '', description).strip() # Remove trailing numbers
112
+ description = re.sub(r'\s*\$?\d+\.\d+\s*$', '', description).strip() # Remove trailing prices
113
  # Skip lines that look like promo codes
114
  if "Promo Code" in description:
115
  print(f"Skipping promo code line: {line}")
116
  continue
117
  quantity = int(match.group(2))
118
+ unit_price = float(match.group(3))
119
+ total_price = float(match.group(4))
120
  items.append({
121
  "description": description,
122
  "quantity": quantity,
 
134
  invoice_number = "Unknown"
135
  vendor_name = "Unknown"
136
  invoice_date = datetime.now().date()
 
137
  total_amount = 0.0
138
 
139
  # Extract items first to use as a filter for NER
 
141
  item_descriptions = [item["description"].lower() for item in items]
142
 
143
  # Flexible regex patterns to handle various invoice formats
144
+ invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-)([\w-]+)"
145
  vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
146
+ invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
147
+ total_amount_pattern = r"(?:Total\s*(?:Amount|Due)?|Amount\s*Due|Total)\s*[:\-\s]*[$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR)?"
 
148
 
149
  # Invoice Number
150
  invoice_num_match = re.search(invoice_num_pattern, text, re.IGNORECASE)
151
  if invoice_num_match:
152
+ invoice_number = invoice_num_match.group(1) if invoice_num_match.group(1) else invoice_num_match.group(2)
153
  print(f"Matched Invoice Number: {invoice_number}") # Debug
154
 
155
  # Vendor Name
 
186
  invoice_date = datetime.strptime(date_str, "%Y-%m-%d").date()
187
  except ValueError:
188
  invoice_date = datetime.strptime(date_str, "%d-%m-%Y").date()
 
 
189
  print(f"Matched Invoice Date: {invoice_date}") # Debug
190
  except ValueError as e:
191
  print(f"Failed to parse Invoice Date '{date_str}': {str(e)}") # Debug
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  # Total Amount
194
  total_amount_match = re.search(total_amount_pattern, text, re.IGNORECASE)
195
  if total_amount_match:
196
  total_amount = float(total_amount_match.group(1).replace(",", ""))
197
  print(f"Matched Total Amount: {total_amount}") # Debug
198
 
199
+ return invoice_number, vendor_name, invoice_date, total_amount
200
 
201
  def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
202
  """Fetch historical invoices for the vendor from Salesforce."""
 
291
 
292
  return df
293
 
294
+ def calculate_fraud_score(amount, is_amount_anomaly, is_frequency_anomaly, is_vendor_pattern_anomaly, text_length, consistency_issues, invoice_date):
295
+ """Calculate fraud score based on amount, anomalies, text length, consistency issues, and invoice date."""
296
  score = 0.0
297
  reasoning = []
298
  today = datetime.now().date()
 
308
  score += 10
309
  reasoning.append("Invoice date is in the future.")
310
 
 
 
 
 
311
  if is_amount_anomaly == -1:
312
  score += 30
313
  reasoning.append("Amount flagged as an anomaly.")
 
328
 
329
  return min(score, 100), reasoning
330
 
331
+ def process_invoice(pdf_file):
332
+ """Process a single invoice PDF and return structured markdown output."""
333
+ text = extract_text_from_pdf(pdf_file)
 
 
 
 
 
 
 
 
 
 
334
  if "Error" in text:
335
  return f"**Error**: {text}"
336
 
337
+ invoice_number, vendor_name, invoice_date, total_amount = extract_entities(text)
338
  items = extract_items(text)
339
  text_length = len(text)
340
 
 
347
  "vendor_name": vendor_name,
348
  "amount": total_amount,
349
  "invoice_date": invoice_date,
 
350
  "text_length": text_length
351
  }
352
  df = pd.DataFrame([data])
 
360
  df["is_vendor_pattern_anomaly"].iloc[0],
361
  text_length,
362
  consistency_issues,
363
+ invoice_date
 
364
  )
365
 
366
  # Format items for Salesforce (only include item descriptions)
 
369
  desc = item['description']
370
  # Additional cleaning to ensure no quantity or price data
371
  desc = re.sub(r'\s*Quantity\s*\d+', '', desc, flags=re.IGNORECASE).strip()
372
+ desc = re.sub(r'\s*Unit\s*Price\s*\$\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
373
+ desc = re.sub(r'\s*Total\s*Price\s*\$\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
374
  cleaned_items.append(desc)
375
  items_str = "; ".join(cleaned_items) if cleaned_items else "No items found"
376
  print(f"Items string for Salesforce (after cleaning): {items_str}") # Debug
 
386
  f"- **Invoice Number**: {invoice_number}",
387
  f"- **Vendor Name**: {vendor_name}",
388
  f"- **Invoice Date**: {invoice_date}",
389
+ f"- **Invoice Amount**: ${total_amount:,.2f}",
 
 
 
 
 
 
 
 
 
390
  "- **Items Selected**:",
391
+ ]
392
 
393
  if items:
394
  for item in items:
 
417
  "Vendor_Name__c": vendor_name,
418
  "Invoice_Amount__c": total_amount,
419
  "Invoice_Date__c": str(invoice_date),
 
 
420
  "Fraud_Score__c": fraud_score,
421
  "Fraud_Reason__c": "; ".join(fraud_reasoning),
422
  "Flagged__c": fraud_score > 50,
 
432
 
433
  return "\n".join(output)
434
 
435
+ def gradio_interface(pdf_file):
436
+ """Gradio interface to process uploaded PDF and display structured results."""
437
+ if pdf_file is None:
438
+ return "Please upload a PDF file."
439
+ result = process_invoice(pdf_file)
440
  return result
441
 
442
  with gr.Blocks(css=".prose a[href*='share']:has(svg) {display:none !important;}") as iface:
443
  gr.Markdown("# Invoice Fraud Detection")
444
  with gr.Row():
445
+ file_input = gr.File(label="Upload Invoice PDF")
446
  result_output = gr.Markdown(label="Fraud Detection Results")
447
  file_input.change(fn=gradio_interface, inputs=file_input, outputs=result_output)
448