Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -286,6 +286,7 @@ def extract_entities(pdf_file, text):
|
|
| 286 |
primary_invoice_number = "Unknown"
|
| 287 |
vendor_name = "Unknown"
|
| 288 |
invoice_date = datetime.now().date()
|
|
|
|
| 289 |
total_amount = 0.0
|
| 290 |
|
| 291 |
# Extract items first to use as a filter for NER
|
|
@@ -296,6 +297,7 @@ def extract_entities(pdf_file, text):
|
|
| 296 |
invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Advice\s*(?:No\.?)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-|Z\d{2}APOT\d{9})([\w-]+)"
|
| 297 |
vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By|Restaurant\s*Name|Vendor)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
|
| 298 |
invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
|
|
|
|
| 299 |
total_amount_pattern = r"(?:Total\s*(?:Amount|Due|Value))\s*[:\-\s]*[₹$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR|INR)?"
|
| 300 |
|
| 301 |
# Invoice Numbers (capture all, then prioritize)
|
|
@@ -362,6 +364,24 @@ def extract_entities(pdf_file, text):
|
|
| 362 |
except ValueError as e:
|
| 363 |
print(f"Failed to parse Invoice Date '{date_str}': {str(e)}") # Debug
|
| 364 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
# Total Amount (prioritize the final total after taxes and fees)
|
| 366 |
total_amount_matches = re.finditer(total_amount_pattern, text, re.IGNORECASE)
|
| 367 |
total_amounts = []
|
|
@@ -394,7 +414,7 @@ def extract_entities(pdf_file, text):
|
|
| 394 |
break
|
| 395 |
print(f"Calculated Total Amount: {total_amount}") # Debug
|
| 396 |
|
| 397 |
-
return primary_invoice_number, vendor_name, invoice_date, total_amount
|
| 398 |
|
| 399 |
def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
|
| 400 |
"""Fetch historical invoices for the vendor from Salesforce."""
|
|
@@ -506,7 +526,7 @@ def calculate_fraud_score(amount, is_amount_anomaly, is_frequency_anomaly, is_ve
|
|
| 506 |
score += 10
|
| 507 |
reasoning.append("Invoice date is in the future.")
|
| 508 |
|
| 509 |
-
if due_date < today and invoice_date < today:
|
| 510 |
score += 15
|
| 511 |
reasoning.append("Due date has passed, indicating potential payment delay.")
|
| 512 |
|
|
@@ -536,13 +556,10 @@ def process_invoice(pdf_file):
|
|
| 536 |
if "Error" in text:
|
| 537 |
return f"**Error**: {text}"
|
| 538 |
|
| 539 |
-
invoice_number, vendor_name, invoice_date, total_amount = extract_entities(pdf_file, text)
|
| 540 |
items = extract_items(pdf_file, text)
|
| 541 |
text_length = len(text)
|
| 542 |
|
| 543 |
-
# Calculate Due Date (Invoice Date + 30 days)
|
| 544 |
-
due_date = invoice_date + timedelta(days=30)
|
| 545 |
-
|
| 546 |
history_df = fetch_vendor_history(vendor_name, invoice_number)
|
| 547 |
consistency_issues = check_data_consistency(invoice_number, vendor_name, invoice_date, history_df)
|
| 548 |
|
|
@@ -586,19 +603,26 @@ def process_invoice(pdf_file):
|
|
| 586 |
items_str = "; ".join(item['description'] for item in items) # Fallback to raw descriptions
|
| 587 |
print(f"Fallback items_str: {items_str}")
|
| 588 |
|
| 589 |
-
# Format the invoice date
|
| 590 |
formatted_invoice_date = invoice_date.strftime("%d-%m-%Y")
|
| 591 |
-
|
|
|
|
| 592 |
|
| 593 |
output = [
|
| 594 |
"## Fraud Detection Summary",
|
| 595 |
f"- **Invoice Number**: {invoice_number}",
|
| 596 |
f"- **Vendor Name**: {vendor_name}",
|
| 597 |
f"- **Invoice Date**: {formatted_invoice_date}",
|
| 598 |
-
f"- **Due Date**: {formatted_due_date}",
|
| 599 |
-
f"- **Invoice Amount**: ${total_amount:,.2f}" if '$' in text else f"- **Invoice Amount**: ₹{total_amount:,.2f}",
|
| 600 |
]
|
| 601 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 602 |
# Add items section
|
| 603 |
output.append("- **Items Selected**:")
|
| 604 |
if items:
|
|
@@ -629,13 +653,15 @@ def process_invoice(pdf_file):
|
|
| 629 |
"Vendor_Name__c": vendor_name,
|
| 630 |
"Invoice_Amount__c": total_amount,
|
| 631 |
"Invoice_Date__c": str(invoice_date),
|
| 632 |
-
"Due_Date__c": str(due_date),
|
| 633 |
"Fraud_Score__c": fraud_score,
|
| 634 |
"Fraud_Reason__c": "; ".join(fraud_reasoning),
|
| 635 |
"Flagged__c": fraud_score > 50,
|
| 636 |
"Status__c": "Flagged" if fraud_score > 50 else "Cleared",
|
| 637 |
"Items_Selected__c": items_str
|
| 638 |
}
|
|
|
|
|
|
|
|
|
|
| 639 |
print(f"Record data being sent to Salesforce: {record_data}") # Debug
|
| 640 |
sf.Invoice_Record__c.create(record_data)
|
| 641 |
print(f"Successfully created Salesforce record with Items_Selected__c: {items_str}") # Debug
|
|
|
|
| 286 |
primary_invoice_number = "Unknown"
|
| 287 |
vendor_name = "Unknown"
|
| 288 |
invoice_date = datetime.now().date()
|
| 289 |
+
due_date = None # Due Date will be None unless explicitly found in the invoice
|
| 290 |
total_amount = 0.0
|
| 291 |
|
| 292 |
# Extract items first to use as a filter for NER
|
|
|
|
| 297 |
invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Advice\s*(?:No\.?)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-|Z\d{2}APOT\d{9})([\w-]+)"
|
| 298 |
vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By|Restaurant\s*Name|Vendor)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
|
| 299 |
invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
|
| 300 |
+
due_date_pattern = r"(?:Due\s*Date|Payment\s*Due\s*(?:Date)?)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
|
| 301 |
total_amount_pattern = r"(?:Total\s*(?:Amount|Due|Value))\s*[:\-\s]*[₹$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR|INR)?"
|
| 302 |
|
| 303 |
# Invoice Numbers (capture all, then prioritize)
|
|
|
|
| 364 |
except ValueError as e:
|
| 365 |
print(f"Failed to parse Invoice Date '{date_str}': {str(e)}") # Debug
|
| 366 |
|
| 367 |
+
# Due Date (only extract if explicitly present in the invoice)
|
| 368 |
+
due_date_match = re.search(due_date_pattern, text, re.IGNORECASE)
|
| 369 |
+
if due_date_match:
|
| 370 |
+
date_str = due_date_match.group(1)
|
| 371 |
+
try:
|
| 372 |
+
if "/" in date_str:
|
| 373 |
+
due_date = datetime.strptime(date_str, "%m/%d/%Y").date()
|
| 374 |
+
elif "," in date_str:
|
| 375 |
+
due_date = datetime.strptime(date_str, "%B %d, %Y").date()
|
| 376 |
+
elif "-" in date_str:
|
| 377 |
+
try:
|
| 378 |
+
due_date = datetime.strptime(date_str, "%Y-%m-%d").date()
|
| 379 |
+
except ValueError:
|
| 380 |
+
due_date = datetime.strptime(date_str, "%d-%m-%Y").date()
|
| 381 |
+
print(f"Matched Due Date: {due_date}") # Debug
|
| 382 |
+
except ValueError as e:
|
| 383 |
+
print(f"Failed to parse Due Date '{date_str}': {str(e)}") # Debug
|
| 384 |
+
|
| 385 |
# Total Amount (prioritize the final total after taxes and fees)
|
| 386 |
total_amount_matches = re.finditer(total_amount_pattern, text, re.IGNORECASE)
|
| 387 |
total_amounts = []
|
|
|
|
| 414 |
break
|
| 415 |
print(f"Calculated Total Amount: {total_amount}") # Debug
|
| 416 |
|
| 417 |
+
return primary_invoice_number, vendor_name, invoice_date, due_date, total_amount
|
| 418 |
|
| 419 |
def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
|
| 420 |
"""Fetch historical invoices for the vendor from Salesforce."""
|
|
|
|
| 526 |
score += 10
|
| 527 |
reasoning.append("Invoice date is in the future.")
|
| 528 |
|
| 529 |
+
if due_date and due_date < today and invoice_date < today:
|
| 530 |
score += 15
|
| 531 |
reasoning.append("Due date has passed, indicating potential payment delay.")
|
| 532 |
|
|
|
|
| 556 |
if "Error" in text:
|
| 557 |
return f"**Error**: {text}"
|
| 558 |
|
| 559 |
+
invoice_number, vendor_name, invoice_date, due_date, total_amount = extract_entities(pdf_file, text)
|
| 560 |
items = extract_items(pdf_file, text)
|
| 561 |
text_length = len(text)
|
| 562 |
|
|
|
|
|
|
|
|
|
|
| 563 |
history_df = fetch_vendor_history(vendor_name, invoice_number)
|
| 564 |
consistency_issues = check_data_consistency(invoice_number, vendor_name, invoice_date, history_df)
|
| 565 |
|
|
|
|
| 603 |
items_str = "; ".join(item['description'] for item in items) # Fallback to raw descriptions
|
| 604 |
print(f"Fallback items_str: {items_str}")
|
| 605 |
|
| 606 |
+
# Format the invoice date as DD-MM-YYYY
|
| 607 |
formatted_invoice_date = invoice_date.strftime("%d-%m-%Y")
|
| 608 |
+
# Format the due date as DD-MM-YYYY only if it exists
|
| 609 |
+
formatted_due_date = due_date.strftime("%d-%m-%Y") if due_date else None
|
| 610 |
|
| 611 |
output = [
|
| 612 |
"## Fraud Detection Summary",
|
| 613 |
f"- **Invoice Number**: {invoice_number}",
|
| 614 |
f"- **Vendor Name**: {vendor_name}",
|
| 615 |
f"- **Invoice Date**: {formatted_invoice_date}",
|
|
|
|
|
|
|
| 616 |
]
|
| 617 |
|
| 618 |
+
# Only include Due Date in the output if it was extracted from the invoice
|
| 619 |
+
if formatted_due_date:
|
| 620 |
+
output.append(f"- **Due Date**: {formatted_due_date}")
|
| 621 |
+
|
| 622 |
+
output.append(
|
| 623 |
+
f"- **Invoice Amount**: ${total_amount:,.2f}" if '$' in text else f"- **Invoice Amount**: ₹{total_amount:,.2f}"
|
| 624 |
+
)
|
| 625 |
+
|
| 626 |
# Add items section
|
| 627 |
output.append("- **Items Selected**:")
|
| 628 |
if items:
|
|
|
|
| 653 |
"Vendor_Name__c": vendor_name,
|
| 654 |
"Invoice_Amount__c": total_amount,
|
| 655 |
"Invoice_Date__c": str(invoice_date),
|
|
|
|
| 656 |
"Fraud_Score__c": fraud_score,
|
| 657 |
"Fraud_Reason__c": "; ".join(fraud_reasoning),
|
| 658 |
"Flagged__c": fraud_score > 50,
|
| 659 |
"Status__c": "Flagged" if fraud_score > 50 else "Cleared",
|
| 660 |
"Items_Selected__c": items_str
|
| 661 |
}
|
| 662 |
+
# Only include Due_Date__c if a due date was extracted
|
| 663 |
+
if due_date:
|
| 664 |
+
record_data["Due_Date__c"] = str(due_date)
|
| 665 |
print(f"Record data being sent to Salesforce: {record_data}") # Debug
|
| 666 |
sf.Invoice_Record__c.create(record_data)
|
| 667 |
print(f"Successfully created Salesforce record with Items_Selected__c: {items_str}") # Debug
|