Spaces:

Abhisesh7
/

Invoice-Fraud-Detection

Sleeping

App Files Files Community

Abhisesh7 commited on May 23, 2025

Commit

5b964d1

verified ·

1 Parent(s): 15ee88d

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -58

app.py CHANGED Viewed

@@ -59,30 +59,40 @@ def extract_text_from_pdf(pdf_file):
         return f"Error extracting text: {str(e)}"
 def extract_items(text):
-    """Extract items from the invoice table with a simplified approach."""
     items = []
-    # Replace escaped dollar signs
-    text = text.replace(r'\$', '$')
     # Split text into lines
     lines = text.split('\n')
     print("Text split into lines:", lines)  # Debug
-    # Find the table header
     table_start = -1
     for i, line in enumerate(lines):
-        if "Item Description" in line and "Quantity" in line and "Unit Price" in line and "Total Price" in line:
-            table_start = i + 1  # Table data starts after the header
             break
     if table_start == -1:
         print("Table header not found.")
         return items
-    # Find the end of the table (before "Total Amount", "Promo Code", or end of text)
     table_end = len(lines)
     for i in range(table_start, len(lines)):
-        if "Total Amount" in lines[i] or "Total Due" in lines[i] or "Promo Code" in lines[i]:
             table_end = i
             break
@@ -90,8 +100,13 @@ def extract_items(text):
     table_lines = lines[table_start:table_end]
     print("Table lines:", table_lines)  # Debug
-    # Pattern to match table rows
-    table_row_pattern = r"\|?\s*([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*(\d+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
     for line in table_lines:
         line = line.strip()
@@ -101,22 +116,19 @@ def extract_items(text):
         if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
             print(f"Skipping alignment row: {line}")
             continue
-        # Replace alignment markers in the row (e.g., "|---|") with "|"
-        line = re.sub(r'\|\s*---\s*\|', '|', line)
         print(f"Processing table row: {line}")  # Debug
         match = re.match(table_row_pattern, line)
         if match:
-            description = match.group(1).strip()
-            # Clean the description to remove any trailing quantity or price data
-            description = re.sub(r'\s*\d+\s*$', '', description).strip()  # Remove trailing numbers
-            description = re.sub(r'\s*\$?\d+\.\d+\s*$', '', description).strip()  # Remove trailing prices
-            # Skip lines that look like promo codes
-            if "Promo Code" in description:
-                print(f"Skipping promo code line: {line}")
-                continue
-            quantity = int(match.group(2))
-            unit_price = float(match.group(3))
-            total_price = float(match.group(4))
             items.append({
                 "description": description,
                 "quantity": quantity,
@@ -130,32 +142,38 @@ def extract_items(text):
     return items
 def extract_entities(text):
-    """Extract structured invoice details using flexible regex patterns."""
-    invoice_number = "Unknown"
     vendor_name = "Unknown"
     invoice_date = datetime.now().date()
     total_amount = 0.0
     # Extract items first to use as a filter for NER
     items = extract_items(text)
     item_descriptions = [item["description"].lower() for item in items]
     # Flexible regex patterns to handle various invoice formats
-    invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-)([\w-]+)"
-    vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
     invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
-    total_amount_pattern = r"(?:Total\s*(?:Amount|Due)?|Amount\s*Due|Total)\s*[:\-\s]*[$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR)?"
-    # Invoice Number
-    invoice_num_match = re.search(invoice_num_pattern, text, re.IGNORECASE)
-    if invoice_num_match:
-        invoice_number = invoice_num_match.group(1) if invoice_num_match.group(1) else invoice_num_match.group(2)
         print(f"Matched Invoice Number: {invoice_number}")  # Debug
     # Vendor Name
     vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
     if vendor_match:
         vendor_name = vendor_match.group(1).strip()
         print(f"Matched Vendor Name (Regex): {vendor_name}")  # Debug
     else:
         # Enhanced NER fallback for multi-word organization names
@@ -172,8 +190,16 @@ def extract_entities(text):
                 vendor_name = candidate_vendor_name
             print(f"NER Matched Vendor Name: {vendor_name}")  # Debug
-    # Invoice Date
-    invoice_date_match = re.search(invoice_date_pattern, text, re.IGNORECASE)
     if invoice_date_match:
         date_str = invoice_date_match.group(1)
         try:
@@ -190,15 +216,29 @@ def extract_entities(text):
         except ValueError as e:
             print(f"Failed to parse Invoice Date '{date_str}': {str(e)}")  # Debug
-    # Total Amount
-    total_amount_match = re.search(total_amount_pattern, text, re.IGNORECASE)
-    if total_amount_match:
-        total_amount = float(total_amount_match.group(1).replace(",", ""))
-        print(f"Matched Total Amount: {total_amount}")  # Debug
-    return invoice_number, vendor_name, invoice_date, total_amount
-def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
     """Fetch historical invoices for the vendor from Salesforce."""
     if sf is None:
         return pd.DataFrame()
@@ -225,14 +265,15 @@ def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
         print(f"Failed to fetch vendor history: {str(e)}")
         return pd.DataFrame()
-def check_data_consistency(invoice_number, vendor_name, invoice_date, history_df):
     """Check for data consistency issues like duplicates."""
     consistency_issues = []
     if not history_df.empty:
-        duplicate_invoices = history_df[history_df['Invoice_Number__c'] == invoice_number]
-        if not duplicate_invoices.empty:
-            consistency_issues.append(f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'.")
     return consistency_issues
@@ -334,16 +375,16 @@ def process_invoice(pdf_file):
     if "Error" in text:
         return f"**Error**: {text}"
-    invoice_number, vendor_name, invoice_date, total_amount = extract_entities(text)
     items = extract_items(text)
     text_length = len(text)
-    history_df = fetch_vendor_history(vendor_name, invoice_number)
-    consistency_issues = check_data_consistency(invoice_number, vendor_name, invoice_date, history_df)
     data = {
         "invoice_id": str(uuid.uuid4()),
-        "invoice_number": invoice_number,
         "vendor_name": vendor_name,
         "amount": total_amount,
         "invoice_date": invoice_date,
@@ -369,31 +410,33 @@ def process_invoice(pdf_file):
         desc = item['description']
         # Additional cleaning to ensure no quantity or price data
         desc = re.sub(r'\s*Quantity\s*\d+', '', desc, flags=re.IGNORECASE).strip()
-        desc = re.sub(r'\s*Unit\s*Price\s*\$\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
-        desc = re.sub(r'\s*Total\s*Price\s*\$\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
         cleaned_items.append(desc)
     items_str = "; ".join(cleaned_items) if cleaned_items else "No items found"
     print(f"Items string for Salesforce (after cleaning): {items_str}")  # Debug
     # Validate items_str to ensure it contains no quantity or price data
-    if re.search(r'Quantity|Unit Price|Total Price|\$\d+\.\d+', items_str, re.IGNORECASE):
         print(f"ERROR: items_str contains unexpected quantity or price data: {items_str}")
         items_str = "; ".join(item['description'] for item in items)  # Fallback to raw descriptions
         print(f"Fallback items_str: {items_str}")
     output = [
         "## Fraud Detection Summary",
-        f"- **Invoice Number**: {invoice_number}",
         f"- **Vendor Name**: {vendor_name}",
         f"- **Invoice Date**: {invoice_date}",
-        f"- **Invoice Amount**: ${total_amount:,.2f}",
-        "- **Items Selected**:",
     ]
     if items:
         for item in items:
-            clean_description = re.sub(r'\s*\d+\s*\d*$', '', item['description']).strip()
-            output.append(f"  - {clean_description}")
     else:
         output.append("  - No items found")
@@ -413,7 +456,7 @@ def process_invoice(pdf_file):
     if sf is not None:
         try:
             record_data = {
-                "Invoice_Number__c": invoice_number,
                 "Vendor_Name__c": vendor_name,
                 "Invoice_Amount__c": total_amount,
                 "Invoice_Date__c": str(invoice_date),

         return f"Error extracting text: {str(e)}"
 def extract_items(text):
+    """Extract items from the invoice table with support for multiple table formats."""
     items = []
+    # Replace escaped dollar signs and other symbols
+    text = text.replace(r'\$', '$').replace('₹', '₹')
     # Split text into lines
     lines = text.split('\n')
     print("Text split into lines:", lines)  # Debug
+    # Define possible table headers
+    table_headers = [
+        ("Item Description", "Quantity", "Unit Price", "Total Price"),  # Format 1 (e.g., invoice_4.pdf)
+        ("Particulars", "Gross value", "Discount", "Net value", "Total"),  # Format 2 (e.g., Invoice_6164752968.pdf)
+    ]
     table_start = -1
+    table_format = None
     for i, line in enumerate(lines):
+        for headers in table_headers:
+            if all(header in line for header in headers):
+                table_start = i + 1  # Table data starts after the header
+                table_format = headers
+                break
+        if table_start != -1:
             break
     if table_start == -1:
         print("Table header not found.")
         return items
+    # Find the end of the table (before "Total Amount", "Total Value", or end of text)
     table_end = len(lines)
     for i in range(table_start, len(lines)):
+        if "Total Amount" in lines[i] or "Total Value" in lines[i] or "Total Due" in lines[i] or "Item(s) Total" in lines[i]:
             table_end = i
             break
     table_lines = lines[table_start:table_end]
     print("Table lines:", table_lines)  # Debug
+    # Define patterns based on table format
+    if table_format[0] == "Item Description":
+        # Pattern for invoice_4.pdf: "Monitor 24 inch | 7 | 150.00 | 1050.00"
+        table_row_pattern = r"\|?\s*([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*(\d+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
+    else:
+        # Pattern for Invoice_6164752968.pdf: "1 x Chicken Frankie | 60 | 6 | 54 | 2.5% | 1.35 | 2.5% | 1.35 | 56.7"
+        table_row_pattern = r"\|?\s*(\d+\s*x\s*[A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*[\d.%]+\s*\|?\s*[\d.]+(?:\s*\|?\s*[\d.%]+\s*\|?\s*[\d.]+)?\s*\|?\s*([\d.]+)\s*\|?"
     for line in table_lines:
         line = line.strip()
         if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
             print(f"Skipping alignment row: {line}")
             continue
         print(f"Processing table row: {line}")  # Debug
         match = re.match(table_row_pattern, line)
         if match:
+            if table_format[0] == "Item Description":
+                description = match.group(1).strip()
+                quantity = int(match.group(2))
+                unit_price = float(match.group(3))
+                total_price = float(match.group(4))
+            else:
+                description = match.group(1).strip()
+                quantity = int(description.split(' x ')[0].strip()) if ' x ' in description else 1
+                unit_price = float(match.group(2))  # Gross value
+                total_price = float(match.group(5))  # Total after taxes
             items.append({
                 "description": description,
                 "quantity": quantity,
     return items
 def extract_entities(text):
+    """Extract structured invoice details including recipient name using flexible regex patterns."""
+    invoice_numbers = []
     vendor_name = "Unknown"
     invoice_date = datetime.now().date()
     total_amount = 0.0
+    recipient_name = "Unknown"
     # Extract items first to use as a filter for NER
     items = extract_items(text)
     item_descriptions = [item["description"].lower() for item in items]
     # Flexible regex patterns to handle various invoice formats
+    invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-|Z\d{2}APOT\d{9})([\w-]+)"
+    vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By|Restaurant\s*Name)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
     invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
+    total_amount_pattern = r"(?:Total\s*(?:Amount|Due|Value))?[^:\n]*[:\-\s]*[₹$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR|INR)?"
+    recipient_pattern = r"(?:Customer\s*Name|Recipient|Bill\s*To)\s*[:\-\s]*([A-Za-z\s]+)(?=\s*(?:Address|Phone|Email|\n|$))"
+    # Invoice Numbers (capture multiple if present)
+    for match in re.finditer(invoice_num_pattern, text, re.IGNORECASE):
+        invoice_number = match.group(1) if match.group(1) else match.group(2)
+        invoice_numbers.append(invoice_number)
         print(f"Matched Invoice Number: {invoice_number}")  # Debug
+    invoice_numbers = invoice_numbers if invoice_numbers else ["Unknown"]
     # Vendor Name
     vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
     if vendor_match:
         vendor_name = vendor_match.group(1).strip()
+        # Ensure vendor name is not an item description
+        if vendor_name.lower() in item_descriptions:
+            vendor_name = "Unknown"
         print(f"Matched Vendor Name (Regex): {vendor_name}")  # Debug
     else:
         # Enhanced NER fallback for multi-word organization names
                 vendor_name = candidate_vendor_name
             print(f"NER Matched Vendor Name: {vendor_name}")  # Debug
+    # Invoice Date (prioritize "Invoice Date")
+    invoice_date_match = None
+    for line in text.split('\n'):
+        if "Invoice Date" in line:
+            match = re.search(invoice_date_pattern, line, re.IGNORECASE)
+            if match:
+                invoice_date_match = match
+                break
+    if not invoice_date_match:
+        invoice_date_match = re.search(invoice_date_pattern, text, re.IGNORECASE)
     if invoice_date_match:
         date_str = invoice_date_match.group(1)
         try:
         except ValueError as e:
             print(f"Failed to parse Invoice Date '{date_str}': {str(e)}")  # Debug
+    # Total Amount (sum all "Total Value" entries)
+    total_amount_matches = re.finditer(total_amount_pattern, text, re.IGNORECASE)
+    total_amounts = []
+    for match in total_amount_matches:
+        amount_str = match.group(1).replace(",", "")
+        try:
+            amount = float(amount_str)
+            total_amounts.append(amount)
+            print(f"Matched Amount: {amount}")  # Debug
+        except ValueError:
+            continue
+    total_amount = sum(total_amounts) if total_amounts else 0.0
+    print(f"Calculated Total Amount: {total_amount}")  # Debug
+    # Recipient Name
+    recipient_match = re.search(recipient_pattern, text, re.IGNORECASE)
+    if recipient_match:
+        recipient_name = recipient_match.group(1).strip()
+        print(f"Matched Recipient Name: {recipient_name}")  # Debug
+    return invoice_numbers, vendor_name, invoice_date, total_amount, recipient_name
+def fetch_vendor_history(vendor_name, invoice_numbers, time_window_days=30):
     """Fetch historical invoices for the vendor from Salesforce."""
     if sf is None:
         return pd.DataFrame()
         print(f"Failed to fetch vendor history: {str(e)}")
         return pd.DataFrame()
+def check_data_consistency(invoice_numbers, vendor_name, invoice_date, history_df):
     """Check for data consistency issues like duplicates."""
     consistency_issues = []
     if not history_df.empty:
+        for invoice_number in invoice_numbers:
+            duplicate_invoices = history_df[history_df['Invoice_Number__c'] == invoice_number]
+            if not duplicate_invoices.empty:
+                consistency_issues.append(f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'.")
     return consistency_issues
     if "Error" in text:
         return f"**Error**: {text}"
+    invoice_numbers, vendor_name, invoice_date, total_amount, recipient_name = extract_entities(text)
     items = extract_items(text)
     text_length = len(text)
+    history_df = fetch_vendor_history(vendor_name, invoice_numbers)
+    consistency_issues = check_data_consistency(invoice_numbers, vendor_name, invoice_date, history_df)
     data = {
         "invoice_id": str(uuid.uuid4()),
+        "invoice_number": "; ".join(invoice_numbers),
         "vendor_name": vendor_name,
         "amount": total_amount,
         "invoice_date": invoice_date,
         desc = item['description']
         # Additional cleaning to ensure no quantity or price data
         desc = re.sub(r'\s*Quantity\s*\d+', '', desc, flags=re.IGNORECASE).strip()
+        desc = re.sub(r'\s*Unit\s*Price\s*[₹$]\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
+        desc = re.sub(r'\s*Total\s*Price\s*[₹$]\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
         cleaned_items.append(desc)
     items_str = "; ".join(cleaned_items) if cleaned_items else "No items found"
     print(f"Items string for Salesforce (after cleaning): {items_str}")  # Debug
     # Validate items_str to ensure it contains no quantity or price data
+    if re.search(r'Quantity|Unit Price|Total Price|[₹$]\d+\.\d+', items_str, re.IGNORECASE):
         print(f"ERROR: items_str contains unexpected quantity or price data: {items_str}")
         items_str = "; ".join(item['description'] for item in items)  # Fallback to raw descriptions
         print(f"Fallback items_str: {items_str}")
     output = [
         "## Fraud Detection Summary",
+        f"- **Invoice Number**: {'; '.join(invoice_numbers)}",
+        f"- **Recipient Name**: {recipient_name}",
         f"- **Vendor Name**: {vendor_name}",
         f"- **Invoice Date**: {invoice_date}",
+        f"- **Invoice Amount**: ₹{total_amount:,.2f}",  # Assuming INR for this PDF
     ]
+    # Add items section
+    output.append("- **Items Selected**:")
     if items:
         for item in items:
+            clean_description = re.sub(r'\s*\d+\s*x\s*', '', item['description']).strip()  # Remove "1 x "
+            output.append(f"  - {clean_description}: ₹{item['total_price']:.2f}")
     else:
         output.append("  - No items found")
     if sf is not None:
         try:
             record_data = {
+                "Invoice_Number__c": "; ".join(invoice_numbers),
                 "Vendor_Name__c": vendor_name,
                 "Invoice_Amount__c": total_amount,
                 "Invoice_Date__c": str(invoice_date),