Spaces:

Abhisesh7
/

Invoice-Fraud-Detection

Sleeping

App Files Files Community

Abhisesh7 commited on May 23, 2025

Commit

fbe7682

verified ·

1 Parent(s): db037c1

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -87

app.py CHANGED Viewed

@@ -12,7 +12,6 @@ from datetime import datetime, timedelta
 import re
 import gradio as gr
 from simple_salesforce import Salesforce, SalesforceAuthenticationFailed
-from image_ocr import extract_text_from_image  # Import the image OCR function
 # Load environment variables from .env file
 load_dotenv()
@@ -62,18 +61,17 @@ def extract_text_from_pdf(pdf_file):
 def extract_items(text):
     """Extract items from the invoice table with a simplified approach."""
     items = []
-    # Replace escaped dollar signs and other currency symbols
-    text = text.replace(r'\$', '$').replace('₹', '₹')
     # Split text into lines
     lines = text.split('\n')
     print("Text split into lines:", lines)  # Debug
-    # Find the table header (more flexible matching)
     table_start = -1
     for i, line in enumerate(lines):
-        # Match variations of table headers like "Item Quantity Rate Amount"
-        if re.search(r'Item.*Quantity.*(Rate|Unit\s*Price).*(Amount|Total\s*Price)', line, re.IGNORECASE):
             table_start = i + 1  # Table data starts after the header
             break
@@ -81,10 +79,10 @@ def extract_items(text):
         print("Table header not found.")
         return items
-    # Find the end of the table (before "Subtotal", "Total", "Tax", or end of text)
     table_end = len(lines)
     for i in range(table_start, len(lines)):
-        if any(keyword in lines[i] for keyword in ["Subtotal", "Total", "Tax", "Balance Due", "Promo Code"]):
             table_end = i
             break
@@ -92,28 +90,33 @@ def extract_items(text):
     table_lines = lines[table_start:table_end]
     print("Table lines:", table_lines)  # Debug
-    # Updated pattern to match table rows more accurately
-    # Captures: Description (non-greedy), Quantity (digits), Rate/Unit Price (decimal with optional currency), Amount/Total Price (decimal with optional currency)
-    table_row_pattern = r"^(.*?)\s+(\d+)\s+(?:₹|[$£€]?\s*)([\d,]+\.?\d*)\s+(?:₹|[$£€]?\s*)([\d,]+\.?\d*)$"
     for line in table_lines:
         line = line.strip()
         if not line:
             continue
         print(f"Processing table row: {line}")  # Debug
         match = re.match(table_row_pattern, line)
         if match:
             description = match.group(1).strip()
             # Clean the description to remove any trailing quantity or price data
             description = re.sub(r'\s*\d+\s*$', '', description).strip()  # Remove trailing numbers
-            description = re.sub(r'\s*(?:₹|[$£€]?)[\d,]+\.?\d*\s*$', '', description).strip()  # Remove trailing prices
             # Skip lines that look like promo codes
             if "Promo Code" in description:
                 print(f"Skipping promo code line: {line}")
                 continue
             quantity = int(match.group(2))
-            unit_price = float(match.group(3).replace(",", ""))
-            total_price = float(match.group(4).replace(",", ""))
             items.append({
                 "description": description,
                 "quantity": quantity,
@@ -131,7 +134,6 @@ def extract_entities(text):
     invoice_number = "Unknown"
     vendor_name = "Unknown"
     invoice_date = datetime.now().date()
-    due_date = None  # Default to None
     total_amount = 0.0
     # Extract items first to use as a filter for NER
@@ -139,16 +141,15 @@ def extract_entities(text):
     item_descriptions = [item["description"].lower() for item in items]
     # Flexible regex patterns to handle various invoice formats
-    invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-)([\w-]+)|#?\s*(\d+)"
     vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
-    invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*((\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4}|[A-Za-z]+\s*\d{1,2}\s*\d{4}))"
-    due_date_pattern = r"(?:Due\s*Date|Payment\s*Due\s*Date|Due\s*By)\s*[:\-\s]*((\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4}|[A-Za-z]+\s*\d{1,2}\s*\d{4}))"
-    total_amount_pattern = r"(?:Total\s*(?:Amount|Due)?|Amount\s*Due|Total|Balance\s*Due)\s*[:\-\s]*(?:₹|[$£€])?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR|INR)?"
     # Invoice Number
     invoice_num_match = re.search(invoice_num_pattern, text, re.IGNORECASE)
     if invoice_num_match:
-        invoice_number = invoice_num_match.group(1) if invoice_num_match.group(1) else (invoice_num_match.group(2) if invoice_num_match.group(2) else invoice_num_match.group(3))
         print(f"Matched Invoice Number: {invoice_number}")  # Debug
     # Vendor Name
@@ -185,39 +186,17 @@ def extract_entities(text):
                     invoice_date = datetime.strptime(date_str, "%Y-%m-%d").date()
                 except ValueError:
                     invoice_date = datetime.strptime(date_str, "%d-%m-%Y").date()
-            elif re.match(r"[A-Za-z]+\s*\d{1,2}\s*\d{4}", date_str):
-                invoice_date = datetime.strptime(date_str, "%B %d %Y").date()
             print(f"Matched Invoice Date: {invoice_date}")  # Debug
         except ValueError as e:
             print(f"Failed to parse Invoice Date '{date_str}': {str(e)}")  # Debug
-    # Due Date
-    due_date_match = re.search(due_date_pattern, text, re.IGNORECASE)
-    if due_date_match:
-        date_str = due_date_match.group(1)
-        try:
-            if "/" in date_str:
-                due_date = datetime.strptime(date_str, "%m/%d/%Y").date()
-            elif "," in date_str:
-                due_date = datetime.strptime(date_str, "%B %d, %Y").date()
-            elif "-" in date_str:
-                try:
-                    due_date = datetime.strptime(date_str, "%Y-%m-%d").date()
-                except ValueError:
-                    invoice_date = datetime.strptime(date_str, "%d-%m-%Y").date()
-            elif re.match(r"[A-Za-z]+\s*\d{1,2}\s*\d{4}", date_str):
-                due_date = datetime.strptime(date_str, "%B %d %Y").date()
-            print(f"Matched Due Date: {due_date}")  # Debug
-        except ValueError as e:
-            print(f"Failed to parse Due Date '{date_str}': {str(e)}")  # Debug
     # Total Amount
     total_amount_match = re.search(total_amount_pattern, text, re.IGNORECASE)
     if total_amount_match:
         total_amount = float(total_amount_match.group(1).replace(",", ""))
         print(f"Matched Total Amount: {total_amount}")  # Debug
-    return invoice_number, vendor_name, invoice_date, due_date, total_amount
 def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
     """Fetch historical invoices for the vendor from Salesforce."""
@@ -312,8 +291,8 @@ def detect_anomalies(df, history_df):
     return df
-def calculate_fraud_score(amount, is_amount_anomaly, is_frequency_anomaly, is_vendor_pattern_anomaly, text_length, consistency_issues, invoice_date, due_date):
-    """Calculate fraud score based on amount, anomalies, text length, consistency issues, invoice date, and due date."""
     score = 0.0
     reasoning = []
     today = datetime.now().date()
@@ -329,10 +308,6 @@ def calculate_fraud_score(amount, is_amount_anomaly, is_frequency_anomaly, is_ve
         score += 10
         reasoning.append("Invoice date is in the future.")
-    if due_date and due_date < today:
-        score += 10
-        reasoning.append("Due date is in the past.")
     if is_amount_anomaly == -1:
         score += 30
         reasoning.append("Amount flagged as an anomaly.")
@@ -353,23 +328,13 @@ def calculate_fraud_score(amount, is_amount_anomaly, is_frequency_anomaly, is_ve
     return min(score, 100), reasoning
-def process_invoice(file_path):
-    """Process a single invoice (PDF or image) and return structured markdown output."""
-    # Determine file type and extract text accordingly
-    if file_path.lower().endswith('.pdf'):
-        text = extract_text_from_pdf(file_path)
-    elif file_path.lower().endswith(('.png', '.jpg', '.jpeg')):
-        # Ensure file_path is a string (Gradio might pass a TempFile object)
-        if hasattr(file_path, 'name'):
-            file_path = file_path.name  # Extract the file path from Gradio's TempFile object
-        text = extract_text_from_image(file_path)
-    else:
-        return "**Error**: Unsupported file type. Please upload a PDF or image (PNG/JPG/JPEG)."
     if "Error" in text:
         return f"**Error**: {text}"
-    invoice_number, vendor_name, invoice_date, due_date, total_amount = extract_entities(text)
     items = extract_items(text)
     text_length = len(text)
@@ -382,7 +347,6 @@ def process_invoice(file_path):
         "vendor_name": vendor_name,
         "amount": total_amount,
         "invoice_date": invoice_date,
-        "due_date": due_date,
         "text_length": text_length
     }
     df = pd.DataFrame([data])
@@ -396,8 +360,7 @@ def process_invoice(file_path):
         df["is_vendor_pattern_anomaly"].iloc[0],
         text_length,
         consistency_issues,
-        invoice_date,
-        due_date
     )
     # Format items for Salesforce (only include item descriptions)
@@ -406,8 +369,8 @@ def process_invoice(file_path):
         desc = item['description']
         # Additional cleaning to ensure no quantity or price data
         desc = re.sub(r'\s*Quantity\s*\d+', '', desc, flags=re.IGNORECASE).strip()
-        desc = re.sub(r'\s*(?:Rate|Unit\s*Price)\s*(?:₹|[$£€])\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
-        desc = re.sub(r'\s*(?:Amount|Total\s*Price)\s*(?:₹|[$£€])\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
         cleaned_items.append(desc)
     items_str = "; ".join(cleaned_items) if cleaned_items else "No items found"
     print(f"Items string for Salesforce (after cleaning): {items_str}")  # Debug
@@ -423,18 +386,9 @@ def process_invoice(file_path):
         f"- **Invoice Number**: {invoice_number}",
         f"- **Vendor Name**: {vendor_name}",
         f"- **Invoice Date**: {invoice_date}",
-    ]
-    # Only add Due Date to output if it exists
-    if due_date:
-        output.append(f"- **Due Date**: {due_date}")
-    else:
-        output.append(f"- **Due Date**: Not specified")
-    output.extend([
-        f"- **Invoice Amount**: ₹{total_amount:,.2f}",
         "- **Items Selected**:",
-    ])
     if items:
         for item in items:
@@ -463,8 +417,6 @@ def process_invoice(file_path):
                 "Vendor_Name__c": vendor_name,
                 "Invoice_Amount__c": total_amount,
                 "Invoice_Date__c": str(invoice_date),
-                # Only include Due_Date__c if due_date exists
-                "Due_Date__c": str(due_date) if due_date else None,
                 "Fraud_Score__c": fraud_score,
                 "Fraud_Reason__c": "; ".join(fraud_reasoning),
                 "Flagged__c": fraud_score > 50,
@@ -480,17 +432,17 @@ def process_invoice(file_path):
     return "\n".join(output)
-def gradio_interface(file):
-    """Gradio interface to process uploaded file (PDF or image) and display structured results."""
-    if file is None:
-        return "Please upload a PDF or image file."
-    result = process_invoice(file)
     return result
 with gr.Blocks(css=".prose a[href*='share']:has(svg) {display:none !important;}") as iface:
     gr.Markdown("# Invoice Fraud Detection")
     with gr.Row():
-        file_input = gr.File(label="Upload Invoice (PDF or Image)")
     result_output = gr.Markdown(label="Fraud Detection Results")
     file_input.change(fn=gradio_interface, inputs=file_input, outputs=result_output)

 import re
 import gradio as gr
 from simple_salesforce import Salesforce, SalesforceAuthenticationFailed
 # Load environment variables from .env file
 load_dotenv()
 def extract_items(text):
     """Extract items from the invoice table with a simplified approach."""
     items = []
+    # Replace escaped dollar signs
+    text = text.replace(r'\$', '$')
     # Split text into lines
     lines = text.split('\n')
     print("Text split into lines:", lines)  # Debug
+    # Find the table header
     table_start = -1
     for i, line in enumerate(lines):
+        if "Item Description" in line and "Quantity" in line and "Unit Price" in line and "Total Price" in line:
             table_start = i + 1  # Table data starts after the header
             break
         print("Table header not found.")
         return items
+    # Find the end of the table (before "Total Amount", "Promo Code", or end of text)
     table_end = len(lines)
     for i in range(table_start, len(lines)):
+        if "Total Amount" in lines[i] or "Total Due" in lines[i] or "Promo Code" in lines[i]:
             table_end = i
             break
     table_lines = lines[table_start:table_end]
     print("Table lines:", table_lines)  # Debug
+    # Pattern to match table rows
+    table_row_pattern = r"\|?\s*([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*(\d+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
     for line in table_lines:
         line = line.strip()
         if not line:
             continue
+        # Skip alignment rows (e.g., "|---|---|")
+        if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
+            print(f"Skipping alignment row: {line}")
+            continue
+        # Replace alignment markers in the row (e.g., "|---|") with "|"
+        line = re.sub(r'\|\s*---\s*\|', '|', line)
         print(f"Processing table row: {line}")  # Debug
         match = re.match(table_row_pattern, line)
         if match:
             description = match.group(1).strip()
             # Clean the description to remove any trailing quantity or price data
             description = re.sub(r'\s*\d+\s*$', '', description).strip()  # Remove trailing numbers
+            description = re.sub(r'\s*\$?\d+\.\d+\s*$', '', description).strip()  # Remove trailing prices
             # Skip lines that look like promo codes
             if "Promo Code" in description:
                 print(f"Skipping promo code line: {line}")
                 continue
             quantity = int(match.group(2))
+            unit_price = float(match.group(3))
+            total_price = float(match.group(4))
             items.append({
                 "description": description,
                 "quantity": quantity,
     invoice_number = "Unknown"
     vendor_name = "Unknown"
     invoice_date = datetime.now().date()
     total_amount = 0.0
     # Extract items first to use as a filter for NER
     item_descriptions = [item["description"].lower() for item in items]
     # Flexible regex patterns to handle various invoice formats
+    invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-)([\w-]+)"
     vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
+    invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
+    total_amount_pattern = r"(?:Total\s*(?:Amount|Due)?|Amount\s*Due|Total)\s*[:\-\s]*[$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR)?"
     # Invoice Number
     invoice_num_match = re.search(invoice_num_pattern, text, re.IGNORECASE)
     if invoice_num_match:
+        invoice_number = invoice_num_match.group(1) if invoice_num_match.group(1) else invoice_num_match.group(2)
         print(f"Matched Invoice Number: {invoice_number}")  # Debug
     # Vendor Name
                     invoice_date = datetime.strptime(date_str, "%Y-%m-%d").date()
                 except ValueError:
                     invoice_date = datetime.strptime(date_str, "%d-%m-%Y").date()
             print(f"Matched Invoice Date: {invoice_date}")  # Debug
         except ValueError as e:
             print(f"Failed to parse Invoice Date '{date_str}': {str(e)}")  # Debug
     # Total Amount
     total_amount_match = re.search(total_amount_pattern, text, re.IGNORECASE)
     if total_amount_match:
         total_amount = float(total_amount_match.group(1).replace(",", ""))
         print(f"Matched Total Amount: {total_amount}")  # Debug
+    return invoice_number, vendor_name, invoice_date, total_amount
 def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
     """Fetch historical invoices for the vendor from Salesforce."""
     return df
+def calculate_fraud_score(amount, is_amount_anomaly, is_frequency_anomaly, is_vendor_pattern_anomaly, text_length, consistency_issues, invoice_date):
+    """Calculate fraud score based on amount, anomalies, text length, consistency issues, and invoice date."""
     score = 0.0
     reasoning = []
     today = datetime.now().date()
         score += 10
         reasoning.append("Invoice date is in the future.")
     if is_amount_anomaly == -1:
         score += 30
         reasoning.append("Amount flagged as an anomaly.")
     return min(score, 100), reasoning
+def process_invoice(pdf_file):
+    """Process a single invoice PDF and return structured markdown output."""
+    text = extract_text_from_pdf(pdf_file)
     if "Error" in text:
         return f"**Error**: {text}"
+    invoice_number, vendor_name, invoice_date, total_amount = extract_entities(text)
     items = extract_items(text)
     text_length = len(text)
         "vendor_name": vendor_name,
         "amount": total_amount,
         "invoice_date": invoice_date,
         "text_length": text_length
     }
     df = pd.DataFrame([data])
         df["is_vendor_pattern_anomaly"].iloc[0],
         text_length,
         consistency_issues,
+        invoice_date
     )
     # Format items for Salesforce (only include item descriptions)
         desc = item['description']
         # Additional cleaning to ensure no quantity or price data
         desc = re.sub(r'\s*Quantity\s*\d+', '', desc, flags=re.IGNORECASE).strip()
+        desc = re.sub(r'\s*Unit\s*Price\s*\$\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
+        desc = re.sub(r'\s*Total\s*Price\s*\$\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
         cleaned_items.append(desc)
     items_str = "; ".join(cleaned_items) if cleaned_items else "No items found"
     print(f"Items string for Salesforce (after cleaning): {items_str}")  # Debug
         f"- **Invoice Number**: {invoice_number}",
         f"- **Vendor Name**: {vendor_name}",
         f"- **Invoice Date**: {invoice_date}",
+        f"- **Invoice Amount**: ${total_amount:,.2f}",
         "- **Items Selected**:",
+    ]
     if items:
         for item in items:
                 "Vendor_Name__c": vendor_name,
                 "Invoice_Amount__c": total_amount,
                 "Invoice_Date__c": str(invoice_date),
                 "Fraud_Score__c": fraud_score,
                 "Fraud_Reason__c": "; ".join(fraud_reasoning),
                 "Flagged__c": fraud_score > 50,
     return "\n".join(output)
+def gradio_interface(pdf_file):
+    """Gradio interface to process uploaded PDF and display structured results."""
+    if pdf_file is None:
+        return "Please upload a PDF file."
+    result = process_invoice(pdf_file)
     return result
 with gr.Blocks(css=".prose a[href*='share']:has(svg) {display:none !important;}") as iface:
     gr.Markdown("# Invoice Fraud Detection")
     with gr.Row():
+        file_input = gr.File(label="Upload Invoice PDF")
     result_output = gr.Markdown(label="Fraud Detection Results")
     file_input.change(fn=gradio_interface, inputs=file_input, outputs=result_output)