Spaces:

Abhisesh7
/

Invoice-Fraud-Detection

Sleeping

App Files Files Community

Abhisesh7 commited on May 22, 2025

Commit

6ad9a18

verified ·

1 Parent(s): 0c25273

Update app.py

Browse files

Files changed (1) hide show

app.py +436 -22

app.py CHANGED Viewed

@@ -1,44 +1,458 @@
 import os
 import logging
 import gradio as gr
-from pdf_extraction import extract_text_from_pdf
-from image_extraction import extract_text_from_image
-# Set up logging
-logging.basicConfig(level=logging.DEBUG)  # Set to DEBUG for detailed logging
-logger = logging.getLogger(__name__)
-def process_invoice(file):
-    """Extract text from a single invoice (PDF or image) and return it as is."""
     # Determine file type and extract text accordingly
-    file_extension = os.path.splitext(file.name)[1].lower()
-    if file_extension == '.pdf':
-        text = extract_text_from_pdf(file.name)
-    elif file_extension in ['.png', '.jpg', '.jpeg']:
-        text = extract_text_from_image(file.name)
     else:
-        return f"Error: Unsupported file type '{file_extension}'. Please upload a PDF, PNG, or JPG file."
     if "Error" in text:
-        return text
-    # Return the raw extracted text without any additional formatting
-    return text
 def gradio_interface(file):
-    """Gradio interface to process uploaded file (PDF or image) and display raw text."""
     if file is None:
         return "Please upload a PDF or image file."
     result = process_invoice(file)
     return result
 with gr.Blocks(css=".prose a[href*='share']:has(svg) {display:none !important;}") as iface:
-    gr.Markdown("# Invoice Text Extraction")
     with gr.Row():
-        file_input = gr.File(label="Upload Invoice (PDF, PNG, JPG)")
-    result_output = gr.Textbox(label="Extracted Text")
     file_input.change(fn=gradio_interface, inputs=file_input, outputs=result_output)
 if __name__ == "__main__":
-    logger.info("Starting the application...")
-    iface.launch(share=True)

 import os
+from dotenv import load_dotenv
 import logging
+import pdfplumber
+import pandas as pd
+import numpy as np
+from transformers import pipeline
+from sklearn.ensemble import IsolationForest
+from sklearn.preprocessing import StandardScaler
+import uuid
+from datetime import datetime, timedelta
+import re
 import gradio as gr
+from simple_salesforce import Salesforce, SalesforceAuthenticationFailed
+from image_ocr import extract_text_from_image  # Import the image OCR function
+# Load environment variables from .env file
+load_dotenv()
+# Configure environment for CPU usage
+os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable GPU usage
+os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"  # Disable oneDNN optimizations
+# Set up logging to suppress transformers warnings
+logging.getLogger("transformers").setLevel(logging.ERROR)
+# Read Salesforce credentials from environment variables
+SF_USERNAME = os.getenv("SF_USERNAME")
+SF_PASSWORD = os.getenv("SF_PASSWORD")
+SF_SECURITY_TOKEN = os.getenv("SF_SECURITY_TOKEN")
+print(f"Salesforce login info: username={SF_USERNAME}")
+# Salesforce connection with error handling
+try:
+    sf = Salesforce(
+        username=SF_USERNAME,
+        password=SF_PASSWORD,
+        security_token=SF_SECURITY_TOKEN
+    )
+    print("Salesforce login successful.")
+except SalesforceAuthenticationFailed as e:
+    print(f"Salesforce authentication failed: {e}")
+    sf = None
+# Initialize Hugging Face NER pipeline (force CPU)
+ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", device=-1)
+def extract_text_from_pdf(pdf_file):
+    """Extract text from a PDF invoice."""
+    try:
+        with pdfplumber.open(pdf_file) as pdf:
+            text = ""
+            for page in pdf.pages:
+                page_text = page.extract_text() or ""
+                text += page_text + "\n"
+        print("Extracted text:\n", text)  # Debug: Print extracted text
+        return text
+    except Exception as e:
+        return f"Error extracting text: {str(e)}"
+def extract_items(text):
+    """Extract items from the invoice table with a simplified approach."""
+    items = []
+    # Replace escaped dollar signs
+    text = text.replace(r'\$', '$')
+    # Split text into lines
+    lines = text.split('\n')
+    print("Text split into lines:", lines)  # Debug
+    # Find the table header
+    table_start = -1
+    for i, line in enumerate(lines):
+        if "Item Description" in line and "Quantity" in line and "Unit Price" in line and "Total Price" in line:
+            table_start = i + 1  # Table data starts after the header
+            break
+    if table_start == -1:
+        print("Table header not found.")
+        return items
+    # Find the end of the table (before "Total Amount", "Promo Code", or end of text)
+    table_end = len(lines)
+    for i in range(table_start, len(lines)):
+        if "Total Amount" in lines[i] or "Total Due" in lines[i] or "Promo Code" in lines[i]:
+            table_end = i
+            break
+    print(f"Table section: lines {table_start} to {table_end-1}")  # Debug
+    table_lines = lines[table_start:table_end]
+    print("Table lines:", table_lines)  # Debug
+    # Pattern to match table rows
+    table_row_pattern = r"\|?\s*([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*(\d+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
+    for line in table_lines:
+        line = line.strip()
+        if not line:
+            continue
+        # Skip alignment rows (e.g., "|---|---|")
+        if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
+            print(f"Skipping alignment row: {line}")
+            continue
+        # Replace alignment markers in the row (e.g., "|---|") with "|"
+        line = re.sub(r'\|\s*---\s*\|', '|', line)
+        print(f"Processing table row: {line}")  # Debug
+        match = re.match(table_row_pattern, line)
+        if match:
+            description = match.group(1).strip()
+            # Clean the description to remove any trailing quantity or price data
+            description = re.sub(r'\s*\d+\s*$', '', description).strip()  # Remove trailing numbers
+            description = re.sub(r'\s*\$?\d+\.\d+\s*$', '', description).strip()  # Remove trailing prices
+            # Skip lines that look like promo codes
+            if "Promo Code" in description:
+                print(f"Skipping promo code line: {line}")
+                continue
+            quantity = int(match.group(2))
+            unit_price = float(match.group(3))
+            total_price = float(match.group(4))
+            items.append({
+                "description": description,
+                "quantity": quantity,
+                "unit_price": unit_price,
+                "total_price": total_price
+            })
+            print(f"Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}")  # Debug
+        else:
+            print(f"Failed to match row: {line}")
+    return items
+def extract_entities(text):
+    """Extract structured invoice details using flexible regex patterns."""
+    invoice_number = "Unknown"
+    vendor_name = "Unknown"
+    invoice_date = datetime.now().date()
+    total_amount = 0.0
+    # Extract items first to use as a filter for NER
+    items = extract_items(text)
+    item_descriptions = [item["description"].lower() for item in items]
+    # Flexible regex patterns to handle various invoice formats
+    invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-)([\w-]+)"
+    vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
+    invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
+    total_amount_pattern = r"(?:Total\s*(?:Amount|Due)?|Amount\s*Due|Total)\s*[:\-\s]*[$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR)?"
+    # Invoice Number
+    invoice_num_match = re.search(invoice_num_pattern, text, re.IGNORECASE)
+    if invoice_num_match:
+        invoice_number = invoice_num_match.group(1) if invoice_num_match.group(1) else invoice_num_match.group(2)
+        print(f"Matched Invoice Number: {invoice_number}")  # Debug
+    # Vendor Name
+    vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
+    if vendor_match:
+        vendor_name = vendor_match.group(1).strip()
+        print(f"Matched Vendor Name (Regex): {vendor_name}")  # Debug
+    else:
+        # Enhanced NER fallback for multi-word organization names
+        ner_results = ner_pipeline(text)
+        org_name_parts = []
+        for i, entity in enumerate(ner_results):
+            if entity['entity'].startswith('B-ORG'):
+                org_name_parts = [entity['word']]
+            elif entity['entity'].startswith('I-ORG') and org_name_parts:
+                org_name_parts.append(entity['word'])
+        if org_name_parts:
+            candidate_vendor_name = " ".join(part.replace("##", "") for part in org_name_parts)
+            if candidate_vendor_name.lower() not in item_descriptions:
+                vendor_name = candidate_vendor_name
+            print(f"NER Matched Vendor Name: {vendor_name}")  # Debug
+    # Invoice Date
+    invoice_date_match = re.search(invoice_date_pattern, text, re.IGNORECASE)
+    if invoice_date_match:
+        date_str = invoice_date_match.group(1)
+        try:
+            if "/" in date_str:
+                invoice_date = datetime.strptime(date_str, "%m/%d/%Y").date()
+            elif "," in date_str:
+                invoice_date = datetime.strptime(date_str, "%B %d, %Y").date()
+            elif "-" in date_str:
+                try:
+                    invoice_date = datetime.strptime(date_str, "%Y-%m-%d").date()
+                except ValueError:
+                    invoice_date = datetime.strptime(date_str, "%d-%m-%Y").date()
+            print(f"Matched Invoice Date: {invoice_date}")  # Debug
+        except ValueError as e:
+            print(f"Failed to parse Invoice Date '{date_str}': {str(e)}")  # Debug
+    # Total Amount
+    total_amount_match = re.search(total_amount_pattern, text, re.IGNORECASE)
+    if total_amount_match:
+        total_amount = float(total_amount_match.group(1).replace(",", ""))
+        print(f"Matched Total Amount: {total_amount}")  # Debug
+    return invoice_number, vendor_name, invoice_date, total_amount
+def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
+    """Fetch historical invoices for the vendor from Salesforce."""
+    if sf is None:
+        return pd.DataFrame()
+    try:
+        end_date = datetime.now().date()
+        start_date = end_date - timedelta(days=time_window_days)
+        query = f"""
+            SELECT Invoice_Number__c, Invoice_Amount__c, Invoice_Date__c, Vendor_Name__c
+            FROM Invoice_Record__c
+            WHERE Invoice_Date__c >= {start_date} AND Invoice_Date__c <= {end_date}
+            AND Vendor_Name__c = '{vendor_name}'
+            LIMIT 100
+        """
+        result = sf.query(query)
+        records = result['records']
+        history_df = pd.DataFrame(records)
+        if not history_df.empty:
+            history_df['Invoice_Date__c'] = pd.to_datetime(history_df['Invoice_Date__c']).dt.date
+        return history_df
+    except Exception as e:
+        print(f"Failed to fetch vendor history: {str(e)}")
+        return pd.DataFrame()
+def check_data_consistency(invoice_number, vendor_name, invoice_date, history_df):
+    """Check for data consistency issues like duplicates."""
+    consistency_issues = []
+    if not history_df.empty:
+        duplicate_invoices = history_df[history_df['Invoice_Number__c'] == invoice_number]
+        if not duplicate_invoices.empty:
+            consistency_issues.append(f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'.")
+    return consistency_issues
+def detect_anomalies(df, history_df):
+    """Detect anomalies in amount, frequency, and vendor patterns."""
+    df["is_amount_anomaly"] = 0
+    df["is_frequency_anomaly"] = 0
+    df["is_vendor_pattern_anomaly"] = 0
+    if not df.empty:
+        scaler = StandardScaler()
+        X_scaled = scaler.fit_transform(df[["amount"]])
+        model = IsolationForest(contamination=0.05, random_state=42)
+        df["is_amount_anomaly"] = model.fit_predict(X_scaled)
+    if not history_df.empty:
+        history_df['Invoice_Date__c'] = pd.to_datetime(history_df['Invoice_Date__c'])
+        date_range = (history_df['Invoice_Date__c'].max() - history_df['Invoice_Date__c'].min()).days + 1
+        frequency = len(history_df) / max(date_range, 1)
+        date_diffs = [(d - history_df['Invoice_Date__c'].min()).days for d in history_df['Invoice_Date__c']]
+        date_clustering = np.std(date_diffs) if len(date_diffs) > 1 else 0
+        frequency_df = pd.DataFrame({
+            "frequency": [frequency],
+            "date_clustering": [date_clustering]
+        })
+        scaler = StandardScaler()
+        X_scaled = scaler.fit_transform(frequency_df[["frequency", "date_clustering"]])
+        model = IsolationForest(contamination=0.05, random_state=42)
+        df["is_frequency_anomaly"] = model.fit_predict(X_scaled)[0]
+    else:
+        df["is_frequency_anomaly"] = 1
+    if not history_df.empty and len(history_df) > 1:
+        historical_amounts = history_df["Invoice_Amount__c"].astype(float)
+        mean_amount = historical_amounts.mean()
+        std_amount = historical_amounts.std() if len(historical_amounts) > 1 else 1
+        amount_variance = historical_amounts.var() if len(historical_amounts) > 1 else 0
+        current_amount = df["amount"].iloc[0]
+        deviation = abs(current_amount - mean_amount) / (std_amount if std_amount > 0 else 1)
+        invoice_count = len(history_df)
+        vendor_pattern_df = pd.DataFrame({
+            "amount_deviation": [deviation],
+            "invoice_count": [invoice_count],
+            "amount_variance": [amount_variance]
+        })
+        scaler = StandardScaler()
+        X_scaled = scaler.fit_transform(vendor_pattern_df[["amount_deviation", "invoice_count", "amount_variance"]])
+        model = IsolationForest(contamination=0.05, random_state=42)
+        df["is_vendor_pattern_anomaly"] = model.fit_predict(X_scaled)[0]
+    else:
+        df["is_vendor_pattern_anomaly"] = 1
+    return df
+def calculate_fraud_score(amount, is_amount_anomaly, is_frequency_anomaly, is_vendor_pattern_anomaly, text_length, consistency_issues, invoice_date):
+    """Calculate fraud score based on amount, anomalies, text length, consistency issues, and invoice date."""
+    score = 0.0
+    reasoning = []
+    today = datetime.now().date()
+    if amount > 5000:
+        score += 40
+        reasoning.append("High invoice amount detected.")
+    elif amount < 10:
+        score += 20
+        reasoning.append("Unusually low invoice amount.")
+    if invoice_date > today:
+        score += 10
+        reasoning.append("Invoice date is in the future.")
+    if is_amount_anomaly == -1:
+        score += 30
+        reasoning.append("Amount flagged as an anomaly.")
+    if is_frequency_anomaly == -1:
+        score += 25
+        reasoning.append("Unusual invoice submission frequency or clustering detected.")
+    if is_vendor_pattern_anomaly == -1:
+        score += 25
+        reasoning.append("Unusual vendor pattern detected (amount deviation, frequency, or variance).")
+    if text_length > 500:
+        score += 10
+        reasoning.append("Excessive text length in invoice.")
+    if consistency_issues:
+        score += 15 * len(consistency_issues)
+        reasoning.extend(consistency_issues)
+    return min(score, 100), reasoning
+def process_invoice(file_path):
+    """Process a single invoice (PDF or image) and return structured markdown output."""
     # Determine file type and extract text accordingly
+    if file_path.lower().endswith('.pdf'):
+        text = extract_text_from_pdf(file_path)
+    elif file_path.lower().endswith(('.png', '.jpg', '.jpeg')):
+        text = extract_text_from_image(file_path)
     else:
+        return "**Error**: Unsupported file type. Please upload a PDF or image (PNG/JPG/JPEG)."
     if "Error" in text:
+        return f"**Error**: {text}"
+    invoice_number, vendor_name, invoice_date, total_amount = extract_entities(text)
+    items = extract_items(text)
+    text_length = len(text)
+    history_df = fetch_vendor_history(vendor_name, invoice_number)
+    consistency_issues = check_data_consistency(invoice_number, vendor_name, invoice_date, history_df)
+    data = {
+        "invoice_id": str(uuid.uuid4()),
+        "invoice_number": invoice_number,
+        "vendor_name": vendor_name,
+        "amount": total_amount,
+        "invoice_date": invoice_date,
+        "text_length": text_length
+    }
+    df = pd.DataFrame([data])
+    df = detect_anomalies(df, history_df)
+    fraud_score, fraud_reasoning = calculate_fraud_score(
+        df["amount"].iloc[0],
+        df["is_amount_anomaly"].iloc[0],
+        df["is_frequency_anomaly"].iloc[0],
+        df["is_vendor_pattern_anomaly"].iloc[0],
+        text_length,
+        consistency_issues,
+        invoice_date
+    )
+    # Format items for Salesforce (only include item descriptions)
+    cleaned_items = []
+    for item in items:
+        desc = item['description']
+        # Additional cleaning to ensure no quantity or price data
+        desc = re.sub(r'\s*Quantity\s*\d+', '', desc, flags=re.IGNORECASE).strip()
+        desc = re.sub(r'\s*Unit\s*Price\s*\$\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
+        desc = re.sub(r'\s*Total\s*Price\s*\$\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
+        cleaned_items.append(desc)
+    items_str = "; ".join(cleaned_items) if cleaned_items else "No items found"
+    print(f"Items string for Salesforce (after cleaning): {items_str}")  # Debug
+    # Validate items_str to ensure it contains no quantity or price data
+    if re.search(r'Quantity|Unit Price|Total Price|\$\d+\.\d+', items_str, re.IGNORECASE):
+        print(f"ERROR: items_str contains unexpected quantity or price data: {items_str}")
+        items_str = "; ".join(item['description'] for item in items)  # Fallback to raw descriptions
+        print(f"Fallback items_str: {items_str}")
+    output = [
+        "## Fraud Detection Summary",
+        f"- **Invoice Number**: {invoice_number}",
+        f"- **Vendor Name**: {vendor_name}",
+        f"- **Invoice Date**: {invoice_date}",
+        f"- **Invoice Amount**: ${total_amount:,.2f}",
+        "- **Items Selected**:",
+    ]
+    if items:
+        for item in items:
+            clean_description = re.sub(r'\s*\d+\s*\d*$', '', item['description']).strip()
+            output.append(f"  - {clean_description}")
+    else:
+        output.append("  - No items found")
+    output.extend([
+        f"- **Fraud Score**: {fraud_score}",
+        f"- **Status**: {'Flagged' if fraud_score > 50 else 'Cleared'}",
+        f"- **Flagged**: {fraud_score > 50}",
+        "",
+        "## Fraud Reasoning"
+    ])
+    if fraud_reasoning:
+        output.extend([f"- {reason}" for reason in fraud_reasoning])
+    else:
+        output.append("- No specific fraud indicators detected")
+    if sf is not None:
+        try:
+            record_data = {
+                "Invoice_Number__c": invoice_number,
+                "Vendor_Name__c": vendor_name,
+                "Invoice_Amount__c": total_amount,
+                "Invoice_Date__c": str(invoice_date),
+                "Fraud_Score__c": fraud_score,
+                "Fraud_Reason__c": "; ".join(fraud_reasoning),
+                "Flagged__c": fraud_score > 50,
+                "Status__c": "Flagged" if fraud_score > 50 else "Cleared",
+                "Items_Selected__c": items_str
+            }
+            print(f"Record data being sent to Salesforce: {record_data}")  # Debug
+            sf.Invoice_Record__c.create(record_data)
+            print(f"Successfully created Salesforce record with Items_Selected__c: {items_str}")  # Debug
+        except Exception as e:
+            print(f"Failed to create Salesforce record: {str(e)}")
+            pass
+    return "\n".join(output)
 def gradio_interface(file):
+    """Gradio interface to process uploaded file (PDF or image) and display structured results."""
     if file is None:
         return "Please upload a PDF or image file."
     result = process_invoice(file)
     return result
 with gr.Blocks(css=".prose a[href*='share']:has(svg) {display:none !important;}") as iface:
+    gr.Markdown("# Invoice Fraud Detection")
     with gr.Row():
+        file_input = gr.File(label="Upload Invoice (PDF or Image)")
+    result_output = gr.Markdown(label="Fraud Detection Results")
     file_input.change(fn=gradio_interface, inputs=file_input, outputs=result_output)
 if __name__ == "__main__":
+    iface.launch()