Spaces:

Abhisesh7
/

Invoice-Fraud-Detection

Sleeping

App Files Files Community

Abhisesh7 commited on May 20, 2025

Commit

087f2ac

verified ·

1 Parent(s): b03d28f

Update app.py

Browse files

Files changed (1) hide show

app.py +295 -487

app.py CHANGED Viewed

@@ -20,16 +20,15 @@ load_dotenv()
 os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable GPU usage
 os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"  # Disable oneDNN optimizations
-# Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
 # Read Salesforce credentials from environment variables
 SF_USERNAME = os.getenv("SF_USERNAME")
 SF_PASSWORD = os.getenv("SF_PASSWORD")
 SF_SECURITY_TOKEN = os.getenv("SF_SECURITY_TOKEN")
-logger.info(f"Salesforce login info: username={SF_USERNAME}")
 # Salesforce connection with error handling
 try:
@@ -38,221 +37,158 @@ try:
         password=SF_PASSWORD,
         security_token=SF_SECURITY_TOKEN
     )
-    logger.info("Salesforce login successful.")
 except SalesforceAuthenticationFailed as e:
-    logger.error(f"Salesforce authentication failed: {e}")
-    sf = None
-except Exception as e:
-    logger.error(f"Unexpected error during Salesforce connection: {e}")
     sf = None
 # Initialize Hugging Face NER pipeline (force CPU)
-try:
-    ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", device=-1)
-    logger.info("NER pipeline initialized successfully.")
-except Exception as e:
-    logger.error(f"Failed to initialize NER pipeline: {e}")
-    ner_pipeline = None
 def extract_text_from_pdf(pdf_file):
-    """Extract text from a PDF invoice with error handling."""
     try:
         with pdfplumber.open(pdf_file) as pdf:
             text = ""
             for page in pdf.pages:
                 page_text = page.extract_text() or ""
                 text += page_text + "\n"
-        logger.info("Extracted text:\n%s", text)
         return text
     except Exception as e:
-        logger.error("Error extracting text: %s", str(e))
         return f"Error extracting text: {str(e)}"
 def extract_items(text):
-    """Extract items from the invoice table step by step with enhanced robustness."""
     items = []
-    try:
-        # Replace escaped dollar signs and normalize text
-        text = text.replace(r'\$', '$')
-        text = re.sub(r'\s+', ' ', text)  # Normalize spaces
-        logger.info("Step 1: Splitting text into lines")
-        lines = text.split('\n')
-        logger.info("Text split into lines: %s", lines)
-        # Step 2: Find the table header
-        logger.info("Step 2: Locating table header")
-        table_start = -1
-        for i, line in enumerate(lines):
-            line = line.strip()
-            if not line:
-                continue
-            # Look for common table header patterns
-            if ("Item Description" in line or "Description" in line) and "Quantity" in line and "Price" in line:
-                table_start = i + 1
-                logger.info("Table header found at line %d: %s", i, line)
-                break
-        if table_start == -1:
-            logger.warning("Table header not found.")
-            return items
-        # Step 3: Find the end of the table
-        logger.info("Step 3: Locating table end")
-        table_end = len(lines)
-        for i in range(table_start, len(lines)):
-            line = lines[i].strip()
-            if not line:
-                continue
-            if re.search(r"Total\s*(Amount|Due|Price|Cost)", line, re.IGNORECASE):
-                table_end = i
-                logger.info("Table end found at line %d: %s", i, line)
-                break
-        logger.info("Table section identified: lines %d to %d", table_start, table_end-1)
-        table_lines = lines[table_start:table_end]
-        logger.info("Table lines: %s", table_lines)
-        # Step 4: Process each row step by step
-        logger.info("Step 4: Processing table rows one by one")
-        # Enhanced regex to handle variations (e.g., missing pipes, extra spaces)
-        table_row_pattern = r"(?:\|?\s*|\b)([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*(?:\|?\s*|\s+)(\d+)\s*(?:\|?\s*|\s+)([\d.]+)\s*(?:\|?\s*|\s+)([\d.]+)(?:\s*\|?\s*|\b)"
-        for row_idx, line in enumerate(table_lines, 1):
-            line = line.strip()
-            if not line:
-                logger.info("Row %d: Skipping empty row", row_idx)
-                continue
-            # Skip alignment rows (e.g., "|---|---|")
-            if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
-                logger.info("Row %d: Skipping alignment row: %s", row_idx, line)
-                continue
-            # Replace alignment markers in the row
-            line = re.sub(r'\|\s*---\s*\|', '|', line)
-            logger.info("Row %d: Processing row: %s", row_idx, line)
-            # Step 4a: Apply regex to extract item details
-            match = re.match(table_row_pattern, line)
-            if not match:
-                logger.warning("Row %d: Failed to match row: %s", row_idx, line)
-                continue
-            # Step 4b: Extract and validate item details
             description = match.group(1).strip()
-            try:
-                quantity = int(match.group(2))
-                unit_price = float(match.group(3))
-                total_price = float(match.group(4))
-            except ValueError as e:
-                logger.warning("Row %d: Failed to parse numbers: %s", row_idx, str(e))
-                continue
-            # Step 4c: Validate the extracted values
-            if quantity <= 0 or unit_price < 0 or total_price < 0:
-                logger.warning("Row %d: Invalid values (non-positive quantity, negative unit price, or total price): %s", row_idx, line)
-                continue
-            # Check if total_price ≈ quantity × unit_price
-            expected_total = quantity * unit_price
-            if abs(expected_total - total_price) > 0.01:
-                logger.warning("Row %d: Total price mismatch: Expected %.2f, Got %.2f for %s", row_idx, expected_total, total_price, description)
-                continue
-            # Step 4d: Add the item to the list
-            item = {
                 "description": description,
                 "quantity": quantity,
                 "unit_price": unit_price,
                 "total_price": total_price
-            }
-            items.append(item)
-            logger.info("Row %d: Successfully extracted item: %s, Qty: %d, Unit Price: $%.2f, Total Price: $%.2f",
-                        row_idx, description, quantity, unit_price, total_price)
-        # Step 5: Return the extracted items
-        logger.info("Step 5: Extraction complete. Total items extracted: %d", len(items))
-        return items
-    except Exception as e:
-        logger.error("Unexpected error in extract_items: %s", str(e))
-        return items
 def extract_entities(text):
-    """Extract structured invoice details with enhanced robustness."""
     invoice_number = "Unknown"
     vendor_name = "Unknown"
     invoice_date = datetime.now().date()
     total_amount = 0.0
-    try:
-        # Invoice Number
-        invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-)([\w-]+)"
-        invoice_num_match = re.search(invoice_num_pattern, text, re.IGNORECASE)
-        if invoice_num_match:
-            invoice_number = invoice_num_match.group(1) if invoice_num_match.group(1) else invoice_num_match.group(2)
-            logger.info("Matched Invoice Number: %s", invoice_number)
-        # Vendor Name
-        vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Invoice|No\.?|Date|$|\d))"
-        vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
-        if vendor_match:
-            vendor_name = vendor_match.group(1).strip()
-            logger.info("Matched Vendor Name (Regex): %s", vendor_name)
-        elif ner_pipeline:
-            try:
-                ner_results = ner_pipeline(text)
-                org_name_parts = []
-                for i, entity in enumerate(ner_results):
-                    if entity['entity'].startswith('B-ORG'):
-                        org_name_parts = [entity['word']]
-                    elif entity['entity'].startswith('I-ORG') and org_name_parts:
-                        org_name_parts.append(entity['word'])
-                if org_name_parts:
-                    vendor_name = " ".join(part.replace("##", "") for part in org_name_parts)
-                    logger.info("NER Matched Vendor Name: %s", vendor_name)
-            except Exception as e:
-                logger.warning("NER failed for vendor name: %s", str(e))
-        # Invoice Date
-        # Support multiple date formats
-        invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4}|\d{1,2}\s*[A-Za-z]+\s*\d{4})"
-        invoice_date_match = re.search(invoice_date_pattern, text, re.IGNORECASE)
-        if invoice_date_match:
-            date_str = invoice_date_match.group(1)
-            date_formats = [
-                "%Y-%m-%d", "%d/%m/%Y", "%d-%m-%Y", "%B %d, %Y", "%d %B %Y"
-            ]
-            for date_format in date_formats:
                 try:
-                    invoice_date = datetime.strptime(date_str, date_format).date()
-                    logger.info("Matched Invoice Date: %s", invoice_date)
-                    break
                 except ValueError:
-                    continue
-            else:
-                logger.warning("Failed to parse Invoice Date: %s", date_str)
-        # Total Amount
-        total_amount_pattern = r"(?:Total\s*(?:Amount|Due)?|Amount\s*Due|Total)\s*[:\-\s]*[$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR)?"
-        total_amount_match = re.search(total_amount_pattern, text, re.IGNORECASE)
-        if total_amount_match:
-            try:
-                total_amount = float(total_amount_match.group(1).replace(",", ""))
-                logger.info("Matched Total Amount: %.2f", total_amount)
-            except ValueError as e:
-                logger.warning("Failed to parse Total Amount: %s", str(e))
-        return invoice_number, vendor_name, invoice_date, total_amount
-    except Exception as e:
-        logger.error("Unexpected error in extract_entities: %s", str(e))
-        return invoice_number, vendor_name, invoice_date, total_amount
 def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
     """Fetch historical invoices for the vendor from Salesforce."""
     if sf is None:
-        logger.warning("Salesforce connection not available.")
         return pd.DataFrame()
     try:
@@ -260,7 +196,7 @@ def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
         start_date = end_date - timedelta(days=time_window_days)
         query = f"""
-            SELECT Invoice_Number__c, Invoice_Amount__c, Invoice_Date__c, Vendor_Name__c, Items_Selected__c
             FROM Invoice_Record__c
             WHERE Invoice_Date__c >= {start_date} AND Invoice_Date__c <= {end_date}
             AND Vendor_Name__c = '{vendor_name}'
@@ -272,337 +208,209 @@ def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
         history_df = pd.DataFrame(records)
         if not history_df.empty:
             history_df['Invoice_Date__c'] = pd.to_datetime(history_df['Invoice_Date__c']).dt.date
-            logger.info("Fetched %d historical records for vendor %s", len(history_df), vendor_name)
-        else:
-            logger.info("No historical records found for vendor %s", vendor_name)
         return history_df
     except Exception as e:
-        logger.error("Failed to fetch vendor history: %s", str(e))
         return pd.DataFrame()
 def check_data_consistency(invoice_number, vendor_name, invoice_date, history_df):
     """Check for data consistency issues like duplicates."""
     consistency_issues = []
-    try:
-        if not history_df.empty:
-            duplicate_invoices = history_df[history_df['Invoice_Number__c'] == invoice_number]
-            if not duplicate_invoices.empty:
-                issue = f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'."
-                consistency_issues.append(issue)
-                logger.warning(issue)
-        return consistency_issues
-    except Exception as e:
-        logger.error("Error in check_data_consistency: %s", str(e))
-        return consistency_issues
-def parse_items_to_features(items_str):
-    """Parse the Items_Selected__c field into features for anomaly detection."""
-    try:
-        if not items_str or items_str == "No items found":
-            return 0, 0, 0
-        max_quantity = 0
-        total_unit_price = 0.0
-        total_items = 0
-        items = items_str.split("; ")
-        for item in items:
-            if not item:
-                continue
-            try:
-                quantity_match = re.search(r"Quantity (\d+)", item)
-                unit_price_match = re.search(r"Unit Price \$([\d.]+)", item)
-                if quantity_match and unit_price_match:
-                    quantity = int(quantity_match.group(1))
-                    unit_price = float(unit_price_match.group(1))
-                    max_quantity = max(max_quantity, quantity)
-                    total_unit_price += unit_price
-                    total_items += 1
-            except Exception as e:
-                logger.warning("Error parsing item '%s': %s", item, str(e))
-                continue
-        avg_unit_price = total_unit_price / total_items if total_items > 0 else 0
-        return max_quantity, avg_unit_price, total_items
-    except Exception as e:
-        logger.error("Error in parse_items_to_features: %s", str(e))
-        return 0, 0, 0
-def detect_anomalies(df, history_df, items):
-    """Detect anomalies with improved handling for small datasets."""
     df["is_amount_anomaly"] = 0
     df["is_frequency_anomaly"] = 0
     df["is_vendor_pattern_anomaly"] = 0
-    df["is_item_anomaly"] = 0
-    try:
-        # Amount anomaly detection
-        if not history_df.empty:
-            historical_amounts = history_df["Invoice_Amount__c"].astype(float).values
-            current_amount = df["amount"].iloc[0]
-            amounts = np.append(historical_amounts, current_amount)
-            if len(amounts) > 1:  # Need at least 2 data points for meaningful anomaly detection
-                amounts_df = pd.DataFrame({"amount": amounts})
-                scaler = StandardScaler()
-                X_scaled = scaler.fit_transform(amounts_df[["amount"]])
-                model = IsolationForest(contamination=0.05, random_state=42)
-                predictions = model.fit_predict(X_scaled)
-                df["is_amount_anomaly"] = predictions[-1]
-                logger.info("Amount anomaly detection completed: %d", df["is_amount_anomaly"].iloc[0])
-            else:
-                logger.info("Not enough data for amount anomaly detection.")
-        # Frequency anomaly detection
-        if not history_df.empty:
-            history_df['Invoice_Date__c'] = pd.to_datetime(history_df['Invoice_Date__c'])
-            date_range = (history_df['Invoice_Date__c'].max() - history_df['Invoice_Date__c'].min()).days + 1
-            frequency = len(history_df) / max(date_range, 1)
-            date_diffs = [(d - history_df['Invoice_Date__c'].min()).days for d in history_df['Invoice_Date__c']]
-            date_clustering = np.std(date_diffs) if len(date_diffs) > 1 else 0
-            frequency_df = pd.DataFrame({
-                "frequency": [frequency],
-                "date_clustering": [date_clustering]
-            })
-            scaler = StandardScaler()
-            X_scaled = scaler.fit_transform(frequency_df[["frequency", "date_clustering"]])
-            model = IsolationForest(contamination=0.05, random_state=42)
-            df["is_frequency_anomaly"] = model.fit_predict(X_scaled)[0]
-            logger.info("Frequency anomaly detection completed: %d", df["is_frequency_anomaly"].iloc[0])
-        else:
-            df["is_frequency_anomaly"] = 1
-            logger.info("No historical data for frequency anomaly detection.")
-        # Vendor pattern anomaly detection
-        if not history_df.empty and len(history_df) > 1:
-            historical_amounts = history_df["Invoice_Amount__c"].astype(float)
-            mean_amount = historical_amounts.mean()
-            std_amount = historical_amounts.std() if len(historical_amounts) > 1 else 1
-            amount_variance = historical_amounts.var() if len(historical_amounts) > 1 else 0
-            current_amount = df["amount"].iloc[0]
-            deviation = abs(current_amount - mean_amount) / (std_amount if std_amount > 0 else 1)
-            invoice_count = len(history_df)
-            vendor_pattern_df = pd.DataFrame({
-                "amount_deviation": [deviation],
-                "invoice_count": [invoice_count],
-                "amount_variance": [amount_variance]
-            })
-            scaler = StandardScaler()
-            X_scaled = scaler.fit_transform(vendor_pattern_df[["amount_deviation", "invoice_count", "amount_variance"]])
-            model = IsolationForest(contamination=0.05, random_state=42)
-            df["is_vendor_pattern_anomaly"] = model.fit_predict(X_scaled)[0]
-            logger.info("Vendor pattern anomaly detection completed: %d", df["is_vendor_pattern_anomaly"].iloc[0])
-        else:
-            df["is_vendor_pattern_anomaly"] = 1
-            logger.info("Not enough data for vendor pattern anomaly detection.")
-        # Item-level anomaly detection
-        if not history_df.empty:
-            historical_max_quantities = []
-            historical_avg_unit_prices = []
-            historical_total_items = []
-            for items_str in history_df["Items_Selected__c"]:
-                max_qty, avg_price, total_items = parse_items_to_features(items_str)
-                historical_max_quantities.append(max_qty)
-                historical_avg_unit_prices.append(avg_price)
-                historical_total_items.append(total_items)
-            current_max_quantity = max(item["quantity"] for item in items) if items else 0
-            current_avg_unit_price = sum(item["unit_price"] for item in items) / len(items) if items else 0
-            current_total_items = len(items)
-            item_features = pd.DataFrame({
-                "max_quantity": historical_max_quantities + [current_max_quantity],
-                "avg_unit_price": historical_avg_unit_prices + [current_avg_unit_price],
-                "total_items": historical_total_items + [current_total_items]
-            })
-            if len(item_features) > 1:
-                scaler = StandardScaler()
-                X_scaled = scaler.fit_transform(item_features[["max_quantity", "avg_unit_price", "total_items"]])
-                model = IsolationForest(contamination=0.05, random_state=42)
-                predictions = model.fit_predict(X_scaled)
-                df["is_item_anomaly"] = predictions[-1]
-                logger.info("Item anomaly detection completed: %d", df["is_item_anomaly"].iloc[0])
-            else:
-                logger.info("Not enough data for item anomaly detection.")
-        return df
-    except Exception as e:
-        logger.error("Error in detect_anomalies: %s", str(e))
-        return df
-def calculate_fraud_score(amount, is_amount_anomaly, is_frequency_anomaly, is_vendor_pattern_anomaly, is_item_anomaly, text_length, consistency_issues, invoice_date, items):
-    """Calculate fraud score with additional validation rules."""
     score = 0.0
     reasoning = []
-    try:
-        today = datetime.now().date()
-        if amount > 5000:
-            score += 40
-            reasoning.append("High invoice amount detected.")
-        elif amount < 10:
-            score += 20
-            reasoning.append("Unusually low invoice amount.")
-        if invoice_date > today:
-            score += 10
-            reasoning.append("Invoice date is in the future.")
-        if is_amount_anomaly == -1:
-            score += 30
-            reasoning.append("Amount flagged as an anomaly.")
-        if is_frequency_anomaly == -1:
-            score += 25
-            reasoning.append("Unusual invoice submission frequency or clustering detected.")
-        if is_vendor_pattern_anomaly == -1:
-            score += 25
-            reasoning.append("Unusual vendor pattern detected (amount deviation, frequency, or variance).")
-        if is_item_anomaly == -1:
-            score += 20
-            reasoning.append("Unusual item patterns detected (quantity, unit price, or number of items).")
-        if text_length > 500:
-            score += 5
-            reasoning.append("Excessive text length in invoice.")
-        if consistency_issues:
-            score += 15 * len(consistency_issues)
-            reasoning.extend(consistency_issues)
         for item in items:
-            # High quantity rule
-            if item["quantity"] > 10:
-                score += 10
-                reasoning.append(f"High quantity detected for item '{item['description']}' (Quantity: {item['quantity']}).")
-                break
-            # New rule: High unit price
-            if item["unit_price"] > 500:
-                score += 15
-                reasoning.append(f"High unit price detected for item '{item['description']}' (Unit Price: ${item['unit_price']:.2f}).")
-                break
-        fraud_score = min(score, 100)
-        logger.info("Fraud score calculated: %.2f with reasons: %s", fraud_score, reasoning)
-        return fraud_score, reasoning
-    except Exception as e:
-        logger.error("Error in calculate_fraud_score: %s", str(e))
-        return score, reasoning
-def process_invoice(pdf_file):
-    """Process a single invoice PDF with comprehensive error handling."""
-    try:
-        text = extract_text_from_pdf(pdf_file)
-        if "Error" in text:
-            return f"**Error**: {text}"
-        invoice_number, vendor_name, invoice_date, total_amount = extract_entities(text)
-        items = extract_items(text)
-        text_length = len(text)
-        history_df = fetch_vendor_history(vendor_name, invoice_number)
-        consistency_issues = check_data_consistency(invoice_number, vendor_name, invoice_date, history_df)
-        data = {
-            "invoice_id": str(uuid.uuid4()),
-            "invoice_number": invoice_number,
-            "vendor_name": vendor_name,
-            "amount": total_amount,
-            "invoice_date": invoice_date,
-            "text_length": text_length
-        }
-        df = pd.DataFrame([data])
-        df = detect_anomalies(df, history_df, items)
-        fraud_score, fraud_reasoning = calculate_fraud_score(
-            df["amount"].iloc[0],
-            df["is_amount_anomaly"].iloc[0],
-            df["is_frequency_anomaly"].iloc[0],
-            df["is_vendor_pattern_anomaly"].iloc[0],
-            df["is_item_anomaly"].iloc[0],
-            text_length,
-            consistency_issues,
-            invoice_date,
-            items
-        )
-        items_str = "; ".join(
-            f"{item['description']}: Quantity {item['quantity']}, Unit Price ${item['unit_price']:.2f}, Total Price ${item['total_price']:.2f}"
-            for item in items
-        ) if items else "No items found"
-        output = [
-            "## Fraud Detection Summary",
-            f"- **Invoice Number**: {invoice_number}",
-            f"- **Vendor Name**: {vendor_name}",
-            f"- **Invoice Date**: {invoice_date}",
-            f"- **Invoice Amount**: ${total_amount:,.2f}",
-            "- **Items Selected**:",
-        ]
-        if items:
-            for item in items:
-                output.append(f"  - {item['description']}: Quantity {item['quantity']}, Unit Price ${item['unit_price']:.2f}, Total Price ${item['total_price']:.2f}")
-        else:
-            output.append("  - No items found")
-        output.extend([
-            f"- **Fraud Score**: {fraud_score}",
-            f"- **Status**: {'Flagged' if fraud_score > 50 else 'Cleared'}",
-            f"- **Flagged**: {fraud_score > 50}",
-            "",
-            "## Fraud Reasoning"
-        ])
-        if fraud_reasoning:
-            output.extend([f"- {reason}" for reason in fraud_reasoning])
-        else:
-            output.append("- No specific fraud indicators detected")
-        if sf is not None:
-            try:
-                sf.Invoice_Record__c.create({
-                    "Invoice_Number__c": invoice_number if invoice_number != "Unknown" else "",
-                    "Vendor_Name__c": vendor_name if vendor_name != "Unknown" else "",
-                    "Invoice_Amount__c": float(total_amount) if total_amount is not None else 0.0,
-                    "Invoice_Date__c": str(invoice_date) if invoice_date else "",
-                    "Fraud_Score__c": float(fraud_score) if fraud_score is not None else 0.0,
-                    "Fraud_Reason__c": "; ".join(fraud_reasoning) if fraud_reasoning else "",
-                    "Flagged__c": fraud_score > 50,
-                    "Status__c": "Flagged" if fraud_score > 50 else "Cleared",
-                    "Items_Selected__c": items_str
-                })
-                logger.info("Successfully created Salesforce record with Items_Selected__c: %s", items_str)
-            except Exception as e:
-                logger.error("Failed to create Salesforce record: %s", str(e))
-        return "\n".join(output)
-    except Exception as e:
-        logger.error("Unexpected error in process_invoice: %s", str(e))
-        return f"**Error**: An unexpected error occurred: {str(e)}"
 def gradio_interface(pdf_file):
     """Gradio interface to process uploaded PDF and display structured results."""
     if pdf_file is None:
         return "Please upload a PDF file."
-    try:
-        result = process_invoice(pdf_file)
-        return result
-    except Exception as e:
-        logger.error("Error in gradio_interface: %s", str(e))
-        return f"**Error**: {str(e)}"
 with gr.Blocks(css=".prose a[href*='share']:has(svg) {display:none !important;}") as iface:
     gr.Markdown("# Invoice Fraud Detection")

 os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable GPU usage
 os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"  # Disable oneDNN optimizations
+# Set up logging to suppress transformers warnings
+logging.getLogger("transformers").setLevel(logging.ERROR)
 # Read Salesforce credentials from environment variables
 SF_USERNAME = os.getenv("SF_USERNAME")
 SF_PASSWORD = os.getenv("SF_PASSWORD")
 SF_SECURITY_TOKEN = os.getenv("SF_SECURITY_TOKEN")
+print(f"Salesforce login info: username={SF_USERNAME}")
 # Salesforce connection with error handling
 try:
         password=SF_PASSWORD,
         security_token=SF_SECURITY_TOKEN
     )
+    print("Salesforce login successful.")
 except SalesforceAuthenticationFailed as e:
+    print(f"Salesforce authentication failed: {e}")
     sf = None
 # Initialize Hugging Face NER pipeline (force CPU)
+ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", device=-1)
 def extract_text_from_pdf(pdf_file):
+    """Extract text from a PDF invoice."""
     try:
         with pdfplumber.open(pdf_file) as pdf:
             text = ""
             for page in pdf.pages:
                 page_text = page.extract_text() or ""
                 text += page_text + "\n"
+        print("Extracted text:\n", text)  # Debug: Print extracted text
         return text
     except Exception as e:
         return f"Error extracting text: {str(e)}"
 def extract_items(text):
+    """Extract items from the invoice table with a simplified approach."""
     items = []
+    # Replace escaped dollar signs
+    text = text.replace(r'\$', '$')
+    # Split text into lines
+    lines = text.split('\n')
+    print("Text split into lines:", lines)  # Debug
+    # Find the table header
+    table_start = -1
+    for i, line in enumerate(lines):
+        if "Item Description" in line and "Quantity" in line and "Unit Price" in line and "Total Price" in line:
+            table_start = i + 1  # Table data starts after the header
+            break
+    if table_start == -1:
+        print("Table header not found.")
+        return items
+    # Find the end of the table (before "Total Amount" or end of text)
+    table_end = len(lines)
+    for i in range(table_start, len(lines)):
+        if "Total Amount" in lines[i] or "Total Due" in lines[i]:
+            table_end = i
+            break
+    print(f"Table section: lines {table_start} to {table_end-1}")  # Debug
+    table_lines = lines[table_start:table_end]
+    print("Table lines:", table_lines)  # Debug
+    # Pattern to match table rows
+    # Simplified to handle multi-word descriptions and flexible spacing
+    table_row_pattern = r"\|?\s*([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*(\d+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
+    for line in table_lines:
+        line = line.strip()
+        if not line:
+            continue
+        # Skip alignment rows (e.g., "|---|---|")
+        if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
+            print(f"Skipping alignment row: {line}")
+            continue
+        # Replace alignment markers in the row (e.g., "|---|") with "|"
+        line = re.sub(r'\|\s*---\s*\|', '|', line)
+        print(f"Processing table row: {line}")  # Debug
+        match = re.match(table_row_pattern, line)
+        if match:
             description = match.group(1).strip()
+            quantity = int(match.group(2))
+            unit_price = float(match.group(3))
+            total_price = float(match.group(4))
+            items.append({
                 "description": description,
                 "quantity": quantity,
                 "unit_price": unit_price,
                 "total_price": total_price
+            })
+            print(f"Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}")  # Debug
+        else:
+            print(f"Failed to match row: {line}")
+    return items
 def extract_entities(text):
+    """Extract structured invoice details using flexible regex patterns."""
     invoice_number = "Unknown"
     vendor_name = "Unknown"
     invoice_date = datetime.now().date()
     total_amount = 0.0
+    # Flexible regex patterns to handle various invoice formats
+    invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-)([\w-]+)"
+    vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Invoice|No\.?|Date|$|\d))"
+    invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
+    total_amount_pattern = r"(?:Total\s*(?:Amount|Due)?|Amount\s*Due|Total)\s*[:\-\s]*[$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR)?"
+    # Invoice Number
+    invoice_num_match = re.search(invoice_num_pattern, text, re.IGNORECASE)
+    if invoice_num_match:
+        invoice_number = invoice_num_match.group(1) if invoice_num_match.group(1) else invoice_num_match.group(2)
+        print(f"Matched Invoice Number: {invoice_number}")  # Debug
+    # Vendor Name
+    vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
+    if vendor_match:
+        vendor_name = vendor_match.group(1).strip()
+        print(f"Matched Vendor Name (Regex): {vendor_name}")  # Debug
+    else:
+        # Enhanced NER fallback for multi-word organization names
+        ner_results = ner_pipeline(text)
+        org_name_parts = []
+        for i, entity in enumerate(ner_results):
+            if entity['entity'].startswith('B-ORG'):
+                org_name_parts = [entity['word']]
+            elif entity['entity'].startswith('I-ORG') and org_name_parts:
+                org_name_parts.append(entity['word'])
+        if org_name_parts:
+            vendor_name = " ".join(part.replace("##", "") for part in org_name_parts)
+            print(f"NER Matched Vendor Name: {vendor_name}")  # Debug
+    # Invoice Date
+    invoice_date_match = re.search(invoice_date_pattern, text, re.IGNORECASE)
+    if invoice_date_match:
+        date_str = invoice_date_match.group(1)
+        try:
+            if "/" in date_str:
+                invoice_date = datetime.strptime(date_str, "%m/%d/%Y").date()
+            elif "," in date_str:
+                invoice_date = datetime.strptime(date_str, "%B %d, %Y").date()
+            elif "-" in date_str:
                 try:
+                    invoice_date = datetime.strptime(date_str, "%Y-%m-%d").date()
                 except ValueError:
+                    invoice_date = datetime.strptime(date_str, "%d-%m-%Y").date()
+            print(f"Matched Invoice Date: {invoice_date}")  # Debug
+        except ValueError as e:
+            print(f"Failed to parse Invoice Date '{date_str}': {str(e)}")  # Debug
+    # Total Amount
+    total_amount_match = re.search(total_amount_pattern, text, re.IGNORECASE)
+    if total_amount_match:
+        total_amount = float(total_amount_match.group(1).replace(",", ""))
+        print(f"Matched Total Amount: {total_amount}")  # Debug
+    return invoice_number, vendor_name, invoice_date, total_amount
 def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
     """Fetch historical invoices for the vendor from Salesforce."""
     if sf is None:
         return pd.DataFrame()
     try:
         start_date = end_date - timedelta(days=time_window_days)
         query = f"""
+            SELECT Invoice_Number__c, Invoice_Amount__c, Invoice_Date__c, Vendor_Name__c
             FROM Invoice_Record__c
             WHERE Invoice_Date__c >= {start_date} AND Invoice_Date__c <= {end_date}
             AND Vendor_Name__c = '{vendor_name}'
         history_df = pd.DataFrame(records)
         if not history_df.empty:
             history_df['Invoice_Date__c'] = pd.to_datetime(history_df['Invoice_Date__c']).dt.date
         return history_df
     except Exception as e:
+        print(f"Failed to fetch vendor history: {str(e)}")
         return pd.DataFrame()
 def check_data_consistency(invoice_number, vendor_name, invoice_date, history_df):
     """Check for data consistency issues like duplicates."""
     consistency_issues = []
+    if not history_df.empty:
+        duplicate_invoices = history_df[history_df['Invoice_Number__c'] == invoice_number]
+        if not duplicate_invoices.empty:
+            consistency_issues.append(f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'.")
+    return consistency_issues
+def detect_anomalies(df, history_df):
+    """Detect anomalies in amount, frequency, and vendor patterns."""
     df["is_amount_anomaly"] = 0
     df["is_frequency_anomaly"] = 0
     df["is_vendor_pattern_anomaly"] = 0
+    if not df.empty:
+        scaler = StandardScaler()
+        X_scaled = scaler.fit_transform(df[["amount"]])
+        model = IsolationForest(contamination=0.05, random_state=42)
+        df["is_amount_anomaly"] = model.fit_predict(X_scaled)
+    if not history_df.empty:
+        history_df['Invoice_Date__c'] = pd.to_datetime(history_df['Invoice_Date__c'])
+        date_range = (history_df['Invoice_Date__c'].max() - history_df['Invoice_Date__c'].min()).days + 1
+        frequency = len(history_df) / max(date_range, 1)
+        date_diffs = [(d - history_df['Invoice_Date__c'].min()).days for d in history_df['Invoice_Date__c']]
+        date_clustering = np.std(date_diffs) if len(date_diffs) > 1 else 0
+        frequency_df = pd.DataFrame({
+            "frequency": [frequency],
+            "date_clustering": [date_clustering]
+        })
+        scaler = StandardScaler()
+        X_scaled = scaler.fit_transform(frequency_df[["frequency", "date_clustering"]])
+        model = IsolationForest(contamination=0.05, random_state=42)
+        df["is_frequency_anomaly"] = model.fit_predict(X_scaled)[0]
+    else:
+        df["is_frequency_anomaly"] = 1
+    if not history_df.empty and len(history_df) > 1:
+        historical_amounts = history_df["Invoice_Amount__c"].astype(float)
+        mean_amount = historical_amounts.mean()
+        std_amount = historical_amounts.std() if len(historical_amounts) > 1 else 1
+        amount_variance = historical_amounts.var() if len(historical_amounts) > 1 else 0
+        current_amount = df["amount"].iloc[0]
+        deviation = abs(current_amount - mean_amount) / (std_amount if std_amount > 0 else 1)
+        invoice_count = len(history_df)
+        vendor_pattern_df = pd.DataFrame({
+            "amount_deviation": [deviation],
+            "invoice_count": [invoice_count],
+            "amount_variance": [amount_variance]
+        })
+        scaler = StandardScaler()
+        X_scaled = scaler.fit_transform(vendor_pattern_df[["amount_deviation", "invoice_count", "amount_variance"]])
+        model = IsolationForest(contamination=0.05, random_state=42)
+        df["is_vendor_pattern_anomaly"] = model.fit_predict(X_scaled)[0]
+    else:
+        df["is_vendor_pattern_anomaly"] = 1
+    return df
+def calculate_fraud_score(amount, is_amount_anomaly, is_frequency_anomaly, is_vendor_pattern_anomaly, text_length, consistency_issues, invoice_date):
+    """Calculate fraud score based on amount, anomalies, text length, consistency issues, and invoice date."""
     score = 0.0
     reasoning = []
+    today = datetime.now().date()
+    if amount > 5000:
+        score += 40
+        reasoning.append("High invoice amount detected.")
+    elif amount < 10:
+        score += 20
+        reasoning.append("Unusually low invoice amount.")
+    if invoice_date > today:
+        score += 10
+        reasoning.append("Invoice date is in the future.")
+    if is_amount_anomaly == -1:
+        score += 30
+        reasoning.append("Amount flagged as an anomaly.")
+    if is_frequency_anomaly == -1:
+        score += 25
+        reasoning.append("Unusual invoice submission frequency or clustering detected.")
+    if is_vendor_pattern_anomaly == -1:
+        score += 25
+        reasoning.append("Unusual vendor pattern detected (amount deviation, frequency, or variance).")
+    if text_length > 500:
+        score += 10
+        reasoning.append("Excessive text length in invoice.")
+    if consistency_issues:
+        score += 15 * len(consistency_issues)
+        reasoning.extend(consistency_issues)
+    return min(score, 100), reasoning
+def process_invoice(pdf_file):
+    """Process a single invoice PDF and return structured markdown output."""
+    text = extract_text_from_pdf(pdf_file)
+    if "Error" in text:
+        return f"**Error**: {text}"
+    invoice_number, vendor_name, invoice_date, total_amount = extract_entities(text)
+    items = extract_items(text)
+    text_length = len(text)
+    history_df = fetch_vendor_history(vendor_name, invoice_number)
+    consistency_issues = check_data_consistency(invoice_number, vendor_name, invoice_date, history_df)
+    data = {
+        "invoice_id": str(uuid.uuid4()),
+        "invoice_number": invoice_number,
+        "vendor_name": vendor_name,
+        "amount": total_amount,
+        "invoice_date": invoice_date,
+        "text_length": text_length
+    }
+    df = pd.DataFrame([data])
+    df = detect_anomalies(df, history_df)
+    fraud_score, fraud_reasoning = calculate_fraud_score(
+        df["amount"].iloc[0],
+        df["is_amount_anomaly"].iloc[0],
+        df["is_frequency_anomaly"].iloc[0],
+        df["is_vendor_pattern_anomaly"].iloc[0],
+        text_length,
+        consistency_issues,
+        invoice_date
+    )
+    # Format items for Salesforce (semicolon-separated string)
+    items_str = "; ".join(
+        f"{item['description']}: Quantity {item['quantity']}, Unit Price ${item['unit_price']:.2f}, Total Price ${item['total_price']:.2f}"
+        for item in items
+    ) if items else "No items found"
+    output = [
+        "## Fraud Detection Summary",
+        f"- **Invoice Number**: {invoice_number}",
+        f"- **Vendor Name**: {vendor_name}",
+        f"- **Invoice Date**: {invoice_date}",
+        f"- **Invoice Amount**: ${total_amount:,.2f}",
+        "- **Items Selected**:",
+    ]
+    if items:
         for item in items:
+            output.append(f"  - {item['description']}: Quantity {item['quantity']}, Unit Price ${item['unit_price']:.2f}, Total Price ${item['total_price']:.2f}")
+    else:
+        output.append("  - No items found")
+    output.extend([
+        f"- **Fraud Score**: {fraud_score}",
+        f"- **Status**: {'Flagged' if fraud_score > 50 else 'Cleared'}",
+        f"- **Flagged**: {fraud_score > 50}",
+        "",
+        "## Fraud Reasoning"
+    ])
+    if fraud_reasoning:
+        output.extend([f"- {reason}" for reason in fraud_reasoning])
+    else:
+        output.append("- No specific fraud indicators detected")
+    if sf is not None:
+        try:
+            sf.Invoice_Record__c.create({
+                "Invoice_Number__c": invoice_number,
+                "Vendor_Name__c": vendor_name,
+                "Invoice_Amount__c": total_amount,
+                "Invoice_Date__c": str(invoice_date),
+                "Fraud_Score__c": fraud_score,
+                "Fraud_Reason__c": "; ".join(fraud_reasoning),
+                "Flagged__c": fraud_score > 50,
+                "Status__c": "Flagged" if fraud_score > 50 else "Cleared",
+                "Items_Selected__c": items_str
+            })
+            print(f"Successfully created Salesforce record with Items_Selected__c: {items_str}")  # Debug
+        except Exception as e:
+            print(f"Failed to create Salesforce record: {str(e)}")
+            pass
+    return "\n".join(output)
 def gradio_interface(pdf_file):
     """Gradio interface to process uploaded PDF and display structured results."""
     if pdf_file is None:
         return "Please upload a PDF file."
+    result = process_invoice(pdf_file)
+    return result
 with gr.Blocks(css=".prose a[href*='share']:has(svg) {display:none !important;}") as iface:
     gr.Markdown("# Invoice Fraud Detection")