Spaces:

Abhisesh7
/

Invoice-Fraud-Detection

Sleeping

App Files Files Community

Abhisesh7 commited on May 23, 2025

Commit

49cc078

verified ·

1 Parent(s): 855d3bc

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -124

app.py CHANGED Viewed

@@ -58,137 +58,180 @@ def extract_text_from_pdf(pdf_file):
     except Exception as e:
         return f"Error extracting text: {str(e)}"
-def extract_items(text):
-    """Extract items from the invoice table with support for multiple table formats."""
     items = []
-    # Replace escaped dollar signs and other symbols
-    text = text.replace(r'\$', '$').replace('₹', '₹')
-    # Split text into lines
-    lines = text.split('\n')
-    print("Text split into lines:", lines)  # Debug
-    # Define possible table headers
-    table_headers = [
-        ("Item Description", "Quantity", "Unit Price", "Total Price"),  # Format 1 (e.g., invoice_4.pdf)
-        ("Particulars", "Gross value", "Discount", "Net value", "Total"),  # Format 2 (e.g., Invoice_6164752968.pdf)
-    ]
-    # Extract main table (e.g., Particulars | Gross value | Discount | Net value | Total)
-    table_start = -1
-    table_format = None
-    for i, line in enumerate(lines):
-        for headers in table_headers:
-            if all(header in line for header in headers):
-                table_start = i + 1  # Table data starts after the header
-                table_format = headers
-                break
-        if table_start != -1:
-            break
-    if table_start != -1:
-        # Find the end of the main table
-        table_end = len(lines)
-        for i in range(table_start, len(lines)):
-            if "Item(s) Total" in lines[i] or "Total Value" in lines[i] or "Sr.No Particulars" in lines[i]:
-                table_end = i
                 break
-        print(f"Main table section: lines {table_start} to {table_end-1}")  # Debug
-        table_lines = lines[table_start:table_end]
-        print("Main table lines:", table_lines)  # Debug
-        # Define patterns based on table format
-        if table_format[0] == "Item Description":
-            # Pattern for invoice_4.pdf: "Monitor 24 inch | 7 | 150.00 | 1050.00"
-            table_row_pattern = r"\|?\s*([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*(\d+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
-        else:
-            # Simplified pattern for Invoice_6164752968.pdf: "1 x Chicken Frankie | 60 | 6 | 54 | 2.5% | 1.35 | 2.5% | 1.35 | 56.7"
             table_row_pattern = r"(\d+\s*x\s*[A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*[0-9.%]+\s*\|?\s*[\d.]+){2}\s*(?:\|\s*)?([\d.]+)"
-        for line in table_lines:
-            line = line.strip()
-            if not line or "HSN Code" in line or "Total" in line:
-                print(f"Skipping irrelevant line: {line}")
-                continue
-            # Skip alignment rows (e.g., "|---|---|")
-            if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
-                print(f"Skipping alignment row: {line}")
-                continue
-            print(f"Processing main table row: {line}")  # Debug
-            match = re.match(table_row_pattern, line)
-            if match:
-                description = match.group(1).strip()
-                quantity = int(description.split(' x ')[0].strip()) if ' x ' in description else 1
-                unit_price = float(match.group(2))  # Gross value
-                total_price = float(match.group(5))  # Total after taxes
-                items.append({
-                    "description": description,
-                    "quantity": quantity,
-                    "unit_price": unit_price,
-                    "total_price": total_price
-                })
-                print(f"Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}")  # Debug
-            else:
-                # Fallback: Split by | and validate fields manually
-                fields = [f.strip() for f in line.split('|')]
-                print(f"Fallback processing: {fields}")  # Debug
-                if len(fields) >= 9:  # Expecting at least 9 fields (description, gross value, discount, net value, CGST %, CGST amount, SGST %, SGST amount, total)
-                    try:
-                        description = fields[0].strip()
-                        if not description.startswith('1 x'):
-                            continue  # Skip if not an item row
-                        quantity = int(description.split(' x ')[0].strip())
-                        unit_price = float(fields[1].strip())  # Gross value
-                        total_price = float(fields[-1].strip())  # Total after taxes
-                        items.append({
-                            "description": description,
-                            "quantity": quantity,
-                            "unit_price": unit_price,
-                            "total_price": total_price
-                        })
-                        print(f"Fallback Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}")  # Debug
-                    except (ValueError, IndexError) as e:
-                        print(f"Failed fallback parsing for line '{line}': {str(e)}")
-                        continue
-    # Extract platform fee table (e.g., Sr.No Particulars)
-    platform_fee_start = -1
-    for i, line in enumerate(lines):
-        if "Sr.No Particulars" in line:
-            platform_fee_start = i + 1
-            break
-    if platform_fee_start != -1:
-        platform_fee_end = len(lines)
-        for i in range(platform_fee_start, len(lines)):
-            if "Total" in lines[i] and "Sr.No" not in lines[i]:
-                platform_fee_end = i + 1
                 break
-        platform_fee_lines = lines[platform_fee_start:platform_fee_end]
-        print("Platform fee lines:", platform_fee_lines)  # Debug
-        platform_fee_pattern = r"\|?\s*\d+\s*\|?\s*([A-Za-z\s]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
-        for line in platform_fee_lines:
-            line = line.strip()
-            if not line or "Total" in line:
-                continue
-            match = re.match(platform_fee_pattern, line)
-            if match:
-                description = match.group(1).strip()
-                total_price = float(match.group(5))
-                items.append({
-                    "description": description,
-                    "quantity": 1,  # Platform fee is a single item
-                    "unit_price": float(match.group(2)),  # Taxable amount
-                    "total_price": total_price
-                })
-                print(f"Extracted Platform Fee: {description}, Total Price: {total_price}")  # Debug
-            else:
-                print(f"Failed to match platform fee row: {line}")
     return items
-def extract_entities(text):
     """Extract structured invoice details using flexible regex patterns."""
     invoice_numbers = []
     primary_invoice_number = "Unknown"
@@ -197,7 +240,7 @@ def extract_entities(text):
     total_amount = 0.0
     # Extract items first to use as a filter for NER
-    items = extract_items(text)
     item_descriptions = [item["description"].lower() for item in items]
     # Flexible regex patterns to handle various invoice formats
@@ -426,8 +469,8 @@ def process_invoice(pdf_file):
     if "Error" in text:
         return f"**Error**: {text}"
-    invoice_number, vendor_name, invoice_date, total_amount = extract_entities(text)
-    items = extract_items(text)
     text_length = len(text)
     history_df = fetch_vendor_history(vendor_name, invoice_number)

     except Exception as e:
         return f"Error extracting text: {str(e)}"
+def extract_items(pdf_file, text):
+    """Extract items from the invoice using table extraction and text fallback."""
     items = []
+    # First, try to extract tables using pdfplumber
+    try:
+        with pdfplumber.open(pdf_file) as pdf:
+            for page in pdf.pages:
+                tables = page.extract_tables()
+                print(f"Found {len(tables)} tables on page")  # Debug
+                for table_idx, table in enumerate(tables):
+                    print(f"Table {table_idx}:\n{table}")  # Debug
+                    # Identify main table (Particulars | Gross value | Discount | Net value | Total)
+                    if table and len(table) > 0 and any("Particulars" in str(cell) for cell in table[0]):
+                        # Skip the header row
+                        for row in table[1:]:
+                            if not row or len(row) < 9:  # Expecting at least 9 columns
+                                continue
+                            # Check if row contains item data (starts with "1 x")
+                            description = str(row[0]).strip()
+                            if not description or "Total" in description or "HSN Code" in description:
+                                continue
+                            if description.startswith('1 x'):
+                                try:
+                                    quantity = int(description.split(' x ')[0].strip())
+                                    unit_price = float(str(row[1]).strip())  # Gross value
+                                    total_price = float(str(row[-1]).strip())  # Total after taxes
+                                    items.append({
+                                        "description": description,
+                                        "quantity": quantity,
+                                        "unit_price": unit_price,
+                                        "total_price": total_price
+                                    })
+                                    print(f"Table Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}")  # Debug
+                                except (ValueError, IndexError) as e:
+                                    print(f"Failed to parse table row {row}: {str(e)}")
+                                    continue
+                    # Identify platform fee table (Sr.No Particulars)
+                    if table and len(table) > 0 and any("Sr.No Particulars" in str(cell) for cell in table[0]):
+                        for row in table[1:]:
+                            if not row or len(row) < 5 or "Total" in str(row[1]):
+                                continue
+                            description = str(row[1]).strip()
+                            try:
+                                total_price = float(str(row[-1]).strip())
+                                items.append({
+                                    "description": description,
+                                    "quantity": 1,
+                                    "unit_price": float(str(row[2]).strip()),  # Taxable amount
+                                    "total_price": total_price
+                                })
+                                print(f"Table Extracted Platform Fee: {description}, Total Price: {total_price}")  # Debug
+                            except (ValueError, IndexError) as e:
+                                print(f"Failed to parse platform fee row {row}: {str(e)}")
+                                continue
+    except Exception as e:
+        print(f"Table extraction failed: {str(e)}. Falling back to text-based extraction.")
+    # Fallback to text-based extraction if no items were extracted
+    if not items or len(items) < 3:  # Expecting at least 3 items (2 main items + platform fee)
+        print("Falling back to text-based item extraction.")
+        text = text.replace(r'\$', '$').replace('₹', '₹')
+        lines = text.split('\n')
+        print("Text split into lines:", lines)  # Debug
+        # Define possible table headers
+        table_headers = [
+            ("Particulars", "Gross value", "Discount", "Net value", "Total"),
+        ]
+        # Extract main table
+        table_start = -1
+        for i, line in enumerate(lines):
+            for headers in table_headers:
+                if all(header in line for header in headers):
+                    table_start = i + 1
+                    break
+            if table_start != -1:
                 break
+        if table_start != -1:
+            table_end = len(lines)
+            for i in range(table_start, len(lines)):
+                if "Item(s) Total" in lines[i] or "Total Value" in lines[i] or "Sr.No Particulars" in lines[i]:
+                    table_end = i
+                    break
+            print(f"Main table section: lines {table_start} to {table_end-1}")  # Debug
+            table_lines = lines[table_start:table_end]
+            print("Main table lines:", table_lines)  # Debug
             table_row_pattern = r"(\d+\s*x\s*[A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*[0-9.%]+\s*\|?\s*[\d.]+){2}\s*(?:\|\s*)?([\d.]+)"
+            for line in table_lines:
+                line = line.strip()
+                if not line or "HSN Code" in line or "Total" in line:
+                    print(f"Skipping irrelevant line: {line}")
+                    continue
+                if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
+                    print(f"Skipping alignment row: {line}")
+                    continue
+                print(f"Processing main table row: {line}")  # Debug
+                match = re.match(table_row_pattern, line)
+                if match:
+                    description = match.group(1).strip()
+                    quantity = int(description.split(' x ')[0].strip()) if ' x ' in description else 1
+                    unit_price = float(match.group(2))
+                    total_price = float(match.group(5))
+                    items.append({
+                        "description": description,
+                        "quantity": quantity,
+                        "unit_price": unit_price,
+                        "total_price": total_price
+                    })
+                    print(f"Fallback Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}")  # Debug
+                else:
+                    fields = [f.strip() for f in line.split('|')]
+                    print(f"Fallback splitting: {fields}")  # Debug
+                    if len(fields) >= 9:
+                        try:
+                            description = fields[0].strip()
+                            if not description.startswith('1 x'):
+                                continue
+                            quantity = int(description.split(' x ')[0].strip())
+                            unit_price = float(fields[1].strip())
+                            total_price = float(fields[-1].strip())
+                            items.append({
+                                "description": description,
+                                "quantity": quantity,
+                                "unit_price": unit_price,
+                                "total_price": total_price
+                            })
+                            print(f"Fallback Split Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}")  # Debug
+                        except (ValueError, IndexError) as e:
+                            print(f"Failed fallback parsing for line '{line}': {str(e)}")
+                            continue
+        # Extract platform fee table
+        platform_fee_start = -1
+        for i, line in enumerate(lines):
+            if "Sr.No Particulars" in line:
+                platform_fee_start = i + 1
                 break
+        if platform_fee_start != -1:
+            platform_fee_end = len(lines)
+            for i in range(platform_fee_start, len(lines)):
+                if "Total" in lines[i] and "Sr.No" not in lines[i]:
+                    platform_fee_end = i + 1
+                    break
+            platform_fee_lines = lines[platform_fee_start:platform_fee_end]
+            print("Platform fee lines:", platform_fee_lines)  # Debug
+            platform_fee_pattern = r"\|?\s*\d+\s*\|?\s*([A-Za-z\s]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
+            for line in platform_fee_lines:
+                line = line.strip()
+                if not line or "Total" in line:
+                    continue
+                match = re.match(platform_fee_pattern, line)
+                if match:
+                    description = match.group(1).strip()
+                    total_price = float(match.group(5))
+                    items.append({
+                        "description": description,
+                        "quantity": 1,
+                        "unit_price": float(match.group(2)),
+                        "total_price": total_price
+                    })
+                    print(f"Fallback Extracted Platform Fee: {description}, Total Price: {total_price}")  # Debug
+                else:
+                    print(f"Failed to match platform fee row: {line}")
     return items
+def extract_entities(pdf_file, text):
     """Extract structured invoice details using flexible regex patterns."""
     invoice_numbers = []
     primary_invoice_number = "Unknown"
     total_amount = 0.0
     # Extract items first to use as a filter for NER
+    items = extract_items(pdf_file, text)
     item_descriptions = [item["description"].lower() for item in items]
     # Flexible regex patterns to handle various invoice formats
     if "Error" in text:
         return f"**Error**: {text}"
+    invoice_number, vendor_name, invoice_date, total_amount = extract_entities(pdf_file, text)
+    items = extract_items(pdf_file, text)
     text_length = len(text)
     history_df = fetch_vendor_history(vendor_name, invoice_number)