Spaces:

GSoumyajit2005
/

invoice-processor-ml

Sleeping

App Files Files Community

GSoumyajit2005 commited on Jan 16

Commit

097a95c

1 Parent(s): e81f779

refactor: remove obsolete OCR test file and enhance address extraction logic

Browse files

Files changed (5) hide show

README.md +0 -2
src/extraction.py +103 -1
src/ml_extraction.py +36 -1
src/schema.py +8 -3
tests/test_ocr.py +0 -101

README.md CHANGED Viewed

@@ -156,7 +156,6 @@ _UI shows simple format hints and confidence._
 ### Prerequisites
 - Python 3.10+
-- Tesseract OCR
 - (Optional) CUDA-capable GPU for training/inference speed
 ### Installation
@@ -342,7 +341,6 @@ invoice-processor-ml/
 ├── tests/
 │   ├── test_extraction.py      # Tests for regex extraction module
 │   ├── test_full_pipeline.py   # Full end-to-end integration tests
-│   ├── test_ocr.py             # Tests for the OCR module
 │   ├── test_pipeline.py        # Pipeline process tests
 │   └── test_preprocessing.py   # Tests for the preprocessing module
 │

 ### Prerequisites
 - Python 3.10+
 - (Optional) CUDA-capable GPU for training/inference speed
 ### Installation
 ├── tests/
 │   ├── test_extraction.py      # Tests for regex extraction module
 │   ├── test_full_pipeline.py   # Full end-to-end integration tests
 │   ├── test_pipeline.py        # Pipeline process tests
 │   └── test_preprocessing.py   # Tests for the preprocessing module
 │

src/extraction.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import re
 from typing import List, Dict, Optional, Any
 from datetime import datetime
 def extract_dates(text: str) -> List[str]:
     """
@@ -132,7 +133,7 @@ def extract_invoice_number(text: str) -> Optional[str]:
     for line in lines[:25]: # Scan top 25 lines
         line_upper = line.upper()
-        # ⚠️ CRITICAL FIX: Skip lines that look like Tax IDs (GST/REG)
         # But allow if the line explicitly says "INVOICE" (e.g. "Tax Invoice / GST Reg No")
         if any(bad in line_upper for bad in TOXIC_LINE_INDICATORS) and "INVOICE" not in line_upper:
             continue
@@ -165,6 +166,107 @@ def extract_bill_to(text: str) -> Optional[Dict[str, str]]:
         return {"name": name, "email": None}
     return None
 def extract_line_items(text: str) -> List[Dict[str, Any]]:
     return []

 import re
 from typing import List, Dict, Optional, Any
 from datetime import datetime
+from difflib import SequenceMatcher
 def extract_dates(text: str) -> List[str]:
     """
     for line in lines[:25]: # Scan top 25 lines
         line_upper = line.upper()
+        # CRITICAL FIX: Skip lines that look like Tax IDs (GST/REG)
         # But allow if the line explicitly says "INVOICE" (e.g. "Tax Invoice / GST Reg No")
         if any(bad in line_upper for bad in TOXIC_LINE_INDICATORS) and "INVOICE" not in line_upper:
             continue
         return {"name": name, "email": None}
     return None
+def extract_address(text: str, vendor_name: Optional[str] = None) -> Optional[str]:
+    """
+    Generalized Address Extraction using Spatial Heuristics.
+    Strategy:
+    1. If Vendor is known, look at the lines immediately FOLLOWING it (Spatial).
+    2. If Vendor is unknown, look for lines in the top header with 'Address-like' traits
+       (mix of text + numbers, 3+ words, contains Zip-code-like patterns).
+    """
+    if not text: return None
+    lines = [line.strip() for line in text.split('\n') if line.strip()]
+    # --- FILTERS (Generalized) ---
+    # Skip lines that are clearly NOT addresses
+    def is_invalid_line(line):
+        line_upper = line.upper()
+        # 1. It's a Phone/Fax/Email/URL
+        if any(x in line_upper for x in ['TEL', 'FAX', 'PHONE', 'EMAIL', '@', 'WWW.', '.COM', 'HTTP']):
+            return True
+        # 2. It's a Date
+        if len(line) < 15 and any(c.isdigit() for c in line) and ('/' in line or '-' in line):
+            return True
+        # 3. It's the Vendor name itself (if provided)
+        if vendor_name and vendor_name.lower() in line.lower():
+            return True
+        return False
+    # --- STRATEGY 1: Contextual Search (Below Vendor) ---
+    # This is the most accurate method for receipts worldwide.
+    candidate_lines = []
+    if vendor_name:
+        vendor_found = False
+        # Find where the vendor appears
+        for i, line in enumerate(lines[:15]): # Check top 15 lines only
+            if vendor_name.lower() in line.lower() or (len(vendor_name) > 5 and SequenceMatcher(None, vendor_name, line).ratio() > 0.8):
+                vendor_found = True
+                # Grab the next 1-3 lines as the potential address block
+                # We stop if we hit a phone number or blank line
+                for j in range(1, 4):
+                    if i + j < len(lines):
+                        next_line = lines[i + j]
+                        if not is_invalid_line(next_line):
+                            candidate_lines.append(next_line)
+                        else:
+                            # If we hit a phone number, the address block usually ended
+                            break
+                break
+    # If Strategy 1 found something, join it and return
+    if candidate_lines:
+        return ", ".join(candidate_lines)
+    # --- STRATEGY 2: Header Scan (Density Heuristic) ---
+    # If we couldn't anchor to the vendor, we scan the top 10 lines for "Address-looking" text.
+    # An address usually has:
+    # - At least one digit (Building number, Zip code)
+    # - At least 3 words
+    # - Is NOT a phone number
+    #
+    # CONTIGUITY RULE: Once we start collecting candidates, we STOP at the first
+    # invalid line (phone/fax/etc). This prevents capturing non-adjacent lines
+    # like GST numbers that appear after phone numbers.
+    fallback_candidates = []
+    started_collecting = False
+    for line in lines[:10]:
+        if is_invalid_line(line):
+            # If we've already started collecting, an invalid line means
+            # the address block has ended - don't continue past it
+            if started_collecting:
+                break
+            continue
+        # Check for Address Density:
+        # 1. Has digits (e.g. "123 Main St" or "Singapore 55123")
+        has_digits = any(c.isdigit() for c in line)
+        # 2. Length is substantial (avoid short noise)
+        is_long_enough = len(line) > 10
+        # 3. Has spaces (at least 2 spaces => 3 words)
+        is_multi_word = line.count(' ') >= 2
+        # FIRST line must have digits (to anchor on building/street number)
+        # CONTINUATION lines only need length + multi-word (city/state names often lack digits)
+        is_valid_first_line = has_digits and is_long_enough and is_multi_word
+        is_valid_continuation = started_collecting and is_long_enough and is_multi_word
+        if is_valid_first_line or is_valid_continuation:
+            # We found a strong candidate line
+            fallback_candidates.append(line)
+            started_collecting = True
+            # If we have 3 candidates, that's probably the full address block
+            if len(fallback_candidates) >= 3:
+                break
+    if fallback_candidates:
+        return ", ".join(fallback_candidates)
+    return None
 def extract_line_items(text: str) -> List[Dict[str, Any]]:
     return []

src/ml_extraction.py CHANGED Viewed

@@ -8,7 +8,7 @@ from PIL import Image
 from typing import List, Dict, Any, Tuple
 import re
 import numpy as np
-from extraction import extract_invoice_number, extract_total
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
@@ -254,6 +254,41 @@ def extract_ml_based(image_path: str) -> Dict[str, Any]:
             largest_idx = max(top_words_indices, key=lambda i: unnormalized_boxes[i][3])
             final_output["vendor"] = words[largest_idx]
     # Fallbacks
     ml_total = extracted_entities.get("TOTAL", {}).get("text")
     if ml_total:

 from typing import List, Dict, Any, Tuple
 import re
 import numpy as np
+from extraction import extract_invoice_number, extract_total, extract_address
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
             largest_idx = max(top_words_indices, key=lambda i: unnormalized_boxes[i][3])
             final_output["vendor"] = words[largest_idx]
+    # --- ADDRESS FALLBACK ---
+    if not final_output["address"]:
+        # We pass the extracted (or fallback) Vendor Name to help anchor the search
+        # Use the raw text and the known vendor to find the address spatially
+        fallback_address = extract_address(raw_text, vendor_name=final_output["vendor"])
+        if fallback_address:
+            final_output["address"] = fallback_address
+    # Backfill Bounding Boxes for Address Fallback
+    # If Regex found the address but ML didn't, find its boxes in the OCR data
+    if final_output["address"] and "ADDRESS" not in final_output["raw_predictions"]:
+        address_text = final_output["address"]
+        address_boxes = []
+        # The address may span multiple words, so we search for each word
+        # Split by comma first (since extract_address joins lines with ", ")
+        address_parts = [part.strip() for part in address_text.split(",")]
+        for part in address_parts:
+            part_words = part.split()
+            for target_word in part_words:
+                for i, word in enumerate(words):
+                    # Case-insensitive match
+                    if target_word.lower() == word.lower() or target_word.lower() in word.lower():
+                        address_boxes.append(unnormalized_boxes[i])
+                        break  # Only match once per target word
+        # If we found any boxes, inject into raw_predictions
+        if address_boxes:
+            final_output["raw_predictions"]["ADDRESS"] = {
+                "text": address_text,
+                "bbox": address_boxes
+            }
     # Fallbacks
     ml_total = extracted_entities.get("TOTAL", {}).get("text")
     if ml_total:

src/schema.py CHANGED Viewed

@@ -64,9 +64,14 @@ class InvoiceData(BaseModel):
         if isinstance(v, str):
             try:
                 # Try common formats
-                for fmt in ("%d/%m/%Y", "%Y-%m-%d", "%d-%m-%Y", "%d.%m.%Y"):
                     try:
                         parsed_date = datetime.strptime(v, fmt).date()
                         break
                     except ValueError:
                         continue
@@ -78,8 +83,8 @@ class InvoiceData(BaseModel):
             if parsed_date > today:
                 return None
-            # ⚠️ FIX: Use 'DateType' constructor
-            min_date = DateType(today.year - 10, 1, 1)
             if parsed_date < min_date:
                 return None

         if isinstance(v, str):
             try:
                 # Try common formats
+                for fmt in (
+                    "%d/%m/%Y", "%Y-%m-%d", "%d-%m-%Y", "%d.%m.%Y",
+                    "%m/%d/%Y", "%m-%d-%Y"
+                ):
                     try:
                         parsed_date = datetime.strptime(v, fmt).date()
+                        # Sanity check: If we parsed 05/01/2020, was it May 1st or Jan 5th?
+                        # Usually, if we are here, strict parsing succeeded.
                         break
                     except ValueError:
                         continue
             if parsed_date > today:
                 return None
+            # FIX: Use 'DateType' constructor
+            min_date = DateType(today.year - 30, 1, 1)
             if parsed_date < min_date:
                 return None

tests/test_ocr.py DELETED Viewed

@@ -1,101 +0,0 @@
-import sys
-sys.path.append('src')
-from preprocessing import load_image, convert_to_grayscale, remove_noise
-from ocr import extract_text
-import matplotlib.pyplot as plt
-import numpy as np
-print("=" * 60)
-print("🎯 OPTIMIZING GRAYSCALE OCR")
-print("=" * 60)
-# Load and convert to grayscale
-image = load_image('data/raw/receipt3.jpg')
-gray = convert_to_grayscale(image)
-# Test 1: Different PSM modes
-print("\n📊 Testing different Tesseract PSM modes...\n")
-psm_configs = [
-    ('', 'Default'),
-    ('--psm 3', 'Automatic page segmentation'),
-    ('--psm 4', 'Single column of text'),
-    ('--psm 6', 'Uniform block of text'),
-    ('--psm 11', 'Sparse text, find as much as possible'),
-    ('--psm 12', 'Sparse text with OSD (Orientation and Script Detection)'),
-]
-results = {}
-for config, desc in psm_configs:
-    text = extract_text(gray, config=config)
-    results[desc] = text
-    print(f"{desc:50s} → {len(text):4d} chars")
-# Find best result
-best_desc = max(results, key=lambda k: len(results[k]))
-best_text = results[best_desc]
-print(f"\n✅ WINNER: {best_desc} ({len(best_text)} chars)")
-# Test 2: With slight denoising
-print("\n📊 Testing with light denoising...\n")
-denoised = remove_noise(gray, kernel_size=3)
-text_denoised = extract_text(denoised, config='--psm 6')
-print(f"Grayscale + Denoise (psm 6): {len(text_denoised)} chars")
-# Display best result
-print("\n" + "=" * 60)
-print("📄 BEST EXTRACTED TEXT:")
-print("=" * 60)
-print(best_text)
-print("=" * 60)
-# Visualize
-fig, axes = plt.subplots(1, 3, figsize=(15, 5))
-axes[0].imshow(image)
-axes[0].set_title("Original")
-axes[0].axis('off')
-axes[1].imshow(gray, cmap='gray')
-axes[1].set_title(f"Grayscale\n({len(best_text)} chars - {best_desc})")
-axes[1].axis('off')
-axes[2].imshow(denoised, cmap='gray')
-axes[2].set_title(f"Denoised\n({len(text_denoised)} chars)")
-axes[2].axis('off')
-plt.tight_layout()
-plt.show()
-print(f"\n💡 Recommended pipeline: Grayscale + {best_desc}")
-# Test the combination we missed!
-print("\n📊 Testing BEST combination...\n")
-denoised = remove_noise(gray, kernel_size=3)
-# Test PSM 11 on denoised
-text_denoised_psm11 = extract_text(denoised, config='--psm 11')
-text_denoised_psm6 = extract_text(denoised, config='--psm 6')
-print(f"Denoised + PSM 6:  {len(text_denoised_psm6)} chars")
-print(f"Denoised + PSM 11: {len(text_denoised_psm11)} chars")
-if len(text_denoised_psm11) > len(text_denoised_psm6):
-    print(f"\n✅ PSM 11 wins! ({len(text_denoised_psm11)} chars)")
-    best_config = '--psm 11'
-    best_text_final = text_denoised_psm11
-else:
-    print(f"\n✅ PSM 6 wins! ({len(text_denoised_psm6)} chars)")
-    best_config = '--psm 6'
-    best_text_final = text_denoised_psm6
-print(f"\n🏆 FINAL WINNER: Denoised + {best_config}")
-print("\nFull text:")
-print("=" * 60)
-print(best_text_final)
-print("=" * 60)