Spaces:

GSoumyajit2005
/

invoice-processor-ml

Sleeping

GSoumyajit2005 commited on Jan 15

Commit

ec0b507

1 Parent(s): 343b0c3

Refactor: Replace Tesseract with DocTR and integrate LayoutLMv3-DocTR model

Major overhaul of OCR/Inference pipeline. Swapped Tesseract for DocTR, retrained LayoutLMv3 (~83% F1), and fixed address extraction using Fuzzy Matching.

Files changed (10) hide show

.gitignore +1 -0
Dockerfile +6 -3
README.md +1 -1
requirements.txt +1 -1
scripts/prepare_doctr_data.py +377 -0
scripts/train_combined.py +29 -11
src/extraction.py +46 -18
src/ml_extraction.py +180 -32
src/ocr.py +0 -42
src/pipeline.py +3 -7

.gitignore CHANGED Viewed

@@ -23,6 +23,7 @@ credentials.json
 *.log
 logs/
 .cache/
 # OS
 .DS_Store

 *.log
 logs/
 .cache/
+*.pkl
 # OS
 .DS_Store

Dockerfile CHANGED Viewed

@@ -1,10 +1,13 @@
 # Use an official Python runtime
 FROM python:3.10-slim
-# 1. Install system dependencies (Tesseract + OpenCV + POPPLER)
-# Added poppler-utils because src/pdf_utils.py uses pdf2image
 RUN apt-get update && apt-get install -y \
-    tesseract-ocr \
     poppler-utils \
     ffmpeg libsm6 libxext6 \
     && rm -rf /var/lib/apt/lists/*

 # Use an official Python runtime
 FROM python:3.10-slim
+# 1. Install system dependencies (DocTR + OpenCV + POPPLER)
+# DocTR requires OpenGL and GStreamer libraries for image processing
 RUN apt-get update && apt-get install -y \
+    libgl1-mesa-dev \
+    libglib2.0-0 \
+    libgstreamer1.0-0 \
+    libgstreamer-plugins-base1.0-0 \
     poppler-utils \
     ffmpeg libsm6 libxext6 \
     && rm -rf /var/lib/apt/lists/*

README.md CHANGED Viewed

@@ -374,7 +374,7 @@ invoice-processor-ml/
 ## ⚠️ Known Limitations
-1. **Layout Sensitivity**: The ML model was fine‑tuned only on SROIE (retail receipts). Professional multi-column invoices may underperform until you fine‑tune on more diverse datasets.
 2. **Invoice Number**: SROIE dataset lacks invoice number labels. The system solves this by using the Hybrid Fallback Engine, which successfully extracts invoice numbers using Regex whenever the ML model output is empty.
 3. **Line Items/Tables**: Not trained for table extraction yet. Rule-based supports simple totals; table extraction comes later.
 4. **OCR Variability**: Tesseract outputs can vary; preprocessing and thresholds can impact ML results.

 ## ⚠️ Known Limitations
+1. **Layout Sensitivity**: The ML model was fine‑tuned on SROIE (retail receipts) and mychen76/invoices-and-receipts_ocr_v1 (English). Professional multi-column invoices may underperform until you fine‑tune on more diverse datasets.
 2. **Invoice Number**: SROIE dataset lacks invoice number labels. The system solves this by using the Hybrid Fallback Engine, which successfully extracts invoice numbers using Regex whenever the ML model output is empty.
 3. **Line Items/Tables**: Not trained for table extraction yet. Rule-based supports simple totals; table extraction comes later.
 4. **OCR Variability**: Tesseract outputs can vary; preprocessing and thresholds can impact ML results.

requirements.txt CHANGED Viewed

@@ -2,7 +2,7 @@
 streamlit>=1.28.0
 # ----- OCR -----
-pytesseract>=0.3.10
 opencv-python>=4.8.0
 Pillow>=10.0.0

 streamlit>=1.28.0
 # ----- OCR -----
+python-doctr[torch]>=0.8.0
 opencv-python>=4.8.0
 Pillow>=10.0.0

scripts/prepare_doctr_data.py ADDED Viewed

	@@ -0,0 +1,377 @@

+# scripts/prepare_doctr_data.py
+"""
+Prepare training data using DocTR OCR output.
+This script:
+1. Iterates through SROIE training/test images
+2. Runs DocTR OCR to get words and boxes
+3. Aligns DocTR output with ground truth labels using fuzzy matching
+4. Saves the aligned dataset to a pickle file for training
+This ensures the model learns from DocTR's actual output (with its specific errors)
+rather than from perfect ground truth which it will never see in production.
+"""
+import torch
+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import json
+import pickle
+from pathlib import Path
+from PIL import Image
+from tqdm import tqdm
+from difflib import SequenceMatcher
+from typing import List, Dict, Any, Tuple, Optional
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+# --- CONFIGURATION ---
+SROIE_DATA_PATH = "data/sroie"
+OUTPUT_CACHE_PATH = "data/doctr_trained_cache.pkl"
+# Ground truth field names and their corresponding BIO labels
+GT_FIELD_MAPPING = {
+    "company": "COMPANY",
+    "date": "DATE",
+    "address": "ADDRESS",
+    "total": "TOTAL",
+}
+def load_doctr_predictor():
+    """Initialize DocTR predictor with lightweight backbone and move to GPU."""
+    print("Loading DocTR OCR predictor...")
+    # 1. Initialize the model
+    predictor = ocr_predictor(
+        det_arch='db_resnet50',
+        reco_arch='crnn_vgg16_bn',
+        pretrained=True
+    )
+    # 2. Force it to GPU if available
+    if torch.cuda.is_available():
+        print("🚀 Moving DocTR to GPU (CUDA)...")
+        predictor.cuda()
+    else:
+        print("⚠️ GPU not found. Running on CPU (this will be slow).")
+    print("DocTR OCR predictor ready.")
+    return predictor
+def parse_doctr_output(doctr_result, img_width: int, img_height: int) -> Tuple[List[str], List[List[int]]]:
+    """
+    Parse DocTR output into words and normalized boxes (0-1000 scale).
+    Returns:
+        words: List of word strings
+        normalized_boxes: List of [x0, y0, x1, y1] in 0-1000 scale
+    """
+    words = []
+    normalized_boxes = []
+    for page in doctr_result.pages:
+        for block in page.blocks:
+            for line in block.lines:
+                for word in line.words:
+                    if not word.value.strip():
+                        continue
+                    words.append(word.value)
+                    # DocTR bbox format: ((x_min, y_min), (x_max, y_max)) in 0-1 scale
+                    (x_min, y_min), (x_max, y_max) = word.geometry
+                    # Normalize to 0-1000 scale with clamping
+                    normalized_boxes.append([
+                        max(0, min(1000, int(x_min * 1000))),
+                        max(0, min(1000, int(y_min * 1000))),
+                        max(0, min(1000, int(x_max * 1000))),
+                        max(0, min(1000, int(y_max * 1000))),
+                    ])
+    return words, normalized_boxes
+def fuzzy_match_score(s1: str, s2: str) -> float:
+    """Calculate fuzzy match score between two strings."""
+    return SequenceMatcher(None, s1.lower(), s2.lower()).ratio()
+def find_entity_in_words(
+    entity_text: str,
+    words: List[str],
+    start_idx: int = 0,
+    threshold: float = 0.7
+) -> Optional[Tuple[int, int]]:
+    """
+    Find a ground truth entity in the DocTR words using fuzzy matching.
+    Includes expansion search to handle OCR word splitting.
+    """
+    entity_words = entity_text.split()
+    n_target = len(entity_words)
+    # 1. Single word match
+    if n_target == 1:
+        best_score = 0
+        best_idx = -1
+        for i in range(start_idx, len(words)):
+            score = fuzzy_match_score(entity_text, words[i])
+            if score > best_score and score >= threshold:
+                best_score = score
+                best_idx = i
+        if best_idx >= 0:
+            return (best_idx, best_idx)
+    # 2. Multi-word entity: Flexible Window Search
+    # We search windows of size N, N+1, N+2... up to N+5 (to catch OCR splits)
+    # AND N-1, N-2... (to catch OCR merges)
+    best_match_score = 0.0
+    best_match_indices = None
+    # Define search range: from (Length - 3) to (Length + 5)
+    min_len = max(1, n_target - 3)
+    max_len = min(len(words) - start_idx, n_target + 5)
+    combined_entity_text = " ".join(entity_words)
+    # Iterate through window sizes
+    for window_size in range(min_len, max_len + 1):
+        for i in range(start_idx, len(words) - window_size + 1):
+            # Construct window text
+            window_tokens = words[i : i + window_size]
+            window_text = " ".join(window_tokens)
+            score = fuzzy_match_score(combined_entity_text, window_text)
+            # Optimization: If perfect match, return immediately
+            if score > 0.95:
+                return (i, i + window_size - 1)
+            if score > best_match_score and score >= threshold:
+                best_match_score = score
+                best_match_indices = (i, i + window_size - 1)
+    return best_match_indices
+def load_ground_truth(json_path: Path) -> Dict[str, str]:
+    """
+    Load ground truth entities from the tagged JSON.
+    The SROIE tagged JSON has: {"words": [...], "bbox": [...], "labels": [...]}
+    We need to reconstruct the entity values from words + labels.
+    """
+    with open(json_path, encoding="utf-8") as f:
+        data = json.load(f)
+    words = data.get("words", [])
+    labels = data.get("labels", [])
+    # Reconstruct entities from BIO tags
+    entities = {}
+    current_entity = None
+    current_text = []
+    for word, label in zip(words, labels):
+        if label.startswith("B-"):
+            # Save previous entity if exists
+            if current_entity and current_text:
+                entities[current_entity.lower()] = " ".join(current_text)
+            # Start new entity
+            current_entity = label[2:]  # Remove "B-" prefix
+            current_text = [word]
+        elif label.startswith("I-") and current_entity:
+            entity_type = label[2:]
+            if entity_type == current_entity:
+                current_text.append(word)
+            else:
+                # Entity type changed, save current
+                if current_text:
+                    entities[current_entity.lower()] = " ".join(current_text)
+                current_entity = None
+                current_text = []
+        else:
+            # "O" label - save current entity if exists
+            if current_entity and current_text:
+                entities[current_entity.lower()] = " ".join(current_text)
+            current_entity = None
+            current_text = []
+    # Don't forget the last entity
+    if current_entity and current_text:
+        entities[current_entity.lower()] = " ".join(current_text)
+    return entities
+def align_labels(
+    doctr_words: List[str],
+    ground_truth: Dict[str, str]
+) -> List[str]:
+    labels = ["O"] * len(doctr_words)
+    used_indices = set()
+    for gt_field, bio_label in GT_FIELD_MAPPING.items():
+        if gt_field not in ground_truth:
+            continue
+        entity_text = ground_truth[gt_field]
+        if not entity_text or not entity_text.strip():
+            continue
+        # DYNAMIC THRESHOLD: Be lenient with Addresses, strict with Dates/Totals
+        current_threshold = 0.6
+        if bio_label == "ADDRESS":
+            current_threshold = 0.45  # Lower threshold for messy addresses
+        elif bio_label in ["DATE", "TOTAL"]:
+            current_threshold = 0.7   # Keep strict for precision fields
+        match = find_entity_in_words(entity_text, doctr_words, start_idx=0, threshold=current_threshold)
+        if match:
+            start_idx, end_idx = match
+            # Overlap check
+            if any(i in used_indices for i in range(start_idx, end_idx + 1)):
+                continue
+            labels[start_idx] = f"B-{bio_label}"
+            for i in range(start_idx + 1, end_idx + 1):
+                labels[i] = f"I-{bio_label}"
+            used_indices.update(range(start_idx, end_idx + 1))
+    return labels
+def process_split(
+    split_path: Path,
+    predictor,
+    split_name: str
+) -> List[Dict[str, Any]]:
+    """Process all images in a split directory."""
+    # Find image and annotation directories
+    if (split_path / "images").exists():
+        img_dir = split_path / "images"
+    elif (split_path / "img").exists():
+        img_dir = split_path / "img"
+    else:
+        print(f"   ⚠️ No image directory found in {split_path}")
+        return []
+    if (split_path / "tagged").exists():
+        ann_dir = split_path / "tagged"
+    elif (split_path / "box").exists():
+        ann_dir = split_path / "box"
+    else:
+        print(f"   ⚠️ No annotation directory found in {split_path}")
+        return []
+    examples = []
+    image_files = sorted([f for f in img_dir.iterdir() if f.suffix.lower() in [".jpg", ".png"]])
+    print(f"   Processing {len(image_files)} images in {split_name}...")
+    for img_file in tqdm(image_files, desc=f"   {split_name}"):
+        try:
+            # Check for corresponding annotation
+            json_path = ann_dir / f"{img_file.stem}.json"
+            if not json_path.exists():
+                continue
+            # Load image dimensions
+            with Image.open(img_file) as img:
+                width, height = img.size
+            # Run DocTR OCR
+            doc = DocumentFile.from_images(str(img_file))
+            doctr_result = predictor(doc)
+            # Parse DocTR output
+            words, boxes = parse_doctr_output(doctr_result, width, height)
+            if not words:
+                continue
+            # Load ground truth and align labels
+            ground_truth = load_ground_truth(json_path)
+            aligned_labels = align_labels(words, ground_truth)
+            # Create example
+            examples.append({
+                "image_path": str(img_file),
+                "words": words,
+                "bboxes": boxes,
+                "ner_tags": aligned_labels,
+                "ground_truth": ground_truth  # Keep for debugging
+            })
+        except Exception as e:
+            print(f"\n   ❌ Error processing {img_file.name}: {e}")
+            continue
+    return examples
+def main():
+    print("=" * 60)
+    print("📦 DocTR Training Data Preparation")
+    print("=" * 60)
+    sroie_path = Path(SROIE_DATA_PATH)
+    if not sroie_path.exists():
+        print(f"❌ SROIE path not found: {sroie_path}")
+        return
+    # Load DocTR predictor
+    predictor = load_doctr_predictor()
+    dataset = {"train": [], "test": []}
+    # Process each split
+    for split in ["train", "test"]:
+        split_path = sroie_path / split
+        if not split_path.exists():
+            print(f"   ⚠️ Split not found: {split}")
+            continue
+        print(f"\n📂 Processing {split} split...")
+        examples = process_split(split_path, predictor, split)
+        dataset[split] = examples
+        # Stats
+        total_entities = sum(
+            sum(1 for label in ex["ner_tags"] if label.startswith("B-"))
+            for ex in examples
+        )
+        print(f"   ✅ {len(examples)} images processed")
+        print(f"   📊 {total_entities} entities aligned")
+    # Save cache
+    print(f"\n💾 Saving cache to {OUTPUT_CACHE_PATH}...")
+    output_path = Path(OUTPUT_CACHE_PATH)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "wb") as f:
+        pickle.dump(dataset, f)
+    print(f"✅ Cache saved!")
+    print(f"   - Train examples: {len(dataset['train'])}")
+    print(f"   - Test examples: {len(dataset['test'])}")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()

scripts/train_combined.py CHANGED Viewed

@@ -13,6 +13,7 @@ from pathlib import Path
 import numpy as np
 import random
 import os
 # --- IMPORTS ---
 from src.sroie_loader import load_sroie
@@ -21,8 +22,9 @@ from src.data_loader import load_unified_dataset
 # --- CONFIGURATION ---
 # Points to your local SROIE copy
 SROIE_DATA_PATH = "data/sroie"
 MODEL_CHECKPOINT = "microsoft/layoutlmv3-base"
-OUTPUT_DIR = "models/layoutlmv3-generalized"
 # Standard Label Set
 LABEL_LIST = ['O', 'B-COMPANY', 'I-COMPANY', 'B-DATE', 'I-DATE',
@@ -86,18 +88,34 @@ class UnifiedDataset(Dataset):
         return {k: v.squeeze(0) for k, v in encoding.items()}
 def train():
     print(f"{'='*40}\n🚀 STARTING HYBRID TRAINING\n{'='*40}")
-    # Check SROIE path
-    if not os.path.exists(SROIE_DATA_PATH):
-        print(f"❌ Error: SROIE path not found at {SROIE_DATA_PATH}")
-        print("Please make sure you copied the 'sroie' folder into 'data/'.")
-        return
-    # 1. Load SROIE
-    print("📦 Loading SROIE dataset...")
-    sroie_data = load_sroie(SROIE_DATA_PATH)
     print(f"   - SROIE Train: {len(sroie_data['train'])}")
     print(f"   - SROIE Test:  {len(sroie_data['test'])}")
@@ -141,7 +159,7 @@ def train():
     # 6. Optimize & Train
     optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
     best_f1 = 0.0
-    NUM_EPOCHS = 5
     print("\n🔥 Beginning Fine-Tuning...")
     for epoch in range(NUM_EPOCHS):

 import numpy as np
 import random
 import os
+import pickle
 # --- IMPORTS ---
 from src.sroie_loader import load_sroie
 # --- CONFIGURATION ---
 # Points to your local SROIE copy
 SROIE_DATA_PATH = "data/sroie"
+DOCTR_CACHE_PATH = "data/doctr_trained_cache.pkl"  # DocTR pre-processed cache
 MODEL_CHECKPOINT = "microsoft/layoutlmv3-base"
+OUTPUT_DIR = "models/layoutlmv3-doctr-trained"
 # Standard Label Set
 LABEL_LIST = ['O', 'B-COMPANY', 'I-COMPANY', 'B-DATE', 'I-DATE',
         return {k: v.squeeze(0) for k, v in encoding.items()}
+def load_doctr_cache(cache_path: str) -> dict:
+    """Load pre-processed DocTR training data from cache."""
+    print(f"📦 Loading DocTR cache from {cache_path}...")
+    with open(cache_path, "rb") as f:
+        data = pickle.load(f)
+    print(f"   ✅ Loaded {len(data.get('train', []))} train, {len(data.get('test', []))} test examples")
+    return data
 def train():
     print(f"{'='*40}\n🚀 STARTING HYBRID TRAINING\n{'='*40}")
+    # 1. Load SROIE data (prefer DocTR cache if available)
+    if os.path.exists(DOCTR_CACHE_PATH):
+        print("🔄 Using DocTR-aligned training data (recommended)")
+        sroie_data = load_doctr_cache(DOCTR_CACHE_PATH)
+    else:
+        print("⚠️  DocTR cache not found. Using original SROIE loader.")
+        print("   Run 'python scripts/prepare_doctr_data.py' to generate the cache.")
+        if not os.path.exists(SROIE_DATA_PATH):
+            print(f"❌ Error: SROIE path not found at {SROIE_DATA_PATH}")
+            print("Please make sure you copied the 'sroie' folder into 'data/'.")
+            return
+        sroie_data = load_sroie(SROIE_DATA_PATH)
     print(f"   - SROIE Train: {len(sroie_data['train'])}")
     print(f"   - SROIE Test:  {len(sroie_data['test'])}")
     # 6. Optimize & Train
     optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
     best_f1 = 0.0
+    NUM_EPOCHS = 10
     print("\n🔥 Beginning Fine-Tuning...")
     for epoch in range(NUM_EPOCHS):

src/extraction.py CHANGED Viewed

@@ -102,29 +102,57 @@ def extract_vendor(text: str) -> Optional[str]:
     return None
 def extract_invoice_number(text: str) -> Optional[str]:
-    """
-    Improved regex that handles alphanumeric AND numeric IDs, plus variations like "Tax Inv".
-    """
     if not text: return None
-    # Strategy 1: Look for "Invoice No: XXXXX" pattern
-    # UPDATED: Handles "Tax Invoice", "Inv No", and standard variations
-    keyword_pattern = r'(?:TAX\s*)?(?:INVOICE|INV|BILL|RECEIPT)\s*(?:NO|NUMBER|#|NUM)?[\s\.:-]*([A-Z0-9\-/]{3,})'
-    match = re.search(keyword_pattern, text, re.IGNORECASE)
-    if match:
-        return match.group(1)
-    # Strategy 2: Look for standalone labeled patterns (Existing Logic)
-    # Only if Strategy 1 fails
     lines = text.split('\n')
-    for line in lines[:20]:
-        if any(k in line.lower() for k in ['invoice', 'no', '#']):
-            # Allow pure digits now if they are long enough (e.g. 40378170)
-            # Match 4+ digits OR alphanumeric
-            token_match = re.search(r'\b([A-Z0-9-]{4,})\b', line)
-            if token_match:
-                return token_match.group(1)
     return None
 def extract_bill_to(text: str) -> Optional[Dict[str, str]]:

     return None
 def extract_invoice_number(text: str) -> Optional[str]:
     if not text: return None
+    # 1. BLOCK LIST: Words that might be captured as the ID itself by mistake
+    FORBIDDEN_WORDS = {
+        'INVOICE', 'TAX', 'RECEIPT', 'BILL', 'NUMBER', 'NO', 'DATE',
+        'ORIGINAL', 'COPY', 'GST', 'REG', 'MEMBER', 'SLIP', 'TEL', 'FAX'
+    }
+    # 2. TOXIC CONTEXTS: If a line contains these, it's likely a Tax ID or Phone #, not an Invoice #
+    # We skip the line entirely if these are found (unless "INVOICE" is also strictly present)
+    TOXIC_LINE_INDICATORS = ['GST', 'REG', 'SSM', 'TIN', 'PHONE', 'TEL', 'FAX', 'UBL', 'UEN']
+    # Strategy 1: Explicit Label Search (High Confidence)
+    # matches "Invoice No:", "Slip No:", "Bill #:", etc.
+    # ADDED: 'SLIP' to the valid prefixes
+    keyword_pattern = r'(?i)(?:TAX\s*)?(?:INVOICE|INV|BILL|RECEIPT|SLIP)\s*(?:NO|NUMBER|#|NUM)\s*[:\.]?\s*([A-Z0-9\-/]+)'
+    matches = re.findall(keyword_pattern, text)
+    for match in matches:
+        clean_match = match.strip()
+        # Verify length and ensure the match itself isn't a forbidden word
+        if len(clean_match) >= 3 and clean_match.upper() not in FORBIDDEN_WORDS:
+            return clean_match
+    # Strategy 2: Contextual Line Search (Medium Confidence)
+    # We scan line-by-line for loose patterns like "No: 12345" or "Slip: 555"
     lines = text.split('\n')
+    for line in lines[:25]: # Scan top 25 lines
+        line_upper = line.upper()
+        # ⚠️ CRITICAL FIX: Skip lines that look like Tax IDs (GST/REG)
+        # But allow if the line explicitly says "INVOICE" (e.g. "Tax Invoice / GST Reg No")
+        if any(bad in line_upper for bad in TOXIC_LINE_INDICATORS) and "INVOICE" not in line_upper:
+            continue
+        # Look for Invoice-like keywords (Added SLIP)
+        # matches " NO", " #", "SLIP"
+        if any(k in line_upper for k in ['INVOICE', ' NO', ' #', 'INV', 'SLIP', 'BILL']):
+            # Find candidate tokens: 3+ alphanumeric chars
+            tokens = re.findall(r'\b[A-Z0-9\-/]{3,}\b', line_upper)
+            for token in tokens:
+                if token in FORBIDDEN_WORDS:
+                    continue
+                # Heuristic: Invoice numbers almost always have digits.
+                # This filters out purely alpha strings like "CREDIT" or "CASH"
+                if any(c.isdigit() for c in token):
+                    return token
     return None
 def extract_bill_to(text: str) -> Optional[Dict[str, str]]:

src/ml_extraction.py CHANGED Viewed

@@ -5,17 +5,18 @@ import torch
 from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
 from huggingface_hub import snapshot_download
 from PIL import Image
-import pytesseract
-from typing import List, Dict, Any
 import re
 import numpy as np
 from extraction import extract_invoice_number, extract_total
 # --- CONFIGURATION ---
-LOCAL_MODEL_PATH = "./models/layoutlmv3-generalized"
-HUB_MODEL_ID = "GSoumyajit2005/layoutlmv3-sroie-invoice-extraction"
-# --- Load Model ---
 def load_model_and_processor(model_path, hub_id):
     print("Loading processor from microsoft/layoutlmv3-base...")
     processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
@@ -32,7 +33,26 @@ def load_model_and_processor(model_path, hub_id):
     return model, processor
 MODEL, PROCESSOR = load_model_and_processor(LOCAL_MODEL_PATH, HUB_MODEL_ID)
 if MODEL and PROCESSOR:
     DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -43,6 +63,71 @@ else:
     DEVICE = None
     print("❌ Could not load ML model.")
 def _process_predictions(words, unnormalized_boxes, encoding, predictions, id2label):
     word_ids = encoding.word_ids(batch_index=0)
     word_level_preds = {}
@@ -70,6 +155,7 @@ def _process_predictions(words, unnormalized_boxes, encoding, predictions, id2la
     return entities
 def extract_ml_based(image_path: str) -> Dict[str, Any]:
     if not MODEL or not PROCESSOR:
         raise RuntimeError("ML model is not loaded.")
@@ -77,35 +163,59 @@ def extract_ml_based(image_path: str) -> Dict[str, Any]:
     # 1. Load Image
     image = Image.open(image_path).convert("RGB")
     width, height = image.size
-    ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
-    words = []
-    unnormalized_boxes = []
-    for i in range(len(ocr_data['level'])):
-        if int(ocr_data['conf'][i]) > 30 and ocr_data['text'][i].strip() != '':
-            words.append(ocr_data['text'][i])
-            unnormalized_boxes.append([
-                ocr_data['left'][i], ocr_data['top'][i],
-                ocr_data['width'][i], ocr_data['height'][i]
-            ])
-    raw_text = " ".join(words)
-    # 2. Normalize Boxes (WITH SAFETY CLAMP)
-    normalized_boxes = []
-    for box in unnormalized_boxes:
-        x, y, w, h = box
-        x0, y0, x1, y1 = x, y, x + w, y + h
-        # ⚠️ The Fix: Ensure values never exceed 1000 or drop below 0
-        normalized_boxes.append([
-            max(0, min(1000, int(1000 * (x0 / width)))),
-            max(0, min(1000, int(1000 * (y0 / height)))),
-            max(0, min(1000, int(1000 * (x1 / width)))),
-            max(0, min(1000, int(1000 * (y1 / height)))),
-        ])
-    # 3. Inference
     encoding = PROCESSOR(
         image, text=words, boxes=normalized_boxes,
         truncation=True, max_length=512, return_tensors="pt"
@@ -117,7 +227,7 @@ def extract_ml_based(image_path: str) -> Dict[str, Any]:
     predictions = outputs.logits.argmax(-1).squeeze().tolist()
     extracted_entities = _process_predictions(words, unnormalized_boxes, encoding, predictions, MODEL.config.id2label)
-    # 4. Construct Output
     final_output = {
         "vendor": extracted_entities.get("COMPANY", {}).get("text"),
         "date": extracted_entities.get("DATE", {}).get("text"),
@@ -130,6 +240,20 @@ def extract_ml_based(image_path: str) -> Dict[str, Any]:
         "raw_predictions": extracted_entities  # Contains text and bbox data for each entity
     }
     # Fallbacks
     ml_total = extracted_entities.get("TOTAL", {}).get("text")
     if ml_total:
@@ -144,5 +268,29 @@ def extract_ml_based(image_path: str) -> Dict[str, Any]:
     if not final_output["receipt_number"]:
         final_output["receipt_number"] = extract_invoice_number(raw_text)
     return final_output

 from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
 from huggingface_hub import snapshot_download
 from PIL import Image
+from typing import List, Dict, Any, Tuple
 import re
 import numpy as np
 from extraction import extract_invoice_number, extract_total
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
 # --- CONFIGURATION ---
+LOCAL_MODEL_PATH = "./models/layoutlmv3-doctr-trained"
+HUB_MODEL_ID = "GSoumyajit2005/layoutlmv3-doctr-invoice-processor"
+# --- Load LayoutLMv3 Model ---
 def load_model_and_processor(model_path, hub_id):
     print("Loading processor from microsoft/layoutlmv3-base...")
     processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
     return model, processor
+# --- Load DocTR OCR Predictor ---
+def load_doctr_predictor():
+    """Initialize DocTR predictor and move to GPU for speed."""
+    print("Loading DocTR OCR predictor...")
+    predictor = ocr_predictor(
+        det_arch='db_resnet50',
+        reco_arch='crnn_vgg16_bn',
+        pretrained=True
+    )
+    if torch.cuda.is_available():
+        print("🚀 Moving DocTR to GPU (CUDA)...")
+        predictor.cuda()
+    else:
+        print("⚠️ GPU not found. Running on CPU (slow).")
+    print("DocTR OCR predictor is ready.")
+    return predictor
 MODEL, PROCESSOR = load_model_and_processor(LOCAL_MODEL_PATH, HUB_MODEL_ID)
+DOCTR_PREDICTOR = load_doctr_predictor()
 if MODEL and PROCESSOR:
     DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     DEVICE = None
     print("❌ Could not load ML model.")
+def parse_doctr_output(doctr_result, img_width: int, img_height: int) -> Tuple[List[str], List[List[int]], List[List[int]]]:
+    """
+    Parse DocTR's hierarchical output (Page -> Block -> Line -> Word)
+    into flat lists of words and bounding boxes for LayoutLMv3.
+    DocTR returns coordinates in 0-1.0 scale (relative to image).
+    We convert to:
+    - unnormalized_boxes: pixel coordinates [x, y, width, height] for visualization
+    - normalized_boxes: 0-1000 scale [x0, y0, x1, y1] for LayoutLMv3
+    Args:
+        doctr_result: Output from DocTR predictor
+        img_width: Original image width in pixels
+        img_height: Original image height in pixels
+    Returns:
+        words: List of word strings
+        unnormalized_boxes: List of [x, y, width, height] in pixel coordinates
+        normalized_boxes: List of [x0, y0, x1, y1] in 0-1000 scale
+    """
+    words = []
+    unnormalized_boxes = []
+    normalized_boxes = []
+    # DocTR hierarchy: Document -> Page -> Block -> Line -> Word
+    for page in doctr_result.pages:
+        for block in page.blocks:
+            for line in block.lines:
+                for word in line.words:
+                    # Skip empty words
+                    if not word.value.strip():
+                        continue
+                    words.append(word.value)
+                    # DocTR bbox format: ((x_min, y_min), (x_max, y_max)) in 0-1 scale
+                    (x_min, y_min), (x_max, y_max) = word.geometry
+                    # Convert to pixel coordinates for visualization
+                    px_x0 = int(x_min * img_width)
+                    px_y0 = int(y_min * img_height)
+                    px_x1 = int(x_max * img_width)
+                    px_y1 = int(y_max * img_height)
+                    # Unnormalized box: [x, y, width, height] for visualization overlay
+                    unnormalized_boxes.append([
+                        px_x0,
+                        px_y0,
+                        px_x1 - px_x0,  # width
+                        px_y1 - px_y0   # height
+                    ])
+                    # Normalized box: [x0, y0, x1, y1] in 0-1000 scale for LayoutLMv3
+                    # Clamp values to ensure they stay within [0, 1000]
+                    normalized_boxes.append([
+                        max(0, min(1000, int(x_min * 1000))),
+                        max(0, min(1000, int(y_min * 1000))),
+                        max(0, min(1000, int(x_max * 1000))),
+                        max(0, min(1000, int(y_max * 1000))),
+                    ])
+    return words, unnormalized_boxes, normalized_boxes
 def _process_predictions(words, unnormalized_boxes, encoding, predictions, id2label):
     word_ids = encoding.word_ids(batch_index=0)
     word_level_preds = {}
     return entities
 def extract_ml_based(image_path: str) -> Dict[str, Any]:
     if not MODEL or not PROCESSOR:
         raise RuntimeError("ML model is not loaded.")
     # 1. Load Image
     image = Image.open(image_path).convert("RGB")
     width, height = image.size
+    # 2. Run DocTR OCR
+    doc = DocumentFile.from_images(image_path)
+    doctr_result = DOCTR_PREDICTOR(doc)
+    # 3. Parse DocTR output to get words and boxes
+    words, unnormalized_boxes, normalized_boxes = parse_doctr_output(
+        doctr_result, width, height
+    )
+    # Reconstructs lines so regex can work line-by-line
+    lines = []
+    current_line = []
+    if len(unnormalized_boxes) > 0:
+        # Initialize with first word's Y and Height
+        current_y = unnormalized_boxes[0][1]
+        current_h = unnormalized_boxes[0][3]
+        for i, word in enumerate(words):
+            y = unnormalized_boxes[i][1]
+            h = unnormalized_boxes[i][3]
+            # If vertical gap > 50% of line height, it's a new line
+            if abs(y - current_y) > max(current_h, h) / 2:
+                lines.append(" ".join(current_line))
+                current_line = []
+                current_y = y
+                current_h = h
+            current_line.append(word)
+        # Append the last line
+        if current_line:
+            lines.append(" ".join(current_line))
+    raw_text = "\n".join(lines)
+    # Handle empty OCR result
+    if not words:
+        return {
+            "vendor": None,
+            "date": None,
+            "address": None,
+            "receipt_number": None,
+            "bill_to": None,
+            "total_amount": None,
+            "items": [],
+            "raw_text": "",
+            "raw_predictions": {}
+        }
+    # 4. Inference with LayoutLMv3
     encoding = PROCESSOR(
         image, text=words, boxes=normalized_boxes,
         truncation=True, max_length=512, return_tensors="pt"
     predictions = outputs.logits.argmax(-1).squeeze().tolist()
     extracted_entities = _process_predictions(words, unnormalized_boxes, encoding, predictions, MODEL.config.id2label)
+    # 5. Construct Output
     final_output = {
         "vendor": extracted_entities.get("COMPANY", {}).get("text"),
         "date": extracted_entities.get("DATE", {}).get("text"),
         "raw_predictions": extracted_entities  # Contains text and bbox data for each entity
     }
+    # 6. Vendor Fallback (Spatial Heuristic)
+    # If ML failed to find a vendor, assume the largest text at the top is the vendor
+    if not final_output["vendor"] and unnormalized_boxes:
+        # Filter for words in the top 20% of the image
+        top_words_indices = [
+            i for i, box in enumerate(unnormalized_boxes)
+            if box[1] < height * 0.2
+        ]
+        if top_words_indices:
+            # Find the word with the largest height (font size)
+            largest_idx = max(top_words_indices, key=lambda i: unnormalized_boxes[i][3])
+            final_output["vendor"] = words[largest_idx]
     # Fallbacks
     ml_total = extracted_entities.get("TOTAL", {}).get("text")
     if ml_total:
     if not final_output["receipt_number"]:
         final_output["receipt_number"] = extract_invoice_number(raw_text)
+    # Backfill Bounding Boxes for Regex Results
+    # If Regex found the number but ML didn't, we must find its box
+    # in the OCR data so the UI can draw it.
+    if final_output["receipt_number"] and "INVOICE_NO" not in final_output["raw_predictions"]:
+        target_val = final_output["receipt_number"].strip()
+        found_box = None
+        # 1. Try finding the exact word in the OCR list
+        # 'words' and 'unnormalized_boxes' are available from step 3
+        for i, word in enumerate(words):
+            # Check for exact match or if the word contains the target (e.g. "Inv#123")
+            if target_val == word or (len(target_val) > 3 and target_val in word):
+                found_box = unnormalized_boxes[i]
+                break
+        # 2. If found, inject it into raw_predictions
+        if found_box:
+            # The UI expects a list of boxes
+            final_output["raw_predictions"]["INVOICE_NO"] = {
+                "text": target_val,
+                "bbox": [found_box]
+            }
     return final_output

src/ocr.py DELETED Viewed

@@ -1,42 +0,0 @@
-# src/ocr.py
-import pytesseract
-import numpy as np
-import os
-import shutil
-import sys
-# --- Dynamic Tesseract Configuration ---
-# This block ensures the code runs on both Windows (Local) and Linux (Production)
-if os.name == 'nt': # Windows
-    # Common default installation paths for Windows
-    possible_paths = [
-        r'C:\Program Files\Tesseract-OCR\tesseract.exe',
-        r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe',
-        r'C:\Users\{}\AppData\Local\Tesseract-OCR\tesseract.exe'.format(os.getlogin())
-    ]
-    # Search for the executable
-    found = False
-    for path in possible_paths:
-        if os.path.exists(path):
-            pytesseract.pytesseract.tesseract_cmd = path
-            found = True
-            print(f"✅ Found Tesseract at: {path}")
-            break
-    if not found:
-        print("⚠️ Warning: Tesseract exe not found in standard paths. Assuming it's in system PATH.")
-else:
-    # Linux/Mac (Docker/Production)
-    if not shutil.which('tesseract'):
-        print("⚠️ Warning: 'tesseract' binary not found in PATH. Please install tesseract-ocr.")
-def extract_text(image: np.ndarray, lang: str='eng', config: str='--psm 11') -> str:
-    if image is None:
-        raise ValueError("Input image is None")
-    # Pytesseract will now use the path found above (or default to PATH)
-    return pytesseract.image_to_string(image, lang=lang, config=config).strip()
-def extract_text_with_boxes(image):
-    pass

src/pipeline.py CHANGED Viewed

@@ -13,7 +13,6 @@ import cv2
 # --- IMPORTS ---
 from preprocessing import load_image, convert_to_grayscale, remove_noise
-from ocr import extract_text
 from extraction import structure_output
 from ml_extraction import extract_ml_based
 from schema import InvoiceData
@@ -90,13 +89,10 @@ def process_invoice(image_path: str,
         elif method == 'rules':
             try:
-                image = load_image(image_path)
-                gray_image = convert_to_grayscale(image)
-                preprocessed_image = remove_noise(gray_image, kernel_size=3)
-                text = extract_text(preprocessed_image, config='--psm 6')
-                raw_result = structure_output(text)
             except Exception as e:
-                raise ValueError(f"Error during rule-based extraction: {e}")
         # Clean up temp file if we created one
         if image_path.endswith('.jpg') and 'sample_pdf' in image_path: # Safety check

 # --- IMPORTS ---
 from preprocessing import load_image, convert_to_grayscale, remove_noise
 from extraction import structure_output
 from ml_extraction import extract_ml_based
 from schema import InvoiceData
         elif method == 'rules':
             try:
+                print("⚠️ Rule-based mode is deprecated. Redirecting to ML-based extraction.")
+                raw_result = extract_ml_based(image_path)
             except Exception as e:
+                raise ValueError(f"Error during ML-based extraction: {e}")
         # Clean up temp file if we created one
         if image_path.endswith('.jpg') and 'sample_pdf' in image_path: # Safety check