Spaces:

GSoumyajit2005
/

invoice-processor-ml

Sleeping

App Files Files Community

GSoumyajit2005 commited on Jan 12

Commit

faa3050

1 Parent(s): c19ef4d

feat: Update Dockerfile and requirements for PDF processing, add new dependencies, and refactor API structure

Browse files

Files changed (9) hide show

Dockerfile +7 -6
requirements.txt +6 -1
src/api.py +0 -3
src/pdf_utils.py +2 -0
src/pipeline.py +2 -0
src/preprocessing.py +2 -0
src/sroie_loader.py +2 -0
src/utils.py +2 -0
tests/test_pipeline.py +40 -85

Dockerfile CHANGED Viewed

@@ -1,9 +1,11 @@
 # Use an official Python runtime
 FROM python:3.10-slim
-# Install system dependencies (Tesseract + OpenCV)
 RUN apt-get update && apt-get install -y \
     tesseract-ocr \
     ffmpeg libsm6 libxext6 \
     && rm -rf /var/lib/apt/lists/*
@@ -13,13 +15,12 @@ WORKDIR /app
 # Install Python dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-RUN pip install fastapi uvicorn python-multipart
 # Copy application code
 COPY . .
-# Expose API port
-EXPOSE 8000
-# Run FastAPI
-CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "8000"]

 # Use an official Python runtime
 FROM python:3.10-slim
+# 1. Install system dependencies (Tesseract + OpenCV + POPPLER)
+# Added poppler-utils because src/pdf_utils.py uses pdf2image
 RUN apt-get update && apt-get install -y \
     tesseract-ocr \
+    poppler-utils \
     ffmpeg libsm6 libxext6 \
     && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy application code
 COPY . .
+# 2. Change Port to 7860 (Hugging Face default)
+EXPOSE 7860
+# 3. Run Streamlit
+CMD ["streamlit", "run", "app.py", "--server.port", "7860", "--server.address", "0.0.0.0"]

requirements.txt CHANGED Viewed

@@ -28,4 +28,9 @@ pdf2image>=1.16.0
 # ----- API Framework -----
 fastapi>=0.126.0
 uvicorn[standard]>=0.38.0
-python-multipart>=0.0.21

 # ----- API Framework -----
 fastapi>=0.126.0
 uvicorn[standard]>=0.38.0
+python-multipart>=0.0.21
+# ----- Database -----
+sqlmodel>=0.0.14
+psycopg2-binary>=2.9.0
+alembic>=1.13.0

src/api.py CHANGED Viewed

@@ -1,6 +1,3 @@
 # src/api.py
 from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks





1	# src/api.py
2
3	from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks

src/pdf_utils.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import pdfplumber
 from pdf2image import convert_from_path
 from pathlib import Path

+# src/pdf_utils.py
 import pdfplumber
 from pdf2image import convert_from_path
 from pathlib import Path

src/pipeline.py CHANGED Viewed

@@ -1,3 +1,5 @@
 """
 Main invoice processing pipeline
 Orchestrates preprocessing, OCR, and extraction

+# src/pipeline.py
 """
 Main invoice processing pipeline
 Orchestrates preprocessing, OCR, and extraction

src/preprocessing.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import cv2
 import numpy as np
 from pathlib import Path

+# src/preprocessing.py
 import cv2
 import numpy as np
 from pathlib import Path

src/sroie_loader.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import json
 from pathlib import Path
 from PIL import Image

+# src/sroie_loader.py
 import json
 from pathlib import Path
 from PIL import Image

src/utils.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import hashlib
 from typing import Dict, Any
 from decimal import Decimal

+# src/utils.py
 import hashlib
 from typing import Dict, Any
 from decimal import Decimal

tests/test_pipeline.py CHANGED Viewed

@@ -1,96 +1,51 @@
-import sys
-import json
 from pathlib import Path
-# Add the 'src' directory to the Python path
-sys.path.append('src')
 from pipeline import process_invoice
-def test_full_pipeline():
     """
-    Tests the full invoice processing pipeline on a sample receipt
-    and prints the advanced JSON structure.
     """
-    print("=" * 60)
-    print("🎯 ADVANCED INVOICE PROCESSING PIPELINE TEST")
-    print("=" * 60)
-    # --- Configuration ---
-    image_path = 'data/raw/receipt1.jpg'
-    save_output = True
-    output_dir = 'outputs'
-    # Check if the image exists
-    if not Path(image_path).exists():
-        print(f"❌ ERROR: Test image not found at '{image_path}'")
-        return
-    # --- Processing ---
-    print(f"\n🔄 Processing invoice: {image_path}...")
-    try:
-        # Call the main processing function
-        result = process_invoice(image_path, save_results=save_output, output_dir=output_dir)
-        print("✅ Invoice processed successfully!")
-    except Exception as e:
-        print(f"❌ An error occurred during processing: {e}")
-        # Print traceback for detailed debugging
-        import traceback
-        traceback.print_exc()
-        return
-    # --- Display Results ---
-    print("\n" + "=" * 60)
-    print("📊 EXTRACTED INVOICE DATA (Advanced JSON)")
-    print("=" * 60)
-    # Pretty-print the JSON to the console
-    print(json.dumps(result, indent=2, ensure_ascii=False))
-    print("\n" + "=" * 60)
-    print("📋 SUMMARY OF KEY EXTRACTED FIELDS")
-    print("=" * 60)
-    # --- Print a clean summary ---
-    print(f"📄 Receipt Number: {result.get('receipt_number', 'N/A')}")
-    print(f"📅 Date: {result.get('date', 'N/A')}")
-    # Print Bill To info safely
-    bill_to = result.get('bill_to')
-    if bill_to and isinstance(bill_to, dict):
-        print(f"👤 Bill To: {bill_to.get('name', 'N/A')}")
-    else:
-        print("👤 Bill To: N/A")
-    # Print line items
-    print("\n🛒 Line Items:")
-    items = result.get('items', [])
-    if items:
-        for i, item in enumerate(items, 1):
-            desc = item.get('description', 'No Description')
-            qty = item.get('quantity', 1)
-            total = item.get('total', 0.0)
-            print(f"  - Item {i}: {desc[:40]:<40} | Qty: {qty} | Total: {total:.2f}")
-    else:
-        print("  - No line items extracted.")
-    # Print total and validation status
-    print(f"\n💵 Total Amount: ${result.get('total_amount', 0.0)}")
-    confidence = result.get('extraction_confidence', 0)
-    print(f"📈 Confidence: {confidence}%")
-    validation = "✅ Passed" if result.get('validation_passed', False) else "❌ Failed"
-    print(f"✔️ Validation: {validation}")
-    print("\n" + "=" * 60)
-    if save_output:
-        json_path = Path(output_dir) / (Path(image_path).stem + '.json')
-        print(f"\n💾 Full JSON output saved to: {json_path}")
-    print("\n🎉 PIPELINE TEST COMPLETE!")
-if __name__ == '__main__':
-    test_full_pipeline()

+import pytest
+from unittest.mock import patch
 from pathlib import Path
+import sys
+import os
+# Add src to path
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../src')))
 from pipeline import process_invoice
+# --- MOCK DATA ---
+# This is what we pretend the ML model returned
+MOCK_ML_RESPONSE = {
+    "vendor": "MOCKED VENDOR INC",
+    "date": "2023-01-01",
+    "total_amount": "100.00",
+    "receipt_number": "MOCK-123",
+    "address": "123 Mock Street",
+    "bill_to": "Mock Customer",
+    "items": [],
+    "raw_text": "Mocked raw text content"
+}
+@patch('pipeline.extract_rule_based')
+def test_pipeline_rule_based(mock_extract):
+    mock_extract.return_value = MOCK_ML_RESPONSE
+    with patch('pathlib.Path.exists', return_value=True):
+        result = process_invoice("fake_invoice.jpg", method="rules")
+    assert result['vendor'] == "MOCKED VENDOR INC"
+    assert result['validation_status'] == "passed"
+    mock_extract.assert_called_once()
+@patch('pipeline.extract_ml_based')
+def test_pipeline_ml_mocked(mock_extract):
     """
+    Tests the ML pipeline WITHOUT loading the heavy model.
     """
+    mock_extract.return_value = MOCK_ML_RESPONSE
+    with patch('pathlib.Path.exists', return_value=True):
+        result = process_invoice("fake_invoice.jpg", method="ml")
+    assert result['vendor'] == "MOCKED VENDOR INC"
+    assert result['receipt_number'] == "MOCK-123"
+    assert result['validation_status'] == "passed"
+    mock_extract.assert_called_once()