GSoumyajit2005 commited on
Commit
faa3050
·
1 Parent(s): c19ef4d

feat: Update Dockerfile and requirements for PDF processing, add new dependencies, and refactor API structure

Browse files
Dockerfile CHANGED
@@ -1,9 +1,11 @@
1
  # Use an official Python runtime
2
  FROM python:3.10-slim
3
 
4
- # Install system dependencies (Tesseract + OpenCV)
 
5
  RUN apt-get update && apt-get install -y \
6
  tesseract-ocr \
 
7
  ffmpeg libsm6 libxext6 \
8
  && rm -rf /var/lib/apt/lists/*
9
 
@@ -13,13 +15,12 @@ WORKDIR /app
13
  # Install Python dependencies
14
  COPY requirements.txt .
15
  RUN pip install --no-cache-dir -r requirements.txt
16
- RUN pip install fastapi uvicorn python-multipart
17
 
18
  # Copy application code
19
  COPY . .
20
 
21
- # Expose API port
22
- EXPOSE 8000
23
 
24
- # Run FastAPI
25
- CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "8000"]
 
1
  # Use an official Python runtime
2
  FROM python:3.10-slim
3
 
4
+ # 1. Install system dependencies (Tesseract + OpenCV + POPPLER)
5
+ # Added poppler-utils because src/pdf_utils.py uses pdf2image
6
  RUN apt-get update && apt-get install -y \
7
  tesseract-ocr \
8
+ poppler-utils \
9
  ffmpeg libsm6 libxext6 \
10
  && rm -rf /var/lib/apt/lists/*
11
 
 
15
  # Install Python dependencies
16
  COPY requirements.txt .
17
  RUN pip install --no-cache-dir -r requirements.txt
 
18
 
19
  # Copy application code
20
  COPY . .
21
 
22
+ # 2. Change Port to 7860 (Hugging Face default)
23
+ EXPOSE 7860
24
 
25
+ # 3. Run Streamlit
26
+ CMD ["streamlit", "run", "app.py", "--server.port", "7860", "--server.address", "0.0.0.0"]
requirements.txt CHANGED
@@ -28,4 +28,9 @@ pdf2image>=1.16.0
28
  # ----- API Framework -----
29
  fastapi>=0.126.0
30
  uvicorn[standard]>=0.38.0
31
- python-multipart>=0.0.21
 
 
 
 
 
 
28
  # ----- API Framework -----
29
  fastapi>=0.126.0
30
  uvicorn[standard]>=0.38.0
31
+ python-multipart>=0.0.21
32
+
33
+ # ----- Database -----
34
+ sqlmodel>=0.0.14
35
+ psycopg2-binary>=2.9.0
36
+ alembic>=1.13.0
src/api.py CHANGED
@@ -1,6 +1,3 @@
1
-
2
-
3
-
4
  # src/api.py
5
 
6
  from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
 
 
 
 
1
  # src/api.py
2
 
3
  from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
src/pdf_utils.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import pdfplumber
2
  from pdf2image import convert_from_path
3
  from pathlib import Path
 
1
+ # src/pdf_utils.py
2
+
3
  import pdfplumber
4
  from pdf2image import convert_from_path
5
  from pathlib import Path
src/pipeline.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  """
2
  Main invoice processing pipeline
3
  Orchestrates preprocessing, OCR, and extraction
 
1
+ # src/pipeline.py
2
+
3
  """
4
  Main invoice processing pipeline
5
  Orchestrates preprocessing, OCR, and extraction
src/preprocessing.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import cv2
2
  import numpy as np
3
  from pathlib import Path
 
1
+ # src/preprocessing.py
2
+
3
  import cv2
4
  import numpy as np
5
  from pathlib import Path
src/sroie_loader.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import json
2
  from pathlib import Path
3
  from PIL import Image
 
1
+ # src/sroie_loader.py
2
+
3
  import json
4
  from pathlib import Path
5
  from PIL import Image
src/utils.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import hashlib
2
  from typing import Dict, Any
3
  from decimal import Decimal
 
1
+ # src/utils.py
2
+
3
  import hashlib
4
  from typing import Dict, Any
5
  from decimal import Decimal
tests/test_pipeline.py CHANGED
@@ -1,96 +1,51 @@
1
- import sys
2
- import json
3
  from pathlib import Path
 
 
4
 
5
- # Add the 'src' directory to the Python path
6
- sys.path.append('src')
7
 
8
  from pipeline import process_invoice
9
 
10
- def test_full_pipeline():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  """
12
- Tests the full invoice processing pipeline on a sample receipt
13
- and prints the advanced JSON structure.
14
  """
15
- print("=" * 60)
16
- print("🎯 ADVANCED INVOICE PROCESSING PIPELINE TEST")
17
- print("=" * 60)
18
-
19
- # --- Configuration ---
20
- image_path = 'data/raw/receipt1.jpg'
21
- save_output = True
22
- output_dir = 'outputs'
23
-
24
- # Check if the image exists
25
- if not Path(image_path).exists():
26
- print(f"❌ ERROR: Test image not found at '{image_path}'")
27
- return
28
 
29
- # --- Processing ---
30
- print(f"\n🔄 Processing invoice: {image_path}...")
31
- try:
32
- # Call the main processing function
33
- result = process_invoice(image_path, save_results=save_output, output_dir=output_dir)
34
- print("✅ Invoice processed successfully!")
35
- except Exception as e:
36
- print(f"❌ An error occurred during processing: {e}")
37
- # Print traceback for detailed debugging
38
- import traceback
39
- traceback.print_exc()
40
- return
41
 
42
- # --- Display Results ---
43
- print("\n" + "=" * 60)
44
- print("📊 EXTRACTED INVOICE DATA (Advanced JSON)")
45
- print("=" * 60)
46
 
47
- # Pretty-print the JSON to the console
48
- print(json.dumps(result, indent=2, ensure_ascii=False))
49
-
50
- print("\n" + "=" * 60)
51
- print("📋 SUMMARY OF KEY EXTRACTED FIELDS")
52
- print("=" * 60)
53
-
54
- # --- Print a clean summary ---
55
- print(f"📄 Receipt Number: {result.get('receipt_number', 'N/A')}")
56
- print(f"📅 Date: {result.get('date', 'N/A')}")
57
 
58
- # Print Bill To info safely
59
- bill_to = result.get('bill_to')
60
- if bill_to and isinstance(bill_to, dict):
61
- print(f"👤 Bill To: {bill_to.get('name', 'N/A')}")
62
- else:
63
- print("👤 Bill To: N/A")
64
-
65
- # Print line items
66
- print("\n🛒 Line Items:")
67
- items = result.get('items', [])
68
- if items:
69
- for i, item in enumerate(items, 1):
70
- desc = item.get('description', 'No Description')
71
- qty = item.get('quantity', 1)
72
- total = item.get('total', 0.0)
73
- print(f" - Item {i}: {desc[:40]:<40} | Qty: {qty} | Total: {total:.2f}")
74
- else:
75
- print(" - No line items extracted.")
76
-
77
- # Print total and validation status
78
- print(f"\n💵 Total Amount: ${result.get('total_amount', 0.0)}")
79
-
80
- confidence = result.get('extraction_confidence', 0)
81
- print(f"📈 Confidence: {confidence}%")
82
-
83
- validation = "✅ Passed" if result.get('validation_passed', False) else "❌ Failed"
84
- print(f"✔️ Validation: {validation}")
85
-
86
- print("\n" + "=" * 60)
87
-
88
- if save_output:
89
- json_path = Path(output_dir) / (Path(image_path).stem + '.json')
90
- print(f"\n💾 Full JSON output saved to: {json_path}")
91
-
92
- print("\n🎉 PIPELINE TEST COMPLETE!")
93
-
94
-
95
- if __name__ == '__main__':
96
- test_full_pipeline()
 
1
+ import pytest
2
+ from unittest.mock import patch
3
  from pathlib import Path
4
+ import sys
5
+ import os
6
 
7
+ # Add src to path
8
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../src')))
9
 
10
  from pipeline import process_invoice
11
 
12
+ # --- MOCK DATA ---
13
+ # This is what we pretend the ML model returned
14
+ MOCK_ML_RESPONSE = {
15
+ "vendor": "MOCKED VENDOR INC",
16
+ "date": "2023-01-01",
17
+ "total_amount": "100.00",
18
+ "receipt_number": "MOCK-123",
19
+ "address": "123 Mock Street",
20
+ "bill_to": "Mock Customer",
21
+ "items": [],
22
+ "raw_text": "Mocked raw text content"
23
+ }
24
+
25
+ @patch('pipeline.extract_rule_based')
26
+ def test_pipeline_rule_based(mock_extract):
27
+ mock_extract.return_value = MOCK_ML_RESPONSE
28
+
29
+ with patch('pathlib.Path.exists', return_value=True):
30
+ result = process_invoice("fake_invoice.jpg", method="rules")
31
+
32
+ assert result['vendor'] == "MOCKED VENDOR INC"
33
+ assert result['validation_status'] == "passed"
34
+ mock_extract.assert_called_once()
35
+
36
+ @patch('pipeline.extract_ml_based')
37
+ def test_pipeline_ml_mocked(mock_extract):
38
  """
39
+ Tests the ML pipeline WITHOUT loading the heavy model.
 
40
  """
41
+ mock_extract.return_value = MOCK_ML_RESPONSE
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ with patch('pathlib.Path.exists', return_value=True):
44
+ result = process_invoice("fake_invoice.jpg", method="ml")
 
 
 
 
 
 
 
 
 
 
45
 
46
+ assert result['vendor'] == "MOCKED VENDOR INC"
47
+ assert result['receipt_number'] == "MOCK-123"
48
+ assert result['validation_status'] == "passed"
 
49
 
50
+ mock_extract.assert_called_once()
 
 
 
 
 
 
 
 
 
51