Spaces:

GSoumyajit2005
/

invoice-processor-ml

Sleeping

App Files Files Community

GSoumyajit2005 commited on Dec 20, 2025

Commit

f74e17e

1 Parent(s): 1144bea

feat: Enhance pipeline with smart PDF handling, Pydantic validation, and semantic hashing, and refactor API to src.

Browse files

Files changed (10) hide show

api.py +0 -35
app.py +63 -75
requirements.txt +19 -2
src/api.py +67 -0
src/pdf_utils.py +50 -0
src/pipeline.py +106 -25
src/schema.py +112 -0
src/utils.py +35 -0
tests/test_full_pipeline.py +2 -2
tests/test_pipeline.py +1 -1

api.py DELETED Viewed

@@ -1,35 +0,0 @@
-from fastapi import FastAPI, UploadFile, File, HTTPException
-from src.pipeline import process_invoice
-import shutil
-import os
-import uvicorn
-app = FastAPI(title="Invoice Extraction API", version="1.0")
-@app.post("/extract")
-async def extract_invoice(file: UploadFile = File(...), method: str = 'ml'):
-    """
-    Endpoint to process an uploaded invoice file.
-    """
-    temp_file_path = f"temp_{file.filename}"
-    try:
-        # Save uploaded file temporarily
-        with open(temp_file_path, "wb") as buffer:
-            shutil.copyfileobj(file.file, buffer)
-        # Run pipeline
-        result = process_invoice(temp_file_path, method=method, save_results=False)
-        return {"status": "success", "data": result}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-    finally:
-        # Cleanup temp file
-        if os.path.exists(temp_file_path):
-            os.remove(temp_file_path)
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8000)

app.py CHANGED Viewed

@@ -12,16 +12,12 @@ import sys
 sys.path.append('src')
 from pipeline import process_invoice
-# --- Mock Functions to support the UI without errors ---
-# These functions simulate the ones from your example README.
-# They allow the UI to render without needing to build a complex format detector today.
 def detect_invoice_format(ocr_text: str):
     """
     A mock function to simulate format detection.
     In a real system, this would analyze the text layout.
     """
-    # Simple heuristic: if it contains "SDN BHD", it's our known format.
     if "SDN BHD" in ocr_text:
         return {
             'name': 'Template A (Retail)',
@@ -44,9 +40,8 @@ def get_format_recommendations(format_info):
     else:
         return ["• Results may be incomplete.", "• Consider adding patterns for this format."]
-# --- Streamlit App ---
-# Page configuration
 st.set_page_config(
     page_title="Invoice Processor",
     page_icon="📄",
@@ -54,7 +49,7 @@ st.set_page_config(
     initial_sidebar_state="expanded"
 )
-# Custom CSS for styling
 st.markdown("""
 <style>
     .main-header {
@@ -87,11 +82,10 @@ st.markdown("""
 </style>
 """, unsafe_allow_html=True)
-# Title
 st.markdown('<h1 class="main-header">📄 Smart Invoice Processor</h1>', unsafe_allow_html=True)
 st.markdown("### Extract structured data from invoices using your custom-built OCR pipeline")
-# Sidebar
 with st.sidebar:
     st.header("ℹ️ About")
     st.info("""
@@ -118,7 +112,7 @@ with st.sidebar:
     extraction_method = st.selectbox(
         "Choose Extraction Method:",
         ('ML-Based (LayoutLMv3)', 'Rule-Based (Regex)'),
-        help="ML-Based is more robust but may miss fields not in its training data. Rule-Based is faster but more fragile."
     )
 # Main content
@@ -128,18 +122,22 @@ with tab1:
     st.header("Upload an Invoice")
     uploaded_file = st.file_uploader(
-        "Choose an invoice image (JPG, PNG)",
-        type=['jpg', 'jpeg', 'png'],
-        help="Upload a clear image of an invoice or receipt"
     )
     if uploaded_file is not None:
         col1, col2 = st.columns([1, 1])
         with col1:
-            st.subheader("📸 Original Image")
-            image = Image.open(uploaded_file)
-            st.image(image, use_container_width=True)
             st.caption(f"Filename: {uploaded_file.name}")
         with col2:
@@ -148,27 +146,24 @@ with tab1:
             if st.button("🚀 Extract Data", type="primary"):
                 with st.spinner("Executing your custom pipeline..."):
                     try:
-                        # Save the uploaded file to a temporary path to be used by our pipeline
                         temp_dir = "temp"
                         os.makedirs(temp_dir, exist_ok=True)
                         temp_path = os.path.join(temp_dir, uploaded_file.name)
                         with open(temp_path, "wb") as f:
                             f.write(uploaded_file.getbuffer())
-                        # Step 1: Call YOUR full pipeline function
                         st.write("✅ Calling `process_invoice`...")
-                        # Map the user-friendly name from the dropdown to the actual method parameter
                         method = 'ml' if extraction_method == 'ML-Based (LayoutLMv3)' else 'rules'
                         st.write(f"⚙️ Using **{method.upper()}** extraction method...")
-                        # Call the pipeline with the selected method
-                        extracted_data = process_invoice(temp_path, method=method)
-                        # Step 2: Simulate format detection using the extracted data
                         st.write("✅ Simulating format detection...")
                         format_info = detect_invoice_format(extracted_data.get("raw_text", ""))
-                        # Store results in session state to display them
                         st.session_state.extracted_data = extracted_data
                         st.session_state.format_info = format_info
                         st.session_state.processed_count += 1
@@ -178,12 +173,12 @@ with tab1:
                     except Exception as e:
                         st.error(f"❌ An error occurred in the pipeline: {str(e)}")
-        # Display results if they exist in the session state
         if 'extracted_data' in st.session_state:
             st.markdown("---")
             st.header("📊 Extraction Results")
-            # --- Format Detection Section ---
             format_info = st.session_state.format_info
             st.subheader("📋 Detected Format (Simulated)")
             col1_fmt, col2_fmt = st.columns([2, 3])
@@ -199,39 +194,34 @@ with tab1:
                 for rec in get_format_recommendations(format_info): st.write(rec)
             st.markdown("---")
-            # --- Main Results Section ---
             data = st.session_state.extracted_data
-            # Confidence display
-            confidence = data.get('extraction_confidence', 0)
-            if confidence >= 80:
-                st.markdown(f'<div class="success-box">✅ <strong>High Confidence: {confidence}%</strong> - Most key fields were found.</div>', unsafe_allow_html=True)
-            elif confidence >= 50:
-                st.markdown(f'<div class="warning-box">⚠️ <strong>Medium Confidence: {confidence}%</strong> - Some fields may be missing.</div>', unsafe_allow_html=True)
             else:
-                st.markdown(f'<div class="error-box">❌ <strong>Low Confidence: {confidence}%</strong> - Format likely unsupported.</div>', unsafe_allow_html=True)
-            # Validation display
-            if data.get('validation_passed', False):
-                st.success("✔️ Validation Passed: Total amount appears consistent with other extracted amounts.")
-            else:
-                st.warning("⚠️ Validation Failed: Total amount could not be verified against other numbers.")
-            # Key metrics display
-            # Key metrics display
-            st.metric("🏢 Vendor", data.get('vendor') or "N/A") # <-- ADD THIS
             res_col1, res_col2, res_col3 = st.columns(3)
             res_col1.metric("📄 Receipt Number", data.get('receipt_number') or "N/A")
             res_col2.metric("📅 Date", data.get('date') or "N/A")
-            res_col3.metric("💵 Total Amount", f"${data.get('total_amount'):.2f}" if data.get('total_amount') is not None else "N/A")
-            # Use an expander for longer text fields like address
             with st.expander("Show More Details"):
-                # Handle receipt_number
                 st.markdown(f"**🧾 Receipt Number:** {data.get('receipt_number') or 'N/A'}")
-                # Handle bill_to (can be string from ML or dict from rules)
                 bill_to = data.get('bill_to')
                 if isinstance(bill_to, dict):
                     bill_to_display = bill_to.get('name') or 'N/A'
@@ -242,16 +232,18 @@ with tab1:
                 st.markdown(f"**👤 Bill To:** {bill_to_display}")
                 st.markdown(f"**📍 Vendor Address:** {data.get('address') or 'N/A'}")
-            # Line items table
             if data.get('items'):
                 st.subheader("🛒 Line Items")
-                # Ensure data is in the right format for DataFrame
                 items_df_data = [{
                     "Description": item.get("description", "N/A"),
                     "Qty": item.get("quantity", "N/A"),
-                    "Unit Price": f"${item.get('unit_price', 0.0):.2f}",
-                    "Total": f"${item.get('total', 0.0):.2f}"
                 } for item in data['items']]
                 df = pd.DataFrame(items_df_data)
                 st.dataframe(df, use_container_width=True)
@@ -281,15 +273,17 @@ with tab2:
     st.header("📚 Sample Invoices")
     st.write("Try the sample invoice below to see how the system performs:")
-    sample_dir = "data/samples" # ✅ Points to the correct folder
     if os.path.exists(sample_dir):
-        sample_files = [f for f in os.listdir(sample_dir) if f.endswith(('.jpg', '.png', '.jpeg'))]
         if sample_files:
-            # Display the first sample found
-            img_path = os.path.join(sample_dir, sample_files[0])
-            st.image(Image.open(img_path), caption=sample_files[0], use_container_width=True)
-            st.info("You can download this image and upload it in the 'Upload & Process' tab to test the pipeline.")
         else:
             st.warning("No sample invoices found in `data/samples/`.")
     else:
@@ -300,26 +294,20 @@ with tab3:
     st.markdown("""
     This app follows the exact pipeline you built:
     ```
-    1. 📸 Image Upload
-       ↓
-    2. 🔄 Preprocessing (OpenCV)
-       Grayscale conversion and noise removal.
        ↓
-    3. 🔍 OCR (Tesseract)
-       Optimized with PSM 6 for receipt layouts.
        ↓
-    4. 🎯 Rule-Based Extraction (Regex)
-       Your custom patterns find specific fields.
        ↓
-    5. ✅ Confidence & Validation
-       Heuristics to check the quality of the extraction.
        ↓
-    6. 📊 Output JSON
-       Presents all extracted data in a structured format.
     ```
-    """)
-    st.info("This rule-based system is a great foundation. The next step is to replace the extraction logic with an ML model like LayoutLM to handle more diverse formats!")
-# Footer
-st.markdown("---")
-st.markdown("<div style='text-align: center; color: #666;'>Built with your custom Python pipeline | UI by Streamlit</div>", unsafe_allow_html=True)

 sys.path.append('src')
 from pipeline import process_invoice
+# --- Mock Functions (KEPT AS IS) ---
 def detect_invoice_format(ocr_text: str):
     """
     A mock function to simulate format detection.
     In a real system, this would analyze the text layout.
     """
     if "SDN BHD" in ocr_text:
         return {
             'name': 'Template A (Retail)',
     else:
         return ["• Results may be incomplete.", "• Consider adding patterns for this format."]
+# --- Streamlit App (KEPT AS IS) ---
 st.set_page_config(
     page_title="Invoice Processor",
     page_icon="📄",
     initial_sidebar_state="expanded"
 )
+# Custom CSS (KEPT AS IS)
 st.markdown("""
 <style>
     .main-header {
 </style>
 """, unsafe_allow_html=True)
+# Title & Sidebar (KEPT AS IS)
 st.markdown('<h1 class="main-header">📄 Smart Invoice Processor</h1>', unsafe_allow_html=True)
 st.markdown("### Extract structured data from invoices using your custom-built OCR pipeline")
 with st.sidebar:
     st.header("ℹ️ About")
     st.info("""
     extraction_method = st.selectbox(
         "Choose Extraction Method:",
         ('ML-Based (LayoutLMv3)', 'Rule-Based (Regex)'),
+        help="ML-Based is more robust. Rule-Based is faster."
     )
 # Main content
     st.header("Upload an Invoice")
     uploaded_file = st.file_uploader(
+        "Choose an invoice image (JPG, PNG) or PDF",
+        type=['jpg', 'jpeg', 'png', 'pdf'], # Added PDF support
+        help="Upload a clear image or PDF of an invoice"
     )
     if uploaded_file is not None:
         col1, col2 = st.columns([1, 1])
         with col1:
+            st.subheader("📸 Original Document")
+            # Preview Logic updated for PDF support
+            if uploaded_file.type == "application/pdf":
+                st.info("📄 PDF Uploaded (Preview not supported directly)")
+            else:
+                image = Image.open(uploaded_file)
+                st.image(image, use_container_width=True)
             st.caption(f"Filename: {uploaded_file.name}")
         with col2:
             if st.button("🚀 Extract Data", type="primary"):
                 with st.spinner("Executing your custom pipeline..."):
                     try:
+                        # Save temp file
                         temp_dir = "temp"
                         os.makedirs(temp_dir, exist_ok=True)
                         temp_path = os.path.join(temp_dir, uploaded_file.name)
                         with open(temp_path, "wb") as f:
                             f.write(uploaded_file.getbuffer())
+                        # Call Pipeline
                         st.write("✅ Calling `process_invoice`...")
                         method = 'ml' if extraction_method == 'ML-Based (LayoutLMv3)' else 'rules'
                         st.write(f"⚙️ Using **{method.upper()}** extraction method...")
+                        # ⚠️ UPDATE: Pass string path
+                        extracted_data = process_invoice(str(temp_path), method=method)
                         st.write("✅ Simulating format detection...")
                         format_info = detect_invoice_format(extracted_data.get("raw_text", ""))
                         st.session_state.extracted_data = extracted_data
                         st.session_state.format_info = format_info
                         st.session_state.processed_count += 1
                     except Exception as e:
                         st.error(f"❌ An error occurred in the pipeline: {str(e)}")
+        # Display results
         if 'extracted_data' in st.session_state:
             st.markdown("---")
             st.header("📊 Extraction Results")
+            # --- Format Detection Section (KEPT AS IS) ---
             format_info = st.session_state.format_info
             st.subheader("📋 Detected Format (Simulated)")
             col1_fmt, col2_fmt = st.columns([2, 3])
                 for rec in get_format_recommendations(format_info): st.write(rec)
             st.markdown("---")
+            # --- Main Results Section (UPDATED) ---
             data = st.session_state.extracted_data
+            # 1. New Validation Display (Replaces old Confidence box)
+            status = data.get('validation_status', 'unknown')
+            if status == 'passed':
+                st.markdown(f'<div class="success-box">✅ <strong>Validation Passed</strong>: Data meets strict quality rules (Pydantic).</div>', unsafe_allow_html=True)
+            elif status == 'failed':
+                err_count = len(data.get('validation_errors', []))
+                st.markdown(f'<div class="error-box">❌ <strong>Validation Failed</strong>: Found {err_count} issues. Check JSON for details.</div>', unsafe_allow_html=True)
             else:
+                st.markdown(f'<div class="warning-box">⚠️ <strong>Status Unknown</strong>: Validation logic was skipped.</div>', unsafe_allow_html=True)
+            # 2. Key Metrics (Mapped to NEW keys)
+            st.metric("🏢 Vendor", data.get('vendor') or "N/A")
             res_col1, res_col2, res_col3 = st.columns(3)
             res_col1.metric("📄 Receipt Number", data.get('receipt_number') or "N/A")
             res_col2.metric("📅 Date", data.get('date') or "N/A")
+            # Handle total (it's now a string from the pipeline, but metric handles strings fine)
+            total = data.get('total_amount')
+            res_col3.metric("💵 Total Amount", f"${total}" if total else "N/A")
+            # 3. Expanded Details
             with st.expander("Show More Details"):
                 st.markdown(f"**🧾 Receipt Number:** {data.get('receipt_number') or 'N/A'}")
+                # Handle bill_to
                 bill_to = data.get('bill_to')
                 if isinstance(bill_to, dict):
                     bill_to_display = bill_to.get('name') or 'N/A'
                 st.markdown(f"**👤 Bill To:** {bill_to_display}")
                 st.markdown(f"**📍 Vendor Address:** {data.get('address') or 'N/A'}")
+                # New: Show Duplicate Hash
+                st.markdown(f"**🔑 Semantic Hash (Duplicate ID):** `{data.get('semantic_hash') or 'N/A'}`")
+            # 4. Line items table
             if data.get('items'):
                 st.subheader("🛒 Line Items")
                 items_df_data = [{
                     "Description": item.get("description", "N/A"),
                     "Qty": item.get("quantity", "N/A"),
+                    "Unit Price": f"${item.get('unit_price', 0.0) if item.get('unit_price') is not None else 0}",
+                    "Total": f"${item.get('total', 0.0) if item.get('total') is not None else 0}"
                 } for item in data['items']]
                 df = pd.DataFrame(items_df_data)
                 st.dataframe(df, use_container_width=True)
     st.header("📚 Sample Invoices")
     st.write("Try the sample invoice below to see how the system performs:")
+    sample_dir = "data/samples"
     if os.path.exists(sample_dir):
+        sample_files = [f for f in os.listdir(sample_dir) if f.endswith(('.jpg', '.png', '.jpeg', '.pdf'))]
         if sample_files:
+            file_path = os.path.join(sample_dir, sample_files[0])
+            st.write(f"**Sample File:** {sample_files[0]}")
+            if file_path.endswith('.pdf'):
+                 st.info("📄 PDF Sample available. Download and upload it to test.")
+            else:
+                st.image(Image.open(file_path), caption=sample_files[0], use_container_width=True)
         else:
             st.warning("No sample invoices found in `data/samples/`.")
     else:
     st.markdown("""
     This app follows the exact pipeline you built:
     ```
+    1. 📸 Input Handling
+       Detects JPG vs PDF. Smart Loader extracts text from PDFs instantly.
        ↓
+    2. 🧠 Hybrid Engine
+       - Digital PDFs: Direct Text Extraction (Fast)
+       - Images/Scans: LayoutLMv3 (ML) + Tesseract (OCR)
        ↓
+    3. 🛡️ Validation Gate
+       Pydantic Schema ensures data integrity (Decimal precision, Date formats).
        ↓
+    4. 🔑 Duplicate Detection
+       Generates a unique semantic hash based on content.
        ↓
+    5. 📊 Output JSON
+       Standardized, validated output ready for API response.
     ```
+    """)

requirements.txt CHANGED Viewed

@@ -1,14 +1,31 @@
 streamlit>=1.28.0
 pytesseract>=0.3.10
 opencv-python>=4.8.0
 Pillow>=10.0.0
 numpy>=1.24.0
 pandas>=2.0.0
-# Machine Learning
 torch>=2.0.0
 torchvision>=0.15.0
 transformers>=4.30.0
 datasets>=2.14.0
 huggingface-hub>=0.17.0
-seqeval>=1.2.2

+# ----- Streamlit -----
 streamlit>=1.28.0
+# ----- OCR -----
 pytesseract>=0.3.10
 opencv-python>=4.8.0
 Pillow>=10.0.0
+# ----- Data -----
 numpy>=1.24.0
 pandas>=2.0.0
+# ----- Machine Learning -----
 torch>=2.0.0
 torchvision>=0.15.0
 transformers>=4.30.0
 datasets>=2.14.0
 huggingface-hub>=0.17.0
+seqeval>=1.2.2
+# ----- Data Validation -----
+pydantic>=2.12.0
+# ----- PDF Processing -----
+pdfplumber>=0.11.0
+pdf2image>=1.16.0
+# ----- API Framework -----
+fastapi>=0.126.0
+uvicorn[standard]>=0.38.0
+python-multipart>=0.0.21

src/api.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# src/api.py
+from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
+from fastapi.responses import JSONResponse
+import shutil
+import os
+from pathlib import Path
+import uuid
+import sys
+# Import modules
+sys.path.append(str(Path(__file__).resolve().parent))
+from pipeline import process_invoice
+from schema import InvoiceData
+app = FastAPI(
+    title="Invoice Extraction API",
+    description="Hybrid ML + Rule-Based Pipeline with LayoutLMv3",
+    version="2.0"
+)
+# Create temp folder if not exists
+UPLOAD_DIR = Path("temp_uploads")
+UPLOAD_DIR.mkdir(exist_ok=True)
+def cleanup_file(path: str):
+    """Background task to remove temp file after processing"""
+    try:
+        if os.path.exists(path):
+            os.remove(path)
+    except Exception as e:
+        print(f"Error cleaning up {path}: {e}")
+@app.post("/extract", response_model=InvoiceData) # <--- CONTRACT ENFORCED
+async def extract_invoice(
+    background_tasks: BackgroundTasks,
+    file: UploadFile = File(...)
+):
+    """
+    Upload an invoice (PDF/JPG/PNG) and get structured data.
+    """
+    # 1. Generate unique filename to prevent collisions
+    file_ext = Path(file.filename).suffix
+    unique_name = f"{uuid.uuid4()}{file_ext}"
+    temp_path = UPLOAD_DIR / unique_name
+    try:
+        # 2. Save Uploaded File
+        with open(temp_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        # 3. Process Logic
+        result = process_invoice(str(temp_path), method='ml')
+        # 4. Cleanup
+        # We use background_tasks to delete the file AFTER the response is sent
+        background_tasks.add_task(cleanup_file, str(temp_path))
+        return result
+    except Exception as e:
+        # Cleanup even on error
+        cleanup_file(str(temp_path))
+        raise HTTPException(status_code=500, detail=str(e))

src/pdf_utils.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import pdfplumber
+from pdf2image import convert_from_path
+from pathlib import Path
+from typing import List, Union
+import numpy as np
+import cv2
+def extract_text_from_pdf(pdf_path: str) -> str:
+    """Extracts raw text from a digital PDF"""
+    path = Path(pdf_path)
+    if not path.exists():
+        raise FileNotFoundError(f"PDF not found: {pdf_path}")
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            full_text = ""
+            for page in pdf.pages:
+                page_text = page.extract_text() or ""
+                full_text += page_text + "\n"
+            return full_text.strip()
+    except Exception as e:
+        raise ValueError(f"Failed to read PDF {pdf_path}: {str(e)}")
+def convert_pdf_to_images(pdf_path: str) -> List[np.ndarray]:
+    """
+    Converts a PDF into a list of OpenCV images (numpy arrays).
+    Required for the ML pipeline (LayoutLM) or Scanned PDFs.
+    Logic:
+    1. Use 'convert_from_path' to get PIL images.
+    2. Convert PIL images to numpy arrays (OpenCV format).
+    3. Return list of arrays.
+    """
+    # 1. Convert to PIL images
+    try:
+        pil_images = convert_from_path(pdf_path)
+    except Exception as e:
+        raise ValueError(f"Error converting PDF to image: {e}")
+    cv_images = []
+    for pil_img in pil_images:
+        array = np.array(pil_img)
+        cv_images.append(cv2.cvtColor(array, cv2.COLOR_RGB2BGR))
+    return cv_images

src/pipeline.py CHANGED Viewed

@@ -6,15 +6,20 @@ Orchestrates preprocessing, OCR, and extraction
 from typing import Dict, Any, Optional
 from pathlib import Path
 import json
-# Make sure all your modules are imported
 from preprocessing import load_image, convert_to_grayscale, remove_noise
 from ocr import extract_text
 from extraction import structure_output
 from ml_extraction import extract_ml_based
 def process_invoice(image_path: str,
-                   method: str = 'ml', # <-- New parameter: 'ml' or 'rules'
                    save_results: bool = False,
                    output_dir: str = 'outputs') -> Dict[str, Any]:
     """
@@ -29,45 +34,121 @@ def process_invoice(image_path: str,
     Returns:
         A dictionary with the extracted invoice data.
     """
     if not Path(image_path).exists():
-        raise FileNotFoundError(f"Image not found at path: {image_path}")
-    print(f"Processing with '{method}' method...")
-    if method == 'ml':
-        # --- ML-Based Extraction ---
         try:
-            # The ml_extraction function handles everything internally
-            structured_data = extract_ml_based(image_path)
-        except Exception as e:
-            raise ValueError(f"Error during ML-based extraction: {e}")
-    elif method == 'rules':
-        # --- Rule-Based Extraction (Your original logic) ---
-        try:
-            image = load_image(image_path)
-            gray_image = convert_to_grayscale(image)
-            preprocessed_image = remove_noise(gray_image, kernel_size=3)
-            text = extract_text(preprocessed_image, config='--psm 6')
-            structured_data = structure_output(text) # Calls your old extraction.py
         except Exception as e:
-            raise ValueError(f"Error during rule-based extraction: {e}")
-    else:
-        raise ValueError(f"Unknown extraction method: '{method}'. Choose 'ml' or 'rules'.")
-    # --- Saving Logic (remains the same) ---
     if save_results:
         output_path = Path(output_dir)
         output_path.mkdir(parents=True, exist_ok=True)
-        json_path = output_path / (Path(image_path).stem + f"_{method}.json") # Add method to filename
         try:
             with open(json_path, 'w', encoding='utf-8') as f:
-                json.dump(structured_data, f, indent=2, ensure_ascii=False)
         except Exception as e:
             raise IOError(f"Error saving results to {json_path}: {e}")
-    return structured_data
 def process_batch(image_folder: str, output_dir: str = 'outputs') -> list:

 from typing import Dict, Any, Optional
 from pathlib import Path
 import json
+from pydantic import ValidationError
+import cv2
+# --- IMPORTS ---
 from preprocessing import load_image, convert_to_grayscale, remove_noise
 from ocr import extract_text
 from extraction import structure_output
 from ml_extraction import extract_ml_based
+from schema import InvoiceData
+from pdf_utils import extract_text_from_pdf, convert_pdf_to_images
+from utils import generate_semantic_hash
 def process_invoice(image_path: str,
+                   method: str = 'ml',
                    save_results: bool = False,
                    output_dir: str = 'outputs') -> Dict[str, Any]:
     """
     Returns:
         A dictionary with the extracted invoice data.
     """
     if not Path(image_path).exists():
+        raise FileNotFoundError(f"Image/PDF not found at path: {image_path}")
+    print(f"Processing: {image_path}")
+    raw_result = {}
+    is_digital_pdf = False
+    # --- 1. SMART PDF HANDLING ---
+    if image_path.lower().endswith('.pdf'):
+        print("📄 PDF detected. Checking type...")
         try:
+            # Attempt to extract text directly (Fast Path)
+            digital_text = extract_text_from_pdf(image_path)
+            # Heuristic: If we found >50 chars, it's likely a native Digital PDF
+            if len(digital_text.strip()) > 50:
+                print("   ✅ Digital Text found. Using Rule-Based Engine (Fast Mode).")
+                # We bypass the ML model because we have perfect text
+                raw_result = structure_output(digital_text)
+                is_digital_pdf = True
+                method = 'rules (digital)' # Override method for logging
+            else:
+                print("   ⚠️  Sparse text detected. Treating as Scanned PDF.")
+                # Convert first page to image for the ML pipeline
+                print("   🔄 Converting Page 1 to Image...")
+                images = convert_pdf_to_images(image_path)
+                # Save as temp jpg so our existing pipeline can read it
+                # (In production, you might pass the array directly, but this is safer for now)
+                temp_jpg = image_path.replace('.pdf', '.jpg')
+                cv2.imwrite(temp_jpg, images[0])
+                # SWAP THE PATH: The rest of the pipeline will now see a JPG!
+                image_path = temp_jpg
+                print(f"   ➡️  Continuing with converted image: {image_path}")
         except Exception as e:
+            print(f"   ❌ PDF Error: {e}. Falling back to standard processing.")
+    # --- 2. STANDARD EXTRACTION (ML / RULES) ---
+    # Only run this if we didn't already extract from Digital PDF
+    if not is_digital_pdf:
+        print(f"⚙️  Using '{method}' method on image...")
+        if method == 'ml':
+            try:
+                raw_result = extract_ml_based(image_path)
+            except Exception as e:
+                raise ValueError(f"Error during ML-based extraction: {e}")
+        elif method == 'rules':
+            try:
+                image = load_image(image_path)
+                gray_image = convert_to_grayscale(image)
+                preprocessed_image = remove_noise(gray_image, kernel_size=3)
+                text = extract_text(preprocessed_image, config='--psm 6')
+                raw_result = structure_output(text)
+            except Exception as e:
+                raise ValueError(f"Error during rule-based extraction: {e}")
+        # Clean up temp file if we created one
+        if image_path.endswith('.jpg') and 'sample_pdf' in image_path: # Safety check
+             # Optional: os.remove(image_path)
+             pass
+    # --- VALIDATION STEP ---
+    final_data = raw_result # Default to raw if validation crashes hard
+    if method == 'ml':
+        try:
+            invoice = InvoiceData(**raw_result)
+            final_data = invoice.model_dump(mode='json')
+            final_data['validation_status'] = 'passed'
+            print("✅ Data Validation Passed")
+        except ValidationError as e:
+            print(f"❌ Data Validation Failed: {len(e.errors())} errors")
+            # We keep the 'raw_result' data so the user isn't left with nothing,
+            # but we attach the error report so they know what to fix.
+            final_data = raw_result.copy()
+            final_data['validation_status'] = 'failed'
+            # Format errors nicely
+            error_list = []
+            for err in e.errors():
+                field = " -> ".join(str(loc) for loc in err['loc'])
+                msg = err['msg']
+                print(f"   - {field}: {msg}")
+                error_list.append(f"{field}: {msg}")
+            final_data['validation_errors'] = error_list
+    # --- DUPLICATE DETECTION ---
+    # We calculate the hash based on the final (or raw) data.
+    # This gives us a unique fingerprint for this specific business transaction.
+    final_data['semantic_hash'] = generate_semantic_hash(final_data)
+    # --- SAVING STEP ---
     if save_results:
         output_path = Path(output_dir)
         output_path.mkdir(parents=True, exist_ok=True)
+        # Helper to serialize Decimals/Dates for JSON (standard json.dump fails on them)
+        # You can use 'default=str' in json.dump or convert before saving
+        json_path = output_path / (Path(image_path).stem + f"_{method}.json")
         try:
             with open(json_path, 'w', encoding='utf-8') as f:
+                # Use default=str to handle Decimal and Date objects automatically
+                json.dump(final_data, f, indent=2, ensure_ascii=False, default=str)
         except Exception as e:
             raise IOError(f"Error saving results to {json_path}: {e}")
+    return final_data
 def process_batch(image_folder: str, output_dir: str = 'outputs') -> list:

src/schema.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# src/schema.py
+from pydantic import BaseModel, Field, field_validator, model_validator
+from typing import List, Optional, Union, Dict
+from decimal import Decimal, InvalidOperation
+from datetime import date as DateType, datetime
+# --- 1. Line Item Schema ---
+class LineItem(BaseModel):
+    description: str
+    quantity: int = Field(default=1, ge=1)
+    unit_price: Optional[Decimal] = Field(default=None, ge=0)
+    total: Decimal = Field(default=0, ge=0)
+    @field_validator('unit_price', 'total', mode='before')
+    @classmethod
+    def validate_precision(cls, v):
+        """Ensure exactly 2 decimal places for currency."""
+        if v is None:
+            return None
+        try:
+            d = Decimal(str(v))
+            return d.quantize(Decimal('0.01'))
+        except (InvalidOperation, ValueError, TypeError):
+            return Decimal('0.00')
+# --- 2. Invoice Schema ---
+class InvoiceData(BaseModel):
+    """
+    Strict Data Contract for Invoice Extraction.
+    """
+    # Core Fields
+    receipt_number: Optional[str] = Field(default=None, description="Unique ID")
+    date: Optional[DateType] = Field(default=None, description="Invoice Date")
+    # Financials
+    total_amount: Optional[Decimal] = Field(default=None, ge=0)
+    # Entities
+    vendor: Optional[str] = None
+    address: Optional[str] = None
+    bill_to: Optional[Union[str, Dict]] = None
+    # Nested Items
+    items: List[LineItem] = Field(default_factory=list)
+    # --- METADATA ---
+    validation_status: str = Field(default="unknown", description="passed/failed")
+    validation_errors: List[str] = Field(default_factory=list, description="List of validation failure messages")
+    semantic_hash: Optional[str] = Field(default=None, description="Unique fingerprint of the invoice content")
+    # --- VALIDATORS ---
+    @field_validator('date', mode='before')
+    @classmethod
+    def clean_date(cls, v):
+        """Logic: Handle None, parse formats, then validate range."""
+        if not v:
+            return None
+        parsed_date = v
+        if isinstance(v, str):
+            try:
+                # Try common formats
+                for fmt in ("%d/%m/%Y", "%Y-%m-%d", "%d-%m-%Y", "%d.%m.%Y"):
+                    try:
+                        parsed_date = datetime.strptime(v, fmt).date()
+                        break
+                    except ValueError:
+                        continue
+            except Exception:
+                return None
+        if isinstance(parsed_date, DateType):
+            today = datetime.now().date()
+            if parsed_date > today:
+                return None
+            # ⚠️ FIX: Use 'DateType' constructor
+            min_date = DateType(today.year - 10, 1, 1)
+            if parsed_date < min_date:
+                return None
+            return parsed_date
+        return None
+    @field_validator('total_amount', mode='before')
+    @classmethod
+    def validate_money(cls, v):
+        if v is None:
+            return None
+        try:
+            d = Decimal(str(v))
+            return d.quantize(Decimal('0.01'))
+        except (InvalidOperation, ValueError):
+            return None
+    @model_validator(mode='after')
+    def validate_math(self):
+        if not self.items or self.total_amount is None:
+            return self
+        line_items_sum = sum(item.total for item in self.items)
+        diff = abs(self.total_amount - line_items_sum)
+        if diff > Decimal('0.05'):
+            print(f"⚠️ Validation Warning: Total {self.total_amount} != Sum of items {line_items_sum}")
+        return self

src/utils.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import hashlib
+from typing import Dict, Any
+from decimal import Decimal
+from datetime import date
+def generate_semantic_hash(invoice_data: Dict[str, Any]) -> str:
+    """
+    Generates a unique fingerprint using a Composite Key strategy.
+    Composite Key = Vendor + Date + Total + Receipt Number
+    """
+    # Define the specific fields that determine uniqueness
+    keys_to_hash = ['vendor', 'date', 'total_amount', 'receipt_number']
+    normalized_values = []
+    for key in keys_to_hash:
+        value = invoice_data[key]
+        # Normalize without modifying the original object
+        if value is None:
+           norm_val = ""
+        elif isinstance(value, (date, Decimal, int, float)):
+            norm_val = str(value)
+        else:
+            # String normalization
+            norm_val = str(value).lower().strip()
+        normalized_values.append(norm_val)
+    # Create the fingerprint string
+    composite_string = "|".join(normalized_values)
+    # Return the SHA256 hash of the string
+    return hashlib.sha256(composite_string.encode()).hexdigest()

tests/test_full_pipeline.py CHANGED Viewed

@@ -37,6 +37,6 @@ print("=" * 60)
 print("\n🎉 PIPELINE COMPLETE!")
 print("\n📋 Summary:")
 print(f"   Vendor: {result['vendor']}")
-print(f"   Invoice #: {result['invoice_number']}")
 print(f"   Date: {result['date']}")
-print(f"   Total: ${result['total']}")

 print("\n🎉 PIPELINE COMPLETE!")
 print("\n📋 Summary:")
 print(f"   Vendor: {result['vendor']}")
+print(f"   Invoice #: {result['receipt_number']}")
 print(f"   Date: {result['date']}")
+print(f"   Total: ${result.get('total_amount', '0.00')}")

tests/test_pipeline.py CHANGED Viewed

@@ -75,7 +75,7 @@ def test_full_pipeline():
         print("  - No line items extracted.")
     # Print total and validation status
-    print(f"\n💵 Total Amount: ${result.get('total_amount', 0.0):.2f}")
     confidence = result.get('extraction_confidence', 0)
     print(f"📈 Confidence: {confidence}%")

         print("  - No line items extracted.")
     # Print total and validation status
+    print(f"\n💵 Total Amount: ${result.get('total_amount', 0.0)}")
     confidence = result.get('extraction_confidence', 0)
     print(f"📈 Confidence: {confidence}%")