Spaces:

point9
/

Invoice_Digitization_Agent

Sleeping

App Files Files Community

Dipan04 commited on Dec 19, 2025

Commit

8a859a8

1 Parent(s): ec7fdf7

Deploy Invoice Digitization Agent

Browse files

Files changed (29) hide show

.dockerignore +18 -0
.gitignore +3 -0
Dockerfile +48 -0
app.py +415 -0
backend/__init__.py +0 -0
backend/app/__init__.py +0 -0
backend/app/agent/__init__.py +19 -0
backend/app/agent/agent_orchestrator.py +476 -0
backend/app/api/__init__.py +0 -0
backend/app/api/ingest.py +1459 -0
backend/app/utils/__init__.py +0 -0
backend/app/utils/agent_client.py +153 -0
backend/app/wrappers/__init__.py +0 -0
backend/app/wrappers/gemini_output_filter.py +349 -0
backend/database/__init__.py +0 -0
backend/database/migration_ingest_v1.sql +67 -0
backend/database/migration_ingest_v2.sql +63 -0
backend/database/queries.sql +354 -0
backend/database/schema_sqlite.sql +132 -0
backend/etl/__init__.py +0 -0
backend/etl/update_customer_aggregates_sqlite.py +189 -0
backend/feature_builder/__init__.py +0 -0
backend/feature_builder/feature_builder.py +312 -0
backend/ingest/__init__.py +0 -0
backend/ingest/ingest_invoice_sqlite.py +158 -0
backend/worker/job_processor.py +126 -0
backend/worker/text_extractor.py +129 -0
backend/worker/worker.py +30 -0
requirements.txt +47 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,18 @@

+*.pyc
+*.pyo
+*.pyd
+.Python
+venv/
+env/
+.venv
+.git
+.gitignore
+.vscode
+.idea
+*.log
+*.db-journal
+.env
+Dockerfile
+docker-compose.yml
+README*.md
+*.md

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@


1	+
2	+ __pycache__/
3	+ *.pyc

Dockerfile ADDED Viewed

	@@ -0,0 +1,48 @@

+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    sqlite3 \
+    ca-certificates \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install Python packages
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Upgrade Gemini SDK for v1 API
+RUN pip install --no-cache-dir --upgrade google-generativeai google-ai-generativelanguage
+# Copy application code
+COPY . .
+# Create necessary directories
+RUN mkdir -p /app/data/logs /app/data/docs && chmod -R 777 /app/data
+# Create __init__.py files (INCLUDING AGENT DIRECTORY)
+RUN touch backend/__init__.py \
+    && touch backend/feature_builder/__init__.py \
+    && touch backend/app/__init__.py \
+    && touch backend/app/api/__init__.py \
+    && touch backend/app/agent/__init__.py \
+    && touch backend/app/wrappers/__init__.py \
+    && touch backend/ingest/__init__.py
+# Verify agent files exist (will fail build if missing)
+RUN test -f backend/app/agent/agent_orchestrator.py || \
+    (echo "ERROR: agent_orchestrator.py not found! Add it before building." && exit 1)
+# Initialize database if it doesn't exist
+RUN if [ ! -f /app/data/invoices.db ]; then \
+        sqlite3 /app/data/invoices.db < backend/database/init_schema_sqlite.sql; \
+    fi
+# Expose port
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "75"]

app.py ADDED Viewed

	@@ -0,0 +1,415 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import Optional, Dict
+import sqlite3
+import joblib
+import pandas as pd
+from datetime import datetime, timedelta
+from pathlib import Path
+from filelock import FileLock
+from fastapi.responses import JSONResponse
+import json
+import sys
+import logging
+# Setup logging for entire app
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[logging.StreamHandler(sys.stdout)],
+    force=True
+)
+# Setup paths
+BASE_DIR = Path(__file__).parent
+DB_PATH = BASE_DIR / "data" / "invoices.db"  # Inside container: /app/data/invoices.db
+LOCK_PATH = BASE_DIR / "data" / "invoices.db.lock"
+MODEL_PATH = BASE_DIR / "ml" / "models" / "payment_predictor_model_20251124_194847.pkl"
+LOG_DIR = BASE_DIR / "data" / "logs"
+PREDICTIONS_LOG = LOG_DIR / "predictions.csv"
+# Ensure directories exist
+LOG_DIR.mkdir(parents=True, exist_ok=True)
+# Add backend to path
+sys.path.append(str(BASE_DIR / "backend"))
+# Import feature builder
+from backend.feature_builder.feature_builder import build_features, features_to_dataframe
+from backend.ingest.ingest_invoice_sqlite import ingest_invoice as ingest_func
+# ============================================
+# IMPORT INGEST ROUTER (NEW)
+# ============================================
+from backend.app.api.ingest import router as ingest_router
+# Load ML model
+print("🤖 Loading ML model...")
+try:
+    model_artifacts = joblib.load(MODEL_PATH)
+    model = model_artifacts['model']
+    print(f"✅ Model loaded: {MODEL_PATH.name}")
+except Exception as e:
+    print(f"❌ Failed to load model: {e}")
+    model = None
+# FastAPI app
+app = FastAPI(
+    title="Invoice Payment Predictor",
+    description="Predicts payment clearing time for invoices",
+    version="1.0.0"
+)
+# ============================================
+# REGISTER INGEST ROUTER (NEW)
+# ============================================
+app.include_router(ingest_router)
+# ============================================
+# Pydantic Models
+# ============================================
+class InvoiceIngest(BaseModel):
+    invoice_id: int
+    business_code: str
+    cust_number: str
+    name_customer: Optional[str] = None
+    posting_date: str
+    document_create_date: Optional[str] = None
+    document_create_date_alt: Optional[str] = None
+    due_in_date: Optional[str] = None
+    baseline_create_date: Optional[str] = None
+    clear_date: Optional[str] = None
+    total_open_amount: float
+    invoice_currency: str = "USD"
+    document_type: Optional[str] = "RV"
+    cust_payment_terms: Optional[str] = None
+    posting_id: Optional[float] = None
+    business_year: Optional[int] = None
+class PredictionRequest(BaseModel):
+    invoice_id: Optional[int] = None
+    cust_number: str
+    posting_date: str
+    total_open_amount: float
+    business_code: str = "U001"
+    cust_payment_terms: str = "NAH4"
+    invoice_currency: str = "USD"
+    document_type: str = "RV"
+    due_in_date: Optional[str] = None
+    business_year: Optional[int] = None
+# ============================================
+# Helper Functions
+# ============================================
+def get_customer_aggregates(cust_number: str) -> Optional[Dict]:
+    """Fetch customer aggregates from SQLite."""
+    try:
+        with FileLock(str(LOCK_PATH), timeout=10):
+            conn = sqlite3.connect(str(DB_PATH))
+            conn.row_factory = sqlite3.Row
+            cursor = conn.cursor()
+            cursor.execute("""
+                SELECT * FROM customer_aggregates WHERE cust_number = ?
+            """, (cust_number,))
+            row = cursor.fetchone()
+            conn.close()
+            if row:
+                return dict(row)
+    except Exception as e:
+        print(f"Error fetching customer aggregates: {e}")
+    return None
+def get_payment_terms_aggregates(payment_terms: str) -> Optional[Dict]:
+    """Fetch payment terms aggregates from SQLite."""
+    try:
+        with FileLock(str(LOCK_PATH), timeout=10):
+            conn = sqlite3.connect(str(DB_PATH))
+            conn.row_factory = sqlite3.Row
+            cursor = conn.cursor()
+            cursor.execute("""
+                SELECT * FROM payment_terms_aggregates WHERE cust_payment_terms = ?
+            """, (payment_terms,))
+            row = cursor.fetchone()
+            conn.close()
+            if row:
+                return dict(row)
+    except Exception as e:
+        print(f"Error fetching payment terms: {e}")
+    return None
+def get_business_code_aggregates(business_code: str) -> Optional[Dict]:
+    """Fetch business code aggregates from SQLite."""
+    try:
+        with FileLock(str(LOCK_PATH), timeout=10):
+            conn = sqlite3.connect(str(DB_PATH))
+            conn.row_factory = sqlite3.Row
+            cursor = conn.cursor()
+            cursor.execute("""
+                SELECT * FROM business_code_aggregates WHERE business_code = ?
+            """, (business_code,))
+            row = cursor.fetchone()
+            conn.close()
+            if row:
+                return dict(row)
+    except Exception as e:
+        print(f"Error fetching business code: {e}")
+    return None
+def log_prediction_to_csv(prediction_data: Dict):
+    """Append prediction to CSV log."""
+    df = pd.DataFrame([prediction_data])
+    if not PREDICTIONS_LOG.exists():
+        df.to_csv(PREDICTIONS_LOG, index=False)
+    else:
+        df.to_csv(PREDICTIONS_LOG, mode='a', header=False, index=False)
+def log_prediction_to_db(prediction_data: Dict):
+    """Insert prediction into SQLite predictions_log."""
+    try:
+        with FileLock(str(LOCK_PATH), timeout=10):
+            conn = sqlite3.connect(str(DB_PATH))
+            cursor = conn.cursor()
+            cursor.execute("""
+                INSERT INTO predictions_log (
+                    invoice_id, cust_number, posting_date, total_open_amount,
+                    business_code, cust_payment_terms, predicted_days_to_clear,
+                    predicted_clear_date, model_version, features_json
+                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            """, (
+                prediction_data.get('invoice_id'),
+                prediction_data['cust_number'],
+                prediction_data['posting_date'],
+                prediction_data['total_open_amount'],
+                prediction_data.get('business_code'),
+                prediction_data.get('cust_payment_terms'),
+                prediction_data['predicted_days_to_clear'],
+                prediction_data['predicted_clear_date'],
+                prediction_data.get('model_version', 'v1.0'),
+                json.dumps(prediction_data.get('features', {}))
+            ))
+            prediction_id = cursor.lastrowid
+            conn.commit()
+            conn.close()
+            return prediction_id
+    except Exception as e:
+        print(f"Error logging to DB: {e}")
+        return None
+# ============================================
+# API Endpoints
+# ============================================
+@app.get("/")
+def root():
+    """Root endpoint."""
+    return {
+        "service": "Invoice Payment Predictor",
+        "version": "1.0.0",
+        "status": "operational",
+        "model_loaded": model is not None
+    }
+@app.get("/health")
+def health():
+    return JSONResponse(
+        content={
+            "status": "ok",
+            "model_loaded": model is not None,
+            "db_exists": DB_PATH.exists()
+        },
+        media_type="application/json"
+    )
+@app.post("/ingest")
+def ingest_invoice(invoice: InvoiceIngest):
+    """
+    Ingest invoice into SQLite database.
+    Computes derived fields and stores data.
+    """
+    try:
+        result = ingest_func(invoice.dict())
+        return {
+            "status": "success",
+            "message": "Invoice ingested successfully",
+            "data": result
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Ingestion failed: {str(e)}")
+@app.get("/features/{cust_number}")
+def get_features(cust_number: str):
+    """
+    Get customer aggregate features.
+    Returns cached aggregates or defaults for new customers.
+    """
+    customer_agg = get_customer_aggregates(cust_number)
+    if not customer_agg:
+        return {
+            "cust_number": cust_number,
+            "status": "new_customer",
+            "message": "No historical data found, using defaults",
+            "features": {
+                "cust_avg_days": 18.0,
+                "cust_median_days": 15.0,
+                "cust_invoice_count": 0
+            }
+        }
+    return {
+        "cust_number": cust_number,
+        "status": "existing_customer",
+        "features": customer_agg
+    }
+@app.post("/predict")
+def predict(request: PredictionRequest):
+    """
+    Predict payment clearing time for an invoice.
+    Returns:
+        - predicted_days_to_clear
+        - predicted_clear_date
+        - confidence info
+    """
+    if model is None:
+        raise HTTPException(status_code=503, detail="ML model not loaded")
+    try:
+        # Fetch aggregates
+        customer_agg = get_customer_aggregates(request.cust_number)
+        payment_agg = get_payment_terms_aggregates(request.cust_payment_terms)
+        business_agg = get_business_code_aggregates(request.business_code)
+        # Build invoice data dict
+        invoice_data = request.dict()
+        # Compute days_posting_to_due if due_in_date provided
+        if request.due_in_date:
+            posting_dt = datetime.strptime(request.posting_date, "%Y-%m-%d")
+            due_dt = datetime.strptime(request.due_in_date, "%Y-%m-%d")
+            invoice_data['days_posting_to_due'] = (due_dt - posting_dt).days
+        else:
+            invoice_data['days_posting_to_due'] = 15  # Default
+        # Build features
+        features = build_features(invoice_data, customer_agg, payment_agg, business_agg)
+        features_df = features_to_dataframe(features)
+        # Predict
+        predicted_days = float(model.predict(features_df)[0])
+        # Calculate predicted clear date
+        posting_dt = datetime.strptime(request.posting_date, "%Y-%m-%d")
+        predicted_clear_dt = posting_dt + timedelta(days=predicted_days)
+        # Prepare response
+        response = {
+            "invoice_id": request.invoice_id,
+            "cust_number": request.cust_number,
+            "posting_date": request.posting_date,
+            "total_open_amount": request.total_open_amount,
+            "predicted_days_to_clear": round(predicted_days, 2),
+            "predicted_clear_date": predicted_clear_dt.strftime("%Y-%m-%d"),
+            "customer_history": "available" if customer_agg else "new_customer",
+            "model_version": "v1.0"
+        }
+        # Log prediction
+        log_prediction_to_csv(response)
+        prediction_id = log_prediction_to_db({
+            **response,
+            'business_code': request.business_code,
+            'cust_payment_terms': request.cust_payment_terms,
+            'features': features
+        })
+        response['prediction_id'] = prediction_id
+        return response
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Prediction failed: {str(e)}")
+@app.get("/predictions/recent")
+def get_recent_predictions(limit: int = 10):
+    """Get recent predictions from log."""
+    try:
+        with FileLock(str(LOCK_PATH), timeout=10):
+            conn = sqlite3.connect(str(DB_PATH))
+            conn.row_factory = sqlite3.Row
+            cursor = conn.cursor()
+            cursor.execute("""
+                SELECT
+                    prediction_id,
+                    cust_number,
+                    posting_date,
+                    predicted_days_to_clear,
+                    predicted_clear_date,
+                    predicted_at
+                FROM predictions_log
+                ORDER BY predicted_at DESC
+                LIMIT ?
+            """, (limit,))
+            rows = cursor.fetchall()
+            conn.close()
+            return {
+                "count": len(rows),
+                "predictions": [dict(row) for row in rows]
+            }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to fetch predictions: {str(e)}")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=7860,
+        timeout_keep_alive=75,
+        timeout_graceful_shutdown=10
+    )

backend/__init__.py ADDED Viewed

File without changes

backend/app/__init__.py ADDED Viewed

File without changes

backend/app/agent/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""
+Agent module for autonomous invoice processing.
+"""
+from .agent_orchestrator import (
+    InvoiceAgent,
+    AgentState,
+    AgentDecision,
+    create_agent,
+    run_agent_pipeline
+)
+__all__ = [
+    'InvoiceAgent',
+    'AgentState',
+    'AgentDecision',
+    'create_agent',
+    'run_agent_pipeline'
+]

backend/app/agent/agent_orchestrator.py ADDED Viewed

	@@ -0,0 +1,476 @@

+"""
+True End-to-End Agent Orchestrator
+===================================
+Autonomous agent that:
+1. Decides which tools to use based on document analysis
+2. Validates its own output
+3. Self-corrects when confidence is low
+4. Learns from patterns
+"""
+import json
+import sys
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+from dataclasses import dataclass
+from enum import Enum
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[logging.StreamHandler(sys.stdout)],
+    force=True
+)
+logger = logging.getLogger(__name__)
+class AgentDecision(Enum):
+    """Agent's possible decisions"""
+    EXTRACT_TEXT = "extract_text"
+    EXTRACT_TABLES = "extract_tables"
+    RUN_NER = "run_ner"
+    USE_GEMINI = "use_gemini"
+    USE_REGEX = "use_regex"
+    VALIDATE = "validate"
+    RETRY = "retry"
+    COMPLETE = "complete"
+    HUMAN_REVIEW = "human_review"
+@dataclass
+class AgentState:
+    """Agent's internal state"""
+    doc_id: str
+    file_path: Path
+    # Extracted data
+    raw_text: Optional[str] = None
+    tables: Optional[List] = None
+    entities: Optional[List] = None
+    entity_map: Optional[Dict] = None
+    # Mapped fields
+    fields: Optional[Dict] = None
+    confidence_map: Optional[Dict] = None
+    # Decision tracking
+    attempts: int = 0
+    max_attempts: int = 3
+    history: List[str] = None
+    errors: List[str] = None
+    def __post_init__(self):
+        if self.history is None:
+            self.history = []
+        if self.errors is None:
+            self.errors = []
+class InvoiceAgent:
+    """
+    Autonomous agent that processes invoices with self-correction.
+    """
+    def __init__(self, text_extractor, table_extractor, ner_extractor, gemini_mapper):
+        """
+        Args:
+            text_extractor: Function(file_path) -> (success, text, error)
+            table_extractor: Function(file_path) -> (success, tables, error)
+            ner_extractor: Function(text) -> (success, entities, entity_map, error)
+            gemini_mapper: Function(text, entities, entity_map, tables) -> (success, fields, error)
+        """
+        self.text_extractor = text_extractor
+        self.table_extractor = table_extractor
+        self.ner_extractor = ner_extractor
+        self.gemini_mapper = gemini_mapper
+        # Minimum confidence thresholds
+        self.MIN_CONFIDENCE = {
+            'cust_number': 0.6,
+            'posting_date': 0.7,
+            'total_open_amount': 0.7,
+            'cust_payment_terms': 0.5
+        }
+    def process(self, state: AgentState) -> AgentState:
+        """
+        Main agent loop - autonomous decision-making and execution.
+        """
+        logger.info("=" * 70)
+        logger.info(f"**** AGENT STARTING: {state.file_path.name}")
+        logger.info("=" * 70)
+        while state.attempts < state.max_attempts:
+            state.attempts += 1
+            logger.info(f"\n**** ATTEMPT {state.attempts}/{state.max_attempts}")
+            # Step 1: Decide next action
+            decision = self._decide_next_action(state)
+            logger.info(f"**** DECISION: {decision.value}")
+            state.history.append(decision.value)
+            # Step 2: Execute action
+            success = self._execute_action(decision, state)
+            if not success:
+                logger.warning(f"****  Action {decision.value} failed")
+                continue
+            # Step 3: Check if we're done
+            if decision == AgentDecision.COMPLETE:
+                logger.info("**** AGENT COMPLETE")
+                break
+            if decision == AgentDecision.HUMAN_REVIEW:
+                logger.info("**** AGENT REQUESTING HUMAN REVIEW")
+                break
+        logger.info("=" * 70)
+        logger.info(f"**** Final confidence: {self._calculate_overall_confidence(state):.2f}")
+        logger.info(f"**** Actions taken: {' → '.join(state.history)}")
+        logger.info("=" * 70)
+        return state
+    def _decide_next_action(self, state: AgentState) -> AgentDecision:
+        """
+        Agent's brain - decides what to do next based on current state.
+        """
+        # 1. If no text, extract it
+        if state.raw_text is None:
+            return AgentDecision.EXTRACT_TEXT
+        # 2. If text exists but no entities, run NER
+        if state.entities is None:
+            return AgentDecision.RUN_NER
+        # 3. If no fields mapped yet, try Gemini first
+        if state.fields is None:
+            return AgentDecision.USE_GEMINI
+        # 4. If fields exist, validate them
+        if not self._is_validated(state):
+            return AgentDecision.VALIDATE
+        # 5. Check confidence - retry if low
+        overall_confidence = self._calculate_overall_confidence(state)
+        if overall_confidence < 0.6 and state.attempts < state.max_attempts:
+            # Try alternative approach
+            if 'use_gemini' in state.history and 'use_regex' not in state.history:
+                return AgentDecision.USE_REGEX
+            elif 'extract_tables' not in state.history:
+                return AgentDecision.EXTRACT_TABLES
+            else:
+                return AgentDecision.RETRY
+        # 6. If still low confidence, request human review
+        if overall_confidence < 0.5:
+            return AgentDecision.HUMAN_REVIEW
+        # 7. Otherwise, we're done!
+        return AgentDecision.COMPLETE
+    def _execute_action(self, decision: AgentDecision, state: AgentState) -> bool:
+        """Execute the decided action."""
+        try:
+            if decision == AgentDecision.EXTRACT_TEXT:
+                return self._extract_text(state)
+            elif decision == AgentDecision.EXTRACT_TABLES:
+                return self._extract_tables(state)
+            elif decision == AgentDecision.RUN_NER:
+                return self._run_ner(state)
+            elif decision == AgentDecision.USE_GEMINI:
+                return self._use_gemini(state)
+            elif decision == AgentDecision.USE_REGEX:
+                return self._use_regex(state)
+            elif decision == AgentDecision.VALIDATE:
+                return self._validate_fields(state)
+            elif decision == AgentDecision.RETRY:
+                # Clear fields and try again with different approach
+                state.fields = None
+                state.confidence_map = None
+                return True
+            elif decision in [AgentDecision.COMPLETE, AgentDecision.HUMAN_REVIEW]:
+                return True
+            return False
+        except Exception as e:
+            logger.error(f"**** Action failed: {e}")
+            state.errors.append(str(e))
+            return False
+    def _extract_text(self, state: AgentState) -> bool:
+        """Extract text from document."""
+        logger.info("**** Extracting text...")
+        success, text, error = self.text_extractor(state.file_path)
+        if success and text and len(text.strip()) > 10:
+            state.raw_text = text
+            logger.info(f"**** Extracted {len(text)} characters")
+            return True
+        state.errors.append(f"Text extraction failed: {error}")
+        return False
+    def _extract_tables(self, state: AgentState) -> bool:
+        """Extract tables from document."""
+        logger.info("**** Extracting tables...")
+        success, tables, error = self.table_extractor(state.file_path)
+        if success:
+            state.tables = tables
+            logger.info(f"**** Extracted {len(tables)} tables")
+            return True
+        logger.warning(f"**** Table extraction failed: {error}")
+        state.tables = []
+        return True  # Non-critical, continue
+    def _run_ner(self, state: AgentState) -> bool:
+        """Run Named Entity Recognition."""
+        logger.info("**** Running NER...")
+        success, entities, entity_map, error = self.ner_extractor(state.raw_text)
+        if success:
+            state.entities = entities
+            state.entity_map = entity_map
+            logger.info(f"**** Found {len(entities)} entities")
+            return True
+        logger.warning(f"**** NER failed: {error}")
+        state.entities = []
+        state.entity_map = {}
+        return True  # Non-critical, continue
+    def _use_gemini(self, state: AgentState) -> bool:
+        """Use Gemini for intelligent mapping."""
+        logger.info("**** Using Gemini mapping...")
+        success, result, error = self.gemini_mapper(
+            state.raw_text,
+            state.entities or [],
+            state.entity_map or {},
+            state.tables or []
+        )
+        if success and result:
+            state.fields = {
+                'cust_number': result.get('customer_name', 'UNKNOWN')[:20],
+                'posting_date': result.get('date', '2024-01-01'),
+                'total_open_amount': float(result.get('total_amount', 0.0)),
+                'business_code': 'U001',
+                'cust_payment_terms': result.get('payment_terms', 'NAH4')[:10]
+            }
+            # High confidence from Gemini
+            state.confidence_map = {
+                'cust_number': 0.9,
+                'posting_date': 0.9,
+                'total_open_amount': 0.9,
+                'business_code': 0.3,
+                'cust_payment_terms': 0.8
+            }
+            logger.info(f"**** Gemini mapped: {state.fields}")
+            return True
+        logger.warning(f"**** Gemini failed: {error}")
+        state.errors.append(f"Gemini mapping failed: {error}")
+        return False
+    def _use_regex(self, state: AgentState) -> bool:
+        """Fallback regex-based extraction."""
+        logger.info("**** Using regex fallback...")
+        from backend.app.api.ingest import map_with_regex
+        fields, confidence = map_with_regex(state.raw_text, state.entities or [])
+        state.fields = fields
+        state.confidence_map = confidence
+        logger.info(f"**** Regex mapped: {fields}")
+        return True
+    def _validate_fields(self, state: AgentState) -> bool:
+        """
+        Validate extracted fields using business rules.
+        Agent learns if data makes sense.
+        """
+        logger.info("✓ Validating fields...")
+        if not state.fields:
+            return False
+        validation_results = {}
+        # 1. Customer number shouldn't be empty or generic
+        cust = state.fields.get('cust_number', '')
+        if cust and cust != 'UNKNOWN' and len(cust) > 2:
+            validation_results['cust_number'] = True
+        else:
+            validation_results['cust_number'] = False
+            logger.warning("****  Customer number looks invalid")
+        # 2. Date should be reasonable (not default)
+        date = state.fields.get('posting_date', '')
+        if date and date != '2024-01-01':
+            validation_results['posting_date'] = True
+        else:
+            validation_results['posting_date'] = False
+            logger.warning("****  Date looks like default value")
+        # 3. Amount should be > 0
+        amount = state.fields.get('total_open_amount', 0.0)
+        if amount > 0:
+            validation_results['total_open_amount'] = True
+        else:
+            validation_results['total_open_amount'] = False
+            logger.warning("****  Amount is zero or missing")
+        # Adjust confidence based on validation
+        for field, is_valid in validation_results.items():
+            if not is_valid and state.confidence_map:
+                state.confidence_map[field] *= 0.5  # Reduce confidence
+        # Mark as validated
+        state.history.append('validated')
+        success_count = sum(validation_results.values())
+        logger.info(f"✓ Validation: {success_count}/{len(validation_results)} checks passed")
+        return success_count >= 2  # At least 2 fields should be valid
+    def _is_validated(self, state: AgentState) -> bool:
+        """Check if validation has been performed."""
+        return 'validated' in state.history
+    def _calculate_overall_confidence(self, state: AgentState) -> float:
+        """Calculate overall confidence score."""
+        if not state.confidence_map:
+            return 0.0
+        # Weighted average (important fields have more weight)
+        weights = {
+            'cust_number': 0.3,
+            'posting_date': 0.2,
+            'total_open_amount': 0.3,
+            'cust_payment_terms': 0.1,
+            'business_code': 0.1
+        }
+        total_confidence = 0.0
+        total_weight = 0.0
+        for field, weight in weights.items():
+            if field in state.confidence_map:
+                total_confidence += state.confidence_map[field] * weight
+                total_weight += weight
+        return total_confidence / total_weight if total_weight > 0 else 0.0
+# ==============================================
+# Integration with existing code
+# ==============================================
+def create_agent(text_extractor_fn, table_extractor_fn, ner_fn, gemini_fn):
+    """
+    Factory function to create agent with your existing functions.
+    Usage:
+        from backend.app.api.ingest import (
+            call_text_extractor, call_table_extractor,
+            call_ner, map_with_gemini
+        )
+        agent = create_agent(
+            call_text_extractor,
+            call_table_extractor,
+            call_ner,
+            map_with_gemini
+        )
+        state = AgentState(doc_id="doc123", file_path=Path("invoice.pdf"))
+        result_state = agent.process(state)
+    """
+    return InvoiceAgent(text_extractor_fn, table_extractor_fn, ner_fn, gemini_fn)
+def run_agent_pipeline(job_id: str, doc_id: str, file_path: Path):
+    """
+    Replace your existing process_document() with this agentic version.
+    """
+    from backend.app.api.ingest import (
+        call_text_extractor, call_table_extractor,
+        call_ner, map_with_gemini,
+        save_extraction, save_invoice_fields,
+        update_job_status
+    )
+    try:
+        update_job_status(job_id, 'processing')
+        # Create agent
+        agent = create_agent(
+            call_text_extractor,
+            call_table_extractor,
+            call_ner,
+            map_with_gemini
+        )
+        # Initialize state
+        state = AgentState(doc_id=doc_id, file_path=file_path)
+        # Let agent decide and execute autonomously
+        result_state = agent.process(state)
+        # Save results
+        if result_state.fields:
+            save_extraction(
+                doc_id,
+                result_state.raw_text,
+                result_state.tables or [],
+                result_state.entities or [],
+                {
+                    'method': 'autonomous_agent',
+                    'attempts': result_state.attempts,
+                    'actions': result_state.history,
+                    'confidence': agent._calculate_overall_confidence(result_state)
+                },
+                None
+            )
+            save_invoice_fields(
+                doc_id,
+                result_state.fields,
+                result_state.confidence_map or {}
+            )
+            # Check if needs human review
+            if AgentDecision.HUMAN_REVIEW.value in result_state.history:
+                update_job_status(job_id, 'needs_review')
+            else:
+                update_job_status(job_id, 'completed')
+            logger.info(f"**** Agent completed with {len(result_state.history)} actions")
+        else:
+            update_job_status(job_id, 'failed', 'Agent could not extract fields')
+    except Exception as e:
+        logger.error(f"**** Agent failed: {e}")
+        import traceback
+        traceback.print_exc()
+        update_job_status(job_id, 'failed', str(e))

backend/app/api/__init__.py ADDED Viewed

File without changes

backend/app/api/ingest.py ADDED Viewed

	@@ -0,0 +1,1459 @@

+"""
+Complete ingest pipeline with AUTONOMOUS AGENT INTEGRATION
+✅ Step 1: HF agents extract raw text
+✅ Step 2: HF NER finds entities
+✅ Step 3: Gemini maps to structured invoice fields
+✅ NEW: Autonomous agent orchestrates, validates, and self-corrects
+✅ UPDATED: Retry logic with exponential backoff + Local OCR fallback
+"""
+import os
+import uuid
+import json
+import sqlite3
+import logging
+import csv
+from pathlib import Path
+from datetime import datetime
+from typing import Optional, Dict, List, Any
+from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks, Form
+from pydantic import BaseModel
+from filelock import FileLock
+import httpx
+import re
+import sys
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[logging.StreamHandler(sys.stdout)],
+    force=True  # Override any existing config
+)
+logger = logging.getLogger(__name__)
+# Setup
+BASE_DIR = Path(__file__).parent.parent.parent.parent
+STORAGE_PATH = Path(os.getenv('STORAGE_PATH', str(BASE_DIR / "data" / "docs")))
+DB_PATH = Path(os.getenv('DB_PATH', str(BASE_DIR / "data" / "invoices.db")))
+LOCK_PATH = BASE_DIR / "data" / "invoices.db.lock"
+PREDICT_ENDPOINT = 'http://localhost:7860/predict'
+STORAGE_PATH.mkdir(parents=True, exist_ok=True)
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/api", tags=["ingest"])
+# ============================================
+# LOCAL OCR FALLBACK (NEW)
+# ============================================
+# ============================================
+# LOCAL OCR FALLBACK (UPDATED - EasyOCR + Tesseract)
+# ============================================
+def extract_text_with_easyocr(file_path: Path) -> tuple:
+    """
+    EasyOCR - Best free open-source OCR
+    - Works offline
+    - 80+ languages
+    - GPU/CPU support
+    - Better accuracy than Tesseract for invoices
+    """
+    try:
+        import easyocr
+        logger.info("🔧 Using EasyOCR (best free OCR)...")
+        # Initialize reader (downloads models on first run)
+        # Use GPU if available, fallback to CPU
+        reader = easyocr.Reader(['en'], gpu=False)  # Set gpu=True if you have CUDA
+        # Read image
+        result = reader.readtext(str(file_path), detail=0, paragraph=True)
+        # Join all text
+        text = '\n'.join(result)
+        if text and len(text.strip()) >= 10:
+            logger.info(f"✅ EasyOCR extracted {len(text)} characters")
+            return True, text, None
+        return False, None, "EasyOCR produced no usable text"
+    except ImportError:
+        logger.warning("⚠️ easyocr not installed. Install with: pip install easyocr")
+        return False, None, "easyocr not available"
+    except Exception as e:
+        logger.error(f"❌ EasyOCR failed: {e}")
+        return False, None, str(e)
+def extract_text_with_tesseract(file_path: Path) -> tuple:
+    """
+    Tesseract OCR - Fallback option
+    Faster but less accurate than EasyOCR
+    """
+    try:
+        import pytesseract
+        from PIL import Image
+        logger.info("🔧 Using Tesseract OCR as secondary fallback...")
+        image = Image.open(file_path)
+        text = pytesseract.image_to_string(image)
+        if text and len(text.strip()) >= 10:
+            logger.info(f"✅ Tesseract extracted {len(text)} characters")
+            return True, text, None
+        return False, None, "Tesseract produced no usable text"
+    except ImportError:
+        logger.warning("⚠️ pytesseract not installed. Install with: pip install pytesseract pillow")
+        return False, None, "pytesseract not available"
+    except Exception as e:
+        logger.error(f"❌ Tesseract failed: {e}")
+        return False, None, str(e)
+def extract_text_with_local_ocr(file_path: Path) -> tuple:
+    """
+    Multi-tier local OCR fallback system:
+    1. Try EasyOCR (best accuracy)
+    2. Try Tesseract (faster, less accurate)
+    3. Give up
+    """
+    logger.info("=" * 70)
+    logger.info("🔄 HF extraction failed - trying local OCR fallbacks...")
+    logger.info("=" * 70)
+    # Priority 1: EasyOCR (best for invoices)
+    success, text, error = extract_text_with_easyocr(file_path)
+    if success:
+        logger.info("✅ EasyOCR succeeded!")
+        return True, text, None
+    else:
+        logger.warning(f"⚠️ EasyOCR failed: {error}")
+    # Priority 2: Tesseract (faster fallback)
+    success, text, error = extract_text_with_tesseract(file_path)
+    if success:
+        logger.info("✅ Tesseract succeeded!")
+        return True, text, None
+    else:
+        logger.warning(f"⚠️ Tesseract failed: {error}")
+    # All local OCR failed
+    logger.error("❌ All local OCR methods failed")
+    return False, None, "All local OCR methods failed"
+# ============================================
+# STEP 1: HF Agent Text Extraction (UPDATED)
+# ============================================
+def get_agent_headers():
+    """Get headers with HF token"""
+    token = (
+        os.getenv('HF_TOKEN') or
+        os.getenv('HUGGINGFACE_API_TOKEN') or
+        os.getenv('AGENT_BEARER_TOKEN') or
+        ''
+    )
+    return {'Authorization': f'Bearer {token}'} if token else {}
+def get_mime_type(file_path: Path) -> str:
+    """Get MIME type"""
+    ext = file_path.suffix.lower()
+    mime_map = {
+        '.pdf': 'application/pdf',
+        '.jpg': 'image/jpeg',
+        '.jpeg': 'image/jpeg',
+        '.png': 'image/png'
+    }
+    return mime_map.get(ext, 'application/octet-stream')
+def call_text_extractor(file_path: Path, max_retries=3):
+    """
+    HF text extraction with retry logic and exponential backoff.
+    Falls back to local OCR if all retries fail.
+    """
+    url = os.getenv('TEXT_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/text')
+    base_timeout = int(os.getenv('AGENT_TIMEOUT_SECONDS', '120'))
+    for attempt in range(max_retries):
+        # Progressive timeout: 120s, 180s, 240s
+        timeout = base_timeout + (60 * attempt)
+        try:
+            logger.info(f"📄 Extracting text from {file_path.name} (attempt {attempt + 1}/{max_retries}, timeout={timeout}s)...")
+            filename = file_path.name
+            mime_type = get_mime_type(file_path)
+            with open(file_path, 'rb') as f:
+                files = {'file': (filename, f, mime_type)}
+                data = {
+                    'filename': filename,
+                    'start_page': 1,
+                    'end_page': 1
+                }
+                headers = get_agent_headers()
+                response = httpx.post(url, files=files, data=data, headers=headers, timeout=timeout)
+                if response.status_code == 200:
+                    result = response.json()
+                    text = result.get('result') or result.get('text') or result.get('extracted_text') or ''
+                    if text and len(text.strip()) >= 10:
+                        logger.info(f"✅ Extracted {len(text)} characters")
+                        return True, text, None
+                    logger.warning("⚠️ No text extracted from response")
+                    if attempt < max_retries - 1:
+                        continue
+                    return False, None, "No text extracted"
+                logger.warning(f"⚠️ HTTP {response.status_code}: {response.text[:200]}")
+        except httpx.TimeoutException:
+            logger.warning(f"⚠️ Timeout after {timeout}s on attempt {attempt + 1}")
+            if attempt < max_retries - 1:
+                logger.info("🔄 Retrying with longer timeout...")
+                continue
+        except Exception as e:
+            logger.error(f"❌ Error on attempt {attempt + 1}: {e}")
+            if attempt < max_retries - 1:
+                logger.info("🔄 Retrying...")
+                continue
+    # All retries failed - try local OCR fallback
+    logger.warning(f"⚠️ All {max_retries} HF extraction attempts failed, trying local OCR fallback...")
+    return extract_text_with_local_ocr(file_path)
+def call_table_extractor(file_path: Path, max_retries=2):
+    """
+    HF table extraction with retry logic.
+    Non-critical, so fewer retries.
+    """
+    url = os.getenv('TABLE_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/tables')
+    base_timeout = int(os.getenv('AGENT_TIMEOUT_SECONDS', '120'))
+    for attempt in range(max_retries):
+        timeout = base_timeout + (60 * attempt)
+        try:
+            logger.info(f"📊 Extracting tables from {file_path.name} (attempt {attempt + 1}/{max_retries})...")
+            filename = file_path.name
+            mime_type = get_mime_type(file_path)
+            with open(file_path, 'rb') as f:
+                files = {'file': (filename, f, mime_type)}
+                data = {
+                    'filename': filename,
+                    'start_page': 1,
+                    'end_page': 1
+                }
+                headers = get_agent_headers()
+                response = httpx.post(url, files=files, data=data, headers=headers, timeout=timeout)
+                if response.status_code == 200:
+                    result = response.json()
+                    tables = result.get('result') or result.get('tables') or []
+                    logger.info(f"✅ Extracted {len(tables)} tables")
+                    return True, tables, None
+                logger.warning(f"⚠️ HTTP {response.status_code}")
+        except httpx.TimeoutException:
+            logger.warning(f"⚠️ Table extraction timeout on attempt {attempt + 1}")
+        except Exception as e:
+            logger.warning(f"⚠️ Table extraction error: {e}")
+    # Non-critical - return empty list
+    logger.info("ℹ️ Table extraction failed, continuing without tables")
+    return False, [], "Table extraction failed (non-critical)"
+# ============================================
+# STEP 2: HF NER (Named Entity Recognition)
+# ============================================
+def call_ner(text: str, file_path: Path = None, max_retries=2) -> tuple:
+    """
+    Extract named entities using HF NER agent with retry logic.
+    """
+    url = os.getenv('NER_URL', 'https://point9-ner.hf.space/api/ner')
+    base_timeout = int(os.getenv('AGENT_TIMEOUT_SECONDS', '120'))
+    for attempt in range(max_retries):
+        timeout = base_timeout + (30 * attempt)
+        try:
+            logger.info(f"🔍 Running NER to find entities (attempt {attempt + 1}/{max_retries})...")
+            headers = get_agent_headers()
+            # NER expects multipart/form-data with file OR text
+            if file_path and file_path.exists():
+                # Send file
+                filename = file_path.name
+                mime_type = get_mime_type(file_path)
+                with open(file_path, 'rb') as f:
+                    files = {'file': (filename, f, mime_type)}
+                    data = {
+                        'text': text[:5000],
+                        'filename': filename,
+                        'start_page': 1,
+                        'end_page': 1
+                    }
+                    response = httpx.post(url, files=files, data=data, headers=headers, timeout=timeout)
+            else:
+                # Send just text as form data
+                data = {
+                    'text': text[:5000],
+                    'filename': 'document.txt',
+                    'start_page': 1,
+                    'end_page': 1
+                }
+                response = httpx.post(url, data=data, headers=headers, timeout=timeout)
+            if response.status_code == 200:
+                result = response.json()
+                # FIX: Handle both dict and string responses
+                if isinstance(result, str):
+                    try:
+                        result = json.loads(result)
+                    except:
+                        logger.warning(f"⚠️ NER returned unparseable string: {result[:100]}")
+                        if attempt < max_retries - 1:
+                            continue
+                        return False, [], {}, "Invalid response format"
+                # Extract entities
+                entities = result.get('entities') or result.get('result') or []
+                # Handle case where entities might also be a string
+                if isinstance(entities, str):
+                    try:
+                        entities = json.loads(entities)
+                    except:
+                        entities = []
+                logger.info(f"✅ Found {len(entities)} entities")
+                # Group entities by type
+                entity_map = {
+                    'PERSON': [],
+                    'ORG': [],
+                    'DATE': [],
+                    'MONEY': [],
+                    'CARDINAL': []
+                }
+                for entity in entities:
+                    if not isinstance(entity, dict):
+                        continue
+                    ent_type = entity.get('entity_type') or entity.get('label')
+                    ent_text = entity.get('text') or entity.get('word')
+                    if ent_type in entity_map and ent_text:
+                        entity_map[ent_type].append(ent_text)
+                logger.info(f"📋 Entity summary: PERSON={len(entity_map['PERSON'])}, ORG={len(entity_map['ORG'])}, DATE={len(entity_map['DATE'])}, MONEY={len(entity_map['MONEY'])}")
+                return True, entities, entity_map, None
+            logger.warning(f"⚠️ NER HTTP {response.status_code}")
+        except httpx.TimeoutException:
+            logger.warning(f"⚠️ NER timeout on attempt {attempt + 1}")
+        except Exception as e:
+            logger.error(f"❌ NER error on attempt {attempt + 1}: {e}")
+    # NER failed - return empty (non-critical)
+    logger.warning("⚠️ NER failed after retries, continuing without entities")
+    return False, [], {}, "NER failed (non-critical)"
+# ============================================
+# STEP 3: Gemini Intelligent Mapping
+# ============================================
+def map_with_gemini(text: str, entities: List, entity_map: Dict, tables: List):
+    """Use Gemini to intelligently map extracted data to invoice fields"""
+    try:
+        import google.generativeai as genai
+        api_key = os.getenv('GEMINI_API_KEY')
+        if not api_key:
+            logger.warning("⚠️ No Gemini API key configured")
+            return False, None, "No Gemini API key"
+        logger.info("🧠 Using Gemini for intelligent field mapping...")
+        genai.configure(api_key=api_key)
+        model = genai.GenerativeModel('models/gemini-2.5-flash')
+        # Build context for Gemini
+        context = f"""
+EXTRACTED TEXT:
+{text[:3000]}
+NAMED ENTITIES FOUND:
+- Organizations: {entity_map.get('ORG', [])}
+- People: {entity_map.get('PERSON', [])}
+- Dates: {entity_map.get('DATE', [])}
+- Money amounts: {entity_map.get('MONEY', [])}
+- Numbers: {entity_map.get('CARDINAL', [])}
+TABLES:
+{json.dumps(tables[:2], indent=2) if tables else 'None'}
+"""
+        prompt = f"""You are an expert at analyzing invoice data. Given the extracted text and entities below, map them to invoice fields.
+{context}
+Analyze the above data and return ONLY a valid JSON object with these exact fields:
+{{
+    "customer_name": "the client/customer company name (check ORG entities first)",
+    "invoice_number": "the invoice number (check CARDINAL entities)",
+    "date": "invoice date in YYYY-MM-DD format (check DATE entities)",
+    "total_amount": numeric total amount only (check MONEY entities, no currency symbol),
+    "payment_terms": "payment terms like NET30, NET60, or NAH4 if not found",
+    "reasoning": "brief explanation of how you identified each field"
+}}
+Rules:
+1. Prefer entities over raw text when available
+2. Customer name is usually the first ORG after "Bill To" or "Client"
+3. Total amount is usually the largest MONEY value
+4. Date should be in YYYY-MM-DD format
+5. If uncertain, use these defaults: customer_name="UNKNOWN", date="2024-01-01", total_amount=0.0, payment_terms="NAH4"
+Return ONLY the JSON object, no markdown, no explanation outside the JSON."""
+        response = model.generate_content(prompt)
+        text_response = response.text.strip()
+        # Remove markdown if present
+        text_response = text_response.replace('```json', '').replace('```', '').strip()
+        result = json.loads(text_response)
+        logger.info(f"✅ Gemini mapped: Customer={result.get('customer_name')}, Amount=${result.get('total_amount')}")
+        logger.info(f"💡 Reasoning: {result.get('reasoning', 'N/A')[:100]}")
+        return True, result, None
+    except json.JSONDecodeError as e:
+        logger.error(f"❌ Gemini returned invalid JSON: {e}")
+        logger.error(f"Response: {text_response[:500]}")
+        return False, None, f"Invalid JSON: {e}"
+    except Exception as e:
+        logger.error(f"❌ Gemini mapping failed: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        return False, None, str(e)
+# ============================================
+# Fallback: Regex Mapping
+# ============================================
+def map_with_regex(text: str, entities: List) -> tuple:
+    """Fallback regex-based field extraction"""
+    logger.info("🔤 Using regex fallback for field mapping...")
+    fields = {}
+    confidence = {}
+    # CUSTOMER NAME - try to use ORG entities first
+    org_entities = [e.get('text') or e.get('word') for e in entities
+                    if (e.get('entity_type') or e.get('label')) == 'ORG']
+    if org_entities:
+        fields['cust_number'] = org_entities[0][:20]
+        confidence['cust_number'] = 0.8
+    else:
+        # Regex fallback
+        client_patterns = [
+            r'(?:Client|Bill\s+To|Customer)[:\s]+(.*?)(?:\n|Tax|IBAN)',
+            r'(?:customer|client)[\s:]+([A-Za-z][A-Za-z\s,&-]+?)(?:\n|$)',
+        ]
+        for pattern in client_patterns:
+            match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
+            if match:
+                client = match.group(1).strip()
+                words = [w.strip() for w in client.replace(',', ' ').split() if len(w.strip()) > 2]
+                if words:
+                    fields['cust_number'] = words[0][:20]
+                    confidence['cust_number'] = 0.6
+                    break
+    if 'cust_number' not in fields:
+        fields['cust_number'] = 'UNKNOWN'
+        confidence['cust_number'] = 0.1
+    # DATE - try DATE entities first
+    date_entities = [e.get('text') or e.get('word') for e in entities
+                     if (e.get('entity_type') or e.get('label')) == 'DATE']
+    if date_entities:
+        date_str = date_entities[0]
+        for fmt in ['%m/%d/%Y', '%d/%m/%Y', '%Y-%m-%d', '%m-%d-%Y']:
+            try:
+                dt = datetime.strptime(date_str, fmt)
+                fields['posting_date'] = dt.strftime('%Y-%m-%d')
+                confidence['posting_date'] = 0.8
+                break
+            except:
+                continue
+    if 'posting_date' not in fields:
+        date_patterns = [
+            r'(?:Date\s+of\s+issue|Invoice\s+Date|Date)[:\s]+(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
+        ]
+        for pattern in date_patterns:
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                date_str = match.group(1)
+                for fmt in ['%m/%d/%Y', '%d/%m/%Y']:
+                    try:
+                        dt = datetime.strptime(date_str, fmt)
+                        fields['posting_date'] = dt.strftime('%Y-%m-%d')
+                        confidence['posting_date'] = 0.7
+                        break
+                    except:
+                        continue
+                if 'posting_date' in fields:
+                    break
+    if 'posting_date' not in fields:
+        fields['posting_date'] = datetime.now().strftime('%Y-%m-%d')
+        confidence['posting_date'] = 0.1
+    # AMOUNT - try MONEY entities first
+    money_entities = [e.get('text') or e.get('word') for e in entities
+                      if (e.get('entity_type') or e.get('label')) == 'MONEY']
+    if money_entities:
+        amounts = []
+        for money_str in money_entities:
+            try:
+                # Remove currency symbols and parse
+                amt_str = re.sub(r'[^\d.]', '', money_str)
+                amt = float(amt_str)
+                if amt > 10:
+                    amounts.append(amt)
+            except:
+                pass
+        if amounts:
+            fields['total_open_amount'] = max(amounts)
+            confidence['total_open_amount'] = 0.8
+            logger.info(f"✅ Found amount from MONEY entity: ${fields['total_open_amount']}")
+    if 'total_open_amount' not in fields:
+        # Regex fallback
+        pattern = r'\$\s*([0-9]{1,3}(?:,?[0-9]{3})*\.[0-9]{2})'
+        amounts = []
+        for match in re.finditer(pattern, text):
+            try:
+                amt = float(match.group(1).replace(',', ''))
+                if amt > 50:
+                    amounts.append(amt)
+            except:
+                pass
+        if amounts:
+            fields['total_open_amount'] = max(amounts)
+            confidence['total_open_amount'] = 0.6
+        else:
+            fields['total_open_amount'] = 0.0
+            confidence['total_open_amount'] = 0.0
+            logger.warning("⚠️ No amount found!")
+    # PAYMENT TERMS
+    terms_match = re.search(r'(NET\s?\d{1,2}|N\d{2}|NAH\d)', text, re.IGNORECASE)
+    fields['cust_payment_terms'] = terms_match.group(1).upper() if terms_match else 'NAH4'
+    confidence['cust_payment_terms'] = 0.7 if terms_match else 0.2
+    # BUSINESS CODE
+    fields['business_code'] = 'U001'
+    confidence['business_code'] = 0.2
+    return fields, confidence
+# ============================================
+# Database Functions
+# ============================================
+def update_job_status(job_id: str, status: str, error_text: str = None):
+    """Update job status"""
+    with FileLock(str(LOCK_PATH), timeout=10):
+        conn = sqlite3.connect(str(DB_PATH))
+        cursor = conn.cursor()
+        cursor.execute("""
+            UPDATE ingest_jobs
+            SET status = ?, error_text = ?, updated_at = CURRENT_TIMESTAMP
+            WHERE job_id = ?
+        """, (status, error_text, job_id))
+        conn.commit()
+        conn.close()
+def save_extraction(doc_id: str, raw_text: str, tables: list, entities: list, classification: dict, summary: str = None):
+    """Save extraction results"""
+    with FileLock(str(LOCK_PATH), timeout=10):
+        conn = sqlite3.connect(str(DB_PATH))
+        cursor = conn.cursor()
+        cursor.execute("""
+            INSERT OR REPLACE INTO extractions (
+                doc_id, raw_text, tables_json, entities_json,
+                classification_json, summary_text
+            ) VALUES (?, ?, ?, ?, ?, ?)
+        """, (
+            doc_id,
+            raw_text,
+            json.dumps(tables) if tables else None,
+            json.dumps(entities) if entities else None,
+            json.dumps(classification) if classification else None,
+            summary
+        ))
+        conn.commit()
+        conn.close()
+def save_invoice_fields(doc_id: str, fields: Dict, confidence_map: Dict):
+    """Save invoice fields"""
+    with FileLock(str(LOCK_PATH), timeout=10):
+        conn = sqlite3.connect(str(DB_PATH))
+        cursor = conn.cursor()
+        cursor.execute("""
+            INSERT INTO invoice_fields (
+                doc_id, cust_number, posting_date, total_open_amount,
+                business_code, cust_payment_terms, confidence_map
+            ) VALUES (?, ?, ?, ?, ?, ?, ?)
+        """, (
+            doc_id,
+            fields.get('cust_number'),
+            fields.get('posting_date'),
+            fields.get('total_open_amount'),
+            fields.get('business_code'),
+            fields.get('cust_payment_terms'),
+            json.dumps(confidence_map)
+        ))
+        conn.commit()
+        conn.close()
+# ============================================
+# AGENT MODE FLAG (Environment Variable)
+# ============================================
+USE_AGENT_MODE = os.getenv('USE_AGENT_MODE', 'true').lower() == 'true'
+# ============================================
+# Main Processing Pipeline
+# ============================================
+def process_document_legacy(job_id: str, doc_id: str, file_path: Path):
+    """
+    LEGACY PIPELINE (Original Implementation):
+    1. HF Extract text + tables
+    2. HF NER finds entities
+    3. Gemini maps to invoice fields
+    """
+    logger.info("=" * 70)
+    logger.info(f"🚀 Starting LEGACY pipeline for {file_path.name}")
+    logger.info("=" * 70)
+    try:
+        update_job_status(job_id, 'processing')
+        # STEP 1: Extract text with HF agents
+        logger.info("STEP 1: HF TEXT + TABLE EXTRACTION")
+        logger.info("-" * 70)
+        success, raw_text, error = call_text_extractor(file_path)
+        if not success or not raw_text:
+            update_job_status(job_id, 'failed', f"Text extraction failed: {error}")
+            return
+        # Extract tables (optional, won't fail if it doesn't work)
+        _, tables, _ = call_table_extractor(file_path)
+        # STEP 2: NER to find entities
+        logger.info("-" * 70)
+        logger.info("STEP 2: NER - NAMED ENTITY RECOGNITION")
+        logger.info("-" * 70)
+        ner_success, entities, entity_map, ner_error = call_ner(raw_text, file_path)
+        if not ner_success:
+            logger.warning(f"⚠️ NER failed: {ner_error}, continuing without entities")
+            entities = []
+            entity_map = {}
+        # STEP 3: Gemini intelligent mapping
+        logger.info("-" * 70)
+        logger.info("STEP 3: GEMINI INTELLIGENT MAPPING")
+        logger.info("-" * 70)
+        gemini_success, gemini_result, gemini_error = map_with_gemini(
+            raw_text, entities, entity_map, tables
+        )
+        if gemini_success and gemini_result:
+            # Use Gemini's mapping
+            fields = {
+                'cust_number': gemini_result.get('customer_name', 'UNKNOWN')[:20],
+                'posting_date': gemini_result.get('date', datetime.now().strftime('%Y-%m-%d')),
+                'total_open_amount': float(gemini_result.get('total_amount', 0.0)),
+                'business_code': 'U001',
+                'cust_payment_terms': gemini_result.get('payment_terms', 'NAH4')[:10]
+            }
+            confidence_map = {
+                'cust_number': 0.95,
+                'posting_date': 0.95,
+                'total_open_amount': 0.95,
+                'business_code': 0.2,
+                'cust_payment_terms': 0.8
+            }
+            method = 'hf_ner_gemini'
+        else:
+            # Fallback to regex mapping
+            logger.warning(f"⚠️ Gemini mapping failed: {gemini_error}")
+            logger.info("-" * 70)
+            logger.info("FALLBACK: REGEX MAPPING")
+            logger.info("-" * 70)
+            fields, confidence_map = map_with_regex(raw_text, entities)
+            method = 'hf_ner_regex'
+        # Save results
+        save_extraction(
+            doc_id, raw_text, tables, entities,
+            {'method': method, 'entity_count': len(entities)},
+            None
+        )
+        save_invoice_fields(doc_id, fields, confidence_map)
+        logger.info("=" * 70)
+        logger.info(f"✅ EXTRACTION COMPLETE - Method: {method}")
+        logger.info(f"📋 Fields: {fields}")
+        logger.info("=" * 70)
+        # Call prediction API
+        #logger.info("🔮 Calling payment prediction...")
+        #try:
+        #    pred_response = httpx.post(PREDICT_ENDPOINT, json=fields, timeout=30)
+        #
+        #    if pred_response.status_code == 200:
+        #        pred_result = pred_response.json()
+        #        logger.info(f"✅ Prediction: {pred_result.get('predicted_days_to_clear')} days")
+        #except Exception as e:
+        #    logger.error(f"⚠️ Prediction failed: {e}")
+        update_job_status(job_id, 'completed')
+        logger.info(f"🎉 Job {job_id} completed successfully")
+    except Exception as e:
+        logger.error(f"❌ Job {job_id} failed: {e}")
+        import traceback
+        traceback.print_exc()
+        update_job_status(job_id, 'failed', str(e))
+def process_document_agent(job_id: str, doc_id: str, file_path: Path, user_message: Optional[str] = None):
+    """
+    NEW AUTONOMOUS AGENT PIPELINE with optional wrapper
+    """
+    try:
+        # Clean up user_message
+        if user_message in [None, 'None', '', 'null', 'undefined']:
+            user_message = None
+        else:
+            user_message = str(user_message).strip()
+            if not user_message:
+                user_message = None
+        logger.info("=" * 70)
+        logger.info(f"🔍 AGENT - Processing with message: '{user_message}'")
+        logger.info(f"🔍 Type: {type(user_message)}")
+        logger.info(f"🔍 Is None: {user_message is None}")
+        logger.info("=" * 70)
+        from backend.app.agent.agent_orchestrator import (
+            InvoiceAgent, AgentState, create_agent
+        )
+        logger.info("=" * 70)
+        logger.info(f"🤖 AUTONOMOUS AGENT MODE for {file_path.name}")
+        logger.info("=" * 70)
+        update_job_status(job_id, 'processing')
+        # Create agent
+        agent = create_agent(
+            call_text_extractor,
+            call_table_extractor,
+            call_ner,
+            map_with_gemini
+        )
+        # Initialize state
+        state = AgentState(doc_id=doc_id, file_path=file_path)
+        # Let agent autonomously decide and execute
+        result_state = agent.process(state)
+        # ============================================
+        # WRAPPER INTEGRATION
+        # ============================================
+        full_extraction = result_state.fields
+        final_result = full_extraction
+        wrapper_used = False
+        # Check if user_message is actually provided
+        if user_message is not None and len(user_message) > 0:
+            logger.info("=" * 70)
+            logger.info(f"💬 USER MESSAGE DETECTED: '{user_message}'")
+            logger.info("🎯 Activating Gemini wrapper to filter output...")
+            logger.info(f"📦 Full extraction fields: {list(full_extraction.keys())}")
+            logger.info("=" * 70)
+            try:
+                from backend.app.wrappers.gemini_output_filter import GeminiOutputFilter
+                wrapper = GeminiOutputFilter()
+                final_result = wrapper.filter_output(user_message, full_extraction)
+                wrapper_used = True
+                logger.info("=" * 70)
+                logger.info(f"✅ WRAPPER SUCCESS!")
+                logger.info(f"📤 Original fields: {list(full_extraction.keys())}")
+                logger.info(f"🎯 Filtered fields: {list(final_result.keys())}")
+                logger.info(f"📋 Filtered result: {json.dumps(final_result, indent=2)}")
+                logger.info("=" * 70)
+            except Exception as wrapper_error:
+                logger.error("=" * 70)
+                logger.error(f"❌ WRAPPER FAILED: {wrapper_error}")
+                logger.error("=" * 70)
+                import traceback
+                logger.error(traceback.format_exc())
+                logger.warning("📦 Falling back to full extraction")
+                final_result = full_extraction
+                wrapper_used = False
+        else:
+            logger.info("=" * 70)
+            logger.info("ℹ️  No user message provided - returning full extraction")
+            logger.info(f"📦 Full extraction fields: {list(full_extraction.keys())}")
+            logger.info("=" * 70)
+        # ============================================
+        # Save results
+        # ============================================
+        if result_state.fields:
+            # Determine method
+            if 'use_gemini' in result_state.history:
+                method = 'autonomous_agent_gemini'
+            elif 'use_regex' in result_state.history:
+                method = 'autonomous_agent_regex'
+            else:
+                method = 'autonomous_agent'
+            if wrapper_used:
+                method += '_with_wrapper'
+            save_extraction(
+                doc_id,
+                result_state.raw_text or '',
+                result_state.tables or [],
+                result_state.entities or [],
+                {
+                    'method': method,
+                    'attempts': result_state.attempts,
+                    'actions': result_state.history,
+                    'confidence': agent._calculate_overall_confidence(result_state),
+                    'errors': result_state.errors,
+                    'user_message': user_message,
+                    'wrapper_used': wrapper_used,
+                    'full_extraction_keys': list(full_extraction.keys()) if full_extraction else [],
+                    'filtered_keys': list(final_result.keys()) if wrapper_used else None
+                },
+                None
+            )
+            # Save filtered result
+            save_invoice_fields(
+                doc_id,
+                final_result,
+                result_state.confidence_map or {}
+            )
+            # Call prediction
+            logger.info("🔮 Calling payment prediction...")
+            try:
+                pred_response = httpx.post(PREDICT_ENDPOINT, json=final_result, timeout=30)
+                if pred_response.status_code == 200:
+                    pred_result = pred_response.json()
+                    logger.info(f"✅ Prediction: {pred_result.get('predicted_days_to_clear')} days")
+            except Exception as e:
+                logger.error(f"⚠️ Prediction failed: {e}")
+            # Check status
+            from backend.app.agent.agent_orchestrator import AgentDecision
+            if AgentDecision.HUMAN_REVIEW.value in result_state.history:
+                update_job_status(job_id, 'needs_review')
+                logger.info("👤 Agent requesting human review")
+            else:
+                update_job_status(job_id, 'completed')
+                logger.info(f"✅ Agent completed with confidence: {agent._calculate_overall_confidence(result_state):.2f}")
+        else:
+            update_job_status(job_id, 'failed', 'Agent could not extract fields')
+            logger.error("❌ Agent failed to extract any fields")
+    except ImportError as e:
+        logger.error(f"❌ Agent module not found: {e}")
+        logger.info("⚠️ Falling back to legacy pipeline...")
+        process_document_legacy(job_id, doc_id, file_path)
+    except Exception as e:
+        logger.error(f"❌ Agent failed: {e}")
+        import traceback
+        traceback.print_exc()
+        update_job_status(job_id, 'failed', str(e))
+def process_document(job_id: str, doc_id: str, file_path: Path, user_message: Optional[str] = None):
+    """
+    Main entry point - routes to agent or legacy pipeline.
+    """
+    # Clean up user_message
+    if user_message in [None, 'None', '', 'null', 'undefined']:
+        user_message = None
+    else:
+        user_message = str(user_message).strip()
+        if not user_message:
+            user_message = None
+    logger.info("=" * 70)
+    logger.info(f"🔍 PROCESS_DOCUMENT - Cleaned user_message: '{user_message}'")
+    logger.info(f"🔍 Type: {type(user_message)}")
+    logger.info(f"🔍 Is None: {user_message is None}")
+    logger.info("=" * 70)
+    if USE_AGENT_MODE:
+        logger.info("🤖 Using AUTONOMOUS AGENT mode")
+        process_document_agent(job_id, doc_id, file_path, user_message=user_message)
+    else:
+        logger.info("📋 Using LEGACY pipeline mode")
+        process_document_legacy(job_id, doc_id, file_path)
+# ============================================
+# API Endpoints
+# ============================================
+class IngestResponse(BaseModel):
+    job_id: str
+    doc_id: str
+    filename: str
+    status: str
+    message: str
+class JobStatusResponse(BaseModel):
+    job_id: str
+    doc_id: str
+    filename: str
+    status: str
+    error_text: Optional[str] = None
+    created_at: str
+    updated_at: str
+    extraction: Optional[Dict] = None
+    invoice_fields: Optional[Dict] = None
+class BatchIngestResponse(BaseModel):
+    batch_id: str
+    total_files: int
+    jobs: List[Dict[str, str]]
+    message: str
+class BatchStatusResponse(BaseModel):
+    batch_id: str
+    total_files: int
+    completed: int
+    processing: int
+    failed: int
+    queued: int
+    jobs: List[Dict[str, Any]]
+@router.post("/ingest", response_model=IngestResponse)
+async def ingest_document(
+    background_tasks: BackgroundTasks,
+    file: UploadFile = File(...),
+    message: str = Form(None)  # CHANGED: Use Form(None) instead of Optional[str] = None
+):
+    # Clean message parameter
+    cleaned_message = None
+    if message and message not in ['None', 'null', '', 'undefined']:
+        cleaned_message = message.strip()
+        if not cleaned_message:
+            cleaned_message = None
+    logger.info("=" * 70)
+    logger.info(f"📨 API ENDPOINT - Raw message: '{message}'")
+    logger.info(f"✨ Cleaned message: '{cleaned_message}'")
+    logger.info(f"🔍 Message type: {type(cleaned_message)}")
+    logger.info(f"❓ Is None: {cleaned_message is None}")
+    logger.info("=" * 70)
+    try:
+        allowed_types = ['application/pdf', 'image/png', 'image/jpeg']
+        if file.content_type not in allowed_types:
+            raise HTTPException(400, f"Invalid file type: {file.content_type}")
+        job_id = f"job_{uuid.uuid4().hex[:12]}"
+        doc_id = f"doc_{uuid.uuid4().hex[:12]}"
+        file_ext = file.filename.split('.')[-1] if '.' in file.filename else 'pdf'
+        stored_filename = f"{doc_id}.{file_ext}"
+        file_path = STORAGE_PATH / stored_filename
+        content = await file.read()
+        with open(file_path, 'wb') as f:
+            f.write(content)
+        with FileLock(str(LOCK_PATH), timeout=10):
+            conn = sqlite3.connect(str(DB_PATH))
+            cursor = conn.cursor()
+            cursor.execute("""
+                INSERT INTO ingest_jobs (job_id, doc_id, filename, status)
+                VALUES (?, ?, ?, 'queued')
+            """, (job_id, doc_id, file.filename))
+            cursor.execute("""
+                INSERT INTO documents (doc_id, job_id, path, filename, content_type)
+                VALUES (?, ?, ?, ?, ?)
+            """, (doc_id, job_id, str(file_path), file.filename, file.content_type))
+            conn.commit()
+            conn.close()
+        # Start processing with cleaned message
+        background_tasks.add_task(
+            process_document,
+            job_id,
+            doc_id,
+            file_path,
+            user_message=cleaned_message  # Pass cleaned message
+        )
+        logger.info(f"🚀 Background task started with message: '{cleaned_message}'")
+        mode = "autonomous agent"
+        if cleaned_message:
+            mode += f" with intelligent filtering"
+            logger.info(f"🎯 User wants: '{cleaned_message}'")
+        return IngestResponse(
+            job_id=job_id,
+            doc_id=doc_id,
+            filename=file.filename,
+            status='queued',
+            message=f'Document uploaded. Processing with {mode}.'
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"❌ Ingest endpoint error: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        raise HTTPException(500, str(e))
+@router.get("/ingest/{job_id}", response_model=JobStatusResponse)
+def get_ingest_status(job_id: str):
+    """Get job status with agent decision history (if applicable)"""
+    try:
+        with FileLock(str(LOCK_PATH), timeout=10):
+            conn = sqlite3.connect(str(DB_PATH))
+            conn.row_factory = sqlite3.Row
+            cursor = conn.cursor()
+            cursor.execute("SELECT * FROM ingest_jobs WHERE job_id = ?", (job_id,))
+            job = cursor.fetchone()
+            if not job:
+                conn.close()
+                raise HTTPException(404, "Job not found")
+            job_data = dict(job)
+            doc_id = job_data['doc_id']
+            if job_data['status'] in ['completed', 'needs_review']:
+                cursor.execute("SELECT * FROM extractions WHERE doc_id = ?", (doc_id,))
+                extraction = cursor.fetchone()
+                if extraction:
+                    ext_dict = dict(extraction)
+                    if ext_dict.get('raw_text'):
+                        ext_dict['raw_text'] = ext_dict['raw_text'][:500] + "..."
+                    job_data['extraction'] = ext_dict
+                cursor.execute("SELECT * FROM invoice_fields WHERE doc_id = ?", (doc_id,))
+                invoice = cursor.fetchone()
+                if invoice:
+                    inv_dict = dict(invoice)
+                    if inv_dict.get('confidence_map'):
+                        inv_dict['confidence_map'] = json.loads(inv_dict['confidence_map'])
+                    job_data['invoice_fields'] = inv_dict
+            conn.close()
+            return JobStatusResponse(**job_data)
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"❌ Job status error: {e}")
+        raise HTTPException(500, str(e))
+@router.post("/ingest/batch", response_model=BatchIngestResponse)
+async def ingest_batch_documents(
+    background_tasks: BackgroundTasks,
+    files: List[UploadFile] = File(...),
+    message: str = Form(None)
+):
+    """
+    Upload multiple documents for batch processing.
+    Examples:
+    1. Batch upload without filtering:
+       curl -F "files=@invoice1.jpg" -F "files=@invoice2.pdf" -F "files=@invoice3.png" \
+            http://localhost:7860/api/ingest/batch
+    2. Batch upload with same extraction rule for all:
+       curl -F "files=@invoice1.jpg" -F "files=@invoice2.jpg" \
+            -F "message=extract only total and date" \
+            http://localhost:7860/api/ingest/batch
+    3. Maximum 50 files per batch
+    """
+    # Validate batch size
+    if len(files) > 50:
+        raise HTTPException(400, "Maximum 50 files per batch")
+    if len(files) == 0:
+        raise HTTPException(400, "No files provided")
+    # Clean message
+    cleaned_message = None
+    if message and message not in ['None', 'null', '', 'undefined']:
+        cleaned_message = message.strip()
+        if not cleaned_message:
+            cleaned_message = None
+    batch_id = f"batch_{uuid.uuid4().hex[:12]}"
+    jobs = []
+    logger.info("=" * 70)
+    logger.info(f"📦 BATCH UPLOAD - {len(files)} files")
+    logger.info(f"📦 Batch ID: {batch_id}")
+    logger.info(f"📦 Message: '{cleaned_message}'")
+    logger.info("=" * 70)
+    try:
+        allowed_types = ['application/pdf', 'image/png', 'image/jpeg']
+        for idx, file in enumerate(files):
+            # Validate each file
+            if file.content_type not in allowed_types:
+                logger.warning(f"⚠️ Skipping {file.filename} - invalid type: {file.content_type}")
+                continue
+            # Create job for this file
+            job_id = f"job_{uuid.uuid4().hex[:12]}"
+            doc_id = f"doc_{uuid.uuid4().hex[:12]}"
+            file_ext = file.filename.split('.')[-1] if '.' in file.filename else 'pdf'
+            stored_filename = f"{doc_id}.{file_ext}"
+            file_path = STORAGE_PATH / stored_filename
+            # Save file
+            content = await file.read()
+            with open(file_path, 'wb') as f:
+                f.write(content)
+            # Save to database
+            with FileLock(str(LOCK_PATH), timeout=10):
+                conn = sqlite3.connect(str(DB_PATH))
+                cursor = conn.cursor()
+                cursor.execute("""
+                    INSERT INTO ingest_jobs (job_id, doc_id, filename, status)
+                    VALUES (?, ?, ?, 'queued')
+                """, (job_id, doc_id, file.filename))
+                cursor.execute("""
+                    INSERT INTO documents (doc_id, job_id, path, filename, content_type)
+                    VALUES (?, ?, ?, ?, ?)
+                """, (doc_id, job_id, str(file_path), file.filename, file.content_type))
+                conn.commit()
+                conn.close()
+            # Queue processing
+            background_tasks.add_task(
+                process_document,
+                job_id,
+                doc_id,
+                file_path,
+                user_message=cleaned_message
+            )
+            jobs.append({
+                'job_id': job_id,
+                'doc_id': doc_id,
+                'filename': file.filename,
+                'status': 'queued'
+            })
+            logger.info(f"✅ [{idx+1}/{len(files)}] Queued: {file.filename}")
+        if not jobs:
+            raise HTTPException(400, "No valid files to process")
+        # Save batch metadata
+        with FileLock(str(LOCK_PATH), timeout=10):
+            conn = sqlite3.connect(str(DB_PATH))
+            cursor = conn.cursor()
+            # Create batch_jobs table if it doesn't exist
+            cursor.execute("""
+                CREATE TABLE IF NOT EXISTS batch_jobs (
+                    batch_id TEXT PRIMARY KEY,
+                    total_files INTEGER,
+                    message TEXT,
+                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+                )
+            """)
+            cursor.execute("""
+                INSERT INTO batch_jobs (batch_id, total_files, message)
+                VALUES (?, ?, ?)
+            """, (batch_id, len(jobs), cleaned_message))
+            # Link jobs to batch
+            cursor.execute("""
+                CREATE TABLE IF NOT EXISTS batch_job_mapping (
+                    batch_id TEXT,
+                    job_id TEXT,
+                    FOREIGN KEY (job_id) REFERENCES ingest_jobs(job_id)
+                )
+            """)
+            for job in jobs:
+                cursor.execute("""
+                    INSERT INTO batch_job_mapping (batch_id, job_id)
+                    VALUES (?, ?)
+                """, (batch_id, job['job_id']))
+            conn.commit()
+            conn.close()
+        mode = "autonomous agent"
+        if cleaned_message:
+            mode += " with intelligent filtering"
+        logger.info(f"🚀 Batch {batch_id} processing started with {len(jobs)} files")
+        return BatchIngestResponse(
+            batch_id=batch_id,
+            total_files=len(jobs),
+            jobs=jobs,
+            message=f'Batch of {len(jobs)} documents uploaded. Processing with {mode}.'
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"❌ Batch ingest error: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        raise HTTPException(500, str(e))
+@router.get("/ingest/batch/{batch_id}", response_model=BatchStatusResponse)
+def get_batch_status(batch_id: str):
+    """
+    Get status of all jobs in a batch.
+    Example:
+    curl http://localhost:7860/api/ingest/batch/batch_abc123
+    """
+    try:
+        with FileLock(str(LOCK_PATH), timeout=10):
+            conn = sqlite3.connect(str(DB_PATH))
+            conn.row_factory = sqlite3.Row
+            cursor = conn.cursor()
+            # Get batch info
+            cursor.execute("SELECT * FROM batch_jobs WHERE batch_id = ?", (batch_id,))
+            batch = cursor.fetchone()
+            if not batch:
+                conn.close()
+                raise HTTPException(404, "Batch not found")
+            # Get all jobs in batch
+            cursor.execute("""
+                SELECT j.* FROM ingest_jobs j
+                JOIN batch_job_mapping bm ON j.job_id = bm.job_id
+                WHERE bm.batch_id = ?
+            """, (batch_id,))
+            jobs = cursor.fetchall()
+            conn.close()
+            # Count statuses
+            status_counts = {
+                'completed': 0,
+                'processing': 0,
+                'failed': 0,
+                'queued': 0,
+                'needs_review': 0
+            }
+            jobs_list = []
+            for job in jobs:
+                job_dict = dict(job)
+                status = job_dict['status']
+                status_counts[status] = status_counts.get(status, 0) + 1
+                jobs_list.append({
+                    'job_id': job_dict['job_id'],
+                    'doc_id': job_dict['doc_id'],
+                    'filename': job_dict['filename'],
+                    'status': status,
+                    'error_text': job_dict.get('error_text'),
+                    'created_at': job_dict['created_at'],
+                    'updated_at': job_dict['updated_at']
+                })
+            return BatchStatusResponse(
+                batch_id=batch_id,
+                total_files=len(jobs),
+                completed=status_counts['completed'],
+                processing=status_counts['processing'],
+                failed=status_counts['failed'],
+                queued=status_counts['queued'],
+                jobs=jobs_list
+            )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"❌ Batch status error: {e}")
+        raise HTTPException(500, str(e))
+@router.get("/ingest/batch/{batch_id}/download")
+def download_batch_results(batch_id: str):
+    """
+    Download all extracted data from a batch as CSV.
+    Example:
+    curl http://localhost:7860/api/ingest/batch/batch_abc123/download -o results.csv
+    """
+    try:
+        import csv
+        from io import StringIO
+        from fastapi.responses import StreamingResponse
+        with FileLock(str(LOCK_PATH), timeout=10):
+            conn = sqlite3.connect(str(DB_PATH))
+            conn.row_factory = sqlite3.Row
+            cursor = conn.cursor()
+            # Get all completed jobs in batch
+            cursor.execute("""
+                SELECT j.*, f.* FROM ingest_jobs j
+                JOIN batch_job_mapping bm ON j.job_id = bm.job_id
+                LEFT JOIN invoice_fields f ON j.doc_id = f.doc_id
+                WHERE bm.batch_id = ? AND j.status = 'completed'
+            """, (batch_id,))
+            results = cursor.fetchall()
+            conn.close()
+            if not results:
+                raise HTTPException(404, "No completed jobs found in batch")
+            # Create CSV
+            output = StringIO()
+            writer = csv.writer(output)
+            # Header
+            writer.writerow([
+                'filename', 'doc_id', 'customer', 'date', 'amount',
+                'payment_terms', 'business_code', 'status'
+            ])
+            # Data rows
+            for row in results:
+                writer.writerow([
+                    row['filename'],
+                    row['doc_id'],
+                    row['cust_number'] or 'N/A',
+                    row['posting_date'] or 'N/A',
+                    row['total_open_amount'] or 0.0,
+                    row['cust_payment_terms'] or 'N/A',
+                    row['business_code'] or 'N/A',
+                    row['status']
+                ])
+            output.seek(0)
+            return StreamingResponse(
+                iter([output.getvalue()]),
+                media_type="text/csv",
+                headers={
+                    "Content-Disposition": f"attachment; filename=batch_{batch_id}_results.csv"
+                }
+            )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(500, str(e))

backend/app/utils/__init__.py ADDED Viewed

File without changes

backend/app/utils/agent_client.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""
+HF Agent client with proper environment variable support.
+"""
+import httpx
+import os
+import time
+import logging
+from typing import Dict, Optional, Tuple
+from pathlib import Path
+logger = logging.getLogger(__name__)
+# Load from environment
+TEXT_EXTRACTOR_URL = os.getenv('TEXT_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/text')
+TABLE_EXTRACTOR_URL = os.getenv('TABLE_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/tables')
+NER_URL = os.getenv('NER_URL', 'https://point9-ner.hf.space/api/ner')
+CLASSIFY_URL = os.getenv('CLASSIFY_URL', 'https://point9-classify.hf.space/api/classify')
+SUMMARIZER_URL = os.getenv('SUMMARIZER_URL', '')  # Optional
+AGENT_BEARER_TOKEN = os.getenv('AGENT_BEARER_TOKEN', '')
+AGENT_TIMEOUT_SECONDS = int(os.getenv('AGENT_TIMEOUT_SECONDS', '30'))
+def get_headers() -> Dict:
+    """Get headers with optional bearer token."""
+    headers = {}
+    if AGENT_BEARER_TOKEN:
+        headers['Authorization'] = f'Bearer {AGENT_BEARER_TOKEN}'
+    return headers
+def call_agent_with_retry(
+    url: str,
+    files: Optional[Dict] = None,
+    data: Optional[Dict] = None,
+    json: Optional[Dict] = None,
+    max_retries: int = 1
+) -> Tuple[bool, Optional[Dict], Optional[str]]:
+    """Call agent with retry logic."""
+    headers = get_headers()
+    for attempt in range(max_retries + 1):
+        try:
+            with httpx.Client(timeout=AGENT_TIMEOUT_SECONDS) as client:
+                if files:
+                    response = client.post(url, headers=headers, files=files, data=data)
+                elif json:
+                    response = client.post(url, headers=headers, json=json)
+                else:
+                    response = client.post(url, headers=headers, data=data)
+                if response.status_code == 200:
+                    return True, response.json(), None
+                elif response.status_code == 429:
+                    if attempt < max_retries:
+                        time.sleep(2)
+                        continue
+                    return False, None, "Rate limited"
+                else:
+                    return False, None, f"HTTP {response.status_code}: {response.text[:200]}"
+        except httpx.TimeoutException:
+            if attempt < max_retries:
+                time.sleep(1)
+                continue
+            return False, None, f"Timeout after {AGENT_TIMEOUT_SECONDS}s"
+        except Exception as e:
+            if attempt < max_retries:
+                time.sleep(1)
+                continue
+            return False, None, str(e)
+    return False, None, "Max retries exceeded"
+def extract_text_from_file(file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
+    """Extract text using HF agent."""
+    try:
+        with open(file_path, 'rb') as f:
+            files = {'file': (file_path.name, f, 'application/pdf')}
+            data = {'filename': file_path.name}
+            success, response, error = call_agent_with_retry(TEXT_EXTRACTOR_URL, files=files, data=data)
+            if success and response:
+                text = response.get('text', '')
+                if not text or len(text.strip()) < 10:
+                    return False, None, "No text extracted"
+                return True, text, None
+            else:
+                return False, None, error or "Text extraction failed"
+    except Exception as e:
+        return False, None, str(e)
+def extract_tables_from_file(file_path: Path) -> Tuple[bool, Optional[list], Optional[str]]:
+    """Extract tables using HF agent."""
+    try:
+        with open(file_path, 'rb') as f:
+            files = {'file': (file_path.name, f, 'application/pdf')}
+            data = {'filename': file_path.name}
+            success, response, error = call_agent_with_retry(TABLE_EXTRACTOR_URL, files=files, data=data)
+            if success and response:
+                return True, response.get('tables', []), None
+            else:
+                return False, None, error or "Table extraction failed"
+    except Exception as e:
+        return False, None, str(e)
+def extract_entities_from_text(text: str) -> Tuple[bool, Optional[list], Optional[str]]:
+    """Extract entities using NER agent."""
+    try:
+        success, response, error = call_agent_with_retry(NER_URL, json={'text': text})
+        if success and response:
+            return True, response.get('entities', []), None
+        else:
+            return False, None, error or "NER failed"
+    except Exception as e:
+        return False, None, str(e)
+def classify_document(text: str) -> Tuple[bool, Optional[Dict], Optional[str]]:
+    """Classify document using classifier agent."""
+    try:
+        success, response, error = call_agent_with_retry(CLASSIFY_URL, json={'text': text[:2000]})
+        if success and response:
+            return True, response, None
+        else:
+            return False, None, error or "Classification failed"
+    except Exception as e:
+        return False, None, str(e)
+def summarize_text(text: str) -> Tuple[bool, Optional[str], Optional[str]]:
+    """Summarize text (optional)."""
+    if not SUMMARIZER_URL:
+        return True, None, None
+    try:
+        success, response, error = call_agent_with_retry(SUMMARIZER_URL, json={'text': text[:5000]})
+        if success and response:
+            return True, response.get('summary', ''), None
+        else:
+            return False, None, error or "Summarization failed"
+    except Exception as e:
+        return False, None, str(e)

backend/app/wrappers/__init__.py ADDED Viewed

File without changes

backend/app/wrappers/gemini_output_filter.py ADDED Viewed

	@@ -0,0 +1,349 @@

+import json
+import os
+import logging
+import time
+from typing import Dict, Optional
+try:
+    import google.generativeai as genai
+except ImportError:
+    raise ImportError("Install google-generativeai: pip install google-generativeai")
+logger = logging.getLogger(__name__)
+class GeminiOutputFilter:
+    """
+    Context-aware output filter that adapts to any invoice format.
+    No hardcoded field glossary - Gemini discovers fields dynamically.
+    WITH RATE LIMIT HANDLING
+    """
+    def __init__(self):
+        """Initialize Gemini model"""
+        api_key = os.getenv('GEMINI_API_KEY')
+        if not api_key:
+            raise ValueError("GEMINI_API_KEY environment variable not set")
+        genai.configure(api_key=api_key)
+        self.model = genai.GenerativeModel('gemini-2.5-flash')
+        logger.info("✅ GeminiOutputFilter initialized")
+    def filter_output(self, user_message: str, full_extraction: Dict, max_retries: int = 3) -> Dict:
+        """
+        Filter extraction based on user message with intelligent retry logic.
+        Args:
+            user_message: What user wants (e.g., "I need total and date")
+            full_extraction: Complete extraction from agent (any format)
+            max_retries: Maximum number of retry attempts for rate limits
+        Returns:
+            Filtered result with only requested fields
+        """
+        logger.info(f"🔍 Filtering request: '{user_message}'")
+        logger.info(f"📊 Available fields: {list(full_extraction.keys())}")
+        # Build context-aware prompt
+        prompt = self._build_prompt(user_message, full_extraction)
+        for attempt in range(max_retries):
+            try:
+                logger.info(f"🤖 Calling Gemini (attempt {attempt + 1}/{max_retries})...")
+                # Call Gemini
+                response = self.model.generate_content(prompt)
+                response_text = response.text.strip()
+                # Clean markdown if present
+                response_text = response_text.replace('```json', '').replace('```', '').strip()
+                # Parse JSON
+                filtered_result = json.loads(response_text)
+                logger.info(f"✅ Filtered result: {list(filtered_result.keys())}")
+                return filtered_result
+            except json.JSONDecodeError as e:
+                logger.error(f"❌ JSON parse error: {e}")
+                logger.error(f"Response was: {response_text[:300]}")
+                return {
+                    "_error": "Failed to parse AI response",
+                    "_debug": response_text[:300],
+                    "_fallback": full_extraction
+                }
+            except Exception as e:
+                error_msg = str(e)
+                # Check if it's a rate limit error (429)
+                is_rate_limit = (
+                    "429" in error_msg or
+                    "quota" in error_msg.lower() or
+                    "rate limit" in error_msg.lower() or
+                    "exceeded" in error_msg.lower()
+                )
+                if is_rate_limit:
+                    # Extract wait time from error message
+                    wait_time = self._extract_retry_delay(error_msg)
+                    if attempt < max_retries - 1:
+                        logger.warning(f"⚠️ Rate limit hit (attempt {attempt + 1}/{max_retries})")
+                        logger.info(f"⏳ Waiting {wait_time:.1f}s before retry...")
+                        time.sleep(wait_time)
+                        continue
+                    else:
+                        # Max retries exhausted
+                        logger.error(f"❌ Rate limit exceeded after {max_retries} attempts")
+                        logger.error(f"Full error: {error_msg}")
+                        return {
+                            "_error": f"Filtering failed: {error_msg}",
+                            "_fallback": full_extraction
+                        }
+                else:
+                    # Non-rate-limit error - fail immediately
+                    logger.error(f"❌ Filtering failed: {e}")
+                    return {
+                        "_error": f"Filtering failed: {str(e)}",
+                        "_fallback": full_extraction
+                    }
+        # Should not reach here, but just in case
+        return {
+            "_error": "Max retries exceeded",
+            "_fallback": full_extraction
+        }
+    def _extract_retry_delay(self, error_message: str) -> float:
+        """
+        Extract retry delay from Gemini error message.
+        Gemini errors include: "Please retry in 50.923950003s"
+        """
+        import re
+        # Look for pattern: "retry in X.Xs" or "retry in Xs"
+        match = re.search(r'retry in ([\d.]+)s', error_message, re.IGNORECASE)
+        if match:
+            retry_seconds = float(match.group(1))
+            # Add small buffer (2 seconds) to be safe
+            wait_time = retry_seconds + 2
+            logger.info(f"📍 Extracted wait time from error: {retry_seconds}s (using {wait_time}s with buffer)")
+            return wait_time
+        # Look for alternative patterns in error
+        match = re.search(r'(\d+)\s*(?:second|sec)', error_message, re.IGNORECASE)
+        if match:
+            retry_seconds = float(match.group(1))
+            wait_time = retry_seconds + 2
+            logger.info(f"📍 Extracted wait time from error: {retry_seconds}s (using {wait_time}s with buffer)")
+            return wait_time
+        # Default: exponential backoff (10s, 20s, 40s)
+        default_wait = 10 * (2 ** (0))  # Can increase based on attempt number
+        logger.warning(f"⚠️ Could not extract retry delay, using default: {default_wait}s")
+        return default_wait
+    def _build_prompt(self, user_message: str, full_extraction: Dict) -> str:
+        """Build the context-aware Gemini prompt"""
+        return f"""You are an intelligent output filter for an invoice extraction system that handles invoices from MANY different companies with DIFFERENT formats and field names.
+YOUR TASK:
+Our agent has extracted data from an invoice. The fields extracted depend on the invoice format - different companies use different field names and structures. You need to understand what the user wants and map it to whatever fields are available in THIS specific extraction.
+====================
+USER'S REQUEST:
+====================
+"{user_message}"
+====================
+EXTRACTED DATA (from this specific invoice):
+====================
+{json.dumps(full_extraction, indent=2)}
+====================
+YOUR JOB:
+====================
+1. ANALYZE the extracted fields to understand what data is available
+2. UNDERSTAND what the user is asking for
+3. MAP the user's request to the actual field names in this extraction
+4. RETURN only the fields the user requested
+====================
+IMPORTANT CONTEXT AWARENESS:
+====================
+Different invoices have different field names. You must be flexible and understand INTENT:
+USER ASKS FOR "total" or "amount":
+- Could be: total_open_amount, total, amount, grand_total, net_amount, invoice_total, final_amount, etc.
+- Look for fields that contain: "total", "amount", "price", "sum", or numeric values that seem like totals
+USER ASKS FOR "date":
+- Could be: posting_date, invoice_date, date, issue_date, date_of_issue, created_date, created, etc.
+- Look for fields with: "date", "created", "issue" or date-like values (YYYY-MM-DD format)
+USER ASKS FOR "customer" or "client":
+- Could be: cust_number, customer, client, customer_name, client_name, bill_to, buyer, purchaser, etc.
+- Look for fields with: "cust", "client", "customer", "buyer", "bill", "purchaser"
+USER ASKS FOR "invoice number":
+- Could be: invoice_id, invoice_no, invoice_number, doc_no, document_number, reference, ref_no, doc_reference, etc.
+- Look for fields with: "invoice", "doc", "number", "id", "ref", "reference"
+USER ASKS FOR "payment terms":
+- Could be: payment_terms, terms, due_terms, payment_conditions, net_terms, etc.
+- Look for fields with: "payment", "terms", "due", "net"
+====================
+STRATEGY:
+====================
+1. First, list out all available fields from the extraction
+2. For each field, infer what type of data it contains based on:
+   - Field name (does it contain keywords like "total", "date", "customer"?)
+   - Value type (is it a number? date? string?)
+   - Value content (does it look like money? a date? a name?)
+3. Match user's request to the best-fitting available fields
+4. If multiple fields could match, pick the most likely one (e.g., "grand_total" over "subtotal")
+5. If NO fields match, explain what's available
+====================
+FLEXIBILITY EXAMPLES:
+====================
+Example 1 - Simple mapping:
+Extraction: {{"total_amount": 500, "customer_name": "ABC Corp"}}
+User: "show me total"
+Response: {{"total_amount": 500}}
+Example 2 - Different field name:
+Extraction: {{"grand_total": 500, "bill_to": "ABC Corp"}}
+User: "show me total"
+Response: {{"grand_total": 500}}
+Example 3 - User friendly name:
+Extraction: {{"invoice_amt": 500, "client_id": "ABC Corp"}}
+User: "what's the amount?"
+Response: {{"amount": 500}}
+Example 4 - Multiple fields requested:
+Extraction: {{"total": 500, "date": "2024-01-01", "customer": "ABC"}}
+User: "I need total and date"
+Response: {{"total": 500, "date": "2024-01-01"}}
+Example 5 - Extract all:
+User: "extract all information" OR "give me everything" OR "show full data"
+Response: {{entire extraction unchanged}}
+Example 6 - Field not found:
+Extraction: {{"total": 500, "date": "2024-01-01"}}
+User: "show me shipping address"
+Response: {{
+  "_error": "No shipping address found in this invoice",
+  "_available_fields": {{
+    "total": "appears to be invoice total amount",
+    "date": "appears to be invoice date"
+  }},
+  "_suggestion": "Available data: total, date"
+}}
+====================
+RESPONSE FORMAT:
+====================
+Return ONLY valid JSON, no markdown, no extra text.
+If successful:
+{{
+  "field_name": value
+}}
+You CAN rename fields to be user-friendly:
+{{
+  "total": 500  // even if original field was "invoice_amt"
+}}
+If field not found:
+{{
+  "_error": "...",
+  "_available_fields": {{
+    "field1": "what this field appears to contain",
+    "field2": "what this field appears to contain"
+  }}
+}}
+====================
+CRITICAL RULES:
+====================
+1. DO NOT assume what fields exist. ONLY work with the fields present in the extraction JSON above.
+2. Be intelligent about inferring what each field means based on its name and value.
+3. If user asks for "all" or "everything", return the ENTIRE extraction unchanged.
+4. Always return valid JSON only - no explanations outside the JSON.
+Now process the user's request."""
+    def analyze_extraction(self, extraction: Dict) -> Dict:
+        """
+        Optional utility: Get Gemini's analysis of what fields mean.
+        Useful for debugging or showing users what's available.
+        """
+        prompt = f"""Analyze this invoice extraction and explain what each field likely contains:
+{json.dumps(extraction, indent=2)}
+For each field, provide:
+- Field name
+- Likely meaning (what data it contains)
+- Data type
+- User-friendly name suggestion
+Return as JSON:
+{{
+  "field_name": {{
+    "meaning": "...",
+    "type": "...",
+    "user_friendly_name": "..."
+  }}
+}}"""
+        try:
+            response = self.model.generate_content(prompt)
+            response_text = response.text.strip().replace('```json', '').replace('```', '')
+            analysis = json.loads(response_text)
+            return analysis
+        except Exception as e:
+            logger.error(f"Analysis failed: {e}")
+            return {"error": f"Could not analyze extraction: {str(e)}"}
+# ============================================
+# Usage Example (for testing)
+# ============================================
+if __name__ == "__main__":
+    # Test the wrapper with retry logic
+    extraction = {
+        "cust_number": "Martinez Rosales, An",
+        "posting_date": "2015-07-21",
+        "total_open_amount": 442.93,
+        "business_code": "U001",
+        "cust_payment_terms": "NAH4"
+    }
+    wrapper = GeminiOutputFilter()
+    print("\n" + "="*60)
+    print("TEST: User asks 'show me who the customer is'")
+    print("="*60)
+    result = wrapper.filter_output("show me who the customer is", extraction, max_retries=3)
+    print(f"Result: {json.dumps(result, indent=2)}")

backend/database/__init__.py ADDED Viewed

File without changes

backend/database/migration_ingest_v1.sql ADDED Viewed

	@@ -0,0 +1,67 @@

+-- ============================================
+-- Minimal Ingest Pipeline Tables
+-- Version: 1.0 (Idempotent)
+-- ============================================
+-- Table 1: ingest_jobs (job tracking)
+CREATE TABLE IF NOT EXISTS ingest_jobs (
+    job_id TEXT PRIMARY KEY,
+    doc_id INTEGER,
+    filename TEXT NOT NULL,
+    status TEXT NOT NULL DEFAULT 'queued',
+    error_text TEXT,
+    created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+    updated_at TEXT DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
+);
+-- Drop indexes if they exist, then recreate
+DROP INDEX IF EXISTS idx_ingest_jobs_status;
+DROP INDEX IF EXISTS idx_ingest_jobs_created;
+CREATE INDEX idx_ingest_jobs_status ON ingest_jobs(status);
+CREATE INDEX idx_ingest_jobs_created ON ingest_jobs(created_at DESC);
+-- Table 2: documents (file metadata)
+CREATE TABLE IF NOT EXISTS documents (
+    doc_id INTEGER PRIMARY KEY AUTOINCREMENT,
+    job_id TEXT NOT NULL,
+    path TEXT NOT NULL,
+    filename TEXT NOT NULL,
+    content_type TEXT NOT NULL,
+    uploaded_at TEXT DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY (job_id) REFERENCES ingest_jobs(job_id)
+);
+DROP INDEX IF EXISTS idx_documents_job_id;
+CREATE INDEX idx_documents_job_id ON documents(job_id);
+-- Table 3: extractions (agent artifacts)
+CREATE TABLE IF NOT EXISTS extractions (
+    doc_id INTEGER PRIMARY KEY,
+    raw_text TEXT,
+    tables_json TEXT,
+    entities_json TEXT,
+    classification_json TEXT,
+    summary_text TEXT,
+    extracted_at TEXT DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
+);
+-- Table 4: invoice_fields (mapped fields for prediction)
+CREATE TABLE IF NOT EXISTS invoice_fields (
+    invoice_id INTEGER PRIMARY KEY AUTOINCREMENT,
+    doc_id INTEGER NOT NULL,
+    cust_number TEXT,
+    posting_date TEXT,
+    total_open_amount REAL,
+    business_code TEXT,
+    cust_payment_terms TEXT,
+    invoice_currency TEXT DEFAULT 'USD',
+    due_in_date TEXT,
+    confidence_map TEXT,
+    created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
+);
+DROP INDEX IF EXISTS idx_invoice_fields_doc_id;
+CREATE INDEX idx_invoice_fields_doc_id ON invoice_fields(doc_id);

backend/database/migration_ingest_v2.sql ADDED Viewed

	@@ -0,0 +1,63 @@

+-- ============================================
+-- Invoice Ingest Pipeline - Complete Schema
+-- Version: 2.0
+-- ============================================
+-- Table 1: ingest_jobs
+CREATE TABLE IF NOT EXISTS ingest_jobs (
+    job_id TEXT PRIMARY KEY,
+    doc_id TEXT,
+    filename TEXT NOT NULL,
+    status TEXT NOT NULL DEFAULT 'queued',
+    error_text TEXT,
+    created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+    updated_at TEXT DEFAULT CURRENT_TIMESTAMP
+);
+DROP INDEX IF EXISTS idx_ingest_jobs_status;
+DROP INDEX IF EXISTS idx_ingest_jobs_created;
+CREATE INDEX idx_ingest_jobs_status ON ingest_jobs(status);
+CREATE INDEX idx_ingest_jobs_created ON ingest_jobs(created_at DESC);
+-- Table 2: documents
+CREATE TABLE IF NOT EXISTS documents (
+    doc_id TEXT PRIMARY KEY,
+    job_id TEXT NOT NULL,
+    path TEXT NOT NULL,
+    filename TEXT NOT NULL,
+    content_type TEXT NOT NULL,
+    uploaded_at TEXT DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY (job_id) REFERENCES ingest_jobs(job_id)
+);
+DROP INDEX IF EXISTS idx_documents_job_id;
+CREATE INDEX idx_documents_job_id ON documents(job_id);
+-- Table 3: extractions
+CREATE TABLE IF NOT EXISTS extractions (
+    doc_id TEXT PRIMARY KEY,
+    raw_text TEXT,
+    tables_json TEXT,
+    entities_json TEXT,
+    classification_json TEXT,
+    summary_text TEXT,
+    extracted_at TEXT DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
+);
+-- Table 4: invoice_fields
+CREATE TABLE IF NOT EXISTS invoice_fields (
+    invoice_id INTEGER PRIMARY KEY AUTOINCREMENT,
+    doc_id TEXT NOT NULL,
+    cust_number TEXT,
+    posting_date TEXT,
+    total_open_amount REAL,
+    business_code TEXT,
+    cust_payment_terms TEXT,
+    confidence_map TEXT,
+    created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
+);
+DROP INDEX IF EXISTS idx_invoice_fields_doc_id;
+CREATE INDEX idx_invoice_fields_doc_id ON invoice_fields(doc_id);

backend/database/queries.sql ADDED Viewed

	@@ -0,0 +1,354 @@

+-- Fix for the overdue percentage calculation
+-- Original line 32 had syntax error
+-- CORRECTED Query 1:
+WITH customer_stats AS (
+    SELECT
+        cust_number,
+        COUNT(*) as total_invoices,
+        COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END) as cleared_count,
+        AVG(days_to_clear) as avg_days,
+        PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY days_to_clear) as median_days,
+        STDDEV(days_to_clear) as std_days,
+        MIN(days_to_clear) as min_days,
+        MAX(days_to_clear) as max_days,
+        AVG(total_open_amount) as avg_amount,
+        SUM(total_open_amount) as total_amount,
+        -- FIXED: Overdue percentage calculation
+        CASE
+            WHEN COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END) > 0
+            THEN (CAST(COUNT(CASE WHEN is_overdue = TRUE THEN 1 END) AS NUMERIC) /
+                  CAST(COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END) AS NUMERIC) * 100)
+            ELSE 0.0
+        END as pct_overdue,
+        (SELECT cust_payment_terms FROM invoices_history WHERE cust_number = $1 GROUP BY cust_payment_terms ORDER BY COUNT(*) DESC LIMIT 1) as most_common_payment_term,
+        (SELECT business_code FROM invoices_history WHERE cust_number = $1 GROUP BY business_code ORDER BY COUNT(*) DESC LIMIT 1) as most_common_business_code,
+        (SELECT invoice_currency FROM invoices_history WHERE cust_number = $1 GROUP BY invoice_currency ORDER BY COUNT(*) DESC LIMIT 1) as most_common_currency
+    FROM invoices_history
+    WHERE cust_number = $1
+    GROUP BY cust_number
+)
+SELECT
+    cust_number,
+    total_invoices as cust_invoice_count,
+    cleared_count as cust_cleared_count,
+    ROUND(avg_days, 2) as cust_avg_days,
+    ROUND(median_days, 2) as cust_median_days,
+    ROUND(COALESCE(std_days, 0), 2) as cust_std_days,
+    min_days as cust_min_days,
+    max_days as cust_max_days,
+    ROUND(avg_amount, 2) as cust_avg_amount,
+    ROUND(total_amount, 2) as cust_total_amount,
+    ROUND(pct_overdue, 2) as cust_pct_overdue,
+    most_common_payment_term,
+    most_common_business_code,
+    most_common_currency
+FROM customer_stats;
+-- ============================================
+-- QUERY 2: Batch Compute All Customer Aggregates
+-- Usage: Nightly ETL job
+-- ============================================
+-- Name: compute_all_customer_aggregates
+-- Description: Computes aggregates for ALL customers with cleared invoices
+WITH customer_stats AS (
+    SELECT
+        cust_number,
+        COUNT(*) as total_invoices,
+        COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END) as cleared_count,
+        AVG(days_to_clear) as avg_days,
+        PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY days_to_clear) as median_days,
+        STDDEV(days_to_clear) as std_days,
+        MIN(days_to_clear) as min_days,
+        MAX(days_to_clear) as max_days,
+        AVG(total_open_amount) as avg_amount,
+        SUM(total_open_amount) as total_amount,
+        COUNT(CASE WHEN is_overdue = TRUE THEN 1 END)::NUMERIC /
+            NULLIF(COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END), 0) * 100 as pct_overdue,
+        MODE() WITHIN GROUP (ORDER BY cust_payment_terms) as most_common_payment_term,
+        MODE() WITHIN GROUP (ORDER BY business_code) as most_common_business_code,
+        MODE() WITHIN GROUP (ORDER BY invoice_currency) as most_common_currency
+    FROM invoices_history
+    WHERE clear_date IS NOT NULL  -- Only customers with history
+    GROUP BY cust_number
+    HAVING COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END) > 0
+)
+SELECT
+    cust_number,
+    total_invoices as cust_invoice_count,
+    cleared_count as cust_cleared_count,
+    ROUND(avg_days, 2) as cust_avg_days,
+    ROUND(median_days, 2) as cust_median_days,
+    ROUND(COALESCE(std_days, 0), 2) as cust_std_days,
+    min_days as cust_min_days,
+    max_days as cust_max_days,
+    ROUND(avg_amount, 2) as cust_avg_amount,
+    ROUND(total_amount, 2) as cust_total_amount,
+    ROUND(COALESCE(pct_overdue, 0), 2) as cust_pct_overdue,
+    most_common_payment_term,
+    most_common_business_code,
+    most_common_currency,
+    NOW() as last_computed_at
+FROM customer_stats;
+-- ============================================
+-- QUERY 3: Upsert Customer Aggregates
+-- Usage: Insert or update customer_aggregates table
+-- ============================================
+-- Name: upsert_customer_aggregates
+-- Description: Insert/update aggregates with conflict handling
+-- Parameters: All customer aggregate fields
+INSERT INTO customer_aggregates (
+    cust_number,
+    cust_invoice_count,
+    cust_cleared_count,
+    cust_avg_days,
+    cust_median_days,
+    cust_std_days,
+    cust_min_days,
+    cust_max_days,
+    cust_avg_amount,
+    cust_total_amount,
+    cust_pct_overdue,
+    most_common_payment_term,
+    most_common_business_code,
+    most_common_currency,
+    last_computed_at
+) VALUES (
+    $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, NOW()
+)
+ON CONFLICT (cust_number)
+DO UPDATE SET
+    cust_invoice_count = EXCLUDED.cust_invoice_count,
+    cust_cleared_count = EXCLUDED.cust_cleared_count,
+    cust_avg_days = EXCLUDED.cust_avg_days,
+    cust_median_days = EXCLUDED.cust_median_days,
+    cust_std_days = EXCLUDED.cust_std_days,
+    cust_min_days = EXCLUDED.cust_min_days,
+    cust_max_days = EXCLUDED.cust_max_days,
+    cust_avg_amount = EXCLUDED.cust_avg_amount,
+    cust_total_amount = EXCLUDED.cust_total_amount,
+    cust_pct_overdue = EXCLUDED.cust_pct_overdue,
+    most_common_payment_term = EXCLUDED.most_common_payment_term,
+    most_common_business_code = EXCLUDED.most_common_business_code,
+    most_common_currency = EXCLUDED.most_common_currency,
+    last_computed_at = NOW();
+-- ============================================
+-- QUERY 4: Compute Payment Terms Aggregates
+-- Usage: Pre-compute payment term statistics
+-- ============================================
+-- Name: compute_payment_terms_aggregates
+WITH payment_stats AS (
+    SELECT
+        cust_payment_terms,
+        AVG(days_to_clear) as avg_days,
+        PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY days_to_clear) as median_days,
+        COUNT(*) as invoice_count
+    FROM invoices_history
+    WHERE clear_date IS NOT NULL
+      AND cust_payment_terms IS NOT NULL
+    GROUP BY cust_payment_terms
+)
+SELECT
+    cust_payment_terms,
+    ROUND(avg_days, 2) as payment_terms_avg_days,
+    ROUND(median_days, 2) as payment_terms_median_days,
+    invoice_count as payment_terms_count,
+    NOW() as last_computed_at
+FROM payment_stats;
+-- ============================================
+-- QUERY 5: Compute Business Code Aggregates
+-- Usage: Pre-compute business code statistics
+-- ============================================
+-- Name: compute_business_code_aggregates
+WITH business_stats AS (
+    SELECT
+        business_code,
+        AVG(days_to_clear) as avg_days,
+        PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY days_to_clear) as median_days,
+        COUNT(*) as invoice_count
+    FROM invoices_history
+    WHERE clear_date IS NOT NULL
+      AND business_code IS NOT NULL
+    GROUP BY business_code
+)
+SELECT
+    business_code,
+    ROUND(avg_days, 2) as business_avg_days,
+    ROUND(median_days, 2) as business_median_days,
+    invoice_count as business_count,
+    NOW() as last_computed_at
+FROM business_stats;
+-- ============================================
+-- QUERY 6: Get Customer Features (for inference)
+-- Usage: Retrieve all features for a customer
+-- ============================================
+-- Name: get_customer_features
+-- Description: Get customer aggregates for prediction
+-- Parameters: $1 = cust_number
+SELECT
+    cust_number,
+    cust_invoice_count,
+    cust_cleared_count,
+    cust_avg_days,
+    cust_median_days,
+    cust_std_days,
+    cust_min_days,
+    cust_max_days,
+    cust_avg_amount,
+    cust_total_amount,
+    cust_pct_overdue,
+    most_common_payment_term,
+    most_common_business_code,
+    most_common_currency,
+    last_computed_at
+FROM customer_aggregates
+WHERE cust_number = $1;
+-- ============================================
+-- QUERY 7: Get Payment Terms Features
+-- Usage: Retrieve payment term stats
+-- ============================================
+-- Name: get_payment_terms_features
+-- Parameters: $1 = cust_payment_terms
+SELECT
+    cust_payment_terms,
+    payment_terms_avg_days,
+    payment_terms_median_days,
+    payment_terms_count
+FROM payment_terms_aggregates
+WHERE cust_payment_terms = $1;
+-- ============================================
+-- QUERY 8: Get Business Code Features
+-- Usage: Retrieve business code stats
+-- ============================================
+-- Name: get_business_code_features
+-- Parameters: $1 = business_code
+SELECT
+    business_code,
+    business_avg_days,
+    business_median_days,
+    business_count
+FROM business_code_aggregates
+WHERE business_code = $1;
+-- ============================================
+-- QUERY 9: Insert Invoice (with upsert)
+-- Usage: Ingest new invoice data
+-- ============================================
+-- Name: upsert_invoice
+-- Parameters: All invoice fields
+INSERT INTO invoices_history (
+    invoice_id,
+    business_code,
+    cust_number,
+    name_customer,
+    posting_date,
+    document_create_date,
+    document_create_date_alt,
+    due_in_date,
+    baseline_create_date,
+    clear_date,
+    total_open_amount,
+    invoice_currency,
+    document_type,
+    cust_payment_terms,
+    posting_id,
+    business_year
+) VALUES (
+    $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16
+)
+ON CONFLICT (invoice_id)
+DO UPDATE SET
+    clear_date = EXCLUDED.clear_date,
+    is_open = EXCLUDED.is_open,
+    updated_at = NOW();
+-- ============================================
+-- QUERY 10: Insert Prediction Log
+-- Usage: Record prediction for monitoring
+-- ============================================
+-- Name: insert_prediction_log
+-- Parameters: prediction fields
+INSERT INTO predictions_log (
+    invoice_id,
+    cust_number,
+    posting_date,
+    total_open_amount,
+    business_code,
+    cust_payment_terms,
+    features_json,
+    predicted_days_to_clear,
+    predicted_clear_date,
+    model_version,
+    model_path
+) VALUES (
+    $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11
+) RETURNING prediction_id;
+-- ============================================
+-- QUERY 11: Update Prediction with Actual Outcome
+-- Usage: Record actual outcome for model monitoring
+-- ============================================
+-- Name: update_prediction_outcome
+-- Parameters: $1 = prediction_id, $2 = actual_clear_date
+UPDATE predictions_log
+SET
+    actual_clear_date = $2,
+    actual_days_to_clear = EXTRACT(DAY FROM ($2 - posting_date))::INTEGER,
+    prediction_error = EXTRACT(DAY FROM ($2 - posting_date))::INTEGER - predicted_days_to_clear,
+    absolute_error = ABS(EXTRACT(DAY FROM ($2 - posting_date))::INTEGER - predicted_days_to_clear),
+    outcome_recorded_at = NOW()
+WHERE prediction_id = $1;
+-- ============================================
+-- QUERY 12: Get Recent Predictions Performance
+-- Usage: Monitor model accuracy
+-- ============================================
+-- Name: get_prediction_metrics
+-- Description: Calculate model performance over last N days
+-- Parameters: $1 = days_back (e.g., 30)
+SELECT
+    COUNT(*) as total_predictions,
+    COUNT(actual_days_to_clear) as predictions_with_outcome,
+    ROUND(AVG(ABS(prediction_error)), 2) as mae,
+    ROUND(SQRT(AVG(prediction_error * prediction_error)), 2) as rmse,
+    ROUND(AVG(CASE
+        WHEN ABS(prediction_error) <= 3 THEN 1.0
+        ELSE 0.0
+    END) * 100, 2) as pct_within_3_days,
+    ROUND(AVG(CASE
+        WHEN ABS(prediction_error) <= 7 THEN 1.0
+        ELSE 0.0
+    END) * 100, 2) as pct_within_7_days
+FROM predictions_log
+WHERE predicted_at >= NOW() - INTERVAL '$1 days'
+  AND actual_days_to_clear IS NOT NULL;
+-- ============================================
+-- End of Query Templates
+-- ============================================

backend/database/schema_sqlite.sql ADDED Viewed

	@@ -0,0 +1,132 @@

+-- ============================================
+-- Invoice Payment Prediction System - SQLite Schema
+-- Version: 1.0 (SQLite)
+-- ============================================
+-- Drop existing tables
+DROP TABLE IF EXISTS predictions_log;
+DROP TABLE IF EXISTS business_code_aggregates;
+DROP TABLE IF EXISTS payment_terms_aggregates;
+DROP TABLE IF EXISTS customer_aggregates;
+DROP TABLE IF EXISTS invoices_history;
+-- ============================================
+-- Table 1: invoices_history
+-- ============================================
+CREATE TABLE invoices_history (
+    invoice_id INTEGER PRIMARY KEY,
+    business_code TEXT NOT NULL,
+    cust_number TEXT NOT NULL,
+    name_customer TEXT,
+    -- Dates (stored as TEXT in ISO format: YYYY-MM-DD HH:MM:SS)
+    posting_date TEXT NOT NULL,
+    document_create_date TEXT,
+    document_create_date_alt TEXT,
+    due_in_date TEXT,
+    baseline_create_date TEXT,
+    clear_date TEXT,
+    -- Financial
+    total_open_amount REAL NOT NULL,
+    invoice_currency TEXT DEFAULT 'USD',
+    -- Metadata
+    document_type TEXT,
+    cust_payment_terms TEXT,
+    posting_id REAL,
+    is_open INTEGER DEFAULT 1,
+    business_year INTEGER,
+    -- Computed fields
+    days_to_clear INTEGER,
+    days_posting_to_due INTEGER,
+    days_create_to_posting INTEGER,
+    days_baseline_to_posting INTEGER,
+    is_overdue INTEGER DEFAULT 0,
+    -- Audit
+    created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+    updated_at TEXT DEFAULT CURRENT_TIMESTAMP
+);
+CREATE INDEX idx_invoices_cust ON invoices_history(cust_number);
+CREATE INDEX idx_invoices_posting ON invoices_history(posting_date);
+CREATE INDEX idx_invoices_cleared ON invoices_history(cust_number, posting_date) WHERE clear_date IS NOT NULL;
+-- ============================================
+-- Table 2: customer_aggregates
+-- ============================================
+CREATE TABLE customer_aggregates (
+    cust_number TEXT PRIMARY KEY,
+    cust_invoice_count INTEGER DEFAULT 0,
+    cust_cleared_count INTEGER DEFAULT 0,
+    cust_avg_days REAL,
+    cust_median_days REAL,
+    cust_std_days REAL,
+    cust_min_days INTEGER,
+    cust_max_days INTEGER,
+    cust_avg_amount REAL,
+    cust_total_amount REAL,
+    cust_pct_overdue REAL DEFAULT 0.0,
+    most_common_payment_term TEXT,
+    most_common_business_code TEXT,
+    most_common_currency TEXT,
+    last_computed_at TEXT DEFAULT CURRENT_TIMESTAMP,
+    created_at TEXT DEFAULT CURRENT_TIMESTAMP
+);
+-- ============================================
+-- Table 3: payment_terms_aggregates
+-- ============================================
+CREATE TABLE payment_terms_aggregates (
+    cust_payment_terms TEXT PRIMARY KEY,
+    payment_terms_avg_days REAL,
+    payment_terms_median_days REAL,
+    payment_terms_count INTEGER DEFAULT 0,
+    last_computed_at TEXT DEFAULT CURRENT_TIMESTAMP
+);
+-- ============================================
+-- Table 4: business_code_aggregates
+-- ============================================
+CREATE TABLE business_code_aggregates (
+    business_code TEXT PRIMARY KEY,
+    business_avg_days REAL,
+    business_median_days REAL,
+    business_count INTEGER DEFAULT 0,
+    last_computed_at TEXT DEFAULT CURRENT_TIMESTAMP
+);
+-- ============================================
+-- Table 5: predictions_log
+-- ============================================
+CREATE TABLE predictions_log (
+    prediction_id INTEGER PRIMARY KEY AUTOINCREMENT,
+    invoice_id INTEGER,
+    cust_number TEXT NOT NULL,
+    posting_date TEXT NOT NULL,
+    total_open_amount REAL NOT NULL,
+    business_code TEXT,
+    cust_payment_terms TEXT,
+    predicted_days_to_clear REAL NOT NULL,
+    predicted_clear_date TEXT NOT NULL,
+    model_version TEXT,
+    features_json TEXT,
+    actual_clear_date TEXT,
+    actual_days_to_clear INTEGER,
+    prediction_error REAL,
+    absolute_error REAL,
+    predicted_at TEXT DEFAULT CURRENT_TIMESTAMP
+);
+CREATE INDEX idx_predictions_cust ON predictions_log(cust_number);
+CREATE INDEX idx_predictions_date ON predictions_log(predicted_at);

backend/etl/__init__.py ADDED Viewed

File without changes

backend/etl/update_customer_aggregates_sqlite.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import sqlite3
+import pandas as pd
+from pathlib import Path
+from filelock import FileLock
+from datetime import datetime
+DB_PATH = Path(__file__).parent.parent.parent / "data" / "invoices.db"
+LOCK_PATH = Path(__file__).parent.parent.parent / "data" / "invoices.db.lock"
+def get_most_common(series):
+    """Get mode (most common value)."""
+    if series.empty:
+        return None
+    return series.mode()[0] if not series.mode().empty else None
+def update_customer_aggregates():
+    """Compute and update customer aggregates."""
+    print("🔄 Starting customer aggregates computation...")
+    with FileLock(str(LOCK_PATH)):
+        conn = sqlite3.connect(str(DB_PATH))
+        # Load cleared invoices
+        df = pd.read_sql_query("""
+            SELECT
+                cust_number,
+                days_to_clear,
+                total_open_amount,
+                is_overdue,
+                cust_payment_terms,
+                business_code,
+                invoice_currency
+            FROM invoices_history
+            WHERE clear_date IS NOT NULL
+        """, conn)
+        if df.empty:
+            print("⚠️  No cleared invoices found")
+            conn.close()
+            return
+        print(f"📊 Processing {len(df)} cleared invoices...")
+        # Compute aggregates per customer
+        agg_results = []
+        for cust_number, group in df.groupby('cust_number'):
+            agg = {
+                'cust_number': cust_number,
+                'cust_invoice_count': len(group),
+                'cust_cleared_count': len(group),
+                'cust_avg_days': round(group['days_to_clear'].mean(), 2),
+                'cust_median_days': round(group['days_to_clear'].median(), 2),
+                'cust_std_days': round(group['days_to_clear'].std(), 2) if len(group) > 1 else 0.0,
+                'cust_min_days': int(group['days_to_clear'].min()),
+                'cust_max_days': int(group['days_to_clear'].max()),
+                'cust_avg_amount': round(group['total_open_amount'].mean(), 2),
+                'cust_total_amount': round(group['total_open_amount'].sum(), 2),
+                'cust_pct_overdue': round((group['is_overdue'].sum() / len(group)) * 100, 2),
+                'most_common_payment_term': get_most_common(group['cust_payment_terms']),
+                'most_common_business_code': get_most_common(group['business_code']),
+                'most_common_currency': get_most_common(group['invoice_currency'])
+            }
+            agg_results.append(agg)
+        # Upsert into customer_aggregates
+        cursor = conn.cursor()
+        for agg in agg_results:
+            cursor.execute("""
+                INSERT OR REPLACE INTO customer_aggregates (
+                    cust_number, cust_invoice_count, cust_cleared_count,
+                    cust_avg_days, cust_median_days, cust_std_days,
+                    cust_min_days, cust_max_days,
+                    cust_avg_amount, cust_total_amount, cust_pct_overdue,
+                    most_common_payment_term, most_common_business_code,
+                    most_common_currency, last_computed_at
+                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
+            """, (
+                agg['cust_number'], agg['cust_invoice_count'], agg['cust_cleared_count'],
+                agg['cust_avg_days'], agg['cust_median_days'], agg['cust_std_days'],
+                agg['cust_min_days'], agg['cust_max_days'],
+                agg['cust_avg_amount'], agg['cust_total_amount'], agg['cust_pct_overdue'],
+                agg['most_common_payment_term'], agg['most_common_business_code'],
+                agg['most_common_currency']
+            ))
+        conn.commit()
+        print(f"✅ Updated {len(agg_results)} customer aggregates")
+        conn.close()
+def update_payment_terms_aggregates():
+    """Compute and update payment terms aggregates."""
+    print("🔄 Computing payment terms aggregates...")
+    with FileLock(str(LOCK_PATH)):
+        conn = sqlite3.connect(str(DB_PATH))
+        df = pd.read_sql_query("""
+            SELECT cust_payment_terms, days_to_clear
+            FROM invoices_history
+            WHERE clear_date IS NOT NULL AND cust_payment_terms IS NOT NULL
+        """, conn)
+        if df.empty:
+            print("⚠️  No data for payment terms")
+            conn.close()
+            return
+        agg = df.groupby('cust_payment_terms')['days_to_clear'].agg(['mean', 'median', 'count']).reset_index()
+        agg.columns = ['cust_payment_terms', 'payment_terms_avg_days', 'payment_terms_median_days', 'payment_terms_count']
+        cursor = conn.cursor()
+        for _, row in agg.iterrows():
+            cursor.execute("""
+                INSERT OR REPLACE INTO payment_terms_aggregates (
+                    cust_payment_terms, payment_terms_avg_days, payment_terms_median_days,
+                    payment_terms_count, last_computed_at
+                ) VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
+            """, (
+                row['cust_payment_terms'],
+                round(row['payment_terms_avg_days'], 2),
+                round(row['payment_terms_median_days'], 2),
+                int(row['payment_terms_count'])
+            ))
+        conn.commit()
+        print(f"✅ Updated {len(agg)} payment terms aggregates")
+        conn.close()
+def update_business_code_aggregates():
+    """Compute and update business code aggregates."""
+    print("🔄 Computing business code aggregates...")
+    with FileLock(str(LOCK_PATH)):
+        conn = sqlite3.connect(str(DB_PATH))
+        df = pd.read_sql_query("""
+            SELECT business_code, days_to_clear
+            FROM invoices_history
+            WHERE clear_date IS NOT NULL AND business_code IS NOT NULL
+        """, conn)
+        if df.empty:
+            print("⚠️  No data for business codes")
+            conn.close()
+            return
+        agg = df.groupby('business_code')['days_to_clear'].agg(['mean', 'median', 'count']).reset_index()
+        agg.columns = ['business_code', 'business_avg_days', 'business_median_days', 'business_count']
+        cursor = conn.cursor()
+        for _, row in agg.iterrows():
+            cursor.execute("""
+                INSERT OR REPLACE INTO business_code_aggregates (
+                    business_code, business_avg_days, business_median_days,
+                    business_count, last_computed_at
+                ) VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
+            """, (
+                row['business_code'],
+                round(row['business_avg_days'], 2),
+                round(row['business_median_days'], 2),
+                int(row['business_count'])
+            ))
+        conn.commit()
+        print(f"✅ Updated {len(agg)} business code aggregates")
+        conn.close()
+if __name__ == "__main__":
+    print("="*60)
+    print("🚀 ETL: Updating Aggregates")
+    print("="*60)
+    update_customer_aggregates()
+    update_payment_terms_aggregates()
+    update_business_code_aggregates()
+    print("\n✅ All aggregates updated successfully!")

backend/feature_builder/__init__.py ADDED Viewed

File without changes

backend/feature_builder/feature_builder.py ADDED Viewed

	@@ -0,0 +1,312 @@

+"""
+Feature builder that matches ML training pipeline exactly.
+Generates features for inference from invoice data + aggregates.
+FIXED: Handles None values properly with robust defaults.
+"""
+import pandas as pd
+import numpy as np
+from datetime import datetime
+from typing import Dict, Optional
+# Default values for new customers (from training)
+DEFAULTS = {
+    'cust_avg_days': 18.0,
+    'cust_median_days': 15.0,
+    'cust_std_days': 0.0,
+    'cust_min_days': 12,
+    'cust_max_days': 25,
+    'cust_invoice_count': 1,
+    'cust_avg_amount': 30000.0,
+    'cust_total_amount': 30000.0,
+    'cust_pct_overdue': 0.0,
+    'payment_terms_avg_days': 15.0,
+    'payment_terms_median_days': 15.0,
+    'payment_terms_count': 100,
+    'business_avg_days': 17.0,
+    'business_median_days': 15.0,
+    'business_count': 1000
+}
+def safe_float(value, default=0.0):
+    """Safely convert to float with default."""
+    if value is None:
+        return float(default)
+    try:
+        return float(value)
+    except (ValueError, TypeError):
+        return float(default)
+def safe_int(value, default=0):
+    """Safely convert to int with default."""
+    if value is None:
+        return int(default)
+    try:
+        return int(value)
+    except (ValueError, TypeError):
+        return int(default)
+def parse_date(date_str: str) -> datetime:
+    """Parse date string to datetime."""
+    if isinstance(date_str, datetime):
+        return date_str
+    for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y%m%d"]:
+        try:
+            return datetime.strptime(str(date_str), fmt)
+        except ValueError:
+            continue
+    raise ValueError(f"Cannot parse date: {date_str}")
+def build_features(
+    invoice_data: Dict,
+    customer_agg: Optional[Dict] = None,
+    payment_terms_agg: Optional[Dict] = None,
+    business_code_agg: Optional[Dict] = None
+) -> Dict:
+    """
+    Build feature vector matching ML training pipeline.
+    Args:
+        invoice_data: Invoice details (posting_date, amount, etc.)
+        customer_agg: Customer aggregates from DB (or None for defaults)
+        payment_terms_agg: Payment terms aggregates from DB
+        business_code_agg: Business code aggregates from DB
+    Returns:
+        Dict of features ready for model.predict()
+    """
+    # Parse dates
+    posting_date = parse_date(invoice_data['posting_date'])
+    # Use provided aggregates or empty dicts (will use defaults)
+    cust_agg = customer_agg or {}
+    pmt_agg = payment_terms_agg or {}
+    biz_agg = business_code_agg or {}
+    # Build feature dictionary
+    features = {}
+    # ============================================
+    # Categorical Features (encoded as integers)
+    # ============================================
+    # Business code mapping
+    business_code = invoice_data.get('business_code', 'U001')
+    business_code_map = {'U001': 0, 'U002': 1, 'U005': 2, 'U007': 3, 'U013': 4, 'CA02': 5}
+    features['business_code'] = business_code_map.get(business_code, 0)
+    # Payment terms (simplified hash encoding)
+    payment_terms = invoice_data.get('cust_payment_terms', 'NAH4')
+    features['cust_payment_terms'] = abs(hash(payment_terms)) % 74
+    # Currency
+    currency_map = {'USD': 0, 'CAD': 1}
+    features['invoice_currency'] = currency_map.get(invoice_data.get('invoice_currency', 'USD'), 0)
+    # Document type
+    doc_type_map = {'RV': 0, 'AB': 1}
+    features['document_type'] = doc_type_map.get(invoice_data.get('document_type', 'RV'), 0)
+    # Amount category
+    amount = safe_float(invoice_data.get('total_open_amount'), 30000.0)
+    if amount < 5000:
+        amount_cat = 0  # small
+    elif amount < 20000:
+        amount_cat = 1  # medium
+    elif amount < 50000:
+        amount_cat = 2  # large
+    else:
+        amount_cat = 3  # very_large
+    features['amount_category'] = amount_cat
+    # ============================================
+    # Numerical Features
+    # ============================================
+    features['buisness_year'] = safe_float(invoice_data.get('business_year', posting_date.year))
+    features['total_open_amount'] = amount
+    features['amount_log'] = float(np.log1p(amount))
+    # Temporal features
+    features['posting_year'] = posting_date.year
+    features['posting_month'] = posting_date.month
+    features['posting_quarter'] = (posting_date.month - 1) // 3 + 1
+    features['posting_day'] = posting_date.day
+    features['posting_dayofweek'] = posting_date.weekday()
+    features['posting_is_weekend'] = 1 if posting_date.weekday() >= 5 else 0
+    features['posting_is_month_end'] = 1 if posting_date.day >= 28 else 0
+    features['posting_is_month_start'] = 1 if posting_date.day <= 3 else 0
+    # Days between dates
+    features['days_posting_to_due'] = safe_int(invoice_data.get('days_posting_to_due'), 15)
+    features['days_create_to_posting'] = safe_int(invoice_data.get('days_create_to_posting'), 0)
+    features['days_baseline_to_posting'] = safe_int(invoice_data.get('days_baseline_to_posting'), 0)
+    # Document create date alt (as integer YYYYMMDD)
+    doc_create_alt = invoice_data.get('document_create_date_alt')
+    if doc_create_alt:
+        try:
+            cleaned = str(doc_create_alt).replace('-', '').replace(' ', '').replace(':', '')[:8]
+            features['document_create_date.1'] = int(cleaned)
+        except:
+            features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d'))
+    else:
+        features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d'))
+    # ============================================
+    # Customer Aggregates (with robust defaults)
+    # ============================================
+    features['cust_avg_days'] = safe_float(
+        cust_agg.get('cust_avg_days'),
+        DEFAULTS['cust_avg_days']
+    )
+    features['cust_median_days'] = safe_float(
+        cust_agg.get('cust_median_days'),
+        DEFAULTS['cust_median_days']
+    )
+    features['cust_std_days'] = safe_float(
+        cust_agg.get('cust_std_days'),
+        DEFAULTS['cust_std_days']
+    )
+    features['cust_min_days'] = safe_int(
+        cust_agg.get('cust_min_days'),
+        DEFAULTS['cust_min_days']
+    )
+    features['cust_max_days'] = safe_int(
+        cust_agg.get('cust_max_days'),
+        DEFAULTS['cust_max_days']
+    )
+    features['cust_invoice_count'] = safe_int(
+        cust_agg.get('cust_invoice_count'),
+        DEFAULTS['cust_invoice_count']
+    )
+    features['cust_avg_amount'] = safe_float(
+        cust_agg.get('cust_avg_amount'),
+        DEFAULTS['cust_avg_amount']
+    )
+    features['cust_total_amount'] = safe_float(
+        cust_agg.get('cust_total_amount'),
+        DEFAULTS['cust_total_amount']
+    )
+    # ============================================
+    # Payment Terms Aggregates
+    # ============================================
+    features['payment_terms_avg_days'] = safe_float(
+        pmt_agg.get('payment_terms_avg_days'),
+        DEFAULTS['payment_terms_avg_days']
+    )
+    features['payment_terms_median_days'] = safe_float(
+        pmt_agg.get('payment_terms_median_days'),
+        DEFAULTS['payment_terms_median_days']
+    )
+    features['payment_terms_count'] = safe_int(
+        pmt_agg.get('payment_terms_count'),
+        DEFAULTS['payment_terms_count']
+    )
+    # ============================================
+    # Business Code Aggregates
+    # ============================================
+    features['business_avg_days'] = safe_float(
+        biz_agg.get('business_avg_days'),
+        DEFAULTS['business_avg_days']
+    )
+    features['business_median_days'] = safe_float(
+        biz_agg.get('business_median_days'),
+        DEFAULTS['business_median_days']
+    )
+    features['business_count'] = safe_int(
+        biz_agg.get('business_count'),
+        DEFAULTS['business_count']
+    )
+    # ============================================
+    # Interaction Features
+    # ============================================
+    cust_avg_amt = features['cust_avg_amount']
+    if cust_avg_amt > 0:
+        features['amount_vs_cust_avg'] = float(amount / cust_avg_amt)
+    else:
+        features['amount_vs_cust_avg'] = 1.0
+    features['is_large_for_customer'] = 1 if amount > cust_avg_amt * 1.5 else 0
+    # ============================================
+    # Other required fields
+    # ============================================
+    features['isOpen'] = safe_int(invoice_data.get('is_open'), 1)
+    features['posting_id'] = safe_float(invoice_data.get('posting_id'), 1.0)
+    return features
+def features_to_dataframe(features: Dict) -> pd.DataFrame:
+    """
+    Convert feature dict to DataFrame with correct column order.
+    Must match training feature order exactly.
+    """
+    # Expected column order from training
+    COLUMN_ORDER = [
+        'business_code', 'buisness_year', 'document_create_date.1',
+        'invoice_currency', 'document_type', 'total_open_amount',
+        'cust_payment_terms', 'isOpen', 'posting_year', 'posting_month',
+        'posting_quarter', 'posting_day', 'posting_dayofweek',
+        'posting_is_weekend', 'posting_is_month_end', 'posting_is_month_start',
+        'days_posting_to_due', 'days_create_to_posting', 'days_baseline_to_posting',
+        'amount_log', 'amount_category', 'cust_avg_days', 'cust_median_days',
+        'cust_std_days', 'cust_min_days', 'cust_max_days', 'cust_invoice_count',
+        'cust_avg_amount', 'cust_total_amount', 'payment_terms_avg_days',
+        'payment_terms_median_days', 'payment_terms_count', 'business_avg_days',
+        'business_median_days', 'business_count', 'amount_vs_cust_avg',
+        'is_large_for_customer'
+    ]
+    # Ensure all columns present with safe defaults
+    for col in COLUMN_ORDER:
+        if col not in features:
+            features[col] = 0.0  # Fallback
+    # Create DataFrame with correct order
+    df = pd.DataFrame([features])[COLUMN_ORDER]
+    return df
+if __name__ == "__main__":
+    # Test with minimal data
+    test_invoice = {
+        'posting_date': '2024-01-15',
+        'total_open_amount': 50000.0,
+        'business_code': 'U001',
+        'cust_payment_terms': 'NAH4',
+        'invoice_currency': 'USD',
+        'document_type': 'RV',
+        'business_year': 2024,
+        'days_posting_to_due': 15,
+        'is_open': 1
+    }
+    # Test with no aggregates (should use defaults)
+    features = build_features(test_invoice, None, None, None)
+    df = features_to_dataframe(features)
+    print("✅ Features built successfully:")
+    print(f"Shape: {df.shape}")
+    print(f"Columns: {len(df.columns)}")
+    print(f"\nSample features:")
+    print(df[['cust_avg_days', 'payment_terms_avg_days', 'business_avg_days']].T)

backend/ingest/__init__.py ADDED Viewed

File without changes

backend/ingest/ingest_invoice_sqlite.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""
+Invoice ingestion helper for SQLite.
+Handles insert/update with computed fields.
+"""
+import sqlite3
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Optional
+from filelock import FileLock
+DB_PATH = Path(__file__).parent.parent.parent / "data" / "invoices.db"
+LOCK_PATH = Path(__file__).parent.parent.parent / "data" / "invoices.db.lock"
+def parse_date(date_input) -> Optional[str]:
+    """Convert various date formats to ISO string."""
+    if not date_input:
+        return None
+    if isinstance(date_input, str):
+        # Try parsing common formats
+        for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y%m%d"]:
+            try:
+                dt = datetime.strptime(date_input, fmt)
+                return dt.strftime("%Y-%m-%d %H:%M:%S")
+            except ValueError:
+                continue
+        return date_input  # Return as-is if parsing fails
+    if isinstance(date_input, datetime):
+        return date_input.strftime("%Y-%m-%d %H:%M:%S")
+    return str(date_input)
+def compute_days_diff(date1_str: Optional[str], date2_str: Optional[str]) -> Optional[int]:
+    """Compute day difference between two ISO date strings."""
+    if not date1_str or not date2_str:
+        return None
+    try:
+        d1 = datetime.strptime(date1_str, "%Y-%m-%d %H:%M:%S")
+        d2 = datetime.strptime(date2_str, "%Y-%m-%d %H:%M:%S")
+        return (d1 - d2).days
+    except:
+        return None
+def ingest_invoice(invoice_data: Dict) -> Dict:
+    """
+    Insert or update invoice in SQLite with computed fields.
+    Args:
+        invoice_data: Dict with invoice fields
+    Returns:
+        Dict with status and invoice_id
+    """
+    # Parse dates
+    posting_date = parse_date(invoice_data.get("posting_date"))
+    clear_date = parse_date(invoice_data.get("clear_date"))
+    due_in_date = parse_date(invoice_data.get("due_in_date"))
+    document_create_date = parse_date(invoice_data.get("document_create_date"))
+    baseline_create_date = parse_date(invoice_data.get("baseline_create_date"))
+    # Compute derived fields
+    days_to_clear = compute_days_diff(clear_date, posting_date) if clear_date else None
+    days_posting_to_due = compute_days_diff(due_in_date, posting_date)
+    days_create_to_posting = compute_days_diff(posting_date, document_create_date)
+    days_baseline_to_posting = compute_days_diff(posting_date, baseline_create_date)
+    is_open = 0 if clear_date else 1
+    is_overdue = 0
+    if clear_date and due_in_date:
+        try:
+            cd = datetime.strptime(clear_date, "%Y-%m-%d %H:%M:%S")
+            dd = datetime.strptime(due_in_date, "%Y-%m-%d %H:%M:%S")
+            is_overdue = 1 if cd > dd else 0
+        except:
+            pass
+    # Prepare data
+    invoice_id = invoice_data.get("invoice_id")
+    if not invoice_id:
+        raise ValueError("invoice_id is required")
+    # SQLite write with lock
+    with FileLock(str(LOCK_PATH)):
+        conn = sqlite3.connect(str(DB_PATH))
+        cursor = conn.cursor()
+        cursor.execute("""
+            INSERT OR REPLACE INTO invoices_history (
+                invoice_id, business_code, cust_number, name_customer,
+                posting_date, document_create_date, document_create_date_alt,
+                due_in_date, baseline_create_date, clear_date,
+                total_open_amount, invoice_currency, document_type,
+                cust_payment_terms, posting_id, business_year,
+                days_to_clear, days_posting_to_due, days_create_to_posting,
+                days_baseline_to_posting, is_overdue, is_open,
+                updated_at
+            ) VALUES (
+                ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
+                ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP
+            )
+        """, (
+            invoice_id,
+            invoice_data.get("business_code"),
+            invoice_data.get("cust_number"),
+            invoice_data.get("name_customer"),
+            posting_date,
+            document_create_date,
+            invoice_data.get("document_create_date_alt"),
+            due_in_date,
+            baseline_create_date,
+            clear_date,
+            invoice_data.get("total_open_amount"),
+            invoice_data.get("invoice_currency", "USD"),
+            invoice_data.get("document_type"),
+            invoice_data.get("cust_payment_terms"),
+            invoice_data.get("posting_id"),
+            invoice_data.get("business_year"),
+            days_to_clear,
+            days_posting_to_due,
+            days_create_to_posting,
+            days_baseline_to_posting,
+            is_overdue,
+            is_open
+        ))
+        conn.commit()
+        conn.close()
+    return {
+        "status": "success",
+        "invoice_id": invoice_id,
+        "is_open": bool(is_open),
+        "days_to_clear": days_to_clear
+    }
+if __name__ == "__main__":
+    # Test
+    test_invoice = {
+        "invoice_id": 12345,
+        "business_code": "U001",
+        "cust_number": "0200769623",
+        "name_customer": "Test Customer",
+        "posting_date": "2024-01-15",
+        "clear_date": "2024-02-01",
+        "due_in_date": "2024-01-30",
+        "total_open_amount": 50000.0,
+        "cust_payment_terms": "NAH4"
+    }
+    result = ingest_invoice(test_invoice)
+    print(result)

backend/worker/job_processor.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+Background worker for processing ingest jobs.
+Consumes jobs from Redis queue and processes them.
+"""
+import sqlite3
+import logging
+from pathlib import Path
+from datetime import datetime
+from filelock import FileLock
+from typing import Dict
+import traceback
+from .text_extractor import extract_text
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Paths
+BASE_DIR = Path(__file__).parent.parent.parent
+DB_PATH = BASE_DIR / "data" / "invoices.db"
+LOCK_PATH = BASE_DIR / "data" / "invoices.db.lock"
+def update_job_status(job_id: str, status: str, error_message: str = None):
+    """Update job status in database."""
+    with FileLock(str(LOCK_PATH), timeout=10):
+        conn = sqlite3.connect(str(DB_PATH))
+        cursor = conn.cursor()
+        if status == "processing":
+            cursor.execute("""
+                UPDATE ingest_jobs
+                SET status = ?, started_at = CURRENT_TIMESTAMP
+                WHERE job_id = ?
+            """, (status, job_id))
+        elif status == "completed":
+            cursor.execute("""
+                UPDATE ingest_jobs
+                SET status = ?, completed_at = CURRENT_TIMESTAMP
+                WHERE job_id = ?
+            """, (status, job_id))
+        elif status == "failed":
+            cursor.execute("""
+                UPDATE ingest_jobs
+                SET status = ?, error_message = ?, completed_at = CURRENT_TIMESTAMP
+                WHERE job_id = ?
+            """, (status, error_message, job_id))
+        conn.commit()
+        conn.close()
+def save_extraction(document_id: int, raw_text: str, metadata: Dict):
+    """Save extracted text to database."""
+    with FileLock(str(LOCK_PATH), timeout=10):
+        conn = sqlite3.connect(str(DB_PATH))
+        cursor = conn.cursor()
+        cursor.execute("""
+            INSERT INTO extractions (
+                document_id,
+                raw_text,
+                page_count,
+                extraction_method,
+                confidence_score
+            ) VALUES (?, ?, ?, ?, ?)
+        """, (
+            document_id,
+            raw_text,
+            metadata.get('page_count'),
+            metadata.get('extraction_method'),
+            metadata.get('confidence_score')
+        ))
+        conn.commit()
+        conn.close()
+def process_job(job_data: Dict):
+    """
+    Process a single ingest job.
+    Args:
+        job_data: Dict with job_id, document_id, file_path, mime_type
+    """
+    job_id = job_data['job_id']
+    document_id = job_data['document_id']
+    file_path = Path(job_data['file_path'])
+    mime_type = job_data['mime_type']
+    logger.info(f"Processing job {job_id} for document {document_id}")
+    try:
+        # Update status to processing
+        update_job_status(job_id, "processing")
+        # Extract text
+        logger.info(f"Extracting text from {file_path}")
+        raw_text, metadata = extract_text(file_path, mime_type)
+        if not raw_text or len(raw_text.strip()) < 10:
+            raise ValueError("No text extracted or text too short")
+        logger.info(f"Extracted {len(raw_text)} characters, {metadata['page_count']} pages")
+        # Save to database
+        save_extraction(document_id, raw_text, metadata)
+        # Update status to completed
+        update_job_status(job_id, "completed")
+        logger.info(f"Job {job_id} completed successfully")
+    except Exception as e:
+        error_msg = f"{type(e).__name__}: {str(e)}"
+        logger.error(f"Job {job_id} failed: {error_msg}")
+        logger.error(traceback.format_exc())
+        # Update status to failed
+        update_job_status(job_id, "failed", error_msg)
+        raise

backend/worker/text_extractor.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""
+Text extraction utilities for PDF and images.
+Supports both digital PDFs and scanned documents (OCR).
+"""
+import pdfplumber
+import fitz  # PyMuPDF
+import pytesseract
+from PIL import Image
+from pathlib import Path
+from typing import Dict, Tuple
+import logging
+logger = logging.getLogger(__name__)
+def extract_text_from_pdf(file_path: Path) -> Tuple[str, Dict]:
+    """
+    Extract text from PDF using pdfplumber (for digital PDFs).
+    Returns:
+        (raw_text, metadata)
+    """
+    try:
+        text_pages = []
+        page_count = 0
+        with pdfplumber.open(str(file_path)) as pdf:
+            page_count = len(pdf.pages)
+            for page in pdf.pages:
+                text = page.extract_text()
+                if text:
+                    text_pages.append(text)
+        raw_text = "\n\n".join(text_pages)
+        metadata = {
+            "page_count": page_count,
+            "extraction_method": "pdfplumber",
+            "confidence_score": 1.0 if len(raw_text) > 50 else 0.5
+        }
+        # If no text extracted, it might be a scanned PDF
+        if not raw_text.strip():
+            logger.info("No text found with pdfplumber, trying OCR...")
+            return extract_text_from_pdf_ocr(file_path)
+        return raw_text, metadata
+    except Exception as e:
+        logger.error(f"PDF extraction failed: {e}")
+        raise
+def extract_text_from_pdf_ocr(file_path: Path) -> Tuple[str, Dict]:
+    """
+    Extract text from scanned PDF using OCR (PyMuPDF + Tesseract).
+    """
+    try:
+        text_pages = []
+        doc = fitz.open(str(file_path))
+        page_count = len(doc)
+        for page_num in range(page_count):
+            page = doc[page_num]
+            # Convert page to image
+            pix = page.get_pixmap(dpi=300)
+            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            # OCR
+            text = pytesseract.image_to_string(img)
+            text_pages.append(text)
+        doc.close()
+        raw_text = "\n\n".join(text_pages)
+        metadata = {
+            "page_count": page_count,
+            "extraction_method": "tesseract_ocr",
+            "confidence_score": 0.7  # OCR typically less confident
+        }
+        return raw_text, metadata
+    except Exception as e:
+        logger.error(f"OCR extraction failed: {e}")
+        raise
+def extract_text_from_image(file_path: Path) -> Tuple[str, Dict]:
+    """
+    Extract text from image using OCR (Tesseract).
+    """
+    try:
+        img = Image.open(str(file_path))
+        raw_text = pytesseract.image_to_string(img)
+        metadata = {
+            "page_count": 1,
+            "extraction_method": "tesseract_ocr",
+            "confidence_score": 0.7
+        }
+        return raw_text, metadata
+    except Exception as e:
+        logger.error(f"Image OCR failed: {e}")
+        raise
+def extract_text(file_path: Path, mime_type: str) -> Tuple[str, Dict]:
+    """
+    Main entry point for text extraction.
+    Routes to appropriate extractor based on file type.
+    Args:
+        file_path: Path to document
+        mime_type: MIME type of document
+    Returns:
+        (raw_text, metadata_dict)
+    """
+    if mime_type == "application/pdf":
+        return extract_text_from_pdf(file_path)
+    elif mime_type in ["image/png", "image/jpeg", "image/jpg"]:
+        return extract_text_from_image(file_path)
+    else:
+        raise ValueError(f"Unsupported file type: {mime_type}")

backend/worker/worker.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import os
+import sys
+from pathlib import Path
+from redis import Redis
+from rq import Worker, Queue, Connection
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+# Import job processor
+from backend.worker.job_processor import process_job
+# Redis connection
+REDIS_HOST = os.getenv('REDIS_HOST', 'redis')
+REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
+REDIS_DB = int(os.getenv('REDIS_DB', 0))
+QUEUE_NAME = os.getenv('REDIS_QUEUE_NAME', 'invoice_ingest')
+redis_conn = Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB)
+if __name__ == '__main__':
+    print(f"🚀 Starting worker for queue: {QUEUE_NAME}")
+    print(f"📡 Redis: {REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}")
+    with Connection(redis_conn):
+        worker = Worker([QUEUE_NAME])
+        worker.work()

requirements.txt ADDED Viewed

	@@ -0,0 +1,47 @@

+# Web Framework
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+pydantic==2.5.0
+# ML & Data
+pandas==2.1.3
+numpy==1.26.2
+scikit-learn==1.6.1
+lightgbm==4.1.0
+joblib==1.3.2
+# Utilities
+python-dateutil==2.8.2
+filelock==3.13.1
+python-multipart==0.0.6
+# Testing (optional)
+httpx==0.25.2
+pytest==7.4.3
+redis==5.0.1
+rq==1.15.1
+# NEW: Text extraction (Item 1)
+pdfplumber==0.10.3
+PyMuPDF==1.23.8
+pytesseract==0.3.10
+Pillow==10.1.0
+# NEW: Utilities
+python-magic==0.4.27  # File type detection
+uuid==1.30
+requests==2.31.0
+python-dotenv==1.0.0
+# NEW: Database
+psycopg2-binary==2.9.7
+SQLAlchemy==2.0.20
+alembic==1.11.1
+google-generativeai>=0.8.0