Spaces:
Sleeping
Sleeping
Deploy Invoice Digitization Agent
Browse files- .dockerignore +18 -0
- .gitignore +3 -0
- Dockerfile +48 -0
- app.py +415 -0
- backend/__init__.py +0 -0
- backend/app/__init__.py +0 -0
- backend/app/agent/__init__.py +19 -0
- backend/app/agent/agent_orchestrator.py +476 -0
- backend/app/api/__init__.py +0 -0
- backend/app/api/ingest.py +1459 -0
- backend/app/utils/__init__.py +0 -0
- backend/app/utils/agent_client.py +153 -0
- backend/app/wrappers/__init__.py +0 -0
- backend/app/wrappers/gemini_output_filter.py +349 -0
- backend/database/__init__.py +0 -0
- backend/database/migration_ingest_v1.sql +67 -0
- backend/database/migration_ingest_v2.sql +63 -0
- backend/database/queries.sql +354 -0
- backend/database/schema_sqlite.sql +132 -0
- backend/etl/__init__.py +0 -0
- backend/etl/update_customer_aggregates_sqlite.py +189 -0
- backend/feature_builder/__init__.py +0 -0
- backend/feature_builder/feature_builder.py +312 -0
- backend/ingest/__init__.py +0 -0
- backend/ingest/ingest_invoice_sqlite.py +158 -0
- backend/worker/job_processor.py +126 -0
- backend/worker/text_extractor.py +129 -0
- backend/worker/worker.py +30 -0
- requirements.txt +47 -0
.dockerignore
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.pyc
|
| 2 |
+
*.pyo
|
| 3 |
+
*.pyd
|
| 4 |
+
.Python
|
| 5 |
+
venv/
|
| 6 |
+
env/
|
| 7 |
+
.venv
|
| 8 |
+
.git
|
| 9 |
+
.gitignore
|
| 10 |
+
.vscode
|
| 11 |
+
.idea
|
| 12 |
+
*.log
|
| 13 |
+
*.db-journal
|
| 14 |
+
.env
|
| 15 |
+
Dockerfile
|
| 16 |
+
docker-compose.yml
|
| 17 |
+
README*.md
|
| 18 |
+
*.md
|
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
Dockerfile
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 7 |
+
|
| 8 |
+
sqlite3 \
|
| 9 |
+
ca-certificates \
|
| 10 |
+
libgomp1 \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
# Copy requirements and install Python packages
|
| 14 |
+
COPY requirements.txt .
|
| 15 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 16 |
+
pip install --no-cache-dir -r requirements.txt
|
| 17 |
+
|
| 18 |
+
# Upgrade Gemini SDK for v1 API
|
| 19 |
+
RUN pip install --no-cache-dir --upgrade google-generativeai google-ai-generativelanguage
|
| 20 |
+
|
| 21 |
+
# Copy application code
|
| 22 |
+
COPY . .
|
| 23 |
+
|
| 24 |
+
# Create necessary directories
|
| 25 |
+
RUN mkdir -p /app/data/logs /app/data/docs && chmod -R 777 /app/data
|
| 26 |
+
|
| 27 |
+
# Create __init__.py files (INCLUDING AGENT DIRECTORY)
|
| 28 |
+
RUN touch backend/__init__.py \
|
| 29 |
+
&& touch backend/feature_builder/__init__.py \
|
| 30 |
+
&& touch backend/app/__init__.py \
|
| 31 |
+
&& touch backend/app/api/__init__.py \
|
| 32 |
+
&& touch backend/app/agent/__init__.py \
|
| 33 |
+
&& touch backend/app/wrappers/__init__.py \
|
| 34 |
+
&& touch backend/ingest/__init__.py
|
| 35 |
+
|
| 36 |
+
# Verify agent files exist (will fail build if missing)
|
| 37 |
+
RUN test -f backend/app/agent/agent_orchestrator.py || \
|
| 38 |
+
(echo "ERROR: agent_orchestrator.py not found! Add it before building." && exit 1)
|
| 39 |
+
|
| 40 |
+
# Initialize database if it doesn't exist
|
| 41 |
+
RUN if [ ! -f /app/data/invoices.db ]; then \
|
| 42 |
+
sqlite3 /app/data/invoices.db < backend/database/init_schema_sqlite.sql; \
|
| 43 |
+
fi
|
| 44 |
+
|
| 45 |
+
# Expose port
|
| 46 |
+
EXPOSE 7860
|
| 47 |
+
|
| 48 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "75"]
|
app.py
ADDED
|
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
from fastapi import FastAPI, HTTPException
|
| 4 |
+
from pydantic import BaseModel
|
| 5 |
+
from typing import Optional, Dict
|
| 6 |
+
import sqlite3
|
| 7 |
+
import joblib
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from datetime import datetime, timedelta
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from filelock import FileLock
|
| 12 |
+
from fastapi.responses import JSONResponse
|
| 13 |
+
import json
|
| 14 |
+
import sys
|
| 15 |
+
|
| 16 |
+
import logging
|
| 17 |
+
|
| 18 |
+
# Setup logging for entire app
|
| 19 |
+
logging.basicConfig(
|
| 20 |
+
level=logging.INFO,
|
| 21 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 22 |
+
handlers=[logging.StreamHandler(sys.stdout)],
|
| 23 |
+
force=True
|
| 24 |
+
)
|
| 25 |
+
# Setup paths
|
| 26 |
+
BASE_DIR = Path(__file__).parent
|
| 27 |
+
DB_PATH = BASE_DIR / "data" / "invoices.db" # Inside container: /app/data/invoices.db
|
| 28 |
+
LOCK_PATH = BASE_DIR / "data" / "invoices.db.lock"
|
| 29 |
+
MODEL_PATH = BASE_DIR / "ml" / "models" / "payment_predictor_model_20251124_194847.pkl"
|
| 30 |
+
LOG_DIR = BASE_DIR / "data" / "logs"
|
| 31 |
+
PREDICTIONS_LOG = LOG_DIR / "predictions.csv"
|
| 32 |
+
|
| 33 |
+
# Ensure directories exist
|
| 34 |
+
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
| 35 |
+
|
| 36 |
+
# Add backend to path
|
| 37 |
+
sys.path.append(str(BASE_DIR / "backend"))
|
| 38 |
+
|
| 39 |
+
# Import feature builder
|
| 40 |
+
from backend.feature_builder.feature_builder import build_features, features_to_dataframe
|
| 41 |
+
from backend.ingest.ingest_invoice_sqlite import ingest_invoice as ingest_func
|
| 42 |
+
|
| 43 |
+
# ============================================
|
| 44 |
+
# IMPORT INGEST ROUTER (NEW)
|
| 45 |
+
# ============================================
|
| 46 |
+
from backend.app.api.ingest import router as ingest_router
|
| 47 |
+
|
| 48 |
+
# Load ML model
|
| 49 |
+
print("🤖 Loading ML model...")
|
| 50 |
+
try:
|
| 51 |
+
model_artifacts = joblib.load(MODEL_PATH)
|
| 52 |
+
model = model_artifacts['model']
|
| 53 |
+
print(f"✅ Model loaded: {MODEL_PATH.name}")
|
| 54 |
+
except Exception as e:
|
| 55 |
+
print(f"❌ Failed to load model: {e}")
|
| 56 |
+
model = None
|
| 57 |
+
|
| 58 |
+
# FastAPI app
|
| 59 |
+
app = FastAPI(
|
| 60 |
+
title="Invoice Payment Predictor",
|
| 61 |
+
description="Predicts payment clearing time for invoices",
|
| 62 |
+
version="1.0.0"
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
# ============================================
|
| 66 |
+
# REGISTER INGEST ROUTER (NEW)
|
| 67 |
+
# ============================================
|
| 68 |
+
app.include_router(ingest_router)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
# ============================================
|
| 72 |
+
# Pydantic Models
|
| 73 |
+
# ============================================
|
| 74 |
+
|
| 75 |
+
class InvoiceIngest(BaseModel):
|
| 76 |
+
invoice_id: int
|
| 77 |
+
business_code: str
|
| 78 |
+
cust_number: str
|
| 79 |
+
name_customer: Optional[str] = None
|
| 80 |
+
posting_date: str
|
| 81 |
+
document_create_date: Optional[str] = None
|
| 82 |
+
document_create_date_alt: Optional[str] = None
|
| 83 |
+
due_in_date: Optional[str] = None
|
| 84 |
+
baseline_create_date: Optional[str] = None
|
| 85 |
+
clear_date: Optional[str] = None
|
| 86 |
+
total_open_amount: float
|
| 87 |
+
invoice_currency: str = "USD"
|
| 88 |
+
document_type: Optional[str] = "RV"
|
| 89 |
+
cust_payment_terms: Optional[str] = None
|
| 90 |
+
posting_id: Optional[float] = None
|
| 91 |
+
business_year: Optional[int] = None
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
class PredictionRequest(BaseModel):
|
| 95 |
+
invoice_id: Optional[int] = None
|
| 96 |
+
cust_number: str
|
| 97 |
+
posting_date: str
|
| 98 |
+
total_open_amount: float
|
| 99 |
+
business_code: str = "U001"
|
| 100 |
+
cust_payment_terms: str = "NAH4"
|
| 101 |
+
invoice_currency: str = "USD"
|
| 102 |
+
document_type: str = "RV"
|
| 103 |
+
due_in_date: Optional[str] = None
|
| 104 |
+
business_year: Optional[int] = None
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
# ============================================
|
| 108 |
+
# Helper Functions
|
| 109 |
+
# ============================================
|
| 110 |
+
|
| 111 |
+
def get_customer_aggregates(cust_number: str) -> Optional[Dict]:
|
| 112 |
+
"""Fetch customer aggregates from SQLite."""
|
| 113 |
+
try:
|
| 114 |
+
with FileLock(str(LOCK_PATH), timeout=10):
|
| 115 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 116 |
+
conn.row_factory = sqlite3.Row
|
| 117 |
+
cursor = conn.cursor()
|
| 118 |
+
|
| 119 |
+
cursor.execute("""
|
| 120 |
+
SELECT * FROM customer_aggregates WHERE cust_number = ?
|
| 121 |
+
""", (cust_number,))
|
| 122 |
+
|
| 123 |
+
row = cursor.fetchone()
|
| 124 |
+
conn.close()
|
| 125 |
+
|
| 126 |
+
if row:
|
| 127 |
+
return dict(row)
|
| 128 |
+
except Exception as e:
|
| 129 |
+
print(f"Error fetching customer aggregates: {e}")
|
| 130 |
+
|
| 131 |
+
return None
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def get_payment_terms_aggregates(payment_terms: str) -> Optional[Dict]:
|
| 135 |
+
"""Fetch payment terms aggregates from SQLite."""
|
| 136 |
+
try:
|
| 137 |
+
with FileLock(str(LOCK_PATH), timeout=10):
|
| 138 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 139 |
+
conn.row_factory = sqlite3.Row
|
| 140 |
+
cursor = conn.cursor()
|
| 141 |
+
|
| 142 |
+
cursor.execute("""
|
| 143 |
+
SELECT * FROM payment_terms_aggregates WHERE cust_payment_terms = ?
|
| 144 |
+
""", (payment_terms,))
|
| 145 |
+
|
| 146 |
+
row = cursor.fetchone()
|
| 147 |
+
conn.close()
|
| 148 |
+
|
| 149 |
+
if row:
|
| 150 |
+
return dict(row)
|
| 151 |
+
except Exception as e:
|
| 152 |
+
print(f"Error fetching payment terms: {e}")
|
| 153 |
+
|
| 154 |
+
return None
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def get_business_code_aggregates(business_code: str) -> Optional[Dict]:
|
| 158 |
+
"""Fetch business code aggregates from SQLite."""
|
| 159 |
+
try:
|
| 160 |
+
with FileLock(str(LOCK_PATH), timeout=10):
|
| 161 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 162 |
+
conn.row_factory = sqlite3.Row
|
| 163 |
+
cursor = conn.cursor()
|
| 164 |
+
|
| 165 |
+
cursor.execute("""
|
| 166 |
+
SELECT * FROM business_code_aggregates WHERE business_code = ?
|
| 167 |
+
""", (business_code,))
|
| 168 |
+
|
| 169 |
+
row = cursor.fetchone()
|
| 170 |
+
conn.close()
|
| 171 |
+
|
| 172 |
+
if row:
|
| 173 |
+
return dict(row)
|
| 174 |
+
except Exception as e:
|
| 175 |
+
print(f"Error fetching business code: {e}")
|
| 176 |
+
|
| 177 |
+
return None
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def log_prediction_to_csv(prediction_data: Dict):
|
| 181 |
+
"""Append prediction to CSV log."""
|
| 182 |
+
df = pd.DataFrame([prediction_data])
|
| 183 |
+
|
| 184 |
+
if not PREDICTIONS_LOG.exists():
|
| 185 |
+
df.to_csv(PREDICTIONS_LOG, index=False)
|
| 186 |
+
else:
|
| 187 |
+
df.to_csv(PREDICTIONS_LOG, mode='a', header=False, index=False)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def log_prediction_to_db(prediction_data: Dict):
|
| 191 |
+
"""Insert prediction into SQLite predictions_log."""
|
| 192 |
+
try:
|
| 193 |
+
with FileLock(str(LOCK_PATH), timeout=10):
|
| 194 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 195 |
+
cursor = conn.cursor()
|
| 196 |
+
|
| 197 |
+
cursor.execute("""
|
| 198 |
+
INSERT INTO predictions_log (
|
| 199 |
+
invoice_id, cust_number, posting_date, total_open_amount,
|
| 200 |
+
business_code, cust_payment_terms, predicted_days_to_clear,
|
| 201 |
+
predicted_clear_date, model_version, features_json
|
| 202 |
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 203 |
+
""", (
|
| 204 |
+
prediction_data.get('invoice_id'),
|
| 205 |
+
prediction_data['cust_number'],
|
| 206 |
+
prediction_data['posting_date'],
|
| 207 |
+
prediction_data['total_open_amount'],
|
| 208 |
+
prediction_data.get('business_code'),
|
| 209 |
+
prediction_data.get('cust_payment_terms'),
|
| 210 |
+
prediction_data['predicted_days_to_clear'],
|
| 211 |
+
prediction_data['predicted_clear_date'],
|
| 212 |
+
prediction_data.get('model_version', 'v1.0'),
|
| 213 |
+
json.dumps(prediction_data.get('features', {}))
|
| 214 |
+
))
|
| 215 |
+
|
| 216 |
+
prediction_id = cursor.lastrowid
|
| 217 |
+
conn.commit()
|
| 218 |
+
conn.close()
|
| 219 |
+
|
| 220 |
+
return prediction_id
|
| 221 |
+
except Exception as e:
|
| 222 |
+
print(f"Error logging to DB: {e}")
|
| 223 |
+
return None
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
# ============================================
|
| 227 |
+
# API Endpoints
|
| 228 |
+
# ============================================
|
| 229 |
+
|
| 230 |
+
@app.get("/")
|
| 231 |
+
def root():
|
| 232 |
+
"""Root endpoint."""
|
| 233 |
+
return {
|
| 234 |
+
"service": "Invoice Payment Predictor",
|
| 235 |
+
"version": "1.0.0",
|
| 236 |
+
"status": "operational",
|
| 237 |
+
"model_loaded": model is not None
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
@app.get("/health")
|
| 242 |
+
def health():
|
| 243 |
+
return JSONResponse(
|
| 244 |
+
content={
|
| 245 |
+
"status": "ok",
|
| 246 |
+
"model_loaded": model is not None,
|
| 247 |
+
"db_exists": DB_PATH.exists()
|
| 248 |
+
},
|
| 249 |
+
media_type="application/json"
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
@app.post("/ingest")
|
| 256 |
+
def ingest_invoice(invoice: InvoiceIngest):
|
| 257 |
+
"""
|
| 258 |
+
Ingest invoice into SQLite database.
|
| 259 |
+
Computes derived fields and stores data.
|
| 260 |
+
"""
|
| 261 |
+
try:
|
| 262 |
+
result = ingest_func(invoice.dict())
|
| 263 |
+
|
| 264 |
+
return {
|
| 265 |
+
"status": "success",
|
| 266 |
+
"message": "Invoice ingested successfully",
|
| 267 |
+
"data": result
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
except Exception as e:
|
| 271 |
+
raise HTTPException(status_code=500, detail=f"Ingestion failed: {str(e)}")
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
@app.get("/features/{cust_number}")
|
| 275 |
+
def get_features(cust_number: str):
|
| 276 |
+
"""
|
| 277 |
+
Get customer aggregate features.
|
| 278 |
+
Returns cached aggregates or defaults for new customers.
|
| 279 |
+
"""
|
| 280 |
+
|
| 281 |
+
customer_agg = get_customer_aggregates(cust_number)
|
| 282 |
+
|
| 283 |
+
if not customer_agg:
|
| 284 |
+
return {
|
| 285 |
+
"cust_number": cust_number,
|
| 286 |
+
"status": "new_customer",
|
| 287 |
+
"message": "No historical data found, using defaults",
|
| 288 |
+
"features": {
|
| 289 |
+
"cust_avg_days": 18.0,
|
| 290 |
+
"cust_median_days": 15.0,
|
| 291 |
+
"cust_invoice_count": 0
|
| 292 |
+
}
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
return {
|
| 296 |
+
"cust_number": cust_number,
|
| 297 |
+
"status": "existing_customer",
|
| 298 |
+
"features": customer_agg
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
@app.post("/predict")
|
| 303 |
+
def predict(request: PredictionRequest):
|
| 304 |
+
"""
|
| 305 |
+
Predict payment clearing time for an invoice.
|
| 306 |
+
|
| 307 |
+
Returns:
|
| 308 |
+
- predicted_days_to_clear
|
| 309 |
+
- predicted_clear_date
|
| 310 |
+
- confidence info
|
| 311 |
+
"""
|
| 312 |
+
|
| 313 |
+
if model is None:
|
| 314 |
+
raise HTTPException(status_code=503, detail="ML model not loaded")
|
| 315 |
+
|
| 316 |
+
try:
|
| 317 |
+
# Fetch aggregates
|
| 318 |
+
customer_agg = get_customer_aggregates(request.cust_number)
|
| 319 |
+
payment_agg = get_payment_terms_aggregates(request.cust_payment_terms)
|
| 320 |
+
business_agg = get_business_code_aggregates(request.business_code)
|
| 321 |
+
|
| 322 |
+
# Build invoice data dict
|
| 323 |
+
invoice_data = request.dict()
|
| 324 |
+
|
| 325 |
+
# Compute days_posting_to_due if due_in_date provided
|
| 326 |
+
if request.due_in_date:
|
| 327 |
+
posting_dt = datetime.strptime(request.posting_date, "%Y-%m-%d")
|
| 328 |
+
due_dt = datetime.strptime(request.due_in_date, "%Y-%m-%d")
|
| 329 |
+
invoice_data['days_posting_to_due'] = (due_dt - posting_dt).days
|
| 330 |
+
else:
|
| 331 |
+
invoice_data['days_posting_to_due'] = 15 # Default
|
| 332 |
+
|
| 333 |
+
# Build features
|
| 334 |
+
features = build_features(invoice_data, customer_agg, payment_agg, business_agg)
|
| 335 |
+
features_df = features_to_dataframe(features)
|
| 336 |
+
|
| 337 |
+
# Predict
|
| 338 |
+
predicted_days = float(model.predict(features_df)[0])
|
| 339 |
+
|
| 340 |
+
# Calculate predicted clear date
|
| 341 |
+
posting_dt = datetime.strptime(request.posting_date, "%Y-%m-%d")
|
| 342 |
+
predicted_clear_dt = posting_dt + timedelta(days=predicted_days)
|
| 343 |
+
|
| 344 |
+
# Prepare response
|
| 345 |
+
response = {
|
| 346 |
+
"invoice_id": request.invoice_id,
|
| 347 |
+
"cust_number": request.cust_number,
|
| 348 |
+
"posting_date": request.posting_date,
|
| 349 |
+
"total_open_amount": request.total_open_amount,
|
| 350 |
+
"predicted_days_to_clear": round(predicted_days, 2),
|
| 351 |
+
"predicted_clear_date": predicted_clear_dt.strftime("%Y-%m-%d"),
|
| 352 |
+
"customer_history": "available" if customer_agg else "new_customer",
|
| 353 |
+
"model_version": "v1.0"
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
# Log prediction
|
| 357 |
+
log_prediction_to_csv(response)
|
| 358 |
+
prediction_id = log_prediction_to_db({
|
| 359 |
+
**response,
|
| 360 |
+
'business_code': request.business_code,
|
| 361 |
+
'cust_payment_terms': request.cust_payment_terms,
|
| 362 |
+
'features': features
|
| 363 |
+
})
|
| 364 |
+
|
| 365 |
+
response['prediction_id'] = prediction_id
|
| 366 |
+
|
| 367 |
+
return response
|
| 368 |
+
|
| 369 |
+
except Exception as e:
|
| 370 |
+
raise HTTPException(status_code=500, detail=f"Prediction failed: {str(e)}")
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
@app.get("/predictions/recent")
|
| 374 |
+
def get_recent_predictions(limit: int = 10):
|
| 375 |
+
"""Get recent predictions from log."""
|
| 376 |
+
|
| 377 |
+
try:
|
| 378 |
+
with FileLock(str(LOCK_PATH), timeout=10):
|
| 379 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 380 |
+
conn.row_factory = sqlite3.Row
|
| 381 |
+
cursor = conn.cursor()
|
| 382 |
+
|
| 383 |
+
cursor.execute("""
|
| 384 |
+
SELECT
|
| 385 |
+
prediction_id,
|
| 386 |
+
cust_number,
|
| 387 |
+
posting_date,
|
| 388 |
+
predicted_days_to_clear,
|
| 389 |
+
predicted_clear_date,
|
| 390 |
+
predicted_at
|
| 391 |
+
FROM predictions_log
|
| 392 |
+
ORDER BY predicted_at DESC
|
| 393 |
+
LIMIT ?
|
| 394 |
+
""", (limit,))
|
| 395 |
+
|
| 396 |
+
rows = cursor.fetchall()
|
| 397 |
+
conn.close()
|
| 398 |
+
|
| 399 |
+
return {
|
| 400 |
+
"count": len(rows),
|
| 401 |
+
"predictions": [dict(row) for row in rows]
|
| 402 |
+
}
|
| 403 |
+
except Exception as e:
|
| 404 |
+
raise HTTPException(status_code=500, detail=f"Failed to fetch predictions: {str(e)}")
|
| 405 |
+
|
| 406 |
+
|
| 407 |
+
if __name__ == "__main__":
|
| 408 |
+
import uvicorn
|
| 409 |
+
uvicorn.run(
|
| 410 |
+
app,
|
| 411 |
+
host="0.0.0.0",
|
| 412 |
+
port=7860,
|
| 413 |
+
timeout_keep_alive=75,
|
| 414 |
+
timeout_graceful_shutdown=10
|
| 415 |
+
)
|
backend/__init__.py
ADDED
|
File without changes
|
backend/app/__init__.py
ADDED
|
File without changes
|
backend/app/agent/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agent module for autonomous invoice processing.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .agent_orchestrator import (
|
| 6 |
+
InvoiceAgent,
|
| 7 |
+
AgentState,
|
| 8 |
+
AgentDecision,
|
| 9 |
+
create_agent,
|
| 10 |
+
run_agent_pipeline
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
__all__ = [
|
| 14 |
+
'InvoiceAgent',
|
| 15 |
+
'AgentState',
|
| 16 |
+
'AgentDecision',
|
| 17 |
+
'create_agent',
|
| 18 |
+
'run_agent_pipeline'
|
| 19 |
+
]
|
backend/app/agent/agent_orchestrator.py
ADDED
|
@@ -0,0 +1,476 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
True End-to-End Agent Orchestrator
|
| 3 |
+
===================================
|
| 4 |
+
Autonomous agent that:
|
| 5 |
+
1. Decides which tools to use based on document analysis
|
| 6 |
+
2. Validates its own output
|
| 7 |
+
3. Self-corrects when confidence is low
|
| 8 |
+
4. Learns from patterns
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import json
|
| 12 |
+
import sys
|
| 13 |
+
import logging
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import Dict, List, Optional, Tuple
|
| 16 |
+
from dataclasses import dataclass
|
| 17 |
+
from enum import Enum
|
| 18 |
+
|
| 19 |
+
# Configure logging
|
| 20 |
+
logging.basicConfig(
|
| 21 |
+
level=logging.INFO,
|
| 22 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 23 |
+
handlers=[logging.StreamHandler(sys.stdout)],
|
| 24 |
+
force=True
|
| 25 |
+
)
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class AgentDecision(Enum):
|
| 30 |
+
"""Agent's possible decisions"""
|
| 31 |
+
EXTRACT_TEXT = "extract_text"
|
| 32 |
+
EXTRACT_TABLES = "extract_tables"
|
| 33 |
+
RUN_NER = "run_ner"
|
| 34 |
+
USE_GEMINI = "use_gemini"
|
| 35 |
+
USE_REGEX = "use_regex"
|
| 36 |
+
VALIDATE = "validate"
|
| 37 |
+
RETRY = "retry"
|
| 38 |
+
COMPLETE = "complete"
|
| 39 |
+
HUMAN_REVIEW = "human_review"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@dataclass
|
| 43 |
+
class AgentState:
|
| 44 |
+
"""Agent's internal state"""
|
| 45 |
+
doc_id: str
|
| 46 |
+
file_path: Path
|
| 47 |
+
|
| 48 |
+
# Extracted data
|
| 49 |
+
raw_text: Optional[str] = None
|
| 50 |
+
tables: Optional[List] = None
|
| 51 |
+
entities: Optional[List] = None
|
| 52 |
+
entity_map: Optional[Dict] = None
|
| 53 |
+
|
| 54 |
+
# Mapped fields
|
| 55 |
+
fields: Optional[Dict] = None
|
| 56 |
+
confidence_map: Optional[Dict] = None
|
| 57 |
+
|
| 58 |
+
# Decision tracking
|
| 59 |
+
attempts: int = 0
|
| 60 |
+
max_attempts: int = 3
|
| 61 |
+
history: List[str] = None
|
| 62 |
+
errors: List[str] = None
|
| 63 |
+
|
| 64 |
+
def __post_init__(self):
|
| 65 |
+
if self.history is None:
|
| 66 |
+
self.history = []
|
| 67 |
+
if self.errors is None:
|
| 68 |
+
self.errors = []
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class InvoiceAgent:
|
| 72 |
+
"""
|
| 73 |
+
Autonomous agent that processes invoices with self-correction.
|
| 74 |
+
"""
|
| 75 |
+
|
| 76 |
+
def __init__(self, text_extractor, table_extractor, ner_extractor, gemini_mapper):
|
| 77 |
+
"""
|
| 78 |
+
Args:
|
| 79 |
+
text_extractor: Function(file_path) -> (success, text, error)
|
| 80 |
+
table_extractor: Function(file_path) -> (success, tables, error)
|
| 81 |
+
ner_extractor: Function(text) -> (success, entities, entity_map, error)
|
| 82 |
+
gemini_mapper: Function(text, entities, entity_map, tables) -> (success, fields, error)
|
| 83 |
+
"""
|
| 84 |
+
self.text_extractor = text_extractor
|
| 85 |
+
self.table_extractor = table_extractor
|
| 86 |
+
self.ner_extractor = ner_extractor
|
| 87 |
+
self.gemini_mapper = gemini_mapper
|
| 88 |
+
|
| 89 |
+
# Minimum confidence thresholds
|
| 90 |
+
self.MIN_CONFIDENCE = {
|
| 91 |
+
'cust_number': 0.6,
|
| 92 |
+
'posting_date': 0.7,
|
| 93 |
+
'total_open_amount': 0.7,
|
| 94 |
+
'cust_payment_terms': 0.5
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
def process(self, state: AgentState) -> AgentState:
|
| 98 |
+
"""
|
| 99 |
+
Main agent loop - autonomous decision-making and execution.
|
| 100 |
+
"""
|
| 101 |
+
logger.info("=" * 70)
|
| 102 |
+
logger.info(f"**** AGENT STARTING: {state.file_path.name}")
|
| 103 |
+
logger.info("=" * 70)
|
| 104 |
+
|
| 105 |
+
while state.attempts < state.max_attempts:
|
| 106 |
+
state.attempts += 1
|
| 107 |
+
logger.info(f"\n**** ATTEMPT {state.attempts}/{state.max_attempts}")
|
| 108 |
+
|
| 109 |
+
# Step 1: Decide next action
|
| 110 |
+
decision = self._decide_next_action(state)
|
| 111 |
+
logger.info(f"**** DECISION: {decision.value}")
|
| 112 |
+
state.history.append(decision.value)
|
| 113 |
+
|
| 114 |
+
# Step 2: Execute action
|
| 115 |
+
success = self._execute_action(decision, state)
|
| 116 |
+
|
| 117 |
+
if not success:
|
| 118 |
+
logger.warning(f"**** Action {decision.value} failed")
|
| 119 |
+
continue
|
| 120 |
+
|
| 121 |
+
# Step 3: Check if we're done
|
| 122 |
+
if decision == AgentDecision.COMPLETE:
|
| 123 |
+
logger.info("**** AGENT COMPLETE")
|
| 124 |
+
break
|
| 125 |
+
|
| 126 |
+
if decision == AgentDecision.HUMAN_REVIEW:
|
| 127 |
+
logger.info("**** AGENT REQUESTING HUMAN REVIEW")
|
| 128 |
+
break
|
| 129 |
+
|
| 130 |
+
logger.info("=" * 70)
|
| 131 |
+
logger.info(f"**** Final confidence: {self._calculate_overall_confidence(state):.2f}")
|
| 132 |
+
logger.info(f"**** Actions taken: {' → '.join(state.history)}")
|
| 133 |
+
logger.info("=" * 70)
|
| 134 |
+
|
| 135 |
+
return state
|
| 136 |
+
|
| 137 |
+
def _decide_next_action(self, state: AgentState) -> AgentDecision:
|
| 138 |
+
"""
|
| 139 |
+
Agent's brain - decides what to do next based on current state.
|
| 140 |
+
"""
|
| 141 |
+
|
| 142 |
+
# 1. If no text, extract it
|
| 143 |
+
if state.raw_text is None:
|
| 144 |
+
return AgentDecision.EXTRACT_TEXT
|
| 145 |
+
|
| 146 |
+
# 2. If text exists but no entities, run NER
|
| 147 |
+
if state.entities is None:
|
| 148 |
+
return AgentDecision.RUN_NER
|
| 149 |
+
|
| 150 |
+
# 3. If no fields mapped yet, try Gemini first
|
| 151 |
+
if state.fields is None:
|
| 152 |
+
return AgentDecision.USE_GEMINI
|
| 153 |
+
|
| 154 |
+
# 4. If fields exist, validate them
|
| 155 |
+
if not self._is_validated(state):
|
| 156 |
+
return AgentDecision.VALIDATE
|
| 157 |
+
|
| 158 |
+
# 5. Check confidence - retry if low
|
| 159 |
+
overall_confidence = self._calculate_overall_confidence(state)
|
| 160 |
+
|
| 161 |
+
if overall_confidence < 0.6 and state.attempts < state.max_attempts:
|
| 162 |
+
# Try alternative approach
|
| 163 |
+
if 'use_gemini' in state.history and 'use_regex' not in state.history:
|
| 164 |
+
return AgentDecision.USE_REGEX
|
| 165 |
+
elif 'extract_tables' not in state.history:
|
| 166 |
+
return AgentDecision.EXTRACT_TABLES
|
| 167 |
+
else:
|
| 168 |
+
return AgentDecision.RETRY
|
| 169 |
+
|
| 170 |
+
# 6. If still low confidence, request human review
|
| 171 |
+
if overall_confidence < 0.5:
|
| 172 |
+
return AgentDecision.HUMAN_REVIEW
|
| 173 |
+
|
| 174 |
+
# 7. Otherwise, we're done!
|
| 175 |
+
return AgentDecision.COMPLETE
|
| 176 |
+
|
| 177 |
+
def _execute_action(self, decision: AgentDecision, state: AgentState) -> bool:
|
| 178 |
+
"""Execute the decided action."""
|
| 179 |
+
|
| 180 |
+
try:
|
| 181 |
+
if decision == AgentDecision.EXTRACT_TEXT:
|
| 182 |
+
return self._extract_text(state)
|
| 183 |
+
|
| 184 |
+
elif decision == AgentDecision.EXTRACT_TABLES:
|
| 185 |
+
return self._extract_tables(state)
|
| 186 |
+
|
| 187 |
+
elif decision == AgentDecision.RUN_NER:
|
| 188 |
+
return self._run_ner(state)
|
| 189 |
+
|
| 190 |
+
elif decision == AgentDecision.USE_GEMINI:
|
| 191 |
+
return self._use_gemini(state)
|
| 192 |
+
|
| 193 |
+
elif decision == AgentDecision.USE_REGEX:
|
| 194 |
+
return self._use_regex(state)
|
| 195 |
+
|
| 196 |
+
elif decision == AgentDecision.VALIDATE:
|
| 197 |
+
return self._validate_fields(state)
|
| 198 |
+
|
| 199 |
+
elif decision == AgentDecision.RETRY:
|
| 200 |
+
# Clear fields and try again with different approach
|
| 201 |
+
state.fields = None
|
| 202 |
+
state.confidence_map = None
|
| 203 |
+
return True
|
| 204 |
+
|
| 205 |
+
elif decision in [AgentDecision.COMPLETE, AgentDecision.HUMAN_REVIEW]:
|
| 206 |
+
return True
|
| 207 |
+
|
| 208 |
+
return False
|
| 209 |
+
|
| 210 |
+
except Exception as e:
|
| 211 |
+
logger.error(f"**** Action failed: {e}")
|
| 212 |
+
state.errors.append(str(e))
|
| 213 |
+
return False
|
| 214 |
+
|
| 215 |
+
def _extract_text(self, state: AgentState) -> bool:
|
| 216 |
+
"""Extract text from document."""
|
| 217 |
+
logger.info("**** Extracting text...")
|
| 218 |
+
success, text, error = self.text_extractor(state.file_path)
|
| 219 |
+
|
| 220 |
+
if success and text and len(text.strip()) > 10:
|
| 221 |
+
state.raw_text = text
|
| 222 |
+
logger.info(f"**** Extracted {len(text)} characters")
|
| 223 |
+
return True
|
| 224 |
+
|
| 225 |
+
state.errors.append(f"Text extraction failed: {error}")
|
| 226 |
+
return False
|
| 227 |
+
|
| 228 |
+
def _extract_tables(self, state: AgentState) -> bool:
|
| 229 |
+
"""Extract tables from document."""
|
| 230 |
+
logger.info("**** Extracting tables...")
|
| 231 |
+
success, tables, error = self.table_extractor(state.file_path)
|
| 232 |
+
|
| 233 |
+
if success:
|
| 234 |
+
state.tables = tables
|
| 235 |
+
logger.info(f"**** Extracted {len(tables)} tables")
|
| 236 |
+
return True
|
| 237 |
+
|
| 238 |
+
logger.warning(f"**** Table extraction failed: {error}")
|
| 239 |
+
state.tables = []
|
| 240 |
+
return True # Non-critical, continue
|
| 241 |
+
|
| 242 |
+
def _run_ner(self, state: AgentState) -> bool:
|
| 243 |
+
"""Run Named Entity Recognition."""
|
| 244 |
+
logger.info("**** Running NER...")
|
| 245 |
+
success, entities, entity_map, error = self.ner_extractor(state.raw_text)
|
| 246 |
+
|
| 247 |
+
if success:
|
| 248 |
+
state.entities = entities
|
| 249 |
+
state.entity_map = entity_map
|
| 250 |
+
logger.info(f"**** Found {len(entities)} entities")
|
| 251 |
+
return True
|
| 252 |
+
|
| 253 |
+
logger.warning(f"**** NER failed: {error}")
|
| 254 |
+
state.entities = []
|
| 255 |
+
state.entity_map = {}
|
| 256 |
+
return True # Non-critical, continue
|
| 257 |
+
|
| 258 |
+
def _use_gemini(self, state: AgentState) -> bool:
|
| 259 |
+
"""Use Gemini for intelligent mapping."""
|
| 260 |
+
logger.info("**** Using Gemini mapping...")
|
| 261 |
+
|
| 262 |
+
success, result, error = self.gemini_mapper(
|
| 263 |
+
state.raw_text,
|
| 264 |
+
state.entities or [],
|
| 265 |
+
state.entity_map or {},
|
| 266 |
+
state.tables or []
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
if success and result:
|
| 270 |
+
state.fields = {
|
| 271 |
+
'cust_number': result.get('customer_name', 'UNKNOWN')[:20],
|
| 272 |
+
'posting_date': result.get('date', '2024-01-01'),
|
| 273 |
+
'total_open_amount': float(result.get('total_amount', 0.0)),
|
| 274 |
+
'business_code': 'U001',
|
| 275 |
+
'cust_payment_terms': result.get('payment_terms', 'NAH4')[:10]
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
# High confidence from Gemini
|
| 279 |
+
state.confidence_map = {
|
| 280 |
+
'cust_number': 0.9,
|
| 281 |
+
'posting_date': 0.9,
|
| 282 |
+
'total_open_amount': 0.9,
|
| 283 |
+
'business_code': 0.3,
|
| 284 |
+
'cust_payment_terms': 0.8
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
logger.info(f"**** Gemini mapped: {state.fields}")
|
| 288 |
+
return True
|
| 289 |
+
|
| 290 |
+
logger.warning(f"**** Gemini failed: {error}")
|
| 291 |
+
state.errors.append(f"Gemini mapping failed: {error}")
|
| 292 |
+
return False
|
| 293 |
+
|
| 294 |
+
def _use_regex(self, state: AgentState) -> bool:
|
| 295 |
+
"""Fallback regex-based extraction."""
|
| 296 |
+
logger.info("**** Using regex fallback...")
|
| 297 |
+
|
| 298 |
+
from backend.app.api.ingest import map_with_regex
|
| 299 |
+
|
| 300 |
+
fields, confidence = map_with_regex(state.raw_text, state.entities or [])
|
| 301 |
+
state.fields = fields
|
| 302 |
+
state.confidence_map = confidence
|
| 303 |
+
|
| 304 |
+
logger.info(f"**** Regex mapped: {fields}")
|
| 305 |
+
return True
|
| 306 |
+
|
| 307 |
+
def _validate_fields(self, state: AgentState) -> bool:
|
| 308 |
+
"""
|
| 309 |
+
Validate extracted fields using business rules.
|
| 310 |
+
Agent learns if data makes sense.
|
| 311 |
+
"""
|
| 312 |
+
logger.info("✓ Validating fields...")
|
| 313 |
+
|
| 314 |
+
if not state.fields:
|
| 315 |
+
return False
|
| 316 |
+
|
| 317 |
+
validation_results = {}
|
| 318 |
+
|
| 319 |
+
# 1. Customer number shouldn't be empty or generic
|
| 320 |
+
cust = state.fields.get('cust_number', '')
|
| 321 |
+
if cust and cust != 'UNKNOWN' and len(cust) > 2:
|
| 322 |
+
validation_results['cust_number'] = True
|
| 323 |
+
else:
|
| 324 |
+
validation_results['cust_number'] = False
|
| 325 |
+
logger.warning("**** Customer number looks invalid")
|
| 326 |
+
|
| 327 |
+
# 2. Date should be reasonable (not default)
|
| 328 |
+
date = state.fields.get('posting_date', '')
|
| 329 |
+
if date and date != '2024-01-01':
|
| 330 |
+
validation_results['posting_date'] = True
|
| 331 |
+
else:
|
| 332 |
+
validation_results['posting_date'] = False
|
| 333 |
+
logger.warning("**** Date looks like default value")
|
| 334 |
+
|
| 335 |
+
# 3. Amount should be > 0
|
| 336 |
+
amount = state.fields.get('total_open_amount', 0.0)
|
| 337 |
+
if amount > 0:
|
| 338 |
+
validation_results['total_open_amount'] = True
|
| 339 |
+
else:
|
| 340 |
+
validation_results['total_open_amount'] = False
|
| 341 |
+
logger.warning("**** Amount is zero or missing")
|
| 342 |
+
|
| 343 |
+
# Adjust confidence based on validation
|
| 344 |
+
for field, is_valid in validation_results.items():
|
| 345 |
+
if not is_valid and state.confidence_map:
|
| 346 |
+
state.confidence_map[field] *= 0.5 # Reduce confidence
|
| 347 |
+
|
| 348 |
+
# Mark as validated
|
| 349 |
+
state.history.append('validated')
|
| 350 |
+
|
| 351 |
+
success_count = sum(validation_results.values())
|
| 352 |
+
logger.info(f"✓ Validation: {success_count}/{len(validation_results)} checks passed")
|
| 353 |
+
|
| 354 |
+
return success_count >= 2 # At least 2 fields should be valid
|
| 355 |
+
|
| 356 |
+
def _is_validated(self, state: AgentState) -> bool:
|
| 357 |
+
"""Check if validation has been performed."""
|
| 358 |
+
return 'validated' in state.history
|
| 359 |
+
|
| 360 |
+
def _calculate_overall_confidence(self, state: AgentState) -> float:
|
| 361 |
+
"""Calculate overall confidence score."""
|
| 362 |
+
if not state.confidence_map:
|
| 363 |
+
return 0.0
|
| 364 |
+
|
| 365 |
+
# Weighted average (important fields have more weight)
|
| 366 |
+
weights = {
|
| 367 |
+
'cust_number': 0.3,
|
| 368 |
+
'posting_date': 0.2,
|
| 369 |
+
'total_open_amount': 0.3,
|
| 370 |
+
'cust_payment_terms': 0.1,
|
| 371 |
+
'business_code': 0.1
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
total_confidence = 0.0
|
| 375 |
+
total_weight = 0.0
|
| 376 |
+
|
| 377 |
+
for field, weight in weights.items():
|
| 378 |
+
if field in state.confidence_map:
|
| 379 |
+
total_confidence += state.confidence_map[field] * weight
|
| 380 |
+
total_weight += weight
|
| 381 |
+
|
| 382 |
+
return total_confidence / total_weight if total_weight > 0 else 0.0
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
# ==============================================
|
| 386 |
+
# Integration with existing code
|
| 387 |
+
# ==============================================
|
| 388 |
+
|
| 389 |
+
def create_agent(text_extractor_fn, table_extractor_fn, ner_fn, gemini_fn):
|
| 390 |
+
"""
|
| 391 |
+
Factory function to create agent with your existing functions.
|
| 392 |
+
|
| 393 |
+
Usage:
|
| 394 |
+
from backend.app.api.ingest import (
|
| 395 |
+
call_text_extractor, call_table_extractor,
|
| 396 |
+
call_ner, map_with_gemini
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
agent = create_agent(
|
| 400 |
+
call_text_extractor,
|
| 401 |
+
call_table_extractor,
|
| 402 |
+
call_ner,
|
| 403 |
+
map_with_gemini
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
state = AgentState(doc_id="doc123", file_path=Path("invoice.pdf"))
|
| 407 |
+
result_state = agent.process(state)
|
| 408 |
+
"""
|
| 409 |
+
return InvoiceAgent(text_extractor_fn, table_extractor_fn, ner_fn, gemini_fn)
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
def run_agent_pipeline(job_id: str, doc_id: str, file_path: Path):
|
| 413 |
+
"""
|
| 414 |
+
Replace your existing process_document() with this agentic version.
|
| 415 |
+
"""
|
| 416 |
+
from backend.app.api.ingest import (
|
| 417 |
+
call_text_extractor, call_table_extractor,
|
| 418 |
+
call_ner, map_with_gemini,
|
| 419 |
+
save_extraction, save_invoice_fields,
|
| 420 |
+
update_job_status
|
| 421 |
+
)
|
| 422 |
+
|
| 423 |
+
try:
|
| 424 |
+
update_job_status(job_id, 'processing')
|
| 425 |
+
|
| 426 |
+
# Create agent
|
| 427 |
+
agent = create_agent(
|
| 428 |
+
call_text_extractor,
|
| 429 |
+
call_table_extractor,
|
| 430 |
+
call_ner,
|
| 431 |
+
map_with_gemini
|
| 432 |
+
)
|
| 433 |
+
|
| 434 |
+
# Initialize state
|
| 435 |
+
state = AgentState(doc_id=doc_id, file_path=file_path)
|
| 436 |
+
|
| 437 |
+
# Let agent decide and execute autonomously
|
| 438 |
+
result_state = agent.process(state)
|
| 439 |
+
|
| 440 |
+
# Save results
|
| 441 |
+
if result_state.fields:
|
| 442 |
+
save_extraction(
|
| 443 |
+
doc_id,
|
| 444 |
+
result_state.raw_text,
|
| 445 |
+
result_state.tables or [],
|
| 446 |
+
result_state.entities or [],
|
| 447 |
+
{
|
| 448 |
+
'method': 'autonomous_agent',
|
| 449 |
+
'attempts': result_state.attempts,
|
| 450 |
+
'actions': result_state.history,
|
| 451 |
+
'confidence': agent._calculate_overall_confidence(result_state)
|
| 452 |
+
},
|
| 453 |
+
None
|
| 454 |
+
)
|
| 455 |
+
|
| 456 |
+
save_invoice_fields(
|
| 457 |
+
doc_id,
|
| 458 |
+
result_state.fields,
|
| 459 |
+
result_state.confidence_map or {}
|
| 460 |
+
)
|
| 461 |
+
|
| 462 |
+
# Check if needs human review
|
| 463 |
+
if AgentDecision.HUMAN_REVIEW.value in result_state.history:
|
| 464 |
+
update_job_status(job_id, 'needs_review')
|
| 465 |
+
else:
|
| 466 |
+
update_job_status(job_id, 'completed')
|
| 467 |
+
|
| 468 |
+
logger.info(f"**** Agent completed with {len(result_state.history)} actions")
|
| 469 |
+
else:
|
| 470 |
+
update_job_status(job_id, 'failed', 'Agent could not extract fields')
|
| 471 |
+
|
| 472 |
+
except Exception as e:
|
| 473 |
+
logger.error(f"**** Agent failed: {e}")
|
| 474 |
+
import traceback
|
| 475 |
+
traceback.print_exc()
|
| 476 |
+
update_job_status(job_id, 'failed', str(e))
|
backend/app/api/__init__.py
ADDED
|
File without changes
|
backend/app/api/ingest.py
ADDED
|
@@ -0,0 +1,1459 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Complete ingest pipeline with AUTONOMOUS AGENT INTEGRATION
|
| 3 |
+
✅ Step 1: HF agents extract raw text
|
| 4 |
+
✅ Step 2: HF NER finds entities
|
| 5 |
+
✅ Step 3: Gemini maps to structured invoice fields
|
| 6 |
+
✅ NEW: Autonomous agent orchestrates, validates, and self-corrects
|
| 7 |
+
✅ UPDATED: Retry logic with exponential backoff + Local OCR fallback
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import uuid
|
| 12 |
+
import json
|
| 13 |
+
import sqlite3
|
| 14 |
+
import logging
|
| 15 |
+
import csv
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from typing import Optional, Dict, List, Any
|
| 19 |
+
from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks, Form
|
| 20 |
+
from pydantic import BaseModel
|
| 21 |
+
from filelock import FileLock
|
| 22 |
+
import httpx
|
| 23 |
+
import re
|
| 24 |
+
|
| 25 |
+
import sys
|
| 26 |
+
logging.basicConfig(
|
| 27 |
+
level=logging.INFO,
|
| 28 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 29 |
+
handlers=[logging.StreamHandler(sys.stdout)],
|
| 30 |
+
force=True # Override any existing config
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
logger = logging.getLogger(__name__)
|
| 34 |
+
|
| 35 |
+
# Setup
|
| 36 |
+
BASE_DIR = Path(__file__).parent.parent.parent.parent
|
| 37 |
+
STORAGE_PATH = Path(os.getenv('STORAGE_PATH', str(BASE_DIR / "data" / "docs")))
|
| 38 |
+
DB_PATH = Path(os.getenv('DB_PATH', str(BASE_DIR / "data" / "invoices.db")))
|
| 39 |
+
LOCK_PATH = BASE_DIR / "data" / "invoices.db.lock"
|
| 40 |
+
PREDICT_ENDPOINT = 'http://localhost:7860/predict'
|
| 41 |
+
|
| 42 |
+
STORAGE_PATH.mkdir(parents=True, exist_ok=True)
|
| 43 |
+
|
| 44 |
+
logger = logging.getLogger(__name__)
|
| 45 |
+
router = APIRouter(prefix="/api", tags=["ingest"])
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# ============================================
|
| 49 |
+
# LOCAL OCR FALLBACK (NEW)
|
| 50 |
+
# ============================================
|
| 51 |
+
|
| 52 |
+
# ============================================
|
| 53 |
+
# LOCAL OCR FALLBACK (UPDATED - EasyOCR + Tesseract)
|
| 54 |
+
# ============================================
|
| 55 |
+
|
| 56 |
+
def extract_text_with_easyocr(file_path: Path) -> tuple:
|
| 57 |
+
"""
|
| 58 |
+
EasyOCR - Best free open-source OCR
|
| 59 |
+
- Works offline
|
| 60 |
+
- 80+ languages
|
| 61 |
+
- GPU/CPU support
|
| 62 |
+
- Better accuracy than Tesseract for invoices
|
| 63 |
+
"""
|
| 64 |
+
try:
|
| 65 |
+
import easyocr
|
| 66 |
+
|
| 67 |
+
logger.info("🔧 Using EasyOCR (best free OCR)...")
|
| 68 |
+
|
| 69 |
+
# Initialize reader (downloads models on first run)
|
| 70 |
+
# Use GPU if available, fallback to CPU
|
| 71 |
+
reader = easyocr.Reader(['en'], gpu=False) # Set gpu=True if you have CUDA
|
| 72 |
+
|
| 73 |
+
# Read image
|
| 74 |
+
result = reader.readtext(str(file_path), detail=0, paragraph=True)
|
| 75 |
+
|
| 76 |
+
# Join all text
|
| 77 |
+
text = '\n'.join(result)
|
| 78 |
+
|
| 79 |
+
if text and len(text.strip()) >= 10:
|
| 80 |
+
logger.info(f"✅ EasyOCR extracted {len(text)} characters")
|
| 81 |
+
return True, text, None
|
| 82 |
+
|
| 83 |
+
return False, None, "EasyOCR produced no usable text"
|
| 84 |
+
|
| 85 |
+
except ImportError:
|
| 86 |
+
logger.warning("⚠️ easyocr not installed. Install with: pip install easyocr")
|
| 87 |
+
return False, None, "easyocr not available"
|
| 88 |
+
except Exception as e:
|
| 89 |
+
logger.error(f"❌ EasyOCR failed: {e}")
|
| 90 |
+
return False, None, str(e)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def extract_text_with_tesseract(file_path: Path) -> tuple:
|
| 94 |
+
"""
|
| 95 |
+
Tesseract OCR - Fallback option
|
| 96 |
+
Faster but less accurate than EasyOCR
|
| 97 |
+
"""
|
| 98 |
+
try:
|
| 99 |
+
import pytesseract
|
| 100 |
+
from PIL import Image
|
| 101 |
+
|
| 102 |
+
logger.info("🔧 Using Tesseract OCR as secondary fallback...")
|
| 103 |
+
|
| 104 |
+
image = Image.open(file_path)
|
| 105 |
+
text = pytesseract.image_to_string(image)
|
| 106 |
+
|
| 107 |
+
if text and len(text.strip()) >= 10:
|
| 108 |
+
logger.info(f"✅ Tesseract extracted {len(text)} characters")
|
| 109 |
+
return True, text, None
|
| 110 |
+
|
| 111 |
+
return False, None, "Tesseract produced no usable text"
|
| 112 |
+
|
| 113 |
+
except ImportError:
|
| 114 |
+
logger.warning("⚠️ pytesseract not installed. Install with: pip install pytesseract pillow")
|
| 115 |
+
return False, None, "pytesseract not available"
|
| 116 |
+
except Exception as e:
|
| 117 |
+
logger.error(f"❌ Tesseract failed: {e}")
|
| 118 |
+
return False, None, str(e)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def extract_text_with_local_ocr(file_path: Path) -> tuple:
|
| 122 |
+
"""
|
| 123 |
+
Multi-tier local OCR fallback system:
|
| 124 |
+
1. Try EasyOCR (best accuracy)
|
| 125 |
+
2. Try Tesseract (faster, less accurate)
|
| 126 |
+
3. Give up
|
| 127 |
+
"""
|
| 128 |
+
logger.info("=" * 70)
|
| 129 |
+
logger.info("🔄 HF extraction failed - trying local OCR fallbacks...")
|
| 130 |
+
logger.info("=" * 70)
|
| 131 |
+
|
| 132 |
+
# Priority 1: EasyOCR (best for invoices)
|
| 133 |
+
success, text, error = extract_text_with_easyocr(file_path)
|
| 134 |
+
if success:
|
| 135 |
+
logger.info("✅ EasyOCR succeeded!")
|
| 136 |
+
return True, text, None
|
| 137 |
+
else:
|
| 138 |
+
logger.warning(f"⚠️ EasyOCR failed: {error}")
|
| 139 |
+
|
| 140 |
+
# Priority 2: Tesseract (faster fallback)
|
| 141 |
+
success, text, error = extract_text_with_tesseract(file_path)
|
| 142 |
+
if success:
|
| 143 |
+
logger.info("✅ Tesseract succeeded!")
|
| 144 |
+
return True, text, None
|
| 145 |
+
else:
|
| 146 |
+
logger.warning(f"⚠️ Tesseract failed: {error}")
|
| 147 |
+
|
| 148 |
+
# All local OCR failed
|
| 149 |
+
logger.error("❌ All local OCR methods failed")
|
| 150 |
+
return False, None, "All local OCR methods failed"
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
# ============================================
|
| 154 |
+
# STEP 1: HF Agent Text Extraction (UPDATED)
|
| 155 |
+
# ============================================
|
| 156 |
+
|
| 157 |
+
def get_agent_headers():
|
| 158 |
+
"""Get headers with HF token"""
|
| 159 |
+
token = (
|
| 160 |
+
os.getenv('HF_TOKEN') or
|
| 161 |
+
os.getenv('HUGGINGFACE_API_TOKEN') or
|
| 162 |
+
os.getenv('AGENT_BEARER_TOKEN') or
|
| 163 |
+
''
|
| 164 |
+
)
|
| 165 |
+
return {'Authorization': f'Bearer {token}'} if token else {}
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def get_mime_type(file_path: Path) -> str:
|
| 169 |
+
"""Get MIME type"""
|
| 170 |
+
ext = file_path.suffix.lower()
|
| 171 |
+
mime_map = {
|
| 172 |
+
'.pdf': 'application/pdf',
|
| 173 |
+
'.jpg': 'image/jpeg',
|
| 174 |
+
'.jpeg': 'image/jpeg',
|
| 175 |
+
'.png': 'image/png'
|
| 176 |
+
}
|
| 177 |
+
return mime_map.get(ext, 'application/octet-stream')
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def call_text_extractor(file_path: Path, max_retries=3):
|
| 181 |
+
"""
|
| 182 |
+
HF text extraction with retry logic and exponential backoff.
|
| 183 |
+
Falls back to local OCR if all retries fail.
|
| 184 |
+
"""
|
| 185 |
+
url = os.getenv('TEXT_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/text')
|
| 186 |
+
base_timeout = int(os.getenv('AGENT_TIMEOUT_SECONDS', '120'))
|
| 187 |
+
|
| 188 |
+
for attempt in range(max_retries):
|
| 189 |
+
# Progressive timeout: 120s, 180s, 240s
|
| 190 |
+
timeout = base_timeout + (60 * attempt)
|
| 191 |
+
|
| 192 |
+
try:
|
| 193 |
+
logger.info(f"📄 Extracting text from {file_path.name} (attempt {attempt + 1}/{max_retries}, timeout={timeout}s)...")
|
| 194 |
+
|
| 195 |
+
filename = file_path.name
|
| 196 |
+
mime_type = get_mime_type(file_path)
|
| 197 |
+
|
| 198 |
+
with open(file_path, 'rb') as f:
|
| 199 |
+
files = {'file': (filename, f, mime_type)}
|
| 200 |
+
data = {
|
| 201 |
+
'filename': filename,
|
| 202 |
+
'start_page': 1,
|
| 203 |
+
'end_page': 1
|
| 204 |
+
}
|
| 205 |
+
headers = get_agent_headers()
|
| 206 |
+
|
| 207 |
+
response = httpx.post(url, files=files, data=data, headers=headers, timeout=timeout)
|
| 208 |
+
|
| 209 |
+
if response.status_code == 200:
|
| 210 |
+
result = response.json()
|
| 211 |
+
text = result.get('result') or result.get('text') or result.get('extracted_text') or ''
|
| 212 |
+
|
| 213 |
+
if text and len(text.strip()) >= 10:
|
| 214 |
+
logger.info(f"✅ Extracted {len(text)} characters")
|
| 215 |
+
return True, text, None
|
| 216 |
+
|
| 217 |
+
logger.warning("⚠️ No text extracted from response")
|
| 218 |
+
if attempt < max_retries - 1:
|
| 219 |
+
continue
|
| 220 |
+
return False, None, "No text extracted"
|
| 221 |
+
|
| 222 |
+
logger.warning(f"⚠️ HTTP {response.status_code}: {response.text[:200]}")
|
| 223 |
+
|
| 224 |
+
except httpx.TimeoutException:
|
| 225 |
+
logger.warning(f"⚠️ Timeout after {timeout}s on attempt {attempt + 1}")
|
| 226 |
+
if attempt < max_retries - 1:
|
| 227 |
+
logger.info("🔄 Retrying with longer timeout...")
|
| 228 |
+
continue
|
| 229 |
+
except Exception as e:
|
| 230 |
+
logger.error(f"❌ Error on attempt {attempt + 1}: {e}")
|
| 231 |
+
if attempt < max_retries - 1:
|
| 232 |
+
logger.info("🔄 Retrying...")
|
| 233 |
+
continue
|
| 234 |
+
|
| 235 |
+
# All retries failed - try local OCR fallback
|
| 236 |
+
logger.warning(f"⚠️ All {max_retries} HF extraction attempts failed, trying local OCR fallback...")
|
| 237 |
+
return extract_text_with_local_ocr(file_path)
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def call_table_extractor(file_path: Path, max_retries=2):
|
| 241 |
+
"""
|
| 242 |
+
HF table extraction with retry logic.
|
| 243 |
+
Non-critical, so fewer retries.
|
| 244 |
+
"""
|
| 245 |
+
url = os.getenv('TABLE_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/tables')
|
| 246 |
+
base_timeout = int(os.getenv('AGENT_TIMEOUT_SECONDS', '120'))
|
| 247 |
+
|
| 248 |
+
for attempt in range(max_retries):
|
| 249 |
+
timeout = base_timeout + (60 * attempt)
|
| 250 |
+
|
| 251 |
+
try:
|
| 252 |
+
logger.info(f"📊 Extracting tables from {file_path.name} (attempt {attempt + 1}/{max_retries})...")
|
| 253 |
+
|
| 254 |
+
filename = file_path.name
|
| 255 |
+
mime_type = get_mime_type(file_path)
|
| 256 |
+
|
| 257 |
+
with open(file_path, 'rb') as f:
|
| 258 |
+
files = {'file': (filename, f, mime_type)}
|
| 259 |
+
data = {
|
| 260 |
+
'filename': filename,
|
| 261 |
+
'start_page': 1,
|
| 262 |
+
'end_page': 1
|
| 263 |
+
}
|
| 264 |
+
headers = get_agent_headers()
|
| 265 |
+
|
| 266 |
+
response = httpx.post(url, files=files, data=data, headers=headers, timeout=timeout)
|
| 267 |
+
|
| 268 |
+
if response.status_code == 200:
|
| 269 |
+
result = response.json()
|
| 270 |
+
tables = result.get('result') or result.get('tables') or []
|
| 271 |
+
logger.info(f"✅ Extracted {len(tables)} tables")
|
| 272 |
+
return True, tables, None
|
| 273 |
+
|
| 274 |
+
logger.warning(f"⚠️ HTTP {response.status_code}")
|
| 275 |
+
|
| 276 |
+
except httpx.TimeoutException:
|
| 277 |
+
logger.warning(f"⚠️ Table extraction timeout on attempt {attempt + 1}")
|
| 278 |
+
except Exception as e:
|
| 279 |
+
logger.warning(f"⚠️ Table extraction error: {e}")
|
| 280 |
+
|
| 281 |
+
# Non-critical - return empty list
|
| 282 |
+
logger.info("ℹ️ Table extraction failed, continuing without tables")
|
| 283 |
+
return False, [], "Table extraction failed (non-critical)"
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
# ============================================
|
| 287 |
+
# STEP 2: HF NER (Named Entity Recognition)
|
| 288 |
+
# ============================================
|
| 289 |
+
|
| 290 |
+
def call_ner(text: str, file_path: Path = None, max_retries=2) -> tuple:
|
| 291 |
+
"""
|
| 292 |
+
Extract named entities using HF NER agent with retry logic.
|
| 293 |
+
"""
|
| 294 |
+
url = os.getenv('NER_URL', 'https://point9-ner.hf.space/api/ner')
|
| 295 |
+
base_timeout = int(os.getenv('AGENT_TIMEOUT_SECONDS', '120'))
|
| 296 |
+
|
| 297 |
+
for attempt in range(max_retries):
|
| 298 |
+
timeout = base_timeout + (30 * attempt)
|
| 299 |
+
|
| 300 |
+
try:
|
| 301 |
+
logger.info(f"🔍 Running NER to find entities (attempt {attempt + 1}/{max_retries})...")
|
| 302 |
+
|
| 303 |
+
headers = get_agent_headers()
|
| 304 |
+
|
| 305 |
+
# NER expects multipart/form-data with file OR text
|
| 306 |
+
if file_path and file_path.exists():
|
| 307 |
+
# Send file
|
| 308 |
+
filename = file_path.name
|
| 309 |
+
mime_type = get_mime_type(file_path)
|
| 310 |
+
|
| 311 |
+
with open(file_path, 'rb') as f:
|
| 312 |
+
files = {'file': (filename, f, mime_type)}
|
| 313 |
+
data = {
|
| 314 |
+
'text': text[:5000],
|
| 315 |
+
'filename': filename,
|
| 316 |
+
'start_page': 1,
|
| 317 |
+
'end_page': 1
|
| 318 |
+
}
|
| 319 |
+
response = httpx.post(url, files=files, data=data, headers=headers, timeout=timeout)
|
| 320 |
+
else:
|
| 321 |
+
# Send just text as form data
|
| 322 |
+
data = {
|
| 323 |
+
'text': text[:5000],
|
| 324 |
+
'filename': 'document.txt',
|
| 325 |
+
'start_page': 1,
|
| 326 |
+
'end_page': 1
|
| 327 |
+
}
|
| 328 |
+
response = httpx.post(url, data=data, headers=headers, timeout=timeout)
|
| 329 |
+
|
| 330 |
+
if response.status_code == 200:
|
| 331 |
+
result = response.json()
|
| 332 |
+
|
| 333 |
+
# FIX: Handle both dict and string responses
|
| 334 |
+
if isinstance(result, str):
|
| 335 |
+
try:
|
| 336 |
+
result = json.loads(result)
|
| 337 |
+
except:
|
| 338 |
+
logger.warning(f"⚠️ NER returned unparseable string: {result[:100]}")
|
| 339 |
+
if attempt < max_retries - 1:
|
| 340 |
+
continue
|
| 341 |
+
return False, [], {}, "Invalid response format"
|
| 342 |
+
|
| 343 |
+
# Extract entities
|
| 344 |
+
entities = result.get('entities') or result.get('result') or []
|
| 345 |
+
|
| 346 |
+
# Handle case where entities might also be a string
|
| 347 |
+
if isinstance(entities, str):
|
| 348 |
+
try:
|
| 349 |
+
entities = json.loads(entities)
|
| 350 |
+
except:
|
| 351 |
+
entities = []
|
| 352 |
+
|
| 353 |
+
logger.info(f"✅ Found {len(entities)} entities")
|
| 354 |
+
|
| 355 |
+
# Group entities by type
|
| 356 |
+
entity_map = {
|
| 357 |
+
'PERSON': [],
|
| 358 |
+
'ORG': [],
|
| 359 |
+
'DATE': [],
|
| 360 |
+
'MONEY': [],
|
| 361 |
+
'CARDINAL': []
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
for entity in entities:
|
| 365 |
+
if not isinstance(entity, dict):
|
| 366 |
+
continue
|
| 367 |
+
|
| 368 |
+
ent_type = entity.get('entity_type') or entity.get('label')
|
| 369 |
+
ent_text = entity.get('text') or entity.get('word')
|
| 370 |
+
|
| 371 |
+
if ent_type in entity_map and ent_text:
|
| 372 |
+
entity_map[ent_type].append(ent_text)
|
| 373 |
+
|
| 374 |
+
logger.info(f"📋 Entity summary: PERSON={len(entity_map['PERSON'])}, ORG={len(entity_map['ORG'])}, DATE={len(entity_map['DATE'])}, MONEY={len(entity_map['MONEY'])}")
|
| 375 |
+
|
| 376 |
+
return True, entities, entity_map, None
|
| 377 |
+
|
| 378 |
+
logger.warning(f"⚠️ NER HTTP {response.status_code}")
|
| 379 |
+
|
| 380 |
+
except httpx.TimeoutException:
|
| 381 |
+
logger.warning(f"⚠️ NER timeout on attempt {attempt + 1}")
|
| 382 |
+
except Exception as e:
|
| 383 |
+
logger.error(f"❌ NER error on attempt {attempt + 1}: {e}")
|
| 384 |
+
|
| 385 |
+
# NER failed - return empty (non-critical)
|
| 386 |
+
logger.warning("⚠️ NER failed after retries, continuing without entities")
|
| 387 |
+
return False, [], {}, "NER failed (non-critical)"
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
# ============================================
|
| 391 |
+
# STEP 3: Gemini Intelligent Mapping
|
| 392 |
+
# ============================================
|
| 393 |
+
|
| 394 |
+
def map_with_gemini(text: str, entities: List, entity_map: Dict, tables: List):
|
| 395 |
+
"""Use Gemini to intelligently map extracted data to invoice fields"""
|
| 396 |
+
try:
|
| 397 |
+
import google.generativeai as genai
|
| 398 |
+
|
| 399 |
+
api_key = os.getenv('GEMINI_API_KEY')
|
| 400 |
+
if not api_key:
|
| 401 |
+
logger.warning("⚠️ No Gemini API key configured")
|
| 402 |
+
return False, None, "No Gemini API key"
|
| 403 |
+
|
| 404 |
+
logger.info("🧠 Using Gemini for intelligent field mapping...")
|
| 405 |
+
|
| 406 |
+
genai.configure(api_key=api_key)
|
| 407 |
+
model = genai.GenerativeModel('models/gemini-2.5-flash')
|
| 408 |
+
|
| 409 |
+
# Build context for Gemini
|
| 410 |
+
context = f"""
|
| 411 |
+
EXTRACTED TEXT:
|
| 412 |
+
{text[:3000]}
|
| 413 |
+
|
| 414 |
+
NAMED ENTITIES FOUND:
|
| 415 |
+
- Organizations: {entity_map.get('ORG', [])}
|
| 416 |
+
- People: {entity_map.get('PERSON', [])}
|
| 417 |
+
- Dates: {entity_map.get('DATE', [])}
|
| 418 |
+
- Money amounts: {entity_map.get('MONEY', [])}
|
| 419 |
+
- Numbers: {entity_map.get('CARDINAL', [])}
|
| 420 |
+
|
| 421 |
+
TABLES:
|
| 422 |
+
{json.dumps(tables[:2], indent=2) if tables else 'None'}
|
| 423 |
+
"""
|
| 424 |
+
|
| 425 |
+
prompt = f"""You are an expert at analyzing invoice data. Given the extracted text and entities below, map them to invoice fields.
|
| 426 |
+
|
| 427 |
+
{context}
|
| 428 |
+
|
| 429 |
+
Analyze the above data and return ONLY a valid JSON object with these exact fields:
|
| 430 |
+
|
| 431 |
+
{{
|
| 432 |
+
"customer_name": "the client/customer company name (check ORG entities first)",
|
| 433 |
+
"invoice_number": "the invoice number (check CARDINAL entities)",
|
| 434 |
+
"date": "invoice date in YYYY-MM-DD format (check DATE entities)",
|
| 435 |
+
"total_amount": numeric total amount only (check MONEY entities, no currency symbol),
|
| 436 |
+
"payment_terms": "payment terms like NET30, NET60, or NAH4 if not found",
|
| 437 |
+
"reasoning": "brief explanation of how you identified each field"
|
| 438 |
+
}}
|
| 439 |
+
|
| 440 |
+
Rules:
|
| 441 |
+
1. Prefer entities over raw text when available
|
| 442 |
+
2. Customer name is usually the first ORG after "Bill To" or "Client"
|
| 443 |
+
3. Total amount is usually the largest MONEY value
|
| 444 |
+
4. Date should be in YYYY-MM-DD format
|
| 445 |
+
5. If uncertain, use these defaults: customer_name="UNKNOWN", date="2024-01-01", total_amount=0.0, payment_terms="NAH4"
|
| 446 |
+
|
| 447 |
+
Return ONLY the JSON object, no markdown, no explanation outside the JSON."""
|
| 448 |
+
|
| 449 |
+
response = model.generate_content(prompt)
|
| 450 |
+
text_response = response.text.strip()
|
| 451 |
+
|
| 452 |
+
# Remove markdown if present
|
| 453 |
+
text_response = text_response.replace('```json', '').replace('```', '').strip()
|
| 454 |
+
|
| 455 |
+
result = json.loads(text_response)
|
| 456 |
+
|
| 457 |
+
logger.info(f"✅ Gemini mapped: Customer={result.get('customer_name')}, Amount=${result.get('total_amount')}")
|
| 458 |
+
logger.info(f"💡 Reasoning: {result.get('reasoning', 'N/A')[:100]}")
|
| 459 |
+
|
| 460 |
+
return True, result, None
|
| 461 |
+
|
| 462 |
+
except json.JSONDecodeError as e:
|
| 463 |
+
logger.error(f"❌ Gemini returned invalid JSON: {e}")
|
| 464 |
+
logger.error(f"Response: {text_response[:500]}")
|
| 465 |
+
return False, None, f"Invalid JSON: {e}"
|
| 466 |
+
except Exception as e:
|
| 467 |
+
logger.error(f"❌ Gemini mapping failed: {e}")
|
| 468 |
+
import traceback
|
| 469 |
+
logger.error(traceback.format_exc())
|
| 470 |
+
return False, None, str(e)
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
# ============================================
|
| 474 |
+
# Fallback: Regex Mapping
|
| 475 |
+
# ============================================
|
| 476 |
+
|
| 477 |
+
def map_with_regex(text: str, entities: List) -> tuple:
|
| 478 |
+
"""Fallback regex-based field extraction"""
|
| 479 |
+
logger.info("🔤 Using regex fallback for field mapping...")
|
| 480 |
+
|
| 481 |
+
fields = {}
|
| 482 |
+
confidence = {}
|
| 483 |
+
|
| 484 |
+
# CUSTOMER NAME - try to use ORG entities first
|
| 485 |
+
org_entities = [e.get('text') or e.get('word') for e in entities
|
| 486 |
+
if (e.get('entity_type') or e.get('label')) == 'ORG']
|
| 487 |
+
|
| 488 |
+
if org_entities:
|
| 489 |
+
fields['cust_number'] = org_entities[0][:20]
|
| 490 |
+
confidence['cust_number'] = 0.8
|
| 491 |
+
else:
|
| 492 |
+
# Regex fallback
|
| 493 |
+
client_patterns = [
|
| 494 |
+
r'(?:Client|Bill\s+To|Customer)[:\s]+(.*?)(?:\n|Tax|IBAN)',
|
| 495 |
+
r'(?:customer|client)[\s:]+([A-Za-z][A-Za-z\s,&-]+?)(?:\n|$)',
|
| 496 |
+
]
|
| 497 |
+
|
| 498 |
+
for pattern in client_patterns:
|
| 499 |
+
match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
|
| 500 |
+
if match:
|
| 501 |
+
client = match.group(1).strip()
|
| 502 |
+
words = [w.strip() for w in client.replace(',', ' ').split() if len(w.strip()) > 2]
|
| 503 |
+
if words:
|
| 504 |
+
fields['cust_number'] = words[0][:20]
|
| 505 |
+
confidence['cust_number'] = 0.6
|
| 506 |
+
break
|
| 507 |
+
|
| 508 |
+
if 'cust_number' not in fields:
|
| 509 |
+
fields['cust_number'] = 'UNKNOWN'
|
| 510 |
+
confidence['cust_number'] = 0.1
|
| 511 |
+
|
| 512 |
+
# DATE - try DATE entities first
|
| 513 |
+
date_entities = [e.get('text') or e.get('word') for e in entities
|
| 514 |
+
if (e.get('entity_type') or e.get('label')) == 'DATE']
|
| 515 |
+
|
| 516 |
+
if date_entities:
|
| 517 |
+
date_str = date_entities[0]
|
| 518 |
+
for fmt in ['%m/%d/%Y', '%d/%m/%Y', '%Y-%m-%d', '%m-%d-%Y']:
|
| 519 |
+
try:
|
| 520 |
+
dt = datetime.strptime(date_str, fmt)
|
| 521 |
+
fields['posting_date'] = dt.strftime('%Y-%m-%d')
|
| 522 |
+
confidence['posting_date'] = 0.8
|
| 523 |
+
break
|
| 524 |
+
except:
|
| 525 |
+
continue
|
| 526 |
+
|
| 527 |
+
if 'posting_date' not in fields:
|
| 528 |
+
date_patterns = [
|
| 529 |
+
r'(?:Date\s+of\s+issue|Invoice\s+Date|Date)[:\s]+(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
|
| 530 |
+
]
|
| 531 |
+
|
| 532 |
+
for pattern in date_patterns:
|
| 533 |
+
match = re.search(pattern, text, re.IGNORECASE)
|
| 534 |
+
if match:
|
| 535 |
+
date_str = match.group(1)
|
| 536 |
+
for fmt in ['%m/%d/%Y', '%d/%m/%Y']:
|
| 537 |
+
try:
|
| 538 |
+
dt = datetime.strptime(date_str, fmt)
|
| 539 |
+
fields['posting_date'] = dt.strftime('%Y-%m-%d')
|
| 540 |
+
confidence['posting_date'] = 0.7
|
| 541 |
+
break
|
| 542 |
+
except:
|
| 543 |
+
continue
|
| 544 |
+
if 'posting_date' in fields:
|
| 545 |
+
break
|
| 546 |
+
|
| 547 |
+
if 'posting_date' not in fields:
|
| 548 |
+
fields['posting_date'] = datetime.now().strftime('%Y-%m-%d')
|
| 549 |
+
confidence['posting_date'] = 0.1
|
| 550 |
+
|
| 551 |
+
# AMOUNT - try MONEY entities first
|
| 552 |
+
money_entities = [e.get('text') or e.get('word') for e in entities
|
| 553 |
+
if (e.get('entity_type') or e.get('label')) == 'MONEY']
|
| 554 |
+
|
| 555 |
+
if money_entities:
|
| 556 |
+
amounts = []
|
| 557 |
+
for money_str in money_entities:
|
| 558 |
+
try:
|
| 559 |
+
# Remove currency symbols and parse
|
| 560 |
+
amt_str = re.sub(r'[^\d.]', '', money_str)
|
| 561 |
+
amt = float(amt_str)
|
| 562 |
+
if amt > 10:
|
| 563 |
+
amounts.append(amt)
|
| 564 |
+
except:
|
| 565 |
+
pass
|
| 566 |
+
|
| 567 |
+
if amounts:
|
| 568 |
+
fields['total_open_amount'] = max(amounts)
|
| 569 |
+
confidence['total_open_amount'] = 0.8
|
| 570 |
+
logger.info(f"✅ Found amount from MONEY entity: ${fields['total_open_amount']}")
|
| 571 |
+
|
| 572 |
+
if 'total_open_amount' not in fields:
|
| 573 |
+
# Regex fallback
|
| 574 |
+
pattern = r'\$\s*([0-9]{1,3}(?:,?[0-9]{3})*\.[0-9]{2})'
|
| 575 |
+
amounts = []
|
| 576 |
+
for match in re.finditer(pattern, text):
|
| 577 |
+
try:
|
| 578 |
+
amt = float(match.group(1).replace(',', ''))
|
| 579 |
+
if amt > 50:
|
| 580 |
+
amounts.append(amt)
|
| 581 |
+
except:
|
| 582 |
+
pass
|
| 583 |
+
|
| 584 |
+
if amounts:
|
| 585 |
+
fields['total_open_amount'] = max(amounts)
|
| 586 |
+
confidence['total_open_amount'] = 0.6
|
| 587 |
+
else:
|
| 588 |
+
fields['total_open_amount'] = 0.0
|
| 589 |
+
confidence['total_open_amount'] = 0.0
|
| 590 |
+
logger.warning("⚠️ No amount found!")
|
| 591 |
+
|
| 592 |
+
# PAYMENT TERMS
|
| 593 |
+
terms_match = re.search(r'(NET\s?\d{1,2}|N\d{2}|NAH\d)', text, re.IGNORECASE)
|
| 594 |
+
fields['cust_payment_terms'] = terms_match.group(1).upper() if terms_match else 'NAH4'
|
| 595 |
+
confidence['cust_payment_terms'] = 0.7 if terms_match else 0.2
|
| 596 |
+
|
| 597 |
+
# BUSINESS CODE
|
| 598 |
+
fields['business_code'] = 'U001'
|
| 599 |
+
confidence['business_code'] = 0.2
|
| 600 |
+
|
| 601 |
+
return fields, confidence
|
| 602 |
+
|
| 603 |
+
|
| 604 |
+
# ============================================
|
| 605 |
+
# Database Functions
|
| 606 |
+
# ============================================
|
| 607 |
+
|
| 608 |
+
def update_job_status(job_id: str, status: str, error_text: str = None):
|
| 609 |
+
"""Update job status"""
|
| 610 |
+
with FileLock(str(LOCK_PATH), timeout=10):
|
| 611 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 612 |
+
cursor = conn.cursor()
|
| 613 |
+
cursor.execute("""
|
| 614 |
+
UPDATE ingest_jobs
|
| 615 |
+
SET status = ?, error_text = ?, updated_at = CURRENT_TIMESTAMP
|
| 616 |
+
WHERE job_id = ?
|
| 617 |
+
""", (status, error_text, job_id))
|
| 618 |
+
conn.commit()
|
| 619 |
+
conn.close()
|
| 620 |
+
|
| 621 |
+
|
| 622 |
+
def save_extraction(doc_id: str, raw_text: str, tables: list, entities: list, classification: dict, summary: str = None):
|
| 623 |
+
"""Save extraction results"""
|
| 624 |
+
with FileLock(str(LOCK_PATH), timeout=10):
|
| 625 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 626 |
+
cursor = conn.cursor()
|
| 627 |
+
cursor.execute("""
|
| 628 |
+
INSERT OR REPLACE INTO extractions (
|
| 629 |
+
doc_id, raw_text, tables_json, entities_json,
|
| 630 |
+
classification_json, summary_text
|
| 631 |
+
) VALUES (?, ?, ?, ?, ?, ?)
|
| 632 |
+
""", (
|
| 633 |
+
doc_id,
|
| 634 |
+
raw_text,
|
| 635 |
+
json.dumps(tables) if tables else None,
|
| 636 |
+
json.dumps(entities) if entities else None,
|
| 637 |
+
json.dumps(classification) if classification else None,
|
| 638 |
+
summary
|
| 639 |
+
))
|
| 640 |
+
conn.commit()
|
| 641 |
+
conn.close()
|
| 642 |
+
|
| 643 |
+
|
| 644 |
+
def save_invoice_fields(doc_id: str, fields: Dict, confidence_map: Dict):
|
| 645 |
+
"""Save invoice fields"""
|
| 646 |
+
with FileLock(str(LOCK_PATH), timeout=10):
|
| 647 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 648 |
+
cursor = conn.cursor()
|
| 649 |
+
cursor.execute("""
|
| 650 |
+
INSERT INTO invoice_fields (
|
| 651 |
+
doc_id, cust_number, posting_date, total_open_amount,
|
| 652 |
+
business_code, cust_payment_terms, confidence_map
|
| 653 |
+
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
| 654 |
+
""", (
|
| 655 |
+
doc_id,
|
| 656 |
+
fields.get('cust_number'),
|
| 657 |
+
fields.get('posting_date'),
|
| 658 |
+
fields.get('total_open_amount'),
|
| 659 |
+
fields.get('business_code'),
|
| 660 |
+
fields.get('cust_payment_terms'),
|
| 661 |
+
json.dumps(confidence_map)
|
| 662 |
+
))
|
| 663 |
+
conn.commit()
|
| 664 |
+
conn.close()
|
| 665 |
+
|
| 666 |
+
|
| 667 |
+
# ============================================
|
| 668 |
+
# AGENT MODE FLAG (Environment Variable)
|
| 669 |
+
# ============================================
|
| 670 |
+
|
| 671 |
+
USE_AGENT_MODE = os.getenv('USE_AGENT_MODE', 'true').lower() == 'true'
|
| 672 |
+
|
| 673 |
+
|
| 674 |
+
# ============================================
|
| 675 |
+
# Main Processing Pipeline
|
| 676 |
+
# ============================================
|
| 677 |
+
|
| 678 |
+
def process_document_legacy(job_id: str, doc_id: str, file_path: Path):
|
| 679 |
+
"""
|
| 680 |
+
LEGACY PIPELINE (Original Implementation):
|
| 681 |
+
1. HF Extract text + tables
|
| 682 |
+
2. HF NER finds entities
|
| 683 |
+
3. Gemini maps to invoice fields
|
| 684 |
+
"""
|
| 685 |
+
logger.info("=" * 70)
|
| 686 |
+
logger.info(f"🚀 Starting LEGACY pipeline for {file_path.name}")
|
| 687 |
+
logger.info("=" * 70)
|
| 688 |
+
|
| 689 |
+
try:
|
| 690 |
+
update_job_status(job_id, 'processing')
|
| 691 |
+
|
| 692 |
+
# STEP 1: Extract text with HF agents
|
| 693 |
+
logger.info("STEP 1: HF TEXT + TABLE EXTRACTION")
|
| 694 |
+
logger.info("-" * 70)
|
| 695 |
+
|
| 696 |
+
success, raw_text, error = call_text_extractor(file_path)
|
| 697 |
+
if not success or not raw_text:
|
| 698 |
+
update_job_status(job_id, 'failed', f"Text extraction failed: {error}")
|
| 699 |
+
return
|
| 700 |
+
|
| 701 |
+
# Extract tables (optional, won't fail if it doesn't work)
|
| 702 |
+
_, tables, _ = call_table_extractor(file_path)
|
| 703 |
+
|
| 704 |
+
# STEP 2: NER to find entities
|
| 705 |
+
logger.info("-" * 70)
|
| 706 |
+
logger.info("STEP 2: NER - NAMED ENTITY RECOGNITION")
|
| 707 |
+
logger.info("-" * 70)
|
| 708 |
+
|
| 709 |
+
ner_success, entities, entity_map, ner_error = call_ner(raw_text, file_path)
|
| 710 |
+
|
| 711 |
+
if not ner_success:
|
| 712 |
+
logger.warning(f"⚠️ NER failed: {ner_error}, continuing without entities")
|
| 713 |
+
entities = []
|
| 714 |
+
entity_map = {}
|
| 715 |
+
|
| 716 |
+
# STEP 3: Gemini intelligent mapping
|
| 717 |
+
logger.info("-" * 70)
|
| 718 |
+
logger.info("STEP 3: GEMINI INTELLIGENT MAPPING")
|
| 719 |
+
logger.info("-" * 70)
|
| 720 |
+
|
| 721 |
+
gemini_success, gemini_result, gemini_error = map_with_gemini(
|
| 722 |
+
raw_text, entities, entity_map, tables
|
| 723 |
+
)
|
| 724 |
+
|
| 725 |
+
if gemini_success and gemini_result:
|
| 726 |
+
# Use Gemini's mapping
|
| 727 |
+
fields = {
|
| 728 |
+
'cust_number': gemini_result.get('customer_name', 'UNKNOWN')[:20],
|
| 729 |
+
'posting_date': gemini_result.get('date', datetime.now().strftime('%Y-%m-%d')),
|
| 730 |
+
'total_open_amount': float(gemini_result.get('total_amount', 0.0)),
|
| 731 |
+
'business_code': 'U001',
|
| 732 |
+
'cust_payment_terms': gemini_result.get('payment_terms', 'NAH4')[:10]
|
| 733 |
+
}
|
| 734 |
+
|
| 735 |
+
confidence_map = {
|
| 736 |
+
'cust_number': 0.95,
|
| 737 |
+
'posting_date': 0.95,
|
| 738 |
+
'total_open_amount': 0.95,
|
| 739 |
+
'business_code': 0.2,
|
| 740 |
+
'cust_payment_terms': 0.8
|
| 741 |
+
}
|
| 742 |
+
|
| 743 |
+
method = 'hf_ner_gemini'
|
| 744 |
+
|
| 745 |
+
else:
|
| 746 |
+
# Fallback to regex mapping
|
| 747 |
+
logger.warning(f"⚠️ Gemini mapping failed: {gemini_error}")
|
| 748 |
+
logger.info("-" * 70)
|
| 749 |
+
logger.info("FALLBACK: REGEX MAPPING")
|
| 750 |
+
logger.info("-" * 70)
|
| 751 |
+
|
| 752 |
+
fields, confidence_map = map_with_regex(raw_text, entities)
|
| 753 |
+
method = 'hf_ner_regex'
|
| 754 |
+
|
| 755 |
+
# Save results
|
| 756 |
+
save_extraction(
|
| 757 |
+
doc_id, raw_text, tables, entities,
|
| 758 |
+
{'method': method, 'entity_count': len(entities)},
|
| 759 |
+
None
|
| 760 |
+
)
|
| 761 |
+
save_invoice_fields(doc_id, fields, confidence_map)
|
| 762 |
+
|
| 763 |
+
logger.info("=" * 70)
|
| 764 |
+
logger.info(f"✅ EXTRACTION COMPLETE - Method: {method}")
|
| 765 |
+
logger.info(f"📋 Fields: {fields}")
|
| 766 |
+
logger.info("=" * 70)
|
| 767 |
+
|
| 768 |
+
# Call prediction API
|
| 769 |
+
#logger.info("🔮 Calling payment prediction...")
|
| 770 |
+
#try:
|
| 771 |
+
# pred_response = httpx.post(PREDICT_ENDPOINT, json=fields, timeout=30)
|
| 772 |
+
#
|
| 773 |
+
# if pred_response.status_code == 200:
|
| 774 |
+
# pred_result = pred_response.json()
|
| 775 |
+
# logger.info(f"✅ Prediction: {pred_result.get('predicted_days_to_clear')} days")
|
| 776 |
+
#except Exception as e:
|
| 777 |
+
# logger.error(f"⚠️ Prediction failed: {e}")
|
| 778 |
+
|
| 779 |
+
update_job_status(job_id, 'completed')
|
| 780 |
+
logger.info(f"🎉 Job {job_id} completed successfully")
|
| 781 |
+
|
| 782 |
+
except Exception as e:
|
| 783 |
+
logger.error(f"❌ Job {job_id} failed: {e}")
|
| 784 |
+
import traceback
|
| 785 |
+
traceback.print_exc()
|
| 786 |
+
update_job_status(job_id, 'failed', str(e))
|
| 787 |
+
|
| 788 |
+
|
| 789 |
+
|
| 790 |
+
|
| 791 |
+
def process_document_agent(job_id: str, doc_id: str, file_path: Path, user_message: Optional[str] = None):
|
| 792 |
+
"""
|
| 793 |
+
NEW AUTONOMOUS AGENT PIPELINE with optional wrapper
|
| 794 |
+
"""
|
| 795 |
+
try:
|
| 796 |
+
# Clean up user_message
|
| 797 |
+
if user_message in [None, 'None', '', 'null', 'undefined']:
|
| 798 |
+
user_message = None
|
| 799 |
+
else:
|
| 800 |
+
user_message = str(user_message).strip()
|
| 801 |
+
if not user_message:
|
| 802 |
+
user_message = None
|
| 803 |
+
|
| 804 |
+
logger.info("=" * 70)
|
| 805 |
+
logger.info(f"🔍 AGENT - Processing with message: '{user_message}'")
|
| 806 |
+
logger.info(f"🔍 Type: {type(user_message)}")
|
| 807 |
+
logger.info(f"🔍 Is None: {user_message is None}")
|
| 808 |
+
logger.info("=" * 70)
|
| 809 |
+
|
| 810 |
+
from backend.app.agent.agent_orchestrator import (
|
| 811 |
+
InvoiceAgent, AgentState, create_agent
|
| 812 |
+
)
|
| 813 |
+
|
| 814 |
+
logger.info("=" * 70)
|
| 815 |
+
logger.info(f"🤖 AUTONOMOUS AGENT MODE for {file_path.name}")
|
| 816 |
+
logger.info("=" * 70)
|
| 817 |
+
|
| 818 |
+
update_job_status(job_id, 'processing')
|
| 819 |
+
|
| 820 |
+
# Create agent
|
| 821 |
+
agent = create_agent(
|
| 822 |
+
call_text_extractor,
|
| 823 |
+
call_table_extractor,
|
| 824 |
+
call_ner,
|
| 825 |
+
map_with_gemini
|
| 826 |
+
)
|
| 827 |
+
|
| 828 |
+
# Initialize state
|
| 829 |
+
state = AgentState(doc_id=doc_id, file_path=file_path)
|
| 830 |
+
|
| 831 |
+
# Let agent autonomously decide and execute
|
| 832 |
+
result_state = agent.process(state)
|
| 833 |
+
|
| 834 |
+
# ============================================
|
| 835 |
+
# WRAPPER INTEGRATION
|
| 836 |
+
# ============================================
|
| 837 |
+
|
| 838 |
+
full_extraction = result_state.fields
|
| 839 |
+
final_result = full_extraction
|
| 840 |
+
wrapper_used = False
|
| 841 |
+
|
| 842 |
+
# Check if user_message is actually provided
|
| 843 |
+
if user_message is not None and len(user_message) > 0:
|
| 844 |
+
logger.info("=" * 70)
|
| 845 |
+
logger.info(f"💬 USER MESSAGE DETECTED: '{user_message}'")
|
| 846 |
+
logger.info("🎯 Activating Gemini wrapper to filter output...")
|
| 847 |
+
logger.info(f"📦 Full extraction fields: {list(full_extraction.keys())}")
|
| 848 |
+
logger.info("=" * 70)
|
| 849 |
+
|
| 850 |
+
try:
|
| 851 |
+
from backend.app.wrappers.gemini_output_filter import GeminiOutputFilter
|
| 852 |
+
|
| 853 |
+
wrapper = GeminiOutputFilter()
|
| 854 |
+
final_result = wrapper.filter_output(user_message, full_extraction)
|
| 855 |
+
wrapper_used = True
|
| 856 |
+
|
| 857 |
+
logger.info("=" * 70)
|
| 858 |
+
logger.info(f"✅ WRAPPER SUCCESS!")
|
| 859 |
+
logger.info(f"📤 Original fields: {list(full_extraction.keys())}")
|
| 860 |
+
logger.info(f"🎯 Filtered fields: {list(final_result.keys())}")
|
| 861 |
+
logger.info(f"📋 Filtered result: {json.dumps(final_result, indent=2)}")
|
| 862 |
+
logger.info("=" * 70)
|
| 863 |
+
|
| 864 |
+
except Exception as wrapper_error:
|
| 865 |
+
logger.error("=" * 70)
|
| 866 |
+
logger.error(f"❌ WRAPPER FAILED: {wrapper_error}")
|
| 867 |
+
logger.error("=" * 70)
|
| 868 |
+
import traceback
|
| 869 |
+
logger.error(traceback.format_exc())
|
| 870 |
+
logger.warning("📦 Falling back to full extraction")
|
| 871 |
+
final_result = full_extraction
|
| 872 |
+
wrapper_used = False
|
| 873 |
+
else:
|
| 874 |
+
logger.info("=" * 70)
|
| 875 |
+
logger.info("ℹ️ No user message provided - returning full extraction")
|
| 876 |
+
logger.info(f"📦 Full extraction fields: {list(full_extraction.keys())}")
|
| 877 |
+
logger.info("=" * 70)
|
| 878 |
+
|
| 879 |
+
# ============================================
|
| 880 |
+
# Save results
|
| 881 |
+
# ============================================
|
| 882 |
+
|
| 883 |
+
if result_state.fields:
|
| 884 |
+
# Determine method
|
| 885 |
+
if 'use_gemini' in result_state.history:
|
| 886 |
+
method = 'autonomous_agent_gemini'
|
| 887 |
+
elif 'use_regex' in result_state.history:
|
| 888 |
+
method = 'autonomous_agent_regex'
|
| 889 |
+
else:
|
| 890 |
+
method = 'autonomous_agent'
|
| 891 |
+
|
| 892 |
+
if wrapper_used:
|
| 893 |
+
method += '_with_wrapper'
|
| 894 |
+
|
| 895 |
+
save_extraction(
|
| 896 |
+
doc_id,
|
| 897 |
+
result_state.raw_text or '',
|
| 898 |
+
result_state.tables or [],
|
| 899 |
+
result_state.entities or [],
|
| 900 |
+
{
|
| 901 |
+
'method': method,
|
| 902 |
+
'attempts': result_state.attempts,
|
| 903 |
+
'actions': result_state.history,
|
| 904 |
+
'confidence': agent._calculate_overall_confidence(result_state),
|
| 905 |
+
'errors': result_state.errors,
|
| 906 |
+
'user_message': user_message,
|
| 907 |
+
'wrapper_used': wrapper_used,
|
| 908 |
+
'full_extraction_keys': list(full_extraction.keys()) if full_extraction else [],
|
| 909 |
+
'filtered_keys': list(final_result.keys()) if wrapper_used else None
|
| 910 |
+
},
|
| 911 |
+
None
|
| 912 |
+
)
|
| 913 |
+
|
| 914 |
+
# Save filtered result
|
| 915 |
+
save_invoice_fields(
|
| 916 |
+
doc_id,
|
| 917 |
+
final_result,
|
| 918 |
+
result_state.confidence_map or {}
|
| 919 |
+
)
|
| 920 |
+
|
| 921 |
+
# Call prediction
|
| 922 |
+
logger.info("🔮 Calling payment prediction...")
|
| 923 |
+
try:
|
| 924 |
+
pred_response = httpx.post(PREDICT_ENDPOINT, json=final_result, timeout=30)
|
| 925 |
+
|
| 926 |
+
if pred_response.status_code == 200:
|
| 927 |
+
pred_result = pred_response.json()
|
| 928 |
+
logger.info(f"✅ Prediction: {pred_result.get('predicted_days_to_clear')} days")
|
| 929 |
+
except Exception as e:
|
| 930 |
+
logger.error(f"⚠️ Prediction failed: {e}")
|
| 931 |
+
|
| 932 |
+
# Check status
|
| 933 |
+
from backend.app.agent.agent_orchestrator import AgentDecision
|
| 934 |
+
if AgentDecision.HUMAN_REVIEW.value in result_state.history:
|
| 935 |
+
update_job_status(job_id, 'needs_review')
|
| 936 |
+
logger.info("👤 Agent requesting human review")
|
| 937 |
+
else:
|
| 938 |
+
update_job_status(job_id, 'completed')
|
| 939 |
+
logger.info(f"✅ Agent completed with confidence: {agent._calculate_overall_confidence(result_state):.2f}")
|
| 940 |
+
else:
|
| 941 |
+
update_job_status(job_id, 'failed', 'Agent could not extract fields')
|
| 942 |
+
logger.error("❌ Agent failed to extract any fields")
|
| 943 |
+
|
| 944 |
+
except ImportError as e:
|
| 945 |
+
logger.error(f"❌ Agent module not found: {e}")
|
| 946 |
+
logger.info("⚠️ Falling back to legacy pipeline...")
|
| 947 |
+
process_document_legacy(job_id, doc_id, file_path)
|
| 948 |
+
except Exception as e:
|
| 949 |
+
logger.error(f"❌ Agent failed: {e}")
|
| 950 |
+
import traceback
|
| 951 |
+
traceback.print_exc()
|
| 952 |
+
update_job_status(job_id, 'failed', str(e))
|
| 953 |
+
|
| 954 |
+
|
| 955 |
+
def process_document(job_id: str, doc_id: str, file_path: Path, user_message: Optional[str] = None):
|
| 956 |
+
"""
|
| 957 |
+
Main entry point - routes to agent or legacy pipeline.
|
| 958 |
+
"""
|
| 959 |
+
# Clean up user_message
|
| 960 |
+
if user_message in [None, 'None', '', 'null', 'undefined']:
|
| 961 |
+
user_message = None
|
| 962 |
+
else:
|
| 963 |
+
user_message = str(user_message).strip()
|
| 964 |
+
if not user_message:
|
| 965 |
+
user_message = None
|
| 966 |
+
|
| 967 |
+
logger.info("=" * 70)
|
| 968 |
+
logger.info(f"🔍 PROCESS_DOCUMENT - Cleaned user_message: '{user_message}'")
|
| 969 |
+
logger.info(f"🔍 Type: {type(user_message)}")
|
| 970 |
+
logger.info(f"🔍 Is None: {user_message is None}")
|
| 971 |
+
logger.info("=" * 70)
|
| 972 |
+
|
| 973 |
+
if USE_AGENT_MODE:
|
| 974 |
+
logger.info("🤖 Using AUTONOMOUS AGENT mode")
|
| 975 |
+
process_document_agent(job_id, doc_id, file_path, user_message=user_message)
|
| 976 |
+
else:
|
| 977 |
+
logger.info("📋 Using LEGACY pipeline mode")
|
| 978 |
+
process_document_legacy(job_id, doc_id, file_path)
|
| 979 |
+
|
| 980 |
+
|
| 981 |
+
# ============================================
|
| 982 |
+
# API Endpoints
|
| 983 |
+
# ============================================
|
| 984 |
+
|
| 985 |
+
class IngestResponse(BaseModel):
|
| 986 |
+
job_id: str
|
| 987 |
+
doc_id: str
|
| 988 |
+
filename: str
|
| 989 |
+
status: str
|
| 990 |
+
message: str
|
| 991 |
+
|
| 992 |
+
|
| 993 |
+
class JobStatusResponse(BaseModel):
|
| 994 |
+
job_id: str
|
| 995 |
+
doc_id: str
|
| 996 |
+
filename: str
|
| 997 |
+
status: str
|
| 998 |
+
error_text: Optional[str] = None
|
| 999 |
+
created_at: str
|
| 1000 |
+
updated_at: str
|
| 1001 |
+
extraction: Optional[Dict] = None
|
| 1002 |
+
invoice_fields: Optional[Dict] = None
|
| 1003 |
+
|
| 1004 |
+
class BatchIngestResponse(BaseModel):
|
| 1005 |
+
batch_id: str
|
| 1006 |
+
total_files: int
|
| 1007 |
+
jobs: List[Dict[str, str]]
|
| 1008 |
+
message: str
|
| 1009 |
+
|
| 1010 |
+
|
| 1011 |
+
class BatchStatusResponse(BaseModel):
|
| 1012 |
+
batch_id: str
|
| 1013 |
+
total_files: int
|
| 1014 |
+
completed: int
|
| 1015 |
+
processing: int
|
| 1016 |
+
failed: int
|
| 1017 |
+
queued: int
|
| 1018 |
+
jobs: List[Dict[str, Any]]
|
| 1019 |
+
|
| 1020 |
+
@router.post("/ingest", response_model=IngestResponse)
|
| 1021 |
+
async def ingest_document(
|
| 1022 |
+
background_tasks: BackgroundTasks,
|
| 1023 |
+
file: UploadFile = File(...),
|
| 1024 |
+
message: str = Form(None) # CHANGED: Use Form(None) instead of Optional[str] = None
|
| 1025 |
+
):
|
| 1026 |
+
|
| 1027 |
+
# Clean message parameter
|
| 1028 |
+
cleaned_message = None
|
| 1029 |
+
if message and message not in ['None', 'null', '', 'undefined']:
|
| 1030 |
+
cleaned_message = message.strip()
|
| 1031 |
+
if not cleaned_message:
|
| 1032 |
+
cleaned_message = None
|
| 1033 |
+
|
| 1034 |
+
logger.info("=" * 70)
|
| 1035 |
+
logger.info(f"📨 API ENDPOINT - Raw message: '{message}'")
|
| 1036 |
+
logger.info(f"✨ Cleaned message: '{cleaned_message}'")
|
| 1037 |
+
logger.info(f"🔍 Message type: {type(cleaned_message)}")
|
| 1038 |
+
logger.info(f"❓ Is None: {cleaned_message is None}")
|
| 1039 |
+
logger.info("=" * 70)
|
| 1040 |
+
|
| 1041 |
+
try:
|
| 1042 |
+
allowed_types = ['application/pdf', 'image/png', 'image/jpeg']
|
| 1043 |
+
if file.content_type not in allowed_types:
|
| 1044 |
+
raise HTTPException(400, f"Invalid file type: {file.content_type}")
|
| 1045 |
+
|
| 1046 |
+
job_id = f"job_{uuid.uuid4().hex[:12]}"
|
| 1047 |
+
doc_id = f"doc_{uuid.uuid4().hex[:12]}"
|
| 1048 |
+
file_ext = file.filename.split('.')[-1] if '.' in file.filename else 'pdf'
|
| 1049 |
+
|
| 1050 |
+
stored_filename = f"{doc_id}.{file_ext}"
|
| 1051 |
+
file_path = STORAGE_PATH / stored_filename
|
| 1052 |
+
|
| 1053 |
+
content = await file.read()
|
| 1054 |
+
with open(file_path, 'wb') as f:
|
| 1055 |
+
f.write(content)
|
| 1056 |
+
|
| 1057 |
+
with FileLock(str(LOCK_PATH), timeout=10):
|
| 1058 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 1059 |
+
cursor = conn.cursor()
|
| 1060 |
+
|
| 1061 |
+
cursor.execute("""
|
| 1062 |
+
INSERT INTO ingest_jobs (job_id, doc_id, filename, status)
|
| 1063 |
+
VALUES (?, ?, ?, 'queued')
|
| 1064 |
+
""", (job_id, doc_id, file.filename))
|
| 1065 |
+
|
| 1066 |
+
cursor.execute("""
|
| 1067 |
+
INSERT INTO documents (doc_id, job_id, path, filename, content_type)
|
| 1068 |
+
VALUES (?, ?, ?, ?, ?)
|
| 1069 |
+
""", (doc_id, job_id, str(file_path), file.filename, file.content_type))
|
| 1070 |
+
|
| 1071 |
+
conn.commit()
|
| 1072 |
+
conn.close()
|
| 1073 |
+
|
| 1074 |
+
# Start processing with cleaned message
|
| 1075 |
+
background_tasks.add_task(
|
| 1076 |
+
process_document,
|
| 1077 |
+
job_id,
|
| 1078 |
+
doc_id,
|
| 1079 |
+
file_path,
|
| 1080 |
+
user_message=cleaned_message # Pass cleaned message
|
| 1081 |
+
)
|
| 1082 |
+
|
| 1083 |
+
logger.info(f"🚀 Background task started with message: '{cleaned_message}'")
|
| 1084 |
+
|
| 1085 |
+
mode = "autonomous agent"
|
| 1086 |
+
if cleaned_message:
|
| 1087 |
+
mode += f" with intelligent filtering"
|
| 1088 |
+
logger.info(f"🎯 User wants: '{cleaned_message}'")
|
| 1089 |
+
|
| 1090 |
+
return IngestResponse(
|
| 1091 |
+
job_id=job_id,
|
| 1092 |
+
doc_id=doc_id,
|
| 1093 |
+
filename=file.filename,
|
| 1094 |
+
status='queued',
|
| 1095 |
+
message=f'Document uploaded. Processing with {mode}.'
|
| 1096 |
+
)
|
| 1097 |
+
|
| 1098 |
+
except HTTPException:
|
| 1099 |
+
raise
|
| 1100 |
+
except Exception as e:
|
| 1101 |
+
logger.error(f"❌ Ingest endpoint error: {e}")
|
| 1102 |
+
import traceback
|
| 1103 |
+
logger.error(traceback.format_exc())
|
| 1104 |
+
raise HTTPException(500, str(e))
|
| 1105 |
+
|
| 1106 |
+
|
| 1107 |
+
@router.get("/ingest/{job_id}", response_model=JobStatusResponse)
|
| 1108 |
+
def get_ingest_status(job_id: str):
|
| 1109 |
+
"""Get job status with agent decision history (if applicable)"""
|
| 1110 |
+
try:
|
| 1111 |
+
with FileLock(str(LOCK_PATH), timeout=10):
|
| 1112 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 1113 |
+
conn.row_factory = sqlite3.Row
|
| 1114 |
+
cursor = conn.cursor()
|
| 1115 |
+
|
| 1116 |
+
cursor.execute("SELECT * FROM ingest_jobs WHERE job_id = ?", (job_id,))
|
| 1117 |
+
job = cursor.fetchone()
|
| 1118 |
+
if not job:
|
| 1119 |
+
conn.close()
|
| 1120 |
+
raise HTTPException(404, "Job not found")
|
| 1121 |
+
|
| 1122 |
+
job_data = dict(job)
|
| 1123 |
+
doc_id = job_data['doc_id']
|
| 1124 |
+
|
| 1125 |
+
if job_data['status'] in ['completed', 'needs_review']:
|
| 1126 |
+
cursor.execute("SELECT * FROM extractions WHERE doc_id = ?", (doc_id,))
|
| 1127 |
+
extraction = cursor.fetchone()
|
| 1128 |
+
if extraction:
|
| 1129 |
+
ext_dict = dict(extraction)
|
| 1130 |
+
if ext_dict.get('raw_text'):
|
| 1131 |
+
ext_dict['raw_text'] = ext_dict['raw_text'][:500] + "..."
|
| 1132 |
+
job_data['extraction'] = ext_dict
|
| 1133 |
+
|
| 1134 |
+
cursor.execute("SELECT * FROM invoice_fields WHERE doc_id = ?", (doc_id,))
|
| 1135 |
+
invoice = cursor.fetchone()
|
| 1136 |
+
if invoice:
|
| 1137 |
+
inv_dict = dict(invoice)
|
| 1138 |
+
if inv_dict.get('confidence_map'):
|
| 1139 |
+
inv_dict['confidence_map'] = json.loads(inv_dict['confidence_map'])
|
| 1140 |
+
job_data['invoice_fields'] = inv_dict
|
| 1141 |
+
|
| 1142 |
+
conn.close()
|
| 1143 |
+
return JobStatusResponse(**job_data)
|
| 1144 |
+
|
| 1145 |
+
except HTTPException:
|
| 1146 |
+
raise
|
| 1147 |
+
except Exception as e:
|
| 1148 |
+
logger.error(f"❌ Job status error: {e}")
|
| 1149 |
+
raise HTTPException(500, str(e))
|
| 1150 |
+
|
| 1151 |
+
|
| 1152 |
+
@router.post("/ingest/batch", response_model=BatchIngestResponse)
|
| 1153 |
+
async def ingest_batch_documents(
|
| 1154 |
+
background_tasks: BackgroundTasks,
|
| 1155 |
+
files: List[UploadFile] = File(...),
|
| 1156 |
+
message: str = Form(None)
|
| 1157 |
+
):
|
| 1158 |
+
"""
|
| 1159 |
+
Upload multiple documents for batch processing.
|
| 1160 |
+
|
| 1161 |
+
Examples:
|
| 1162 |
+
1. Batch upload without filtering:
|
| 1163 |
+
curl -F "files=@invoice1.jpg" -F "files=@invoice2.pdf" -F "files=@invoice3.png" \
|
| 1164 |
+
http://localhost:7860/api/ingest/batch
|
| 1165 |
+
|
| 1166 |
+
2. Batch upload with same extraction rule for all:
|
| 1167 |
+
curl -F "files=@invoice1.jpg" -F "files=@invoice2.jpg" \
|
| 1168 |
+
-F "message=extract only total and date" \
|
| 1169 |
+
http://localhost:7860/api/ingest/batch
|
| 1170 |
+
|
| 1171 |
+
3. Maximum 50 files per batch
|
| 1172 |
+
"""
|
| 1173 |
+
# Validate batch size
|
| 1174 |
+
if len(files) > 50:
|
| 1175 |
+
raise HTTPException(400, "Maximum 50 files per batch")
|
| 1176 |
+
|
| 1177 |
+
if len(files) == 0:
|
| 1178 |
+
raise HTTPException(400, "No files provided")
|
| 1179 |
+
|
| 1180 |
+
# Clean message
|
| 1181 |
+
cleaned_message = None
|
| 1182 |
+
if message and message not in ['None', 'null', '', 'undefined']:
|
| 1183 |
+
cleaned_message = message.strip()
|
| 1184 |
+
if not cleaned_message:
|
| 1185 |
+
cleaned_message = None
|
| 1186 |
+
|
| 1187 |
+
batch_id = f"batch_{uuid.uuid4().hex[:12]}"
|
| 1188 |
+
jobs = []
|
| 1189 |
+
|
| 1190 |
+
logger.info("=" * 70)
|
| 1191 |
+
logger.info(f"📦 BATCH UPLOAD - {len(files)} files")
|
| 1192 |
+
logger.info(f"📦 Batch ID: {batch_id}")
|
| 1193 |
+
logger.info(f"📦 Message: '{cleaned_message}'")
|
| 1194 |
+
logger.info("=" * 70)
|
| 1195 |
+
|
| 1196 |
+
try:
|
| 1197 |
+
allowed_types = ['application/pdf', 'image/png', 'image/jpeg']
|
| 1198 |
+
|
| 1199 |
+
for idx, file in enumerate(files):
|
| 1200 |
+
# Validate each file
|
| 1201 |
+
if file.content_type not in allowed_types:
|
| 1202 |
+
logger.warning(f"⚠️ Skipping {file.filename} - invalid type: {file.content_type}")
|
| 1203 |
+
continue
|
| 1204 |
+
|
| 1205 |
+
# Create job for this file
|
| 1206 |
+
job_id = f"job_{uuid.uuid4().hex[:12]}"
|
| 1207 |
+
doc_id = f"doc_{uuid.uuid4().hex[:12]}"
|
| 1208 |
+
file_ext = file.filename.split('.')[-1] if '.' in file.filename else 'pdf'
|
| 1209 |
+
|
| 1210 |
+
stored_filename = f"{doc_id}.{file_ext}"
|
| 1211 |
+
file_path = STORAGE_PATH / stored_filename
|
| 1212 |
+
|
| 1213 |
+
# Save file
|
| 1214 |
+
content = await file.read()
|
| 1215 |
+
with open(file_path, 'wb') as f:
|
| 1216 |
+
f.write(content)
|
| 1217 |
+
|
| 1218 |
+
# Save to database
|
| 1219 |
+
with FileLock(str(LOCK_PATH), timeout=10):
|
| 1220 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 1221 |
+
cursor = conn.cursor()
|
| 1222 |
+
|
| 1223 |
+
cursor.execute("""
|
| 1224 |
+
INSERT INTO ingest_jobs (job_id, doc_id, filename, status)
|
| 1225 |
+
VALUES (?, ?, ?, 'queued')
|
| 1226 |
+
""", (job_id, doc_id, file.filename))
|
| 1227 |
+
|
| 1228 |
+
cursor.execute("""
|
| 1229 |
+
INSERT INTO documents (doc_id, job_id, path, filename, content_type)
|
| 1230 |
+
VALUES (?, ?, ?, ?, ?)
|
| 1231 |
+
""", (doc_id, job_id, str(file_path), file.filename, file.content_type))
|
| 1232 |
+
|
| 1233 |
+
conn.commit()
|
| 1234 |
+
conn.close()
|
| 1235 |
+
|
| 1236 |
+
# Queue processing
|
| 1237 |
+
background_tasks.add_task(
|
| 1238 |
+
process_document,
|
| 1239 |
+
job_id,
|
| 1240 |
+
doc_id,
|
| 1241 |
+
file_path,
|
| 1242 |
+
user_message=cleaned_message
|
| 1243 |
+
)
|
| 1244 |
+
|
| 1245 |
+
jobs.append({
|
| 1246 |
+
'job_id': job_id,
|
| 1247 |
+
'doc_id': doc_id,
|
| 1248 |
+
'filename': file.filename,
|
| 1249 |
+
'status': 'queued'
|
| 1250 |
+
})
|
| 1251 |
+
|
| 1252 |
+
logger.info(f"✅ [{idx+1}/{len(files)}] Queued: {file.filename}")
|
| 1253 |
+
|
| 1254 |
+
if not jobs:
|
| 1255 |
+
raise HTTPException(400, "No valid files to process")
|
| 1256 |
+
|
| 1257 |
+
# Save batch metadata
|
| 1258 |
+
with FileLock(str(LOCK_PATH), timeout=10):
|
| 1259 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 1260 |
+
cursor = conn.cursor()
|
| 1261 |
+
|
| 1262 |
+
# Create batch_jobs table if it doesn't exist
|
| 1263 |
+
cursor.execute("""
|
| 1264 |
+
CREATE TABLE IF NOT EXISTS batch_jobs (
|
| 1265 |
+
batch_id TEXT PRIMARY KEY,
|
| 1266 |
+
total_files INTEGER,
|
| 1267 |
+
message TEXT,
|
| 1268 |
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
| 1269 |
+
)
|
| 1270 |
+
""")
|
| 1271 |
+
|
| 1272 |
+
cursor.execute("""
|
| 1273 |
+
INSERT INTO batch_jobs (batch_id, total_files, message)
|
| 1274 |
+
VALUES (?, ?, ?)
|
| 1275 |
+
""", (batch_id, len(jobs), cleaned_message))
|
| 1276 |
+
|
| 1277 |
+
# Link jobs to batch
|
| 1278 |
+
cursor.execute("""
|
| 1279 |
+
CREATE TABLE IF NOT EXISTS batch_job_mapping (
|
| 1280 |
+
batch_id TEXT,
|
| 1281 |
+
job_id TEXT,
|
| 1282 |
+
FOREIGN KEY (job_id) REFERENCES ingest_jobs(job_id)
|
| 1283 |
+
)
|
| 1284 |
+
""")
|
| 1285 |
+
|
| 1286 |
+
for job in jobs:
|
| 1287 |
+
cursor.execute("""
|
| 1288 |
+
INSERT INTO batch_job_mapping (batch_id, job_id)
|
| 1289 |
+
VALUES (?, ?)
|
| 1290 |
+
""", (batch_id, job['job_id']))
|
| 1291 |
+
|
| 1292 |
+
conn.commit()
|
| 1293 |
+
conn.close()
|
| 1294 |
+
|
| 1295 |
+
mode = "autonomous agent"
|
| 1296 |
+
if cleaned_message:
|
| 1297 |
+
mode += " with intelligent filtering"
|
| 1298 |
+
|
| 1299 |
+
logger.info(f"🚀 Batch {batch_id} processing started with {len(jobs)} files")
|
| 1300 |
+
|
| 1301 |
+
return BatchIngestResponse(
|
| 1302 |
+
batch_id=batch_id,
|
| 1303 |
+
total_files=len(jobs),
|
| 1304 |
+
jobs=jobs,
|
| 1305 |
+
message=f'Batch of {len(jobs)} documents uploaded. Processing with {mode}.'
|
| 1306 |
+
)
|
| 1307 |
+
|
| 1308 |
+
except HTTPException:
|
| 1309 |
+
raise
|
| 1310 |
+
except Exception as e:
|
| 1311 |
+
logger.error(f"❌ Batch ingest error: {e}")
|
| 1312 |
+
import traceback
|
| 1313 |
+
logger.error(traceback.format_exc())
|
| 1314 |
+
raise HTTPException(500, str(e))
|
| 1315 |
+
|
| 1316 |
+
|
| 1317 |
+
@router.get("/ingest/batch/{batch_id}", response_model=BatchStatusResponse)
|
| 1318 |
+
def get_batch_status(batch_id: str):
|
| 1319 |
+
"""
|
| 1320 |
+
Get status of all jobs in a batch.
|
| 1321 |
+
|
| 1322 |
+
Example:
|
| 1323 |
+
curl http://localhost:7860/api/ingest/batch/batch_abc123
|
| 1324 |
+
"""
|
| 1325 |
+
try:
|
| 1326 |
+
with FileLock(str(LOCK_PATH), timeout=10):
|
| 1327 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 1328 |
+
conn.row_factory = sqlite3.Row
|
| 1329 |
+
cursor = conn.cursor()
|
| 1330 |
+
|
| 1331 |
+
# Get batch info
|
| 1332 |
+
cursor.execute("SELECT * FROM batch_jobs WHERE batch_id = ?", (batch_id,))
|
| 1333 |
+
batch = cursor.fetchone()
|
| 1334 |
+
if not batch:
|
| 1335 |
+
conn.close()
|
| 1336 |
+
raise HTTPException(404, "Batch not found")
|
| 1337 |
+
|
| 1338 |
+
# Get all jobs in batch
|
| 1339 |
+
cursor.execute("""
|
| 1340 |
+
SELECT j.* FROM ingest_jobs j
|
| 1341 |
+
JOIN batch_job_mapping bm ON j.job_id = bm.job_id
|
| 1342 |
+
WHERE bm.batch_id = ?
|
| 1343 |
+
""", (batch_id,))
|
| 1344 |
+
|
| 1345 |
+
jobs = cursor.fetchall()
|
| 1346 |
+
conn.close()
|
| 1347 |
+
|
| 1348 |
+
# Count statuses
|
| 1349 |
+
status_counts = {
|
| 1350 |
+
'completed': 0,
|
| 1351 |
+
'processing': 0,
|
| 1352 |
+
'failed': 0,
|
| 1353 |
+
'queued': 0,
|
| 1354 |
+
'needs_review': 0
|
| 1355 |
+
}
|
| 1356 |
+
|
| 1357 |
+
jobs_list = []
|
| 1358 |
+
for job in jobs:
|
| 1359 |
+
job_dict = dict(job)
|
| 1360 |
+
status = job_dict['status']
|
| 1361 |
+
status_counts[status] = status_counts.get(status, 0) + 1
|
| 1362 |
+
|
| 1363 |
+
jobs_list.append({
|
| 1364 |
+
'job_id': job_dict['job_id'],
|
| 1365 |
+
'doc_id': job_dict['doc_id'],
|
| 1366 |
+
'filename': job_dict['filename'],
|
| 1367 |
+
'status': status,
|
| 1368 |
+
'error_text': job_dict.get('error_text'),
|
| 1369 |
+
'created_at': job_dict['created_at'],
|
| 1370 |
+
'updated_at': job_dict['updated_at']
|
| 1371 |
+
})
|
| 1372 |
+
|
| 1373 |
+
return BatchStatusResponse(
|
| 1374 |
+
batch_id=batch_id,
|
| 1375 |
+
total_files=len(jobs),
|
| 1376 |
+
completed=status_counts['completed'],
|
| 1377 |
+
processing=status_counts['processing'],
|
| 1378 |
+
failed=status_counts['failed'],
|
| 1379 |
+
queued=status_counts['queued'],
|
| 1380 |
+
jobs=jobs_list
|
| 1381 |
+
)
|
| 1382 |
+
|
| 1383 |
+
except HTTPException:
|
| 1384 |
+
raise
|
| 1385 |
+
except Exception as e:
|
| 1386 |
+
logger.error(f"❌ Batch status error: {e}")
|
| 1387 |
+
raise HTTPException(500, str(e))
|
| 1388 |
+
|
| 1389 |
+
|
| 1390 |
+
@router.get("/ingest/batch/{batch_id}/download")
|
| 1391 |
+
def download_batch_results(batch_id: str):
|
| 1392 |
+
"""
|
| 1393 |
+
Download all extracted data from a batch as CSV.
|
| 1394 |
+
|
| 1395 |
+
Example:
|
| 1396 |
+
curl http://localhost:7860/api/ingest/batch/batch_abc123/download -o results.csv
|
| 1397 |
+
"""
|
| 1398 |
+
try:
|
| 1399 |
+
import csv
|
| 1400 |
+
from io import StringIO
|
| 1401 |
+
from fastapi.responses import StreamingResponse
|
| 1402 |
+
|
| 1403 |
+
with FileLock(str(LOCK_PATH), timeout=10):
|
| 1404 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 1405 |
+
conn.row_factory = sqlite3.Row
|
| 1406 |
+
cursor = conn.cursor()
|
| 1407 |
+
|
| 1408 |
+
# Get all completed jobs in batch
|
| 1409 |
+
cursor.execute("""
|
| 1410 |
+
SELECT j.*, f.* FROM ingest_jobs j
|
| 1411 |
+
JOIN batch_job_mapping bm ON j.job_id = bm.job_id
|
| 1412 |
+
LEFT JOIN invoice_fields f ON j.doc_id = f.doc_id
|
| 1413 |
+
WHERE bm.batch_id = ? AND j.status = 'completed'
|
| 1414 |
+
""", (batch_id,))
|
| 1415 |
+
|
| 1416 |
+
results = cursor.fetchall()
|
| 1417 |
+
conn.close()
|
| 1418 |
+
|
| 1419 |
+
if not results:
|
| 1420 |
+
raise HTTPException(404, "No completed jobs found in batch")
|
| 1421 |
+
|
| 1422 |
+
# Create CSV
|
| 1423 |
+
output = StringIO()
|
| 1424 |
+
writer = csv.writer(output)
|
| 1425 |
+
|
| 1426 |
+
# Header
|
| 1427 |
+
writer.writerow([
|
| 1428 |
+
'filename', 'doc_id', 'customer', 'date', 'amount',
|
| 1429 |
+
'payment_terms', 'business_code', 'status'
|
| 1430 |
+
])
|
| 1431 |
+
|
| 1432 |
+
# Data rows
|
| 1433 |
+
for row in results:
|
| 1434 |
+
writer.writerow([
|
| 1435 |
+
row['filename'],
|
| 1436 |
+
row['doc_id'],
|
| 1437 |
+
row['cust_number'] or 'N/A',
|
| 1438 |
+
row['posting_date'] or 'N/A',
|
| 1439 |
+
row['total_open_amount'] or 0.0,
|
| 1440 |
+
row['cust_payment_terms'] or 'N/A',
|
| 1441 |
+
row['business_code'] or 'N/A',
|
| 1442 |
+
row['status']
|
| 1443 |
+
])
|
| 1444 |
+
|
| 1445 |
+
output.seek(0)
|
| 1446 |
+
|
| 1447 |
+
return StreamingResponse(
|
| 1448 |
+
iter([output.getvalue()]),
|
| 1449 |
+
media_type="text/csv",
|
| 1450 |
+
headers={
|
| 1451 |
+
"Content-Disposition": f"attachment; filename=batch_{batch_id}_results.csv"
|
| 1452 |
+
}
|
| 1453 |
+
)
|
| 1454 |
+
|
| 1455 |
+
|
| 1456 |
+
except HTTPException:
|
| 1457 |
+
raise
|
| 1458 |
+
except Exception as e:
|
| 1459 |
+
raise HTTPException(500, str(e))
|
backend/app/utils/__init__.py
ADDED
|
File without changes
|
backend/app/utils/agent_client.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HF Agent client with proper environment variable support.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import httpx
|
| 6 |
+
import os
|
| 7 |
+
import time
|
| 8 |
+
import logging
|
| 9 |
+
from typing import Dict, Optional, Tuple
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
# Load from environment
|
| 15 |
+
TEXT_EXTRACTOR_URL = os.getenv('TEXT_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/text')
|
| 16 |
+
TABLE_EXTRACTOR_URL = os.getenv('TABLE_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/tables')
|
| 17 |
+
NER_URL = os.getenv('NER_URL', 'https://point9-ner.hf.space/api/ner')
|
| 18 |
+
CLASSIFY_URL = os.getenv('CLASSIFY_URL', 'https://point9-classify.hf.space/api/classify')
|
| 19 |
+
SUMMARIZER_URL = os.getenv('SUMMARIZER_URL', '') # Optional
|
| 20 |
+
|
| 21 |
+
AGENT_BEARER_TOKEN = os.getenv('AGENT_BEARER_TOKEN', '')
|
| 22 |
+
AGENT_TIMEOUT_SECONDS = int(os.getenv('AGENT_TIMEOUT_SECONDS', '30'))
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def get_headers() -> Dict:
|
| 26 |
+
"""Get headers with optional bearer token."""
|
| 27 |
+
headers = {}
|
| 28 |
+
if AGENT_BEARER_TOKEN:
|
| 29 |
+
headers['Authorization'] = f'Bearer {AGENT_BEARER_TOKEN}'
|
| 30 |
+
return headers
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def call_agent_with_retry(
|
| 34 |
+
url: str,
|
| 35 |
+
files: Optional[Dict] = None,
|
| 36 |
+
data: Optional[Dict] = None,
|
| 37 |
+
json: Optional[Dict] = None,
|
| 38 |
+
max_retries: int = 1
|
| 39 |
+
) -> Tuple[bool, Optional[Dict], Optional[str]]:
|
| 40 |
+
"""Call agent with retry logic."""
|
| 41 |
+
headers = get_headers()
|
| 42 |
+
|
| 43 |
+
for attempt in range(max_retries + 1):
|
| 44 |
+
try:
|
| 45 |
+
with httpx.Client(timeout=AGENT_TIMEOUT_SECONDS) as client:
|
| 46 |
+
if files:
|
| 47 |
+
response = client.post(url, headers=headers, files=files, data=data)
|
| 48 |
+
elif json:
|
| 49 |
+
response = client.post(url, headers=headers, json=json)
|
| 50 |
+
else:
|
| 51 |
+
response = client.post(url, headers=headers, data=data)
|
| 52 |
+
|
| 53 |
+
if response.status_code == 200:
|
| 54 |
+
return True, response.json(), None
|
| 55 |
+
elif response.status_code == 429:
|
| 56 |
+
if attempt < max_retries:
|
| 57 |
+
time.sleep(2)
|
| 58 |
+
continue
|
| 59 |
+
return False, None, "Rate limited"
|
| 60 |
+
else:
|
| 61 |
+
return False, None, f"HTTP {response.status_code}: {response.text[:200]}"
|
| 62 |
+
|
| 63 |
+
except httpx.TimeoutException:
|
| 64 |
+
if attempt < max_retries:
|
| 65 |
+
time.sleep(1)
|
| 66 |
+
continue
|
| 67 |
+
return False, None, f"Timeout after {AGENT_TIMEOUT_SECONDS}s"
|
| 68 |
+
except Exception as e:
|
| 69 |
+
if attempt < max_retries:
|
| 70 |
+
time.sleep(1)
|
| 71 |
+
continue
|
| 72 |
+
return False, None, str(e)
|
| 73 |
+
|
| 74 |
+
return False, None, "Max retries exceeded"
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def extract_text_from_file(file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
|
| 78 |
+
"""Extract text using HF agent."""
|
| 79 |
+
try:
|
| 80 |
+
with open(file_path, 'rb') as f:
|
| 81 |
+
files = {'file': (file_path.name, f, 'application/pdf')}
|
| 82 |
+
data = {'filename': file_path.name}
|
| 83 |
+
|
| 84 |
+
success, response, error = call_agent_with_retry(TEXT_EXTRACTOR_URL, files=files, data=data)
|
| 85 |
+
|
| 86 |
+
if success and response:
|
| 87 |
+
text = response.get('text', '')
|
| 88 |
+
if not text or len(text.strip()) < 10:
|
| 89 |
+
return False, None, "No text extracted"
|
| 90 |
+
return True, text, None
|
| 91 |
+
else:
|
| 92 |
+
return False, None, error or "Text extraction failed"
|
| 93 |
+
except Exception as e:
|
| 94 |
+
return False, None, str(e)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def extract_tables_from_file(file_path: Path) -> Tuple[bool, Optional[list], Optional[str]]:
|
| 98 |
+
"""Extract tables using HF agent."""
|
| 99 |
+
try:
|
| 100 |
+
with open(file_path, 'rb') as f:
|
| 101 |
+
files = {'file': (file_path.name, f, 'application/pdf')}
|
| 102 |
+
data = {'filename': file_path.name}
|
| 103 |
+
|
| 104 |
+
success, response, error = call_agent_with_retry(TABLE_EXTRACTOR_URL, files=files, data=data)
|
| 105 |
+
|
| 106 |
+
if success and response:
|
| 107 |
+
return True, response.get('tables', []), None
|
| 108 |
+
else:
|
| 109 |
+
return False, None, error or "Table extraction failed"
|
| 110 |
+
except Exception as e:
|
| 111 |
+
return False, None, str(e)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def extract_entities_from_text(text: str) -> Tuple[bool, Optional[list], Optional[str]]:
|
| 115 |
+
"""Extract entities using NER agent."""
|
| 116 |
+
try:
|
| 117 |
+
success, response, error = call_agent_with_retry(NER_URL, json={'text': text})
|
| 118 |
+
|
| 119 |
+
if success and response:
|
| 120 |
+
return True, response.get('entities', []), None
|
| 121 |
+
else:
|
| 122 |
+
return False, None, error or "NER failed"
|
| 123 |
+
except Exception as e:
|
| 124 |
+
return False, None, str(e)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def classify_document(text: str) -> Tuple[bool, Optional[Dict], Optional[str]]:
|
| 128 |
+
"""Classify document using classifier agent."""
|
| 129 |
+
try:
|
| 130 |
+
success, response, error = call_agent_with_retry(CLASSIFY_URL, json={'text': text[:2000]})
|
| 131 |
+
|
| 132 |
+
if success and response:
|
| 133 |
+
return True, response, None
|
| 134 |
+
else:
|
| 135 |
+
return False, None, error or "Classification failed"
|
| 136 |
+
except Exception as e:
|
| 137 |
+
return False, None, str(e)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def summarize_text(text: str) -> Tuple[bool, Optional[str], Optional[str]]:
|
| 141 |
+
"""Summarize text (optional)."""
|
| 142 |
+
if not SUMMARIZER_URL:
|
| 143 |
+
return True, None, None
|
| 144 |
+
|
| 145 |
+
try:
|
| 146 |
+
success, response, error = call_agent_with_retry(SUMMARIZER_URL, json={'text': text[:5000]})
|
| 147 |
+
|
| 148 |
+
if success and response:
|
| 149 |
+
return True, response.get('summary', ''), None
|
| 150 |
+
else:
|
| 151 |
+
return False, None, error or "Summarization failed"
|
| 152 |
+
except Exception as e:
|
| 153 |
+
return False, None, str(e)
|
backend/app/wrappers/__init__.py
ADDED
|
File without changes
|
backend/app/wrappers/gemini_output_filter.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import logging
|
| 4 |
+
import time
|
| 5 |
+
from typing import Dict, Optional
|
| 6 |
+
|
| 7 |
+
try:
|
| 8 |
+
import google.generativeai as genai
|
| 9 |
+
except ImportError:
|
| 10 |
+
raise ImportError("Install google-generativeai: pip install google-generativeai")
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class GeminiOutputFilter:
|
| 16 |
+
"""
|
| 17 |
+
Context-aware output filter that adapts to any invoice format.
|
| 18 |
+
No hardcoded field glossary - Gemini discovers fields dynamically.
|
| 19 |
+
WITH RATE LIMIT HANDLING
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self):
|
| 23 |
+
"""Initialize Gemini model"""
|
| 24 |
+
|
| 25 |
+
api_key = os.getenv('GEMINI_API_KEY')
|
| 26 |
+
if not api_key:
|
| 27 |
+
raise ValueError("GEMINI_API_KEY environment variable not set")
|
| 28 |
+
|
| 29 |
+
genai.configure(api_key=api_key)
|
| 30 |
+
self.model = genai.GenerativeModel('gemini-2.5-flash')
|
| 31 |
+
|
| 32 |
+
logger.info("✅ GeminiOutputFilter initialized")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def filter_output(self, user_message: str, full_extraction: Dict, max_retries: int = 3) -> Dict:
|
| 36 |
+
"""
|
| 37 |
+
Filter extraction based on user message with intelligent retry logic.
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
user_message: What user wants (e.g., "I need total and date")
|
| 41 |
+
full_extraction: Complete extraction from agent (any format)
|
| 42 |
+
max_retries: Maximum number of retry attempts for rate limits
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
Filtered result with only requested fields
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
logger.info(f"🔍 Filtering request: '{user_message}'")
|
| 49 |
+
logger.info(f"📊 Available fields: {list(full_extraction.keys())}")
|
| 50 |
+
|
| 51 |
+
# Build context-aware prompt
|
| 52 |
+
prompt = self._build_prompt(user_message, full_extraction)
|
| 53 |
+
|
| 54 |
+
for attempt in range(max_retries):
|
| 55 |
+
try:
|
| 56 |
+
logger.info(f"🤖 Calling Gemini (attempt {attempt + 1}/{max_retries})...")
|
| 57 |
+
|
| 58 |
+
# Call Gemini
|
| 59 |
+
response = self.model.generate_content(prompt)
|
| 60 |
+
response_text = response.text.strip()
|
| 61 |
+
|
| 62 |
+
# Clean markdown if present
|
| 63 |
+
response_text = response_text.replace('```json', '').replace('```', '').strip()
|
| 64 |
+
|
| 65 |
+
# Parse JSON
|
| 66 |
+
filtered_result = json.loads(response_text)
|
| 67 |
+
|
| 68 |
+
logger.info(f"✅ Filtered result: {list(filtered_result.keys())}")
|
| 69 |
+
return filtered_result
|
| 70 |
+
|
| 71 |
+
except json.JSONDecodeError as e:
|
| 72 |
+
logger.error(f"❌ JSON parse error: {e}")
|
| 73 |
+
logger.error(f"Response was: {response_text[:300]}")
|
| 74 |
+
return {
|
| 75 |
+
"_error": "Failed to parse AI response",
|
| 76 |
+
"_debug": response_text[:300],
|
| 77 |
+
"_fallback": full_extraction
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
except Exception as e:
|
| 81 |
+
error_msg = str(e)
|
| 82 |
+
|
| 83 |
+
# Check if it's a rate limit error (429)
|
| 84 |
+
is_rate_limit = (
|
| 85 |
+
"429" in error_msg or
|
| 86 |
+
"quota" in error_msg.lower() or
|
| 87 |
+
"rate limit" in error_msg.lower() or
|
| 88 |
+
"exceeded" in error_msg.lower()
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
if is_rate_limit:
|
| 92 |
+
# Extract wait time from error message
|
| 93 |
+
wait_time = self._extract_retry_delay(error_msg)
|
| 94 |
+
|
| 95 |
+
if attempt < max_retries - 1:
|
| 96 |
+
logger.warning(f"⚠️ Rate limit hit (attempt {attempt + 1}/{max_retries})")
|
| 97 |
+
logger.info(f"⏳ Waiting {wait_time:.1f}s before retry...")
|
| 98 |
+
time.sleep(wait_time)
|
| 99 |
+
continue
|
| 100 |
+
else:
|
| 101 |
+
# Max retries exhausted
|
| 102 |
+
logger.error(f"❌ Rate limit exceeded after {max_retries} attempts")
|
| 103 |
+
logger.error(f"Full error: {error_msg}")
|
| 104 |
+
return {
|
| 105 |
+
"_error": f"Filtering failed: {error_msg}",
|
| 106 |
+
"_fallback": full_extraction
|
| 107 |
+
}
|
| 108 |
+
else:
|
| 109 |
+
# Non-rate-limit error - fail immediately
|
| 110 |
+
logger.error(f"❌ Filtering failed: {e}")
|
| 111 |
+
return {
|
| 112 |
+
"_error": f"Filtering failed: {str(e)}",
|
| 113 |
+
"_fallback": full_extraction
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
# Should not reach here, but just in case
|
| 117 |
+
return {
|
| 118 |
+
"_error": "Max retries exceeded",
|
| 119 |
+
"_fallback": full_extraction
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def _extract_retry_delay(self, error_message: str) -> float:
|
| 124 |
+
"""
|
| 125 |
+
Extract retry delay from Gemini error message.
|
| 126 |
+
|
| 127 |
+
Gemini errors include: "Please retry in 50.923950003s"
|
| 128 |
+
"""
|
| 129 |
+
import re
|
| 130 |
+
|
| 131 |
+
# Look for pattern: "retry in X.Xs" or "retry in Xs"
|
| 132 |
+
match = re.search(r'retry in ([\d.]+)s', error_message, re.IGNORECASE)
|
| 133 |
+
|
| 134 |
+
if match:
|
| 135 |
+
retry_seconds = float(match.group(1))
|
| 136 |
+
# Add small buffer (2 seconds) to be safe
|
| 137 |
+
wait_time = retry_seconds + 2
|
| 138 |
+
logger.info(f"📍 Extracted wait time from error: {retry_seconds}s (using {wait_time}s with buffer)")
|
| 139 |
+
return wait_time
|
| 140 |
+
|
| 141 |
+
# Look for alternative patterns in error
|
| 142 |
+
match = re.search(r'(\d+)\s*(?:second|sec)', error_message, re.IGNORECASE)
|
| 143 |
+
if match:
|
| 144 |
+
retry_seconds = float(match.group(1))
|
| 145 |
+
wait_time = retry_seconds + 2
|
| 146 |
+
logger.info(f"📍 Extracted wait time from error: {retry_seconds}s (using {wait_time}s with buffer)")
|
| 147 |
+
return wait_time
|
| 148 |
+
|
| 149 |
+
# Default: exponential backoff (10s, 20s, 40s)
|
| 150 |
+
default_wait = 10 * (2 ** (0)) # Can increase based on attempt number
|
| 151 |
+
logger.warning(f"⚠️ Could not extract retry delay, using default: {default_wait}s")
|
| 152 |
+
return default_wait
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def _build_prompt(self, user_message: str, full_extraction: Dict) -> str:
|
| 156 |
+
"""Build the context-aware Gemini prompt"""
|
| 157 |
+
|
| 158 |
+
return f"""You are an intelligent output filter for an invoice extraction system that handles invoices from MANY different companies with DIFFERENT formats and field names.
|
| 159 |
+
|
| 160 |
+
YOUR TASK:
|
| 161 |
+
Our agent has extracted data from an invoice. The fields extracted depend on the invoice format - different companies use different field names and structures. You need to understand what the user wants and map it to whatever fields are available in THIS specific extraction.
|
| 162 |
+
|
| 163 |
+
====================
|
| 164 |
+
USER'S REQUEST:
|
| 165 |
+
====================
|
| 166 |
+
"{user_message}"
|
| 167 |
+
|
| 168 |
+
====================
|
| 169 |
+
EXTRACTED DATA (from this specific invoice):
|
| 170 |
+
====================
|
| 171 |
+
{json.dumps(full_extraction, indent=2)}
|
| 172 |
+
|
| 173 |
+
====================
|
| 174 |
+
YOUR JOB:
|
| 175 |
+
====================
|
| 176 |
+
1. ANALYZE the extracted fields to understand what data is available
|
| 177 |
+
2. UNDERSTAND what the user is asking for
|
| 178 |
+
3. MAP the user's request to the actual field names in this extraction
|
| 179 |
+
4. RETURN only the fields the user requested
|
| 180 |
+
|
| 181 |
+
====================
|
| 182 |
+
IMPORTANT CONTEXT AWARENESS:
|
| 183 |
+
====================
|
| 184 |
+
Different invoices have different field names. You must be flexible and understand INTENT:
|
| 185 |
+
|
| 186 |
+
USER ASKS FOR "total" or "amount":
|
| 187 |
+
- Could be: total_open_amount, total, amount, grand_total, net_amount, invoice_total, final_amount, etc.
|
| 188 |
+
- Look for fields that contain: "total", "amount", "price", "sum", or numeric values that seem like totals
|
| 189 |
+
|
| 190 |
+
USER ASKS FOR "date":
|
| 191 |
+
- Could be: posting_date, invoice_date, date, issue_date, date_of_issue, created_date, created, etc.
|
| 192 |
+
- Look for fields with: "date", "created", "issue" or date-like values (YYYY-MM-DD format)
|
| 193 |
+
|
| 194 |
+
USER ASKS FOR "customer" or "client":
|
| 195 |
+
- Could be: cust_number, customer, client, customer_name, client_name, bill_to, buyer, purchaser, etc.
|
| 196 |
+
- Look for fields with: "cust", "client", "customer", "buyer", "bill", "purchaser"
|
| 197 |
+
|
| 198 |
+
USER ASKS FOR "invoice number":
|
| 199 |
+
- Could be: invoice_id, invoice_no, invoice_number, doc_no, document_number, reference, ref_no, doc_reference, etc.
|
| 200 |
+
- Look for fields with: "invoice", "doc", "number", "id", "ref", "reference"
|
| 201 |
+
|
| 202 |
+
USER ASKS FOR "payment terms":
|
| 203 |
+
- Could be: payment_terms, terms, due_terms, payment_conditions, net_terms, etc.
|
| 204 |
+
- Look for fields with: "payment", "terms", "due", "net"
|
| 205 |
+
|
| 206 |
+
====================
|
| 207 |
+
STRATEGY:
|
| 208 |
+
====================
|
| 209 |
+
1. First, list out all available fields from the extraction
|
| 210 |
+
2. For each field, infer what type of data it contains based on:
|
| 211 |
+
- Field name (does it contain keywords like "total", "date", "customer"?)
|
| 212 |
+
- Value type (is it a number? date? string?)
|
| 213 |
+
- Value content (does it look like money? a date? a name?)
|
| 214 |
+
3. Match user's request to the best-fitting available fields
|
| 215 |
+
4. If multiple fields could match, pick the most likely one (e.g., "grand_total" over "subtotal")
|
| 216 |
+
5. If NO fields match, explain what's available
|
| 217 |
+
|
| 218 |
+
====================
|
| 219 |
+
FLEXIBILITY EXAMPLES:
|
| 220 |
+
====================
|
| 221 |
+
|
| 222 |
+
Example 1 - Simple mapping:
|
| 223 |
+
Extraction: {{"total_amount": 500, "customer_name": "ABC Corp"}}
|
| 224 |
+
User: "show me total"
|
| 225 |
+
Response: {{"total_amount": 500}}
|
| 226 |
+
|
| 227 |
+
Example 2 - Different field name:
|
| 228 |
+
Extraction: {{"grand_total": 500, "bill_to": "ABC Corp"}}
|
| 229 |
+
User: "show me total"
|
| 230 |
+
Response: {{"grand_total": 500}}
|
| 231 |
+
|
| 232 |
+
Example 3 - User friendly name:
|
| 233 |
+
Extraction: {{"invoice_amt": 500, "client_id": "ABC Corp"}}
|
| 234 |
+
User: "what's the amount?"
|
| 235 |
+
Response: {{"amount": 500}}
|
| 236 |
+
|
| 237 |
+
Example 4 - Multiple fields requested:
|
| 238 |
+
Extraction: {{"total": 500, "date": "2024-01-01", "customer": "ABC"}}
|
| 239 |
+
User: "I need total and date"
|
| 240 |
+
Response: {{"total": 500, "date": "2024-01-01"}}
|
| 241 |
+
|
| 242 |
+
Example 5 - Extract all:
|
| 243 |
+
User: "extract all information" OR "give me everything" OR "show full data"
|
| 244 |
+
Response: {{entire extraction unchanged}}
|
| 245 |
+
|
| 246 |
+
Example 6 - Field not found:
|
| 247 |
+
Extraction: {{"total": 500, "date": "2024-01-01"}}
|
| 248 |
+
User: "show me shipping address"
|
| 249 |
+
Response: {{
|
| 250 |
+
"_error": "No shipping address found in this invoice",
|
| 251 |
+
"_available_fields": {{
|
| 252 |
+
"total": "appears to be invoice total amount",
|
| 253 |
+
"date": "appears to be invoice date"
|
| 254 |
+
}},
|
| 255 |
+
"_suggestion": "Available data: total, date"
|
| 256 |
+
}}
|
| 257 |
+
|
| 258 |
+
====================
|
| 259 |
+
RESPONSE FORMAT:
|
| 260 |
+
====================
|
| 261 |
+
Return ONLY valid JSON, no markdown, no extra text.
|
| 262 |
+
|
| 263 |
+
If successful:
|
| 264 |
+
{{
|
| 265 |
+
"field_name": value
|
| 266 |
+
}}
|
| 267 |
+
|
| 268 |
+
You CAN rename fields to be user-friendly:
|
| 269 |
+
{{
|
| 270 |
+
"total": 500 // even if original field was "invoice_amt"
|
| 271 |
+
}}
|
| 272 |
+
|
| 273 |
+
If field not found:
|
| 274 |
+
{{
|
| 275 |
+
"_error": "...",
|
| 276 |
+
"_available_fields": {{
|
| 277 |
+
"field1": "what this field appears to contain",
|
| 278 |
+
"field2": "what this field appears to contain"
|
| 279 |
+
}}
|
| 280 |
+
}}
|
| 281 |
+
|
| 282 |
+
====================
|
| 283 |
+
CRITICAL RULES:
|
| 284 |
+
====================
|
| 285 |
+
1. DO NOT assume what fields exist. ONLY work with the fields present in the extraction JSON above.
|
| 286 |
+
2. Be intelligent about inferring what each field means based on its name and value.
|
| 287 |
+
3. If user asks for "all" or "everything", return the ENTIRE extraction unchanged.
|
| 288 |
+
4. Always return valid JSON only - no explanations outside the JSON.
|
| 289 |
+
|
| 290 |
+
Now process the user's request."""
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
def analyze_extraction(self, extraction: Dict) -> Dict:
|
| 294 |
+
"""
|
| 295 |
+
Optional utility: Get Gemini's analysis of what fields mean.
|
| 296 |
+
Useful for debugging or showing users what's available.
|
| 297 |
+
"""
|
| 298 |
+
|
| 299 |
+
prompt = f"""Analyze this invoice extraction and explain what each field likely contains:
|
| 300 |
+
|
| 301 |
+
{json.dumps(extraction, indent=2)}
|
| 302 |
+
|
| 303 |
+
For each field, provide:
|
| 304 |
+
- Field name
|
| 305 |
+
- Likely meaning (what data it contains)
|
| 306 |
+
- Data type
|
| 307 |
+
- User-friendly name suggestion
|
| 308 |
+
|
| 309 |
+
Return as JSON:
|
| 310 |
+
{{
|
| 311 |
+
"field_name": {{
|
| 312 |
+
"meaning": "...",
|
| 313 |
+
"type": "...",
|
| 314 |
+
"user_friendly_name": "..."
|
| 315 |
+
}}
|
| 316 |
+
}}"""
|
| 317 |
+
|
| 318 |
+
try:
|
| 319 |
+
response = self.model.generate_content(prompt)
|
| 320 |
+
response_text = response.text.strip().replace('```json', '').replace('```', '')
|
| 321 |
+
analysis = json.loads(response_text)
|
| 322 |
+
return analysis
|
| 323 |
+
except Exception as e:
|
| 324 |
+
logger.error(f"Analysis failed: {e}")
|
| 325 |
+
return {"error": f"Could not analyze extraction: {str(e)}"}
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
# ============================================
|
| 329 |
+
# Usage Example (for testing)
|
| 330 |
+
# ============================================
|
| 331 |
+
|
| 332 |
+
if __name__ == "__main__":
|
| 333 |
+
# Test the wrapper with retry logic
|
| 334 |
+
|
| 335 |
+
extraction = {
|
| 336 |
+
"cust_number": "Martinez Rosales, An",
|
| 337 |
+
"posting_date": "2015-07-21",
|
| 338 |
+
"total_open_amount": 442.93,
|
| 339 |
+
"business_code": "U001",
|
| 340 |
+
"cust_payment_terms": "NAH4"
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
wrapper = GeminiOutputFilter()
|
| 344 |
+
|
| 345 |
+
print("\n" + "="*60)
|
| 346 |
+
print("TEST: User asks 'show me who the customer is'")
|
| 347 |
+
print("="*60)
|
| 348 |
+
result = wrapper.filter_output("show me who the customer is", extraction, max_retries=3)
|
| 349 |
+
print(f"Result: {json.dumps(result, indent=2)}")
|
backend/database/__init__.py
ADDED
|
File without changes
|
backend/database/migration_ingest_v1.sql
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- ============================================
|
| 2 |
+
-- Minimal Ingest Pipeline Tables
|
| 3 |
+
-- Version: 1.0 (Idempotent)
|
| 4 |
+
-- ============================================
|
| 5 |
+
|
| 6 |
+
-- Table 1: ingest_jobs (job tracking)
|
| 7 |
+
CREATE TABLE IF NOT EXISTS ingest_jobs (
|
| 8 |
+
job_id TEXT PRIMARY KEY,
|
| 9 |
+
doc_id INTEGER,
|
| 10 |
+
filename TEXT NOT NULL,
|
| 11 |
+
status TEXT NOT NULL DEFAULT 'queued',
|
| 12 |
+
error_text TEXT,
|
| 13 |
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
| 14 |
+
updated_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
| 15 |
+
FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
|
| 16 |
+
);
|
| 17 |
+
|
| 18 |
+
-- Drop indexes if they exist, then recreate
|
| 19 |
+
DROP INDEX IF EXISTS idx_ingest_jobs_status;
|
| 20 |
+
DROP INDEX IF EXISTS idx_ingest_jobs_created;
|
| 21 |
+
CREATE INDEX idx_ingest_jobs_status ON ingest_jobs(status);
|
| 22 |
+
CREATE INDEX idx_ingest_jobs_created ON ingest_jobs(created_at DESC);
|
| 23 |
+
|
| 24 |
+
-- Table 2: documents (file metadata)
|
| 25 |
+
CREATE TABLE IF NOT EXISTS documents (
|
| 26 |
+
doc_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 27 |
+
job_id TEXT NOT NULL,
|
| 28 |
+
path TEXT NOT NULL,
|
| 29 |
+
filename TEXT NOT NULL,
|
| 30 |
+
content_type TEXT NOT NULL,
|
| 31 |
+
uploaded_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
| 32 |
+
FOREIGN KEY (job_id) REFERENCES ingest_jobs(job_id)
|
| 33 |
+
);
|
| 34 |
+
|
| 35 |
+
DROP INDEX IF EXISTS idx_documents_job_id;
|
| 36 |
+
CREATE INDEX idx_documents_job_id ON documents(job_id);
|
| 37 |
+
|
| 38 |
+
-- Table 3: extractions (agent artifacts)
|
| 39 |
+
CREATE TABLE IF NOT EXISTS extractions (
|
| 40 |
+
doc_id INTEGER PRIMARY KEY,
|
| 41 |
+
raw_text TEXT,
|
| 42 |
+
tables_json TEXT,
|
| 43 |
+
entities_json TEXT,
|
| 44 |
+
classification_json TEXT,
|
| 45 |
+
summary_text TEXT,
|
| 46 |
+
extracted_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
| 47 |
+
FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
|
| 48 |
+
);
|
| 49 |
+
|
| 50 |
+
-- Table 4: invoice_fields (mapped fields for prediction)
|
| 51 |
+
CREATE TABLE IF NOT EXISTS invoice_fields (
|
| 52 |
+
invoice_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 53 |
+
doc_id INTEGER NOT NULL,
|
| 54 |
+
cust_number TEXT,
|
| 55 |
+
posting_date TEXT,
|
| 56 |
+
total_open_amount REAL,
|
| 57 |
+
business_code TEXT,
|
| 58 |
+
cust_payment_terms TEXT,
|
| 59 |
+
invoice_currency TEXT DEFAULT 'USD',
|
| 60 |
+
due_in_date TEXT,
|
| 61 |
+
confidence_map TEXT,
|
| 62 |
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
| 63 |
+
FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
|
| 64 |
+
);
|
| 65 |
+
|
| 66 |
+
DROP INDEX IF EXISTS idx_invoice_fields_doc_id;
|
| 67 |
+
CREATE INDEX idx_invoice_fields_doc_id ON invoice_fields(doc_id);
|
backend/database/migration_ingest_v2.sql
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- ============================================
|
| 2 |
+
-- Invoice Ingest Pipeline - Complete Schema
|
| 3 |
+
-- Version: 2.0
|
| 4 |
+
-- ============================================
|
| 5 |
+
|
| 6 |
+
-- Table 1: ingest_jobs
|
| 7 |
+
CREATE TABLE IF NOT EXISTS ingest_jobs (
|
| 8 |
+
job_id TEXT PRIMARY KEY,
|
| 9 |
+
doc_id TEXT,
|
| 10 |
+
filename TEXT NOT NULL,
|
| 11 |
+
status TEXT NOT NULL DEFAULT 'queued',
|
| 12 |
+
error_text TEXT,
|
| 13 |
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
| 14 |
+
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
|
| 15 |
+
);
|
| 16 |
+
|
| 17 |
+
DROP INDEX IF EXISTS idx_ingest_jobs_status;
|
| 18 |
+
DROP INDEX IF EXISTS idx_ingest_jobs_created;
|
| 19 |
+
CREATE INDEX idx_ingest_jobs_status ON ingest_jobs(status);
|
| 20 |
+
CREATE INDEX idx_ingest_jobs_created ON ingest_jobs(created_at DESC);
|
| 21 |
+
|
| 22 |
+
-- Table 2: documents
|
| 23 |
+
CREATE TABLE IF NOT EXISTS documents (
|
| 24 |
+
doc_id TEXT PRIMARY KEY,
|
| 25 |
+
job_id TEXT NOT NULL,
|
| 26 |
+
path TEXT NOT NULL,
|
| 27 |
+
filename TEXT NOT NULL,
|
| 28 |
+
content_type TEXT NOT NULL,
|
| 29 |
+
uploaded_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
| 30 |
+
FOREIGN KEY (job_id) REFERENCES ingest_jobs(job_id)
|
| 31 |
+
);
|
| 32 |
+
|
| 33 |
+
DROP INDEX IF EXISTS idx_documents_job_id;
|
| 34 |
+
CREATE INDEX idx_documents_job_id ON documents(job_id);
|
| 35 |
+
|
| 36 |
+
-- Table 3: extractions
|
| 37 |
+
CREATE TABLE IF NOT EXISTS extractions (
|
| 38 |
+
doc_id TEXT PRIMARY KEY,
|
| 39 |
+
raw_text TEXT,
|
| 40 |
+
tables_json TEXT,
|
| 41 |
+
entities_json TEXT,
|
| 42 |
+
classification_json TEXT,
|
| 43 |
+
summary_text TEXT,
|
| 44 |
+
extracted_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
| 45 |
+
FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
|
| 46 |
+
);
|
| 47 |
+
|
| 48 |
+
-- Table 4: invoice_fields
|
| 49 |
+
CREATE TABLE IF NOT EXISTS invoice_fields (
|
| 50 |
+
invoice_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 51 |
+
doc_id TEXT NOT NULL,
|
| 52 |
+
cust_number TEXT,
|
| 53 |
+
posting_date TEXT,
|
| 54 |
+
total_open_amount REAL,
|
| 55 |
+
business_code TEXT,
|
| 56 |
+
cust_payment_terms TEXT,
|
| 57 |
+
confidence_map TEXT,
|
| 58 |
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
| 59 |
+
FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
|
| 60 |
+
);
|
| 61 |
+
|
| 62 |
+
DROP INDEX IF EXISTS idx_invoice_fields_doc_id;
|
| 63 |
+
CREATE INDEX idx_invoice_fields_doc_id ON invoice_fields(doc_id);
|
backend/database/queries.sql
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Fix for the overdue percentage calculation
|
| 2 |
+
-- Original line 32 had syntax error
|
| 3 |
+
|
| 4 |
+
-- CORRECTED Query 1:
|
| 5 |
+
WITH customer_stats AS (
|
| 6 |
+
SELECT
|
| 7 |
+
cust_number,
|
| 8 |
+
COUNT(*) as total_invoices,
|
| 9 |
+
COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END) as cleared_count,
|
| 10 |
+
|
| 11 |
+
AVG(days_to_clear) as avg_days,
|
| 12 |
+
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY days_to_clear) as median_days,
|
| 13 |
+
STDDEV(days_to_clear) as std_days,
|
| 14 |
+
MIN(days_to_clear) as min_days,
|
| 15 |
+
MAX(days_to_clear) as max_days,
|
| 16 |
+
|
| 17 |
+
AVG(total_open_amount) as avg_amount,
|
| 18 |
+
SUM(total_open_amount) as total_amount,
|
| 19 |
+
|
| 20 |
+
-- FIXED: Overdue percentage calculation
|
| 21 |
+
CASE
|
| 22 |
+
WHEN COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END) > 0
|
| 23 |
+
THEN (CAST(COUNT(CASE WHEN is_overdue = TRUE THEN 1 END) AS NUMERIC) /
|
| 24 |
+
CAST(COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END) AS NUMERIC) * 100)
|
| 25 |
+
ELSE 0.0
|
| 26 |
+
END as pct_overdue,
|
| 27 |
+
|
| 28 |
+
(SELECT cust_payment_terms FROM invoices_history WHERE cust_number = $1 GROUP BY cust_payment_terms ORDER BY COUNT(*) DESC LIMIT 1) as most_common_payment_term,
|
| 29 |
+
(SELECT business_code FROM invoices_history WHERE cust_number = $1 GROUP BY business_code ORDER BY COUNT(*) DESC LIMIT 1) as most_common_business_code,
|
| 30 |
+
(SELECT invoice_currency FROM invoices_history WHERE cust_number = $1 GROUP BY invoice_currency ORDER BY COUNT(*) DESC LIMIT 1) as most_common_currency
|
| 31 |
+
|
| 32 |
+
FROM invoices_history
|
| 33 |
+
WHERE cust_number = $1
|
| 34 |
+
GROUP BY cust_number
|
| 35 |
+
)
|
| 36 |
+
SELECT
|
| 37 |
+
cust_number,
|
| 38 |
+
total_invoices as cust_invoice_count,
|
| 39 |
+
cleared_count as cust_cleared_count,
|
| 40 |
+
ROUND(avg_days, 2) as cust_avg_days,
|
| 41 |
+
ROUND(median_days, 2) as cust_median_days,
|
| 42 |
+
ROUND(COALESCE(std_days, 0), 2) as cust_std_days,
|
| 43 |
+
min_days as cust_min_days,
|
| 44 |
+
max_days as cust_max_days,
|
| 45 |
+
ROUND(avg_amount, 2) as cust_avg_amount,
|
| 46 |
+
ROUND(total_amount, 2) as cust_total_amount,
|
| 47 |
+
ROUND(pct_overdue, 2) as cust_pct_overdue,
|
| 48 |
+
most_common_payment_term,
|
| 49 |
+
most_common_business_code,
|
| 50 |
+
most_common_currency
|
| 51 |
+
FROM customer_stats;
|
| 52 |
+
-- ============================================
|
| 53 |
+
-- QUERY 2: Batch Compute All Customer Aggregates
|
| 54 |
+
-- Usage: Nightly ETL job
|
| 55 |
+
-- ============================================
|
| 56 |
+
|
| 57 |
+
-- Name: compute_all_customer_aggregates
|
| 58 |
+
-- Description: Computes aggregates for ALL customers with cleared invoices
|
| 59 |
+
WITH customer_stats AS (
|
| 60 |
+
SELECT
|
| 61 |
+
cust_number,
|
| 62 |
+
COUNT(*) as total_invoices,
|
| 63 |
+
COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END) as cleared_count,
|
| 64 |
+
|
| 65 |
+
AVG(days_to_clear) as avg_days,
|
| 66 |
+
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY days_to_clear) as median_days,
|
| 67 |
+
STDDEV(days_to_clear) as std_days,
|
| 68 |
+
MIN(days_to_clear) as min_days,
|
| 69 |
+
MAX(days_to_clear) as max_days,
|
| 70 |
+
|
| 71 |
+
AVG(total_open_amount) as avg_amount,
|
| 72 |
+
SUM(total_open_amount) as total_amount,
|
| 73 |
+
|
| 74 |
+
COUNT(CASE WHEN is_overdue = TRUE THEN 1 END)::NUMERIC /
|
| 75 |
+
NULLIF(COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END), 0) * 100 as pct_overdue,
|
| 76 |
+
|
| 77 |
+
MODE() WITHIN GROUP (ORDER BY cust_payment_terms) as most_common_payment_term,
|
| 78 |
+
MODE() WITHIN GROUP (ORDER BY business_code) as most_common_business_code,
|
| 79 |
+
MODE() WITHIN GROUP (ORDER BY invoice_currency) as most_common_currency
|
| 80 |
+
|
| 81 |
+
FROM invoices_history
|
| 82 |
+
WHERE clear_date IS NOT NULL -- Only customers with history
|
| 83 |
+
GROUP BY cust_number
|
| 84 |
+
HAVING COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END) > 0
|
| 85 |
+
)
|
| 86 |
+
SELECT
|
| 87 |
+
cust_number,
|
| 88 |
+
total_invoices as cust_invoice_count,
|
| 89 |
+
cleared_count as cust_cleared_count,
|
| 90 |
+
ROUND(avg_days, 2) as cust_avg_days,
|
| 91 |
+
ROUND(median_days, 2) as cust_median_days,
|
| 92 |
+
ROUND(COALESCE(std_days, 0), 2) as cust_std_days,
|
| 93 |
+
min_days as cust_min_days,
|
| 94 |
+
max_days as cust_max_days,
|
| 95 |
+
ROUND(avg_amount, 2) as cust_avg_amount,
|
| 96 |
+
ROUND(total_amount, 2) as cust_total_amount,
|
| 97 |
+
ROUND(COALESCE(pct_overdue, 0), 2) as cust_pct_overdue,
|
| 98 |
+
most_common_payment_term,
|
| 99 |
+
most_common_business_code,
|
| 100 |
+
most_common_currency,
|
| 101 |
+
NOW() as last_computed_at
|
| 102 |
+
FROM customer_stats;
|
| 103 |
+
|
| 104 |
+
-- ============================================
|
| 105 |
+
-- QUERY 3: Upsert Customer Aggregates
|
| 106 |
+
-- Usage: Insert or update customer_aggregates table
|
| 107 |
+
-- ============================================
|
| 108 |
+
|
| 109 |
+
-- Name: upsert_customer_aggregates
|
| 110 |
+
-- Description: Insert/update aggregates with conflict handling
|
| 111 |
+
-- Parameters: All customer aggregate fields
|
| 112 |
+
INSERT INTO customer_aggregates (
|
| 113 |
+
cust_number,
|
| 114 |
+
cust_invoice_count,
|
| 115 |
+
cust_cleared_count,
|
| 116 |
+
cust_avg_days,
|
| 117 |
+
cust_median_days,
|
| 118 |
+
cust_std_days,
|
| 119 |
+
cust_min_days,
|
| 120 |
+
cust_max_days,
|
| 121 |
+
cust_avg_amount,
|
| 122 |
+
cust_total_amount,
|
| 123 |
+
cust_pct_overdue,
|
| 124 |
+
most_common_payment_term,
|
| 125 |
+
most_common_business_code,
|
| 126 |
+
most_common_currency,
|
| 127 |
+
last_computed_at
|
| 128 |
+
) VALUES (
|
| 129 |
+
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, NOW()
|
| 130 |
+
)
|
| 131 |
+
ON CONFLICT (cust_number)
|
| 132 |
+
DO UPDATE SET
|
| 133 |
+
cust_invoice_count = EXCLUDED.cust_invoice_count,
|
| 134 |
+
cust_cleared_count = EXCLUDED.cust_cleared_count,
|
| 135 |
+
cust_avg_days = EXCLUDED.cust_avg_days,
|
| 136 |
+
cust_median_days = EXCLUDED.cust_median_days,
|
| 137 |
+
cust_std_days = EXCLUDED.cust_std_days,
|
| 138 |
+
cust_min_days = EXCLUDED.cust_min_days,
|
| 139 |
+
cust_max_days = EXCLUDED.cust_max_days,
|
| 140 |
+
cust_avg_amount = EXCLUDED.cust_avg_amount,
|
| 141 |
+
cust_total_amount = EXCLUDED.cust_total_amount,
|
| 142 |
+
cust_pct_overdue = EXCLUDED.cust_pct_overdue,
|
| 143 |
+
most_common_payment_term = EXCLUDED.most_common_payment_term,
|
| 144 |
+
most_common_business_code = EXCLUDED.most_common_business_code,
|
| 145 |
+
most_common_currency = EXCLUDED.most_common_currency,
|
| 146 |
+
last_computed_at = NOW();
|
| 147 |
+
|
| 148 |
+
-- ============================================
|
| 149 |
+
-- QUERY 4: Compute Payment Terms Aggregates
|
| 150 |
+
-- Usage: Pre-compute payment term statistics
|
| 151 |
+
-- ============================================
|
| 152 |
+
|
| 153 |
+
-- Name: compute_payment_terms_aggregates
|
| 154 |
+
WITH payment_stats AS (
|
| 155 |
+
SELECT
|
| 156 |
+
cust_payment_terms,
|
| 157 |
+
AVG(days_to_clear) as avg_days,
|
| 158 |
+
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY days_to_clear) as median_days,
|
| 159 |
+
COUNT(*) as invoice_count
|
| 160 |
+
FROM invoices_history
|
| 161 |
+
WHERE clear_date IS NOT NULL
|
| 162 |
+
AND cust_payment_terms IS NOT NULL
|
| 163 |
+
GROUP BY cust_payment_terms
|
| 164 |
+
)
|
| 165 |
+
SELECT
|
| 166 |
+
cust_payment_terms,
|
| 167 |
+
ROUND(avg_days, 2) as payment_terms_avg_days,
|
| 168 |
+
ROUND(median_days, 2) as payment_terms_median_days,
|
| 169 |
+
invoice_count as payment_terms_count,
|
| 170 |
+
NOW() as last_computed_at
|
| 171 |
+
FROM payment_stats;
|
| 172 |
+
|
| 173 |
+
-- ============================================
|
| 174 |
+
-- QUERY 5: Compute Business Code Aggregates
|
| 175 |
+
-- Usage: Pre-compute business code statistics
|
| 176 |
+
-- ============================================
|
| 177 |
+
|
| 178 |
+
-- Name: compute_business_code_aggregates
|
| 179 |
+
WITH business_stats AS (
|
| 180 |
+
SELECT
|
| 181 |
+
business_code,
|
| 182 |
+
AVG(days_to_clear) as avg_days,
|
| 183 |
+
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY days_to_clear) as median_days,
|
| 184 |
+
COUNT(*) as invoice_count
|
| 185 |
+
FROM invoices_history
|
| 186 |
+
WHERE clear_date IS NOT NULL
|
| 187 |
+
AND business_code IS NOT NULL
|
| 188 |
+
GROUP BY business_code
|
| 189 |
+
)
|
| 190 |
+
SELECT
|
| 191 |
+
business_code,
|
| 192 |
+
ROUND(avg_days, 2) as business_avg_days,
|
| 193 |
+
ROUND(median_days, 2) as business_median_days,
|
| 194 |
+
invoice_count as business_count,
|
| 195 |
+
NOW() as last_computed_at
|
| 196 |
+
FROM business_stats;
|
| 197 |
+
|
| 198 |
+
-- ============================================
|
| 199 |
+
-- QUERY 6: Get Customer Features (for inference)
|
| 200 |
+
-- Usage: Retrieve all features for a customer
|
| 201 |
+
-- ============================================
|
| 202 |
+
|
| 203 |
+
-- Name: get_customer_features
|
| 204 |
+
-- Description: Get customer aggregates for prediction
|
| 205 |
+
-- Parameters: $1 = cust_number
|
| 206 |
+
SELECT
|
| 207 |
+
cust_number,
|
| 208 |
+
cust_invoice_count,
|
| 209 |
+
cust_cleared_count,
|
| 210 |
+
cust_avg_days,
|
| 211 |
+
cust_median_days,
|
| 212 |
+
cust_std_days,
|
| 213 |
+
cust_min_days,
|
| 214 |
+
cust_max_days,
|
| 215 |
+
cust_avg_amount,
|
| 216 |
+
cust_total_amount,
|
| 217 |
+
cust_pct_overdue,
|
| 218 |
+
most_common_payment_term,
|
| 219 |
+
most_common_business_code,
|
| 220 |
+
most_common_currency,
|
| 221 |
+
last_computed_at
|
| 222 |
+
FROM customer_aggregates
|
| 223 |
+
WHERE cust_number = $1;
|
| 224 |
+
|
| 225 |
+
-- ============================================
|
| 226 |
+
-- QUERY 7: Get Payment Terms Features
|
| 227 |
+
-- Usage: Retrieve payment term stats
|
| 228 |
+
-- ============================================
|
| 229 |
+
|
| 230 |
+
-- Name: get_payment_terms_features
|
| 231 |
+
-- Parameters: $1 = cust_payment_terms
|
| 232 |
+
SELECT
|
| 233 |
+
cust_payment_terms,
|
| 234 |
+
payment_terms_avg_days,
|
| 235 |
+
payment_terms_median_days,
|
| 236 |
+
payment_terms_count
|
| 237 |
+
FROM payment_terms_aggregates
|
| 238 |
+
WHERE cust_payment_terms = $1;
|
| 239 |
+
|
| 240 |
+
-- ============================================
|
| 241 |
+
-- QUERY 8: Get Business Code Features
|
| 242 |
+
-- Usage: Retrieve business code stats
|
| 243 |
+
-- ============================================
|
| 244 |
+
|
| 245 |
+
-- Name: get_business_code_features
|
| 246 |
+
-- Parameters: $1 = business_code
|
| 247 |
+
SELECT
|
| 248 |
+
business_code,
|
| 249 |
+
business_avg_days,
|
| 250 |
+
business_median_days,
|
| 251 |
+
business_count
|
| 252 |
+
FROM business_code_aggregates
|
| 253 |
+
WHERE business_code = $1;
|
| 254 |
+
|
| 255 |
+
-- ============================================
|
| 256 |
+
-- QUERY 9: Insert Invoice (with upsert)
|
| 257 |
+
-- Usage: Ingest new invoice data
|
| 258 |
+
-- ============================================
|
| 259 |
+
|
| 260 |
+
-- Name: upsert_invoice
|
| 261 |
+
-- Parameters: All invoice fields
|
| 262 |
+
INSERT INTO invoices_history (
|
| 263 |
+
invoice_id,
|
| 264 |
+
business_code,
|
| 265 |
+
cust_number,
|
| 266 |
+
name_customer,
|
| 267 |
+
posting_date,
|
| 268 |
+
document_create_date,
|
| 269 |
+
document_create_date_alt,
|
| 270 |
+
due_in_date,
|
| 271 |
+
baseline_create_date,
|
| 272 |
+
clear_date,
|
| 273 |
+
total_open_amount,
|
| 274 |
+
invoice_currency,
|
| 275 |
+
document_type,
|
| 276 |
+
cust_payment_terms,
|
| 277 |
+
posting_id,
|
| 278 |
+
business_year
|
| 279 |
+
) VALUES (
|
| 280 |
+
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16
|
| 281 |
+
)
|
| 282 |
+
ON CONFLICT (invoice_id)
|
| 283 |
+
DO UPDATE SET
|
| 284 |
+
clear_date = EXCLUDED.clear_date,
|
| 285 |
+
is_open = EXCLUDED.is_open,
|
| 286 |
+
updated_at = NOW();
|
| 287 |
+
|
| 288 |
+
-- ============================================
|
| 289 |
+
-- QUERY 10: Insert Prediction Log
|
| 290 |
+
-- Usage: Record prediction for monitoring
|
| 291 |
+
-- ============================================
|
| 292 |
+
|
| 293 |
+
-- Name: insert_prediction_log
|
| 294 |
+
-- Parameters: prediction fields
|
| 295 |
+
INSERT INTO predictions_log (
|
| 296 |
+
invoice_id,
|
| 297 |
+
cust_number,
|
| 298 |
+
posting_date,
|
| 299 |
+
total_open_amount,
|
| 300 |
+
business_code,
|
| 301 |
+
cust_payment_terms,
|
| 302 |
+
features_json,
|
| 303 |
+
predicted_days_to_clear,
|
| 304 |
+
predicted_clear_date,
|
| 305 |
+
model_version,
|
| 306 |
+
model_path
|
| 307 |
+
) VALUES (
|
| 308 |
+
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11
|
| 309 |
+
) RETURNING prediction_id;
|
| 310 |
+
|
| 311 |
+
-- ============================================
|
| 312 |
+
-- QUERY 11: Update Prediction with Actual Outcome
|
| 313 |
+
-- Usage: Record actual outcome for model monitoring
|
| 314 |
+
-- ============================================
|
| 315 |
+
|
| 316 |
+
-- Name: update_prediction_outcome
|
| 317 |
+
-- Parameters: $1 = prediction_id, $2 = actual_clear_date
|
| 318 |
+
UPDATE predictions_log
|
| 319 |
+
SET
|
| 320 |
+
actual_clear_date = $2,
|
| 321 |
+
actual_days_to_clear = EXTRACT(DAY FROM ($2 - posting_date))::INTEGER,
|
| 322 |
+
prediction_error = EXTRACT(DAY FROM ($2 - posting_date))::INTEGER - predicted_days_to_clear,
|
| 323 |
+
absolute_error = ABS(EXTRACT(DAY FROM ($2 - posting_date))::INTEGER - predicted_days_to_clear),
|
| 324 |
+
outcome_recorded_at = NOW()
|
| 325 |
+
WHERE prediction_id = $1;
|
| 326 |
+
|
| 327 |
+
-- ============================================
|
| 328 |
+
-- QUERY 12: Get Recent Predictions Performance
|
| 329 |
+
-- Usage: Monitor model accuracy
|
| 330 |
+
-- ============================================
|
| 331 |
+
|
| 332 |
+
-- Name: get_prediction_metrics
|
| 333 |
+
-- Description: Calculate model performance over last N days
|
| 334 |
+
-- Parameters: $1 = days_back (e.g., 30)
|
| 335 |
+
SELECT
|
| 336 |
+
COUNT(*) as total_predictions,
|
| 337 |
+
COUNT(actual_days_to_clear) as predictions_with_outcome,
|
| 338 |
+
ROUND(AVG(ABS(prediction_error)), 2) as mae,
|
| 339 |
+
ROUND(SQRT(AVG(prediction_error * prediction_error)), 2) as rmse,
|
| 340 |
+
ROUND(AVG(CASE
|
| 341 |
+
WHEN ABS(prediction_error) <= 3 THEN 1.0
|
| 342 |
+
ELSE 0.0
|
| 343 |
+
END) * 100, 2) as pct_within_3_days,
|
| 344 |
+
ROUND(AVG(CASE
|
| 345 |
+
WHEN ABS(prediction_error) <= 7 THEN 1.0
|
| 346 |
+
ELSE 0.0
|
| 347 |
+
END) * 100, 2) as pct_within_7_days
|
| 348 |
+
FROM predictions_log
|
| 349 |
+
WHERE predicted_at >= NOW() - INTERVAL '$1 days'
|
| 350 |
+
AND actual_days_to_clear IS NOT NULL;
|
| 351 |
+
|
| 352 |
+
-- ============================================
|
| 353 |
+
-- End of Query Templates
|
| 354 |
+
-- ============================================
|
backend/database/schema_sqlite.sql
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- ============================================
|
| 2 |
+
-- Invoice Payment Prediction System - SQLite Schema
|
| 3 |
+
-- Version: 1.0 (SQLite)
|
| 4 |
+
-- ============================================
|
| 5 |
+
|
| 6 |
+
-- Drop existing tables
|
| 7 |
+
DROP TABLE IF EXISTS predictions_log;
|
| 8 |
+
DROP TABLE IF EXISTS business_code_aggregates;
|
| 9 |
+
DROP TABLE IF EXISTS payment_terms_aggregates;
|
| 10 |
+
DROP TABLE IF EXISTS customer_aggregates;
|
| 11 |
+
DROP TABLE IF EXISTS invoices_history;
|
| 12 |
+
|
| 13 |
+
-- ============================================
|
| 14 |
+
-- Table 1: invoices_history
|
| 15 |
+
-- ============================================
|
| 16 |
+
CREATE TABLE invoices_history (
|
| 17 |
+
invoice_id INTEGER PRIMARY KEY,
|
| 18 |
+
business_code TEXT NOT NULL,
|
| 19 |
+
cust_number TEXT NOT NULL,
|
| 20 |
+
name_customer TEXT,
|
| 21 |
+
|
| 22 |
+
-- Dates (stored as TEXT in ISO format: YYYY-MM-DD HH:MM:SS)
|
| 23 |
+
posting_date TEXT NOT NULL,
|
| 24 |
+
document_create_date TEXT,
|
| 25 |
+
document_create_date_alt TEXT,
|
| 26 |
+
due_in_date TEXT,
|
| 27 |
+
baseline_create_date TEXT,
|
| 28 |
+
clear_date TEXT,
|
| 29 |
+
|
| 30 |
+
-- Financial
|
| 31 |
+
total_open_amount REAL NOT NULL,
|
| 32 |
+
invoice_currency TEXT DEFAULT 'USD',
|
| 33 |
+
|
| 34 |
+
-- Metadata
|
| 35 |
+
document_type TEXT,
|
| 36 |
+
cust_payment_terms TEXT,
|
| 37 |
+
posting_id REAL,
|
| 38 |
+
is_open INTEGER DEFAULT 1,
|
| 39 |
+
business_year INTEGER,
|
| 40 |
+
|
| 41 |
+
-- Computed fields
|
| 42 |
+
days_to_clear INTEGER,
|
| 43 |
+
days_posting_to_due INTEGER,
|
| 44 |
+
days_create_to_posting INTEGER,
|
| 45 |
+
days_baseline_to_posting INTEGER,
|
| 46 |
+
is_overdue INTEGER DEFAULT 0,
|
| 47 |
+
|
| 48 |
+
-- Audit
|
| 49 |
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
| 50 |
+
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
|
| 51 |
+
);
|
| 52 |
+
|
| 53 |
+
CREATE INDEX idx_invoices_cust ON invoices_history(cust_number);
|
| 54 |
+
CREATE INDEX idx_invoices_posting ON invoices_history(posting_date);
|
| 55 |
+
CREATE INDEX idx_invoices_cleared ON invoices_history(cust_number, posting_date) WHERE clear_date IS NOT NULL;
|
| 56 |
+
|
| 57 |
+
-- ============================================
|
| 58 |
+
-- Table 2: customer_aggregates
|
| 59 |
+
-- ============================================
|
| 60 |
+
CREATE TABLE customer_aggregates (
|
| 61 |
+
cust_number TEXT PRIMARY KEY,
|
| 62 |
+
cust_invoice_count INTEGER DEFAULT 0,
|
| 63 |
+
cust_cleared_count INTEGER DEFAULT 0,
|
| 64 |
+
|
| 65 |
+
cust_avg_days REAL,
|
| 66 |
+
cust_median_days REAL,
|
| 67 |
+
cust_std_days REAL,
|
| 68 |
+
cust_min_days INTEGER,
|
| 69 |
+
cust_max_days INTEGER,
|
| 70 |
+
|
| 71 |
+
cust_avg_amount REAL,
|
| 72 |
+
cust_total_amount REAL,
|
| 73 |
+
cust_pct_overdue REAL DEFAULT 0.0,
|
| 74 |
+
|
| 75 |
+
most_common_payment_term TEXT,
|
| 76 |
+
most_common_business_code TEXT,
|
| 77 |
+
most_common_currency TEXT,
|
| 78 |
+
|
| 79 |
+
last_computed_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
| 80 |
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
| 81 |
+
);
|
| 82 |
+
|
| 83 |
+
-- ============================================
|
| 84 |
+
-- Table 3: payment_terms_aggregates
|
| 85 |
+
-- ============================================
|
| 86 |
+
CREATE TABLE payment_terms_aggregates (
|
| 87 |
+
cust_payment_terms TEXT PRIMARY KEY,
|
| 88 |
+
payment_terms_avg_days REAL,
|
| 89 |
+
payment_terms_median_days REAL,
|
| 90 |
+
payment_terms_count INTEGER DEFAULT 0,
|
| 91 |
+
last_computed_at TEXT DEFAULT CURRENT_TIMESTAMP
|
| 92 |
+
);
|
| 93 |
+
|
| 94 |
+
-- ============================================
|
| 95 |
+
-- Table 4: business_code_aggregates
|
| 96 |
+
-- ============================================
|
| 97 |
+
CREATE TABLE business_code_aggregates (
|
| 98 |
+
business_code TEXT PRIMARY KEY,
|
| 99 |
+
business_avg_days REAL,
|
| 100 |
+
business_median_days REAL,
|
| 101 |
+
business_count INTEGER DEFAULT 0,
|
| 102 |
+
last_computed_at TEXT DEFAULT CURRENT_TIMESTAMP
|
| 103 |
+
);
|
| 104 |
+
|
| 105 |
+
-- ============================================
|
| 106 |
+
-- Table 5: predictions_log
|
| 107 |
+
-- ============================================
|
| 108 |
+
CREATE TABLE predictions_log (
|
| 109 |
+
prediction_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 110 |
+
invoice_id INTEGER,
|
| 111 |
+
cust_number TEXT NOT NULL,
|
| 112 |
+
posting_date TEXT NOT NULL,
|
| 113 |
+
total_open_amount REAL NOT NULL,
|
| 114 |
+
business_code TEXT,
|
| 115 |
+
cust_payment_terms TEXT,
|
| 116 |
+
|
| 117 |
+
predicted_days_to_clear REAL NOT NULL,
|
| 118 |
+
predicted_clear_date TEXT NOT NULL,
|
| 119 |
+
|
| 120 |
+
model_version TEXT,
|
| 121 |
+
features_json TEXT,
|
| 122 |
+
|
| 123 |
+
actual_clear_date TEXT,
|
| 124 |
+
actual_days_to_clear INTEGER,
|
| 125 |
+
prediction_error REAL,
|
| 126 |
+
absolute_error REAL,
|
| 127 |
+
|
| 128 |
+
predicted_at TEXT DEFAULT CURRENT_TIMESTAMP
|
| 129 |
+
);
|
| 130 |
+
|
| 131 |
+
CREATE INDEX idx_predictions_cust ON predictions_log(cust_number);
|
| 132 |
+
CREATE INDEX idx_predictions_date ON predictions_log(predicted_at);
|
backend/etl/__init__.py
ADDED
|
File without changes
|
backend/etl/update_customer_aggregates_sqlite.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
import sqlite3
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from filelock import FileLock
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
|
| 9 |
+
DB_PATH = Path(__file__).parent.parent.parent / "data" / "invoices.db"
|
| 10 |
+
LOCK_PATH = Path(__file__).parent.parent.parent / "data" / "invoices.db.lock"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def get_most_common(series):
|
| 14 |
+
"""Get mode (most common value)."""
|
| 15 |
+
if series.empty:
|
| 16 |
+
return None
|
| 17 |
+
return series.mode()[0] if not series.mode().empty else None
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def update_customer_aggregates():
|
| 21 |
+
"""Compute and update customer aggregates."""
|
| 22 |
+
|
| 23 |
+
print("🔄 Starting customer aggregates computation...")
|
| 24 |
+
|
| 25 |
+
with FileLock(str(LOCK_PATH)):
|
| 26 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 27 |
+
|
| 28 |
+
# Load cleared invoices
|
| 29 |
+
df = pd.read_sql_query("""
|
| 30 |
+
SELECT
|
| 31 |
+
cust_number,
|
| 32 |
+
days_to_clear,
|
| 33 |
+
total_open_amount,
|
| 34 |
+
is_overdue,
|
| 35 |
+
cust_payment_terms,
|
| 36 |
+
business_code,
|
| 37 |
+
invoice_currency
|
| 38 |
+
FROM invoices_history
|
| 39 |
+
WHERE clear_date IS NOT NULL
|
| 40 |
+
""", conn)
|
| 41 |
+
|
| 42 |
+
if df.empty:
|
| 43 |
+
print("⚠️ No cleared invoices found")
|
| 44 |
+
conn.close()
|
| 45 |
+
return
|
| 46 |
+
|
| 47 |
+
print(f"📊 Processing {len(df)} cleared invoices...")
|
| 48 |
+
|
| 49 |
+
# Compute aggregates per customer
|
| 50 |
+
agg_results = []
|
| 51 |
+
|
| 52 |
+
for cust_number, group in df.groupby('cust_number'):
|
| 53 |
+
agg = {
|
| 54 |
+
'cust_number': cust_number,
|
| 55 |
+
'cust_invoice_count': len(group),
|
| 56 |
+
'cust_cleared_count': len(group),
|
| 57 |
+
'cust_avg_days': round(group['days_to_clear'].mean(), 2),
|
| 58 |
+
'cust_median_days': round(group['days_to_clear'].median(), 2),
|
| 59 |
+
'cust_std_days': round(group['days_to_clear'].std(), 2) if len(group) > 1 else 0.0,
|
| 60 |
+
'cust_min_days': int(group['days_to_clear'].min()),
|
| 61 |
+
'cust_max_days': int(group['days_to_clear'].max()),
|
| 62 |
+
'cust_avg_amount': round(group['total_open_amount'].mean(), 2),
|
| 63 |
+
'cust_total_amount': round(group['total_open_amount'].sum(), 2),
|
| 64 |
+
'cust_pct_overdue': round((group['is_overdue'].sum() / len(group)) * 100, 2),
|
| 65 |
+
'most_common_payment_term': get_most_common(group['cust_payment_terms']),
|
| 66 |
+
'most_common_business_code': get_most_common(group['business_code']),
|
| 67 |
+
'most_common_currency': get_most_common(group['invoice_currency'])
|
| 68 |
+
}
|
| 69 |
+
agg_results.append(agg)
|
| 70 |
+
|
| 71 |
+
# Upsert into customer_aggregates
|
| 72 |
+
cursor = conn.cursor()
|
| 73 |
+
for agg in agg_results:
|
| 74 |
+
cursor.execute("""
|
| 75 |
+
INSERT OR REPLACE INTO customer_aggregates (
|
| 76 |
+
cust_number, cust_invoice_count, cust_cleared_count,
|
| 77 |
+
cust_avg_days, cust_median_days, cust_std_days,
|
| 78 |
+
cust_min_days, cust_max_days,
|
| 79 |
+
cust_avg_amount, cust_total_amount, cust_pct_overdue,
|
| 80 |
+
most_common_payment_term, most_common_business_code,
|
| 81 |
+
most_common_currency, last_computed_at
|
| 82 |
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
|
| 83 |
+
""", (
|
| 84 |
+
agg['cust_number'], agg['cust_invoice_count'], agg['cust_cleared_count'],
|
| 85 |
+
agg['cust_avg_days'], agg['cust_median_days'], agg['cust_std_days'],
|
| 86 |
+
agg['cust_min_days'], agg['cust_max_days'],
|
| 87 |
+
agg['cust_avg_amount'], agg['cust_total_amount'], agg['cust_pct_overdue'],
|
| 88 |
+
agg['most_common_payment_term'], agg['most_common_business_code'],
|
| 89 |
+
agg['most_common_currency']
|
| 90 |
+
))
|
| 91 |
+
|
| 92 |
+
conn.commit()
|
| 93 |
+
print(f"✅ Updated {len(agg_results)} customer aggregates")
|
| 94 |
+
|
| 95 |
+
conn.close()
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def update_payment_terms_aggregates():
|
| 99 |
+
"""Compute and update payment terms aggregates."""
|
| 100 |
+
|
| 101 |
+
print("🔄 Computing payment terms aggregates...")
|
| 102 |
+
|
| 103 |
+
with FileLock(str(LOCK_PATH)):
|
| 104 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 105 |
+
|
| 106 |
+
df = pd.read_sql_query("""
|
| 107 |
+
SELECT cust_payment_terms, days_to_clear
|
| 108 |
+
FROM invoices_history
|
| 109 |
+
WHERE clear_date IS NOT NULL AND cust_payment_terms IS NOT NULL
|
| 110 |
+
""", conn)
|
| 111 |
+
|
| 112 |
+
if df.empty:
|
| 113 |
+
print("⚠️ No data for payment terms")
|
| 114 |
+
conn.close()
|
| 115 |
+
return
|
| 116 |
+
|
| 117 |
+
agg = df.groupby('cust_payment_terms')['days_to_clear'].agg(['mean', 'median', 'count']).reset_index()
|
| 118 |
+
agg.columns = ['cust_payment_terms', 'payment_terms_avg_days', 'payment_terms_median_days', 'payment_terms_count']
|
| 119 |
+
|
| 120 |
+
cursor = conn.cursor()
|
| 121 |
+
for _, row in agg.iterrows():
|
| 122 |
+
cursor.execute("""
|
| 123 |
+
INSERT OR REPLACE INTO payment_terms_aggregates (
|
| 124 |
+
cust_payment_terms, payment_terms_avg_days, payment_terms_median_days,
|
| 125 |
+
payment_terms_count, last_computed_at
|
| 126 |
+
) VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
|
| 127 |
+
""", (
|
| 128 |
+
row['cust_payment_terms'],
|
| 129 |
+
round(row['payment_terms_avg_days'], 2),
|
| 130 |
+
round(row['payment_terms_median_days'], 2),
|
| 131 |
+
int(row['payment_terms_count'])
|
| 132 |
+
))
|
| 133 |
+
|
| 134 |
+
conn.commit()
|
| 135 |
+
print(f"✅ Updated {len(agg)} payment terms aggregates")
|
| 136 |
+
conn.close()
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def update_business_code_aggregates():
|
| 140 |
+
"""Compute and update business code aggregates."""
|
| 141 |
+
|
| 142 |
+
print("🔄 Computing business code aggregates...")
|
| 143 |
+
|
| 144 |
+
with FileLock(str(LOCK_PATH)):
|
| 145 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 146 |
+
|
| 147 |
+
df = pd.read_sql_query("""
|
| 148 |
+
SELECT business_code, days_to_clear
|
| 149 |
+
FROM invoices_history
|
| 150 |
+
WHERE clear_date IS NOT NULL AND business_code IS NOT NULL
|
| 151 |
+
""", conn)
|
| 152 |
+
|
| 153 |
+
if df.empty:
|
| 154 |
+
print("⚠️ No data for business codes")
|
| 155 |
+
conn.close()
|
| 156 |
+
return
|
| 157 |
+
|
| 158 |
+
agg = df.groupby('business_code')['days_to_clear'].agg(['mean', 'median', 'count']).reset_index()
|
| 159 |
+
agg.columns = ['business_code', 'business_avg_days', 'business_median_days', 'business_count']
|
| 160 |
+
|
| 161 |
+
cursor = conn.cursor()
|
| 162 |
+
for _, row in agg.iterrows():
|
| 163 |
+
cursor.execute("""
|
| 164 |
+
INSERT OR REPLACE INTO business_code_aggregates (
|
| 165 |
+
business_code, business_avg_days, business_median_days,
|
| 166 |
+
business_count, last_computed_at
|
| 167 |
+
) VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
|
| 168 |
+
""", (
|
| 169 |
+
row['business_code'],
|
| 170 |
+
round(row['business_avg_days'], 2),
|
| 171 |
+
round(row['business_median_days'], 2),
|
| 172 |
+
int(row['business_count'])
|
| 173 |
+
))
|
| 174 |
+
|
| 175 |
+
conn.commit()
|
| 176 |
+
print(f"✅ Updated {len(agg)} business code aggregates")
|
| 177 |
+
conn.close()
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
if __name__ == "__main__":
|
| 181 |
+
print("="*60)
|
| 182 |
+
print("🚀 ETL: Updating Aggregates")
|
| 183 |
+
print("="*60)
|
| 184 |
+
|
| 185 |
+
update_customer_aggregates()
|
| 186 |
+
update_payment_terms_aggregates()
|
| 187 |
+
update_business_code_aggregates()
|
| 188 |
+
|
| 189 |
+
print("\n✅ All aggregates updated successfully!")
|
backend/feature_builder/__init__.py
ADDED
|
File without changes
|
backend/feature_builder/feature_builder.py
ADDED
|
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Feature builder that matches ML training pipeline exactly.
|
| 3 |
+
Generates features for inference from invoice data + aggregates.
|
| 4 |
+
FIXED: Handles None values properly with robust defaults.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import numpy as np
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from typing import Dict, Optional
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# Default values for new customers (from training)
|
| 14 |
+
DEFAULTS = {
|
| 15 |
+
'cust_avg_days': 18.0,
|
| 16 |
+
'cust_median_days': 15.0,
|
| 17 |
+
'cust_std_days': 0.0,
|
| 18 |
+
'cust_min_days': 12,
|
| 19 |
+
'cust_max_days': 25,
|
| 20 |
+
'cust_invoice_count': 1,
|
| 21 |
+
'cust_avg_amount': 30000.0,
|
| 22 |
+
'cust_total_amount': 30000.0,
|
| 23 |
+
'cust_pct_overdue': 0.0,
|
| 24 |
+
'payment_terms_avg_days': 15.0,
|
| 25 |
+
'payment_terms_median_days': 15.0,
|
| 26 |
+
'payment_terms_count': 100,
|
| 27 |
+
'business_avg_days': 17.0,
|
| 28 |
+
'business_median_days': 15.0,
|
| 29 |
+
'business_count': 1000
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def safe_float(value, default=0.0):
|
| 34 |
+
"""Safely convert to float with default."""
|
| 35 |
+
if value is None:
|
| 36 |
+
return float(default)
|
| 37 |
+
try:
|
| 38 |
+
return float(value)
|
| 39 |
+
except (ValueError, TypeError):
|
| 40 |
+
return float(default)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def safe_int(value, default=0):
|
| 44 |
+
"""Safely convert to int with default."""
|
| 45 |
+
if value is None:
|
| 46 |
+
return int(default)
|
| 47 |
+
try:
|
| 48 |
+
return int(value)
|
| 49 |
+
except (ValueError, TypeError):
|
| 50 |
+
return int(default)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def parse_date(date_str: str) -> datetime:
|
| 54 |
+
"""Parse date string to datetime."""
|
| 55 |
+
if isinstance(date_str, datetime):
|
| 56 |
+
return date_str
|
| 57 |
+
|
| 58 |
+
for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y%m%d"]:
|
| 59 |
+
try:
|
| 60 |
+
return datetime.strptime(str(date_str), fmt)
|
| 61 |
+
except ValueError:
|
| 62 |
+
continue
|
| 63 |
+
|
| 64 |
+
raise ValueError(f"Cannot parse date: {date_str}")
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def build_features(
|
| 68 |
+
invoice_data: Dict,
|
| 69 |
+
customer_agg: Optional[Dict] = None,
|
| 70 |
+
payment_terms_agg: Optional[Dict] = None,
|
| 71 |
+
business_code_agg: Optional[Dict] = None
|
| 72 |
+
) -> Dict:
|
| 73 |
+
"""
|
| 74 |
+
Build feature vector matching ML training pipeline.
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
invoice_data: Invoice details (posting_date, amount, etc.)
|
| 78 |
+
customer_agg: Customer aggregates from DB (or None for defaults)
|
| 79 |
+
payment_terms_agg: Payment terms aggregates from DB
|
| 80 |
+
business_code_agg: Business code aggregates from DB
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
Dict of features ready for model.predict()
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
# Parse dates
|
| 87 |
+
posting_date = parse_date(invoice_data['posting_date'])
|
| 88 |
+
|
| 89 |
+
# Use provided aggregates or empty dicts (will use defaults)
|
| 90 |
+
cust_agg = customer_agg or {}
|
| 91 |
+
pmt_agg = payment_terms_agg or {}
|
| 92 |
+
biz_agg = business_code_agg or {}
|
| 93 |
+
|
| 94 |
+
# Build feature dictionary
|
| 95 |
+
features = {}
|
| 96 |
+
|
| 97 |
+
# ============================================
|
| 98 |
+
# Categorical Features (encoded as integers)
|
| 99 |
+
# ============================================
|
| 100 |
+
|
| 101 |
+
# Business code mapping
|
| 102 |
+
business_code = invoice_data.get('business_code', 'U001')
|
| 103 |
+
business_code_map = {'U001': 0, 'U002': 1, 'U005': 2, 'U007': 3, 'U013': 4, 'CA02': 5}
|
| 104 |
+
features['business_code'] = business_code_map.get(business_code, 0)
|
| 105 |
+
|
| 106 |
+
# Payment terms (simplified hash encoding)
|
| 107 |
+
payment_terms = invoice_data.get('cust_payment_terms', 'NAH4')
|
| 108 |
+
features['cust_payment_terms'] = abs(hash(payment_terms)) % 74
|
| 109 |
+
|
| 110 |
+
# Currency
|
| 111 |
+
currency_map = {'USD': 0, 'CAD': 1}
|
| 112 |
+
features['invoice_currency'] = currency_map.get(invoice_data.get('invoice_currency', 'USD'), 0)
|
| 113 |
+
|
| 114 |
+
# Document type
|
| 115 |
+
doc_type_map = {'RV': 0, 'AB': 1}
|
| 116 |
+
features['document_type'] = doc_type_map.get(invoice_data.get('document_type', 'RV'), 0)
|
| 117 |
+
|
| 118 |
+
# Amount category
|
| 119 |
+
amount = safe_float(invoice_data.get('total_open_amount'), 30000.0)
|
| 120 |
+
if amount < 5000:
|
| 121 |
+
amount_cat = 0 # small
|
| 122 |
+
elif amount < 20000:
|
| 123 |
+
amount_cat = 1 # medium
|
| 124 |
+
elif amount < 50000:
|
| 125 |
+
amount_cat = 2 # large
|
| 126 |
+
else:
|
| 127 |
+
amount_cat = 3 # very_large
|
| 128 |
+
features['amount_category'] = amount_cat
|
| 129 |
+
|
| 130 |
+
# ============================================
|
| 131 |
+
# Numerical Features
|
| 132 |
+
# ============================================
|
| 133 |
+
|
| 134 |
+
features['buisness_year'] = safe_float(invoice_data.get('business_year', posting_date.year))
|
| 135 |
+
features['total_open_amount'] = amount
|
| 136 |
+
features['amount_log'] = float(np.log1p(amount))
|
| 137 |
+
|
| 138 |
+
# Temporal features
|
| 139 |
+
features['posting_year'] = posting_date.year
|
| 140 |
+
features['posting_month'] = posting_date.month
|
| 141 |
+
features['posting_quarter'] = (posting_date.month - 1) // 3 + 1
|
| 142 |
+
features['posting_day'] = posting_date.day
|
| 143 |
+
features['posting_dayofweek'] = posting_date.weekday()
|
| 144 |
+
features['posting_is_weekend'] = 1 if posting_date.weekday() >= 5 else 0
|
| 145 |
+
features['posting_is_month_end'] = 1 if posting_date.day >= 28 else 0
|
| 146 |
+
features['posting_is_month_start'] = 1 if posting_date.day <= 3 else 0
|
| 147 |
+
|
| 148 |
+
# Days between dates
|
| 149 |
+
features['days_posting_to_due'] = safe_int(invoice_data.get('days_posting_to_due'), 15)
|
| 150 |
+
features['days_create_to_posting'] = safe_int(invoice_data.get('days_create_to_posting'), 0)
|
| 151 |
+
features['days_baseline_to_posting'] = safe_int(invoice_data.get('days_baseline_to_posting'), 0)
|
| 152 |
+
|
| 153 |
+
# Document create date alt (as integer YYYYMMDD)
|
| 154 |
+
doc_create_alt = invoice_data.get('document_create_date_alt')
|
| 155 |
+
if doc_create_alt:
|
| 156 |
+
try:
|
| 157 |
+
cleaned = str(doc_create_alt).replace('-', '').replace(' ', '').replace(':', '')[:8]
|
| 158 |
+
features['document_create_date.1'] = int(cleaned)
|
| 159 |
+
except:
|
| 160 |
+
features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d'))
|
| 161 |
+
else:
|
| 162 |
+
features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d'))
|
| 163 |
+
|
| 164 |
+
# ============================================
|
| 165 |
+
# Customer Aggregates (with robust defaults)
|
| 166 |
+
# ============================================
|
| 167 |
+
|
| 168 |
+
features['cust_avg_days'] = safe_float(
|
| 169 |
+
cust_agg.get('cust_avg_days'),
|
| 170 |
+
DEFAULTS['cust_avg_days']
|
| 171 |
+
)
|
| 172 |
+
features['cust_median_days'] = safe_float(
|
| 173 |
+
cust_agg.get('cust_median_days'),
|
| 174 |
+
DEFAULTS['cust_median_days']
|
| 175 |
+
)
|
| 176 |
+
features['cust_std_days'] = safe_float(
|
| 177 |
+
cust_agg.get('cust_std_days'),
|
| 178 |
+
DEFAULTS['cust_std_days']
|
| 179 |
+
)
|
| 180 |
+
features['cust_min_days'] = safe_int(
|
| 181 |
+
cust_agg.get('cust_min_days'),
|
| 182 |
+
DEFAULTS['cust_min_days']
|
| 183 |
+
)
|
| 184 |
+
features['cust_max_days'] = safe_int(
|
| 185 |
+
cust_agg.get('cust_max_days'),
|
| 186 |
+
DEFAULTS['cust_max_days']
|
| 187 |
+
)
|
| 188 |
+
features['cust_invoice_count'] = safe_int(
|
| 189 |
+
cust_agg.get('cust_invoice_count'),
|
| 190 |
+
DEFAULTS['cust_invoice_count']
|
| 191 |
+
)
|
| 192 |
+
features['cust_avg_amount'] = safe_float(
|
| 193 |
+
cust_agg.get('cust_avg_amount'),
|
| 194 |
+
DEFAULTS['cust_avg_amount']
|
| 195 |
+
)
|
| 196 |
+
features['cust_total_amount'] = safe_float(
|
| 197 |
+
cust_agg.get('cust_total_amount'),
|
| 198 |
+
DEFAULTS['cust_total_amount']
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
# ============================================
|
| 202 |
+
# Payment Terms Aggregates
|
| 203 |
+
# ============================================
|
| 204 |
+
|
| 205 |
+
features['payment_terms_avg_days'] = safe_float(
|
| 206 |
+
pmt_agg.get('payment_terms_avg_days'),
|
| 207 |
+
DEFAULTS['payment_terms_avg_days']
|
| 208 |
+
)
|
| 209 |
+
features['payment_terms_median_days'] = safe_float(
|
| 210 |
+
pmt_agg.get('payment_terms_median_days'),
|
| 211 |
+
DEFAULTS['payment_terms_median_days']
|
| 212 |
+
)
|
| 213 |
+
features['payment_terms_count'] = safe_int(
|
| 214 |
+
pmt_agg.get('payment_terms_count'),
|
| 215 |
+
DEFAULTS['payment_terms_count']
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
# ============================================
|
| 219 |
+
# Business Code Aggregates
|
| 220 |
+
# ============================================
|
| 221 |
+
|
| 222 |
+
features['business_avg_days'] = safe_float(
|
| 223 |
+
biz_agg.get('business_avg_days'),
|
| 224 |
+
DEFAULTS['business_avg_days']
|
| 225 |
+
)
|
| 226 |
+
features['business_median_days'] = safe_float(
|
| 227 |
+
biz_agg.get('business_median_days'),
|
| 228 |
+
DEFAULTS['business_median_days']
|
| 229 |
+
)
|
| 230 |
+
features['business_count'] = safe_int(
|
| 231 |
+
biz_agg.get('business_count'),
|
| 232 |
+
DEFAULTS['business_count']
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
# ============================================
|
| 236 |
+
# Interaction Features
|
| 237 |
+
# ============================================
|
| 238 |
+
|
| 239 |
+
cust_avg_amt = features['cust_avg_amount']
|
| 240 |
+
if cust_avg_amt > 0:
|
| 241 |
+
features['amount_vs_cust_avg'] = float(amount / cust_avg_amt)
|
| 242 |
+
else:
|
| 243 |
+
features['amount_vs_cust_avg'] = 1.0
|
| 244 |
+
|
| 245 |
+
features['is_large_for_customer'] = 1 if amount > cust_avg_amt * 1.5 else 0
|
| 246 |
+
|
| 247 |
+
# ============================================
|
| 248 |
+
# Other required fields
|
| 249 |
+
# ============================================
|
| 250 |
+
|
| 251 |
+
features['isOpen'] = safe_int(invoice_data.get('is_open'), 1)
|
| 252 |
+
features['posting_id'] = safe_float(invoice_data.get('posting_id'), 1.0)
|
| 253 |
+
|
| 254 |
+
return features
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def features_to_dataframe(features: Dict) -> pd.DataFrame:
|
| 258 |
+
"""
|
| 259 |
+
Convert feature dict to DataFrame with correct column order.
|
| 260 |
+
Must match training feature order exactly.
|
| 261 |
+
"""
|
| 262 |
+
|
| 263 |
+
# Expected column order from training
|
| 264 |
+
COLUMN_ORDER = [
|
| 265 |
+
'business_code', 'buisness_year', 'document_create_date.1',
|
| 266 |
+
'invoice_currency', 'document_type', 'total_open_amount',
|
| 267 |
+
'cust_payment_terms', 'isOpen', 'posting_year', 'posting_month',
|
| 268 |
+
'posting_quarter', 'posting_day', 'posting_dayofweek',
|
| 269 |
+
'posting_is_weekend', 'posting_is_month_end', 'posting_is_month_start',
|
| 270 |
+
'days_posting_to_due', 'days_create_to_posting', 'days_baseline_to_posting',
|
| 271 |
+
'amount_log', 'amount_category', 'cust_avg_days', 'cust_median_days',
|
| 272 |
+
'cust_std_days', 'cust_min_days', 'cust_max_days', 'cust_invoice_count',
|
| 273 |
+
'cust_avg_amount', 'cust_total_amount', 'payment_terms_avg_days',
|
| 274 |
+
'payment_terms_median_days', 'payment_terms_count', 'business_avg_days',
|
| 275 |
+
'business_median_days', 'business_count', 'amount_vs_cust_avg',
|
| 276 |
+
'is_large_for_customer'
|
| 277 |
+
]
|
| 278 |
+
|
| 279 |
+
# Ensure all columns present with safe defaults
|
| 280 |
+
for col in COLUMN_ORDER:
|
| 281 |
+
if col not in features:
|
| 282 |
+
features[col] = 0.0 # Fallback
|
| 283 |
+
|
| 284 |
+
# Create DataFrame with correct order
|
| 285 |
+
df = pd.DataFrame([features])[COLUMN_ORDER]
|
| 286 |
+
|
| 287 |
+
return df
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
if __name__ == "__main__":
|
| 291 |
+
# Test with minimal data
|
| 292 |
+
test_invoice = {
|
| 293 |
+
'posting_date': '2024-01-15',
|
| 294 |
+
'total_open_amount': 50000.0,
|
| 295 |
+
'business_code': 'U001',
|
| 296 |
+
'cust_payment_terms': 'NAH4',
|
| 297 |
+
'invoice_currency': 'USD',
|
| 298 |
+
'document_type': 'RV',
|
| 299 |
+
'business_year': 2024,
|
| 300 |
+
'days_posting_to_due': 15,
|
| 301 |
+
'is_open': 1
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
# Test with no aggregates (should use defaults)
|
| 305 |
+
features = build_features(test_invoice, None, None, None)
|
| 306 |
+
df = features_to_dataframe(features)
|
| 307 |
+
|
| 308 |
+
print("✅ Features built successfully:")
|
| 309 |
+
print(f"Shape: {df.shape}")
|
| 310 |
+
print(f"Columns: {len(df.columns)}")
|
| 311 |
+
print(f"\nSample features:")
|
| 312 |
+
print(df[['cust_avg_days', 'payment_terms_avg_days', 'business_avg_days']].T)
|
backend/ingest/__init__.py
ADDED
|
File without changes
|
backend/ingest/ingest_invoice_sqlite.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Invoice ingestion helper for SQLite.
|
| 3 |
+
Handles insert/update with computed fields.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sqlite3
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Dict, Optional
|
| 10 |
+
from filelock import FileLock
|
| 11 |
+
|
| 12 |
+
DB_PATH = Path(__file__).parent.parent.parent / "data" / "invoices.db"
|
| 13 |
+
LOCK_PATH = Path(__file__).parent.parent.parent / "data" / "invoices.db.lock"
|
| 14 |
+
|
| 15 |
+
def parse_date(date_input) -> Optional[str]:
|
| 16 |
+
"""Convert various date formats to ISO string."""
|
| 17 |
+
if not date_input:
|
| 18 |
+
return None
|
| 19 |
+
|
| 20 |
+
if isinstance(date_input, str):
|
| 21 |
+
# Try parsing common formats
|
| 22 |
+
for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y%m%d"]:
|
| 23 |
+
try:
|
| 24 |
+
dt = datetime.strptime(date_input, fmt)
|
| 25 |
+
return dt.strftime("%Y-%m-%d %H:%M:%S")
|
| 26 |
+
except ValueError:
|
| 27 |
+
continue
|
| 28 |
+
return date_input # Return as-is if parsing fails
|
| 29 |
+
|
| 30 |
+
if isinstance(date_input, datetime):
|
| 31 |
+
return date_input.strftime("%Y-%m-%d %H:%M:%S")
|
| 32 |
+
|
| 33 |
+
return str(date_input)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def compute_days_diff(date1_str: Optional[str], date2_str: Optional[str]) -> Optional[int]:
|
| 37 |
+
"""Compute day difference between two ISO date strings."""
|
| 38 |
+
if not date1_str or not date2_str:
|
| 39 |
+
return None
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
d1 = datetime.strptime(date1_str, "%Y-%m-%d %H:%M:%S")
|
| 43 |
+
d2 = datetime.strptime(date2_str, "%Y-%m-%d %H:%M:%S")
|
| 44 |
+
return (d1 - d2).days
|
| 45 |
+
except:
|
| 46 |
+
return None
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def ingest_invoice(invoice_data: Dict) -> Dict:
|
| 50 |
+
"""
|
| 51 |
+
Insert or update invoice in SQLite with computed fields.
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
invoice_data: Dict with invoice fields
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
Dict with status and invoice_id
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
# Parse dates
|
| 61 |
+
posting_date = parse_date(invoice_data.get("posting_date"))
|
| 62 |
+
clear_date = parse_date(invoice_data.get("clear_date"))
|
| 63 |
+
due_in_date = parse_date(invoice_data.get("due_in_date"))
|
| 64 |
+
document_create_date = parse_date(invoice_data.get("document_create_date"))
|
| 65 |
+
baseline_create_date = parse_date(invoice_data.get("baseline_create_date"))
|
| 66 |
+
|
| 67 |
+
# Compute derived fields
|
| 68 |
+
days_to_clear = compute_days_diff(clear_date, posting_date) if clear_date else None
|
| 69 |
+
days_posting_to_due = compute_days_diff(due_in_date, posting_date)
|
| 70 |
+
days_create_to_posting = compute_days_diff(posting_date, document_create_date)
|
| 71 |
+
days_baseline_to_posting = compute_days_diff(posting_date, baseline_create_date)
|
| 72 |
+
|
| 73 |
+
is_open = 0 if clear_date else 1
|
| 74 |
+
is_overdue = 0
|
| 75 |
+
if clear_date and due_in_date:
|
| 76 |
+
try:
|
| 77 |
+
cd = datetime.strptime(clear_date, "%Y-%m-%d %H:%M:%S")
|
| 78 |
+
dd = datetime.strptime(due_in_date, "%Y-%m-%d %H:%M:%S")
|
| 79 |
+
is_overdue = 1 if cd > dd else 0
|
| 80 |
+
except:
|
| 81 |
+
pass
|
| 82 |
+
|
| 83 |
+
# Prepare data
|
| 84 |
+
invoice_id = invoice_data.get("invoice_id")
|
| 85 |
+
if not invoice_id:
|
| 86 |
+
raise ValueError("invoice_id is required")
|
| 87 |
+
|
| 88 |
+
# SQLite write with lock
|
| 89 |
+
with FileLock(str(LOCK_PATH)):
|
| 90 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 91 |
+
cursor = conn.cursor()
|
| 92 |
+
|
| 93 |
+
cursor.execute("""
|
| 94 |
+
INSERT OR REPLACE INTO invoices_history (
|
| 95 |
+
invoice_id, business_code, cust_number, name_customer,
|
| 96 |
+
posting_date, document_create_date, document_create_date_alt,
|
| 97 |
+
due_in_date, baseline_create_date, clear_date,
|
| 98 |
+
total_open_amount, invoice_currency, document_type,
|
| 99 |
+
cust_payment_terms, posting_id, business_year,
|
| 100 |
+
days_to_clear, days_posting_to_due, days_create_to_posting,
|
| 101 |
+
days_baseline_to_posting, is_overdue, is_open,
|
| 102 |
+
updated_at
|
| 103 |
+
) VALUES (
|
| 104 |
+
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
|
| 105 |
+
?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP
|
| 106 |
+
)
|
| 107 |
+
""", (
|
| 108 |
+
invoice_id,
|
| 109 |
+
invoice_data.get("business_code"),
|
| 110 |
+
invoice_data.get("cust_number"),
|
| 111 |
+
invoice_data.get("name_customer"),
|
| 112 |
+
posting_date,
|
| 113 |
+
document_create_date,
|
| 114 |
+
invoice_data.get("document_create_date_alt"),
|
| 115 |
+
due_in_date,
|
| 116 |
+
baseline_create_date,
|
| 117 |
+
clear_date,
|
| 118 |
+
invoice_data.get("total_open_amount"),
|
| 119 |
+
invoice_data.get("invoice_currency", "USD"),
|
| 120 |
+
invoice_data.get("document_type"),
|
| 121 |
+
invoice_data.get("cust_payment_terms"),
|
| 122 |
+
invoice_data.get("posting_id"),
|
| 123 |
+
invoice_data.get("business_year"),
|
| 124 |
+
days_to_clear,
|
| 125 |
+
days_posting_to_due,
|
| 126 |
+
days_create_to_posting,
|
| 127 |
+
days_baseline_to_posting,
|
| 128 |
+
is_overdue,
|
| 129 |
+
is_open
|
| 130 |
+
))
|
| 131 |
+
|
| 132 |
+
conn.commit()
|
| 133 |
+
conn.close()
|
| 134 |
+
|
| 135 |
+
return {
|
| 136 |
+
"status": "success",
|
| 137 |
+
"invoice_id": invoice_id,
|
| 138 |
+
"is_open": bool(is_open),
|
| 139 |
+
"days_to_clear": days_to_clear
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
if __name__ == "__main__":
|
| 144 |
+
# Test
|
| 145 |
+
test_invoice = {
|
| 146 |
+
"invoice_id": 12345,
|
| 147 |
+
"business_code": "U001",
|
| 148 |
+
"cust_number": "0200769623",
|
| 149 |
+
"name_customer": "Test Customer",
|
| 150 |
+
"posting_date": "2024-01-15",
|
| 151 |
+
"clear_date": "2024-02-01",
|
| 152 |
+
"due_in_date": "2024-01-30",
|
| 153 |
+
"total_open_amount": 50000.0,
|
| 154 |
+
"cust_payment_terms": "NAH4"
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
result = ingest_invoice(test_invoice)
|
| 158 |
+
print(result)
|
backend/worker/job_processor.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Background worker for processing ingest jobs.
|
| 3 |
+
Consumes jobs from Redis queue and processes them.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sqlite3
|
| 7 |
+
import logging
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from filelock import FileLock
|
| 11 |
+
from typing import Dict
|
| 12 |
+
import traceback
|
| 13 |
+
|
| 14 |
+
from .text_extractor import extract_text
|
| 15 |
+
|
| 16 |
+
# Setup logging
|
| 17 |
+
logging.basicConfig(
|
| 18 |
+
level=logging.INFO,
|
| 19 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 20 |
+
)
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
# Paths
|
| 24 |
+
BASE_DIR = Path(__file__).parent.parent.parent
|
| 25 |
+
DB_PATH = BASE_DIR / "data" / "invoices.db"
|
| 26 |
+
LOCK_PATH = BASE_DIR / "data" / "invoices.db.lock"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def update_job_status(job_id: str, status: str, error_message: str = None):
|
| 30 |
+
"""Update job status in database."""
|
| 31 |
+
with FileLock(str(LOCK_PATH), timeout=10):
|
| 32 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 33 |
+
cursor = conn.cursor()
|
| 34 |
+
|
| 35 |
+
if status == "processing":
|
| 36 |
+
cursor.execute("""
|
| 37 |
+
UPDATE ingest_jobs
|
| 38 |
+
SET status = ?, started_at = CURRENT_TIMESTAMP
|
| 39 |
+
WHERE job_id = ?
|
| 40 |
+
""", (status, job_id))
|
| 41 |
+
elif status == "completed":
|
| 42 |
+
cursor.execute("""
|
| 43 |
+
UPDATE ingest_jobs
|
| 44 |
+
SET status = ?, completed_at = CURRENT_TIMESTAMP
|
| 45 |
+
WHERE job_id = ?
|
| 46 |
+
""", (status, job_id))
|
| 47 |
+
elif status == "failed":
|
| 48 |
+
cursor.execute("""
|
| 49 |
+
UPDATE ingest_jobs
|
| 50 |
+
SET status = ?, error_message = ?, completed_at = CURRENT_TIMESTAMP
|
| 51 |
+
WHERE job_id = ?
|
| 52 |
+
""", (status, error_message, job_id))
|
| 53 |
+
|
| 54 |
+
conn.commit()
|
| 55 |
+
conn.close()
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def save_extraction(document_id: int, raw_text: str, metadata: Dict):
|
| 59 |
+
"""Save extracted text to database."""
|
| 60 |
+
with FileLock(str(LOCK_PATH), timeout=10):
|
| 61 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 62 |
+
cursor = conn.cursor()
|
| 63 |
+
|
| 64 |
+
cursor.execute("""
|
| 65 |
+
INSERT INTO extractions (
|
| 66 |
+
document_id,
|
| 67 |
+
raw_text,
|
| 68 |
+
page_count,
|
| 69 |
+
extraction_method,
|
| 70 |
+
confidence_score
|
| 71 |
+
) VALUES (?, ?, ?, ?, ?)
|
| 72 |
+
""", (
|
| 73 |
+
document_id,
|
| 74 |
+
raw_text,
|
| 75 |
+
metadata.get('page_count'),
|
| 76 |
+
metadata.get('extraction_method'),
|
| 77 |
+
metadata.get('confidence_score')
|
| 78 |
+
))
|
| 79 |
+
|
| 80 |
+
conn.commit()
|
| 81 |
+
conn.close()
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def process_job(job_data: Dict):
|
| 85 |
+
"""
|
| 86 |
+
Process a single ingest job.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
job_data: Dict with job_id, document_id, file_path, mime_type
|
| 90 |
+
"""
|
| 91 |
+
job_id = job_data['job_id']
|
| 92 |
+
document_id = job_data['document_id']
|
| 93 |
+
file_path = Path(job_data['file_path'])
|
| 94 |
+
mime_type = job_data['mime_type']
|
| 95 |
+
|
| 96 |
+
logger.info(f"Processing job {job_id} for document {document_id}")
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
# Update status to processing
|
| 100 |
+
update_job_status(job_id, "processing")
|
| 101 |
+
|
| 102 |
+
# Extract text
|
| 103 |
+
logger.info(f"Extracting text from {file_path}")
|
| 104 |
+
raw_text, metadata = extract_text(file_path, mime_type)
|
| 105 |
+
|
| 106 |
+
if not raw_text or len(raw_text.strip()) < 10:
|
| 107 |
+
raise ValueError("No text extracted or text too short")
|
| 108 |
+
|
| 109 |
+
logger.info(f"Extracted {len(raw_text)} characters, {metadata['page_count']} pages")
|
| 110 |
+
|
| 111 |
+
# Save to database
|
| 112 |
+
save_extraction(document_id, raw_text, metadata)
|
| 113 |
+
|
| 114 |
+
# Update status to completed
|
| 115 |
+
update_job_status(job_id, "completed")
|
| 116 |
+
|
| 117 |
+
logger.info(f"Job {job_id} completed successfully")
|
| 118 |
+
|
| 119 |
+
except Exception as e:
|
| 120 |
+
error_msg = f"{type(e).__name__}: {str(e)}"
|
| 121 |
+
logger.error(f"Job {job_id} failed: {error_msg}")
|
| 122 |
+
logger.error(traceback.format_exc())
|
| 123 |
+
|
| 124 |
+
# Update status to failed
|
| 125 |
+
update_job_status(job_id, "failed", error_msg)
|
| 126 |
+
raise
|
backend/worker/text_extractor.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text extraction utilities for PDF and images.
|
| 3 |
+
Supports both digital PDFs and scanned documents (OCR).
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import pdfplumber
|
| 7 |
+
import fitz # PyMuPDF
|
| 8 |
+
import pytesseract
|
| 9 |
+
from PIL import Image
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Dict, Tuple
|
| 12 |
+
import logging
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def extract_text_from_pdf(file_path: Path) -> Tuple[str, Dict]:
|
| 18 |
+
"""
|
| 19 |
+
Extract text from PDF using pdfplumber (for digital PDFs).
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
(raw_text, metadata)
|
| 23 |
+
"""
|
| 24 |
+
try:
|
| 25 |
+
text_pages = []
|
| 26 |
+
page_count = 0
|
| 27 |
+
|
| 28 |
+
with pdfplumber.open(str(file_path)) as pdf:
|
| 29 |
+
page_count = len(pdf.pages)
|
| 30 |
+
|
| 31 |
+
for page in pdf.pages:
|
| 32 |
+
text = page.extract_text()
|
| 33 |
+
if text:
|
| 34 |
+
text_pages.append(text)
|
| 35 |
+
|
| 36 |
+
raw_text = "\n\n".join(text_pages)
|
| 37 |
+
|
| 38 |
+
metadata = {
|
| 39 |
+
"page_count": page_count,
|
| 40 |
+
"extraction_method": "pdfplumber",
|
| 41 |
+
"confidence_score": 1.0 if len(raw_text) > 50 else 0.5
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
# If no text extracted, it might be a scanned PDF
|
| 45 |
+
if not raw_text.strip():
|
| 46 |
+
logger.info("No text found with pdfplumber, trying OCR...")
|
| 47 |
+
return extract_text_from_pdf_ocr(file_path)
|
| 48 |
+
|
| 49 |
+
return raw_text, metadata
|
| 50 |
+
|
| 51 |
+
except Exception as e:
|
| 52 |
+
logger.error(f"PDF extraction failed: {e}")
|
| 53 |
+
raise
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def extract_text_from_pdf_ocr(file_path: Path) -> Tuple[str, Dict]:
|
| 57 |
+
"""
|
| 58 |
+
Extract text from scanned PDF using OCR (PyMuPDF + Tesseract).
|
| 59 |
+
"""
|
| 60 |
+
try:
|
| 61 |
+
text_pages = []
|
| 62 |
+
doc = fitz.open(str(file_path))
|
| 63 |
+
page_count = len(doc)
|
| 64 |
+
|
| 65 |
+
for page_num in range(page_count):
|
| 66 |
+
page = doc[page_num]
|
| 67 |
+
# Convert page to image
|
| 68 |
+
pix = page.get_pixmap(dpi=300)
|
| 69 |
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 70 |
+
|
| 71 |
+
# OCR
|
| 72 |
+
text = pytesseract.image_to_string(img)
|
| 73 |
+
text_pages.append(text)
|
| 74 |
+
|
| 75 |
+
doc.close()
|
| 76 |
+
raw_text = "\n\n".join(text_pages)
|
| 77 |
+
|
| 78 |
+
metadata = {
|
| 79 |
+
"page_count": page_count,
|
| 80 |
+
"extraction_method": "tesseract_ocr",
|
| 81 |
+
"confidence_score": 0.7 # OCR typically less confident
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
return raw_text, metadata
|
| 85 |
+
|
| 86 |
+
except Exception as e:
|
| 87 |
+
logger.error(f"OCR extraction failed: {e}")
|
| 88 |
+
raise
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def extract_text_from_image(file_path: Path) -> Tuple[str, Dict]:
|
| 92 |
+
"""
|
| 93 |
+
Extract text from image using OCR (Tesseract).
|
| 94 |
+
"""
|
| 95 |
+
try:
|
| 96 |
+
img = Image.open(str(file_path))
|
| 97 |
+
raw_text = pytesseract.image_to_string(img)
|
| 98 |
+
|
| 99 |
+
metadata = {
|
| 100 |
+
"page_count": 1,
|
| 101 |
+
"extraction_method": "tesseract_ocr",
|
| 102 |
+
"confidence_score": 0.7
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
return raw_text, metadata
|
| 106 |
+
|
| 107 |
+
except Exception as e:
|
| 108 |
+
logger.error(f"Image OCR failed: {e}")
|
| 109 |
+
raise
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def extract_text(file_path: Path, mime_type: str) -> Tuple[str, Dict]:
|
| 113 |
+
"""
|
| 114 |
+
Main entry point for text extraction.
|
| 115 |
+
Routes to appropriate extractor based on file type.
|
| 116 |
+
|
| 117 |
+
Args:
|
| 118 |
+
file_path: Path to document
|
| 119 |
+
mime_type: MIME type of document
|
| 120 |
+
|
| 121 |
+
Returns:
|
| 122 |
+
(raw_text, metadata_dict)
|
| 123 |
+
"""
|
| 124 |
+
if mime_type == "application/pdf":
|
| 125 |
+
return extract_text_from_pdf(file_path)
|
| 126 |
+
elif mime_type in ["image/png", "image/jpeg", "image/jpg"]:
|
| 127 |
+
return extract_text_from_image(file_path)
|
| 128 |
+
else:
|
| 129 |
+
raise ValueError(f"Unsupported file type: {mime_type}")
|
backend/worker/worker.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from redis import Redis
|
| 7 |
+
from rq import Worker, Queue, Connection
|
| 8 |
+
|
| 9 |
+
# Add parent directory to path
|
| 10 |
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
| 11 |
+
|
| 12 |
+
# Import job processor
|
| 13 |
+
from backend.worker.job_processor import process_job
|
| 14 |
+
|
| 15 |
+
# Redis connection
|
| 16 |
+
REDIS_HOST = os.getenv('REDIS_HOST', 'redis')
|
| 17 |
+
REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
|
| 18 |
+
REDIS_DB = int(os.getenv('REDIS_DB', 0))
|
| 19 |
+
QUEUE_NAME = os.getenv('REDIS_QUEUE_NAME', 'invoice_ingest')
|
| 20 |
+
|
| 21 |
+
redis_conn = Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
if __name__ == '__main__':
|
| 25 |
+
print(f"🚀 Starting worker for queue: {QUEUE_NAME}")
|
| 26 |
+
print(f"📡 Redis: {REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}")
|
| 27 |
+
|
| 28 |
+
with Connection(redis_conn):
|
| 29 |
+
worker = Worker([QUEUE_NAME])
|
| 30 |
+
worker.work()
|
requirements.txt
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Web Framework
|
| 2 |
+
fastapi==0.104.1
|
| 3 |
+
uvicorn[standard]==0.24.0
|
| 4 |
+
pydantic==2.5.0
|
| 5 |
+
|
| 6 |
+
# ML & Data
|
| 7 |
+
pandas==2.1.3
|
| 8 |
+
numpy==1.26.2
|
| 9 |
+
scikit-learn==1.6.1
|
| 10 |
+
|
| 11 |
+
lightgbm==4.1.0
|
| 12 |
+
joblib==1.3.2
|
| 13 |
+
|
| 14 |
+
# Utilities
|
| 15 |
+
python-dateutil==2.8.2
|
| 16 |
+
filelock==3.13.1
|
| 17 |
+
python-multipart==0.0.6
|
| 18 |
+
|
| 19 |
+
# Testing (optional)
|
| 20 |
+
httpx==0.25.2
|
| 21 |
+
pytest==7.4.3
|
| 22 |
+
|
| 23 |
+
redis==5.0.1
|
| 24 |
+
rq==1.15.1
|
| 25 |
+
|
| 26 |
+
# NEW: Text extraction (Item 1)
|
| 27 |
+
pdfplumber==0.10.3
|
| 28 |
+
PyMuPDF==1.23.8
|
| 29 |
+
pytesseract==0.3.10
|
| 30 |
+
Pillow==10.1.0
|
| 31 |
+
|
| 32 |
+
# NEW: Utilities
|
| 33 |
+
python-magic==0.4.27 # File type detection
|
| 34 |
+
uuid==1.30
|
| 35 |
+
requests==2.31.0
|
| 36 |
+
python-dotenv==1.0.0
|
| 37 |
+
# NEW: Database
|
| 38 |
+
psycopg2-binary==2.9.7
|
| 39 |
+
SQLAlchemy==2.0.20
|
| 40 |
+
alembic==1.11.1
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
google-generativeai>=0.8.0
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
|