Dipan04 commited on
Commit
8a859a8
·
1 Parent(s): ec7fdf7

Deploy Invoice Digitization Agent

Browse files
.dockerignore ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.pyc
2
+ *.pyo
3
+ *.pyd
4
+ .Python
5
+ venv/
6
+ env/
7
+ .venv
8
+ .git
9
+ .gitignore
10
+ .vscode
11
+ .idea
12
+ *.log
13
+ *.db-journal
14
+ .env
15
+ Dockerfile
16
+ docker-compose.yml
17
+ README*.md
18
+ *.md
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ __pycache__/
3
+ *.pyc
Dockerfile ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+
8
+ sqlite3 \
9
+ ca-certificates \
10
+ libgomp1 \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Copy requirements and install Python packages
14
+ COPY requirements.txt .
15
+ RUN pip install --no-cache-dir --upgrade pip && \
16
+ pip install --no-cache-dir -r requirements.txt
17
+
18
+ # Upgrade Gemini SDK for v1 API
19
+ RUN pip install --no-cache-dir --upgrade google-generativeai google-ai-generativelanguage
20
+
21
+ # Copy application code
22
+ COPY . .
23
+
24
+ # Create necessary directories
25
+ RUN mkdir -p /app/data/logs /app/data/docs && chmod -R 777 /app/data
26
+
27
+ # Create __init__.py files (INCLUDING AGENT DIRECTORY)
28
+ RUN touch backend/__init__.py \
29
+ && touch backend/feature_builder/__init__.py \
30
+ && touch backend/app/__init__.py \
31
+ && touch backend/app/api/__init__.py \
32
+ && touch backend/app/agent/__init__.py \
33
+ && touch backend/app/wrappers/__init__.py \
34
+ && touch backend/ingest/__init__.py
35
+
36
+ # Verify agent files exist (will fail build if missing)
37
+ RUN test -f backend/app/agent/agent_orchestrator.py || \
38
+ (echo "ERROR: agent_orchestrator.py not found! Add it before building." && exit 1)
39
+
40
+ # Initialize database if it doesn't exist
41
+ RUN if [ ! -f /app/data/invoices.db ]; then \
42
+ sqlite3 /app/data/invoices.db < backend/database/init_schema_sqlite.sql; \
43
+ fi
44
+
45
+ # Expose port
46
+ EXPOSE 7860
47
+
48
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "75"]
app.py ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ from fastapi import FastAPI, HTTPException
4
+ from pydantic import BaseModel
5
+ from typing import Optional, Dict
6
+ import sqlite3
7
+ import joblib
8
+ import pandas as pd
9
+ from datetime import datetime, timedelta
10
+ from pathlib import Path
11
+ from filelock import FileLock
12
+ from fastapi.responses import JSONResponse
13
+ import json
14
+ import sys
15
+
16
+ import logging
17
+
18
+ # Setup logging for entire app
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
22
+ handlers=[logging.StreamHandler(sys.stdout)],
23
+ force=True
24
+ )
25
+ # Setup paths
26
+ BASE_DIR = Path(__file__).parent
27
+ DB_PATH = BASE_DIR / "data" / "invoices.db" # Inside container: /app/data/invoices.db
28
+ LOCK_PATH = BASE_DIR / "data" / "invoices.db.lock"
29
+ MODEL_PATH = BASE_DIR / "ml" / "models" / "payment_predictor_model_20251124_194847.pkl"
30
+ LOG_DIR = BASE_DIR / "data" / "logs"
31
+ PREDICTIONS_LOG = LOG_DIR / "predictions.csv"
32
+
33
+ # Ensure directories exist
34
+ LOG_DIR.mkdir(parents=True, exist_ok=True)
35
+
36
+ # Add backend to path
37
+ sys.path.append(str(BASE_DIR / "backend"))
38
+
39
+ # Import feature builder
40
+ from backend.feature_builder.feature_builder import build_features, features_to_dataframe
41
+ from backend.ingest.ingest_invoice_sqlite import ingest_invoice as ingest_func
42
+
43
+ # ============================================
44
+ # IMPORT INGEST ROUTER (NEW)
45
+ # ============================================
46
+ from backend.app.api.ingest import router as ingest_router
47
+
48
+ # Load ML model
49
+ print("🤖 Loading ML model...")
50
+ try:
51
+ model_artifacts = joblib.load(MODEL_PATH)
52
+ model = model_artifacts['model']
53
+ print(f"✅ Model loaded: {MODEL_PATH.name}")
54
+ except Exception as e:
55
+ print(f"❌ Failed to load model: {e}")
56
+ model = None
57
+
58
+ # FastAPI app
59
+ app = FastAPI(
60
+ title="Invoice Payment Predictor",
61
+ description="Predicts payment clearing time for invoices",
62
+ version="1.0.0"
63
+ )
64
+
65
+ # ============================================
66
+ # REGISTER INGEST ROUTER (NEW)
67
+ # ============================================
68
+ app.include_router(ingest_router)
69
+
70
+
71
+ # ============================================
72
+ # Pydantic Models
73
+ # ============================================
74
+
75
+ class InvoiceIngest(BaseModel):
76
+ invoice_id: int
77
+ business_code: str
78
+ cust_number: str
79
+ name_customer: Optional[str] = None
80
+ posting_date: str
81
+ document_create_date: Optional[str] = None
82
+ document_create_date_alt: Optional[str] = None
83
+ due_in_date: Optional[str] = None
84
+ baseline_create_date: Optional[str] = None
85
+ clear_date: Optional[str] = None
86
+ total_open_amount: float
87
+ invoice_currency: str = "USD"
88
+ document_type: Optional[str] = "RV"
89
+ cust_payment_terms: Optional[str] = None
90
+ posting_id: Optional[float] = None
91
+ business_year: Optional[int] = None
92
+
93
+
94
+ class PredictionRequest(BaseModel):
95
+ invoice_id: Optional[int] = None
96
+ cust_number: str
97
+ posting_date: str
98
+ total_open_amount: float
99
+ business_code: str = "U001"
100
+ cust_payment_terms: str = "NAH4"
101
+ invoice_currency: str = "USD"
102
+ document_type: str = "RV"
103
+ due_in_date: Optional[str] = None
104
+ business_year: Optional[int] = None
105
+
106
+
107
+ # ============================================
108
+ # Helper Functions
109
+ # ============================================
110
+
111
+ def get_customer_aggregates(cust_number: str) -> Optional[Dict]:
112
+ """Fetch customer aggregates from SQLite."""
113
+ try:
114
+ with FileLock(str(LOCK_PATH), timeout=10):
115
+ conn = sqlite3.connect(str(DB_PATH))
116
+ conn.row_factory = sqlite3.Row
117
+ cursor = conn.cursor()
118
+
119
+ cursor.execute("""
120
+ SELECT * FROM customer_aggregates WHERE cust_number = ?
121
+ """, (cust_number,))
122
+
123
+ row = cursor.fetchone()
124
+ conn.close()
125
+
126
+ if row:
127
+ return dict(row)
128
+ except Exception as e:
129
+ print(f"Error fetching customer aggregates: {e}")
130
+
131
+ return None
132
+
133
+
134
+ def get_payment_terms_aggregates(payment_terms: str) -> Optional[Dict]:
135
+ """Fetch payment terms aggregates from SQLite."""
136
+ try:
137
+ with FileLock(str(LOCK_PATH), timeout=10):
138
+ conn = sqlite3.connect(str(DB_PATH))
139
+ conn.row_factory = sqlite3.Row
140
+ cursor = conn.cursor()
141
+
142
+ cursor.execute("""
143
+ SELECT * FROM payment_terms_aggregates WHERE cust_payment_terms = ?
144
+ """, (payment_terms,))
145
+
146
+ row = cursor.fetchone()
147
+ conn.close()
148
+
149
+ if row:
150
+ return dict(row)
151
+ except Exception as e:
152
+ print(f"Error fetching payment terms: {e}")
153
+
154
+ return None
155
+
156
+
157
+ def get_business_code_aggregates(business_code: str) -> Optional[Dict]:
158
+ """Fetch business code aggregates from SQLite."""
159
+ try:
160
+ with FileLock(str(LOCK_PATH), timeout=10):
161
+ conn = sqlite3.connect(str(DB_PATH))
162
+ conn.row_factory = sqlite3.Row
163
+ cursor = conn.cursor()
164
+
165
+ cursor.execute("""
166
+ SELECT * FROM business_code_aggregates WHERE business_code = ?
167
+ """, (business_code,))
168
+
169
+ row = cursor.fetchone()
170
+ conn.close()
171
+
172
+ if row:
173
+ return dict(row)
174
+ except Exception as e:
175
+ print(f"Error fetching business code: {e}")
176
+
177
+ return None
178
+
179
+
180
+ def log_prediction_to_csv(prediction_data: Dict):
181
+ """Append prediction to CSV log."""
182
+ df = pd.DataFrame([prediction_data])
183
+
184
+ if not PREDICTIONS_LOG.exists():
185
+ df.to_csv(PREDICTIONS_LOG, index=False)
186
+ else:
187
+ df.to_csv(PREDICTIONS_LOG, mode='a', header=False, index=False)
188
+
189
+
190
+ def log_prediction_to_db(prediction_data: Dict):
191
+ """Insert prediction into SQLite predictions_log."""
192
+ try:
193
+ with FileLock(str(LOCK_PATH), timeout=10):
194
+ conn = sqlite3.connect(str(DB_PATH))
195
+ cursor = conn.cursor()
196
+
197
+ cursor.execute("""
198
+ INSERT INTO predictions_log (
199
+ invoice_id, cust_number, posting_date, total_open_amount,
200
+ business_code, cust_payment_terms, predicted_days_to_clear,
201
+ predicted_clear_date, model_version, features_json
202
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
203
+ """, (
204
+ prediction_data.get('invoice_id'),
205
+ prediction_data['cust_number'],
206
+ prediction_data['posting_date'],
207
+ prediction_data['total_open_amount'],
208
+ prediction_data.get('business_code'),
209
+ prediction_data.get('cust_payment_terms'),
210
+ prediction_data['predicted_days_to_clear'],
211
+ prediction_data['predicted_clear_date'],
212
+ prediction_data.get('model_version', 'v1.0'),
213
+ json.dumps(prediction_data.get('features', {}))
214
+ ))
215
+
216
+ prediction_id = cursor.lastrowid
217
+ conn.commit()
218
+ conn.close()
219
+
220
+ return prediction_id
221
+ except Exception as e:
222
+ print(f"Error logging to DB: {e}")
223
+ return None
224
+
225
+
226
+ # ============================================
227
+ # API Endpoints
228
+ # ============================================
229
+
230
+ @app.get("/")
231
+ def root():
232
+ """Root endpoint."""
233
+ return {
234
+ "service": "Invoice Payment Predictor",
235
+ "version": "1.0.0",
236
+ "status": "operational",
237
+ "model_loaded": model is not None
238
+ }
239
+
240
+
241
+ @app.get("/health")
242
+ def health():
243
+ return JSONResponse(
244
+ content={
245
+ "status": "ok",
246
+ "model_loaded": model is not None,
247
+ "db_exists": DB_PATH.exists()
248
+ },
249
+ media_type="application/json"
250
+ )
251
+
252
+
253
+
254
+
255
+ @app.post("/ingest")
256
+ def ingest_invoice(invoice: InvoiceIngest):
257
+ """
258
+ Ingest invoice into SQLite database.
259
+ Computes derived fields and stores data.
260
+ """
261
+ try:
262
+ result = ingest_func(invoice.dict())
263
+
264
+ return {
265
+ "status": "success",
266
+ "message": "Invoice ingested successfully",
267
+ "data": result
268
+ }
269
+
270
+ except Exception as e:
271
+ raise HTTPException(status_code=500, detail=f"Ingestion failed: {str(e)}")
272
+
273
+
274
+ @app.get("/features/{cust_number}")
275
+ def get_features(cust_number: str):
276
+ """
277
+ Get customer aggregate features.
278
+ Returns cached aggregates or defaults for new customers.
279
+ """
280
+
281
+ customer_agg = get_customer_aggregates(cust_number)
282
+
283
+ if not customer_agg:
284
+ return {
285
+ "cust_number": cust_number,
286
+ "status": "new_customer",
287
+ "message": "No historical data found, using defaults",
288
+ "features": {
289
+ "cust_avg_days": 18.0,
290
+ "cust_median_days": 15.0,
291
+ "cust_invoice_count": 0
292
+ }
293
+ }
294
+
295
+ return {
296
+ "cust_number": cust_number,
297
+ "status": "existing_customer",
298
+ "features": customer_agg
299
+ }
300
+
301
+
302
+ @app.post("/predict")
303
+ def predict(request: PredictionRequest):
304
+ """
305
+ Predict payment clearing time for an invoice.
306
+
307
+ Returns:
308
+ - predicted_days_to_clear
309
+ - predicted_clear_date
310
+ - confidence info
311
+ """
312
+
313
+ if model is None:
314
+ raise HTTPException(status_code=503, detail="ML model not loaded")
315
+
316
+ try:
317
+ # Fetch aggregates
318
+ customer_agg = get_customer_aggregates(request.cust_number)
319
+ payment_agg = get_payment_terms_aggregates(request.cust_payment_terms)
320
+ business_agg = get_business_code_aggregates(request.business_code)
321
+
322
+ # Build invoice data dict
323
+ invoice_data = request.dict()
324
+
325
+ # Compute days_posting_to_due if due_in_date provided
326
+ if request.due_in_date:
327
+ posting_dt = datetime.strptime(request.posting_date, "%Y-%m-%d")
328
+ due_dt = datetime.strptime(request.due_in_date, "%Y-%m-%d")
329
+ invoice_data['days_posting_to_due'] = (due_dt - posting_dt).days
330
+ else:
331
+ invoice_data['days_posting_to_due'] = 15 # Default
332
+
333
+ # Build features
334
+ features = build_features(invoice_data, customer_agg, payment_agg, business_agg)
335
+ features_df = features_to_dataframe(features)
336
+
337
+ # Predict
338
+ predicted_days = float(model.predict(features_df)[0])
339
+
340
+ # Calculate predicted clear date
341
+ posting_dt = datetime.strptime(request.posting_date, "%Y-%m-%d")
342
+ predicted_clear_dt = posting_dt + timedelta(days=predicted_days)
343
+
344
+ # Prepare response
345
+ response = {
346
+ "invoice_id": request.invoice_id,
347
+ "cust_number": request.cust_number,
348
+ "posting_date": request.posting_date,
349
+ "total_open_amount": request.total_open_amount,
350
+ "predicted_days_to_clear": round(predicted_days, 2),
351
+ "predicted_clear_date": predicted_clear_dt.strftime("%Y-%m-%d"),
352
+ "customer_history": "available" if customer_agg else "new_customer",
353
+ "model_version": "v1.0"
354
+ }
355
+
356
+ # Log prediction
357
+ log_prediction_to_csv(response)
358
+ prediction_id = log_prediction_to_db({
359
+ **response,
360
+ 'business_code': request.business_code,
361
+ 'cust_payment_terms': request.cust_payment_terms,
362
+ 'features': features
363
+ })
364
+
365
+ response['prediction_id'] = prediction_id
366
+
367
+ return response
368
+
369
+ except Exception as e:
370
+ raise HTTPException(status_code=500, detail=f"Prediction failed: {str(e)}")
371
+
372
+
373
+ @app.get("/predictions/recent")
374
+ def get_recent_predictions(limit: int = 10):
375
+ """Get recent predictions from log."""
376
+
377
+ try:
378
+ with FileLock(str(LOCK_PATH), timeout=10):
379
+ conn = sqlite3.connect(str(DB_PATH))
380
+ conn.row_factory = sqlite3.Row
381
+ cursor = conn.cursor()
382
+
383
+ cursor.execute("""
384
+ SELECT
385
+ prediction_id,
386
+ cust_number,
387
+ posting_date,
388
+ predicted_days_to_clear,
389
+ predicted_clear_date,
390
+ predicted_at
391
+ FROM predictions_log
392
+ ORDER BY predicted_at DESC
393
+ LIMIT ?
394
+ """, (limit,))
395
+
396
+ rows = cursor.fetchall()
397
+ conn.close()
398
+
399
+ return {
400
+ "count": len(rows),
401
+ "predictions": [dict(row) for row in rows]
402
+ }
403
+ except Exception as e:
404
+ raise HTTPException(status_code=500, detail=f"Failed to fetch predictions: {str(e)}")
405
+
406
+
407
+ if __name__ == "__main__":
408
+ import uvicorn
409
+ uvicorn.run(
410
+ app,
411
+ host="0.0.0.0",
412
+ port=7860,
413
+ timeout_keep_alive=75,
414
+ timeout_graceful_shutdown=10
415
+ )
backend/__init__.py ADDED
File without changes
backend/app/__init__.py ADDED
File without changes
backend/app/agent/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agent module for autonomous invoice processing.
3
+ """
4
+
5
+ from .agent_orchestrator import (
6
+ InvoiceAgent,
7
+ AgentState,
8
+ AgentDecision,
9
+ create_agent,
10
+ run_agent_pipeline
11
+ )
12
+
13
+ __all__ = [
14
+ 'InvoiceAgent',
15
+ 'AgentState',
16
+ 'AgentDecision',
17
+ 'create_agent',
18
+ 'run_agent_pipeline'
19
+ ]
backend/app/agent/agent_orchestrator.py ADDED
@@ -0,0 +1,476 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ True End-to-End Agent Orchestrator
3
+ ===================================
4
+ Autonomous agent that:
5
+ 1. Decides which tools to use based on document analysis
6
+ 2. Validates its own output
7
+ 3. Self-corrects when confidence is low
8
+ 4. Learns from patterns
9
+ """
10
+
11
+ import json
12
+ import sys
13
+ import logging
14
+ from pathlib import Path
15
+ from typing import Dict, List, Optional, Tuple
16
+ from dataclasses import dataclass
17
+ from enum import Enum
18
+
19
+ # Configure logging
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format='%(asctime)s - %(levelname)s - %(message)s',
23
+ handlers=[logging.StreamHandler(sys.stdout)],
24
+ force=True
25
+ )
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class AgentDecision(Enum):
30
+ """Agent's possible decisions"""
31
+ EXTRACT_TEXT = "extract_text"
32
+ EXTRACT_TABLES = "extract_tables"
33
+ RUN_NER = "run_ner"
34
+ USE_GEMINI = "use_gemini"
35
+ USE_REGEX = "use_regex"
36
+ VALIDATE = "validate"
37
+ RETRY = "retry"
38
+ COMPLETE = "complete"
39
+ HUMAN_REVIEW = "human_review"
40
+
41
+
42
+ @dataclass
43
+ class AgentState:
44
+ """Agent's internal state"""
45
+ doc_id: str
46
+ file_path: Path
47
+
48
+ # Extracted data
49
+ raw_text: Optional[str] = None
50
+ tables: Optional[List] = None
51
+ entities: Optional[List] = None
52
+ entity_map: Optional[Dict] = None
53
+
54
+ # Mapped fields
55
+ fields: Optional[Dict] = None
56
+ confidence_map: Optional[Dict] = None
57
+
58
+ # Decision tracking
59
+ attempts: int = 0
60
+ max_attempts: int = 3
61
+ history: List[str] = None
62
+ errors: List[str] = None
63
+
64
+ def __post_init__(self):
65
+ if self.history is None:
66
+ self.history = []
67
+ if self.errors is None:
68
+ self.errors = []
69
+
70
+
71
+ class InvoiceAgent:
72
+ """
73
+ Autonomous agent that processes invoices with self-correction.
74
+ """
75
+
76
+ def __init__(self, text_extractor, table_extractor, ner_extractor, gemini_mapper):
77
+ """
78
+ Args:
79
+ text_extractor: Function(file_path) -> (success, text, error)
80
+ table_extractor: Function(file_path) -> (success, tables, error)
81
+ ner_extractor: Function(text) -> (success, entities, entity_map, error)
82
+ gemini_mapper: Function(text, entities, entity_map, tables) -> (success, fields, error)
83
+ """
84
+ self.text_extractor = text_extractor
85
+ self.table_extractor = table_extractor
86
+ self.ner_extractor = ner_extractor
87
+ self.gemini_mapper = gemini_mapper
88
+
89
+ # Minimum confidence thresholds
90
+ self.MIN_CONFIDENCE = {
91
+ 'cust_number': 0.6,
92
+ 'posting_date': 0.7,
93
+ 'total_open_amount': 0.7,
94
+ 'cust_payment_terms': 0.5
95
+ }
96
+
97
+ def process(self, state: AgentState) -> AgentState:
98
+ """
99
+ Main agent loop - autonomous decision-making and execution.
100
+ """
101
+ logger.info("=" * 70)
102
+ logger.info(f"**** AGENT STARTING: {state.file_path.name}")
103
+ logger.info("=" * 70)
104
+
105
+ while state.attempts < state.max_attempts:
106
+ state.attempts += 1
107
+ logger.info(f"\n**** ATTEMPT {state.attempts}/{state.max_attempts}")
108
+
109
+ # Step 1: Decide next action
110
+ decision = self._decide_next_action(state)
111
+ logger.info(f"**** DECISION: {decision.value}")
112
+ state.history.append(decision.value)
113
+
114
+ # Step 2: Execute action
115
+ success = self._execute_action(decision, state)
116
+
117
+ if not success:
118
+ logger.warning(f"**** Action {decision.value} failed")
119
+ continue
120
+
121
+ # Step 3: Check if we're done
122
+ if decision == AgentDecision.COMPLETE:
123
+ logger.info("**** AGENT COMPLETE")
124
+ break
125
+
126
+ if decision == AgentDecision.HUMAN_REVIEW:
127
+ logger.info("**** AGENT REQUESTING HUMAN REVIEW")
128
+ break
129
+
130
+ logger.info("=" * 70)
131
+ logger.info(f"**** Final confidence: {self._calculate_overall_confidence(state):.2f}")
132
+ logger.info(f"**** Actions taken: {' → '.join(state.history)}")
133
+ logger.info("=" * 70)
134
+
135
+ return state
136
+
137
+ def _decide_next_action(self, state: AgentState) -> AgentDecision:
138
+ """
139
+ Agent's brain - decides what to do next based on current state.
140
+ """
141
+
142
+ # 1. If no text, extract it
143
+ if state.raw_text is None:
144
+ return AgentDecision.EXTRACT_TEXT
145
+
146
+ # 2. If text exists but no entities, run NER
147
+ if state.entities is None:
148
+ return AgentDecision.RUN_NER
149
+
150
+ # 3. If no fields mapped yet, try Gemini first
151
+ if state.fields is None:
152
+ return AgentDecision.USE_GEMINI
153
+
154
+ # 4. If fields exist, validate them
155
+ if not self._is_validated(state):
156
+ return AgentDecision.VALIDATE
157
+
158
+ # 5. Check confidence - retry if low
159
+ overall_confidence = self._calculate_overall_confidence(state)
160
+
161
+ if overall_confidence < 0.6 and state.attempts < state.max_attempts:
162
+ # Try alternative approach
163
+ if 'use_gemini' in state.history and 'use_regex' not in state.history:
164
+ return AgentDecision.USE_REGEX
165
+ elif 'extract_tables' not in state.history:
166
+ return AgentDecision.EXTRACT_TABLES
167
+ else:
168
+ return AgentDecision.RETRY
169
+
170
+ # 6. If still low confidence, request human review
171
+ if overall_confidence < 0.5:
172
+ return AgentDecision.HUMAN_REVIEW
173
+
174
+ # 7. Otherwise, we're done!
175
+ return AgentDecision.COMPLETE
176
+
177
+ def _execute_action(self, decision: AgentDecision, state: AgentState) -> bool:
178
+ """Execute the decided action."""
179
+
180
+ try:
181
+ if decision == AgentDecision.EXTRACT_TEXT:
182
+ return self._extract_text(state)
183
+
184
+ elif decision == AgentDecision.EXTRACT_TABLES:
185
+ return self._extract_tables(state)
186
+
187
+ elif decision == AgentDecision.RUN_NER:
188
+ return self._run_ner(state)
189
+
190
+ elif decision == AgentDecision.USE_GEMINI:
191
+ return self._use_gemini(state)
192
+
193
+ elif decision == AgentDecision.USE_REGEX:
194
+ return self._use_regex(state)
195
+
196
+ elif decision == AgentDecision.VALIDATE:
197
+ return self._validate_fields(state)
198
+
199
+ elif decision == AgentDecision.RETRY:
200
+ # Clear fields and try again with different approach
201
+ state.fields = None
202
+ state.confidence_map = None
203
+ return True
204
+
205
+ elif decision in [AgentDecision.COMPLETE, AgentDecision.HUMAN_REVIEW]:
206
+ return True
207
+
208
+ return False
209
+
210
+ except Exception as e:
211
+ logger.error(f"**** Action failed: {e}")
212
+ state.errors.append(str(e))
213
+ return False
214
+
215
+ def _extract_text(self, state: AgentState) -> bool:
216
+ """Extract text from document."""
217
+ logger.info("**** Extracting text...")
218
+ success, text, error = self.text_extractor(state.file_path)
219
+
220
+ if success and text and len(text.strip()) > 10:
221
+ state.raw_text = text
222
+ logger.info(f"**** Extracted {len(text)} characters")
223
+ return True
224
+
225
+ state.errors.append(f"Text extraction failed: {error}")
226
+ return False
227
+
228
+ def _extract_tables(self, state: AgentState) -> bool:
229
+ """Extract tables from document."""
230
+ logger.info("**** Extracting tables...")
231
+ success, tables, error = self.table_extractor(state.file_path)
232
+
233
+ if success:
234
+ state.tables = tables
235
+ logger.info(f"**** Extracted {len(tables)} tables")
236
+ return True
237
+
238
+ logger.warning(f"**** Table extraction failed: {error}")
239
+ state.tables = []
240
+ return True # Non-critical, continue
241
+
242
+ def _run_ner(self, state: AgentState) -> bool:
243
+ """Run Named Entity Recognition."""
244
+ logger.info("**** Running NER...")
245
+ success, entities, entity_map, error = self.ner_extractor(state.raw_text)
246
+
247
+ if success:
248
+ state.entities = entities
249
+ state.entity_map = entity_map
250
+ logger.info(f"**** Found {len(entities)} entities")
251
+ return True
252
+
253
+ logger.warning(f"**** NER failed: {error}")
254
+ state.entities = []
255
+ state.entity_map = {}
256
+ return True # Non-critical, continue
257
+
258
+ def _use_gemini(self, state: AgentState) -> bool:
259
+ """Use Gemini for intelligent mapping."""
260
+ logger.info("**** Using Gemini mapping...")
261
+
262
+ success, result, error = self.gemini_mapper(
263
+ state.raw_text,
264
+ state.entities or [],
265
+ state.entity_map or {},
266
+ state.tables or []
267
+ )
268
+
269
+ if success and result:
270
+ state.fields = {
271
+ 'cust_number': result.get('customer_name', 'UNKNOWN')[:20],
272
+ 'posting_date': result.get('date', '2024-01-01'),
273
+ 'total_open_amount': float(result.get('total_amount', 0.0)),
274
+ 'business_code': 'U001',
275
+ 'cust_payment_terms': result.get('payment_terms', 'NAH4')[:10]
276
+ }
277
+
278
+ # High confidence from Gemini
279
+ state.confidence_map = {
280
+ 'cust_number': 0.9,
281
+ 'posting_date': 0.9,
282
+ 'total_open_amount': 0.9,
283
+ 'business_code': 0.3,
284
+ 'cust_payment_terms': 0.8
285
+ }
286
+
287
+ logger.info(f"**** Gemini mapped: {state.fields}")
288
+ return True
289
+
290
+ logger.warning(f"**** Gemini failed: {error}")
291
+ state.errors.append(f"Gemini mapping failed: {error}")
292
+ return False
293
+
294
+ def _use_regex(self, state: AgentState) -> bool:
295
+ """Fallback regex-based extraction."""
296
+ logger.info("**** Using regex fallback...")
297
+
298
+ from backend.app.api.ingest import map_with_regex
299
+
300
+ fields, confidence = map_with_regex(state.raw_text, state.entities or [])
301
+ state.fields = fields
302
+ state.confidence_map = confidence
303
+
304
+ logger.info(f"**** Regex mapped: {fields}")
305
+ return True
306
+
307
+ def _validate_fields(self, state: AgentState) -> bool:
308
+ """
309
+ Validate extracted fields using business rules.
310
+ Agent learns if data makes sense.
311
+ """
312
+ logger.info("✓ Validating fields...")
313
+
314
+ if not state.fields:
315
+ return False
316
+
317
+ validation_results = {}
318
+
319
+ # 1. Customer number shouldn't be empty or generic
320
+ cust = state.fields.get('cust_number', '')
321
+ if cust and cust != 'UNKNOWN' and len(cust) > 2:
322
+ validation_results['cust_number'] = True
323
+ else:
324
+ validation_results['cust_number'] = False
325
+ logger.warning("**** Customer number looks invalid")
326
+
327
+ # 2. Date should be reasonable (not default)
328
+ date = state.fields.get('posting_date', '')
329
+ if date and date != '2024-01-01':
330
+ validation_results['posting_date'] = True
331
+ else:
332
+ validation_results['posting_date'] = False
333
+ logger.warning("**** Date looks like default value")
334
+
335
+ # 3. Amount should be > 0
336
+ amount = state.fields.get('total_open_amount', 0.0)
337
+ if amount > 0:
338
+ validation_results['total_open_amount'] = True
339
+ else:
340
+ validation_results['total_open_amount'] = False
341
+ logger.warning("**** Amount is zero or missing")
342
+
343
+ # Adjust confidence based on validation
344
+ for field, is_valid in validation_results.items():
345
+ if not is_valid and state.confidence_map:
346
+ state.confidence_map[field] *= 0.5 # Reduce confidence
347
+
348
+ # Mark as validated
349
+ state.history.append('validated')
350
+
351
+ success_count = sum(validation_results.values())
352
+ logger.info(f"✓ Validation: {success_count}/{len(validation_results)} checks passed")
353
+
354
+ return success_count >= 2 # At least 2 fields should be valid
355
+
356
+ def _is_validated(self, state: AgentState) -> bool:
357
+ """Check if validation has been performed."""
358
+ return 'validated' in state.history
359
+
360
+ def _calculate_overall_confidence(self, state: AgentState) -> float:
361
+ """Calculate overall confidence score."""
362
+ if not state.confidence_map:
363
+ return 0.0
364
+
365
+ # Weighted average (important fields have more weight)
366
+ weights = {
367
+ 'cust_number': 0.3,
368
+ 'posting_date': 0.2,
369
+ 'total_open_amount': 0.3,
370
+ 'cust_payment_terms': 0.1,
371
+ 'business_code': 0.1
372
+ }
373
+
374
+ total_confidence = 0.0
375
+ total_weight = 0.0
376
+
377
+ for field, weight in weights.items():
378
+ if field in state.confidence_map:
379
+ total_confidence += state.confidence_map[field] * weight
380
+ total_weight += weight
381
+
382
+ return total_confidence / total_weight if total_weight > 0 else 0.0
383
+
384
+
385
+ # ==============================================
386
+ # Integration with existing code
387
+ # ==============================================
388
+
389
+ def create_agent(text_extractor_fn, table_extractor_fn, ner_fn, gemini_fn):
390
+ """
391
+ Factory function to create agent with your existing functions.
392
+
393
+ Usage:
394
+ from backend.app.api.ingest import (
395
+ call_text_extractor, call_table_extractor,
396
+ call_ner, map_with_gemini
397
+ )
398
+
399
+ agent = create_agent(
400
+ call_text_extractor,
401
+ call_table_extractor,
402
+ call_ner,
403
+ map_with_gemini
404
+ )
405
+
406
+ state = AgentState(doc_id="doc123", file_path=Path("invoice.pdf"))
407
+ result_state = agent.process(state)
408
+ """
409
+ return InvoiceAgent(text_extractor_fn, table_extractor_fn, ner_fn, gemini_fn)
410
+
411
+
412
+ def run_agent_pipeline(job_id: str, doc_id: str, file_path: Path):
413
+ """
414
+ Replace your existing process_document() with this agentic version.
415
+ """
416
+ from backend.app.api.ingest import (
417
+ call_text_extractor, call_table_extractor,
418
+ call_ner, map_with_gemini,
419
+ save_extraction, save_invoice_fields,
420
+ update_job_status
421
+ )
422
+
423
+ try:
424
+ update_job_status(job_id, 'processing')
425
+
426
+ # Create agent
427
+ agent = create_agent(
428
+ call_text_extractor,
429
+ call_table_extractor,
430
+ call_ner,
431
+ map_with_gemini
432
+ )
433
+
434
+ # Initialize state
435
+ state = AgentState(doc_id=doc_id, file_path=file_path)
436
+
437
+ # Let agent decide and execute autonomously
438
+ result_state = agent.process(state)
439
+
440
+ # Save results
441
+ if result_state.fields:
442
+ save_extraction(
443
+ doc_id,
444
+ result_state.raw_text,
445
+ result_state.tables or [],
446
+ result_state.entities or [],
447
+ {
448
+ 'method': 'autonomous_agent',
449
+ 'attempts': result_state.attempts,
450
+ 'actions': result_state.history,
451
+ 'confidence': agent._calculate_overall_confidence(result_state)
452
+ },
453
+ None
454
+ )
455
+
456
+ save_invoice_fields(
457
+ doc_id,
458
+ result_state.fields,
459
+ result_state.confidence_map or {}
460
+ )
461
+
462
+ # Check if needs human review
463
+ if AgentDecision.HUMAN_REVIEW.value in result_state.history:
464
+ update_job_status(job_id, 'needs_review')
465
+ else:
466
+ update_job_status(job_id, 'completed')
467
+
468
+ logger.info(f"**** Agent completed with {len(result_state.history)} actions")
469
+ else:
470
+ update_job_status(job_id, 'failed', 'Agent could not extract fields')
471
+
472
+ except Exception as e:
473
+ logger.error(f"**** Agent failed: {e}")
474
+ import traceback
475
+ traceback.print_exc()
476
+ update_job_status(job_id, 'failed', str(e))
backend/app/api/__init__.py ADDED
File without changes
backend/app/api/ingest.py ADDED
@@ -0,0 +1,1459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Complete ingest pipeline with AUTONOMOUS AGENT INTEGRATION
3
+ ✅ Step 1: HF agents extract raw text
4
+ ✅ Step 2: HF NER finds entities
5
+ ✅ Step 3: Gemini maps to structured invoice fields
6
+ ✅ NEW: Autonomous agent orchestrates, validates, and self-corrects
7
+ ✅ UPDATED: Retry logic with exponential backoff + Local OCR fallback
8
+ """
9
+
10
+ import os
11
+ import uuid
12
+ import json
13
+ import sqlite3
14
+ import logging
15
+ import csv
16
+ from pathlib import Path
17
+ from datetime import datetime
18
+ from typing import Optional, Dict, List, Any
19
+ from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks, Form
20
+ from pydantic import BaseModel
21
+ from filelock import FileLock
22
+ import httpx
23
+ import re
24
+
25
+ import sys
26
+ logging.basicConfig(
27
+ level=logging.INFO,
28
+ format='%(asctime)s - %(levelname)s - %(message)s',
29
+ handlers=[logging.StreamHandler(sys.stdout)],
30
+ force=True # Override any existing config
31
+ )
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+ # Setup
36
+ BASE_DIR = Path(__file__).parent.parent.parent.parent
37
+ STORAGE_PATH = Path(os.getenv('STORAGE_PATH', str(BASE_DIR / "data" / "docs")))
38
+ DB_PATH = Path(os.getenv('DB_PATH', str(BASE_DIR / "data" / "invoices.db")))
39
+ LOCK_PATH = BASE_DIR / "data" / "invoices.db.lock"
40
+ PREDICT_ENDPOINT = 'http://localhost:7860/predict'
41
+
42
+ STORAGE_PATH.mkdir(parents=True, exist_ok=True)
43
+
44
+ logger = logging.getLogger(__name__)
45
+ router = APIRouter(prefix="/api", tags=["ingest"])
46
+
47
+
48
+ # ============================================
49
+ # LOCAL OCR FALLBACK (NEW)
50
+ # ============================================
51
+
52
+ # ============================================
53
+ # LOCAL OCR FALLBACK (UPDATED - EasyOCR + Tesseract)
54
+ # ============================================
55
+
56
+ def extract_text_with_easyocr(file_path: Path) -> tuple:
57
+ """
58
+ EasyOCR - Best free open-source OCR
59
+ - Works offline
60
+ - 80+ languages
61
+ - GPU/CPU support
62
+ - Better accuracy than Tesseract for invoices
63
+ """
64
+ try:
65
+ import easyocr
66
+
67
+ logger.info("🔧 Using EasyOCR (best free OCR)...")
68
+
69
+ # Initialize reader (downloads models on first run)
70
+ # Use GPU if available, fallback to CPU
71
+ reader = easyocr.Reader(['en'], gpu=False) # Set gpu=True if you have CUDA
72
+
73
+ # Read image
74
+ result = reader.readtext(str(file_path), detail=0, paragraph=True)
75
+
76
+ # Join all text
77
+ text = '\n'.join(result)
78
+
79
+ if text and len(text.strip()) >= 10:
80
+ logger.info(f"✅ EasyOCR extracted {len(text)} characters")
81
+ return True, text, None
82
+
83
+ return False, None, "EasyOCR produced no usable text"
84
+
85
+ except ImportError:
86
+ logger.warning("⚠️ easyocr not installed. Install with: pip install easyocr")
87
+ return False, None, "easyocr not available"
88
+ except Exception as e:
89
+ logger.error(f"❌ EasyOCR failed: {e}")
90
+ return False, None, str(e)
91
+
92
+
93
+ def extract_text_with_tesseract(file_path: Path) -> tuple:
94
+ """
95
+ Tesseract OCR - Fallback option
96
+ Faster but less accurate than EasyOCR
97
+ """
98
+ try:
99
+ import pytesseract
100
+ from PIL import Image
101
+
102
+ logger.info("🔧 Using Tesseract OCR as secondary fallback...")
103
+
104
+ image = Image.open(file_path)
105
+ text = pytesseract.image_to_string(image)
106
+
107
+ if text and len(text.strip()) >= 10:
108
+ logger.info(f"✅ Tesseract extracted {len(text)} characters")
109
+ return True, text, None
110
+
111
+ return False, None, "Tesseract produced no usable text"
112
+
113
+ except ImportError:
114
+ logger.warning("⚠️ pytesseract not installed. Install with: pip install pytesseract pillow")
115
+ return False, None, "pytesseract not available"
116
+ except Exception as e:
117
+ logger.error(f"❌ Tesseract failed: {e}")
118
+ return False, None, str(e)
119
+
120
+
121
+ def extract_text_with_local_ocr(file_path: Path) -> tuple:
122
+ """
123
+ Multi-tier local OCR fallback system:
124
+ 1. Try EasyOCR (best accuracy)
125
+ 2. Try Tesseract (faster, less accurate)
126
+ 3. Give up
127
+ """
128
+ logger.info("=" * 70)
129
+ logger.info("🔄 HF extraction failed - trying local OCR fallbacks...")
130
+ logger.info("=" * 70)
131
+
132
+ # Priority 1: EasyOCR (best for invoices)
133
+ success, text, error = extract_text_with_easyocr(file_path)
134
+ if success:
135
+ logger.info("✅ EasyOCR succeeded!")
136
+ return True, text, None
137
+ else:
138
+ logger.warning(f"⚠️ EasyOCR failed: {error}")
139
+
140
+ # Priority 2: Tesseract (faster fallback)
141
+ success, text, error = extract_text_with_tesseract(file_path)
142
+ if success:
143
+ logger.info("✅ Tesseract succeeded!")
144
+ return True, text, None
145
+ else:
146
+ logger.warning(f"⚠️ Tesseract failed: {error}")
147
+
148
+ # All local OCR failed
149
+ logger.error("❌ All local OCR methods failed")
150
+ return False, None, "All local OCR methods failed"
151
+
152
+
153
+ # ============================================
154
+ # STEP 1: HF Agent Text Extraction (UPDATED)
155
+ # ============================================
156
+
157
+ def get_agent_headers():
158
+ """Get headers with HF token"""
159
+ token = (
160
+ os.getenv('HF_TOKEN') or
161
+ os.getenv('HUGGINGFACE_API_TOKEN') or
162
+ os.getenv('AGENT_BEARER_TOKEN') or
163
+ ''
164
+ )
165
+ return {'Authorization': f'Bearer {token}'} if token else {}
166
+
167
+
168
+ def get_mime_type(file_path: Path) -> str:
169
+ """Get MIME type"""
170
+ ext = file_path.suffix.lower()
171
+ mime_map = {
172
+ '.pdf': 'application/pdf',
173
+ '.jpg': 'image/jpeg',
174
+ '.jpeg': 'image/jpeg',
175
+ '.png': 'image/png'
176
+ }
177
+ return mime_map.get(ext, 'application/octet-stream')
178
+
179
+
180
+ def call_text_extractor(file_path: Path, max_retries=3):
181
+ """
182
+ HF text extraction with retry logic and exponential backoff.
183
+ Falls back to local OCR if all retries fail.
184
+ """
185
+ url = os.getenv('TEXT_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/text')
186
+ base_timeout = int(os.getenv('AGENT_TIMEOUT_SECONDS', '120'))
187
+
188
+ for attempt in range(max_retries):
189
+ # Progressive timeout: 120s, 180s, 240s
190
+ timeout = base_timeout + (60 * attempt)
191
+
192
+ try:
193
+ logger.info(f"📄 Extracting text from {file_path.name} (attempt {attempt + 1}/{max_retries}, timeout={timeout}s)...")
194
+
195
+ filename = file_path.name
196
+ mime_type = get_mime_type(file_path)
197
+
198
+ with open(file_path, 'rb') as f:
199
+ files = {'file': (filename, f, mime_type)}
200
+ data = {
201
+ 'filename': filename,
202
+ 'start_page': 1,
203
+ 'end_page': 1
204
+ }
205
+ headers = get_agent_headers()
206
+
207
+ response = httpx.post(url, files=files, data=data, headers=headers, timeout=timeout)
208
+
209
+ if response.status_code == 200:
210
+ result = response.json()
211
+ text = result.get('result') or result.get('text') or result.get('extracted_text') or ''
212
+
213
+ if text and len(text.strip()) >= 10:
214
+ logger.info(f"✅ Extracted {len(text)} characters")
215
+ return True, text, None
216
+
217
+ logger.warning("⚠️ No text extracted from response")
218
+ if attempt < max_retries - 1:
219
+ continue
220
+ return False, None, "No text extracted"
221
+
222
+ logger.warning(f"⚠️ HTTP {response.status_code}: {response.text[:200]}")
223
+
224
+ except httpx.TimeoutException:
225
+ logger.warning(f"⚠️ Timeout after {timeout}s on attempt {attempt + 1}")
226
+ if attempt < max_retries - 1:
227
+ logger.info("🔄 Retrying with longer timeout...")
228
+ continue
229
+ except Exception as e:
230
+ logger.error(f"❌ Error on attempt {attempt + 1}: {e}")
231
+ if attempt < max_retries - 1:
232
+ logger.info("🔄 Retrying...")
233
+ continue
234
+
235
+ # All retries failed - try local OCR fallback
236
+ logger.warning(f"⚠️ All {max_retries} HF extraction attempts failed, trying local OCR fallback...")
237
+ return extract_text_with_local_ocr(file_path)
238
+
239
+
240
+ def call_table_extractor(file_path: Path, max_retries=2):
241
+ """
242
+ HF table extraction with retry logic.
243
+ Non-critical, so fewer retries.
244
+ """
245
+ url = os.getenv('TABLE_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/tables')
246
+ base_timeout = int(os.getenv('AGENT_TIMEOUT_SECONDS', '120'))
247
+
248
+ for attempt in range(max_retries):
249
+ timeout = base_timeout + (60 * attempt)
250
+
251
+ try:
252
+ logger.info(f"📊 Extracting tables from {file_path.name} (attempt {attempt + 1}/{max_retries})...")
253
+
254
+ filename = file_path.name
255
+ mime_type = get_mime_type(file_path)
256
+
257
+ with open(file_path, 'rb') as f:
258
+ files = {'file': (filename, f, mime_type)}
259
+ data = {
260
+ 'filename': filename,
261
+ 'start_page': 1,
262
+ 'end_page': 1
263
+ }
264
+ headers = get_agent_headers()
265
+
266
+ response = httpx.post(url, files=files, data=data, headers=headers, timeout=timeout)
267
+
268
+ if response.status_code == 200:
269
+ result = response.json()
270
+ tables = result.get('result') or result.get('tables') or []
271
+ logger.info(f"✅ Extracted {len(tables)} tables")
272
+ return True, tables, None
273
+
274
+ logger.warning(f"⚠️ HTTP {response.status_code}")
275
+
276
+ except httpx.TimeoutException:
277
+ logger.warning(f"⚠️ Table extraction timeout on attempt {attempt + 1}")
278
+ except Exception as e:
279
+ logger.warning(f"⚠️ Table extraction error: {e}")
280
+
281
+ # Non-critical - return empty list
282
+ logger.info("ℹ️ Table extraction failed, continuing without tables")
283
+ return False, [], "Table extraction failed (non-critical)"
284
+
285
+
286
+ # ============================================
287
+ # STEP 2: HF NER (Named Entity Recognition)
288
+ # ============================================
289
+
290
+ def call_ner(text: str, file_path: Path = None, max_retries=2) -> tuple:
291
+ """
292
+ Extract named entities using HF NER agent with retry logic.
293
+ """
294
+ url = os.getenv('NER_URL', 'https://point9-ner.hf.space/api/ner')
295
+ base_timeout = int(os.getenv('AGENT_TIMEOUT_SECONDS', '120'))
296
+
297
+ for attempt in range(max_retries):
298
+ timeout = base_timeout + (30 * attempt)
299
+
300
+ try:
301
+ logger.info(f"🔍 Running NER to find entities (attempt {attempt + 1}/{max_retries})...")
302
+
303
+ headers = get_agent_headers()
304
+
305
+ # NER expects multipart/form-data with file OR text
306
+ if file_path and file_path.exists():
307
+ # Send file
308
+ filename = file_path.name
309
+ mime_type = get_mime_type(file_path)
310
+
311
+ with open(file_path, 'rb') as f:
312
+ files = {'file': (filename, f, mime_type)}
313
+ data = {
314
+ 'text': text[:5000],
315
+ 'filename': filename,
316
+ 'start_page': 1,
317
+ 'end_page': 1
318
+ }
319
+ response = httpx.post(url, files=files, data=data, headers=headers, timeout=timeout)
320
+ else:
321
+ # Send just text as form data
322
+ data = {
323
+ 'text': text[:5000],
324
+ 'filename': 'document.txt',
325
+ 'start_page': 1,
326
+ 'end_page': 1
327
+ }
328
+ response = httpx.post(url, data=data, headers=headers, timeout=timeout)
329
+
330
+ if response.status_code == 200:
331
+ result = response.json()
332
+
333
+ # FIX: Handle both dict and string responses
334
+ if isinstance(result, str):
335
+ try:
336
+ result = json.loads(result)
337
+ except:
338
+ logger.warning(f"⚠️ NER returned unparseable string: {result[:100]}")
339
+ if attempt < max_retries - 1:
340
+ continue
341
+ return False, [], {}, "Invalid response format"
342
+
343
+ # Extract entities
344
+ entities = result.get('entities') or result.get('result') or []
345
+
346
+ # Handle case where entities might also be a string
347
+ if isinstance(entities, str):
348
+ try:
349
+ entities = json.loads(entities)
350
+ except:
351
+ entities = []
352
+
353
+ logger.info(f"✅ Found {len(entities)} entities")
354
+
355
+ # Group entities by type
356
+ entity_map = {
357
+ 'PERSON': [],
358
+ 'ORG': [],
359
+ 'DATE': [],
360
+ 'MONEY': [],
361
+ 'CARDINAL': []
362
+ }
363
+
364
+ for entity in entities:
365
+ if not isinstance(entity, dict):
366
+ continue
367
+
368
+ ent_type = entity.get('entity_type') or entity.get('label')
369
+ ent_text = entity.get('text') or entity.get('word')
370
+
371
+ if ent_type in entity_map and ent_text:
372
+ entity_map[ent_type].append(ent_text)
373
+
374
+ logger.info(f"📋 Entity summary: PERSON={len(entity_map['PERSON'])}, ORG={len(entity_map['ORG'])}, DATE={len(entity_map['DATE'])}, MONEY={len(entity_map['MONEY'])}")
375
+
376
+ return True, entities, entity_map, None
377
+
378
+ logger.warning(f"⚠️ NER HTTP {response.status_code}")
379
+
380
+ except httpx.TimeoutException:
381
+ logger.warning(f"⚠️ NER timeout on attempt {attempt + 1}")
382
+ except Exception as e:
383
+ logger.error(f"❌ NER error on attempt {attempt + 1}: {e}")
384
+
385
+ # NER failed - return empty (non-critical)
386
+ logger.warning("⚠️ NER failed after retries, continuing without entities")
387
+ return False, [], {}, "NER failed (non-critical)"
388
+
389
+
390
+ # ============================================
391
+ # STEP 3: Gemini Intelligent Mapping
392
+ # ============================================
393
+
394
+ def map_with_gemini(text: str, entities: List, entity_map: Dict, tables: List):
395
+ """Use Gemini to intelligently map extracted data to invoice fields"""
396
+ try:
397
+ import google.generativeai as genai
398
+
399
+ api_key = os.getenv('GEMINI_API_KEY')
400
+ if not api_key:
401
+ logger.warning("⚠️ No Gemini API key configured")
402
+ return False, None, "No Gemini API key"
403
+
404
+ logger.info("🧠 Using Gemini for intelligent field mapping...")
405
+
406
+ genai.configure(api_key=api_key)
407
+ model = genai.GenerativeModel('models/gemini-2.5-flash')
408
+
409
+ # Build context for Gemini
410
+ context = f"""
411
+ EXTRACTED TEXT:
412
+ {text[:3000]}
413
+
414
+ NAMED ENTITIES FOUND:
415
+ - Organizations: {entity_map.get('ORG', [])}
416
+ - People: {entity_map.get('PERSON', [])}
417
+ - Dates: {entity_map.get('DATE', [])}
418
+ - Money amounts: {entity_map.get('MONEY', [])}
419
+ - Numbers: {entity_map.get('CARDINAL', [])}
420
+
421
+ TABLES:
422
+ {json.dumps(tables[:2], indent=2) if tables else 'None'}
423
+ """
424
+
425
+ prompt = f"""You are an expert at analyzing invoice data. Given the extracted text and entities below, map them to invoice fields.
426
+
427
+ {context}
428
+
429
+ Analyze the above data and return ONLY a valid JSON object with these exact fields:
430
+
431
+ {{
432
+ "customer_name": "the client/customer company name (check ORG entities first)",
433
+ "invoice_number": "the invoice number (check CARDINAL entities)",
434
+ "date": "invoice date in YYYY-MM-DD format (check DATE entities)",
435
+ "total_amount": numeric total amount only (check MONEY entities, no currency symbol),
436
+ "payment_terms": "payment terms like NET30, NET60, or NAH4 if not found",
437
+ "reasoning": "brief explanation of how you identified each field"
438
+ }}
439
+
440
+ Rules:
441
+ 1. Prefer entities over raw text when available
442
+ 2. Customer name is usually the first ORG after "Bill To" or "Client"
443
+ 3. Total amount is usually the largest MONEY value
444
+ 4. Date should be in YYYY-MM-DD format
445
+ 5. If uncertain, use these defaults: customer_name="UNKNOWN", date="2024-01-01", total_amount=0.0, payment_terms="NAH4"
446
+
447
+ Return ONLY the JSON object, no markdown, no explanation outside the JSON."""
448
+
449
+ response = model.generate_content(prompt)
450
+ text_response = response.text.strip()
451
+
452
+ # Remove markdown if present
453
+ text_response = text_response.replace('```json', '').replace('```', '').strip()
454
+
455
+ result = json.loads(text_response)
456
+
457
+ logger.info(f"✅ Gemini mapped: Customer={result.get('customer_name')}, Amount=${result.get('total_amount')}")
458
+ logger.info(f"💡 Reasoning: {result.get('reasoning', 'N/A')[:100]}")
459
+
460
+ return True, result, None
461
+
462
+ except json.JSONDecodeError as e:
463
+ logger.error(f"❌ Gemini returned invalid JSON: {e}")
464
+ logger.error(f"Response: {text_response[:500]}")
465
+ return False, None, f"Invalid JSON: {e}"
466
+ except Exception as e:
467
+ logger.error(f"❌ Gemini mapping failed: {e}")
468
+ import traceback
469
+ logger.error(traceback.format_exc())
470
+ return False, None, str(e)
471
+
472
+
473
+ # ============================================
474
+ # Fallback: Regex Mapping
475
+ # ============================================
476
+
477
+ def map_with_regex(text: str, entities: List) -> tuple:
478
+ """Fallback regex-based field extraction"""
479
+ logger.info("🔤 Using regex fallback for field mapping...")
480
+
481
+ fields = {}
482
+ confidence = {}
483
+
484
+ # CUSTOMER NAME - try to use ORG entities first
485
+ org_entities = [e.get('text') or e.get('word') for e in entities
486
+ if (e.get('entity_type') or e.get('label')) == 'ORG']
487
+
488
+ if org_entities:
489
+ fields['cust_number'] = org_entities[0][:20]
490
+ confidence['cust_number'] = 0.8
491
+ else:
492
+ # Regex fallback
493
+ client_patterns = [
494
+ r'(?:Client|Bill\s+To|Customer)[:\s]+(.*?)(?:\n|Tax|IBAN)',
495
+ r'(?:customer|client)[\s:]+([A-Za-z][A-Za-z\s,&-]+?)(?:\n|$)',
496
+ ]
497
+
498
+ for pattern in client_patterns:
499
+ match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
500
+ if match:
501
+ client = match.group(1).strip()
502
+ words = [w.strip() for w in client.replace(',', ' ').split() if len(w.strip()) > 2]
503
+ if words:
504
+ fields['cust_number'] = words[0][:20]
505
+ confidence['cust_number'] = 0.6
506
+ break
507
+
508
+ if 'cust_number' not in fields:
509
+ fields['cust_number'] = 'UNKNOWN'
510
+ confidence['cust_number'] = 0.1
511
+
512
+ # DATE - try DATE entities first
513
+ date_entities = [e.get('text') or e.get('word') for e in entities
514
+ if (e.get('entity_type') or e.get('label')) == 'DATE']
515
+
516
+ if date_entities:
517
+ date_str = date_entities[0]
518
+ for fmt in ['%m/%d/%Y', '%d/%m/%Y', '%Y-%m-%d', '%m-%d-%Y']:
519
+ try:
520
+ dt = datetime.strptime(date_str, fmt)
521
+ fields['posting_date'] = dt.strftime('%Y-%m-%d')
522
+ confidence['posting_date'] = 0.8
523
+ break
524
+ except:
525
+ continue
526
+
527
+ if 'posting_date' not in fields:
528
+ date_patterns = [
529
+ r'(?:Date\s+of\s+issue|Invoice\s+Date|Date)[:\s]+(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
530
+ ]
531
+
532
+ for pattern in date_patterns:
533
+ match = re.search(pattern, text, re.IGNORECASE)
534
+ if match:
535
+ date_str = match.group(1)
536
+ for fmt in ['%m/%d/%Y', '%d/%m/%Y']:
537
+ try:
538
+ dt = datetime.strptime(date_str, fmt)
539
+ fields['posting_date'] = dt.strftime('%Y-%m-%d')
540
+ confidence['posting_date'] = 0.7
541
+ break
542
+ except:
543
+ continue
544
+ if 'posting_date' in fields:
545
+ break
546
+
547
+ if 'posting_date' not in fields:
548
+ fields['posting_date'] = datetime.now().strftime('%Y-%m-%d')
549
+ confidence['posting_date'] = 0.1
550
+
551
+ # AMOUNT - try MONEY entities first
552
+ money_entities = [e.get('text') or e.get('word') for e in entities
553
+ if (e.get('entity_type') or e.get('label')) == 'MONEY']
554
+
555
+ if money_entities:
556
+ amounts = []
557
+ for money_str in money_entities:
558
+ try:
559
+ # Remove currency symbols and parse
560
+ amt_str = re.sub(r'[^\d.]', '', money_str)
561
+ amt = float(amt_str)
562
+ if amt > 10:
563
+ amounts.append(amt)
564
+ except:
565
+ pass
566
+
567
+ if amounts:
568
+ fields['total_open_amount'] = max(amounts)
569
+ confidence['total_open_amount'] = 0.8
570
+ logger.info(f"✅ Found amount from MONEY entity: ${fields['total_open_amount']}")
571
+
572
+ if 'total_open_amount' not in fields:
573
+ # Regex fallback
574
+ pattern = r'\$\s*([0-9]{1,3}(?:,?[0-9]{3})*\.[0-9]{2})'
575
+ amounts = []
576
+ for match in re.finditer(pattern, text):
577
+ try:
578
+ amt = float(match.group(1).replace(',', ''))
579
+ if amt > 50:
580
+ amounts.append(amt)
581
+ except:
582
+ pass
583
+
584
+ if amounts:
585
+ fields['total_open_amount'] = max(amounts)
586
+ confidence['total_open_amount'] = 0.6
587
+ else:
588
+ fields['total_open_amount'] = 0.0
589
+ confidence['total_open_amount'] = 0.0
590
+ logger.warning("⚠️ No amount found!")
591
+
592
+ # PAYMENT TERMS
593
+ terms_match = re.search(r'(NET\s?\d{1,2}|N\d{2}|NAH\d)', text, re.IGNORECASE)
594
+ fields['cust_payment_terms'] = terms_match.group(1).upper() if terms_match else 'NAH4'
595
+ confidence['cust_payment_terms'] = 0.7 if terms_match else 0.2
596
+
597
+ # BUSINESS CODE
598
+ fields['business_code'] = 'U001'
599
+ confidence['business_code'] = 0.2
600
+
601
+ return fields, confidence
602
+
603
+
604
+ # ============================================
605
+ # Database Functions
606
+ # ============================================
607
+
608
+ def update_job_status(job_id: str, status: str, error_text: str = None):
609
+ """Update job status"""
610
+ with FileLock(str(LOCK_PATH), timeout=10):
611
+ conn = sqlite3.connect(str(DB_PATH))
612
+ cursor = conn.cursor()
613
+ cursor.execute("""
614
+ UPDATE ingest_jobs
615
+ SET status = ?, error_text = ?, updated_at = CURRENT_TIMESTAMP
616
+ WHERE job_id = ?
617
+ """, (status, error_text, job_id))
618
+ conn.commit()
619
+ conn.close()
620
+
621
+
622
+ def save_extraction(doc_id: str, raw_text: str, tables: list, entities: list, classification: dict, summary: str = None):
623
+ """Save extraction results"""
624
+ with FileLock(str(LOCK_PATH), timeout=10):
625
+ conn = sqlite3.connect(str(DB_PATH))
626
+ cursor = conn.cursor()
627
+ cursor.execute("""
628
+ INSERT OR REPLACE INTO extractions (
629
+ doc_id, raw_text, tables_json, entities_json,
630
+ classification_json, summary_text
631
+ ) VALUES (?, ?, ?, ?, ?, ?)
632
+ """, (
633
+ doc_id,
634
+ raw_text,
635
+ json.dumps(tables) if tables else None,
636
+ json.dumps(entities) if entities else None,
637
+ json.dumps(classification) if classification else None,
638
+ summary
639
+ ))
640
+ conn.commit()
641
+ conn.close()
642
+
643
+
644
+ def save_invoice_fields(doc_id: str, fields: Dict, confidence_map: Dict):
645
+ """Save invoice fields"""
646
+ with FileLock(str(LOCK_PATH), timeout=10):
647
+ conn = sqlite3.connect(str(DB_PATH))
648
+ cursor = conn.cursor()
649
+ cursor.execute("""
650
+ INSERT INTO invoice_fields (
651
+ doc_id, cust_number, posting_date, total_open_amount,
652
+ business_code, cust_payment_terms, confidence_map
653
+ ) VALUES (?, ?, ?, ?, ?, ?, ?)
654
+ """, (
655
+ doc_id,
656
+ fields.get('cust_number'),
657
+ fields.get('posting_date'),
658
+ fields.get('total_open_amount'),
659
+ fields.get('business_code'),
660
+ fields.get('cust_payment_terms'),
661
+ json.dumps(confidence_map)
662
+ ))
663
+ conn.commit()
664
+ conn.close()
665
+
666
+
667
+ # ============================================
668
+ # AGENT MODE FLAG (Environment Variable)
669
+ # ============================================
670
+
671
+ USE_AGENT_MODE = os.getenv('USE_AGENT_MODE', 'true').lower() == 'true'
672
+
673
+
674
+ # ============================================
675
+ # Main Processing Pipeline
676
+ # ============================================
677
+
678
+ def process_document_legacy(job_id: str, doc_id: str, file_path: Path):
679
+ """
680
+ LEGACY PIPELINE (Original Implementation):
681
+ 1. HF Extract text + tables
682
+ 2. HF NER finds entities
683
+ 3. Gemini maps to invoice fields
684
+ """
685
+ logger.info("=" * 70)
686
+ logger.info(f"🚀 Starting LEGACY pipeline for {file_path.name}")
687
+ logger.info("=" * 70)
688
+
689
+ try:
690
+ update_job_status(job_id, 'processing')
691
+
692
+ # STEP 1: Extract text with HF agents
693
+ logger.info("STEP 1: HF TEXT + TABLE EXTRACTION")
694
+ logger.info("-" * 70)
695
+
696
+ success, raw_text, error = call_text_extractor(file_path)
697
+ if not success or not raw_text:
698
+ update_job_status(job_id, 'failed', f"Text extraction failed: {error}")
699
+ return
700
+
701
+ # Extract tables (optional, won't fail if it doesn't work)
702
+ _, tables, _ = call_table_extractor(file_path)
703
+
704
+ # STEP 2: NER to find entities
705
+ logger.info("-" * 70)
706
+ logger.info("STEP 2: NER - NAMED ENTITY RECOGNITION")
707
+ logger.info("-" * 70)
708
+
709
+ ner_success, entities, entity_map, ner_error = call_ner(raw_text, file_path)
710
+
711
+ if not ner_success:
712
+ logger.warning(f"⚠️ NER failed: {ner_error}, continuing without entities")
713
+ entities = []
714
+ entity_map = {}
715
+
716
+ # STEP 3: Gemini intelligent mapping
717
+ logger.info("-" * 70)
718
+ logger.info("STEP 3: GEMINI INTELLIGENT MAPPING")
719
+ logger.info("-" * 70)
720
+
721
+ gemini_success, gemini_result, gemini_error = map_with_gemini(
722
+ raw_text, entities, entity_map, tables
723
+ )
724
+
725
+ if gemini_success and gemini_result:
726
+ # Use Gemini's mapping
727
+ fields = {
728
+ 'cust_number': gemini_result.get('customer_name', 'UNKNOWN')[:20],
729
+ 'posting_date': gemini_result.get('date', datetime.now().strftime('%Y-%m-%d')),
730
+ 'total_open_amount': float(gemini_result.get('total_amount', 0.0)),
731
+ 'business_code': 'U001',
732
+ 'cust_payment_terms': gemini_result.get('payment_terms', 'NAH4')[:10]
733
+ }
734
+
735
+ confidence_map = {
736
+ 'cust_number': 0.95,
737
+ 'posting_date': 0.95,
738
+ 'total_open_amount': 0.95,
739
+ 'business_code': 0.2,
740
+ 'cust_payment_terms': 0.8
741
+ }
742
+
743
+ method = 'hf_ner_gemini'
744
+
745
+ else:
746
+ # Fallback to regex mapping
747
+ logger.warning(f"⚠️ Gemini mapping failed: {gemini_error}")
748
+ logger.info("-" * 70)
749
+ logger.info("FALLBACK: REGEX MAPPING")
750
+ logger.info("-" * 70)
751
+
752
+ fields, confidence_map = map_with_regex(raw_text, entities)
753
+ method = 'hf_ner_regex'
754
+
755
+ # Save results
756
+ save_extraction(
757
+ doc_id, raw_text, tables, entities,
758
+ {'method': method, 'entity_count': len(entities)},
759
+ None
760
+ )
761
+ save_invoice_fields(doc_id, fields, confidence_map)
762
+
763
+ logger.info("=" * 70)
764
+ logger.info(f"✅ EXTRACTION COMPLETE - Method: {method}")
765
+ logger.info(f"📋 Fields: {fields}")
766
+ logger.info("=" * 70)
767
+
768
+ # Call prediction API
769
+ #logger.info("🔮 Calling payment prediction...")
770
+ #try:
771
+ # pred_response = httpx.post(PREDICT_ENDPOINT, json=fields, timeout=30)
772
+ #
773
+ # if pred_response.status_code == 200:
774
+ # pred_result = pred_response.json()
775
+ # logger.info(f"✅ Prediction: {pred_result.get('predicted_days_to_clear')} days")
776
+ #except Exception as e:
777
+ # logger.error(f"⚠️ Prediction failed: {e}")
778
+
779
+ update_job_status(job_id, 'completed')
780
+ logger.info(f"🎉 Job {job_id} completed successfully")
781
+
782
+ except Exception as e:
783
+ logger.error(f"❌ Job {job_id} failed: {e}")
784
+ import traceback
785
+ traceback.print_exc()
786
+ update_job_status(job_id, 'failed', str(e))
787
+
788
+
789
+
790
+
791
+ def process_document_agent(job_id: str, doc_id: str, file_path: Path, user_message: Optional[str] = None):
792
+ """
793
+ NEW AUTONOMOUS AGENT PIPELINE with optional wrapper
794
+ """
795
+ try:
796
+ # Clean up user_message
797
+ if user_message in [None, 'None', '', 'null', 'undefined']:
798
+ user_message = None
799
+ else:
800
+ user_message = str(user_message).strip()
801
+ if not user_message:
802
+ user_message = None
803
+
804
+ logger.info("=" * 70)
805
+ logger.info(f"🔍 AGENT - Processing with message: '{user_message}'")
806
+ logger.info(f"🔍 Type: {type(user_message)}")
807
+ logger.info(f"🔍 Is None: {user_message is None}")
808
+ logger.info("=" * 70)
809
+
810
+ from backend.app.agent.agent_orchestrator import (
811
+ InvoiceAgent, AgentState, create_agent
812
+ )
813
+
814
+ logger.info("=" * 70)
815
+ logger.info(f"🤖 AUTONOMOUS AGENT MODE for {file_path.name}")
816
+ logger.info("=" * 70)
817
+
818
+ update_job_status(job_id, 'processing')
819
+
820
+ # Create agent
821
+ agent = create_agent(
822
+ call_text_extractor,
823
+ call_table_extractor,
824
+ call_ner,
825
+ map_with_gemini
826
+ )
827
+
828
+ # Initialize state
829
+ state = AgentState(doc_id=doc_id, file_path=file_path)
830
+
831
+ # Let agent autonomously decide and execute
832
+ result_state = agent.process(state)
833
+
834
+ # ============================================
835
+ # WRAPPER INTEGRATION
836
+ # ============================================
837
+
838
+ full_extraction = result_state.fields
839
+ final_result = full_extraction
840
+ wrapper_used = False
841
+
842
+ # Check if user_message is actually provided
843
+ if user_message is not None and len(user_message) > 0:
844
+ logger.info("=" * 70)
845
+ logger.info(f"💬 USER MESSAGE DETECTED: '{user_message}'")
846
+ logger.info("🎯 Activating Gemini wrapper to filter output...")
847
+ logger.info(f"📦 Full extraction fields: {list(full_extraction.keys())}")
848
+ logger.info("=" * 70)
849
+
850
+ try:
851
+ from backend.app.wrappers.gemini_output_filter import GeminiOutputFilter
852
+
853
+ wrapper = GeminiOutputFilter()
854
+ final_result = wrapper.filter_output(user_message, full_extraction)
855
+ wrapper_used = True
856
+
857
+ logger.info("=" * 70)
858
+ logger.info(f"✅ WRAPPER SUCCESS!")
859
+ logger.info(f"📤 Original fields: {list(full_extraction.keys())}")
860
+ logger.info(f"🎯 Filtered fields: {list(final_result.keys())}")
861
+ logger.info(f"📋 Filtered result: {json.dumps(final_result, indent=2)}")
862
+ logger.info("=" * 70)
863
+
864
+ except Exception as wrapper_error:
865
+ logger.error("=" * 70)
866
+ logger.error(f"❌ WRAPPER FAILED: {wrapper_error}")
867
+ logger.error("=" * 70)
868
+ import traceback
869
+ logger.error(traceback.format_exc())
870
+ logger.warning("📦 Falling back to full extraction")
871
+ final_result = full_extraction
872
+ wrapper_used = False
873
+ else:
874
+ logger.info("=" * 70)
875
+ logger.info("ℹ️ No user message provided - returning full extraction")
876
+ logger.info(f"📦 Full extraction fields: {list(full_extraction.keys())}")
877
+ logger.info("=" * 70)
878
+
879
+ # ============================================
880
+ # Save results
881
+ # ============================================
882
+
883
+ if result_state.fields:
884
+ # Determine method
885
+ if 'use_gemini' in result_state.history:
886
+ method = 'autonomous_agent_gemini'
887
+ elif 'use_regex' in result_state.history:
888
+ method = 'autonomous_agent_regex'
889
+ else:
890
+ method = 'autonomous_agent'
891
+
892
+ if wrapper_used:
893
+ method += '_with_wrapper'
894
+
895
+ save_extraction(
896
+ doc_id,
897
+ result_state.raw_text or '',
898
+ result_state.tables or [],
899
+ result_state.entities or [],
900
+ {
901
+ 'method': method,
902
+ 'attempts': result_state.attempts,
903
+ 'actions': result_state.history,
904
+ 'confidence': agent._calculate_overall_confidence(result_state),
905
+ 'errors': result_state.errors,
906
+ 'user_message': user_message,
907
+ 'wrapper_used': wrapper_used,
908
+ 'full_extraction_keys': list(full_extraction.keys()) if full_extraction else [],
909
+ 'filtered_keys': list(final_result.keys()) if wrapper_used else None
910
+ },
911
+ None
912
+ )
913
+
914
+ # Save filtered result
915
+ save_invoice_fields(
916
+ doc_id,
917
+ final_result,
918
+ result_state.confidence_map or {}
919
+ )
920
+
921
+ # Call prediction
922
+ logger.info("🔮 Calling payment prediction...")
923
+ try:
924
+ pred_response = httpx.post(PREDICT_ENDPOINT, json=final_result, timeout=30)
925
+
926
+ if pred_response.status_code == 200:
927
+ pred_result = pred_response.json()
928
+ logger.info(f"✅ Prediction: {pred_result.get('predicted_days_to_clear')} days")
929
+ except Exception as e:
930
+ logger.error(f"⚠️ Prediction failed: {e}")
931
+
932
+ # Check status
933
+ from backend.app.agent.agent_orchestrator import AgentDecision
934
+ if AgentDecision.HUMAN_REVIEW.value in result_state.history:
935
+ update_job_status(job_id, 'needs_review')
936
+ logger.info("👤 Agent requesting human review")
937
+ else:
938
+ update_job_status(job_id, 'completed')
939
+ logger.info(f"✅ Agent completed with confidence: {agent._calculate_overall_confidence(result_state):.2f}")
940
+ else:
941
+ update_job_status(job_id, 'failed', 'Agent could not extract fields')
942
+ logger.error("❌ Agent failed to extract any fields")
943
+
944
+ except ImportError as e:
945
+ logger.error(f"❌ Agent module not found: {e}")
946
+ logger.info("⚠️ Falling back to legacy pipeline...")
947
+ process_document_legacy(job_id, doc_id, file_path)
948
+ except Exception as e:
949
+ logger.error(f"❌ Agent failed: {e}")
950
+ import traceback
951
+ traceback.print_exc()
952
+ update_job_status(job_id, 'failed', str(e))
953
+
954
+
955
+ def process_document(job_id: str, doc_id: str, file_path: Path, user_message: Optional[str] = None):
956
+ """
957
+ Main entry point - routes to agent or legacy pipeline.
958
+ """
959
+ # Clean up user_message
960
+ if user_message in [None, 'None', '', 'null', 'undefined']:
961
+ user_message = None
962
+ else:
963
+ user_message = str(user_message).strip()
964
+ if not user_message:
965
+ user_message = None
966
+
967
+ logger.info("=" * 70)
968
+ logger.info(f"🔍 PROCESS_DOCUMENT - Cleaned user_message: '{user_message}'")
969
+ logger.info(f"🔍 Type: {type(user_message)}")
970
+ logger.info(f"🔍 Is None: {user_message is None}")
971
+ logger.info("=" * 70)
972
+
973
+ if USE_AGENT_MODE:
974
+ logger.info("🤖 Using AUTONOMOUS AGENT mode")
975
+ process_document_agent(job_id, doc_id, file_path, user_message=user_message)
976
+ else:
977
+ logger.info("📋 Using LEGACY pipeline mode")
978
+ process_document_legacy(job_id, doc_id, file_path)
979
+
980
+
981
+ # ============================================
982
+ # API Endpoints
983
+ # ============================================
984
+
985
+ class IngestResponse(BaseModel):
986
+ job_id: str
987
+ doc_id: str
988
+ filename: str
989
+ status: str
990
+ message: str
991
+
992
+
993
+ class JobStatusResponse(BaseModel):
994
+ job_id: str
995
+ doc_id: str
996
+ filename: str
997
+ status: str
998
+ error_text: Optional[str] = None
999
+ created_at: str
1000
+ updated_at: str
1001
+ extraction: Optional[Dict] = None
1002
+ invoice_fields: Optional[Dict] = None
1003
+
1004
+ class BatchIngestResponse(BaseModel):
1005
+ batch_id: str
1006
+ total_files: int
1007
+ jobs: List[Dict[str, str]]
1008
+ message: str
1009
+
1010
+
1011
+ class BatchStatusResponse(BaseModel):
1012
+ batch_id: str
1013
+ total_files: int
1014
+ completed: int
1015
+ processing: int
1016
+ failed: int
1017
+ queued: int
1018
+ jobs: List[Dict[str, Any]]
1019
+
1020
+ @router.post("/ingest", response_model=IngestResponse)
1021
+ async def ingest_document(
1022
+ background_tasks: BackgroundTasks,
1023
+ file: UploadFile = File(...),
1024
+ message: str = Form(None) # CHANGED: Use Form(None) instead of Optional[str] = None
1025
+ ):
1026
+
1027
+ # Clean message parameter
1028
+ cleaned_message = None
1029
+ if message and message not in ['None', 'null', '', 'undefined']:
1030
+ cleaned_message = message.strip()
1031
+ if not cleaned_message:
1032
+ cleaned_message = None
1033
+
1034
+ logger.info("=" * 70)
1035
+ logger.info(f"📨 API ENDPOINT - Raw message: '{message}'")
1036
+ logger.info(f"✨ Cleaned message: '{cleaned_message}'")
1037
+ logger.info(f"🔍 Message type: {type(cleaned_message)}")
1038
+ logger.info(f"❓ Is None: {cleaned_message is None}")
1039
+ logger.info("=" * 70)
1040
+
1041
+ try:
1042
+ allowed_types = ['application/pdf', 'image/png', 'image/jpeg']
1043
+ if file.content_type not in allowed_types:
1044
+ raise HTTPException(400, f"Invalid file type: {file.content_type}")
1045
+
1046
+ job_id = f"job_{uuid.uuid4().hex[:12]}"
1047
+ doc_id = f"doc_{uuid.uuid4().hex[:12]}"
1048
+ file_ext = file.filename.split('.')[-1] if '.' in file.filename else 'pdf'
1049
+
1050
+ stored_filename = f"{doc_id}.{file_ext}"
1051
+ file_path = STORAGE_PATH / stored_filename
1052
+
1053
+ content = await file.read()
1054
+ with open(file_path, 'wb') as f:
1055
+ f.write(content)
1056
+
1057
+ with FileLock(str(LOCK_PATH), timeout=10):
1058
+ conn = sqlite3.connect(str(DB_PATH))
1059
+ cursor = conn.cursor()
1060
+
1061
+ cursor.execute("""
1062
+ INSERT INTO ingest_jobs (job_id, doc_id, filename, status)
1063
+ VALUES (?, ?, ?, 'queued')
1064
+ """, (job_id, doc_id, file.filename))
1065
+
1066
+ cursor.execute("""
1067
+ INSERT INTO documents (doc_id, job_id, path, filename, content_type)
1068
+ VALUES (?, ?, ?, ?, ?)
1069
+ """, (doc_id, job_id, str(file_path), file.filename, file.content_type))
1070
+
1071
+ conn.commit()
1072
+ conn.close()
1073
+
1074
+ # Start processing with cleaned message
1075
+ background_tasks.add_task(
1076
+ process_document,
1077
+ job_id,
1078
+ doc_id,
1079
+ file_path,
1080
+ user_message=cleaned_message # Pass cleaned message
1081
+ )
1082
+
1083
+ logger.info(f"🚀 Background task started with message: '{cleaned_message}'")
1084
+
1085
+ mode = "autonomous agent"
1086
+ if cleaned_message:
1087
+ mode += f" with intelligent filtering"
1088
+ logger.info(f"🎯 User wants: '{cleaned_message}'")
1089
+
1090
+ return IngestResponse(
1091
+ job_id=job_id,
1092
+ doc_id=doc_id,
1093
+ filename=file.filename,
1094
+ status='queued',
1095
+ message=f'Document uploaded. Processing with {mode}.'
1096
+ )
1097
+
1098
+ except HTTPException:
1099
+ raise
1100
+ except Exception as e:
1101
+ logger.error(f"❌ Ingest endpoint error: {e}")
1102
+ import traceback
1103
+ logger.error(traceback.format_exc())
1104
+ raise HTTPException(500, str(e))
1105
+
1106
+
1107
+ @router.get("/ingest/{job_id}", response_model=JobStatusResponse)
1108
+ def get_ingest_status(job_id: str):
1109
+ """Get job status with agent decision history (if applicable)"""
1110
+ try:
1111
+ with FileLock(str(LOCK_PATH), timeout=10):
1112
+ conn = sqlite3.connect(str(DB_PATH))
1113
+ conn.row_factory = sqlite3.Row
1114
+ cursor = conn.cursor()
1115
+
1116
+ cursor.execute("SELECT * FROM ingest_jobs WHERE job_id = ?", (job_id,))
1117
+ job = cursor.fetchone()
1118
+ if not job:
1119
+ conn.close()
1120
+ raise HTTPException(404, "Job not found")
1121
+
1122
+ job_data = dict(job)
1123
+ doc_id = job_data['doc_id']
1124
+
1125
+ if job_data['status'] in ['completed', 'needs_review']:
1126
+ cursor.execute("SELECT * FROM extractions WHERE doc_id = ?", (doc_id,))
1127
+ extraction = cursor.fetchone()
1128
+ if extraction:
1129
+ ext_dict = dict(extraction)
1130
+ if ext_dict.get('raw_text'):
1131
+ ext_dict['raw_text'] = ext_dict['raw_text'][:500] + "..."
1132
+ job_data['extraction'] = ext_dict
1133
+
1134
+ cursor.execute("SELECT * FROM invoice_fields WHERE doc_id = ?", (doc_id,))
1135
+ invoice = cursor.fetchone()
1136
+ if invoice:
1137
+ inv_dict = dict(invoice)
1138
+ if inv_dict.get('confidence_map'):
1139
+ inv_dict['confidence_map'] = json.loads(inv_dict['confidence_map'])
1140
+ job_data['invoice_fields'] = inv_dict
1141
+
1142
+ conn.close()
1143
+ return JobStatusResponse(**job_data)
1144
+
1145
+ except HTTPException:
1146
+ raise
1147
+ except Exception as e:
1148
+ logger.error(f"❌ Job status error: {e}")
1149
+ raise HTTPException(500, str(e))
1150
+
1151
+
1152
+ @router.post("/ingest/batch", response_model=BatchIngestResponse)
1153
+ async def ingest_batch_documents(
1154
+ background_tasks: BackgroundTasks,
1155
+ files: List[UploadFile] = File(...),
1156
+ message: str = Form(None)
1157
+ ):
1158
+ """
1159
+ Upload multiple documents for batch processing.
1160
+
1161
+ Examples:
1162
+ 1. Batch upload without filtering:
1163
+ curl -F "files=@invoice1.jpg" -F "files=@invoice2.pdf" -F "files=@invoice3.png" \
1164
+ http://localhost:7860/api/ingest/batch
1165
+
1166
+ 2. Batch upload with same extraction rule for all:
1167
+ curl -F "files=@invoice1.jpg" -F "files=@invoice2.jpg" \
1168
+ -F "message=extract only total and date" \
1169
+ http://localhost:7860/api/ingest/batch
1170
+
1171
+ 3. Maximum 50 files per batch
1172
+ """
1173
+ # Validate batch size
1174
+ if len(files) > 50:
1175
+ raise HTTPException(400, "Maximum 50 files per batch")
1176
+
1177
+ if len(files) == 0:
1178
+ raise HTTPException(400, "No files provided")
1179
+
1180
+ # Clean message
1181
+ cleaned_message = None
1182
+ if message and message not in ['None', 'null', '', 'undefined']:
1183
+ cleaned_message = message.strip()
1184
+ if not cleaned_message:
1185
+ cleaned_message = None
1186
+
1187
+ batch_id = f"batch_{uuid.uuid4().hex[:12]}"
1188
+ jobs = []
1189
+
1190
+ logger.info("=" * 70)
1191
+ logger.info(f"📦 BATCH UPLOAD - {len(files)} files")
1192
+ logger.info(f"📦 Batch ID: {batch_id}")
1193
+ logger.info(f"📦 Message: '{cleaned_message}'")
1194
+ logger.info("=" * 70)
1195
+
1196
+ try:
1197
+ allowed_types = ['application/pdf', 'image/png', 'image/jpeg']
1198
+
1199
+ for idx, file in enumerate(files):
1200
+ # Validate each file
1201
+ if file.content_type not in allowed_types:
1202
+ logger.warning(f"⚠️ Skipping {file.filename} - invalid type: {file.content_type}")
1203
+ continue
1204
+
1205
+ # Create job for this file
1206
+ job_id = f"job_{uuid.uuid4().hex[:12]}"
1207
+ doc_id = f"doc_{uuid.uuid4().hex[:12]}"
1208
+ file_ext = file.filename.split('.')[-1] if '.' in file.filename else 'pdf'
1209
+
1210
+ stored_filename = f"{doc_id}.{file_ext}"
1211
+ file_path = STORAGE_PATH / stored_filename
1212
+
1213
+ # Save file
1214
+ content = await file.read()
1215
+ with open(file_path, 'wb') as f:
1216
+ f.write(content)
1217
+
1218
+ # Save to database
1219
+ with FileLock(str(LOCK_PATH), timeout=10):
1220
+ conn = sqlite3.connect(str(DB_PATH))
1221
+ cursor = conn.cursor()
1222
+
1223
+ cursor.execute("""
1224
+ INSERT INTO ingest_jobs (job_id, doc_id, filename, status)
1225
+ VALUES (?, ?, ?, 'queued')
1226
+ """, (job_id, doc_id, file.filename))
1227
+
1228
+ cursor.execute("""
1229
+ INSERT INTO documents (doc_id, job_id, path, filename, content_type)
1230
+ VALUES (?, ?, ?, ?, ?)
1231
+ """, (doc_id, job_id, str(file_path), file.filename, file.content_type))
1232
+
1233
+ conn.commit()
1234
+ conn.close()
1235
+
1236
+ # Queue processing
1237
+ background_tasks.add_task(
1238
+ process_document,
1239
+ job_id,
1240
+ doc_id,
1241
+ file_path,
1242
+ user_message=cleaned_message
1243
+ )
1244
+
1245
+ jobs.append({
1246
+ 'job_id': job_id,
1247
+ 'doc_id': doc_id,
1248
+ 'filename': file.filename,
1249
+ 'status': 'queued'
1250
+ })
1251
+
1252
+ logger.info(f"✅ [{idx+1}/{len(files)}] Queued: {file.filename}")
1253
+
1254
+ if not jobs:
1255
+ raise HTTPException(400, "No valid files to process")
1256
+
1257
+ # Save batch metadata
1258
+ with FileLock(str(LOCK_PATH), timeout=10):
1259
+ conn = sqlite3.connect(str(DB_PATH))
1260
+ cursor = conn.cursor()
1261
+
1262
+ # Create batch_jobs table if it doesn't exist
1263
+ cursor.execute("""
1264
+ CREATE TABLE IF NOT EXISTS batch_jobs (
1265
+ batch_id TEXT PRIMARY KEY,
1266
+ total_files INTEGER,
1267
+ message TEXT,
1268
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
1269
+ )
1270
+ """)
1271
+
1272
+ cursor.execute("""
1273
+ INSERT INTO batch_jobs (batch_id, total_files, message)
1274
+ VALUES (?, ?, ?)
1275
+ """, (batch_id, len(jobs), cleaned_message))
1276
+
1277
+ # Link jobs to batch
1278
+ cursor.execute("""
1279
+ CREATE TABLE IF NOT EXISTS batch_job_mapping (
1280
+ batch_id TEXT,
1281
+ job_id TEXT,
1282
+ FOREIGN KEY (job_id) REFERENCES ingest_jobs(job_id)
1283
+ )
1284
+ """)
1285
+
1286
+ for job in jobs:
1287
+ cursor.execute("""
1288
+ INSERT INTO batch_job_mapping (batch_id, job_id)
1289
+ VALUES (?, ?)
1290
+ """, (batch_id, job['job_id']))
1291
+
1292
+ conn.commit()
1293
+ conn.close()
1294
+
1295
+ mode = "autonomous agent"
1296
+ if cleaned_message:
1297
+ mode += " with intelligent filtering"
1298
+
1299
+ logger.info(f"🚀 Batch {batch_id} processing started with {len(jobs)} files")
1300
+
1301
+ return BatchIngestResponse(
1302
+ batch_id=batch_id,
1303
+ total_files=len(jobs),
1304
+ jobs=jobs,
1305
+ message=f'Batch of {len(jobs)} documents uploaded. Processing with {mode}.'
1306
+ )
1307
+
1308
+ except HTTPException:
1309
+ raise
1310
+ except Exception as e:
1311
+ logger.error(f"❌ Batch ingest error: {e}")
1312
+ import traceback
1313
+ logger.error(traceback.format_exc())
1314
+ raise HTTPException(500, str(e))
1315
+
1316
+
1317
+ @router.get("/ingest/batch/{batch_id}", response_model=BatchStatusResponse)
1318
+ def get_batch_status(batch_id: str):
1319
+ """
1320
+ Get status of all jobs in a batch.
1321
+
1322
+ Example:
1323
+ curl http://localhost:7860/api/ingest/batch/batch_abc123
1324
+ """
1325
+ try:
1326
+ with FileLock(str(LOCK_PATH), timeout=10):
1327
+ conn = sqlite3.connect(str(DB_PATH))
1328
+ conn.row_factory = sqlite3.Row
1329
+ cursor = conn.cursor()
1330
+
1331
+ # Get batch info
1332
+ cursor.execute("SELECT * FROM batch_jobs WHERE batch_id = ?", (batch_id,))
1333
+ batch = cursor.fetchone()
1334
+ if not batch:
1335
+ conn.close()
1336
+ raise HTTPException(404, "Batch not found")
1337
+
1338
+ # Get all jobs in batch
1339
+ cursor.execute("""
1340
+ SELECT j.* FROM ingest_jobs j
1341
+ JOIN batch_job_mapping bm ON j.job_id = bm.job_id
1342
+ WHERE bm.batch_id = ?
1343
+ """, (batch_id,))
1344
+
1345
+ jobs = cursor.fetchall()
1346
+ conn.close()
1347
+
1348
+ # Count statuses
1349
+ status_counts = {
1350
+ 'completed': 0,
1351
+ 'processing': 0,
1352
+ 'failed': 0,
1353
+ 'queued': 0,
1354
+ 'needs_review': 0
1355
+ }
1356
+
1357
+ jobs_list = []
1358
+ for job in jobs:
1359
+ job_dict = dict(job)
1360
+ status = job_dict['status']
1361
+ status_counts[status] = status_counts.get(status, 0) + 1
1362
+
1363
+ jobs_list.append({
1364
+ 'job_id': job_dict['job_id'],
1365
+ 'doc_id': job_dict['doc_id'],
1366
+ 'filename': job_dict['filename'],
1367
+ 'status': status,
1368
+ 'error_text': job_dict.get('error_text'),
1369
+ 'created_at': job_dict['created_at'],
1370
+ 'updated_at': job_dict['updated_at']
1371
+ })
1372
+
1373
+ return BatchStatusResponse(
1374
+ batch_id=batch_id,
1375
+ total_files=len(jobs),
1376
+ completed=status_counts['completed'],
1377
+ processing=status_counts['processing'],
1378
+ failed=status_counts['failed'],
1379
+ queued=status_counts['queued'],
1380
+ jobs=jobs_list
1381
+ )
1382
+
1383
+ except HTTPException:
1384
+ raise
1385
+ except Exception as e:
1386
+ logger.error(f"❌ Batch status error: {e}")
1387
+ raise HTTPException(500, str(e))
1388
+
1389
+
1390
+ @router.get("/ingest/batch/{batch_id}/download")
1391
+ def download_batch_results(batch_id: str):
1392
+ """
1393
+ Download all extracted data from a batch as CSV.
1394
+
1395
+ Example:
1396
+ curl http://localhost:7860/api/ingest/batch/batch_abc123/download -o results.csv
1397
+ """
1398
+ try:
1399
+ import csv
1400
+ from io import StringIO
1401
+ from fastapi.responses import StreamingResponse
1402
+
1403
+ with FileLock(str(LOCK_PATH), timeout=10):
1404
+ conn = sqlite3.connect(str(DB_PATH))
1405
+ conn.row_factory = sqlite3.Row
1406
+ cursor = conn.cursor()
1407
+
1408
+ # Get all completed jobs in batch
1409
+ cursor.execute("""
1410
+ SELECT j.*, f.* FROM ingest_jobs j
1411
+ JOIN batch_job_mapping bm ON j.job_id = bm.job_id
1412
+ LEFT JOIN invoice_fields f ON j.doc_id = f.doc_id
1413
+ WHERE bm.batch_id = ? AND j.status = 'completed'
1414
+ """, (batch_id,))
1415
+
1416
+ results = cursor.fetchall()
1417
+ conn.close()
1418
+
1419
+ if not results:
1420
+ raise HTTPException(404, "No completed jobs found in batch")
1421
+
1422
+ # Create CSV
1423
+ output = StringIO()
1424
+ writer = csv.writer(output)
1425
+
1426
+ # Header
1427
+ writer.writerow([
1428
+ 'filename', 'doc_id', 'customer', 'date', 'amount',
1429
+ 'payment_terms', 'business_code', 'status'
1430
+ ])
1431
+
1432
+ # Data rows
1433
+ for row in results:
1434
+ writer.writerow([
1435
+ row['filename'],
1436
+ row['doc_id'],
1437
+ row['cust_number'] or 'N/A',
1438
+ row['posting_date'] or 'N/A',
1439
+ row['total_open_amount'] or 0.0,
1440
+ row['cust_payment_terms'] or 'N/A',
1441
+ row['business_code'] or 'N/A',
1442
+ row['status']
1443
+ ])
1444
+
1445
+ output.seek(0)
1446
+
1447
+ return StreamingResponse(
1448
+ iter([output.getvalue()]),
1449
+ media_type="text/csv",
1450
+ headers={
1451
+ "Content-Disposition": f"attachment; filename=batch_{batch_id}_results.csv"
1452
+ }
1453
+ )
1454
+
1455
+
1456
+ except HTTPException:
1457
+ raise
1458
+ except Exception as e:
1459
+ raise HTTPException(500, str(e))
backend/app/utils/__init__.py ADDED
File without changes
backend/app/utils/agent_client.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HF Agent client with proper environment variable support.
3
+ """
4
+
5
+ import httpx
6
+ import os
7
+ import time
8
+ import logging
9
+ from typing import Dict, Optional, Tuple
10
+ from pathlib import Path
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Load from environment
15
+ TEXT_EXTRACTOR_URL = os.getenv('TEXT_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/text')
16
+ TABLE_EXTRACTOR_URL = os.getenv('TABLE_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/tables')
17
+ NER_URL = os.getenv('NER_URL', 'https://point9-ner.hf.space/api/ner')
18
+ CLASSIFY_URL = os.getenv('CLASSIFY_URL', 'https://point9-classify.hf.space/api/classify')
19
+ SUMMARIZER_URL = os.getenv('SUMMARIZER_URL', '') # Optional
20
+
21
+ AGENT_BEARER_TOKEN = os.getenv('AGENT_BEARER_TOKEN', '')
22
+ AGENT_TIMEOUT_SECONDS = int(os.getenv('AGENT_TIMEOUT_SECONDS', '30'))
23
+
24
+
25
+ def get_headers() -> Dict:
26
+ """Get headers with optional bearer token."""
27
+ headers = {}
28
+ if AGENT_BEARER_TOKEN:
29
+ headers['Authorization'] = f'Bearer {AGENT_BEARER_TOKEN}'
30
+ return headers
31
+
32
+
33
+ def call_agent_with_retry(
34
+ url: str,
35
+ files: Optional[Dict] = None,
36
+ data: Optional[Dict] = None,
37
+ json: Optional[Dict] = None,
38
+ max_retries: int = 1
39
+ ) -> Tuple[bool, Optional[Dict], Optional[str]]:
40
+ """Call agent with retry logic."""
41
+ headers = get_headers()
42
+
43
+ for attempt in range(max_retries + 1):
44
+ try:
45
+ with httpx.Client(timeout=AGENT_TIMEOUT_SECONDS) as client:
46
+ if files:
47
+ response = client.post(url, headers=headers, files=files, data=data)
48
+ elif json:
49
+ response = client.post(url, headers=headers, json=json)
50
+ else:
51
+ response = client.post(url, headers=headers, data=data)
52
+
53
+ if response.status_code == 200:
54
+ return True, response.json(), None
55
+ elif response.status_code == 429:
56
+ if attempt < max_retries:
57
+ time.sleep(2)
58
+ continue
59
+ return False, None, "Rate limited"
60
+ else:
61
+ return False, None, f"HTTP {response.status_code}: {response.text[:200]}"
62
+
63
+ except httpx.TimeoutException:
64
+ if attempt < max_retries:
65
+ time.sleep(1)
66
+ continue
67
+ return False, None, f"Timeout after {AGENT_TIMEOUT_SECONDS}s"
68
+ except Exception as e:
69
+ if attempt < max_retries:
70
+ time.sleep(1)
71
+ continue
72
+ return False, None, str(e)
73
+
74
+ return False, None, "Max retries exceeded"
75
+
76
+
77
+ def extract_text_from_file(file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
78
+ """Extract text using HF agent."""
79
+ try:
80
+ with open(file_path, 'rb') as f:
81
+ files = {'file': (file_path.name, f, 'application/pdf')}
82
+ data = {'filename': file_path.name}
83
+
84
+ success, response, error = call_agent_with_retry(TEXT_EXTRACTOR_URL, files=files, data=data)
85
+
86
+ if success and response:
87
+ text = response.get('text', '')
88
+ if not text or len(text.strip()) < 10:
89
+ return False, None, "No text extracted"
90
+ return True, text, None
91
+ else:
92
+ return False, None, error or "Text extraction failed"
93
+ except Exception as e:
94
+ return False, None, str(e)
95
+
96
+
97
+ def extract_tables_from_file(file_path: Path) -> Tuple[bool, Optional[list], Optional[str]]:
98
+ """Extract tables using HF agent."""
99
+ try:
100
+ with open(file_path, 'rb') as f:
101
+ files = {'file': (file_path.name, f, 'application/pdf')}
102
+ data = {'filename': file_path.name}
103
+
104
+ success, response, error = call_agent_with_retry(TABLE_EXTRACTOR_URL, files=files, data=data)
105
+
106
+ if success and response:
107
+ return True, response.get('tables', []), None
108
+ else:
109
+ return False, None, error or "Table extraction failed"
110
+ except Exception as e:
111
+ return False, None, str(e)
112
+
113
+
114
+ def extract_entities_from_text(text: str) -> Tuple[bool, Optional[list], Optional[str]]:
115
+ """Extract entities using NER agent."""
116
+ try:
117
+ success, response, error = call_agent_with_retry(NER_URL, json={'text': text})
118
+
119
+ if success and response:
120
+ return True, response.get('entities', []), None
121
+ else:
122
+ return False, None, error or "NER failed"
123
+ except Exception as e:
124
+ return False, None, str(e)
125
+
126
+
127
+ def classify_document(text: str) -> Tuple[bool, Optional[Dict], Optional[str]]:
128
+ """Classify document using classifier agent."""
129
+ try:
130
+ success, response, error = call_agent_with_retry(CLASSIFY_URL, json={'text': text[:2000]})
131
+
132
+ if success and response:
133
+ return True, response, None
134
+ else:
135
+ return False, None, error or "Classification failed"
136
+ except Exception as e:
137
+ return False, None, str(e)
138
+
139
+
140
+ def summarize_text(text: str) -> Tuple[bool, Optional[str], Optional[str]]:
141
+ """Summarize text (optional)."""
142
+ if not SUMMARIZER_URL:
143
+ return True, None, None
144
+
145
+ try:
146
+ success, response, error = call_agent_with_retry(SUMMARIZER_URL, json={'text': text[:5000]})
147
+
148
+ if success and response:
149
+ return True, response.get('summary', ''), None
150
+ else:
151
+ return False, None, error or "Summarization failed"
152
+ except Exception as e:
153
+ return False, None, str(e)
backend/app/wrappers/__init__.py ADDED
File without changes
backend/app/wrappers/gemini_output_filter.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import logging
4
+ import time
5
+ from typing import Dict, Optional
6
+
7
+ try:
8
+ import google.generativeai as genai
9
+ except ImportError:
10
+ raise ImportError("Install google-generativeai: pip install google-generativeai")
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class GeminiOutputFilter:
16
+ """
17
+ Context-aware output filter that adapts to any invoice format.
18
+ No hardcoded field glossary - Gemini discovers fields dynamically.
19
+ WITH RATE LIMIT HANDLING
20
+ """
21
+
22
+ def __init__(self):
23
+ """Initialize Gemini model"""
24
+
25
+ api_key = os.getenv('GEMINI_API_KEY')
26
+ if not api_key:
27
+ raise ValueError("GEMINI_API_KEY environment variable not set")
28
+
29
+ genai.configure(api_key=api_key)
30
+ self.model = genai.GenerativeModel('gemini-2.5-flash')
31
+
32
+ logger.info("✅ GeminiOutputFilter initialized")
33
+
34
+
35
+ def filter_output(self, user_message: str, full_extraction: Dict, max_retries: int = 3) -> Dict:
36
+ """
37
+ Filter extraction based on user message with intelligent retry logic.
38
+
39
+ Args:
40
+ user_message: What user wants (e.g., "I need total and date")
41
+ full_extraction: Complete extraction from agent (any format)
42
+ max_retries: Maximum number of retry attempts for rate limits
43
+
44
+ Returns:
45
+ Filtered result with only requested fields
46
+ """
47
+
48
+ logger.info(f"🔍 Filtering request: '{user_message}'")
49
+ logger.info(f"📊 Available fields: {list(full_extraction.keys())}")
50
+
51
+ # Build context-aware prompt
52
+ prompt = self._build_prompt(user_message, full_extraction)
53
+
54
+ for attempt in range(max_retries):
55
+ try:
56
+ logger.info(f"🤖 Calling Gemini (attempt {attempt + 1}/{max_retries})...")
57
+
58
+ # Call Gemini
59
+ response = self.model.generate_content(prompt)
60
+ response_text = response.text.strip()
61
+
62
+ # Clean markdown if present
63
+ response_text = response_text.replace('```json', '').replace('```', '').strip()
64
+
65
+ # Parse JSON
66
+ filtered_result = json.loads(response_text)
67
+
68
+ logger.info(f"✅ Filtered result: {list(filtered_result.keys())}")
69
+ return filtered_result
70
+
71
+ except json.JSONDecodeError as e:
72
+ logger.error(f"❌ JSON parse error: {e}")
73
+ logger.error(f"Response was: {response_text[:300]}")
74
+ return {
75
+ "_error": "Failed to parse AI response",
76
+ "_debug": response_text[:300],
77
+ "_fallback": full_extraction
78
+ }
79
+
80
+ except Exception as e:
81
+ error_msg = str(e)
82
+
83
+ # Check if it's a rate limit error (429)
84
+ is_rate_limit = (
85
+ "429" in error_msg or
86
+ "quota" in error_msg.lower() or
87
+ "rate limit" in error_msg.lower() or
88
+ "exceeded" in error_msg.lower()
89
+ )
90
+
91
+ if is_rate_limit:
92
+ # Extract wait time from error message
93
+ wait_time = self._extract_retry_delay(error_msg)
94
+
95
+ if attempt < max_retries - 1:
96
+ logger.warning(f"⚠️ Rate limit hit (attempt {attempt + 1}/{max_retries})")
97
+ logger.info(f"⏳ Waiting {wait_time:.1f}s before retry...")
98
+ time.sleep(wait_time)
99
+ continue
100
+ else:
101
+ # Max retries exhausted
102
+ logger.error(f"❌ Rate limit exceeded after {max_retries} attempts")
103
+ logger.error(f"Full error: {error_msg}")
104
+ return {
105
+ "_error": f"Filtering failed: {error_msg}",
106
+ "_fallback": full_extraction
107
+ }
108
+ else:
109
+ # Non-rate-limit error - fail immediately
110
+ logger.error(f"❌ Filtering failed: {e}")
111
+ return {
112
+ "_error": f"Filtering failed: {str(e)}",
113
+ "_fallback": full_extraction
114
+ }
115
+
116
+ # Should not reach here, but just in case
117
+ return {
118
+ "_error": "Max retries exceeded",
119
+ "_fallback": full_extraction
120
+ }
121
+
122
+
123
+ def _extract_retry_delay(self, error_message: str) -> float:
124
+ """
125
+ Extract retry delay from Gemini error message.
126
+
127
+ Gemini errors include: "Please retry in 50.923950003s"
128
+ """
129
+ import re
130
+
131
+ # Look for pattern: "retry in X.Xs" or "retry in Xs"
132
+ match = re.search(r'retry in ([\d.]+)s', error_message, re.IGNORECASE)
133
+
134
+ if match:
135
+ retry_seconds = float(match.group(1))
136
+ # Add small buffer (2 seconds) to be safe
137
+ wait_time = retry_seconds + 2
138
+ logger.info(f"📍 Extracted wait time from error: {retry_seconds}s (using {wait_time}s with buffer)")
139
+ return wait_time
140
+
141
+ # Look for alternative patterns in error
142
+ match = re.search(r'(\d+)\s*(?:second|sec)', error_message, re.IGNORECASE)
143
+ if match:
144
+ retry_seconds = float(match.group(1))
145
+ wait_time = retry_seconds + 2
146
+ logger.info(f"📍 Extracted wait time from error: {retry_seconds}s (using {wait_time}s with buffer)")
147
+ return wait_time
148
+
149
+ # Default: exponential backoff (10s, 20s, 40s)
150
+ default_wait = 10 * (2 ** (0)) # Can increase based on attempt number
151
+ logger.warning(f"⚠️ Could not extract retry delay, using default: {default_wait}s")
152
+ return default_wait
153
+
154
+
155
+ def _build_prompt(self, user_message: str, full_extraction: Dict) -> str:
156
+ """Build the context-aware Gemini prompt"""
157
+
158
+ return f"""You are an intelligent output filter for an invoice extraction system that handles invoices from MANY different companies with DIFFERENT formats and field names.
159
+
160
+ YOUR TASK:
161
+ Our agent has extracted data from an invoice. The fields extracted depend on the invoice format - different companies use different field names and structures. You need to understand what the user wants and map it to whatever fields are available in THIS specific extraction.
162
+
163
+ ====================
164
+ USER'S REQUEST:
165
+ ====================
166
+ "{user_message}"
167
+
168
+ ====================
169
+ EXTRACTED DATA (from this specific invoice):
170
+ ====================
171
+ {json.dumps(full_extraction, indent=2)}
172
+
173
+ ====================
174
+ YOUR JOB:
175
+ ====================
176
+ 1. ANALYZE the extracted fields to understand what data is available
177
+ 2. UNDERSTAND what the user is asking for
178
+ 3. MAP the user's request to the actual field names in this extraction
179
+ 4. RETURN only the fields the user requested
180
+
181
+ ====================
182
+ IMPORTANT CONTEXT AWARENESS:
183
+ ====================
184
+ Different invoices have different field names. You must be flexible and understand INTENT:
185
+
186
+ USER ASKS FOR "total" or "amount":
187
+ - Could be: total_open_amount, total, amount, grand_total, net_amount, invoice_total, final_amount, etc.
188
+ - Look for fields that contain: "total", "amount", "price", "sum", or numeric values that seem like totals
189
+
190
+ USER ASKS FOR "date":
191
+ - Could be: posting_date, invoice_date, date, issue_date, date_of_issue, created_date, created, etc.
192
+ - Look for fields with: "date", "created", "issue" or date-like values (YYYY-MM-DD format)
193
+
194
+ USER ASKS FOR "customer" or "client":
195
+ - Could be: cust_number, customer, client, customer_name, client_name, bill_to, buyer, purchaser, etc.
196
+ - Look for fields with: "cust", "client", "customer", "buyer", "bill", "purchaser"
197
+
198
+ USER ASKS FOR "invoice number":
199
+ - Could be: invoice_id, invoice_no, invoice_number, doc_no, document_number, reference, ref_no, doc_reference, etc.
200
+ - Look for fields with: "invoice", "doc", "number", "id", "ref", "reference"
201
+
202
+ USER ASKS FOR "payment terms":
203
+ - Could be: payment_terms, terms, due_terms, payment_conditions, net_terms, etc.
204
+ - Look for fields with: "payment", "terms", "due", "net"
205
+
206
+ ====================
207
+ STRATEGY:
208
+ ====================
209
+ 1. First, list out all available fields from the extraction
210
+ 2. For each field, infer what type of data it contains based on:
211
+ - Field name (does it contain keywords like "total", "date", "customer"?)
212
+ - Value type (is it a number? date? string?)
213
+ - Value content (does it look like money? a date? a name?)
214
+ 3. Match user's request to the best-fitting available fields
215
+ 4. If multiple fields could match, pick the most likely one (e.g., "grand_total" over "subtotal")
216
+ 5. If NO fields match, explain what's available
217
+
218
+ ====================
219
+ FLEXIBILITY EXAMPLES:
220
+ ====================
221
+
222
+ Example 1 - Simple mapping:
223
+ Extraction: {{"total_amount": 500, "customer_name": "ABC Corp"}}
224
+ User: "show me total"
225
+ Response: {{"total_amount": 500}}
226
+
227
+ Example 2 - Different field name:
228
+ Extraction: {{"grand_total": 500, "bill_to": "ABC Corp"}}
229
+ User: "show me total"
230
+ Response: {{"grand_total": 500}}
231
+
232
+ Example 3 - User friendly name:
233
+ Extraction: {{"invoice_amt": 500, "client_id": "ABC Corp"}}
234
+ User: "what's the amount?"
235
+ Response: {{"amount": 500}}
236
+
237
+ Example 4 - Multiple fields requested:
238
+ Extraction: {{"total": 500, "date": "2024-01-01", "customer": "ABC"}}
239
+ User: "I need total and date"
240
+ Response: {{"total": 500, "date": "2024-01-01"}}
241
+
242
+ Example 5 - Extract all:
243
+ User: "extract all information" OR "give me everything" OR "show full data"
244
+ Response: {{entire extraction unchanged}}
245
+
246
+ Example 6 - Field not found:
247
+ Extraction: {{"total": 500, "date": "2024-01-01"}}
248
+ User: "show me shipping address"
249
+ Response: {{
250
+ "_error": "No shipping address found in this invoice",
251
+ "_available_fields": {{
252
+ "total": "appears to be invoice total amount",
253
+ "date": "appears to be invoice date"
254
+ }},
255
+ "_suggestion": "Available data: total, date"
256
+ }}
257
+
258
+ ====================
259
+ RESPONSE FORMAT:
260
+ ====================
261
+ Return ONLY valid JSON, no markdown, no extra text.
262
+
263
+ If successful:
264
+ {{
265
+ "field_name": value
266
+ }}
267
+
268
+ You CAN rename fields to be user-friendly:
269
+ {{
270
+ "total": 500 // even if original field was "invoice_amt"
271
+ }}
272
+
273
+ If field not found:
274
+ {{
275
+ "_error": "...",
276
+ "_available_fields": {{
277
+ "field1": "what this field appears to contain",
278
+ "field2": "what this field appears to contain"
279
+ }}
280
+ }}
281
+
282
+ ====================
283
+ CRITICAL RULES:
284
+ ====================
285
+ 1. DO NOT assume what fields exist. ONLY work with the fields present in the extraction JSON above.
286
+ 2. Be intelligent about inferring what each field means based on its name and value.
287
+ 3. If user asks for "all" or "everything", return the ENTIRE extraction unchanged.
288
+ 4. Always return valid JSON only - no explanations outside the JSON.
289
+
290
+ Now process the user's request."""
291
+
292
+
293
+ def analyze_extraction(self, extraction: Dict) -> Dict:
294
+ """
295
+ Optional utility: Get Gemini's analysis of what fields mean.
296
+ Useful for debugging or showing users what's available.
297
+ """
298
+
299
+ prompt = f"""Analyze this invoice extraction and explain what each field likely contains:
300
+
301
+ {json.dumps(extraction, indent=2)}
302
+
303
+ For each field, provide:
304
+ - Field name
305
+ - Likely meaning (what data it contains)
306
+ - Data type
307
+ - User-friendly name suggestion
308
+
309
+ Return as JSON:
310
+ {{
311
+ "field_name": {{
312
+ "meaning": "...",
313
+ "type": "...",
314
+ "user_friendly_name": "..."
315
+ }}
316
+ }}"""
317
+
318
+ try:
319
+ response = self.model.generate_content(prompt)
320
+ response_text = response.text.strip().replace('```json', '').replace('```', '')
321
+ analysis = json.loads(response_text)
322
+ return analysis
323
+ except Exception as e:
324
+ logger.error(f"Analysis failed: {e}")
325
+ return {"error": f"Could not analyze extraction: {str(e)}"}
326
+
327
+
328
+ # ============================================
329
+ # Usage Example (for testing)
330
+ # ============================================
331
+
332
+ if __name__ == "__main__":
333
+ # Test the wrapper with retry logic
334
+
335
+ extraction = {
336
+ "cust_number": "Martinez Rosales, An",
337
+ "posting_date": "2015-07-21",
338
+ "total_open_amount": 442.93,
339
+ "business_code": "U001",
340
+ "cust_payment_terms": "NAH4"
341
+ }
342
+
343
+ wrapper = GeminiOutputFilter()
344
+
345
+ print("\n" + "="*60)
346
+ print("TEST: User asks 'show me who the customer is'")
347
+ print("="*60)
348
+ result = wrapper.filter_output("show me who the customer is", extraction, max_retries=3)
349
+ print(f"Result: {json.dumps(result, indent=2)}")
backend/database/__init__.py ADDED
File without changes
backend/database/migration_ingest_v1.sql ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- ============================================
2
+ -- Minimal Ingest Pipeline Tables
3
+ -- Version: 1.0 (Idempotent)
4
+ -- ============================================
5
+
6
+ -- Table 1: ingest_jobs (job tracking)
7
+ CREATE TABLE IF NOT EXISTS ingest_jobs (
8
+ job_id TEXT PRIMARY KEY,
9
+ doc_id INTEGER,
10
+ filename TEXT NOT NULL,
11
+ status TEXT NOT NULL DEFAULT 'queued',
12
+ error_text TEXT,
13
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP,
14
+ updated_at TEXT DEFAULT CURRENT_TIMESTAMP,
15
+ FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
16
+ );
17
+
18
+ -- Drop indexes if they exist, then recreate
19
+ DROP INDEX IF EXISTS idx_ingest_jobs_status;
20
+ DROP INDEX IF EXISTS idx_ingest_jobs_created;
21
+ CREATE INDEX idx_ingest_jobs_status ON ingest_jobs(status);
22
+ CREATE INDEX idx_ingest_jobs_created ON ingest_jobs(created_at DESC);
23
+
24
+ -- Table 2: documents (file metadata)
25
+ CREATE TABLE IF NOT EXISTS documents (
26
+ doc_id INTEGER PRIMARY KEY AUTOINCREMENT,
27
+ job_id TEXT NOT NULL,
28
+ path TEXT NOT NULL,
29
+ filename TEXT NOT NULL,
30
+ content_type TEXT NOT NULL,
31
+ uploaded_at TEXT DEFAULT CURRENT_TIMESTAMP,
32
+ FOREIGN KEY (job_id) REFERENCES ingest_jobs(job_id)
33
+ );
34
+
35
+ DROP INDEX IF EXISTS idx_documents_job_id;
36
+ CREATE INDEX idx_documents_job_id ON documents(job_id);
37
+
38
+ -- Table 3: extractions (agent artifacts)
39
+ CREATE TABLE IF NOT EXISTS extractions (
40
+ doc_id INTEGER PRIMARY KEY,
41
+ raw_text TEXT,
42
+ tables_json TEXT,
43
+ entities_json TEXT,
44
+ classification_json TEXT,
45
+ summary_text TEXT,
46
+ extracted_at TEXT DEFAULT CURRENT_TIMESTAMP,
47
+ FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
48
+ );
49
+
50
+ -- Table 4: invoice_fields (mapped fields for prediction)
51
+ CREATE TABLE IF NOT EXISTS invoice_fields (
52
+ invoice_id INTEGER PRIMARY KEY AUTOINCREMENT,
53
+ doc_id INTEGER NOT NULL,
54
+ cust_number TEXT,
55
+ posting_date TEXT,
56
+ total_open_amount REAL,
57
+ business_code TEXT,
58
+ cust_payment_terms TEXT,
59
+ invoice_currency TEXT DEFAULT 'USD',
60
+ due_in_date TEXT,
61
+ confidence_map TEXT,
62
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP,
63
+ FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
64
+ );
65
+
66
+ DROP INDEX IF EXISTS idx_invoice_fields_doc_id;
67
+ CREATE INDEX idx_invoice_fields_doc_id ON invoice_fields(doc_id);
backend/database/migration_ingest_v2.sql ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- ============================================
2
+ -- Invoice Ingest Pipeline - Complete Schema
3
+ -- Version: 2.0
4
+ -- ============================================
5
+
6
+ -- Table 1: ingest_jobs
7
+ CREATE TABLE IF NOT EXISTS ingest_jobs (
8
+ job_id TEXT PRIMARY KEY,
9
+ doc_id TEXT,
10
+ filename TEXT NOT NULL,
11
+ status TEXT NOT NULL DEFAULT 'queued',
12
+ error_text TEXT,
13
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP,
14
+ updated_at TEXT DEFAULT CURRENT_TIMESTAMP
15
+ );
16
+
17
+ DROP INDEX IF EXISTS idx_ingest_jobs_status;
18
+ DROP INDEX IF EXISTS idx_ingest_jobs_created;
19
+ CREATE INDEX idx_ingest_jobs_status ON ingest_jobs(status);
20
+ CREATE INDEX idx_ingest_jobs_created ON ingest_jobs(created_at DESC);
21
+
22
+ -- Table 2: documents
23
+ CREATE TABLE IF NOT EXISTS documents (
24
+ doc_id TEXT PRIMARY KEY,
25
+ job_id TEXT NOT NULL,
26
+ path TEXT NOT NULL,
27
+ filename TEXT NOT NULL,
28
+ content_type TEXT NOT NULL,
29
+ uploaded_at TEXT DEFAULT CURRENT_TIMESTAMP,
30
+ FOREIGN KEY (job_id) REFERENCES ingest_jobs(job_id)
31
+ );
32
+
33
+ DROP INDEX IF EXISTS idx_documents_job_id;
34
+ CREATE INDEX idx_documents_job_id ON documents(job_id);
35
+
36
+ -- Table 3: extractions
37
+ CREATE TABLE IF NOT EXISTS extractions (
38
+ doc_id TEXT PRIMARY KEY,
39
+ raw_text TEXT,
40
+ tables_json TEXT,
41
+ entities_json TEXT,
42
+ classification_json TEXT,
43
+ summary_text TEXT,
44
+ extracted_at TEXT DEFAULT CURRENT_TIMESTAMP,
45
+ FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
46
+ );
47
+
48
+ -- Table 4: invoice_fields
49
+ CREATE TABLE IF NOT EXISTS invoice_fields (
50
+ invoice_id INTEGER PRIMARY KEY AUTOINCREMENT,
51
+ doc_id TEXT NOT NULL,
52
+ cust_number TEXT,
53
+ posting_date TEXT,
54
+ total_open_amount REAL,
55
+ business_code TEXT,
56
+ cust_payment_terms TEXT,
57
+ confidence_map TEXT,
58
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP,
59
+ FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
60
+ );
61
+
62
+ DROP INDEX IF EXISTS idx_invoice_fields_doc_id;
63
+ CREATE INDEX idx_invoice_fields_doc_id ON invoice_fields(doc_id);
backend/database/queries.sql ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Fix for the overdue percentage calculation
2
+ -- Original line 32 had syntax error
3
+
4
+ -- CORRECTED Query 1:
5
+ WITH customer_stats AS (
6
+ SELECT
7
+ cust_number,
8
+ COUNT(*) as total_invoices,
9
+ COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END) as cleared_count,
10
+
11
+ AVG(days_to_clear) as avg_days,
12
+ PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY days_to_clear) as median_days,
13
+ STDDEV(days_to_clear) as std_days,
14
+ MIN(days_to_clear) as min_days,
15
+ MAX(days_to_clear) as max_days,
16
+
17
+ AVG(total_open_amount) as avg_amount,
18
+ SUM(total_open_amount) as total_amount,
19
+
20
+ -- FIXED: Overdue percentage calculation
21
+ CASE
22
+ WHEN COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END) > 0
23
+ THEN (CAST(COUNT(CASE WHEN is_overdue = TRUE THEN 1 END) AS NUMERIC) /
24
+ CAST(COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END) AS NUMERIC) * 100)
25
+ ELSE 0.0
26
+ END as pct_overdue,
27
+
28
+ (SELECT cust_payment_terms FROM invoices_history WHERE cust_number = $1 GROUP BY cust_payment_terms ORDER BY COUNT(*) DESC LIMIT 1) as most_common_payment_term,
29
+ (SELECT business_code FROM invoices_history WHERE cust_number = $1 GROUP BY business_code ORDER BY COUNT(*) DESC LIMIT 1) as most_common_business_code,
30
+ (SELECT invoice_currency FROM invoices_history WHERE cust_number = $1 GROUP BY invoice_currency ORDER BY COUNT(*) DESC LIMIT 1) as most_common_currency
31
+
32
+ FROM invoices_history
33
+ WHERE cust_number = $1
34
+ GROUP BY cust_number
35
+ )
36
+ SELECT
37
+ cust_number,
38
+ total_invoices as cust_invoice_count,
39
+ cleared_count as cust_cleared_count,
40
+ ROUND(avg_days, 2) as cust_avg_days,
41
+ ROUND(median_days, 2) as cust_median_days,
42
+ ROUND(COALESCE(std_days, 0), 2) as cust_std_days,
43
+ min_days as cust_min_days,
44
+ max_days as cust_max_days,
45
+ ROUND(avg_amount, 2) as cust_avg_amount,
46
+ ROUND(total_amount, 2) as cust_total_amount,
47
+ ROUND(pct_overdue, 2) as cust_pct_overdue,
48
+ most_common_payment_term,
49
+ most_common_business_code,
50
+ most_common_currency
51
+ FROM customer_stats;
52
+ -- ============================================
53
+ -- QUERY 2: Batch Compute All Customer Aggregates
54
+ -- Usage: Nightly ETL job
55
+ -- ============================================
56
+
57
+ -- Name: compute_all_customer_aggregates
58
+ -- Description: Computes aggregates for ALL customers with cleared invoices
59
+ WITH customer_stats AS (
60
+ SELECT
61
+ cust_number,
62
+ COUNT(*) as total_invoices,
63
+ COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END) as cleared_count,
64
+
65
+ AVG(days_to_clear) as avg_days,
66
+ PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY days_to_clear) as median_days,
67
+ STDDEV(days_to_clear) as std_days,
68
+ MIN(days_to_clear) as min_days,
69
+ MAX(days_to_clear) as max_days,
70
+
71
+ AVG(total_open_amount) as avg_amount,
72
+ SUM(total_open_amount) as total_amount,
73
+
74
+ COUNT(CASE WHEN is_overdue = TRUE THEN 1 END)::NUMERIC /
75
+ NULLIF(COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END), 0) * 100 as pct_overdue,
76
+
77
+ MODE() WITHIN GROUP (ORDER BY cust_payment_terms) as most_common_payment_term,
78
+ MODE() WITHIN GROUP (ORDER BY business_code) as most_common_business_code,
79
+ MODE() WITHIN GROUP (ORDER BY invoice_currency) as most_common_currency
80
+
81
+ FROM invoices_history
82
+ WHERE clear_date IS NOT NULL -- Only customers with history
83
+ GROUP BY cust_number
84
+ HAVING COUNT(CASE WHEN clear_date IS NOT NULL THEN 1 END) > 0
85
+ )
86
+ SELECT
87
+ cust_number,
88
+ total_invoices as cust_invoice_count,
89
+ cleared_count as cust_cleared_count,
90
+ ROUND(avg_days, 2) as cust_avg_days,
91
+ ROUND(median_days, 2) as cust_median_days,
92
+ ROUND(COALESCE(std_days, 0), 2) as cust_std_days,
93
+ min_days as cust_min_days,
94
+ max_days as cust_max_days,
95
+ ROUND(avg_amount, 2) as cust_avg_amount,
96
+ ROUND(total_amount, 2) as cust_total_amount,
97
+ ROUND(COALESCE(pct_overdue, 0), 2) as cust_pct_overdue,
98
+ most_common_payment_term,
99
+ most_common_business_code,
100
+ most_common_currency,
101
+ NOW() as last_computed_at
102
+ FROM customer_stats;
103
+
104
+ -- ============================================
105
+ -- QUERY 3: Upsert Customer Aggregates
106
+ -- Usage: Insert or update customer_aggregates table
107
+ -- ============================================
108
+
109
+ -- Name: upsert_customer_aggregates
110
+ -- Description: Insert/update aggregates with conflict handling
111
+ -- Parameters: All customer aggregate fields
112
+ INSERT INTO customer_aggregates (
113
+ cust_number,
114
+ cust_invoice_count,
115
+ cust_cleared_count,
116
+ cust_avg_days,
117
+ cust_median_days,
118
+ cust_std_days,
119
+ cust_min_days,
120
+ cust_max_days,
121
+ cust_avg_amount,
122
+ cust_total_amount,
123
+ cust_pct_overdue,
124
+ most_common_payment_term,
125
+ most_common_business_code,
126
+ most_common_currency,
127
+ last_computed_at
128
+ ) VALUES (
129
+ $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, NOW()
130
+ )
131
+ ON CONFLICT (cust_number)
132
+ DO UPDATE SET
133
+ cust_invoice_count = EXCLUDED.cust_invoice_count,
134
+ cust_cleared_count = EXCLUDED.cust_cleared_count,
135
+ cust_avg_days = EXCLUDED.cust_avg_days,
136
+ cust_median_days = EXCLUDED.cust_median_days,
137
+ cust_std_days = EXCLUDED.cust_std_days,
138
+ cust_min_days = EXCLUDED.cust_min_days,
139
+ cust_max_days = EXCLUDED.cust_max_days,
140
+ cust_avg_amount = EXCLUDED.cust_avg_amount,
141
+ cust_total_amount = EXCLUDED.cust_total_amount,
142
+ cust_pct_overdue = EXCLUDED.cust_pct_overdue,
143
+ most_common_payment_term = EXCLUDED.most_common_payment_term,
144
+ most_common_business_code = EXCLUDED.most_common_business_code,
145
+ most_common_currency = EXCLUDED.most_common_currency,
146
+ last_computed_at = NOW();
147
+
148
+ -- ============================================
149
+ -- QUERY 4: Compute Payment Terms Aggregates
150
+ -- Usage: Pre-compute payment term statistics
151
+ -- ============================================
152
+
153
+ -- Name: compute_payment_terms_aggregates
154
+ WITH payment_stats AS (
155
+ SELECT
156
+ cust_payment_terms,
157
+ AVG(days_to_clear) as avg_days,
158
+ PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY days_to_clear) as median_days,
159
+ COUNT(*) as invoice_count
160
+ FROM invoices_history
161
+ WHERE clear_date IS NOT NULL
162
+ AND cust_payment_terms IS NOT NULL
163
+ GROUP BY cust_payment_terms
164
+ )
165
+ SELECT
166
+ cust_payment_terms,
167
+ ROUND(avg_days, 2) as payment_terms_avg_days,
168
+ ROUND(median_days, 2) as payment_terms_median_days,
169
+ invoice_count as payment_terms_count,
170
+ NOW() as last_computed_at
171
+ FROM payment_stats;
172
+
173
+ -- ============================================
174
+ -- QUERY 5: Compute Business Code Aggregates
175
+ -- Usage: Pre-compute business code statistics
176
+ -- ============================================
177
+
178
+ -- Name: compute_business_code_aggregates
179
+ WITH business_stats AS (
180
+ SELECT
181
+ business_code,
182
+ AVG(days_to_clear) as avg_days,
183
+ PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY days_to_clear) as median_days,
184
+ COUNT(*) as invoice_count
185
+ FROM invoices_history
186
+ WHERE clear_date IS NOT NULL
187
+ AND business_code IS NOT NULL
188
+ GROUP BY business_code
189
+ )
190
+ SELECT
191
+ business_code,
192
+ ROUND(avg_days, 2) as business_avg_days,
193
+ ROUND(median_days, 2) as business_median_days,
194
+ invoice_count as business_count,
195
+ NOW() as last_computed_at
196
+ FROM business_stats;
197
+
198
+ -- ============================================
199
+ -- QUERY 6: Get Customer Features (for inference)
200
+ -- Usage: Retrieve all features for a customer
201
+ -- ============================================
202
+
203
+ -- Name: get_customer_features
204
+ -- Description: Get customer aggregates for prediction
205
+ -- Parameters: $1 = cust_number
206
+ SELECT
207
+ cust_number,
208
+ cust_invoice_count,
209
+ cust_cleared_count,
210
+ cust_avg_days,
211
+ cust_median_days,
212
+ cust_std_days,
213
+ cust_min_days,
214
+ cust_max_days,
215
+ cust_avg_amount,
216
+ cust_total_amount,
217
+ cust_pct_overdue,
218
+ most_common_payment_term,
219
+ most_common_business_code,
220
+ most_common_currency,
221
+ last_computed_at
222
+ FROM customer_aggregates
223
+ WHERE cust_number = $1;
224
+
225
+ -- ============================================
226
+ -- QUERY 7: Get Payment Terms Features
227
+ -- Usage: Retrieve payment term stats
228
+ -- ============================================
229
+
230
+ -- Name: get_payment_terms_features
231
+ -- Parameters: $1 = cust_payment_terms
232
+ SELECT
233
+ cust_payment_terms,
234
+ payment_terms_avg_days,
235
+ payment_terms_median_days,
236
+ payment_terms_count
237
+ FROM payment_terms_aggregates
238
+ WHERE cust_payment_terms = $1;
239
+
240
+ -- ============================================
241
+ -- QUERY 8: Get Business Code Features
242
+ -- Usage: Retrieve business code stats
243
+ -- ============================================
244
+
245
+ -- Name: get_business_code_features
246
+ -- Parameters: $1 = business_code
247
+ SELECT
248
+ business_code,
249
+ business_avg_days,
250
+ business_median_days,
251
+ business_count
252
+ FROM business_code_aggregates
253
+ WHERE business_code = $1;
254
+
255
+ -- ============================================
256
+ -- QUERY 9: Insert Invoice (with upsert)
257
+ -- Usage: Ingest new invoice data
258
+ -- ============================================
259
+
260
+ -- Name: upsert_invoice
261
+ -- Parameters: All invoice fields
262
+ INSERT INTO invoices_history (
263
+ invoice_id,
264
+ business_code,
265
+ cust_number,
266
+ name_customer,
267
+ posting_date,
268
+ document_create_date,
269
+ document_create_date_alt,
270
+ due_in_date,
271
+ baseline_create_date,
272
+ clear_date,
273
+ total_open_amount,
274
+ invoice_currency,
275
+ document_type,
276
+ cust_payment_terms,
277
+ posting_id,
278
+ business_year
279
+ ) VALUES (
280
+ $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16
281
+ )
282
+ ON CONFLICT (invoice_id)
283
+ DO UPDATE SET
284
+ clear_date = EXCLUDED.clear_date,
285
+ is_open = EXCLUDED.is_open,
286
+ updated_at = NOW();
287
+
288
+ -- ============================================
289
+ -- QUERY 10: Insert Prediction Log
290
+ -- Usage: Record prediction for monitoring
291
+ -- ============================================
292
+
293
+ -- Name: insert_prediction_log
294
+ -- Parameters: prediction fields
295
+ INSERT INTO predictions_log (
296
+ invoice_id,
297
+ cust_number,
298
+ posting_date,
299
+ total_open_amount,
300
+ business_code,
301
+ cust_payment_terms,
302
+ features_json,
303
+ predicted_days_to_clear,
304
+ predicted_clear_date,
305
+ model_version,
306
+ model_path
307
+ ) VALUES (
308
+ $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11
309
+ ) RETURNING prediction_id;
310
+
311
+ -- ============================================
312
+ -- QUERY 11: Update Prediction with Actual Outcome
313
+ -- Usage: Record actual outcome for model monitoring
314
+ -- ============================================
315
+
316
+ -- Name: update_prediction_outcome
317
+ -- Parameters: $1 = prediction_id, $2 = actual_clear_date
318
+ UPDATE predictions_log
319
+ SET
320
+ actual_clear_date = $2,
321
+ actual_days_to_clear = EXTRACT(DAY FROM ($2 - posting_date))::INTEGER,
322
+ prediction_error = EXTRACT(DAY FROM ($2 - posting_date))::INTEGER - predicted_days_to_clear,
323
+ absolute_error = ABS(EXTRACT(DAY FROM ($2 - posting_date))::INTEGER - predicted_days_to_clear),
324
+ outcome_recorded_at = NOW()
325
+ WHERE prediction_id = $1;
326
+
327
+ -- ============================================
328
+ -- QUERY 12: Get Recent Predictions Performance
329
+ -- Usage: Monitor model accuracy
330
+ -- ============================================
331
+
332
+ -- Name: get_prediction_metrics
333
+ -- Description: Calculate model performance over last N days
334
+ -- Parameters: $1 = days_back (e.g., 30)
335
+ SELECT
336
+ COUNT(*) as total_predictions,
337
+ COUNT(actual_days_to_clear) as predictions_with_outcome,
338
+ ROUND(AVG(ABS(prediction_error)), 2) as mae,
339
+ ROUND(SQRT(AVG(prediction_error * prediction_error)), 2) as rmse,
340
+ ROUND(AVG(CASE
341
+ WHEN ABS(prediction_error) <= 3 THEN 1.0
342
+ ELSE 0.0
343
+ END) * 100, 2) as pct_within_3_days,
344
+ ROUND(AVG(CASE
345
+ WHEN ABS(prediction_error) <= 7 THEN 1.0
346
+ ELSE 0.0
347
+ END) * 100, 2) as pct_within_7_days
348
+ FROM predictions_log
349
+ WHERE predicted_at >= NOW() - INTERVAL '$1 days'
350
+ AND actual_days_to_clear IS NOT NULL;
351
+
352
+ -- ============================================
353
+ -- End of Query Templates
354
+ -- ============================================
backend/database/schema_sqlite.sql ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- ============================================
2
+ -- Invoice Payment Prediction System - SQLite Schema
3
+ -- Version: 1.0 (SQLite)
4
+ -- ============================================
5
+
6
+ -- Drop existing tables
7
+ DROP TABLE IF EXISTS predictions_log;
8
+ DROP TABLE IF EXISTS business_code_aggregates;
9
+ DROP TABLE IF EXISTS payment_terms_aggregates;
10
+ DROP TABLE IF EXISTS customer_aggregates;
11
+ DROP TABLE IF EXISTS invoices_history;
12
+
13
+ -- ============================================
14
+ -- Table 1: invoices_history
15
+ -- ============================================
16
+ CREATE TABLE invoices_history (
17
+ invoice_id INTEGER PRIMARY KEY,
18
+ business_code TEXT NOT NULL,
19
+ cust_number TEXT NOT NULL,
20
+ name_customer TEXT,
21
+
22
+ -- Dates (stored as TEXT in ISO format: YYYY-MM-DD HH:MM:SS)
23
+ posting_date TEXT NOT NULL,
24
+ document_create_date TEXT,
25
+ document_create_date_alt TEXT,
26
+ due_in_date TEXT,
27
+ baseline_create_date TEXT,
28
+ clear_date TEXT,
29
+
30
+ -- Financial
31
+ total_open_amount REAL NOT NULL,
32
+ invoice_currency TEXT DEFAULT 'USD',
33
+
34
+ -- Metadata
35
+ document_type TEXT,
36
+ cust_payment_terms TEXT,
37
+ posting_id REAL,
38
+ is_open INTEGER DEFAULT 1,
39
+ business_year INTEGER,
40
+
41
+ -- Computed fields
42
+ days_to_clear INTEGER,
43
+ days_posting_to_due INTEGER,
44
+ days_create_to_posting INTEGER,
45
+ days_baseline_to_posting INTEGER,
46
+ is_overdue INTEGER DEFAULT 0,
47
+
48
+ -- Audit
49
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP,
50
+ updated_at TEXT DEFAULT CURRENT_TIMESTAMP
51
+ );
52
+
53
+ CREATE INDEX idx_invoices_cust ON invoices_history(cust_number);
54
+ CREATE INDEX idx_invoices_posting ON invoices_history(posting_date);
55
+ CREATE INDEX idx_invoices_cleared ON invoices_history(cust_number, posting_date) WHERE clear_date IS NOT NULL;
56
+
57
+ -- ============================================
58
+ -- Table 2: customer_aggregates
59
+ -- ============================================
60
+ CREATE TABLE customer_aggregates (
61
+ cust_number TEXT PRIMARY KEY,
62
+ cust_invoice_count INTEGER DEFAULT 0,
63
+ cust_cleared_count INTEGER DEFAULT 0,
64
+
65
+ cust_avg_days REAL,
66
+ cust_median_days REAL,
67
+ cust_std_days REAL,
68
+ cust_min_days INTEGER,
69
+ cust_max_days INTEGER,
70
+
71
+ cust_avg_amount REAL,
72
+ cust_total_amount REAL,
73
+ cust_pct_overdue REAL DEFAULT 0.0,
74
+
75
+ most_common_payment_term TEXT,
76
+ most_common_business_code TEXT,
77
+ most_common_currency TEXT,
78
+
79
+ last_computed_at TEXT DEFAULT CURRENT_TIMESTAMP,
80
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP
81
+ );
82
+
83
+ -- ============================================
84
+ -- Table 3: payment_terms_aggregates
85
+ -- ============================================
86
+ CREATE TABLE payment_terms_aggregates (
87
+ cust_payment_terms TEXT PRIMARY KEY,
88
+ payment_terms_avg_days REAL,
89
+ payment_terms_median_days REAL,
90
+ payment_terms_count INTEGER DEFAULT 0,
91
+ last_computed_at TEXT DEFAULT CURRENT_TIMESTAMP
92
+ );
93
+
94
+ -- ============================================
95
+ -- Table 4: business_code_aggregates
96
+ -- ============================================
97
+ CREATE TABLE business_code_aggregates (
98
+ business_code TEXT PRIMARY KEY,
99
+ business_avg_days REAL,
100
+ business_median_days REAL,
101
+ business_count INTEGER DEFAULT 0,
102
+ last_computed_at TEXT DEFAULT CURRENT_TIMESTAMP
103
+ );
104
+
105
+ -- ============================================
106
+ -- Table 5: predictions_log
107
+ -- ============================================
108
+ CREATE TABLE predictions_log (
109
+ prediction_id INTEGER PRIMARY KEY AUTOINCREMENT,
110
+ invoice_id INTEGER,
111
+ cust_number TEXT NOT NULL,
112
+ posting_date TEXT NOT NULL,
113
+ total_open_amount REAL NOT NULL,
114
+ business_code TEXT,
115
+ cust_payment_terms TEXT,
116
+
117
+ predicted_days_to_clear REAL NOT NULL,
118
+ predicted_clear_date TEXT NOT NULL,
119
+
120
+ model_version TEXT,
121
+ features_json TEXT,
122
+
123
+ actual_clear_date TEXT,
124
+ actual_days_to_clear INTEGER,
125
+ prediction_error REAL,
126
+ absolute_error REAL,
127
+
128
+ predicted_at TEXT DEFAULT CURRENT_TIMESTAMP
129
+ );
130
+
131
+ CREATE INDEX idx_predictions_cust ON predictions_log(cust_number);
132
+ CREATE INDEX idx_predictions_date ON predictions_log(predicted_at);
backend/etl/__init__.py ADDED
File without changes
backend/etl/update_customer_aggregates_sqlite.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import sqlite3
4
+ import pandas as pd
5
+ from pathlib import Path
6
+ from filelock import FileLock
7
+ from datetime import datetime
8
+
9
+ DB_PATH = Path(__file__).parent.parent.parent / "data" / "invoices.db"
10
+ LOCK_PATH = Path(__file__).parent.parent.parent / "data" / "invoices.db.lock"
11
+
12
+
13
+ def get_most_common(series):
14
+ """Get mode (most common value)."""
15
+ if series.empty:
16
+ return None
17
+ return series.mode()[0] if not series.mode().empty else None
18
+
19
+
20
+ def update_customer_aggregates():
21
+ """Compute and update customer aggregates."""
22
+
23
+ print("🔄 Starting customer aggregates computation...")
24
+
25
+ with FileLock(str(LOCK_PATH)):
26
+ conn = sqlite3.connect(str(DB_PATH))
27
+
28
+ # Load cleared invoices
29
+ df = pd.read_sql_query("""
30
+ SELECT
31
+ cust_number,
32
+ days_to_clear,
33
+ total_open_amount,
34
+ is_overdue,
35
+ cust_payment_terms,
36
+ business_code,
37
+ invoice_currency
38
+ FROM invoices_history
39
+ WHERE clear_date IS NOT NULL
40
+ """, conn)
41
+
42
+ if df.empty:
43
+ print("⚠️ No cleared invoices found")
44
+ conn.close()
45
+ return
46
+
47
+ print(f"📊 Processing {len(df)} cleared invoices...")
48
+
49
+ # Compute aggregates per customer
50
+ agg_results = []
51
+
52
+ for cust_number, group in df.groupby('cust_number'):
53
+ agg = {
54
+ 'cust_number': cust_number,
55
+ 'cust_invoice_count': len(group),
56
+ 'cust_cleared_count': len(group),
57
+ 'cust_avg_days': round(group['days_to_clear'].mean(), 2),
58
+ 'cust_median_days': round(group['days_to_clear'].median(), 2),
59
+ 'cust_std_days': round(group['days_to_clear'].std(), 2) if len(group) > 1 else 0.0,
60
+ 'cust_min_days': int(group['days_to_clear'].min()),
61
+ 'cust_max_days': int(group['days_to_clear'].max()),
62
+ 'cust_avg_amount': round(group['total_open_amount'].mean(), 2),
63
+ 'cust_total_amount': round(group['total_open_amount'].sum(), 2),
64
+ 'cust_pct_overdue': round((group['is_overdue'].sum() / len(group)) * 100, 2),
65
+ 'most_common_payment_term': get_most_common(group['cust_payment_terms']),
66
+ 'most_common_business_code': get_most_common(group['business_code']),
67
+ 'most_common_currency': get_most_common(group['invoice_currency'])
68
+ }
69
+ agg_results.append(agg)
70
+
71
+ # Upsert into customer_aggregates
72
+ cursor = conn.cursor()
73
+ for agg in agg_results:
74
+ cursor.execute("""
75
+ INSERT OR REPLACE INTO customer_aggregates (
76
+ cust_number, cust_invoice_count, cust_cleared_count,
77
+ cust_avg_days, cust_median_days, cust_std_days,
78
+ cust_min_days, cust_max_days,
79
+ cust_avg_amount, cust_total_amount, cust_pct_overdue,
80
+ most_common_payment_term, most_common_business_code,
81
+ most_common_currency, last_computed_at
82
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
83
+ """, (
84
+ agg['cust_number'], agg['cust_invoice_count'], agg['cust_cleared_count'],
85
+ agg['cust_avg_days'], agg['cust_median_days'], agg['cust_std_days'],
86
+ agg['cust_min_days'], agg['cust_max_days'],
87
+ agg['cust_avg_amount'], agg['cust_total_amount'], agg['cust_pct_overdue'],
88
+ agg['most_common_payment_term'], agg['most_common_business_code'],
89
+ agg['most_common_currency']
90
+ ))
91
+
92
+ conn.commit()
93
+ print(f"✅ Updated {len(agg_results)} customer aggregates")
94
+
95
+ conn.close()
96
+
97
+
98
+ def update_payment_terms_aggregates():
99
+ """Compute and update payment terms aggregates."""
100
+
101
+ print("🔄 Computing payment terms aggregates...")
102
+
103
+ with FileLock(str(LOCK_PATH)):
104
+ conn = sqlite3.connect(str(DB_PATH))
105
+
106
+ df = pd.read_sql_query("""
107
+ SELECT cust_payment_terms, days_to_clear
108
+ FROM invoices_history
109
+ WHERE clear_date IS NOT NULL AND cust_payment_terms IS NOT NULL
110
+ """, conn)
111
+
112
+ if df.empty:
113
+ print("⚠️ No data for payment terms")
114
+ conn.close()
115
+ return
116
+
117
+ agg = df.groupby('cust_payment_terms')['days_to_clear'].agg(['mean', 'median', 'count']).reset_index()
118
+ agg.columns = ['cust_payment_terms', 'payment_terms_avg_days', 'payment_terms_median_days', 'payment_terms_count']
119
+
120
+ cursor = conn.cursor()
121
+ for _, row in agg.iterrows():
122
+ cursor.execute("""
123
+ INSERT OR REPLACE INTO payment_terms_aggregates (
124
+ cust_payment_terms, payment_terms_avg_days, payment_terms_median_days,
125
+ payment_terms_count, last_computed_at
126
+ ) VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
127
+ """, (
128
+ row['cust_payment_terms'],
129
+ round(row['payment_terms_avg_days'], 2),
130
+ round(row['payment_terms_median_days'], 2),
131
+ int(row['payment_terms_count'])
132
+ ))
133
+
134
+ conn.commit()
135
+ print(f"✅ Updated {len(agg)} payment terms aggregates")
136
+ conn.close()
137
+
138
+
139
+ def update_business_code_aggregates():
140
+ """Compute and update business code aggregates."""
141
+
142
+ print("🔄 Computing business code aggregates...")
143
+
144
+ with FileLock(str(LOCK_PATH)):
145
+ conn = sqlite3.connect(str(DB_PATH))
146
+
147
+ df = pd.read_sql_query("""
148
+ SELECT business_code, days_to_clear
149
+ FROM invoices_history
150
+ WHERE clear_date IS NOT NULL AND business_code IS NOT NULL
151
+ """, conn)
152
+
153
+ if df.empty:
154
+ print("⚠️ No data for business codes")
155
+ conn.close()
156
+ return
157
+
158
+ agg = df.groupby('business_code')['days_to_clear'].agg(['mean', 'median', 'count']).reset_index()
159
+ agg.columns = ['business_code', 'business_avg_days', 'business_median_days', 'business_count']
160
+
161
+ cursor = conn.cursor()
162
+ for _, row in agg.iterrows():
163
+ cursor.execute("""
164
+ INSERT OR REPLACE INTO business_code_aggregates (
165
+ business_code, business_avg_days, business_median_days,
166
+ business_count, last_computed_at
167
+ ) VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
168
+ """, (
169
+ row['business_code'],
170
+ round(row['business_avg_days'], 2),
171
+ round(row['business_median_days'], 2),
172
+ int(row['business_count'])
173
+ ))
174
+
175
+ conn.commit()
176
+ print(f"✅ Updated {len(agg)} business code aggregates")
177
+ conn.close()
178
+
179
+
180
+ if __name__ == "__main__":
181
+ print("="*60)
182
+ print("🚀 ETL: Updating Aggregates")
183
+ print("="*60)
184
+
185
+ update_customer_aggregates()
186
+ update_payment_terms_aggregates()
187
+ update_business_code_aggregates()
188
+
189
+ print("\n✅ All aggregates updated successfully!")
backend/feature_builder/__init__.py ADDED
File without changes
backend/feature_builder/feature_builder.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Feature builder that matches ML training pipeline exactly.
3
+ Generates features for inference from invoice data + aggregates.
4
+ FIXED: Handles None values properly with robust defaults.
5
+ """
6
+
7
+ import pandas as pd
8
+ import numpy as np
9
+ from datetime import datetime
10
+ from typing import Dict, Optional
11
+
12
+
13
+ # Default values for new customers (from training)
14
+ DEFAULTS = {
15
+ 'cust_avg_days': 18.0,
16
+ 'cust_median_days': 15.0,
17
+ 'cust_std_days': 0.0,
18
+ 'cust_min_days': 12,
19
+ 'cust_max_days': 25,
20
+ 'cust_invoice_count': 1,
21
+ 'cust_avg_amount': 30000.0,
22
+ 'cust_total_amount': 30000.0,
23
+ 'cust_pct_overdue': 0.0,
24
+ 'payment_terms_avg_days': 15.0,
25
+ 'payment_terms_median_days': 15.0,
26
+ 'payment_terms_count': 100,
27
+ 'business_avg_days': 17.0,
28
+ 'business_median_days': 15.0,
29
+ 'business_count': 1000
30
+ }
31
+
32
+
33
+ def safe_float(value, default=0.0):
34
+ """Safely convert to float with default."""
35
+ if value is None:
36
+ return float(default)
37
+ try:
38
+ return float(value)
39
+ except (ValueError, TypeError):
40
+ return float(default)
41
+
42
+
43
+ def safe_int(value, default=0):
44
+ """Safely convert to int with default."""
45
+ if value is None:
46
+ return int(default)
47
+ try:
48
+ return int(value)
49
+ except (ValueError, TypeError):
50
+ return int(default)
51
+
52
+
53
+ def parse_date(date_str: str) -> datetime:
54
+ """Parse date string to datetime."""
55
+ if isinstance(date_str, datetime):
56
+ return date_str
57
+
58
+ for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y%m%d"]:
59
+ try:
60
+ return datetime.strptime(str(date_str), fmt)
61
+ except ValueError:
62
+ continue
63
+
64
+ raise ValueError(f"Cannot parse date: {date_str}")
65
+
66
+
67
+ def build_features(
68
+ invoice_data: Dict,
69
+ customer_agg: Optional[Dict] = None,
70
+ payment_terms_agg: Optional[Dict] = None,
71
+ business_code_agg: Optional[Dict] = None
72
+ ) -> Dict:
73
+ """
74
+ Build feature vector matching ML training pipeline.
75
+
76
+ Args:
77
+ invoice_data: Invoice details (posting_date, amount, etc.)
78
+ customer_agg: Customer aggregates from DB (or None for defaults)
79
+ payment_terms_agg: Payment terms aggregates from DB
80
+ business_code_agg: Business code aggregates from DB
81
+
82
+ Returns:
83
+ Dict of features ready for model.predict()
84
+ """
85
+
86
+ # Parse dates
87
+ posting_date = parse_date(invoice_data['posting_date'])
88
+
89
+ # Use provided aggregates or empty dicts (will use defaults)
90
+ cust_agg = customer_agg or {}
91
+ pmt_agg = payment_terms_agg or {}
92
+ biz_agg = business_code_agg or {}
93
+
94
+ # Build feature dictionary
95
+ features = {}
96
+
97
+ # ============================================
98
+ # Categorical Features (encoded as integers)
99
+ # ============================================
100
+
101
+ # Business code mapping
102
+ business_code = invoice_data.get('business_code', 'U001')
103
+ business_code_map = {'U001': 0, 'U002': 1, 'U005': 2, 'U007': 3, 'U013': 4, 'CA02': 5}
104
+ features['business_code'] = business_code_map.get(business_code, 0)
105
+
106
+ # Payment terms (simplified hash encoding)
107
+ payment_terms = invoice_data.get('cust_payment_terms', 'NAH4')
108
+ features['cust_payment_terms'] = abs(hash(payment_terms)) % 74
109
+
110
+ # Currency
111
+ currency_map = {'USD': 0, 'CAD': 1}
112
+ features['invoice_currency'] = currency_map.get(invoice_data.get('invoice_currency', 'USD'), 0)
113
+
114
+ # Document type
115
+ doc_type_map = {'RV': 0, 'AB': 1}
116
+ features['document_type'] = doc_type_map.get(invoice_data.get('document_type', 'RV'), 0)
117
+
118
+ # Amount category
119
+ amount = safe_float(invoice_data.get('total_open_amount'), 30000.0)
120
+ if amount < 5000:
121
+ amount_cat = 0 # small
122
+ elif amount < 20000:
123
+ amount_cat = 1 # medium
124
+ elif amount < 50000:
125
+ amount_cat = 2 # large
126
+ else:
127
+ amount_cat = 3 # very_large
128
+ features['amount_category'] = amount_cat
129
+
130
+ # ============================================
131
+ # Numerical Features
132
+ # ============================================
133
+
134
+ features['buisness_year'] = safe_float(invoice_data.get('business_year', posting_date.year))
135
+ features['total_open_amount'] = amount
136
+ features['amount_log'] = float(np.log1p(amount))
137
+
138
+ # Temporal features
139
+ features['posting_year'] = posting_date.year
140
+ features['posting_month'] = posting_date.month
141
+ features['posting_quarter'] = (posting_date.month - 1) // 3 + 1
142
+ features['posting_day'] = posting_date.day
143
+ features['posting_dayofweek'] = posting_date.weekday()
144
+ features['posting_is_weekend'] = 1 if posting_date.weekday() >= 5 else 0
145
+ features['posting_is_month_end'] = 1 if posting_date.day >= 28 else 0
146
+ features['posting_is_month_start'] = 1 if posting_date.day <= 3 else 0
147
+
148
+ # Days between dates
149
+ features['days_posting_to_due'] = safe_int(invoice_data.get('days_posting_to_due'), 15)
150
+ features['days_create_to_posting'] = safe_int(invoice_data.get('days_create_to_posting'), 0)
151
+ features['days_baseline_to_posting'] = safe_int(invoice_data.get('days_baseline_to_posting'), 0)
152
+
153
+ # Document create date alt (as integer YYYYMMDD)
154
+ doc_create_alt = invoice_data.get('document_create_date_alt')
155
+ if doc_create_alt:
156
+ try:
157
+ cleaned = str(doc_create_alt).replace('-', '').replace(' ', '').replace(':', '')[:8]
158
+ features['document_create_date.1'] = int(cleaned)
159
+ except:
160
+ features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d'))
161
+ else:
162
+ features['document_create_date.1'] = int(posting_date.strftime('%Y%m%d'))
163
+
164
+ # ============================================
165
+ # Customer Aggregates (with robust defaults)
166
+ # ============================================
167
+
168
+ features['cust_avg_days'] = safe_float(
169
+ cust_agg.get('cust_avg_days'),
170
+ DEFAULTS['cust_avg_days']
171
+ )
172
+ features['cust_median_days'] = safe_float(
173
+ cust_agg.get('cust_median_days'),
174
+ DEFAULTS['cust_median_days']
175
+ )
176
+ features['cust_std_days'] = safe_float(
177
+ cust_agg.get('cust_std_days'),
178
+ DEFAULTS['cust_std_days']
179
+ )
180
+ features['cust_min_days'] = safe_int(
181
+ cust_agg.get('cust_min_days'),
182
+ DEFAULTS['cust_min_days']
183
+ )
184
+ features['cust_max_days'] = safe_int(
185
+ cust_agg.get('cust_max_days'),
186
+ DEFAULTS['cust_max_days']
187
+ )
188
+ features['cust_invoice_count'] = safe_int(
189
+ cust_agg.get('cust_invoice_count'),
190
+ DEFAULTS['cust_invoice_count']
191
+ )
192
+ features['cust_avg_amount'] = safe_float(
193
+ cust_agg.get('cust_avg_amount'),
194
+ DEFAULTS['cust_avg_amount']
195
+ )
196
+ features['cust_total_amount'] = safe_float(
197
+ cust_agg.get('cust_total_amount'),
198
+ DEFAULTS['cust_total_amount']
199
+ )
200
+
201
+ # ============================================
202
+ # Payment Terms Aggregates
203
+ # ============================================
204
+
205
+ features['payment_terms_avg_days'] = safe_float(
206
+ pmt_agg.get('payment_terms_avg_days'),
207
+ DEFAULTS['payment_terms_avg_days']
208
+ )
209
+ features['payment_terms_median_days'] = safe_float(
210
+ pmt_agg.get('payment_terms_median_days'),
211
+ DEFAULTS['payment_terms_median_days']
212
+ )
213
+ features['payment_terms_count'] = safe_int(
214
+ pmt_agg.get('payment_terms_count'),
215
+ DEFAULTS['payment_terms_count']
216
+ )
217
+
218
+ # ============================================
219
+ # Business Code Aggregates
220
+ # ============================================
221
+
222
+ features['business_avg_days'] = safe_float(
223
+ biz_agg.get('business_avg_days'),
224
+ DEFAULTS['business_avg_days']
225
+ )
226
+ features['business_median_days'] = safe_float(
227
+ biz_agg.get('business_median_days'),
228
+ DEFAULTS['business_median_days']
229
+ )
230
+ features['business_count'] = safe_int(
231
+ biz_agg.get('business_count'),
232
+ DEFAULTS['business_count']
233
+ )
234
+
235
+ # ============================================
236
+ # Interaction Features
237
+ # ============================================
238
+
239
+ cust_avg_amt = features['cust_avg_amount']
240
+ if cust_avg_amt > 0:
241
+ features['amount_vs_cust_avg'] = float(amount / cust_avg_amt)
242
+ else:
243
+ features['amount_vs_cust_avg'] = 1.0
244
+
245
+ features['is_large_for_customer'] = 1 if amount > cust_avg_amt * 1.5 else 0
246
+
247
+ # ============================================
248
+ # Other required fields
249
+ # ============================================
250
+
251
+ features['isOpen'] = safe_int(invoice_data.get('is_open'), 1)
252
+ features['posting_id'] = safe_float(invoice_data.get('posting_id'), 1.0)
253
+
254
+ return features
255
+
256
+
257
+ def features_to_dataframe(features: Dict) -> pd.DataFrame:
258
+ """
259
+ Convert feature dict to DataFrame with correct column order.
260
+ Must match training feature order exactly.
261
+ """
262
+
263
+ # Expected column order from training
264
+ COLUMN_ORDER = [
265
+ 'business_code', 'buisness_year', 'document_create_date.1',
266
+ 'invoice_currency', 'document_type', 'total_open_amount',
267
+ 'cust_payment_terms', 'isOpen', 'posting_year', 'posting_month',
268
+ 'posting_quarter', 'posting_day', 'posting_dayofweek',
269
+ 'posting_is_weekend', 'posting_is_month_end', 'posting_is_month_start',
270
+ 'days_posting_to_due', 'days_create_to_posting', 'days_baseline_to_posting',
271
+ 'amount_log', 'amount_category', 'cust_avg_days', 'cust_median_days',
272
+ 'cust_std_days', 'cust_min_days', 'cust_max_days', 'cust_invoice_count',
273
+ 'cust_avg_amount', 'cust_total_amount', 'payment_terms_avg_days',
274
+ 'payment_terms_median_days', 'payment_terms_count', 'business_avg_days',
275
+ 'business_median_days', 'business_count', 'amount_vs_cust_avg',
276
+ 'is_large_for_customer'
277
+ ]
278
+
279
+ # Ensure all columns present with safe defaults
280
+ for col in COLUMN_ORDER:
281
+ if col not in features:
282
+ features[col] = 0.0 # Fallback
283
+
284
+ # Create DataFrame with correct order
285
+ df = pd.DataFrame([features])[COLUMN_ORDER]
286
+
287
+ return df
288
+
289
+
290
+ if __name__ == "__main__":
291
+ # Test with minimal data
292
+ test_invoice = {
293
+ 'posting_date': '2024-01-15',
294
+ 'total_open_amount': 50000.0,
295
+ 'business_code': 'U001',
296
+ 'cust_payment_terms': 'NAH4',
297
+ 'invoice_currency': 'USD',
298
+ 'document_type': 'RV',
299
+ 'business_year': 2024,
300
+ 'days_posting_to_due': 15,
301
+ 'is_open': 1
302
+ }
303
+
304
+ # Test with no aggregates (should use defaults)
305
+ features = build_features(test_invoice, None, None, None)
306
+ df = features_to_dataframe(features)
307
+
308
+ print("✅ Features built successfully:")
309
+ print(f"Shape: {df.shape}")
310
+ print(f"Columns: {len(df.columns)}")
311
+ print(f"\nSample features:")
312
+ print(df[['cust_avg_days', 'payment_terms_avg_days', 'business_avg_days']].T)
backend/ingest/__init__.py ADDED
File without changes
backend/ingest/ingest_invoice_sqlite.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Invoice ingestion helper for SQLite.
3
+ Handles insert/update with computed fields.
4
+ """
5
+
6
+ import sqlite3
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+ from typing import Dict, Optional
10
+ from filelock import FileLock
11
+
12
+ DB_PATH = Path(__file__).parent.parent.parent / "data" / "invoices.db"
13
+ LOCK_PATH = Path(__file__).parent.parent.parent / "data" / "invoices.db.lock"
14
+
15
+ def parse_date(date_input) -> Optional[str]:
16
+ """Convert various date formats to ISO string."""
17
+ if not date_input:
18
+ return None
19
+
20
+ if isinstance(date_input, str):
21
+ # Try parsing common formats
22
+ for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y%m%d"]:
23
+ try:
24
+ dt = datetime.strptime(date_input, fmt)
25
+ return dt.strftime("%Y-%m-%d %H:%M:%S")
26
+ except ValueError:
27
+ continue
28
+ return date_input # Return as-is if parsing fails
29
+
30
+ if isinstance(date_input, datetime):
31
+ return date_input.strftime("%Y-%m-%d %H:%M:%S")
32
+
33
+ return str(date_input)
34
+
35
+
36
+ def compute_days_diff(date1_str: Optional[str], date2_str: Optional[str]) -> Optional[int]:
37
+ """Compute day difference between two ISO date strings."""
38
+ if not date1_str or not date2_str:
39
+ return None
40
+
41
+ try:
42
+ d1 = datetime.strptime(date1_str, "%Y-%m-%d %H:%M:%S")
43
+ d2 = datetime.strptime(date2_str, "%Y-%m-%d %H:%M:%S")
44
+ return (d1 - d2).days
45
+ except:
46
+ return None
47
+
48
+
49
+ def ingest_invoice(invoice_data: Dict) -> Dict:
50
+ """
51
+ Insert or update invoice in SQLite with computed fields.
52
+
53
+ Args:
54
+ invoice_data: Dict with invoice fields
55
+
56
+ Returns:
57
+ Dict with status and invoice_id
58
+ """
59
+
60
+ # Parse dates
61
+ posting_date = parse_date(invoice_data.get("posting_date"))
62
+ clear_date = parse_date(invoice_data.get("clear_date"))
63
+ due_in_date = parse_date(invoice_data.get("due_in_date"))
64
+ document_create_date = parse_date(invoice_data.get("document_create_date"))
65
+ baseline_create_date = parse_date(invoice_data.get("baseline_create_date"))
66
+
67
+ # Compute derived fields
68
+ days_to_clear = compute_days_diff(clear_date, posting_date) if clear_date else None
69
+ days_posting_to_due = compute_days_diff(due_in_date, posting_date)
70
+ days_create_to_posting = compute_days_diff(posting_date, document_create_date)
71
+ days_baseline_to_posting = compute_days_diff(posting_date, baseline_create_date)
72
+
73
+ is_open = 0 if clear_date else 1
74
+ is_overdue = 0
75
+ if clear_date and due_in_date:
76
+ try:
77
+ cd = datetime.strptime(clear_date, "%Y-%m-%d %H:%M:%S")
78
+ dd = datetime.strptime(due_in_date, "%Y-%m-%d %H:%M:%S")
79
+ is_overdue = 1 if cd > dd else 0
80
+ except:
81
+ pass
82
+
83
+ # Prepare data
84
+ invoice_id = invoice_data.get("invoice_id")
85
+ if not invoice_id:
86
+ raise ValueError("invoice_id is required")
87
+
88
+ # SQLite write with lock
89
+ with FileLock(str(LOCK_PATH)):
90
+ conn = sqlite3.connect(str(DB_PATH))
91
+ cursor = conn.cursor()
92
+
93
+ cursor.execute("""
94
+ INSERT OR REPLACE INTO invoices_history (
95
+ invoice_id, business_code, cust_number, name_customer,
96
+ posting_date, document_create_date, document_create_date_alt,
97
+ due_in_date, baseline_create_date, clear_date,
98
+ total_open_amount, invoice_currency, document_type,
99
+ cust_payment_terms, posting_id, business_year,
100
+ days_to_clear, days_posting_to_due, days_create_to_posting,
101
+ days_baseline_to_posting, is_overdue, is_open,
102
+ updated_at
103
+ ) VALUES (
104
+ ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
105
+ ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP
106
+ )
107
+ """, (
108
+ invoice_id,
109
+ invoice_data.get("business_code"),
110
+ invoice_data.get("cust_number"),
111
+ invoice_data.get("name_customer"),
112
+ posting_date,
113
+ document_create_date,
114
+ invoice_data.get("document_create_date_alt"),
115
+ due_in_date,
116
+ baseline_create_date,
117
+ clear_date,
118
+ invoice_data.get("total_open_amount"),
119
+ invoice_data.get("invoice_currency", "USD"),
120
+ invoice_data.get("document_type"),
121
+ invoice_data.get("cust_payment_terms"),
122
+ invoice_data.get("posting_id"),
123
+ invoice_data.get("business_year"),
124
+ days_to_clear,
125
+ days_posting_to_due,
126
+ days_create_to_posting,
127
+ days_baseline_to_posting,
128
+ is_overdue,
129
+ is_open
130
+ ))
131
+
132
+ conn.commit()
133
+ conn.close()
134
+
135
+ return {
136
+ "status": "success",
137
+ "invoice_id": invoice_id,
138
+ "is_open": bool(is_open),
139
+ "days_to_clear": days_to_clear
140
+ }
141
+
142
+
143
+ if __name__ == "__main__":
144
+ # Test
145
+ test_invoice = {
146
+ "invoice_id": 12345,
147
+ "business_code": "U001",
148
+ "cust_number": "0200769623",
149
+ "name_customer": "Test Customer",
150
+ "posting_date": "2024-01-15",
151
+ "clear_date": "2024-02-01",
152
+ "due_in_date": "2024-01-30",
153
+ "total_open_amount": 50000.0,
154
+ "cust_payment_terms": "NAH4"
155
+ }
156
+
157
+ result = ingest_invoice(test_invoice)
158
+ print(result)
backend/worker/job_processor.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Background worker for processing ingest jobs.
3
+ Consumes jobs from Redis queue and processes them.
4
+ """
5
+
6
+ import sqlite3
7
+ import logging
8
+ from pathlib import Path
9
+ from datetime import datetime
10
+ from filelock import FileLock
11
+ from typing import Dict
12
+ import traceback
13
+
14
+ from .text_extractor import extract_text
15
+
16
+ # Setup logging
17
+ logging.basicConfig(
18
+ level=logging.INFO,
19
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
20
+ )
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Paths
24
+ BASE_DIR = Path(__file__).parent.parent.parent
25
+ DB_PATH = BASE_DIR / "data" / "invoices.db"
26
+ LOCK_PATH = BASE_DIR / "data" / "invoices.db.lock"
27
+
28
+
29
+ def update_job_status(job_id: str, status: str, error_message: str = None):
30
+ """Update job status in database."""
31
+ with FileLock(str(LOCK_PATH), timeout=10):
32
+ conn = sqlite3.connect(str(DB_PATH))
33
+ cursor = conn.cursor()
34
+
35
+ if status == "processing":
36
+ cursor.execute("""
37
+ UPDATE ingest_jobs
38
+ SET status = ?, started_at = CURRENT_TIMESTAMP
39
+ WHERE job_id = ?
40
+ """, (status, job_id))
41
+ elif status == "completed":
42
+ cursor.execute("""
43
+ UPDATE ingest_jobs
44
+ SET status = ?, completed_at = CURRENT_TIMESTAMP
45
+ WHERE job_id = ?
46
+ """, (status, job_id))
47
+ elif status == "failed":
48
+ cursor.execute("""
49
+ UPDATE ingest_jobs
50
+ SET status = ?, error_message = ?, completed_at = CURRENT_TIMESTAMP
51
+ WHERE job_id = ?
52
+ """, (status, error_message, job_id))
53
+
54
+ conn.commit()
55
+ conn.close()
56
+
57
+
58
+ def save_extraction(document_id: int, raw_text: str, metadata: Dict):
59
+ """Save extracted text to database."""
60
+ with FileLock(str(LOCK_PATH), timeout=10):
61
+ conn = sqlite3.connect(str(DB_PATH))
62
+ cursor = conn.cursor()
63
+
64
+ cursor.execute("""
65
+ INSERT INTO extractions (
66
+ document_id,
67
+ raw_text,
68
+ page_count,
69
+ extraction_method,
70
+ confidence_score
71
+ ) VALUES (?, ?, ?, ?, ?)
72
+ """, (
73
+ document_id,
74
+ raw_text,
75
+ metadata.get('page_count'),
76
+ metadata.get('extraction_method'),
77
+ metadata.get('confidence_score')
78
+ ))
79
+
80
+ conn.commit()
81
+ conn.close()
82
+
83
+
84
+ def process_job(job_data: Dict):
85
+ """
86
+ Process a single ingest job.
87
+
88
+ Args:
89
+ job_data: Dict with job_id, document_id, file_path, mime_type
90
+ """
91
+ job_id = job_data['job_id']
92
+ document_id = job_data['document_id']
93
+ file_path = Path(job_data['file_path'])
94
+ mime_type = job_data['mime_type']
95
+
96
+ logger.info(f"Processing job {job_id} for document {document_id}")
97
+
98
+ try:
99
+ # Update status to processing
100
+ update_job_status(job_id, "processing")
101
+
102
+ # Extract text
103
+ logger.info(f"Extracting text from {file_path}")
104
+ raw_text, metadata = extract_text(file_path, mime_type)
105
+
106
+ if not raw_text or len(raw_text.strip()) < 10:
107
+ raise ValueError("No text extracted or text too short")
108
+
109
+ logger.info(f"Extracted {len(raw_text)} characters, {metadata['page_count']} pages")
110
+
111
+ # Save to database
112
+ save_extraction(document_id, raw_text, metadata)
113
+
114
+ # Update status to completed
115
+ update_job_status(job_id, "completed")
116
+
117
+ logger.info(f"Job {job_id} completed successfully")
118
+
119
+ except Exception as e:
120
+ error_msg = f"{type(e).__name__}: {str(e)}"
121
+ logger.error(f"Job {job_id} failed: {error_msg}")
122
+ logger.error(traceback.format_exc())
123
+
124
+ # Update status to failed
125
+ update_job_status(job_id, "failed", error_msg)
126
+ raise
backend/worker/text_extractor.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text extraction utilities for PDF and images.
3
+ Supports both digital PDFs and scanned documents (OCR).
4
+ """
5
+
6
+ import pdfplumber
7
+ import fitz # PyMuPDF
8
+ import pytesseract
9
+ from PIL import Image
10
+ from pathlib import Path
11
+ from typing import Dict, Tuple
12
+ import logging
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def extract_text_from_pdf(file_path: Path) -> Tuple[str, Dict]:
18
+ """
19
+ Extract text from PDF using pdfplumber (for digital PDFs).
20
+
21
+ Returns:
22
+ (raw_text, metadata)
23
+ """
24
+ try:
25
+ text_pages = []
26
+ page_count = 0
27
+
28
+ with pdfplumber.open(str(file_path)) as pdf:
29
+ page_count = len(pdf.pages)
30
+
31
+ for page in pdf.pages:
32
+ text = page.extract_text()
33
+ if text:
34
+ text_pages.append(text)
35
+
36
+ raw_text = "\n\n".join(text_pages)
37
+
38
+ metadata = {
39
+ "page_count": page_count,
40
+ "extraction_method": "pdfplumber",
41
+ "confidence_score": 1.0 if len(raw_text) > 50 else 0.5
42
+ }
43
+
44
+ # If no text extracted, it might be a scanned PDF
45
+ if not raw_text.strip():
46
+ logger.info("No text found with pdfplumber, trying OCR...")
47
+ return extract_text_from_pdf_ocr(file_path)
48
+
49
+ return raw_text, metadata
50
+
51
+ except Exception as e:
52
+ logger.error(f"PDF extraction failed: {e}")
53
+ raise
54
+
55
+
56
+ def extract_text_from_pdf_ocr(file_path: Path) -> Tuple[str, Dict]:
57
+ """
58
+ Extract text from scanned PDF using OCR (PyMuPDF + Tesseract).
59
+ """
60
+ try:
61
+ text_pages = []
62
+ doc = fitz.open(str(file_path))
63
+ page_count = len(doc)
64
+
65
+ for page_num in range(page_count):
66
+ page = doc[page_num]
67
+ # Convert page to image
68
+ pix = page.get_pixmap(dpi=300)
69
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
70
+
71
+ # OCR
72
+ text = pytesseract.image_to_string(img)
73
+ text_pages.append(text)
74
+
75
+ doc.close()
76
+ raw_text = "\n\n".join(text_pages)
77
+
78
+ metadata = {
79
+ "page_count": page_count,
80
+ "extraction_method": "tesseract_ocr",
81
+ "confidence_score": 0.7 # OCR typically less confident
82
+ }
83
+
84
+ return raw_text, metadata
85
+
86
+ except Exception as e:
87
+ logger.error(f"OCR extraction failed: {e}")
88
+ raise
89
+
90
+
91
+ def extract_text_from_image(file_path: Path) -> Tuple[str, Dict]:
92
+ """
93
+ Extract text from image using OCR (Tesseract).
94
+ """
95
+ try:
96
+ img = Image.open(str(file_path))
97
+ raw_text = pytesseract.image_to_string(img)
98
+
99
+ metadata = {
100
+ "page_count": 1,
101
+ "extraction_method": "tesseract_ocr",
102
+ "confidence_score": 0.7
103
+ }
104
+
105
+ return raw_text, metadata
106
+
107
+ except Exception as e:
108
+ logger.error(f"Image OCR failed: {e}")
109
+ raise
110
+
111
+
112
+ def extract_text(file_path: Path, mime_type: str) -> Tuple[str, Dict]:
113
+ """
114
+ Main entry point for text extraction.
115
+ Routes to appropriate extractor based on file type.
116
+
117
+ Args:
118
+ file_path: Path to document
119
+ mime_type: MIME type of document
120
+
121
+ Returns:
122
+ (raw_text, metadata_dict)
123
+ """
124
+ if mime_type == "application/pdf":
125
+ return extract_text_from_pdf(file_path)
126
+ elif mime_type in ["image/png", "image/jpeg", "image/jpg"]:
127
+ return extract_text_from_image(file_path)
128
+ else:
129
+ raise ValueError(f"Unsupported file type: {mime_type}")
backend/worker/worker.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import os
4
+ import sys
5
+ from pathlib import Path
6
+ from redis import Redis
7
+ from rq import Worker, Queue, Connection
8
+
9
+ # Add parent directory to path
10
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
11
+
12
+ # Import job processor
13
+ from backend.worker.job_processor import process_job
14
+
15
+ # Redis connection
16
+ REDIS_HOST = os.getenv('REDIS_HOST', 'redis')
17
+ REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
18
+ REDIS_DB = int(os.getenv('REDIS_DB', 0))
19
+ QUEUE_NAME = os.getenv('REDIS_QUEUE_NAME', 'invoice_ingest')
20
+
21
+ redis_conn = Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB)
22
+
23
+
24
+ if __name__ == '__main__':
25
+ print(f"🚀 Starting worker for queue: {QUEUE_NAME}")
26
+ print(f"📡 Redis: {REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}")
27
+
28
+ with Connection(redis_conn):
29
+ worker = Worker([QUEUE_NAME])
30
+ worker.work()
requirements.txt ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Web Framework
2
+ fastapi==0.104.1
3
+ uvicorn[standard]==0.24.0
4
+ pydantic==2.5.0
5
+
6
+ # ML & Data
7
+ pandas==2.1.3
8
+ numpy==1.26.2
9
+ scikit-learn==1.6.1
10
+
11
+ lightgbm==4.1.0
12
+ joblib==1.3.2
13
+
14
+ # Utilities
15
+ python-dateutil==2.8.2
16
+ filelock==3.13.1
17
+ python-multipart==0.0.6
18
+
19
+ # Testing (optional)
20
+ httpx==0.25.2
21
+ pytest==7.4.3
22
+
23
+ redis==5.0.1
24
+ rq==1.15.1
25
+
26
+ # NEW: Text extraction (Item 1)
27
+ pdfplumber==0.10.3
28
+ PyMuPDF==1.23.8
29
+ pytesseract==0.3.10
30
+ Pillow==10.1.0
31
+
32
+ # NEW: Utilities
33
+ python-magic==0.4.27 # File type detection
34
+ uuid==1.30
35
+ requests==2.31.0
36
+ python-dotenv==1.0.0
37
+ # NEW: Database
38
+ psycopg2-binary==2.9.7
39
+ SQLAlchemy==2.0.20
40
+ alembic==1.11.1
41
+
42
+
43
+ google-generativeai>=0.8.0
44
+
45
+
46
+
47
+