Spaces:

point9
/

Invoice_Digitization_Agent

Sleeping

App Files Files Community

Invoice_Digitization_Agent / backend /app /api /ingest.py

dipan004

Update backend/app/api/ingest.py

aef305f verified 9 days ago

raw

history blame contribute delete

59.7 kB



	import os
	import uuid
	import json
	import sqlite3
	import logging
	import csv
	from pathlib import Path
	from datetime import datetime
	from typing import Optional, Dict, List, Any
	from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks, Form
	from pydantic import BaseModel
	from filelock import FileLock
	import httpx
	import re

	import sys
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	handlers=[logging.StreamHandler(sys.stdout)],
	force=True
	)

	logger = logging.getLogger(__name__)


	BASE_DIR = Path(__file__).parent.parent.parent.parent
	STORAGE_PATH = Path(os.getenv('STORAGE_PATH', str(BASE_DIR / "data" / "docs")))
	DB_PATH = Path(os.getenv('DB_PATH', str(BASE_DIR / "data" / "invoices.db")))
	LOCK_PATH = BASE_DIR / "data" / "invoices.db.lock"
	PREDICT_ENDPOINT = 'http://localhost:7860/predict'

	STORAGE_PATH.mkdir(parents=True, exist_ok=True)

	router = APIRouter(prefix="/api", tags=["ingest"])



	def _init_db_tables():
	"""Create tables on module import - ensures HF Space has tables"""
	try:
	logger.info("🔍 Checking if database tables exist...")
	DB_PATH.parent.mkdir(parents=True, exist_ok=True)

	with FileLock(str(LOCK_PATH), timeout=10):
	conn = sqlite3.connect(str(DB_PATH))
	cursor = conn.cursor()

	# Quick check
	cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='ingest_jobs'")
	if cursor.fetchone():
	conn.close()
	logger.info("✅ Database tables already exist")
	return

	logger.warning("⚠️ Database tables not found, creating...")

	# Create all tables
	tables_sql = [
	"""CREATE TABLE IF NOT EXISTS ingest_jobs (
	job_id TEXT PRIMARY KEY,
	doc_id TEXT,
	filename TEXT NOT NULL,
	status TEXT NOT NULL DEFAULT 'queued',
	error_text TEXT,
	created_at TEXT DEFAULT CURRENT_TIMESTAMP,
	updated_at TEXT DEFAULT CURRENT_TIMESTAMP
	)""",

	"""CREATE TABLE IF NOT EXISTS documents (
	doc_id TEXT PRIMARY KEY,
	job_id TEXT NOT NULL,
	path TEXT NOT NULL,
	filename TEXT NOT NULL,
	content_type TEXT NOT NULL,
	uploaded_at TEXT DEFAULT CURRENT_TIMESTAMP,
	FOREIGN KEY (job_id) REFERENCES ingest_jobs(job_id)
	)""",

	"""CREATE TABLE IF NOT EXISTS extractions (
	doc_id TEXT PRIMARY KEY,
	raw_text TEXT,
	tables_json TEXT,
	entities_json TEXT,
	classification_json TEXT,
	summary_text TEXT,
	extracted_at TEXT DEFAULT CURRENT_TIMESTAMP,
	FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
	)""",

	"""CREATE TABLE IF NOT EXISTS invoice_fields (
	invoice_id INTEGER PRIMARY KEY AUTOINCREMENT,
	doc_id TEXT NOT NULL,
	cust_number TEXT,
	posting_date TEXT,
	total_open_amount REAL,
	business_code TEXT,
	cust_payment_terms TEXT,
	confidence_map TEXT,
	created_at TEXT DEFAULT CURRENT_TIMESTAMP,
	FOREIGN KEY (doc_id) REFERENCES documents(doc_id)
	)""",

	"""CREATE TABLE IF NOT EXISTS batch_jobs (
	batch_id TEXT PRIMARY KEY,
	total_files INTEGER,
	message TEXT,
	created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
	)""",

	"""CREATE TABLE IF NOT EXISTS batch_job_mapping (
	batch_id TEXT,
	job_id TEXT,
	FOREIGN KEY (batch_id) REFERENCES batch_jobs(batch_id),
	FOREIGN KEY (job_id) REFERENCES ingest_jobs(job_id)
	)""",

	# ML feature tables
	"""CREATE TABLE IF NOT EXISTS customer_aggregates (
	cust_number TEXT PRIMARY KEY,
	cust_avg_days REAL,
	cust_median_days REAL,
	cust_invoice_count INTEGER,
	last_updated TEXT DEFAULT CURRENT_TIMESTAMP
	)""",

	"""CREATE TABLE IF NOT EXISTS payment_terms_aggregates (
	cust_payment_terms TEXT PRIMARY KEY,
	terms_avg_days REAL,
	terms_median_days REAL,
	terms_invoice_count INTEGER,
	last_updated TEXT DEFAULT CURRENT_TIMESTAMP
	)""",

	"""CREATE TABLE IF NOT EXISTS business_code_aggregates (
	business_code TEXT PRIMARY KEY,
	bc_avg_days REAL,
	bc_median_days REAL,
	bc_invoice_count INTEGER,
	last_updated TEXT DEFAULT CURRENT_TIMESTAMP
	)""",

	"""CREATE TABLE IF NOT EXISTS predictions_log (
	prediction_id INTEGER PRIMARY KEY AUTOINCREMENT,
	invoice_id INTEGER,
	cust_number TEXT,
	posting_date TEXT,
	total_open_amount REAL,
	business_code TEXT,
	cust_payment_terms TEXT,
	predicted_days_to_clear REAL,
	predicted_clear_date TEXT,
	model_version TEXT,
	features_json TEXT,
	predicted_at TEXT DEFAULT CURRENT_TIMESTAMP
	)"""
	]

	# Execute all CREATE TABLE statements
	for sql in tables_sql:
	cursor.execute(sql)

	# Create indexes
	indexes_sql = [
	"CREATE INDEX IF NOT EXISTS idx_ingest_jobs_status ON ingest_jobs(status)",
	"CREATE INDEX IF NOT EXISTS idx_ingest_jobs_created ON ingest_jobs(created_at DESC)",
	"CREATE INDEX IF NOT EXISTS idx_documents_job_id ON documents(job_id)",
	"CREATE INDEX IF NOT EXISTS idx_invoice_fields_doc_id ON invoice_fields(doc_id)",
	"CREATE INDEX IF NOT EXISTS idx_batch_mapping_batch ON batch_job_mapping(batch_id)",
	"CREATE INDEX IF NOT EXISTS idx_predictions_log_cust ON predictions_log(cust_number)"
	]

	for sql in indexes_sql:
	cursor.execute(sql)

	conn.commit()
	conn.close()

	logger.info("✅ Database tables created successfully!")

	except Exception as e:
	logger.error(f"❌ Failed to create tables: {e}")
	import traceback
	logger.error(traceback.format_exc())


	# Run on module import
	logger.info("🔍 Initializing database on module load...")
	try:
	_init_db_tables()
	logger.info("✅ Database initialization complete")
	except Exception as e:
	logger.error(f"❌ Database initialization failed: {e}")
	logger.warning("⚠️ Application may not work correctly!")



	# ============================================
	# LOCAL OCR FALLBACK (NEW)
	# ============================================

	# ============================================
	# LOCAL OCR FALLBACK (UPDATED - EasyOCR + Tesseract)
	# ============================================

	def extract_text_with_easyocr(file_path: Path) -> tuple:
	"""
	EasyOCR - Best free open-source OCR
	- Works offline
	- 80+ languages
	- GPU/CPU support
	- Better accuracy than Tesseract for invoices
	"""
	try:
	import easyocr

	logger.info("🔧 Using EasyOCR (best free OCR)...")

	# Initialize reader (downloads models on first run)
	# Use GPU if available, fallback to CPU
	reader = easyocr.Reader(['en'], gpu=False) # Set gpu=True if you have CUDA

	# Read image
	result = reader.readtext(str(file_path), detail=0, paragraph=True)

	# Join all text
	text = '\n'.join(result)

	if text and len(text.strip()) >= 10:
	logger.info(f"✅ EasyOCR extracted {len(text)} characters")
	return True, text, None

	return False, None, "EasyOCR produced no usable text"

	except ImportError:
	logger.warning("⚠️ easyocr not installed. Install with: pip install easyocr")
	return False, None, "easyocr not available"
	except Exception as e:
	logger.error(f"❌ EasyOCR failed: {e}")
	return False, None, str(e)


	def extract_text_with_tesseract(file_path: Path) -> tuple:
	"""
	Tesseract OCR - Fallback option
	Faster but less accurate than EasyOCR
	"""
	try:
	import pytesseract
	from PIL import Image

	logger.info("🔧 Using Tesseract OCR as secondary fallback...")

	image = Image.open(file_path)
	text = pytesseract.image_to_string(image)

	if text and len(text.strip()) >= 10:
	logger.info(f"✅ Tesseract extracted {len(text)} characters")
	return True, text, None

	return False, None, "Tesseract produced no usable text"

	except ImportError:
	logger.warning("⚠️ pytesseract not installed. Install with: pip install pytesseract pillow")
	return False, None, "pytesseract not available"
	except Exception as e:
	logger.error(f"❌ Tesseract failed: {e}")
	return False, None, str(e)


	def extract_text_with_local_ocr(file_path: Path) -> tuple:
	"""
	Multi-tier local OCR fallback system:
	1. Try EasyOCR (best accuracy)
	2. Try Tesseract (faster, less accurate)
	3. Give up
	"""
	logger.info("=" * 70)
	logger.info("🔄 HF extraction failed - trying local OCR fallbacks...")
	logger.info("=" * 70)

	# Priority 1: EasyOCR (best for invoices)
	success, text, error = extract_text_with_easyocr(file_path)
	if success:
	logger.info("✅ EasyOCR succeeded!")
	return True, text, None
	else:
	logger.warning(f"⚠️ EasyOCR failed: {error}")

	# Priority 2: Tesseract (faster fallback)
	success, text, error = extract_text_with_tesseract(file_path)
	if success:
	logger.info("✅ Tesseract succeeded!")
	return True, text, None
	else:
	logger.warning(f"⚠️ Tesseract failed: {error}")

	# All local OCR failed
	logger.error("❌ All local OCR methods failed")
	return False, None, "All local OCR methods failed"


	# ============================================
	# STEP 1: HF Agent Text Extraction (UPDATED)
	# ============================================

	def get_agent_headers():
	"""Get headers with HF token"""
	token = (
	os.getenv('HF_TOKEN') or
	os.getenv('HUGGINGFACE_API_TOKEN') or
	os.getenv('AGENT_BEARER_TOKEN') or
	''
	)
	return {'Authorization': f'Bearer {token}'} if token else {}


	def get_mime_type(file_path: Path) -> str:
	"""Get MIME type"""
	ext = file_path.suffix.lower()
	mime_map = {
	'.pdf': 'application/pdf',
	'.jpg': 'image/jpeg',
	'.jpeg': 'image/jpeg',
	'.png': 'image/png'
	}
	return mime_map.get(ext, 'application/octet-stream')


	def call_text_extractor(file_path: Path, max_retries=3):
	"""
	HF text extraction with retry logic and exponential backoff.
	Falls back to local OCR if all retries fail.
	"""
	url = os.getenv('TEXT_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/text')
	base_timeout = int(os.getenv('AGENT_TIMEOUT_SECONDS', '120'))

	for attempt in range(max_retries):
	# Progressive timeout: 120s, 180s, 240s
	timeout = base_timeout + (60 * attempt)

	try:
	logger.info(f"📄 Extracting text from {file_path.name} (attempt {attempt + 1}/{max_retries}, timeout={timeout}s)...")

	filename = file_path.name
	mime_type = get_mime_type(file_path)

	with open(file_path, 'rb') as f:
	files = {'file': (filename, f, mime_type)}
	data = {
	'filename': filename,
	'start_page': 1,
	'end_page': 1
	}
	headers = get_agent_headers()

	response = httpx.post(url, files=files, data=data, headers=headers, timeout=timeout)

	if response.status_code == 200:
	result = response.json()
	text = result.get('result') or result.get('text') or result.get('extracted_text') or ''

	if text and len(text.strip()) >= 10:
	logger.info(f"✅ Extracted {len(text)} characters")
	return True, text, None

	logger.warning("⚠️ No text extracted from response")
	if attempt < max_retries - 1:
	continue
	return False, None, "No text extracted"

	logger.warning(f"⚠️ HTTP {response.status_code}: {response.text[:200]}")

	except httpx.TimeoutException:
	logger.warning(f"⚠️ Timeout after {timeout}s on attempt {attempt + 1}")
	if attempt < max_retries - 1:
	logger.info("🔄 Retrying with longer timeout...")
	continue
	except Exception as e:
	logger.error(f"❌ Error on attempt {attempt + 1}: {e}")
	if attempt < max_retries - 1:
	logger.info("🔄 Retrying...")
	continue

	# All retries failed - try local OCR fallback
	logger.warning(f"⚠️ All {max_retries} HF extraction attempts failed, trying local OCR fallback...")
	return extract_text_with_local_ocr(file_path)


	def call_table_extractor(file_path: Path, max_retries=2):
	"""
	HF table extraction with retry logic.
	Non-critical, so fewer retries.
	"""
	url = os.getenv('TABLE_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/tables')
	base_timeout = int(os.getenv('AGENT_TIMEOUT_SECONDS', '120'))

	for attempt in range(max_retries):
	timeout = base_timeout + (60 * attempt)

	try:
	logger.info(f"📊 Extracting tables from {file_path.name} (attempt {attempt + 1}/{max_retries})...")

	filename = file_path.name
	mime_type = get_mime_type(file_path)

	with open(file_path, 'rb') as f:
	files = {'file': (filename, f, mime_type)}
	data = {
	'filename': filename,
	'start_page': 1,
	'end_page': 1
	}
	headers = get_agent_headers()

	response = httpx.post(url, files=files, data=data, headers=headers, timeout=timeout)

	if response.status_code == 200:
	result = response.json()
	tables = result.get('result') or result.get('tables') or []
	logger.info(f"✅ Extracted {len(tables)} tables")
	return True, tables, None

	logger.warning(f"⚠️ HTTP {response.status_code}")

	except httpx.TimeoutException:
	logger.warning(f"⚠️ Table extraction timeout on attempt {attempt + 1}")
	except Exception as e:
	logger.warning(f"⚠️ Table extraction error: {e}")

	# Non-critical - return empty list
	logger.info("ℹ️ Table extraction failed, continuing without tables")
	return False, [], "Table extraction failed (non-critical)"


	# ============================================
	# STEP 2: HF NER (Named Entity Recognition)
	# ============================================

	def call_ner(text: str, file_path: Path = None, max_retries=2) -> tuple:
	"""
	Extract named entities using HF NER agent with retry logic.
	"""
	url = os.getenv('NER_URL', 'https://point9-ner.hf.space/api/ner')
	base_timeout = int(os.getenv('AGENT_TIMEOUT_SECONDS', '120'))

	for attempt in range(max_retries):
	timeout = base_timeout + (30 * attempt)

	try:
	logger.info(f"🔍 Running NER to find entities (attempt {attempt + 1}/{max_retries})...")

	headers = get_agent_headers()

	# NER expects multipart/form-data with file OR text
	if file_path and file_path.exists():
	# Send file
	filename = file_path.name
	mime_type = get_mime_type(file_path)

	with open(file_path, 'rb') as f:
	files = {'file': (filename, f, mime_type)}
	data = {
	'text': text[:5000],
	'filename': filename,
	'start_page': 1,
	'end_page': 1
	}
	response = httpx.post(url, files=files, data=data, headers=headers, timeout=timeout)
	else:
	# Send just text as form data
	data = {
	'text': text[:5000],
	'filename': 'document.txt',
	'start_page': 1,
	'end_page': 1
	}
	response = httpx.post(url, data=data, headers=headers, timeout=timeout)

	if response.status_code == 200:
	result = response.json()

	# FIX: Handle both dict and string responses
	if isinstance(result, str):
	try:
	result = json.loads(result)
	except:
	logger.warning(f"⚠️ NER returned unparseable string: {result[:100]}")
	if attempt < max_retries - 1:
	continue
	return False, [], {}, "Invalid response format"

	# Extract entities
	entities = result.get('entities') or result.get('result') or []

	# Handle case where entities might also be a string
	if isinstance(entities, str):
	try:
	entities = json.loads(entities)
	except:
	entities = []

	logger.info(f"✅ Found {len(entities)} entities")

	# Group entities by type
	entity_map = {
	'PERSON': [],
	'ORG': [],
	'DATE': [],
	'MONEY': [],
	'CARDINAL': []
	}

	for entity in entities:
	if not isinstance(entity, dict):
	continue

	ent_type = entity.get('entity_type') or entity.get('label')
	ent_text = entity.get('text') or entity.get('word')

	if ent_type in entity_map and ent_text:
	entity_map[ent_type].append(ent_text)

	logger.info(f"📋 Entity summary: PERSON={len(entity_map['PERSON'])}, ORG={len(entity_map['ORG'])}, DATE={len(entity_map['DATE'])}, MONEY={len(entity_map['MONEY'])}")

	return True, entities, entity_map, None

	logger.warning(f"⚠️ NER HTTP {response.status_code}")

	except httpx.TimeoutException:
	logger.warning(f"⚠️ NER timeout on attempt {attempt + 1}")
	except Exception as e:
	logger.error(f"❌ NER error on attempt {attempt + 1}: {e}")

	# NER failed - return empty (non-critical)
	logger.warning("⚠️ NER failed after retries, continuing without entities")
	return False, [], {}, "NER failed (non-critical)"


	# ============================================
	# STEP 3: Gemini Intelligent Mapping
	# ============================================

	def map_with_gemini(text: str, entities: List, entity_map: Dict, tables: List):
	"""Use Gemini to intelligently map extracted data to invoice fields"""
	try:
	import google.generativeai as genai

	api_key = os.getenv('GEMINI_API_KEY')
	if not api_key:
	logger.warning("⚠️ No Gemini API key configured")
	return False, None, "No Gemini API key"

	logger.info("🧠 Using Gemini for intelligent field mapping...")

	genai.configure(api_key=api_key)
	model = genai.GenerativeModel('models/gemini-2.5-flash')

	# Build context for Gemini
	context = f"""
	EXTRACTED TEXT:
	{text[:3000]}

	NAMED ENTITIES FOUND:
	- Organizations: {entity_map.get('ORG', [])}
	- People: {entity_map.get('PERSON', [])}
	- Dates: {entity_map.get('DATE', [])}
	- Money amounts: {entity_map.get('MONEY', [])}
	- Numbers: {entity_map.get('CARDINAL', [])}

	TABLES:
	{json.dumps(tables[:2], indent=2) if tables else 'None'}
	"""

	prompt = f"""You are an expert at analyzing invoice data. Given the extracted text and entities below, map them to invoice fields.

	{context}

	Analyze the above data and return ONLY a valid JSON object with these exact fields:

	{{
	"customer_name": "the client/customer company name (check ORG entities first)",
	"invoice_number": "the invoice number (check CARDINAL entities)",
	"date": "invoice date in YYYY-MM-DD format (check DATE entities)",
	"total_amount": numeric total amount only (check MONEY entities, no currency symbol),
	"payment_terms": "payment terms like NET30, NET60, or NAH4 if not found",
	"reasoning": "brief explanation of how you identified each field"
	}}

	Rules:
	1. Prefer entities over raw text when available
	2. Customer name is usually the first ORG after "Bill To" or "Client"
	3. Total amount is usually the largest MONEY value
	4. Date should be in YYYY-MM-DD format
	5. If uncertain, use these defaults: customer_name="UNKNOWN", date="2024-01-01", total_amount=0.0, payment_terms="NAH4"

	Return ONLY the JSON object, no markdown, no explanation outside the JSON."""

	response = model.generate_content(prompt)
	text_response = response.text.strip()

	# Remove markdown if present
	text_response = text_response.replace('```json', '').replace('```', '').strip()

	result = json.loads(text_response)

	logger.info(f"✅ Gemini mapped: Customer={result.get('customer_name')}, Amount=${result.get('total_amount')}")
	logger.info(f"💡 Reasoning: {result.get('reasoning', 'N/A')[:100]}")

	return True, result, None

	except json.JSONDecodeError as e:
	logger.error(f"❌ Gemini returned invalid JSON: {e}")
	logger.error(f"Response: {text_response[:500]}")
	return False, None, f"Invalid JSON: {e}"
	except Exception as e:
	logger.error(f"❌ Gemini mapping failed: {e}")
	import traceback
	logger.error(traceback.format_exc())
	return False, None, str(e)


	# ============================================
	# Fallback: Regex Mapping
	# ============================================

	def map_with_regex(text: str, entities: List) -> tuple:
	"""Fallback regex-based field extraction"""
	logger.info("🔤 Using regex fallback for field mapping...")

	fields = {}
	confidence = {}

	# CUSTOMER NAME - try to use ORG entities first
	org_entities = [e.get('text') or e.get('word') for e in entities
	if (e.get('entity_type') or e.get('label')) == 'ORG']

	if org_entities:
	fields['cust_number'] = org_entities[0][:20]
	confidence['cust_number'] = 0.8
	else:
	# Regex fallback
	client_patterns = [
	r'(?:Client\|Bill\s+To\|Customer)[:\s]+(.*?)(?:\n\|Tax\|IBAN)',
	r'(?:customer\|client)[\s:]+([A-Za-z][A-Za-z\s,&-]+?)(?:\n\|$)',
	]

	for pattern in client_patterns:
	match = re.search(pattern, text, re.IGNORECASE \| re.MULTILINE)
	if match:
	client = match.group(1).strip()
	words = [w.strip() for w in client.replace(',', ' ').split() if len(w.strip()) > 2]
	if words:
	fields['cust_number'] = words[0][:20]
	confidence['cust_number'] = 0.6
	break

	if 'cust_number' not in fields:
	fields['cust_number'] = 'UNKNOWN'
	confidence['cust_number'] = 0.1

	# DATE - try DATE entities first
	date_entities = [e.get('text') or e.get('word') for e in entities
	if (e.get('entity_type') or e.get('label')) == 'DATE']

	if date_entities:
	date_str = date_entities[0]
	for fmt in ['%m/%d/%Y', '%d/%m/%Y', '%Y-%m-%d', '%m-%d-%Y']:
	try:
	dt = datetime.strptime(date_str, fmt)
	fields['posting_date'] = dt.strftime('%Y-%m-%d')
	confidence['posting_date'] = 0.8
	break
	except:
	continue

	if 'posting_date' not in fields:
	date_patterns = [
	r'(?:Date\s+of\s+issue\|Invoice\s+Date\|Date)[:\s]+(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
	]

	for pattern in date_patterns:
	match = re.search(pattern, text, re.IGNORECASE)
	if match:
	date_str = match.group(1)
	for fmt in ['%m/%d/%Y', '%d/%m/%Y']:
	try:
	dt = datetime.strptime(date_str, fmt)
	fields['posting_date'] = dt.strftime('%Y-%m-%d')
	confidence['posting_date'] = 0.7
	break
	except:
	continue
	if 'posting_date' in fields:
	break

	if 'posting_date' not in fields:
	fields['posting_date'] = datetime.now().strftime('%Y-%m-%d')
	confidence['posting_date'] = 0.1

	# AMOUNT - try MONEY entities first
	money_entities = [e.get('text') or e.get('word') for e in entities
	if (e.get('entity_type') or e.get('label')) == 'MONEY']

	if money_entities:
	amounts = []
	for money_str in money_entities:
	try:
	# Remove currency symbols and parse
	amt_str = re.sub(r'[^\d.]', '', money_str)
	amt = float(amt_str)
	if amt > 10:
	amounts.append(amt)
	except:
	pass

	if amounts:
	fields['total_open_amount'] = max(amounts)
	confidence['total_open_amount'] = 0.8
	logger.info(f"✅ Found amount from MONEY entity: ${fields['total_open_amount']}")

	if 'total_open_amount' not in fields:
	# Regex fallback
	pattern = r'\$\s([0-9]{1,3}(?:,?[0-9]{3})\.[0-9]{2})'
	amounts = []
	for match in re.finditer(pattern, text):
	try:
	amt = float(match.group(1).replace(',', ''))
	if amt > 50:
	amounts.append(amt)
	except:
	pass

	if amounts:
	fields['total_open_amount'] = max(amounts)
	confidence['total_open_amount'] = 0.6
	else:
	fields['total_open_amount'] = 0.0
	confidence['total_open_amount'] = 0.0
	logger.warning("⚠️ No amount found!")

	# PAYMENT TERMS
	terms_match = re.search(r'(NET\s?\d{1,2}\|N\d{2}\|NAH\d)', text, re.IGNORECASE)
	fields['cust_payment_terms'] = terms_match.group(1).upper() if terms_match else 'NAH4'
	confidence['cust_payment_terms'] = 0.7 if terms_match else 0.2

	# BUSINESS CODE
	fields['business_code'] = 'U001'
	confidence['business_code'] = 0.2

	return fields, confidence


	# ============================================
	# Database Functions
	# ============================================

	def update_job_status(job_id: str, status: str, error_text: str = None):
	"""Update job status"""
	with FileLock(str(LOCK_PATH), timeout=10):
	conn = sqlite3.connect(str(DB_PATH))
	cursor = conn.cursor()
	cursor.execute("""
	UPDATE ingest_jobs
	SET status = ?, error_text = ?, updated_at = CURRENT_TIMESTAMP
	WHERE job_id = ?
	""", (status, error_text, job_id))
	conn.commit()
	conn.close()


	def save_extraction(doc_id: str, raw_text: str, tables: list, entities: list, classification: dict, summary: str = None):
	"""Save extraction results"""
	with FileLock(str(LOCK_PATH), timeout=10):
	conn = sqlite3.connect(str(DB_PATH))
	cursor = conn.cursor()
	cursor.execute("""
	INSERT OR REPLACE INTO extractions (
	doc_id, raw_text, tables_json, entities_json,
	classification_json, summary_text
	) VALUES (?, ?, ?, ?, ?, ?)
	""", (
	doc_id,
	raw_text,
	json.dumps(tables) if tables else None,
	json.dumps(entities) if entities else None,
	json.dumps(classification) if classification else None,
	summary
	))
	conn.commit()
	conn.close()


	def save_invoice_fields(doc_id: str, fields: Dict, confidence_map: Dict):
	"""Save invoice fields"""
	with FileLock(str(LOCK_PATH), timeout=10):
	conn = sqlite3.connect(str(DB_PATH))
	cursor = conn.cursor()
	cursor.execute("""
	INSERT INTO invoice_fields (
	doc_id, cust_number, posting_date, total_open_amount,
	business_code, cust_payment_terms, confidence_map
	) VALUES (?, ?, ?, ?, ?, ?, ?)
	""", (
	doc_id,
	fields.get('cust_number'),
	fields.get('posting_date'),
	fields.get('total_open_amount'),
	fields.get('business_code'),
	fields.get('cust_payment_terms'),
	json.dumps(confidence_map)
	))
	conn.commit()
	conn.close()


	# ============================================
	# AGENT MODE FLAG (Environment Variable)
	# ============================================

	USE_AGENT_MODE = os.getenv('USE_AGENT_MODE', 'true').lower() == 'true'


	# ============================================
	# Main Processing Pipeline
	# ============================================

	def process_document_legacy(job_id: str, doc_id: str, file_path: Path):
	"""
	LEGACY PIPELINE (Original Implementation):
	1. HF Extract text + tables
	2. HF NER finds entities
	3. Gemini maps to invoice fields
	"""
	logger.info("=" * 70)
	logger.info(f"🚀 Starting LEGACY pipeline for {file_path.name}")
	logger.info("=" * 70)

	try:
	update_job_status(job_id, 'processing')

	# STEP 1: Extract text with HF agents
	logger.info("STEP 1: HF TEXT + TABLE EXTRACTION")
	logger.info("-" * 70)

	success, raw_text, error = call_text_extractor(file_path)
	if not success or not raw_text:
	update_job_status(job_id, 'failed', f"Text extraction failed: {error}")
	return

	# Extract tables (optional, won't fail if it doesn't work)
	_, tables, _ = call_table_extractor(file_path)

	# STEP 2: NER to find entities
	logger.info("-" * 70)
	logger.info("STEP 2: NER - NAMED ENTITY RECOGNITION")
	logger.info("-" * 70)

	ner_success, entities, entity_map, ner_error = call_ner(raw_text, file_path)

	if not ner_success:
	logger.warning(f"⚠️ NER failed: {ner_error}, continuing without entities")
	entities = []
	entity_map = {}

	# STEP 3: Gemini intelligent mapping
	logger.info("-" * 70)
	logger.info("STEP 3: GEMINI INTELLIGENT MAPPING")
	logger.info("-" * 70)

	gemini_success, gemini_result, gemini_error = map_with_gemini(
	raw_text, entities, entity_map, tables
	)

	if gemini_success and gemini_result:
	# Use Gemini's mapping
	fields = {
	'cust_number': gemini_result.get('customer_name', 'UNKNOWN')[:20],
	'posting_date': gemini_result.get('date', datetime.now().strftime('%Y-%m-%d')),
	'total_open_amount': float(gemini_result.get('total_amount', 0.0)),
	'business_code': 'U001',
	'cust_payment_terms': gemini_result.get('payment_terms', 'NAH4')[:10]
	}

	confidence_map = {
	'cust_number': 0.95,
	'posting_date': 0.95,
	'total_open_amount': 0.95,
	'business_code': 0.2,
	'cust_payment_terms': 0.8
	}

	method = 'hf_ner_gemini'

	else:
	# Fallback to regex mapping
	logger.warning(f"⚠️ Gemini mapping failed: {gemini_error}")
	logger.info("-" * 70)
	logger.info("FALLBACK: REGEX MAPPING")
	logger.info("-" * 70)

	fields, confidence_map = map_with_regex(raw_text, entities)
	method = 'hf_ner_regex'

	# Save results
	save_extraction(
	doc_id, raw_text, tables, entities,
	{'method': method, 'entity_count': len(entities)},
	None
	)
	save_invoice_fields(doc_id, fields, confidence_map)

	logger.info("=" * 70)
	logger.info(f"✅ EXTRACTION COMPLETE - Method: {method}")
	logger.info(f"📋 Fields: {fields}")
	logger.info("=" * 70)

	# Call prediction API
	#logger.info("🔮 Calling payment prediction...")
	#try:
	# pred_response = httpx.post(PREDICT_ENDPOINT, json=fields, timeout=30)
	#
	# if pred_response.status_code == 200:
	# pred_result = pred_response.json()
	# logger.info(f"✅ Prediction: {pred_result.get('predicted_days_to_clear')} days")
	#except Exception as e:
	# logger.error(f"⚠️ Prediction failed: {e}")

	update_job_status(job_id, 'completed')
	logger.info(f"🎉 Job {job_id} completed successfully")

	except Exception as e:
	logger.error(f"❌ Job {job_id} failed: {e}")
	import traceback
	traceback.print_exc()
	update_job_status(job_id, 'failed', str(e))




	def process_document_agent(job_id: str, doc_id: str, file_path: Path, user_message: Optional[str] = None):
	"""
	NEW AUTONOMOUS AGENT PIPELINE with optional wrapper
	"""
	try:
	# Clean up user_message
	if user_message in [None, 'None', '', 'null', 'undefined']:
	user_message = None
	else:
	user_message = str(user_message).strip()
	if not user_message:
	user_message = None

	logger.info("=" * 70)
	logger.info(f"🔍 AGENT - Processing with message: '{user_message}'")
	logger.info(f"🔍 Type: {type(user_message)}")
	logger.info(f"🔍 Is None: {user_message is None}")
	logger.info("=" * 70)

	from backend.app.agent.agent_orchestrator import (
	InvoiceAgent, AgentState, create_agent
	)

	logger.info("=" * 70)
	logger.info(f"🤖 AUTONOMOUS AGENT MODE for {file_path.name}")
	logger.info("=" * 70)

	update_job_status(job_id, 'processing')

	# Create agent
	agent = create_agent(
	call_text_extractor,
	call_table_extractor,
	call_ner,
	map_with_gemini
	)

	# Initialize state
	state = AgentState(doc_id=doc_id, file_path=file_path)

	# Let agent autonomously decide and execute
	result_state = agent.process(state)

	# ============================================
	# WRAPPER INTEGRATION
	# ============================================

	full_extraction = result_state.fields
	final_result = full_extraction
	wrapper_used = False

	# Check if user_message is actually provided
	if user_message is not None and len(user_message) > 0:
	logger.info("=" * 70)
	logger.info(f"💬 USER MESSAGE DETECTED: '{user_message}'")
	logger.info("🎯 Activating Gemini wrapper to filter output...")
	logger.info(f"📦 Full extraction fields: {list(full_extraction.keys())}")
	logger.info("=" * 70)

	try:
	from backend.app.wrappers.gemini_output_filter import GeminiOutputFilter

	wrapper = GeminiOutputFilter()
	final_result = wrapper.filter_output(user_message, full_extraction)
	wrapper_used = True

	logger.info("=" * 70)
	logger.info(f"✅ WRAPPER SUCCESS!")
	logger.info(f"📤 Original fields: {list(full_extraction.keys())}")
	logger.info(f"🎯 Filtered fields: {list(final_result.keys())}")
	logger.info(f"📋 Filtered result: {json.dumps(final_result, indent=2)}")
	logger.info("=" * 70)

	except Exception as wrapper_error:
	logger.error("=" * 70)
	logger.error(f"❌ WRAPPER FAILED: {wrapper_error}")
	logger.error("=" * 70)
	import traceback
	logger.error(traceback.format_exc())
	logger.warning("📦 Falling back to full extraction")
	final_result = full_extraction
	wrapper_used = False
	else:
	logger.info("=" * 70)
	logger.info("ℹ️ No user message provided - returning full extraction")
	logger.info(f"📦 Full extraction fields: {list(full_extraction.keys())}")
	logger.info("=" * 70)

	# ============================================
	# Save results
	# ============================================

	if result_state.fields:
	# Determine method
	if 'use_gemini' in result_state.history:
	method = 'autonomous_agent_gemini'
	elif 'use_regex' in result_state.history:
	method = 'autonomous_agent_regex'
	else:
	method = 'autonomous_agent'

	if wrapper_used:
	method += '_with_wrapper'

	save_extraction(
	doc_id,
	result_state.raw_text or '',
	result_state.tables or [],
	result_state.entities or [],
	{
	'method': method,
	'attempts': result_state.attempts,
	'actions': result_state.history,
	'confidence': agent._calculate_overall_confidence(result_state),
	'errors': result_state.errors,
	'user_message': user_message,
	'wrapper_used': wrapper_used,
	'full_extraction_keys': list(full_extraction.keys()) if full_extraction else [],
	'filtered_keys': list(final_result.keys()) if wrapper_used else None
	},
	None
	)

	# Save filtered result
	save_invoice_fields(
	doc_id,
	final_result,
	result_state.confidence_map or {}
	)

	# Call prediction
	logger.info("🔮 Calling payment prediction...")
	try:
	pred_response = httpx.post(PREDICT_ENDPOINT, json=final_result, timeout=30)

	if pred_response.status_code == 200:
	pred_result = pred_response.json()
	logger.info(f"✅ Prediction: {pred_result.get('predicted_days_to_clear')} days")
	except Exception as e:
	logger.error(f"⚠️ Prediction failed: {e}")

	# Check status
	from backend.app.agent.agent_orchestrator import AgentDecision
	if AgentDecision.HUMAN_REVIEW.value in result_state.history:
	update_job_status(job_id, 'needs_review')
	logger.info("👤 Agent requesting human review")
	else:
	update_job_status(job_id, 'completed')
	logger.info(f"✅ Agent completed with confidence: {agent._calculate_overall_confidence(result_state):.2f}")
	else:
	update_job_status(job_id, 'failed', 'Agent could not extract fields')
	logger.error("❌ Agent failed to extract any fields")

	except ImportError as e:
	logger.error(f"❌ Agent module not found: {e}")
	logger.info("⚠️ Falling back to legacy pipeline...")
	process_document_legacy(job_id, doc_id, file_path)
	except Exception as e:
	logger.error(f"❌ Agent failed: {e}")
	import traceback
	traceback.print_exc()
	update_job_status(job_id, 'failed', str(e))


	def process_document(job_id: str, doc_id: str, file_path: Path, user_message: Optional[str] = None):
	"""
	Main entry point - routes to agent or legacy pipeline.
	"""
	# Clean up user_message
	if user_message in [None, 'None', '', 'null', 'undefined']:
	user_message = None
	else:
	user_message = str(user_message).strip()
	if not user_message:
	user_message = None

	logger.info("=" * 70)
	logger.info(f"🔍 PROCESS_DOCUMENT - Cleaned user_message: '{user_message}'")
	logger.info(f"🔍 Type: {type(user_message)}")
	logger.info(f"🔍 Is None: {user_message is None}")
	logger.info("=" * 70)

	if USE_AGENT_MODE:
	logger.info("🤖 Using AUTONOMOUS AGENT mode")
	process_document_agent(job_id, doc_id, file_path, user_message=user_message)
	else:
	logger.info("📋 Using LEGACY pipeline mode")
	process_document_legacy(job_id, doc_id, file_path)


	# ============================================
	# API Endpoints
	# ============================================

	class IngestResponse(BaseModel):
	job_id: str
	doc_id: str
	filename: str
	status: str
	message: str


	class JobStatusResponse(BaseModel):
	job_id: str
	doc_id: str
	filename: str
	status: str
	error_text: Optional[str] = None
	created_at: str
	updated_at: str
	extraction: Optional[Dict] = None
	invoice_fields: Optional[Dict] = None

	class BatchIngestResponse(BaseModel):
	batch_id: str
	total_files: int
	jobs: List[Dict[str, str]]
	message: str


	class BatchStatusResponse(BaseModel):
	batch_id: str
	total_files: int
	completed: int
	processing: int
	failed: int
	queued: int
	jobs: List[Dict[str, Any]]

	@router.post("/ingest", response_model=IngestResponse)
	async def ingest_document(
	background_tasks: BackgroundTasks,
	file: UploadFile = File(...),
	message: str = Form(None) # CHANGED: Use Form(None) instead of Optional[str] = None
	):

	# Clean message parameter
	cleaned_message = None
	if message and message not in ['None', 'null', '', 'undefined']:
	cleaned_message = message.strip()
	if not cleaned_message:
	cleaned_message = None

	logger.info("=" * 70)
	logger.info(f"📨 API ENDPOINT - Raw message: '{message}'")
	logger.info(f"✨ Cleaned message: '{cleaned_message}'")
	logger.info(f"🔍 Message type: {type(cleaned_message)}")
	logger.info(f"❓ Is None: {cleaned_message is None}")
	logger.info("=" * 70)

	try:
	allowed_types = ['application/pdf', 'image/png', 'image/jpeg']
	if file.content_type not in allowed_types:
	raise HTTPException(400, f"Invalid file type: {file.content_type}")

	job_id = f"job_{uuid.uuid4().hex[:12]}"
	doc_id = f"doc_{uuid.uuid4().hex[:12]}"
	file_ext = file.filename.split('.')[-1] if '.' in file.filename else 'pdf'

	stored_filename = f"{doc_id}.{file_ext}"
	file_path = STORAGE_PATH / stored_filename

	content = await file.read()
	with open(file_path, 'wb') as f:
	f.write(content)

	with FileLock(str(LOCK_PATH), timeout=10):
	conn = sqlite3.connect(str(DB_PATH))
	cursor = conn.cursor()

	cursor.execute("""
	INSERT INTO ingest_jobs (job_id, doc_id, filename, status)
	VALUES (?, ?, ?, 'queued')
	""", (job_id, doc_id, file.filename))

	cursor.execute("""
	INSERT INTO documents (doc_id, job_id, path, filename, content_type)
	VALUES (?, ?, ?, ?, ?)
	""", (doc_id, job_id, str(file_path), file.filename, file.content_type))

	conn.commit()
	conn.close()

	# Start processing with cleaned message
	background_tasks.add_task(
	process_document,
	job_id,
	doc_id,
	file_path,
	user_message=cleaned_message # Pass cleaned message
	)

	logger.info(f"🚀 Background task started with message: '{cleaned_message}'")

	mode = "autonomous agent"
	if cleaned_message:
	mode += f" with intelligent filtering"
	logger.info(f"🎯 User wants: '{cleaned_message}'")

	return IngestResponse(
	job_id=job_id,
	doc_id=doc_id,
	filename=file.filename,
	status='queued',
	message=f'Document uploaded. Processing with {mode}.'
	)

	except HTTPException:
	raise
	except Exception as e:
	logger.error(f"❌ Ingest endpoint error: {e}")
	import traceback
	logger.error(traceback.format_exc())
	raise HTTPException(500, str(e))


	@router.get("/ingest/{job_id}", response_model=JobStatusResponse)
	def get_ingest_status(job_id: str):
	"""Get job status with agent decision history (if applicable)"""
	try:
	with FileLock(str(LOCK_PATH), timeout=10):
	conn = sqlite3.connect(str(DB_PATH))
	conn.row_factory = sqlite3.Row
	cursor = conn.cursor()

	cursor.execute("SELECT * FROM ingest_jobs WHERE job_id = ?", (job_id,))
	job = cursor.fetchone()
	if not job:
	conn.close()
	raise HTTPException(404, "Job not found")

	job_data = dict(job)
	doc_id = job_data['doc_id']

	if job_data['status'] in ['completed', 'needs_review']:
	cursor.execute("SELECT * FROM extractions WHERE doc_id = ?", (doc_id,))
	extraction = cursor.fetchone()
	if extraction:
	ext_dict = dict(extraction)
	if ext_dict.get('raw_text'):
	ext_dict['raw_text'] = ext_dict['raw_text'][:500] + "..."
	job_data['extraction'] = ext_dict

	cursor.execute("SELECT * FROM invoice_fields WHERE doc_id = ?", (doc_id,))
	invoice = cursor.fetchone()
	if invoice:
	inv_dict = dict(invoice)
	if inv_dict.get('confidence_map'):
	inv_dict['confidence_map'] = json.loads(inv_dict['confidence_map'])
	job_data['invoice_fields'] = inv_dict

	conn.close()
	return JobStatusResponse(**job_data)

	except HTTPException:
	raise
	except Exception as e:
	logger.error(f"❌ Job status error: {e}")
	raise HTTPException(500, str(e))


	@router.post("/ingest/batch", response_model=BatchIngestResponse)
	async def ingest_batch_documents(
	background_tasks: BackgroundTasks,
	files: List[UploadFile] = File(...),
	message: str = Form(None)
	):
	"""
	Upload multiple documents for batch processing.

	Examples:
	1. Batch upload without filtering:
	curl -F "files=@invoice1.jpg" -F "files=@invoice2.pdf" -F "files=@invoice3.png" \
	http://localhost:7860/api/ingest/batch

	2. Batch upload with same extraction rule for all:
	curl -F "files=@invoice1.jpg" -F "files=@invoice2.jpg" \
	-F "message=extract only total and date" \
	http://localhost:7860/api/ingest/batch

	3. Maximum 50 files per batch
	"""
	# Validate batch size
	if len(files) > 50:
	raise HTTPException(400, "Maximum 50 files per batch")

	if len(files) == 0:
	raise HTTPException(400, "No files provided")

	# Clean message
	cleaned_message = None
	if message and message not in ['None', 'null', '', 'undefined']:
	cleaned_message = message.strip()
	if not cleaned_message:
	cleaned_message = None

	batch_id = f"batch_{uuid.uuid4().hex[:12]}"
	jobs = []

	logger.info("=" * 70)
	logger.info(f"📦 BATCH UPLOAD - {len(files)} files")
	logger.info(f"📦 Batch ID: {batch_id}")
	logger.info(f"📦 Message: '{cleaned_message}'")
	logger.info("=" * 70)

	try:
	allowed_types = ['application/pdf', 'image/png', 'image/jpeg']

	for idx, file in enumerate(files):
	# Validate each file
	if file.content_type not in allowed_types:
	logger.warning(f"⚠️ Skipping {file.filename} - invalid type: {file.content_type}")
	continue

	# Create job for this file
	job_id = f"job_{uuid.uuid4().hex[:12]}"
	doc_id = f"doc_{uuid.uuid4().hex[:12]}"
	file_ext = file.filename.split('.')[-1] if '.' in file.filename else 'pdf'

	stored_filename = f"{doc_id}.{file_ext}"
	file_path = STORAGE_PATH / stored_filename

	# Save file
	content = await file.read()
	with open(file_path, 'wb') as f:
	f.write(content)

	# Save to database
	with FileLock(str(LOCK_PATH), timeout=10):
	conn = sqlite3.connect(str(DB_PATH))
	cursor = conn.cursor()

	cursor.execute("""
	INSERT INTO ingest_jobs (job_id, doc_id, filename, status)
	VALUES (?, ?, ?, 'queued')
	""", (job_id, doc_id, file.filename))

	cursor.execute("""
	INSERT INTO documents (doc_id, job_id, path, filename, content_type)
	VALUES (?, ?, ?, ?, ?)
	""", (doc_id, job_id, str(file_path), file.filename, file.content_type))

	conn.commit()
	conn.close()

	# Queue processing
	background_tasks.add_task(
	process_document,
	job_id,
	doc_id,
	file_path,
	user_message=cleaned_message
	)

	jobs.append({
	'job_id': job_id,
	'doc_id': doc_id,
	'filename': file.filename,
	'status': 'queued'
	})

	logger.info(f"✅ [{idx+1}/{len(files)}] Queued: {file.filename}")

	if not jobs:
	raise HTTPException(400, "No valid files to process")

	# Save batch metadata
	with FileLock(str(LOCK_PATH), timeout=10):
	conn = sqlite3.connect(str(DB_PATH))
	cursor = conn.cursor()

	# Create batch_jobs table if it doesn't exist
	cursor.execute("""
	CREATE TABLE IF NOT EXISTS batch_jobs (
	batch_id TEXT PRIMARY KEY,
	total_files INTEGER,
	message TEXT,
	created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
	)
	""")

	cursor.execute("""
	INSERT INTO batch_jobs (batch_id, total_files, message)
	VALUES (?, ?, ?)
	""", (batch_id, len(jobs), cleaned_message))

	# Link jobs to batch
	cursor.execute("""
	CREATE TABLE IF NOT EXISTS batch_job_mapping (
	batch_id TEXT,
	job_id TEXT,
	FOREIGN KEY (job_id) REFERENCES ingest_jobs(job_id)
	)
	""")

	for job in jobs:
	cursor.execute("""
	INSERT INTO batch_job_mapping (batch_id, job_id)
	VALUES (?, ?)
	""", (batch_id, job['job_id']))

	conn.commit()
	conn.close()

	mode = "autonomous agent"
	if cleaned_message:
	mode += " with intelligent filtering"

	logger.info(f"🚀 Batch {batch_id} processing started with {len(jobs)} files")

	return BatchIngestResponse(
	batch_id=batch_id,
	total_files=len(jobs),
	jobs=jobs,
	message=f'Batch of {len(jobs)} documents uploaded. Processing with {mode}.'
	)

	except HTTPException:
	raise
	except Exception as e:
	logger.error(f"❌ Batch ingest error: {e}")
	import traceback
	logger.error(traceback.format_exc())
	raise HTTPException(500, str(e))


	@router.get("/ingest/batch/{batch_id}", response_model=BatchStatusResponse)
	def get_batch_status(batch_id: str):
	"""
	Get status of all jobs in a batch.

	Example:
	curl http://localhost:7860/api/ingest/batch/batch_abc123
	"""
	try:
	with FileLock(str(LOCK_PATH), timeout=10):
	conn = sqlite3.connect(str(DB_PATH))
	conn.row_factory = sqlite3.Row
	cursor = conn.cursor()

	# Get batch info
	cursor.execute("SELECT * FROM batch_jobs WHERE batch_id = ?", (batch_id,))
	batch = cursor.fetchone()
	if not batch:
	conn.close()
	raise HTTPException(404, "Batch not found")

	# Get all jobs in batch
	cursor.execute("""
	SELECT j.* FROM ingest_jobs j
	JOIN batch_job_mapping bm ON j.job_id = bm.job_id
	WHERE bm.batch_id = ?
	""", (batch_id,))

	jobs = cursor.fetchall()
	conn.close()

	# Count statuses
	status_counts = {
	'completed': 0,
	'processing': 0,
	'failed': 0,
	'queued': 0,
	'needs_review': 0
	}

	jobs_list = []
	for job in jobs:
	job_dict = dict(job)
	status = job_dict['status']
	status_counts[status] = status_counts.get(status, 0) + 1

	jobs_list.append({
	'job_id': job_dict['job_id'],
	'doc_id': job_dict['doc_id'],
	'filename': job_dict['filename'],
	'status': status,
	'error_text': job_dict.get('error_text'),
	'created_at': job_dict['created_at'],
	'updated_at': job_dict['updated_at']
	})

	return BatchStatusResponse(
	batch_id=batch_id,
	total_files=len(jobs),
	completed=status_counts['completed'],
	processing=status_counts['processing'],
	failed=status_counts['failed'],
	queued=status_counts['queued'],
	jobs=jobs_list
	)

	except HTTPException:
	raise
	except Exception as e:
	logger.error(f"❌ Batch status error: {e}")
	raise HTTPException(500, str(e))


	@router.get("/ingest/batch/{batch_id}/download")
	def download_batch_results(batch_id: str):
	"""
	Download all extracted data from a batch as CSV.

	Example:
	curl http://localhost:7860/api/ingest/batch/batch_abc123/download -o results.csv
	"""
	try:
	import csv
	from io import StringIO
	from fastapi.responses import StreamingResponse

	with FileLock(str(LOCK_PATH), timeout=10):
	conn = sqlite3.connect(str(DB_PATH))
	conn.row_factory = sqlite3.Row
	cursor = conn.cursor()

	# Get all completed jobs in batch
	cursor.execute("""
	SELECT j., f. FROM ingest_jobs j
	JOIN batch_job_mapping bm ON j.job_id = bm.job_id
	LEFT JOIN invoice_fields f ON j.doc_id = f.doc_id
	WHERE bm.batch_id = ? AND j.status = 'completed'
	""", (batch_id,))

	results = cursor.fetchall()
	conn.close()

	if not results:
	raise HTTPException(404, "No completed jobs found in batch")

	# Create CSV
	output = StringIO()
	writer = csv.writer(output)

	# Header
	writer.writerow([
	'filename', 'doc_id', 'customer', 'date', 'amount',
	'payment_terms', 'business_code', 'status'
	])

	# Data rows
	for row in results:
	writer.writerow([
	row['filename'],
	row['doc_id'],
	row['cust_number'] or 'N/A',
	row['posting_date'] or 'N/A',
	row['total_open_amount'] or 0.0,
	row['cust_payment_terms'] or 'N/A',
	row['business_code'] or 'N/A',
	row['status']
	])

	output.seek(0)

	return StreamingResponse(
	iter([output.getvalue()]),
	media_type="text/csv",
	headers={
	"Content-Disposition": f"attachment; filename=batch_{batch_id}_results.csv"
	}
	)


	except HTTPException:
	raise
	except Exception as e:
	raise HTTPException(500, str(e))