Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,8 +7,11 @@ import tempfile
|
|
| 7 |
import uuid
|
| 8 |
import asyncio
|
| 9 |
from typing import List, Dict, Optional, Tuple
|
| 10 |
-
from collections import Counter
|
| 11 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
|
| 14 |
from fastapi.middleware.cors import CORSMiddleware
|
|
@@ -16,7 +19,10 @@ from fastapi.responses import JSONResponse
|
|
| 16 |
from starlette.requests import Request
|
| 17 |
import fitz # PyMuPDF
|
| 18 |
import google.generativeai as genai
|
|
|
|
| 19 |
from PIL import Image
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# Azure Blob Storage
|
| 22 |
try:
|
|
@@ -40,6 +46,10 @@ except ImportError:
|
|
| 40 |
|
| 41 |
from datetime import datetime, timedelta
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
app = FastAPI(title="Invoice Splitter API with Azure Blob Storage - Optimized")
|
| 44 |
|
| 45 |
# Increase request body size limit
|
|
@@ -54,7 +64,7 @@ app.add_middleware(
|
|
| 54 |
)
|
| 55 |
|
| 56 |
# ============================================================================
|
| 57 |
-
# β CONFIGURATION
|
| 58 |
# ============================================================================
|
| 59 |
|
| 60 |
# Gemini API Key - REQUIRED for image-based PDFs
|
|
@@ -84,6 +94,10 @@ USE_SMART_SAMPLING = os.environ.get(
|
|
| 84 |
HOST = os.environ.get("HOST", "0.0.0.0") # Hugging Face uses 0.0.0.0
|
| 85 |
PORT = int(os.environ.get("PORT", "7860")) # Hugging Face default port
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
# ============================================================================
|
| 88 |
# GLOBAL VARIABLES
|
| 89 |
# ============================================================================
|
|
@@ -92,6 +106,196 @@ gemini_model = None
|
|
| 92 |
blob_service_client = None
|
| 93 |
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
# ============================================================================
|
| 96 |
# STARTUP VALIDATION
|
| 97 |
# ============================================================================
|
|
@@ -102,35 +306,28 @@ def validate_configuration():
|
|
| 102 |
warnings = []
|
| 103 |
errors = []
|
| 104 |
|
| 105 |
-
# Check Gemini API Key
|
| 106 |
if not GEMINI_API_KEY:
|
| 107 |
warnings.append(
|
| 108 |
"β οΈ GEMINI_API_KEY not set - image-based PDFs will not work")
|
| 109 |
else:
|
| 110 |
print(f"β
GEMINI_API_KEY configured ({len(GEMINI_API_KEY)} chars)")
|
| 111 |
|
| 112 |
-
# Check Azure credentials
|
| 113 |
if not AZURE_STORAGE_CONNECTION_STRING:
|
| 114 |
if not (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY):
|
| 115 |
-
errors.append(
|
| 116 |
-
"β Azure credentials missing - set AZURE_STORAGE_CONNECTION_STRING or both AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY")
|
| 117 |
else:
|
| 118 |
print(
|
| 119 |
f"β
Azure credentials configured (account: {AZURE_STORAGE_ACCOUNT_NAME})")
|
| 120 |
else:
|
| 121 |
print(f"β
Azure connection string configured")
|
| 122 |
|
| 123 |
-
# Print all warnings
|
| 124 |
for warning in warnings:
|
| 125 |
print(warning)
|
| 126 |
-
|
| 127 |
-
# Print all errors
|
| 128 |
for error in errors:
|
| 129 |
print(error)
|
| 130 |
|
| 131 |
if errors:
|
| 132 |
print("\nβ οΈ WARNING: Some required credentials are missing!")
|
| 133 |
-
print(" Set them in Hugging Face Spaces Settings > Repository secrets")
|
| 134 |
|
| 135 |
return len(errors) == 0
|
| 136 |
|
|
@@ -156,9 +353,7 @@ def get_blob_service_client():
|
|
| 156 |
elif AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY:
|
| 157 |
account_url = f"https://{AZURE_STORAGE_ACCOUNT_NAME}.blob.core.windows.net"
|
| 158 |
blob_service_client = BlobServiceClient(
|
| 159 |
-
account_url=account_url,
|
| 160 |
-
credential=AZURE_STORAGE_ACCOUNT_KEY
|
| 161 |
-
)
|
| 162 |
print("β
Azure Blob Storage initialized with account key")
|
| 163 |
else:
|
| 164 |
print("β οΈ WARNING: No Azure credentials configured")
|
|
@@ -199,17 +394,13 @@ def upload_raw_pdf_to_blob(pdf_bytes: bytes, filename: str, batch_id: str, conta
|
|
| 199 |
raise HTTPException(
|
| 200 |
status_code=500, detail="Azure Blob Storage not configured")
|
| 201 |
|
| 202 |
-
# Clean filename for folder name
|
| 203 |
base_filename = os.path.splitext(filename)[0]
|
| 204 |
safe_folder_name = re.sub(r'[<>:"/\\|?*]', '_', base_filename)
|
| 205 |
-
|
| 206 |
blob_name = f"{ROOT_FOLDER}/{batch_id}/{safe_folder_name}/Raw/{filename}"
|
| 207 |
|
| 208 |
-
# Get blob client
|
| 209 |
blob_client = client.get_blob_client(
|
| 210 |
container=container_name, blob=blob_name)
|
| 211 |
|
| 212 |
-
# Upload PDF
|
| 213 |
print(f"π€ Uploading raw PDF to: {blob_name}")
|
| 214 |
blob_client.upload_blob(
|
| 215 |
pdf_bytes,
|
|
@@ -223,7 +414,6 @@ def upload_raw_pdf_to_blob(pdf_bytes: bytes, filename: str, batch_id: str, conta
|
|
| 223 |
}
|
| 224 |
)
|
| 225 |
|
| 226 |
-
# Generate SAS URL (valid for 24 hours)
|
| 227 |
expiry_hours = 24
|
| 228 |
sas_token = generate_blob_sas(
|
| 229 |
account_name=AZURE_STORAGE_ACCOUNT_NAME,
|
|
@@ -234,7 +424,6 @@ def upload_raw_pdf_to_blob(pdf_bytes: bytes, filename: str, batch_id: str, conta
|
|
| 234 |
expiry=datetime.utcnow() + timedelta(hours=expiry_hours)
|
| 235 |
)
|
| 236 |
|
| 237 |
-
# Construct URLs
|
| 238 |
blob_url = blob_client.url
|
| 239 |
download_url = f"{blob_url}?{sas_token}"
|
| 240 |
expires_at = (datetime.utcnow() +
|
|
@@ -273,17 +462,13 @@ def upload_split_pdf_to_blob(pdf_bytes: bytes, invoice_filename: str, original_f
|
|
| 273 |
raise HTTPException(
|
| 274 |
status_code=500, detail="Azure Blob Storage not configured")
|
| 275 |
|
| 276 |
-
# Clean original filename for folder name
|
| 277 |
base_filename = os.path.splitext(original_filename)[0]
|
| 278 |
safe_folder_name = re.sub(r'[<>:"/\\|?*]', '_', base_filename)
|
| 279 |
-
|
| 280 |
blob_name = f"{ROOT_FOLDER}/{batch_id}/{safe_folder_name}/Splitted/{invoice_filename}"
|
| 281 |
|
| 282 |
-
# Get blob client
|
| 283 |
blob_client = client.get_blob_client(
|
| 284 |
container=container_name, blob=blob_name)
|
| 285 |
|
| 286 |
-
# Upload PDF
|
| 287 |
blob_client.upload_blob(
|
| 288 |
pdf_bytes,
|
| 289 |
overwrite=True,
|
|
@@ -297,7 +482,6 @@ def upload_split_pdf_to_blob(pdf_bytes: bytes, invoice_filename: str, original_f
|
|
| 297 |
}
|
| 298 |
)
|
| 299 |
|
| 300 |
-
# Generate SAS URL (valid for 24 hours)
|
| 301 |
expiry_hours = 24
|
| 302 |
sas_token = generate_blob_sas(
|
| 303 |
account_name=AZURE_STORAGE_ACCOUNT_NAME,
|
|
@@ -308,7 +492,6 @@ def upload_split_pdf_to_blob(pdf_bytes: bytes, invoice_filename: str, original_f
|
|
| 308 |
expiry=datetime.utcnow() + timedelta(hours=expiry_hours)
|
| 309 |
)
|
| 310 |
|
| 311 |
-
# Construct URLs
|
| 312 |
blob_url = blob_client.url
|
| 313 |
download_url = f"{blob_url}?{sas_token}"
|
| 314 |
expires_at = (datetime.utcnow() +
|
|
@@ -344,7 +527,6 @@ async def cleanup_old_blobs(batch_id: str, container_name: str = None):
|
|
| 344 |
return
|
| 345 |
|
| 346 |
container_client = client.get_container_client(container_name)
|
| 347 |
-
|
| 348 |
prefix = f"{ROOT_FOLDER}/{batch_id}/"
|
| 349 |
blobs = container_client.list_blobs(name_starts_with=prefix)
|
| 350 |
|
|
@@ -365,103 +547,88 @@ async def cleanup_old_blobs(batch_id: str, container_name: str = None):
|
|
| 365 |
|
| 366 |
|
| 367 |
def get_gemini_model():
|
| 368 |
-
"""Get or create Gemini model instance
|
| 369 |
-
global gemini_model
|
| 370 |
|
| 371 |
if not GEMINI_AVAILABLE:
|
| 372 |
return None
|
| 373 |
|
| 374 |
-
if
|
| 375 |
-
|
| 376 |
-
return None
|
| 377 |
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
# Use Gemini 2.5 Flash
|
| 381 |
-
gemini_model = genai.GenerativeModel('gemini-2.5-flash')
|
| 382 |
-
print("β
Google Gemini initialized")
|
| 383 |
-
except Exception as e:
|
| 384 |
-
print(f"β Failed to initialize Gemini: {e}")
|
| 385 |
-
return None
|
| 386 |
|
| 387 |
-
|
|
|
|
| 388 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
|
| 396 |
-
img = None
|
| 397 |
try:
|
| 398 |
-
# Reduced resolution for faster processing
|
| 399 |
pix = page.get_pixmap(matrix=fitz.Matrix(
|
| 400 |
GEMINI_IMAGE_RESOLUTION, GEMINI_IMAGE_RESOLUTION))
|
| 401 |
img_bytes = pix.tobytes("png")
|
| 402 |
pix = None
|
| 403 |
-
img = Image.open(io.BytesIO(img_bytes))
|
| 404 |
|
| 405 |
-
# β ENHANCED PROMPT: More specific instructions
|
| 406 |
prompt = """Look at this invoice image and extract ONLY the invoice number.
|
|
|
|
| 407 |
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
- The invoice number is the value RIGHT AFTER these labels
|
| 411 |
-
- DO NOT extract: ACK numbers, Account numbers (A/C No), Order numbers
|
| 412 |
-
- Return ONLY the invoice number (letters and numbers, e.g., G031663, DHC036747)
|
| 413 |
-
- If not found, return: NONE
|
| 414 |
-
|
| 415 |
-
Invoice Number:"""
|
| 416 |
-
|
| 417 |
-
response = model.generate_content([prompt, img])
|
| 418 |
-
if response and response.text:
|
| 419 |
-
extracted_text = response.text.strip()
|
| 420 |
-
|
| 421 |
-
# Clean up the response
|
| 422 |
-
cleaned = extracted_text.replace(
|
| 423 |
-
"*", "").replace("#", "").replace("Invoice Number:", "").strip()
|
| 424 |
-
|
| 425 |
-
print(f" π€ Gemini raw response: '{extracted_text}'")
|
| 426 |
-
print(f" π€ Gemini cleaned: '{cleaned}'")
|
| 427 |
-
|
| 428 |
-
# Basic validation
|
| 429 |
-
if cleaned and cleaned.upper() != "NONE" and len(cleaned) >= 3:
|
| 430 |
-
# Remove any remaining labels
|
| 431 |
-
cleaned = re.sub(
|
| 432 |
-
r'^(Invoice|Bill|Document)\s+(No\.?|Number)[\s\.:]*', '', cleaned, flags=re.IGNORECASE)
|
| 433 |
-
cleaned = cleaned.strip(".,;:-_")
|
| 434 |
-
|
| 435 |
-
if len(cleaned) >= 3:
|
| 436 |
-
print(f" β
Gemini extracted: {cleaned}")
|
| 437 |
-
img.close()
|
| 438 |
-
return cleaned.upper()
|
| 439 |
-
|
| 440 |
-
# β FALLBACK: Full OCR + regex extraction
|
| 441 |
-
print(" β οΈ Gemini direct extraction failed, trying full OCR...")
|
| 442 |
-
ocr_prompt = """Extract ALL text from this invoice image.
|
| 443 |
-
Return the complete text content exactly as it appears, preserving all labels and values."""
|
| 444 |
-
|
| 445 |
-
ocr_response = model.generate_content([ocr_prompt, img])
|
| 446 |
-
if ocr_response and ocr_response.text:
|
| 447 |
-
ocr_text = ocr_response.text
|
| 448 |
-
print(
|
| 449 |
-
f"\n π Gemini OCR text (first 500 chars):\n{ocr_text[:500]}\n")
|
| 450 |
|
| 451 |
-
|
| 452 |
-
inv = try_extract_invoice_from_text(ocr_text)
|
| 453 |
-
if inv:
|
| 454 |
-
img.close()
|
| 455 |
-
return inv
|
| 456 |
|
| 457 |
-
if
|
| 458 |
-
|
| 459 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
|
| 461 |
except Exception as e:
|
| 462 |
-
print(f"
|
| 463 |
-
if img:
|
| 464 |
-
img.close()
|
| 465 |
return None
|
| 466 |
|
| 467 |
|
|
@@ -471,7 +638,6 @@ async def extract_invoices_batch_async(doc: fitz.Document, is_image_pdf: bool,
|
|
| 471 |
page_invoice_nos = []
|
| 472 |
|
| 473 |
if not is_image_pdf:
|
| 474 |
-
# Fast text-based extraction (no parallelization needed)
|
| 475 |
print(f" π Text-based extraction (sequential)")
|
| 476 |
for i in range(doc.page_count):
|
| 477 |
if i % 50 == 0:
|
|
@@ -484,37 +650,29 @@ async def extract_invoices_batch_async(doc: fitz.Document, is_image_pdf: bool,
|
|
| 484 |
gc.collect()
|
| 485 |
return page_invoice_nos
|
| 486 |
|
| 487 |
-
# Image-based PDF: Use parallel Gemini processing
|
| 488 |
print(f" π Image-based extraction (parallel, batch_size={batch_size})")
|
| 489 |
|
| 490 |
-
# Use ThreadPoolExecutor for parallel API calls
|
| 491 |
with ThreadPoolExecutor(max_workers=batch_size) as executor:
|
| 492 |
futures = []
|
| 493 |
|
| 494 |
-
# Submit all pages to thread pool
|
| 495 |
for i in range(doc.page_count):
|
| 496 |
page = doc.load_page(i)
|
| 497 |
-
# First try text extraction (fast)
|
| 498 |
text_result = extract_invoice_text_based(page)
|
| 499 |
if text_result:
|
| 500 |
futures.append((i, None, text_result))
|
| 501 |
else:
|
| 502 |
-
# Submit to Gemini thread pool
|
| 503 |
future = executor.submit(extract_invoice_gemini_sync, page)
|
| 504 |
futures.append((i, future, None))
|
| 505 |
|
| 506 |
-
# Collect results in order
|
| 507 |
page_invoice_nos = [None] * doc.page_count
|
| 508 |
completed = 0
|
| 509 |
|
| 510 |
for i, future, text_result in futures:
|
| 511 |
try:
|
| 512 |
if text_result:
|
| 513 |
-
# Already extracted from text
|
| 514 |
page_invoice_nos[i] = text_result
|
| 515 |
completed += 1
|
| 516 |
else:
|
| 517 |
-
# Wait for Gemini result
|
| 518 |
result = future.result(timeout=30)
|
| 519 |
page_invoice_nos[i] = result
|
| 520 |
completed += 1
|
|
@@ -540,12 +698,10 @@ def extract_invoices_smart_sampling(doc: fitz.Document, is_image_pdf: bool) -> L
|
|
| 540 |
|
| 541 |
page_invoice_nos = [None] * doc.page_count
|
| 542 |
|
| 543 |
-
# Always extract from first page
|
| 544 |
page = doc.load_page(0)
|
| 545 |
page_invoice_nos[0] = extract_invoice_no_from_page(page, is_image_pdf)
|
| 546 |
print(f" β Page 1: {page_invoice_nos[0]}")
|
| 547 |
|
| 548 |
-
# Sample every Nth page to detect changes
|
| 549 |
sample_interval = max(3, doc.page_count // 20)
|
| 550 |
print(f" Sampling interval: every {sample_interval} pages")
|
| 551 |
|
|
@@ -557,7 +713,6 @@ def extract_invoices_smart_sampling(doc: fitz.Document, is_image_pdf: bool) -> L
|
|
| 557 |
if i % 10 == 0:
|
| 558 |
print(f" Sampling page {i+1}/{doc.page_count}...")
|
| 559 |
|
| 560 |
-
# If invoice changed, extract nearby pages to find exact boundary
|
| 561 |
prev_known_idx = i - sample_interval
|
| 562 |
while prev_known_idx >= 0 and page_invoice_nos[prev_known_idx] is None:
|
| 563 |
prev_known_idx -= 1
|
|
@@ -571,13 +726,11 @@ def extract_invoices_smart_sampling(doc: fitz.Document, is_image_pdf: bool) -> L
|
|
| 571 |
page_invoice_nos[idx] = extract_invoice_no_from_page(
|
| 572 |
page, is_image_pdf)
|
| 573 |
|
| 574 |
-
# Also check last page
|
| 575 |
if page_invoice_nos[-1] is None:
|
| 576 |
page = doc.load_page(doc.page_count - 1)
|
| 577 |
page_invoice_nos[-1] = extract_invoice_no_from_page(page, is_image_pdf)
|
| 578 |
print(f" β Last page: {page_invoice_nos[-1]}")
|
| 579 |
|
| 580 |
-
# Forward-fill gaps
|
| 581 |
last_known = page_invoice_nos[0]
|
| 582 |
filled = 0
|
| 583 |
for i in range(len(page_invoice_nos)):
|
|
@@ -591,7 +744,7 @@ def extract_invoices_smart_sampling(doc: fitz.Document, is_image_pdf: bool) -> L
|
|
| 591 |
return page_invoice_nos
|
| 592 |
|
| 593 |
# ============================================================================
|
| 594 |
-
# PDF PROCESSING FUNCTIONS
|
| 595 |
# ============================================================================
|
| 596 |
|
| 597 |
|
|
@@ -633,48 +786,26 @@ def is_valid_invoice_number(candidate: str) -> bool:
|
|
| 633 |
has_digit = any(c.isdigit() for c in candidate)
|
| 634 |
return has_letter and has_digit
|
| 635 |
|
| 636 |
-
# β KEEP YOUR ORIGINAL EXTRACTION FUNCTION (Works for other invoices)
|
| 637 |
-
|
| 638 |
|
| 639 |
def try_extract_invoice_from_text(text: str) -> Optional[str]:
|
| 640 |
-
"""
|
| 641 |
-
β UNIVERSAL LABEL-FIRST EXTRACTION with Smart Prioritization
|
| 642 |
-
|
| 643 |
-
Strategy:
|
| 644 |
-
1. Find invoice-related labels (Invoice No, Bill No, etc.)
|
| 645 |
-
2. Extract ALL potential candidates after the label
|
| 646 |
-
3. TWO-PASS: Prioritize pure numeric 12-14 digit numbers (common for invoices)
|
| 647 |
-
4. Filter out noise patterns (ACK, PH, A/C, state codes, etc.)
|
| 648 |
-
5. Return the first valid candidate
|
| 649 |
-
|
| 650 |
-
Works for ANY invoice format!
|
| 651 |
-
"""
|
| 652 |
if not text:
|
| 653 |
return None
|
| 654 |
|
| 655 |
text_norm = normalize_text_for_search(text)
|
| 656 |
|
| 657 |
-
# β DEBUG: Print first 800 chars
|
| 658 |
if len(text_norm) > 0:
|
| 659 |
print(f"\n{'='*70}")
|
| 660 |
print(f"π ANALYZING TEXT (first 800 chars):")
|
| 661 |
print(f"{text_norm[:800]}")
|
| 662 |
print(f"{'='*70}\n")
|
| 663 |
|
| 664 |
-
# ============================================================================
|
| 665 |
-
# β PRIORITY 1: LABELED VALUE EXTRACTION (UNIVERSAL APPROACH)
|
| 666 |
-
# ============================================================================
|
| 667 |
-
|
| 668 |
-
# Define label patterns in PRIORITY ORDER
|
| 669 |
label_patterns = [
|
| 670 |
-
# Invoice labels (highest priority)
|
| 671 |
(r"Invoice\s*(?:No\.?|Number|Num)", "Invoice No", True),
|
| 672 |
(r"Inv\s*(?:No\.?|Number)", "Inv No", True),
|
| 673 |
(r"Bill\s*(?:No\.?|Number|Num)", "Bill No", True),
|
| 674 |
(r"Tax\s*Invoice\s*(?:No\.?|Number)", "Tax Invoice No", True),
|
| 675 |
(r"Document\s*(?:No\.?|Number)", "Document No", True),
|
| 676 |
-
|
| 677 |
-
# Other labels (lower priority)
|
| 678 |
(r"Receipt\s*(?:No\.?|Number)", "Receipt No", False),
|
| 679 |
(r"Voucher\s*(?:No\.?|Number)", "Voucher No", False),
|
| 680 |
(r"Reference\s*(?:No\.?|Number)", "Reference No", False),
|
|
@@ -685,14 +816,11 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
|
|
| 685 |
|
| 686 |
for label_pattern, label_name, is_invoice_label in label_patterns:
|
| 687 |
header_text = text_norm[:2000]
|
| 688 |
-
|
| 689 |
-
# Find ALL matches of this label
|
| 690 |
label_matches = list(re.finditer(
|
| 691 |
label_pattern, header_text, re.IGNORECASE))
|
| 692 |
|
| 693 |
for label_match in label_matches:
|
| 694 |
start_pos = label_match.end()
|
| 695 |
-
# Get a larger chunk of text after the label (200 chars)
|
| 696 |
text_after_label = header_text[start_pos:start_pos + 200]
|
| 697 |
|
| 698 |
print(
|
|
@@ -700,22 +828,12 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
|
|
| 700 |
print(
|
| 701 |
f" Text after label (first 80 chars): '{text_after_label[:80]}...'")
|
| 702 |
|
| 703 |
-
# β UNIVERSAL APPROACH: Extract ALL potential candidates (alphanumeric tokens)
|
| 704 |
all_candidates = re.findall(
|
| 705 |
-
r'\b([A-Z0-9][A-Z0-9\-\/]{2,20})\b',
|
| 706 |
-
text_after_label,
|
| 707 |
-
re.IGNORECASE
|
| 708 |
-
)
|
| 709 |
|
| 710 |
print(
|
| 711 |
f" Found {len(all_candidates)} potential candidates: {all_candidates[:5]}")
|
| 712 |
|
| 713 |
-
# ============================================================================
|
| 714 |
-
# β TWO-PASS SMART PRIORITIZATION
|
| 715 |
-
# Pass 1: Pure numeric 12-14 digit numbers (very common for invoices)
|
| 716 |
-
# Pass 2: Alphanumeric candidates (only if Pass 1 fails)
|
| 717 |
-
# ============================================================================
|
| 718 |
-
|
| 719 |
for pass_number in [1, 2]:
|
| 720 |
if pass_number == 2 and len(all_candidates) > 0:
|
| 721 |
print(f" π Second pass: Trying alphanumeric candidates...")
|
|
@@ -723,31 +841,22 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
|
|
| 723 |
for candidate in all_candidates:
|
| 724 |
invoice_num = candidate.strip(".,;:-_")
|
| 725 |
|
| 726 |
-
# Skip if too short
|
| 727 |
if len(invoice_num) < 3:
|
| 728 |
continue
|
| 729 |
|
| 730 |
-
# β SMART FILTERING: First pass only accepts pure numeric 12-14 digits
|
| 731 |
is_pure_numeric = invoice_num.isdigit()
|
| 732 |
is_ideal_invoice_length = 12 <= len(invoice_num) <= 14
|
| 733 |
|
| 734 |
if pass_number == 1:
|
| 735 |
-
# First pass: Only consider pure numeric 12-14 digits
|
| 736 |
if not (is_pure_numeric and is_ideal_invoice_length):
|
| 737 |
continue
|
| 738 |
print(
|
| 739 |
f" β¨ PRIORITY candidate (12-14 digit numeric): '{invoice_num}'")
|
| 740 |
else:
|
| 741 |
-
# Second pass: Skip ones already checked in first pass
|
| 742 |
if is_pure_numeric and is_ideal_invoice_length:
|
| 743 |
continue
|
| 744 |
print(f" π Evaluating candidate: '{invoice_num}'")
|
| 745 |
|
| 746 |
-
# ====================================================================
|
| 747 |
-
# β COMPREHENSIVE BLACKLIST FILTER
|
| 748 |
-
# ====================================================================
|
| 749 |
-
|
| 750 |
-
# Skip noise words
|
| 751 |
if invoice_num.upper() in ("ORDER", "REF", "NO", "NUMBER", "DATE", "DT", "AND",
|
| 752 |
"INV", "BILL", "ACCOUNT", "PO", "COPY", "OF",
|
| 753 |
"DOCUMENT", "DOC", "GST", "GSTIN", "ACK", "USER",
|
|
@@ -755,204 +864,135 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
|
|
| 755 |
print(f" β οΈ Skipped: noise word")
|
| 756 |
continue
|
| 757 |
|
| 758 |
-
# Context-aware batch pattern filter
|
| 759 |
if not is_invoice_label:
|
| 760 |
if re.match(r'^[A-Z]\d{6}$', invoice_num, re.IGNORECASE):
|
| 761 |
print(
|
| 762 |
f" β οΈ Skipped: batch pattern (non-invoice context)")
|
| 763 |
continue
|
| 764 |
|
| 765 |
-
# Skip license patterns (XX-XXX-123456)
|
| 766 |
if re.match(r'^[A-Z]{2,3}-[A-Z0-9]+-\d+$', invoice_num, re.IGNORECASE):
|
| 767 |
print(f" β οΈ Skipped: license pattern")
|
| 768 |
continue
|
| 769 |
|
| 770 |
-
# β NEW: Skip state code / UIN patterns (MHMY-4501110485 format)
|
| 771 |
if re.match(r'^[A-Z]{2,4}-\d{10}$', invoice_num, re.IGNORECASE):
|
| 772 |
-
print(
|
| 773 |
-
f" β οΈ Skipped: state code/UIN pattern (XXXX-nnnnnnnnnn)")
|
| 774 |
continue
|
| 775 |
|
| 776 |
-
|
| 777 |
-
if re.search(rf"Ack\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(invoice_num)}",
|
| 778 |
-
text_norm, re.IGNORECASE):
|
| 779 |
print(f" β οΈ Skipped: ACK number")
|
| 780 |
continue
|
| 781 |
|
| 782 |
-
|
| 783 |
-
if re.search(rf"PH\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(invoice_num)}",
|
| 784 |
-
text_norm, re.IGNORECASE):
|
| 785 |
print(f" β οΈ Skipped: PH number")
|
| 786 |
continue
|
| 787 |
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
text_norm, re.IGNORECASE):
|
| 791 |
-
print(f" β οΈ Skipped: UIN/UID/State Code/D.L.No")
|
| 792 |
continue
|
| 793 |
|
| 794 |
-
|
| 795 |
-
if re.search(rf"A[\s\/]*C\s*(?:No\.?|Number)?\s*[\-:\.]?\s*{re.escape(invoice_num)}",
|
| 796 |
-
text_norm, re.IGNORECASE):
|
| 797 |
print(f" β οΈ Skipped: A/C number")
|
| 798 |
continue
|
| 799 |
|
| 800 |
-
# Skip phone numbers (10-11 digits)
|
| 801 |
if re.match(r'^[0-9]{10,11}$', invoice_num):
|
| 802 |
-
# 10 digits starting with 6-9 (mobile)
|
| 803 |
if len(invoice_num) == 10 and invoice_num[0] in '6789':
|
| 804 |
print(f" β οΈ Skipped: mobile number")
|
| 805 |
continue
|
| 806 |
-
# 11 digits starting with 0 (landline with STD code)
|
| 807 |
if len(invoice_num) == 11 and invoice_num[0] == '0':
|
| 808 |
print(f" β οΈ Skipped: landline number")
|
| 809 |
continue
|
| 810 |
|
| 811 |
-
# Skip dates (8 digits starting with 20)
|
| 812 |
if re.match(r'^20\d{6}$', invoice_num):
|
| 813 |
-
print(f" β οΈ Skipped: date pattern
|
| 814 |
continue
|
| 815 |
|
| 816 |
-
# Skip date formats (dd/mm/yyyy or dd-mm-yyyy)
|
| 817 |
if re.match(r'^\d{2}[\/\-]\d{2}[\/\-]\d{4}$', invoice_num):
|
| 818 |
-
print(f" β οΈ Skipped: date format
|
| 819 |
continue
|
| 820 |
|
| 821 |
-
|
| 822 |
-
|
| 823 |
-
invoice_num, re.IGNORECASE):
|
| 824 |
-
print(f" β οΈ Skipped: GST number (15 chars)")
|
| 825 |
continue
|
| 826 |
|
| 827 |
-
# β
VALID INVOICE NUMBER FOUND!
|
| 828 |
print(f" β
β
β
ACCEPTED: '{invoice_num}'")
|
| 829 |
return invoice_num.upper()
|
| 830 |
|
| 831 |
print(f" β οΈ No valid candidates found after '{label_name}'")
|
| 832 |
|
| 833 |
-
# ============================================================================
|
| 834 |
-
# β PRIORITY 2: FALLBACK - Unlabeled extraction
|
| 835 |
-
# ============================================================================
|
| 836 |
-
|
| 837 |
print("\nβ οΈ No labeled invoice number found, trying fallback extraction...")
|
| 838 |
|
| 839 |
top_text = text_norm[:1000]
|
| 840 |
|
| 841 |
-
# Try CREDIT numbers (12-20 digits, excluding 14-digit account numbers)
|
| 842 |
credit_match = re.search(
|
| 843 |
-
r"CREDIT\s*(?:NO|NUMBER|#)?\s*[:\-]?\s*(\d{12,20})",
|
| 844 |
-
text_norm,
|
| 845 |
-
re.IGNORECASE
|
| 846 |
-
)
|
| 847 |
if credit_match:
|
| 848 |
credit_num = credit_match.group(1).strip()
|
| 849 |
-
# Allow 12-14 digits, exclude exactly 14 if it might be account number
|
| 850 |
if 12 <= len(credit_num) <= 20 and len(credit_num) != 14:
|
| 851 |
print(f"β Fallback: Found CREDIT number: {credit_num}")
|
| 852 |
return credit_num.upper()
|
| 853 |
|
| 854 |
-
# Try long numerics (12-20 digits), excluding problematic patterns
|
| 855 |
long_numerics = re.findall(r'\b(\d{12,20})\b', top_text)
|
| 856 |
for num in long_numerics:
|
| 857 |
-
|
| 858 |
-
if re.search(rf"(?:Ack|PH|A[\s\/]*C|UIN|UID|State\s*Code|D\.L\.No)\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(num)}",
|
| 859 |
-
text_norm, re.IGNORECASE):
|
| 860 |
print(f"β οΈ Fallback: Skipping (labeled as ACK/PH/A/C/UIN): {num}")
|
| 861 |
continue
|
| 862 |
-
|
| 863 |
print(f"β Fallback: Found long numeric: {num}")
|
| 864 |
return num.upper()
|
| 865 |
|
| 866 |
-
# Try medium numerics (10-15 digits, excluding phones and dates)
|
| 867 |
medium_numerics = re.findall(r'\b(\d{10,15})\b', top_text)
|
| 868 |
for num in medium_numerics:
|
| 869 |
-
# Skip phone numbers
|
| 870 |
if len(num) == 10 and num[0] in '6789':
|
| 871 |
continue
|
| 872 |
if len(num) == 11 and num[0] == '0':
|
| 873 |
continue
|
| 874 |
-
|
| 875 |
-
# Skip dates
|
| 876 |
if len(num) == 8 and num.startswith('20'):
|
| 877 |
continue
|
| 878 |
-
|
| 879 |
-
# Skip if labeled as problematic
|
| 880 |
-
if re.search(rf"(?:Ack|PH|A[\s\/]*C|UIN|UID)\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(num)}",
|
| 881 |
-
text_norm, re.IGNORECASE):
|
| 882 |
continue
|
| 883 |
-
|
| 884 |
print(f"β Fallback: Found medium numeric: {num}")
|
| 885 |
return num.upper()
|
| 886 |
|
| 887 |
print("β No invoice number found (labeled or unlabeled)")
|
| 888 |
return None
|
| 889 |
|
| 890 |
-
# β ENHANCED FUNCTION: Add Zydus Healthcare fallback (works with table layouts)
|
| 891 |
-
|
| 892 |
|
| 893 |
def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
|
| 894 |
-
"""
|
| 895 |
-
Extract invoice number from TEXT-BASED PDF.
|
| 896 |
-
|
| 897 |
-
β ZYDUS HEALTHCARE PRIORITY EXTRACTION:
|
| 898 |
-
Zydus Healthcare invoices have a specific pattern: 10-digit numbers starting with '23'
|
| 899 |
-
(e.g., 2310763135, 2310763275). These must be extracted BEFORE the original logic
|
| 900 |
-
runs, because the original logic will pick up 14-digit Order IDs instead.
|
| 901 |
-
"""
|
| 902 |
text = page.get_text("text") or ""
|
| 903 |
text_norm = normalize_text_for_search(text)
|
| 904 |
|
| 905 |
-
# β STEP 1: ALWAYS check for Zydus pattern FIRST (before any other extraction)
|
| 906 |
-
# Look for 10-digit number starting with '23' in first 2500 chars
|
| 907 |
header_text = text_norm[:2500]
|
| 908 |
-
|
| 909 |
-
# Find ALL occurrences of 23xxxxxxxx pattern
|
| 910 |
zydus_candidates = re.findall(r'\b(23\d{8})\b', header_text)
|
| 911 |
|
| 912 |
if zydus_candidates:
|
| 913 |
-
# β CRITICAL: If we found any 23xxxxxxxx pattern, this is a Zydus invoice
|
| 914 |
-
# Return the FIRST occurrence (most likely to be the invoice number)
|
| 915 |
zydus_number = zydus_candidates[0]
|
| 916 |
print(f" β
ZYDUS INVOICE DETECTED: {zydus_number}")
|
| 917 |
return zydus_number.upper()
|
| 918 |
|
| 919 |
-
# β STEP 2: If NO Zydus pattern found, use original extraction logic
|
| 920 |
inv = try_extract_invoice_from_text(text)
|
| 921 |
|
| 922 |
-
# β NEW: BLACKLIST FILTER for Zydus Healthcare invoices
|
| 923 |
-
# Reject 14-digit Order IDs (pattern: 107xxxxxxxxxx or 10xxxxxxxxxx with 14 digits)
|
| 924 |
if inv:
|
| 925 |
-
# Check if this is a 14-digit number starting with '10' or '107'
|
| 926 |
if re.match(r'^10\d{12}$', inv):
|
| 927 |
-
print(
|
| 928 |
-
f" β οΈ REJECTED Order ID (14-digit): {inv} - Looking for Zydus pattern instead...")
|
| 929 |
-
# This is likely a Zydus invoice page without the invoice number visible
|
| 930 |
-
# Skip this extraction and try other methods
|
| 931 |
inv = None
|
| 932 |
else:
|
| 933 |
-
# Valid invoice number from original extraction
|
| 934 |
return inv
|
| 935 |
|
| 936 |
-
# β STEP 3: Try block-level extraction (original logic)
|
| 937 |
for block in (page.get_text("blocks") or []):
|
| 938 |
block_text = block[4] if len(block) > 4 else ""
|
| 939 |
if block_text:
|
| 940 |
inv = try_extract_invoice_from_text(block_text)
|
| 941 |
if inv:
|
| 942 |
-
# Check blacklist again
|
| 943 |
if re.match(r'^10\d{12}$', inv):
|
| 944 |
-
print(
|
| 945 |
-
|
| 946 |
-
continue # Skip this block, try next one
|
| 947 |
else:
|
| 948 |
return inv
|
| 949 |
|
| 950 |
-
# β STEP 4: Final fallback - try Zydus pattern in text blocks
|
| 951 |
-
# (For continuation pages where invoice number might be in a different block)
|
| 952 |
blocks = page.get_text("blocks") or []
|
| 953 |
sorted_blocks = sorted(blocks, key=lambda b: b[1] if len(b) > 1 else 0)
|
| 954 |
|
| 955 |
-
for block in sorted_blocks[:15]:
|
| 956 |
block_text = block[4] if len(block) > 4 else ""
|
| 957 |
if block_text:
|
| 958 |
block_norm = normalize_text_for_search(block_text)
|
|
@@ -962,14 +1002,12 @@ def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
|
|
| 962 |
print(f" β
ZYDUS BLOCK DETECTION: {number}")
|
| 963 |
return number.upper()
|
| 964 |
|
| 965 |
-
# β STEP 5: Last resort - if still nothing found, return None
|
| 966 |
-
# The forward-fill logic will assign this page to the previous invoice
|
| 967 |
print(f" β οΈ No valid invoice found on this page (will use forward-fill)")
|
| 968 |
return None
|
| 969 |
|
| 970 |
|
| 971 |
def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
|
| 972 |
-
"""Extract invoice number from a single page
|
| 973 |
text_result = extract_invoice_text_based(page)
|
| 974 |
if text_result:
|
| 975 |
return text_result
|
|
@@ -996,21 +1034,13 @@ def remove_file(path: str):
|
|
| 996 |
except Exception as e:
|
| 997 |
print(f"β οΈ Cleanup warning: {e}")
|
| 998 |
|
| 999 |
-
# ============================================================================
|
| 1000 |
-
# β NEW: MERGE FUNCTION FOR NULL FIRST GROUP
|
| 1001 |
-
# ============================================================================
|
| 1002 |
-
|
| 1003 |
|
| 1004 |
def merge_first_null_group(groups: List[Dict]) -> List[Dict]:
|
| 1005 |
-
"""
|
| 1006 |
-
If the first group has invoice_no = None and the second group has a valid invoice,
|
| 1007 |
-
merge them together (page 1 is likely the cover page of the first invoice).
|
| 1008 |
-
"""
|
| 1009 |
if len(groups) >= 2:
|
| 1010 |
first_group = groups[0]
|
| 1011 |
second_group = groups[1]
|
| 1012 |
|
| 1013 |
-
# Check if first group is null and second group has invoice number
|
| 1014 |
if first_group["invoice_no"] is None and second_group["invoice_no"] is not None:
|
| 1015 |
print(f"\nπ§ AUTO-FIX: Merging null first page(s) with first invoice")
|
| 1016 |
print(
|
|
@@ -1018,11 +1048,8 @@ def merge_first_null_group(groups: List[Dict]) -> List[Dict]:
|
|
| 1018 |
print(
|
| 1019 |
f" First invoice: {second_group['invoice_no']}, Pages {[p+1 for p in second_group['pages']]}")
|
| 1020 |
|
| 1021 |
-
# Merge: Add first group's pages to second group
|
| 1022 |
merged_pages = first_group["pages"] + second_group["pages"]
|
| 1023 |
second_group["pages"] = merged_pages
|
| 1024 |
-
|
| 1025 |
-
# Remove first null group
|
| 1026 |
groups.pop(0)
|
| 1027 |
|
| 1028 |
print(
|
|
@@ -1039,35 +1066,25 @@ def merge_first_null_group(groups: List[Dict]) -> List[Dict]:
|
|
| 1039 |
async def split_invoices(
|
| 1040 |
background_tasks: BackgroundTasks,
|
| 1041 |
file: UploadFile = File(...),
|
| 1042 |
-
batch_id: str = Form(...,
|
| 1043 |
-
description="Batch ID (required) - used for folder structure"),
|
| 1044 |
use_blob_storage: bool = Form(
|
| 1045 |
True, description="Upload PDFs to Azure Blob Storage"),
|
| 1046 |
blob_container: Optional[str] = Form(
|
| 1047 |
-
None, description="Custom Azure container
|
| 1048 |
include_base64: bool = Form(
|
| 1049 |
False, description="Include base64 in response"),
|
| 1050 |
parallel_batch_size: int = Form(
|
| 1051 |
-
MAX_PARALLEL_GEMINI_CALLS, description="Parallel Gemini API calls
|
| 1052 |
use_smart_sampling: bool = Form(
|
| 1053 |
-
USE_SMART_SAMPLING, description="Use smart sampling
|
| 1054 |
max_file_size_mb: int = Form(200, description="Maximum file size in MB"),
|
| 1055 |
):
|
| 1056 |
-
"""
|
| 1057 |
-
β UNIVERSAL INVOICE SPLITTER
|
| 1058 |
-
|
| 1059 |
-
Works for ALL invoice types:
|
| 1060 |
-
- Standard invoices (original extraction)
|
| 1061 |
-
- Zydus Healthcare invoices (enhanced fallback for 23xxxxxxxx pattern)
|
| 1062 |
-
- Auto-merges null first pages
|
| 1063 |
-
"""
|
| 1064 |
|
| 1065 |
if not file.filename:
|
| 1066 |
raise HTTPException(status_code=400, detail="No filename provided")
|
| 1067 |
|
| 1068 |
filename_lower = file.filename.lower()
|
| 1069 |
-
|
| 1070 |
-
# Supported formats
|
| 1071 |
SUPPORTED_EXTENSIONS = ['.pdf', '.png',
|
| 1072 |
'.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
|
| 1073 |
|
|
@@ -1079,18 +1096,14 @@ async def split_invoices(
|
|
| 1079 |
|
| 1080 |
if not file_extension:
|
| 1081 |
raise HTTPException(
|
| 1082 |
-
status_code=400,
|
| 1083 |
-
detail=f"Unsupported file format. Supported: PDF, PNG, JPG, JPEG, TIFF, BMP"
|
| 1084 |
-
)
|
| 1085 |
|
| 1086 |
is_image_file = file_extension in [
|
| 1087 |
'.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
|
| 1088 |
|
| 1089 |
if is_image_file and not GEMINI_AVAILABLE:
|
| 1090 |
raise HTTPException(
|
| 1091 |
-
status_code=500,
|
| 1092 |
-
detail="Image processing requires PIL. Install: pip install Pillow"
|
| 1093 |
-
)
|
| 1094 |
|
| 1095 |
if use_blob_storage and not get_blob_service_client():
|
| 1096 |
raise HTTPException(
|
|
@@ -1198,7 +1211,6 @@ async def split_invoices(
|
|
| 1198 |
print(
|
| 1199 |
f" ... (showing first 10 of {len(page_invoice_nos)} pages)")
|
| 1200 |
|
| 1201 |
-
# Step 1: Normalize extracted invoice numbers (only filter GST numbers)
|
| 1202 |
page_invoice_nos_normalized = []
|
| 1203 |
for v in page_invoice_nos:
|
| 1204 |
if v and v.upper().startswith("GST"):
|
|
@@ -1209,7 +1221,6 @@ async def split_invoices(
|
|
| 1209 |
else:
|
| 1210 |
page_invoice_nos_normalized.append(None)
|
| 1211 |
|
| 1212 |
-
# Step 2: Smart forward-fill for failed extractions
|
| 1213 |
page_invoice_nos_filled = []
|
| 1214 |
last_known_invoice = None
|
| 1215 |
|
|
@@ -1230,7 +1241,6 @@ async def split_invoices(
|
|
| 1230 |
page_count = sum(1 for v in page_invoice_nos_filled if v == inv_no)
|
| 1231 |
print(f" β’ {inv_no}: {page_count} pages")
|
| 1232 |
|
| 1233 |
-
# Step 3: Group consecutive pages by invoice number
|
| 1234 |
groups = []
|
| 1235 |
current_group = []
|
| 1236 |
current_invoice = None
|
|
@@ -1241,10 +1251,8 @@ async def split_invoices(
|
|
| 1241 |
current_group = [idx]
|
| 1242 |
else:
|
| 1243 |
if inv != current_invoice:
|
| 1244 |
-
groups.append({
|
| 1245 |
-
|
| 1246 |
-
"pages": current_group[:]
|
| 1247 |
-
})
|
| 1248 |
print(
|
| 1249 |
f" π Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
|
| 1250 |
current_invoice = inv
|
|
@@ -1253,27 +1261,21 @@ async def split_invoices(
|
|
| 1253 |
current_group.append(idx)
|
| 1254 |
|
| 1255 |
if current_group:
|
| 1256 |
-
groups.append({
|
| 1257 |
-
|
| 1258 |
-
"pages": current_group[:]
|
| 1259 |
-
})
|
| 1260 |
print(
|
| 1261 |
f" π Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
|
| 1262 |
|
| 1263 |
if len(groups) == 1 and groups[0]["invoice_no"] is None:
|
| 1264 |
-
groups = [{
|
| 1265 |
-
|
| 1266 |
-
"pages": list(range(doc.page_count))
|
| 1267 |
-
}]
|
| 1268 |
|
| 1269 |
-
# β NEW: Auto-merge first null group
|
| 1270 |
groups = merge_first_null_group(groups)
|
| 1271 |
|
| 1272 |
print(f"\nβ
Created {len(groups)} invoice groups (after auto-merge)")
|
| 1273 |
print(
|
| 1274 |
f" Forward-filled {filled_count} pages with missing invoice numbers")
|
| 1275 |
|
| 1276 |
-
# Build and upload split PDFs
|
| 1277 |
print(f"\nπ¨ Building and uploading split invoices...")
|
| 1278 |
all_parts = []
|
| 1279 |
|
|
@@ -1355,7 +1357,8 @@ async def split_invoices(
|
|
| 1355 |
"unique_invoice_numbers": len(unique_invoices),
|
| 1356 |
"extraction_method": "gemini" if is_image_pdf else "text",
|
| 1357 |
"pages_forward_filled": filled_count,
|
| 1358 |
-
"storage_type": "azure_blob" if use_blob_storage else "base64"
|
|
|
|
| 1359 |
},
|
| 1360 |
"performance": {
|
| 1361 |
"total_time_seconds": round(total_time, 2),
|
|
@@ -1377,6 +1380,7 @@ async def split_invoices(
|
|
| 1377 |
f" Raw PDF: {raw_pdf_info['blob_name'] if raw_pdf_info else 'Not uploaded'}")
|
| 1378 |
print(f" Split invoices: {len(all_parts)}")
|
| 1379 |
print(f" Unique invoice numbers: {len(unique_invoices)}")
|
|
|
|
| 1380 |
print(f" Total time: {total_time:.1f}s")
|
| 1381 |
print(
|
| 1382 |
f" Extraction time: {extraction_time:.1f}s ({total_pages_count / extraction_time:.1f} pages/sec)")
|
|
@@ -1406,7 +1410,7 @@ async def cleanup_batch(
|
|
| 1406 |
background_tasks: BackgroundTasks,
|
| 1407 |
container_name: Optional[str] = Form(None)
|
| 1408 |
):
|
| 1409 |
-
"""Delete all blobs for a specific batch
|
| 1410 |
if container_name is None:
|
| 1411 |
container_name = AZURE_CONTAINER_NAME
|
| 1412 |
|
|
@@ -1421,38 +1425,79 @@ async def cleanup_batch(
|
|
| 1421 |
})
|
| 1422 |
|
| 1423 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1424 |
@app.get("/")
|
| 1425 |
async def root():
|
| 1426 |
return {
|
| 1427 |
-
"service": "Universal Invoice Splitter API",
|
| 1428 |
-
"version": "
|
| 1429 |
"status": "running",
|
| 1430 |
"features": {
|
| 1431 |
"multi_format_support": True,
|
| 1432 |
"zydus_healthcare_support": True,
|
| 1433 |
"auto_merge_null_groups": True,
|
| 1434 |
"azure_blob_storage": True,
|
| 1435 |
-
"parallel_processing": True
|
| 1436 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1437 |
}
|
| 1438 |
|
| 1439 |
|
| 1440 |
@app.get("/health")
|
| 1441 |
async def health():
|
|
|
|
| 1442 |
return {
|
| 1443 |
"status": "healthy",
|
| 1444 |
"timestamp": datetime.now().isoformat(),
|
| 1445 |
"gemini_configured": bool(GEMINI_API_KEY),
|
| 1446 |
-
"azure_configured": bool(AZURE_STORAGE_CONNECTION_STRING or (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY))
|
|
|
|
|
|
|
| 1447 |
}
|
| 1448 |
|
| 1449 |
if __name__ == "__main__":
|
| 1450 |
import uvicorn
|
| 1451 |
-
|
| 1452 |
-
|
| 1453 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1454 |
print(f"β
Supports ALL invoice types")
|
| 1455 |
print(f"β
Zydus Healthcare fallback (23xxxxxxxx pattern)")
|
| 1456 |
print(f"β
Auto-merge null first pages")
|
| 1457 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1458 |
uvicorn.run(app, host=HOST, port=PORT, log_level="info")
|
|
|
|
| 7 |
import uuid
|
| 8 |
import asyncio
|
| 9 |
from typing import List, Dict, Optional, Tuple
|
| 10 |
+
from collections import Counter, deque
|
| 11 |
from concurrent.futures import ThreadPoolExecutor
|
| 12 |
+
from threading import Lock, Thread, Event
|
| 13 |
+
import time
|
| 14 |
+
import logging
|
| 15 |
|
| 16 |
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
|
| 17 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
| 19 |
from starlette.requests import Request
|
| 20 |
import fitz # PyMuPDF
|
| 21 |
import google.generativeai as genai
|
| 22 |
+
from google.api_core import exceptions as google_exceptions
|
| 23 |
from PIL import Image
|
| 24 |
+
import requests
|
| 25 |
+
import base64
|
| 26 |
|
| 27 |
# Azure Blob Storage
|
| 28 |
try:
|
|
|
|
| 46 |
|
| 47 |
from datetime import datetime, timedelta
|
| 48 |
|
| 49 |
+
# Configure logging
|
| 50 |
+
logging.basicConfig(level=logging.INFO)
|
| 51 |
+
logger = logging.getLogger(__name__)
|
| 52 |
+
|
| 53 |
app = FastAPI(title="Invoice Splitter API with Azure Blob Storage - Optimized")
|
| 54 |
|
| 55 |
# Increase request body size limit
|
|
|
|
| 64 |
)
|
| 65 |
|
| 66 |
# ============================================================================
|
| 67 |
+
# β CONFIGURATION
|
| 68 |
# ============================================================================
|
| 69 |
|
| 70 |
# Gemini API Key - REQUIRED for image-based PDFs
|
|
|
|
| 94 |
HOST = os.environ.get("HOST", "0.0.0.0") # Hugging Face uses 0.0.0.0
|
| 95 |
PORT = int(os.environ.get("PORT", "7860")) # Hugging Face default port
|
| 96 |
|
| 97 |
+
MAX_WAIT_TIME = 300 # 5 minutes max wait for quota
|
| 98 |
+
model_lock = Lock()
|
| 99 |
+
quota_manager_lock = Lock()
|
| 100 |
+
|
| 101 |
# ============================================================================
|
| 102 |
# GLOBAL VARIABLES
|
| 103 |
# ============================================================================
|
|
|
|
| 106 |
blob_service_client = None
|
| 107 |
|
| 108 |
|
| 109 |
+
GEMINI_REST_URL = "https://generativelanguage.googleapis.com/v1/models/{model}:generateContent?key={key}"
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def call_gemini_25(model_name: str, image_bytes: bytes, prompt: str) -> str:
|
| 113 |
+
global current_model_index
|
| 114 |
+
|
| 115 |
+
while True:
|
| 116 |
+
|
| 117 |
+
model_config = get_current_model_config()
|
| 118 |
+
url = GEMINI_REST_URL.format(
|
| 119 |
+
model=model_config["name"], key=GEMINI_API_KEY)
|
| 120 |
+
|
| 121 |
+
encoded = base64.b64encode(image_bytes).decode("utf-8")
|
| 122 |
+
|
| 123 |
+
payload = {
|
| 124 |
+
"contents": [
|
| 125 |
+
{
|
| 126 |
+
"parts": [
|
| 127 |
+
{"inline_data": {"mime_type": "image/png", "data": encoded}},
|
| 128 |
+
{"text": prompt}
|
| 129 |
+
]
|
| 130 |
+
}
|
| 131 |
+
],
|
| 132 |
+
"generationConfig": {"temperature": 0}
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
r = requests.post(url, json=payload, timeout=model_config["timeout"])
|
| 136 |
+
|
| 137 |
+
# SUCCESS
|
| 138 |
+
if r.status_code == 200:
|
| 139 |
+
record_model_request(model_config)
|
| 140 |
+
data = r.json()
|
| 141 |
+
return data["candidates"][0]["content"]["parts"][0]["text"]
|
| 142 |
+
|
| 143 |
+
# QUOTA HIT β SWITCH MODEL
|
| 144 |
+
if r.status_code in (429, 503):
|
| 145 |
+
print(
|
| 146 |
+
f"β οΈ RPM exhausted for {model_config['name']} β switching model")
|
| 147 |
+
|
| 148 |
+
model_config["current_rpm"] = model_config["max_requests_per_minute"]
|
| 149 |
+
|
| 150 |
+
next_model = get_next_available_model()
|
| 151 |
+
|
| 152 |
+
if next_model:
|
| 153 |
+
print(f"π Switched to {next_model['name']}")
|
| 154 |
+
continue
|
| 155 |
+
|
| 156 |
+
# All models exhausted β wait
|
| 157 |
+
print("β³ All models exhausted. Waiting 60s...")
|
| 158 |
+
time.sleep(60)
|
| 159 |
+
continue
|
| 160 |
+
|
| 161 |
+
# Other error
|
| 162 |
+
raise Exception(f"Gemini error {r.status_code}: {r.text}")
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def get_next_available_model():
|
| 166 |
+
global current_model_index
|
| 167 |
+
|
| 168 |
+
for i in range(len(GEMINI_MODELS)):
|
| 169 |
+
idx = (current_model_index + i) % len(GEMINI_MODELS)
|
| 170 |
+
if can_use_model(GEMINI_MODELS[idx]):
|
| 171 |
+
current_model_index = idx
|
| 172 |
+
return GEMINI_MODELS[idx]
|
| 173 |
+
|
| 174 |
+
return None
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
# Model configuration with quota tracking
|
| 178 |
+
GEMINI_MODELS = [
|
| 179 |
+
{
|
| 180 |
+
"name": "gemini-2.5-flash-lite",
|
| 181 |
+
"max_requests_per_minute": 120,
|
| 182 |
+
"max_requests_per_day": 10000,
|
| 183 |
+
"max_output_tokens": 16384,
|
| 184 |
+
"timeout": 60,
|
| 185 |
+
"description": "Stage 1 - Pre-classification / validation / cheap parsing",
|
| 186 |
+
"current_rpm": 0,
|
| 187 |
+
"current_rpd": 0,
|
| 188 |
+
"last_rpm_reset": None,
|
| 189 |
+
"last_rpd_reset": None,
|
| 190 |
+
"quota_reset_time": None,
|
| 191 |
+
"skip_on_error": True
|
| 192 |
+
},
|
| 193 |
+
{
|
| 194 |
+
"name": "gemini-2.5-flash-image",
|
| 195 |
+
"max_requests_per_minute": 50,
|
| 196 |
+
"max_requests_per_day": 1500,
|
| 197 |
+
"max_output_tokens": 65536,
|
| 198 |
+
"timeout": 300,
|
| 199 |
+
"description": "Stage 2 - Primary invoice OCR extraction",
|
| 200 |
+
"current_rpm": 0,
|
| 201 |
+
"current_rpd": 0,
|
| 202 |
+
"last_rpm_reset": None,
|
| 203 |
+
"last_rpd_reset": None,
|
| 204 |
+
"quota_reset_time": None,
|
| 205 |
+
"skip_on_error": False
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"name": "gemini-2.5-pro",
|
| 209 |
+
"max_requests_per_minute": 10,
|
| 210 |
+
"max_requests_per_day": 1000,
|
| 211 |
+
"max_output_tokens": 65536,
|
| 212 |
+
"timeout": 300,
|
| 213 |
+
"description": "Stage 3 - Complex invoice reasoning fallback",
|
| 214 |
+
"current_rpm": 0,
|
| 215 |
+
"current_rpd": 0,
|
| 216 |
+
"last_rpm_reset": None,
|
| 217 |
+
"last_rpd_reset": None,
|
| 218 |
+
"quota_reset_time": None,
|
| 219 |
+
"skip_on_error": False
|
| 220 |
+
}
|
| 221 |
+
]
|
| 222 |
+
|
| 223 |
+
current_model_index = 0
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
# ============================================================================
|
| 228 |
+
# β QUOTA MANAGEMENT FUNCTIONS
|
| 229 |
+
# ============================================================================
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def reset_model_quota_counters(model_config):
|
| 233 |
+
"""Reset quota counters based on time windows"""
|
| 234 |
+
now = datetime.now()
|
| 235 |
+
with quota_manager_lock:
|
| 236 |
+
if model_config["last_rpm_reset"] is None:
|
| 237 |
+
model_config["last_rpm_reset"] = now
|
| 238 |
+
model_config["current_rpm"] = 0
|
| 239 |
+
elif (now - model_config["last_rpm_reset"]).total_seconds() >= 60:
|
| 240 |
+
model_config["current_rpm"] = 0
|
| 241 |
+
model_config["last_rpm_reset"] = now
|
| 242 |
+
logger.debug(f"π Reset RPM for {model_config['name']}")
|
| 243 |
+
|
| 244 |
+
if model_config["last_rpd_reset"] is None:
|
| 245 |
+
model_config["last_rpd_reset"] = now
|
| 246 |
+
model_config["current_rpd"] = 0
|
| 247 |
+
elif now.date() > model_config["last_rpd_reset"].date():
|
| 248 |
+
model_config["current_rpd"] = 0
|
| 249 |
+
model_config["last_rpd_reset"] = now
|
| 250 |
+
logger.info(f"π Reset daily quota for {model_config['name']}")
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
def can_use_model(model_config):
|
| 254 |
+
"""Check if model has available quota"""
|
| 255 |
+
reset_model_quota_counters(model_config)
|
| 256 |
+
with quota_manager_lock:
|
| 257 |
+
rpm_ok = model_config["current_rpm"] < model_config["max_requests_per_minute"]
|
| 258 |
+
rpd_ok = model_config["current_rpd"] < model_config["max_requests_per_day"]
|
| 259 |
+
return rpm_ok and rpd_ok
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
def record_model_request(model_config):
|
| 263 |
+
"""Record a request"""
|
| 264 |
+
with quota_manager_lock:
|
| 265 |
+
model_config["current_rpm"] += 1
|
| 266 |
+
model_config["current_rpd"] += 1
|
| 267 |
+
logger.debug(
|
| 268 |
+
f"π {model_config['name']}: RPM={model_config['current_rpm']}/{model_config['max_requests_per_minute']}")
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
def wait_for_quota_renewal(max_wait=MAX_WAIT_TIME):
|
| 272 |
+
"""Wait for any model to have quota"""
|
| 273 |
+
start = time.time()
|
| 274 |
+
logger.info(
|
| 275 |
+
f"β³ All models quota exhausted. Waiting for renewal (max {max_wait}s)...")
|
| 276 |
+
|
| 277 |
+
while time.time() - start < max_wait:
|
| 278 |
+
for i, model in enumerate(GEMINI_MODELS):
|
| 279 |
+
if can_use_model(model):
|
| 280 |
+
elapsed = time.time() - start
|
| 281 |
+
logger.info(
|
| 282 |
+
f"β
{model['name']} quota available after {elapsed:.1f}s")
|
| 283 |
+
return True, i
|
| 284 |
+
|
| 285 |
+
elapsed = time.time() - start
|
| 286 |
+
remaining = max_wait - elapsed
|
| 287 |
+
logger.info(
|
| 288 |
+
f"β° Waiting... (elapsed: {elapsed:.0f}s, remaining: {remaining:.0f}s)")
|
| 289 |
+
time.sleep(10)
|
| 290 |
+
|
| 291 |
+
logger.error(f"β Timeout: No quota available after {max_wait}s")
|
| 292 |
+
return False, -1
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
def get_current_model_config():
|
| 296 |
+
"""Get current model config"""
|
| 297 |
+
return GEMINI_MODELS[current_model_index]
|
| 298 |
+
|
| 299 |
# ============================================================================
|
| 300 |
# STARTUP VALIDATION
|
| 301 |
# ============================================================================
|
|
|
|
| 306 |
warnings = []
|
| 307 |
errors = []
|
| 308 |
|
|
|
|
| 309 |
if not GEMINI_API_KEY:
|
| 310 |
warnings.append(
|
| 311 |
"β οΈ GEMINI_API_KEY not set - image-based PDFs will not work")
|
| 312 |
else:
|
| 313 |
print(f"β
GEMINI_API_KEY configured ({len(GEMINI_API_KEY)} chars)")
|
| 314 |
|
|
|
|
| 315 |
if not AZURE_STORAGE_CONNECTION_STRING:
|
| 316 |
if not (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY):
|
| 317 |
+
errors.append("β Azure credentials missing")
|
|
|
|
| 318 |
else:
|
| 319 |
print(
|
| 320 |
f"β
Azure credentials configured (account: {AZURE_STORAGE_ACCOUNT_NAME})")
|
| 321 |
else:
|
| 322 |
print(f"β
Azure connection string configured")
|
| 323 |
|
|
|
|
| 324 |
for warning in warnings:
|
| 325 |
print(warning)
|
|
|
|
|
|
|
| 326 |
for error in errors:
|
| 327 |
print(error)
|
| 328 |
|
| 329 |
if errors:
|
| 330 |
print("\nβ οΈ WARNING: Some required credentials are missing!")
|
|
|
|
| 331 |
|
| 332 |
return len(errors) == 0
|
| 333 |
|
|
|
|
| 353 |
elif AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY:
|
| 354 |
account_url = f"https://{AZURE_STORAGE_ACCOUNT_NAME}.blob.core.windows.net"
|
| 355 |
blob_service_client = BlobServiceClient(
|
| 356 |
+
account_url=account_url, credential=AZURE_STORAGE_ACCOUNT_KEY)
|
|
|
|
|
|
|
| 357 |
print("β
Azure Blob Storage initialized with account key")
|
| 358 |
else:
|
| 359 |
print("β οΈ WARNING: No Azure credentials configured")
|
|
|
|
| 394 |
raise HTTPException(
|
| 395 |
status_code=500, detail="Azure Blob Storage not configured")
|
| 396 |
|
|
|
|
| 397 |
base_filename = os.path.splitext(filename)[0]
|
| 398 |
safe_folder_name = re.sub(r'[<>:"/\\|?*]', '_', base_filename)
|
|
|
|
| 399 |
blob_name = f"{ROOT_FOLDER}/{batch_id}/{safe_folder_name}/Raw/{filename}"
|
| 400 |
|
|
|
|
| 401 |
blob_client = client.get_blob_client(
|
| 402 |
container=container_name, blob=blob_name)
|
| 403 |
|
|
|
|
| 404 |
print(f"π€ Uploading raw PDF to: {blob_name}")
|
| 405 |
blob_client.upload_blob(
|
| 406 |
pdf_bytes,
|
|
|
|
| 414 |
}
|
| 415 |
)
|
| 416 |
|
|
|
|
| 417 |
expiry_hours = 24
|
| 418 |
sas_token = generate_blob_sas(
|
| 419 |
account_name=AZURE_STORAGE_ACCOUNT_NAME,
|
|
|
|
| 424 |
expiry=datetime.utcnow() + timedelta(hours=expiry_hours)
|
| 425 |
)
|
| 426 |
|
|
|
|
| 427 |
blob_url = blob_client.url
|
| 428 |
download_url = f"{blob_url}?{sas_token}"
|
| 429 |
expires_at = (datetime.utcnow() +
|
|
|
|
| 462 |
raise HTTPException(
|
| 463 |
status_code=500, detail="Azure Blob Storage not configured")
|
| 464 |
|
|
|
|
| 465 |
base_filename = os.path.splitext(original_filename)[0]
|
| 466 |
safe_folder_name = re.sub(r'[<>:"/\\|?*]', '_', base_filename)
|
|
|
|
| 467 |
blob_name = f"{ROOT_FOLDER}/{batch_id}/{safe_folder_name}/Splitted/{invoice_filename}"
|
| 468 |
|
|
|
|
| 469 |
blob_client = client.get_blob_client(
|
| 470 |
container=container_name, blob=blob_name)
|
| 471 |
|
|
|
|
| 472 |
blob_client.upload_blob(
|
| 473 |
pdf_bytes,
|
| 474 |
overwrite=True,
|
|
|
|
| 482 |
}
|
| 483 |
)
|
| 484 |
|
|
|
|
| 485 |
expiry_hours = 24
|
| 486 |
sas_token = generate_blob_sas(
|
| 487 |
account_name=AZURE_STORAGE_ACCOUNT_NAME,
|
|
|
|
| 492 |
expiry=datetime.utcnow() + timedelta(hours=expiry_hours)
|
| 493 |
)
|
| 494 |
|
|
|
|
| 495 |
blob_url = blob_client.url
|
| 496 |
download_url = f"{blob_url}?{sas_token}"
|
| 497 |
expires_at = (datetime.utcnow() +
|
|
|
|
| 527 |
return
|
| 528 |
|
| 529 |
container_client = client.get_container_client(container_name)
|
|
|
|
| 530 |
prefix = f"{ROOT_FOLDER}/{batch_id}/"
|
| 531 |
blobs = container_client.list_blobs(name_starts_with=prefix)
|
| 532 |
|
|
|
|
| 547 |
|
| 548 |
|
| 549 |
def get_gemini_model():
|
| 550 |
+
"""Get or create Gemini model instance WITH QUOTA CHECK"""
|
| 551 |
+
global gemini_model, current_model_index
|
| 552 |
|
| 553 |
if not GEMINI_AVAILABLE:
|
| 554 |
return None
|
| 555 |
|
| 556 |
+
if not GEMINI_API_KEY:
|
| 557 |
+
return None
|
|
|
|
| 558 |
|
| 559 |
+
# β CHECK QUOTA BEFORE RETURNING MODEL
|
| 560 |
+
model_config = get_current_model_config()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 561 |
|
| 562 |
+
if not can_use_model(model_config):
|
| 563 |
+
logger.warning(f"β οΈ {model_config['name']} quota exhausted")
|
| 564 |
|
| 565 |
+
# Try other models
|
| 566 |
+
for i, alt_model in enumerate(GEMINI_MODELS):
|
| 567 |
+
if i != current_model_index and can_use_model(alt_model):
|
| 568 |
+
current_model_index = i
|
| 569 |
+
model_config = alt_model
|
| 570 |
+
logger.info(f"π Switched to {model_config['name']}")
|
| 571 |
+
gemini_model = None # Force recreation
|
| 572 |
+
break
|
| 573 |
+
else:
|
| 574 |
+
# All models exhausted - wait
|
| 575 |
+
success, new_index = wait_for_quota_renewal(MAX_WAIT_TIME)
|
| 576 |
+
if success:
|
| 577 |
+
current_model_index = new_index
|
| 578 |
+
model_config = GEMINI_MODELS[new_index]
|
| 579 |
+
gemini_model = None
|
| 580 |
+
else:
|
| 581 |
+
logger.error("β All models quota exhausted")
|
| 582 |
+
return None
|
| 583 |
|
| 584 |
+
# Create/recreate model if needed
|
| 585 |
+
with model_lock:
|
| 586 |
+
if gemini_model is None or not hasattr(gemini_model, '_model_name') or gemini_model._model_name != model_config['name']:
|
| 587 |
+
try:
|
| 588 |
+
genai.configure(api_key=GEMINI_API_KEY)
|
| 589 |
+
gemini_model = genai.GenerativeModel(model_config['name'])
|
| 590 |
+
gemini_model._model_name = model_config['name']
|
| 591 |
+
logger.info(f"β
Using {model_config['name']}")
|
| 592 |
+
except Exception as e:
|
| 593 |
+
logger.error(
|
| 594 |
+
f"β Failed to initialize {model_config['name']}: {e}")
|
| 595 |
+
return None
|
| 596 |
+
|
| 597 |
+
# β REMOVE THIS LINE - Don't record request for model creation
|
| 598 |
+
# record_model_request(model_config) # <-- DELETE THIS
|
| 599 |
+
|
| 600 |
+
return gemini_model
|
| 601 |
+
|
| 602 |
+
|
| 603 |
+
def extract_invoice_gemini_sync(page):
|
| 604 |
+
model_config = get_current_model_config()
|
| 605 |
|
|
|
|
| 606 |
try:
|
|
|
|
| 607 |
pix = page.get_pixmap(matrix=fitz.Matrix(
|
| 608 |
GEMINI_IMAGE_RESOLUTION, GEMINI_IMAGE_RESOLUTION))
|
| 609 |
img_bytes = pix.tobytes("png")
|
| 610 |
pix = None
|
|
|
|
| 611 |
|
|
|
|
| 612 |
prompt = """Look at this invoice image and extract ONLY the invoice number.
|
| 613 |
+
Return ONLY the invoice number. If not found return NONE."""
|
| 614 |
|
| 615 |
+
text = call_gemini_25(model_config["name"], img_bytes, prompt)
|
| 616 |
+
cleaned = text.strip().replace("Invoice Number:", "").strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 617 |
|
| 618 |
+
print(f"π€ Gemini raw response: {text}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 619 |
|
| 620 |
+
if cleaned and cleaned.upper() != "NONE" and len(cleaned) >= 3:
|
| 621 |
+
cleaned = re.sub(r'[^A-Za-z0-9\-/]', '', cleaned)
|
| 622 |
+
print(f"β
Gemini extracted: {cleaned}")
|
| 623 |
+
return cleaned.upper()
|
| 624 |
+
|
| 625 |
+
# Fallback OCR
|
| 626 |
+
ocr = call_gemini_25(
|
| 627 |
+
model_config["name"], img_bytes, "Extract all visible text from this image")
|
| 628 |
+
return try_extract_invoice_from_text(ocr)
|
| 629 |
|
| 630 |
except Exception as e:
|
| 631 |
+
print(f"β Gemini error: {e}")
|
|
|
|
|
|
|
| 632 |
return None
|
| 633 |
|
| 634 |
|
|
|
|
| 638 |
page_invoice_nos = []
|
| 639 |
|
| 640 |
if not is_image_pdf:
|
|
|
|
| 641 |
print(f" π Text-based extraction (sequential)")
|
| 642 |
for i in range(doc.page_count):
|
| 643 |
if i % 50 == 0:
|
|
|
|
| 650 |
gc.collect()
|
| 651 |
return page_invoice_nos
|
| 652 |
|
|
|
|
| 653 |
print(f" π Image-based extraction (parallel, batch_size={batch_size})")
|
| 654 |
|
|
|
|
| 655 |
with ThreadPoolExecutor(max_workers=batch_size) as executor:
|
| 656 |
futures = []
|
| 657 |
|
|
|
|
| 658 |
for i in range(doc.page_count):
|
| 659 |
page = doc.load_page(i)
|
|
|
|
| 660 |
text_result = extract_invoice_text_based(page)
|
| 661 |
if text_result:
|
| 662 |
futures.append((i, None, text_result))
|
| 663 |
else:
|
|
|
|
| 664 |
future = executor.submit(extract_invoice_gemini_sync, page)
|
| 665 |
futures.append((i, future, None))
|
| 666 |
|
|
|
|
| 667 |
page_invoice_nos = [None] * doc.page_count
|
| 668 |
completed = 0
|
| 669 |
|
| 670 |
for i, future, text_result in futures:
|
| 671 |
try:
|
| 672 |
if text_result:
|
|
|
|
| 673 |
page_invoice_nos[i] = text_result
|
| 674 |
completed += 1
|
| 675 |
else:
|
|
|
|
| 676 |
result = future.result(timeout=30)
|
| 677 |
page_invoice_nos[i] = result
|
| 678 |
completed += 1
|
|
|
|
| 698 |
|
| 699 |
page_invoice_nos = [None] * doc.page_count
|
| 700 |
|
|
|
|
| 701 |
page = doc.load_page(0)
|
| 702 |
page_invoice_nos[0] = extract_invoice_no_from_page(page, is_image_pdf)
|
| 703 |
print(f" β Page 1: {page_invoice_nos[0]}")
|
| 704 |
|
|
|
|
| 705 |
sample_interval = max(3, doc.page_count // 20)
|
| 706 |
print(f" Sampling interval: every {sample_interval} pages")
|
| 707 |
|
|
|
|
| 713 |
if i % 10 == 0:
|
| 714 |
print(f" Sampling page {i+1}/{doc.page_count}...")
|
| 715 |
|
|
|
|
| 716 |
prev_known_idx = i - sample_interval
|
| 717 |
while prev_known_idx >= 0 and page_invoice_nos[prev_known_idx] is None:
|
| 718 |
prev_known_idx -= 1
|
|
|
|
| 726 |
page_invoice_nos[idx] = extract_invoice_no_from_page(
|
| 727 |
page, is_image_pdf)
|
| 728 |
|
|
|
|
| 729 |
if page_invoice_nos[-1] is None:
|
| 730 |
page = doc.load_page(doc.page_count - 1)
|
| 731 |
page_invoice_nos[-1] = extract_invoice_no_from_page(page, is_image_pdf)
|
| 732 |
print(f" β Last page: {page_invoice_nos[-1]}")
|
| 733 |
|
|
|
|
| 734 |
last_known = page_invoice_nos[0]
|
| 735 |
filled = 0
|
| 736 |
for i in range(len(page_invoice_nos)):
|
|
|
|
| 744 |
return page_invoice_nos
|
| 745 |
|
| 746 |
# ============================================================================
|
| 747 |
+
# PDF PROCESSING FUNCTIONS
|
| 748 |
# ============================================================================
|
| 749 |
|
| 750 |
|
|
|
|
| 786 |
has_digit = any(c.isdigit() for c in candidate)
|
| 787 |
return has_letter and has_digit
|
| 788 |
|
|
|
|
|
|
|
| 789 |
|
| 790 |
def try_extract_invoice_from_text(text: str) -> Optional[str]:
|
| 791 |
+
"""Universal label-first extraction with smart prioritization"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 792 |
if not text:
|
| 793 |
return None
|
| 794 |
|
| 795 |
text_norm = normalize_text_for_search(text)
|
| 796 |
|
|
|
|
| 797 |
if len(text_norm) > 0:
|
| 798 |
print(f"\n{'='*70}")
|
| 799 |
print(f"π ANALYZING TEXT (first 800 chars):")
|
| 800 |
print(f"{text_norm[:800]}")
|
| 801 |
print(f"{'='*70}\n")
|
| 802 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 803 |
label_patterns = [
|
|
|
|
| 804 |
(r"Invoice\s*(?:No\.?|Number|Num)", "Invoice No", True),
|
| 805 |
(r"Inv\s*(?:No\.?|Number)", "Inv No", True),
|
| 806 |
(r"Bill\s*(?:No\.?|Number|Num)", "Bill No", True),
|
| 807 |
(r"Tax\s*Invoice\s*(?:No\.?|Number)", "Tax Invoice No", True),
|
| 808 |
(r"Document\s*(?:No\.?|Number)", "Document No", True),
|
|
|
|
|
|
|
| 809 |
(r"Receipt\s*(?:No\.?|Number)", "Receipt No", False),
|
| 810 |
(r"Voucher\s*(?:No\.?|Number)", "Voucher No", False),
|
| 811 |
(r"Reference\s*(?:No\.?|Number)", "Reference No", False),
|
|
|
|
| 816 |
|
| 817 |
for label_pattern, label_name, is_invoice_label in label_patterns:
|
| 818 |
header_text = text_norm[:2000]
|
|
|
|
|
|
|
| 819 |
label_matches = list(re.finditer(
|
| 820 |
label_pattern, header_text, re.IGNORECASE))
|
| 821 |
|
| 822 |
for label_match in label_matches:
|
| 823 |
start_pos = label_match.end()
|
|
|
|
| 824 |
text_after_label = header_text[start_pos:start_pos + 200]
|
| 825 |
|
| 826 |
print(
|
|
|
|
| 828 |
print(
|
| 829 |
f" Text after label (first 80 chars): '{text_after_label[:80]}...'")
|
| 830 |
|
|
|
|
| 831 |
all_candidates = re.findall(
|
| 832 |
+
r'\b([A-Z0-9][A-Z0-9\-\/]{2,20})\b', text_after_label, re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
| 833 |
|
| 834 |
print(
|
| 835 |
f" Found {len(all_candidates)} potential candidates: {all_candidates[:5]}")
|
| 836 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 837 |
for pass_number in [1, 2]:
|
| 838 |
if pass_number == 2 and len(all_candidates) > 0:
|
| 839 |
print(f" π Second pass: Trying alphanumeric candidates...")
|
|
|
|
| 841 |
for candidate in all_candidates:
|
| 842 |
invoice_num = candidate.strip(".,;:-_")
|
| 843 |
|
|
|
|
| 844 |
if len(invoice_num) < 3:
|
| 845 |
continue
|
| 846 |
|
|
|
|
| 847 |
is_pure_numeric = invoice_num.isdigit()
|
| 848 |
is_ideal_invoice_length = 12 <= len(invoice_num) <= 14
|
| 849 |
|
| 850 |
if pass_number == 1:
|
|
|
|
| 851 |
if not (is_pure_numeric and is_ideal_invoice_length):
|
| 852 |
continue
|
| 853 |
print(
|
| 854 |
f" β¨ PRIORITY candidate (12-14 digit numeric): '{invoice_num}'")
|
| 855 |
else:
|
|
|
|
| 856 |
if is_pure_numeric and is_ideal_invoice_length:
|
| 857 |
continue
|
| 858 |
print(f" π Evaluating candidate: '{invoice_num}'")
|
| 859 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 860 |
if invoice_num.upper() in ("ORDER", "REF", "NO", "NUMBER", "DATE", "DT", "AND",
|
| 861 |
"INV", "BILL", "ACCOUNT", "PO", "COPY", "OF",
|
| 862 |
"DOCUMENT", "DOC", "GST", "GSTIN", "ACK", "USER",
|
|
|
|
| 864 |
print(f" β οΈ Skipped: noise word")
|
| 865 |
continue
|
| 866 |
|
|
|
|
| 867 |
if not is_invoice_label:
|
| 868 |
if re.match(r'^[A-Z]\d{6}$', invoice_num, re.IGNORECASE):
|
| 869 |
print(
|
| 870 |
f" β οΈ Skipped: batch pattern (non-invoice context)")
|
| 871 |
continue
|
| 872 |
|
|
|
|
| 873 |
if re.match(r'^[A-Z]{2,3}-[A-Z0-9]+-\d+$', invoice_num, re.IGNORECASE):
|
| 874 |
print(f" β οΈ Skipped: license pattern")
|
| 875 |
continue
|
| 876 |
|
|
|
|
| 877 |
if re.match(r'^[A-Z]{2,4}-\d{10}$', invoice_num, re.IGNORECASE):
|
| 878 |
+
print(f" β οΈ Skipped: state code/UIN pattern")
|
|
|
|
| 879 |
continue
|
| 880 |
|
| 881 |
+
if re.search(rf"Ack\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(invoice_num)}", text_norm, re.IGNORECASE):
|
|
|
|
|
|
|
| 882 |
print(f" β οΈ Skipped: ACK number")
|
| 883 |
continue
|
| 884 |
|
| 885 |
+
if re.search(rf"PH\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(invoice_num)}", text_norm, re.IGNORECASE):
|
|
|
|
|
|
|
| 886 |
print(f" β οΈ Skipped: PH number")
|
| 887 |
continue
|
| 888 |
|
| 889 |
+
if re.search(rf"(?:UIN|UID|State\s*Code|D\.L\.No)\.?\s*:?\s*{re.escape(invoice_num)}", text_norm, re.IGNORECASE):
|
| 890 |
+
print(f" β οΈ Skipped: UIN/UID/State Code")
|
|
|
|
|
|
|
| 891 |
continue
|
| 892 |
|
| 893 |
+
if re.search(rf"A[\s\/]*C\s*(?:No\.?|Number)?\s*[\-:\.]?\s*{re.escape(invoice_num)}", text_norm, re.IGNORECASE):
|
|
|
|
|
|
|
| 894 |
print(f" β οΈ Skipped: A/C number")
|
| 895 |
continue
|
| 896 |
|
|
|
|
| 897 |
if re.match(r'^[0-9]{10,11}$', invoice_num):
|
|
|
|
| 898 |
if len(invoice_num) == 10 and invoice_num[0] in '6789':
|
| 899 |
print(f" β οΈ Skipped: mobile number")
|
| 900 |
continue
|
|
|
|
| 901 |
if len(invoice_num) == 11 and invoice_num[0] == '0':
|
| 902 |
print(f" β οΈ Skipped: landline number")
|
| 903 |
continue
|
| 904 |
|
|
|
|
| 905 |
if re.match(r'^20\d{6}$', invoice_num):
|
| 906 |
+
print(f" β οΈ Skipped: date pattern")
|
| 907 |
continue
|
| 908 |
|
|
|
|
| 909 |
if re.match(r'^\d{2}[\/\-]\d{2}[\/\-]\d{4}$', invoice_num):
|
| 910 |
+
print(f" β οΈ Skipped: date format")
|
| 911 |
continue
|
| 912 |
|
| 913 |
+
if len(invoice_num) == 15 and re.match(r'^\d{2}[A-Z]{5}\d{4}[A-Z]\d[A-Z]\d$', invoice_num, re.IGNORECASE):
|
| 914 |
+
print(f" β οΈ Skipped: GST number")
|
|
|
|
|
|
|
| 915 |
continue
|
| 916 |
|
|
|
|
| 917 |
print(f" β
β
β
ACCEPTED: '{invoice_num}'")
|
| 918 |
return invoice_num.upper()
|
| 919 |
|
| 920 |
print(f" β οΈ No valid candidates found after '{label_name}'")
|
| 921 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 922 |
print("\nβ οΈ No labeled invoice number found, trying fallback extraction...")
|
| 923 |
|
| 924 |
top_text = text_norm[:1000]
|
| 925 |
|
|
|
|
| 926 |
credit_match = re.search(
|
| 927 |
+
r"CREDIT\s*(?:NO|NUMBER|#)?\s*[:\-]?\s*(\d{12,20})", text_norm, re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
| 928 |
if credit_match:
|
| 929 |
credit_num = credit_match.group(1).strip()
|
|
|
|
| 930 |
if 12 <= len(credit_num) <= 20 and len(credit_num) != 14:
|
| 931 |
print(f"β Fallback: Found CREDIT number: {credit_num}")
|
| 932 |
return credit_num.upper()
|
| 933 |
|
|
|
|
| 934 |
long_numerics = re.findall(r'\b(\d{12,20})\b', top_text)
|
| 935 |
for num in long_numerics:
|
| 936 |
+
if re.search(rf"(?:Ack|PH|A[\s\/]*C|UIN|UID|State\s*Code|D\.L\.No)\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(num)}", text_norm, re.IGNORECASE):
|
|
|
|
|
|
|
| 937 |
print(f"β οΈ Fallback: Skipping (labeled as ACK/PH/A/C/UIN): {num}")
|
| 938 |
continue
|
|
|
|
| 939 |
print(f"β Fallback: Found long numeric: {num}")
|
| 940 |
return num.upper()
|
| 941 |
|
|
|
|
| 942 |
medium_numerics = re.findall(r'\b(\d{10,15})\b', top_text)
|
| 943 |
for num in medium_numerics:
|
|
|
|
| 944 |
if len(num) == 10 and num[0] in '6789':
|
| 945 |
continue
|
| 946 |
if len(num) == 11 and num[0] == '0':
|
| 947 |
continue
|
|
|
|
|
|
|
| 948 |
if len(num) == 8 and num.startswith('20'):
|
| 949 |
continue
|
| 950 |
+
if re.search(rf"(?:Ack|PH|A[\s\/]*C|UIN|UID)\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(num)}", text_norm, re.IGNORECASE):
|
|
|
|
|
|
|
|
|
|
| 951 |
continue
|
|
|
|
| 952 |
print(f"β Fallback: Found medium numeric: {num}")
|
| 953 |
return num.upper()
|
| 954 |
|
| 955 |
print("β No invoice number found (labeled or unlabeled)")
|
| 956 |
return None
|
| 957 |
|
|
|
|
|
|
|
| 958 |
|
| 959 |
def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
|
| 960 |
+
"""Extract invoice number from TEXT-BASED PDF with Zydus fallback"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 961 |
text = page.get_text("text") or ""
|
| 962 |
text_norm = normalize_text_for_search(text)
|
| 963 |
|
|
|
|
|
|
|
| 964 |
header_text = text_norm[:2500]
|
|
|
|
|
|
|
| 965 |
zydus_candidates = re.findall(r'\b(23\d{8})\b', header_text)
|
| 966 |
|
| 967 |
if zydus_candidates:
|
|
|
|
|
|
|
| 968 |
zydus_number = zydus_candidates[0]
|
| 969 |
print(f" β
ZYDUS INVOICE DETECTED: {zydus_number}")
|
| 970 |
return zydus_number.upper()
|
| 971 |
|
|
|
|
| 972 |
inv = try_extract_invoice_from_text(text)
|
| 973 |
|
|
|
|
|
|
|
| 974 |
if inv:
|
|
|
|
| 975 |
if re.match(r'^10\d{12}$', inv):
|
| 976 |
+
print(f" β οΈ REJECTED Order ID (14-digit): {inv}")
|
|
|
|
|
|
|
|
|
|
| 977 |
inv = None
|
| 978 |
else:
|
|
|
|
| 979 |
return inv
|
| 980 |
|
|
|
|
| 981 |
for block in (page.get_text("blocks") or []):
|
| 982 |
block_text = block[4] if len(block) > 4 else ""
|
| 983 |
if block_text:
|
| 984 |
inv = try_extract_invoice_from_text(block_text)
|
| 985 |
if inv:
|
|
|
|
| 986 |
if re.match(r'^10\d{12}$', inv):
|
| 987 |
+
print(f" β οΈ REJECTED Order ID from block: {inv}")
|
| 988 |
+
continue
|
|
|
|
| 989 |
else:
|
| 990 |
return inv
|
| 991 |
|
|
|
|
|
|
|
| 992 |
blocks = page.get_text("blocks") or []
|
| 993 |
sorted_blocks = sorted(blocks, key=lambda b: b[1] if len(b) > 1 else 0)
|
| 994 |
|
| 995 |
+
for block in sorted_blocks[:15]:
|
| 996 |
block_text = block[4] if len(block) > 4 else ""
|
| 997 |
if block_text:
|
| 998 |
block_norm = normalize_text_for_search(block_text)
|
|
|
|
| 1002 |
print(f" β
ZYDUS BLOCK DETECTION: {number}")
|
| 1003 |
return number.upper()
|
| 1004 |
|
|
|
|
|
|
|
| 1005 |
print(f" β οΈ No valid invoice found on this page (will use forward-fill)")
|
| 1006 |
return None
|
| 1007 |
|
| 1008 |
|
| 1009 |
def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
|
| 1010 |
+
"""Extract invoice number from a single page"""
|
| 1011 |
text_result = extract_invoice_text_based(page)
|
| 1012 |
if text_result:
|
| 1013 |
return text_result
|
|
|
|
| 1034 |
except Exception as e:
|
| 1035 |
print(f"β οΈ Cleanup warning: {e}")
|
| 1036 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1037 |
|
| 1038 |
def merge_first_null_group(groups: List[Dict]) -> List[Dict]:
|
| 1039 |
+
"""Merge null first page with first invoice"""
|
|
|
|
|
|
|
|
|
|
| 1040 |
if len(groups) >= 2:
|
| 1041 |
first_group = groups[0]
|
| 1042 |
second_group = groups[1]
|
| 1043 |
|
|
|
|
| 1044 |
if first_group["invoice_no"] is None and second_group["invoice_no"] is not None:
|
| 1045 |
print(f"\nπ§ AUTO-FIX: Merging null first page(s) with first invoice")
|
| 1046 |
print(
|
|
|
|
| 1048 |
print(
|
| 1049 |
f" First invoice: {second_group['invoice_no']}, Pages {[p+1 for p in second_group['pages']]}")
|
| 1050 |
|
|
|
|
| 1051 |
merged_pages = first_group["pages"] + second_group["pages"]
|
| 1052 |
second_group["pages"] = merged_pages
|
|
|
|
|
|
|
| 1053 |
groups.pop(0)
|
| 1054 |
|
| 1055 |
print(
|
|
|
|
| 1066 |
async def split_invoices(
|
| 1067 |
background_tasks: BackgroundTasks,
|
| 1068 |
file: UploadFile = File(...),
|
| 1069 |
+
batch_id: str = Form(..., description="Batch ID (required)"),
|
|
|
|
| 1070 |
use_blob_storage: bool = Form(
|
| 1071 |
True, description="Upload PDFs to Azure Blob Storage"),
|
| 1072 |
blob_container: Optional[str] = Form(
|
| 1073 |
+
None, description="Custom Azure container"),
|
| 1074 |
include_base64: bool = Form(
|
| 1075 |
False, description="Include base64 in response"),
|
| 1076 |
parallel_batch_size: int = Form(
|
| 1077 |
+
MAX_PARALLEL_GEMINI_CALLS, description="Parallel Gemini API calls"),
|
| 1078 |
use_smart_sampling: bool = Form(
|
| 1079 |
+
USE_SMART_SAMPLING, description="Use smart sampling"),
|
| 1080 |
max_file_size_mb: int = Form(200, description="Maximum file size in MB"),
|
| 1081 |
):
|
| 1082 |
+
"""Universal Invoice Splitter with RPM Management"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1083 |
|
| 1084 |
if not file.filename:
|
| 1085 |
raise HTTPException(status_code=400, detail="No filename provided")
|
| 1086 |
|
| 1087 |
filename_lower = file.filename.lower()
|
|
|
|
|
|
|
| 1088 |
SUPPORTED_EXTENSIONS = ['.pdf', '.png',
|
| 1089 |
'.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
|
| 1090 |
|
|
|
|
| 1096 |
|
| 1097 |
if not file_extension:
|
| 1098 |
raise HTTPException(
|
| 1099 |
+
status_code=400, detail=f"Unsupported file format. Supported: {', '.join(SUPPORTED_EXTENSIONS)}")
|
|
|
|
|
|
|
| 1100 |
|
| 1101 |
is_image_file = file_extension in [
|
| 1102 |
'.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
|
| 1103 |
|
| 1104 |
if is_image_file and not GEMINI_AVAILABLE:
|
| 1105 |
raise HTTPException(
|
| 1106 |
+
status_code=500, detail="Image processing requires PIL")
|
|
|
|
|
|
|
| 1107 |
|
| 1108 |
if use_blob_storage and not get_blob_service_client():
|
| 1109 |
raise HTTPException(
|
|
|
|
| 1211 |
print(
|
| 1212 |
f" ... (showing first 10 of {len(page_invoice_nos)} pages)")
|
| 1213 |
|
|
|
|
| 1214 |
page_invoice_nos_normalized = []
|
| 1215 |
for v in page_invoice_nos:
|
| 1216 |
if v and v.upper().startswith("GST"):
|
|
|
|
| 1221 |
else:
|
| 1222 |
page_invoice_nos_normalized.append(None)
|
| 1223 |
|
|
|
|
| 1224 |
page_invoice_nos_filled = []
|
| 1225 |
last_known_invoice = None
|
| 1226 |
|
|
|
|
| 1241 |
page_count = sum(1 for v in page_invoice_nos_filled if v == inv_no)
|
| 1242 |
print(f" β’ {inv_no}: {page_count} pages")
|
| 1243 |
|
|
|
|
| 1244 |
groups = []
|
| 1245 |
current_group = []
|
| 1246 |
current_invoice = None
|
|
|
|
| 1251 |
current_group = [idx]
|
| 1252 |
else:
|
| 1253 |
if inv != current_invoice:
|
| 1254 |
+
groups.append({"invoice_no": current_invoice,
|
| 1255 |
+
"pages": current_group[:]})
|
|
|
|
|
|
|
| 1256 |
print(
|
| 1257 |
f" π Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
|
| 1258 |
current_invoice = inv
|
|
|
|
| 1261 |
current_group.append(idx)
|
| 1262 |
|
| 1263 |
if current_group:
|
| 1264 |
+
groups.append({"invoice_no": current_invoice,
|
| 1265 |
+
"pages": current_group[:]})
|
|
|
|
|
|
|
| 1266 |
print(
|
| 1267 |
f" π Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
|
| 1268 |
|
| 1269 |
if len(groups) == 1 and groups[0]["invoice_no"] is None:
|
| 1270 |
+
groups = [{"invoice_no": None,
|
| 1271 |
+
"pages": list(range(doc.page_count))}]
|
|
|
|
|
|
|
| 1272 |
|
|
|
|
| 1273 |
groups = merge_first_null_group(groups)
|
| 1274 |
|
| 1275 |
print(f"\nβ
Created {len(groups)} invoice groups (after auto-merge)")
|
| 1276 |
print(
|
| 1277 |
f" Forward-filled {filled_count} pages with missing invoice numbers")
|
| 1278 |
|
|
|
|
| 1279 |
print(f"\nπ¨ Building and uploading split invoices...")
|
| 1280 |
all_parts = []
|
| 1281 |
|
|
|
|
| 1357 |
"unique_invoice_numbers": len(unique_invoices),
|
| 1358 |
"extraction_method": "gemini" if is_image_pdf else "text",
|
| 1359 |
"pages_forward_filled": filled_count,
|
| 1360 |
+
"storage_type": "azure_blob" if use_blob_storage else "base64",
|
| 1361 |
+
"model_used": get_current_model_config()['name']
|
| 1362 |
},
|
| 1363 |
"performance": {
|
| 1364 |
"total_time_seconds": round(total_time, 2),
|
|
|
|
| 1380 |
f" Raw PDF: {raw_pdf_info['blob_name'] if raw_pdf_info else 'Not uploaded'}")
|
| 1381 |
print(f" Split invoices: {len(all_parts)}")
|
| 1382 |
print(f" Unique invoice numbers: {len(unique_invoices)}")
|
| 1383 |
+
print(f" Model used: {get_current_model_config()['name']}")
|
| 1384 |
print(f" Total time: {total_time:.1f}s")
|
| 1385 |
print(
|
| 1386 |
f" Extraction time: {extraction_time:.1f}s ({total_pages_count / extraction_time:.1f} pages/sec)")
|
|
|
|
| 1410 |
background_tasks: BackgroundTasks,
|
| 1411 |
container_name: Optional[str] = Form(None)
|
| 1412 |
):
|
| 1413 |
+
"""Delete all blobs for a specific batch"""
|
| 1414 |
if container_name is None:
|
| 1415 |
container_name = AZURE_CONTAINER_NAME
|
| 1416 |
|
|
|
|
| 1425 |
})
|
| 1426 |
|
| 1427 |
|
| 1428 |
+
@app.get("/quota-status")
|
| 1429 |
+
def quota_status():
|
| 1430 |
+
"""Get quota status for all models"""
|
| 1431 |
+
status = []
|
| 1432 |
+
for i, model in enumerate(GEMINI_MODELS):
|
| 1433 |
+
reset_model_quota_counters(model)
|
| 1434 |
+
with quota_manager_lock:
|
| 1435 |
+
status.append({
|
| 1436 |
+
"model": model["name"],
|
| 1437 |
+
"is_current": i == current_model_index,
|
| 1438 |
+
"rpm": {"used": model["current_rpm"], "limit": model["max_requests_per_minute"]},
|
| 1439 |
+
"rpd": {"used": model["current_rpd"], "limit": model["max_requests_per_day"]},
|
| 1440 |
+
"available": can_use_model(model)
|
| 1441 |
+
})
|
| 1442 |
+
return JSONResponse({"models": status, "timestamp": datetime.now().isoformat()})
|
| 1443 |
+
|
| 1444 |
+
|
| 1445 |
@app.get("/")
|
| 1446 |
async def root():
|
| 1447 |
return {
|
| 1448 |
+
"service": "Universal Invoice Splitter API with RPM Management",
|
| 1449 |
+
"version": "4.0",
|
| 1450 |
"status": "running",
|
| 1451 |
"features": {
|
| 1452 |
"multi_format_support": True,
|
| 1453 |
"zydus_healthcare_support": True,
|
| 1454 |
"auto_merge_null_groups": True,
|
| 1455 |
"azure_blob_storage": True,
|
| 1456 |
+
"parallel_processing": True,
|
| 1457 |
+
"rpm_management": True,
|
| 1458 |
+
"multi_model_fallback": True
|
| 1459 |
+
},
|
| 1460 |
+
"models": [m["name"] for m in GEMINI_MODELS],
|
| 1461 |
+
"current_model": get_current_model_config()['name']
|
| 1462 |
}
|
| 1463 |
|
| 1464 |
|
| 1465 |
@app.get("/health")
|
| 1466 |
async def health():
|
| 1467 |
+
model_config = get_current_model_config()
|
| 1468 |
return {
|
| 1469 |
"status": "healthy",
|
| 1470 |
"timestamp": datetime.now().isoformat(),
|
| 1471 |
"gemini_configured": bool(GEMINI_API_KEY),
|
| 1472 |
+
"azure_configured": bool(AZURE_STORAGE_CONNECTION_STRING or (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY)),
|
| 1473 |
+
"current_model": model_config['name'],
|
| 1474 |
+
"quota_available": can_use_model(model_config)
|
| 1475 |
}
|
| 1476 |
|
| 1477 |
if __name__ == "__main__":
|
| 1478 |
import uvicorn
|
| 1479 |
+
|
| 1480 |
+
# Initialize model quota tracking
|
| 1481 |
+
for model in GEMINI_MODELS:
|
| 1482 |
+
model["last_rpm_reset"] = datetime.now()
|
| 1483 |
+
model["last_rpd_reset"] = datetime.now()
|
| 1484 |
+
|
| 1485 |
+
print("\n" + "="*80)
|
| 1486 |
+
print("π Starting Universal Invoice Splitter API with RPM Management")
|
| 1487 |
+
print("="*80)
|
| 1488 |
print(f"β
Supports ALL invoice types")
|
| 1489 |
print(f"β
Zydus Healthcare fallback (23xxxxxxxx pattern)")
|
| 1490 |
print(f"β
Auto-merge null first pages")
|
| 1491 |
+
print(f"β
RPM/RPD quota management")
|
| 1492 |
+
print(f"β
Multi-model fallback")
|
| 1493 |
+
print("="*80)
|
| 1494 |
+
print(f"π Model Chain:")
|
| 1495 |
+
for i, model in enumerate(GEMINI_MODELS):
|
| 1496 |
+
print(f" {i+1}. {model['name']}")
|
| 1497 |
+
print(
|
| 1498 |
+
f" RPM: {model['max_requests_per_minute']}, RPD: {model['max_requests_per_day']}")
|
| 1499 |
+
print("="*80)
|
| 1500 |
+
print(f"π Server: http://127.0.0.1:8000")
|
| 1501 |
+
print("="*80 + "\n")
|
| 1502 |
+
|
| 1503 |
uvicorn.run(app, host=HOST, port=PORT, log_level="info")
|