Spaces:

anujakkulkarni
/

splitpdffile

Paused

App Files Files Community

anujakkulkarni commited on Feb 14

Commit

0e0c157

verified ·

1 Parent(s): 91ed191

Update app.py

Browse files

Files changed (1) hide show

app.py +359 -314

app.py CHANGED Viewed

@@ -7,8 +7,11 @@ import tempfile
 import uuid
 import asyncio
 from typing import List, Dict, Optional, Tuple
-from collections import Counter
 from concurrent.futures import ThreadPoolExecutor
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
 from fastapi.middleware.cors import CORSMiddleware
@@ -16,7 +19,10 @@ from fastapi.responses import JSONResponse
 from starlette.requests import Request
 import fitz  # PyMuPDF
 import google.generativeai as genai
 from PIL import Image
 # Azure Blob Storage
 try:
@@ -40,6 +46,10 @@ except ImportError:
 from datetime import datetime, timedelta
 app = FastAPI(title="Invoice Splitter API with Azure Blob Storage - Optimized")
 # Increase request body size limit
@@ -54,7 +64,7 @@ app.add_middleware(
 )
 # ============================================================================
-# ⭐ CONFIGURATION FROM ENVIRONMENT VARIABLES (Hugging Face Secrets)
 # ============================================================================
 # Gemini API Key - REQUIRED for image-based PDFs
@@ -84,6 +94,10 @@ USE_SMART_SAMPLING = os.environ.get(
 HOST = os.environ.get("HOST", "0.0.0.0")  # Hugging Face uses 0.0.0.0
 PORT = int(os.environ.get("PORT", "7860"))  # Hugging Face default port
 # ============================================================================
 # GLOBAL VARIABLES
 # ============================================================================
@@ -92,6 +106,196 @@ gemini_model = None
 blob_service_client = None
 # ============================================================================
 # STARTUP VALIDATION
 # ============================================================================
@@ -102,35 +306,28 @@ def validate_configuration():
     warnings = []
     errors = []
-    # Check Gemini API Key
     if not GEMINI_API_KEY:
         warnings.append(
             "⚠️  GEMINI_API_KEY not set - image-based PDFs will not work")
     else:
         print(f"✅ GEMINI_API_KEY configured ({len(GEMINI_API_KEY)} chars)")
-    # Check Azure credentials
     if not AZURE_STORAGE_CONNECTION_STRING:
         if not (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY):
-            errors.append(
-                "❌ Azure credentials missing - set AZURE_STORAGE_CONNECTION_STRING or both AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY")
         else:
             print(
                 f"✅ Azure credentials configured (account: {AZURE_STORAGE_ACCOUNT_NAME})")
     else:
         print(f"✅ Azure connection string configured")
-    # Print all warnings
     for warning in warnings:
         print(warning)
-    # Print all errors
     for error in errors:
         print(error)
     if errors:
         print("\n⚠️  WARNING: Some required credentials are missing!")
-        print("   Set them in Hugging Face Spaces Settings > Repository secrets")
     return len(errors) == 0
@@ -156,9 +353,7 @@ def get_blob_service_client():
             elif AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY:
                 account_url = f"https://{AZURE_STORAGE_ACCOUNT_NAME}.blob.core.windows.net"
                 blob_service_client = BlobServiceClient(
-                    account_url=account_url,
-                    credential=AZURE_STORAGE_ACCOUNT_KEY
-                )
                 print("✅ Azure Blob Storage initialized with account key")
             else:
                 print("⚠️ WARNING: No Azure credentials configured")
@@ -199,17 +394,13 @@ def upload_raw_pdf_to_blob(pdf_bytes: bytes, filename: str, batch_id: str, conta
             raise HTTPException(
                 status_code=500, detail="Azure Blob Storage not configured")
-        # Clean filename for folder name
         base_filename = os.path.splitext(filename)[0]
         safe_folder_name = re.sub(r'[<>:"/\\|?*]', '_', base_filename)
         blob_name = f"{ROOT_FOLDER}/{batch_id}/{safe_folder_name}/Raw/{filename}"
-        # Get blob client
         blob_client = client.get_blob_client(
             container=container_name, blob=blob_name)
-        # Upload PDF
         print(f"📤 Uploading raw PDF to: {blob_name}")
         blob_client.upload_blob(
             pdf_bytes,
@@ -223,7 +414,6 @@ def upload_raw_pdf_to_blob(pdf_bytes: bytes, filename: str, batch_id: str, conta
             }
         )
-        # Generate SAS URL (valid for 24 hours)
         expiry_hours = 24
         sas_token = generate_blob_sas(
             account_name=AZURE_STORAGE_ACCOUNT_NAME,
@@ -234,7 +424,6 @@ def upload_raw_pdf_to_blob(pdf_bytes: bytes, filename: str, batch_id: str, conta
             expiry=datetime.utcnow() + timedelta(hours=expiry_hours)
         )
-        # Construct URLs
         blob_url = blob_client.url
         download_url = f"{blob_url}?{sas_token}"
         expires_at = (datetime.utcnow() +
@@ -273,17 +462,13 @@ def upload_split_pdf_to_blob(pdf_bytes: bytes, invoice_filename: str, original_f
             raise HTTPException(
                 status_code=500, detail="Azure Blob Storage not configured")
-        # Clean original filename for folder name
         base_filename = os.path.splitext(original_filename)[0]
         safe_folder_name = re.sub(r'[<>:"/\\|?*]', '_', base_filename)
         blob_name = f"{ROOT_FOLDER}/{batch_id}/{safe_folder_name}/Splitted/{invoice_filename}"
-        # Get blob client
         blob_client = client.get_blob_client(
             container=container_name, blob=blob_name)
-        # Upload PDF
         blob_client.upload_blob(
             pdf_bytes,
             overwrite=True,
@@ -297,7 +482,6 @@ def upload_split_pdf_to_blob(pdf_bytes: bytes, invoice_filename: str, original_f
             }
         )
-        # Generate SAS URL (valid for 24 hours)
         expiry_hours = 24
         sas_token = generate_blob_sas(
             account_name=AZURE_STORAGE_ACCOUNT_NAME,
@@ -308,7 +492,6 @@ def upload_split_pdf_to_blob(pdf_bytes: bytes, invoice_filename: str, original_f
             expiry=datetime.utcnow() + timedelta(hours=expiry_hours)
         )
-        # Construct URLs
         blob_url = blob_client.url
         download_url = f"{blob_url}?{sas_token}"
         expires_at = (datetime.utcnow() +
@@ -344,7 +527,6 @@ async def cleanup_old_blobs(batch_id: str, container_name: str = None):
             return
         container_client = client.get_container_client(container_name)
         prefix = f"{ROOT_FOLDER}/{batch_id}/"
         blobs = container_client.list_blobs(name_starts_with=prefix)
@@ -365,103 +547,88 @@ async def cleanup_old_blobs(batch_id: str, container_name: str = None):
 def get_gemini_model():
-    """Get or create Gemini model instance."""
-    global gemini_model
     if not GEMINI_AVAILABLE:
         return None
-    if gemini_model is None:
-        if not GEMINI_API_KEY:
-            return None
-        try:
-            genai.configure(api_key=GEMINI_API_KEY)
-            # Use Gemini 2.5 Flash
-            gemini_model = genai.GenerativeModel('gemini-2.5-flash')
-            print("✅ Google Gemini initialized")
-        except Exception as e:
-            print(f"❌ Failed to initialize Gemini: {e}")
-            return None
-    return gemini_model
-def extract_invoice_gemini_sync(page: fitz.Page) -> Optional[str]:
-    """Enhanced Gemini extraction with improved prompts and fallback."""
-    model = get_gemini_model()
-    if not model:
-        return None
-    img = None
     try:
-        # Reduced resolution for faster processing
         pix = page.get_pixmap(matrix=fitz.Matrix(
             GEMINI_IMAGE_RESOLUTION, GEMINI_IMAGE_RESOLUTION))
         img_bytes = pix.tobytes("png")
         pix = None
-        img = Image.open(io.BytesIO(img_bytes))
-        # ⭐ ENHANCED PROMPT: More specific instructions
         prompt = """Look at this invoice image and extract ONLY the invoice number.
-IMPORTANT:
-- Look for labels: "Invoice No", "Invoice Number", "Bill No", "Document No"
-- The invoice number is the value RIGHT AFTER these labels
-- DO NOT extract: ACK numbers, Account numbers (A/C No), Order numbers
-- Return ONLY the invoice number (letters and numbers, e.g., G031663, DHC036747)
-- If not found, return: NONE
-Invoice Number:"""
-        response = model.generate_content([prompt, img])
-        if response and response.text:
-            extracted_text = response.text.strip()
-            # Clean up the response
-            cleaned = extracted_text.replace(
-                "*", "").replace("#", "").replace("Invoice Number:", "").strip()
-            print(f"    🤖 Gemini raw response: '{extracted_text}'")
-            print(f"    🤖 Gemini cleaned: '{cleaned}'")
-            # Basic validation
-            if cleaned and cleaned.upper() != "NONE" and len(cleaned) >= 3:
-                # Remove any remaining labels
-                cleaned = re.sub(
-                    r'^(Invoice|Bill|Document)\s+(No\.?|Number)[\s\.:]*', '', cleaned, flags=re.IGNORECASE)
-                cleaned = cleaned.strip(".,;:-_")
-                if len(cleaned) >= 3:
-                    print(f"    ✅ Gemini extracted: {cleaned}")
-                    img.close()
-                    return cleaned.upper()
-        # ⭐ FALLBACK: Full OCR + regex extraction
-        print("    ⚠️ Gemini direct extraction failed, trying full OCR...")
-        ocr_prompt = """Extract ALL text from this invoice image.
-Return the complete text content exactly as it appears, preserving all labels and values."""
-        ocr_response = model.generate_content([ocr_prompt, img])
-        if ocr_response and ocr_response.text:
-            ocr_text = ocr_response.text
-            print(
-                f"\n    🔍 Gemini OCR text (first 500 chars):\n{ocr_text[:500]}\n")
-            # Try our extraction function on the OCR text
-            inv = try_extract_invoice_from_text(ocr_text)
-            if inv:
-                img.close()
-                return inv
-        if img:
-            img.close()
-        return None
     except Exception as e:
-        print(f"    ❌ Gemini error: {e}")
-        if img:
-            img.close()
         return None
@@ -471,7 +638,6 @@ async def extract_invoices_batch_async(doc: fitz.Document, is_image_pdf: bool,
     page_invoice_nos = []
     if not is_image_pdf:
-        # Fast text-based extraction (no parallelization needed)
         print(f"  📝 Text-based extraction (sequential)")
         for i in range(doc.page_count):
             if i % 50 == 0:
@@ -484,37 +650,29 @@ async def extract_invoices_batch_async(doc: fitz.Document, is_image_pdf: bool,
                 gc.collect()
         return page_invoice_nos
-    # Image-based PDF: Use parallel Gemini processing
     print(f"  🚀 Image-based extraction (parallel, batch_size={batch_size})")
-    # Use ThreadPoolExecutor for parallel API calls
     with ThreadPoolExecutor(max_workers=batch_size) as executor:
         futures = []
-        # Submit all pages to thread pool
         for i in range(doc.page_count):
             page = doc.load_page(i)
-            # First try text extraction (fast)
             text_result = extract_invoice_text_based(page)
             if text_result:
                 futures.append((i, None, text_result))
             else:
-                # Submit to Gemini thread pool
                 future = executor.submit(extract_invoice_gemini_sync, page)
                 futures.append((i, future, None))
-        # Collect results in order
         page_invoice_nos = [None] * doc.page_count
         completed = 0
         for i, future, text_result in futures:
             try:
                 if text_result:
-                    # Already extracted from text
                     page_invoice_nos[i] = text_result
                     completed += 1
                 else:
-                    # Wait for Gemini result
                     result = future.result(timeout=30)
                     page_invoice_nos[i] = result
                     completed += 1
@@ -540,12 +698,10 @@ def extract_invoices_smart_sampling(doc: fitz.Document, is_image_pdf: bool) -> L
     page_invoice_nos = [None] * doc.page_count
-    # Always extract from first page
     page = doc.load_page(0)
     page_invoice_nos[0] = extract_invoice_no_from_page(page, is_image_pdf)
     print(f"  ✓ Page 1: {page_invoice_nos[0]}")
-    # Sample every Nth page to detect changes
     sample_interval = max(3, doc.page_count // 20)
     print(f"  Sampling interval: every {sample_interval} pages")
@@ -557,7 +713,6 @@ def extract_invoices_smart_sampling(doc: fitz.Document, is_image_pdf: bool) -> L
         if i % 10 == 0:
             print(f"  Sampling page {i+1}/{doc.page_count}...")
-        # If invoice changed, extract nearby pages to find exact boundary
         prev_known_idx = i - sample_interval
         while prev_known_idx >= 0 and page_invoice_nos[prev_known_idx] is None:
             prev_known_idx -= 1
@@ -571,13 +726,11 @@ def extract_invoices_smart_sampling(doc: fitz.Document, is_image_pdf: bool) -> L
                     page_invoice_nos[idx] = extract_invoice_no_from_page(
                         page, is_image_pdf)
-    # Also check last page
     if page_invoice_nos[-1] is None:
         page = doc.load_page(doc.page_count - 1)
         page_invoice_nos[-1] = extract_invoice_no_from_page(page, is_image_pdf)
         print(f"  ✓ Last page: {page_invoice_nos[-1]}")
-    # Forward-fill gaps
     last_known = page_invoice_nos[0]
     filled = 0
     for i in range(len(page_invoice_nos)):
@@ -591,7 +744,7 @@ def extract_invoices_smart_sampling(doc: fitz.Document, is_image_pdf: bool) -> L
     return page_invoice_nos
 # ============================================================================
-# PDF PROCESSING FUNCTIONS (KEEP ORIGINAL + ADD ZYDUS FALLBACK)
 # ============================================================================
@@ -633,48 +786,26 @@ def is_valid_invoice_number(candidate: str) -> bool:
     has_digit = any(c.isdigit() for c in candidate)
     return has_letter and has_digit
-# ⭐ KEEP YOUR ORIGINAL EXTRACTION FUNCTION (Works for other invoices)
 def try_extract_invoice_from_text(text: str) -> Optional[str]:
-    """
-    ⭐ UNIVERSAL LABEL-FIRST EXTRACTION with Smart Prioritization
-    Strategy:
-    1. Find invoice-related labels (Invoice No, Bill No, etc.)
-    2. Extract ALL potential candidates after the label
-    3. TWO-PASS: Prioritize pure numeric 12-14 digit numbers (common for invoices)
-    4. Filter out noise patterns (ACK, PH, A/C, state codes, etc.)
-    5. Return the first valid candidate
-    Works for ANY invoice format!
-    """
     if not text:
         return None
     text_norm = normalize_text_for_search(text)
-    # ⭐ DEBUG: Print first 800 chars
     if len(text_norm) > 0:
         print(f"\n{'='*70}")
         print(f"🔍 ANALYZING TEXT (first 800 chars):")
         print(f"{text_norm[:800]}")
         print(f"{'='*70}\n")
-    # ============================================================================
-    # ⭐ PRIORITY 1: LABELED VALUE EXTRACTION (UNIVERSAL APPROACH)
-    # ============================================================================
-    # Define label patterns in PRIORITY ORDER
     label_patterns = [
-        # Invoice labels (highest priority)
         (r"Invoice\s*(?:No\.?|Number|Num)", "Invoice No", True),
         (r"Inv\s*(?:No\.?|Number)", "Inv No", True),
         (r"Bill\s*(?:No\.?|Number|Num)", "Bill No", True),
         (r"Tax\s*Invoice\s*(?:No\.?|Number)", "Tax Invoice No", True),
         (r"Document\s*(?:No\.?|Number)", "Document No", True),
-        # Other labels (lower priority)
         (r"Receipt\s*(?:No\.?|Number)", "Receipt No", False),
         (r"Voucher\s*(?:No\.?|Number)", "Voucher No", False),
         (r"Reference\s*(?:No\.?|Number)", "Reference No", False),
@@ -685,14 +816,11 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
     for label_pattern, label_name, is_invoice_label in label_patterns:
         header_text = text_norm[:2000]
-        # Find ALL matches of this label
         label_matches = list(re.finditer(
             label_pattern, header_text, re.IGNORECASE))
         for label_match in label_matches:
             start_pos = label_match.end()
-            # Get a larger chunk of text after the label (200 chars)
             text_after_label = header_text[start_pos:start_pos + 200]
             print(
@@ -700,22 +828,12 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
             print(
                 f"   Text after label (first 80 chars): '{text_after_label[:80]}...'")
-            # ⭐ UNIVERSAL APPROACH: Extract ALL potential candidates (alphanumeric tokens)
             all_candidates = re.findall(
-                r'\b([A-Z0-9][A-Z0-9\-\/]{2,20})\b',
-                text_after_label,
-                re.IGNORECASE
-            )
             print(
                 f"   Found {len(all_candidates)} potential candidates: {all_candidates[:5]}")
-            # ============================================================================
-            # ⭐ TWO-PASS SMART PRIORITIZATION
-            # Pass 1: Pure numeric 12-14 digit numbers (very common for invoices)
-            # Pass 2: Alphanumeric candidates (only if Pass 1 fails)
-            # ============================================================================
             for pass_number in [1, 2]:
                 if pass_number == 2 and len(all_candidates) > 0:
                     print(f"   🔄 Second pass: Trying alphanumeric candidates...")
@@ -723,31 +841,22 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
                 for candidate in all_candidates:
                     invoice_num = candidate.strip(".,;:-_")
-                    # Skip if too short
                     if len(invoice_num) < 3:
                         continue
-                    # ⭐ SMART FILTERING: First pass only accepts pure numeric 12-14 digits
                     is_pure_numeric = invoice_num.isdigit()
                     is_ideal_invoice_length = 12 <= len(invoice_num) <= 14
                     if pass_number == 1:
-                        # First pass: Only consider pure numeric 12-14 digits
                         if not (is_pure_numeric and is_ideal_invoice_length):
                             continue
                         print(
                             f"   ✨ PRIORITY candidate (12-14 digit numeric): '{invoice_num}'")
                     else:
-                        # Second pass: Skip ones already checked in first pass
                         if is_pure_numeric and is_ideal_invoice_length:
                             continue
                         print(f"   🔍 Evaluating candidate: '{invoice_num}'")
-                    # ====================================================================
-                    # ⭐ COMPREHENSIVE BLACKLIST FILTER
-                    # ====================================================================
-                    # Skip noise words
                     if invoice_num.upper() in ("ORDER", "REF", "NO", "NUMBER", "DATE", "DT", "AND",
                                                "INV", "BILL", "ACCOUNT", "PO", "COPY", "OF",
                                                "DOCUMENT", "DOC", "GST", "GSTIN", "ACK", "USER",
@@ -755,204 +864,135 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
                         print(f"      ⚠️ Skipped: noise word")
                         continue
-                    # Context-aware batch pattern filter
                     if not is_invoice_label:
                         if re.match(r'^[A-Z]\d{6}$', invoice_num, re.IGNORECASE):
                             print(
                                 f"      ⚠️ Skipped: batch pattern (non-invoice context)")
                             continue
-                    # Skip license patterns (XX-XXX-123456)
                     if re.match(r'^[A-Z]{2,3}-[A-Z0-9]+-\d+$', invoice_num, re.IGNORECASE):
                         print(f"      ⚠️ Skipped: license pattern")
                         continue
-                    # ⭐ NEW: Skip state code / UIN patterns (MHMY-4501110485 format)
                     if re.match(r'^[A-Z]{2,4}-\d{10}$', invoice_num, re.IGNORECASE):
-                        print(
-                            f"      ⚠️ Skipped: state code/UIN pattern (XXXX-nnnnnnnnnn)")
                         continue
-                    # Skip ACK numbers
-                    if re.search(rf"Ack\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(invoice_num)}",
-                                 text_norm, re.IGNORECASE):
                         print(f"      ⚠️ Skipped: ACK number")
                         continue
-                    # Skip PH (Phone) numbers
-                    if re.search(rf"PH\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(invoice_num)}",
-                                 text_norm, re.IGNORECASE):
                         print(f"      ⚠️ Skipped: PH number")
                         continue
-                    # ⭐ Skip UIN/UID/State Code
-                    if re.search(rf"(?:UIN|UID|State\s*Code|D\.L\.No)\.?\s*:?\s*{re.escape(invoice_num)}",
-                                 text_norm, re.IGNORECASE):
-                        print(f"      ⚠️ Skipped: UIN/UID/State Code/D.L.No")
                         continue
-                    # Skip A/C (Account) numbers
-                    if re.search(rf"A[\s\/]*C\s*(?:No\.?|Number)?\s*[\-:\.]?\s*{re.escape(invoice_num)}",
-                                 text_norm, re.IGNORECASE):
                         print(f"      ⚠️ Skipped: A/C number")
                         continue
-                    # Skip phone numbers (10-11 digits)
                     if re.match(r'^[0-9]{10,11}$', invoice_num):
-                        # 10 digits starting with 6-9 (mobile)
                         if len(invoice_num) == 10 and invoice_num[0] in '6789':
                             print(f"      ⚠️ Skipped: mobile number")
                             continue
-                        # 11 digits starting with 0 (landline with STD code)
                         if len(invoice_num) == 11 and invoice_num[0] == '0':
                             print(f"      ⚠️ Skipped: landline number")
                             continue
-                    # Skip dates (8 digits starting with 20)
                     if re.match(r'^20\d{6}$', invoice_num):
-                        print(f"      ⚠️ Skipped: date pattern (20xxxxxx)")
                         continue
-                    # Skip date formats (dd/mm/yyyy or dd-mm-yyyy)
                     if re.match(r'^\d{2}[\/\-]\d{2}[\/\-]\d{4}$', invoice_num):
-                        print(f"      ⚠️ Skipped: date format (dd/mm/yyyy)")
                         continue
-                    # Skip GST numbers (15 alphanumeric)
-                    if len(invoice_num) == 15 and re.match(r'^\d{2}[A-Z]{5}\d{4}[A-Z]\d[A-Z]\d$',
-                                                           invoice_num, re.IGNORECASE):
-                        print(f"      ⚠️ Skipped: GST number (15 chars)")
                         continue
-                    # ✅ VALID INVOICE NUMBER FOUND!
                     print(f"      ✅✅✅ ACCEPTED: '{invoice_num}'")
                     return invoice_num.upper()
             print(f"   ⚠️ No valid candidates found after '{label_name}'")
-    # ============================================================================
-    # ⭐ PRIORITY 2: FALLBACK - Unlabeled extraction
-    # ============================================================================
     print("\n⚠️ No labeled invoice number found, trying fallback extraction...")
     top_text = text_norm[:1000]
-    # Try CREDIT numbers (12-20 digits, excluding 14-digit account numbers)
     credit_match = re.search(
-        r"CREDIT\s*(?:NO|NUMBER|#)?\s*[:\-]?\s*(\d{12,20})",
-        text_norm,
-        re.IGNORECASE
-    )
     if credit_match:
         credit_num = credit_match.group(1).strip()
-        # Allow 12-14 digits, exclude exactly 14 if it might be account number
         if 12 <= len(credit_num) <= 20 and len(credit_num) != 14:
             print(f"✓ Fallback: Found CREDIT number: {credit_num}")
             return credit_num.upper()
-    # Try long numerics (12-20 digits), excluding problematic patterns
     long_numerics = re.findall(r'\b(\d{12,20})\b', top_text)
     for num in long_numerics:
-        # Skip if labeled as ACK, PH, A/C, UIN, etc.
-        if re.search(rf"(?:Ack|PH|A[\s\/]*C|UIN|UID|State\s*Code|D\.L\.No)\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(num)}",
-                     text_norm, re.IGNORECASE):
             print(f"⚠️ Fallback: Skipping (labeled as ACK/PH/A/C/UIN): {num}")
             continue
         print(f"✓ Fallback: Found long numeric: {num}")
         return num.upper()
-    # Try medium numerics (10-15 digits, excluding phones and dates)
     medium_numerics = re.findall(r'\b(\d{10,15})\b', top_text)
     for num in medium_numerics:
-        # Skip phone numbers
         if len(num) == 10 and num[0] in '6789':
             continue
         if len(num) == 11 and num[0] == '0':
             continue
-        # Skip dates
         if len(num) == 8 and num.startswith('20'):
             continue
-        # Skip if labeled as problematic
-        if re.search(rf"(?:Ack|PH|A[\s\/]*C|UIN|UID)\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(num)}",
-                     text_norm, re.IGNORECASE):
             continue
         print(f"✓ Fallback: Found medium numeric: {num}")
         return num.upper()
     print("✗ No invoice number found (labeled or unlabeled)")
     return None
-# ⭐ ENHANCED FUNCTION: Add Zydus Healthcare fallback (works with table layouts)
 def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
-    """
-    Extract invoice number from TEXT-BASED PDF.
-    ⭐ ZYDUS HEALTHCARE PRIORITY EXTRACTION:
-    Zydus Healthcare invoices have a specific pattern: 10-digit numbers starting with '23'
-    (e.g., 2310763135, 2310763275). These must be extracted BEFORE the original logic
-    runs, because the original logic will pick up 14-digit Order IDs instead.
-    """
     text = page.get_text("text") or ""
     text_norm = normalize_text_for_search(text)
-    # ⭐ STEP 1: ALWAYS check for Zydus pattern FIRST (before any other extraction)
-    # Look for 10-digit number starting with '23' in first 2500 chars
     header_text = text_norm[:2500]
-    # Find ALL occurrences of 23xxxxxxxx pattern
     zydus_candidates = re.findall(r'\b(23\d{8})\b', header_text)
     if zydus_candidates:
-        # ⭐ CRITICAL: If we found any 23xxxxxxxx pattern, this is a Zydus invoice
-        # Return the FIRST occurrence (most likely to be the invoice number)
         zydus_number = zydus_candidates[0]
         print(f"    ✅ ZYDUS INVOICE DETECTED: {zydus_number}")
         return zydus_number.upper()
-    # ⭐ STEP 2: If NO Zydus pattern found, use original extraction logic
     inv = try_extract_invoice_from_text(text)
-    # ⭐ NEW: BLACKLIST FILTER for Zydus Healthcare invoices
-    # Reject 14-digit Order IDs (pattern: 107xxxxxxxxxx or 10xxxxxxxxxx with 14 digits)
     if inv:
-        # Check if this is a 14-digit number starting with '10' or '107'
         if re.match(r'^10\d{12}$', inv):
-            print(
-                f"    ⚠️ REJECTED Order ID (14-digit): {inv} - Looking for Zydus pattern instead...")
-            # This is likely a Zydus invoice page without the invoice number visible
-            # Skip this extraction and try other methods
             inv = None
         else:
-            # Valid invoice number from original extraction
             return inv
-    # ⭐ STEP 3: Try block-level extraction (original logic)
     for block in (page.get_text("blocks") or []):
         block_text = block[4] if len(block) > 4 else ""
         if block_text:
             inv = try_extract_invoice_from_text(block_text)
             if inv:
-                # Check blacklist again
                 if re.match(r'^10\d{12}$', inv):
-                    print(
-                        f"    ⚠️ REJECTED Order ID from block (14-digit): {inv}")
-                    continue  # Skip this block, try next one
                 else:
                     return inv
-    # ⭐ STEP 4: Final fallback - try Zydus pattern in text blocks
-    # (For continuation pages where invoice number might be in a different block)
     blocks = page.get_text("blocks") or []
     sorted_blocks = sorted(blocks, key=lambda b: b[1] if len(b) > 1 else 0)
-    for block in sorted_blocks[:15]:  # Check first 15 blocks
         block_text = block[4] if len(block) > 4 else ""
         if block_text:
             block_norm = normalize_text_for_search(block_text)
@@ -962,14 +1002,12 @@ def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
                 print(f"    ✅ ZYDUS BLOCK DETECTION: {number}")
                 return number.upper()
-    # ⭐ STEP 5: Last resort - if still nothing found, return None
-    # The forward-fill logic will assign this page to the previous invoice
     print(f"    ⚠️ No valid invoice found on this page (will use forward-fill)")
     return None
 def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
-    """Extract invoice number from a single page (used by smart sampling)."""
     text_result = extract_invoice_text_based(page)
     if text_result:
         return text_result
@@ -996,21 +1034,13 @@ def remove_file(path: str):
     except Exception as e:
         print(f"⚠️ Cleanup warning: {e}")
-# ============================================================================
-# ⭐ NEW: MERGE FUNCTION FOR NULL FIRST GROUP
-# ============================================================================
 def merge_first_null_group(groups: List[Dict]) -> List[Dict]:
-    """
-    If the first group has invoice_no = None and the second group has a valid invoice,
-    merge them together (page 1 is likely the cover page of the first invoice).
-    """
     if len(groups) >= 2:
         first_group = groups[0]
         second_group = groups[1]
-        # Check if first group is null and second group has invoice number
         if first_group["invoice_no"] is None and second_group["invoice_no"] is not None:
             print(f"\n🔧 AUTO-FIX: Merging null first page(s) with first invoice")
             print(
@@ -1018,11 +1048,8 @@ def merge_first_null_group(groups: List[Dict]) -> List[Dict]:
             print(
                 f"   First invoice: {second_group['invoice_no']}, Pages {[p+1 for p in second_group['pages']]}")
-            # Merge: Add first group's pages to second group
             merged_pages = first_group["pages"] + second_group["pages"]
             second_group["pages"] = merged_pages
-            # Remove first null group
             groups.pop(0)
             print(
@@ -1039,35 +1066,25 @@ def merge_first_null_group(groups: List[Dict]) -> List[Dict]:
 async def split_invoices(
     background_tasks: BackgroundTasks,
     file: UploadFile = File(...),
-    batch_id: str = Form(...,
-                         description="Batch ID (required) - used for folder structure"),
     use_blob_storage: bool = Form(
         True, description="Upload PDFs to Azure Blob Storage"),
     blob_container: Optional[str] = Form(
-        None, description="Custom Azure container (optional)"),
     include_base64: bool = Form(
         False, description="Include base64 in response"),
     parallel_batch_size: int = Form(
-        MAX_PARALLEL_GEMINI_CALLS, description="Parallel Gemini API calls (1-10)"),
     use_smart_sampling: bool = Form(
-        USE_SMART_SAMPLING, description="Use smart sampling (faster, ~95% accurate)"),
     max_file_size_mb: int = Form(200, description="Maximum file size in MB"),
 ):
-    """
-    ⭐ UNIVERSAL INVOICE SPLITTER
-    Works for ALL invoice types:
-    - Standard invoices (original extraction)
-    - Zydus Healthcare invoices (enhanced fallback for 23xxxxxxxx pattern)
-    - Auto-merges null first pages
-    """
     if not file.filename:
         raise HTTPException(status_code=400, detail="No filename provided")
     filename_lower = file.filename.lower()
-    # Supported formats
     SUPPORTED_EXTENSIONS = ['.pdf', '.png',
                             '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
@@ -1079,18 +1096,14 @@ async def split_invoices(
     if not file_extension:
         raise HTTPException(
-            status_code=400,
-            detail=f"Unsupported file format. Supported: PDF, PNG, JPG, JPEG, TIFF, BMP"
-        )
     is_image_file = file_extension in [
         '.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
     if is_image_file and not GEMINI_AVAILABLE:
         raise HTTPException(
-            status_code=500,
-            detail="Image processing requires PIL. Install: pip install Pillow"
-        )
     if use_blob_storage and not get_blob_service_client():
         raise HTTPException(
@@ -1198,7 +1211,6 @@ async def split_invoices(
             print(
                 f"   ... (showing first 10 of {len(page_invoice_nos)} pages)")
-        # Step 1: Normalize extracted invoice numbers (only filter GST numbers)
         page_invoice_nos_normalized = []
         for v in page_invoice_nos:
             if v and v.upper().startswith("GST"):
@@ -1209,7 +1221,6 @@ async def split_invoices(
             else:
                 page_invoice_nos_normalized.append(None)
-        # Step 2: Smart forward-fill for failed extractions
         page_invoice_nos_filled = []
         last_known_invoice = None
@@ -1230,7 +1241,6 @@ async def split_invoices(
             page_count = sum(1 for v in page_invoice_nos_filled if v == inv_no)
             print(f"      • {inv_no}: {page_count} pages")
-        # Step 3: Group consecutive pages by invoice number
         groups = []
         current_group = []
         current_invoice = None
@@ -1241,10 +1251,8 @@ async def split_invoices(
                 current_group = [idx]
             else:
                 if inv != current_invoice:
-                    groups.append({
-                        "invoice_no": current_invoice,
-                        "pages": current_group[:]
-                    })
                     print(
                         f"   📄 Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
                     current_invoice = inv
@@ -1253,27 +1261,21 @@ async def split_invoices(
                     current_group.append(idx)
         if current_group:
-            groups.append({
-                "invoice_no": current_invoice,
-                "pages": current_group[:]
-            })
             print(
                 f"   📄 Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
         if len(groups) == 1 and groups[0]["invoice_no"] is None:
-            groups = [{
-                "invoice_no": None,
-                "pages": list(range(doc.page_count))
-            }]
-        # ⭐ NEW: Auto-merge first null group
         groups = merge_first_null_group(groups)
         print(f"\n✅ Created {len(groups)} invoice groups (after auto-merge)")
         print(
             f"   Forward-filled {filled_count} pages with missing invoice numbers")
-        # Build and upload split PDFs
         print(f"\n🔨 Building and uploading split invoices...")
         all_parts = []
@@ -1355,7 +1357,8 @@ async def split_invoices(
                 "unique_invoice_numbers": len(unique_invoices),
                 "extraction_method": "gemini" if is_image_pdf else "text",
                 "pages_forward_filled": filled_count,
-                "storage_type": "azure_blob" if use_blob_storage else "base64"
             },
             "performance": {
                 "total_time_seconds": round(total_time, 2),
@@ -1377,6 +1380,7 @@ async def split_invoices(
             f"   Raw PDF: {raw_pdf_info['blob_name'] if raw_pdf_info else 'Not uploaded'}")
         print(f"   Split invoices: {len(all_parts)}")
         print(f"   Unique invoice numbers: {len(unique_invoices)}")
         print(f"   Total time: {total_time:.1f}s")
         print(
             f"   Extraction time: {extraction_time:.1f}s ({total_pages_count / extraction_time:.1f} pages/sec)")
@@ -1406,7 +1410,7 @@ async def cleanup_batch(
     background_tasks: BackgroundTasks,
     container_name: Optional[str] = Form(None)
 ):
-    """Delete all blobs for a specific batch (entire POD/{batch_id}/ folder)."""
     if container_name is None:
         container_name = AZURE_CONTAINER_NAME
@@ -1421,38 +1425,79 @@ async def cleanup_batch(
     })
 @app.get("/")
 async def root():
     return {
-        "service": "Universal Invoice Splitter API",
-        "version": "3.2",
         "status": "running",
         "features": {
             "multi_format_support": True,
             "zydus_healthcare_support": True,
             "auto_merge_null_groups": True,
             "azure_blob_storage": True,
-            "parallel_processing": True
-        }
     }
 @app.get("/health")
 async def health():
     return {
         "status": "healthy",
         "timestamp": datetime.now().isoformat(),
         "gemini_configured": bool(GEMINI_API_KEY),
-        "azure_configured": bool(AZURE_STORAGE_CONNECTION_STRING or (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY))
     }
 if __name__ == "__main__":
     import uvicorn
-    print("\n" + "="*70)
-    print("🚀 Starting Universal Invoice Splitter API")
-    print("="*70)
     print(f"✅ Supports ALL invoice types")
     print(f"✅ Zydus Healthcare fallback (23xxxxxxxx pattern)")
     print(f"✅ Auto-merge null first pages")
-    print("="*70 + "\n")
     uvicorn.run(app, host=HOST, port=PORT, log_level="info")

 import uuid
 import asyncio
 from typing import List, Dict, Optional, Tuple
+from collections import Counter, deque
 from concurrent.futures import ThreadPoolExecutor
+from threading import Lock, Thread, Event
+import time
+import logging
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
 from fastapi.middleware.cors import CORSMiddleware
 from starlette.requests import Request
 import fitz  # PyMuPDF
 import google.generativeai as genai
+from google.api_core import exceptions as google_exceptions
 from PIL import Image
+import requests
+import base64
 # Azure Blob Storage
 try:
 from datetime import datetime, timedelta
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 app = FastAPI(title="Invoice Splitter API with Azure Blob Storage - Optimized")
 # Increase request body size limit
 )
 # ============================================================================
+# ⭐ CONFIGURATION
 # ============================================================================
 # Gemini API Key - REQUIRED for image-based PDFs
 HOST = os.environ.get("HOST", "0.0.0.0")  # Hugging Face uses 0.0.0.0
 PORT = int(os.environ.get("PORT", "7860"))  # Hugging Face default port
+MAX_WAIT_TIME = 300  # 5 minutes max wait for quota
+model_lock = Lock()
+quota_manager_lock = Lock()
 # ============================================================================
 # GLOBAL VARIABLES
 # ============================================================================
 blob_service_client = None
+GEMINI_REST_URL = "https://generativelanguage.googleapis.com/v1/models/{model}:generateContent?key={key}"
+def call_gemini_25(model_name: str, image_bytes: bytes, prompt: str) -> str:
+    global current_model_index
+    while True:
+        model_config = get_current_model_config()
+        url = GEMINI_REST_URL.format(
+            model=model_config["name"], key=GEMINI_API_KEY)
+        encoded = base64.b64encode(image_bytes).decode("utf-8")
+        payload = {
+            "contents": [
+                {
+                    "parts": [
+                        {"inline_data": {"mime_type": "image/png", "data": encoded}},
+                        {"text": prompt}
+                    ]
+                }
+            ],
+            "generationConfig": {"temperature": 0}
+        }
+        r = requests.post(url, json=payload, timeout=model_config["timeout"])
+        # SUCCESS
+        if r.status_code == 200:
+            record_model_request(model_config)
+            data = r.json()
+            return data["candidates"][0]["content"]["parts"][0]["text"]
+        # QUOTA HIT → SWITCH MODEL
+        if r.status_code in (429, 503):
+            print(
+                f"⚠️ RPM exhausted for {model_config['name']} → switching model")
+            model_config["current_rpm"] = model_config["max_requests_per_minute"]
+            next_model = get_next_available_model()
+            if next_model:
+                print(f"🔄 Switched to {next_model['name']}")
+                continue
+            # All models exhausted → wait
+            print("⏳ All models exhausted. Waiting 60s...")
+            time.sleep(60)
+            continue
+        # Other error
+        raise Exception(f"Gemini error {r.status_code}: {r.text}")
+def get_next_available_model():
+    global current_model_index
+    for i in range(len(GEMINI_MODELS)):
+        idx = (current_model_index + i) % len(GEMINI_MODELS)
+        if can_use_model(GEMINI_MODELS[idx]):
+            current_model_index = idx
+            return GEMINI_MODELS[idx]
+    return None
+# Model configuration with quota tracking
+GEMINI_MODELS = [
+    {
+        "name": "gemini-2.5-flash-lite",
+        "max_requests_per_minute": 120,
+        "max_requests_per_day": 10000,
+        "max_output_tokens": 16384,
+        "timeout": 60,
+        "description": "Stage 1 - Pre-classification / validation / cheap parsing",
+        "current_rpm": 0,
+        "current_rpd": 0,
+        "last_rpm_reset": None,
+        "last_rpd_reset": None,
+        "quota_reset_time": None,
+        "skip_on_error": True
+    },
+    {
+        "name": "gemini-2.5-flash-image",
+        "max_requests_per_minute": 50,
+        "max_requests_per_day": 1500,
+        "max_output_tokens": 65536,
+        "timeout": 300,
+        "description": "Stage 2 - Primary invoice OCR extraction",
+        "current_rpm": 0,
+        "current_rpd": 0,
+        "last_rpm_reset": None,
+        "last_rpd_reset": None,
+        "quota_reset_time": None,
+        "skip_on_error": False
+    },
+    {
+        "name": "gemini-2.5-pro",
+        "max_requests_per_minute": 10,
+        "max_requests_per_day": 1000,
+        "max_output_tokens": 65536,
+        "timeout": 300,
+        "description": "Stage 3 - Complex invoice reasoning fallback",
+        "current_rpm": 0,
+        "current_rpd": 0,
+        "last_rpm_reset": None,
+        "last_rpd_reset": None,
+        "quota_reset_time": None,
+        "skip_on_error": False
+    }
+]
+current_model_index = 0
+# ============================================================================
+# ⭐ QUOTA MANAGEMENT FUNCTIONS
+# ============================================================================
+def reset_model_quota_counters(model_config):
+    """Reset quota counters based on time windows"""
+    now = datetime.now()
+    with quota_manager_lock:
+        if model_config["last_rpm_reset"] is None:
+            model_config["last_rpm_reset"] = now
+            model_config["current_rpm"] = 0
+        elif (now - model_config["last_rpm_reset"]).total_seconds() >= 60:
+            model_config["current_rpm"] = 0
+            model_config["last_rpm_reset"] = now
+            logger.debug(f"🔄 Reset RPM for {model_config['name']}")
+        if model_config["last_rpd_reset"] is None:
+            model_config["last_rpd_reset"] = now
+            model_config["current_rpd"] = 0
+        elif now.date() > model_config["last_rpd_reset"].date():
+            model_config["current_rpd"] = 0
+            model_config["last_rpd_reset"] = now
+            logger.info(f"🔄 Reset daily quota for {model_config['name']}")
+def can_use_model(model_config):
+    """Check if model has available quota"""
+    reset_model_quota_counters(model_config)
+    with quota_manager_lock:
+        rpm_ok = model_config["current_rpm"] < model_config["max_requests_per_minute"]
+        rpd_ok = model_config["current_rpd"] < model_config["max_requests_per_day"]
+        return rpm_ok and rpd_ok
+def record_model_request(model_config):
+    """Record a request"""
+    with quota_manager_lock:
+        model_config["current_rpm"] += 1
+        model_config["current_rpd"] += 1
+        logger.debug(
+            f"📊 {model_config['name']}: RPM={model_config['current_rpm']}/{model_config['max_requests_per_minute']}")
+def wait_for_quota_renewal(max_wait=MAX_WAIT_TIME):
+    """Wait for any model to have quota"""
+    start = time.time()
+    logger.info(
+        f"⏳ All models quota exhausted. Waiting for renewal (max {max_wait}s)...")
+    while time.time() - start < max_wait:
+        for i, model in enumerate(GEMINI_MODELS):
+            if can_use_model(model):
+                elapsed = time.time() - start
+                logger.info(
+                    f"✅ {model['name']} quota available after {elapsed:.1f}s")
+                return True, i
+        elapsed = time.time() - start
+        remaining = max_wait - elapsed
+        logger.info(
+            f"⏰ Waiting... (elapsed: {elapsed:.0f}s, remaining: {remaining:.0f}s)")
+        time.sleep(10)
+    logger.error(f"❌ Timeout: No quota available after {max_wait}s")
+    return False, -1
+def get_current_model_config():
+    """Get current model config"""
+    return GEMINI_MODELS[current_model_index]
 # ============================================================================
 # STARTUP VALIDATION
 # ============================================================================
     warnings = []
     errors = []
     if not GEMINI_API_KEY:
         warnings.append(
             "⚠️  GEMINI_API_KEY not set - image-based PDFs will not work")
     else:
         print(f"✅ GEMINI_API_KEY configured ({len(GEMINI_API_KEY)} chars)")
     if not AZURE_STORAGE_CONNECTION_STRING:
         if not (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY):
+            errors.append("❌ Azure credentials missing")
         else:
             print(
                 f"✅ Azure credentials configured (account: {AZURE_STORAGE_ACCOUNT_NAME})")
     else:
         print(f"✅ Azure connection string configured")
     for warning in warnings:
         print(warning)
     for error in errors:
         print(error)
     if errors:
         print("\n⚠️  WARNING: Some required credentials are missing!")
     return len(errors) == 0
             elif AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY:
                 account_url = f"https://{AZURE_STORAGE_ACCOUNT_NAME}.blob.core.windows.net"
                 blob_service_client = BlobServiceClient(
+                    account_url=account_url, credential=AZURE_STORAGE_ACCOUNT_KEY)
                 print("✅ Azure Blob Storage initialized with account key")
             else:
                 print("⚠️ WARNING: No Azure credentials configured")
             raise HTTPException(
                 status_code=500, detail="Azure Blob Storage not configured")
         base_filename = os.path.splitext(filename)[0]
         safe_folder_name = re.sub(r'[<>:"/\\|?*]', '_', base_filename)
         blob_name = f"{ROOT_FOLDER}/{batch_id}/{safe_folder_name}/Raw/{filename}"
         blob_client = client.get_blob_client(
             container=container_name, blob=blob_name)
         print(f"📤 Uploading raw PDF to: {blob_name}")
         blob_client.upload_blob(
             pdf_bytes,
             }
         )
         expiry_hours = 24
         sas_token = generate_blob_sas(
             account_name=AZURE_STORAGE_ACCOUNT_NAME,
             expiry=datetime.utcnow() + timedelta(hours=expiry_hours)
         )
         blob_url = blob_client.url
         download_url = f"{blob_url}?{sas_token}"
         expires_at = (datetime.utcnow() +
             raise HTTPException(
                 status_code=500, detail="Azure Blob Storage not configured")
         base_filename = os.path.splitext(original_filename)[0]
         safe_folder_name = re.sub(r'[<>:"/\\|?*]', '_', base_filename)
         blob_name = f"{ROOT_FOLDER}/{batch_id}/{safe_folder_name}/Splitted/{invoice_filename}"
         blob_client = client.get_blob_client(
             container=container_name, blob=blob_name)
         blob_client.upload_blob(
             pdf_bytes,
             overwrite=True,
             }
         )
         expiry_hours = 24
         sas_token = generate_blob_sas(
             account_name=AZURE_STORAGE_ACCOUNT_NAME,
             expiry=datetime.utcnow() + timedelta(hours=expiry_hours)
         )
         blob_url = blob_client.url
         download_url = f"{blob_url}?{sas_token}"
         expires_at = (datetime.utcnow() +
             return
         container_client = client.get_container_client(container_name)
         prefix = f"{ROOT_FOLDER}/{batch_id}/"
         blobs = container_client.list_blobs(name_starts_with=prefix)
 def get_gemini_model():
+    """Get or create Gemini model instance WITH QUOTA CHECK"""
+    global gemini_model, current_model_index
     if not GEMINI_AVAILABLE:
         return None
+    if not GEMINI_API_KEY:
+        return None
+    # ⭐ CHECK QUOTA BEFORE RETURNING MODEL
+    model_config = get_current_model_config()
+    if not can_use_model(model_config):
+        logger.warning(f"⚠️ {model_config['name']} quota exhausted")
+        # Try other models
+        for i, alt_model in enumerate(GEMINI_MODELS):
+            if i != current_model_index and can_use_model(alt_model):
+                current_model_index = i
+                model_config = alt_model
+                logger.info(f"🔄 Switched to {model_config['name']}")
+                gemini_model = None  # Force recreation
+                break
+        else:
+            # All models exhausted - wait
+            success, new_index = wait_for_quota_renewal(MAX_WAIT_TIME)
+            if success:
+                current_model_index = new_index
+                model_config = GEMINI_MODELS[new_index]
+                gemini_model = None
+            else:
+                logger.error("❌ All models quota exhausted")
+                return None
+    # Create/recreate model if needed
+    with model_lock:
+        if gemini_model is None or not hasattr(gemini_model, '_model_name') or gemini_model._model_name != model_config['name']:
+            try:
+                genai.configure(api_key=GEMINI_API_KEY)
+                gemini_model = genai.GenerativeModel(model_config['name'])
+                gemini_model._model_name = model_config['name']
+                logger.info(f"✅ Using {model_config['name']}")
+            except Exception as e:
+                logger.error(
+                    f"❌ Failed to initialize {model_config['name']}: {e}")
+                return None
+        # ❌ REMOVE THIS LINE - Don't record request for model creation
+        # record_model_request(model_config)  # <-- DELETE THIS
+        return gemini_model
+def extract_invoice_gemini_sync(page):
+    model_config = get_current_model_config()
     try:
         pix = page.get_pixmap(matrix=fitz.Matrix(
             GEMINI_IMAGE_RESOLUTION, GEMINI_IMAGE_RESOLUTION))
         img_bytes = pix.tobytes("png")
         pix = None
         prompt = """Look at this invoice image and extract ONLY the invoice number.
+Return ONLY the invoice number. If not found return NONE."""
+        text = call_gemini_25(model_config["name"], img_bytes, prompt)
+        cleaned = text.strip().replace("Invoice Number:", "").strip()
+        print(f"🤖 Gemini raw response: {text}")
+        if cleaned and cleaned.upper() != "NONE" and len(cleaned) >= 3:
+            cleaned = re.sub(r'[^A-Za-z0-9\-/]', '', cleaned)
+            print(f"✅ Gemini extracted: {cleaned}")
+            return cleaned.upper()
+        # Fallback OCR
+        ocr = call_gemini_25(
+            model_config["name"], img_bytes, "Extract all visible text from this image")
+        return try_extract_invoice_from_text(ocr)
     except Exception as e:
+        print(f"❌ Gemini error: {e}")
         return None
     page_invoice_nos = []
     if not is_image_pdf:
         print(f"  📝 Text-based extraction (sequential)")
         for i in range(doc.page_count):
             if i % 50 == 0:
                 gc.collect()
         return page_invoice_nos
     print(f"  🚀 Image-based extraction (parallel, batch_size={batch_size})")
     with ThreadPoolExecutor(max_workers=batch_size) as executor:
         futures = []
         for i in range(doc.page_count):
             page = doc.load_page(i)
             text_result = extract_invoice_text_based(page)
             if text_result:
                 futures.append((i, None, text_result))
             else:
                 future = executor.submit(extract_invoice_gemini_sync, page)
                 futures.append((i, future, None))
         page_invoice_nos = [None] * doc.page_count
         completed = 0
         for i, future, text_result in futures:
             try:
                 if text_result:
                     page_invoice_nos[i] = text_result
                     completed += 1
                 else:
                     result = future.result(timeout=30)
                     page_invoice_nos[i] = result
                     completed += 1
     page_invoice_nos = [None] * doc.page_count
     page = doc.load_page(0)
     page_invoice_nos[0] = extract_invoice_no_from_page(page, is_image_pdf)
     print(f"  ✓ Page 1: {page_invoice_nos[0]}")
     sample_interval = max(3, doc.page_count // 20)
     print(f"  Sampling interval: every {sample_interval} pages")
         if i % 10 == 0:
             print(f"  Sampling page {i+1}/{doc.page_count}...")
         prev_known_idx = i - sample_interval
         while prev_known_idx >= 0 and page_invoice_nos[prev_known_idx] is None:
             prev_known_idx -= 1
                     page_invoice_nos[idx] = extract_invoice_no_from_page(
                         page, is_image_pdf)
     if page_invoice_nos[-1] is None:
         page = doc.load_page(doc.page_count - 1)
         page_invoice_nos[-1] = extract_invoice_no_from_page(page, is_image_pdf)
         print(f"  ✓ Last page: {page_invoice_nos[-1]}")
     last_known = page_invoice_nos[0]
     filled = 0
     for i in range(len(page_invoice_nos)):
     return page_invoice_nos
 # ============================================================================
+# PDF PROCESSING FUNCTIONS
 # ============================================================================
     has_digit = any(c.isdigit() for c in candidate)
     return has_letter and has_digit
 def try_extract_invoice_from_text(text: str) -> Optional[str]:
+    """Universal label-first extraction with smart prioritization"""
     if not text:
         return None
     text_norm = normalize_text_for_search(text)
     if len(text_norm) > 0:
         print(f"\n{'='*70}")
         print(f"🔍 ANALYZING TEXT (first 800 chars):")
         print(f"{text_norm[:800]}")
         print(f"{'='*70}\n")
     label_patterns = [
         (r"Invoice\s*(?:No\.?|Number|Num)", "Invoice No", True),
         (r"Inv\s*(?:No\.?|Number)", "Inv No", True),
         (r"Bill\s*(?:No\.?|Number|Num)", "Bill No", True),
         (r"Tax\s*Invoice\s*(?:No\.?|Number)", "Tax Invoice No", True),
         (r"Document\s*(?:No\.?|Number)", "Document No", True),
         (r"Receipt\s*(?:No\.?|Number)", "Receipt No", False),
         (r"Voucher\s*(?:No\.?|Number)", "Voucher No", False),
         (r"Reference\s*(?:No\.?|Number)", "Reference No", False),
     for label_pattern, label_name, is_invoice_label in label_patterns:
         header_text = text_norm[:2000]
         label_matches = list(re.finditer(
             label_pattern, header_text, re.IGNORECASE))
         for label_match in label_matches:
             start_pos = label_match.end()
             text_after_label = header_text[start_pos:start_pos + 200]
             print(
             print(
                 f"   Text after label (first 80 chars): '{text_after_label[:80]}...'")
             all_candidates = re.findall(
+                r'\b([A-Z0-9][A-Z0-9\-\/]{2,20})\b', text_after_label, re.IGNORECASE)
             print(
                 f"   Found {len(all_candidates)} potential candidates: {all_candidates[:5]}")
             for pass_number in [1, 2]:
                 if pass_number == 2 and len(all_candidates) > 0:
                     print(f"   🔄 Second pass: Trying alphanumeric candidates...")
                 for candidate in all_candidates:
                     invoice_num = candidate.strip(".,;:-_")
                     if len(invoice_num) < 3:
                         continue
                     is_pure_numeric = invoice_num.isdigit()
                     is_ideal_invoice_length = 12 <= len(invoice_num) <= 14
                     if pass_number == 1:
                         if not (is_pure_numeric and is_ideal_invoice_length):
                             continue
                         print(
                             f"   ✨ PRIORITY candidate (12-14 digit numeric): '{invoice_num}'")
                     else:
                         if is_pure_numeric and is_ideal_invoice_length:
                             continue
                         print(f"   🔍 Evaluating candidate: '{invoice_num}'")
                     if invoice_num.upper() in ("ORDER", "REF", "NO", "NUMBER", "DATE", "DT", "AND",
                                                "INV", "BILL", "ACCOUNT", "PO", "COPY", "OF",
                                                "DOCUMENT", "DOC", "GST", "GSTIN", "ACK", "USER",
                         print(f"      ⚠️ Skipped: noise word")
                         continue
                     if not is_invoice_label:
                         if re.match(r'^[A-Z]\d{6}$', invoice_num, re.IGNORECASE):
                             print(
                                 f"      ⚠️ Skipped: batch pattern (non-invoice context)")
                             continue
                     if re.match(r'^[A-Z]{2,3}-[A-Z0-9]+-\d+$', invoice_num, re.IGNORECASE):
                         print(f"      ⚠️ Skipped: license pattern")
                         continue
                     if re.match(r'^[A-Z]{2,4}-\d{10}$', invoice_num, re.IGNORECASE):
+                        print(f"      ⚠️ Skipped: state code/UIN pattern")
                         continue
+                    if re.search(rf"Ack\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(invoice_num)}", text_norm, re.IGNORECASE):
                         print(f"      ⚠️ Skipped: ACK number")
                         continue
+                    if re.search(rf"PH\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(invoice_num)}", text_norm, re.IGNORECASE):
                         print(f"      ⚠️ Skipped: PH number")
                         continue
+                    if re.search(rf"(?:UIN|UID|State\s*Code|D\.L\.No)\.?\s*:?\s*{re.escape(invoice_num)}", text_norm, re.IGNORECASE):
+                        print(f"      ⚠️ Skipped: UIN/UID/State Code")
                         continue
+                    if re.search(rf"A[\s\/]*C\s*(?:No\.?|Number)?\s*[\-:\.]?\s*{re.escape(invoice_num)}", text_norm, re.IGNORECASE):
                         print(f"      ⚠️ Skipped: A/C number")
                         continue
                     if re.match(r'^[0-9]{10,11}$', invoice_num):
                         if len(invoice_num) == 10 and invoice_num[0] in '6789':
                             print(f"      ⚠️ Skipped: mobile number")
                             continue
                         if len(invoice_num) == 11 and invoice_num[0] == '0':
                             print(f"      ⚠️ Skipped: landline number")
                             continue
                     if re.match(r'^20\d{6}$', invoice_num):
+                        print(f"      ⚠️ Skipped: date pattern")
                         continue
                     if re.match(r'^\d{2}[\/\-]\d{2}[\/\-]\d{4}$', invoice_num):
+                        print(f"      ⚠️ Skipped: date format")
                         continue
+                    if len(invoice_num) == 15 and re.match(r'^\d{2}[A-Z]{5}\d{4}[A-Z]\d[A-Z]\d$', invoice_num, re.IGNORECASE):
+                        print(f"      ⚠️ Skipped: GST number")
                         continue
                     print(f"      ✅✅✅ ACCEPTED: '{invoice_num}'")
                     return invoice_num.upper()
             print(f"   ⚠️ No valid candidates found after '{label_name}'")
     print("\n⚠️ No labeled invoice number found, trying fallback extraction...")
     top_text = text_norm[:1000]
     credit_match = re.search(
+        r"CREDIT\s*(?:NO|NUMBER|#)?\s*[:\-]?\s*(\d{12,20})", text_norm, re.IGNORECASE)
     if credit_match:
         credit_num = credit_match.group(1).strip()
         if 12 <= len(credit_num) <= 20 and len(credit_num) != 14:
             print(f"✓ Fallback: Found CREDIT number: {credit_num}")
             return credit_num.upper()
     long_numerics = re.findall(r'\b(\d{12,20})\b', top_text)
     for num in long_numerics:
+        if re.search(rf"(?:Ack|PH|A[\s\/]*C|UIN|UID|State\s*Code|D\.L\.No)\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(num)}", text_norm, re.IGNORECASE):
             print(f"⚠️ Fallback: Skipping (labeled as ACK/PH/A/C/UIN): {num}")
             continue
         print(f"✓ Fallback: Found long numeric: {num}")
         return num.upper()
     medium_numerics = re.findall(r'\b(\d{10,15})\b', top_text)
     for num in medium_numerics:
         if len(num) == 10 and num[0] in '6789':
             continue
         if len(num) == 11 and num[0] == '0':
             continue
         if len(num) == 8 and num.startswith('20'):
             continue
+        if re.search(rf"(?:Ack|PH|A[\s\/]*C|UIN|UID)\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(num)}", text_norm, re.IGNORECASE):
             continue
         print(f"✓ Fallback: Found medium numeric: {num}")
         return num.upper()
     print("✗ No invoice number found (labeled or unlabeled)")
     return None
 def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
+    """Extract invoice number from TEXT-BASED PDF with Zydus fallback"""
     text = page.get_text("text") or ""
     text_norm = normalize_text_for_search(text)
     header_text = text_norm[:2500]
     zydus_candidates = re.findall(r'\b(23\d{8})\b', header_text)
     if zydus_candidates:
         zydus_number = zydus_candidates[0]
         print(f"    ✅ ZYDUS INVOICE DETECTED: {zydus_number}")
         return zydus_number.upper()
     inv = try_extract_invoice_from_text(text)
     if inv:
         if re.match(r'^10\d{12}$', inv):
+            print(f"    ⚠️ REJECTED Order ID (14-digit): {inv}")
             inv = None
         else:
             return inv
     for block in (page.get_text("blocks") or []):
         block_text = block[4] if len(block) > 4 else ""
         if block_text:
             inv = try_extract_invoice_from_text(block_text)
             if inv:
                 if re.match(r'^10\d{12}$', inv):
+                    print(f"    ⚠️ REJECTED Order ID from block: {inv}")
+                    continue
                 else:
                     return inv
     blocks = page.get_text("blocks") or []
     sorted_blocks = sorted(blocks, key=lambda b: b[1] if len(b) > 1 else 0)
+    for block in sorted_blocks[:15]:
         block_text = block[4] if len(block) > 4 else ""
         if block_text:
             block_norm = normalize_text_for_search(block_text)
                 print(f"    ✅ ZYDUS BLOCK DETECTION: {number}")
                 return number.upper()
     print(f"    ⚠️ No valid invoice found on this page (will use forward-fill)")
     return None
 def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
+    """Extract invoice number from a single page"""
     text_result = extract_invoice_text_based(page)
     if text_result:
         return text_result
     except Exception as e:
         print(f"⚠️ Cleanup warning: {e}")
 def merge_first_null_group(groups: List[Dict]) -> List[Dict]:
+    """Merge null first page with first invoice"""
     if len(groups) >= 2:
         first_group = groups[0]
         second_group = groups[1]
         if first_group["invoice_no"] is None and second_group["invoice_no"] is not None:
             print(f"\n🔧 AUTO-FIX: Merging null first page(s) with first invoice")
             print(
             print(
                 f"   First invoice: {second_group['invoice_no']}, Pages {[p+1 for p in second_group['pages']]}")
             merged_pages = first_group["pages"] + second_group["pages"]
             second_group["pages"] = merged_pages
             groups.pop(0)
             print(
 async def split_invoices(
     background_tasks: BackgroundTasks,
     file: UploadFile = File(...),
+    batch_id: str = Form(..., description="Batch ID (required)"),
     use_blob_storage: bool = Form(
         True, description="Upload PDFs to Azure Blob Storage"),
     blob_container: Optional[str] = Form(
+        None, description="Custom Azure container"),
     include_base64: bool = Form(
         False, description="Include base64 in response"),
     parallel_batch_size: int = Form(
+        MAX_PARALLEL_GEMINI_CALLS, description="Parallel Gemini API calls"),
     use_smart_sampling: bool = Form(
+        USE_SMART_SAMPLING, description="Use smart sampling"),
     max_file_size_mb: int = Form(200, description="Maximum file size in MB"),
 ):
+    """Universal Invoice Splitter with RPM Management"""
     if not file.filename:
         raise HTTPException(status_code=400, detail="No filename provided")
     filename_lower = file.filename.lower()
     SUPPORTED_EXTENSIONS = ['.pdf', '.png',
                             '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
     if not file_extension:
         raise HTTPException(
+            status_code=400, detail=f"Unsupported file format. Supported: {', '.join(SUPPORTED_EXTENSIONS)}")
     is_image_file = file_extension in [
         '.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
     if is_image_file and not GEMINI_AVAILABLE:
         raise HTTPException(
+            status_code=500, detail="Image processing requires PIL")
     if use_blob_storage and not get_blob_service_client():
         raise HTTPException(
             print(
                 f"   ... (showing first 10 of {len(page_invoice_nos)} pages)")
         page_invoice_nos_normalized = []
         for v in page_invoice_nos:
             if v and v.upper().startswith("GST"):
             else:
                 page_invoice_nos_normalized.append(None)
         page_invoice_nos_filled = []
         last_known_invoice = None
             page_count = sum(1 for v in page_invoice_nos_filled if v == inv_no)
             print(f"      • {inv_no}: {page_count} pages")
         groups = []
         current_group = []
         current_invoice = None
                 current_group = [idx]
             else:
                 if inv != current_invoice:
+                    groups.append({"invoice_no": current_invoice,
+                                  "pages": current_group[:]})
                     print(
                         f"   📄 Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
                     current_invoice = inv
                     current_group.append(idx)
         if current_group:
+            groups.append({"invoice_no": current_invoice,
+                          "pages": current_group[:]})
             print(
                 f"   📄 Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
         if len(groups) == 1 and groups[0]["invoice_no"] is None:
+            groups = [{"invoice_no": None,
+                       "pages": list(range(doc.page_count))}]
         groups = merge_first_null_group(groups)
         print(f"\n✅ Created {len(groups)} invoice groups (after auto-merge)")
         print(
             f"   Forward-filled {filled_count} pages with missing invoice numbers")
         print(f"\n🔨 Building and uploading split invoices...")
         all_parts = []
                 "unique_invoice_numbers": len(unique_invoices),
                 "extraction_method": "gemini" if is_image_pdf else "text",
                 "pages_forward_filled": filled_count,
+                "storage_type": "azure_blob" if use_blob_storage else "base64",
+                "model_used": get_current_model_config()['name']
             },
             "performance": {
                 "total_time_seconds": round(total_time, 2),
             f"   Raw PDF: {raw_pdf_info['blob_name'] if raw_pdf_info else 'Not uploaded'}")
         print(f"   Split invoices: {len(all_parts)}")
         print(f"   Unique invoice numbers: {len(unique_invoices)}")
+        print(f"   Model used: {get_current_model_config()['name']}")
         print(f"   Total time: {total_time:.1f}s")
         print(
             f"   Extraction time: {extraction_time:.1f}s ({total_pages_count / extraction_time:.1f} pages/sec)")
     background_tasks: BackgroundTasks,
     container_name: Optional[str] = Form(None)
 ):
+    """Delete all blobs for a specific batch"""
     if container_name is None:
         container_name = AZURE_CONTAINER_NAME
     })
+@app.get("/quota-status")
+def quota_status():
+    """Get quota status for all models"""
+    status = []
+    for i, model in enumerate(GEMINI_MODELS):
+        reset_model_quota_counters(model)
+        with quota_manager_lock:
+            status.append({
+                "model": model["name"],
+                "is_current": i == current_model_index,
+                "rpm": {"used": model["current_rpm"], "limit": model["max_requests_per_minute"]},
+                "rpd": {"used": model["current_rpd"], "limit": model["max_requests_per_day"]},
+                "available": can_use_model(model)
+            })
+    return JSONResponse({"models": status, "timestamp": datetime.now().isoformat()})
 @app.get("/")
 async def root():
     return {
+        "service": "Universal Invoice Splitter API with RPM Management",
+        "version": "4.0",
         "status": "running",
         "features": {
             "multi_format_support": True,
             "zydus_healthcare_support": True,
             "auto_merge_null_groups": True,
             "azure_blob_storage": True,
+            "parallel_processing": True,
+            "rpm_management": True,
+            "multi_model_fallback": True
+        },
+        "models": [m["name"] for m in GEMINI_MODELS],
+        "current_model": get_current_model_config()['name']
     }
 @app.get("/health")
 async def health():
+    model_config = get_current_model_config()
     return {
         "status": "healthy",
         "timestamp": datetime.now().isoformat(),
         "gemini_configured": bool(GEMINI_API_KEY),
+        "azure_configured": bool(AZURE_STORAGE_CONNECTION_STRING or (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY)),
+        "current_model": model_config['name'],
+        "quota_available": can_use_model(model_config)
     }
 if __name__ == "__main__":
     import uvicorn
+    # Initialize model quota tracking
+    for model in GEMINI_MODELS:
+        model["last_rpm_reset"] = datetime.now()
+        model["last_rpd_reset"] = datetime.now()
+    print("\n" + "="*80)
+    print("🚀 Starting Universal Invoice Splitter API with RPM Management")
+    print("="*80)
     print(f"✅ Supports ALL invoice types")
     print(f"✅ Zydus Healthcare fallback (23xxxxxxxx pattern)")
     print(f"✅ Auto-merge null first pages")
+    print(f"✅ RPM/RPD quota management")
+    print(f"✅ Multi-model fallback")
+    print("="*80)
+    print(f"📋 Model Chain:")
+    for i, model in enumerate(GEMINI_MODELS):
+        print(f"   {i+1}. {model['name']}")
+        print(
+            f"      RPM: {model['max_requests_per_minute']}, RPD: {model['max_requests_per_day']}")
+    print("="*80)
+    print(f"🌐 Server: http://127.0.0.1:8000")
+    print("="*80 + "\n")
     uvicorn.run(app, host=HOST, port=PORT, log_level="info")