Spaces:

anujakkulkarni
/

splitpdffile

Paused

App Files Files Community

anujakkulkarni commited on Jan 12

Commit

5e18860

verified ·

1 Parent(s): c1e3bdc

Update app.py

Browse files

Files changed (1) hide show

app.py +161 -459

app.py CHANGED Viewed

@@ -4,10 +4,13 @@ import re
 import base64
 import time
 import datetime
 from typing import List, Dict, Optional, Tuple
 from collections import deque
-from fastapi import FastAPI, File, UploadFile, Form, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 import fitz  # PyMuPDF
@@ -17,9 +20,9 @@ try:
     import google.generativeai as genai
     from PIL import Image
     GEMINI_AVAILABLE = True
-except ImportError:
     GEMINI_AVAILABLE = False
-    print("Warning: google-generativeai not installed.  Image-based PDFs won't be supported.")
 app = FastAPI(title="Invoice Splitter API")
@@ -37,28 +40,22 @@ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
 # Model fallback list (in priority order)
 GEMINI_MODELS = [
     {
-        "name": "gemini-2.5-flash-image",  # PRIMARY - Recommended by Google
-        "max_requests_per_minute": 50,  # Higher quota limit
-        "timeout": 300,
-        "description": "Primary model with higher quota"
-    },
-    {
-        "name": "gemini-2.0-flash",  # Fallback
         "max_requests_per_minute": 15,
         "timeout": 300,
-        "description": "Pro fallback"
     },
     {
-        "name": "gemini-3-flash",  # Fallback
-        "max_requests_per_minute": 15,
         "timeout": 300,
-        "description": "Pro fallback"
     },
     {
-        "name": "gemini-2.0-flash-exp",  # FALLBACK 1 - Your original choice
-        "max_requests_per_minute": 9,  # Conservative (under 10 limit)
         "timeout": 300,
-        "description": "Fallback experimental model"
     }
 ]
@@ -78,7 +75,6 @@ class SimpleRateLimiter:
     def allow_request(self):
         now = time.time()
-        # Remove old requests outside time window
         while self.requests and self.requests[0] < now - self.window_seconds:
             self.requests.popleft()
@@ -94,14 +90,13 @@ class SimpleRateLimiter:
         return max(0, self.window_seconds - (time.time() - oldest))
     def reset(self):
-        self.requests. clear()
         self.quota_error_count = 0
     def record_quota_error(self):
         self.quota_error_count += 1
-# Initialize rate limiter for current model
 gemini_rate_limiter = SimpleRateLimiter(
     max_requests=GEMINI_MODELS[current_model_index]["max_requests_per_minute"],
     window_seconds=60
@@ -110,9 +105,7 @@ gemini_rate_limiter = SimpleRateLimiter(
 # --- Daily Quota Management ---
 def check_daily_quota():
-    """Check if we should reset daily quota flag."""
     global last_quota_reset, daily_quota_exhausted
     now = datetime.datetime.now()
     if last_quota_reset is None:
@@ -120,12 +113,10 @@ def check_daily_quota():
         daily_quota_exhausted = False
         return True
-    # Reset at midnight
     if now.date() > last_quota_reset.date():
         print("🔄 Daily quota reset detected")
         last_quota_reset = now
         daily_quota_exhausted = False
-        # Also reset to primary model
         reset_to_primary_model()
         return True
@@ -133,349 +124,189 @@ def check_daily_quota():
 def mark_daily_quota_exhausted():
-    """Mark daily quota as exhausted."""
     global daily_quota_exhausted
     daily_quota_exhausted = True
-    next_reset = (datetime.datetime.now() + datetime.timedelta(days=1)).replace(
-        hour=0, minute=0, second=0
-    )
-    print(f"❌ Daily quota exhausted - resets at {next_reset. strftime('%Y-%m-%d %H:%M')}")
-# --- Model Management Functions ---
 def get_gemini_model():
-    """Get or create Gemini model instance with auto-fallback."""
     global gemini_model, current_model_index
-    if not GEMINI_AVAILABLE:
-        print("Gemini SDK not available")
-        return None
-    if not GEMINI_API_KEY:
-        print("Warning:  Gemini API key not found in environment variables.")
         return None
-    # Check daily quota first
     if not check_daily_quota():
-        print("Daily quota exhausted, Gemini unavailable until reset")
         return None
-    # Try to initialize model if not already done
     if gemini_model is None:
         model_config = GEMINI_MODELS[current_model_index]
         try:
             genai.configure(api_key=GEMINI_API_KEY)
             gemini_model = genai.GenerativeModel(model_config["name"])
-            print(f"✓ Initialized:  {model_config['name']} ({model_config['description']})")
-        except Exception as e:
             print(f"Failed to initialize {model_config['name']}: {e}")
             return None
     return gemini_model
 def switch_to_next_model():
-    """Switch to next available model in fallback chain."""
     global gemini_model, current_model_index, gemini_rate_limiter
     if current_model_index < len(GEMINI_MODELS) - 1:
         current_model_index += 1
         model_config = GEMINI_MODELS[current_model_index]
-        # Reset rate limiter with new model's limits
         gemini_rate_limiter = SimpleRateLimiter(
             max_requests=model_config["max_requests_per_minute"],
             window_seconds=60
         )
-        # Force reinitialization
         gemini_model = None
-        print(f"🔄 SWITCHED TO MODEL: {model_config['name']} ({model_config['description']})")
         return get_gemini_model()
-    else:
-        print("❌ All models exhausted!")
-        return None
 def reset_to_primary_model():
-    """Reset back to primary model."""
     global gemini_model, current_model_index, gemini_rate_limiter
     if current_model_index != 0:
-        old_model = GEMINI_MODELS[current_model_index]['name']
         current_model_index = 0
         model_config = GEMINI_MODELS[0]
         gemini_rate_limiter = SimpleRateLimiter(
             max_requests=model_config["max_requests_per_minute"],
             window_seconds=60
         )
         gemini_model = None
-        print(f"🔄 Reset from {old_model} to primary model:  {model_config['name']}")
         return True
     return False
-# --- Regex patterns for text-based PDF extraction ---
 INVOICE_NO_RE = re.compile(
-    r"""
-    (?:
-        Invoice\s*No\.?|
-        Inv\. ?\s*No\.?|
-        Bill\s*No\.?|
-        Document\s*No\.?|
-        Doc\s*No\.?|
-        Tax\s*Invoice\s*No\.?
-    )
-    \s*[:\-]?\s*
-    ([A-Z0-9][A-Z0-9\-\/]{3,})
-    """,
     re.IGNORECASE | re.VERBOSE
 )
-PREFIXED_INVOICE_RE = re.compile(
-    r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b"
-)
-GST_LIKE_RE = re.compile(
-    r"\b((?: GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b",
-    re.IGNORECASE
-)
 def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
-    """
-    Detect if PDF is image-based or text-based by sampling pages.
-    Returns (is_image_based, avg_text_length).
-    """
     total_text_length = 0
     pages_to_check = min(sample_pages, doc.page_count)
     for i in range(pages_to_check):
         text = doc.load_page(i).get_text("text") or ""
-        total_text_length += len(text. strip())
     avg_text_length = total_text_length / pages_to_check
-    is_image_based = avg_text_length < 50
-    print(f"  PDF Type Detection: avg_text_length={avg_text_length:.1f} chars/page")
-    print(f"  Classification: {'IMAGE-BASED' if is_image_based else 'TEXT-BASED'} PDF")
-    return is_image_based, avg_text_length
-# ============================================================================
-# TEXT-BASED PDF EXTRACTION
-# ============================================================================
 def normalize_text_for_search(s: str) -> str:
-    """Light normalization:  collapse whitespace and normalize common separators."""
-    if not s:
-        return s
-    s = s.replace("\u00A0", " ")  # non-breaking space
-    s = re.sub(r"[\r\n\t]+", " ", s)
-    s = re.sub(r"[ ]{2,}", " ", s).strip()
-    return s
 def try_extract_invoice_from_text(text: str) -> Optional[str]:
-    """Extract invoice number from text using regex patterns."""
-    if not text:
-        return None
     text_norm = normalize_text_for_search(text)
-    # 1) Labeled invoice like "Invoice No", "Inv No."
-    m = INVOICE_NO_RE. search(text_norm)
     if m:
         inv = (m.group(1) or "").strip()
-        if inv and inv.lower() not in ("invoice", "inv", "bill") and len(inv) > 2:
             return inv
-    # 2) Search top portion for prefixed invoice codes
-    top_text = text_norm[: 600]
-    m = PREFIXED_INVOICE_RE.search(top_text)
     if m:
         inv = (m.group(1) or "").strip()
         if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
             return inv
-    # 3) Last-resort: GST detection
     gm = GST_LIKE_RE.search(text_norm)
     if gm:
-        gst_val = gm.group(2) or ""
-        gst_val = gst_val.replace(" ", "").strip().upper()
-        if len(gst_val) == 15 and re.match(r"^[0-9A-Z]{15}$", gst_val):
             return f"GST:{gst_val}"
-    return None
-def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
-    """Extract invoice number from TEXT-BASED PDF."""
-    # Try full-page text
-    text = page.get_text("text") or ""
-    inv = try_extract_invoice_from_text(text)
-    if inv:
-        return inv
-    # Try block-level text
-    for block in (page.get_text("blocks") or []):
-        block_text = block[4] if len(block) > 4 else ""
-        if block_text:
-            inv = try_extract_invoice_from_text(block_text)
-            if inv:
-                return inv
     return None
-# ============================================================================
-# IMAGE-BASED PDF EXTRACTION (Google Gemini with Auto-Switching)
-# ============================================================================
 def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
-    """
-    Extract invoice number from IMAGE-BASED PDF using Google Gemini.
-    With automatic model switching on quota exhaustion.
-    """
-    # Check daily quota first
-    if not check_daily_quota():
-        print("    ❌ Daily quota exhausted, skipping Gemini")
-        return None
     model = get_gemini_model()
-    if not model:
-        print("    Gemini model not available")
-        return None
-    # Check rate limit
     if not gemini_rate_limiter.allow_request():
         wait_time = gemini_rate_limiter.wait_time()
-        print(f"    ⏱ Rate limit reached, waiting {int(wait_time)}s...")
         time.sleep(wait_time + 1)
         return extract_invoice_gemini(page, retry_count)
     try:
-        # Convert page to image
-        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x resolution
         img_bytes = pix.tobytes("png")
         img = Image.open(io.BytesIO(img_bytes))
-        # Prompt for Gemini
-        prompt = """
-        Extract the invoice number from this image. Look for:
-        - Invoice No, Invoice Number, Bill No, Bill Number, Document No
-        - Any alphanumeric code that appears to be an invoice identifier
-        - Purchase Order numbers if no invoice number is found
-        Return ONLY the invoice number/identifier itself, nothing else.
-        If no invoice number is found, return "NOT_FOUND".
-        """
-        model_name = GEMINI_MODELS[current_model_index]["name"]
-        print(f"    Calling Gemini API (model: {model_name})...")
         response = model.generate_content([prompt, img])
         if response and response.text:
-            extracted_text = response.text.strip()
-            print(f"    Gemini response: {extracted_text}")
-            if extracted_text and extracted_text != "NOT_FOUND":
-                invoice_no = extracted_text. replace("*", "").replace("#", "").strip()
-                if invoice_no and len(invoice_no) > 2:
-                    print(f"    ✓ Gemini found invoice:  {invoice_no}")
-                    return invoice_no
-            # Fallback:  Get full OCR text and try regex
-            ocr_prompt = "Extract all text from this invoice image.  Return the complete text content."
-            ocr_response = model.generate_content([ocr_prompt, img])
-            if ocr_response and ocr_response.text:
-                print(f"    Gemini extracted {len(ocr_response.text)} chars, trying regex...")
-                inv = try_extract_invoice_from_text(ocr_response.text)
-                if inv:
-                    print(f"    ✓ Found via regex on Gemini text: {inv}")
-                    return inv
-        print("    ✗ Gemini:  No invoice found")
         return None
-    except Exception as e:
         error_str = str(e).lower()
-        # Handle quota exhausted errors
-        if "429" in str(e) or "quota" in error_str or "resource" in error_str:
-            print(f"    ❌ QUOTA ERROR: {e}")
             gemini_rate_limiter.record_quota_error()
-            # Check if it's daily quota
-            if "per_day" in error_str or "limit:  0" in str(e):
-                print("    ❌ DAILY quota exhausted")
                 mark_daily_quota_exhausted()
                 return None
-            # Per-minute quota - try switching model
             if retry_count < len(GEMINI_MODELS) - 1:
-                print(f"    🔄 Switching to fallback model (attempt {retry_count + 1})...")
                 if switch_to_next_model():
-                    time.sleep(2)  # Brief delay before retry
                     return extract_invoice_gemini(page, retry_count + 1)
-            # Wait and retry once more with current model
-            if retry_count < len(GEMINI_MODELS):
-                retry_delay = 30
-                # Try to extract retry delay from error
-                import re as regex
-                match = regex.search(r'seconds:\s*(\d+)', str(e))
-                if match:
-                    retry_delay = int(match.group(1)) + 2
-                print(f"    ⏰ Waiting {retry_delay}s before final retry...")
-                time.sleep(retry_delay)
-                return extract_invoice_gemini(page, retry_count + 1)
-            print("    ❌ All retry attempts exhausted")
-            return None
-        # Other errors
-        print(f"    ✗ Gemini extraction failed: {e}")
         return None
-# ============================================================================
-# UNIFIED EXTRACTION LOGIC
-# ============================================================================
 def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
-    """Try text extraction first, then Gemini as fallback"""
-    # ALWAYS try text extraction first (fast, no API cost)
-    text_result = extract_invoice_text_based(page)
-    if text_result:
-        print(f"  ✓ Found via text extraction: {text_result}")
-        return text_result
-    # If text fails AND PDF seems image-based, try Gemini
-    if is_image_pdf:
-        gemini_result = extract_invoice_gemini(page)
-        if gemini_result:
-            print(f"  ✓ Found via Gemini: {gemini_result}")
-            return gemini_result
     return None
 def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
-    """Create a new PDF with the given pages (0-based indices)."""
     out = fitz.open()
     for i in page_indices:
         out.insert_pdf(src_doc, from_page=i, to_page=i)
-    pdf_bytes = out.tobytes()
-    out.close()
-    return pdf_bytes
 # ============================================================================
@@ -484,248 +315,119 @@ def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> byt
 @app.post("/split-invoices")
 async def split_invoices(
     file: UploadFile = File(...),
     include_pdf: bool = Form(True),
-    initial_dpi: int = Form(300),
 ):
-    """
-    Split a multi-invoice PDF into separate PDFs based on invoice numbers.
-    Features:
-    - Text-based PDFs:  Fast text extraction
-    - Image-based PDFs: Google Gemini with auto-model switching
-    - Auto-switches between models when quota exhausted
-    - Daily quota tracking with auto-reset
-    """
     if not file.filename.lower().endswith(".pdf"):
-        raise HTTPException(status_code=400, detail="only PDF is supported")
-    file_bytes = await file.read()
-    if not file_bytes:
-        raise HTTPException(status_code=400, detail="empty file")
     try:
-        doc = fitz. open(stream=file_bytes, filetype="pdf")
-        if doc. page_count == 0:
-            raise HTTPException(status_code=400, detail="no pages found")
-        print(f"\n{'='*60}")
-        print(f"Processing PDF: {file.filename}")
-        print(f"Total pages: {doc.page_count}")
-        if GEMINI_AVAILABLE:
-            model_status = GEMINI_MODELS[current_model_index]["name"]
-            print(f"Current Gemini model: {model_status}")
-            print(f"Daily quota exhausted: {daily_quota_exhausted}")
-        print(f"{'='*60}")
-        # Step 1: Detect PDF type
-        is_image_pdf, avg_text_len = is_image_based_pdf(doc)
-        if is_image_pdf and not get_gemini_model():
-            if daily_quota_exhausted:
-                raise HTTPException(
-                    status_code=429,
-                    detail="Image-based PDF detected but Gemini API daily quota is exhausted. "
-                           "Please try again tomorrow or use text-based PDFs."
-                )
-            else:
-                raise HTTPException(
-                    status_code=500,
-                    detail="Image-based PDF detected but Google Gemini is not configured. "
-                           "Please add GEMINI_API_KEY to your environment variables."
-                )
-        # Step 2: Extract invoice numbers from each page
-        page_invoice_nos:  List[Optional[str]] = []
-        for i in range(doc.page_count):
-            print(f"\n--- Page {i+1}/{doc.page_count} ---")
-            inv = extract_invoice_no_from_page(doc. load_page(i), is_image_pdf)
-            if inv:
-                print(f"  ✓ Raw extracted id: {inv}")
-            else:
-                print(f"  ✗ No invoice found")
-            page_invoice_nos.append(inv)
-        print(f"\n{'='*60}")
-        print(f"Raw Extraction Results:  {page_invoice_nos}")
-        print(f"{'='*60}")
-        # Step 3: Filter GST values
-        page_invoice_nos_filtered:  List[Optional[str]] = []
-        for v in page_invoice_nos:
-            if v is None:
-                page_invoice_nos_filtered.append(None)
-            else:
-                if isinstance(v, str) and v.upper().startswith("GST:"):
-                    page_invoice_nos_filtered.append(None)
-                else:
-                    page_invoice_nos_filtered. append(v)
-        print(f"Filtered (GST ignored) Results: {page_invoice_nos_filtered}")
-        # Step 4: Group pages by invoice number
-        groups: List[Dict] = []
-        current_group_pages:  List[int] = []
-        current_invoice:  Optional[str] = None
-        for idx, inv in enumerate(page_invoice_nos_filtered):
-            if current_invoice is None:
-                current_invoice = inv
-                current_group_pages = [idx]
             else:
-                if inv is not None and inv != current_invoice:
-                    groups.append({
-                        "invoice_no": current_invoice,
-                        "pages": current_group_pages[: ],
-                    })
-                    current_invoice = inv
-                    current_group_pages = [idx]
                 else:
-                    current_group_pages.append(idx)
-        # Save last group
-        if current_group_pages:
-            groups.append({
-                "invoice_no":  current_invoice,
-                "pages": current_group_pages[:]
-            })
-        # Post-process groups
         if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
             groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
-            groups. pop(0)
-        if all(g["invoice_no"] is None for g in groups):
-            print("\n⚠ Warning: No invoices detected in any page!")
-            print("  Returning entire PDF as single part")
-            groups = [{
-                "invoice_no": None,
-                "pages": list(range(doc.page_count))
-            }]
-        # Step 5: Build response parts
         parts = []
-        for idx, g in enumerate(groups):
             part_bytes = build_pdf_from_pages(doc, g["pages"])
             info = {
                 "invoice_no": g["invoice_no"],
                 "pages": [p + 1 for p in g["pages"]],
-                "num_pages": len(g["pages"]),
-                "size_bytes": len(part_bytes),
             }
             if include_pdf:
                 info["pdf_base64"] = base64.b64encode(part_bytes).decode("ascii")
             parts.append(info)
-            print(f"\nPart {idx+1}:")
-            print(f"  Invoice: {g['invoice_no']}")
-            print(f"  Pages: {info['pages']}")
-            print(f"  Size: {len(part_bytes):,} bytes")
         doc.close()
-        print(f"\n{'='*60}")
-        print(f"✓ Successfully split into {len(parts)} part(s)")
-        print(f"{'='*60}\n")
         return JSONResponse({
             "count": len(parts),
-            "pdf_type": "image-based" if is_image_pdf else "text-based",
-            "current_model":  GEMINI_MODELS[current_model_index]["name"] if GEMINI_AVAILABLE else None,
-            "quota_status": {
-                "daily_exhausted": daily_quota_exhausted,
-                "current_model_index": current_model_index,
-                "total_models": len(GEMINI_MODELS)
-            },
-            "parts": parts
         })
-    except HTTPException:
-        raise
     except Exception as e:
-        print(f"\n✗ Error:  {str(e)}")
         import traceback
         traceback.print_exc()
         return JSONResponse({"error": str(e)}, status_code=500)
-@app.get("/health")
-async def health_check():
-    """Health check endpoint to verify Gemini configuration."""
-    gemini_status = "not available"
-    current_model_name = None
-    if GEMINI_AVAILABLE and get_gemini_model():
-        gemini_status = "configured"
-        current_model_name = GEMINI_MODELS[current_model_index]["name"]
-    return {
-        "status": "healthy",
-        "gemini_available": GEMINI_AVAILABLE,
-        "gemini_status": gemini_status,
-        "current_model": current_model_name,
-        "current_model_index": current_model_index,
-        "total_models": len(GEMINI_MODELS),
-        "daily_quota_exhausted": daily_quota_exhausted,
-        "quota_errors": gemini_rate_limiter.quota_error_count if GEMINI_AVAILABLE else 0,
-    }
-@app.post("/admin/reset-model")
-async def admin_reset_model():
-    """Reset to primary Gemini model."""
-    if reset_to_primary_model():
-        return {
-            "message": "Successfully reset to primary model",
-            "current_model": GEMINI_MODELS[current_model_index]["name"],
-            "status": "success"
-        }
-    else:
-        return {
-            "message": "Already on primary model",
-            "current_model": GEMINI_MODELS[current_model_index]["name"],
-            "status": "info"
-        }
-@app. get("/status")
-async def get_status():
-    """Get detailed status of Gemini models and quota."""
-    return {
-        "current_model":  {
-            "name": GEMINI_MODELS[current_model_index]["name"],
-            "description": GEMINI_MODELS[current_model_index]["description"],
-            "index": current_model_index,
-            "max_rpm": GEMINI_MODELS[current_model_index]["max_requests_per_minute"],
-        },
-        "all_models": [
-            {
-                "name":  m["name"],
-                "description":  m["description"],
-                "max_rpm": m["max_requests_per_minute"],
-                "is_active": i == current_model_index
-            }
-            for i, m in enumerate(GEMINI_MODELS)
-        ],
-        "quota_status": {
-            "daily_exhausted": daily_quota_exhausted,
-            "last_reset": last_quota_reset. isoformat() if last_quota_reset else None,
-            "quota_errors": gemini_rate_limiter.quota_error_count,
-        },
-        "timestamp": datetime.datetime.now().isoformat()
-    }
 if __name__ == "__main__":
     import uvicorn
-    print("="*80)
-    print("🚀 Starting Invoice Splitter API")
-    print("="*80)
-    print(f"📋 Available Gemini Models:")
-    for i, model in enumerate(GEMINI_MODELS):
-        prefix = "🎯 PRIMARY" if i == 0 else f"🔄 FALLBACK {i}"
-        print(f"   {prefix}:  {model['name']} - {model['description']}")
-        print(f"      Rate Limit: {model['max_requests_per_minute']} req/min")
-    print("="*80)
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 import base64
 import time
 import datetime
+import shutil
+import tempfile
 from typing import List, Dict, Optional, Tuple
 from collections import deque
+from pathlib import Path
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 import fitz  # PyMuPDF
     import google.generativeai as genai
     from PIL import Image
     GEMINI_AVAILABLE = True
+except ImportError:
     GEMINI_AVAILABLE = False
+    print("Warning: google-generativeai not installed. Image-based PDFs won't be supported.")
 app = FastAPI(title="Invoice Splitter API")
 # Model fallback list (in priority order)
 GEMINI_MODELS = [
     {
+        "name": "gemini-1.5-flash",  # UPDATED: Current standard fast model
         "max_requests_per_minute": 15,
         "timeout": 300,
+        "description": "Primary fast model"
     },
     {
+        "name": "gemini-2.0-flash-exp",  # Fallback experimental
+        "max_requests_per_minute": 10,
         "timeout": 300,
+        "description": "Experimental fallback"
     },
     {
+        "name": "gemini-1.5-pro",  # Slower fallback
+        "max_requests_per_minute": 2,
         "timeout": 300,
+        "description": "Pro fallback (slower)"
     }
 ]
     def allow_request(self):
         now = time.time()
         while self.requests and self.requests[0] < now - self.window_seconds:
             self.requests.popleft()
         return max(0, self.window_seconds - (time.time() - oldest))
     def reset(self):
+        self.requests.clear()
         self.quota_error_count = 0
     def record_quota_error(self):
         self.quota_error_count += 1
 gemini_rate_limiter = SimpleRateLimiter(
     max_requests=GEMINI_MODELS[current_model_index]["max_requests_per_minute"],
     window_seconds=60
 # --- Daily Quota Management ---
 def check_daily_quota():
     global last_quota_reset, daily_quota_exhausted
     now = datetime.datetime.now()
     if last_quota_reset is None:
         daily_quota_exhausted = False
         return True
     if now.date() > last_quota_reset.date():
         print("🔄 Daily quota reset detected")
         last_quota_reset = now
         daily_quota_exhausted = False
         reset_to_primary_model()
         return True
 def mark_daily_quota_exhausted():
     global daily_quota_exhausted
     daily_quota_exhausted = True
+    print(f"❌ Daily quota exhausted")
+# --- Model Management ---
 def get_gemini_model():
     global gemini_model, current_model_index
+    if not GEMINI_AVAILABLE or not GEMINI_API_KEY:
         return None
     if not check_daily_quota():
         return None
     if gemini_model is None:
         model_config = GEMINI_MODELS[current_model_index]
         try:
             genai.configure(api_key=GEMINI_API_KEY)
             gemini_model = genai.GenerativeModel(model_config["name"])
+            print(f"✓ Initialized: {model_config['name']}")
+        except Exception as e:
             print(f"Failed to initialize {model_config['name']}: {e}")
             return None
     return gemini_model
 def switch_to_next_model():
     global gemini_model, current_model_index, gemini_rate_limiter
     if current_model_index < len(GEMINI_MODELS) - 1:
         current_model_index += 1
         model_config = GEMINI_MODELS[current_model_index]
         gemini_rate_limiter = SimpleRateLimiter(
             max_requests=model_config["max_requests_per_minute"],
             window_seconds=60
         )
         gemini_model = None
+        print(f"🔄 SWITCHED TO MODEL: {model_config['name']}")
         return get_gemini_model()
+    return None
 def reset_to_primary_model():
     global gemini_model, current_model_index, gemini_rate_limiter
     if current_model_index != 0:
         current_model_index = 0
         model_config = GEMINI_MODELS[0]
         gemini_rate_limiter = SimpleRateLimiter(
             max_requests=model_config["max_requests_per_minute"],
             window_seconds=60
         )
         gemini_model = None
         return True
     return False
+# --- Regex Patterns ---
 INVOICE_NO_RE = re.compile(
+    r"""(?:Invoice\s*No\.?|Inv\. ?\s*No\.?|Bill\s*No\.?|Document\s*No\.?|Doc\s*No\.?|Tax\s*Invoice\s*No\.?)\s*[:\-]?\s*([A-Z0-9][A-Z0-9\-\/]{3,})""",
     re.IGNORECASE | re.VERBOSE
 )
+PREFIXED_INVOICE_RE = re.compile(r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b")
+GST_LIKE_RE = re.compile(r"\b((?:GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b", re.IGNORECASE)
 def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
     total_text_length = 0
     pages_to_check = min(sample_pages, doc.page_count)
     for i in range(pages_to_check):
         text = doc.load_page(i).get_text("text") or ""
+        total_text_length += len(text.strip())
     avg_text_length = total_text_length / pages_to_check
+    return avg_text_length < 50, avg_text_length
+# --- Extraction Logic ---
 def normalize_text_for_search(s: str) -> str:
+    if not s: return s
+    s = s.replace("\u00A0", " ")
+    return re.sub(r"[ ]{2,}", " ", re.sub(r"[\r\n\t]+", " ", s)).strip()
 def try_extract_invoice_from_text(text: str) -> Optional[str]:
+    if not text: return None
     text_norm = normalize_text_for_search(text)
+    m = INVOICE_NO_RE.search(text_norm)
     if m:
         inv = (m.group(1) or "").strip()
+        if inv and len(inv) > 2 and inv.lower() not in ("invoice", "bill"):
             return inv
+    m = PREFIXED_INVOICE_RE.search(text_norm[:600])
     if m:
         inv = (m.group(1) or "").strip()
         if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
             return inv
     gm = GST_LIKE_RE.search(text_norm)
     if gm:
+        gst_val = gm.group(2).replace(" ", "").strip().upper()
+        if len(gst_val) == 15:
             return f"GST:{gst_val}"
     return None
 def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
+    if not check_daily_quota(): return None
     model = get_gemini_model()
+    if not model: return None
     if not gemini_rate_limiter.allow_request():
         wait_time = gemini_rate_limiter.wait_time()
+        print(f"    ⏱ Rate limit, waiting {int(wait_time)}s...")
         time.sleep(wait_time + 1)
         return extract_invoice_gemini(page, retry_count)
     try:
+        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
         img_bytes = pix.tobytes("png")
         img = Image.open(io.BytesIO(img_bytes))
+        prompt = """Extract the invoice number. Return ONLY the number. If not found, return 'NOT_FOUND'."""
         response = model.generate_content([prompt, img])
         if response and response.text:
+            txt = response.text.strip().replace("*", "").replace("#", "")
+            if txt and txt != "NOT_FOUND" and len(txt) > 2:
+                return txt
+        # Fallback to OCR text
+        ocr_resp = model.generate_content(["Extract all text.", img])
+        if ocr_resp and ocr_resp.text:
+            return try_extract_invoice_from_text(ocr_resp.text)
         return None
+    except Exception as e:
         error_str = str(e).lower()
+        if "429" in str(e) or "quota" in error_str:
             gemini_rate_limiter.record_quota_error()
+            if "per_day" in error_str:
                 mark_daily_quota_exhausted()
                 return None
             if retry_count < len(GEMINI_MODELS) - 1:
                 if switch_to_next_model():
                     return extract_invoice_gemini(page, retry_count + 1)
+        print(f"    ✗ Gemini Error: {e}")
         return None
 def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
+    # 1. Try Text Extraction (Fastest)
+    text = page.get_text("text") or ""
+    inv = try_extract_invoice_from_text(text)
+    if inv: return inv
+    # 2. Try Block Extraction
+    for block in (page.get_text("blocks") or []):
+        if len(block) > 4 and block[4]:
+            inv = try_extract_invoice_from_text(block[4])
+            if inv: return inv
+    # 3. Gemini Fallback (Only if enabled and seemingly image-based)
+    if is_image_pdf:
+        return extract_invoice_gemini(page)
     return None
 def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
     out = fitz.open()
     for i in page_indices:
         out.insert_pdf(src_doc, from_page=i, to_page=i)
+    return out.tobytes()
+# --- File Cleanup Utility ---
+def remove_file(path: str):
+    try:
+        os.remove(path)
+        print(f"🧹 Cleaned up temp file: {path}")
+    except Exception as e:
+        print(f"Warning: Could not remove temp file {path}: {e}")
 # ============================================================================
 @app.post("/split-invoices")
 async def split_invoices(
+    background_tasks: BackgroundTasks,
     file: UploadFile = File(...),
     include_pdf: bool = Form(True),
 ):
     if not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF supported")
+    # --- FIX FOR 100MB FILES: STREAM TO DISK ---
+    # Create a temporary file to store the upload
+    fd, temp_path = tempfile.mkstemp(suffix=".pdf")
+    os.close(fd)  # Close the low-level file descriptor immediately
     try:
+        # Stream upload chunks to disk to keep RAM low
+        print(f"📥 Receiving large file: {file.filename}")
+        with open(temp_path, "wb") as buffer:
+            # Read in 1MB chunks
+            while content := await file.read(1024 * 1024):
+                buffer.write(content)
+        print(f"💾 Saved to temp disk: {temp_path}")
+        # Open Document from DISK (Lazy loading)
+        doc = fitz.open(temp_path)
+        if doc.page_count == 0:
+            raise HTTPException(status_code=400, detail="Empty PDF")
+        print(f"Processing {doc.page_count} pages...")
+        # Step 1: Detect Type
+        is_image_pdf, _ = is_image_based_pdf(doc)
+        # Step 2: Extraction Loop
+        page_invoice_nos = []
+        for i in range(doc.page_count):
+            # Load only one page into memory at a time
+            page = doc.load_page(i)
+            inv = extract_invoice_no_from_page(page, is_image_pdf)
+            page_invoice_nos.append(inv)
+            # Explicitly dereference page to help garbage collector
+            del page
+        # Step 3: Filtering & Grouping
+        clean_invs = [
+            None if (v and v.upper().startswith("GST:")) else v
+            for v in page_invoice_nos
+        ]
+        groups = []
+        current_group = []
+        current_inv = None
+        for idx, inv in enumerate(clean_invs):
+            if current_inv is None:
+                current_inv = inv
+                current_group = [idx]
             else:
+                if inv is not None and inv != current_inv:
+                    # Save previous group
+                    groups.append({"invoice_no": current_inv, "pages": current_group})
+                    # Start new group
+                    current_inv = inv
+                    current_group = [idx]
                 else:
+                    current_group.append(idx)
+        if current_group:
+            groups.append({"invoice_no": current_inv, "pages": current_group})
+        # Logic Fix: If first page has no invoice, merge with second group if valid
         if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
             groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
+            groups.pop(0)
+        # Step 4: Build Response
         parts = []
+        for g in groups:
+            # Generate bytes only for specific pages
             part_bytes = build_pdf_from_pages(doc, g["pages"])
             info = {
                 "invoice_no": g["invoice_no"],
                 "pages": [p + 1 for p in g["pages"]],
+                "size_bytes": len(part_bytes)
             }
             if include_pdf:
                 info["pdf_base64"] = base64.b64encode(part_bytes).decode("ascii")
             parts.append(info)
         doc.close()
         return JSONResponse({
             "count": len(parts),
+            "parts": parts,
+            "quota_status": {"daily_exhausted": daily_quota_exhausted}
         })
     except Exception as e:
+        print(f"Critical Error: {e}")
         import traceback
         traceback.print_exc()
         return JSONResponse({"error": str(e)}, status_code=500)
+    finally:
+        # --- CRITICAL CLEANUP ---
+        # Ensure temp file is deleted even if code crashes
+        # Use background task to delete file after response is sent if you want,
+        # but here we do it synchronously to be safe.
+        remove_file(temp_path)
 if __name__ == "__main__":
     import uvicorn
+    print("🚀 Starting High-Performance Invoice Splitter")
+    # Workers=1 ensures rate limiter works correctly
+    uvicorn.run(app, host="0.0.0.0", port=7860, workers=1)