Spaces:

anujakkulkarni
/

splitpdffile

Sleeping

App Files Files Community

anujakkulkarni commited on Jan 7

Commit

e531516

verified ·

1 Parent(s): 63e2ea5

Update app.py

Browse files

Files changed (1) hide show

app.py +380 -124

app.py CHANGED Viewed

@@ -2,7 +2,10 @@ import os
 import io
 import re
 import base64
 from typing import List, Dict, Optional, Tuple
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
@@ -14,7 +17,7 @@ try:
     import google.generativeai as genai
     from PIL import Image
     GEMINI_AVAILABLE = True
-except ImportError:
     GEMINI_AVAILABLE = False
     print("Warning: google-generativeai not installed.  Image-based PDFs won't be supported.")
@@ -29,48 +32,202 @@ app.add_middleware(
 )
 # --- Google Gemini Configuration ---
-# This will be automatically loaded from environment variables
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
 gemini_model = None
 def get_gemini_model():
-    """Get or create Gemini model instance."""
-    global gemini_model
     if not GEMINI_AVAILABLE:
         print("Gemini SDK not available")
         return None
-    if gemini_model is None:
-        # Check if API key is configured via environment variables
-        if not GEMINI_API_KEY:
-            print("Warning:  Gemini API key not found in environment variables.")
-            print("Please configure GEMINI_API_KEY in your environment variables.")
-            return None
         try:
             genai.configure(api_key=GEMINI_API_KEY)
-            gemini_model = genai.GenerativeModel('gemini-2.0-flash-exp')
-            print("✓ Google Gemini Flash 2.0 initialized")
-        except Exception as e:
-            print(f"Failed to initialize Gemini model: {e}")
             return None
     return gemini_model
 # --- Regex patterns for text-based PDF extraction ---
 INVOICE_NO_RE = re.compile(
     r"""
-    (?:
         Invoice\s*No\.?|
-        Inv\.?\s*No\.?|
         Bill\s*No\.?|
-        Document\s*No\.?|       # ✅ ADD THIS
         Doc\s*No\.?|
-        Tax\s*Invoice\s*No\.?
     )
     \s*[:\-]?\s*
     ([A-Z0-9][A-Z0-9\-\/]{3,})
@@ -78,50 +235,43 @@ INVOICE_NO_RE = re.compile(
     re.IGNORECASE | re.VERBOSE
 )
 PREFIXED_INVOICE_RE = re.compile(
-    r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b"
 )
 GST_LIKE_RE = re.compile(
-    r"\b((?:GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b", re.IGNORECASE)
 def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
     """
     Detect if PDF is image-based or text-based by sampling pages.
     Returns (is_image_based, avg_text_length).
-    Strategy:
-    - Sample first few pages
-    - If average extractable text < 50 chars per page, it's likely image-based
-    - If text > 200 chars per page, it's text-based
     """
     total_text_length = 0
     pages_to_check = min(sample_pages, doc.page_count)
     for i in range(pages_to_check):
-        text = doc. load_page(i).get_text("text") or ""
         total_text_length += len(text. strip())
     avg_text_length = total_text_length / pages_to_check
     is_image_based = avg_text_length < 50
-    print(
-        f"  PDF Type Detection: avg_text_length={avg_text_length:.1f} chars/page")
-    print(
-        f"  Classification: {'IMAGE-BASED' if is_image_based else 'TEXT-BASED'} PDF")
     return is_image_based, avg_text_length
 # ============================================================================
-# TEXT-BASED PDF EXTRACTION (Original Code)
 # ============================================================================
 def normalize_text_for_search(s: str) -> str:
-    """Light normalization: collapse whitespace and normalize common separators."""
     if not s:
         return s
     s = s.replace("\u00A0", " ")  # non-breaking space
@@ -131,51 +281,40 @@ def normalize_text_for_search(s: str) -> str:
 def try_extract_invoice_from_text(text: str) -> Optional[str]:
-    """
-    Extract invoice number from text using regex patterns.
-    - Prefer explicit labeled Invoice/Bill patterns.
-    - Prefer prefixed invoice formats found in the top of the page.
-    - Use GST only as a last resort and tag it so it won't be mistaken for an invoice id.
-    """
     if not text:
         return None
     text_norm = normalize_text_for_search(text)
     # 1) Labeled invoice like "Invoice No", "Inv No."
-    m = INVOICE_NO_RE.search(text_norm)
     if m:
         inv = (m.group(1) or "").strip()
         if inv and inv.lower() not in ("invoice", "inv", "bill") and len(inv) > 2:
             return inv
-    # 2) Search top portion for prefixed invoice codes (WN-1234, 5EN19710, etc.)
-    top_text = text_norm[:600]  # bigger top area to be robust
     m = PREFIXED_INVOICE_RE.search(top_text)
     if m:
         inv = (m.group(1) or "").strip()
-        # extra length check so tiny numeric matches don't pass
         if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
             return inv
-    # 3) As absolute last-resort: strict GST detection (only accept 15-char GSTIN)
     gm = GST_LIKE_RE.search(text_norm)
     if gm:
         gst_val = gm.group(2) or ""
         gst_val = gst_val.replace(" ", "").strip().upper()
-        # Only accept if 15 alnum chars (typical Indian GSTIN length)
         if len(gst_val) == 15 and re.match(r"^[0-9A-Z]{15}$", gst_val):
-            # tag it so grouping won't treat GST same as invoice ID
             return f"GST:{gst_val}"
     return None
 def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
-    """
-    Extract invoice number from TEXT-BASED PDF.
-    Uses the original fast text extraction method.
-    """
     # Try full-page text
     text = page.get_text("text") or ""
     inv = try_extract_invoice_from_text(text)
@@ -194,30 +333,41 @@ def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
 # ============================================================================
-# IMAGE-BASED PDF EXTRACTION (Google Gemini)
 # ============================================================================
-def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
     """
-    Extract invoice number from IMAGE-BASED PDF using Google Gemini Flash 2.0.
     """
     model = get_gemini_model()
     if not model:
         print("    Gemini model not available")
         return None
     try:
         # Convert page to image
         pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x resolution
         img_bytes = pix.tobytes("png")
-        # Convert to PIL Image for Gemini
         img = Image.open(io.BytesIO(img_bytes))
-        # Prompt for Gemini to extract invoice number
         prompt = """
-        Extract the invoice number from this image.  Look for:
-        - Invoice No, Invoice Number, Bill No, Bill Number
         - Any alphanumeric code that appears to be an invoice identifier
         - Purchase Order numbers if no invoice number is found
@@ -225,7 +375,9 @@ def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
         If no invoice number is found, return "NOT_FOUND".
         """
-        print("    Calling Google Gemini API...")
         response = model.generate_content([prompt, img])
         if response and response.text:
@@ -233,20 +385,17 @@ def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
             print(f"    Gemini response: {extracted_text}")
             if extracted_text and extracted_text != "NOT_FOUND":
-                # Clean up the response
-                invoice_no = extracted_text.replace(
-                    "*", "").replace("#", "").strip()
                 if invoice_no and len(invoice_no) > 2:
-                    print(f"    ✓ Gemini found invoice: {invoice_no}")
                     return invoice_no
             # Fallback:  Get full OCR text and try regex
-            ocr_prompt = "Extract all text from this invoice image. Return the complete text content."
             ocr_response = model.generate_content([ocr_prompt, img])
             if ocr_response and ocr_response.text:
-                print(
-                    f"    Gemini extracted {len(ocr_response.text)} chars, trying regex...")
                 inv = try_extract_invoice_from_text(ocr_response.text)
                 if inv:
                     print(f"    ✓ Found via regex on Gemini text: {inv}")
@@ -255,7 +404,44 @@ def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
         print("    ✗ Gemini:  No invoice found")
         return None
-    except Exception as e:
         print(f"    ✗ Gemini extraction failed: {e}")
         return None
@@ -266,7 +452,6 @@ def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
 def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
     """Try text extraction first, then Gemini as fallback"""
     # ALWAYS try text extraction first (fast, no API cost)
     text_result = extract_invoice_text_based(page)
     if text_result:
@@ -274,7 +459,7 @@ def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optiona
         return text_result
     # If text fails AND PDF seems image-based, try Gemini
-    if is_image_pdf:
         gemini_result = extract_invoice_gemini(page)
         if gemini_result:
             print(f"  ✓ Found via Gemini: {gemini_result}")
@@ -294,23 +479,23 @@ def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> byt
 # ============================================================================
-# API ENDPOINT
 # ============================================================================
 @app.post("/split-invoices")
 async def split_invoices(
     file: UploadFile = File(...),
     include_pdf: bool = Form(True),
-    initial_dpi: int = Form(300),  # Kept for compatibility
 ):
     """
     Split a multi-invoice PDF into separate PDFs based on invoice numbers.
-    - Text-based PDFs: Uses fast text extraction
-    - Image-based PDFs: Uses Google Gemini Flash 2.0 (if configured)
-    Note: GST values (tagged as "GST:...") are treated as a last-resort identifier and
-    are ignored for splitting by default (so repeated company GST won't prevent splits).
     """
     if not file.filename.lower().endswith(".pdf"):
         raise HTTPException(status_code=400, detail="only PDF is supported")
@@ -320,119 +505,116 @@ async def split_invoices(
         raise HTTPException(status_code=400, detail="empty file")
     try:
-        doc = fitz.open(stream=file_bytes, filetype="pdf")
-        if doc.page_count == 0:
             raise HTTPException(status_code=400, detail="no pages found")
         print(f"\n{'='*60}")
         print(f"Processing PDF: {file.filename}")
         print(f"Total pages: {doc.page_count}")
         print(f"{'='*60}")
-        # Step 1: Detect PDF type (text-based vs image-based)
         is_image_pdf, avg_text_len = is_image_based_pdf(doc)
         if is_image_pdf and not get_gemini_model():
-            raise HTTPException(
-                status_code=500,
-                detail="Image-based PDF detected but Google Gemini is not configured.  "
-                       "Please add GEMINI_API_KEY to your environment variables."
-            )
         # Step 2: Extract invoice numbers from each page
-        page_invoice_nos: List[Optional[str]] = []
         for i in range(doc.page_count):
             print(f"\n--- Page {i+1}/{doc.page_count} ---")
-            inv = extract_invoice_no_from_page(doc.load_page(i), is_image_pdf)
-            # inv may be something like "5EN19710" or "GST:12ABCDE..." or None
-            if inv:
                 print(f"  ✓ Raw extracted id: {inv}")
             else:
-                print(f"  ✗ No invoice found (raw)")
             page_invoice_nos.append(inv)
         print(f"\n{'='*60}")
         print(f"Raw Extraction Results:  {page_invoice_nos}")
         print(f"{'='*60}")
-        # ---------------------------------------------------------
-        # Post-process extracted ids before grouping
-        # - Treat GST:<value> as a LAST-RESORT marker and ignore it for splitting
-        #   (convert to None) so repeated company GST doesn't group pages together.
-        # - Keep actual invoice ids like '5EN19710' intact.
-        # ---------------------------------------------------------
-        page_invoice_nos_filtered: List[Optional[str]] = []
-        for v in page_invoice_nos:
             if v is None:
                 page_invoice_nos_filtered.append(None)
             else:
-                # If GST-tagged value (we returned "GST:..."), ignore it for splitting
                 if isinstance(v, str) and v.upper().startswith("GST:"):
                     page_invoice_nos_filtered.append(None)
                 else:
-                    page_invoice_nos_filtered.append(v)
         print(f"Filtered (GST ignored) Results: {page_invoice_nos_filtered}")
-        # Step 3: Group pages by invoice number (use filtered ids)
         groups: List[Dict] = []
-        current_group_pages: List[int] = []
-        current_invoice: Optional[str] = None
         for idx, inv in enumerate(page_invoice_nos_filtered):
             if current_invoice is None:
-                # Start a new group (even if inv is None)
                 current_invoice = inv
                 current_group_pages = [idx]
             else:
-                # If a new non-empty invoice appears and differs -> close current group
                 if inv is not None and inv != current_invoice:
                     groups.append({
                         "invoice_no": current_invoice,
-                        "pages": current_group_pages[:],
                     })
                     current_invoice = inv
                     current_group_pages = [idx]
                 else:
-                    # Continue current group (same invoice or both None)
                     current_group_pages.append(idx)
         # Save last group
         if current_group_pages:
             groups.append({
-                "invoice_no": current_invoice,
                 "pages": current_group_pages[:]
             })
-        # Post-process groups:
-        # If first group has invoice_no None and next group has non-None -> merge leading None
         if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
             groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
-            groups.pop(0)
-        # If, after filtering, all groups are None (no invoice detected), return whole doc as one part
         if all(g["invoice_no"] is None for g in groups):
-            print("\n⚠ Warning: No invoices detected in any page (after GST ignored)!")
             print("  Returning entire PDF as single part")
             groups = [{
                 "invoice_no": None,
                 "pages": list(range(doc.page_count))
             }]
-        # Step 4: Build response parts
         parts = []
         for idx, g in enumerate(groups):
             part_bytes = build_pdf_from_pages(doc, g["pages"])
             info = {
-                # Keep invoice_no as detected in filtered set (None or actual invoice id)
                 "invoice_no": g["invoice_no"],
-                "pages": [p + 1 for p in g["pages"]],  # 1-based for humans
                 "num_pages": len(g["pages"]),
                 "size_bytes": len(part_bytes),
             }
             if include_pdf:
-                info["pdf_base64"] = base64.b64encode(
-                    part_bytes).decode("ascii")
             parts.append(info)
             print(f"\nPart {idx+1}:")
             print(f"  Invoice: {g['invoice_no']}")
@@ -448,13 +630,19 @@ async def split_invoices(
         return JSONResponse({
             "count": len(parts),
             "pdf_type": "image-based" if is_image_pdf else "text-based",
             "parts": parts
         })
-    except HTTPException:
         raise
     except Exception as e:
-        print(f"\n✗ Error: {str(e)}")
         import traceback
         traceback.print_exc()
         return JSONResponse({"error": str(e)}, status_code=500)
@@ -463,13 +651,81 @@ async def split_invoices(
 @app.get("/health")
 async def health_check():
     """Health check endpoint to verify Gemini configuration."""
-    gemini_status = "configured" if get_gemini_model() else "not configured"
     return {
         "status": "healthy",
-        "gemini_flash": gemini_status,
         "gemini_available": GEMINI_AVAILABLE,
     }
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8001)

 import io
 import re
 import base64
+import time
+import datetime
 from typing import List, Dict, Optional, Tuple
+from collections import deque
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
     import google.generativeai as genai
     from PIL import Image
     GEMINI_AVAILABLE = True
+except ImportError:
     GEMINI_AVAILABLE = False
     print("Warning: google-generativeai not installed.  Image-based PDFs won't be supported.")
 )
 # --- Google Gemini Configuration ---
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
+# Model fallback list (in priority order)
+MODELS = [
+    {
+        "name": "gemini-2.5-flash-image",  # PRIMARY - Recommended by Google
+        "max_requests_per_minute": 50,  # Higher quota limit
+        "timeout": 300,
+        "description": "Primary model with higher quota"
+    },
+    {
+        "name": "gemini-2.0-flash",  # Fallback
+        "max_requests_per_minute": 15,
+        "timeout": 300,
+        "description": "Pro fallback"
+    },
+    {
+        "name": "gemini-3-flash",  # Fallback
+        "max_requests_per_minute": 15,
+        "timeout": 300,
+        "description": "Pro fallback"
+    },
+    {
+        "name": "gemini-2.0-flash-exp",  # FALLBACK 1 - Your original choice
+        "max_requests_per_minute": 9,  # Conservative (under 10 limit)
+        "timeout": 300,
+        "description": "Fallback experimental model"
+    }
+]
+current_model_index = 0
 gemini_model = None
+last_quota_reset = None
+daily_quota_exhausted = False
+# --- Rate Limiter Class ---
+class SimpleRateLimiter:
+    def __init__(self, max_requests=10, window_seconds=60):
+        self.max_requests = max_requests
+        self.window_seconds = window_seconds
+        self.requests = deque()
+        self.quota_error_count = 0
+    def allow_request(self):
+        now = time.time()
+        # Remove old requests outside time window
+        while self.requests and self.requests[0] < now - self.window_seconds:
+            self.requests.popleft()
+        if len(self.requests) < self.max_requests:
+            self.requests.append(now)
+            return True
+        return False
+    def wait_time(self):
+        if not self.requests:
+            return 0
+        oldest = self.requests[0]
+        return max(0, self.window_seconds - (time.time() - oldest))
+    def reset(self):
+        self.requests. clear()
+        self.quota_error_count = 0
+    def record_quota_error(self):
+        self.quota_error_count += 1
+# Initialize rate limiter for current model
+gemini_rate_limiter = SimpleRateLimiter(
+    max_requests=GEMINI_MODELS[current_model_index]["max_requests_per_minute"],
+    window_seconds=60
+)
+# --- Daily Quota Management ---
+def check_daily_quota():
+    """Check if we should reset daily quota flag."""
+    global last_quota_reset, daily_quota_exhausted
+    now = datetime.datetime.now()
+    if last_quota_reset is None:
+        last_quota_reset = now
+        daily_quota_exhausted = False
+        return True
+    # Reset at midnight
+    if now.date() > last_quota_reset.date():
+        print("🔄 Daily quota reset detected")
+        last_quota_reset = now
+        daily_quota_exhausted = False
+        # Also reset to primary model
+        reset_to_primary_model()
+        return True
+    return not daily_quota_exhausted
+def mark_daily_quota_exhausted():
+    """Mark daily quota as exhausted."""
+    global daily_quota_exhausted
+    daily_quota_exhausted = True
+    next_reset = (datetime.datetime.now() + datetime.timedelta(days=1)).replace(
+        hour=0, minute=0, second=0
+    )
+    print(f"❌ Daily quota exhausted - resets at {next_reset. strftime('%Y-%m-%d %H:%M')}")
+# --- Model Management Functions ---
 def get_gemini_model():
+    """Get or create Gemini model instance with auto-fallback."""
+    global gemini_model, current_model_index
     if not GEMINI_AVAILABLE:
         print("Gemini SDK not available")
         return None
+    if not GEMINI_API_KEY:
+        print("Warning:  Gemini API key not found in environment variables.")
+        return None
+    # Check daily quota first
+    if not check_daily_quota():
+        print("Daily quota exhausted, Gemini unavailable until reset")
+        return None
+    # Try to initialize model if not already done
+    if gemini_model is None:
+        model_config = GEMINI_MODELS[current_model_index]
         try:
             genai.configure(api_key=GEMINI_API_KEY)
+            gemini_model = genai.GenerativeModel(model_config["name"])
+            print(f"✓ Initialized:  {model_config['name']} ({model_config['description']})")
+        except Exception as e:
+            print(f"Failed to initialize {model_config['name']}: {e}")
             return None
     return gemini_model
+def switch_to_next_model():
+    """Switch to next available model in fallback chain."""
+    global gemini_model, current_model_index, gemini_rate_limiter
+    if current_model_index < len(GEMINI_MODELS) - 1:
+        current_model_index += 1
+        model_config = GEMINI_MODELS[current_model_index]
+        # Reset rate limiter with new model's limits
+        gemini_rate_limiter = SimpleRateLimiter(
+            max_requests=model_config["max_requests_per_minute"],
+            window_seconds=60
+        )
+        # Force reinitialization
+        gemini_model = None
+        print(f"🔄 SWITCHED TO MODEL: {model_config['name']} ({model_config['description']})")
+        return get_gemini_model()
+    else:
+        print("❌ All models exhausted!")
+        return None
+def reset_to_primary_model():
+    """Reset back to primary model."""
+    global gemini_model, current_model_index, gemini_rate_limiter
+    if current_model_index != 0:
+        old_model = GEMINI_MODELS[current_model_index]['name']
+        current_model_index = 0
+        model_config = GEMINI_MODELS[0]
+        gemini_rate_limiter = SimpleRateLimiter(
+            max_requests=model_config["max_requests_per_minute"],
+            window_seconds=60
+        )
+        gemini_model = None
+        print(f"🔄 Reset from {old_model} to primary model:  {model_config['name']}")
+        return True
+    return False
 # --- Regex patterns for text-based PDF extraction ---
 INVOICE_NO_RE = re.compile(
     r"""
+    (?:
         Invoice\s*No\.?|
+        Inv\. ?\s*No\.?|
         Bill\s*No\.?|
+        Document\s*No\.?|
         Doc\s*No\.?|
+        Tax\s*Invoice\s*No\.?
     )
     \s*[:\-]?\s*
     ([A-Z0-9][A-Z0-9\-\/]{3,})
     re.IGNORECASE | re.VERBOSE
 )
 PREFIXED_INVOICE_RE = re.compile(
+    r"\b([A-Z]{2,4}[-/]\d{4,}(? :/\d+)?[A-Z]*)\b"
 )
 GST_LIKE_RE = re.compile(
+    r"\b((? : GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b",
+    re.IGNORECASE
+)
 def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
     """
     Detect if PDF is image-based or text-based by sampling pages.
     Returns (is_image_based, avg_text_length).
     """
     total_text_length = 0
     pages_to_check = min(sample_pages, doc.page_count)
     for i in range(pages_to_check):
+        text = doc.load_page(i).get_text("text") or ""
         total_text_length += len(text. strip())
     avg_text_length = total_text_length / pages_to_check
     is_image_based = avg_text_length < 50
+    print(f"  PDF Type Detection: avg_text_length={avg_text_length:.1f} chars/page")
+    print(f"  Classification: {'IMAGE-BASED' if is_image_based else 'TEXT-BASED'} PDF")
     return is_image_based, avg_text_length
 # ============================================================================
+# TEXT-BASED PDF EXTRACTION
 # ============================================================================
 def normalize_text_for_search(s: str) -> str:
+    """Light normalization:  collapse whitespace and normalize common separators."""
     if not s:
         return s
     s = s.replace("\u00A0", " ")  # non-breaking space
 def try_extract_invoice_from_text(text: str) -> Optional[str]:
+    """Extract invoice number from text using regex patterns."""
     if not text:
         return None
     text_norm = normalize_text_for_search(text)
     # 1) Labeled invoice like "Invoice No", "Inv No."
+    m = INVOICE_NO_RE. search(text_norm)
     if m:
         inv = (m.group(1) or "").strip()
         if inv and inv.lower() not in ("invoice", "inv", "bill") and len(inv) > 2:
             return inv
+    # 2) Search top portion for prefixed invoice codes
+    top_text = text_norm[: 600]
     m = PREFIXED_INVOICE_RE.search(top_text)
     if m:
         inv = (m.group(1) or "").strip()
         if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
             return inv
+    # 3) Last-resort: GST detection
     gm = GST_LIKE_RE.search(text_norm)
     if gm:
         gst_val = gm.group(2) or ""
         gst_val = gst_val.replace(" ", "").strip().upper()
         if len(gst_val) == 15 and re.match(r"^[0-9A-Z]{15}$", gst_val):
             return f"GST:{gst_val}"
     return None
 def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
+    """Extract invoice number from TEXT-BASED PDF."""
     # Try full-page text
     text = page.get_text("text") or ""
     inv = try_extract_invoice_from_text(text)
 # ============================================================================
+# IMAGE-BASED PDF EXTRACTION (Google Gemini with Auto-Switching)
 # ============================================================================
+def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
     """
+    Extract invoice number from IMAGE-BASED PDF using Google Gemini.
+    With automatic model switching on quota exhaustion.
     """
+    # Check daily quota first
+    if not check_daily_quota():
+        print("    ❌ Daily quota exhausted, skipping Gemini")
+        return None
     model = get_gemini_model()
     if not model:
         print("    Gemini model not available")
         return None
+    # Check rate limit
+    if not gemini_rate_limiter.allow_request():
+        wait_time = gemini_rate_limiter.wait_time()
+        print(f"    ⏱ Rate limit reached, waiting {int(wait_time)}s...")
+        time.sleep(wait_time + 1)
+        return extract_invoice_gemini(page, retry_count)
     try:
         # Convert page to image
         pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x resolution
         img_bytes = pix.tobytes("png")
         img = Image.open(io.BytesIO(img_bytes))
+        # Prompt for Gemini
         prompt = """
+        Extract the invoice number from this image. Look for:
+        - Invoice No, Invoice Number, Bill No, Bill Number, Document No
         - Any alphanumeric code that appears to be an invoice identifier
         - Purchase Order numbers if no invoice number is found
         If no invoice number is found, return "NOT_FOUND".
         """
+        model_name = GEMINI_MODELS[current_model_index]["name"]
+        print(f"    Calling Gemini API (model: {model_name})...")
         response = model.generate_content([prompt, img])
         if response and response.text:
             print(f"    Gemini response: {extracted_text}")
             if extracted_text and extracted_text != "NOT_FOUND":
+                invoice_no = extracted_text. replace("*", "").replace("#", "").strip()
                 if invoice_no and len(invoice_no) > 2:
+                    print(f"    ✓ Gemini found invoice:  {invoice_no}")
                     return invoice_no
             # Fallback:  Get full OCR text and try regex
+            ocr_prompt = "Extract all text from this invoice image.  Return the complete text content."
             ocr_response = model.generate_content([ocr_prompt, img])
             if ocr_response and ocr_response.text:
+                print(f"    Gemini extracted {len(ocr_response.text)} chars, trying regex...")
                 inv = try_extract_invoice_from_text(ocr_response.text)
                 if inv:
                     print(f"    ✓ Found via regex on Gemini text: {inv}")
         print("    ✗ Gemini:  No invoice found")
         return None
+    except Exception as e:
+        error_str = str(e).lower()
+        # Handle quota exhausted errors
+        if "429" in str(e) or "quota" in error_str or "resource" in error_str:
+            print(f"    ❌ QUOTA ERROR: {e}")
+            gemini_rate_limiter.record_quota_error()
+            # Check if it's daily quota
+            if "per_day" in error_str or "limit:  0" in str(e):
+                print("    ❌ DAILY quota exhausted")
+                mark_daily_quota_exhausted()
+                return None
+            # Per-minute quota - try switching model
+            if retry_count < len(GEMINI_MODELS) - 1:
+                print(f"    🔄 Switching to fallback model (attempt {retry_count + 1})...")
+                if switch_to_next_model():
+                    time.sleep(2)  # Brief delay before retry
+                    return extract_invoice_gemini(page, retry_count + 1)
+            # Wait and retry once more with current model
+            if retry_count < len(GEMINI_MODELS):
+                retry_delay = 30
+                # Try to extract retry delay from error
+                import re as regex
+                match = regex.search(r'seconds:\s*(\d+)', str(e))
+                if match:
+                    retry_delay = int(match.group(1)) + 2
+                print(f"    ⏰ Waiting {retry_delay}s before final retry...")
+                time.sleep(retry_delay)
+                return extract_invoice_gemini(page, retry_count + 1)
+            print("    ❌ All retry attempts exhausted")
+            return None
+        # Other errors
         print(f"    ✗ Gemini extraction failed: {e}")
         return None
 def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
     """Try text extraction first, then Gemini as fallback"""
     # ALWAYS try text extraction first (fast, no API cost)
     text_result = extract_invoice_text_based(page)
     if text_result:
         return text_result
     # If text fails AND PDF seems image-based, try Gemini
+    if is_image_pdf:
         gemini_result = extract_invoice_gemini(page)
         if gemini_result:
             print(f"  ✓ Found via Gemini: {gemini_result}")
 # ============================================================================
+# API ENDPOINTS
 # ============================================================================
 @app.post("/split-invoices")
 async def split_invoices(
     file: UploadFile = File(...),
     include_pdf: bool = Form(True),
+    initial_dpi: int = Form(300),
 ):
     """
     Split a multi-invoice PDF into separate PDFs based on invoice numbers.
+    Features:
+    - Text-based PDFs:  Fast text extraction
+    - Image-based PDFs: Google Gemini with auto-model switching
+    - Auto-switches between models when quota exhausted
+    - Daily quota tracking with auto-reset
     """
     if not file.filename.lower().endswith(".pdf"):
         raise HTTPException(status_code=400, detail="only PDF is supported")
         raise HTTPException(status_code=400, detail="empty file")
     try:
+        doc = fitz. open(stream=file_bytes, filetype="pdf")
+        if doc. page_count == 0:
             raise HTTPException(status_code=400, detail="no pages found")
         print(f"\n{'='*60}")
         print(f"Processing PDF: {file.filename}")
         print(f"Total pages: {doc.page_count}")
+        if GEMINI_AVAILABLE:
+            model_status = GEMINI_MODELS[current_model_index]["name"]
+            print(f"Current Gemini model: {model_status}")
+            print(f"Daily quota exhausted: {daily_quota_exhausted}")
         print(f"{'='*60}")
+        # Step 1: Detect PDF type
         is_image_pdf, avg_text_len = is_image_based_pdf(doc)
         if is_image_pdf and not get_gemini_model():
+            if daily_quota_exhausted:
+                raise HTTPException(
+                    status_code=429,
+                    detail="Image-based PDF detected but Gemini API daily quota is exhausted. "
+                           "Please try again tomorrow or use text-based PDFs."
+                )
+            else:
+                raise HTTPException(
+                    status_code=500,
+                    detail="Image-based PDF detected but Google Gemini is not configured. "
+                           "Please add GEMINI_API_KEY to your environment variables."
+                )
         # Step 2: Extract invoice numbers from each page
+        page_invoice_nos:  List[Optional[str]] = []
         for i in range(doc.page_count):
             print(f"\n--- Page {i+1}/{doc.page_count} ---")
+            inv = extract_invoice_no_from_page(doc. load_page(i), is_image_pdf)
+            if inv:
                 print(f"  ✓ Raw extracted id: {inv}")
             else:
+                print(f"  ✗ No invoice found")
             page_invoice_nos.append(inv)
         print(f"\n{'='*60}")
         print(f"Raw Extraction Results:  {page_invoice_nos}")
         print(f"{'='*60}")
+        # Step 3: Filter GST values
+        page_invoice_nos_filtered:  List[Optional[str]] = []
+        for v in page_invoice_nos:
             if v is None:
                 page_invoice_nos_filtered.append(None)
             else:
                 if isinstance(v, str) and v.upper().startswith("GST:"):
                     page_invoice_nos_filtered.append(None)
                 else:
+                    page_invoice_nos_filtered. append(v)
         print(f"Filtered (GST ignored) Results: {page_invoice_nos_filtered}")
+        # Step 4: Group pages by invoice number
         groups: List[Dict] = []
+        current_group_pages:  List[int] = []
+        current_invoice:  Optional[str] = None
         for idx, inv in enumerate(page_invoice_nos_filtered):
             if current_invoice is None:
                 current_invoice = inv
                 current_group_pages = [idx]
             else:
                 if inv is not None and inv != current_invoice:
                     groups.append({
                         "invoice_no": current_invoice,
+                        "pages": current_group_pages[: ],
                     })
                     current_invoice = inv
                     current_group_pages = [idx]
                 else:
                     current_group_pages.append(idx)
         # Save last group
         if current_group_pages:
             groups.append({
+                "invoice_no":  current_invoice,
                 "pages": current_group_pages[:]
             })
+        # Post-process groups
         if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
             groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
+            groups. pop(0)
         if all(g["invoice_no"] is None for g in groups):
+            print("\n⚠ Warning: No invoices detected in any page!")
             print("  Returning entire PDF as single part")
             groups = [{
                 "invoice_no": None,
                 "pages": list(range(doc.page_count))
             }]
+        # Step 5: Build response parts
         parts = []
         for idx, g in enumerate(groups):
             part_bytes = build_pdf_from_pages(doc, g["pages"])
             info = {
                 "invoice_no": g["invoice_no"],
+                "pages": [p + 1 for p in g["pages"]],
                 "num_pages": len(g["pages"]),
                 "size_bytes": len(part_bytes),
             }
             if include_pdf:
+                info["pdf_base64"] = base64.b64encode(part_bytes).decode("ascii")
             parts.append(info)
             print(f"\nPart {idx+1}:")
             print(f"  Invoice: {g['invoice_no']}")
         return JSONResponse({
             "count": len(parts),
             "pdf_type": "image-based" if is_image_pdf else "text-based",
+            "current_model":  GEMINI_MODELS[current_model_index]["name"] if GEMINI_AVAILABLE else None,
+            "quota_status": {
+                "daily_exhausted": daily_quota_exhausted,
+                "current_model_index": current_model_index,
+                "total_models": len(GEMINI_MODELS)
+            },
             "parts": parts
         })
+    except HTTPException:
         raise
     except Exception as e:
+        print(f"\n✗ Error:  {str(e)}")
         import traceback
         traceback.print_exc()
         return JSONResponse({"error": str(e)}, status_code=500)
 @app.get("/health")
 async def health_check():
     """Health check endpoint to verify Gemini configuration."""
+    gemini_status = "not available"
+    current_model_name = None
+    if GEMINI_AVAILABLE and get_gemini_model():
+        gemini_status = "configured"
+        current_model_name = GEMINI_MODELS[current_model_index]["name"]
     return {
         "status": "healthy",
         "gemini_available": GEMINI_AVAILABLE,
+        "gemini_status": gemini_status,
+        "current_model": current_model_name,
+        "current_model_index": current_model_index,
+        "total_models": len(GEMINI_MODELS),
+        "daily_quota_exhausted": daily_quota_exhausted,
+        "quota_errors": gemini_rate_limiter.quota_error_count if GEMINI_AVAILABLE else 0,
     }
+@app.post("/admin/reset-model")
+async def admin_reset_model():
+    """Reset to primary Gemini model."""
+    if reset_to_primary_model():
+        return {
+            "message": "Successfully reset to primary model",
+            "current_model": GEMINI_MODELS[current_model_index]["name"],
+            "status": "success"
+        }
+    else:
+        return {
+            "message": "Already on primary model",
+            "current_model": GEMINI_MODELS[current_model_index]["name"],
+            "status": "info"
+        }
+@app. get("/status")
+async def get_status():
+    """Get detailed status of Gemini models and quota."""
+    return {
+        "current_model":  {
+            "name": GEMINI_MODELS[current_model_index]["name"],
+            "description": GEMINI_MODELS[current_model_index]["description"],
+            "index": current_model_index,
+            "max_rpm": GEMINI_MODELS[current_model_index]["max_requests_per_minute"],
+        },
+        "all_models": [
+            {
+                "name":  m["name"],
+                "description":  m["description"],
+                "max_rpm": m["max_requests_per_minute"],
+                "is_active": i == current_model_index
+            }
+            for i, m in enumerate(GEMINI_MODELS)
+        ],
+        "quota_status": {
+            "daily_exhausted": daily_quota_exhausted,
+            "last_reset": last_quota_reset. isoformat() if last_quota_reset else None,
+            "quota_errors": gemini_rate_limiter.quota_error_count,
+        },
+        "timestamp": datetime.datetime.now().isoformat()
+    }
 if __name__ == "__main__":
     import uvicorn
+    print("="*80)
+    print("🚀 Starting Invoice Splitter API")
+    print("="*80)
+    print(f"📋 Available Gemini Models:")
+    for i, model in enumerate(GEMINI_MODELS):
+        prefix = "🎯 PRIMARY" if i == 0 else f"🔄 FALLBACK {i}"
+        print(f"   {prefix}:  {model['name']} - {model['description']}")
+        print(f"      Rate Limit: {model['max_requests_per_minute']} req/min")
+    print("="*80)
+    uvicorn.run(app, host="0.0.0.0", port=7860)