Spaces:

anujakkulkarni
/

splitpdffile

Sleeping

App Files Files Community

anujakkulkarni commited on 17 days ago

Commit

0da7d43

verified ·

1 Parent(s): 7e49357

Update app.py

Browse files

Files changed (1) hide show

app.py +191 -211

app.py CHANGED Viewed

@@ -61,7 +61,8 @@ app.add_middleware(
 GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
 # Azure Blob Storage Configuration - REQUIRED for blob storage
-AZURE_STORAGE_CONNECTION_STRING = os.environ.get("AZURE_STORAGE_CONNECTION_STRING", "")
 AZURE_STORAGE_ACCOUNT_NAME = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME", "")
 AZURE_STORAGE_ACCOUNT_KEY = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY", "")
@@ -72,9 +73,12 @@ AZURE_CONTAINER_NAME = os.environ.get("AZURE_CONTAINER_NAME", "invoice-splits")
 ROOT_FOLDER = os.environ.get("ROOT_FOLDER", "POD")  # Root folder name
 # ⭐ PERFORMANCE CONFIGURATION
-MAX_PARALLEL_GEMINI_CALLS = int(os.environ.get("MAX_PARALLEL_GEMINI_CALLS", "5"))
-GEMINI_IMAGE_RESOLUTION = float(os.environ.get("GEMINI_IMAGE_RESOLUTION", "1.2"))
-USE_SMART_SAMPLING = os.environ.get("USE_SMART_SAMPLING", "false").lower() == "true"
 # ⭐ SERVER CONFIGURATION
 HOST = os.environ.get("HOST", "0.0.0.0")  # Hugging Face uses 0.0.0.0
@@ -91,38 +95,42 @@ blob_service_client = None
 # STARTUP VALIDATION
 # ============================================================================
 def validate_configuration():
     """Validate configuration and warn about missing credentials."""
     warnings = []
     errors = []
     # Check Gemini API Key
     if not GEMINI_API_KEY:
-        warnings.append("⚠️  GEMINI_API_KEY not set - image-based PDFs will not work")
     else:
         print(f"✅ GEMINI_API_KEY configured ({len(GEMINI_API_KEY)} chars)")
     # Check Azure credentials
     if not AZURE_STORAGE_CONNECTION_STRING:
         if not (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY):
-            errors.append("❌ Azure credentials missing - set AZURE_STORAGE_CONNECTION_STRING or both AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY")
         else:
-            print(f"✅ Azure credentials configured (account: {AZURE_STORAGE_ACCOUNT_NAME})")
     else:
         print(f"✅ Azure connection string configured")
     # Print all warnings
     for warning in warnings:
         print(warning)
     # Print all errors
     for error in errors:
         print(error)
     if errors:
         print("\n⚠️  WARNING: Some required credentials are missing!")
         print("   Set them in Hugging Face Spaces Settings > Repository secrets")
     return len(errors) == 0
@@ -410,16 +418,48 @@ def get_gemini_model():
     return gemini_model
 def extract_invoice_gemini_sync(page: fitz.Page) -> Optional[str]:
     """
     Optimized synchronous Gemini extraction for thread pool execution.
     - Reduced image resolution for faster processing
     - Simplified prompt for quicker responses
     """
     model = get_gemini_model()
     if not model:
         return None
     try:
         # Reduced resolution for faster processing
         pix = page.get_pixmap(matrix=fitz.Matrix(
@@ -428,26 +468,36 @@ def extract_invoice_gemini_sync(page: fitz.Page) -> Optional[str]:
         pix = None
         img = Image.open(io.BytesIO(img_bytes))
-        # Optimized prompt for faster response
         prompt = """Extract ONLY the invoice number from this image.
-Look for: Invoice No, Bill No, Tax Invoice No, or Document No.
-Return ONLY the number/code. If not found, return: NONE"""
         response = model.generate_content([prompt, img])
         if response and response.text:
             extracted_text = response.text.strip()
-            if extracted_text and extracted_text not in ("NOT_FOUND", "NONE", "N/A", "NA"):
-                invoice_no = extracted_text.replace(
-                    "*", "").replace("#", "").replace("Invoice No:", "").replace(":", "").strip()
-                if invoice_no and len(invoice_no) > 2:
-                    img.close()
-                    return invoice_no
-        img.close()
         return None
     except Exception as e:
         print(f"Gemini error: {e}")
         return None
@@ -634,49 +684,121 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
     if not text:
         return None
     text_norm = normalize_text_for_search(text)
-    label_match = re.search(
-        r"(?:Invoice\s*No\.?|Inv\.?\s*No\.?|Bill\s*No\.?|Doc\s*No\.?|Document\s*No\.?|Tax\s*Invoice\s*No\.?)[\s:\-]*(\d{6,15})",
         text_norm, re.IGNORECASE
     )
-    if label_match:
-        invoice_num = label_match.group(1).strip()
-        if is_valid_invoice_number(invoice_num):
-            return invoice_num.upper()
     label_match = re.search(
-        r"(?:Invoice|Inv|Bill|Doc|Document|Tax\s*Invoice)\s*(?:No|#|\.|:\s*)",
         text_norm, re.IGNORECASE
     )
     if label_match:
         start_idx = label_match.end()
-        candidate_text = text_norm[start_idx:start_idx + 60]
-        clean_candidates = re.sub(r"[:\-\(\)\[\]]", " ", candidate_text)
-        words = clean_candidates.split()
-        for word in words:
-            word = word.strip(".,;")
-            if word.lower() in ("order", "ref", "no", "date", "dt", "inv", "bill", "account"):
                 continue
-            if len(word) > 2 and is_valid_invoice_number(word):
-                return word.upper()
-    top_text = text_norm[:800]
-    digit_matches = re.findall(r'\b(\d{6,15})\b', top_text)
-    for match in digit_matches:
-        if is_valid_invoice_number(match):
-            if not re.match(r'^(19|20)\d{6}$', match):
-                if not re.match(r'^[6-9]\d{9}$', match):
-                    return match.upper()
-    top_text = text_norm[:600]
-    m = re.search(r"\b([A-Z0-9][A-Z0-9\-\/]{4,})\b", top_text)
-    if m:
-        inv = m.group(1).upper()
-        if is_valid_invoice_number(inv):
-            return inv
     return None
 def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
     text = page.get_text("text") or ""
     inv = try_extract_invoice_from_text(text)
@@ -879,15 +1001,16 @@ async def split_invoices(
         # ============================================================================
         # 🔧 CORRECTED GROUPING LOGIC - NO AGGRESSIVE FILTERING
         # ============================================================================
         print(f"\n🔧 Grouping invoices...")
         # DEBUG: Show raw extraction results
         print(f"\n🔍 DEBUG - Raw extraction results:")
         for idx, inv in enumerate(page_invoice_nos[:min(10, len(page_invoice_nos))]):
             print(f"   Page {idx+1}: {inv if inv else '(not found)'}")
         if len(page_invoice_nos) > 10:
-            print(f"   ... (showing first 10 of {len(page_invoice_nos)} pages)")
         # Step 1: Normalize extracted invoice numbers (only filter GST numbers)
         page_invoice_nos_normalized = []
@@ -918,10 +1041,11 @@ async def split_invoices(
         # Count how many pages were forward-filled
         filled_count = sum(1 for i in range(len(page_invoice_nos_normalized))
-                          if page_invoice_nos_normalized[i] is None and page_invoice_nos_filled[i] is not None)
         # Debug: Count unique invoice numbers
-        unique_invoices = set([v for v in page_invoice_nos_filled if v is not None])
         print(f"\n   📊 Found {len(unique_invoices)} unique invoice numbers:")
         for inv_no in sorted(unique_invoices) if unique_invoices else []:
             page_count = sum(1 for v in page_invoice_nos_filled if v == inv_no)
@@ -944,7 +1068,8 @@ async def split_invoices(
                         "invoice_no": current_invoice,
                         "pages": current_group[:]
                     })
-                    print(f"   📄 Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
                     current_invoice = inv
                     current_group = [idx]
                 else:
@@ -957,7 +1082,8 @@ async def split_invoices(
                 "invoice_no": current_invoice,
                 "pages": current_group[:]
             })
-            print(f"   📄 Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
         # Handle edge case: entire PDF has no invoice numbers
         if len(groups) == 1 and groups[0]["invoice_no"] is None:
@@ -967,7 +1093,8 @@ async def split_invoices(
             }]
         print(f"\n✅ Created {len(groups)} invoice groups")
-        print(f"   Forward-filled {filled_count} pages with missing invoice numbers")
         # Build and upload split PDFs
         print(f"\n🔨 Building and uploading split invoices...")
@@ -1118,151 +1245,4 @@ async def cleanup_batch(
         "batch_id": batch_id,
         "folder_path": f"{ROOT_FOLDER}/{batch_id}/",
         "container": container_name
-    })
-@app.get("/health")
-async def health_check():
-    """Health check endpoint."""
-    gemini_status = "configured" if get_gemini_model() else "not configured"
-    blob_status = "not configured"
-    blob_details = None
-    try:
-        client = get_blob_service_client()
-        if client:
-            blob_status = "configured"
-            blob_details = {
-                "account_name": AZURE_STORAGE_ACCOUNT_NAME,
-                "container": AZURE_CONTAINER_NAME,
-                "root_folder": ROOT_FOLDER,
-                "available": True
-            }
-    except Exception as e:
-        blob_status = f"error: {str(e)}"
-    return {
-        "status": "healthy",
-        "timestamp": datetime.now().isoformat(),
-        "services": {
-            "gemini": {
-                "status": gemini_status,
-                "available": GEMINI_AVAILABLE,
-                "model": "gemini-2.5-flash",
-                "api_key_set": bool(GEMINI_API_KEY)
-            },
-            "azure_blob_storage": {
-                "status": blob_status,
-                "available": AZURE_AVAILABLE,
-                "details": blob_details,
-                "credentials_set": bool(AZURE_STORAGE_CONNECTION_STRING or (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY))
-            }
-        },
-        "performance": {
-            "max_parallel_gemini_calls": MAX_PARALLEL_GEMINI_CALLS,
-            "gemini_image_resolution": GEMINI_IMAGE_RESOLUTION,
-            "smart_sampling_default": USE_SMART_SAMPLING
-        },
-        "environment": {
-            "host": HOST,
-            "port": PORT
-        }
-    }
-@app.get("/")
-async def root():
-    """Root endpoint."""
-    return {
-        "name": "Invoice Splitter API",
-        "version": "6.0.0 - Fixed Grouping Logic",
-        "description": "Split PDF invoices with Azure Blob Storage - Splits on invoice number change",
-        "features": {
-            "parallel_processing": f"Up to {MAX_PARALLEL_GEMINI_CALLS} concurrent Gemini API calls",
-            "smart_sampling": "Optional fast mode for large PDFs (~5-10x faster)",
-            "optimized_prompts": "Faster Gemini responses",
-            "reduced_resolution": f"Image processing at {GEMINI_IMAGE_RESOLUTION}x for speed",
-            "no_aggressive_filtering": "Keeps all extracted invoice numbers (fixed bug)"
-        },
-        "folder_structure": {
-            "format": "POD/{batch_id}/{filename}/Raw|Splitted/",
-            "raw_folder": "Contains original uploaded PDF",
-            "split_folder": "Contains individual split invoice PDFs"
-        },
-        "endpoints": {
-            "split_invoices": "/split-invoices",
-            "cleanup_batch": "/cleanup-batch/{batch_id}",
-            "health": "/health"
-        },
-        "configuration": {
-            "gemini_configured": bool(GEMINI_API_KEY),
-            "azure_configured": bool(AZURE_STORAGE_CONNECTION_STRING or (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY)),
-            "environment_ready": validate_configuration()
-        }
-    }
-if __name__ == "__main__":
-    import uvicorn
-    print("\n" + "="*70)
-    print("🚀 Invoice Splitter API - v6.0 FIXED (Hugging Face)")
-    print("="*70)
-    # Validate configuration
-    config_valid = validate_configuration()
-    print(f"\n⚡ Performance Features:")
-    print(
-        f"   • Parallel Gemini API calls: {MAX_PARALLEL_GEMINI_CALLS} workers")
-    print(f"   • Image resolution: {GEMINI_IMAGE_RESOLUTION}x (optimized)")
-    print(
-        f"   • Smart sampling: {'Enabled' if USE_SMART_SAMPLING else 'Disabled'} (optional)")
-    print(f"   • Expected speed: 5-10x faster for image PDFs")
-    print(f"\n🔧 Bug Fixes:")
-    print(f"   • ✅ Removed aggressive frequency filtering")
-    print(f"   • ✅ Splits on every invoice number change")
-    print(f"   • ✅ Keeps all extracted invoice numbers")
-    print(f"   • ✅ Added detailed debug logging")
-    print(f"\n📁 Folder Structure:")
-    print(f"   {ROOT_FOLDER}/{{batch_id}}/{{filename}}/")
-    print(f"     ├── Raw/ (original PDF)")
-    print(f"     └── Splitted/ (split invoices)")
-    print(f"\n📦 Azure Configuration:")
-    print(f"   Account: {AZURE_STORAGE_ACCOUNT_NAME or 'Not set'}")
-    print(f"   Container: {AZURE_CONTAINER_NAME}")
-    if get_blob_service_client():
-        print(f"   ✅ Azure Blob Storage: Connected")
-    else:
-        print(f"   ⚠️ Azure Blob Storage: Not configured")
-    if get_gemini_model():
-        print(f"   ✅ Gemini AI: Connected (gemini-2.5-flash)")
-    else:
-        print(f"   ⚠️ Gemini AI: Not configured")
-    print(f"\n🌐 Server Configuration:")
-    print(f"   Host: {HOST}")
-    print(f"   Port: {PORT}")
-    if not config_valid:
-        print(f"\n⚠️  WARNING: Some credentials are missing!")
-        print(f"   For Hugging Face deployment:")
-        print(f"   1. Go to your Space Settings > Repository secrets")
-        print(f"   2. Add the following secrets:")
-        print(f"      - GEMINI_API_KEY")
-        print(f"      - AZURE_STORAGE_CONNECTION_STRING (or)")
-        print(f"      - AZURE_STORAGE_ACCOUNT_NAME + AZURE_STORAGE_ACCOUNT_KEY")
-    print("\n" + "="*70 + "\n")
-    uvicorn.run(
-        app,
-        host=HOST,
-        port=PORT,
-        workers=1,
-        timeout_keep_alive=600
-    )

 GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
 # Azure Blob Storage Configuration - REQUIRED for blob storage
+AZURE_STORAGE_CONNECTION_STRING = os.environ.get(
+    "AZURE_STORAGE_CONNECTION_STRING", "")
 AZURE_STORAGE_ACCOUNT_NAME = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME", "")
 AZURE_STORAGE_ACCOUNT_KEY = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY", "")
 ROOT_FOLDER = os.environ.get("ROOT_FOLDER", "POD")  # Root folder name
 # ⭐ PERFORMANCE CONFIGURATION
+MAX_PARALLEL_GEMINI_CALLS = int(
+    os.environ.get("MAX_PARALLEL_GEMINI_CALLS", "5"))
+GEMINI_IMAGE_RESOLUTION = float(
+    os.environ.get("GEMINI_IMAGE_RESOLUTION", "1.2"))
+USE_SMART_SAMPLING = os.environ.get(
+    "USE_SMART_SAMPLING", "false").lower() == "true"
 # ⭐ SERVER CONFIGURATION
 HOST = os.environ.get("HOST", "0.0.0.0")  # Hugging Face uses 0.0.0.0
 # STARTUP VALIDATION
 # ============================================================================
 def validate_configuration():
     """Validate configuration and warn about missing credentials."""
     warnings = []
     errors = []
     # Check Gemini API Key
     if not GEMINI_API_KEY:
+        warnings.append(
+            "⚠️  GEMINI_API_KEY not set - image-based PDFs will not work")
     else:
         print(f"✅ GEMINI_API_KEY configured ({len(GEMINI_API_KEY)} chars)")
     # Check Azure credentials
     if not AZURE_STORAGE_CONNECTION_STRING:
         if not (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY):
+            errors.append(
+                "❌ Azure credentials missing - set AZURE_STORAGE_CONNECTION_STRING or both AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY")
         else:
+            print(
+                f"✅ Azure credentials configured (account: {AZURE_STORAGE_ACCOUNT_NAME})")
     else:
         print(f"✅ Azure connection string configured")
     # Print all warnings
     for warning in warnings:
         print(warning)
     # Print all errors
     for error in errors:
         print(error)
     if errors:
         print("\n⚠️  WARNING: Some required credentials are missing!")
         print("   Set them in Hugging Face Spaces Settings > Repository secrets")
     return len(errors) == 0
     return gemini_model
+def _clean_gemini_invoice_text(text: str) -> Optional[str]:
+    if not text:
+        return None
+    cleaned = text.strip()
+    cleaned = cleaned.replace("*", "").replace("#", "")
+    cleaned = re.sub(
+        r"(?i)\b(invoice|inv|bill|document|doc|tax\s*invoice)\s*(no|number)?\b",
+        "",
+        cleaned
+    )
+    cleaned = re.sub(r"[:\-\(\)\[\]]", " ", cleaned)
+    cleaned = re.sub(r"\s+", " ", cleaned).strip()
+    # Extract candidates
+    tokens = re.findall(r"[A-Z0-9][A-Z0-9\-\/]{2,}", cleaned.upper())
+    # Prefer alphanumeric invoice IDs first
+    for token in tokens:
+        if any(c.isalpha() for c in token) and any(c.isdigit() for c in token):
+            return token
+    # Fallback to numeric-only (6-15 digits)
+    for token in tokens:
+        if token.isdigit() and 6 <= len(token) <= 15:
+            return token
+    return None
 def extract_invoice_gemini_sync(page: fitz.Page) -> Optional[str]:
     """
     Optimized synchronous Gemini extraction for thread pool execution.
     - Reduced image resolution for faster processing
     - Simplified prompt for quicker responses
+    - OCR fallback for better accuracy
     """
     model = get_gemini_model()
     if not model:
         return None
+    img = None
     try:
         # Reduced resolution for faster processing
         pix = page.get_pixmap(matrix=fitz.Matrix(
         pix = None
         img = Image.open(io.BytesIO(img_bytes))
+        # Updated prompt to prioritize labeled alphanumeric invoice numbers
         prompt = """Extract ONLY the invoice number from this image.
+Prefer the value next to labels like: Invoice No, Invoice Number, Bill No, Document No.
+Return ONLY the identifier (keep letters, e.g., A07966). If not found, return: NONE."""
         response = model.generate_content([prompt, img])
         if response and response.text:
             extracted_text = response.text.strip()
+            candidate = _clean_gemini_invoice_text(extracted_text)
+            if candidate and len(candidate) > 2:
+                img.close()
+                return candidate
+        # OCR Fallback: Extract full text then run regex
+        ocr_prompt = "Extract all text from this invoice image. Return the complete text content."
+        ocr_response = model.generate_content([ocr_prompt, img])
+        if ocr_response and ocr_response.text:
+            inv = try_extract_invoice_from_text(ocr_response.text)
+            if inv:
+                img.close()
+                return inv
+        if img:
+            img.close()
         return None
     except Exception as e:
         print(f"Gemini error: {e}")
+        if img:
+            img.close()
         return None
     if not text:
         return None
     text_norm = normalize_text_for_search(text)
+    # DEBUG: Print first 600 chars
+    print(f"\n🔍 DEBUG - Extracted text (first 600 chars):\n{text_norm[:600]}\n")
+    # PRIORITY 1: Look for CREDIT number (14 digits, common in pharma invoices)
+    credit_match = re.search(
+        r"CREDIT\s*(?:NO|NUMBER|#)?\s*[:\-]?\s*(\d{12,20})",
         text_norm, re.IGNORECASE
     )
+    if credit_match:
+        credit_num = credit_match.group(1).strip()
+        print(f"✓ Found CREDIT number: {credit_num}")
+        if 12 <= len(credit_num) <= 20:
+            return credit_num.upper()
+    # PRIORITY 2: Look for "Invoice No" or "Bill No" followed by long numeric (12-20 digits)
+    invoice_patterns = [
+        r"Invoice\s*(?:No|Number)\.?\s*[:\-]?\s*(\d{12,20})",
+        r"Bill\s*(?:No|Number)\.?\s*[:\-]?\s*(\d{12,20})",
+        r"Tax\s*Invoice\s*(?:No|Number)\.?\s*[:\-]?\s*(\d{12,20})",
+    ]
+    for pattern in invoice_patterns:
+        match = re.search(pattern, text_norm, re.IGNORECASE)
+        if match:
+            num = match.group(1).strip()
+            print(f"✓ Found labeled long numeric invoice: {num}")
+            return num.upper()
+    # PRIORITY 3: Look for "Invoice No" with alphanumeric (but EXCLUDE batch patterns)
+    label_patterns = [
+        r"Invoice\s*No\.?\s*[:\-]\s*([A-Z][A-Z0-9\-\/]{2,20})",
+        r"Bill\s*No\.?\s*[:\-]\s*([A-Z][A-Z0-9\-\/]{2,20})",
+    ]
+    for pattern in label_patterns:
+        match = re.search(pattern, text_norm, re.IGNORECASE)
+        if match:
+            invoice_num = match.group(1).strip()
+            # EXCLUDE batch number patterns (single letter + 6 digits: F500256, I500734, etc.)
+            if re.match(r'^[A-Z]\d{6}$', invoice_num, re.IGNORECASE):
+                print(f"⚠️ Skipping (batch pattern): {invoice_num}")
+                continue
+            # EXCLUDE license patterns (KA-MY2-157424)
+            if re.match(r'^[A-Z]{2,3}-[A-Z0-9]+-\d+$', invoice_num, re.IGNORECASE):
+                print(f"⚠️ Skipping (license pattern): {invoice_num}")
+                continue
+            print(f"✓ Found labeled alphanumeric: {invoice_num}")
+            if any(c.isalpha() for c in invoice_num) and any(c.isdigit() for c in invoice_num):
+                if 3 <= len(invoice_num) <= 20:
+                    return invoice_num.upper()
+    # PRIORITY 4: Look for long numeric values (12-20 digits) in top area
+    top_text = text_norm[:1000]
+    long_numerics = re.findall(r'\b(\d{12,20})\b', top_text)
+    if long_numerics:
+        # Take the longest one (most likely to be invoice number)
+        longest = max(long_numerics, key=len)
+        print(f"✓ Found long numeric value: {longest}")
+        return longest.upper()
+    # PRIORITY 5: Look near "Invoice" label for tokens, EXCLUDE batch patterns
     label_match = re.search(
+        r"(?:Invoice|Bill|Tax\s*Invoice)\s*(?:No|Number|#|\.|:\s*)",
         text_norm, re.IGNORECASE
     )
     if label_match:
         start_idx = label_match.end()
+        candidate_text = text_norm[start_idx:start_idx + 100]
+        print(f"🔍 Text after label: '{candidate_text[:50]}...'")
+        tokens = re.findall(r"\b([A-Z0-9][A-Z0-9\-\/]{2,20})\b", candidate_text, re.IGNORECASE)
+        print(f"🔍 Tokens found: {tokens}")
+        for token in tokens:
+            token = token.strip(".,;:-*")
+            # Skip common words
+            if token.upper() in ("ORDER", "REF", "NO", "DATE", "DT", "INV", "BILL", "ACCOUNT", "PO", "COPY", "OF"):
+                continue
+            # EXCLUDE batch patterns (F500256, I500734)
+            if re.match(r'^[A-Z]\d{6}$', token, re.IGNORECASE):
+                print(f"⚠️ Skipping (batch pattern): {token}")
+                continue
+            # EXCLUDE license patterns
+            if re.match(r'^[A-Z]{2,3}-[A-Z0-9]+-\d+$', token, re.IGNORECASE):
+                print(f"⚠️ Skipping (license pattern): {token}")
                 continue
+            if any(c.isalpha() for c in token) and any(c.isdigit() for c in token):
+                if 3 <= len(token) <= 20:
+                    print(f"✓ Selected token: {token}")
+                    return token.upper()
+    # PRIORITY 6: Medium-length numeric (10-15 digits)
+    medium_numerics = re.findall(r'\b(\d{10,15})\b', top_text)
+    for num in medium_numerics:
+        # Exclude phone numbers (10 digits starting with 6-9)
+        if len(num) == 10 and num[0] in '6789':
+            continue
+        # Exclude dates (8 digits starting with 20)
+        if len(num) == 8 and num.startswith('20'):
+            continue
+        print(f"✓ Found medium numeric value: {num}")
+        return num.upper()
+    print("✗ No invoice number found")
     return None
 def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
     text = page.get_text("text") or ""
     inv = try_extract_invoice_from_text(text)
         # ============================================================================
         # 🔧 CORRECTED GROUPING LOGIC - NO AGGRESSIVE FILTERING
         # ============================================================================
         print(f"\n🔧 Grouping invoices...")
         # DEBUG: Show raw extraction results
         print(f"\n🔍 DEBUG - Raw extraction results:")
         for idx, inv in enumerate(page_invoice_nos[:min(10, len(page_invoice_nos))]):
             print(f"   Page {idx+1}: {inv if inv else '(not found)'}")
         if len(page_invoice_nos) > 10:
+            print(
+                f"   ... (showing first 10 of {len(page_invoice_nos)} pages)")
         # Step 1: Normalize extracted invoice numbers (only filter GST numbers)
         page_invoice_nos_normalized = []
         # Count how many pages were forward-filled
         filled_count = sum(1 for i in range(len(page_invoice_nos_normalized))
+                           if page_invoice_nos_normalized[i] is None and page_invoice_nos_filled[i] is not None)
         # Debug: Count unique invoice numbers
+        unique_invoices = set(
+            [v for v in page_invoice_nos_filled if v is not None])
         print(f"\n   📊 Found {len(unique_invoices)} unique invoice numbers:")
         for inv_no in sorted(unique_invoices) if unique_invoices else []:
             page_count = sum(1 for v in page_invoice_nos_filled if v == inv_no)
                         "invoice_no": current_invoice,
                         "pages": current_group[:]
                     })
+                    print(
+                        f"   📄 Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
                     current_invoice = inv
                     current_group = [idx]
                 else:
                 "invoice_no": current_invoice,
                 "pages": current_group[:]
             })
+            print(
+                f"   📄 Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
         # Handle edge case: entire PDF has no invoice numbers
         if len(groups) == 1 and groups[0]["invoice_no"] is None:
             }]
         print(f"\n✅ Created {len(groups)} invoice groups")
+        print(
+            f"   Forward-filled {filled_count} pages with missing invoice numbers")
         # Build and upload split PDFs
         print(f"\n🔨 Building and uploading split invoices...")
         "batch_id": batch_id,
         "folder_path": f"{ROOT_FOLDER}/{batch_id}/",
         "container": container_name
+    })