anujakkulkarni commited on
Commit
0e0c157
Β·
verified Β·
1 Parent(s): 91ed191

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +359 -314
app.py CHANGED
@@ -7,8 +7,11 @@ import tempfile
7
  import uuid
8
  import asyncio
9
  from typing import List, Dict, Optional, Tuple
10
- from collections import Counter
11
  from concurrent.futures import ThreadPoolExecutor
 
 
 
12
 
13
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
14
  from fastapi.middleware.cors import CORSMiddleware
@@ -16,7 +19,10 @@ from fastapi.responses import JSONResponse
16
  from starlette.requests import Request
17
  import fitz # PyMuPDF
18
  import google.generativeai as genai
 
19
  from PIL import Image
 
 
20
 
21
  # Azure Blob Storage
22
  try:
@@ -40,6 +46,10 @@ except ImportError:
40
 
41
  from datetime import datetime, timedelta
42
 
 
 
 
 
43
  app = FastAPI(title="Invoice Splitter API with Azure Blob Storage - Optimized")
44
 
45
  # Increase request body size limit
@@ -54,7 +64,7 @@ app.add_middleware(
54
  )
55
 
56
  # ============================================================================
57
- # ⭐ CONFIGURATION FROM ENVIRONMENT VARIABLES (Hugging Face Secrets)
58
  # ============================================================================
59
 
60
  # Gemini API Key - REQUIRED for image-based PDFs
@@ -84,6 +94,10 @@ USE_SMART_SAMPLING = os.environ.get(
84
  HOST = os.environ.get("HOST", "0.0.0.0") # Hugging Face uses 0.0.0.0
85
  PORT = int(os.environ.get("PORT", "7860")) # Hugging Face default port
86
 
 
 
 
 
87
  # ============================================================================
88
  # GLOBAL VARIABLES
89
  # ============================================================================
@@ -92,6 +106,196 @@ gemini_model = None
92
  blob_service_client = None
93
 
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  # ============================================================================
96
  # STARTUP VALIDATION
97
  # ============================================================================
@@ -102,35 +306,28 @@ def validate_configuration():
102
  warnings = []
103
  errors = []
104
 
105
- # Check Gemini API Key
106
  if not GEMINI_API_KEY:
107
  warnings.append(
108
  "⚠️ GEMINI_API_KEY not set - image-based PDFs will not work")
109
  else:
110
  print(f"βœ… GEMINI_API_KEY configured ({len(GEMINI_API_KEY)} chars)")
111
 
112
- # Check Azure credentials
113
  if not AZURE_STORAGE_CONNECTION_STRING:
114
  if not (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY):
115
- errors.append(
116
- "❌ Azure credentials missing - set AZURE_STORAGE_CONNECTION_STRING or both AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY")
117
  else:
118
  print(
119
  f"βœ… Azure credentials configured (account: {AZURE_STORAGE_ACCOUNT_NAME})")
120
  else:
121
  print(f"βœ… Azure connection string configured")
122
 
123
- # Print all warnings
124
  for warning in warnings:
125
  print(warning)
126
-
127
- # Print all errors
128
  for error in errors:
129
  print(error)
130
 
131
  if errors:
132
  print("\n⚠️ WARNING: Some required credentials are missing!")
133
- print(" Set them in Hugging Face Spaces Settings > Repository secrets")
134
 
135
  return len(errors) == 0
136
 
@@ -156,9 +353,7 @@ def get_blob_service_client():
156
  elif AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY:
157
  account_url = f"https://{AZURE_STORAGE_ACCOUNT_NAME}.blob.core.windows.net"
158
  blob_service_client = BlobServiceClient(
159
- account_url=account_url,
160
- credential=AZURE_STORAGE_ACCOUNT_KEY
161
- )
162
  print("βœ… Azure Blob Storage initialized with account key")
163
  else:
164
  print("⚠️ WARNING: No Azure credentials configured")
@@ -199,17 +394,13 @@ def upload_raw_pdf_to_blob(pdf_bytes: bytes, filename: str, batch_id: str, conta
199
  raise HTTPException(
200
  status_code=500, detail="Azure Blob Storage not configured")
201
 
202
- # Clean filename for folder name
203
  base_filename = os.path.splitext(filename)[0]
204
  safe_folder_name = re.sub(r'[<>:"/\\|?*]', '_', base_filename)
205
-
206
  blob_name = f"{ROOT_FOLDER}/{batch_id}/{safe_folder_name}/Raw/{filename}"
207
 
208
- # Get blob client
209
  blob_client = client.get_blob_client(
210
  container=container_name, blob=blob_name)
211
 
212
- # Upload PDF
213
  print(f"πŸ“€ Uploading raw PDF to: {blob_name}")
214
  blob_client.upload_blob(
215
  pdf_bytes,
@@ -223,7 +414,6 @@ def upload_raw_pdf_to_blob(pdf_bytes: bytes, filename: str, batch_id: str, conta
223
  }
224
  )
225
 
226
- # Generate SAS URL (valid for 24 hours)
227
  expiry_hours = 24
228
  sas_token = generate_blob_sas(
229
  account_name=AZURE_STORAGE_ACCOUNT_NAME,
@@ -234,7 +424,6 @@ def upload_raw_pdf_to_blob(pdf_bytes: bytes, filename: str, batch_id: str, conta
234
  expiry=datetime.utcnow() + timedelta(hours=expiry_hours)
235
  )
236
 
237
- # Construct URLs
238
  blob_url = blob_client.url
239
  download_url = f"{blob_url}?{sas_token}"
240
  expires_at = (datetime.utcnow() +
@@ -273,17 +462,13 @@ def upload_split_pdf_to_blob(pdf_bytes: bytes, invoice_filename: str, original_f
273
  raise HTTPException(
274
  status_code=500, detail="Azure Blob Storage not configured")
275
 
276
- # Clean original filename for folder name
277
  base_filename = os.path.splitext(original_filename)[0]
278
  safe_folder_name = re.sub(r'[<>:"/\\|?*]', '_', base_filename)
279
-
280
  blob_name = f"{ROOT_FOLDER}/{batch_id}/{safe_folder_name}/Splitted/{invoice_filename}"
281
 
282
- # Get blob client
283
  blob_client = client.get_blob_client(
284
  container=container_name, blob=blob_name)
285
 
286
- # Upload PDF
287
  blob_client.upload_blob(
288
  pdf_bytes,
289
  overwrite=True,
@@ -297,7 +482,6 @@ def upload_split_pdf_to_blob(pdf_bytes: bytes, invoice_filename: str, original_f
297
  }
298
  )
299
 
300
- # Generate SAS URL (valid for 24 hours)
301
  expiry_hours = 24
302
  sas_token = generate_blob_sas(
303
  account_name=AZURE_STORAGE_ACCOUNT_NAME,
@@ -308,7 +492,6 @@ def upload_split_pdf_to_blob(pdf_bytes: bytes, invoice_filename: str, original_f
308
  expiry=datetime.utcnow() + timedelta(hours=expiry_hours)
309
  )
310
 
311
- # Construct URLs
312
  blob_url = blob_client.url
313
  download_url = f"{blob_url}?{sas_token}"
314
  expires_at = (datetime.utcnow() +
@@ -344,7 +527,6 @@ async def cleanup_old_blobs(batch_id: str, container_name: str = None):
344
  return
345
 
346
  container_client = client.get_container_client(container_name)
347
-
348
  prefix = f"{ROOT_FOLDER}/{batch_id}/"
349
  blobs = container_client.list_blobs(name_starts_with=prefix)
350
 
@@ -365,103 +547,88 @@ async def cleanup_old_blobs(batch_id: str, container_name: str = None):
365
 
366
 
367
  def get_gemini_model():
368
- """Get or create Gemini model instance."""
369
- global gemini_model
370
 
371
  if not GEMINI_AVAILABLE:
372
  return None
373
 
374
- if gemini_model is None:
375
- if not GEMINI_API_KEY:
376
- return None
377
 
378
- try:
379
- genai.configure(api_key=GEMINI_API_KEY)
380
- # Use Gemini 2.5 Flash
381
- gemini_model = genai.GenerativeModel('gemini-2.5-flash')
382
- print("βœ… Google Gemini initialized")
383
- except Exception as e:
384
- print(f"❌ Failed to initialize Gemini: {e}")
385
- return None
386
 
387
- return gemini_model
 
388
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
 
390
- def extract_invoice_gemini_sync(page: fitz.Page) -> Optional[str]:
391
- """Enhanced Gemini extraction with improved prompts and fallback."""
392
- model = get_gemini_model()
393
- if not model:
394
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
 
396
- img = None
397
  try:
398
- # Reduced resolution for faster processing
399
  pix = page.get_pixmap(matrix=fitz.Matrix(
400
  GEMINI_IMAGE_RESOLUTION, GEMINI_IMAGE_RESOLUTION))
401
  img_bytes = pix.tobytes("png")
402
  pix = None
403
- img = Image.open(io.BytesIO(img_bytes))
404
 
405
- # ⭐ ENHANCED PROMPT: More specific instructions
406
  prompt = """Look at this invoice image and extract ONLY the invoice number.
 
407
 
408
- IMPORTANT:
409
- - Look for labels: "Invoice No", "Invoice Number", "Bill No", "Document No"
410
- - The invoice number is the value RIGHT AFTER these labels
411
- - DO NOT extract: ACK numbers, Account numbers (A/C No), Order numbers
412
- - Return ONLY the invoice number (letters and numbers, e.g., G031663, DHC036747)
413
- - If not found, return: NONE
414
-
415
- Invoice Number:"""
416
-
417
- response = model.generate_content([prompt, img])
418
- if response and response.text:
419
- extracted_text = response.text.strip()
420
-
421
- # Clean up the response
422
- cleaned = extracted_text.replace(
423
- "*", "").replace("#", "").replace("Invoice Number:", "").strip()
424
-
425
- print(f" πŸ€– Gemini raw response: '{extracted_text}'")
426
- print(f" πŸ€– Gemini cleaned: '{cleaned}'")
427
-
428
- # Basic validation
429
- if cleaned and cleaned.upper() != "NONE" and len(cleaned) >= 3:
430
- # Remove any remaining labels
431
- cleaned = re.sub(
432
- r'^(Invoice|Bill|Document)\s+(No\.?|Number)[\s\.:]*', '', cleaned, flags=re.IGNORECASE)
433
- cleaned = cleaned.strip(".,;:-_")
434
-
435
- if len(cleaned) >= 3:
436
- print(f" βœ… Gemini extracted: {cleaned}")
437
- img.close()
438
- return cleaned.upper()
439
-
440
- # ⭐ FALLBACK: Full OCR + regex extraction
441
- print(" ⚠️ Gemini direct extraction failed, trying full OCR...")
442
- ocr_prompt = """Extract ALL text from this invoice image.
443
- Return the complete text content exactly as it appears, preserving all labels and values."""
444
-
445
- ocr_response = model.generate_content([ocr_prompt, img])
446
- if ocr_response and ocr_response.text:
447
- ocr_text = ocr_response.text
448
- print(
449
- f"\n πŸ” Gemini OCR text (first 500 chars):\n{ocr_text[:500]}\n")
450
 
451
- # Try our extraction function on the OCR text
452
- inv = try_extract_invoice_from_text(ocr_text)
453
- if inv:
454
- img.close()
455
- return inv
456
 
457
- if img:
458
- img.close()
459
- return None
 
 
 
 
 
 
460
 
461
  except Exception as e:
462
- print(f" ❌ Gemini error: {e}")
463
- if img:
464
- img.close()
465
  return None
466
 
467
 
@@ -471,7 +638,6 @@ async def extract_invoices_batch_async(doc: fitz.Document, is_image_pdf: bool,
471
  page_invoice_nos = []
472
 
473
  if not is_image_pdf:
474
- # Fast text-based extraction (no parallelization needed)
475
  print(f" πŸ“ Text-based extraction (sequential)")
476
  for i in range(doc.page_count):
477
  if i % 50 == 0:
@@ -484,37 +650,29 @@ async def extract_invoices_batch_async(doc: fitz.Document, is_image_pdf: bool,
484
  gc.collect()
485
  return page_invoice_nos
486
 
487
- # Image-based PDF: Use parallel Gemini processing
488
  print(f" πŸš€ Image-based extraction (parallel, batch_size={batch_size})")
489
 
490
- # Use ThreadPoolExecutor for parallel API calls
491
  with ThreadPoolExecutor(max_workers=batch_size) as executor:
492
  futures = []
493
 
494
- # Submit all pages to thread pool
495
  for i in range(doc.page_count):
496
  page = doc.load_page(i)
497
- # First try text extraction (fast)
498
  text_result = extract_invoice_text_based(page)
499
  if text_result:
500
  futures.append((i, None, text_result))
501
  else:
502
- # Submit to Gemini thread pool
503
  future = executor.submit(extract_invoice_gemini_sync, page)
504
  futures.append((i, future, None))
505
 
506
- # Collect results in order
507
  page_invoice_nos = [None] * doc.page_count
508
  completed = 0
509
 
510
  for i, future, text_result in futures:
511
  try:
512
  if text_result:
513
- # Already extracted from text
514
  page_invoice_nos[i] = text_result
515
  completed += 1
516
  else:
517
- # Wait for Gemini result
518
  result = future.result(timeout=30)
519
  page_invoice_nos[i] = result
520
  completed += 1
@@ -540,12 +698,10 @@ def extract_invoices_smart_sampling(doc: fitz.Document, is_image_pdf: bool) -> L
540
 
541
  page_invoice_nos = [None] * doc.page_count
542
 
543
- # Always extract from first page
544
  page = doc.load_page(0)
545
  page_invoice_nos[0] = extract_invoice_no_from_page(page, is_image_pdf)
546
  print(f" βœ“ Page 1: {page_invoice_nos[0]}")
547
 
548
- # Sample every Nth page to detect changes
549
  sample_interval = max(3, doc.page_count // 20)
550
  print(f" Sampling interval: every {sample_interval} pages")
551
 
@@ -557,7 +713,6 @@ def extract_invoices_smart_sampling(doc: fitz.Document, is_image_pdf: bool) -> L
557
  if i % 10 == 0:
558
  print(f" Sampling page {i+1}/{doc.page_count}...")
559
 
560
- # If invoice changed, extract nearby pages to find exact boundary
561
  prev_known_idx = i - sample_interval
562
  while prev_known_idx >= 0 and page_invoice_nos[prev_known_idx] is None:
563
  prev_known_idx -= 1
@@ -571,13 +726,11 @@ def extract_invoices_smart_sampling(doc: fitz.Document, is_image_pdf: bool) -> L
571
  page_invoice_nos[idx] = extract_invoice_no_from_page(
572
  page, is_image_pdf)
573
 
574
- # Also check last page
575
  if page_invoice_nos[-1] is None:
576
  page = doc.load_page(doc.page_count - 1)
577
  page_invoice_nos[-1] = extract_invoice_no_from_page(page, is_image_pdf)
578
  print(f" βœ“ Last page: {page_invoice_nos[-1]}")
579
 
580
- # Forward-fill gaps
581
  last_known = page_invoice_nos[0]
582
  filled = 0
583
  for i in range(len(page_invoice_nos)):
@@ -591,7 +744,7 @@ def extract_invoices_smart_sampling(doc: fitz.Document, is_image_pdf: bool) -> L
591
  return page_invoice_nos
592
 
593
  # ============================================================================
594
- # PDF PROCESSING FUNCTIONS (KEEP ORIGINAL + ADD ZYDUS FALLBACK)
595
  # ============================================================================
596
 
597
 
@@ -633,48 +786,26 @@ def is_valid_invoice_number(candidate: str) -> bool:
633
  has_digit = any(c.isdigit() for c in candidate)
634
  return has_letter and has_digit
635
 
636
- # ⭐ KEEP YOUR ORIGINAL EXTRACTION FUNCTION (Works for other invoices)
637
-
638
 
639
  def try_extract_invoice_from_text(text: str) -> Optional[str]:
640
- """
641
- ⭐ UNIVERSAL LABEL-FIRST EXTRACTION with Smart Prioritization
642
-
643
- Strategy:
644
- 1. Find invoice-related labels (Invoice No, Bill No, etc.)
645
- 2. Extract ALL potential candidates after the label
646
- 3. TWO-PASS: Prioritize pure numeric 12-14 digit numbers (common for invoices)
647
- 4. Filter out noise patterns (ACK, PH, A/C, state codes, etc.)
648
- 5. Return the first valid candidate
649
-
650
- Works for ANY invoice format!
651
- """
652
  if not text:
653
  return None
654
 
655
  text_norm = normalize_text_for_search(text)
656
 
657
- # ⭐ DEBUG: Print first 800 chars
658
  if len(text_norm) > 0:
659
  print(f"\n{'='*70}")
660
  print(f"πŸ” ANALYZING TEXT (first 800 chars):")
661
  print(f"{text_norm[:800]}")
662
  print(f"{'='*70}\n")
663
 
664
- # ============================================================================
665
- # ⭐ PRIORITY 1: LABELED VALUE EXTRACTION (UNIVERSAL APPROACH)
666
- # ============================================================================
667
-
668
- # Define label patterns in PRIORITY ORDER
669
  label_patterns = [
670
- # Invoice labels (highest priority)
671
  (r"Invoice\s*(?:No\.?|Number|Num)", "Invoice No", True),
672
  (r"Inv\s*(?:No\.?|Number)", "Inv No", True),
673
  (r"Bill\s*(?:No\.?|Number|Num)", "Bill No", True),
674
  (r"Tax\s*Invoice\s*(?:No\.?|Number)", "Tax Invoice No", True),
675
  (r"Document\s*(?:No\.?|Number)", "Document No", True),
676
-
677
- # Other labels (lower priority)
678
  (r"Receipt\s*(?:No\.?|Number)", "Receipt No", False),
679
  (r"Voucher\s*(?:No\.?|Number)", "Voucher No", False),
680
  (r"Reference\s*(?:No\.?|Number)", "Reference No", False),
@@ -685,14 +816,11 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
685
 
686
  for label_pattern, label_name, is_invoice_label in label_patterns:
687
  header_text = text_norm[:2000]
688
-
689
- # Find ALL matches of this label
690
  label_matches = list(re.finditer(
691
  label_pattern, header_text, re.IGNORECASE))
692
 
693
  for label_match in label_matches:
694
  start_pos = label_match.end()
695
- # Get a larger chunk of text after the label (200 chars)
696
  text_after_label = header_text[start_pos:start_pos + 200]
697
 
698
  print(
@@ -700,22 +828,12 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
700
  print(
701
  f" Text after label (first 80 chars): '{text_after_label[:80]}...'")
702
 
703
- # ⭐ UNIVERSAL APPROACH: Extract ALL potential candidates (alphanumeric tokens)
704
  all_candidates = re.findall(
705
- r'\b([A-Z0-9][A-Z0-9\-\/]{2,20})\b',
706
- text_after_label,
707
- re.IGNORECASE
708
- )
709
 
710
  print(
711
  f" Found {len(all_candidates)} potential candidates: {all_candidates[:5]}")
712
 
713
- # ============================================================================
714
- # ⭐ TWO-PASS SMART PRIORITIZATION
715
- # Pass 1: Pure numeric 12-14 digit numbers (very common for invoices)
716
- # Pass 2: Alphanumeric candidates (only if Pass 1 fails)
717
- # ============================================================================
718
-
719
  for pass_number in [1, 2]:
720
  if pass_number == 2 and len(all_candidates) > 0:
721
  print(f" πŸ”„ Second pass: Trying alphanumeric candidates...")
@@ -723,31 +841,22 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
723
  for candidate in all_candidates:
724
  invoice_num = candidate.strip(".,;:-_")
725
 
726
- # Skip if too short
727
  if len(invoice_num) < 3:
728
  continue
729
 
730
- # ⭐ SMART FILTERING: First pass only accepts pure numeric 12-14 digits
731
  is_pure_numeric = invoice_num.isdigit()
732
  is_ideal_invoice_length = 12 <= len(invoice_num) <= 14
733
 
734
  if pass_number == 1:
735
- # First pass: Only consider pure numeric 12-14 digits
736
  if not (is_pure_numeric and is_ideal_invoice_length):
737
  continue
738
  print(
739
  f" ✨ PRIORITY candidate (12-14 digit numeric): '{invoice_num}'")
740
  else:
741
- # Second pass: Skip ones already checked in first pass
742
  if is_pure_numeric and is_ideal_invoice_length:
743
  continue
744
  print(f" πŸ” Evaluating candidate: '{invoice_num}'")
745
 
746
- # ====================================================================
747
- # ⭐ COMPREHENSIVE BLACKLIST FILTER
748
- # ====================================================================
749
-
750
- # Skip noise words
751
  if invoice_num.upper() in ("ORDER", "REF", "NO", "NUMBER", "DATE", "DT", "AND",
752
  "INV", "BILL", "ACCOUNT", "PO", "COPY", "OF",
753
  "DOCUMENT", "DOC", "GST", "GSTIN", "ACK", "USER",
@@ -755,204 +864,135 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
755
  print(f" ⚠️ Skipped: noise word")
756
  continue
757
 
758
- # Context-aware batch pattern filter
759
  if not is_invoice_label:
760
  if re.match(r'^[A-Z]\d{6}$', invoice_num, re.IGNORECASE):
761
  print(
762
  f" ⚠️ Skipped: batch pattern (non-invoice context)")
763
  continue
764
 
765
- # Skip license patterns (XX-XXX-123456)
766
  if re.match(r'^[A-Z]{2,3}-[A-Z0-9]+-\d+$', invoice_num, re.IGNORECASE):
767
  print(f" ⚠️ Skipped: license pattern")
768
  continue
769
 
770
- # ⭐ NEW: Skip state code / UIN patterns (MHMY-4501110485 format)
771
  if re.match(r'^[A-Z]{2,4}-\d{10}$', invoice_num, re.IGNORECASE):
772
- print(
773
- f" ⚠️ Skipped: state code/UIN pattern (XXXX-nnnnnnnnnn)")
774
  continue
775
 
776
- # Skip ACK numbers
777
- if re.search(rf"Ack\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(invoice_num)}",
778
- text_norm, re.IGNORECASE):
779
  print(f" ⚠️ Skipped: ACK number")
780
  continue
781
 
782
- # Skip PH (Phone) numbers
783
- if re.search(rf"PH\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(invoice_num)}",
784
- text_norm, re.IGNORECASE):
785
  print(f" ⚠️ Skipped: PH number")
786
  continue
787
 
788
- # ⭐ Skip UIN/UID/State Code
789
- if re.search(rf"(?:UIN|UID|State\s*Code|D\.L\.No)\.?\s*:?\s*{re.escape(invoice_num)}",
790
- text_norm, re.IGNORECASE):
791
- print(f" ⚠️ Skipped: UIN/UID/State Code/D.L.No")
792
  continue
793
 
794
- # Skip A/C (Account) numbers
795
- if re.search(rf"A[\s\/]*C\s*(?:No\.?|Number)?\s*[\-:\.]?\s*{re.escape(invoice_num)}",
796
- text_norm, re.IGNORECASE):
797
  print(f" ⚠️ Skipped: A/C number")
798
  continue
799
 
800
- # Skip phone numbers (10-11 digits)
801
  if re.match(r'^[0-9]{10,11}$', invoice_num):
802
- # 10 digits starting with 6-9 (mobile)
803
  if len(invoice_num) == 10 and invoice_num[0] in '6789':
804
  print(f" ⚠️ Skipped: mobile number")
805
  continue
806
- # 11 digits starting with 0 (landline with STD code)
807
  if len(invoice_num) == 11 and invoice_num[0] == '0':
808
  print(f" ⚠️ Skipped: landline number")
809
  continue
810
 
811
- # Skip dates (8 digits starting with 20)
812
  if re.match(r'^20\d{6}$', invoice_num):
813
- print(f" ⚠️ Skipped: date pattern (20xxxxxx)")
814
  continue
815
 
816
- # Skip date formats (dd/mm/yyyy or dd-mm-yyyy)
817
  if re.match(r'^\d{2}[\/\-]\d{2}[\/\-]\d{4}$', invoice_num):
818
- print(f" ⚠️ Skipped: date format (dd/mm/yyyy)")
819
  continue
820
 
821
- # Skip GST numbers (15 alphanumeric)
822
- if len(invoice_num) == 15 and re.match(r'^\d{2}[A-Z]{5}\d{4}[A-Z]\d[A-Z]\d$',
823
- invoice_num, re.IGNORECASE):
824
- print(f" ⚠️ Skipped: GST number (15 chars)")
825
  continue
826
 
827
- # βœ… VALID INVOICE NUMBER FOUND!
828
  print(f" βœ…βœ…βœ… ACCEPTED: '{invoice_num}'")
829
  return invoice_num.upper()
830
 
831
  print(f" ⚠️ No valid candidates found after '{label_name}'")
832
 
833
- # ============================================================================
834
- # ⭐ PRIORITY 2: FALLBACK - Unlabeled extraction
835
- # ============================================================================
836
-
837
  print("\n⚠️ No labeled invoice number found, trying fallback extraction...")
838
 
839
  top_text = text_norm[:1000]
840
 
841
- # Try CREDIT numbers (12-20 digits, excluding 14-digit account numbers)
842
  credit_match = re.search(
843
- r"CREDIT\s*(?:NO|NUMBER|#)?\s*[:\-]?\s*(\d{12,20})",
844
- text_norm,
845
- re.IGNORECASE
846
- )
847
  if credit_match:
848
  credit_num = credit_match.group(1).strip()
849
- # Allow 12-14 digits, exclude exactly 14 if it might be account number
850
  if 12 <= len(credit_num) <= 20 and len(credit_num) != 14:
851
  print(f"βœ“ Fallback: Found CREDIT number: {credit_num}")
852
  return credit_num.upper()
853
 
854
- # Try long numerics (12-20 digits), excluding problematic patterns
855
  long_numerics = re.findall(r'\b(\d{12,20})\b', top_text)
856
  for num in long_numerics:
857
- # Skip if labeled as ACK, PH, A/C, UIN, etc.
858
- if re.search(rf"(?:Ack|PH|A[\s\/]*C|UIN|UID|State\s*Code|D\.L\.No)\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(num)}",
859
- text_norm, re.IGNORECASE):
860
  print(f"⚠️ Fallback: Skipping (labeled as ACK/PH/A/C/UIN): {num}")
861
  continue
862
-
863
  print(f"βœ“ Fallback: Found long numeric: {num}")
864
  return num.upper()
865
 
866
- # Try medium numerics (10-15 digits, excluding phones and dates)
867
  medium_numerics = re.findall(r'\b(\d{10,15})\b', top_text)
868
  for num in medium_numerics:
869
- # Skip phone numbers
870
  if len(num) == 10 and num[0] in '6789':
871
  continue
872
  if len(num) == 11 and num[0] == '0':
873
  continue
874
-
875
- # Skip dates
876
  if len(num) == 8 and num.startswith('20'):
877
  continue
878
-
879
- # Skip if labeled as problematic
880
- if re.search(rf"(?:Ack|PH|A[\s\/]*C|UIN|UID)\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(num)}",
881
- text_norm, re.IGNORECASE):
882
  continue
883
-
884
  print(f"βœ“ Fallback: Found medium numeric: {num}")
885
  return num.upper()
886
 
887
  print("βœ— No invoice number found (labeled or unlabeled)")
888
  return None
889
 
890
- # ⭐ ENHANCED FUNCTION: Add Zydus Healthcare fallback (works with table layouts)
891
-
892
 
893
  def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
894
- """
895
- Extract invoice number from TEXT-BASED PDF.
896
-
897
- ⭐ ZYDUS HEALTHCARE PRIORITY EXTRACTION:
898
- Zydus Healthcare invoices have a specific pattern: 10-digit numbers starting with '23'
899
- (e.g., 2310763135, 2310763275). These must be extracted BEFORE the original logic
900
- runs, because the original logic will pick up 14-digit Order IDs instead.
901
- """
902
  text = page.get_text("text") or ""
903
  text_norm = normalize_text_for_search(text)
904
 
905
- # ⭐ STEP 1: ALWAYS check for Zydus pattern FIRST (before any other extraction)
906
- # Look for 10-digit number starting with '23' in first 2500 chars
907
  header_text = text_norm[:2500]
908
-
909
- # Find ALL occurrences of 23xxxxxxxx pattern
910
  zydus_candidates = re.findall(r'\b(23\d{8})\b', header_text)
911
 
912
  if zydus_candidates:
913
- # ⭐ CRITICAL: If we found any 23xxxxxxxx pattern, this is a Zydus invoice
914
- # Return the FIRST occurrence (most likely to be the invoice number)
915
  zydus_number = zydus_candidates[0]
916
  print(f" βœ… ZYDUS INVOICE DETECTED: {zydus_number}")
917
  return zydus_number.upper()
918
 
919
- # ⭐ STEP 2: If NO Zydus pattern found, use original extraction logic
920
  inv = try_extract_invoice_from_text(text)
921
 
922
- # ⭐ NEW: BLACKLIST FILTER for Zydus Healthcare invoices
923
- # Reject 14-digit Order IDs (pattern: 107xxxxxxxxxx or 10xxxxxxxxxx with 14 digits)
924
  if inv:
925
- # Check if this is a 14-digit number starting with '10' or '107'
926
  if re.match(r'^10\d{12}$', inv):
927
- print(
928
- f" ⚠️ REJECTED Order ID (14-digit): {inv} - Looking for Zydus pattern instead...")
929
- # This is likely a Zydus invoice page without the invoice number visible
930
- # Skip this extraction and try other methods
931
  inv = None
932
  else:
933
- # Valid invoice number from original extraction
934
  return inv
935
 
936
- # ⭐ STEP 3: Try block-level extraction (original logic)
937
  for block in (page.get_text("blocks") or []):
938
  block_text = block[4] if len(block) > 4 else ""
939
  if block_text:
940
  inv = try_extract_invoice_from_text(block_text)
941
  if inv:
942
- # Check blacklist again
943
  if re.match(r'^10\d{12}$', inv):
944
- print(
945
- f" ⚠️ REJECTED Order ID from block (14-digit): {inv}")
946
- continue # Skip this block, try next one
947
  else:
948
  return inv
949
 
950
- # ⭐ STEP 4: Final fallback - try Zydus pattern in text blocks
951
- # (For continuation pages where invoice number might be in a different block)
952
  blocks = page.get_text("blocks") or []
953
  sorted_blocks = sorted(blocks, key=lambda b: b[1] if len(b) > 1 else 0)
954
 
955
- for block in sorted_blocks[:15]: # Check first 15 blocks
956
  block_text = block[4] if len(block) > 4 else ""
957
  if block_text:
958
  block_norm = normalize_text_for_search(block_text)
@@ -962,14 +1002,12 @@ def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
962
  print(f" βœ… ZYDUS BLOCK DETECTION: {number}")
963
  return number.upper()
964
 
965
- # ⭐ STEP 5: Last resort - if still nothing found, return None
966
- # The forward-fill logic will assign this page to the previous invoice
967
  print(f" ⚠️ No valid invoice found on this page (will use forward-fill)")
968
  return None
969
 
970
 
971
  def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
972
- """Extract invoice number from a single page (used by smart sampling)."""
973
  text_result = extract_invoice_text_based(page)
974
  if text_result:
975
  return text_result
@@ -996,21 +1034,13 @@ def remove_file(path: str):
996
  except Exception as e:
997
  print(f"⚠️ Cleanup warning: {e}")
998
 
999
- # ============================================================================
1000
- # ⭐ NEW: MERGE FUNCTION FOR NULL FIRST GROUP
1001
- # ============================================================================
1002
-
1003
 
1004
  def merge_first_null_group(groups: List[Dict]) -> List[Dict]:
1005
- """
1006
- If the first group has invoice_no = None and the second group has a valid invoice,
1007
- merge them together (page 1 is likely the cover page of the first invoice).
1008
- """
1009
  if len(groups) >= 2:
1010
  first_group = groups[0]
1011
  second_group = groups[1]
1012
 
1013
- # Check if first group is null and second group has invoice number
1014
  if first_group["invoice_no"] is None and second_group["invoice_no"] is not None:
1015
  print(f"\nπŸ”§ AUTO-FIX: Merging null first page(s) with first invoice")
1016
  print(
@@ -1018,11 +1048,8 @@ def merge_first_null_group(groups: List[Dict]) -> List[Dict]:
1018
  print(
1019
  f" First invoice: {second_group['invoice_no']}, Pages {[p+1 for p in second_group['pages']]}")
1020
 
1021
- # Merge: Add first group's pages to second group
1022
  merged_pages = first_group["pages"] + second_group["pages"]
1023
  second_group["pages"] = merged_pages
1024
-
1025
- # Remove first null group
1026
  groups.pop(0)
1027
 
1028
  print(
@@ -1039,35 +1066,25 @@ def merge_first_null_group(groups: List[Dict]) -> List[Dict]:
1039
  async def split_invoices(
1040
  background_tasks: BackgroundTasks,
1041
  file: UploadFile = File(...),
1042
- batch_id: str = Form(...,
1043
- description="Batch ID (required) - used for folder structure"),
1044
  use_blob_storage: bool = Form(
1045
  True, description="Upload PDFs to Azure Blob Storage"),
1046
  blob_container: Optional[str] = Form(
1047
- None, description="Custom Azure container (optional)"),
1048
  include_base64: bool = Form(
1049
  False, description="Include base64 in response"),
1050
  parallel_batch_size: int = Form(
1051
- MAX_PARALLEL_GEMINI_CALLS, description="Parallel Gemini API calls (1-10)"),
1052
  use_smart_sampling: bool = Form(
1053
- USE_SMART_SAMPLING, description="Use smart sampling (faster, ~95% accurate)"),
1054
  max_file_size_mb: int = Form(200, description="Maximum file size in MB"),
1055
  ):
1056
- """
1057
- ⭐ UNIVERSAL INVOICE SPLITTER
1058
-
1059
- Works for ALL invoice types:
1060
- - Standard invoices (original extraction)
1061
- - Zydus Healthcare invoices (enhanced fallback for 23xxxxxxxx pattern)
1062
- - Auto-merges null first pages
1063
- """
1064
 
1065
  if not file.filename:
1066
  raise HTTPException(status_code=400, detail="No filename provided")
1067
 
1068
  filename_lower = file.filename.lower()
1069
-
1070
- # Supported formats
1071
  SUPPORTED_EXTENSIONS = ['.pdf', '.png',
1072
  '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
1073
 
@@ -1079,18 +1096,14 @@ async def split_invoices(
1079
 
1080
  if not file_extension:
1081
  raise HTTPException(
1082
- status_code=400,
1083
- detail=f"Unsupported file format. Supported: PDF, PNG, JPG, JPEG, TIFF, BMP"
1084
- )
1085
 
1086
  is_image_file = file_extension in [
1087
  '.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
1088
 
1089
  if is_image_file and not GEMINI_AVAILABLE:
1090
  raise HTTPException(
1091
- status_code=500,
1092
- detail="Image processing requires PIL. Install: pip install Pillow"
1093
- )
1094
 
1095
  if use_blob_storage and not get_blob_service_client():
1096
  raise HTTPException(
@@ -1198,7 +1211,6 @@ async def split_invoices(
1198
  print(
1199
  f" ... (showing first 10 of {len(page_invoice_nos)} pages)")
1200
 
1201
- # Step 1: Normalize extracted invoice numbers (only filter GST numbers)
1202
  page_invoice_nos_normalized = []
1203
  for v in page_invoice_nos:
1204
  if v and v.upper().startswith("GST"):
@@ -1209,7 +1221,6 @@ async def split_invoices(
1209
  else:
1210
  page_invoice_nos_normalized.append(None)
1211
 
1212
- # Step 2: Smart forward-fill for failed extractions
1213
  page_invoice_nos_filled = []
1214
  last_known_invoice = None
1215
 
@@ -1230,7 +1241,6 @@ async def split_invoices(
1230
  page_count = sum(1 for v in page_invoice_nos_filled if v == inv_no)
1231
  print(f" β€’ {inv_no}: {page_count} pages")
1232
 
1233
- # Step 3: Group consecutive pages by invoice number
1234
  groups = []
1235
  current_group = []
1236
  current_invoice = None
@@ -1241,10 +1251,8 @@ async def split_invoices(
1241
  current_group = [idx]
1242
  else:
1243
  if inv != current_invoice:
1244
- groups.append({
1245
- "invoice_no": current_invoice,
1246
- "pages": current_group[:]
1247
- })
1248
  print(
1249
  f" πŸ“„ Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
1250
  current_invoice = inv
@@ -1253,27 +1261,21 @@ async def split_invoices(
1253
  current_group.append(idx)
1254
 
1255
  if current_group:
1256
- groups.append({
1257
- "invoice_no": current_invoice,
1258
- "pages": current_group[:]
1259
- })
1260
  print(
1261
  f" πŸ“„ Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
1262
 
1263
  if len(groups) == 1 and groups[0]["invoice_no"] is None:
1264
- groups = [{
1265
- "invoice_no": None,
1266
- "pages": list(range(doc.page_count))
1267
- }]
1268
 
1269
- # ⭐ NEW: Auto-merge first null group
1270
  groups = merge_first_null_group(groups)
1271
 
1272
  print(f"\nβœ… Created {len(groups)} invoice groups (after auto-merge)")
1273
  print(
1274
  f" Forward-filled {filled_count} pages with missing invoice numbers")
1275
 
1276
- # Build and upload split PDFs
1277
  print(f"\nπŸ”¨ Building and uploading split invoices...")
1278
  all_parts = []
1279
 
@@ -1355,7 +1357,8 @@ async def split_invoices(
1355
  "unique_invoice_numbers": len(unique_invoices),
1356
  "extraction_method": "gemini" if is_image_pdf else "text",
1357
  "pages_forward_filled": filled_count,
1358
- "storage_type": "azure_blob" if use_blob_storage else "base64"
 
1359
  },
1360
  "performance": {
1361
  "total_time_seconds": round(total_time, 2),
@@ -1377,6 +1380,7 @@ async def split_invoices(
1377
  f" Raw PDF: {raw_pdf_info['blob_name'] if raw_pdf_info else 'Not uploaded'}")
1378
  print(f" Split invoices: {len(all_parts)}")
1379
  print(f" Unique invoice numbers: {len(unique_invoices)}")
 
1380
  print(f" Total time: {total_time:.1f}s")
1381
  print(
1382
  f" Extraction time: {extraction_time:.1f}s ({total_pages_count / extraction_time:.1f} pages/sec)")
@@ -1406,7 +1410,7 @@ async def cleanup_batch(
1406
  background_tasks: BackgroundTasks,
1407
  container_name: Optional[str] = Form(None)
1408
  ):
1409
- """Delete all blobs for a specific batch (entire POD/{batch_id}/ folder)."""
1410
  if container_name is None:
1411
  container_name = AZURE_CONTAINER_NAME
1412
 
@@ -1421,38 +1425,79 @@ async def cleanup_batch(
1421
  })
1422
 
1423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1424
  @app.get("/")
1425
  async def root():
1426
  return {
1427
- "service": "Universal Invoice Splitter API",
1428
- "version": "3.2",
1429
  "status": "running",
1430
  "features": {
1431
  "multi_format_support": True,
1432
  "zydus_healthcare_support": True,
1433
  "auto_merge_null_groups": True,
1434
  "azure_blob_storage": True,
1435
- "parallel_processing": True
1436
- }
 
 
 
 
1437
  }
1438
 
1439
 
1440
  @app.get("/health")
1441
  async def health():
 
1442
  return {
1443
  "status": "healthy",
1444
  "timestamp": datetime.now().isoformat(),
1445
  "gemini_configured": bool(GEMINI_API_KEY),
1446
- "azure_configured": bool(AZURE_STORAGE_CONNECTION_STRING or (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY))
 
 
1447
  }
1448
 
1449
  if __name__ == "__main__":
1450
  import uvicorn
1451
- print("\n" + "="*70)
1452
- print("πŸš€ Starting Universal Invoice Splitter API")
1453
- print("="*70)
 
 
 
 
 
 
1454
  print(f"βœ… Supports ALL invoice types")
1455
  print(f"βœ… Zydus Healthcare fallback (23xxxxxxxx pattern)")
1456
  print(f"βœ… Auto-merge null first pages")
1457
- print("="*70 + "\n")
 
 
 
 
 
 
 
 
 
 
 
1458
  uvicorn.run(app, host=HOST, port=PORT, log_level="info")
 
7
  import uuid
8
  import asyncio
9
  from typing import List, Dict, Optional, Tuple
10
+ from collections import Counter, deque
11
  from concurrent.futures import ThreadPoolExecutor
12
+ from threading import Lock, Thread, Event
13
+ import time
14
+ import logging
15
 
16
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
17
  from fastapi.middleware.cors import CORSMiddleware
 
19
  from starlette.requests import Request
20
  import fitz # PyMuPDF
21
  import google.generativeai as genai
22
+ from google.api_core import exceptions as google_exceptions
23
  from PIL import Image
24
+ import requests
25
+ import base64
26
 
27
  # Azure Blob Storage
28
  try:
 
46
 
47
  from datetime import datetime, timedelta
48
 
49
+ # Configure logging
50
+ logging.basicConfig(level=logging.INFO)
51
+ logger = logging.getLogger(__name__)
52
+
53
  app = FastAPI(title="Invoice Splitter API with Azure Blob Storage - Optimized")
54
 
55
  # Increase request body size limit
 
64
  )
65
 
66
  # ============================================================================
67
+ # ⭐ CONFIGURATION
68
  # ============================================================================
69
 
70
  # Gemini API Key - REQUIRED for image-based PDFs
 
94
  HOST = os.environ.get("HOST", "0.0.0.0") # Hugging Face uses 0.0.0.0
95
  PORT = int(os.environ.get("PORT", "7860")) # Hugging Face default port
96
 
97
+ MAX_WAIT_TIME = 300 # 5 minutes max wait for quota
98
+ model_lock = Lock()
99
+ quota_manager_lock = Lock()
100
+
101
  # ============================================================================
102
  # GLOBAL VARIABLES
103
  # ============================================================================
 
106
  blob_service_client = None
107
 
108
 
109
+ GEMINI_REST_URL = "https://generativelanguage.googleapis.com/v1/models/{model}:generateContent?key={key}"
110
+
111
+
112
+ def call_gemini_25(model_name: str, image_bytes: bytes, prompt: str) -> str:
113
+ global current_model_index
114
+
115
+ while True:
116
+
117
+ model_config = get_current_model_config()
118
+ url = GEMINI_REST_URL.format(
119
+ model=model_config["name"], key=GEMINI_API_KEY)
120
+
121
+ encoded = base64.b64encode(image_bytes).decode("utf-8")
122
+
123
+ payload = {
124
+ "contents": [
125
+ {
126
+ "parts": [
127
+ {"inline_data": {"mime_type": "image/png", "data": encoded}},
128
+ {"text": prompt}
129
+ ]
130
+ }
131
+ ],
132
+ "generationConfig": {"temperature": 0}
133
+ }
134
+
135
+ r = requests.post(url, json=payload, timeout=model_config["timeout"])
136
+
137
+ # SUCCESS
138
+ if r.status_code == 200:
139
+ record_model_request(model_config)
140
+ data = r.json()
141
+ return data["candidates"][0]["content"]["parts"][0]["text"]
142
+
143
+ # QUOTA HIT β†’ SWITCH MODEL
144
+ if r.status_code in (429, 503):
145
+ print(
146
+ f"⚠️ RPM exhausted for {model_config['name']} β†’ switching model")
147
+
148
+ model_config["current_rpm"] = model_config["max_requests_per_minute"]
149
+
150
+ next_model = get_next_available_model()
151
+
152
+ if next_model:
153
+ print(f"πŸ”„ Switched to {next_model['name']}")
154
+ continue
155
+
156
+ # All models exhausted β†’ wait
157
+ print("⏳ All models exhausted. Waiting 60s...")
158
+ time.sleep(60)
159
+ continue
160
+
161
+ # Other error
162
+ raise Exception(f"Gemini error {r.status_code}: {r.text}")
163
+
164
+
165
+ def get_next_available_model():
166
+ global current_model_index
167
+
168
+ for i in range(len(GEMINI_MODELS)):
169
+ idx = (current_model_index + i) % len(GEMINI_MODELS)
170
+ if can_use_model(GEMINI_MODELS[idx]):
171
+ current_model_index = idx
172
+ return GEMINI_MODELS[idx]
173
+
174
+ return None
175
+
176
+
177
+ # Model configuration with quota tracking
178
+ GEMINI_MODELS = [
179
+ {
180
+ "name": "gemini-2.5-flash-lite",
181
+ "max_requests_per_minute": 120,
182
+ "max_requests_per_day": 10000,
183
+ "max_output_tokens": 16384,
184
+ "timeout": 60,
185
+ "description": "Stage 1 - Pre-classification / validation / cheap parsing",
186
+ "current_rpm": 0,
187
+ "current_rpd": 0,
188
+ "last_rpm_reset": None,
189
+ "last_rpd_reset": None,
190
+ "quota_reset_time": None,
191
+ "skip_on_error": True
192
+ },
193
+ {
194
+ "name": "gemini-2.5-flash-image",
195
+ "max_requests_per_minute": 50,
196
+ "max_requests_per_day": 1500,
197
+ "max_output_tokens": 65536,
198
+ "timeout": 300,
199
+ "description": "Stage 2 - Primary invoice OCR extraction",
200
+ "current_rpm": 0,
201
+ "current_rpd": 0,
202
+ "last_rpm_reset": None,
203
+ "last_rpd_reset": None,
204
+ "quota_reset_time": None,
205
+ "skip_on_error": False
206
+ },
207
+ {
208
+ "name": "gemini-2.5-pro",
209
+ "max_requests_per_minute": 10,
210
+ "max_requests_per_day": 1000,
211
+ "max_output_tokens": 65536,
212
+ "timeout": 300,
213
+ "description": "Stage 3 - Complex invoice reasoning fallback",
214
+ "current_rpm": 0,
215
+ "current_rpd": 0,
216
+ "last_rpm_reset": None,
217
+ "last_rpd_reset": None,
218
+ "quota_reset_time": None,
219
+ "skip_on_error": False
220
+ }
221
+ ]
222
+
223
+ current_model_index = 0
224
+
225
+
226
+
227
+ # ============================================================================
228
+ # ⭐ QUOTA MANAGEMENT FUNCTIONS
229
+ # ============================================================================
230
+
231
+
232
+ def reset_model_quota_counters(model_config):
233
+ """Reset quota counters based on time windows"""
234
+ now = datetime.now()
235
+ with quota_manager_lock:
236
+ if model_config["last_rpm_reset"] is None:
237
+ model_config["last_rpm_reset"] = now
238
+ model_config["current_rpm"] = 0
239
+ elif (now - model_config["last_rpm_reset"]).total_seconds() >= 60:
240
+ model_config["current_rpm"] = 0
241
+ model_config["last_rpm_reset"] = now
242
+ logger.debug(f"πŸ”„ Reset RPM for {model_config['name']}")
243
+
244
+ if model_config["last_rpd_reset"] is None:
245
+ model_config["last_rpd_reset"] = now
246
+ model_config["current_rpd"] = 0
247
+ elif now.date() > model_config["last_rpd_reset"].date():
248
+ model_config["current_rpd"] = 0
249
+ model_config["last_rpd_reset"] = now
250
+ logger.info(f"πŸ”„ Reset daily quota for {model_config['name']}")
251
+
252
+
253
+ def can_use_model(model_config):
254
+ """Check if model has available quota"""
255
+ reset_model_quota_counters(model_config)
256
+ with quota_manager_lock:
257
+ rpm_ok = model_config["current_rpm"] < model_config["max_requests_per_minute"]
258
+ rpd_ok = model_config["current_rpd"] < model_config["max_requests_per_day"]
259
+ return rpm_ok and rpd_ok
260
+
261
+
262
+ def record_model_request(model_config):
263
+ """Record a request"""
264
+ with quota_manager_lock:
265
+ model_config["current_rpm"] += 1
266
+ model_config["current_rpd"] += 1
267
+ logger.debug(
268
+ f"πŸ“Š {model_config['name']}: RPM={model_config['current_rpm']}/{model_config['max_requests_per_minute']}")
269
+
270
+
271
+ def wait_for_quota_renewal(max_wait=MAX_WAIT_TIME):
272
+ """Wait for any model to have quota"""
273
+ start = time.time()
274
+ logger.info(
275
+ f"⏳ All models quota exhausted. Waiting for renewal (max {max_wait}s)...")
276
+
277
+ while time.time() - start < max_wait:
278
+ for i, model in enumerate(GEMINI_MODELS):
279
+ if can_use_model(model):
280
+ elapsed = time.time() - start
281
+ logger.info(
282
+ f"βœ… {model['name']} quota available after {elapsed:.1f}s")
283
+ return True, i
284
+
285
+ elapsed = time.time() - start
286
+ remaining = max_wait - elapsed
287
+ logger.info(
288
+ f"⏰ Waiting... (elapsed: {elapsed:.0f}s, remaining: {remaining:.0f}s)")
289
+ time.sleep(10)
290
+
291
+ logger.error(f"❌ Timeout: No quota available after {max_wait}s")
292
+ return False, -1
293
+
294
+
295
+ def get_current_model_config():
296
+ """Get current model config"""
297
+ return GEMINI_MODELS[current_model_index]
298
+
299
  # ============================================================================
300
  # STARTUP VALIDATION
301
  # ============================================================================
 
306
  warnings = []
307
  errors = []
308
 
 
309
  if not GEMINI_API_KEY:
310
  warnings.append(
311
  "⚠️ GEMINI_API_KEY not set - image-based PDFs will not work")
312
  else:
313
  print(f"βœ… GEMINI_API_KEY configured ({len(GEMINI_API_KEY)} chars)")
314
 
 
315
  if not AZURE_STORAGE_CONNECTION_STRING:
316
  if not (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY):
317
+ errors.append("❌ Azure credentials missing")
 
318
  else:
319
  print(
320
  f"βœ… Azure credentials configured (account: {AZURE_STORAGE_ACCOUNT_NAME})")
321
  else:
322
  print(f"βœ… Azure connection string configured")
323
 
 
324
  for warning in warnings:
325
  print(warning)
 
 
326
  for error in errors:
327
  print(error)
328
 
329
  if errors:
330
  print("\n⚠️ WARNING: Some required credentials are missing!")
 
331
 
332
  return len(errors) == 0
333
 
 
353
  elif AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY:
354
  account_url = f"https://{AZURE_STORAGE_ACCOUNT_NAME}.blob.core.windows.net"
355
  blob_service_client = BlobServiceClient(
356
+ account_url=account_url, credential=AZURE_STORAGE_ACCOUNT_KEY)
 
 
357
  print("βœ… Azure Blob Storage initialized with account key")
358
  else:
359
  print("⚠️ WARNING: No Azure credentials configured")
 
394
  raise HTTPException(
395
  status_code=500, detail="Azure Blob Storage not configured")
396
 
 
397
  base_filename = os.path.splitext(filename)[0]
398
  safe_folder_name = re.sub(r'[<>:"/\\|?*]', '_', base_filename)
 
399
  blob_name = f"{ROOT_FOLDER}/{batch_id}/{safe_folder_name}/Raw/{filename}"
400
 
 
401
  blob_client = client.get_blob_client(
402
  container=container_name, blob=blob_name)
403
 
 
404
  print(f"πŸ“€ Uploading raw PDF to: {blob_name}")
405
  blob_client.upload_blob(
406
  pdf_bytes,
 
414
  }
415
  )
416
 
 
417
  expiry_hours = 24
418
  sas_token = generate_blob_sas(
419
  account_name=AZURE_STORAGE_ACCOUNT_NAME,
 
424
  expiry=datetime.utcnow() + timedelta(hours=expiry_hours)
425
  )
426
 
 
427
  blob_url = blob_client.url
428
  download_url = f"{blob_url}?{sas_token}"
429
  expires_at = (datetime.utcnow() +
 
462
  raise HTTPException(
463
  status_code=500, detail="Azure Blob Storage not configured")
464
 
 
465
  base_filename = os.path.splitext(original_filename)[0]
466
  safe_folder_name = re.sub(r'[<>:"/\\|?*]', '_', base_filename)
 
467
  blob_name = f"{ROOT_FOLDER}/{batch_id}/{safe_folder_name}/Splitted/{invoice_filename}"
468
 
 
469
  blob_client = client.get_blob_client(
470
  container=container_name, blob=blob_name)
471
 
 
472
  blob_client.upload_blob(
473
  pdf_bytes,
474
  overwrite=True,
 
482
  }
483
  )
484
 
 
485
  expiry_hours = 24
486
  sas_token = generate_blob_sas(
487
  account_name=AZURE_STORAGE_ACCOUNT_NAME,
 
492
  expiry=datetime.utcnow() + timedelta(hours=expiry_hours)
493
  )
494
 
 
495
  blob_url = blob_client.url
496
  download_url = f"{blob_url}?{sas_token}"
497
  expires_at = (datetime.utcnow() +
 
527
  return
528
 
529
  container_client = client.get_container_client(container_name)
 
530
  prefix = f"{ROOT_FOLDER}/{batch_id}/"
531
  blobs = container_client.list_blobs(name_starts_with=prefix)
532
 
 
547
 
548
 
549
  def get_gemini_model():
550
+ """Get or create Gemini model instance WITH QUOTA CHECK"""
551
+ global gemini_model, current_model_index
552
 
553
  if not GEMINI_AVAILABLE:
554
  return None
555
 
556
+ if not GEMINI_API_KEY:
557
+ return None
 
558
 
559
+ # ⭐ CHECK QUOTA BEFORE RETURNING MODEL
560
+ model_config = get_current_model_config()
 
 
 
 
 
 
561
 
562
+ if not can_use_model(model_config):
563
+ logger.warning(f"⚠️ {model_config['name']} quota exhausted")
564
 
565
+ # Try other models
566
+ for i, alt_model in enumerate(GEMINI_MODELS):
567
+ if i != current_model_index and can_use_model(alt_model):
568
+ current_model_index = i
569
+ model_config = alt_model
570
+ logger.info(f"πŸ”„ Switched to {model_config['name']}")
571
+ gemini_model = None # Force recreation
572
+ break
573
+ else:
574
+ # All models exhausted - wait
575
+ success, new_index = wait_for_quota_renewal(MAX_WAIT_TIME)
576
+ if success:
577
+ current_model_index = new_index
578
+ model_config = GEMINI_MODELS[new_index]
579
+ gemini_model = None
580
+ else:
581
+ logger.error("❌ All models quota exhausted")
582
+ return None
583
 
584
+ # Create/recreate model if needed
585
+ with model_lock:
586
+ if gemini_model is None or not hasattr(gemini_model, '_model_name') or gemini_model._model_name != model_config['name']:
587
+ try:
588
+ genai.configure(api_key=GEMINI_API_KEY)
589
+ gemini_model = genai.GenerativeModel(model_config['name'])
590
+ gemini_model._model_name = model_config['name']
591
+ logger.info(f"βœ… Using {model_config['name']}")
592
+ except Exception as e:
593
+ logger.error(
594
+ f"❌ Failed to initialize {model_config['name']}: {e}")
595
+ return None
596
+
597
+ # ❌ REMOVE THIS LINE - Don't record request for model creation
598
+ # record_model_request(model_config) # <-- DELETE THIS
599
+
600
+ return gemini_model
601
+
602
+
603
+ def extract_invoice_gemini_sync(page):
604
+ model_config = get_current_model_config()
605
 
 
606
  try:
 
607
  pix = page.get_pixmap(matrix=fitz.Matrix(
608
  GEMINI_IMAGE_RESOLUTION, GEMINI_IMAGE_RESOLUTION))
609
  img_bytes = pix.tobytes("png")
610
  pix = None
 
611
 
 
612
  prompt = """Look at this invoice image and extract ONLY the invoice number.
613
+ Return ONLY the invoice number. If not found return NONE."""
614
 
615
+ text = call_gemini_25(model_config["name"], img_bytes, prompt)
616
+ cleaned = text.strip().replace("Invoice Number:", "").strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
617
 
618
+ print(f"πŸ€– Gemini raw response: {text}")
 
 
 
 
619
 
620
+ if cleaned and cleaned.upper() != "NONE" and len(cleaned) >= 3:
621
+ cleaned = re.sub(r'[^A-Za-z0-9\-/]', '', cleaned)
622
+ print(f"βœ… Gemini extracted: {cleaned}")
623
+ return cleaned.upper()
624
+
625
+ # Fallback OCR
626
+ ocr = call_gemini_25(
627
+ model_config["name"], img_bytes, "Extract all visible text from this image")
628
+ return try_extract_invoice_from_text(ocr)
629
 
630
  except Exception as e:
631
+ print(f"❌ Gemini error: {e}")
 
 
632
  return None
633
 
634
 
 
638
  page_invoice_nos = []
639
 
640
  if not is_image_pdf:
 
641
  print(f" πŸ“ Text-based extraction (sequential)")
642
  for i in range(doc.page_count):
643
  if i % 50 == 0:
 
650
  gc.collect()
651
  return page_invoice_nos
652
 
 
653
  print(f" πŸš€ Image-based extraction (parallel, batch_size={batch_size})")
654
 
 
655
  with ThreadPoolExecutor(max_workers=batch_size) as executor:
656
  futures = []
657
 
 
658
  for i in range(doc.page_count):
659
  page = doc.load_page(i)
 
660
  text_result = extract_invoice_text_based(page)
661
  if text_result:
662
  futures.append((i, None, text_result))
663
  else:
 
664
  future = executor.submit(extract_invoice_gemini_sync, page)
665
  futures.append((i, future, None))
666
 
 
667
  page_invoice_nos = [None] * doc.page_count
668
  completed = 0
669
 
670
  for i, future, text_result in futures:
671
  try:
672
  if text_result:
 
673
  page_invoice_nos[i] = text_result
674
  completed += 1
675
  else:
 
676
  result = future.result(timeout=30)
677
  page_invoice_nos[i] = result
678
  completed += 1
 
698
 
699
  page_invoice_nos = [None] * doc.page_count
700
 
 
701
  page = doc.load_page(0)
702
  page_invoice_nos[0] = extract_invoice_no_from_page(page, is_image_pdf)
703
  print(f" βœ“ Page 1: {page_invoice_nos[0]}")
704
 
 
705
  sample_interval = max(3, doc.page_count // 20)
706
  print(f" Sampling interval: every {sample_interval} pages")
707
 
 
713
  if i % 10 == 0:
714
  print(f" Sampling page {i+1}/{doc.page_count}...")
715
 
 
716
  prev_known_idx = i - sample_interval
717
  while prev_known_idx >= 0 and page_invoice_nos[prev_known_idx] is None:
718
  prev_known_idx -= 1
 
726
  page_invoice_nos[idx] = extract_invoice_no_from_page(
727
  page, is_image_pdf)
728
 
 
729
  if page_invoice_nos[-1] is None:
730
  page = doc.load_page(doc.page_count - 1)
731
  page_invoice_nos[-1] = extract_invoice_no_from_page(page, is_image_pdf)
732
  print(f" βœ“ Last page: {page_invoice_nos[-1]}")
733
 
 
734
  last_known = page_invoice_nos[0]
735
  filled = 0
736
  for i in range(len(page_invoice_nos)):
 
744
  return page_invoice_nos
745
 
746
  # ============================================================================
747
+ # PDF PROCESSING FUNCTIONS
748
  # ============================================================================
749
 
750
 
 
786
  has_digit = any(c.isdigit() for c in candidate)
787
  return has_letter and has_digit
788
 
 
 
789
 
790
  def try_extract_invoice_from_text(text: str) -> Optional[str]:
791
+ """Universal label-first extraction with smart prioritization"""
 
 
 
 
 
 
 
 
 
 
 
792
  if not text:
793
  return None
794
 
795
  text_norm = normalize_text_for_search(text)
796
 
 
797
  if len(text_norm) > 0:
798
  print(f"\n{'='*70}")
799
  print(f"πŸ” ANALYZING TEXT (first 800 chars):")
800
  print(f"{text_norm[:800]}")
801
  print(f"{'='*70}\n")
802
 
 
 
 
 
 
803
  label_patterns = [
 
804
  (r"Invoice\s*(?:No\.?|Number|Num)", "Invoice No", True),
805
  (r"Inv\s*(?:No\.?|Number)", "Inv No", True),
806
  (r"Bill\s*(?:No\.?|Number|Num)", "Bill No", True),
807
  (r"Tax\s*Invoice\s*(?:No\.?|Number)", "Tax Invoice No", True),
808
  (r"Document\s*(?:No\.?|Number)", "Document No", True),
 
 
809
  (r"Receipt\s*(?:No\.?|Number)", "Receipt No", False),
810
  (r"Voucher\s*(?:No\.?|Number)", "Voucher No", False),
811
  (r"Reference\s*(?:No\.?|Number)", "Reference No", False),
 
816
 
817
  for label_pattern, label_name, is_invoice_label in label_patterns:
818
  header_text = text_norm[:2000]
 
 
819
  label_matches = list(re.finditer(
820
  label_pattern, header_text, re.IGNORECASE))
821
 
822
  for label_match in label_matches:
823
  start_pos = label_match.end()
 
824
  text_after_label = header_text[start_pos:start_pos + 200]
825
 
826
  print(
 
828
  print(
829
  f" Text after label (first 80 chars): '{text_after_label[:80]}...'")
830
 
 
831
  all_candidates = re.findall(
832
+ r'\b([A-Z0-9][A-Z0-9\-\/]{2,20})\b', text_after_label, re.IGNORECASE)
 
 
 
833
 
834
  print(
835
  f" Found {len(all_candidates)} potential candidates: {all_candidates[:5]}")
836
 
 
 
 
 
 
 
837
  for pass_number in [1, 2]:
838
  if pass_number == 2 and len(all_candidates) > 0:
839
  print(f" πŸ”„ Second pass: Trying alphanumeric candidates...")
 
841
  for candidate in all_candidates:
842
  invoice_num = candidate.strip(".,;:-_")
843
 
 
844
  if len(invoice_num) < 3:
845
  continue
846
 
 
847
  is_pure_numeric = invoice_num.isdigit()
848
  is_ideal_invoice_length = 12 <= len(invoice_num) <= 14
849
 
850
  if pass_number == 1:
 
851
  if not (is_pure_numeric and is_ideal_invoice_length):
852
  continue
853
  print(
854
  f" ✨ PRIORITY candidate (12-14 digit numeric): '{invoice_num}'")
855
  else:
 
856
  if is_pure_numeric and is_ideal_invoice_length:
857
  continue
858
  print(f" πŸ” Evaluating candidate: '{invoice_num}'")
859
 
 
 
 
 
 
860
  if invoice_num.upper() in ("ORDER", "REF", "NO", "NUMBER", "DATE", "DT", "AND",
861
  "INV", "BILL", "ACCOUNT", "PO", "COPY", "OF",
862
  "DOCUMENT", "DOC", "GST", "GSTIN", "ACK", "USER",
 
864
  print(f" ⚠️ Skipped: noise word")
865
  continue
866
 
 
867
  if not is_invoice_label:
868
  if re.match(r'^[A-Z]\d{6}$', invoice_num, re.IGNORECASE):
869
  print(
870
  f" ⚠️ Skipped: batch pattern (non-invoice context)")
871
  continue
872
 
 
873
  if re.match(r'^[A-Z]{2,3}-[A-Z0-9]+-\d+$', invoice_num, re.IGNORECASE):
874
  print(f" ⚠️ Skipped: license pattern")
875
  continue
876
 
 
877
  if re.match(r'^[A-Z]{2,4}-\d{10}$', invoice_num, re.IGNORECASE):
878
+ print(f" ⚠️ Skipped: state code/UIN pattern")
 
879
  continue
880
 
881
+ if re.search(rf"Ack\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(invoice_num)}", text_norm, re.IGNORECASE):
 
 
882
  print(f" ⚠️ Skipped: ACK number")
883
  continue
884
 
885
+ if re.search(rf"PH\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(invoice_num)}", text_norm, re.IGNORECASE):
 
 
886
  print(f" ⚠️ Skipped: PH number")
887
  continue
888
 
889
+ if re.search(rf"(?:UIN|UID|State\s*Code|D\.L\.No)\.?\s*:?\s*{re.escape(invoice_num)}", text_norm, re.IGNORECASE):
890
+ print(f" ⚠️ Skipped: UIN/UID/State Code")
 
 
891
  continue
892
 
893
+ if re.search(rf"A[\s\/]*C\s*(?:No\.?|Number)?\s*[\-:\.]?\s*{re.escape(invoice_num)}", text_norm, re.IGNORECASE):
 
 
894
  print(f" ⚠️ Skipped: A/C number")
895
  continue
896
 
 
897
  if re.match(r'^[0-9]{10,11}$', invoice_num):
 
898
  if len(invoice_num) == 10 and invoice_num[0] in '6789':
899
  print(f" ⚠️ Skipped: mobile number")
900
  continue
 
901
  if len(invoice_num) == 11 and invoice_num[0] == '0':
902
  print(f" ⚠️ Skipped: landline number")
903
  continue
904
 
 
905
  if re.match(r'^20\d{6}$', invoice_num):
906
+ print(f" ⚠️ Skipped: date pattern")
907
  continue
908
 
 
909
  if re.match(r'^\d{2}[\/\-]\d{2}[\/\-]\d{4}$', invoice_num):
910
+ print(f" ⚠️ Skipped: date format")
911
  continue
912
 
913
+ if len(invoice_num) == 15 and re.match(r'^\d{2}[A-Z]{5}\d{4}[A-Z]\d[A-Z]\d$', invoice_num, re.IGNORECASE):
914
+ print(f" ⚠️ Skipped: GST number")
 
 
915
  continue
916
 
 
917
  print(f" βœ…βœ…βœ… ACCEPTED: '{invoice_num}'")
918
  return invoice_num.upper()
919
 
920
  print(f" ⚠️ No valid candidates found after '{label_name}'")
921
 
 
 
 
 
922
  print("\n⚠️ No labeled invoice number found, trying fallback extraction...")
923
 
924
  top_text = text_norm[:1000]
925
 
 
926
  credit_match = re.search(
927
+ r"CREDIT\s*(?:NO|NUMBER|#)?\s*[:\-]?\s*(\d{12,20})", text_norm, re.IGNORECASE)
 
 
 
928
  if credit_match:
929
  credit_num = credit_match.group(1).strip()
 
930
  if 12 <= len(credit_num) <= 20 and len(credit_num) != 14:
931
  print(f"βœ“ Fallback: Found CREDIT number: {credit_num}")
932
  return credit_num.upper()
933
 
 
934
  long_numerics = re.findall(r'\b(\d{12,20})\b', top_text)
935
  for num in long_numerics:
936
+ if re.search(rf"(?:Ack|PH|A[\s\/]*C|UIN|UID|State\s*Code|D\.L\.No)\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(num)}", text_norm, re.IGNORECASE):
 
 
937
  print(f"⚠️ Fallback: Skipping (labeled as ACK/PH/A/C/UIN): {num}")
938
  continue
 
939
  print(f"βœ“ Fallback: Found long numeric: {num}")
940
  return num.upper()
941
 
 
942
  medium_numerics = re.findall(r'\b(\d{10,15})\b', top_text)
943
  for num in medium_numerics:
 
944
  if len(num) == 10 and num[0] in '6789':
945
  continue
946
  if len(num) == 11 and num[0] == '0':
947
  continue
 
 
948
  if len(num) == 8 and num.startswith('20'):
949
  continue
950
+ if re.search(rf"(?:Ack|PH|A[\s\/]*C|UIN|UID)\.?\s*(?:No\.?|Number)?\s*:?\s*{re.escape(num)}", text_norm, re.IGNORECASE):
 
 
 
951
  continue
 
952
  print(f"βœ“ Fallback: Found medium numeric: {num}")
953
  return num.upper()
954
 
955
  print("βœ— No invoice number found (labeled or unlabeled)")
956
  return None
957
 
 
 
958
 
959
  def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
960
+ """Extract invoice number from TEXT-BASED PDF with Zydus fallback"""
 
 
 
 
 
 
 
961
  text = page.get_text("text") or ""
962
  text_norm = normalize_text_for_search(text)
963
 
 
 
964
  header_text = text_norm[:2500]
 
 
965
  zydus_candidates = re.findall(r'\b(23\d{8})\b', header_text)
966
 
967
  if zydus_candidates:
 
 
968
  zydus_number = zydus_candidates[0]
969
  print(f" βœ… ZYDUS INVOICE DETECTED: {zydus_number}")
970
  return zydus_number.upper()
971
 
 
972
  inv = try_extract_invoice_from_text(text)
973
 
 
 
974
  if inv:
 
975
  if re.match(r'^10\d{12}$', inv):
976
+ print(f" ⚠️ REJECTED Order ID (14-digit): {inv}")
 
 
 
977
  inv = None
978
  else:
 
979
  return inv
980
 
 
981
  for block in (page.get_text("blocks") or []):
982
  block_text = block[4] if len(block) > 4 else ""
983
  if block_text:
984
  inv = try_extract_invoice_from_text(block_text)
985
  if inv:
 
986
  if re.match(r'^10\d{12}$', inv):
987
+ print(f" ⚠️ REJECTED Order ID from block: {inv}")
988
+ continue
 
989
  else:
990
  return inv
991
 
 
 
992
  blocks = page.get_text("blocks") or []
993
  sorted_blocks = sorted(blocks, key=lambda b: b[1] if len(b) > 1 else 0)
994
 
995
+ for block in sorted_blocks[:15]:
996
  block_text = block[4] if len(block) > 4 else ""
997
  if block_text:
998
  block_norm = normalize_text_for_search(block_text)
 
1002
  print(f" βœ… ZYDUS BLOCK DETECTION: {number}")
1003
  return number.upper()
1004
 
 
 
1005
  print(f" ⚠️ No valid invoice found on this page (will use forward-fill)")
1006
  return None
1007
 
1008
 
1009
  def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
1010
+ """Extract invoice number from a single page"""
1011
  text_result = extract_invoice_text_based(page)
1012
  if text_result:
1013
  return text_result
 
1034
  except Exception as e:
1035
  print(f"⚠️ Cleanup warning: {e}")
1036
 
 
 
 
 
1037
 
1038
  def merge_first_null_group(groups: List[Dict]) -> List[Dict]:
1039
+ """Merge null first page with first invoice"""
 
 
 
1040
  if len(groups) >= 2:
1041
  first_group = groups[0]
1042
  second_group = groups[1]
1043
 
 
1044
  if first_group["invoice_no"] is None and second_group["invoice_no"] is not None:
1045
  print(f"\nπŸ”§ AUTO-FIX: Merging null first page(s) with first invoice")
1046
  print(
 
1048
  print(
1049
  f" First invoice: {second_group['invoice_no']}, Pages {[p+1 for p in second_group['pages']]}")
1050
 
 
1051
  merged_pages = first_group["pages"] + second_group["pages"]
1052
  second_group["pages"] = merged_pages
 
 
1053
  groups.pop(0)
1054
 
1055
  print(
 
1066
  async def split_invoices(
1067
  background_tasks: BackgroundTasks,
1068
  file: UploadFile = File(...),
1069
+ batch_id: str = Form(..., description="Batch ID (required)"),
 
1070
  use_blob_storage: bool = Form(
1071
  True, description="Upload PDFs to Azure Blob Storage"),
1072
  blob_container: Optional[str] = Form(
1073
+ None, description="Custom Azure container"),
1074
  include_base64: bool = Form(
1075
  False, description="Include base64 in response"),
1076
  parallel_batch_size: int = Form(
1077
+ MAX_PARALLEL_GEMINI_CALLS, description="Parallel Gemini API calls"),
1078
  use_smart_sampling: bool = Form(
1079
+ USE_SMART_SAMPLING, description="Use smart sampling"),
1080
  max_file_size_mb: int = Form(200, description="Maximum file size in MB"),
1081
  ):
1082
+ """Universal Invoice Splitter with RPM Management"""
 
 
 
 
 
 
 
1083
 
1084
  if not file.filename:
1085
  raise HTTPException(status_code=400, detail="No filename provided")
1086
 
1087
  filename_lower = file.filename.lower()
 
 
1088
  SUPPORTED_EXTENSIONS = ['.pdf', '.png',
1089
  '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
1090
 
 
1096
 
1097
  if not file_extension:
1098
  raise HTTPException(
1099
+ status_code=400, detail=f"Unsupported file format. Supported: {', '.join(SUPPORTED_EXTENSIONS)}")
 
 
1100
 
1101
  is_image_file = file_extension in [
1102
  '.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
1103
 
1104
  if is_image_file and not GEMINI_AVAILABLE:
1105
  raise HTTPException(
1106
+ status_code=500, detail="Image processing requires PIL")
 
 
1107
 
1108
  if use_blob_storage and not get_blob_service_client():
1109
  raise HTTPException(
 
1211
  print(
1212
  f" ... (showing first 10 of {len(page_invoice_nos)} pages)")
1213
 
 
1214
  page_invoice_nos_normalized = []
1215
  for v in page_invoice_nos:
1216
  if v and v.upper().startswith("GST"):
 
1221
  else:
1222
  page_invoice_nos_normalized.append(None)
1223
 
 
1224
  page_invoice_nos_filled = []
1225
  last_known_invoice = None
1226
 
 
1241
  page_count = sum(1 for v in page_invoice_nos_filled if v == inv_no)
1242
  print(f" β€’ {inv_no}: {page_count} pages")
1243
 
 
1244
  groups = []
1245
  current_group = []
1246
  current_invoice = None
 
1251
  current_group = [idx]
1252
  else:
1253
  if inv != current_invoice:
1254
+ groups.append({"invoice_no": current_invoice,
1255
+ "pages": current_group[:]})
 
 
1256
  print(
1257
  f" πŸ“„ Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
1258
  current_invoice = inv
 
1261
  current_group.append(idx)
1262
 
1263
  if current_group:
1264
+ groups.append({"invoice_no": current_invoice,
1265
+ "pages": current_group[:]})
 
 
1266
  print(
1267
  f" πŸ“„ Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
1268
 
1269
  if len(groups) == 1 and groups[0]["invoice_no"] is None:
1270
+ groups = [{"invoice_no": None,
1271
+ "pages": list(range(doc.page_count))}]
 
 
1272
 
 
1273
  groups = merge_first_null_group(groups)
1274
 
1275
  print(f"\nβœ… Created {len(groups)} invoice groups (after auto-merge)")
1276
  print(
1277
  f" Forward-filled {filled_count} pages with missing invoice numbers")
1278
 
 
1279
  print(f"\nπŸ”¨ Building and uploading split invoices...")
1280
  all_parts = []
1281
 
 
1357
  "unique_invoice_numbers": len(unique_invoices),
1358
  "extraction_method": "gemini" if is_image_pdf else "text",
1359
  "pages_forward_filled": filled_count,
1360
+ "storage_type": "azure_blob" if use_blob_storage else "base64",
1361
+ "model_used": get_current_model_config()['name']
1362
  },
1363
  "performance": {
1364
  "total_time_seconds": round(total_time, 2),
 
1380
  f" Raw PDF: {raw_pdf_info['blob_name'] if raw_pdf_info else 'Not uploaded'}")
1381
  print(f" Split invoices: {len(all_parts)}")
1382
  print(f" Unique invoice numbers: {len(unique_invoices)}")
1383
+ print(f" Model used: {get_current_model_config()['name']}")
1384
  print(f" Total time: {total_time:.1f}s")
1385
  print(
1386
  f" Extraction time: {extraction_time:.1f}s ({total_pages_count / extraction_time:.1f} pages/sec)")
 
1410
  background_tasks: BackgroundTasks,
1411
  container_name: Optional[str] = Form(None)
1412
  ):
1413
+ """Delete all blobs for a specific batch"""
1414
  if container_name is None:
1415
  container_name = AZURE_CONTAINER_NAME
1416
 
 
1425
  })
1426
 
1427
 
1428
+ @app.get("/quota-status")
1429
+ def quota_status():
1430
+ """Get quota status for all models"""
1431
+ status = []
1432
+ for i, model in enumerate(GEMINI_MODELS):
1433
+ reset_model_quota_counters(model)
1434
+ with quota_manager_lock:
1435
+ status.append({
1436
+ "model": model["name"],
1437
+ "is_current": i == current_model_index,
1438
+ "rpm": {"used": model["current_rpm"], "limit": model["max_requests_per_minute"]},
1439
+ "rpd": {"used": model["current_rpd"], "limit": model["max_requests_per_day"]},
1440
+ "available": can_use_model(model)
1441
+ })
1442
+ return JSONResponse({"models": status, "timestamp": datetime.now().isoformat()})
1443
+
1444
+
1445
  @app.get("/")
1446
  async def root():
1447
  return {
1448
+ "service": "Universal Invoice Splitter API with RPM Management",
1449
+ "version": "4.0",
1450
  "status": "running",
1451
  "features": {
1452
  "multi_format_support": True,
1453
  "zydus_healthcare_support": True,
1454
  "auto_merge_null_groups": True,
1455
  "azure_blob_storage": True,
1456
+ "parallel_processing": True,
1457
+ "rpm_management": True,
1458
+ "multi_model_fallback": True
1459
+ },
1460
+ "models": [m["name"] for m in GEMINI_MODELS],
1461
+ "current_model": get_current_model_config()['name']
1462
  }
1463
 
1464
 
1465
  @app.get("/health")
1466
  async def health():
1467
+ model_config = get_current_model_config()
1468
  return {
1469
  "status": "healthy",
1470
  "timestamp": datetime.now().isoformat(),
1471
  "gemini_configured": bool(GEMINI_API_KEY),
1472
+ "azure_configured": bool(AZURE_STORAGE_CONNECTION_STRING or (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY)),
1473
+ "current_model": model_config['name'],
1474
+ "quota_available": can_use_model(model_config)
1475
  }
1476
 
1477
  if __name__ == "__main__":
1478
  import uvicorn
1479
+
1480
+ # Initialize model quota tracking
1481
+ for model in GEMINI_MODELS:
1482
+ model["last_rpm_reset"] = datetime.now()
1483
+ model["last_rpd_reset"] = datetime.now()
1484
+
1485
+ print("\n" + "="*80)
1486
+ print("πŸš€ Starting Universal Invoice Splitter API with RPM Management")
1487
+ print("="*80)
1488
  print(f"βœ… Supports ALL invoice types")
1489
  print(f"βœ… Zydus Healthcare fallback (23xxxxxxxx pattern)")
1490
  print(f"βœ… Auto-merge null first pages")
1491
+ print(f"βœ… RPM/RPD quota management")
1492
+ print(f"βœ… Multi-model fallback")
1493
+ print("="*80)
1494
+ print(f"πŸ“‹ Model Chain:")
1495
+ for i, model in enumerate(GEMINI_MODELS):
1496
+ print(f" {i+1}. {model['name']}")
1497
+ print(
1498
+ f" RPM: {model['max_requests_per_minute']}, RPD: {model['max_requests_per_day']}")
1499
+ print("="*80)
1500
+ print(f"🌐 Server: http://127.0.0.1:8000")
1501
+ print("="*80 + "\n")
1502
+
1503
  uvicorn.run(app, host=HOST, port=PORT, log_level="info")