anujakkulkarni commited on
Commit
0da7d43
·
verified ·
1 Parent(s): 7e49357

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +191 -211
app.py CHANGED
@@ -61,7 +61,8 @@ app.add_middleware(
61
  GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
62
 
63
  # Azure Blob Storage Configuration - REQUIRED for blob storage
64
- AZURE_STORAGE_CONNECTION_STRING = os.environ.get("AZURE_STORAGE_CONNECTION_STRING", "")
 
65
  AZURE_STORAGE_ACCOUNT_NAME = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME", "")
66
  AZURE_STORAGE_ACCOUNT_KEY = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY", "")
67
 
@@ -72,9 +73,12 @@ AZURE_CONTAINER_NAME = os.environ.get("AZURE_CONTAINER_NAME", "invoice-splits")
72
  ROOT_FOLDER = os.environ.get("ROOT_FOLDER", "POD") # Root folder name
73
 
74
  # ⭐ PERFORMANCE CONFIGURATION
75
- MAX_PARALLEL_GEMINI_CALLS = int(os.environ.get("MAX_PARALLEL_GEMINI_CALLS", "5"))
76
- GEMINI_IMAGE_RESOLUTION = float(os.environ.get("GEMINI_IMAGE_RESOLUTION", "1.2"))
77
- USE_SMART_SAMPLING = os.environ.get("USE_SMART_SAMPLING", "false").lower() == "true"
 
 
 
78
 
79
  # ⭐ SERVER CONFIGURATION
80
  HOST = os.environ.get("HOST", "0.0.0.0") # Hugging Face uses 0.0.0.0
@@ -91,38 +95,42 @@ blob_service_client = None
91
  # STARTUP VALIDATION
92
  # ============================================================================
93
 
 
94
  def validate_configuration():
95
  """Validate configuration and warn about missing credentials."""
96
  warnings = []
97
  errors = []
98
-
99
  # Check Gemini API Key
100
  if not GEMINI_API_KEY:
101
- warnings.append("⚠️ GEMINI_API_KEY not set - image-based PDFs will not work")
 
102
  else:
103
  print(f"✅ GEMINI_API_KEY configured ({len(GEMINI_API_KEY)} chars)")
104
-
105
  # Check Azure credentials
106
  if not AZURE_STORAGE_CONNECTION_STRING:
107
  if not (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY):
108
- errors.append("❌ Azure credentials missing - set AZURE_STORAGE_CONNECTION_STRING or both AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY")
 
109
  else:
110
- print(f"✅ Azure credentials configured (account: {AZURE_STORAGE_ACCOUNT_NAME})")
 
111
  else:
112
  print(f"✅ Azure connection string configured")
113
-
114
  # Print all warnings
115
  for warning in warnings:
116
  print(warning)
117
-
118
  # Print all errors
119
  for error in errors:
120
  print(error)
121
-
122
  if errors:
123
  print("\n⚠️ WARNING: Some required credentials are missing!")
124
  print(" Set them in Hugging Face Spaces Settings > Repository secrets")
125
-
126
  return len(errors) == 0
127
 
128
 
@@ -410,16 +418,48 @@ def get_gemini_model():
410
  return gemini_model
411
 
412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
  def extract_invoice_gemini_sync(page: fitz.Page) -> Optional[str]:
414
  """
415
  Optimized synchronous Gemini extraction for thread pool execution.
416
  - Reduced image resolution for faster processing
417
  - Simplified prompt for quicker responses
 
418
  """
419
  model = get_gemini_model()
420
  if not model:
421
  return None
422
 
 
423
  try:
424
  # Reduced resolution for faster processing
425
  pix = page.get_pixmap(matrix=fitz.Matrix(
@@ -428,26 +468,36 @@ def extract_invoice_gemini_sync(page: fitz.Page) -> Optional[str]:
428
  pix = None
429
  img = Image.open(io.BytesIO(img_bytes))
430
 
431
- # Optimized prompt for faster response
432
  prompt = """Extract ONLY the invoice number from this image.
433
- Look for: Invoice No, Bill No, Tax Invoice No, or Document No.
434
- Return ONLY the number/code. If not found, return: NONE"""
435
 
436
  response = model.generate_content([prompt, img])
437
  if response and response.text:
438
  extracted_text = response.text.strip()
439
- if extracted_text and extracted_text not in ("NOT_FOUND", "NONE", "N/A", "NA"):
440
- invoice_no = extracted_text.replace(
441
- "*", "").replace("#", "").replace("Invoice No:", "").replace(":", "").strip()
442
- if invoice_no and len(invoice_no) > 2:
443
- img.close()
444
- return invoice_no
445
-
446
- img.close()
 
 
 
 
 
 
 
 
447
  return None
448
 
449
  except Exception as e:
450
  print(f"Gemini error: {e}")
 
 
451
  return None
452
 
453
 
@@ -634,49 +684,121 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
634
  if not text:
635
  return None
636
  text_norm = normalize_text_for_search(text)
 
 
 
637
 
638
- label_match = re.search(
639
- r"(?:Invoice\s*No\.?|Inv\.?\s*No\.?|Bill\s*No\.?|Doc\s*No\.?|Document\s*No\.?|Tax\s*Invoice\s*No\.?)[\s:\-]*(\d{6,15})",
 
640
  text_norm, re.IGNORECASE
641
  )
642
- if label_match:
643
- invoice_num = label_match.group(1).strip()
644
- if is_valid_invoice_number(invoice_num):
645
- return invoice_num.upper()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
646
 
 
647
  label_match = re.search(
648
- r"(?:Invoice|Inv|Bill|Doc|Document|Tax\s*Invoice)\s*(?:No|#|\.|:\s*)",
649
  text_norm, re.IGNORECASE
650
  )
651
  if label_match:
652
  start_idx = label_match.end()
653
- candidate_text = text_norm[start_idx:start_idx + 60]
654
- clean_candidates = re.sub(r"[:\-\(\)\[\]]", " ", candidate_text)
655
- words = clean_candidates.split()
656
- for word in words:
657
- word = word.strip(".,;")
658
- if word.lower() in ("order", "ref", "no", "date", "dt", "inv", "bill", "account"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
659
  continue
660
- if len(word) > 2 and is_valid_invoice_number(word):
661
- return word.upper()
662
-
663
- top_text = text_norm[:800]
664
- digit_matches = re.findall(r'\b(\d{6,15})\b', top_text)
665
- for match in digit_matches:
666
- if is_valid_invoice_number(match):
667
- if not re.match(r'^(19|20)\d{6}$', match):
668
- if not re.match(r'^[6-9]\d{9}$', match):
669
- return match.upper()
670
-
671
- top_text = text_norm[:600]
672
- m = re.search(r"\b([A-Z0-9][A-Z0-9\-\/]{4,})\b", top_text)
673
- if m:
674
- inv = m.group(1).upper()
675
- if is_valid_invoice_number(inv):
676
- return inv
 
 
677
  return None
678
 
679
-
680
  def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
681
  text = page.get_text("text") or ""
682
  inv = try_extract_invoice_from_text(text)
@@ -879,15 +1001,16 @@ async def split_invoices(
879
  # ============================================================================
880
  # 🔧 CORRECTED GROUPING LOGIC - NO AGGRESSIVE FILTERING
881
  # ============================================================================
882
-
883
  print(f"\n🔧 Grouping invoices...")
884
-
885
  # DEBUG: Show raw extraction results
886
  print(f"\n🔍 DEBUG - Raw extraction results:")
887
  for idx, inv in enumerate(page_invoice_nos[:min(10, len(page_invoice_nos))]):
888
  print(f" Page {idx+1}: {inv if inv else '(not found)'}")
889
  if len(page_invoice_nos) > 10:
890
- print(f" ... (showing first 10 of {len(page_invoice_nos)} pages)")
 
891
 
892
  # Step 1: Normalize extracted invoice numbers (only filter GST numbers)
893
  page_invoice_nos_normalized = []
@@ -918,10 +1041,11 @@ async def split_invoices(
918
 
919
  # Count how many pages were forward-filled
920
  filled_count = sum(1 for i in range(len(page_invoice_nos_normalized))
921
- if page_invoice_nos_normalized[i] is None and page_invoice_nos_filled[i] is not None)
922
 
923
  # Debug: Count unique invoice numbers
924
- unique_invoices = set([v for v in page_invoice_nos_filled if v is not None])
 
925
  print(f"\n 📊 Found {len(unique_invoices)} unique invoice numbers:")
926
  for inv_no in sorted(unique_invoices) if unique_invoices else []:
927
  page_count = sum(1 for v in page_invoice_nos_filled if v == inv_no)
@@ -944,7 +1068,8 @@ async def split_invoices(
944
  "invoice_no": current_invoice,
945
  "pages": current_group[:]
946
  })
947
- print(f" 📄 Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
 
948
  current_invoice = inv
949
  current_group = [idx]
950
  else:
@@ -957,7 +1082,8 @@ async def split_invoices(
957
  "invoice_no": current_invoice,
958
  "pages": current_group[:]
959
  })
960
- print(f" 📄 Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
 
961
 
962
  # Handle edge case: entire PDF has no invoice numbers
963
  if len(groups) == 1 and groups[0]["invoice_no"] is None:
@@ -967,7 +1093,8 @@ async def split_invoices(
967
  }]
968
 
969
  print(f"\n✅ Created {len(groups)} invoice groups")
970
- print(f" Forward-filled {filled_count} pages with missing invoice numbers")
 
971
 
972
  # Build and upload split PDFs
973
  print(f"\n🔨 Building and uploading split invoices...")
@@ -1118,151 +1245,4 @@ async def cleanup_batch(
1118
  "batch_id": batch_id,
1119
  "folder_path": f"{ROOT_FOLDER}/{batch_id}/",
1120
  "container": container_name
1121
- })
1122
-
1123
-
1124
- @app.get("/health")
1125
- async def health_check():
1126
- """Health check endpoint."""
1127
- gemini_status = "configured" if get_gemini_model() else "not configured"
1128
-
1129
- blob_status = "not configured"
1130
- blob_details = None
1131
- try:
1132
- client = get_blob_service_client()
1133
- if client:
1134
- blob_status = "configured"
1135
- blob_details = {
1136
- "account_name": AZURE_STORAGE_ACCOUNT_NAME,
1137
- "container": AZURE_CONTAINER_NAME,
1138
- "root_folder": ROOT_FOLDER,
1139
- "available": True
1140
- }
1141
- except Exception as e:
1142
- blob_status = f"error: {str(e)}"
1143
-
1144
- return {
1145
- "status": "healthy",
1146
- "timestamp": datetime.now().isoformat(),
1147
- "services": {
1148
- "gemini": {
1149
- "status": gemini_status,
1150
- "available": GEMINI_AVAILABLE,
1151
- "model": "gemini-2.5-flash",
1152
- "api_key_set": bool(GEMINI_API_KEY)
1153
- },
1154
- "azure_blob_storage": {
1155
- "status": blob_status,
1156
- "available": AZURE_AVAILABLE,
1157
- "details": blob_details,
1158
- "credentials_set": bool(AZURE_STORAGE_CONNECTION_STRING or (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY))
1159
- }
1160
- },
1161
- "performance": {
1162
- "max_parallel_gemini_calls": MAX_PARALLEL_GEMINI_CALLS,
1163
- "gemini_image_resolution": GEMINI_IMAGE_RESOLUTION,
1164
- "smart_sampling_default": USE_SMART_SAMPLING
1165
- },
1166
- "environment": {
1167
- "host": HOST,
1168
- "port": PORT
1169
- }
1170
- }
1171
-
1172
-
1173
- @app.get("/")
1174
- async def root():
1175
- """Root endpoint."""
1176
- return {
1177
- "name": "Invoice Splitter API",
1178
- "version": "6.0.0 - Fixed Grouping Logic",
1179
- "description": "Split PDF invoices with Azure Blob Storage - Splits on invoice number change",
1180
- "features": {
1181
- "parallel_processing": f"Up to {MAX_PARALLEL_GEMINI_CALLS} concurrent Gemini API calls",
1182
- "smart_sampling": "Optional fast mode for large PDFs (~5-10x faster)",
1183
- "optimized_prompts": "Faster Gemini responses",
1184
- "reduced_resolution": f"Image processing at {GEMINI_IMAGE_RESOLUTION}x for speed",
1185
- "no_aggressive_filtering": "Keeps all extracted invoice numbers (fixed bug)"
1186
- },
1187
- "folder_structure": {
1188
- "format": "POD/{batch_id}/{filename}/Raw|Splitted/",
1189
- "raw_folder": "Contains original uploaded PDF",
1190
- "split_folder": "Contains individual split invoice PDFs"
1191
- },
1192
- "endpoints": {
1193
- "split_invoices": "/split-invoices",
1194
- "cleanup_batch": "/cleanup-batch/{batch_id}",
1195
- "health": "/health"
1196
- },
1197
- "configuration": {
1198
- "gemini_configured": bool(GEMINI_API_KEY),
1199
- "azure_configured": bool(AZURE_STORAGE_CONNECTION_STRING or (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY)),
1200
- "environment_ready": validate_configuration()
1201
- }
1202
- }
1203
-
1204
-
1205
- if __name__ == "__main__":
1206
- import uvicorn
1207
-
1208
- print("\n" + "="*70)
1209
- print("🚀 Invoice Splitter API - v6.0 FIXED (Hugging Face)")
1210
- print("="*70)
1211
-
1212
- # Validate configuration
1213
- config_valid = validate_configuration()
1214
-
1215
- print(f"\n⚡ Performance Features:")
1216
- print(
1217
- f" • Parallel Gemini API calls: {MAX_PARALLEL_GEMINI_CALLS} workers")
1218
- print(f" • Image resolution: {GEMINI_IMAGE_RESOLUTION}x (optimized)")
1219
- print(
1220
- f" • Smart sampling: {'Enabled' if USE_SMART_SAMPLING else 'Disabled'} (optional)")
1221
- print(f" • Expected speed: 5-10x faster for image PDFs")
1222
-
1223
- print(f"\n🔧 Bug Fixes:")
1224
- print(f" • ✅ Removed aggressive frequency filtering")
1225
- print(f" • ✅ Splits on every invoice number change")
1226
- print(f" • ✅ Keeps all extracted invoice numbers")
1227
- print(f" • ✅ Added detailed debug logging")
1228
-
1229
- print(f"\n📁 Folder Structure:")
1230
- print(f" {ROOT_FOLDER}/{{batch_id}}/{{filename}}/")
1231
- print(f" ├── Raw/ (original PDF)")
1232
- print(f" └── Splitted/ (split invoices)")
1233
- print(f"\n📦 Azure Configuration:")
1234
- print(f" Account: {AZURE_STORAGE_ACCOUNT_NAME or 'Not set'}")
1235
- print(f" Container: {AZURE_CONTAINER_NAME}")
1236
-
1237
- if get_blob_service_client():
1238
- print(f" ✅ Azure Blob Storage: Connected")
1239
- else:
1240
- print(f" ⚠️ Azure Blob Storage: Not configured")
1241
-
1242
- if get_gemini_model():
1243
- print(f" ✅ Gemini AI: Connected (gemini-2.5-flash)")
1244
- else:
1245
- print(f" ⚠️ Gemini AI: Not configured")
1246
-
1247
- print(f"\n🌐 Server Configuration:")
1248
- print(f" Host: {HOST}")
1249
- print(f" Port: {PORT}")
1250
-
1251
- if not config_valid:
1252
- print(f"\n⚠️ WARNING: Some credentials are missing!")
1253
- print(f" For Hugging Face deployment:")
1254
- print(f" 1. Go to your Space Settings > Repository secrets")
1255
- print(f" 2. Add the following secrets:")
1256
- print(f" - GEMINI_API_KEY")
1257
- print(f" - AZURE_STORAGE_CONNECTION_STRING (or)")
1258
- print(f" - AZURE_STORAGE_ACCOUNT_NAME + AZURE_STORAGE_ACCOUNT_KEY")
1259
-
1260
- print("\n" + "="*70 + "\n")
1261
-
1262
- uvicorn.run(
1263
- app,
1264
- host=HOST,
1265
- port=PORT,
1266
- workers=1,
1267
- timeout_keep_alive=600
1268
- )
 
61
  GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
62
 
63
  # Azure Blob Storage Configuration - REQUIRED for blob storage
64
+ AZURE_STORAGE_CONNECTION_STRING = os.environ.get(
65
+ "AZURE_STORAGE_CONNECTION_STRING", "")
66
  AZURE_STORAGE_ACCOUNT_NAME = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME", "")
67
  AZURE_STORAGE_ACCOUNT_KEY = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY", "")
68
 
 
73
  ROOT_FOLDER = os.environ.get("ROOT_FOLDER", "POD") # Root folder name
74
 
75
  # ⭐ PERFORMANCE CONFIGURATION
76
+ MAX_PARALLEL_GEMINI_CALLS = int(
77
+ os.environ.get("MAX_PARALLEL_GEMINI_CALLS", "5"))
78
+ GEMINI_IMAGE_RESOLUTION = float(
79
+ os.environ.get("GEMINI_IMAGE_RESOLUTION", "1.2"))
80
+ USE_SMART_SAMPLING = os.environ.get(
81
+ "USE_SMART_SAMPLING", "false").lower() == "true"
82
 
83
  # ⭐ SERVER CONFIGURATION
84
  HOST = os.environ.get("HOST", "0.0.0.0") # Hugging Face uses 0.0.0.0
 
95
  # STARTUP VALIDATION
96
  # ============================================================================
97
 
98
+
99
  def validate_configuration():
100
  """Validate configuration and warn about missing credentials."""
101
  warnings = []
102
  errors = []
103
+
104
  # Check Gemini API Key
105
  if not GEMINI_API_KEY:
106
+ warnings.append(
107
+ "⚠️ GEMINI_API_KEY not set - image-based PDFs will not work")
108
  else:
109
  print(f"✅ GEMINI_API_KEY configured ({len(GEMINI_API_KEY)} chars)")
110
+
111
  # Check Azure credentials
112
  if not AZURE_STORAGE_CONNECTION_STRING:
113
  if not (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY):
114
+ errors.append(
115
+ "❌ Azure credentials missing - set AZURE_STORAGE_CONNECTION_STRING or both AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY")
116
  else:
117
+ print(
118
+ f"✅ Azure credentials configured (account: {AZURE_STORAGE_ACCOUNT_NAME})")
119
  else:
120
  print(f"✅ Azure connection string configured")
121
+
122
  # Print all warnings
123
  for warning in warnings:
124
  print(warning)
125
+
126
  # Print all errors
127
  for error in errors:
128
  print(error)
129
+
130
  if errors:
131
  print("\n⚠️ WARNING: Some required credentials are missing!")
132
  print(" Set them in Hugging Face Spaces Settings > Repository secrets")
133
+
134
  return len(errors) == 0
135
 
136
 
 
418
  return gemini_model
419
 
420
 
421
+ def _clean_gemini_invoice_text(text: str) -> Optional[str]:
422
+ if not text:
423
+ return None
424
+
425
+ cleaned = text.strip()
426
+ cleaned = cleaned.replace("*", "").replace("#", "")
427
+ cleaned = re.sub(
428
+ r"(?i)\b(invoice|inv|bill|document|doc|tax\s*invoice)\s*(no|number)?\b",
429
+ "",
430
+ cleaned
431
+ )
432
+ cleaned = re.sub(r"[:\-\(\)\[\]]", " ", cleaned)
433
+ cleaned = re.sub(r"\s+", " ", cleaned).strip()
434
+
435
+ # Extract candidates
436
+ tokens = re.findall(r"[A-Z0-9][A-Z0-9\-\/]{2,}", cleaned.upper())
437
+
438
+ # Prefer alphanumeric invoice IDs first
439
+ for token in tokens:
440
+ if any(c.isalpha() for c in token) and any(c.isdigit() for c in token):
441
+ return token
442
+
443
+ # Fallback to numeric-only (6-15 digits)
444
+ for token in tokens:
445
+ if token.isdigit() and 6 <= len(token) <= 15:
446
+ return token
447
+
448
+ return None
449
+
450
+
451
  def extract_invoice_gemini_sync(page: fitz.Page) -> Optional[str]:
452
  """
453
  Optimized synchronous Gemini extraction for thread pool execution.
454
  - Reduced image resolution for faster processing
455
  - Simplified prompt for quicker responses
456
+ - OCR fallback for better accuracy
457
  """
458
  model = get_gemini_model()
459
  if not model:
460
  return None
461
 
462
+ img = None
463
  try:
464
  # Reduced resolution for faster processing
465
  pix = page.get_pixmap(matrix=fitz.Matrix(
 
468
  pix = None
469
  img = Image.open(io.BytesIO(img_bytes))
470
 
471
+ # Updated prompt to prioritize labeled alphanumeric invoice numbers
472
  prompt = """Extract ONLY the invoice number from this image.
473
+ Prefer the value next to labels like: Invoice No, Invoice Number, Bill No, Document No.
474
+ Return ONLY the identifier (keep letters, e.g., A07966). If not found, return: NONE."""
475
 
476
  response = model.generate_content([prompt, img])
477
  if response and response.text:
478
  extracted_text = response.text.strip()
479
+ candidate = _clean_gemini_invoice_text(extracted_text)
480
+ if candidate and len(candidate) > 2:
481
+ img.close()
482
+ return candidate
483
+
484
+ # OCR Fallback: Extract full text then run regex
485
+ ocr_prompt = "Extract all text from this invoice image. Return the complete text content."
486
+ ocr_response = model.generate_content([ocr_prompt, img])
487
+ if ocr_response and ocr_response.text:
488
+ inv = try_extract_invoice_from_text(ocr_response.text)
489
+ if inv:
490
+ img.close()
491
+ return inv
492
+
493
+ if img:
494
+ img.close()
495
  return None
496
 
497
  except Exception as e:
498
  print(f"Gemini error: {e}")
499
+ if img:
500
+ img.close()
501
  return None
502
 
503
 
 
684
  if not text:
685
  return None
686
  text_norm = normalize_text_for_search(text)
687
+
688
+ # DEBUG: Print first 600 chars
689
+ print(f"\n🔍 DEBUG - Extracted text (first 600 chars):\n{text_norm[:600]}\n")
690
 
691
+ # PRIORITY 1: Look for CREDIT number (14 digits, common in pharma invoices)
692
+ credit_match = re.search(
693
+ r"CREDIT\s*(?:NO|NUMBER|#)?\s*[:\-]?\s*(\d{12,20})",
694
  text_norm, re.IGNORECASE
695
  )
696
+ if credit_match:
697
+ credit_num = credit_match.group(1).strip()
698
+ print(f"✓ Found CREDIT number: {credit_num}")
699
+ if 12 <= len(credit_num) <= 20:
700
+ return credit_num.upper()
701
+
702
+ # PRIORITY 2: Look for "Invoice No" or "Bill No" followed by long numeric (12-20 digits)
703
+ invoice_patterns = [
704
+ r"Invoice\s*(?:No|Number)\.?\s*[:\-]?\s*(\d{12,20})",
705
+ r"Bill\s*(?:No|Number)\.?\s*[:\-]?\s*(\d{12,20})",
706
+ r"Tax\s*Invoice\s*(?:No|Number)\.?\s*[:\-]?\s*(\d{12,20})",
707
+ ]
708
+
709
+ for pattern in invoice_patterns:
710
+ match = re.search(pattern, text_norm, re.IGNORECASE)
711
+ if match:
712
+ num = match.group(1).strip()
713
+ print(f"✓ Found labeled long numeric invoice: {num}")
714
+ return num.upper()
715
+
716
+ # PRIORITY 3: Look for "Invoice No" with alphanumeric (but EXCLUDE batch patterns)
717
+ label_patterns = [
718
+ r"Invoice\s*No\.?\s*[:\-]\s*([A-Z][A-Z0-9\-\/]{2,20})",
719
+ r"Bill\s*No\.?\s*[:\-]\s*([A-Z][A-Z0-9\-\/]{2,20})",
720
+ ]
721
+
722
+ for pattern in label_patterns:
723
+ match = re.search(pattern, text_norm, re.IGNORECASE)
724
+ if match:
725
+ invoice_num = match.group(1).strip()
726
+
727
+ # EXCLUDE batch number patterns (single letter + 6 digits: F500256, I500734, etc.)
728
+ if re.match(r'^[A-Z]\d{6}$', invoice_num, re.IGNORECASE):
729
+ print(f"⚠️ Skipping (batch pattern): {invoice_num}")
730
+ continue
731
+
732
+ # EXCLUDE license patterns (KA-MY2-157424)
733
+ if re.match(r'^[A-Z]{2,3}-[A-Z0-9]+-\d+$', invoice_num, re.IGNORECASE):
734
+ print(f"⚠️ Skipping (license pattern): {invoice_num}")
735
+ continue
736
+
737
+ print(f"✓ Found labeled alphanumeric: {invoice_num}")
738
+ if any(c.isalpha() for c in invoice_num) and any(c.isdigit() for c in invoice_num):
739
+ if 3 <= len(invoice_num) <= 20:
740
+ return invoice_num.upper()
741
+
742
+ # PRIORITY 4: Look for long numeric values (12-20 digits) in top area
743
+ top_text = text_norm[:1000]
744
+ long_numerics = re.findall(r'\b(\d{12,20})\b', top_text)
745
+
746
+ if long_numerics:
747
+ # Take the longest one (most likely to be invoice number)
748
+ longest = max(long_numerics, key=len)
749
+ print(f"✓ Found long numeric value: {longest}")
750
+ return longest.upper()
751
 
752
+ # PRIORITY 5: Look near "Invoice" label for tokens, EXCLUDE batch patterns
753
  label_match = re.search(
754
+ r"(?:Invoice|Bill|Tax\s*Invoice)\s*(?:No|Number|#|\.|:\s*)",
755
  text_norm, re.IGNORECASE
756
  )
757
  if label_match:
758
  start_idx = label_match.end()
759
+ candidate_text = text_norm[start_idx:start_idx + 100]
760
+ print(f"🔍 Text after label: '{candidate_text[:50]}...'")
761
+
762
+ tokens = re.findall(r"\b([A-Z0-9][A-Z0-9\-\/]{2,20})\b", candidate_text, re.IGNORECASE)
763
+ print(f"🔍 Tokens found: {tokens}")
764
+
765
+ for token in tokens:
766
+ token = token.strip(".,;:-*")
767
+
768
+ # Skip common words
769
+ if token.upper() in ("ORDER", "REF", "NO", "DATE", "DT", "INV", "BILL", "ACCOUNT", "PO", "COPY", "OF"):
770
+ continue
771
+
772
+ # EXCLUDE batch patterns (F500256, I500734)
773
+ if re.match(r'^[A-Z]\d{6}$', token, re.IGNORECASE):
774
+ print(f"⚠️ Skipping (batch pattern): {token}")
775
+ continue
776
+
777
+ # EXCLUDE license patterns
778
+ if re.match(r'^[A-Z]{2,3}-[A-Z0-9]+-\d+$', token, re.IGNORECASE):
779
+ print(f"⚠️ Skipping (license pattern): {token}")
780
  continue
781
+
782
+ if any(c.isalpha() for c in token) and any(c.isdigit() for c in token):
783
+ if 3 <= len(token) <= 20:
784
+ print(f"✓ Selected token: {token}")
785
+ return token.upper()
786
+
787
+ # PRIORITY 6: Medium-length numeric (10-15 digits)
788
+ medium_numerics = re.findall(r'\b(\d{10,15})\b', top_text)
789
+ for num in medium_numerics:
790
+ # Exclude phone numbers (10 digits starting with 6-9)
791
+ if len(num) == 10 and num[0] in '6789':
792
+ continue
793
+ # Exclude dates (8 digits starting with 20)
794
+ if len(num) == 8 and num.startswith('20'):
795
+ continue
796
+ print(f"✓ Found medium numeric value: {num}")
797
+ return num.upper()
798
+
799
+ print("✗ No invoice number found")
800
  return None
801
 
 
802
  def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
803
  text = page.get_text("text") or ""
804
  inv = try_extract_invoice_from_text(text)
 
1001
  # ============================================================================
1002
  # 🔧 CORRECTED GROUPING LOGIC - NO AGGRESSIVE FILTERING
1003
  # ============================================================================
1004
+
1005
  print(f"\n🔧 Grouping invoices...")
1006
+
1007
  # DEBUG: Show raw extraction results
1008
  print(f"\n🔍 DEBUG - Raw extraction results:")
1009
  for idx, inv in enumerate(page_invoice_nos[:min(10, len(page_invoice_nos))]):
1010
  print(f" Page {idx+1}: {inv if inv else '(not found)'}")
1011
  if len(page_invoice_nos) > 10:
1012
+ print(
1013
+ f" ... (showing first 10 of {len(page_invoice_nos)} pages)")
1014
 
1015
  # Step 1: Normalize extracted invoice numbers (only filter GST numbers)
1016
  page_invoice_nos_normalized = []
 
1041
 
1042
  # Count how many pages were forward-filled
1043
  filled_count = sum(1 for i in range(len(page_invoice_nos_normalized))
1044
+ if page_invoice_nos_normalized[i] is None and page_invoice_nos_filled[i] is not None)
1045
 
1046
  # Debug: Count unique invoice numbers
1047
+ unique_invoices = set(
1048
+ [v for v in page_invoice_nos_filled if v is not None])
1049
  print(f"\n 📊 Found {len(unique_invoices)} unique invoice numbers:")
1050
  for inv_no in sorted(unique_invoices) if unique_invoices else []:
1051
  page_count = sum(1 for v in page_invoice_nos_filled if v == inv_no)
 
1068
  "invoice_no": current_invoice,
1069
  "pages": current_group[:]
1070
  })
1071
+ print(
1072
+ f" 📄 Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
1073
  current_invoice = inv
1074
  current_group = [idx]
1075
  else:
 
1082
  "invoice_no": current_invoice,
1083
  "pages": current_group[:]
1084
  })
1085
+ print(
1086
+ f" 📄 Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
1087
 
1088
  # Handle edge case: entire PDF has no invoice numbers
1089
  if len(groups) == 1 and groups[0]["invoice_no"] is None:
 
1093
  }]
1094
 
1095
  print(f"\n✅ Created {len(groups)} invoice groups")
1096
+ print(
1097
+ f" Forward-filled {filled_count} pages with missing invoice numbers")
1098
 
1099
  # Build and upload split PDFs
1100
  print(f"\n🔨 Building and uploading split invoices...")
 
1245
  "batch_id": batch_id,
1246
  "folder_path": f"{ROOT_FOLDER}/{batch_id}/",
1247
  "container": container_name
1248
+ })