Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -61,7 +61,8 @@ app.add_middleware(
|
|
| 61 |
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
|
| 62 |
|
| 63 |
# Azure Blob Storage Configuration - REQUIRED for blob storage
|
| 64 |
-
AZURE_STORAGE_CONNECTION_STRING = os.environ.get(
|
|
|
|
| 65 |
AZURE_STORAGE_ACCOUNT_NAME = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME", "")
|
| 66 |
AZURE_STORAGE_ACCOUNT_KEY = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY", "")
|
| 67 |
|
|
@@ -72,9 +73,12 @@ AZURE_CONTAINER_NAME = os.environ.get("AZURE_CONTAINER_NAME", "invoice-splits")
|
|
| 72 |
ROOT_FOLDER = os.environ.get("ROOT_FOLDER", "POD") # Root folder name
|
| 73 |
|
| 74 |
# ⭐ PERFORMANCE CONFIGURATION
|
| 75 |
-
MAX_PARALLEL_GEMINI_CALLS = int(
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
# ⭐ SERVER CONFIGURATION
|
| 80 |
HOST = os.environ.get("HOST", "0.0.0.0") # Hugging Face uses 0.0.0.0
|
|
@@ -91,38 +95,42 @@ blob_service_client = None
|
|
| 91 |
# STARTUP VALIDATION
|
| 92 |
# ============================================================================
|
| 93 |
|
|
|
|
| 94 |
def validate_configuration():
|
| 95 |
"""Validate configuration and warn about missing credentials."""
|
| 96 |
warnings = []
|
| 97 |
errors = []
|
| 98 |
-
|
| 99 |
# Check Gemini API Key
|
| 100 |
if not GEMINI_API_KEY:
|
| 101 |
-
warnings.append(
|
|
|
|
| 102 |
else:
|
| 103 |
print(f"✅ GEMINI_API_KEY configured ({len(GEMINI_API_KEY)} chars)")
|
| 104 |
-
|
| 105 |
# Check Azure credentials
|
| 106 |
if not AZURE_STORAGE_CONNECTION_STRING:
|
| 107 |
if not (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY):
|
| 108 |
-
errors.append(
|
|
|
|
| 109 |
else:
|
| 110 |
-
print(
|
|
|
|
| 111 |
else:
|
| 112 |
print(f"✅ Azure connection string configured")
|
| 113 |
-
|
| 114 |
# Print all warnings
|
| 115 |
for warning in warnings:
|
| 116 |
print(warning)
|
| 117 |
-
|
| 118 |
# Print all errors
|
| 119 |
for error in errors:
|
| 120 |
print(error)
|
| 121 |
-
|
| 122 |
if errors:
|
| 123 |
print("\n⚠️ WARNING: Some required credentials are missing!")
|
| 124 |
print(" Set them in Hugging Face Spaces Settings > Repository secrets")
|
| 125 |
-
|
| 126 |
return len(errors) == 0
|
| 127 |
|
| 128 |
|
|
@@ -410,16 +418,48 @@ def get_gemini_model():
|
|
| 410 |
return gemini_model
|
| 411 |
|
| 412 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
def extract_invoice_gemini_sync(page: fitz.Page) -> Optional[str]:
|
| 414 |
"""
|
| 415 |
Optimized synchronous Gemini extraction for thread pool execution.
|
| 416 |
- Reduced image resolution for faster processing
|
| 417 |
- Simplified prompt for quicker responses
|
|
|
|
| 418 |
"""
|
| 419 |
model = get_gemini_model()
|
| 420 |
if not model:
|
| 421 |
return None
|
| 422 |
|
|
|
|
| 423 |
try:
|
| 424 |
# Reduced resolution for faster processing
|
| 425 |
pix = page.get_pixmap(matrix=fitz.Matrix(
|
|
@@ -428,26 +468,36 @@ def extract_invoice_gemini_sync(page: fitz.Page) -> Optional[str]:
|
|
| 428 |
pix = None
|
| 429 |
img = Image.open(io.BytesIO(img_bytes))
|
| 430 |
|
| 431 |
-
#
|
| 432 |
prompt = """Extract ONLY the invoice number from this image.
|
| 433 |
-
|
| 434 |
-
Return ONLY the
|
| 435 |
|
| 436 |
response = model.generate_content([prompt, img])
|
| 437 |
if response and response.text:
|
| 438 |
extracted_text = response.text.strip()
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
return None
|
| 448 |
|
| 449 |
except Exception as e:
|
| 450 |
print(f"Gemini error: {e}")
|
|
|
|
|
|
|
| 451 |
return None
|
| 452 |
|
| 453 |
|
|
@@ -634,49 +684,121 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
|
|
| 634 |
if not text:
|
| 635 |
return None
|
| 636 |
text_norm = normalize_text_for_search(text)
|
|
|
|
|
|
|
|
|
|
| 637 |
|
| 638 |
-
|
| 639 |
-
|
|
|
|
| 640 |
text_norm, re.IGNORECASE
|
| 641 |
)
|
| 642 |
-
if
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 646 |
|
|
|
|
| 647 |
label_match = re.search(
|
| 648 |
-
r"(?:Invoice|
|
| 649 |
text_norm, re.IGNORECASE
|
| 650 |
)
|
| 651 |
if label_match:
|
| 652 |
start_idx = label_match.end()
|
| 653 |
-
candidate_text = text_norm[start_idx:start_idx +
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 659 |
continue
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
|
|
|
|
|
|
| 677 |
return None
|
| 678 |
|
| 679 |
-
|
| 680 |
def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
|
| 681 |
text = page.get_text("text") or ""
|
| 682 |
inv = try_extract_invoice_from_text(text)
|
|
@@ -879,15 +1001,16 @@ async def split_invoices(
|
|
| 879 |
# ============================================================================
|
| 880 |
# 🔧 CORRECTED GROUPING LOGIC - NO AGGRESSIVE FILTERING
|
| 881 |
# ============================================================================
|
| 882 |
-
|
| 883 |
print(f"\n🔧 Grouping invoices...")
|
| 884 |
-
|
| 885 |
# DEBUG: Show raw extraction results
|
| 886 |
print(f"\n🔍 DEBUG - Raw extraction results:")
|
| 887 |
for idx, inv in enumerate(page_invoice_nos[:min(10, len(page_invoice_nos))]):
|
| 888 |
print(f" Page {idx+1}: {inv if inv else '(not found)'}")
|
| 889 |
if len(page_invoice_nos) > 10:
|
| 890 |
-
print(
|
|
|
|
| 891 |
|
| 892 |
# Step 1: Normalize extracted invoice numbers (only filter GST numbers)
|
| 893 |
page_invoice_nos_normalized = []
|
|
@@ -918,10 +1041,11 @@ async def split_invoices(
|
|
| 918 |
|
| 919 |
# Count how many pages were forward-filled
|
| 920 |
filled_count = sum(1 for i in range(len(page_invoice_nos_normalized))
|
| 921 |
-
|
| 922 |
|
| 923 |
# Debug: Count unique invoice numbers
|
| 924 |
-
unique_invoices = set(
|
|
|
|
| 925 |
print(f"\n 📊 Found {len(unique_invoices)} unique invoice numbers:")
|
| 926 |
for inv_no in sorted(unique_invoices) if unique_invoices else []:
|
| 927 |
page_count = sum(1 for v in page_invoice_nos_filled if v == inv_no)
|
|
@@ -944,7 +1068,8 @@ async def split_invoices(
|
|
| 944 |
"invoice_no": current_invoice,
|
| 945 |
"pages": current_group[:]
|
| 946 |
})
|
| 947 |
-
print(
|
|
|
|
| 948 |
current_invoice = inv
|
| 949 |
current_group = [idx]
|
| 950 |
else:
|
|
@@ -957,7 +1082,8 @@ async def split_invoices(
|
|
| 957 |
"invoice_no": current_invoice,
|
| 958 |
"pages": current_group[:]
|
| 959 |
})
|
| 960 |
-
print(
|
|
|
|
| 961 |
|
| 962 |
# Handle edge case: entire PDF has no invoice numbers
|
| 963 |
if len(groups) == 1 and groups[0]["invoice_no"] is None:
|
|
@@ -967,7 +1093,8 @@ async def split_invoices(
|
|
| 967 |
}]
|
| 968 |
|
| 969 |
print(f"\n✅ Created {len(groups)} invoice groups")
|
| 970 |
-
print(
|
|
|
|
| 971 |
|
| 972 |
# Build and upload split PDFs
|
| 973 |
print(f"\n🔨 Building and uploading split invoices...")
|
|
@@ -1118,151 +1245,4 @@ async def cleanup_batch(
|
|
| 1118 |
"batch_id": batch_id,
|
| 1119 |
"folder_path": f"{ROOT_FOLDER}/{batch_id}/",
|
| 1120 |
"container": container_name
|
| 1121 |
-
})
|
| 1122 |
-
|
| 1123 |
-
|
| 1124 |
-
@app.get("/health")
|
| 1125 |
-
async def health_check():
|
| 1126 |
-
"""Health check endpoint."""
|
| 1127 |
-
gemini_status = "configured" if get_gemini_model() else "not configured"
|
| 1128 |
-
|
| 1129 |
-
blob_status = "not configured"
|
| 1130 |
-
blob_details = None
|
| 1131 |
-
try:
|
| 1132 |
-
client = get_blob_service_client()
|
| 1133 |
-
if client:
|
| 1134 |
-
blob_status = "configured"
|
| 1135 |
-
blob_details = {
|
| 1136 |
-
"account_name": AZURE_STORAGE_ACCOUNT_NAME,
|
| 1137 |
-
"container": AZURE_CONTAINER_NAME,
|
| 1138 |
-
"root_folder": ROOT_FOLDER,
|
| 1139 |
-
"available": True
|
| 1140 |
-
}
|
| 1141 |
-
except Exception as e:
|
| 1142 |
-
blob_status = f"error: {str(e)}"
|
| 1143 |
-
|
| 1144 |
-
return {
|
| 1145 |
-
"status": "healthy",
|
| 1146 |
-
"timestamp": datetime.now().isoformat(),
|
| 1147 |
-
"services": {
|
| 1148 |
-
"gemini": {
|
| 1149 |
-
"status": gemini_status,
|
| 1150 |
-
"available": GEMINI_AVAILABLE,
|
| 1151 |
-
"model": "gemini-2.5-flash",
|
| 1152 |
-
"api_key_set": bool(GEMINI_API_KEY)
|
| 1153 |
-
},
|
| 1154 |
-
"azure_blob_storage": {
|
| 1155 |
-
"status": blob_status,
|
| 1156 |
-
"available": AZURE_AVAILABLE,
|
| 1157 |
-
"details": blob_details,
|
| 1158 |
-
"credentials_set": bool(AZURE_STORAGE_CONNECTION_STRING or (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY))
|
| 1159 |
-
}
|
| 1160 |
-
},
|
| 1161 |
-
"performance": {
|
| 1162 |
-
"max_parallel_gemini_calls": MAX_PARALLEL_GEMINI_CALLS,
|
| 1163 |
-
"gemini_image_resolution": GEMINI_IMAGE_RESOLUTION,
|
| 1164 |
-
"smart_sampling_default": USE_SMART_SAMPLING
|
| 1165 |
-
},
|
| 1166 |
-
"environment": {
|
| 1167 |
-
"host": HOST,
|
| 1168 |
-
"port": PORT
|
| 1169 |
-
}
|
| 1170 |
-
}
|
| 1171 |
-
|
| 1172 |
-
|
| 1173 |
-
@app.get("/")
|
| 1174 |
-
async def root():
|
| 1175 |
-
"""Root endpoint."""
|
| 1176 |
-
return {
|
| 1177 |
-
"name": "Invoice Splitter API",
|
| 1178 |
-
"version": "6.0.0 - Fixed Grouping Logic",
|
| 1179 |
-
"description": "Split PDF invoices with Azure Blob Storage - Splits on invoice number change",
|
| 1180 |
-
"features": {
|
| 1181 |
-
"parallel_processing": f"Up to {MAX_PARALLEL_GEMINI_CALLS} concurrent Gemini API calls",
|
| 1182 |
-
"smart_sampling": "Optional fast mode for large PDFs (~5-10x faster)",
|
| 1183 |
-
"optimized_prompts": "Faster Gemini responses",
|
| 1184 |
-
"reduced_resolution": f"Image processing at {GEMINI_IMAGE_RESOLUTION}x for speed",
|
| 1185 |
-
"no_aggressive_filtering": "Keeps all extracted invoice numbers (fixed bug)"
|
| 1186 |
-
},
|
| 1187 |
-
"folder_structure": {
|
| 1188 |
-
"format": "POD/{batch_id}/{filename}/Raw|Splitted/",
|
| 1189 |
-
"raw_folder": "Contains original uploaded PDF",
|
| 1190 |
-
"split_folder": "Contains individual split invoice PDFs"
|
| 1191 |
-
},
|
| 1192 |
-
"endpoints": {
|
| 1193 |
-
"split_invoices": "/split-invoices",
|
| 1194 |
-
"cleanup_batch": "/cleanup-batch/{batch_id}",
|
| 1195 |
-
"health": "/health"
|
| 1196 |
-
},
|
| 1197 |
-
"configuration": {
|
| 1198 |
-
"gemini_configured": bool(GEMINI_API_KEY),
|
| 1199 |
-
"azure_configured": bool(AZURE_STORAGE_CONNECTION_STRING or (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY)),
|
| 1200 |
-
"environment_ready": validate_configuration()
|
| 1201 |
-
}
|
| 1202 |
-
}
|
| 1203 |
-
|
| 1204 |
-
|
| 1205 |
-
if __name__ == "__main__":
|
| 1206 |
-
import uvicorn
|
| 1207 |
-
|
| 1208 |
-
print("\n" + "="*70)
|
| 1209 |
-
print("🚀 Invoice Splitter API - v6.0 FIXED (Hugging Face)")
|
| 1210 |
-
print("="*70)
|
| 1211 |
-
|
| 1212 |
-
# Validate configuration
|
| 1213 |
-
config_valid = validate_configuration()
|
| 1214 |
-
|
| 1215 |
-
print(f"\n⚡ Performance Features:")
|
| 1216 |
-
print(
|
| 1217 |
-
f" • Parallel Gemini API calls: {MAX_PARALLEL_GEMINI_CALLS} workers")
|
| 1218 |
-
print(f" • Image resolution: {GEMINI_IMAGE_RESOLUTION}x (optimized)")
|
| 1219 |
-
print(
|
| 1220 |
-
f" • Smart sampling: {'Enabled' if USE_SMART_SAMPLING else 'Disabled'} (optional)")
|
| 1221 |
-
print(f" • Expected speed: 5-10x faster for image PDFs")
|
| 1222 |
-
|
| 1223 |
-
print(f"\n🔧 Bug Fixes:")
|
| 1224 |
-
print(f" • ✅ Removed aggressive frequency filtering")
|
| 1225 |
-
print(f" • ✅ Splits on every invoice number change")
|
| 1226 |
-
print(f" • ✅ Keeps all extracted invoice numbers")
|
| 1227 |
-
print(f" • ✅ Added detailed debug logging")
|
| 1228 |
-
|
| 1229 |
-
print(f"\n📁 Folder Structure:")
|
| 1230 |
-
print(f" {ROOT_FOLDER}/{{batch_id}}/{{filename}}/")
|
| 1231 |
-
print(f" ├── Raw/ (original PDF)")
|
| 1232 |
-
print(f" └── Splitted/ (split invoices)")
|
| 1233 |
-
print(f"\n📦 Azure Configuration:")
|
| 1234 |
-
print(f" Account: {AZURE_STORAGE_ACCOUNT_NAME or 'Not set'}")
|
| 1235 |
-
print(f" Container: {AZURE_CONTAINER_NAME}")
|
| 1236 |
-
|
| 1237 |
-
if get_blob_service_client():
|
| 1238 |
-
print(f" ✅ Azure Blob Storage: Connected")
|
| 1239 |
-
else:
|
| 1240 |
-
print(f" ⚠️ Azure Blob Storage: Not configured")
|
| 1241 |
-
|
| 1242 |
-
if get_gemini_model():
|
| 1243 |
-
print(f" ✅ Gemini AI: Connected (gemini-2.5-flash)")
|
| 1244 |
-
else:
|
| 1245 |
-
print(f" ⚠️ Gemini AI: Not configured")
|
| 1246 |
-
|
| 1247 |
-
print(f"\n🌐 Server Configuration:")
|
| 1248 |
-
print(f" Host: {HOST}")
|
| 1249 |
-
print(f" Port: {PORT}")
|
| 1250 |
-
|
| 1251 |
-
if not config_valid:
|
| 1252 |
-
print(f"\n⚠️ WARNING: Some credentials are missing!")
|
| 1253 |
-
print(f" For Hugging Face deployment:")
|
| 1254 |
-
print(f" 1. Go to your Space Settings > Repository secrets")
|
| 1255 |
-
print(f" 2. Add the following secrets:")
|
| 1256 |
-
print(f" - GEMINI_API_KEY")
|
| 1257 |
-
print(f" - AZURE_STORAGE_CONNECTION_STRING (or)")
|
| 1258 |
-
print(f" - AZURE_STORAGE_ACCOUNT_NAME + AZURE_STORAGE_ACCOUNT_KEY")
|
| 1259 |
-
|
| 1260 |
-
print("\n" + "="*70 + "\n")
|
| 1261 |
-
|
| 1262 |
-
uvicorn.run(
|
| 1263 |
-
app,
|
| 1264 |
-
host=HOST,
|
| 1265 |
-
port=PORT,
|
| 1266 |
-
workers=1,
|
| 1267 |
-
timeout_keep_alive=600
|
| 1268 |
-
)
|
|
|
|
| 61 |
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
|
| 62 |
|
| 63 |
# Azure Blob Storage Configuration - REQUIRED for blob storage
|
| 64 |
+
AZURE_STORAGE_CONNECTION_STRING = os.environ.get(
|
| 65 |
+
"AZURE_STORAGE_CONNECTION_STRING", "")
|
| 66 |
AZURE_STORAGE_ACCOUNT_NAME = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME", "")
|
| 67 |
AZURE_STORAGE_ACCOUNT_KEY = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY", "")
|
| 68 |
|
|
|
|
| 73 |
ROOT_FOLDER = os.environ.get("ROOT_FOLDER", "POD") # Root folder name
|
| 74 |
|
| 75 |
# ⭐ PERFORMANCE CONFIGURATION
|
| 76 |
+
MAX_PARALLEL_GEMINI_CALLS = int(
|
| 77 |
+
os.environ.get("MAX_PARALLEL_GEMINI_CALLS", "5"))
|
| 78 |
+
GEMINI_IMAGE_RESOLUTION = float(
|
| 79 |
+
os.environ.get("GEMINI_IMAGE_RESOLUTION", "1.2"))
|
| 80 |
+
USE_SMART_SAMPLING = os.environ.get(
|
| 81 |
+
"USE_SMART_SAMPLING", "false").lower() == "true"
|
| 82 |
|
| 83 |
# ⭐ SERVER CONFIGURATION
|
| 84 |
HOST = os.environ.get("HOST", "0.0.0.0") # Hugging Face uses 0.0.0.0
|
|
|
|
| 95 |
# STARTUP VALIDATION
|
| 96 |
# ============================================================================
|
| 97 |
|
| 98 |
+
|
| 99 |
def validate_configuration():
|
| 100 |
"""Validate configuration and warn about missing credentials."""
|
| 101 |
warnings = []
|
| 102 |
errors = []
|
| 103 |
+
|
| 104 |
# Check Gemini API Key
|
| 105 |
if not GEMINI_API_KEY:
|
| 106 |
+
warnings.append(
|
| 107 |
+
"⚠️ GEMINI_API_KEY not set - image-based PDFs will not work")
|
| 108 |
else:
|
| 109 |
print(f"✅ GEMINI_API_KEY configured ({len(GEMINI_API_KEY)} chars)")
|
| 110 |
+
|
| 111 |
# Check Azure credentials
|
| 112 |
if not AZURE_STORAGE_CONNECTION_STRING:
|
| 113 |
if not (AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY):
|
| 114 |
+
errors.append(
|
| 115 |
+
"❌ Azure credentials missing - set AZURE_STORAGE_CONNECTION_STRING or both AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY")
|
| 116 |
else:
|
| 117 |
+
print(
|
| 118 |
+
f"✅ Azure credentials configured (account: {AZURE_STORAGE_ACCOUNT_NAME})")
|
| 119 |
else:
|
| 120 |
print(f"✅ Azure connection string configured")
|
| 121 |
+
|
| 122 |
# Print all warnings
|
| 123 |
for warning in warnings:
|
| 124 |
print(warning)
|
| 125 |
+
|
| 126 |
# Print all errors
|
| 127 |
for error in errors:
|
| 128 |
print(error)
|
| 129 |
+
|
| 130 |
if errors:
|
| 131 |
print("\n⚠️ WARNING: Some required credentials are missing!")
|
| 132 |
print(" Set them in Hugging Face Spaces Settings > Repository secrets")
|
| 133 |
+
|
| 134 |
return len(errors) == 0
|
| 135 |
|
| 136 |
|
|
|
|
| 418 |
return gemini_model
|
| 419 |
|
| 420 |
|
| 421 |
+
def _clean_gemini_invoice_text(text: str) -> Optional[str]:
|
| 422 |
+
if not text:
|
| 423 |
+
return None
|
| 424 |
+
|
| 425 |
+
cleaned = text.strip()
|
| 426 |
+
cleaned = cleaned.replace("*", "").replace("#", "")
|
| 427 |
+
cleaned = re.sub(
|
| 428 |
+
r"(?i)\b(invoice|inv|bill|document|doc|tax\s*invoice)\s*(no|number)?\b",
|
| 429 |
+
"",
|
| 430 |
+
cleaned
|
| 431 |
+
)
|
| 432 |
+
cleaned = re.sub(r"[:\-\(\)\[\]]", " ", cleaned)
|
| 433 |
+
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
| 434 |
+
|
| 435 |
+
# Extract candidates
|
| 436 |
+
tokens = re.findall(r"[A-Z0-9][A-Z0-9\-\/]{2,}", cleaned.upper())
|
| 437 |
+
|
| 438 |
+
# Prefer alphanumeric invoice IDs first
|
| 439 |
+
for token in tokens:
|
| 440 |
+
if any(c.isalpha() for c in token) and any(c.isdigit() for c in token):
|
| 441 |
+
return token
|
| 442 |
+
|
| 443 |
+
# Fallback to numeric-only (6-15 digits)
|
| 444 |
+
for token in tokens:
|
| 445 |
+
if token.isdigit() and 6 <= len(token) <= 15:
|
| 446 |
+
return token
|
| 447 |
+
|
| 448 |
+
return None
|
| 449 |
+
|
| 450 |
+
|
| 451 |
def extract_invoice_gemini_sync(page: fitz.Page) -> Optional[str]:
|
| 452 |
"""
|
| 453 |
Optimized synchronous Gemini extraction for thread pool execution.
|
| 454 |
- Reduced image resolution for faster processing
|
| 455 |
- Simplified prompt for quicker responses
|
| 456 |
+
- OCR fallback for better accuracy
|
| 457 |
"""
|
| 458 |
model = get_gemini_model()
|
| 459 |
if not model:
|
| 460 |
return None
|
| 461 |
|
| 462 |
+
img = None
|
| 463 |
try:
|
| 464 |
# Reduced resolution for faster processing
|
| 465 |
pix = page.get_pixmap(matrix=fitz.Matrix(
|
|
|
|
| 468 |
pix = None
|
| 469 |
img = Image.open(io.BytesIO(img_bytes))
|
| 470 |
|
| 471 |
+
# Updated prompt to prioritize labeled alphanumeric invoice numbers
|
| 472 |
prompt = """Extract ONLY the invoice number from this image.
|
| 473 |
+
Prefer the value next to labels like: Invoice No, Invoice Number, Bill No, Document No.
|
| 474 |
+
Return ONLY the identifier (keep letters, e.g., A07966). If not found, return: NONE."""
|
| 475 |
|
| 476 |
response = model.generate_content([prompt, img])
|
| 477 |
if response and response.text:
|
| 478 |
extracted_text = response.text.strip()
|
| 479 |
+
candidate = _clean_gemini_invoice_text(extracted_text)
|
| 480 |
+
if candidate and len(candidate) > 2:
|
| 481 |
+
img.close()
|
| 482 |
+
return candidate
|
| 483 |
+
|
| 484 |
+
# OCR Fallback: Extract full text then run regex
|
| 485 |
+
ocr_prompt = "Extract all text from this invoice image. Return the complete text content."
|
| 486 |
+
ocr_response = model.generate_content([ocr_prompt, img])
|
| 487 |
+
if ocr_response and ocr_response.text:
|
| 488 |
+
inv = try_extract_invoice_from_text(ocr_response.text)
|
| 489 |
+
if inv:
|
| 490 |
+
img.close()
|
| 491 |
+
return inv
|
| 492 |
+
|
| 493 |
+
if img:
|
| 494 |
+
img.close()
|
| 495 |
return None
|
| 496 |
|
| 497 |
except Exception as e:
|
| 498 |
print(f"Gemini error: {e}")
|
| 499 |
+
if img:
|
| 500 |
+
img.close()
|
| 501 |
return None
|
| 502 |
|
| 503 |
|
|
|
|
| 684 |
if not text:
|
| 685 |
return None
|
| 686 |
text_norm = normalize_text_for_search(text)
|
| 687 |
+
|
| 688 |
+
# DEBUG: Print first 600 chars
|
| 689 |
+
print(f"\n🔍 DEBUG - Extracted text (first 600 chars):\n{text_norm[:600]}\n")
|
| 690 |
|
| 691 |
+
# PRIORITY 1: Look for CREDIT number (14 digits, common in pharma invoices)
|
| 692 |
+
credit_match = re.search(
|
| 693 |
+
r"CREDIT\s*(?:NO|NUMBER|#)?\s*[:\-]?\s*(\d{12,20})",
|
| 694 |
text_norm, re.IGNORECASE
|
| 695 |
)
|
| 696 |
+
if credit_match:
|
| 697 |
+
credit_num = credit_match.group(1).strip()
|
| 698 |
+
print(f"✓ Found CREDIT number: {credit_num}")
|
| 699 |
+
if 12 <= len(credit_num) <= 20:
|
| 700 |
+
return credit_num.upper()
|
| 701 |
+
|
| 702 |
+
# PRIORITY 2: Look for "Invoice No" or "Bill No" followed by long numeric (12-20 digits)
|
| 703 |
+
invoice_patterns = [
|
| 704 |
+
r"Invoice\s*(?:No|Number)\.?\s*[:\-]?\s*(\d{12,20})",
|
| 705 |
+
r"Bill\s*(?:No|Number)\.?\s*[:\-]?\s*(\d{12,20})",
|
| 706 |
+
r"Tax\s*Invoice\s*(?:No|Number)\.?\s*[:\-]?\s*(\d{12,20})",
|
| 707 |
+
]
|
| 708 |
+
|
| 709 |
+
for pattern in invoice_patterns:
|
| 710 |
+
match = re.search(pattern, text_norm, re.IGNORECASE)
|
| 711 |
+
if match:
|
| 712 |
+
num = match.group(1).strip()
|
| 713 |
+
print(f"✓ Found labeled long numeric invoice: {num}")
|
| 714 |
+
return num.upper()
|
| 715 |
+
|
| 716 |
+
# PRIORITY 3: Look for "Invoice No" with alphanumeric (but EXCLUDE batch patterns)
|
| 717 |
+
label_patterns = [
|
| 718 |
+
r"Invoice\s*No\.?\s*[:\-]\s*([A-Z][A-Z0-9\-\/]{2,20})",
|
| 719 |
+
r"Bill\s*No\.?\s*[:\-]\s*([A-Z][A-Z0-9\-\/]{2,20})",
|
| 720 |
+
]
|
| 721 |
+
|
| 722 |
+
for pattern in label_patterns:
|
| 723 |
+
match = re.search(pattern, text_norm, re.IGNORECASE)
|
| 724 |
+
if match:
|
| 725 |
+
invoice_num = match.group(1).strip()
|
| 726 |
+
|
| 727 |
+
# EXCLUDE batch number patterns (single letter + 6 digits: F500256, I500734, etc.)
|
| 728 |
+
if re.match(r'^[A-Z]\d{6}$', invoice_num, re.IGNORECASE):
|
| 729 |
+
print(f"⚠️ Skipping (batch pattern): {invoice_num}")
|
| 730 |
+
continue
|
| 731 |
+
|
| 732 |
+
# EXCLUDE license patterns (KA-MY2-157424)
|
| 733 |
+
if re.match(r'^[A-Z]{2,3}-[A-Z0-9]+-\d+$', invoice_num, re.IGNORECASE):
|
| 734 |
+
print(f"⚠️ Skipping (license pattern): {invoice_num}")
|
| 735 |
+
continue
|
| 736 |
+
|
| 737 |
+
print(f"✓ Found labeled alphanumeric: {invoice_num}")
|
| 738 |
+
if any(c.isalpha() for c in invoice_num) and any(c.isdigit() for c in invoice_num):
|
| 739 |
+
if 3 <= len(invoice_num) <= 20:
|
| 740 |
+
return invoice_num.upper()
|
| 741 |
+
|
| 742 |
+
# PRIORITY 4: Look for long numeric values (12-20 digits) in top area
|
| 743 |
+
top_text = text_norm[:1000]
|
| 744 |
+
long_numerics = re.findall(r'\b(\d{12,20})\b', top_text)
|
| 745 |
+
|
| 746 |
+
if long_numerics:
|
| 747 |
+
# Take the longest one (most likely to be invoice number)
|
| 748 |
+
longest = max(long_numerics, key=len)
|
| 749 |
+
print(f"✓ Found long numeric value: {longest}")
|
| 750 |
+
return longest.upper()
|
| 751 |
|
| 752 |
+
# PRIORITY 5: Look near "Invoice" label for tokens, EXCLUDE batch patterns
|
| 753 |
label_match = re.search(
|
| 754 |
+
r"(?:Invoice|Bill|Tax\s*Invoice)\s*(?:No|Number|#|\.|:\s*)",
|
| 755 |
text_norm, re.IGNORECASE
|
| 756 |
)
|
| 757 |
if label_match:
|
| 758 |
start_idx = label_match.end()
|
| 759 |
+
candidate_text = text_norm[start_idx:start_idx + 100]
|
| 760 |
+
print(f"🔍 Text after label: '{candidate_text[:50]}...'")
|
| 761 |
+
|
| 762 |
+
tokens = re.findall(r"\b([A-Z0-9][A-Z0-9\-\/]{2,20})\b", candidate_text, re.IGNORECASE)
|
| 763 |
+
print(f"🔍 Tokens found: {tokens}")
|
| 764 |
+
|
| 765 |
+
for token in tokens:
|
| 766 |
+
token = token.strip(".,;:-*")
|
| 767 |
+
|
| 768 |
+
# Skip common words
|
| 769 |
+
if token.upper() in ("ORDER", "REF", "NO", "DATE", "DT", "INV", "BILL", "ACCOUNT", "PO", "COPY", "OF"):
|
| 770 |
+
continue
|
| 771 |
+
|
| 772 |
+
# EXCLUDE batch patterns (F500256, I500734)
|
| 773 |
+
if re.match(r'^[A-Z]\d{6}$', token, re.IGNORECASE):
|
| 774 |
+
print(f"⚠️ Skipping (batch pattern): {token}")
|
| 775 |
+
continue
|
| 776 |
+
|
| 777 |
+
# EXCLUDE license patterns
|
| 778 |
+
if re.match(r'^[A-Z]{2,3}-[A-Z0-9]+-\d+$', token, re.IGNORECASE):
|
| 779 |
+
print(f"⚠️ Skipping (license pattern): {token}")
|
| 780 |
continue
|
| 781 |
+
|
| 782 |
+
if any(c.isalpha() for c in token) and any(c.isdigit() for c in token):
|
| 783 |
+
if 3 <= len(token) <= 20:
|
| 784 |
+
print(f"✓ Selected token: {token}")
|
| 785 |
+
return token.upper()
|
| 786 |
+
|
| 787 |
+
# PRIORITY 6: Medium-length numeric (10-15 digits)
|
| 788 |
+
medium_numerics = re.findall(r'\b(\d{10,15})\b', top_text)
|
| 789 |
+
for num in medium_numerics:
|
| 790 |
+
# Exclude phone numbers (10 digits starting with 6-9)
|
| 791 |
+
if len(num) == 10 and num[0] in '6789':
|
| 792 |
+
continue
|
| 793 |
+
# Exclude dates (8 digits starting with 20)
|
| 794 |
+
if len(num) == 8 and num.startswith('20'):
|
| 795 |
+
continue
|
| 796 |
+
print(f"✓ Found medium numeric value: {num}")
|
| 797 |
+
return num.upper()
|
| 798 |
+
|
| 799 |
+
print("✗ No invoice number found")
|
| 800 |
return None
|
| 801 |
|
|
|
|
| 802 |
def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
|
| 803 |
text = page.get_text("text") or ""
|
| 804 |
inv = try_extract_invoice_from_text(text)
|
|
|
|
| 1001 |
# ============================================================================
|
| 1002 |
# 🔧 CORRECTED GROUPING LOGIC - NO AGGRESSIVE FILTERING
|
| 1003 |
# ============================================================================
|
| 1004 |
+
|
| 1005 |
print(f"\n🔧 Grouping invoices...")
|
| 1006 |
+
|
| 1007 |
# DEBUG: Show raw extraction results
|
| 1008 |
print(f"\n🔍 DEBUG - Raw extraction results:")
|
| 1009 |
for idx, inv in enumerate(page_invoice_nos[:min(10, len(page_invoice_nos))]):
|
| 1010 |
print(f" Page {idx+1}: {inv if inv else '(not found)'}")
|
| 1011 |
if len(page_invoice_nos) > 10:
|
| 1012 |
+
print(
|
| 1013 |
+
f" ... (showing first 10 of {len(page_invoice_nos)} pages)")
|
| 1014 |
|
| 1015 |
# Step 1: Normalize extracted invoice numbers (only filter GST numbers)
|
| 1016 |
page_invoice_nos_normalized = []
|
|
|
|
| 1041 |
|
| 1042 |
# Count how many pages were forward-filled
|
| 1043 |
filled_count = sum(1 for i in range(len(page_invoice_nos_normalized))
|
| 1044 |
+
if page_invoice_nos_normalized[i] is None and page_invoice_nos_filled[i] is not None)
|
| 1045 |
|
| 1046 |
# Debug: Count unique invoice numbers
|
| 1047 |
+
unique_invoices = set(
|
| 1048 |
+
[v for v in page_invoice_nos_filled if v is not None])
|
| 1049 |
print(f"\n 📊 Found {len(unique_invoices)} unique invoice numbers:")
|
| 1050 |
for inv_no in sorted(unique_invoices) if unique_invoices else []:
|
| 1051 |
page_count = sum(1 for v in page_invoice_nos_filled if v == inv_no)
|
|
|
|
| 1068 |
"invoice_no": current_invoice,
|
| 1069 |
"pages": current_group[:]
|
| 1070 |
})
|
| 1071 |
+
print(
|
| 1072 |
+
f" 📄 Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
|
| 1073 |
current_invoice = inv
|
| 1074 |
current_group = [idx]
|
| 1075 |
else:
|
|
|
|
| 1082 |
"invoice_no": current_invoice,
|
| 1083 |
"pages": current_group[:]
|
| 1084 |
})
|
| 1085 |
+
print(
|
| 1086 |
+
f" 📄 Group {len(groups)}: Invoice {current_invoice or 'UNKNOWN'} - Pages {current_group[0]+1}-{current_group[-1]+1} ({len(current_group)} pages)")
|
| 1087 |
|
| 1088 |
# Handle edge case: entire PDF has no invoice numbers
|
| 1089 |
if len(groups) == 1 and groups[0]["invoice_no"] is None:
|
|
|
|
| 1093 |
}]
|
| 1094 |
|
| 1095 |
print(f"\n✅ Created {len(groups)} invoice groups")
|
| 1096 |
+
print(
|
| 1097 |
+
f" Forward-filled {filled_count} pages with missing invoice numbers")
|
| 1098 |
|
| 1099 |
# Build and upload split PDFs
|
| 1100 |
print(f"\n🔨 Building and uploading split invoices...")
|
|
|
|
| 1245 |
"batch_id": batch_id,
|
| 1246 |
"folder_path": f"{ROOT_FOLDER}/{batch_id}/",
|
| 1247 |
"container": container_name
|
| 1248 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|