anujakkulkarni commited on
Commit
e531516
·
verified ·
1 Parent(s): 63e2ea5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +380 -124
app.py CHANGED
@@ -2,7 +2,10 @@ import os
2
  import io
3
  import re
4
  import base64
 
 
5
  from typing import List, Dict, Optional, Tuple
 
6
 
7
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
8
  from fastapi.middleware.cors import CORSMiddleware
@@ -14,7 +17,7 @@ try:
14
  import google.generativeai as genai
15
  from PIL import Image
16
  GEMINI_AVAILABLE = True
17
- except ImportError:
18
  GEMINI_AVAILABLE = False
19
  print("Warning: google-generativeai not installed. Image-based PDFs won't be supported.")
20
 
@@ -29,48 +32,202 @@ app.add_middleware(
29
  )
30
 
31
  # --- Google Gemini Configuration ---
32
- # This will be automatically loaded from environment variables
33
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  gemini_model = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def get_gemini_model():
39
- """Get or create Gemini model instance."""
40
- global gemini_model
41
 
42
  if not GEMINI_AVAILABLE:
43
  print("Gemini SDK not available")
44
  return None
45
 
46
- if gemini_model is None:
47
- # Check if API key is configured via environment variables
48
- if not GEMINI_API_KEY:
49
- print("Warning: Gemini API key not found in environment variables.")
50
- print("Please configure GEMINI_API_KEY in your environment variables.")
51
- return None
52
 
 
 
 
 
 
 
 
 
53
  try:
54
  genai.configure(api_key=GEMINI_API_KEY)
55
- gemini_model = genai.GenerativeModel('gemini-2.0-flash-exp')
56
- print("✓ Google Gemini Flash 2.0 initialized")
57
- except Exception as e:
58
- print(f"Failed to initialize Gemini model: {e}")
59
  return None
60
 
61
  return gemini_model
62
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  # --- Regex patterns for text-based PDF extraction ---
65
  INVOICE_NO_RE = re.compile(
66
  r"""
67
- (?:
68
  Invoice\s*No\.?|
69
- Inv\.?\s*No\.?|
70
  Bill\s*No\.?|
71
- Document\s*No\.?| # ✅ ADD THIS
72
  Doc\s*No\.?|
73
- Tax\s*Invoice\s*No\.?
74
  )
75
  \s*[:\-]?\s*
76
  ([A-Z0-9][A-Z0-9\-\/]{3,})
@@ -78,50 +235,43 @@ INVOICE_NO_RE = re.compile(
78
  re.IGNORECASE | re.VERBOSE
79
  )
80
 
81
-
82
  PREFIXED_INVOICE_RE = re.compile(
83
- r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b"
84
  )
85
 
86
  GST_LIKE_RE = re.compile(
87
- r"\b((?:GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b", re.IGNORECASE)
 
 
88
 
89
 
90
  def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
91
  """
92
  Detect if PDF is image-based or text-based by sampling pages.
93
  Returns (is_image_based, avg_text_length).
94
-
95
- Strategy:
96
- - Sample first few pages
97
- - If average extractable text < 50 chars per page, it's likely image-based
98
- - If text > 200 chars per page, it's text-based
99
  """
100
  total_text_length = 0
101
  pages_to_check = min(sample_pages, doc.page_count)
102
 
103
  for i in range(pages_to_check):
104
- text = doc. load_page(i).get_text("text") or ""
105
  total_text_length += len(text. strip())
106
 
107
  avg_text_length = total_text_length / pages_to_check
108
  is_image_based = avg_text_length < 50
109
 
110
- print(
111
- f" PDF Type Detection: avg_text_length={avg_text_length:.1f} chars/page")
112
- print(
113
- f" Classification: {'IMAGE-BASED' if is_image_based else 'TEXT-BASED'} PDF")
114
 
115
  return is_image_based, avg_text_length
116
 
117
 
118
  # ============================================================================
119
- # TEXT-BASED PDF EXTRACTION (Original Code)
120
  # ============================================================================
121
 
122
-
123
  def normalize_text_for_search(s: str) -> str:
124
- """Light normalization: collapse whitespace and normalize common separators."""
125
  if not s:
126
  return s
127
  s = s.replace("\u00A0", " ") # non-breaking space
@@ -131,51 +281,40 @@ def normalize_text_for_search(s: str) -> str:
131
 
132
 
133
  def try_extract_invoice_from_text(text: str) -> Optional[str]:
134
- """
135
- Extract invoice number from text using regex patterns.
136
- - Prefer explicit labeled Invoice/Bill patterns.
137
- - Prefer prefixed invoice formats found in the top of the page.
138
- - Use GST only as a last resort and tag it so it won't be mistaken for an invoice id.
139
- """
140
  if not text:
141
  return None
142
 
143
  text_norm = normalize_text_for_search(text)
144
 
145
  # 1) Labeled invoice like "Invoice No", "Inv No."
146
- m = INVOICE_NO_RE.search(text_norm)
147
  if m:
148
  inv = (m.group(1) or "").strip()
149
  if inv and inv.lower() not in ("invoice", "inv", "bill") and len(inv) > 2:
150
  return inv
151
 
152
- # 2) Search top portion for prefixed invoice codes (WN-1234, 5EN19710, etc.)
153
- top_text = text_norm[:600] # bigger top area to be robust
154
  m = PREFIXED_INVOICE_RE.search(top_text)
155
  if m:
156
  inv = (m.group(1) or "").strip()
157
- # extra length check so tiny numeric matches don't pass
158
  if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
159
  return inv
160
 
161
- # 3) As absolute last-resort: strict GST detection (only accept 15-char GSTIN)
162
  gm = GST_LIKE_RE.search(text_norm)
163
  if gm:
164
  gst_val = gm.group(2) or ""
165
  gst_val = gst_val.replace(" ", "").strip().upper()
166
- # Only accept if 15 alnum chars (typical Indian GSTIN length)
167
  if len(gst_val) == 15 and re.match(r"^[0-9A-Z]{15}$", gst_val):
168
- # tag it so grouping won't treat GST same as invoice ID
169
  return f"GST:{gst_val}"
170
 
171
  return None
172
 
173
 
174
  def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
175
- """
176
- Extract invoice number from TEXT-BASED PDF.
177
- Uses the original fast text extraction method.
178
- """
179
  # Try full-page text
180
  text = page.get_text("text") or ""
181
  inv = try_extract_invoice_from_text(text)
@@ -194,30 +333,41 @@ def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
194
 
195
 
196
  # ============================================================================
197
- # IMAGE-BASED PDF EXTRACTION (Google Gemini)
198
  # ============================================================================
199
 
200
- def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
201
  """
202
- Extract invoice number from IMAGE-BASED PDF using Google Gemini Flash 2.0.
 
203
  """
 
 
 
 
 
204
  model = get_gemini_model()
205
  if not model:
206
  print(" Gemini model not available")
207
  return None
208
 
 
 
 
 
 
 
 
209
  try:
210
  # Convert page to image
211
  pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x resolution
212
  img_bytes = pix.tobytes("png")
213
-
214
- # Convert to PIL Image for Gemini
215
  img = Image.open(io.BytesIO(img_bytes))
216
 
217
- # Prompt for Gemini to extract invoice number
218
  prompt = """
219
- Extract the invoice number from this image. Look for:
220
- - Invoice No, Invoice Number, Bill No, Bill Number
221
  - Any alphanumeric code that appears to be an invoice identifier
222
  - Purchase Order numbers if no invoice number is found
223
 
@@ -225,7 +375,9 @@ def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
225
  If no invoice number is found, return "NOT_FOUND".
226
  """
227
 
228
- print(" Calling Google Gemini API...")
 
 
229
  response = model.generate_content([prompt, img])
230
 
231
  if response and response.text:
@@ -233,20 +385,17 @@ def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
233
  print(f" Gemini response: {extracted_text}")
234
 
235
  if extracted_text and extracted_text != "NOT_FOUND":
236
- # Clean up the response
237
- invoice_no = extracted_text.replace(
238
- "*", "").replace("#", "").strip()
239
  if invoice_no and len(invoice_no) > 2:
240
- print(f" ✓ Gemini found invoice: {invoice_no}")
241
  return invoice_no
242
 
243
  # Fallback: Get full OCR text and try regex
244
- ocr_prompt = "Extract all text from this invoice image. Return the complete text content."
245
  ocr_response = model.generate_content([ocr_prompt, img])
246
 
247
  if ocr_response and ocr_response.text:
248
- print(
249
- f" Gemini extracted {len(ocr_response.text)} chars, trying regex...")
250
  inv = try_extract_invoice_from_text(ocr_response.text)
251
  if inv:
252
  print(f" ✓ Found via regex on Gemini text: {inv}")
@@ -255,7 +404,44 @@ def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
255
  print(" ✗ Gemini: No invoice found")
256
  return None
257
 
258
- except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  print(f" ✗ Gemini extraction failed: {e}")
260
  return None
261
 
@@ -266,7 +452,6 @@ def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
266
 
267
  def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
268
  """Try text extraction first, then Gemini as fallback"""
269
-
270
  # ALWAYS try text extraction first (fast, no API cost)
271
  text_result = extract_invoice_text_based(page)
272
  if text_result:
@@ -274,7 +459,7 @@ def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optiona
274
  return text_result
275
 
276
  # If text fails AND PDF seems image-based, try Gemini
277
- if is_image_pdf:
278
  gemini_result = extract_invoice_gemini(page)
279
  if gemini_result:
280
  print(f" ✓ Found via Gemini: {gemini_result}")
@@ -294,23 +479,23 @@ def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> byt
294
 
295
 
296
  # ============================================================================
297
- # API ENDPOINT
298
  # ============================================================================
299
 
300
  @app.post("/split-invoices")
301
  async def split_invoices(
302
  file: UploadFile = File(...),
303
  include_pdf: bool = Form(True),
304
- initial_dpi: int = Form(300), # Kept for compatibility
305
  ):
306
  """
307
  Split a multi-invoice PDF into separate PDFs based on invoice numbers.
308
-
309
- - Text-based PDFs: Uses fast text extraction
310
- - Image-based PDFs: Uses Google Gemini Flash 2.0 (if configured)
311
-
312
- Note: GST values (tagged as "GST:...") are treated as a last-resort identifier and
313
- are ignored for splitting by default (so repeated company GST won't prevent splits).
314
  """
315
  if not file.filename.lower().endswith(".pdf"):
316
  raise HTTPException(status_code=400, detail="only PDF is supported")
@@ -320,119 +505,116 @@ async def split_invoices(
320
  raise HTTPException(status_code=400, detail="empty file")
321
 
322
  try:
323
- doc = fitz.open(stream=file_bytes, filetype="pdf")
324
- if doc.page_count == 0:
325
  raise HTTPException(status_code=400, detail="no pages found")
326
 
327
  print(f"\n{'='*60}")
328
  print(f"Processing PDF: {file.filename}")
329
  print(f"Total pages: {doc.page_count}")
 
 
 
 
330
  print(f"{'='*60}")
331
 
332
- # Step 1: Detect PDF type (text-based vs image-based)
333
  is_image_pdf, avg_text_len = is_image_based_pdf(doc)
334
 
335
  if is_image_pdf and not get_gemini_model():
336
- raise HTTPException(
337
- status_code=500,
338
- detail="Image-based PDF detected but Google Gemini is not configured. "
339
- "Please add GEMINI_API_KEY to your environment variables."
340
- )
 
 
 
 
 
 
 
341
 
342
  # Step 2: Extract invoice numbers from each page
343
- page_invoice_nos: List[Optional[str]] = []
344
  for i in range(doc.page_count):
345
  print(f"\n--- Page {i+1}/{doc.page_count} ---")
346
- inv = extract_invoice_no_from_page(doc.load_page(i), is_image_pdf)
347
- # inv may be something like "5EN19710" or "GST:12ABCDE..." or None
348
- if inv:
349
  print(f" ✓ Raw extracted id: {inv}")
350
  else:
351
- print(f" ✗ No invoice found (raw)")
352
  page_invoice_nos.append(inv)
353
 
354
  print(f"\n{'='*60}")
355
  print(f"Raw Extraction Results: {page_invoice_nos}")
356
  print(f"{'='*60}")
357
 
358
- # ---------------------------------------------------------
359
- # Post-process extracted ids before grouping
360
- # - Treat GST:<value> as a LAST-RESORT marker and ignore it for splitting
361
- # (convert to None) so repeated company GST doesn't group pages together.
362
- # - Keep actual invoice ids like '5EN19710' intact.
363
- # ---------------------------------------------------------
364
- page_invoice_nos_filtered: List[Optional[str]] = []
365
- for v in page_invoice_nos:
366
  if v is None:
367
  page_invoice_nos_filtered.append(None)
368
  else:
369
- # If GST-tagged value (we returned "GST:..."), ignore it for splitting
370
  if isinstance(v, str) and v.upper().startswith("GST:"):
371
  page_invoice_nos_filtered.append(None)
372
  else:
373
- page_invoice_nos_filtered.append(v)
374
 
375
  print(f"Filtered (GST ignored) Results: {page_invoice_nos_filtered}")
376
 
377
- # Step 3: Group pages by invoice number (use filtered ids)
378
  groups: List[Dict] = []
379
- current_group_pages: List[int] = []
380
- current_invoice: Optional[str] = None
381
 
382
  for idx, inv in enumerate(page_invoice_nos_filtered):
383
  if current_invoice is None:
384
- # Start a new group (even if inv is None)
385
  current_invoice = inv
386
  current_group_pages = [idx]
387
  else:
388
- # If a new non-empty invoice appears and differs -> close current group
389
  if inv is not None and inv != current_invoice:
390
  groups.append({
391
  "invoice_no": current_invoice,
392
- "pages": current_group_pages[:],
393
  })
394
  current_invoice = inv
395
  current_group_pages = [idx]
396
  else:
397
- # Continue current group (same invoice or both None)
398
  current_group_pages.append(idx)
399
 
400
  # Save last group
401
  if current_group_pages:
402
  groups.append({
403
- "invoice_no": current_invoice,
404
  "pages": current_group_pages[:]
405
  })
406
 
407
- # Post-process groups:
408
- # If first group has invoice_no None and next group has non-None -> merge leading None
409
  if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
410
  groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
411
- groups.pop(0)
412
 
413
- # If, after filtering, all groups are None (no invoice detected), return whole doc as one part
414
  if all(g["invoice_no"] is None for g in groups):
415
- print("\n⚠ Warning: No invoices detected in any page (after GST ignored)!")
416
  print(" Returning entire PDF as single part")
417
  groups = [{
418
  "invoice_no": None,
419
  "pages": list(range(doc.page_count))
420
  }]
421
 
422
- # Step 4: Build response parts
423
  parts = []
424
  for idx, g in enumerate(groups):
425
  part_bytes = build_pdf_from_pages(doc, g["pages"])
426
  info = {
427
- # Keep invoice_no as detected in filtered set (None or actual invoice id)
428
  "invoice_no": g["invoice_no"],
429
- "pages": [p + 1 for p in g["pages"]], # 1-based for humans
430
  "num_pages": len(g["pages"]),
431
  "size_bytes": len(part_bytes),
432
  }
433
  if include_pdf:
434
- info["pdf_base64"] = base64.b64encode(
435
- part_bytes).decode("ascii")
436
  parts.append(info)
437
  print(f"\nPart {idx+1}:")
438
  print(f" Invoice: {g['invoice_no']}")
@@ -448,13 +630,19 @@ async def split_invoices(
448
  return JSONResponse({
449
  "count": len(parts),
450
  "pdf_type": "image-based" if is_image_pdf else "text-based",
 
 
 
 
 
 
451
  "parts": parts
452
  })
453
 
454
- except HTTPException:
455
  raise
456
  except Exception as e:
457
- print(f"\n✗ Error: {str(e)}")
458
  import traceback
459
  traceback.print_exc()
460
  return JSONResponse({"error": str(e)}, status_code=500)
@@ -463,13 +651,81 @@ async def split_invoices(
463
  @app.get("/health")
464
  async def health_check():
465
  """Health check endpoint to verify Gemini configuration."""
466
- gemini_status = "configured" if get_gemini_model() else "not configured"
 
 
 
 
 
 
467
  return {
468
  "status": "healthy",
469
- "gemini_flash": gemini_status,
470
  "gemini_available": GEMINI_AVAILABLE,
 
 
 
 
 
 
471
  }
472
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
  if __name__ == "__main__":
474
  import uvicorn
475
- uvicorn.run(app, host="0.0.0.0", port=8001)
 
 
 
 
 
 
 
 
 
 
 
 
2
  import io
3
  import re
4
  import base64
5
+ import time
6
+ import datetime
7
  from typing import List, Dict, Optional, Tuple
8
+ from collections import deque
9
 
10
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
11
  from fastapi.middleware.cors import CORSMiddleware
 
17
  import google.generativeai as genai
18
  from PIL import Image
19
  GEMINI_AVAILABLE = True
20
+ except ImportError:
21
  GEMINI_AVAILABLE = False
22
  print("Warning: google-generativeai not installed. Image-based PDFs won't be supported.")
23
 
 
32
  )
33
 
34
  # --- Google Gemini Configuration ---
 
35
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
36
 
37
+ # Model fallback list (in priority order)
38
+ MODELS = [
39
+ {
40
+ "name": "gemini-2.5-flash-image", # PRIMARY - Recommended by Google
41
+ "max_requests_per_minute": 50, # Higher quota limit
42
+ "timeout": 300,
43
+ "description": "Primary model with higher quota"
44
+ },
45
+ {
46
+ "name": "gemini-2.0-flash", # Fallback
47
+ "max_requests_per_minute": 15,
48
+ "timeout": 300,
49
+ "description": "Pro fallback"
50
+ },
51
+ {
52
+ "name": "gemini-3-flash", # Fallback
53
+ "max_requests_per_minute": 15,
54
+ "timeout": 300,
55
+ "description": "Pro fallback"
56
+ },
57
+ {
58
+ "name": "gemini-2.0-flash-exp", # FALLBACK 1 - Your original choice
59
+ "max_requests_per_minute": 9, # Conservative (under 10 limit)
60
+ "timeout": 300,
61
+ "description": "Fallback experimental model"
62
+ }
63
+ ]
64
+
65
+ current_model_index = 0
66
  gemini_model = None
67
+ last_quota_reset = None
68
+ daily_quota_exhausted = False
69
+
70
+
71
+ # --- Rate Limiter Class ---
72
+ class SimpleRateLimiter:
73
+ def __init__(self, max_requests=10, window_seconds=60):
74
+ self.max_requests = max_requests
75
+ self.window_seconds = window_seconds
76
+ self.requests = deque()
77
+ self.quota_error_count = 0
78
+
79
+ def allow_request(self):
80
+ now = time.time()
81
+ # Remove old requests outside time window
82
+ while self.requests and self.requests[0] < now - self.window_seconds:
83
+ self.requests.popleft()
84
+
85
+ if len(self.requests) < self.max_requests:
86
+ self.requests.append(now)
87
+ return True
88
+ return False
89
+
90
+ def wait_time(self):
91
+ if not self.requests:
92
+ return 0
93
+ oldest = self.requests[0]
94
+ return max(0, self.window_seconds - (time.time() - oldest))
95
+
96
+ def reset(self):
97
+ self.requests. clear()
98
+ self.quota_error_count = 0
99
+
100
+ def record_quota_error(self):
101
+ self.quota_error_count += 1
102
+
103
+
104
+ # Initialize rate limiter for current model
105
+ gemini_rate_limiter = SimpleRateLimiter(
106
+ max_requests=GEMINI_MODELS[current_model_index]["max_requests_per_minute"],
107
+ window_seconds=60
108
+ )
109
+
110
+
111
+ # --- Daily Quota Management ---
112
+ def check_daily_quota():
113
+ """Check if we should reset daily quota flag."""
114
+ global last_quota_reset, daily_quota_exhausted
115
+
116
+ now = datetime.datetime.now()
117
+
118
+ if last_quota_reset is None:
119
+ last_quota_reset = now
120
+ daily_quota_exhausted = False
121
+ return True
122
+
123
+ # Reset at midnight
124
+ if now.date() > last_quota_reset.date():
125
+ print("🔄 Daily quota reset detected")
126
+ last_quota_reset = now
127
+ daily_quota_exhausted = False
128
+ # Also reset to primary model
129
+ reset_to_primary_model()
130
+ return True
131
 
132
+ return not daily_quota_exhausted
133
 
134
+
135
+ def mark_daily_quota_exhausted():
136
+ """Mark daily quota as exhausted."""
137
+ global daily_quota_exhausted
138
+ daily_quota_exhausted = True
139
+ next_reset = (datetime.datetime.now() + datetime.timedelta(days=1)).replace(
140
+ hour=0, minute=0, second=0
141
+ )
142
+ print(f"❌ Daily quota exhausted - resets at {next_reset. strftime('%Y-%m-%d %H:%M')}")
143
+
144
+
145
+ # --- Model Management Functions ---
146
  def get_gemini_model():
147
+ """Get or create Gemini model instance with auto-fallback."""
148
+ global gemini_model, current_model_index
149
 
150
  if not GEMINI_AVAILABLE:
151
  print("Gemini SDK not available")
152
  return None
153
 
154
+ if not GEMINI_API_KEY:
155
+ print("Warning: Gemini API key not found in environment variables.")
156
+ return None
 
 
 
157
 
158
+ # Check daily quota first
159
+ if not check_daily_quota():
160
+ print("Daily quota exhausted, Gemini unavailable until reset")
161
+ return None
162
+
163
+ # Try to initialize model if not already done
164
+ if gemini_model is None:
165
+ model_config = GEMINI_MODELS[current_model_index]
166
  try:
167
  genai.configure(api_key=GEMINI_API_KEY)
168
+ gemini_model = genai.GenerativeModel(model_config["name"])
169
+ print(f"✓ Initialized: {model_config['name']} ({model_config['description']})")
170
+ except Exception as e:
171
+ print(f"Failed to initialize {model_config['name']}: {e}")
172
  return None
173
 
174
  return gemini_model
175
 
176
 
177
+ def switch_to_next_model():
178
+ """Switch to next available model in fallback chain."""
179
+ global gemini_model, current_model_index, gemini_rate_limiter
180
+
181
+ if current_model_index < len(GEMINI_MODELS) - 1:
182
+ current_model_index += 1
183
+ model_config = GEMINI_MODELS[current_model_index]
184
+
185
+ # Reset rate limiter with new model's limits
186
+ gemini_rate_limiter = SimpleRateLimiter(
187
+ max_requests=model_config["max_requests_per_minute"],
188
+ window_seconds=60
189
+ )
190
+
191
+ # Force reinitialization
192
+ gemini_model = None
193
+
194
+ print(f"🔄 SWITCHED TO MODEL: {model_config['name']} ({model_config['description']})")
195
+ return get_gemini_model()
196
+ else:
197
+ print("❌ All models exhausted!")
198
+ return None
199
+
200
+
201
+ def reset_to_primary_model():
202
+ """Reset back to primary model."""
203
+ global gemini_model, current_model_index, gemini_rate_limiter
204
+
205
+ if current_model_index != 0:
206
+ old_model = GEMINI_MODELS[current_model_index]['name']
207
+ current_model_index = 0
208
+ model_config = GEMINI_MODELS[0]
209
+
210
+ gemini_rate_limiter = SimpleRateLimiter(
211
+ max_requests=model_config["max_requests_per_minute"],
212
+ window_seconds=60
213
+ )
214
+
215
+ gemini_model = None
216
+ print(f"🔄 Reset from {old_model} to primary model: {model_config['name']}")
217
+ return True
218
+ return False
219
+
220
+
221
  # --- Regex patterns for text-based PDF extraction ---
222
  INVOICE_NO_RE = re.compile(
223
  r"""
224
+ (?:
225
  Invoice\s*No\.?|
226
+ Inv\. ?\s*No\.?|
227
  Bill\s*No\.?|
228
+ Document\s*No\.?|
229
  Doc\s*No\.?|
230
+ Tax\s*Invoice\s*No\.?
231
  )
232
  \s*[:\-]?\s*
233
  ([A-Z0-9][A-Z0-9\-\/]{3,})
 
235
  re.IGNORECASE | re.VERBOSE
236
  )
237
 
 
238
  PREFIXED_INVOICE_RE = re.compile(
239
+ r"\b([A-Z]{2,4}[-/]\d{4,}(? :/\d+)?[A-Z]*)\b"
240
  )
241
 
242
  GST_LIKE_RE = re.compile(
243
+ r"\b((? : GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b",
244
+ re.IGNORECASE
245
+ )
246
 
247
 
248
  def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
249
  """
250
  Detect if PDF is image-based or text-based by sampling pages.
251
  Returns (is_image_based, avg_text_length).
 
 
 
 
 
252
  """
253
  total_text_length = 0
254
  pages_to_check = min(sample_pages, doc.page_count)
255
 
256
  for i in range(pages_to_check):
257
+ text = doc.load_page(i).get_text("text") or ""
258
  total_text_length += len(text. strip())
259
 
260
  avg_text_length = total_text_length / pages_to_check
261
  is_image_based = avg_text_length < 50
262
 
263
+ print(f" PDF Type Detection: avg_text_length={avg_text_length:.1f} chars/page")
264
+ print(f" Classification: {'IMAGE-BASED' if is_image_based else 'TEXT-BASED'} PDF")
 
 
265
 
266
  return is_image_based, avg_text_length
267
 
268
 
269
  # ============================================================================
270
+ # TEXT-BASED PDF EXTRACTION
271
  # ============================================================================
272
 
 
273
  def normalize_text_for_search(s: str) -> str:
274
+ """Light normalization: collapse whitespace and normalize common separators."""
275
  if not s:
276
  return s
277
  s = s.replace("\u00A0", " ") # non-breaking space
 
281
 
282
 
283
  def try_extract_invoice_from_text(text: str) -> Optional[str]:
284
+ """Extract invoice number from text using regex patterns."""
 
 
 
 
 
285
  if not text:
286
  return None
287
 
288
  text_norm = normalize_text_for_search(text)
289
 
290
  # 1) Labeled invoice like "Invoice No", "Inv No."
291
+ m = INVOICE_NO_RE. search(text_norm)
292
  if m:
293
  inv = (m.group(1) or "").strip()
294
  if inv and inv.lower() not in ("invoice", "inv", "bill") and len(inv) > 2:
295
  return inv
296
 
297
+ # 2) Search top portion for prefixed invoice codes
298
+ top_text = text_norm[: 600]
299
  m = PREFIXED_INVOICE_RE.search(top_text)
300
  if m:
301
  inv = (m.group(1) or "").strip()
 
302
  if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
303
  return inv
304
 
305
+ # 3) Last-resort: GST detection
306
  gm = GST_LIKE_RE.search(text_norm)
307
  if gm:
308
  gst_val = gm.group(2) or ""
309
  gst_val = gst_val.replace(" ", "").strip().upper()
 
310
  if len(gst_val) == 15 and re.match(r"^[0-9A-Z]{15}$", gst_val):
 
311
  return f"GST:{gst_val}"
312
 
313
  return None
314
 
315
 
316
  def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
317
+ """Extract invoice number from TEXT-BASED PDF."""
 
 
 
318
  # Try full-page text
319
  text = page.get_text("text") or ""
320
  inv = try_extract_invoice_from_text(text)
 
333
 
334
 
335
  # ============================================================================
336
+ # IMAGE-BASED PDF EXTRACTION (Google Gemini with Auto-Switching)
337
  # ============================================================================
338
 
339
+ def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
340
  """
341
+ Extract invoice number from IMAGE-BASED PDF using Google Gemini.
342
+ With automatic model switching on quota exhaustion.
343
  """
344
+ # Check daily quota first
345
+ if not check_daily_quota():
346
+ print(" ❌ Daily quota exhausted, skipping Gemini")
347
+ return None
348
+
349
  model = get_gemini_model()
350
  if not model:
351
  print(" Gemini model not available")
352
  return None
353
 
354
+ # Check rate limit
355
+ if not gemini_rate_limiter.allow_request():
356
+ wait_time = gemini_rate_limiter.wait_time()
357
+ print(f" ⏱ Rate limit reached, waiting {int(wait_time)}s...")
358
+ time.sleep(wait_time + 1)
359
+ return extract_invoice_gemini(page, retry_count)
360
+
361
  try:
362
  # Convert page to image
363
  pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x resolution
364
  img_bytes = pix.tobytes("png")
 
 
365
  img = Image.open(io.BytesIO(img_bytes))
366
 
367
+ # Prompt for Gemini
368
  prompt = """
369
+ Extract the invoice number from this image. Look for:
370
+ - Invoice No, Invoice Number, Bill No, Bill Number, Document No
371
  - Any alphanumeric code that appears to be an invoice identifier
372
  - Purchase Order numbers if no invoice number is found
373
 
 
375
  If no invoice number is found, return "NOT_FOUND".
376
  """
377
 
378
+ model_name = GEMINI_MODELS[current_model_index]["name"]
379
+ print(f" Calling Gemini API (model: {model_name})...")
380
+
381
  response = model.generate_content([prompt, img])
382
 
383
  if response and response.text:
 
385
  print(f" Gemini response: {extracted_text}")
386
 
387
  if extracted_text and extracted_text != "NOT_FOUND":
388
+ invoice_no = extracted_text. replace("*", "").replace("#", "").strip()
 
 
389
  if invoice_no and len(invoice_no) > 2:
390
+ print(f" ✓ Gemini found invoice: {invoice_no}")
391
  return invoice_no
392
 
393
  # Fallback: Get full OCR text and try regex
394
+ ocr_prompt = "Extract all text from this invoice image. Return the complete text content."
395
  ocr_response = model.generate_content([ocr_prompt, img])
396
 
397
  if ocr_response and ocr_response.text:
398
+ print(f" Gemini extracted {len(ocr_response.text)} chars, trying regex...")
 
399
  inv = try_extract_invoice_from_text(ocr_response.text)
400
  if inv:
401
  print(f" ✓ Found via regex on Gemini text: {inv}")
 
404
  print(" ✗ Gemini: No invoice found")
405
  return None
406
 
407
+ except Exception as e:
408
+ error_str = str(e).lower()
409
+
410
+ # Handle quota exhausted errors
411
+ if "429" in str(e) or "quota" in error_str or "resource" in error_str:
412
+ print(f" ❌ QUOTA ERROR: {e}")
413
+ gemini_rate_limiter.record_quota_error()
414
+
415
+ # Check if it's daily quota
416
+ if "per_day" in error_str or "limit: 0" in str(e):
417
+ print(" ❌ DAILY quota exhausted")
418
+ mark_daily_quota_exhausted()
419
+ return None
420
+
421
+ # Per-minute quota - try switching model
422
+ if retry_count < len(GEMINI_MODELS) - 1:
423
+ print(f" 🔄 Switching to fallback model (attempt {retry_count + 1})...")
424
+ if switch_to_next_model():
425
+ time.sleep(2) # Brief delay before retry
426
+ return extract_invoice_gemini(page, retry_count + 1)
427
+
428
+ # Wait and retry once more with current model
429
+ if retry_count < len(GEMINI_MODELS):
430
+ retry_delay = 30
431
+ # Try to extract retry delay from error
432
+ import re as regex
433
+ match = regex.search(r'seconds:\s*(\d+)', str(e))
434
+ if match:
435
+ retry_delay = int(match.group(1)) + 2
436
+
437
+ print(f" ⏰ Waiting {retry_delay}s before final retry...")
438
+ time.sleep(retry_delay)
439
+ return extract_invoice_gemini(page, retry_count + 1)
440
+
441
+ print(" ❌ All retry attempts exhausted")
442
+ return None
443
+
444
+ # Other errors
445
  print(f" ✗ Gemini extraction failed: {e}")
446
  return None
447
 
 
452
 
453
  def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
454
  """Try text extraction first, then Gemini as fallback"""
 
455
  # ALWAYS try text extraction first (fast, no API cost)
456
  text_result = extract_invoice_text_based(page)
457
  if text_result:
 
459
  return text_result
460
 
461
  # If text fails AND PDF seems image-based, try Gemini
462
+ if is_image_pdf:
463
  gemini_result = extract_invoice_gemini(page)
464
  if gemini_result:
465
  print(f" ✓ Found via Gemini: {gemini_result}")
 
479
 
480
 
481
  # ============================================================================
482
+ # API ENDPOINTS
483
  # ============================================================================
484
 
485
  @app.post("/split-invoices")
486
  async def split_invoices(
487
  file: UploadFile = File(...),
488
  include_pdf: bool = Form(True),
489
+ initial_dpi: int = Form(300),
490
  ):
491
  """
492
  Split a multi-invoice PDF into separate PDFs based on invoice numbers.
493
+
494
+ Features:
495
+ - Text-based PDFs: Fast text extraction
496
+ - Image-based PDFs: Google Gemini with auto-model switching
497
+ - Auto-switches between models when quota exhausted
498
+ - Daily quota tracking with auto-reset
499
  """
500
  if not file.filename.lower().endswith(".pdf"):
501
  raise HTTPException(status_code=400, detail="only PDF is supported")
 
505
  raise HTTPException(status_code=400, detail="empty file")
506
 
507
  try:
508
+ doc = fitz. open(stream=file_bytes, filetype="pdf")
509
+ if doc. page_count == 0:
510
  raise HTTPException(status_code=400, detail="no pages found")
511
 
512
  print(f"\n{'='*60}")
513
  print(f"Processing PDF: {file.filename}")
514
  print(f"Total pages: {doc.page_count}")
515
+ if GEMINI_AVAILABLE:
516
+ model_status = GEMINI_MODELS[current_model_index]["name"]
517
+ print(f"Current Gemini model: {model_status}")
518
+ print(f"Daily quota exhausted: {daily_quota_exhausted}")
519
  print(f"{'='*60}")
520
 
521
+ # Step 1: Detect PDF type
522
  is_image_pdf, avg_text_len = is_image_based_pdf(doc)
523
 
524
  if is_image_pdf and not get_gemini_model():
525
+ if daily_quota_exhausted:
526
+ raise HTTPException(
527
+ status_code=429,
528
+ detail="Image-based PDF detected but Gemini API daily quota is exhausted. "
529
+ "Please try again tomorrow or use text-based PDFs."
530
+ )
531
+ else:
532
+ raise HTTPException(
533
+ status_code=500,
534
+ detail="Image-based PDF detected but Google Gemini is not configured. "
535
+ "Please add GEMINI_API_KEY to your environment variables."
536
+ )
537
 
538
  # Step 2: Extract invoice numbers from each page
539
+ page_invoice_nos: List[Optional[str]] = []
540
  for i in range(doc.page_count):
541
  print(f"\n--- Page {i+1}/{doc.page_count} ---")
542
+ inv = extract_invoice_no_from_page(doc. load_page(i), is_image_pdf)
543
+ if inv:
 
544
  print(f" ✓ Raw extracted id: {inv}")
545
  else:
546
+ print(f" ✗ No invoice found")
547
  page_invoice_nos.append(inv)
548
 
549
  print(f"\n{'='*60}")
550
  print(f"Raw Extraction Results: {page_invoice_nos}")
551
  print(f"{'='*60}")
552
 
553
+ # Step 3: Filter GST values
554
+ page_invoice_nos_filtered: List[Optional[str]] = []
555
+ for v in page_invoice_nos:
 
 
 
 
 
556
  if v is None:
557
  page_invoice_nos_filtered.append(None)
558
  else:
 
559
  if isinstance(v, str) and v.upper().startswith("GST:"):
560
  page_invoice_nos_filtered.append(None)
561
  else:
562
+ page_invoice_nos_filtered. append(v)
563
 
564
  print(f"Filtered (GST ignored) Results: {page_invoice_nos_filtered}")
565
 
566
+ # Step 4: Group pages by invoice number
567
  groups: List[Dict] = []
568
+ current_group_pages: List[int] = []
569
+ current_invoice: Optional[str] = None
570
 
571
  for idx, inv in enumerate(page_invoice_nos_filtered):
572
  if current_invoice is None:
 
573
  current_invoice = inv
574
  current_group_pages = [idx]
575
  else:
 
576
  if inv is not None and inv != current_invoice:
577
  groups.append({
578
  "invoice_no": current_invoice,
579
+ "pages": current_group_pages[: ],
580
  })
581
  current_invoice = inv
582
  current_group_pages = [idx]
583
  else:
 
584
  current_group_pages.append(idx)
585
 
586
  # Save last group
587
  if current_group_pages:
588
  groups.append({
589
+ "invoice_no": current_invoice,
590
  "pages": current_group_pages[:]
591
  })
592
 
593
+ # Post-process groups
 
594
  if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
595
  groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
596
+ groups. pop(0)
597
 
 
598
  if all(g["invoice_no"] is None for g in groups):
599
+ print("\n⚠ Warning: No invoices detected in any page!")
600
  print(" Returning entire PDF as single part")
601
  groups = [{
602
  "invoice_no": None,
603
  "pages": list(range(doc.page_count))
604
  }]
605
 
606
+ # Step 5: Build response parts
607
  parts = []
608
  for idx, g in enumerate(groups):
609
  part_bytes = build_pdf_from_pages(doc, g["pages"])
610
  info = {
 
611
  "invoice_no": g["invoice_no"],
612
+ "pages": [p + 1 for p in g["pages"]],
613
  "num_pages": len(g["pages"]),
614
  "size_bytes": len(part_bytes),
615
  }
616
  if include_pdf:
617
+ info["pdf_base64"] = base64.b64encode(part_bytes).decode("ascii")
 
618
  parts.append(info)
619
  print(f"\nPart {idx+1}:")
620
  print(f" Invoice: {g['invoice_no']}")
 
630
  return JSONResponse({
631
  "count": len(parts),
632
  "pdf_type": "image-based" if is_image_pdf else "text-based",
633
+ "current_model": GEMINI_MODELS[current_model_index]["name"] if GEMINI_AVAILABLE else None,
634
+ "quota_status": {
635
+ "daily_exhausted": daily_quota_exhausted,
636
+ "current_model_index": current_model_index,
637
+ "total_models": len(GEMINI_MODELS)
638
+ },
639
  "parts": parts
640
  })
641
 
642
+ except HTTPException:
643
  raise
644
  except Exception as e:
645
+ print(f"\n✗ Error: {str(e)}")
646
  import traceback
647
  traceback.print_exc()
648
  return JSONResponse({"error": str(e)}, status_code=500)
 
651
  @app.get("/health")
652
  async def health_check():
653
  """Health check endpoint to verify Gemini configuration."""
654
+ gemini_status = "not available"
655
+ current_model_name = None
656
+
657
+ if GEMINI_AVAILABLE and get_gemini_model():
658
+ gemini_status = "configured"
659
+ current_model_name = GEMINI_MODELS[current_model_index]["name"]
660
+
661
  return {
662
  "status": "healthy",
 
663
  "gemini_available": GEMINI_AVAILABLE,
664
+ "gemini_status": gemini_status,
665
+ "current_model": current_model_name,
666
+ "current_model_index": current_model_index,
667
+ "total_models": len(GEMINI_MODELS),
668
+ "daily_quota_exhausted": daily_quota_exhausted,
669
+ "quota_errors": gemini_rate_limiter.quota_error_count if GEMINI_AVAILABLE else 0,
670
  }
671
 
672
+
673
+ @app.post("/admin/reset-model")
674
+ async def admin_reset_model():
675
+ """Reset to primary Gemini model."""
676
+ if reset_to_primary_model():
677
+ return {
678
+ "message": "Successfully reset to primary model",
679
+ "current_model": GEMINI_MODELS[current_model_index]["name"],
680
+ "status": "success"
681
+ }
682
+ else:
683
+ return {
684
+ "message": "Already on primary model",
685
+ "current_model": GEMINI_MODELS[current_model_index]["name"],
686
+ "status": "info"
687
+ }
688
+
689
+
690
+ @app. get("/status")
691
+ async def get_status():
692
+ """Get detailed status of Gemini models and quota."""
693
+ return {
694
+ "current_model": {
695
+ "name": GEMINI_MODELS[current_model_index]["name"],
696
+ "description": GEMINI_MODELS[current_model_index]["description"],
697
+ "index": current_model_index,
698
+ "max_rpm": GEMINI_MODELS[current_model_index]["max_requests_per_minute"],
699
+ },
700
+ "all_models": [
701
+ {
702
+ "name": m["name"],
703
+ "description": m["description"],
704
+ "max_rpm": m["max_requests_per_minute"],
705
+ "is_active": i == current_model_index
706
+ }
707
+ for i, m in enumerate(GEMINI_MODELS)
708
+ ],
709
+ "quota_status": {
710
+ "daily_exhausted": daily_quota_exhausted,
711
+ "last_reset": last_quota_reset. isoformat() if last_quota_reset else None,
712
+ "quota_errors": gemini_rate_limiter.quota_error_count,
713
+ },
714
+ "timestamp": datetime.datetime.now().isoformat()
715
+ }
716
+
717
+
718
  if __name__ == "__main__":
719
  import uvicorn
720
+
721
+ print("="*80)
722
+ print("🚀 Starting Invoice Splitter API")
723
+ print("="*80)
724
+ print(f"📋 Available Gemini Models:")
725
+ for i, model in enumerate(GEMINI_MODELS):
726
+ prefix = "🎯 PRIMARY" if i == 0 else f"🔄 FALLBACK {i}"
727
+ print(f" {prefix}: {model['name']} - {model['description']}")
728
+ print(f" Rate Limit: {model['max_requests_per_minute']} req/min")
729
+ print("="*80)
730
+
731
+ uvicorn.run(app, host="0.0.0.0", port=7860)