anujakkulkarni commited on
Commit
5e18860
·
verified ·
1 Parent(s): c1e3bdc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -459
app.py CHANGED
@@ -4,10 +4,13 @@ import re
4
  import base64
5
  import time
6
  import datetime
 
 
7
  from typing import List, Dict, Optional, Tuple
8
  from collections import deque
 
9
 
10
- from fastapi import FastAPI, File, UploadFile, Form, HTTPException
11
  from fastapi.middleware.cors import CORSMiddleware
12
  from fastapi.responses import JSONResponse
13
  import fitz # PyMuPDF
@@ -17,9 +20,9 @@ try:
17
  import google.generativeai as genai
18
  from PIL import Image
19
  GEMINI_AVAILABLE = True
20
- except ImportError:
21
  GEMINI_AVAILABLE = False
22
- print("Warning: google-generativeai not installed. Image-based PDFs won't be supported.")
23
 
24
  app = FastAPI(title="Invoice Splitter API")
25
 
@@ -37,28 +40,22 @@ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
37
  # Model fallback list (in priority order)
38
  GEMINI_MODELS = [
39
  {
40
- "name": "gemini-2.5-flash-image", # PRIMARY - Recommended by Google
41
- "max_requests_per_minute": 50, # Higher quota limit
42
- "timeout": 300,
43
- "description": "Primary model with higher quota"
44
- },
45
- {
46
- "name": "gemini-2.0-flash", # Fallback
47
  "max_requests_per_minute": 15,
48
  "timeout": 300,
49
- "description": "Pro fallback"
50
  },
51
  {
52
- "name": "gemini-3-flash", # Fallback
53
- "max_requests_per_minute": 15,
54
  "timeout": 300,
55
- "description": "Pro fallback"
56
  },
57
  {
58
- "name": "gemini-2.0-flash-exp", # FALLBACK 1 - Your original choice
59
- "max_requests_per_minute": 9, # Conservative (under 10 limit)
60
  "timeout": 300,
61
- "description": "Fallback experimental model"
62
  }
63
  ]
64
 
@@ -78,7 +75,6 @@ class SimpleRateLimiter:
78
 
79
  def allow_request(self):
80
  now = time.time()
81
- # Remove old requests outside time window
82
  while self.requests and self.requests[0] < now - self.window_seconds:
83
  self.requests.popleft()
84
 
@@ -94,14 +90,13 @@ class SimpleRateLimiter:
94
  return max(0, self.window_seconds - (time.time() - oldest))
95
 
96
  def reset(self):
97
- self.requests. clear()
98
  self.quota_error_count = 0
99
 
100
  def record_quota_error(self):
101
  self.quota_error_count += 1
102
 
103
 
104
- # Initialize rate limiter for current model
105
  gemini_rate_limiter = SimpleRateLimiter(
106
  max_requests=GEMINI_MODELS[current_model_index]["max_requests_per_minute"],
107
  window_seconds=60
@@ -110,9 +105,7 @@ gemini_rate_limiter = SimpleRateLimiter(
110
 
111
  # --- Daily Quota Management ---
112
  def check_daily_quota():
113
- """Check if we should reset daily quota flag."""
114
  global last_quota_reset, daily_quota_exhausted
115
-
116
  now = datetime.datetime.now()
117
 
118
  if last_quota_reset is None:
@@ -120,12 +113,10 @@ def check_daily_quota():
120
  daily_quota_exhausted = False
121
  return True
122
 
123
- # Reset at midnight
124
  if now.date() > last_quota_reset.date():
125
  print("🔄 Daily quota reset detected")
126
  last_quota_reset = now
127
  daily_quota_exhausted = False
128
- # Also reset to primary model
129
  reset_to_primary_model()
130
  return True
131
 
@@ -133,349 +124,189 @@ def check_daily_quota():
133
 
134
 
135
  def mark_daily_quota_exhausted():
136
- """Mark daily quota as exhausted."""
137
  global daily_quota_exhausted
138
  daily_quota_exhausted = True
139
- next_reset = (datetime.datetime.now() + datetime.timedelta(days=1)).replace(
140
- hour=0, minute=0, second=0
141
- )
142
- print(f"❌ Daily quota exhausted - resets at {next_reset. strftime('%Y-%m-%d %H:%M')}")
143
 
144
 
145
- # --- Model Management Functions ---
146
  def get_gemini_model():
147
- """Get or create Gemini model instance with auto-fallback."""
148
  global gemini_model, current_model_index
149
-
150
- if not GEMINI_AVAILABLE:
151
- print("Gemini SDK not available")
152
- return None
153
-
154
- if not GEMINI_API_KEY:
155
- print("Warning: Gemini API key not found in environment variables.")
156
  return None
157
-
158
- # Check daily quota first
159
  if not check_daily_quota():
160
- print("Daily quota exhausted, Gemini unavailable until reset")
161
  return None
162
 
163
- # Try to initialize model if not already done
164
  if gemini_model is None:
165
  model_config = GEMINI_MODELS[current_model_index]
166
  try:
167
  genai.configure(api_key=GEMINI_API_KEY)
168
  gemini_model = genai.GenerativeModel(model_config["name"])
169
- print(f"✓ Initialized: {model_config['name']} ({model_config['description']})")
170
- except Exception as e:
171
  print(f"Failed to initialize {model_config['name']}: {e}")
172
  return None
173
-
174
  return gemini_model
175
 
176
 
177
  def switch_to_next_model():
178
- """Switch to next available model in fallback chain."""
179
  global gemini_model, current_model_index, gemini_rate_limiter
180
-
181
  if current_model_index < len(GEMINI_MODELS) - 1:
182
  current_model_index += 1
183
  model_config = GEMINI_MODELS[current_model_index]
184
-
185
- # Reset rate limiter with new model's limits
186
  gemini_rate_limiter = SimpleRateLimiter(
187
  max_requests=model_config["max_requests_per_minute"],
188
  window_seconds=60
189
  )
190
-
191
- # Force reinitialization
192
  gemini_model = None
193
-
194
- print(f"🔄 SWITCHED TO MODEL: {model_config['name']} ({model_config['description']})")
195
  return get_gemini_model()
196
- else:
197
- print("❌ All models exhausted!")
198
- return None
199
 
200
 
201
  def reset_to_primary_model():
202
- """Reset back to primary model."""
203
  global gemini_model, current_model_index, gemini_rate_limiter
204
-
205
  if current_model_index != 0:
206
- old_model = GEMINI_MODELS[current_model_index]['name']
207
  current_model_index = 0
208
  model_config = GEMINI_MODELS[0]
209
-
210
  gemini_rate_limiter = SimpleRateLimiter(
211
  max_requests=model_config["max_requests_per_minute"],
212
  window_seconds=60
213
  )
214
-
215
  gemini_model = None
216
- print(f"🔄 Reset from {old_model} to primary model: {model_config['name']}")
217
  return True
218
  return False
219
 
220
 
221
- # --- Regex patterns for text-based PDF extraction ---
222
  INVOICE_NO_RE = re.compile(
223
- r"""
224
- (?:
225
- Invoice\s*No\.?|
226
- Inv\. ?\s*No\.?|
227
- Bill\s*No\.?|
228
- Document\s*No\.?|
229
- Doc\s*No\.?|
230
- Tax\s*Invoice\s*No\.?
231
- )
232
- \s*[:\-]?\s*
233
- ([A-Z0-9][A-Z0-9\-\/]{3,})
234
- """,
235
  re.IGNORECASE | re.VERBOSE
236
  )
237
-
238
- PREFIXED_INVOICE_RE = re.compile(
239
- r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b"
240
- )
241
-
242
- GST_LIKE_RE = re.compile(
243
- r"\b((?: GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b",
244
- re.IGNORECASE
245
- )
246
 
247
 
248
  def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
249
- """
250
- Detect if PDF is image-based or text-based by sampling pages.
251
- Returns (is_image_based, avg_text_length).
252
- """
253
  total_text_length = 0
254
  pages_to_check = min(sample_pages, doc.page_count)
255
-
256
  for i in range(pages_to_check):
257
  text = doc.load_page(i).get_text("text") or ""
258
- total_text_length += len(text. strip())
259
-
260
  avg_text_length = total_text_length / pages_to_check
261
- is_image_based = avg_text_length < 50
262
-
263
- print(f" PDF Type Detection: avg_text_length={avg_text_length:.1f} chars/page")
264
- print(f" Classification: {'IMAGE-BASED' if is_image_based else 'TEXT-BASED'} PDF")
265
-
266
- return is_image_based, avg_text_length
267
 
268
 
269
- # ============================================================================
270
- # TEXT-BASED PDF EXTRACTION
271
- # ============================================================================
272
-
273
  def normalize_text_for_search(s: str) -> str:
274
- """Light normalization: collapse whitespace and normalize common separators."""
275
- if not s:
276
- return s
277
- s = s.replace("\u00A0", " ") # non-breaking space
278
- s = re.sub(r"[\r\n\t]+", " ", s)
279
- s = re.sub(r"[ ]{2,}", " ", s).strip()
280
- return s
281
 
282
 
283
  def try_extract_invoice_from_text(text: str) -> Optional[str]:
284
- """Extract invoice number from text using regex patterns."""
285
- if not text:
286
- return None
287
-
288
  text_norm = normalize_text_for_search(text)
289
-
290
- # 1) Labeled invoice like "Invoice No", "Inv No."
291
- m = INVOICE_NO_RE. search(text_norm)
292
  if m:
293
  inv = (m.group(1) or "").strip()
294
- if inv and inv.lower() not in ("invoice", "inv", "bill") and len(inv) > 2:
295
  return inv
296
-
297
- # 2) Search top portion for prefixed invoice codes
298
- top_text = text_norm[: 600]
299
- m = PREFIXED_INVOICE_RE.search(top_text)
300
  if m:
301
  inv = (m.group(1) or "").strip()
302
  if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
303
  return inv
304
-
305
- # 3) Last-resort: GST detection
306
  gm = GST_LIKE_RE.search(text_norm)
307
  if gm:
308
- gst_val = gm.group(2) or ""
309
- gst_val = gst_val.replace(" ", "").strip().upper()
310
- if len(gst_val) == 15 and re.match(r"^[0-9A-Z]{15}$", gst_val):
311
  return f"GST:{gst_val}"
312
-
313
- return None
314
-
315
-
316
- def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
317
- """Extract invoice number from TEXT-BASED PDF."""
318
- # Try full-page text
319
- text = page.get_text("text") or ""
320
- inv = try_extract_invoice_from_text(text)
321
- if inv:
322
- return inv
323
-
324
- # Try block-level text
325
- for block in (page.get_text("blocks") or []):
326
- block_text = block[4] if len(block) > 4 else ""
327
- if block_text:
328
- inv = try_extract_invoice_from_text(block_text)
329
- if inv:
330
- return inv
331
-
332
  return None
333
 
334
 
335
- # ============================================================================
336
- # IMAGE-BASED PDF EXTRACTION (Google Gemini with Auto-Switching)
337
- # ============================================================================
338
-
339
  def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
340
- """
341
- Extract invoice number from IMAGE-BASED PDF using Google Gemini.
342
- With automatic model switching on quota exhaustion.
343
- """
344
- # Check daily quota first
345
- if not check_daily_quota():
346
- print(" ❌ Daily quota exhausted, skipping Gemini")
347
- return None
348
-
349
  model = get_gemini_model()
350
- if not model:
351
- print(" Gemini model not available")
352
- return None
353
 
354
- # Check rate limit
355
  if not gemini_rate_limiter.allow_request():
356
  wait_time = gemini_rate_limiter.wait_time()
357
- print(f" ⏱ Rate limit reached, waiting {int(wait_time)}s...")
358
  time.sleep(wait_time + 1)
359
  return extract_invoice_gemini(page, retry_count)
360
 
361
  try:
362
- # Convert page to image
363
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x resolution
364
  img_bytes = pix.tobytes("png")
365
  img = Image.open(io.BytesIO(img_bytes))
366
 
367
- # Prompt for Gemini
368
- prompt = """
369
- Extract the invoice number from this image. Look for:
370
- - Invoice No, Invoice Number, Bill No, Bill Number, Document No
371
- - Any alphanumeric code that appears to be an invoice identifier
372
- - Purchase Order numbers if no invoice number is found
373
-
374
- Return ONLY the invoice number/identifier itself, nothing else.
375
- If no invoice number is found, return "NOT_FOUND".
376
- """
377
-
378
- model_name = GEMINI_MODELS[current_model_index]["name"]
379
- print(f" Calling Gemini API (model: {model_name})...")
380
 
381
  response = model.generate_content([prompt, img])
382
-
383
  if response and response.text:
384
- extracted_text = response.text.strip()
385
- print(f" Gemini response: {extracted_text}")
386
-
387
- if extracted_text and extracted_text != "NOT_FOUND":
388
- invoice_no = extracted_text. replace("*", "").replace("#", "").strip()
389
- if invoice_no and len(invoice_no) > 2:
390
- print(f" ✓ Gemini found invoice: {invoice_no}")
391
- return invoice_no
392
-
393
- # Fallback: Get full OCR text and try regex
394
- ocr_prompt = "Extract all text from this invoice image. Return the complete text content."
395
- ocr_response = model.generate_content([ocr_prompt, img])
396
-
397
- if ocr_response and ocr_response.text:
398
- print(f" Gemini extracted {len(ocr_response.text)} chars, trying regex...")
399
- inv = try_extract_invoice_from_text(ocr_response.text)
400
- if inv:
401
- print(f" ✓ Found via regex on Gemini text: {inv}")
402
- return inv
403
-
404
- print(" ✗ Gemini: No invoice found")
405
  return None
406
 
407
- except Exception as e:
408
  error_str = str(e).lower()
409
-
410
- # Handle quota exhausted errors
411
- if "429" in str(e) or "quota" in error_str or "resource" in error_str:
412
- print(f" ❌ QUOTA ERROR: {e}")
413
  gemini_rate_limiter.record_quota_error()
414
-
415
- # Check if it's daily quota
416
- if "per_day" in error_str or "limit: 0" in str(e):
417
- print(" ❌ DAILY quota exhausted")
418
  mark_daily_quota_exhausted()
419
  return None
420
-
421
- # Per-minute quota - try switching model
422
  if retry_count < len(GEMINI_MODELS) - 1:
423
- print(f" 🔄 Switching to fallback model (attempt {retry_count + 1})...")
424
  if switch_to_next_model():
425
- time.sleep(2) # Brief delay before retry
426
  return extract_invoice_gemini(page, retry_count + 1)
427
-
428
- # Wait and retry once more with current model
429
- if retry_count < len(GEMINI_MODELS):
430
- retry_delay = 30
431
- # Try to extract retry delay from error
432
- import re as regex
433
- match = regex.search(r'seconds:\s*(\d+)', str(e))
434
- if match:
435
- retry_delay = int(match.group(1)) + 2
436
-
437
- print(f" ⏰ Waiting {retry_delay}s before final retry...")
438
- time.sleep(retry_delay)
439
- return extract_invoice_gemini(page, retry_count + 1)
440
-
441
- print(" ❌ All retry attempts exhausted")
442
- return None
443
-
444
- # Other errors
445
- print(f" ✗ Gemini extraction failed: {e}")
446
  return None
447
 
448
 
449
- # ============================================================================
450
- # UNIFIED EXTRACTION LOGIC
451
- # ============================================================================
452
-
453
  def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
454
- """Try text extraction first, then Gemini as fallback"""
455
- # ALWAYS try text extraction first (fast, no API cost)
456
- text_result = extract_invoice_text_based(page)
457
- if text_result:
458
- print(f" ✓ Found via text extraction: {text_result}")
459
- return text_result
460
-
461
- # If text fails AND PDF seems image-based, try Gemini
462
- if is_image_pdf:
463
- gemini_result = extract_invoice_gemini(page)
464
- if gemini_result:
465
- print(f" ✓ Found via Gemini: {gemini_result}")
466
- return gemini_result
467
 
 
 
 
 
468
  return None
469
 
470
 
471
  def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
472
- """Create a new PDF with the given pages (0-based indices)."""
473
  out = fitz.open()
474
  for i in page_indices:
475
  out.insert_pdf(src_doc, from_page=i, to_page=i)
476
- pdf_bytes = out.tobytes()
477
- out.close()
478
- return pdf_bytes
 
 
 
 
 
 
 
479
 
480
 
481
  # ============================================================================
@@ -484,248 +315,119 @@ def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> byt
484
 
485
  @app.post("/split-invoices")
486
  async def split_invoices(
 
487
  file: UploadFile = File(...),
488
  include_pdf: bool = Form(True),
489
- initial_dpi: int = Form(300),
490
  ):
491
- """
492
- Split a multi-invoice PDF into separate PDFs based on invoice numbers.
493
-
494
- Features:
495
- - Text-based PDFs: Fast text extraction
496
- - Image-based PDFs: Google Gemini with auto-model switching
497
- - Auto-switches between models when quota exhausted
498
- - Daily quota tracking with auto-reset
499
- """
500
  if not file.filename.lower().endswith(".pdf"):
501
- raise HTTPException(status_code=400, detail="only PDF is supported")
502
 
503
- file_bytes = await file.read()
504
- if not file_bytes:
505
- raise HTTPException(status_code=400, detail="empty file")
 
506
 
507
  try:
508
- doc = fitz. open(stream=file_bytes, filetype="pdf")
509
- if doc. page_count == 0:
510
- raise HTTPException(status_code=400, detail="no pages found")
511
-
512
- print(f"\n{'='*60}")
513
- print(f"Processing PDF: {file.filename}")
514
- print(f"Total pages: {doc.page_count}")
515
- if GEMINI_AVAILABLE:
516
- model_status = GEMINI_MODELS[current_model_index]["name"]
517
- print(f"Current Gemini model: {model_status}")
518
- print(f"Daily quota exhausted: {daily_quota_exhausted}")
519
- print(f"{'='*60}")
520
-
521
- # Step 1: Detect PDF type
522
- is_image_pdf, avg_text_len = is_image_based_pdf(doc)
523
-
524
- if is_image_pdf and not get_gemini_model():
525
- if daily_quota_exhausted:
526
- raise HTTPException(
527
- status_code=429,
528
- detail="Image-based PDF detected but Gemini API daily quota is exhausted. "
529
- "Please try again tomorrow or use text-based PDFs."
530
- )
531
- else:
532
- raise HTTPException(
533
- status_code=500,
534
- detail="Image-based PDF detected but Google Gemini is not configured. "
535
- "Please add GEMINI_API_KEY to your environment variables."
536
- )
537
-
538
- # Step 2: Extract invoice numbers from each page
539
- page_invoice_nos: List[Optional[str]] = []
540
- for i in range(doc.page_count):
541
- print(f"\n--- Page {i+1}/{doc.page_count} ---")
542
- inv = extract_invoice_no_from_page(doc. load_page(i), is_image_pdf)
543
- if inv:
544
- print(f" ✓ Raw extracted id: {inv}")
545
- else:
546
- print(f" ✗ No invoice found")
547
- page_invoice_nos.append(inv)
548
-
549
- print(f"\n{'='*60}")
550
- print(f"Raw Extraction Results: {page_invoice_nos}")
551
- print(f"{'='*60}")
552
-
553
- # Step 3: Filter GST values
554
- page_invoice_nos_filtered: List[Optional[str]] = []
555
- for v in page_invoice_nos:
556
- if v is None:
557
- page_invoice_nos_filtered.append(None)
558
- else:
559
- if isinstance(v, str) and v.upper().startswith("GST:"):
560
- page_invoice_nos_filtered.append(None)
561
- else:
562
- page_invoice_nos_filtered. append(v)
563
 
564
- print(f"Filtered (GST ignored) Results: {page_invoice_nos_filtered}")
 
 
 
 
565
 
566
- # Step 4: Group pages by invoice number
567
- groups: List[Dict] = []
568
- current_group_pages: List[int] = []
569
- current_invoice: Optional[str] = None
570
 
571
- for idx, inv in enumerate(page_invoice_nos_filtered):
572
- if current_invoice is None:
573
- current_invoice = inv
574
- current_group_pages = [idx]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  else:
576
- if inv is not None and inv != current_invoice:
577
- groups.append({
578
- "invoice_no": current_invoice,
579
- "pages": current_group_pages[: ],
580
- })
581
- current_invoice = inv
582
- current_group_pages = [idx]
583
  else:
584
- current_group_pages.append(idx)
585
-
586
- # Save last group
587
- if current_group_pages:
588
- groups.append({
589
- "invoice_no": current_invoice,
590
- "pages": current_group_pages[:]
591
- })
592
 
593
- # Post-process groups
594
  if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
595
  groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
596
- groups. pop(0)
597
-
598
- if all(g["invoice_no"] is None for g in groups):
599
- print("\n⚠ Warning: No invoices detected in any page!")
600
- print(" Returning entire PDF as single part")
601
- groups = [{
602
- "invoice_no": None,
603
- "pages": list(range(doc.page_count))
604
- }]
605
 
606
- # Step 5: Build response parts
607
  parts = []
608
- for idx, g in enumerate(groups):
 
609
  part_bytes = build_pdf_from_pages(doc, g["pages"])
610
  info = {
611
  "invoice_no": g["invoice_no"],
612
  "pages": [p + 1 for p in g["pages"]],
613
- "num_pages": len(g["pages"]),
614
- "size_bytes": len(part_bytes),
615
  }
616
  if include_pdf:
617
  info["pdf_base64"] = base64.b64encode(part_bytes).decode("ascii")
618
  parts.append(info)
619
- print(f"\nPart {idx+1}:")
620
- print(f" Invoice: {g['invoice_no']}")
621
- print(f" Pages: {info['pages']}")
622
- print(f" Size: {len(part_bytes):,} bytes")
623
 
624
  doc.close()
625
-
626
- print(f"\n{'='*60}")
627
- print(f"✓ Successfully split into {len(parts)} part(s)")
628
- print(f"{'='*60}\n")
629
-
630
  return JSONResponse({
631
  "count": len(parts),
632
- "pdf_type": "image-based" if is_image_pdf else "text-based",
633
- "current_model": GEMINI_MODELS[current_model_index]["name"] if GEMINI_AVAILABLE else None,
634
- "quota_status": {
635
- "daily_exhausted": daily_quota_exhausted,
636
- "current_model_index": current_model_index,
637
- "total_models": len(GEMINI_MODELS)
638
- },
639
- "parts": parts
640
  })
641
 
642
- except HTTPException:
643
- raise
644
  except Exception as e:
645
- print(f"\n✗ Error: {str(e)}")
646
  import traceback
647
  traceback.print_exc()
648
  return JSONResponse({"error": str(e)}, status_code=500)
649
-
650
-
651
- @app.get("/health")
652
- async def health_check():
653
- """Health check endpoint to verify Gemini configuration."""
654
- gemini_status = "not available"
655
- current_model_name = None
656
-
657
- if GEMINI_AVAILABLE and get_gemini_model():
658
- gemini_status = "configured"
659
- current_model_name = GEMINI_MODELS[current_model_index]["name"]
660
-
661
- return {
662
- "status": "healthy",
663
- "gemini_available": GEMINI_AVAILABLE,
664
- "gemini_status": gemini_status,
665
- "current_model": current_model_name,
666
- "current_model_index": current_model_index,
667
- "total_models": len(GEMINI_MODELS),
668
- "daily_quota_exhausted": daily_quota_exhausted,
669
- "quota_errors": gemini_rate_limiter.quota_error_count if GEMINI_AVAILABLE else 0,
670
- }
671
-
672
-
673
- @app.post("/admin/reset-model")
674
- async def admin_reset_model():
675
- """Reset to primary Gemini model."""
676
- if reset_to_primary_model():
677
- return {
678
- "message": "Successfully reset to primary model",
679
- "current_model": GEMINI_MODELS[current_model_index]["name"],
680
- "status": "success"
681
- }
682
- else:
683
- return {
684
- "message": "Already on primary model",
685
- "current_model": GEMINI_MODELS[current_model_index]["name"],
686
- "status": "info"
687
- }
688
-
689
-
690
- @app. get("/status")
691
- async def get_status():
692
- """Get detailed status of Gemini models and quota."""
693
- return {
694
- "current_model": {
695
- "name": GEMINI_MODELS[current_model_index]["name"],
696
- "description": GEMINI_MODELS[current_model_index]["description"],
697
- "index": current_model_index,
698
- "max_rpm": GEMINI_MODELS[current_model_index]["max_requests_per_minute"],
699
- },
700
- "all_models": [
701
- {
702
- "name": m["name"],
703
- "description": m["description"],
704
- "max_rpm": m["max_requests_per_minute"],
705
- "is_active": i == current_model_index
706
- }
707
- for i, m in enumerate(GEMINI_MODELS)
708
- ],
709
- "quota_status": {
710
- "daily_exhausted": daily_quota_exhausted,
711
- "last_reset": last_quota_reset. isoformat() if last_quota_reset else None,
712
- "quota_errors": gemini_rate_limiter.quota_error_count,
713
- },
714
- "timestamp": datetime.datetime.now().isoformat()
715
- }
716
 
717
 
718
  if __name__ == "__main__":
719
  import uvicorn
720
-
721
- print("="*80)
722
- print("🚀 Starting Invoice Splitter API")
723
- print("="*80)
724
- print(f"📋 Available Gemini Models:")
725
- for i, model in enumerate(GEMINI_MODELS):
726
- prefix = "🎯 PRIMARY" if i == 0 else f"🔄 FALLBACK {i}"
727
- print(f" {prefix}: {model['name']} - {model['description']}")
728
- print(f" Rate Limit: {model['max_requests_per_minute']} req/min")
729
- print("="*80)
730
-
731
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
4
  import base64
5
  import time
6
  import datetime
7
+ import shutil
8
+ import tempfile
9
  from typing import List, Dict, Optional, Tuple
10
  from collections import deque
11
+ from pathlib import Path
12
 
13
+ from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
14
  from fastapi.middleware.cors import CORSMiddleware
15
  from fastapi.responses import JSONResponse
16
  import fitz # PyMuPDF
 
20
  import google.generativeai as genai
21
  from PIL import Image
22
  GEMINI_AVAILABLE = True
23
+ except ImportError:
24
  GEMINI_AVAILABLE = False
25
+ print("Warning: google-generativeai not installed. Image-based PDFs won't be supported.")
26
 
27
  app = FastAPI(title="Invoice Splitter API")
28
 
 
40
  # Model fallback list (in priority order)
41
  GEMINI_MODELS = [
42
  {
43
+ "name": "gemini-1.5-flash", # UPDATED: Current standard fast model
 
 
 
 
 
 
44
  "max_requests_per_minute": 15,
45
  "timeout": 300,
46
+ "description": "Primary fast model"
47
  },
48
  {
49
+ "name": "gemini-2.0-flash-exp", # Fallback experimental
50
+ "max_requests_per_minute": 10,
51
  "timeout": 300,
52
+ "description": "Experimental fallback"
53
  },
54
  {
55
+ "name": "gemini-1.5-pro", # Slower fallback
56
+ "max_requests_per_minute": 2,
57
  "timeout": 300,
58
+ "description": "Pro fallback (slower)"
59
  }
60
  ]
61
 
 
75
 
76
  def allow_request(self):
77
  now = time.time()
 
78
  while self.requests and self.requests[0] < now - self.window_seconds:
79
  self.requests.popleft()
80
 
 
90
  return max(0, self.window_seconds - (time.time() - oldest))
91
 
92
  def reset(self):
93
+ self.requests.clear()
94
  self.quota_error_count = 0
95
 
96
  def record_quota_error(self):
97
  self.quota_error_count += 1
98
 
99
 
 
100
  gemini_rate_limiter = SimpleRateLimiter(
101
  max_requests=GEMINI_MODELS[current_model_index]["max_requests_per_minute"],
102
  window_seconds=60
 
105
 
106
  # --- Daily Quota Management ---
107
  def check_daily_quota():
 
108
  global last_quota_reset, daily_quota_exhausted
 
109
  now = datetime.datetime.now()
110
 
111
  if last_quota_reset is None:
 
113
  daily_quota_exhausted = False
114
  return True
115
 
 
116
  if now.date() > last_quota_reset.date():
117
  print("🔄 Daily quota reset detected")
118
  last_quota_reset = now
119
  daily_quota_exhausted = False
 
120
  reset_to_primary_model()
121
  return True
122
 
 
124
 
125
 
126
  def mark_daily_quota_exhausted():
 
127
  global daily_quota_exhausted
128
  daily_quota_exhausted = True
129
+ print(f"❌ Daily quota exhausted")
 
 
 
130
 
131
 
132
+ # --- Model Management ---
133
  def get_gemini_model():
 
134
  global gemini_model, current_model_index
135
+ if not GEMINI_AVAILABLE or not GEMINI_API_KEY:
 
 
 
 
 
 
136
  return None
 
 
137
  if not check_daily_quota():
 
138
  return None
139
 
 
140
  if gemini_model is None:
141
  model_config = GEMINI_MODELS[current_model_index]
142
  try:
143
  genai.configure(api_key=GEMINI_API_KEY)
144
  gemini_model = genai.GenerativeModel(model_config["name"])
145
+ print(f"✓ Initialized: {model_config['name']}")
146
+ except Exception as e:
147
  print(f"Failed to initialize {model_config['name']}: {e}")
148
  return None
 
149
  return gemini_model
150
 
151
 
152
  def switch_to_next_model():
 
153
  global gemini_model, current_model_index, gemini_rate_limiter
 
154
  if current_model_index < len(GEMINI_MODELS) - 1:
155
  current_model_index += 1
156
  model_config = GEMINI_MODELS[current_model_index]
 
 
157
  gemini_rate_limiter = SimpleRateLimiter(
158
  max_requests=model_config["max_requests_per_minute"],
159
  window_seconds=60
160
  )
 
 
161
  gemini_model = None
162
+ print(f"🔄 SWITCHED TO MODEL: {model_config['name']}")
 
163
  return get_gemini_model()
164
+ return None
 
 
165
 
166
 
167
  def reset_to_primary_model():
 
168
  global gemini_model, current_model_index, gemini_rate_limiter
 
169
  if current_model_index != 0:
 
170
  current_model_index = 0
171
  model_config = GEMINI_MODELS[0]
 
172
  gemini_rate_limiter = SimpleRateLimiter(
173
  max_requests=model_config["max_requests_per_minute"],
174
  window_seconds=60
175
  )
 
176
  gemini_model = None
 
177
  return True
178
  return False
179
 
180
 
181
+ # --- Regex Patterns ---
182
  INVOICE_NO_RE = re.compile(
183
+ r"""(?:Invoice\s*No\.?|Inv\. ?\s*No\.?|Bill\s*No\.?|Document\s*No\.?|Doc\s*No\.?|Tax\s*Invoice\s*No\.?)\s*[:\-]?\s*([A-Z0-9][A-Z0-9\-\/]{3,})""",
 
 
 
 
 
 
 
 
 
 
 
184
  re.IGNORECASE | re.VERBOSE
185
  )
186
+ PREFIXED_INVOICE_RE = re.compile(r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b")
187
+ GST_LIKE_RE = re.compile(r"\b((?:GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b", re.IGNORECASE)
 
 
 
 
 
 
 
188
 
189
 
190
  def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
 
 
 
 
191
  total_text_length = 0
192
  pages_to_check = min(sample_pages, doc.page_count)
 
193
  for i in range(pages_to_check):
194
  text = doc.load_page(i).get_text("text") or ""
195
+ total_text_length += len(text.strip())
 
196
  avg_text_length = total_text_length / pages_to_check
197
+ return avg_text_length < 50, avg_text_length
 
 
 
 
 
198
 
199
 
200
+ # --- Extraction Logic ---
 
 
 
201
  def normalize_text_for_search(s: str) -> str:
202
+ if not s: return s
203
+ s = s.replace("\u00A0", " ")
204
+ return re.sub(r"[ ]{2,}", " ", re.sub(r"[\r\n\t]+", " ", s)).strip()
 
 
 
 
205
 
206
 
207
  def try_extract_invoice_from_text(text: str) -> Optional[str]:
208
+ if not text: return None
 
 
 
209
  text_norm = normalize_text_for_search(text)
210
+
211
+ m = INVOICE_NO_RE.search(text_norm)
 
212
  if m:
213
  inv = (m.group(1) or "").strip()
214
+ if inv and len(inv) > 2 and inv.lower() not in ("invoice", "bill"):
215
  return inv
216
+
217
+ m = PREFIXED_INVOICE_RE.search(text_norm[:600])
 
 
218
  if m:
219
  inv = (m.group(1) or "").strip()
220
  if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
221
  return inv
222
+
 
223
  gm = GST_LIKE_RE.search(text_norm)
224
  if gm:
225
+ gst_val = gm.group(2).replace(" ", "").strip().upper()
226
+ if len(gst_val) == 15:
 
227
  return f"GST:{gst_val}"
228
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  return None
230
 
231
 
 
 
 
 
232
  def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
233
+ if not check_daily_quota(): return None
 
 
 
 
 
 
 
 
234
  model = get_gemini_model()
235
+ if not model: return None
 
 
236
 
 
237
  if not gemini_rate_limiter.allow_request():
238
  wait_time = gemini_rate_limiter.wait_time()
239
+ print(f" ⏱ Rate limit, waiting {int(wait_time)}s...")
240
  time.sleep(wait_time + 1)
241
  return extract_invoice_gemini(page, retry_count)
242
 
243
  try:
244
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
 
245
  img_bytes = pix.tobytes("png")
246
  img = Image.open(io.BytesIO(img_bytes))
247
 
248
+ prompt = """Extract the invoice number. Return ONLY the number. If not found, return 'NOT_FOUND'."""
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
  response = model.generate_content([prompt, img])
 
251
  if response and response.text:
252
+ txt = response.text.strip().replace("*", "").replace("#", "")
253
+ if txt and txt != "NOT_FOUND" and len(txt) > 2:
254
+ return txt
255
+
256
+ # Fallback to OCR text
257
+ ocr_resp = model.generate_content(["Extract all text.", img])
258
+ if ocr_resp and ocr_resp.text:
259
+ return try_extract_invoice_from_text(ocr_resp.text)
260
+
 
 
 
 
 
 
 
 
 
 
 
 
261
  return None
262
 
263
+ except Exception as e:
264
  error_str = str(e).lower()
265
+ if "429" in str(e) or "quota" in error_str:
 
 
 
266
  gemini_rate_limiter.record_quota_error()
267
+ if "per_day" in error_str:
 
 
 
268
  mark_daily_quota_exhausted()
269
  return None
 
 
270
  if retry_count < len(GEMINI_MODELS) - 1:
 
271
  if switch_to_next_model():
 
272
  return extract_invoice_gemini(page, retry_count + 1)
273
+ print(f" ✗ Gemini Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  return None
275
 
276
 
 
 
 
 
277
  def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
278
+ # 1. Try Text Extraction (Fastest)
279
+ text = page.get_text("text") or ""
280
+ inv = try_extract_invoice_from_text(text)
281
+ if inv: return inv
282
+
283
+ # 2. Try Block Extraction
284
+ for block in (page.get_text("blocks") or []):
285
+ if len(block) > 4 and block[4]:
286
+ inv = try_extract_invoice_from_text(block[4])
287
+ if inv: return inv
 
 
 
288
 
289
+ # 3. Gemini Fallback (Only if enabled and seemingly image-based)
290
+ if is_image_pdf:
291
+ return extract_invoice_gemini(page)
292
+
293
  return None
294
 
295
 
296
  def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
 
297
  out = fitz.open()
298
  for i in page_indices:
299
  out.insert_pdf(src_doc, from_page=i, to_page=i)
300
+ return out.tobytes()
301
+
302
+
303
+ # --- File Cleanup Utility ---
304
+ def remove_file(path: str):
305
+ try:
306
+ os.remove(path)
307
+ print(f"🧹 Cleaned up temp file: {path}")
308
+ except Exception as e:
309
+ print(f"Warning: Could not remove temp file {path}: {e}")
310
 
311
 
312
  # ============================================================================
 
315
 
316
  @app.post("/split-invoices")
317
  async def split_invoices(
318
+ background_tasks: BackgroundTasks,
319
  file: UploadFile = File(...),
320
  include_pdf: bool = Form(True),
 
321
  ):
 
 
 
 
 
 
 
 
 
322
  if not file.filename.lower().endswith(".pdf"):
323
+ raise HTTPException(status_code=400, detail="Only PDF supported")
324
 
325
+ # --- FIX FOR 100MB FILES: STREAM TO DISK ---
326
+ # Create a temporary file to store the upload
327
+ fd, temp_path = tempfile.mkstemp(suffix=".pdf")
328
+ os.close(fd) # Close the low-level file descriptor immediately
329
 
330
  try:
331
+ # Stream upload chunks to disk to keep RAM low
332
+ print(f"📥 Receiving large file: {file.filename}")
333
+ with open(temp_path, "wb") as buffer:
334
+ # Read in 1MB chunks
335
+ while content := await file.read(1024 * 1024):
336
+ buffer.write(content)
337
+
338
+ print(f"💾 Saved to temp disk: {temp_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
+ # Open Document from DISK (Lazy loading)
341
+ doc = fitz.open(temp_path)
342
+
343
+ if doc.page_count == 0:
344
+ raise HTTPException(status_code=400, detail="Empty PDF")
345
 
346
+ print(f"Processing {doc.page_count} pages...")
347
+
348
+ # Step 1: Detect Type
349
+ is_image_pdf, _ = is_image_based_pdf(doc)
350
 
351
+ # Step 2: Extraction Loop
352
+ page_invoice_nos = []
353
+ for i in range(doc.page_count):
354
+ # Load only one page into memory at a time
355
+ page = doc.load_page(i)
356
+ inv = extract_invoice_no_from_page(page, is_image_pdf)
357
+ page_invoice_nos.append(inv)
358
+ # Explicitly dereference page to help garbage collector
359
+ del page
360
+
361
+ # Step 3: Filtering & Grouping
362
+ clean_invs = [
363
+ None if (v and v.upper().startswith("GST:")) else v
364
+ for v in page_invoice_nos
365
+ ]
366
+
367
+ groups = []
368
+ current_group = []
369
+ current_inv = None
370
+
371
+ for idx, inv in enumerate(clean_invs):
372
+ if current_inv is None:
373
+ current_inv = inv
374
+ current_group = [idx]
375
  else:
376
+ if inv is not None and inv != current_inv:
377
+ # Save previous group
378
+ groups.append({"invoice_no": current_inv, "pages": current_group})
379
+ # Start new group
380
+ current_inv = inv
381
+ current_group = [idx]
 
382
  else:
383
+ current_group.append(idx)
384
+
385
+ if current_group:
386
+ groups.append({"invoice_no": current_inv, "pages": current_group})
 
 
 
 
387
 
388
+ # Logic Fix: If first page has no invoice, merge with second group if valid
389
  if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
390
  groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
391
+ groups.pop(0)
 
 
 
 
 
 
 
 
392
 
393
+ # Step 4: Build Response
394
  parts = []
395
+ for g in groups:
396
+ # Generate bytes only for specific pages
397
  part_bytes = build_pdf_from_pages(doc, g["pages"])
398
  info = {
399
  "invoice_no": g["invoice_no"],
400
  "pages": [p + 1 for p in g["pages"]],
401
+ "size_bytes": len(part_bytes)
 
402
  }
403
  if include_pdf:
404
  info["pdf_base64"] = base64.b64encode(part_bytes).decode("ascii")
405
  parts.append(info)
 
 
 
 
406
 
407
  doc.close()
408
+
 
 
 
 
409
  return JSONResponse({
410
  "count": len(parts),
411
+ "parts": parts,
412
+ "quota_status": {"daily_exhausted": daily_quota_exhausted}
 
 
 
 
 
 
413
  })
414
 
 
 
415
  except Exception as e:
416
+ print(f"Critical Error: {e}")
417
  import traceback
418
  traceback.print_exc()
419
  return JSONResponse({"error": str(e)}, status_code=500)
420
+
421
+ finally:
422
+ # --- CRITICAL CLEANUP ---
423
+ # Ensure temp file is deleted even if code crashes
424
+ # Use background task to delete file after response is sent if you want,
425
+ # but here we do it synchronously to be safe.
426
+ remove_file(temp_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
 
428
 
429
  if __name__ == "__main__":
430
  import uvicorn
431
+ print("🚀 Starting High-Performance Invoice Splitter")
432
+ # Workers=1 ensures rate limiter works correctly
433
+ uvicorn.run(app, host="0.0.0.0", port=7860, workers=1)