anujakkulkarni commited on
Commit
b8cd992
·
verified ·
1 Parent(s): 5e18860

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +397 -96
app.py CHANGED
@@ -6,13 +6,15 @@ import time
6
  import datetime
7
  import shutil
8
  import tempfile
 
9
  from typing import List, Dict, Optional, Tuple
10
  from collections import deque
11
  from pathlib import Path
12
 
13
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
14
  from fastapi.middleware.cors import CORSMiddleware
15
- from fastapi.responses import JSONResponse
 
16
  import fitz # PyMuPDF
17
 
18
  # Google Gemini - optional import
@@ -22,10 +24,13 @@ try:
22
  GEMINI_AVAILABLE = True
23
  except ImportError:
24
  GEMINI_AVAILABLE = False
25
- print("Warning: google-generativeai not installed. Image-based PDFs won't be supported.")
26
 
27
  app = FastAPI(title="Invoice Splitter API")
28
 
 
 
 
29
  app.add_middleware(
30
  CORSMiddleware,
31
  allow_origins=["*"],
@@ -42,7 +47,7 @@ GEMINI_MODELS = [
42
  {
43
  "name": "gemini-1.5-flash", # UPDATED: Current standard fast model
44
  "max_requests_per_minute": 15,
45
- "timeout": 300,
46
  "description": "Primary fast model"
47
  },
48
  {
@@ -90,7 +95,7 @@ class SimpleRateLimiter:
90
  return max(0, self.window_seconds - (time.time() - oldest))
91
 
92
  def reset(self):
93
- self.requests.clear()
94
  self.quota_error_count = 0
95
 
96
  def record_quota_error(self):
@@ -108,12 +113,12 @@ def check_daily_quota():
108
  global last_quota_reset, daily_quota_exhausted
109
  now = datetime.datetime.now()
110
 
111
- if last_quota_reset is None:
112
  last_quota_reset = now
113
  daily_quota_exhausted = False
114
  return True
115
 
116
- if now.date() > last_quota_reset.date():
117
  print("🔄 Daily quota reset detected")
118
  last_quota_reset = now
119
  daily_quota_exhausted = False
@@ -142,8 +147,8 @@ def get_gemini_model():
142
  try:
143
  genai.configure(api_key=GEMINI_API_KEY)
144
  gemini_model = genai.GenerativeModel(model_config["name"])
145
- print(f"✓ Initialized: {model_config['name']}")
146
- except Exception as e:
147
  print(f"Failed to initialize {model_config['name']}: {e}")
148
  return None
149
  return gemini_model
@@ -159,7 +164,7 @@ def switch_to_next_model():
159
  window_seconds=60
160
  )
161
  gemini_model = None
162
- print(f"🔄 SWITCHED TO MODEL: {model_config['name']}")
163
  return get_gemini_model()
164
  return None
165
 
@@ -180,11 +185,11 @@ def reset_to_primary_model():
180
 
181
  # --- Regex Patterns ---
182
  INVOICE_NO_RE = re.compile(
183
- r"""(?:Invoice\s*No\.?|Inv\. ?\s*No\.?|Bill\s*No\.?|Document\s*No\.?|Doc\s*No\.?|Tax\s*Invoice\s*No\.?)\s*[:\-]?\s*([A-Z0-9][A-Z0-9\-\/]{3,})""",
184
  re.IGNORECASE | re.VERBOSE
185
  )
186
  PREFIXED_INVOICE_RE = re.compile(r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b")
187
- GST_LIKE_RE = re.compile(r"\b((?:GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b", re.IGNORECASE)
188
 
189
 
190
  def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
@@ -192,29 +197,31 @@ def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool,
192
  pages_to_check = min(sample_pages, doc.page_count)
193
  for i in range(pages_to_check):
194
  text = doc.load_page(i).get_text("text") or ""
195
- total_text_length += len(text.strip())
196
  avg_text_length = total_text_length / pages_to_check
197
  return avg_text_length < 50, avg_text_length
198
 
199
 
200
  # --- Extraction Logic ---
201
  def normalize_text_for_search(s: str) -> str:
202
- if not s: return s
 
203
  s = s.replace("\u00A0", " ")
204
  return re.sub(r"[ ]{2,}", " ", re.sub(r"[\r\n\t]+", " ", s)).strip()
205
 
206
 
207
  def try_extract_invoice_from_text(text: str) -> Optional[str]:
208
- if not text: return None
 
209
  text_norm = normalize_text_for_search(text)
210
 
211
- m = INVOICE_NO_RE.search(text_norm)
212
  if m:
213
  inv = (m.group(1) or "").strip()
214
- if inv and len(inv) > 2 and inv.lower() not in ("invoice", "bill"):
215
  return inv
216
 
217
- m = PREFIXED_INVOICE_RE.search(text_norm[:600])
218
  if m:
219
  inv = (m.group(1) or "").strip()
220
  if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
@@ -229,10 +236,12 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
229
  return None
230
 
231
 
232
- def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
233
- if not check_daily_quota(): return None
 
234
  model = get_gemini_model()
235
- if not model: return None
 
236
 
237
  if not gemini_rate_limiter.allow_request():
238
  wait_time = gemini_rate_limiter.wait_time()
@@ -241,28 +250,40 @@ def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
241
  return extract_invoice_gemini(page, retry_count)
242
 
243
  try:
244
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
 
245
  img_bytes = pix.tobytes("png")
 
 
 
 
246
  img = Image.open(io.BytesIO(img_bytes))
247
 
248
- prompt = """Extract the invoice number. Return ONLY the number. If not found, return 'NOT_FOUND'."""
249
 
250
  response = model.generate_content([prompt, img])
 
 
 
251
  if response and response.text:
252
  txt = response.text.strip().replace("*", "").replace("#", "")
253
  if txt and txt != "NOT_FOUND" and len(txt) > 2:
254
- return txt
255
-
256
- # Fallback to OCR text
257
- ocr_resp = model.generate_content(["Extract all text.", img])
258
- if ocr_resp and ocr_resp.text:
259
- return try_extract_invoice_from_text(ocr_resp.text)
260
-
261
- return None
 
 
 
 
262
 
263
  except Exception as e:
264
  error_str = str(e).lower()
265
- if "429" in str(e) or "quota" in error_str:
266
  gemini_rate_limiter.record_quota_error()
267
  if "per_day" in error_str:
268
  mark_daily_quota_exhausted()
@@ -278,89 +299,174 @@ def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optiona
278
  # 1. Try Text Extraction (Fastest)
279
  text = page.get_text("text") or ""
280
  inv = try_extract_invoice_from_text(text)
281
- if inv: return inv
 
282
 
283
  # 2. Try Block Extraction
284
  for block in (page.get_text("blocks") or []):
285
- if len(block) > 4 and block[4]:
286
  inv = try_extract_invoice_from_text(block[4])
287
- if inv: return inv
 
288
 
289
  # 3. Gemini Fallback (Only if enabled and seemingly image-based)
290
- if is_image_pdf:
291
  return extract_invoice_gemini(page)
292
 
293
  return None
294
 
295
 
296
  def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
 
297
  out = fitz.open()
298
- for i in page_indices:
299
- out.insert_pdf(src_doc, from_page=i, to_page=i)
300
- return out.tobytes()
 
 
 
 
 
 
301
 
302
 
303
  # --- File Cleanup Utility ---
304
  def remove_file(path: str):
305
  try:
306
- os.remove(path)
307
- print(f"🧹 Cleaned up temp file: {path}")
 
308
  except Exception as e:
309
- print(f"Warning: Could not remove temp file {path}: {e}")
310
 
311
 
312
  # ============================================================================
313
  # API ENDPOINTS
314
  # ============================================================================
315
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  @app.post("/split-invoices")
317
  async def split_invoices(
318
  background_tasks: BackgroundTasks,
319
  file: UploadFile = File(...),
320
  include_pdf: bool = Form(True),
 
321
  ):
322
- if not file.filename.lower().endswith(".pdf"):
323
- raise HTTPException(status_code=400, detail="Only PDF supported")
 
 
 
 
 
 
 
 
 
 
 
324
 
325
- # --- FIX FOR 100MB FILES: STREAM TO DISK ---
326
- # Create a temporary file to store the upload
327
- fd, temp_path = tempfile.mkstemp(suffix=".pdf")
328
- os.close(fd) # Close the low-level file descriptor immediately
 
 
 
329
 
330
  try:
331
- # Stream upload chunks to disk to keep RAM low
332
- print(f"📥 Receiving large file: {file.filename}")
 
 
333
  with open(temp_path, "wb") as buffer:
334
- # Read in 1MB chunks
335
- while content := await file.read(1024 * 1024):
 
 
 
 
 
 
 
 
 
 
 
336
  buffer.write(content)
 
 
 
 
337
 
338
- print(f"💾 Saved to temp disk: {temp_path}")
 
339
 
340
- # Open Document from DISK (Lazy loading)
341
  doc = fitz.open(temp_path)
342
 
343
- if doc.page_count == 0:
344
- raise HTTPException(status_code=400, detail="Empty PDF")
345
 
346
- print(f"Processing {doc.page_count} pages...")
347
 
348
- # Step 1: Detect Type
349
- is_image_pdf, _ = is_image_based_pdf(doc)
 
 
350
 
351
- # Step 2: Extraction Loop
352
  page_invoice_nos = []
353
- for i in range(doc.page_count):
354
- # Load only one page into memory at a time
355
- page = doc.load_page(i)
356
- inv = extract_invoice_no_from_page(page, is_image_pdf)
357
- page_invoice_nos.append(inv)
358
- # Explicitly dereference page to help garbage collector
359
- del page
360
-
361
- # Step 3: Filtering & Grouping
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  clean_invs = [
363
- None if (v and v.upper().startswith("GST:")) else v
364
  for v in page_invoice_nos
365
  ]
366
 
@@ -369,65 +475,260 @@ async def split_invoices(
369
  current_inv = None
370
 
371
  for idx, inv in enumerate(clean_invs):
372
- if current_inv is None:
373
  current_inv = inv
374
  current_group = [idx]
375
  else:
376
  if inv is not None and inv != current_inv:
377
  # Save previous group
378
- groups.append({"invoice_no": current_inv, "pages": current_group})
379
  # Start new group
380
  current_inv = inv
381
  current_group = [idx]
382
- else:
383
  current_group.append(idx)
384
 
385
  if current_group:
386
- groups.append({"invoice_no": current_inv, "pages": current_group})
387
 
388
- # Logic Fix: If first page has no invoice, merge with second group if valid
389
  if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
 
390
  groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
391
- groups.pop(0)
 
 
392
 
393
- # Step 4: Build Response
394
  parts = []
395
- for g in groups:
396
- # Generate bytes only for specific pages
 
 
 
 
397
  part_bytes = build_pdf_from_pages(doc, g["pages"])
 
398
  info = {
399
  "invoice_no": g["invoice_no"],
400
- "pages": [p + 1 for p in g["pages"]],
401
- "size_bytes": len(part_bytes)
 
 
402
  }
403
- if include_pdf:
404
- info["pdf_base64"] = base64.b64encode(part_bytes).decode("ascii")
 
 
 
 
 
 
 
 
 
 
 
 
 
405
  parts.append(info)
 
 
 
 
 
 
 
 
 
406
 
407
- doc.close()
408
-
409
  return JSONResponse({
 
410
  "count": len(parts),
411
  "parts": parts,
412
- "quota_status": {"daily_exhausted": daily_quota_exhausted}
 
 
 
 
 
 
 
 
 
413
  })
414
 
415
- except Exception as e:
416
- print(f"Critical Error: {e}")
 
 
 
417
  import traceback
418
  traceback.print_exc()
419
- return JSONResponse({"error": str(e)}, status_code=500)
420
 
421
  finally:
422
- # --- CRITICAL CLEANUP ---
423
- # Ensure temp file is deleted even if code crashes
424
- # Use background task to delete file after response is sent if you want,
425
- # but here we do it synchronously to be safe.
 
 
 
 
 
426
  remove_file(temp_path)
 
 
 
427
 
428
 
429
- if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
  import uvicorn
431
- print("🚀 Starting High-Performance Invoice Splitter")
432
- # Workers=1 ensures rate limiter works correctly
433
- uvicorn.run(app, host="0.0.0.0", port=7860, workers=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import datetime
7
  import shutil
8
  import tempfile
9
+ import gc
10
  from typing import List, Dict, Optional, Tuple
11
  from collections import deque
12
  from pathlib import Path
13
 
14
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
15
  from fastapi.middleware.cors import CORSMiddleware
16
+ from fastapi.responses import JSONResponse, StreamingResponse
17
+ from starlette.requests import Request
18
  import fitz # PyMuPDF
19
 
20
  # Google Gemini - optional import
 
24
  GEMINI_AVAILABLE = True
25
  except ImportError:
26
  GEMINI_AVAILABLE = False
27
+ print("Warning: google-generativeai not installed. Image-based PDFs won't be supported.")
28
 
29
  app = FastAPI(title="Invoice Splitter API")
30
 
31
+ # ⭐ Increase max request body size (default is 1MB-2MB)
32
+ Request.max_body_size = 200 * 1024 * 1024 # 200MB limit
33
+
34
  app.add_middleware(
35
  CORSMiddleware,
36
  allow_origins=["*"],
 
47
  {
48
  "name": "gemini-1.5-flash", # UPDATED: Current standard fast model
49
  "max_requests_per_minute": 15,
50
+ "timeout": 300,
51
  "description": "Primary fast model"
52
  },
53
  {
 
95
  return max(0, self.window_seconds - (time.time() - oldest))
96
 
97
  def reset(self):
98
+ self.requests. clear()
99
  self.quota_error_count = 0
100
 
101
  def record_quota_error(self):
 
113
  global last_quota_reset, daily_quota_exhausted
114
  now = datetime.datetime.now()
115
 
116
+ if last_quota_reset is None:
117
  last_quota_reset = now
118
  daily_quota_exhausted = False
119
  return True
120
 
121
+ if now. date() > last_quota_reset.date():
122
  print("🔄 Daily quota reset detected")
123
  last_quota_reset = now
124
  daily_quota_exhausted = False
 
147
  try:
148
  genai.configure(api_key=GEMINI_API_KEY)
149
  gemini_model = genai.GenerativeModel(model_config["name"])
150
+ print(f"✓ Initialized: {model_config['name']}")
151
+ except Exception as e:
152
  print(f"Failed to initialize {model_config['name']}: {e}")
153
  return None
154
  return gemini_model
 
164
  window_seconds=60
165
  )
166
  gemini_model = None
167
+ print(f"🔄 SWITCHED TO MODEL: {model_config['name']}")
168
  return get_gemini_model()
169
  return None
170
 
 
185
 
186
  # --- Regex Patterns ---
187
  INVOICE_NO_RE = re.compile(
188
+ r"""(?: Invoice\s*No\. ?|Inv\. ?\s*No\.?|Bill\s*No\.?|Document\s*No\.?|Doc\s*No\.?|Tax\s*Invoice\s*No\.?)\s*[:\-]?\s*([A-Z0-9][A-Z0-9\-\/]{3,})""",
189
  re.IGNORECASE | re.VERBOSE
190
  )
191
  PREFIXED_INVOICE_RE = re.compile(r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b")
192
+ GST_LIKE_RE = re.compile(r"\b((?: GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b", re.IGNORECASE)
193
 
194
 
195
  def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
 
197
  pages_to_check = min(sample_pages, doc.page_count)
198
  for i in range(pages_to_check):
199
  text = doc.load_page(i).get_text("text") or ""
200
+ total_text_length += len(text. strip())
201
  avg_text_length = total_text_length / pages_to_check
202
  return avg_text_length < 50, avg_text_length
203
 
204
 
205
  # --- Extraction Logic ---
206
  def normalize_text_for_search(s: str) -> str:
207
+ if not s:
208
+ return s
209
  s = s.replace("\u00A0", " ")
210
  return re.sub(r"[ ]{2,}", " ", re.sub(r"[\r\n\t]+", " ", s)).strip()
211
 
212
 
213
  def try_extract_invoice_from_text(text: str) -> Optional[str]:
214
+ if not text:
215
+ return None
216
  text_norm = normalize_text_for_search(text)
217
 
218
+ m = INVOICE_NO_RE. search(text_norm)
219
  if m:
220
  inv = (m.group(1) or "").strip()
221
+ if inv and len(inv) > 2 and inv. lower() not in ("invoice", "bill"):
222
  return inv
223
 
224
+ m = PREFIXED_INVOICE_RE.search(text_norm[: 600])
225
  if m:
226
  inv = (m.group(1) or "").strip()
227
  if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
 
236
  return None
237
 
238
 
239
+ def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
240
+ if not check_daily_quota():
241
+ return None
242
  model = get_gemini_model()
243
+ if not model:
244
+ return None
245
 
246
  if not gemini_rate_limiter.allow_request():
247
  wait_time = gemini_rate_limiter.wait_time()
 
250
  return extract_invoice_gemini(page, retry_count)
251
 
252
  try:
253
+ # Reduced resolution from 2x to 1.5x to save memory
254
+ pix = page.get_pixmap(matrix=fitz.Matrix(1. 5, 1.5), dpi=150)
255
  img_bytes = pix.tobytes("png")
256
+
257
+ # ⭐ Explicitly free pixmap memory
258
+ pix = None
259
+
260
  img = Image.open(io.BytesIO(img_bytes))
261
 
262
+ prompt = """Extract the invoice number. Return ONLY the number. If not found, return 'NOT_FOUND'."""
263
 
264
  response = model.generate_content([prompt, img])
265
+
266
+ # Try to get invoice number from response
267
+ result = None
268
  if response and response.text:
269
  txt = response.text.strip().replace("*", "").replace("#", "")
270
  if txt and txt != "NOT_FOUND" and len(txt) > 2:
271
+ result = txt
272
+
273
+ # Fallback to OCR text if no result
274
+ if not result:
275
+ ocr_resp = model.generate_content(["Extract all text.", img])
276
+ if ocr_resp and ocr_resp.text:
277
+ result = try_extract_invoice_from_text(ocr_resp.text)
278
+
279
+ # ⭐ Free image memory
280
+ img. close()
281
+
282
+ return result
283
 
284
  except Exception as e:
285
  error_str = str(e).lower()
286
+ if "429" in str(e) or "quota" in error_str:
287
  gemini_rate_limiter.record_quota_error()
288
  if "per_day" in error_str:
289
  mark_daily_quota_exhausted()
 
299
  # 1. Try Text Extraction (Fastest)
300
  text = page.get_text("text") or ""
301
  inv = try_extract_invoice_from_text(text)
302
+ if inv:
303
+ return inv
304
 
305
  # 2. Try Block Extraction
306
  for block in (page.get_text("blocks") or []):
307
+ if len(block) > 4 and block[4]:
308
  inv = try_extract_invoice_from_text(block[4])
309
+ if inv:
310
+ return inv
311
 
312
  # 3. Gemini Fallback (Only if enabled and seemingly image-based)
313
+ if is_image_pdf:
314
  return extract_invoice_gemini(page)
315
 
316
  return None
317
 
318
 
319
  def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
320
+ """Build a PDF with memory optimization"""
321
  out = fitz.open()
322
+ try:
323
+ for i in page_indices:
324
+ out.insert_pdf(src_doc, from_page=i, to_page=i)
325
+
326
+ # ⭐ Optimize and compress output PDF
327
+ pdf_bytes = out.tobytes(garbage=4, deflate=True)
328
+ return pdf_bytes
329
+ finally:
330
+ out.close()
331
 
332
 
333
  # --- File Cleanup Utility ---
334
  def remove_file(path: str):
335
  try:
336
+ if os.path.exists(path):
337
+ os.remove(path)
338
+ print(f"🧹 Cleaned up temp file: {path}")
339
  except Exception as e:
340
+ print(f"⚠️ Warning: Could not remove temp file {path}: {e}")
341
 
342
 
343
  # ============================================================================
344
  # API ENDPOINTS
345
  # ============================================================================
346
 
347
+ @app.get("/")
348
+ async def root():
349
+ return {
350
+ "service": "Invoice Splitter API",
351
+ "version": "2.0",
352
+ "max_file_size_mb": 200,
353
+ "gemini_available": GEMINI_AVAILABLE,
354
+ "gemini_configured": bool(GEMINI_API_KEY)
355
+ }
356
+
357
+
358
+ @app.get("/health")
359
+ async def health():
360
+ return {
361
+ "status": "healthy",
362
+ "gemini_status": {
363
+ "available": GEMINI_AVAILABLE,
364
+ "configured": bool(GEMINI_API_KEY),
365
+ "current_model": GEMINI_MODELS[current_model_index]["name"],
366
+ "daily_quota_exhausted": daily_quota_exhausted
367
+ }
368
+ }
369
+
370
+
371
  @app.post("/split-invoices")
372
  async def split_invoices(
373
  background_tasks: BackgroundTasks,
374
  file: UploadFile = File(...),
375
  include_pdf: bool = Form(True),
376
+ max_file_size_mb: int = Form(200)
377
  ):
378
+ """
379
+ Split a large PDF file into separate invoices.
380
+
381
+ Parameters:
382
+ - file: PDF file to split (max 200MB)
383
+ - include_pdf: Include base64-encoded PDFs in response (default: True)
384
+ - max_file_size_mb: Maximum file size in MB (default: 200)
385
+
386
+ Returns:
387
+ - JSON with split invoice parts
388
+ """
389
+ if not file.filename.lower().endswith(". pdf"):
390
+ raise HTTPException(status_code=400, detail="Only PDF files are supported")
391
 
392
+ max_size_bytes = max_file_size_mb * 1024 * 1024
393
+
394
+ # Create temporary file
395
+ fd, temp_path = tempfile. mkstemp(suffix=".pdf")
396
+ os.close(fd)
397
+
398
+ doc = None # Initialize for finally block
399
 
400
  try:
401
+ # Stream upload with size tracking and validation
402
+ print(f"📥 Receiving file: {file.filename}")
403
+ total_size = 0
404
+
405
  with open(temp_path, "wb") as buffer:
406
+ # Use 5MB chunks for faster processing
407
+ chunk_size = 5 * 1024 * 1024
408
+
409
+ while content := await file.read(chunk_size):
410
+ total_size += len(content)
411
+
412
+ # ⭐ Check size limit during upload
413
+ if total_size > max_size_bytes:
414
+ raise HTTPException(
415
+ status_code=413,
416
+ detail=f"File too large. Maximum size: {max_file_size_mb}MB, received: {total_size / (1024*1024):.1f}MB"
417
+ )
418
+
419
  buffer.write(content)
420
+
421
+ # ⭐ Progress logging for large files
422
+ if total_size % (20 * 1024 * 1024) < chunk_size: # Every ~20MB
423
+ print(f" 📊 Uploaded: {total_size / (1024*1024):.1f}MB")
424
 
425
+ file_size_mb = total_size / (1024 * 1024)
426
+ print(f"💾 Saved {file_size_mb:.2f}MB to: {temp_path}")
427
 
428
+ # Open PDF from disk (memory-mapped)
429
  doc = fitz.open(temp_path)
430
 
431
+ if doc. page_count == 0:
432
+ raise HTTPException(status_code=400, detail="PDF file is empty")
433
 
434
+ print(f"📄 Processing {doc.page_count} pages...")
435
 
436
+ # Step 1: Detect if image-based PDF (check fewer pages for large PDFs)
437
+ sample_pages = min(3, doc.page_count)
438
+ is_image_pdf, avg_text = is_image_based_pdf(doc, sample_pages)
439
+ print(f" PDF Type: {'Image-based' if is_image_pdf else 'Text-based'} (avg text: {avg_text:.1f} chars)")
440
 
441
+ # Step 2: Extract invoice numbers from all pages
442
  page_invoice_nos = []
443
+
444
+ for i in range(doc. page_count):
445
+ # Progress logging for large documents
446
+ if i > 0 and i % 50 == 0:
447
+ print(f" 📄 Processed {i}/{doc.page_count} pages")
448
+
449
+ page = doc. load_page(i)
450
+
451
+ try:
452
+ inv = extract_invoice_no_from_page(page, is_image_pdf)
453
+ page_invoice_nos.append(inv)
454
+
455
+ if inv:
456
+ print(f" Page {i+1}: Found invoice '{inv}'")
457
+ finally:
458
+ # ⭐ Explicitly free page resources
459
+ page = None
460
+
461
+ # ⭐ Force garbage collection every 100 pages
462
+ if i > 0 and i % 100 == 0:
463
+ gc.collect()
464
+
465
+ print(f"✓ Extraction complete. Found {sum(1 for x in page_invoice_nos if x)} invoice numbers")
466
+
467
+ # Step 3: Filter GST-only entries and group pages
468
  clean_invs = [
469
+ None if (v and v.upper().startswith("GST: ")) else v
470
  for v in page_invoice_nos
471
  ]
472
 
 
475
  current_inv = None
476
 
477
  for idx, inv in enumerate(clean_invs):
478
+ if current_inv is None:
479
  current_inv = inv
480
  current_group = [idx]
481
  else:
482
  if inv is not None and inv != current_inv:
483
  # Save previous group
484
+ groups.append({"invoice_no": current_inv, "pages": current_group})
485
  # Start new group
486
  current_inv = inv
487
  current_group = [idx]
488
+ else:
489
  current_group.append(idx)
490
 
491
  if current_group:
492
+ groups. append({"invoice_no": current_inv, "pages": current_group})
493
 
494
+ # Smart merging: If first page has no invoice, merge with second group
495
  if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
496
+ print(f" 🔗 Merging first {len(groups[0]['pages'])} pages with invoice '{groups[1]['invoice_no']}'")
497
  groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
498
+ groups. pop(0)
499
+
500
+ print(f"📦 Created {len(groups)} invoice groups")
501
 
502
+ # Step 4: Build response with PDFs
503
  parts = []
504
+ total_response_size = 0
505
+ max_response_size = 100 * 1024 * 1024 # 100MB response limit
506
+
507
+ for idx, g in enumerate(groups):
508
+ print(f" 🔨 Building PDF part {idx+1}/{len(groups)} (Invoice: {g['invoice_no'] or 'Unknown'})")
509
+
510
  part_bytes = build_pdf_from_pages(doc, g["pages"])
511
+
512
  info = {
513
  "invoice_no": g["invoice_no"],
514
+ "pages": [p + 1 for p in g["pages"]], # 1-based page numbers
515
+ "page_count": len(g["pages"]),
516
+ "size_bytes": len(part_bytes),
517
+ "size_mb": round(len(part_bytes) / (1024 * 1024), 2)
518
  }
519
+
520
+ # Handle large responses - skip base64 if total response too large
521
+ if include_pdf:
522
+ base64_size = len(part_bytes) * 4 / 3 # Base64 encoding overhead
523
+ total_response_size += base64_size
524
+
525
+ if total_response_size > max_response_size:
526
+ print(f" ⚠️ Response size exceeds 100MB. Skipping base64 for remaining parts.")
527
+ info["pdf_base64"] = None
528
+ info["warning"] = "PDF too large for inline response. Use streaming endpoint or set include_pdf=false"
529
+ else:
530
+ info["pdf_base64"] = base64.b64encode(part_bytes).decode("ascii")
531
+ else:
532
+ info["pdf_base64"] = None
533
+
534
  parts.append(info)
535
+
536
+ # ⭐ Free memory immediately
537
+ del part_bytes
538
+
539
+ # ⭐ Garbage collect after each part
540
+ if idx % 5 == 0:
541
+ gc.collect()
542
+
543
+ print(f"✅ Successfully split into {len(parts)} parts")
544
 
 
 
545
  return JSONResponse({
546
+ "success": True,
547
  "count": len(parts),
548
  "parts": parts,
549
+ "source_file": {
550
+ "name": file.filename,
551
+ "size_mb": round(file_size_mb, 2),
552
+ "total_pages": doc.page_count,
553
+ "is_image_pdf": is_image_pdf
554
+ },
555
+ "quota_status": {
556
+ "daily_exhausted": daily_quota_exhausted,
557
+ "current_model": GEMINI_MODELS[current_model_index]["name"]
558
+ }
559
  })
560
 
561
+ except HTTPException:
562
+ raise # Re-raise HTTP exceptions as-is
563
+
564
+ except Exception as e:
565
+ print(f"❌ Critical Error: {e}")
566
  import traceback
567
  traceback.print_exc()
568
+ raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
569
 
570
  finally:
571
+ # Critical cleanup in correct order
572
+ if doc:
573
+ try:
574
+ doc.close()
575
+ print("📕 Closed PDF document")
576
+ except Exception as e:
577
+ print(f"⚠️ Error closing document: {e}")
578
+
579
+ # Delete temp file
580
  remove_file(temp_path)
581
+
582
+ # ⭐ Final garbage collection
583
+ gc.collect()
584
 
585
 
586
+ @app.post("/split-invoices-stream")
587
+ async def split_invoices_stream(
588
+ background_tasks: BackgroundTasks,
589
+ file: UploadFile = File(...),
590
+ max_file_size_mb: int = Form(200)
591
+ ):
592
+ """
593
+ Streaming version for extremely large files.
594
+ Returns NDJSON (newline-delimited JSON) with each part as a separate line.
595
+
596
+ This avoids building a large JSON response in memory.
597
+ """
598
+ import json
599
+
600
+ if not file.filename.lower().endswith(".pdf"):
601
+ raise HTTPException(status_code=400, detail="Only PDF files are supported")
602
+
603
+ max_size_bytes = max_file_size_mb * 1024 * 1024
604
+ fd, temp_path = tempfile. mkstemp(suffix=".pdf")
605
+ os.close(fd)
606
+
607
+ # Upload file
608
+ try:
609
+ total_size = 0
610
+ with open(temp_path, "wb") as buffer:
611
+ chunk_size = 5 * 1024 * 1024
612
+ while content := await file.read(chunk_size):
613
+ total_size += len(content)
614
+ if total_size > max_size_bytes:
615
+ remove_file(temp_path)
616
+ raise HTTPException(status_code=413, detail=f"File too large. Max: {max_file_size_mb}MB")
617
+ buffer.write(content)
618
+ except Exception as e:
619
+ remove_file(temp_path)
620
+ raise
621
+
622
+ async def generate_parts():
623
+ doc = None
624
+ try:
625
+ doc = fitz.open(temp_path)
626
+
627
+ # Send initial status
628
+ yield json.dumps({
629
+ "type": "status",
630
+ "status": "processing",
631
+ "total_pages": doc.page_count,
632
+ "filename": file.filename
633
+ }) + "\n"
634
+
635
+ # Detect PDF type
636
+ is_image_pdf, _ = is_image_based_pdf(doc)
637
+
638
+ # Extract invoice numbers
639
+ page_invoice_nos = []
640
+ for i in range(doc.page_count):
641
+ page = doc. load_page(i)
642
+ inv = extract_invoice_no_from_page(page, is_image_pdf)
643
+ page_invoice_nos.append(inv)
644
+ page = None
645
+
646
+ if i % 100 == 0:
647
+ gc.collect()
648
+
649
+ # Group pages
650
+ clean_invs = [None if (v and v.upper().startswith("GST:")) else v for v in page_invoice_nos]
651
+ groups = []
652
+ current_group = []
653
+ current_inv = None
654
+
655
+ for idx, inv in enumerate(clean_invs):
656
+ if current_inv is None:
657
+ current_inv = inv
658
+ current_group = [idx]
659
+ else:
660
+ if inv is not None and inv != current_inv:
661
+ groups. append({"invoice_no": current_inv, "pages": current_group})
662
+ current_inv = inv
663
+ current_group = [idx]
664
+ else:
665
+ current_group. append(idx)
666
+
667
+ if current_group:
668
+ groups.append({"invoice_no": current_inv, "pages": current_group})
669
+
670
+ if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
671
+ groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
672
+ groups.pop(0)
673
+
674
+ # Stream each part
675
+ for idx, g in enumerate(groups):
676
+ part_bytes = build_pdf_from_pages(doc, g["pages"])
677
+
678
+ info = {
679
+ "type": "part",
680
+ "part_index": idx,
681
+ "invoice_no": g["invoice_no"],
682
+ "pages": [p + 1 for p in g["pages"]],
683
+ "page_count": len(g["pages"]),
684
+ "size_bytes": len(part_bytes),
685
+ "pdf_base64": base64.b64encode(part_bytes).decode("ascii")
686
+ }
687
+
688
+ yield json.dumps(info) + "\n"
689
+ del part_bytes
690
+ gc.collect()
691
+
692
+ # Send completion status
693
+ yield json.dumps({
694
+ "type": "complete",
695
+ "total_parts": len(groups)
696
+ }) + "\n"
697
+
698
+ except Exception as e:
699
+ yield json.dumps({
700
+ "type": "error",
701
+ "error": str(e)
702
+ }) + "\n"
703
+ finally:
704
+ if doc:
705
+ doc.close()
706
+ remove_file(temp_path)
707
+ gc.collect()
708
+
709
+ return StreamingResponse(
710
+ generate_parts(),
711
+ media_type="application/x-ndjson",
712
+ headers={
713
+ "Content-Disposition": f"attachment; filename=invoices-split. ndjson"
714
+ }
715
+ )
716
+
717
+
718
+ if __name__ == "__main__":
719
  import uvicorn
720
+ print("🚀 Starting High-Performance Invoice Splitter API")
721
+ print(f" Max file size: 200MB")
722
+ print(f" Gemini available: {GEMINI_AVAILABLE}")
723
+ print(f" Gemini configured: {bool(GEMINI_API_KEY)}")
724
+
725
+ # ⭐ Configure uvicorn for large files
726
+ uvicorn.run(
727
+ app,
728
+ host="0.0.0.0",
729
+ port=7860,
730
+ workers=1, # Single worker to maintain rate limiter state
731
+ timeout_keep_alive=300, # 5 minutes for large uploads
732
+ limit_concurrency=10,
733
+ limit_max_requests=1000
734
+ )