anujakkulkarni commited on
Commit
6c21d15
·
verified ·
1 Parent(s): 1fcd106

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -376
app.py CHANGED
@@ -2,294 +2,190 @@ import os
2
  import io
3
  import re
4
  import base64
 
 
5
  from typing import List, Dict, Optional, Tuple
 
6
 
7
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
8
  from fastapi.middleware.cors import CORSMiddleware
9
  from fastapi.responses import JSONResponse
10
  import fitz # PyMuPDF
11
 
12
- # Google Gemini - optional import
13
- try:
14
- import google.generativeai as genai
15
- from PIL import Image
16
- GEMINI_AVAILABLE = True
17
- except ImportError:
18
- GEMINI_AVAILABLE = False
19
- print("Warning: google-generativeai not installed. Image-based PDFs won't be supported.")
20
 
21
- app = FastAPI(title="Invoice Splitter API")
 
 
22
 
23
- app.add_middleware(
24
- CORSMiddleware,
25
- allow_origins=["*"],
26
- allow_credentials=True,
27
- allow_methods=["*"],
28
- allow_headers=["*"],
29
- )
30
 
31
- # --- Google Gemini Configuration ---
32
- # This will be automatically loaded from environment variables
33
- GEMINI_API_KEY = os. getenv("GEMINI_API_KEY", "")
 
 
 
 
 
 
34
 
35
  gemini_model = None
36
 
37
 
38
  def get_gemini_model():
39
- """Get or create Gemini model instance."""
40
  global gemini_model
41
 
42
- if not GEMINI_AVAILABLE:
43
- print("Gemini SDK not available")
44
  return None
45
 
46
  if gemini_model is None:
47
- # Check if API key is configured via environment variables
48
- if not GEMINI_API_KEY:
49
- print("Warning: Gemini API key not found in environment variables.")
50
- print("Please configure GEMINI_API_KEY in your environment variables.")
51
- return None
52
-
53
- try:
54
- genai.configure(api_key=GEMINI_API_KEY)
55
- gemini_model = genai.GenerativeModel('gemini-2.0-flash-exp')
56
- print("✓ Google Gemini Flash 2.0 initialized")
57
- except Exception as e:
58
- print(f"Failed to initialize Gemini model: {e}")
59
- return None
60
 
61
  return gemini_model
62
 
63
 
64
- # --- Regex patterns for text-based PDF extraction ---
65
- INVOICE_NO_RE = re.compile(
66
- r"""
67
- (?:
68
- Invoice\s*No\.?|
69
- Inv\.?\s*No\.?|
70
- Bill\s*No\.?|
71
- Document\s*No\.?| # ✅ ADD THIS
72
- Doc\s*No\.?|
73
- Tax\s*Invoice\s*No\.?
74
- )
75
- \s*[:\-]?\s*
76
- ([A-Z0-9][A-Z0-9\-\/]{3,})
77
- """,
78
- re.IGNORECASE | re.VERBOSE
79
- )
80
-
81
- PREFIXED_INVOICE_RE = re.compile(
82
- r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b"
83
- )
84
-
85
- GST_LIKE_RE = re.compile(
86
- r"\b((?:GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b", re.IGNORECASE)
87
-
88
-
89
- def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
90
  """
91
- Detect if PDF is image-based or text-based by sampling pages.
92
- Returns (is_image_based, avg_text_length).
93
-
94
- Strategy:
95
- - Sample first few pages
96
- - If average extractable text < 50 chars per page, it's likely image-based
97
- - If text > 200 chars per page, it's text-based
98
  """
99
- total_text_length = 0
100
- pages_to_check = min(sample_pages, doc.page_count)
101
-
102
- for i in range(pages_to_check):
103
- text = doc. load_page(i).get_text("text") or ""
104
- total_text_length += len(text. strip())
105
 
106
- avg_text_length = total_text_length / pages_to_check
107
- is_image_based = avg_text_length < 50
 
 
 
108
 
109
- print(
110
- f" PDF Type Detection: avg_text_length={avg_text_length:.1f} chars/page")
111
- print(
112
- f" Classification: {'IMAGE-BASED' if is_image_based else 'TEXT-BASED'} PDF")
113
 
114
- return is_image_based, avg_text_length
 
115
 
116
 
117
  # ============================================================================
118
- # TEXT-BASED PDF EXTRACTION (Original Code)
119
  # ============================================================================
120
 
 
121
 
122
- def normalize_text_for_search(s: str) -> str:
123
- """Light normalization: collapse whitespace and normalize common separators."""
124
- if not s:
125
- return s
126
- s = s.replace("\u00A0", " ") # non-breaking space
127
- s = re.sub(r"[\r\n\t]+", " ", s)
128
- s = re.sub(r"[ ]{2,}", " ", s).strip()
129
- return s
130
 
 
 
 
131
 
132
- def try_extract_invoice_from_text(text: str) -> Optional[str]:
133
- """
134
- Extract invoice number from text using regex patterns.
135
- - Prefer explicit labeled Invoice/Bill patterns.
136
- - Prefer prefixed invoice formats found in the top of the page.
137
- - Use GST only as a last resort and tag it so it won't be mistaken for an invoice id.
138
- """
139
- if not text:
140
- return None
141
 
142
- text_norm = normalize_text_for_search(text)
143
 
144
- # 1) Labeled invoice like "Invoice No", "Inv No."
145
- m = INVOICE_NO_RE.search(text_norm)
146
- if m:
147
- inv = (m.group(1) or "").strip()
148
- if inv and inv.lower() not in ("invoice", "inv", "bill") and len(inv) > 2:
149
- return inv
150
 
151
- # 2) Search top portion for prefixed invoice codes (WN-1234, 5EN19710, etc.)
152
- top_text = text_norm[:600] # bigger top area to be robust
153
- m = PREFIXED_INVOICE_RE.search(top_text)
154
- if m:
155
- inv = (m.group(1) or "").strip()
156
- # extra length check so tiny numeric matches don't pass
157
- if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
158
- return inv
159
-
160
- # 3) As absolute last-resort: strict GST detection (only accept 15-char GSTIN)
161
- gm = GST_LIKE_RE.search(text_norm)
162
- if gm:
163
- gst_val = gm.group(2) or ""
164
- gst_val = gst_val.replace(" ", "").strip().upper()
165
- # Only accept if 15 alnum chars (typical Indian GSTIN length)
166
- if len(gst_val) == 15 and re.match(r"^[0-9A-Z]{15}$", gst_val):
167
- # tag it so grouping won't treat GST same as invoice ID
168
- return f"GST:{gst_val}"
169
 
170
- return None
171
 
 
 
 
172
 
173
- def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
174
- """
175
- Extract invoice number from TEXT-BASED PDF.
176
- Uses the original fast text extraction method.
177
- """
178
- # Try full-page text
179
  text = page.get_text("text") or ""
180
- inv = try_extract_invoice_from_text(text)
181
- if inv:
182
- return inv
183
-
184
- # Try block-level text
185
- for block in (page.get_text("blocks") or []):
186
- block_text = block[4] if len(block) > 4 else ""
187
- if block_text:
188
- inv = try_extract_invoice_from_text(block_text)
189
- if inv:
190
- return inv
191
 
192
  return None
193
 
194
 
195
  # ============================================================================
196
- # IMAGE-BASED PDF EXTRACTION (Google Gemini)
197
  # ============================================================================
198
 
199
  def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
200
- """
201
- Extract invoice number from IMAGE-BASED PDF using Google Gemini Flash 2.0.
202
- """
203
- model = get_gemini_model()
204
- if not model:
205
- print(" Gemini model not available")
206
- return None
207
-
208
  try:
209
- # Convert page to image
210
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x resolution
211
- img_bytes = pix.tobytes("png")
212
-
213
- # Convert to PIL Image for Gemini
214
- img = Image.open(io.BytesIO(img_bytes))
215
 
216
- # Prompt for Gemini to extract invoice number
217
  prompt = """
218
- Extract the invoice number from this image. Look for:
219
- - Invoice No, Invoice Number, Bill No, Bill Number
220
- - Any alphanumeric code that appears to be an invoice identifier
221
- - Purchase Order numbers if no invoice number is found
222
-
223
- Return ONLY the invoice number/identifier itself, nothing else.
224
- If no invoice number is found, return "NOT_FOUND".
 
 
 
225
  """
226
 
227
- print(" Calling Google Gemini API...")
228
- response = model.generate_content([prompt, img])
229
 
230
  if response and response.text:
231
- extracted_text = response.text.strip()
232
- print(f" Gemini response: {extracted_text}")
233
-
234
- if extracted_text and extracted_text != "NOT_FOUND":
235
- # Clean up the response
236
- invoice_no = extracted_text.replace(
237
- "*", "").replace("#", "").strip()
238
- if invoice_no and len(invoice_no) > 2:
239
- print(f" ✓ Gemini found invoice: {invoice_no}")
240
- return invoice_no
241
-
242
- # Fallback: Get full OCR text and try regex
243
- ocr_prompt = "Extract all text from this invoice image. Return the complete text content."
244
- ocr_response = model.generate_content([ocr_prompt, img])
245
-
246
- if ocr_response and ocr_response.text:
247
- print(
248
- f" Gemini extracted {len(ocr_response.text)} chars, trying regex...")
249
- inv = try_extract_invoice_from_text(ocr_response.text)
250
- if inv:
251
- print(f" ✓ Found via regex on Gemini text: {inv}")
252
- return inv
253
-
254
- print(" ✗ Gemini: No invoice found")
255
- return None
256
 
257
  except Exception as e:
258
- print(f"Gemini extraction failed: {e}")
259
- return None
 
260
 
261
 
262
  # ============================================================================
263
- # UNIFIED EXTRACTION LOGIC
264
  # ============================================================================
265
 
266
- def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
267
- """Try text extraction first, then Gemini as fallback"""
268
-
269
- # ALWAYS try text extraction first (fast, no API cost)
270
- text_result = extract_invoice_text_based(page)
271
- if text_result:
272
- print(f" ✓ Found via text extraction: {text_result}")
273
- return text_result
274
 
275
- # If text fails AND PDF seems image-based, try Gemini
276
  if is_image_pdf:
277
- gemini_result = extract_invoice_gemini(page)
278
- if gemini_result:
279
- print(f" ✓ Found via Gemini: {gemini_result}")
280
- return gemini_result
281
 
282
  return None
283
 
284
 
285
- def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
286
- """Create a new PDF with the given pages (0-based indices)."""
 
 
 
287
  out = fitz.open()
288
- for i in page_indices:
289
- out.insert_pdf(src_doc, from_page=i, to_page=i)
290
- pdf_bytes = out.tobytes()
291
  out.close()
292
- return pdf_bytes
293
 
294
 
295
  # ============================================================================
@@ -299,176 +195,65 @@ def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> byt
299
  @app.post("/split-invoices")
300
  async def split_invoices(
301
  file: UploadFile = File(...),
302
- include_pdf: bool = Form(True),
303
- initial_dpi: int = Form(300), # Kept for compatibility
304
  ):
305
- """
306
- Split a multi-invoice PDF into separate PDFs based on invoice numbers.
307
 
308
- - Text-based PDFs: Uses fast text extraction
309
- - Image-based PDFs: Uses Google Gemini Flash 2.0 (if configured)
310
 
311
- Note: GST values (tagged as "GST:...") are treated as a last-resort identifier and
312
- are ignored for splitting by default (so repeated company GST won't prevent splits).
313
- """
314
- if not file.filename.lower().endswith(".pdf"):
315
- raise HTTPException(status_code=400, detail="only PDF is supported")
316
 
317
- file_bytes = await file.read()
318
- if not file_bytes:
319
- raise HTTPException(status_code=400, detail="empty file")
320
 
321
- try:
322
- doc = fitz.open(stream=file_bytes, filetype="pdf")
323
- if doc.page_count == 0:
324
- raise HTTPException(status_code=400, detail="no pages found")
325
-
326
- print(f"\n{'='*60}")
327
- print(f"Processing PDF: {file.filename}")
328
- print(f"Total pages: {doc.page_count}")
329
- print(f"{'='*60}")
330
-
331
- # Step 1: Detect PDF type (text-based vs image-based)
332
- is_image_pdf, avg_text_len = is_image_based_pdf(doc)
333
-
334
- if is_image_pdf and not get_gemini_model():
335
- raise HTTPException(
336
- status_code=500,
337
- detail="Image-based PDF detected but Google Gemini is not configured. "
338
- "Please add GEMINI_API_KEY to your environment variables."
339
- )
340
-
341
- # Step 2: Extract invoice numbers from each page
342
- page_invoice_nos: List[Optional[str]] = []
343
  for i in range(doc.page_count):
344
- print(f"\n--- Page {i+1}/{doc.page_count} ---")
345
- inv = extract_invoice_no_from_page(doc.load_page(i), is_image_pdf)
346
- # inv may be something like "5EN19710" or "GST:12ABCDE..." or None
347
- if inv:
348
- print(f" ✓ Raw extracted id: {inv}")
349
- else:
350
- print(f" ✗ No invoice found (raw)")
351
- page_invoice_nos.append(inv)
352
-
353
- print(f"\n{'='*60}")
354
- print(f"Raw Extraction Results: {page_invoice_nos}")
355
- print(f"{'='*60}")
356
-
357
- # ---------------------------------------------------------
358
- # Post-process extracted ids before grouping
359
- # - Treat GST:<value> as a LAST-RESORT marker and ignore it for splitting
360
- # (convert to None) so repeated company GST doesn't group pages together.
361
- # - Keep actual invoice ids like '5EN19710' intact.
362
- # ---------------------------------------------------------
363
- page_invoice_nos_filtered: List[Optional[str]] = []
364
- for v in page_invoice_nos:
365
- if v is None:
366
- page_invoice_nos_filtered.append(None)
367
- else:
368
- # If GST-tagged value (we returned "GST:..."), ignore it for splitting
369
- if isinstance(v, str) and v.upper().startswith("GST:"):
370
- page_invoice_nos_filtered.append(None)
371
- else:
372
- page_invoice_nos_filtered.append(v)
373
-
374
- print(f"Filtered (GST ignored) Results: {page_invoice_nos_filtered}")
375
-
376
- # Step 3: Group pages by invoice number (use filtered ids)
377
- groups: List[Dict] = []
378
- current_group_pages: List[int] = []
379
- current_invoice: Optional[str] = None
380
-
381
- for idx, inv in enumerate(page_invoice_nos_filtered):
382
- if current_invoice is None:
383
- # Start a new group (even if inv is None)
384
- current_invoice = inv
385
- current_group_pages = [idx]
386
- else:
387
- # If a new non-empty invoice appears and differs -> close current group
388
- if inv is not None and inv != current_invoice:
389
- groups.append({
390
- "invoice_no": current_invoice,
391
- "pages": current_group_pages[:],
392
- })
393
- current_invoice = inv
394
- current_group_pages = [idx]
395
- else:
396
- # Continue current group (same invoice or both None)
397
- current_group_pages.append(idx)
398
-
399
- # Save last group
400
- if current_group_pages:
401
- groups.append({
402
- "invoice_no": current_invoice,
403
- "pages": current_group_pages[:]
404
- })
405
-
406
- # Post-process groups:
407
- # If first group has invoice_no None and next group has non-None -> merge leading None
408
- if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
409
- groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
410
- groups.pop(0)
411
-
412
- # If, after filtering, all groups are None (no invoice detected), return whole doc as one part
413
- if all(g["invoice_no"] is None for g in groups):
414
- print("\n⚠ Warning: No invoices detected in any page (after GST ignored)!")
415
- print(" Returning entire PDF as single part")
416
- groups = [{
417
- "invoice_no": None,
418
- "pages": list(range(doc.page_count))
419
- }]
420
-
421
- # Step 4: Build response parts
422
- parts = []
423
- for idx, g in enumerate(groups):
424
- part_bytes = build_pdf_from_pages(doc, g["pages"])
425
- info = {
426
- # Keep invoice_no as detected in filtered set (None or actual invoice id)
427
- "invoice_no": g["invoice_no"],
428
- "pages": [p + 1 for p in g["pages"]], # 1-based for humans
429
- "num_pages": len(g["pages"]),
430
- "size_bytes": len(part_bytes),
431
- }
432
- if include_pdf:
433
- info["pdf_base64"] = base64.b64encode(
434
- part_bytes).decode("ascii")
435
- parts.append(info)
436
- print(f"\nPart {idx+1}:")
437
- print(f" Invoice: {g['invoice_no']}")
438
- print(f" Pages: {info['pages']}")
439
- print(f" Size: {len(part_bytes):,} bytes")
440
-
441
- doc.close()
442
-
443
- print(f"\n{'='*60}")
444
- print(f"✓ Successfully split into {len(parts)} part(s)")
445
- print(f"{'='*60}\n")
446
-
447
- return JSONResponse({
448
- "count": len(parts),
449
- "pdf_type": "image-based" if is_image_pdf else "text-based",
450
- "parts": parts
451
- })
452
-
453
- except HTTPException:
454
- raise
455
- except Exception as e:
456
- print(f"\n✗ Error: {str(e)}")
457
- import traceback
458
- traceback.print_exc()
459
- return JSONResponse({"error": str(e)}, status_code=500)
460
 
461
 
462
  @app.get("/health")
463
- async def health_check():
464
- """Health check endpoint to verify Gemini configuration."""
465
- gemini_status = "configured" if get_gemini_model() else "not configured"
466
  return {
467
- "status": "healthy",
468
- "gemini_flash": gemini_status,
469
- "gemini_available": GEMINI_AVAILABLE,
470
  }
471
-
472
- if __name__ == "__main__":
473
- import uvicorn
474
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
2
  import io
3
  import re
4
  import base64
5
+ import time
6
+ import threading
7
  from typing import List, Dict, Optional, Tuple
8
+ from concurrent.futures import ThreadPoolExecutor
9
 
10
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
11
  from fastapi.middleware.cors import CORSMiddleware
12
  from fastapi.responses import JSONResponse
13
  import fitz # PyMuPDF
14
 
15
+ # Gemini
16
+ import google.generativeai as genai
17
+ from PIL import Image
 
 
 
 
 
18
 
19
+ # ============================================================================
20
+ # CONFIG
21
+ # ============================================================================
22
 
23
+ MAX_GEMINI_CONCURRENT_CALLS = 2 # HARD LIMIT
24
+ GEMINI_MIN_INTERVAL_SEC = 1.2 # RATE LIMIT (seconds)
 
 
 
 
 
25
 
26
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
27
+
28
+ # ============================================================================
29
+ # THREAD & RATE LIMIT MANAGEMENT
30
+ # ============================================================================
31
+
32
+ gemini_lock = threading.Lock()
33
+ gemini_semaphore = threading.Semaphore(MAX_GEMINI_CONCURRENT_CALLS)
34
+ last_gemini_call_time = 0.0
35
 
36
  gemini_model = None
37
 
38
 
39
  def get_gemini_model():
 
40
  global gemini_model
41
 
42
+ if not GEMINI_API_KEY:
 
43
  return None
44
 
45
  if gemini_model is None:
46
+ genai.configure(api_key=GEMINI_API_KEY)
47
+ gemini_model = genai.GenerativeModel(
48
+ model_name="models/gemini-2.5-flash-image"
49
+ )
50
+ print("✓ Gemini 2.5 Flash Image initialized")
 
 
 
 
 
 
 
 
51
 
52
  return gemini_model
53
 
54
 
55
+ def rate_limited_gemini_call(prompt, img):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  """
57
+ Thread-safe + rate-limited Gemini call
 
 
 
 
 
 
58
  """
59
+ global last_gemini_call_time
 
 
 
 
 
60
 
61
+ with gemini_semaphore:
62
+ with gemini_lock:
63
+ elapsed = time.time() - last_gemini_call_time
64
+ if elapsed < GEMINI_MIN_INTERVAL_SEC:
65
+ time.sleep(GEMINI_MIN_INTERVAL_SEC - elapsed)
66
 
67
+ model = get_gemini_model()
68
+ response = model.generate_content([prompt, img])
 
 
69
 
70
+ last_gemini_call_time = time.time()
71
+ return response
72
 
73
 
74
  # ============================================================================
75
+ # FASTAPI
76
  # ============================================================================
77
 
78
+ app = FastAPI(title="Invoice Splitter API")
79
 
80
+ app.add_middleware(
81
+ CORSMiddleware,
82
+ allow_origins=["*"],
83
+ allow_methods=["*"],
84
+ allow_headers=["*"],
85
+ )
 
 
86
 
87
+ # ============================================================================
88
+ # REGEX
89
+ # ============================================================================
90
 
91
+ INVOICE_NO_RE = re.compile(
92
+ r"(Invoice\s*No\.?|Inv\.?\s*No\.?|Bill\s*No\.?|Tax\s*Invoice\s*No\.?)\s*[:\-]?\s*([A-Z0-9\-\/]{4,})",
93
+ re.IGNORECASE
94
+ )
 
 
 
 
 
95
 
96
+ PREFIXED_RE = re.compile(r"\b[A-Z]{2,5}[-/]\d{4,}\b")
97
 
98
+ # ============================================================================
99
+ # PDF TYPE DETECTION
100
+ # ============================================================================
 
 
 
101
 
102
+ def is_image_based_pdf(doc: fitz.Document, sample=3):
103
+ total = 0
104
+ for i in range(min(sample, doc.page_count)):
105
+ total += len(doc.load_page(i).get_text("text") or "")
106
+ avg = total / max(1, sample)
107
+ return avg < 50
 
 
 
 
 
 
 
 
 
 
 
 
108
 
 
109
 
110
+ # ============================================================================
111
+ # TEXT EXTRACTION
112
+ # ============================================================================
113
 
114
+ def extract_text_invoice(page: fitz.Page) -> Optional[str]:
 
 
 
 
 
115
  text = page.get_text("text") or ""
116
+ m = INVOICE_NO_RE.search(text)
117
+ if m:
118
+ return m.group(2).strip()
119
+
120
+ top = text[:500]
121
+ m = PREFIXED_RE.search(top)
122
+ if m:
123
+ return m.group(0)
 
 
 
124
 
125
  return None
126
 
127
 
128
  # ============================================================================
129
+ # GEMINI IMAGE EXTRACTION
130
  # ============================================================================
131
 
132
  def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
 
 
 
 
 
 
 
 
133
  try:
134
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
135
+ img = Image.open(io.BytesIO(pix.tobytes("png")))
 
 
 
 
136
 
 
137
  prompt = """
138
+ Extract the invoice number from this invoice image.
139
+
140
+ Look for:
141
+ - Invoice No
142
+ - Bill No
143
+ - Tax Invoice No
144
+ - Purchase Order No (only if invoice not present)
145
+
146
+ Return ONLY the identifier.
147
+ If nothing is found, return NOT_FOUND.
148
  """
149
 
150
+ response = rate_limited_gemini_call(prompt, img)
 
151
 
152
  if response and response.text:
153
+ val = response.text.strip()
154
+ if val != "NOT_FOUND" and len(val) > 2:
155
+ return val
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  except Exception as e:
158
+ print("Gemini failed:", e)
159
+
160
+ return None
161
 
162
 
163
  # ============================================================================
164
+ # UNIFIED PAGE EXTRACTION
165
  # ============================================================================
166
 
167
+ def extract_invoice(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
168
+ text_inv = extract_text_invoice(page)
169
+ if text_inv:
170
+ return text_inv
 
 
 
 
171
 
 
172
  if is_image_pdf:
173
+ return extract_invoice_gemini(page)
 
 
 
174
 
175
  return None
176
 
177
 
178
+ # ============================================================================
179
+ # PDF BUILDER
180
+ # ============================================================================
181
+
182
+ def build_pdf(doc, pages):
183
  out = fitz.open()
184
+ for p in pages:
185
+ out.insert_pdf(doc, from_page=p, to_page=p)
186
+ data = out.tobytes()
187
  out.close()
188
+ return data
189
 
190
 
191
  # ============================================================================
 
195
  @app.post("/split-invoices")
196
  async def split_invoices(
197
  file: UploadFile = File(...),
198
+ include_pdf: bool = Form(True)
 
199
  ):
200
+ if not file.filename.lower().endswith(".pdf"):
201
+ raise HTTPException(400, "Only PDF allowed")
202
 
203
+ data = await file.read()
204
+ doc = fitz.open(stream=data, filetype="pdf")
205
 
206
+ is_image_pdf = is_image_based_pdf(doc)
207
+ print("PDF Type:", "IMAGE" if is_image_pdf else "TEXT")
 
 
 
208
 
209
+ invoice_ids = []
 
 
210
 
211
+ with ThreadPoolExecutor(max_workers=MAX_GEMINI_CONCURRENT_CALLS) as executor:
212
+ futures = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  for i in range(doc.page_count):
214
+ page = doc.load_page(i)
215
+ futures.append(executor.submit(extract_invoice, page, is_image_pdf))
216
+
217
+ for f in futures:
218
+ invoice_ids.append(f.result())
219
+
220
+ # Group pages
221
+ groups = []
222
+ current_inv = invoice_ids[0]
223
+ current_pages = [0]
224
+
225
+ for i in range(1, len(invoice_ids)):
226
+ if invoice_ids[i] != current_inv and invoice_ids[i] is not None:
227
+ groups.append((current_inv, current_pages))
228
+ current_inv = invoice_ids[i]
229
+ current_pages = [i]
230
+ else:
231
+ current_pages.append(i)
232
+
233
+ groups.append((current_inv, current_pages))
234
+
235
+ parts = []
236
+ for inv, pages in groups:
237
+ pdf_bytes = build_pdf(doc, pages)
238
+ part = {
239
+ "invoice_no": inv,
240
+ "pages": [p + 1 for p in pages],
241
+ "num_pages": len(pages),
242
+ }
243
+ if include_pdf:
244
+ part["pdf_base64"] = base64.b64encode(pdf_bytes).decode()
245
+ parts.append(part)
246
+
247
+ return {
248
+ "count": len(parts),
249
+ "pdf_type": "image-based" if is_image_pdf else "text-based",
250
+ "parts": parts
251
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
 
254
  @app.get("/health")
255
+ def health():
 
 
256
  return {
257
+ "status": "ok",
258
+ "gemini": "configured" if GEMINI_API_KEY else "missing"
 
259
  }