anujakkulkarni commited on
Commit
63e2ea5
·
verified ·
1 Parent(s): 8b3c611

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +376 -162
app.py CHANGED
@@ -2,192 +2,295 @@ import os
2
  import io
3
  import re
4
  import base64
5
- import time
6
- import threading
7
  from typing import List, Dict, Optional, Tuple
8
- from concurrent.futures import ThreadPoolExecutor
9
 
10
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
11
  from fastapi.middleware.cors import CORSMiddleware
12
  from fastapi.responses import JSONResponse
13
  import fitz # PyMuPDF
14
 
15
- # Gemini
16
- import google.generativeai as genai
17
- from PIL import Image
 
 
 
 
 
18
 
19
- # ============================================================================
20
- # CONFIG
21
- # ============================================================================
22
 
23
- MAX_GEMINI_CONCURRENT_CALLS = 2 # HARD LIMIT
24
- GEMINI_MIN_INTERVAL_SEC = 1.2 # RATE LIMIT (seconds)
 
 
 
 
 
25
 
 
 
26
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
27
 
28
- # ============================================================================
29
- # THREAD & RATE LIMIT MANAGEMENT
30
- # ============================================================================
31
-
32
- gemini_lock = threading.Lock()
33
- gemini_semaphore = threading.Semaphore(MAX_GEMINI_CONCURRENT_CALLS)
34
- last_gemini_call_time = 0.0
35
-
36
  gemini_model = None
37
 
38
 
39
  def get_gemini_model():
 
40
  global gemini_model
41
 
42
- if not GEMINI_API_KEY:
 
43
  return None
44
 
45
  if gemini_model is None:
46
- genai.configure(api_key=GEMINI_API_KEY)
47
- gemini_model = genai.GenerativeModel(
48
- model_name="models/gemini-2.5-flash-image"
49
- )
50
- print("✓ Gemini 2.5 Flash Image initialized")
 
 
 
 
 
 
 
 
51
 
52
  return gemini_model
53
 
54
 
55
- def rate_limited_gemini_call(prompt, img):
56
- """
57
- Thread-safe + rate-limited Gemini call
58
- """
59
- global last_gemini_call_time
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- with gemini_semaphore:
62
- with gemini_lock:
63
- elapsed = time.time() - last_gemini_call_time
64
- if elapsed < GEMINI_MIN_INTERVAL_SEC:
65
- time.sleep(GEMINI_MIN_INTERVAL_SEC - elapsed)
66
 
67
- model = get_gemini_model()
68
- response = model.generate_content([prompt, img])
69
 
70
- last_gemini_call_time = time.time()
71
- return response
72
 
 
 
 
 
73
 
74
- # ============================================================================
75
- # FASTAPI
76
- # ============================================================================
 
 
 
 
77
 
78
- app = FastAPI(title="Invoice Splitter API")
 
 
79
 
80
- app.add_middleware(
81
- CORSMiddleware,
82
- allow_origins=["*"],
83
- allow_methods=["*"],
84
- allow_headers=["*"],
85
- )
86
 
87
- # ============================================================================
88
- # REGEX
89
- # ============================================================================
 
90
 
91
- INVOICE_NO_RE = re.compile(
92
- r"(Invoice\s*No\.?|Inv\.?\s*No\.?|Bill\s*No\.?|Tax\s*Invoice\s*No\.?)\s*[:\-]?\s*([A-Z0-9\-\/]{4,})",
93
- re.IGNORECASE
94
- )
95
 
96
- PREFIXED_RE = re.compile(r"\b[A-Z]{2,5}[-/]\d{4,}\b")
97
 
98
  # ============================================================================
99
- # PDF TYPE DETECTION
100
  # ============================================================================
101
 
102
- def is_image_based_pdf(doc: fitz.Document, sample=3):
103
- total = 0
104
- for i in range(min(sample, doc.page_count)):
105
- total += len(doc.load_page(i).get_text("text") or "")
106
- avg = total / max(1, sample)
107
- return avg < 50
108
 
 
 
 
 
 
 
 
 
109
 
110
- # ============================================================================
111
- # TEXT EXTRACTION
112
- # ============================================================================
113
 
114
- def extract_text_invoice(page: fitz.Page) -> Optional[str]:
115
- text = page.get_text("text") or ""
116
- m = INVOICE_NO_RE.search(text)
 
 
 
 
 
 
 
 
 
 
 
117
  if m:
118
- return m.group(2).strip()
 
 
119
 
120
- top = text[:500]
121
- m = PREFIXED_RE.search(top)
 
122
  if m:
123
- return m.group(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  return None
126
 
127
 
128
  # ============================================================================
129
- # GEMINI IMAGE EXTRACTION
130
  # ============================================================================
131
 
132
  def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
 
 
 
 
 
 
 
 
133
  try:
134
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
135
- img = Image.open(io.BytesIO(pix.tobytes("png")))
 
 
 
 
136
 
 
137
  prompt = """
138
- Extract the invoice number from this invoice image,
139
- also consider for reading an Indian GST e-Invoice.
140
-
141
- Look for:
142
- - Invoice No
143
- - Bill No
144
- - Tax Invoice No
145
- - Document No
146
- - Purchase Order No (only if invoice not present)
147
-
148
- Return ONLY the identifier.
149
- If nothing is found, return NOT_FOUND.
150
  """
151
 
152
- response = rate_limited_gemini_call(prompt, img)
 
153
 
154
  if response and response.text:
155
- val = response.text.strip()
156
- if val != "NOT_FOUND" and len(val) > 2:
157
- return val
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
  except Exception as e:
160
- print("Gemini failed:", e)
161
-
162
- return None
163
 
164
 
165
  # ============================================================================
166
- # UNIFIED PAGE EXTRACTION
167
  # ============================================================================
168
 
169
- def extract_invoice(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
170
- text_inv = extract_text_invoice(page)
171
- if text_inv:
172
- return text_inv
 
 
 
 
173
 
 
174
  if is_image_pdf:
175
- return extract_invoice_gemini(page)
 
 
 
176
 
177
  return None
178
 
179
 
180
- # ============================================================================
181
- # PDF BUILDER
182
- # ============================================================================
183
-
184
- def build_pdf(doc, pages):
185
  out = fitz.open()
186
- for p in pages:
187
- out.insert_pdf(doc, from_page=p, to_page=p)
188
- data = out.tobytes()
189
  out.close()
190
- return data
191
 
192
 
193
  # ============================================================================
@@ -197,65 +300,176 @@ def build_pdf(doc, pages):
197
  @app.post("/split-invoices")
198
  async def split_invoices(
199
  file: UploadFile = File(...),
200
- include_pdf: bool = Form(True)
 
201
  ):
202
- if not file.filename.lower().endswith(".pdf"):
203
- raise HTTPException(400, "Only PDF allowed")
204
 
205
- data = await file.read()
206
- doc = fitz.open(stream=data, filetype="pdf")
207
 
208
- is_image_pdf = is_image_based_pdf(doc)
209
- print("PDF Type:", "IMAGE" if is_image_pdf else "TEXT")
 
 
 
210
 
211
- invoice_ids = []
 
 
212
 
213
- with ThreadPoolExecutor(max_workers=MAX_GEMINI_CONCURRENT_CALLS) as executor:
214
- futures = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  for i in range(doc.page_count):
216
- page = doc.load_page(i)
217
- futures.append(executor.submit(extract_invoice, page, is_image_pdf))
218
-
219
- for f in futures:
220
- invoice_ids.append(f.result())
221
-
222
- # Group pages
223
- groups = []
224
- current_inv = invoice_ids[0]
225
- current_pages = [0]
226
-
227
- for i in range(1, len(invoice_ids)):
228
- if invoice_ids[i] != current_inv and invoice_ids[i] is not None:
229
- groups.append((current_inv, current_pages))
230
- current_inv = invoice_ids[i]
231
- current_pages = [i]
232
- else:
233
- current_pages.append(i)
234
-
235
- groups.append((current_inv, current_pages))
236
-
237
- parts = []
238
- for inv, pages in groups:
239
- pdf_bytes = build_pdf(doc, pages)
240
- part = {
241
- "invoice_no": inv,
242
- "pages": [p + 1 for p in pages],
243
- "num_pages": len(pages),
244
- }
245
- if include_pdf:
246
- part["pdf_base64"] = base64.b64encode(pdf_bytes).decode()
247
- parts.append(part)
248
-
249
- return {
250
- "count": len(parts),
251
- "pdf_type": "image-based" if is_image_pdf else "text-based",
252
- "parts": parts
253
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
 
256
  @app.get("/health")
257
- def health():
 
 
258
  return {
259
- "status": "ok",
260
- "gemini": "configured" if GEMINI_API_KEY else "missing"
 
261
  }
 
 
 
 
 
2
  import io
3
  import re
4
  import base64
 
 
5
  from typing import List, Dict, Optional, Tuple
 
6
 
7
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
8
  from fastapi.middleware.cors import CORSMiddleware
9
  from fastapi.responses import JSONResponse
10
  import fitz # PyMuPDF
11
 
12
+ # Google Gemini - optional import
13
+ try:
14
+ import google.generativeai as genai
15
+ from PIL import Image
16
+ GEMINI_AVAILABLE = True
17
+ except ImportError:
18
+ GEMINI_AVAILABLE = False
19
+ print("Warning: google-generativeai not installed. Image-based PDFs won't be supported.")
20
 
21
+ app = FastAPI(title="Invoice Splitter API")
 
 
22
 
23
+ app.add_middleware(
24
+ CORSMiddleware,
25
+ allow_origins=["*"],
26
+ allow_credentials=True,
27
+ allow_methods=["*"],
28
+ allow_headers=["*"],
29
+ )
30
 
31
+ # --- Google Gemini Configuration ---
32
+ # This will be automatically loaded from environment variables
33
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
34
 
 
 
 
 
 
 
 
 
35
  gemini_model = None
36
 
37
 
38
  def get_gemini_model():
39
+ """Get or create Gemini model instance."""
40
  global gemini_model
41
 
42
+ if not GEMINI_AVAILABLE:
43
+ print("Gemini SDK not available")
44
  return None
45
 
46
  if gemini_model is None:
47
+ # Check if API key is configured via environment variables
48
+ if not GEMINI_API_KEY:
49
+ print("Warning: Gemini API key not found in environment variables.")
50
+ print("Please configure GEMINI_API_KEY in your environment variables.")
51
+ return None
52
+
53
+ try:
54
+ genai.configure(api_key=GEMINI_API_KEY)
55
+ gemini_model = genai.GenerativeModel('gemini-2.0-flash-exp')
56
+ print("✓ Google Gemini Flash 2.0 initialized")
57
+ except Exception as e:
58
+ print(f"Failed to initialize Gemini model: {e}")
59
+ return None
60
 
61
  return gemini_model
62
 
63
 
64
+ # --- Regex patterns for text-based PDF extraction ---
65
+ INVOICE_NO_RE = re.compile(
66
+ r"""
67
+ (?:
68
+ Invoice\s*No\.?|
69
+ Inv\.?\s*No\.?|
70
+ Bill\s*No\.?|
71
+ Document\s*No\.?| # ✅ ADD THIS
72
+ Doc\s*No\.?|
73
+ Tax\s*Invoice\s*No\.?
74
+ )
75
+ \s*[:\-]?\s*
76
+ ([A-Z0-9][A-Z0-9\-\/]{3,})
77
+ """,
78
+ re.IGNORECASE | re.VERBOSE
79
+ )
80
+
81
 
82
+ PREFIXED_INVOICE_RE = re.compile(
83
+ r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b"
84
+ )
 
 
85
 
86
+ GST_LIKE_RE = re.compile(
87
+ r"\b((?:GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b", re.IGNORECASE)
88
 
 
 
89
 
90
+ def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
91
+ """
92
+ Detect if PDF is image-based or text-based by sampling pages.
93
+ Returns (is_image_based, avg_text_length).
94
 
95
+ Strategy:
96
+ - Sample first few pages
97
+ - If average extractable text < 50 chars per page, it's likely image-based
98
+ - If text > 200 chars per page, it's text-based
99
+ """
100
+ total_text_length = 0
101
+ pages_to_check = min(sample_pages, doc.page_count)
102
 
103
+ for i in range(pages_to_check):
104
+ text = doc. load_page(i).get_text("text") or ""
105
+ total_text_length += len(text. strip())
106
 
107
+ avg_text_length = total_text_length / pages_to_check
108
+ is_image_based = avg_text_length < 50
 
 
 
 
109
 
110
+ print(
111
+ f" PDF Type Detection: avg_text_length={avg_text_length:.1f} chars/page")
112
+ print(
113
+ f" Classification: {'IMAGE-BASED' if is_image_based else 'TEXT-BASED'} PDF")
114
 
115
+ return is_image_based, avg_text_length
 
 
 
116
 
 
117
 
118
  # ============================================================================
119
+ # TEXT-BASED PDF EXTRACTION (Original Code)
120
  # ============================================================================
121
 
 
 
 
 
 
 
122
 
123
+ def normalize_text_for_search(s: str) -> str:
124
+ """Light normalization: collapse whitespace and normalize common separators."""
125
+ if not s:
126
+ return s
127
+ s = s.replace("\u00A0", " ") # non-breaking space
128
+ s = re.sub(r"[\r\n\t]+", " ", s)
129
+ s = re.sub(r"[ ]{2,}", " ", s).strip()
130
+ return s
131
 
 
 
 
132
 
133
+ def try_extract_invoice_from_text(text: str) -> Optional[str]:
134
+ """
135
+ Extract invoice number from text using regex patterns.
136
+ - Prefer explicit labeled Invoice/Bill patterns.
137
+ - Prefer prefixed invoice formats found in the top of the page.
138
+ - Use GST only as a last resort and tag it so it won't be mistaken for an invoice id.
139
+ """
140
+ if not text:
141
+ return None
142
+
143
+ text_norm = normalize_text_for_search(text)
144
+
145
+ # 1) Labeled invoice like "Invoice No", "Inv No."
146
+ m = INVOICE_NO_RE.search(text_norm)
147
  if m:
148
+ inv = (m.group(1) or "").strip()
149
+ if inv and inv.lower() not in ("invoice", "inv", "bill") and len(inv) > 2:
150
+ return inv
151
 
152
+ # 2) Search top portion for prefixed invoice codes (WN-1234, 5EN19710, etc.)
153
+ top_text = text_norm[:600] # bigger top area to be robust
154
+ m = PREFIXED_INVOICE_RE.search(top_text)
155
  if m:
156
+ inv = (m.group(1) or "").strip()
157
+ # extra length check so tiny numeric matches don't pass
158
+ if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
159
+ return inv
160
+
161
+ # 3) As absolute last-resort: strict GST detection (only accept 15-char GSTIN)
162
+ gm = GST_LIKE_RE.search(text_norm)
163
+ if gm:
164
+ gst_val = gm.group(2) or ""
165
+ gst_val = gst_val.replace(" ", "").strip().upper()
166
+ # Only accept if 15 alnum chars (typical Indian GSTIN length)
167
+ if len(gst_val) == 15 and re.match(r"^[0-9A-Z]{15}$", gst_val):
168
+ # tag it so grouping won't treat GST same as invoice ID
169
+ return f"GST:{gst_val}"
170
+
171
+ return None
172
+
173
+
174
+ def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
175
+ """
176
+ Extract invoice number from TEXT-BASED PDF.
177
+ Uses the original fast text extraction method.
178
+ """
179
+ # Try full-page text
180
+ text = page.get_text("text") or ""
181
+ inv = try_extract_invoice_from_text(text)
182
+ if inv:
183
+ return inv
184
+
185
+ # Try block-level text
186
+ for block in (page.get_text("blocks") or []):
187
+ block_text = block[4] if len(block) > 4 else ""
188
+ if block_text:
189
+ inv = try_extract_invoice_from_text(block_text)
190
+ if inv:
191
+ return inv
192
 
193
  return None
194
 
195
 
196
  # ============================================================================
197
+ # IMAGE-BASED PDF EXTRACTION (Google Gemini)
198
  # ============================================================================
199
 
200
  def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
201
+ """
202
+ Extract invoice number from IMAGE-BASED PDF using Google Gemini Flash 2.0.
203
+ """
204
+ model = get_gemini_model()
205
+ if not model:
206
+ print(" Gemini model not available")
207
+ return None
208
+
209
  try:
210
+ # Convert page to image
211
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x resolution
212
+ img_bytes = pix.tobytes("png")
213
+
214
+ # Convert to PIL Image for Gemini
215
+ img = Image.open(io.BytesIO(img_bytes))
216
 
217
+ # Prompt for Gemini to extract invoice number
218
  prompt = """
219
+ Extract the invoice number from this image. Look for:
220
+ - Invoice No, Invoice Number, Bill No, Bill Number
221
+ - Any alphanumeric code that appears to be an invoice identifier
222
+ - Purchase Order numbers if no invoice number is found
223
+
224
+ Return ONLY the invoice number/identifier itself, nothing else.
225
+ If no invoice number is found, return "NOT_FOUND".
 
 
 
 
 
226
  """
227
 
228
+ print(" Calling Google Gemini API...")
229
+ response = model.generate_content([prompt, img])
230
 
231
  if response and response.text:
232
+ extracted_text = response.text.strip()
233
+ print(f" Gemini response: {extracted_text}")
234
+
235
+ if extracted_text and extracted_text != "NOT_FOUND":
236
+ # Clean up the response
237
+ invoice_no = extracted_text.replace(
238
+ "*", "").replace("#", "").strip()
239
+ if invoice_no and len(invoice_no) > 2:
240
+ print(f" ✓ Gemini found invoice: {invoice_no}")
241
+ return invoice_no
242
+
243
+ # Fallback: Get full OCR text and try regex
244
+ ocr_prompt = "Extract all text from this invoice image. Return the complete text content."
245
+ ocr_response = model.generate_content([ocr_prompt, img])
246
+
247
+ if ocr_response and ocr_response.text:
248
+ print(
249
+ f" Gemini extracted {len(ocr_response.text)} chars, trying regex...")
250
+ inv = try_extract_invoice_from_text(ocr_response.text)
251
+ if inv:
252
+ print(f" ✓ Found via regex on Gemini text: {inv}")
253
+ return inv
254
+
255
+ print(" ✗ Gemini: No invoice found")
256
+ return None
257
 
258
  except Exception as e:
259
+ print(f"Gemini extraction failed: {e}")
260
+ return None
 
261
 
262
 
263
  # ============================================================================
264
+ # UNIFIED EXTRACTION LOGIC
265
  # ============================================================================
266
 
267
+ def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
268
+ """Try text extraction first, then Gemini as fallback"""
269
+
270
+ # ALWAYS try text extraction first (fast, no API cost)
271
+ text_result = extract_invoice_text_based(page)
272
+ if text_result:
273
+ print(f" ✓ Found via text extraction: {text_result}")
274
+ return text_result
275
 
276
+ # If text fails AND PDF seems image-based, try Gemini
277
  if is_image_pdf:
278
+ gemini_result = extract_invoice_gemini(page)
279
+ if gemini_result:
280
+ print(f" ✓ Found via Gemini: {gemini_result}")
281
+ return gemini_result
282
 
283
  return None
284
 
285
 
286
+ def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
287
+ """Create a new PDF with the given pages (0-based indices)."""
 
 
 
288
  out = fitz.open()
289
+ for i in page_indices:
290
+ out.insert_pdf(src_doc, from_page=i, to_page=i)
291
+ pdf_bytes = out.tobytes()
292
  out.close()
293
+ return pdf_bytes
294
 
295
 
296
  # ============================================================================
 
300
  @app.post("/split-invoices")
301
  async def split_invoices(
302
  file: UploadFile = File(...),
303
+ include_pdf: bool = Form(True),
304
+ initial_dpi: int = Form(300), # Kept for compatibility
305
  ):
306
+ """
307
+ Split a multi-invoice PDF into separate PDFs based on invoice numbers.
308
 
309
+ - Text-based PDFs: Uses fast text extraction
310
+ - Image-based PDFs: Uses Google Gemini Flash 2.0 (if configured)
311
 
312
+ Note: GST values (tagged as "GST:...") are treated as a last-resort identifier and
313
+ are ignored for splitting by default (so repeated company GST won't prevent splits).
314
+ """
315
+ if not file.filename.lower().endswith(".pdf"):
316
+ raise HTTPException(status_code=400, detail="only PDF is supported")
317
 
318
+ file_bytes = await file.read()
319
+ if not file_bytes:
320
+ raise HTTPException(status_code=400, detail="empty file")
321
 
322
+ try:
323
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
324
+ if doc.page_count == 0:
325
+ raise HTTPException(status_code=400, detail="no pages found")
326
+
327
+ print(f"\n{'='*60}")
328
+ print(f"Processing PDF: {file.filename}")
329
+ print(f"Total pages: {doc.page_count}")
330
+ print(f"{'='*60}")
331
+
332
+ # Step 1: Detect PDF type (text-based vs image-based)
333
+ is_image_pdf, avg_text_len = is_image_based_pdf(doc)
334
+
335
+ if is_image_pdf and not get_gemini_model():
336
+ raise HTTPException(
337
+ status_code=500,
338
+ detail="Image-based PDF detected but Google Gemini is not configured. "
339
+ "Please add GEMINI_API_KEY to your environment variables."
340
+ )
341
+
342
+ # Step 2: Extract invoice numbers from each page
343
+ page_invoice_nos: List[Optional[str]] = []
344
  for i in range(doc.page_count):
345
+ print(f"\n--- Page {i+1}/{doc.page_count} ---")
346
+ inv = extract_invoice_no_from_page(doc.load_page(i), is_image_pdf)
347
+ # inv may be something like "5EN19710" or "GST:12ABCDE..." or None
348
+ if inv:
349
+ print(f" ✓ Raw extracted id: {inv}")
350
+ else:
351
+ print(f" ✗ No invoice found (raw)")
352
+ page_invoice_nos.append(inv)
353
+
354
+ print(f"\n{'='*60}")
355
+ print(f"Raw Extraction Results: {page_invoice_nos}")
356
+ print(f"{'='*60}")
357
+
358
+ # ---------------------------------------------------------
359
+ # Post-process extracted ids before grouping
360
+ # - Treat GST:<value> as a LAST-RESORT marker and ignore it for splitting
361
+ # (convert to None) so repeated company GST doesn't group pages together.
362
+ # - Keep actual invoice ids like '5EN19710' intact.
363
+ # ---------------------------------------------------------
364
+ page_invoice_nos_filtered: List[Optional[str]] = []
365
+ for v in page_invoice_nos:
366
+ if v is None:
367
+ page_invoice_nos_filtered.append(None)
368
+ else:
369
+ # If GST-tagged value (we returned "GST:..."), ignore it for splitting
370
+ if isinstance(v, str) and v.upper().startswith("GST:"):
371
+ page_invoice_nos_filtered.append(None)
372
+ else:
373
+ page_invoice_nos_filtered.append(v)
374
+
375
+ print(f"Filtered (GST ignored) Results: {page_invoice_nos_filtered}")
376
+
377
+ # Step 3: Group pages by invoice number (use filtered ids)
378
+ groups: List[Dict] = []
379
+ current_group_pages: List[int] = []
380
+ current_invoice: Optional[str] = None
381
+
382
+ for idx, inv in enumerate(page_invoice_nos_filtered):
383
+ if current_invoice is None:
384
+ # Start a new group (even if inv is None)
385
+ current_invoice = inv
386
+ current_group_pages = [idx]
387
+ else:
388
+ # If a new non-empty invoice appears and differs -> close current group
389
+ if inv is not None and inv != current_invoice:
390
+ groups.append({
391
+ "invoice_no": current_invoice,
392
+ "pages": current_group_pages[:],
393
+ })
394
+ current_invoice = inv
395
+ current_group_pages = [idx]
396
+ else:
397
+ # Continue current group (same invoice or both None)
398
+ current_group_pages.append(idx)
399
+
400
+ # Save last group
401
+ if current_group_pages:
402
+ groups.append({
403
+ "invoice_no": current_invoice,
404
+ "pages": current_group_pages[:]
405
+ })
406
+
407
+ # Post-process groups:
408
+ # If first group has invoice_no None and next group has non-None -> merge leading None
409
+ if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
410
+ groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
411
+ groups.pop(0)
412
+
413
+ # If, after filtering, all groups are None (no invoice detected), return whole doc as one part
414
+ if all(g["invoice_no"] is None for g in groups):
415
+ print("\n⚠ Warning: No invoices detected in any page (after GST ignored)!")
416
+ print(" Returning entire PDF as single part")
417
+ groups = [{
418
+ "invoice_no": None,
419
+ "pages": list(range(doc.page_count))
420
+ }]
421
+
422
+ # Step 4: Build response parts
423
+ parts = []
424
+ for idx, g in enumerate(groups):
425
+ part_bytes = build_pdf_from_pages(doc, g["pages"])
426
+ info = {
427
+ # Keep invoice_no as detected in filtered set (None or actual invoice id)
428
+ "invoice_no": g["invoice_no"],
429
+ "pages": [p + 1 for p in g["pages"]], # 1-based for humans
430
+ "num_pages": len(g["pages"]),
431
+ "size_bytes": len(part_bytes),
432
+ }
433
+ if include_pdf:
434
+ info["pdf_base64"] = base64.b64encode(
435
+ part_bytes).decode("ascii")
436
+ parts.append(info)
437
+ print(f"\nPart {idx+1}:")
438
+ print(f" Invoice: {g['invoice_no']}")
439
+ print(f" Pages: {info['pages']}")
440
+ print(f" Size: {len(part_bytes):,} bytes")
441
+
442
+ doc.close()
443
+
444
+ print(f"\n{'='*60}")
445
+ print(f"✓ Successfully split into {len(parts)} part(s)")
446
+ print(f"{'='*60}\n")
447
+
448
+ return JSONResponse({
449
+ "count": len(parts),
450
+ "pdf_type": "image-based" if is_image_pdf else "text-based",
451
+ "parts": parts
452
+ })
453
+
454
+ except HTTPException:
455
+ raise
456
+ except Exception as e:
457
+ print(f"\n✗ Error: {str(e)}")
458
+ import traceback
459
+ traceback.print_exc()
460
+ return JSONResponse({"error": str(e)}, status_code=500)
461
 
462
 
463
  @app.get("/health")
464
+ async def health_check():
465
+ """Health check endpoint to verify Gemini configuration."""
466
+ gemini_status = "configured" if get_gemini_model() else "not configured"
467
  return {
468
+ "status": "healthy",
469
+ "gemini_flash": gemini_status,
470
+ "gemini_available": GEMINI_AVAILABLE,
471
  }
472
+
473
+ if __name__ == "__main__":
474
+ import uvicorn
475
+ uvicorn.run(app, host="0.0.0.0", port=8001)