Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,13 +22,13 @@ try:
|
|
| 22 |
import google.generativeai as genai
|
| 23 |
from PIL import Image
|
| 24 |
GEMINI_AVAILABLE = True
|
| 25 |
-
except ImportError:
|
| 26 |
GEMINI_AVAILABLE = False
|
| 27 |
print("Warning: google-generativeai not installed. Image-based PDFs won't be supported.")
|
| 28 |
|
| 29 |
app = FastAPI(title="Invoice Splitter API")
|
| 30 |
|
| 31 |
-
#
|
| 32 |
Request.max_body_size = 200 * 1024 * 1024 # 200MB limit
|
| 33 |
|
| 34 |
app.add_middleware(
|
|
@@ -45,20 +45,20 @@ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
|
|
| 45 |
# Model fallback list (in priority order)
|
| 46 |
GEMINI_MODELS = [
|
| 47 |
{
|
| 48 |
-
"name": "gemini-1.5-flash",
|
| 49 |
"max_requests_per_minute": 15,
|
| 50 |
-
"timeout":
|
| 51 |
"description": "Primary fast model"
|
| 52 |
},
|
| 53 |
{
|
| 54 |
-
"name": "gemini-2.0-flash-exp",
|
| 55 |
"max_requests_per_minute": 10,
|
| 56 |
"timeout": 300,
|
| 57 |
"description": "Experimental fallback"
|
| 58 |
},
|
| 59 |
{
|
| 60 |
-
"name": "gemini-1.5-pro",
|
| 61 |
-
"max_requests_per_minute":
|
| 62 |
"timeout": 300,
|
| 63 |
"description": "Pro fallback (slower)"
|
| 64 |
}
|
|
@@ -113,12 +113,12 @@ def check_daily_quota():
|
|
| 113 |
global last_quota_reset, daily_quota_exhausted
|
| 114 |
now = datetime.datetime.now()
|
| 115 |
|
| 116 |
-
if last_quota_reset is None:
|
| 117 |
last_quota_reset = now
|
| 118 |
daily_quota_exhausted = False
|
| 119 |
return True
|
| 120 |
|
| 121 |
-
if now. date() > last_quota_reset.date():
|
| 122 |
print("π Daily quota reset detected")
|
| 123 |
last_quota_reset = now
|
| 124 |
daily_quota_exhausted = False
|
|
@@ -183,13 +183,21 @@ def reset_to_primary_model():
|
|
| 183 |
return False
|
| 184 |
|
| 185 |
|
| 186 |
-
# --- Regex Patterns ---
|
|
|
|
| 187 |
INVOICE_NO_RE = re.compile(
|
| 188 |
-
r"
|
| 189 |
-
re.IGNORECASE
|
| 190 |
)
|
| 191 |
-
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
|
| 195 |
def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
|
|
@@ -215,24 +223,38 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
|
|
| 215 |
return None
|
| 216 |
text_norm = normalize_text_for_search(text)
|
| 217 |
|
|
|
|
| 218 |
m = INVOICE_NO_RE. search(text_norm)
|
| 219 |
if m:
|
| 220 |
inv = (m.group(1) or "").strip()
|
| 221 |
-
|
|
|
|
| 222 |
return inv
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
m = PREFIXED_INVOICE_RE.search(text_norm[: 600])
|
| 225 |
if m:
|
| 226 |
inv = (m.group(1) or "").strip()
|
| 227 |
if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
|
| 228 |
return inv
|
| 229 |
-
|
|
|
|
| 230 |
gm = GST_LIKE_RE.search(text_norm)
|
| 231 |
if gm:
|
| 232 |
-
gst_val = gm.group(
|
| 233 |
if len(gst_val) == 15:
|
| 234 |
return f"GST:{gst_val}"
|
| 235 |
-
|
| 236 |
return None
|
| 237 |
|
| 238 |
|
|
@@ -250,11 +272,11 @@ def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
|
|
| 250 |
return extract_invoice_gemini(page, retry_count)
|
| 251 |
|
| 252 |
try:
|
| 253 |
-
#
|
| 254 |
pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5), dpi=150)
|
| 255 |
img_bytes = pix.tobytes("png")
|
| 256 |
|
| 257 |
-
#
|
| 258 |
pix = None
|
| 259 |
|
| 260 |
img = Image.open(io.BytesIO(img_bytes))
|
|
@@ -276,7 +298,7 @@ def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
|
|
| 276 |
if ocr_resp and ocr_resp.text:
|
| 277 |
result = try_extract_invoice_from_text(ocr_resp.text)
|
| 278 |
|
| 279 |
-
#
|
| 280 |
img. close()
|
| 281 |
|
| 282 |
return result
|
|
@@ -295,7 +317,7 @@ def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
|
|
| 295 |
return None
|
| 296 |
|
| 297 |
|
| 298 |
-
def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
|
| 299 |
# 1. Try Text Extraction (Fastest)
|
| 300 |
text = page.get_text("text") or ""
|
| 301 |
inv = try_extract_invoice_from_text(text)
|
|
@@ -323,10 +345,10 @@ def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> byt
|
|
| 323 |
for i in page_indices:
|
| 324 |
out.insert_pdf(src_doc, from_page=i, to_page=i)
|
| 325 |
|
| 326 |
-
#
|
| 327 |
pdf_bytes = out.tobytes(garbage=4, deflate=True)
|
| 328 |
return pdf_bytes
|
| 329 |
-
finally:
|
| 330 |
out.close()
|
| 331 |
|
| 332 |
|
|
@@ -348,7 +370,7 @@ def remove_file(path: str):
|
|
| 348 |
async def root():
|
| 349 |
return {
|
| 350 |
"service": "Invoice Splitter API",
|
| 351 |
-
"version": "2.
|
| 352 |
"max_file_size_mb": 200,
|
| 353 |
"gemini_available": GEMINI_AVAILABLE,
|
| 354 |
"gemini_configured": bool(GEMINI_API_KEY)
|
|
@@ -371,7 +393,7 @@ async def health():
|
|
| 371 |
@app.post("/split-invoices")
|
| 372 |
async def split_invoices(
|
| 373 |
background_tasks: BackgroundTasks,
|
| 374 |
-
file: UploadFile = File(...),
|
| 375 |
include_pdf: bool = Form(True),
|
| 376 |
max_file_size_mb: int = Form(200)
|
| 377 |
):
|
|
@@ -386,7 +408,7 @@ async def split_invoices(
|
|
| 386 |
Returns:
|
| 387 |
- JSON with split invoice parts
|
| 388 |
"""
|
| 389 |
-
if not file.filename.lower().endswith(".pdf"):
|
| 390 |
raise HTTPException(status_code=400, detail="Only PDF files are supported")
|
| 391 |
|
| 392 |
max_size_bytes = max_file_size_mb * 1024 * 1024
|
|
@@ -398,37 +420,37 @@ async def split_invoices(
|
|
| 398 |
doc = None # Initialize for finally block
|
| 399 |
|
| 400 |
try:
|
| 401 |
-
#
|
| 402 |
print(f"π₯ Receiving file: {file.filename}")
|
| 403 |
total_size = 0
|
| 404 |
|
| 405 |
with open(temp_path, "wb") as buffer:
|
| 406 |
-
#
|
| 407 |
chunk_size = 5 * 1024 * 1024
|
| 408 |
|
| 409 |
while content := await file.read(chunk_size):
|
| 410 |
total_size += len(content)
|
| 411 |
|
| 412 |
-
#
|
| 413 |
if total_size > max_size_bytes:
|
| 414 |
raise HTTPException(
|
| 415 |
-
status_code=413,
|
| 416 |
-
detail=f"File too large.
|
| 417 |
)
|
| 418 |
|
| 419 |
buffer.write(content)
|
| 420 |
|
| 421 |
-
#
|
| 422 |
if total_size % (20 * 1024 * 1024) < chunk_size: # Every ~20MB
|
| 423 |
print(f" π Uploaded: {total_size / (1024*1024):.1f}MB")
|
| 424 |
|
| 425 |
file_size_mb = total_size / (1024 * 1024)
|
| 426 |
print(f"πΎ Saved {file_size_mb:.2f}MB to: {temp_path}")
|
| 427 |
|
| 428 |
-
#
|
| 429 |
doc = fitz.open(temp_path)
|
| 430 |
|
| 431 |
-
if doc.page_count == 0:
|
| 432 |
raise HTTPException(status_code=400, detail="PDF file is empty")
|
| 433 |
|
| 434 |
print(f"π Processing {doc.page_count} pages...")
|
|
@@ -441,10 +463,10 @@ async def split_invoices(
|
|
| 441 |
# Step 2: Extract invoice numbers from all pages
|
| 442 |
page_invoice_nos = []
|
| 443 |
|
| 444 |
-
for i in range(doc.page_count):
|
| 445 |
-
#
|
| 446 |
if i > 0 and i % 50 == 0:
|
| 447 |
-
print(f"
|
| 448 |
|
| 449 |
page = doc. load_page(i)
|
| 450 |
|
|
@@ -454,11 +476,13 @@ async def split_invoices(
|
|
| 454 |
|
| 455 |
if inv:
|
| 456 |
print(f" Page {i+1}: Found invoice '{inv}'")
|
|
|
|
|
|
|
| 457 |
finally:
|
| 458 |
-
#
|
| 459 |
page = None
|
| 460 |
|
| 461 |
-
#
|
| 462 |
if i > 0 and i % 100 == 0:
|
| 463 |
gc.collect()
|
| 464 |
|
|
@@ -466,7 +490,7 @@ async def split_invoices(
|
|
| 466 |
|
| 467 |
# Step 3: Filter GST-only entries and group pages
|
| 468 |
clean_invs = [
|
| 469 |
-
None if (v and v.upper().startswith("GST: ")) else v
|
| 470 |
for v in page_invoice_nos
|
| 471 |
]
|
| 472 |
|
|
@@ -491,11 +515,11 @@ async def split_invoices(
|
|
| 491 |
if current_group:
|
| 492 |
groups. append({"invoice_no": current_inv, "pages": current_group})
|
| 493 |
|
| 494 |
-
#
|
| 495 |
if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
|
| 496 |
print(f" π Merging first {len(groups[0]['pages'])} pages with invoice '{groups[1]['invoice_no']}'")
|
| 497 |
groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
|
| 498 |
-
groups.pop(0)
|
| 499 |
|
| 500 |
print(f"π¦ Created {len(groups)} invoice groups")
|
| 501 |
|
|
@@ -514,10 +538,10 @@ async def split_invoices(
|
|
| 514 |
"pages": [p + 1 for p in g["pages"]], # 1-based page numbers
|
| 515 |
"page_count": len(g["pages"]),
|
| 516 |
"size_bytes": len(part_bytes),
|
| 517 |
-
"size_mb":
|
| 518 |
}
|
| 519 |
|
| 520 |
-
#
|
| 521 |
if include_pdf:
|
| 522 |
base64_size = len(part_bytes) * 4 / 3 # Base64 encoding overhead
|
| 523 |
total_response_size += base64_size
|
|
@@ -533,10 +557,10 @@ async def split_invoices(
|
|
| 533 |
|
| 534 |
parts.append(info)
|
| 535 |
|
| 536 |
-
#
|
| 537 |
del part_bytes
|
| 538 |
|
| 539 |
-
#
|
| 540 |
if idx % 5 == 0:
|
| 541 |
gc.collect()
|
| 542 |
|
|
@@ -568,18 +592,18 @@ async def split_invoices(
|
|
| 568 |
raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
|
| 569 |
|
| 570 |
finally:
|
| 571 |
-
#
|
| 572 |
if doc:
|
| 573 |
try:
|
| 574 |
doc.close()
|
| 575 |
print("π Closed PDF document")
|
| 576 |
except Exception as e:
|
| 577 |
-
print(f"β οΈ Error closing document:
|
| 578 |
|
| 579 |
# Delete temp file
|
| 580 |
remove_file(temp_path)
|
| 581 |
|
| 582 |
-
#
|
| 583 |
gc.collect()
|
| 584 |
|
| 585 |
|
|
@@ -593,7 +617,7 @@ async def split_invoices_stream(
|
|
| 593 |
Streaming version for extremely large files.
|
| 594 |
Returns NDJSON (newline-delimited JSON) with each part as a separate line.
|
| 595 |
|
| 596 |
-
This avoids building a large JSON response in memory.
|
| 597 |
"""
|
| 598 |
import json
|
| 599 |
|
|
@@ -701,7 +725,7 @@ async def split_invoices_stream(
|
|
| 701 |
"error": str(e)
|
| 702 |
}) + "\n"
|
| 703 |
finally:
|
| 704 |
-
if doc:
|
| 705 |
doc.close()
|
| 706 |
remove_file(temp_path)
|
| 707 |
gc.collect()
|
|
@@ -710,7 +734,7 @@ async def split_invoices_stream(
|
|
| 710 |
generate_parts(),
|
| 711 |
media_type="application/x-ndjson",
|
| 712 |
headers={
|
| 713 |
-
"Content-Disposition":
|
| 714 |
}
|
| 715 |
)
|
| 716 |
|
|
@@ -722,7 +746,7 @@ if __name__ == "__main__":
|
|
| 722 |
print(f" Gemini available: {GEMINI_AVAILABLE}")
|
| 723 |
print(f" Gemini configured: {bool(GEMINI_API_KEY)}")
|
| 724 |
|
| 725 |
-
#
|
| 726 |
uvicorn.run(
|
| 727 |
app,
|
| 728 |
host="0.0.0.0",
|
|
|
|
| 22 |
import google.generativeai as genai
|
| 23 |
from PIL import Image
|
| 24 |
GEMINI_AVAILABLE = True
|
| 25 |
+
except ImportError:
|
| 26 |
GEMINI_AVAILABLE = False
|
| 27 |
print("Warning: google-generativeai not installed. Image-based PDFs won't be supported.")
|
| 28 |
|
| 29 |
app = FastAPI(title="Invoice Splitter API")
|
| 30 |
|
| 31 |
+
# Increase max request body size (default is 1MB-2MB)
|
| 32 |
Request.max_body_size = 200 * 1024 * 1024 # 200MB limit
|
| 33 |
|
| 34 |
app.add_middleware(
|
|
|
|
| 45 |
# Model fallback list (in priority order)
|
| 46 |
GEMINI_MODELS = [
|
| 47 |
{
|
| 48 |
+
"name": "gemini-1.5-flash",
|
| 49 |
"max_requests_per_minute": 15,
|
| 50 |
+
"timeout": 300,
|
| 51 |
"description": "Primary fast model"
|
| 52 |
},
|
| 53 |
{
|
| 54 |
+
"name": "gemini-2.0-flash-exp",
|
| 55 |
"max_requests_per_minute": 10,
|
| 56 |
"timeout": 300,
|
| 57 |
"description": "Experimental fallback"
|
| 58 |
},
|
| 59 |
{
|
| 60 |
+
"name": "gemini-1.5-pro",
|
| 61 |
+
"max_requests_per_minute": 2,
|
| 62 |
"timeout": 300,
|
| 63 |
"description": "Pro fallback (slower)"
|
| 64 |
}
|
|
|
|
| 113 |
global last_quota_reset, daily_quota_exhausted
|
| 114 |
now = datetime.datetime.now()
|
| 115 |
|
| 116 |
+
if last_quota_reset is None:
|
| 117 |
last_quota_reset = now
|
| 118 |
daily_quota_exhausted = False
|
| 119 |
return True
|
| 120 |
|
| 121 |
+
if now. date() > last_quota_reset. date():
|
| 122 |
print("π Daily quota reset detected")
|
| 123 |
last_quota_reset = now
|
| 124 |
daily_quota_exhausted = False
|
|
|
|
| 183 |
return False
|
| 184 |
|
| 185 |
|
| 186 |
+
# --- Regex Patterns (FIXED) ---
|
| 187 |
+
# Match "Invoice No: 2310763135" or similar patterns
|
| 188 |
INVOICE_NO_RE = re.compile(
|
| 189 |
+
r"(?:Invoice\s*No\. ?|Tax\s*Invoice\s*No\.?|Bill\s*No\.?|Document\s*No\.?|Doc\s*No\.?)\s*[:\-]?\s*([A-Z0-9][A-Z0-9\-\/]{3,})",
|
| 190 |
+
re.IGNORECASE
|
| 191 |
)
|
| 192 |
+
|
| 193 |
+
# Match prefixed invoice numbers like "INV-2024/001"
|
| 194 |
+
PREFIXED_INVOICE_RE = re.compile(r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b")
|
| 195 |
+
|
| 196 |
+
# Match GST numbers
|
| 197 |
+
GST_LIKE_RE = re.compile(r"\b(?: GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15})\b", re.IGNORECASE)
|
| 198 |
+
|
| 199 |
+
# Match pure numeric invoice numbers (10 digits like 2310763135)
|
| 200 |
+
NUMERIC_INVOICE_RE = re. compile(r"\b(\d{10})\b")
|
| 201 |
|
| 202 |
|
| 203 |
def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
|
|
|
|
| 223 |
return None
|
| 224 |
text_norm = normalize_text_for_search(text)
|
| 225 |
|
| 226 |
+
# Priority 1: Standard invoice number patterns (Invoice No: XXX)
|
| 227 |
m = INVOICE_NO_RE. search(text_norm)
|
| 228 |
if m:
|
| 229 |
inv = (m.group(1) or "").strip()
|
| 230 |
+
# Filter out common false positives
|
| 231 |
+
if inv and len(inv) > 2 and inv. lower() not in ("invoice", "bill", "order", "no"):
|
| 232 |
return inv
|
| 233 |
+
|
| 234 |
+
# Priority 2: Look for 10-digit numeric invoice numbers (like 2310763135)
|
| 235 |
+
# Search in first 1000 chars to find it near the top
|
| 236 |
+
lines = text_norm[: 1000].split('\n')
|
| 237 |
+
for line in lines:
|
| 238 |
+
if 'invoice' in line.lower() and 'no' in line.lower():
|
| 239 |
+
# Look for 10-digit numbers in this line
|
| 240 |
+
m = NUMERIC_INVOICE_RE.search(line)
|
| 241 |
+
if m:
|
| 242 |
+
return m.group(1)
|
| 243 |
+
|
| 244 |
+
# Priority 3: Prefixed invoice numbers
|
| 245 |
m = PREFIXED_INVOICE_RE.search(text_norm[: 600])
|
| 246 |
if m:
|
| 247 |
inv = (m.group(1) or "").strip()
|
| 248 |
if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
|
| 249 |
return inv
|
| 250 |
+
|
| 251 |
+
# Priority 4: GST number as fallback
|
| 252 |
gm = GST_LIKE_RE.search(text_norm)
|
| 253 |
if gm:
|
| 254 |
+
gst_val = gm.group(1).replace(" ", "").strip().upper()
|
| 255 |
if len(gst_val) == 15:
|
| 256 |
return f"GST:{gst_val}"
|
| 257 |
+
|
| 258 |
return None
|
| 259 |
|
| 260 |
|
|
|
|
| 272 |
return extract_invoice_gemini(page, retry_count)
|
| 273 |
|
| 274 |
try:
|
| 275 |
+
# Reduced resolution from 2x to 1.5x to save memory
|
| 276 |
pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5), dpi=150)
|
| 277 |
img_bytes = pix.tobytes("png")
|
| 278 |
|
| 279 |
+
# Explicitly free pixmap memory
|
| 280 |
pix = None
|
| 281 |
|
| 282 |
img = Image.open(io.BytesIO(img_bytes))
|
|
|
|
| 298 |
if ocr_resp and ocr_resp.text:
|
| 299 |
result = try_extract_invoice_from_text(ocr_resp.text)
|
| 300 |
|
| 301 |
+
# Free image memory
|
| 302 |
img. close()
|
| 303 |
|
| 304 |
return result
|
|
|
|
| 317 |
return None
|
| 318 |
|
| 319 |
|
| 320 |
+
def extract_invoice_no_from_page(page: fitz. Page, is_image_pdf: bool) -> Optional[str]:
|
| 321 |
# 1. Try Text Extraction (Fastest)
|
| 322 |
text = page.get_text("text") or ""
|
| 323 |
inv = try_extract_invoice_from_text(text)
|
|
|
|
| 345 |
for i in page_indices:
|
| 346 |
out.insert_pdf(src_doc, from_page=i, to_page=i)
|
| 347 |
|
| 348 |
+
# Optimize and compress output PDF
|
| 349 |
pdf_bytes = out.tobytes(garbage=4, deflate=True)
|
| 350 |
return pdf_bytes
|
| 351 |
+
finally:
|
| 352 |
out.close()
|
| 353 |
|
| 354 |
|
|
|
|
| 370 |
async def root():
|
| 371 |
return {
|
| 372 |
"service": "Invoice Splitter API",
|
| 373 |
+
"version": "2.1",
|
| 374 |
"max_file_size_mb": 200,
|
| 375 |
"gemini_available": GEMINI_AVAILABLE,
|
| 376 |
"gemini_configured": bool(GEMINI_API_KEY)
|
|
|
|
| 393 |
@app.post("/split-invoices")
|
| 394 |
async def split_invoices(
|
| 395 |
background_tasks: BackgroundTasks,
|
| 396 |
+
file: UploadFile = File(... ),
|
| 397 |
include_pdf: bool = Form(True),
|
| 398 |
max_file_size_mb: int = Form(200)
|
| 399 |
):
|
|
|
|
| 408 |
Returns:
|
| 409 |
- JSON with split invoice parts
|
| 410 |
"""
|
| 411 |
+
if not file.filename.lower().endswith(". pdf"):
|
| 412 |
raise HTTPException(status_code=400, detail="Only PDF files are supported")
|
| 413 |
|
| 414 |
max_size_bytes = max_file_size_mb * 1024 * 1024
|
|
|
|
| 420 |
doc = None # Initialize for finally block
|
| 421 |
|
| 422 |
try:
|
| 423 |
+
# Stream upload with size tracking and validation
|
| 424 |
print(f"π₯ Receiving file: {file.filename}")
|
| 425 |
total_size = 0
|
| 426 |
|
| 427 |
with open(temp_path, "wb") as buffer:
|
| 428 |
+
# Use 5MB chunks for faster processing
|
| 429 |
chunk_size = 5 * 1024 * 1024
|
| 430 |
|
| 431 |
while content := await file.read(chunk_size):
|
| 432 |
total_size += len(content)
|
| 433 |
|
| 434 |
+
# Check size limit during upload
|
| 435 |
if total_size > max_size_bytes:
|
| 436 |
raise HTTPException(
|
| 437 |
+
status_code=413,
|
| 438 |
+
detail=f"File too large. Maximum size: {max_file_size_mb}MB, received: {total_size / (1024*1024):.1f}MB"
|
| 439 |
)
|
| 440 |
|
| 441 |
buffer.write(content)
|
| 442 |
|
| 443 |
+
# Progress logging for large files
|
| 444 |
if total_size % (20 * 1024 * 1024) < chunk_size: # Every ~20MB
|
| 445 |
print(f" π Uploaded: {total_size / (1024*1024):.1f}MB")
|
| 446 |
|
| 447 |
file_size_mb = total_size / (1024 * 1024)
|
| 448 |
print(f"πΎ Saved {file_size_mb:.2f}MB to: {temp_path}")
|
| 449 |
|
| 450 |
+
# Open PDF from disk (memory-mapped)
|
| 451 |
doc = fitz.open(temp_path)
|
| 452 |
|
| 453 |
+
if doc. page_count == 0:
|
| 454 |
raise HTTPException(status_code=400, detail="PDF file is empty")
|
| 455 |
|
| 456 |
print(f"π Processing {doc.page_count} pages...")
|
|
|
|
| 463 |
# Step 2: Extract invoice numbers from all pages
|
| 464 |
page_invoice_nos = []
|
| 465 |
|
| 466 |
+
for i in range(doc. page_count):
|
| 467 |
+
# Progress logging for large documents
|
| 468 |
if i > 0 and i % 50 == 0:
|
| 469 |
+
print(f" π Processed {i}/{doc.page_count} pages")
|
| 470 |
|
| 471 |
page = doc. load_page(i)
|
| 472 |
|
|
|
|
| 476 |
|
| 477 |
if inv:
|
| 478 |
print(f" Page {i+1}: Found invoice '{inv}'")
|
| 479 |
+
else:
|
| 480 |
+
print(f" Page {i+1}: No invoice number found")
|
| 481 |
finally:
|
| 482 |
+
# Explicitly free page resources
|
| 483 |
page = None
|
| 484 |
|
| 485 |
+
# Force garbage collection every 100 pages
|
| 486 |
if i > 0 and i % 100 == 0:
|
| 487 |
gc.collect()
|
| 488 |
|
|
|
|
| 490 |
|
| 491 |
# Step 3: Filter GST-only entries and group pages
|
| 492 |
clean_invs = [
|
| 493 |
+
None if (v and v.upper().startswith("GST: ")) else v
|
| 494 |
for v in page_invoice_nos
|
| 495 |
]
|
| 496 |
|
|
|
|
| 515 |
if current_group:
|
| 516 |
groups. append({"invoice_no": current_inv, "pages": current_group})
|
| 517 |
|
| 518 |
+
# Smart merging: If first page has no invoice, merge with second group
|
| 519 |
if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
|
| 520 |
print(f" π Merging first {len(groups[0]['pages'])} pages with invoice '{groups[1]['invoice_no']}'")
|
| 521 |
groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
|
| 522 |
+
groups. pop(0)
|
| 523 |
|
| 524 |
print(f"π¦ Created {len(groups)} invoice groups")
|
| 525 |
|
|
|
|
| 538 |
"pages": [p + 1 for p in g["pages"]], # 1-based page numbers
|
| 539 |
"page_count": len(g["pages"]),
|
| 540 |
"size_bytes": len(part_bytes),
|
| 541 |
+
"size_mb": round(len(part_bytes) / (1024 * 1024), 2)
|
| 542 |
}
|
| 543 |
|
| 544 |
+
# Handle large responses - skip base64 if total response too large
|
| 545 |
if include_pdf:
|
| 546 |
base64_size = len(part_bytes) * 4 / 3 # Base64 encoding overhead
|
| 547 |
total_response_size += base64_size
|
|
|
|
| 557 |
|
| 558 |
parts.append(info)
|
| 559 |
|
| 560 |
+
# Free memory immediately
|
| 561 |
del part_bytes
|
| 562 |
|
| 563 |
+
# Garbage collect after each part
|
| 564 |
if idx % 5 == 0:
|
| 565 |
gc.collect()
|
| 566 |
|
|
|
|
| 592 |
raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
|
| 593 |
|
| 594 |
finally:
|
| 595 |
+
# Critical cleanup in correct order
|
| 596 |
if doc:
|
| 597 |
try:
|
| 598 |
doc.close()
|
| 599 |
print("π Closed PDF document")
|
| 600 |
except Exception as e:
|
| 601 |
+
print(f"β οΈ Error closing document: {e}")
|
| 602 |
|
| 603 |
# Delete temp file
|
| 604 |
remove_file(temp_path)
|
| 605 |
|
| 606 |
+
# Final garbage collection
|
| 607 |
gc.collect()
|
| 608 |
|
| 609 |
|
|
|
|
| 617 |
Streaming version for extremely large files.
|
| 618 |
Returns NDJSON (newline-delimited JSON) with each part as a separate line.
|
| 619 |
|
| 620 |
+
This avoids building a large JSON response in memory.
|
| 621 |
"""
|
| 622 |
import json
|
| 623 |
|
|
|
|
| 725 |
"error": str(e)
|
| 726 |
}) + "\n"
|
| 727 |
finally:
|
| 728 |
+
if doc:
|
| 729 |
doc.close()
|
| 730 |
remove_file(temp_path)
|
| 731 |
gc.collect()
|
|
|
|
| 734 |
generate_parts(),
|
| 735 |
media_type="application/x-ndjson",
|
| 736 |
headers={
|
| 737 |
+
"Content-Disposition": f"attachment; filename=invoices-split. ndjson"
|
| 738 |
}
|
| 739 |
)
|
| 740 |
|
|
|
|
| 746 |
print(f" Gemini available: {GEMINI_AVAILABLE}")
|
| 747 |
print(f" Gemini configured: {bool(GEMINI_API_KEY)}")
|
| 748 |
|
| 749 |
+
# Configure uvicorn for large files
|
| 750 |
uvicorn.run(
|
| 751 |
app,
|
| 752 |
host="0.0.0.0",
|