Spaces:

anujakkulkarni
/

splitpdffile

Sleeping

App Files Files Community

anujakkulkarni commited on Jan 13

Commit

8354cbf

verified ·

1 Parent(s): 58e0ce8

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -55

app.py CHANGED Viewed

@@ -22,13 +22,13 @@ try:
     import google.generativeai as genai
     from PIL import Image
     GEMINI_AVAILABLE = True
-except ImportError:
     GEMINI_AVAILABLE = False
     print("Warning: google-generativeai not installed.  Image-based PDFs won't be supported.")
 app = FastAPI(title="Invoice Splitter API")
-# ⭐ Increase max request body size (default is 1MB-2MB)
 Request.max_body_size = 200 * 1024 * 1024  # 200MB limit
 app.add_middleware(
@@ -45,20 +45,20 @@ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
 # Model fallback list (in priority order)
 GEMINI_MODELS = [
     {
-        "name": "gemini-1.5-flash",  # UPDATED: Current standard fast model
         "max_requests_per_minute": 15,
-        "timeout":  300,
         "description": "Primary fast model"
     },
     {
-        "name": "gemini-2.0-flash-exp",  # Fallback experimental
         "max_requests_per_minute": 10,
         "timeout": 300,
         "description": "Experimental fallback"
     },
     {
-        "name": "gemini-1.5-pro",  # Slower fallback
-        "max_requests_per_minute": 2,
         "timeout": 300,
         "description": "Pro fallback (slower)"
     }
@@ -113,12 +113,12 @@ def check_daily_quota():
     global last_quota_reset, daily_quota_exhausted
     now = datetime.datetime.now()
-    if last_quota_reset is None:
         last_quota_reset = now
         daily_quota_exhausted = False
         return True
-    if now. date() > last_quota_reset.date():
         print("🔄 Daily quota reset detected")
         last_quota_reset = now
         daily_quota_exhausted = False
@@ -183,13 +183,21 @@ def reset_to_primary_model():
     return False
-# --- Regex Patterns ---
 INVOICE_NO_RE = re.compile(
-    r"""(?: Invoice\s*No\.?|Inv\.\s*No\.?|Bill\s*No\.?|Document\s*No\.?|Doc\s*No\.?|Tax\s*Invoice\s*No\.?)\s*[:\-]?\s*([A-Z0-9][A-Z0-9\-\/]{3,})""",
-    re.IGNORECASE | re.VERBOSE
 )
-PREFIXED_INVOICE_RE = re. compile(r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b")
-GST_LIKE_RE = re.compile(r"\b((?: GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b", re.IGNORECASE)
 def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
@@ -215,24 +223,38 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
         return None
     text_norm = normalize_text_for_search(text)
     m = INVOICE_NO_RE. search(text_norm)
     if m:
         inv = (m.group(1) or "").strip()
-        if inv and len(inv) > 2 and inv. lower() not in ("invoice", "bill"):
             return inv
     m = PREFIXED_INVOICE_RE.search(text_norm[: 600])
     if m:
         inv = (m.group(1) or "").strip()
         if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
             return inv
     gm = GST_LIKE_RE.search(text_norm)
     if gm:
-        gst_val = gm.group(2).replace(" ", "").strip().upper()
         if len(gst_val) == 15:
             return f"GST:{gst_val}"
     return None
@@ -250,11 +272,11 @@ def extract_invoice_gemini(page:  fitz.Page, retry_count=0) -> Optional[str]:
         return extract_invoice_gemini(page, retry_count)
     try:
-        # ⭐ Reduced resolution from 2x to 1.5x to save memory
         pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5), dpi=150)
         img_bytes = pix.tobytes("png")
-        # ⭐ Explicitly free pixmap memory
         pix = None
         img = Image.open(io.BytesIO(img_bytes))
@@ -276,7 +298,7 @@ def extract_invoice_gemini(page:  fitz.Page, retry_count=0) -> Optional[str]:
             if ocr_resp and ocr_resp.text:
                 result = try_extract_invoice_from_text(ocr_resp.text)
-        # ⭐ Free image memory
         img. close()
         return result
@@ -295,7 +317,7 @@ def extract_invoice_gemini(page:  fitz.Page, retry_count=0) -> Optional[str]:
         return None
-def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
     # 1. Try Text Extraction (Fastest)
     text = page.get_text("text") or ""
     inv = try_extract_invoice_from_text(text)
@@ -323,10 +345,10 @@ def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> byt
         for i in page_indices:
             out.insert_pdf(src_doc, from_page=i, to_page=i)
-        # ⭐ Optimize and compress output PDF
         pdf_bytes = out.tobytes(garbage=4, deflate=True)
         return pdf_bytes
-    finally:
         out.close()
@@ -348,7 +370,7 @@ def remove_file(path: str):
 async def root():
     return {
         "service": "Invoice Splitter API",
-        "version": "2.0",
         "max_file_size_mb": 200,
         "gemini_available": GEMINI_AVAILABLE,
         "gemini_configured": bool(GEMINI_API_KEY)
@@ -371,7 +393,7 @@ async def health():
 @app.post("/split-invoices")
 async def split_invoices(
     background_tasks: BackgroundTasks,
-    file: UploadFile = File(...),
     include_pdf: bool = Form(True),
     max_file_size_mb: int = Form(200)
 ):
@@ -386,7 +408,7 @@ async def split_invoices(
     Returns:
     - JSON with split invoice parts
     """
-    if not file.filename.lower().endswith(".pdf"):
         raise HTTPException(status_code=400, detail="Only PDF files are supported")
     max_size_bytes = max_file_size_mb * 1024 * 1024
@@ -398,37 +420,37 @@ async def split_invoices(
     doc = None  # Initialize for finally block
     try:
-        # ⭐ Stream upload with size tracking and validation
         print(f"📥 Receiving file: {file.filename}")
         total_size = 0
         with open(temp_path, "wb") as buffer:
-            # ⭐ Use 5MB chunks for faster processing
             chunk_size = 5 * 1024 * 1024
             while content := await file.read(chunk_size):
                 total_size += len(content)
-                # ⭐ Check size limit during upload
                 if total_size > max_size_bytes:
                     raise HTTPException(
-                        status_code=413,
-                        detail=f"File too large. Maximum size: {max_file_size_mb}MB, received: {total_size / (1024*1024):.1f}MB"
                     )
                 buffer.write(content)
-                # ⭐ Progress logging for large files
                 if total_size % (20 * 1024 * 1024) < chunk_size:  # Every ~20MB
                     print(f"   📊 Uploaded:  {total_size / (1024*1024):.1f}MB")
         file_size_mb = total_size / (1024 * 1024)
         print(f"💾 Saved {file_size_mb:.2f}MB to:  {temp_path}")
-        # ⭐ Open PDF from disk (memory-mapped)
         doc = fitz.open(temp_path)
-        if doc.page_count == 0:
             raise HTTPException(status_code=400, detail="PDF file is empty")
         print(f"📄 Processing {doc.page_count} pages...")
@@ -441,10 +463,10 @@ async def split_invoices(
         # Step 2: Extract invoice numbers from all pages
         page_invoice_nos = []
-        for i in range(doc.page_count):
-            # ⭐ Progress logging for large documents
             if i > 0 and i % 50 == 0:
-                print(f"   ��� Processed {i}/{doc.page_count} pages")
             page = doc. load_page(i)
@@ -454,11 +476,13 @@ async def split_invoices(
                 if inv:
                     print(f"   Page {i+1}: Found invoice '{inv}'")
             finally:
-                # ⭐ Explicitly free page resources
                 page = None
-            # ⭐ Force garbage collection every 100 pages
             if i > 0 and i % 100 == 0:
                 gc.collect()
@@ -466,7 +490,7 @@ async def split_invoices(
         # Step 3: Filter GST-only entries and group pages
         clean_invs = [
-            None if (v and v.upper().startswith("GST: ")) else v
             for v in page_invoice_nos
         ]
@@ -491,11 +515,11 @@ async def split_invoices(
         if current_group:
             groups. append({"invoice_no": current_inv, "pages": current_group})
-        # ⭐ Smart merging:  If first page has no invoice, merge with second group
         if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
             print(f"   🔗 Merging first {len(groups[0]['pages'])} pages with invoice '{groups[1]['invoice_no']}'")
             groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
-            groups.pop(0)
         print(f"📦 Created {len(groups)} invoice groups")
@@ -514,10 +538,10 @@ async def split_invoices(
                 "pages": [p + 1 for p in g["pages"]],  # 1-based page numbers
                 "page_count": len(g["pages"]),
                 "size_bytes": len(part_bytes),
-                "size_mb": round(len(part_bytes) / (1024 * 1024), 2)
             }
-            # ⭐ Handle large responses - skip base64 if total response too large
             if include_pdf:
                 base64_size = len(part_bytes) * 4 / 3  # Base64 encoding overhead
                 total_response_size += base64_size
@@ -533,10 +557,10 @@ async def split_invoices(
             parts.append(info)
-            # ⭐ Free memory immediately
             del part_bytes
-            # ⭐ Garbage collect after each part
             if idx % 5 == 0:
                 gc.collect()
@@ -568,18 +592,18 @@ async def split_invoices(
         raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
     finally:
-        # ⭐ Critical cleanup in correct order
         if doc:
             try:
                 doc.close()
                 print("📕 Closed PDF document")
             except Exception as e:
-                print(f"⚠️ Error closing document:  {e}")
         # Delete temp file
         remove_file(temp_path)
-        # ⭐ Final garbage collection
         gc.collect()
@@ -593,7 +617,7 @@ async def split_invoices_stream(
     Streaming version for extremely large files.
     Returns NDJSON (newline-delimited JSON) with each part as a separate line.
-    This avoids building a large JSON response in memory.
     """
     import json
@@ -701,7 +725,7 @@ async def split_invoices_stream(
                 "error": str(e)
             }) + "\n"
         finally:
-            if doc:
                 doc.close()
             remove_file(temp_path)
             gc.collect()
@@ -710,7 +734,7 @@ async def split_invoices_stream(
         generate_parts(),
         media_type="application/x-ndjson",
         headers={
-            "Content-Disposition":  f"attachment; filename=invoices-split. ndjson"
         }
     )
@@ -722,7 +746,7 @@ if __name__ == "__main__":
     print(f"   Gemini available: {GEMINI_AVAILABLE}")
     print(f"   Gemini configured: {bool(GEMINI_API_KEY)}")
-    # ⭐ Configure uvicorn for large files
     uvicorn.run(
         app,
         host="0.0.0.0",

     import google.generativeai as genai
     from PIL import Image
     GEMINI_AVAILABLE = True
+except ImportError:
     GEMINI_AVAILABLE = False
     print("Warning: google-generativeai not installed.  Image-based PDFs won't be supported.")
 app = FastAPI(title="Invoice Splitter API")
+# Increase max request body size (default is 1MB-2MB)
 Request.max_body_size = 200 * 1024 * 1024  # 200MB limit
 app.add_middleware(
 # Model fallback list (in priority order)
 GEMINI_MODELS = [
     {
+        "name": "gemini-1.5-flash",
         "max_requests_per_minute": 15,
+        "timeout": 300,
         "description": "Primary fast model"
     },
     {
+        "name": "gemini-2.0-flash-exp",
         "max_requests_per_minute": 10,
         "timeout": 300,
         "description": "Experimental fallback"
     },
     {
+        "name": "gemini-1.5-pro",
+        "max_requests_per_minute":  2,
         "timeout": 300,
         "description": "Pro fallback (slower)"
     }
     global last_quota_reset, daily_quota_exhausted
     now = datetime.datetime.now()
+    if last_quota_reset is None:
         last_quota_reset = now
         daily_quota_exhausted = False
         return True
+    if now. date() > last_quota_reset. date():
         print("🔄 Daily quota reset detected")
         last_quota_reset = now
         daily_quota_exhausted = False
     return False
+# --- Regex Patterns (FIXED) ---
+# Match "Invoice No:  2310763135" or similar patterns
 INVOICE_NO_RE = re.compile(
+    r"(?:Invoice\s*No\. ?|Tax\s*Invoice\s*No\.?|Bill\s*No\.?|Document\s*No\.?|Doc\s*No\.?)\s*[:\-]?\s*([A-Z0-9][A-Z0-9\-\/]{3,})",
+    re.IGNORECASE
 )
+# Match prefixed invoice numbers like "INV-2024/001"
+PREFIXED_INVOICE_RE = re.compile(r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b")
+# Match GST numbers
+GST_LIKE_RE = re.compile(r"\b(?: GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15})\b", re.IGNORECASE)
+# Match pure numeric invoice numbers (10 digits like 2310763135)
+NUMERIC_INVOICE_RE = re. compile(r"\b(\d{10})\b")
 def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
         return None
     text_norm = normalize_text_for_search(text)
+    # Priority 1: Standard invoice number patterns (Invoice No:  XXX)
     m = INVOICE_NO_RE. search(text_norm)
     if m:
         inv = (m.group(1) or "").strip()
+        # Filter out common false positives
+        if inv and len(inv) > 2 and inv. lower() not in ("invoice", "bill", "order", "no"):
             return inv
+    # Priority 2: Look for 10-digit numeric invoice numbers (like 2310763135)
+    # Search in first 1000 chars to find it near the top
+    lines = text_norm[: 1000].split('\n')
+    for line in lines:
+        if 'invoice' in line.lower() and 'no' in line.lower():
+            # Look for 10-digit numbers in this line
+            m = NUMERIC_INVOICE_RE.search(line)
+            if m:
+                return m.group(1)
+    # Priority 3: Prefixed invoice numbers
     m = PREFIXED_INVOICE_RE.search(text_norm[: 600])
     if m:
         inv = (m.group(1) or "").strip()
         if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
             return inv
+    # Priority 4: GST number as fallback
     gm = GST_LIKE_RE.search(text_norm)
     if gm:
+        gst_val = gm.group(1).replace(" ", "").strip().upper()
         if len(gst_val) == 15:
             return f"GST:{gst_val}"
     return None
         return extract_invoice_gemini(page, retry_count)
     try:
+        # Reduced resolution from 2x to 1.5x to save memory
         pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5), dpi=150)
         img_bytes = pix.tobytes("png")
+        # Explicitly free pixmap memory
         pix = None
         img = Image.open(io.BytesIO(img_bytes))
             if ocr_resp and ocr_resp.text:
                 result = try_extract_invoice_from_text(ocr_resp.text)
+        # Free image memory
         img. close()
         return result
         return None
+def extract_invoice_no_from_page(page: fitz. Page, is_image_pdf: bool) -> Optional[str]:
     # 1. Try Text Extraction (Fastest)
     text = page.get_text("text") or ""
     inv = try_extract_invoice_from_text(text)
         for i in page_indices:
             out.insert_pdf(src_doc, from_page=i, to_page=i)
+        # Optimize and compress output PDF
         pdf_bytes = out.tobytes(garbage=4, deflate=True)
         return pdf_bytes
+    finally:
         out.close()
 async def root():
     return {
         "service": "Invoice Splitter API",
+        "version": "2.1",
         "max_file_size_mb": 200,
         "gemini_available": GEMINI_AVAILABLE,
         "gemini_configured": bool(GEMINI_API_KEY)
 @app.post("/split-invoices")
 async def split_invoices(
     background_tasks: BackgroundTasks,
+    file: UploadFile = File(... ),
     include_pdf: bool = Form(True),
     max_file_size_mb: int = Form(200)
 ):
     Returns:
     - JSON with split invoice parts
     """
+    if not file.filename.lower().endswith(". pdf"):
         raise HTTPException(status_code=400, detail="Only PDF files are supported")
     max_size_bytes = max_file_size_mb * 1024 * 1024
     doc = None  # Initialize for finally block
     try:
+        # Stream upload with size tracking and validation
         print(f"📥 Receiving file: {file.filename}")
         total_size = 0
         with open(temp_path, "wb") as buffer:
+            # Use 5MB chunks for faster processing
             chunk_size = 5 * 1024 * 1024
             while content := await file.read(chunk_size):
                 total_size += len(content)
+                # Check size limit during upload
                 if total_size > max_size_bytes:
                     raise HTTPException(
+                        status_code=413,
+                        detail=f"File too large.  Maximum size: {max_file_size_mb}MB, received: {total_size / (1024*1024):.1f}MB"
                     )
                 buffer.write(content)
+                # Progress logging for large files
                 if total_size % (20 * 1024 * 1024) < chunk_size:  # Every ~20MB
                     print(f"   📊 Uploaded:  {total_size / (1024*1024):.1f}MB")
         file_size_mb = total_size / (1024 * 1024)
         print(f"💾 Saved {file_size_mb:.2f}MB to:  {temp_path}")
+        # Open PDF from disk (memory-mapped)
         doc = fitz.open(temp_path)
+        if doc. page_count == 0:
             raise HTTPException(status_code=400, detail="PDF file is empty")
         print(f"📄 Processing {doc.page_count} pages...")
         # Step 2: Extract invoice numbers from all pages
         page_invoice_nos = []
+        for i in range(doc. page_count):
+            # Progress logging for large documents
             if i > 0 and i % 50 == 0:
+                print(f"   📄 Processed {i}/{doc.page_count} pages")
             page = doc. load_page(i)
                 if inv:
                     print(f"   Page {i+1}: Found invoice '{inv}'")
+                else:
+                    print(f"   Page {i+1}: No invoice number found")
             finally:
+                # Explicitly free page resources
                 page = None
+            # Force garbage collection every 100 pages
             if i > 0 and i % 100 == 0:
                 gc.collect()
         # Step 3: Filter GST-only entries and group pages
         clean_invs = [
+            None if (v and v.upper().startswith("GST: ")) else v
             for v in page_invoice_nos
         ]
         if current_group:
             groups. append({"invoice_no": current_inv, "pages": current_group})
+        # Smart merging:  If first page has no invoice, merge with second group
         if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
             print(f"   🔗 Merging first {len(groups[0]['pages'])} pages with invoice '{groups[1]['invoice_no']}'")
             groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
+            groups. pop(0)
         print(f"📦 Created {len(groups)} invoice groups")
                 "pages": [p + 1 for p in g["pages"]],  # 1-based page numbers
                 "page_count": len(g["pages"]),
                 "size_bytes": len(part_bytes),
+                "size_mb":  round(len(part_bytes) / (1024 * 1024), 2)
             }
+            # Handle large responses - skip base64 if total response too large
             if include_pdf:
                 base64_size = len(part_bytes) * 4 / 3  # Base64 encoding overhead
                 total_response_size += base64_size
             parts.append(info)
+            # Free memory immediately
             del part_bytes
+            # Garbage collect after each part
             if idx % 5 == 0:
                 gc.collect()
         raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
     finally:
+        # Critical cleanup in correct order
         if doc:
             try:
                 doc.close()
                 print("📕 Closed PDF document")
             except Exception as e:
+                print(f"⚠️ Error closing document: {e}")
         # Delete temp file
         remove_file(temp_path)
+        # Final garbage collection
         gc.collect()
     Streaming version for extremely large files.
     Returns NDJSON (newline-delimited JSON) with each part as a separate line.
+    This avoids building a large JSON response in memory.
     """
     import json
                 "error": str(e)
             }) + "\n"
         finally:
+            if doc:
                 doc.close()
             remove_file(temp_path)
             gc.collect()
         generate_parts(),
         media_type="application/x-ndjson",
         headers={
+            "Content-Disposition": f"attachment; filename=invoices-split. ndjson"
         }
     )
     print(f"   Gemini available: {GEMINI_AVAILABLE}")
     print(f"   Gemini configured: {bool(GEMINI_API_KEY)}")
+    # Configure uvicorn for large files
     uvicorn.run(
         app,
         host="0.0.0.0",