Spaces:

anujakkulkarni
/

splitpdffile

Sleeping

App Files Files Community

anujakkulkarni commited on Dec 6, 2025

Commit

6ba329f

verified ·

1 Parent(s): b7cdb70

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -57

app.py CHANGED Viewed

@@ -5,15 +5,14 @@ import base64
 from typing import List, Dict, Optional, Tuple
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
-from fastapi. middleware.cors import CORSMiddleware
-from fastapi.middleware.gzip import GZipMiddleware
 from fastapi.responses import JSONResponse
 import fitz  # PyMuPDF
 # Azure Document Intelligence (Form Recognizer) - optional import
 try:
     from azure.ai.formrecognizer import DocumentAnalysisClient
-    from azure. core.credentials import AzureKeyCredential
     AZURE_AVAILABLE = True
 except ImportError:
     AZURE_AVAILABLE = False
@@ -21,9 +20,6 @@ except ImportError:
 app = FastAPI(title="Invoice Splitter API")
-# ✅ ADD GZIP COMPRESSION MIDDLEWARE (BEFORE CORS)
-app.add_middleware(GZipMiddleware, minimum_size=1000, compresslevel=6)
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -79,7 +75,7 @@ def get_azure_client() -> Optional[DocumentAnalysisClient]:
 # --- Regex patterns for text-based PDF extraction ---
 INVOICE_NO_RE = re.compile(
-    r"(?:Inv(?:oice)?\s*No\. ?|Invoice\s*No\.?|Bill\s*No\.?|BILL\s*NO\.?|BILL\s*NO)\s*[:\-]?\s*([A-Za-z0-9\-/]+)",
     re.IGNORECASE,
 )
@@ -93,11 +89,11 @@ GST_LIKE_RE = re.compile(r"\b(GST[-\s]?\d+[A-Za-z0-9-]*)\b", re.IGNORECASE)
 def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
     """
     Detect if PDF is image-based or text-based by sampling pages.
-    Returns (is_image_based, avg_text_length).
     Strategy:
     - Sample first few pages
-    - If average extractable text < 30 chars per page, it's likely image-based
     - If text > 200 chars per page, it's text-based
     """
     total_text_length = 0
@@ -105,11 +101,10 @@ def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool,
     for i in range(pages_to_check):
         text = doc.load_page(i).get_text("text") or ""
-        total_text_length += len(text. strip())
     avg_text_length = total_text_length / pages_to_check
-    # ✅ CHANGED: Lower threshold from 50 to 30
-    is_image_based = avg_text_length < 30
     print(
         f"  PDF Type Detection: avg_text_length={avg_text_length:.1f} chars/page")
@@ -125,14 +120,14 @@ def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool,
 def try_extract_invoice_from_text(text: str) -> Optional[str]:
     """
-    Extract invoice number from text using regex patterns.
-    Works for text-based PDFs.
     """
     if not text:
         return None
     # Pattern 1: Labeled invoice (Invoice No, Bill No, etc.)
-    m = INVOICE_NO_RE. search(text)
     if m:
         inv = (m.group(1) or "").strip()
         if inv and inv.lower() != "invoice" and len(inv) > 2:
@@ -160,7 +155,7 @@ def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
     Uses the original fast text extraction method.
     """
     # Try full-page text
-    text = page. get_text("text") or ""
     inv = try_extract_invoice_from_text(text)
     if inv:
         return inv
@@ -208,10 +203,10 @@ def extract_invoice_azure(page: fitz.Page) -> Optional[str]:
                 if hasattr(document, 'fields') and document.fields:
                     # Try InvoiceId field
                     if 'InvoiceId' in document.fields and document.fields['InvoiceId']:
-                        invoice_id = document.fields['InvoiceId']. value
                         if invoice_id:
                             print(f"    ✓ Azure found InvoiceId: {invoice_id}")
-                            return str(invoice_id). strip()
                     # Try PurchaseOrder field
                     if 'PurchaseOrder' in document.fields and document.fields['PurchaseOrder']:
@@ -221,7 +216,7 @@ def extract_invoice_azure(page: fitz.Page) -> Optional[str]:
                             return str(po).strip()
         # Fallback: try regex on Azure-extracted text
-        if result. content:
             print(
                 f"    Azure extracted {len(result.content)} chars, trying regex...")
             inv = try_extract_invoice_from_text(result.content)
@@ -242,9 +237,8 @@ def extract_invoice_azure(page: fitz.Page) -> Optional[str]:
 # ============================================================================
 def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
-    """
-    ✅ HYBRID EXTRACTION: Try text extraction first, then Azure as fallback
-    """
     # ALWAYS try text extraction first (fast, no API cost)
     text_result = extract_invoice_text_based(page)
     if text_result:
@@ -282,20 +276,18 @@ async def split_invoices(
     initial_dpi: int = Form(300),  # Kept for compatibility
 ):
     """
-    Split a multi-invoice PDF into separate PDFs based on invoice numbers.
     Automatically detects PDF type:
-    - Text-based PDFs: Uses fast text extraction (hybrid approach)
     - Image-based PDFs: Uses Azure Document Intelligence for accurate OCR
     Parameters:
     - file: PDF file to split
     - include_pdf: Whether to include base64 PDF in response
     - initial_dpi: DPI setting (kept for compatibility)
-    Response is automatically compressed with GZip for better network reliability.
     """
-    if not file.filename.lower().endswith(". pdf"):
         raise HTTPException(status_code=400, detail="only PDF is supported")
     file_bytes = await file.read()
@@ -303,7 +295,7 @@ async def split_invoices(
         raise HTTPException(status_code=400, detail="empty file")
     try:
-        doc = fitz. open(stream=file_bytes, filetype="pdf")
         if doc.page_count == 0:
             raise HTTPException(status_code=400, detail="no pages found")
@@ -318,7 +310,7 @@ async def split_invoices(
         if is_image_pdf and not get_azure_client():
             raise HTTPException(
                 status_code=500,
-                detail="Image-based PDF detected but Azure Document Intelligence is not configured.  "
                        "Please update AZURE_FORM_RECOGNIZER_ENDPOINT and AZURE_FORM_RECOGNIZER_KEY in the code."
             )
@@ -378,8 +370,6 @@ async def split_invoices(
         # Step 4: Build response parts
         parts = []
-        total_base64_size = 0  # ✅ NEW: Track total size
         for idx, g in enumerate(groups):
             part_bytes = build_pdf_from_pages(doc, g["pages"])
             info = {
@@ -389,43 +379,25 @@ async def split_invoices(
                 "size_bytes": len(part_bytes),
             }
             if include_pdf:
-                pdf_base64 = base64.b64encode(part_bytes).decode("ascii")
-                info["pdf_base64"] = pdf_base64
-                total_base64_size += len(pdf_base64)  # ✅ NEW: Track size
             parts.append(info)
             print(f"\nPart {idx+1}:")
             print(f"  Invoice: {g['invoice_no']}")
             print(f"  Pages: {info['pages']}")
             print(f"  Size: {len(part_bytes):,} bytes")
-            if include_pdf:
-                print(f"  Base64 size: {len(info. get('pdf_base64', '')):,} chars")
-        doc. close()
         print(f"\n{'='*60}")
         print(f"✓ Successfully split into {len(parts)} part(s)")
-        if include_pdf:
-            print(f"Total base64 size: {total_base64_size:,} chars ({total_base64_size/1024/1024:.2f} MB)")
         print(f"{'='*60}\n")
-        # ✅ NEW: Build response with size metadata
-        response_data = {
             "count": len(parts),
             "pdf_type": "image-based" if is_image_pdf else "text-based",
-            "parts": parts,
-            "total_size_bytes": total_base64_size if include_pdf else 0,  # For validation
-            "compression": "gzip",  # Hint that response is compressed
-        }
-        # ✅ NEW: Return with compression headers
-        return JSONResponse(
-            content=response_data,
-            headers={
-                "X-Total-Parts": str(len(parts)),
-                "X-Uncompressed-Size": str(total_base64_size),
-            }
-        )
     except HTTPException:
         raise
@@ -444,8 +416,7 @@ async def health_check():
         "status": "healthy",
         "azure_document_intelligence": azure_status,
         "azure_available": AZURE_AVAILABLE,
-        "endpoint": AZURE_FORM_RECOGNIZER_ENDPOINT if azure_status == "configured" else "not set",
-        "compression": "gzip enabled",
     }
 if __name__ == "__main__":

 from typing import List, Dict, Optional, Tuple
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 import fitz  # PyMuPDF
 # Azure Document Intelligence (Form Recognizer) - optional import
 try:
     from azure.ai.formrecognizer import DocumentAnalysisClient
+    from azure.core.credentials import AzureKeyCredential
     AZURE_AVAILABLE = True
 except ImportError:
     AZURE_AVAILABLE = False
 app = FastAPI(title="Invoice Splitter API")
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
 # --- Regex patterns for text-based PDF extraction ---
 INVOICE_NO_RE = re.compile(
+    r"(?:Inv(?:oice)?\s*No\.?|Invoice\s*No\.?|Bill\s*No\.?|BILL\s*NO\.?|BILL\s*NO)\s*[:\-]?\s*([A-Za-z0-9\-/]+)",
     re.IGNORECASE,
 )
 def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
     """
     Detect if PDF is image-based or text-based by sampling pages.
+    Returns (is_image_based, avg_text_length).
     Strategy:
     - Sample first few pages
+    - If average extractable text < 50 chars per page, it's likely image-based
     - If text > 200 chars per page, it's text-based
     """
     total_text_length = 0
     for i in range(pages_to_check):
         text = doc.load_page(i).get_text("text") or ""
+        total_text_length += len(text.strip())
     avg_text_length = total_text_length / pages_to_check
+    is_image_based = avg_text_length < 50
     print(
         f"  PDF Type Detection: avg_text_length={avg_text_length:.1f} chars/page")
 def try_extract_invoice_from_text(text: str) -> Optional[str]:
     """
+    Extract invoice number from text using regex patterns.
+    Works for text-based PDFs.
     """
     if not text:
         return None
     # Pattern 1: Labeled invoice (Invoice No, Bill No, etc.)
+    m = INVOICE_NO_RE.search(text)
     if m:
         inv = (m.group(1) or "").strip()
         if inv and inv.lower() != "invoice" and len(inv) > 2:
     Uses the original fast text extraction method.
     """
     # Try full-page text
+    text = page.get_text("text") or ""
     inv = try_extract_invoice_from_text(text)
     if inv:
         return inv
                 if hasattr(document, 'fields') and document.fields:
                     # Try InvoiceId field
                     if 'InvoiceId' in document.fields and document.fields['InvoiceId']:
+                        invoice_id = document.fields['InvoiceId'].value
                         if invoice_id:
                             print(f"    ✓ Azure found InvoiceId: {invoice_id}")
+                            return str(invoice_id).strip()
                     # Try PurchaseOrder field
                     if 'PurchaseOrder' in document.fields and document.fields['PurchaseOrder']:
                             return str(po).strip()
         # Fallback: try regex on Azure-extracted text
+        if result.content:
             print(
                 f"    Azure extracted {len(result.content)} chars, trying regex...")
             inv = try_extract_invoice_from_text(result.content)
 # ============================================================================
 def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
+    """Try text extraction first, then Azure as fallback"""
     # ALWAYS try text extraction first (fast, no API cost)
     text_result = extract_invoice_text_based(page)
     if text_result:
     initial_dpi: int = Form(300),  # Kept for compatibility
 ):
     """
+    Split a multi-invoice PDF into separate PDFs based on invoice numbers.
     Automatically detects PDF type:
+    - Text-based PDFs: Uses fast text extraction (original method)
     - Image-based PDFs: Uses Azure Document Intelligence for accurate OCR
     Parameters:
     - file: PDF file to split
     - include_pdf: Whether to include base64 PDF in response
     - initial_dpi: DPI setting (kept for compatibility)
     """
+    if not file.filename.lower().endswith(".pdf"):
         raise HTTPException(status_code=400, detail="only PDF is supported")
     file_bytes = await file.read()
         raise HTTPException(status_code=400, detail="empty file")
     try:
+        doc = fitz.open(stream=file_bytes, filetype="pdf")
         if doc.page_count == 0:
             raise HTTPException(status_code=400, detail="no pages found")
         if is_image_pdf and not get_azure_client():
             raise HTTPException(
                 status_code=500,
+                detail="Image-based PDF detected but Azure Document Intelligence is not configured. "
                        "Please update AZURE_FORM_RECOGNIZER_ENDPOINT and AZURE_FORM_RECOGNIZER_KEY in the code."
             )
         # Step 4: Build response parts
         parts = []
         for idx, g in enumerate(groups):
             part_bytes = build_pdf_from_pages(doc, g["pages"])
             info = {
                 "size_bytes": len(part_bytes),
             }
             if include_pdf:
+                info["pdf_base64"] = base64.b64encode(
+                    part_bytes).decode("ascii")
             parts.append(info)
             print(f"\nPart {idx+1}:")
             print(f"  Invoice: {g['invoice_no']}")
             print(f"  Pages: {info['pages']}")
             print(f"  Size: {len(part_bytes):,} bytes")
+        doc.close()
         print(f"\n{'='*60}")
         print(f"✓ Successfully split into {len(parts)} part(s)")
         print(f"{'='*60}\n")
+        return JSONResponse({
             "count": len(parts),
             "pdf_type": "image-based" if is_image_pdf else "text-based",
+            "parts": parts
+        })
     except HTTPException:
         raise
         "status": "healthy",
         "azure_document_intelligence": azure_status,
         "azure_available": AZURE_AVAILABLE,
+        "endpoint": AZURE_FORM_RECOGNIZER_ENDPOINT if azure_status == "configured" else "not set"
     }
 if __name__ == "__main__":