Spaces:

anujakkulkarni
/

splitpdffile

Sleeping

App Files Files Community

anujakkulkarni commited on Nov 22, 2025

Commit

60a66d0

verified ·

1 Parent(s): df84667

Update app.py

Browse files

Files changed (1) hide show

app.py +306 -40

app.py CHANGED Viewed

@@ -1,13 +1,22 @@
 import io
 import re
 import base64
-from typing import List, Dict, Optional
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 import fitz  # PyMuPDF
 app = FastAPI(title="Invoice Splitter API")
 app.add_middleware(
@@ -18,70 +27,260 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# --- improved invoice detection (replace the old INVOICE_NO_RE + function) ---
 INVOICE_NO_RE = re.compile(
     r"(?:Inv(?:oice)?\s*No\.?|Invoice\s*No\.?|Bill\s*No\.?|BILL\s*NO\.?|BILL\s*NO)\s*[:\-]?\s*([A-Za-z0-9\-/]+)",
     re.IGNORECASE,
 )
-# fallback pattern to capture common GST-like invoice ids (GST-12345 etc)
 GST_LIKE_RE = re.compile(r"\b(GST[-\s]?\d+[A-Za-z0-9-]*)\b", re.IGNORECASE)
-def extract_invoice_no_from_page(page: fitz.Page) -> Optional[str]:
     """
-    Attempt several methods to get an invoice id from the page.
-    1) full-page text search for labeled invoice (Inv No / Invoice No / Bill No)
-    2) block-level search (useful when label and id are on different lines)
-    3) fallback: search for GST-* patterns (many of your PDFs use 'BILL NO. : GST-12345')
-    Returns a stripped string or None.
     """
-    text = page.get_text("text") or ""
-    # 1) try labeled pattern on whole page
     m = INVOICE_NO_RE.search(text)
     if m:
         inv = (m.group(1) or "").strip()
-        if inv and inv.lower() != "invoice":
             return inv
-    # 2) block-level fallback
     for block in (page.get_text("blocks") or []):
         block_text = block[4] if len(block) > 4 else ""
-        if not block_text:
-            continue
-        m = INVOICE_NO_RE.search(block_text)
-        if m:
-            inv = (m.group(1) or "").strip()
-            if inv and inv.lower() != "invoice":
                 return inv
-    # 3) GST-like fallback (common in your PDF: "BILL NO. : GST-25507")
-    m = GST_LIKE_RE.search(text)
-    if m:
-        return m.group(1).replace(" ", "").strip()
-    # if nothing found
-    return None
-# -------------------------------------------------------------------------
 def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
     """Create a new PDF with the given pages (0-based indices)."""
     out = fitz.open()
     for i in page_indices:
-        # Note: insert_pdf uses from_page/to_page, not "pages" kwarg.
         out.insert_pdf(src_doc, from_page=i, to_page=i)
     pdf_bytes = out.tobytes()
     out.close()
     return pdf_bytes
 @app.post("/split-invoices")
 async def split_invoices(
     file: UploadFile = File(...),
     include_pdf: bool = Form(True),
-    initial_dpi: int = Form(300),  # kept for compatibility; not used here
 ):
     if not file.filename.lower().endswith(".pdf"):
         raise HTTPException(status_code=400, detail="only PDF is supported")
@@ -94,23 +293,49 @@ async def split_invoices(
         if doc.page_count == 0:
             raise HTTPException(status_code=400, detail="no pages found")
-        # Extract invoice number per page (0-based)
         page_invoice_nos: List[Optional[str]] = []
         for i in range(doc.page_count):
-            inv = extract_invoice_no_from_page(doc.load_page(i))
             page_invoice_nos.append(inv)
-        # Group pages: start a new group when a NEW non-None invoice number appears
         groups: List[Dict] = []
         current_group_pages: List[int] = []
         current_invoice: Optional[str] = None
         for idx, inv in enumerate(page_invoice_nos):
             if current_invoice is None:
                 current_invoice = inv
                 current_group_pages = [idx]
             else:
                 if inv is not None and inv != current_invoice:
                     groups.append({
                         "invoice_no": current_invoice,
                         "pages": current_group_pages[:],
@@ -118,19 +343,28 @@ async def split_invoices(
                     current_invoice = inv
                     current_group_pages = [idx]
                 else:
                     current_group_pages.append(idx)
         if current_group_pages:
-            groups.append({"invoice_no": current_invoice,
-                          "pages": current_group_pages[:]})
-        # If we never found any invoice numbers, return the whole doc as one part
         if all(g["invoice_no"] is None for g in groups):
-            groups = [{"invoice_no": None,
-                       "pages": list(range(doc.page_count))}]
         parts = []
-        for g in groups:
             part_bytes = build_pdf_from_pages(doc, g["pages"])
             info = {
                 "invoice_no": g["invoice_no"],
@@ -139,14 +373,46 @@ async def split_invoices(
                 "size_bytes": len(part_bytes),
             }
             if include_pdf:
-                info["pdf_base64"] = base64.b64encode(
-                    part_bytes).decode("ascii")
             parts.append(info)
         doc.close()
-        return JSONResponse({"count": len(parts), "parts": parts})
     except HTTPException:
         raise
     except Exception as e:
         return JSONResponse({"error": str(e)}, status_code=500)

 import io
 import re
 import base64
+from typing import List, Dict, Optional, Tuple
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 import fitz  # PyMuPDF
+# Azure Document Intelligence (Form Recognizer) - optional import
+try:
+    from azure.ai.formrecognizer import DocumentAnalysisClient
+    from azure.core.credentials import AzureKeyCredential
+    AZURE_AVAILABLE = True
+except ImportError:
+    AZURE_AVAILABLE = False
+    print("Warning: azure-ai-formrecognizer not installed. Image-based PDFs won't be supported.")
 app = FastAPI(title="Invoice Splitter API")
 app.add_middleware(
     allow_headers=["*"],
 )
+# --- Azure Document Intelligence Configuration (HARDCODED) ---
+# Replace these with your actual Azure credentials
+AZURE_FORM_RECOGNIZER_ENDPOINT = "https://your-resource-name.cognitiveservices.azure.com/"
+AZURE_FORM_RECOGNIZER_KEY = "your-actual-key-here"
+# You can still override with environment variables if needed
+import os
+AZURE_FORM_RECOGNIZER_ENDPOINT = os.getenv("AZURE_FORM_RECOGNIZER_ENDPOINT", AZURE_FORM_RECOGNIZER_ENDPOINT)
+AZURE_FORM_RECOGNIZER_KEY = os.getenv("AZURE_FORM_RECOGNIZER_KEY", AZURE_FORM_RECOGNIZER_KEY)
+azure_client = None
+def get_azure_client() -> Optional[DocumentAnalysisClient]:
+    """Get or create Azure Document Intelligence client."""
+    global azure_client
+    if not AZURE_AVAILABLE:
+        print("Azure SDK not available")
+        return None
+    if azure_client is None:
+        # Check if credentials are still placeholder values
+        if (not AZURE_FORM_RECOGNIZER_ENDPOINT or
+            not AZURE_FORM_RECOGNIZER_KEY or
+            AZURE_FORM_RECOGNIZER_ENDPOINT == "https://your-resource-name.cognitiveservices.azure.com/" or
+            AZURE_FORM_RECOGNIZER_KEY == "your-actual-key-here"):
+            print("Warning: Azure credentials are not properly configured in the code.")
+            return None
+        try:
+            azure_client = DocumentAnalysisClient(
+                endpoint=AZURE_FORM_RECOGNIZER_ENDPOINT,
+                credential=AzureKeyCredential(AZURE_FORM_RECOGNIZER_KEY)
+            )
+            print("✓ Azure Document Intelligence client initialized")
+            print(f"  Endpoint: {AZURE_FORM_RECOGNIZER_ENDPOINT}")
+        except Exception as e:
+            print(f"Failed to initialize Azure client: {e}")
+            return None
+    return azure_client
+# --- Regex patterns for text-based PDF extraction ---
 INVOICE_NO_RE = re.compile(
     r"(?:Inv(?:oice)?\s*No\.?|Invoice\s*No\.?|Bill\s*No\.?|BILL\s*NO\.?|BILL\s*NO)\s*[:\-]?\s*([A-Za-z0-9\-/]+)",
     re.IGNORECASE,
 )
+PREFIXED_INVOICE_RE = re.compile(
+    r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b"
+)
 GST_LIKE_RE = re.compile(r"\b(GST[-\s]?\d+[A-Za-z0-9-]*)\b", re.IGNORECASE)
+def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
     """
+    Detect if PDF is image-based or text-based by sampling pages.
+    Returns (is_image_based, avg_text_length).
+    Strategy:
+    - Sample first few pages
+    - If average extractable text < 50 chars per page, it's likely image-based
+    - If text > 200 chars per page, it's text-based
     """
+    total_text_length = 0
+    pages_to_check = min(sample_pages, doc.page_count)
+    for i in range(pages_to_check):
+        text = doc.load_page(i).get_text("text") or ""
+        total_text_length += len(text.strip())
+    avg_text_length = total_text_length / pages_to_check
+    is_image_based = avg_text_length < 50
+    print(f"  PDF Type Detection: avg_text_length={avg_text_length:.1f} chars/page")
+    print(f"  Classification: {'IMAGE-BASED' if is_image_based else 'TEXT-BASED'} PDF")
+    return is_image_based, avg_text_length
+# ============================================================================
+# TEXT-BASED PDF EXTRACTION (Original Code)
+# ============================================================================
+def try_extract_invoice_from_text(text: str) -> Optional[str]:
+    """
+    Extract invoice number from text using regex patterns.
+    Works for text-based PDFs.
+    """
+    if not text:
+        return None
+    # Pattern 1: Labeled invoice (Invoice No, Bill No, etc.)
     m = INVOICE_NO_RE.search(text)
     if m:
         inv = (m.group(1) or "").strip()
+        if inv and inv.lower() != "invoice" and len(inv) > 2:
             return inv
+    # Pattern 2: Prefixed invoice (WN-12345/25) - search top portion
+    top_text = text[:500]
+    m = PREFIXED_INVOICE_RE.search(top_text)
+    if m:
+        inv = (m.group(1) or "").strip()
+        if inv and len(inv) >= 7:
+            return inv
+    # Pattern 3: GST format
+    m = GST_LIKE_RE.search(text)
+    if m:
+        return m.group(1).replace(" ", "").strip()
+    return None
+def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
+    """
+    Extract invoice number from TEXT-BASED PDF.
+    Uses the original fast text extraction method.
+    """
+    # Try full-page text
+    text = page.get_text("text") or ""
+    inv = try_extract_invoice_from_text(text)
+    if inv:
+        return inv
+    # Try block-level text
     for block in (page.get_text("blocks") or []):
         block_text = block[4] if len(block) > 4 else ""
+        if block_text:
+            inv = try_extract_invoice_from_text(block_text)
+            if inv:
                 return inv
+    return None
+# ============================================================================
+# IMAGE-BASED PDF EXTRACTION (Azure Document Intelligence)
+# ============================================================================
+def extract_invoice_azure(page: fitz.Page) -> Optional[str]:
+    """
+    Extract invoice number from IMAGE-BASED PDF using Azure Document Intelligence.
+    """
+    client = get_azure_client()
+    if not client:
+        print("    Azure client not available")
+        return None
+    try:
+        # Convert page to image
+        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x resolution
+        img_bytes = pix.tobytes("png")
+        # Analyze with Azure prebuilt invoice model
+        print("    Calling Azure Document Intelligence API...")
+        poller = client.begin_analyze_document(
+            "prebuilt-invoice",
+            document=img_bytes
+        )
+        result = poller.result()
+        # Extract invoice ID from structured fields
+        if result.documents:
+            for document in result.documents:
+                if hasattr(document, 'fields') and document.fields:
+                    # Try InvoiceId field
+                    if 'InvoiceId' in document.fields and document.fields['InvoiceId']:
+                        invoice_id = document.fields['InvoiceId'].value
+                        if invoice_id:
+                            print(f"    ✓ Azure found InvoiceId: {invoice_id}")
+                            return str(invoice_id).strip()
+                    # Try PurchaseOrder field
+                    if 'PurchaseOrder' in document.fields and document.fields['PurchaseOrder']:
+                        po = document.fields['PurchaseOrder'].value
+                        if po:
+                            print(f"    ✓ Azure found PurchaseOrder: {po}")
+                            return str(po).strip()
+        # Fallback: try regex on Azure-extracted text
+        if result.content:
+            print(f"    Azure extracted {len(result.content)} chars, trying regex...")
+            inv = try_extract_invoice_from_text(result.content)
+            if inv:
+                print(f"    ✓ Found via regex on Azure text: {inv}")
+                return inv
+        print("    ✗ Azure: No invoice found")
+        return None
+    except Exception as e:
+        print(f"    ✗ Azure extraction failed: {e}")
+        return None
+# ============================================================================
+# UNIFIED EXTRACTION LOGIC
+# ============================================================================
+def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
+    """
+    Extract invoice number using appropriate method based on PDF type.
+    Args:
+        page: PDF page to extract from
+        is_image_pdf: True if PDF is image-based, False if text-based
+    """
+    if is_image_pdf:
+        # Use Azure for image-based PDFs
+        print(f"  Method: Azure Document Intelligence (image-based)")
+        return extract_invoice_azure(page)
+    else:
+        # Use text extraction for text-based PDFs
+        print(f"  Method: Text extraction (text-based)")
+        return extract_invoice_text_based(page)
 def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
     """Create a new PDF with the given pages (0-based indices)."""
     out = fitz.open()
     for i in page_indices:
         out.insert_pdf(src_doc, from_page=i, to_page=i)
     pdf_bytes = out.tobytes()
     out.close()
     return pdf_bytes
+# ============================================================================
+# API ENDPOINT
+# ============================================================================
 @app.post("/split-invoices")
 async def split_invoices(
     file: UploadFile = File(...),
     include_pdf: bool = Form(True),
+    initial_dpi: int = Form(300),  # Kept for compatibility
 ):
+    """
+    Split a multi-invoice PDF into separate PDFs based on invoice numbers.
+    Automatically detects PDF type:
+    - Text-based PDFs: Uses fast text extraction (original method)
+    - Image-based PDFs: Uses Azure Document Intelligence for accurate OCR
+    Parameters:
+    - file: PDF file to split
+    - include_pdf: Whether to include base64 PDF in response
+    - initial_dpi: DPI setting (kept for compatibility)
+    """
     if not file.filename.lower().endswith(".pdf"):
         raise HTTPException(status_code=400, detail="only PDF is supported")
         if doc.page_count == 0:
             raise HTTPException(status_code=400, detail="no pages found")
+        print(f"\n{'='*60}")
+        print(f"Processing PDF: {file.filename}")
+        print(f"Total pages: {doc.page_count}")
+        print(f"{'='*60}")
+        # Step 1: Detect PDF type (text-based vs image-based)
+        is_image_pdf, avg_text_len = is_image_based_pdf(doc)
+        if is_image_pdf and not get_azure_client():
+            raise HTTPException(
+                status_code=500,
+                detail="Image-based PDF detected but Azure Document Intelligence is not configured. "
+                       "Please update AZURE_FORM_RECOGNIZER_ENDPOINT and AZURE_FORM_RECOGNIZER_KEY in the code."
+            )
+        # Step 2: Extract invoice numbers from each page
         page_invoice_nos: List[Optional[str]] = []
         for i in range(doc.page_count):
+            print(f"\n--- Page {i+1}/{doc.page_count} ---")
+            inv = extract_invoice_no_from_page(doc.load_page(i), is_image_pdf)
+            if inv:
+                print(f"  ✓ Invoice found: {inv}")
+            else:
+                print(f"  ✗ No invoice found")
             page_invoice_nos.append(inv)
+        print(f"\n{'='*60}")
+        print(f"Extraction Results: {page_invoice_nos}")
+        print(f"{'='*60}")
+        # Step 3: Group pages by invoice number
         groups: List[Dict] = []
         current_group_pages: List[int] = []
         current_invoice: Optional[str] = None
         for idx, inv in enumerate(page_invoice_nos):
             if current_invoice is None:
+                # Start first group
                 current_invoice = inv
                 current_group_pages = [idx]
             else:
                 if inv is not None and inv != current_invoice:
+                    # New invoice detected - save current group
                     groups.append({
                         "invoice_no": current_invoice,
                         "pages": current_group_pages[:],
                     current_invoice = inv
                     current_group_pages = [idx]
                 else:
+                    # Continue current group (same invoice or no invoice)
                     current_group_pages.append(idx)
+        # Save last group
         if current_group_pages:
+            groups.append({
+                "invoice_no": current_invoice,
+                "pages": current_group_pages[:]
+            })
+        # If no invoices found, return whole document as one part
         if all(g["invoice_no"] is None for g in groups):
+            print("\n⚠ Warning: No invoices detected in any page!")
+            print("  Returning entire PDF as single part")
+            groups = [{
+                "invoice_no": None,
+                "pages": list(range(doc.page_count))
+            }]
+        # Step 4: Build response parts
         parts = []
+        for idx, g in enumerate(groups):
             part_bytes = build_pdf_from_pages(doc, g["pages"])
             info = {
                 "invoice_no": g["invoice_no"],
                 "size_bytes": len(part_bytes),
             }
             if include_pdf:
+                info["pdf_base64"] = base64.b64encode(part_bytes).decode("ascii")
             parts.append(info)
+            print(f"\nPart {idx+1}:")
+            print(f"  Invoice: {g['invoice_no']}")
+            print(f"  Pages: {info['pages']}")
+            print(f"  Size: {len(part_bytes):,} bytes")
         doc.close()
+        print(f"\n{'='*60}")
+        print(f"✓ Successfully split into {len(parts)} part(s)")
+        print(f"{'='*60}\n")
+        return JSONResponse({
+            "count": len(parts),
+            "pdf_type": "image-based" if is_image_pdf else "text-based",
+            "parts": parts
+        })
     except HTTPException:
         raise
     except Exception as e:
+        print(f"\n✗ Error: {str(e)}")
+        import traceback
+        traceback.print_exc()
         return JSONResponse({"error": str(e)}, status_code=500)
+@app.get("/health")
+async def health_check():
+    """Health check endpoint to verify Azure configuration."""
+    azure_status = "configured" if get_azure_client() else "not configured"
+    return {
+        "status": "healthy",
+        "azure_document_intelligence": azure_status,
+        "azure_available": AZURE_AVAILABLE,
+        "endpoint": AZURE_FORM_RECOGNIZER_ENDPOINT if azure_status == "configured" else "not set"
+    }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)