Spaces:

anujakkulkarni
/

splitpdffile

Sleeping

App Files Files Community

anujakkulkarni commited on Oct 12, 2025

Commit

c07129b

verified ·

1 Parent(s): a072bcb

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -50

app.py CHANGED Viewed

@@ -1,14 +1,15 @@
-from fastapi import FastAPI, File, UploadFile, Form
-from fastapi.responses import JSONResponse
-from fastapi.middleware.cors import CORSMiddleware
-import fitz  # PyMuPDF
 import io
 import re
 import base64
 app = FastAPI(title="Invoice Splitter API")
-# Allow CORS (optional but helpful for Flutter/JS frontend)
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -17,59 +18,118 @@ app.add_middleware(
     allow_headers=["*"],
 )
 @app.post("/split-invoices")
 async def split_invoices(
     file: UploadFile = File(...),
     include_pdf: bool = Form(True),
-    initial_dpi: int = Form(300)
 ):
     try:
-        pdf_data = await file.read()
-        pdf = fitz.open(stream=pdf_data, filetype="pdf")
-        invoice_pattern = re.compile(r"\b[A-Z0-9]{3,10}\b")  # Example pattern
-        splits = []
-        current_invoice = None
-        current_pages = []
-        for page_num, page in enumerate(pdf, start=1):
-            text = page.get_text("text")
-            match = re.search(r"Invoice\s*No[:\s\-]*([A-Z0-9]+)", text, re.I)
-            if match:
-                invoice_no = match.group(1)
-                if current_invoice:
-                    splits.append({
                         "invoice_no": current_invoice,
-                        "pages": current_pages.copy()
                     })
-                    current_pages.clear()
-                current_invoice = invoice_no
-            current_pages.append(page_num)
-        if current_invoice and current_pages:
-            splits.append({"invoice_no": current_invoice, "pages": current_pages})
-        results = []
-        for split in splits:
-            doc = fitz.open()
-            for pno in split["pages"]:
-                doc.insert_pdf(pdf, from_page=pno-1, to_page=pno-1)
-            pdf_bytes = doc.tobytes()
-            base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8') if include_pdf else None
-            results.append({
-                "invoice_no": split["invoice_no"],
-                "num_pages": len(split["pages"]),
-                "pages": split["pages"],
-                "pdf_base64": base64_pdf
-            })
-        return JSONResponse({
-            "count": len(results),
-            "parts": results
-        })
     except Exception as e:
         return JSONResponse({"error": str(e)}, status_code=500)

 import io
 import re
 import base64
+from typing import List, Dict, Optional
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+import fitz  # PyMuPDF
 app = FastAPI(title="Invoice Splitter API")
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Same robust pattern you used in Flask:
+INVOICE_NO_RE = re.compile(
+    r"(?:Inv\s*No\.?|Invoice\s*No\.?)\s*[:\-]?\s*([A-Za-z0-9\-\/]+)",
+    re.IGNORECASE
+)
+def extract_invoice_no_from_page(page: fitz.Page) -> Optional[str]:
+    """
+    Extract invoice number from a page by checking full text first,
+    then falling back to block-level text (like your Flask code).
+    """
+    # 1) Full page text
+    text = page.get_text("text") or ""
+    m = INVOICE_NO_RE.search(text)
+    if not m:
+        # 2) Block-level fallback (handles layout/line breaks better)
+        for block in (page.get_text("blocks") or []):
+            # PyMuPDF "blocks" entries are tuples; the 5th item is the text
+            block_text = block[4] if len(block) > 4 else ""
+            m = INVOICE_NO_RE.search(block_text or "")
+            if m:
+                break
+    if not m:
+        return None
+    inv = (m.group(1) or "").strip()
+    # Guard against false positives like "Invoice"
+    if not inv or inv.lower() == "invoice":
+        return None
+    return inv
+def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
+    """Create a new PDF with the given pages (0-based indices)."""
+    out = fitz.open()
+    for i in page_indices:
+        # Note: insert_pdf uses from_page/to_page, not "pages" kwarg.
+        out.insert_pdf(src_doc, from_page=i, to_page=i)
+    pdf_bytes = out.tobytes()
+    out.close()
+    return pdf_bytes
 @app.post("/split-invoices")
 async def split_invoices(
     file: UploadFile = File(...),
     include_pdf: bool = Form(True),
+    initial_dpi: int = Form(300),  # kept for compatibility; not used here
 ):
+    if not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="only PDF is supported")
+    file_bytes = await file.read()
+    if not file_bytes:
+        raise HTTPException(status_code=400, detail="empty file")
     try:
+        doc = fitz.open(stream=file_bytes, filetype="pdf")
+        if doc.page_count == 0:
+            raise HTTPException(status_code=400, detail="no pages found")
+        # Extract invoice number per page (0-based)
+        page_invoice_nos: List[Optional[str]] = []
+        for i in range(doc.page_count):
+            inv = extract_invoice_no_from_page(doc.load_page(i))
+            page_invoice_nos.append(inv)
+        # Group pages: start a new group when a NEW non-None invoice number appears
+        groups: List[Dict] = []
+        current_group_pages: List[int] = []
+        current_invoice: Optional[str] = None
+        for idx, inv in enumerate(page_invoice_nos):
+            if current_invoice is None:
+                current_invoice = inv
+                current_group_pages = [idx]
+            else:
+                if inv is not None and inv != current_invoice:
+                    groups.append({
                         "invoice_no": current_invoice,
+                        "pages": current_group_pages[:],
                     })
+                    current_invoice = inv
+                    current_group_pages = [idx]
+                else:
+                    current_group_pages.append(idx)
+        if current_group_pages:
+            groups.append({"invoice_no": current_invoice, "pages": current_group_pages[:]})
+        # If we never found any invoice numbers, return the whole doc as one part
+        if all(g["invoice_no"] is None for g in groups):
+            groups = [{"invoice_no": None, "pages": list(range(doc.page_count))}]
+        parts = []
+        for g in groups:
+            part_bytes = build_pdf_from_pages(doc, g["pages"])
+            info = {
+                "invoice_no": g["invoice_no"],
+                "pages": [p + 1 for p in g["pages"]],  # 1-based for humans
+                "num_pages": len(g["pages"]),
+                "size_bytes": len(part_bytes),
+            }
+            if include_pdf:
+                info["pdf_base64"] = base64.b64encode(part_bytes).decode("ascii")
+            parts.append(info)
+        doc.close()
+        return JSONResponse({"count": len(parts), "parts": parts})
+    except HTTPException:
+        raise
     except Exception as e:
         return JSONResponse({"error": str(e)}, status_code=500)