Spaces:

anujakkulkarni
/

splitpdffile

Paused

App Files Files Community

anujakkulkarni commited on Dec 12, 2025

Commit

b92224e

verified ·

1 Parent(s): 5034d81

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -55

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ try:
     import google.generativeai as genai
     from PIL import Image
     GEMINI_AVAILABLE = True
-except ImportError:
     GEMINI_AVAILABLE = False
     print("Warning: google-generativeai not installed.  Image-based PDFs won't be supported.")
@@ -54,7 +54,7 @@ def get_gemini_model():
             genai.configure(api_key=GEMINI_API_KEY)
             gemini_model = genai.GenerativeModel('gemini-2.0-flash-exp')
             print("✓ Google Gemini Flash 2.0 initialized")
-        except Exception as e:
             print(f"Failed to initialize Gemini model: {e}")
             return None
@@ -71,7 +71,8 @@ PREFIXED_INVOICE_RE = re.compile(
     r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b"
 )
-GST_LIKE_RE = re.compile(r"\b(GST[-\s]?\d+[A-Za-z0-9-]*)\b", re.IGNORECASE)
 def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
@@ -106,33 +107,54 @@ def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool,
 # TEXT-BASED PDF EXTRACTION (Original Code)
 # ============================================================================
 def try_extract_invoice_from_text(text: str) -> Optional[str]:
     """
     Extract invoice number from text using regex patterns.
-    Works for text-based PDFs.
     """
     if not text:
         return None
-    # Pattern 1: Labeled invoice (Invoice No, Bill No, etc.)
-    m = INVOICE_NO_RE. search(text)
     if m:
         inv = (m.group(1) or "").strip()
-        if inv and inv.lower() != "invoice" and len(inv) > 2:
             return inv
-    # Pattern 2: Prefixed invoice (WN-12345/25) - search top portion
-    top_text = text[:500]
     m = PREFIXED_INVOICE_RE.search(top_text)
     if m:
         inv = (m.group(1) or "").strip()
-        if inv and len(inv) >= 7:
             return inv
-    # Pattern 3: GST format
-    m = GST_LIKE_RE.search(text)
-    if m:
-        return m.group(1).replace(" ", "").strip()
     return None
@@ -151,9 +173,9 @@ def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
     # Try block-level text
     for block in (page.get_text("blocks") or []):
         block_text = block[4] if len(block) > 4 else ""
-        if block_text:
             inv = try_extract_invoice_from_text(block_text)
-            if inv:
                 return inv
     return None
@@ -176,7 +198,7 @@ def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
         # Convert page to image
         pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x resolution
         img_bytes = pix.tobytes("png")
         # Convert to PIL Image for Gemini
         img = Image.open(io.BytesIO(img_bytes))
@@ -193,24 +215,26 @@ def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
         print("    Calling Google Gemini API...")
         response = model.generate_content([prompt, img])
         if response and response.text:
             extracted_text = response.text.strip()
             print(f"    Gemini response: {extracted_text}")
             if extracted_text and extracted_text != "NOT_FOUND":
                 # Clean up the response
-                invoice_no = extracted_text.replace("*", "").replace("#", "").strip()
                 if invoice_no and len(invoice_no) > 2:
                     print(f"    ✓ Gemini found invoice: {invoice_no}")
                     return invoice_no
             # Fallback:  Get full OCR text and try regex
             ocr_prompt = "Extract all text from this invoice image. Return the complete text content."
             ocr_response = model.generate_content([ocr_prompt, img])
             if ocr_response and ocr_response.text:
-                print(f"    Gemini extracted {len(ocr_response.text)} chars, trying regex...")
                 inv = try_extract_invoice_from_text(ocr_response.text)
                 if inv:
                     print(f"    ✓ Found via regex on Gemini text: {inv}")
@@ -263,21 +287,18 @@ def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> byt
 @app.post("/split-invoices")
 async def split_invoices(
-    file: UploadFile = File(... ),
     include_pdf: bool = Form(True),
     initial_dpi: int = Form(300),  # Kept for compatibility
 ):
     """
-    Split a multi-invoice PDF into separate PDFs based on invoice numbers.
-    Automatically detects PDF type:
-    - Text-based PDFs:  Uses fast text extraction (original method)
-    - Image-based PDFs: Uses Google Gemini Flash 2.0 for accurate OCR
-    Parameters:
-    - file: PDF file to split
-    - include_pdf:  Whether to include base64 PDF in response
-    - initial_dpi: DPI setting (kept for compatibility)
     """
     if not file.filename.lower().endswith(".pdf"):
         raise HTTPException(status_code=400, detail="only PDF is supported")
@@ -307,41 +328,61 @@ async def split_invoices(
             )
         # Step 2: Extract invoice numbers from each page
-        page_invoice_nos:  List[Optional[str]] = []
         for i in range(doc.page_count):
             print(f"\n--- Page {i+1}/{doc.page_count} ---")
             inv = extract_invoice_no_from_page(doc.load_page(i), is_image_pdf)
             if inv:
-                print(f"  ✓ Invoice found: {inv}")
             else:
-                print(f"  ✗ No invoice found")
             page_invoice_nos.append(inv)
         print(f"\n{'='*60}")
-        print(f"Extraction Results:  {page_invoice_nos}")
         print(f"{'='*60}")
-        # Step 3: Group pages by invoice number
-        groups:  List[Dict] = []
-        current_group_pages:  List[int] = []
-        current_invoice:  Optional[str] = None
-        for idx, inv in enumerate(page_invoice_nos):
-            if current_invoice is None:
-                # Start first group
                 current_invoice = inv
                 current_group_pages = [idx]
             else:
                 if inv is not None and inv != current_invoice:
-                    # New invoice detected - save current group
                     groups.append({
-                        "invoice_no":  current_invoice,
-                        "pages": current_group_pages[: ],
                     })
                     current_invoice = inv
                     current_group_pages = [idx]
                 else:
-                    # Continue current group (same invoice or no invoice)
                     current_group_pages.append(idx)
         # Save last group
@@ -351,9 +392,15 @@ async def split_invoices(
                 "pages": current_group_pages[:]
             })
-        # If no invoices found, return whole document as one part
         if all(g["invoice_no"] is None for g in groups):
-            print("\n⚠ Warning: No invoices detected in any page!")
             print("  Returning entire PDF as single part")
             groups = [{
                 "invoice_no": None,
@@ -365,12 +412,13 @@ async def split_invoices(
         for idx, g in enumerate(groups):
             part_bytes = build_pdf_from_pages(doc, g["pages"])
             info = {
                 "invoice_no": g["invoice_no"],
-                "pages":  [p + 1 for p in g["pages"]],  # 1-based for humans
                 "num_pages": len(g["pages"]),
-                "size_bytes":  len(part_bytes),
             }
-            if include_pdf:
                 info["pdf_base64"] = base64.b64encode(
                     part_bytes).decode("ascii")
             parts.append(info)
@@ -387,17 +435,17 @@ async def split_invoices(
         return JSONResponse({
             "count": len(parts),
-            "pdf_type":  "image-based" if is_image_pdf else "text-based",
             "parts": parts
         })
     except HTTPException:
         raise
-    except Exception as e:
         print(f"\n✗ Error: {str(e)}")
         import traceback
         traceback.print_exc()
-        return JSONResponse({"error":  str(e)}, status_code=500)
 @app.get("/health")

     import google.generativeai as genai
     from PIL import Image
     GEMINI_AVAILABLE = True
+except ImportError:
     GEMINI_AVAILABLE = False
     print("Warning: google-generativeai not installed.  Image-based PDFs won't be supported.")
             genai.configure(api_key=GEMINI_API_KEY)
             gemini_model = genai.GenerativeModel('gemini-2.0-flash-exp')
             print("✓ Google Gemini Flash 2.0 initialized")
+        except Exception as e:
             print(f"Failed to initialize Gemini model: {e}")
             return None
     r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b"
 )
+GST_LIKE_RE = re.compile(
+    r"\b((?:GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b", re.IGNORECASE)
 def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
 # TEXT-BASED PDF EXTRACTION (Original Code)
 # ============================================================================
+def normalize_text_for_search(s: str) -> str:
+    """Light normalization: collapse whitespace and normalize common separators."""
+    if not s:
+        return s
+    s = s.replace("\u00A0", " ")  # non-breaking space
+    s = re.sub(r"[\r\n\t]+", " ", s)
+    s = re.sub(r"[ ]{2,}", " ", s).strip()
+    return s
 def try_extract_invoice_from_text(text: str) -> Optional[str]:
     """
     Extract invoice number from text using regex patterns.
+    - Prefer explicit labeled Invoice/Bill patterns.
+    - Prefer prefixed invoice formats found in the top of the page.
+    - Use GST only as a last resort and tag it so it won't be mistaken for an invoice id.
     """
     if not text:
         return None
+    text_norm = normalize_text_for_search(text)
+    # 1) Labeled invoice like "Invoice No", "Inv No."
+    m = INVOICE_NO_RE.search(text_norm)
     if m:
         inv = (m.group(1) or "").strip()
+        if inv and inv.lower() not in ("invoice", "inv", "bill") and len(inv) > 2:
             return inv
+    # 2) Search top portion for prefixed invoice codes (WN-1234, 5EN19710, etc.)
+    top_text = text_norm[:600]  # bigger top area to be robust
     m = PREFIXED_INVOICE_RE.search(top_text)
     if m:
         inv = (m.group(1) or "").strip()
+        # extra length check so tiny numeric matches don't pass
+        if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
             return inv
+    # 3) As absolute last-resort: strict GST detection (only accept 15-char GSTIN)
+    gm = GST_LIKE_RE.search(text_norm)
+    if gm:
+        gst_val = gm.group(2) or ""
+        gst_val = gst_val.replace(" ", "").strip().upper()
+        # Only accept if 15 alnum chars (typical Indian GSTIN length)
+        if len(gst_val) == 15 and re.match(r"^[0-9A-Z]{15}$", gst_val):
+            # tag it so grouping won't treat GST same as invoice ID
+            return f"GST:{gst_val}"
     return None
     # Try block-level text
     for block in (page.get_text("blocks") or []):
         block_text = block[4] if len(block) > 4 else ""
+        if block_text:
             inv = try_extract_invoice_from_text(block_text)
+            if inv:
                 return inv
     return None
         # Convert page to image
         pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x resolution
         img_bytes = pix.tobytes("png")
         # Convert to PIL Image for Gemini
         img = Image.open(io.BytesIO(img_bytes))
         print("    Calling Google Gemini API...")
         response = model.generate_content([prompt, img])
         if response and response.text:
             extracted_text = response.text.strip()
             print(f"    Gemini response: {extracted_text}")
             if extracted_text and extracted_text != "NOT_FOUND":
                 # Clean up the response
+                invoice_no = extracted_text.replace(
+                    "*", "").replace("#", "").strip()
                 if invoice_no and len(invoice_no) > 2:
                     print(f"    ✓ Gemini found invoice: {invoice_no}")
                     return invoice_no
             # Fallback:  Get full OCR text and try regex
             ocr_prompt = "Extract all text from this invoice image. Return the complete text content."
             ocr_response = model.generate_content([ocr_prompt, img])
             if ocr_response and ocr_response.text:
+                print(
+                    f"    Gemini extracted {len(ocr_response.text)} chars, trying regex...")
                 inv = try_extract_invoice_from_text(ocr_response.text)
                 if inv:
                     print(f"    ✓ Found via regex on Gemini text: {inv}")
 @app.post("/split-invoices")
 async def split_invoices(
+    file: UploadFile = File(...),
     include_pdf: bool = Form(True),
     initial_dpi: int = Form(300),  # Kept for compatibility
 ):
     """
+    Split a multi-invoice PDF into separate PDFs based on invoice numbers.
+    - Text-based PDFs: Uses fast text extraction
+    - Image-based PDFs: Uses Google Gemini Flash 2.0 (if configured)
+    Note: GST values (tagged as "GST:...") are treated as a last-resort identifier and
+    are ignored for splitting by default (so repeated company GST won't prevent splits).
     """
     if not file.filename.lower().endswith(".pdf"):
         raise HTTPException(status_code=400, detail="only PDF is supported")
             )
         # Step 2: Extract invoice numbers from each page
+        page_invoice_nos: List[Optional[str]] = []
         for i in range(doc.page_count):
             print(f"\n--- Page {i+1}/{doc.page_count} ---")
             inv = extract_invoice_no_from_page(doc.load_page(i), is_image_pdf)
+            # inv may be something like "5EN19710" or "GST:12ABCDE..." or None
             if inv:
+                print(f"  ✓ Raw extracted id: {inv}")
             else:
+                print(f"  ✗ No invoice found (raw)")
             page_invoice_nos.append(inv)
         print(f"\n{'='*60}")
+        print(f"Raw Extraction Results:  {page_invoice_nos}")
         print(f"{'='*60}")
+        # ---------------------------------------------------------
+        # Post-process extracted ids before grouping
+        # - Treat GST:<value> as a LAST-RESORT marker and ignore it for splitting
+        #   (convert to None) so repeated company GST doesn't group pages together.
+        # - Keep actual invoice ids like '5EN19710' intact.
+        # ---------------------------------------------------------
+        page_invoice_nos_filtered: List[Optional[str]] = []
+        for v in page_invoice_nos:
+            if v is None:
+                page_invoice_nos_filtered.append(None)
+            else:
+                # If GST-tagged value (we returned "GST:..."), ignore it for splitting
+                if isinstance(v, str) and v.upper().startswith("GST:"):
+                    page_invoice_nos_filtered.append(None)
+                else:
+                    page_invoice_nos_filtered.append(v)
+        print(f"Filtered (GST ignored) Results: {page_invoice_nos_filtered}")
+        # Step 3: Group pages by invoice number (use filtered ids)
+        groups: List[Dict] = []
+        current_group_pages: List[int] = []
+        current_invoice: Optional[str] = None
+        for idx, inv in enumerate(page_invoice_nos_filtered):
+            if current_invoice is None:
+                # Start a new group (even if inv is None)
                 current_invoice = inv
                 current_group_pages = [idx]
             else:
+                # If a new non-empty invoice appears and differs -> close current group
                 if inv is not None and inv != current_invoice:
                     groups.append({
+                        "invoice_no": current_invoice,
+                        "pages": current_group_pages[:],
                     })
                     current_invoice = inv
                     current_group_pages = [idx]
                 else:
+                    # Continue current group (same invoice or both None)
                     current_group_pages.append(idx)
         # Save last group
                 "pages": current_group_pages[:]
             })
+        # Post-process groups:
+        # If first group has invoice_no None and next group has non-None -> merge leading None
+        if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
+            groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
+            groups.pop(0)
+        # If, after filtering, all groups are None (no invoice detected), return whole doc as one part
         if all(g["invoice_no"] is None for g in groups):
+            print("\n⚠ Warning: No invoices detected in any page (after GST ignored)!")
             print("  Returning entire PDF as single part")
             groups = [{
                 "invoice_no": None,
         for idx, g in enumerate(groups):
             part_bytes = build_pdf_from_pages(doc, g["pages"])
             info = {
+                # Keep invoice_no as detected in filtered set (None or actual invoice id)
                 "invoice_no": g["invoice_no"],
+                "pages": [p + 1 for p in g["pages"]],  # 1-based for humans
                 "num_pages": len(g["pages"]),
+                "size_bytes": len(part_bytes),
             }
+            if include_pdf:
                 info["pdf_base64"] = base64.b64encode(
                     part_bytes).decode("ascii")
             parts.append(info)
         return JSONResponse({
             "count": len(parts),
+            "pdf_type": "image-based" if is_image_pdf else "text-based",
             "parts": parts
         })
     except HTTPException:
         raise
+    except Exception as e:
         print(f"\n✗ Error: {str(e)}")
         import traceback
         traceback.print_exc()
+        return JSONResponse({"error": str(e)}, status_code=500)
 @app.get("/health")