Spaces:

anujakkulkarni
/

splitpdffile

Sleeping

App Files Files Community

anujakkulkarni commited on Jan 6

Commit

63e2ea5

verified ·

1 Parent(s): 8b3c611

Update app.py

Browse files

Files changed (1) hide show

app.py +376 -162

app.py CHANGED Viewed

@@ -2,192 +2,295 @@ import os
 import io
 import re
 import base64
-import time
-import threading
 from typing import List, Dict, Optional, Tuple
-from concurrent.futures import ThreadPoolExecutor
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 import fitz  # PyMuPDF
-# Gemini
-import google.generativeai as genai
-from PIL import Image
-# ============================================================================
-# CONFIG
-# ============================================================================
-MAX_GEMINI_CONCURRENT_CALLS = 2     # HARD LIMIT
-GEMINI_MIN_INTERVAL_SEC = 1.2      # RATE LIMIT (seconds)
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
-# ============================================================================
-# THREAD & RATE LIMIT MANAGEMENT
-# ============================================================================
-gemini_lock = threading.Lock()
-gemini_semaphore = threading.Semaphore(MAX_GEMINI_CONCURRENT_CALLS)
-last_gemini_call_time = 0.0
 gemini_model = None
 def get_gemini_model():
     global gemini_model
-    if not GEMINI_API_KEY:
         return None
     if gemini_model is None:
-        genai.configure(api_key=GEMINI_API_KEY)
-        gemini_model = genai.GenerativeModel(
-            model_name="models/gemini-2.5-flash-image"
-        )
-        print("✓ Gemini 2.5 Flash Image initialized")
     return gemini_model
-def rate_limited_gemini_call(prompt, img):
-    """
-    Thread-safe + rate-limited Gemini call
-    """
-    global last_gemini_call_time
-    with gemini_semaphore:
-        with gemini_lock:
-            elapsed = time.time() - last_gemini_call_time
-            if elapsed < GEMINI_MIN_INTERVAL_SEC:
-                time.sleep(GEMINI_MIN_INTERVAL_SEC - elapsed)
-            model = get_gemini_model()
-            response = model.generate_content([prompt, img])
-            last_gemini_call_time = time.time()
-            return response
-# ============================================================================
-# FASTAPI
-# ============================================================================
-app = FastAPI(title="Invoice Splitter API")
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# ============================================================================
-# REGEX
-# ============================================================================
-INVOICE_NO_RE = re.compile(
-    r"(Invoice\s*No\.?|Inv\.?\s*No\.?|Bill\s*No\.?|Tax\s*Invoice\s*No\.?)\s*[:\-]?\s*([A-Z0-9\-\/]{4,})",
-    re.IGNORECASE
-)
-PREFIXED_RE = re.compile(r"\b[A-Z]{2,5}[-/]\d{4,}\b")
 # ============================================================================
-# PDF TYPE DETECTION
 # ============================================================================
-def is_image_based_pdf(doc: fitz.Document, sample=3):
-    total = 0
-    for i in range(min(sample, doc.page_count)):
-        total += len(doc.load_page(i).get_text("text") or "")
-    avg = total / max(1, sample)
-    return avg < 50
-# ============================================================================
-# TEXT EXTRACTION
-# ============================================================================
-def extract_text_invoice(page: fitz.Page) -> Optional[str]:
-    text = page.get_text("text") or ""
-    m = INVOICE_NO_RE.search(text)
     if m:
-        return m.group(2).strip()
-    top = text[:500]
-    m = PREFIXED_RE.search(top)
     if m:
-        return m.group(0)
     return None
 # ============================================================================
-# GEMINI IMAGE EXTRACTION
 # ============================================================================
 def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
     try:
-        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
-        img = Image.open(io.BytesIO(pix.tobytes("png")))
         prompt = """
-        Extract the invoice number from this invoice image,
-         also consider for reading an Indian GST e-Invoice.
-        Look for:
-        - Invoice No
-        - Bill No
-        - Tax Invoice No
-        - Document No
-        - Purchase Order No (only if invoice not present)
-        Return ONLY the identifier.
-        If nothing is found, return NOT_FOUND.
         """
-        response = rate_limited_gemini_call(prompt, img)
         if response and response.text:
-            val = response.text.strip()
-            if val != "NOT_FOUND" and len(val) > 2:
-                return val
     except Exception as e:
-        print("Gemini failed:", e)
-    return None
 # ============================================================================
-# UNIFIED PAGE EXTRACTION
 # ============================================================================
-def extract_invoice(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
-    text_inv = extract_text_invoice(page)
-    if text_inv:
-        return text_inv
     if is_image_pdf:
-        return extract_invoice_gemini(page)
     return None
-# ============================================================================
-# PDF BUILDER
-# ============================================================================
-def build_pdf(doc, pages):
     out = fitz.open()
-    for p in pages:
-        out.insert_pdf(doc, from_page=p, to_page=p)
-    data = out.tobytes()
     out.close()
-    return data
 # ============================================================================
@@ -197,65 +300,176 @@ def build_pdf(doc, pages):
 @app.post("/split-invoices")
 async def split_invoices(
     file: UploadFile = File(...),
-    include_pdf: bool = Form(True)
 ):
-    if not file.filename.lower().endswith(".pdf"):
-        raise HTTPException(400, "Only PDF allowed")
-    data = await file.read()
-    doc = fitz.open(stream=data, filetype="pdf")
-    is_image_pdf = is_image_based_pdf(doc)
-    print("PDF Type:", "IMAGE" if is_image_pdf else "TEXT")
-    invoice_ids = []
-    with ThreadPoolExecutor(max_workers=MAX_GEMINI_CONCURRENT_CALLS) as executor:
-        futures = []
         for i in range(doc.page_count):
-            page = doc.load_page(i)
-            futures.append(executor.submit(extract_invoice, page, is_image_pdf))
-        for f in futures:
-            invoice_ids.append(f.result())
-    # Group pages
-    groups = []
-    current_inv = invoice_ids[0]
-    current_pages = [0]
-    for i in range(1, len(invoice_ids)):
-        if invoice_ids[i] != current_inv and invoice_ids[i] is not None:
-            groups.append((current_inv, current_pages))
-            current_inv = invoice_ids[i]
-            current_pages = [i]
-        else:
-            current_pages.append(i)
-    groups.append((current_inv, current_pages))
-    parts = []
-    for inv, pages in groups:
-        pdf_bytes = build_pdf(doc, pages)
-        part = {
-            "invoice_no": inv,
-            "pages": [p + 1 for p in pages],
-            "num_pages": len(pages),
-        }
-        if include_pdf:
-            part["pdf_base64"] = base64.b64encode(pdf_bytes).decode()
-        parts.append(part)
-    return {
-        "count": len(parts),
-        "pdf_type": "image-based" if is_image_pdf else "text-based",
-        "parts": parts
-    }
 @app.get("/health")
-def health():
     return {
-        "status": "ok",
-        "gemini": "configured" if GEMINI_API_KEY else "missing"
     }

 import io
 import re
 import base64
 from typing import List, Dict, Optional, Tuple
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 import fitz  # PyMuPDF
+# Google Gemini - optional import
+try:
+    import google.generativeai as genai
+    from PIL import Image
+    GEMINI_AVAILABLE = True
+except ImportError:
+    GEMINI_AVAILABLE = False
+    print("Warning: google-generativeai not installed.  Image-based PDFs won't be supported.")
+app = FastAPI(title="Invoice Splitter API")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# --- Google Gemini Configuration ---
+# This will be automatically loaded from environment variables
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
 gemini_model = None
 def get_gemini_model():
+    """Get or create Gemini model instance."""
     global gemini_model
+    if not GEMINI_AVAILABLE:
+        print("Gemini SDK not available")
         return None
     if gemini_model is None:
+        # Check if API key is configured via environment variables
+        if not GEMINI_API_KEY:
+            print("Warning:  Gemini API key not found in environment variables.")
+            print("Please configure GEMINI_API_KEY in your environment variables.")
+            return None
+        try:
+            genai.configure(api_key=GEMINI_API_KEY)
+            gemini_model = genai.GenerativeModel('gemini-2.0-flash-exp')
+            print("✓ Google Gemini Flash 2.0 initialized")
+        except Exception as e:
+            print(f"Failed to initialize Gemini model: {e}")
+            return None
     return gemini_model
+# --- Regex patterns for text-based PDF extraction ---
+INVOICE_NO_RE = re.compile(
+    r"""
+    (?:
+        Invoice\s*No\.?|
+        Inv\.?\s*No\.?|
+        Bill\s*No\.?|
+        Document\s*No\.?|       # ✅ ADD THIS
+        Doc\s*No\.?|
+        Tax\s*Invoice\s*No\.?
+    )
+    \s*[:\-]?\s*
+    ([A-Z0-9][A-Z0-9\-\/]{3,})
+    """,
+    re.IGNORECASE | re.VERBOSE
+)
+PREFIXED_INVOICE_RE = re.compile(
+    r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b"
+)
+GST_LIKE_RE = re.compile(
+    r"\b((?:GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b", re.IGNORECASE)
+def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
+    """
+    Detect if PDF is image-based or text-based by sampling pages.
+    Returns (is_image_based, avg_text_length).
+    Strategy:
+    - Sample first few pages
+    - If average extractable text < 50 chars per page, it's likely image-based
+    - If text > 200 chars per page, it's text-based
+    """
+    total_text_length = 0
+    pages_to_check = min(sample_pages, doc.page_count)
+    for i in range(pages_to_check):
+        text = doc. load_page(i).get_text("text") or ""
+        total_text_length += len(text. strip())
+    avg_text_length = total_text_length / pages_to_check
+    is_image_based = avg_text_length < 50
+    print(
+        f"  PDF Type Detection: avg_text_length={avg_text_length:.1f} chars/page")
+    print(
+        f"  Classification: {'IMAGE-BASED' if is_image_based else 'TEXT-BASED'} PDF")
+    return is_image_based, avg_text_length
 # ============================================================================
+# TEXT-BASED PDF EXTRACTION (Original Code)
 # ============================================================================
+def normalize_text_for_search(s: str) -> str:
+    """Light normalization: collapse whitespace and normalize common separators."""
+    if not s:
+        return s
+    s = s.replace("\u00A0", " ")  # non-breaking space
+    s = re.sub(r"[\r\n\t]+", " ", s)
+    s = re.sub(r"[ ]{2,}", " ", s).strip()
+    return s
+def try_extract_invoice_from_text(text: str) -> Optional[str]:
+    """
+    Extract invoice number from text using regex patterns.
+    - Prefer explicit labeled Invoice/Bill patterns.
+    - Prefer prefixed invoice formats found in the top of the page.
+    - Use GST only as a last resort and tag it so it won't be mistaken for an invoice id.
+    """
+    if not text:
+        return None
+    text_norm = normalize_text_for_search(text)
+    # 1) Labeled invoice like "Invoice No", "Inv No."
+    m = INVOICE_NO_RE.search(text_norm)
     if m:
+        inv = (m.group(1) or "").strip()
+        if inv and inv.lower() not in ("invoice", "inv", "bill") and len(inv) > 2:
+            return inv
+    # 2) Search top portion for prefixed invoice codes (WN-1234, 5EN19710, etc.)
+    top_text = text_norm[:600]  # bigger top area to be robust
+    m = PREFIXED_INVOICE_RE.search(top_text)
     if m:
+        inv = (m.group(1) or "").strip()
+        # extra length check so tiny numeric matches don't pass
+        if inv and len(re.sub(r"[^A-Za-z0-9]", "", inv)) >= 5:
+            return inv
+    # 3) As absolute last-resort: strict GST detection (only accept 15-char GSTIN)
+    gm = GST_LIKE_RE.search(text_norm)
+    if gm:
+        gst_val = gm.group(2) or ""
+        gst_val = gst_val.replace(" ", "").strip().upper()
+        # Only accept if 15 alnum chars (typical Indian GSTIN length)
+        if len(gst_val) == 15 and re.match(r"^[0-9A-Z]{15}$", gst_val):
+            # tag it so grouping won't treat GST same as invoice ID
+            return f"GST:{gst_val}"
+    return None
+def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
+    """
+    Extract invoice number from TEXT-BASED PDF.
+    Uses the original fast text extraction method.
+    """
+    # Try full-page text
+    text = page.get_text("text") or ""
+    inv = try_extract_invoice_from_text(text)
+    if inv:
+        return inv
+    # Try block-level text
+    for block in (page.get_text("blocks") or []):
+        block_text = block[4] if len(block) > 4 else ""
+        if block_text:
+            inv = try_extract_invoice_from_text(block_text)
+            if inv:
+                return inv
     return None
 # ============================================================================
+# IMAGE-BASED PDF EXTRACTION (Google Gemini)
 # ============================================================================
 def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
+    """
+    Extract invoice number from IMAGE-BASED PDF using Google Gemini Flash 2.0.
+    """
+    model = get_gemini_model()
+    if not model:
+        print("    Gemini model not available")
+        return None
     try:
+        # Convert page to image
+        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x resolution
+        img_bytes = pix.tobytes("png")
+        # Convert to PIL Image for Gemini
+        img = Image.open(io.BytesIO(img_bytes))
+        # Prompt for Gemini to extract invoice number
         prompt = """
+        Extract the invoice number from this image.  Look for:
+        - Invoice No, Invoice Number, Bill No, Bill Number
+        - Any alphanumeric code that appears to be an invoice identifier
+        - Purchase Order numbers if no invoice number is found
+        Return ONLY the invoice number/identifier itself, nothing else.
+        If no invoice number is found, return "NOT_FOUND".
         """
+        print("    Calling Google Gemini API...")
+        response = model.generate_content([prompt, img])
         if response and response.text:
+            extracted_text = response.text.strip()
+            print(f"    Gemini response: {extracted_text}")
+            if extracted_text and extracted_text != "NOT_FOUND":
+                # Clean up the response
+                invoice_no = extracted_text.replace(
+                    "*", "").replace("#", "").strip()
+                if invoice_no and len(invoice_no) > 2:
+                    print(f"    ✓ Gemini found invoice: {invoice_no}")
+                    return invoice_no
+            # Fallback:  Get full OCR text and try regex
+            ocr_prompt = "Extract all text from this invoice image. Return the complete text content."
+            ocr_response = model.generate_content([ocr_prompt, img])
+            if ocr_response and ocr_response.text:
+                print(
+                    f"    Gemini extracted {len(ocr_response.text)} chars, trying regex...")
+                inv = try_extract_invoice_from_text(ocr_response.text)
+                if inv:
+                    print(f"    ✓ Found via regex on Gemini text: {inv}")
+                    return inv
+        print("    ✗ Gemini:  No invoice found")
+        return None
     except Exception as e:
+        print(f"    ✗ Gemini extraction failed: {e}")
+        return None
 # ============================================================================
+# UNIFIED EXTRACTION LOGIC
 # ============================================================================
+def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
+    """Try text extraction first, then Gemini as fallback"""
+    # ALWAYS try text extraction first (fast, no API cost)
+    text_result = extract_invoice_text_based(page)
+    if text_result:
+        print(f"  ✓ Found via text extraction: {text_result}")
+        return text_result
+    # If text fails AND PDF seems image-based, try Gemini
     if is_image_pdf:
+        gemini_result = extract_invoice_gemini(page)
+        if gemini_result:
+            print(f"  ✓ Found via Gemini: {gemini_result}")
+            return gemini_result
     return None
+def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
+    """Create a new PDF with the given pages (0-based indices)."""
     out = fitz.open()
+    for i in page_indices:
+        out.insert_pdf(src_doc, from_page=i, to_page=i)
+    pdf_bytes = out.tobytes()
     out.close()
+    return pdf_bytes
 # ============================================================================
 @app.post("/split-invoices")
 async def split_invoices(
     file: UploadFile = File(...),
+    include_pdf: bool = Form(True),
+    initial_dpi: int = Form(300),  # Kept for compatibility
 ):
+    """
+    Split a multi-invoice PDF into separate PDFs based on invoice numbers.
+    - Text-based PDFs: Uses fast text extraction
+    - Image-based PDFs: Uses Google Gemini Flash 2.0 (if configured)
+    Note: GST values (tagged as "GST:...") are treated as a last-resort identifier and
+    are ignored for splitting by default (so repeated company GST won't prevent splits).
+    """
+    if not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="only PDF is supported")
+    file_bytes = await file.read()
+    if not file_bytes:
+        raise HTTPException(status_code=400, detail="empty file")
+    try:
+        doc = fitz.open(stream=file_bytes, filetype="pdf")
+        if doc.page_count == 0:
+            raise HTTPException(status_code=400, detail="no pages found")
+        print(f"\n{'='*60}")
+        print(f"Processing PDF: {file.filename}")
+        print(f"Total pages: {doc.page_count}")
+        print(f"{'='*60}")
+        # Step 1: Detect PDF type (text-based vs image-based)
+        is_image_pdf, avg_text_len = is_image_based_pdf(doc)
+        if is_image_pdf and not get_gemini_model():
+            raise HTTPException(
+                status_code=500,
+                detail="Image-based PDF detected but Google Gemini is not configured.  "
+                       "Please add GEMINI_API_KEY to your environment variables."
+            )
+        # Step 2: Extract invoice numbers from each page
+        page_invoice_nos: List[Optional[str]] = []
         for i in range(doc.page_count):
+            print(f"\n--- Page {i+1}/{doc.page_count} ---")
+            inv = extract_invoice_no_from_page(doc.load_page(i), is_image_pdf)
+            # inv may be something like "5EN19710" or "GST:12ABCDE..." or None
+            if inv:
+                print(f"  ✓ Raw extracted id: {inv}")
+            else:
+                print(f"  ✗ No invoice found (raw)")
+            page_invoice_nos.append(inv)
+        print(f"\n{'='*60}")
+        print(f"Raw Extraction Results:  {page_invoice_nos}")
+        print(f"{'='*60}")
+        # ---------------------------------------------------------
+        # Post-process extracted ids before grouping
+        # - Treat GST:<value> as a LAST-RESORT marker and ignore it for splitting
+        #   (convert to None) so repeated company GST doesn't group pages together.
+        # - Keep actual invoice ids like '5EN19710' intact.
+        # ---------------------------------------------------------
+        page_invoice_nos_filtered: List[Optional[str]] = []
+        for v in page_invoice_nos:
+            if v is None:
+                page_invoice_nos_filtered.append(None)
+            else:
+                # If GST-tagged value (we returned "GST:..."), ignore it for splitting
+                if isinstance(v, str) and v.upper().startswith("GST:"):
+                    page_invoice_nos_filtered.append(None)
+                else:
+                    page_invoice_nos_filtered.append(v)
+        print(f"Filtered (GST ignored) Results: {page_invoice_nos_filtered}")
+        # Step 3: Group pages by invoice number (use filtered ids)
+        groups: List[Dict] = []
+        current_group_pages: List[int] = []
+        current_invoice: Optional[str] = None
+        for idx, inv in enumerate(page_invoice_nos_filtered):
+            if current_invoice is None:
+                # Start a new group (even if inv is None)
+                current_invoice = inv
+                current_group_pages = [idx]
+            else:
+                # If a new non-empty invoice appears and differs -> close current group
+                if inv is not None and inv != current_invoice:
+                    groups.append({
+                        "invoice_no": current_invoice,
+                        "pages": current_group_pages[:],
+                    })
+                    current_invoice = inv
+                    current_group_pages = [idx]
+                else:
+                    # Continue current group (same invoice or both None)
+                    current_group_pages.append(idx)
+        # Save last group
+        if current_group_pages:
+            groups.append({
+                "invoice_no": current_invoice,
+                "pages": current_group_pages[:]
+            })
+        # Post-process groups:
+        # If first group has invoice_no None and next group has non-None -> merge leading None
+        if len(groups) > 1 and groups[0]["invoice_no"] is None and groups[1]["invoice_no"] is not None:
+            groups[1]["pages"] = groups[0]["pages"] + groups[1]["pages"]
+            groups.pop(0)
+        # If, after filtering, all groups are None (no invoice detected), return whole doc as one part
+        if all(g["invoice_no"] is None for g in groups):
+            print("\n⚠ Warning: No invoices detected in any page (after GST ignored)!")
+            print("  Returning entire PDF as single part")
+            groups = [{
+                "invoice_no": None,
+                "pages": list(range(doc.page_count))
+            }]
+        # Step 4: Build response parts
+        parts = []
+        for idx, g in enumerate(groups):
+            part_bytes = build_pdf_from_pages(doc, g["pages"])
+            info = {
+                # Keep invoice_no as detected in filtered set (None or actual invoice id)
+                "invoice_no": g["invoice_no"],
+                "pages": [p + 1 for p in g["pages"]],  # 1-based for humans
+                "num_pages": len(g["pages"]),
+                "size_bytes": len(part_bytes),
+            }
+            if include_pdf:
+                info["pdf_base64"] = base64.b64encode(
+                    part_bytes).decode("ascii")
+            parts.append(info)
+            print(f"\nPart {idx+1}:")
+            print(f"  Invoice: {g['invoice_no']}")
+            print(f"  Pages: {info['pages']}")
+            print(f"  Size: {len(part_bytes):,} bytes")
+        doc.close()
+        print(f"\n{'='*60}")
+        print(f"✓ Successfully split into {len(parts)} part(s)")
+        print(f"{'='*60}\n")
+        return JSONResponse({
+            "count": len(parts),
+            "pdf_type": "image-based" if is_image_pdf else "text-based",
+            "parts": parts
+        })
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"\n✗ Error: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return JSONResponse({"error": str(e)}, status_code=500)
 @app.get("/health")
+async def health_check():
+    """Health check endpoint to verify Gemini configuration."""
+    gemini_status = "configured" if get_gemini_model() else "not configured"
     return {
+        "status": "healthy",
+        "gemini_flash": gemini_status,
+        "gemini_available": GEMINI_AVAILABLE,
     }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8001)