Spaces:

anujakkulkarni
/

splitpdffile

Paused

App Files Files Community

anujakkulkarni commited on Dec 11, 2025

Commit

b6190f0

verified ·

1 Parent(s): 796d6ee

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -120

app.py CHANGED Viewed

@@ -9,14 +9,14 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 import fitz  # PyMuPDF
-# Azure Document Intelligence (Form Recognizer) - optional import
 try:
-    from azure. ai.formrecognizer import DocumentAnalysisClient
-    from azure.core.credentials import AzureKeyCredential
-    AZURE_AVAILABLE = True
-except ImportError:
-    AZURE_AVAILABLE = False
-    print("Warning: azure-ai-formrecognizer not installed. Image-based PDFs won't be supported.")
 app = FastAPI(title="Invoice Splitter API")
@@ -28,52 +28,47 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# --- Azure Document Intelligence Configuration (FROM HUGGING FACE SECRETS) ---
-# These will be automatically loaded from Hugging Face Spaces secrets
-AZURE_FORM_RECOGNIZER_ENDPOINT = os. getenv("AZURE_FORM_RECOGNIZER_ENDPOINT", "")
-AZURE_FORM_RECOGNIZER_KEY = os.getenv("AZURE_FORM_RECOGNIZER_KEY", "")
-azure_client = None
-def get_azure_client() -> Optional[DocumentAnalysisClient]:
-    """Get or create Azure Document Intelligence client."""
-    global azure_client
-    if not AZURE_AVAILABLE:
-        print("Azure SDK not available")
         return None
-    if azure_client is None:
-        # Check if credentials are configured via environment variables
-        if not AZURE_FORM_RECOGNIZER_ENDPOINT or not AZURE_FORM_RECOGNIZER_KEY:
-            print("Warning: Azure credentials not found in environment variables.")
-            print("Please configure AZURE_FORM_RECOGNIZER_ENDPOINT and AZURE_FORM_RECOGNIZER_KEY")
-            print("in your Hugging Face Space secrets.")
             return None
         try:
-            azure_client = DocumentAnalysisClient(
-                endpoint=AZURE_FORM_RECOGNIZER_ENDPOINT,
-                credential=AzureKeyCredential(AZURE_FORM_RECOGNIZER_KEY)
-            )
-            print("✓ Azure Document Intelligence client initialized")
-            print(f"  Endpoint: {AZURE_FORM_RECOGNIZER_ENDPOINT}")
-        except Exception as e:
-            print(f"Failed to initialize Azure client: {e}")
             return None
-    return azure_client
 # --- Regex patterns for text-based PDF extraction ---
 INVOICE_NO_RE = re.compile(
-    r"(?: Inv(?:Inoice)?\s*No\. ?|Invoice\s*No\. ?|Bill\s*No\.?|BILL\s*NO\.?|BILL\s*NO)\s*[:\-]?\s*([A-Za-z0-9\-/]+)",
     re.IGNORECASE,
 )
 PREFIXED_INVOICE_RE = re.compile(
-    r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b"
 )
 GST_LIKE_RE = re.compile(r"\b(GST[-\s]?\d+[A-Za-z0-9-]*)\b", re.IGNORECASE)
@@ -93,14 +88,14 @@ def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool,
     pages_to_check = min(sample_pages, doc.page_count)
     for i in range(pages_to_check):
-        text = doc.load_page(i).get_text("text") or ""
         total_text_length += len(text. strip())
     avg_text_length = total_text_length / pages_to_check
     is_image_based = avg_text_length < 50
     print(
-        f"  PDF Type Detection:  avg_text_length={avg_text_length:.1f} chars/page")
     print(
         f"  Classification: {'IMAGE-BASED' if is_image_based else 'TEXT-BASED'} PDF")
@@ -114,7 +109,7 @@ def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool,
 def try_extract_invoice_from_text(text: str) -> Optional[str]:
     """
     Extract invoice number from text using regex patterns.
-    Works for text-based PDFs.
     """
     if not text:
         return None
@@ -127,7 +122,7 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
             return inv
     # Pattern 2: Prefixed invoice (WN-12345/25) - search top portion
-    top_text = text[: 500]
     m = PREFIXED_INVOICE_RE.search(top_text)
     if m:
         inv = (m.group(1) or "").strip()
@@ -142,9 +137,9 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
     return None
-def extract_invoice_text_based(page:  fitz.Page) -> Optional[str]:
     """
-    Extract invoice number from TEXT-BASED PDF.
     Uses the original fast text extraction method.
     """
     # Try full-page text
@@ -156,72 +151,76 @@ def extract_invoice_text_based(page:  fitz.Page) -> Optional[str]:
     # Try block-level text
     for block in (page.get_text("blocks") or []):
         block_text = block[4] if len(block) > 4 else ""
-        if block_text:
             inv = try_extract_invoice_from_text(block_text)
-            if inv:
                 return inv
     return None
 # ============================================================================
-# IMAGE-BASED PDF EXTRACTION (Azure Document Intelligence)
 # ============================================================================
-def extract_invoice_azure(page: fitz.Page) -> Optional[str]:
     """
-    Extract invoice number from IMAGE-BASED PDF using Azure Document Intelligence.
     """
-    client = get_azure_client()
-    if not client:
-        print("    Azure client not available")
         return None
     try:
         # Convert page to image
         pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x resolution
         img_bytes = pix.tobytes("png")
-        # Analyze with Azure prebuilt invoice model
-        print("    Calling Azure Document Intelligence API...")
-        poller = client.begin_analyze_document(
-            "prebuilt-invoice",
-            document=img_bytes
-        )
-        result = poller.result()
-        # Extract invoice ID from structured fields
-        if result.documents:
-            for document in result.documents:
-                if hasattr(document, 'fields') and document.fields:
-                    # Try InvoiceId field
-                    if 'InvoiceId' in document.fields and document.fields['InvoiceId']:
-                        invoice_id = document.fields['InvoiceId'].value
-                        if invoice_id:
-                            print(f"    ✓ Azure found InvoiceId: {invoice_id}")
-                            return str(invoice_id).strip()
-                    # Try PurchaseOrder field
-                    if 'PurchaseOrder' in document.fields and document.fields['PurchaseOrder']:
-                        po = document.fields['PurchaseOrder'].value
-                        if po:
-                            print(f"    ✓ Azure found PurchaseOrder: {po}")
-                            return str(po).strip()
-        # Fallback:  try regex on Azure-extracted text
-        if result.content:
-            print(
-                f"    Azure extracted {len(result.content)} chars, trying regex...")
-            inv = try_extract_invoice_from_text(result.content)
-            if inv:
-                print(f"    ✓ Found via regex on Azure text:  {inv}")
-                return inv
-        print("    ✗ Azure:  No invoice found")
         return None
     except Exception as e:
-        print(f"    ✗ Azure extraction failed: {e}")
         return None
@@ -230,7 +229,7 @@ def extract_invoice_azure(page: fitz.Page) -> Optional[str]:
 # ============================================================================
 def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
-    """Try text extraction first, then Azure as fallback"""
     # ALWAYS try text extraction first (fast, no API cost)
     text_result = extract_invoice_text_based(page)
@@ -238,12 +237,12 @@ def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optiona
         print(f"  ✓ Found via text extraction: {text_result}")
         return text_result
-    # If text fails AND PDF seems image-based, try Azure
-    if is_image_pdf:
-        azure_result = extract_invoice_azure(page)
-        if azure_result:
-            print(f"  ✓ Found via Azure:  {azure_result}")
-            return azure_result
     return None
@@ -264,7 +263,7 @@ def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> byt
 @app.post("/split-invoices")
 async def split_invoices(
-    file: UploadFile = File(...),
     include_pdf: bool = Form(True),
     initial_dpi: int = Form(300),  # Kept for compatibility
 ):
@@ -272,12 +271,12 @@ async def split_invoices(
     Split a multi-invoice PDF into separate PDFs based on invoice numbers.
     Automatically detects PDF type:
-    - Text-based PDFs: Uses fast text extraction (original method)
-    - Image-based PDFs: Uses Azure Document Intelligence for accurate OCR
     Parameters:
     - file: PDF file to split
-    - include_pdf: Whether to include base64 PDF in response
     - initial_dpi: DPI setting (kept for compatibility)
     """
     if not file.filename.lower().endswith(".pdf"):
@@ -300,40 +299,39 @@ async def split_invoices(
         # Step 1: Detect PDF type (text-based vs image-based)
         is_image_pdf, avg_text_len = is_image_based_pdf(doc)
-        if is_image_pdf and not get_azure_client():
             raise HTTPException(
                 status_code=500,
-                detail="Image-based PDF detected but Azure Document Intelligence is not configured.  "
-                       "Please add AZURE_FORM_RECOGNIZER_ENDPOINT and AZURE_FORM_RECOGNIZER_KEY "
-                       "to your Hugging Face Space secrets."
             )
         # Step 2: Extract invoice numbers from each page
         page_invoice_nos:  List[Optional[str]] = []
         for i in range(doc.page_count):
             print(f"\n--- Page {i+1}/{doc.page_count} ---")
-            inv = extract_invoice_no_from_page(doc. load_page(i), is_image_pdf)
-            if inv:
                 print(f"  ✓ Invoice found: {inv}")
             else:
                 print(f"  ✗ No invoice found")
             page_invoice_nos.append(inv)
         print(f"\n{'='*60}")
-        print(f"Extraction Results: {page_invoice_nos}")
         print(f"{'='*60}")
         # Step 3: Group pages by invoice number
-        groups: List[Dict] = []
-        current_group_pages: List[int] = []
-        current_invoice: Optional[str] = None
         for idx, inv in enumerate(page_invoice_nos):
-            if current_invoice is None:
                 # Start first group
                 current_invoice = inv
                 current_group_pages = [idx]
-            else:
                 if inv is not None and inv != current_invoice:
                     # New invoice detected - save current group
                     groups.append({
@@ -349,7 +347,7 @@ async def split_invoices(
         # Save last group
         if current_group_pages:
             groups.append({
-                "invoice_no":  current_invoice,
                 "pages": current_group_pages[:]
             })
@@ -369,8 +367,8 @@ async def split_invoices(
             info = {
                 "invoice_no": g["invoice_no"],
                 "pages":  [p + 1 for p in g["pages"]],  # 1-based for humans
-                "num_pages":  len(g["pages"]),
-                "size_bytes": len(part_bytes),
             }
             if include_pdf:
                 info["pdf_base64"] = base64.b64encode(
@@ -389,28 +387,27 @@ async def split_invoices(
         return JSONResponse({
             "count": len(parts),
-            "pdf_type": "image-based" if is_image_pdf else "text-based",
             "parts": parts
         })
     except HTTPException:
         raise
-    except Exception as e:
         print(f"\n✗ Error: {str(e)}")
         import traceback
         traceback.print_exc()
-        return JSONResponse({"error": str(e)}, status_code=500)
 @app.get("/health")
 async def health_check():
-    """Health check endpoint to verify Azure configuration."""
-    azure_status = "configured" if get_azure_client() else "not configured"
     return {
         "status": "healthy",
-        "azure_document_intelligence": azure_status,
-        "azure_available": AZURE_AVAILABLE,
-        "endpoint":  AZURE_FORM_RECOGNIZER_ENDPOINT if azure_status == "configured" else "not set"
     }
 if __name__ == "__main__":

 from fastapi.responses import JSONResponse
 import fitz  # PyMuPDF
+# Google Gemini - optional import
 try:
+    import google.generativeai as genai
+    from PIL import Image
+    GEMINI_AVAILABLE = True
+except ImportError:
+    GEMINI_AVAILABLE = False
+    print("Warning: google-generativeai not installed.  Image-based PDFs won't be supported.")
 app = FastAPI(title="Invoice Splitter API")
     allow_headers=["*"],
 )
+# --- Google Gemini Configuration ---
+# This will be automatically loaded from environment variables
+GEMINI_API_KEY = os. getenv("GEMINI_API_KEY", "")
+gemini_model = None
+def get_gemini_model():
+    """Get or create Gemini model instance."""
+    global gemini_model
+    if not GEMINI_AVAILABLE:
+        print("Gemini SDK not available")
         return None
+    if gemini_model is None:
+        # Check if API key is configured via environment variables
+        if not GEMINI_API_KEY:
+            print("Warning:  Gemini API key not found in environment variables.")
+            print("Please configure GEMINI_API_KEY in your environment variables.")
             return None
         try:
+            genai.configure(api_key=GEMINI_API_KEY)
+            gemini_model = genai.GenerativeModel('gemini-2.0-flash-exp')
+            print("✓ Google Gemini Flash 2.0 initialized")
+        except Exception as e:
+            print(f"Failed to initialize Gemini model: {e}")
             return None
+    return gemini_model
 # --- Regex patterns for text-based PDF extraction ---
 INVOICE_NO_RE = re.compile(
+    r"(? : Inv(? :oice)?\s*No\. ? |Invoice\s*No\.? |Bill\s*No\.?|BILL\s*NO\.?|BILL\s*NO)\s*[:\-]?\s*([A-Za-z0-9\-/]+)",
     re.IGNORECASE,
 )
 PREFIXED_INVOICE_RE = re.compile(
+    r"\b([A-Z]{2,4}[-/]\d{4,}(? :/\d+)?[A-Z]*)\b"
 )
 GST_LIKE_RE = re.compile(r"\b(GST[-\s]?\d+[A-Za-z0-9-]*)\b", re.IGNORECASE)
     pages_to_check = min(sample_pages, doc.page_count)
     for i in range(pages_to_check):
+        text = doc. load_page(i).get_text("text") or ""
         total_text_length += len(text. strip())
     avg_text_length = total_text_length / pages_to_check
     is_image_based = avg_text_length < 50
     print(
+        f"  PDF Type Detection: avg_text_length={avg_text_length:.1f} chars/page")
     print(
         f"  Classification: {'IMAGE-BASED' if is_image_based else 'TEXT-BASED'} PDF")
 def try_extract_invoice_from_text(text: str) -> Optional[str]:
     """
     Extract invoice number from text using regex patterns.
+    Works for text-based PDFs.
     """
     if not text:
         return None
             return inv
     # Pattern 2: Prefixed invoice (WN-12345/25) - search top portion
+    top_text = text[:500]
     m = PREFIXED_INVOICE_RE.search(top_text)
     if m:
         inv = (m.group(1) or "").strip()
     return None
+def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
     """
+    Extract invoice number from TEXT-BASED PDF.
     Uses the original fast text extraction method.
     """
     # Try full-page text
     # Try block-level text
     for block in (page.get_text("blocks") or []):
         block_text = block[4] if len(block) > 4 else ""
+        if block_text:
             inv = try_extract_invoice_from_text(block_text)
+            if inv:
                 return inv
     return None
 # ============================================================================
+# IMAGE-BASED PDF EXTRACTION (Google Gemini)
 # ============================================================================
+def extract_invoice_gemini(page: fitz.Page) -> Optional[str]:
     """
+    Extract invoice number from IMAGE-BASED PDF using Google Gemini Flash 2.0.
     """
+    model = get_gemini_model()
+    if not model:
+        print("    Gemini model not available")
         return None
     try:
         # Convert page to image
         pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x resolution
         img_bytes = pix.tobytes("png")
+        # Convert to PIL Image for Gemini
+        img = Image.open(io.BytesIO(img_bytes))
+        # Prompt for Gemini to extract invoice number
+        prompt = """
+        Extract the invoice number from this image.  Look for:
+        - Invoice No, Invoice Number, Bill No, Bill Number
+        - Any alphanumeric code that appears to be an invoice identifier
+        - Purchase Order numbers if no invoice number is found
+        Return ONLY the invoice number/identifier itself, nothing else.
+        If no invoice number is found, return "NOT_FOUND".
+        """
+        print("    Calling Google Gemini API...")
+        response = model.generate_content([prompt, img])
+        if response and response.text:
+            extracted_text = response.text.strip()
+            print(f"    Gemini response: {extracted_text}")
+            if extracted_text and extracted_text != "NOT_FOUND":
+                # Clean up the response
+                invoice_no = extracted_text.replace("*", "").replace("#", "").strip()
+                if invoice_no and len(invoice_no) > 2:
+                    print(f"    ✓ Gemini found invoice: {invoice_no}")
+                    return invoice_no
+            # Fallback:  Get full OCR text and try regex
+            ocr_prompt = "Extract all text from this invoice image. Return the complete text content."
+            ocr_response = model.generate_content([ocr_prompt, img])
+            if ocr_response and ocr_response.text:
+                print(f"    Gemini extracted {len(ocr_response.text)} chars, trying regex...")
+                inv = try_extract_invoice_from_text(ocr_response.text)
+                if inv:
+                    print(f"    ✓ Found via regex on Gemini text: {inv}")
+                    return inv
+        print("    ✗ Gemini:  No invoice found")
         return None
     except Exception as e:
+        print(f"    ✗ Gemini extraction failed: {e}")
         return None
 # ============================================================================
 def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
+    """Try text extraction first, then Gemini as fallback"""
     # ALWAYS try text extraction first (fast, no API cost)
     text_result = extract_invoice_text_based(page)
         print(f"  ✓ Found via text extraction: {text_result}")
         return text_result
+    # If text fails AND PDF seems image-based, try Gemini
+    if is_image_pdf:
+        gemini_result = extract_invoice_gemini(page)
+        if gemini_result:
+            print(f"  ✓ Found via Gemini: {gemini_result}")
+            return gemini_result
     return None
 @app.post("/split-invoices")
 async def split_invoices(
+    file: UploadFile = File(... ),
     include_pdf: bool = Form(True),
     initial_dpi: int = Form(300),  # Kept for compatibility
 ):
     Split a multi-invoice PDF into separate PDFs based on invoice numbers.
     Automatically detects PDF type:
+    - Text-based PDFs:  Uses fast text extraction (original method)
+    - Image-based PDFs: Uses Google Gemini Flash 2.0 for accurate OCR
     Parameters:
     - file: PDF file to split
+    - include_pdf:  Whether to include base64 PDF in response
     - initial_dpi: DPI setting (kept for compatibility)
     """
     if not file.filename.lower().endswith(".pdf"):
         # Step 1: Detect PDF type (text-based vs image-based)
         is_image_pdf, avg_text_len = is_image_based_pdf(doc)
+        if is_image_pdf and not get_gemini_model():
             raise HTTPException(
                 status_code=500,
+                detail="Image-based PDF detected but Google Gemini is not configured.  "
+                       "Please add GEMINI_API_KEY to your environment variables."
             )
         # Step 2: Extract invoice numbers from each page
         page_invoice_nos:  List[Optional[str]] = []
         for i in range(doc.page_count):
             print(f"\n--- Page {i+1}/{doc.page_count} ---")
+            inv = extract_invoice_no_from_page(doc.load_page(i), is_image_pdf)
+            if inv:
                 print(f"  ✓ Invoice found: {inv}")
             else:
                 print(f"  ✗ No invoice found")
             page_invoice_nos.append(inv)
         print(f"\n{'='*60}")
+        print(f"Extraction Results:  {page_invoice_nos}")
         print(f"{'='*60}")
         # Step 3: Group pages by invoice number
+        groups:  List[Dict] = []
+        current_group_pages:  List[int] = []
+        current_invoice:  Optional[str] = None
         for idx, inv in enumerate(page_invoice_nos):
+            if current_invoice is None:
                 # Start first group
                 current_invoice = inv
                 current_group_pages = [idx]
+            else:
                 if inv is not None and inv != current_invoice:
                     # New invoice detected - save current group
                     groups.append({
         # Save last group
         if current_group_pages:
             groups.append({
+                "invoice_no": current_invoice,
                 "pages": current_group_pages[:]
             })
             info = {
                 "invoice_no": g["invoice_no"],
                 "pages":  [p + 1 for p in g["pages"]],  # 1-based for humans
+                "num_pages": len(g["pages"]),
+                "size_bytes":  len(part_bytes),
             }
             if include_pdf:
                 info["pdf_base64"] = base64.b64encode(
         return JSONResponse({
             "count": len(parts),
+            "pdf_type":  "image-based" if is_image_pdf else "text-based",
             "parts": parts
         })
     except HTTPException:
         raise
+    except Exception as e:
         print(f"\n✗ Error: {str(e)}")
         import traceback
         traceback.print_exc()
+        return JSONResponse({"error":  str(e)}, status_code=500)
 @app.get("/health")
 async def health_check():
+    """Health check endpoint to verify Gemini configuration."""
+    gemini_status = "configured" if get_gemini_model() else "not configured"
     return {
         "status": "healthy",
+        "gemini_flash": gemini_status,
+        "gemini_available": GEMINI_AVAILABLE,
     }
 if __name__ == "__main__":