Spaces:

anujakkulkarni
/

splitpdffile

Sleeping

anujakkulkarni commited on Dec 6, 2025

Commit

ceafaef

verified ·

1 Parent(s): 0daf4cc

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -232,21 +232,22 @@ def extract_invoice_azure(page: fitz.Page) -> Optional[str]:
 # ============================================================================
 def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
-    """
-    Extract invoice number using appropriate method based on PDF type.
-    Args:
-        page: PDF page to extract from
-        is_image_pdf: True if PDF is image-based, False if text-based
-    """
     if is_image_pdf:
-        # Use Azure for image-based PDFs
-        print(f"  Method: Azure Document Intelligence (image-based)")
-        return extract_invoice_azure(page)
-    else:
-        # Use text extraction for text-based PDFs
-        print(f"  Method: Text extraction (text-based)")
-        return extract_invoice_text_based(page)
 def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:

 # ============================================================================
 def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
+    """Try text extraction first, then Azure as fallback"""
+    # ALWAYS try text extraction first (fast, no API cost)
+    text_result = extract_invoice_text_based(page)
+    if text_result:
+        print(f"  ✓ Found via text extraction: {text_result}")
+        return text_result
+    # If text fails AND PDF seems image-based, try Azure
     if is_image_pdf:
+        azure_result = extract_invoice_azure(page)
+        if azure_result:
+            print(f"  ✓ Found via Azure: {azure_result}")
+            return azure_result
+    return None
 def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes: