Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -232,21 +232,22 @@ def extract_invoice_azure(page: fitz.Page) -> Optional[str]:
|
|
| 232 |
# ============================================================================
|
| 233 |
|
| 234 |
def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
|
| 235 |
-
"""
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
|
|
|
|
|
|
| 242 |
if is_image_pdf:
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
return extract_invoice_text_based(page)
|
| 250 |
|
| 251 |
|
| 252 |
def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
|
|
|
|
| 232 |
# ============================================================================
|
| 233 |
|
| 234 |
def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
|
| 235 |
+
"""Try text extraction first, then Azure as fallback"""
|
| 236 |
+
|
| 237 |
+
# ALWAYS try text extraction first (fast, no API cost)
|
| 238 |
+
text_result = extract_invoice_text_based(page)
|
| 239 |
+
if text_result:
|
| 240 |
+
print(f" ✓ Found via text extraction: {text_result}")
|
| 241 |
+
return text_result
|
| 242 |
+
|
| 243 |
+
# If text fails AND PDF seems image-based, try Azure
|
| 244 |
if is_image_pdf:
|
| 245 |
+
azure_result = extract_invoice_azure(page)
|
| 246 |
+
if azure_result:
|
| 247 |
+
print(f" ✓ Found via Azure: {azure_result}")
|
| 248 |
+
return azure_result
|
| 249 |
+
|
| 250 |
+
return None
|
|
|
|
| 251 |
|
| 252 |
|
| 253 |
def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
|