anujakkulkarni commited on
Commit
ceafaef
·
verified ·
1 Parent(s): 0daf4cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -14
app.py CHANGED
@@ -232,21 +232,22 @@ def extract_invoice_azure(page: fitz.Page) -> Optional[str]:
232
  # ============================================================================
233
 
234
  def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
235
- """
236
- Extract invoice number using appropriate method based on PDF type.
237
-
238
- Args:
239
- page: PDF page to extract from
240
- is_image_pdf: True if PDF is image-based, False if text-based
241
- """
 
 
242
  if is_image_pdf:
243
- # Use Azure for image-based PDFs
244
- print(f" Method: Azure Document Intelligence (image-based)")
245
- return extract_invoice_azure(page)
246
- else:
247
- # Use text extraction for text-based PDFs
248
- print(f" Method: Text extraction (text-based)")
249
- return extract_invoice_text_based(page)
250
 
251
 
252
  def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
 
232
  # ============================================================================
233
 
234
  def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
235
+ """Try text extraction first, then Azure as fallback"""
236
+
237
+ # ALWAYS try text extraction first (fast, no API cost)
238
+ text_result = extract_invoice_text_based(page)
239
+ if text_result:
240
+ print(f" ✓ Found via text extraction: {text_result}")
241
+ return text_result
242
+
243
+ # If text fails AND PDF seems image-based, try Azure
244
  if is_image_pdf:
245
+ azure_result = extract_invoice_azure(page)
246
+ if azure_result:
247
+ print(f" ✓ Found via Azure: {azure_result}")
248
+ return azure_result
249
+
250
+ return None
 
251
 
252
 
253
  def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes: