anujakkulkarni commited on
Commit
6ba329f
·
verified ·
1 Parent(s): b7cdb70

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -57
app.py CHANGED
@@ -5,15 +5,14 @@ import base64
5
  from typing import List, Dict, Optional, Tuple
6
 
7
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
8
- from fastapi. middleware.cors import CORSMiddleware
9
- from fastapi.middleware.gzip import GZipMiddleware
10
  from fastapi.responses import JSONResponse
11
  import fitz # PyMuPDF
12
 
13
  # Azure Document Intelligence (Form Recognizer) - optional import
14
  try:
15
  from azure.ai.formrecognizer import DocumentAnalysisClient
16
- from azure. core.credentials import AzureKeyCredential
17
  AZURE_AVAILABLE = True
18
  except ImportError:
19
  AZURE_AVAILABLE = False
@@ -21,9 +20,6 @@ except ImportError:
21
 
22
  app = FastAPI(title="Invoice Splitter API")
23
 
24
- # ✅ ADD GZIP COMPRESSION MIDDLEWARE (BEFORE CORS)
25
- app.add_middleware(GZipMiddleware, minimum_size=1000, compresslevel=6)
26
-
27
  app.add_middleware(
28
  CORSMiddleware,
29
  allow_origins=["*"],
@@ -79,7 +75,7 @@ def get_azure_client() -> Optional[DocumentAnalysisClient]:
79
 
80
  # --- Regex patterns for text-based PDF extraction ---
81
  INVOICE_NO_RE = re.compile(
82
- r"(?:Inv(?:oice)?\s*No\. ?|Invoice\s*No\.?|Bill\s*No\.?|BILL\s*NO\.?|BILL\s*NO)\s*[:\-]?\s*([A-Za-z0-9\-/]+)",
83
  re.IGNORECASE,
84
  )
85
 
@@ -93,11 +89,11 @@ GST_LIKE_RE = re.compile(r"\b(GST[-\s]?\d+[A-Za-z0-9-]*)\b", re.IGNORECASE)
93
  def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
94
  """
95
  Detect if PDF is image-based or text-based by sampling pages.
96
- Returns (is_image_based, avg_text_length).
97
 
98
  Strategy:
99
  - Sample first few pages
100
- - If average extractable text < 30 chars per page, it's likely image-based
101
  - If text > 200 chars per page, it's text-based
102
  """
103
  total_text_length = 0
@@ -105,11 +101,10 @@ def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool,
105
 
106
  for i in range(pages_to_check):
107
  text = doc.load_page(i).get_text("text") or ""
108
- total_text_length += len(text. strip())
109
 
110
  avg_text_length = total_text_length / pages_to_check
111
- # CHANGED: Lower threshold from 50 to 30
112
- is_image_based = avg_text_length < 30
113
 
114
  print(
115
  f" PDF Type Detection: avg_text_length={avg_text_length:.1f} chars/page")
@@ -125,14 +120,14 @@ def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool,
125
 
126
  def try_extract_invoice_from_text(text: str) -> Optional[str]:
127
  """
128
- Extract invoice number from text using regex patterns.
129
- Works for text-based PDFs.
130
  """
131
  if not text:
132
  return None
133
 
134
  # Pattern 1: Labeled invoice (Invoice No, Bill No, etc.)
135
- m = INVOICE_NO_RE. search(text)
136
  if m:
137
  inv = (m.group(1) or "").strip()
138
  if inv and inv.lower() != "invoice" and len(inv) > 2:
@@ -160,7 +155,7 @@ def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
160
  Uses the original fast text extraction method.
161
  """
162
  # Try full-page text
163
- text = page. get_text("text") or ""
164
  inv = try_extract_invoice_from_text(text)
165
  if inv:
166
  return inv
@@ -208,10 +203,10 @@ def extract_invoice_azure(page: fitz.Page) -> Optional[str]:
208
  if hasattr(document, 'fields') and document.fields:
209
  # Try InvoiceId field
210
  if 'InvoiceId' in document.fields and document.fields['InvoiceId']:
211
- invoice_id = document.fields['InvoiceId']. value
212
  if invoice_id:
213
  print(f" ✓ Azure found InvoiceId: {invoice_id}")
214
- return str(invoice_id). strip()
215
 
216
  # Try PurchaseOrder field
217
  if 'PurchaseOrder' in document.fields and document.fields['PurchaseOrder']:
@@ -221,7 +216,7 @@ def extract_invoice_azure(page: fitz.Page) -> Optional[str]:
221
  return str(po).strip()
222
 
223
  # Fallback: try regex on Azure-extracted text
224
- if result. content:
225
  print(
226
  f" Azure extracted {len(result.content)} chars, trying regex...")
227
  inv = try_extract_invoice_from_text(result.content)
@@ -242,9 +237,8 @@ def extract_invoice_azure(page: fitz.Page) -> Optional[str]:
242
  # ============================================================================
243
 
244
  def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
245
- """
246
- ✅ HYBRID EXTRACTION: Try text extraction first, then Azure as fallback
247
- """
248
  # ALWAYS try text extraction first (fast, no API cost)
249
  text_result = extract_invoice_text_based(page)
250
  if text_result:
@@ -282,20 +276,18 @@ async def split_invoices(
282
  initial_dpi: int = Form(300), # Kept for compatibility
283
  ):
284
  """
285
- Split a multi-invoice PDF into separate PDFs based on invoice numbers.
286
 
287
  Automatically detects PDF type:
288
- - Text-based PDFs: Uses fast text extraction (hybrid approach)
289
  - Image-based PDFs: Uses Azure Document Intelligence for accurate OCR
290
 
291
  Parameters:
292
  - file: PDF file to split
293
  - include_pdf: Whether to include base64 PDF in response
294
  - initial_dpi: DPI setting (kept for compatibility)
295
-
296
- Response is automatically compressed with GZip for better network reliability.
297
  """
298
- if not file.filename.lower().endswith(". pdf"):
299
  raise HTTPException(status_code=400, detail="only PDF is supported")
300
 
301
  file_bytes = await file.read()
@@ -303,7 +295,7 @@ async def split_invoices(
303
  raise HTTPException(status_code=400, detail="empty file")
304
 
305
  try:
306
- doc = fitz. open(stream=file_bytes, filetype="pdf")
307
  if doc.page_count == 0:
308
  raise HTTPException(status_code=400, detail="no pages found")
309
 
@@ -318,7 +310,7 @@ async def split_invoices(
318
  if is_image_pdf and not get_azure_client():
319
  raise HTTPException(
320
  status_code=500,
321
- detail="Image-based PDF detected but Azure Document Intelligence is not configured. "
322
  "Please update AZURE_FORM_RECOGNIZER_ENDPOINT and AZURE_FORM_RECOGNIZER_KEY in the code."
323
  )
324
 
@@ -378,8 +370,6 @@ async def split_invoices(
378
 
379
  # Step 4: Build response parts
380
  parts = []
381
- total_base64_size = 0 # ✅ NEW: Track total size
382
-
383
  for idx, g in enumerate(groups):
384
  part_bytes = build_pdf_from_pages(doc, g["pages"])
385
  info = {
@@ -389,43 +379,25 @@ async def split_invoices(
389
  "size_bytes": len(part_bytes),
390
  }
391
  if include_pdf:
392
- pdf_base64 = base64.b64encode(part_bytes).decode("ascii")
393
- info["pdf_base64"] = pdf_base64
394
- total_base64_size += len(pdf_base64) # ✅ NEW: Track size
395
-
396
  parts.append(info)
397
  print(f"\nPart {idx+1}:")
398
  print(f" Invoice: {g['invoice_no']}")
399
  print(f" Pages: {info['pages']}")
400
  print(f" Size: {len(part_bytes):,} bytes")
401
- if include_pdf:
402
- print(f" Base64 size: {len(info. get('pdf_base64', '')):,} chars")
403
 
404
- doc. close()
405
 
406
  print(f"\n{'='*60}")
407
  print(f"✓ Successfully split into {len(parts)} part(s)")
408
- if include_pdf:
409
- print(f"Total base64 size: {total_base64_size:,} chars ({total_base64_size/1024/1024:.2f} MB)")
410
  print(f"{'='*60}\n")
411
 
412
- # ✅ NEW: Build response with size metadata
413
- response_data = {
414
  "count": len(parts),
415
  "pdf_type": "image-based" if is_image_pdf else "text-based",
416
- "parts": parts,
417
- "total_size_bytes": total_base64_size if include_pdf else 0, # For validation
418
- "compression": "gzip", # Hint that response is compressed
419
- }
420
-
421
- # ✅ NEW: Return with compression headers
422
- return JSONResponse(
423
- content=response_data,
424
- headers={
425
- "X-Total-Parts": str(len(parts)),
426
- "X-Uncompressed-Size": str(total_base64_size),
427
- }
428
- )
429
 
430
  except HTTPException:
431
  raise
@@ -444,8 +416,7 @@ async def health_check():
444
  "status": "healthy",
445
  "azure_document_intelligence": azure_status,
446
  "azure_available": AZURE_AVAILABLE,
447
- "endpoint": AZURE_FORM_RECOGNIZER_ENDPOINT if azure_status == "configured" else "not set",
448
- "compression": "gzip enabled",
449
  }
450
 
451
  if __name__ == "__main__":
 
5
  from typing import List, Dict, Optional, Tuple
6
 
7
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
8
+ from fastapi.middleware.cors import CORSMiddleware
 
9
  from fastapi.responses import JSONResponse
10
  import fitz # PyMuPDF
11
 
12
  # Azure Document Intelligence (Form Recognizer) - optional import
13
  try:
14
  from azure.ai.formrecognizer import DocumentAnalysisClient
15
+ from azure.core.credentials import AzureKeyCredential
16
  AZURE_AVAILABLE = True
17
  except ImportError:
18
  AZURE_AVAILABLE = False
 
20
 
21
  app = FastAPI(title="Invoice Splitter API")
22
 
 
 
 
23
  app.add_middleware(
24
  CORSMiddleware,
25
  allow_origins=["*"],
 
75
 
76
  # --- Regex patterns for text-based PDF extraction ---
77
  INVOICE_NO_RE = re.compile(
78
+ r"(?:Inv(?:oice)?\s*No\.?|Invoice\s*No\.?|Bill\s*No\.?|BILL\s*NO\.?|BILL\s*NO)\s*[:\-]?\s*([A-Za-z0-9\-/]+)",
79
  re.IGNORECASE,
80
  )
81
 
 
89
  def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
90
  """
91
  Detect if PDF is image-based or text-based by sampling pages.
92
+ Returns (is_image_based, avg_text_length).
93
 
94
  Strategy:
95
  - Sample first few pages
96
+ - If average extractable text < 50 chars per page, it's likely image-based
97
  - If text > 200 chars per page, it's text-based
98
  """
99
  total_text_length = 0
 
101
 
102
  for i in range(pages_to_check):
103
  text = doc.load_page(i).get_text("text") or ""
104
+ total_text_length += len(text.strip())
105
 
106
  avg_text_length = total_text_length / pages_to_check
107
+ is_image_based = avg_text_length < 50
 
108
 
109
  print(
110
  f" PDF Type Detection: avg_text_length={avg_text_length:.1f} chars/page")
 
120
 
121
  def try_extract_invoice_from_text(text: str) -> Optional[str]:
122
  """
123
+ Extract invoice number from text using regex patterns.
124
+ Works for text-based PDFs.
125
  """
126
  if not text:
127
  return None
128
 
129
  # Pattern 1: Labeled invoice (Invoice No, Bill No, etc.)
130
+ m = INVOICE_NO_RE.search(text)
131
  if m:
132
  inv = (m.group(1) or "").strip()
133
  if inv and inv.lower() != "invoice" and len(inv) > 2:
 
155
  Uses the original fast text extraction method.
156
  """
157
  # Try full-page text
158
+ text = page.get_text("text") or ""
159
  inv = try_extract_invoice_from_text(text)
160
  if inv:
161
  return inv
 
203
  if hasattr(document, 'fields') and document.fields:
204
  # Try InvoiceId field
205
  if 'InvoiceId' in document.fields and document.fields['InvoiceId']:
206
+ invoice_id = document.fields['InvoiceId'].value
207
  if invoice_id:
208
  print(f" ✓ Azure found InvoiceId: {invoice_id}")
209
+ return str(invoice_id).strip()
210
 
211
  # Try PurchaseOrder field
212
  if 'PurchaseOrder' in document.fields and document.fields['PurchaseOrder']:
 
216
  return str(po).strip()
217
 
218
  # Fallback: try regex on Azure-extracted text
219
+ if result.content:
220
  print(
221
  f" Azure extracted {len(result.content)} chars, trying regex...")
222
  inv = try_extract_invoice_from_text(result.content)
 
237
  # ============================================================================
238
 
239
  def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
240
+ """Try text extraction first, then Azure as fallback"""
241
+
 
242
  # ALWAYS try text extraction first (fast, no API cost)
243
  text_result = extract_invoice_text_based(page)
244
  if text_result:
 
276
  initial_dpi: int = Form(300), # Kept for compatibility
277
  ):
278
  """
279
+ Split a multi-invoice PDF into separate PDFs based on invoice numbers.
280
 
281
  Automatically detects PDF type:
282
+ - Text-based PDFs: Uses fast text extraction (original method)
283
  - Image-based PDFs: Uses Azure Document Intelligence for accurate OCR
284
 
285
  Parameters:
286
  - file: PDF file to split
287
  - include_pdf: Whether to include base64 PDF in response
288
  - initial_dpi: DPI setting (kept for compatibility)
 
 
289
  """
290
+ if not file.filename.lower().endswith(".pdf"):
291
  raise HTTPException(status_code=400, detail="only PDF is supported")
292
 
293
  file_bytes = await file.read()
 
295
  raise HTTPException(status_code=400, detail="empty file")
296
 
297
  try:
298
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
299
  if doc.page_count == 0:
300
  raise HTTPException(status_code=400, detail="no pages found")
301
 
 
310
  if is_image_pdf and not get_azure_client():
311
  raise HTTPException(
312
  status_code=500,
313
+ detail="Image-based PDF detected but Azure Document Intelligence is not configured. "
314
  "Please update AZURE_FORM_RECOGNIZER_ENDPOINT and AZURE_FORM_RECOGNIZER_KEY in the code."
315
  )
316
 
 
370
 
371
  # Step 4: Build response parts
372
  parts = []
 
 
373
  for idx, g in enumerate(groups):
374
  part_bytes = build_pdf_from_pages(doc, g["pages"])
375
  info = {
 
379
  "size_bytes": len(part_bytes),
380
  }
381
  if include_pdf:
382
+ info["pdf_base64"] = base64.b64encode(
383
+ part_bytes).decode("ascii")
 
 
384
  parts.append(info)
385
  print(f"\nPart {idx+1}:")
386
  print(f" Invoice: {g['invoice_no']}")
387
  print(f" Pages: {info['pages']}")
388
  print(f" Size: {len(part_bytes):,} bytes")
 
 
389
 
390
+ doc.close()
391
 
392
  print(f"\n{'='*60}")
393
  print(f"✓ Successfully split into {len(parts)} part(s)")
 
 
394
  print(f"{'='*60}\n")
395
 
396
+ return JSONResponse({
 
397
  "count": len(parts),
398
  "pdf_type": "image-based" if is_image_pdf else "text-based",
399
+ "parts": parts
400
+ })
 
 
 
 
 
 
 
 
 
 
 
401
 
402
  except HTTPException:
403
  raise
 
416
  "status": "healthy",
417
  "azure_document_intelligence": azure_status,
418
  "azure_available": AZURE_AVAILABLE,
419
+ "endpoint": AZURE_FORM_RECOGNIZER_ENDPOINT if azure_status == "configured" else "not set"
 
420
  }
421
 
422
  if __name__ == "__main__":