anujakkulkarni commited on
Commit
60a66d0
·
verified ·
1 Parent(s): df84667

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +306 -40
app.py CHANGED
@@ -1,13 +1,22 @@
1
  import io
2
  import re
3
  import base64
4
- from typing import List, Dict, Optional
5
 
6
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from fastapi.responses import JSONResponse
9
  import fitz # PyMuPDF
10
 
 
 
 
 
 
 
 
 
 
11
  app = FastAPI(title="Invoice Splitter API")
12
 
13
  app.add_middleware(
@@ -18,70 +27,260 @@ app.add_middleware(
18
  allow_headers=["*"],
19
  )
20
 
21
- # --- improved invoice detection (replace the old INVOICE_NO_RE + function) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  INVOICE_NO_RE = re.compile(
23
  r"(?:Inv(?:oice)?\s*No\.?|Invoice\s*No\.?|Bill\s*No\.?|BILL\s*NO\.?|BILL\s*NO)\s*[:\-]?\s*([A-Za-z0-9\-/]+)",
24
  re.IGNORECASE,
25
  )
26
 
27
- # fallback pattern to capture common GST-like invoice ids (GST-12345 etc)
 
 
 
28
  GST_LIKE_RE = re.compile(r"\b(GST[-\s]?\d+[A-Za-z0-9-]*)\b", re.IGNORECASE)
29
 
30
 
31
- def extract_invoice_no_from_page(page: fitz.Page) -> Optional[str]:
32
  """
33
- Attempt several methods to get an invoice id from the page.
34
- 1) full-page text search for labeled invoice (Inv No / Invoice No / Bill No)
35
- 2) block-level search (useful when label and id are on different lines)
36
- 3) fallback: search for GST-* patterns (many of your PDFs use 'BILL NO. : GST-12345')
37
- Returns a stripped string or None.
 
 
38
  """
39
- text = page.get_text("text") or ""
40
- # 1) try labeled pattern on whole page
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  m = INVOICE_NO_RE.search(text)
42
  if m:
43
  inv = (m.group(1) or "").strip()
44
- if inv and inv.lower() != "invoice":
45
  return inv
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- # 2) block-level fallback
 
 
 
 
 
 
 
 
 
 
 
48
  for block in (page.get_text("blocks") or []):
49
  block_text = block[4] if len(block) > 4 else ""
50
- if not block_text:
51
- continue
52
- m = INVOICE_NO_RE.search(block_text)
53
- if m:
54
- inv = (m.group(1) or "").strip()
55
- if inv and inv.lower() != "invoice":
56
  return inv
 
 
57
 
58
- # 3) GST-like fallback (common in your PDF: "BILL NO. : GST-25507")
59
- m = GST_LIKE_RE.search(text)
60
- if m:
61
- return m.group(1).replace(" ", "").strip()
62
 
63
- # if nothing found
64
- return None
65
- # -------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
 
68
  def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
69
  """Create a new PDF with the given pages (0-based indices)."""
70
  out = fitz.open()
71
  for i in page_indices:
72
- # Note: insert_pdf uses from_page/to_page, not "pages" kwarg.
73
  out.insert_pdf(src_doc, from_page=i, to_page=i)
74
  pdf_bytes = out.tobytes()
75
  out.close()
76
  return pdf_bytes
77
 
78
 
 
 
 
 
79
  @app.post("/split-invoices")
80
  async def split_invoices(
81
  file: UploadFile = File(...),
82
  include_pdf: bool = Form(True),
83
- initial_dpi: int = Form(300), # kept for compatibility; not used here
84
  ):
 
 
 
 
 
 
 
 
 
 
 
 
85
  if not file.filename.lower().endswith(".pdf"):
86
  raise HTTPException(status_code=400, detail="only PDF is supported")
87
 
@@ -94,23 +293,49 @@ async def split_invoices(
94
  if doc.page_count == 0:
95
  raise HTTPException(status_code=400, detail="no pages found")
96
 
97
- # Extract invoice number per page (0-based)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  page_invoice_nos: List[Optional[str]] = []
99
  for i in range(doc.page_count):
100
- inv = extract_invoice_no_from_page(doc.load_page(i))
 
 
 
 
 
101
  page_invoice_nos.append(inv)
102
 
103
- # Group pages: start a new group when a NEW non-None invoice number appears
 
 
 
 
104
  groups: List[Dict] = []
105
  current_group_pages: List[int] = []
106
  current_invoice: Optional[str] = None
107
 
108
  for idx, inv in enumerate(page_invoice_nos):
109
  if current_invoice is None:
 
110
  current_invoice = inv
111
  current_group_pages = [idx]
112
  else:
113
  if inv is not None and inv != current_invoice:
 
114
  groups.append({
115
  "invoice_no": current_invoice,
116
  "pages": current_group_pages[:],
@@ -118,19 +343,28 @@ async def split_invoices(
118
  current_invoice = inv
119
  current_group_pages = [idx]
120
  else:
 
121
  current_group_pages.append(idx)
122
 
 
123
  if current_group_pages:
124
- groups.append({"invoice_no": current_invoice,
125
- "pages": current_group_pages[:]})
 
 
126
 
127
- # If we never found any invoice numbers, return the whole doc as one part
128
  if all(g["invoice_no"] is None for g in groups):
129
- groups = [{"invoice_no": None,
130
- "pages": list(range(doc.page_count))}]
 
 
 
 
131
 
 
132
  parts = []
133
- for g in groups:
134
  part_bytes = build_pdf_from_pages(doc, g["pages"])
135
  info = {
136
  "invoice_no": g["invoice_no"],
@@ -139,14 +373,46 @@ async def split_invoices(
139
  "size_bytes": len(part_bytes),
140
  }
141
  if include_pdf:
142
- info["pdf_base64"] = base64.b64encode(
143
- part_bytes).decode("ascii")
144
  parts.append(info)
 
 
 
 
145
 
146
  doc.close()
147
- return JSONResponse({"count": len(parts), "parts": parts})
 
 
 
 
 
 
 
 
 
148
 
149
  except HTTPException:
150
  raise
151
  except Exception as e:
 
 
 
152
  return JSONResponse({"error": str(e)}, status_code=500)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import io
2
  import re
3
  import base64
4
+ from typing import List, Dict, Optional, Tuple
5
 
6
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from fastapi.responses import JSONResponse
9
  import fitz # PyMuPDF
10
 
11
+ # Azure Document Intelligence (Form Recognizer) - optional import
12
+ try:
13
+ from azure.ai.formrecognizer import DocumentAnalysisClient
14
+ from azure.core.credentials import AzureKeyCredential
15
+ AZURE_AVAILABLE = True
16
+ except ImportError:
17
+ AZURE_AVAILABLE = False
18
+ print("Warning: azure-ai-formrecognizer not installed. Image-based PDFs won't be supported.")
19
+
20
  app = FastAPI(title="Invoice Splitter API")
21
 
22
  app.add_middleware(
 
27
  allow_headers=["*"],
28
  )
29
 
30
+ # --- Azure Document Intelligence Configuration (HARDCODED) ---
31
+ # Replace these with your actual Azure credentials
32
+ AZURE_FORM_RECOGNIZER_ENDPOINT = "https://your-resource-name.cognitiveservices.azure.com/"
33
+ AZURE_FORM_RECOGNIZER_KEY = "your-actual-key-here"
34
+
35
+ # You can still override with environment variables if needed
36
+ import os
37
+ AZURE_FORM_RECOGNIZER_ENDPOINT = os.getenv("AZURE_FORM_RECOGNIZER_ENDPOINT", AZURE_FORM_RECOGNIZER_ENDPOINT)
38
+ AZURE_FORM_RECOGNIZER_KEY = os.getenv("AZURE_FORM_RECOGNIZER_KEY", AZURE_FORM_RECOGNIZER_KEY)
39
+
40
+ azure_client = None
41
+
42
+
43
+ def get_azure_client() -> Optional[DocumentAnalysisClient]:
44
+ """Get or create Azure Document Intelligence client."""
45
+ global azure_client
46
+
47
+ if not AZURE_AVAILABLE:
48
+ print("Azure SDK not available")
49
+ return None
50
+
51
+ if azure_client is None:
52
+ # Check if credentials are still placeholder values
53
+ if (not AZURE_FORM_RECOGNIZER_ENDPOINT or
54
+ not AZURE_FORM_RECOGNIZER_KEY or
55
+ AZURE_FORM_RECOGNIZER_ENDPOINT == "https://your-resource-name.cognitiveservices.azure.com/" or
56
+ AZURE_FORM_RECOGNIZER_KEY == "your-actual-key-here"):
57
+ print("Warning: Azure credentials are not properly configured in the code.")
58
+ return None
59
+
60
+ try:
61
+ azure_client = DocumentAnalysisClient(
62
+ endpoint=AZURE_FORM_RECOGNIZER_ENDPOINT,
63
+ credential=AzureKeyCredential(AZURE_FORM_RECOGNIZER_KEY)
64
+ )
65
+ print("✓ Azure Document Intelligence client initialized")
66
+ print(f" Endpoint: {AZURE_FORM_RECOGNIZER_ENDPOINT}")
67
+ except Exception as e:
68
+ print(f"Failed to initialize Azure client: {e}")
69
+ return None
70
+
71
+ return azure_client
72
+
73
+
74
+ # --- Regex patterns for text-based PDF extraction ---
75
  INVOICE_NO_RE = re.compile(
76
  r"(?:Inv(?:oice)?\s*No\.?|Invoice\s*No\.?|Bill\s*No\.?|BILL\s*NO\.?|BILL\s*NO)\s*[:\-]?\s*([A-Za-z0-9\-/]+)",
77
  re.IGNORECASE,
78
  )
79
 
80
+ PREFIXED_INVOICE_RE = re.compile(
81
+ r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b"
82
+ )
83
+
84
  GST_LIKE_RE = re.compile(r"\b(GST[-\s]?\d+[A-Za-z0-9-]*)\b", re.IGNORECASE)
85
 
86
 
87
+ def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool, float]:
88
  """
89
+ Detect if PDF is image-based or text-based by sampling pages.
90
+ Returns (is_image_based, avg_text_length).
91
+
92
+ Strategy:
93
+ - Sample first few pages
94
+ - If average extractable text < 50 chars per page, it's likely image-based
95
+ - If text > 200 chars per page, it's text-based
96
  """
97
+ total_text_length = 0
98
+ pages_to_check = min(sample_pages, doc.page_count)
99
+
100
+ for i in range(pages_to_check):
101
+ text = doc.load_page(i).get_text("text") or ""
102
+ total_text_length += len(text.strip())
103
+
104
+ avg_text_length = total_text_length / pages_to_check
105
+ is_image_based = avg_text_length < 50
106
+
107
+ print(f" PDF Type Detection: avg_text_length={avg_text_length:.1f} chars/page")
108
+ print(f" Classification: {'IMAGE-BASED' if is_image_based else 'TEXT-BASED'} PDF")
109
+
110
+ return is_image_based, avg_text_length
111
+
112
+
113
+ # ============================================================================
114
+ # TEXT-BASED PDF EXTRACTION (Original Code)
115
+ # ============================================================================
116
+
117
+ def try_extract_invoice_from_text(text: str) -> Optional[str]:
118
+ """
119
+ Extract invoice number from text using regex patterns.
120
+ Works for text-based PDFs.
121
+ """
122
+ if not text:
123
+ return None
124
+
125
+ # Pattern 1: Labeled invoice (Invoice No, Bill No, etc.)
126
  m = INVOICE_NO_RE.search(text)
127
  if m:
128
  inv = (m.group(1) or "").strip()
129
+ if inv and inv.lower() != "invoice" and len(inv) > 2:
130
  return inv
131
+
132
+ # Pattern 2: Prefixed invoice (WN-12345/25) - search top portion
133
+ top_text = text[:500]
134
+ m = PREFIXED_INVOICE_RE.search(top_text)
135
+ if m:
136
+ inv = (m.group(1) or "").strip()
137
+ if inv and len(inv) >= 7:
138
+ return inv
139
+
140
+ # Pattern 3: GST format
141
+ m = GST_LIKE_RE.search(text)
142
+ if m:
143
+ return m.group(1).replace(" ", "").strip()
144
+
145
+ return None
146
+
147
 
148
+ def extract_invoice_text_based(page: fitz.Page) -> Optional[str]:
149
+ """
150
+ Extract invoice number from TEXT-BASED PDF.
151
+ Uses the original fast text extraction method.
152
+ """
153
+ # Try full-page text
154
+ text = page.get_text("text") or ""
155
+ inv = try_extract_invoice_from_text(text)
156
+ if inv:
157
+ return inv
158
+
159
+ # Try block-level text
160
  for block in (page.get_text("blocks") or []):
161
  block_text = block[4] if len(block) > 4 else ""
162
+ if block_text:
163
+ inv = try_extract_invoice_from_text(block_text)
164
+ if inv:
 
 
 
165
  return inv
166
+
167
+ return None
168
 
 
 
 
 
169
 
170
+ # ============================================================================
171
+ # IMAGE-BASED PDF EXTRACTION (Azure Document Intelligence)
172
+ # ============================================================================
173
+
174
+ def extract_invoice_azure(page: fitz.Page) -> Optional[str]:
175
+ """
176
+ Extract invoice number from IMAGE-BASED PDF using Azure Document Intelligence.
177
+ """
178
+ client = get_azure_client()
179
+ if not client:
180
+ print(" Azure client not available")
181
+ return None
182
+
183
+ try:
184
+ # Convert page to image
185
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x resolution
186
+ img_bytes = pix.tobytes("png")
187
+
188
+ # Analyze with Azure prebuilt invoice model
189
+ print(" Calling Azure Document Intelligence API...")
190
+ poller = client.begin_analyze_document(
191
+ "prebuilt-invoice",
192
+ document=img_bytes
193
+ )
194
+ result = poller.result()
195
+
196
+ # Extract invoice ID from structured fields
197
+ if result.documents:
198
+ for document in result.documents:
199
+ if hasattr(document, 'fields') and document.fields:
200
+ # Try InvoiceId field
201
+ if 'InvoiceId' in document.fields and document.fields['InvoiceId']:
202
+ invoice_id = document.fields['InvoiceId'].value
203
+ if invoice_id:
204
+ print(f" ✓ Azure found InvoiceId: {invoice_id}")
205
+ return str(invoice_id).strip()
206
+
207
+ # Try PurchaseOrder field
208
+ if 'PurchaseOrder' in document.fields and document.fields['PurchaseOrder']:
209
+ po = document.fields['PurchaseOrder'].value
210
+ if po:
211
+ print(f" ✓ Azure found PurchaseOrder: {po}")
212
+ return str(po).strip()
213
+
214
+ # Fallback: try regex on Azure-extracted text
215
+ if result.content:
216
+ print(f" Azure extracted {len(result.content)} chars, trying regex...")
217
+ inv = try_extract_invoice_from_text(result.content)
218
+ if inv:
219
+ print(f" ✓ Found via regex on Azure text: {inv}")
220
+ return inv
221
+
222
+ print(" ✗ Azure: No invoice found")
223
+ return None
224
+
225
+ except Exception as e:
226
+ print(f" ✗ Azure extraction failed: {e}")
227
+ return None
228
+
229
+
230
+ # ============================================================================
231
+ # UNIFIED EXTRACTION LOGIC
232
+ # ============================================================================
233
+
234
+ def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optional[str]:
235
+ """
236
+ Extract invoice number using appropriate method based on PDF type.
237
+
238
+ Args:
239
+ page: PDF page to extract from
240
+ is_image_pdf: True if PDF is image-based, False if text-based
241
+ """
242
+ if is_image_pdf:
243
+ # Use Azure for image-based PDFs
244
+ print(f" Method: Azure Document Intelligence (image-based)")
245
+ return extract_invoice_azure(page)
246
+ else:
247
+ # Use text extraction for text-based PDFs
248
+ print(f" Method: Text extraction (text-based)")
249
+ return extract_invoice_text_based(page)
250
 
251
 
252
  def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
253
  """Create a new PDF with the given pages (0-based indices)."""
254
  out = fitz.open()
255
  for i in page_indices:
 
256
  out.insert_pdf(src_doc, from_page=i, to_page=i)
257
  pdf_bytes = out.tobytes()
258
  out.close()
259
  return pdf_bytes
260
 
261
 
262
+ # ============================================================================
263
+ # API ENDPOINT
264
+ # ============================================================================
265
+
266
  @app.post("/split-invoices")
267
  async def split_invoices(
268
  file: UploadFile = File(...),
269
  include_pdf: bool = Form(True),
270
+ initial_dpi: int = Form(300), # Kept for compatibility
271
  ):
272
+ """
273
+ Split a multi-invoice PDF into separate PDFs based on invoice numbers.
274
+
275
+ Automatically detects PDF type:
276
+ - Text-based PDFs: Uses fast text extraction (original method)
277
+ - Image-based PDFs: Uses Azure Document Intelligence for accurate OCR
278
+
279
+ Parameters:
280
+ - file: PDF file to split
281
+ - include_pdf: Whether to include base64 PDF in response
282
+ - initial_dpi: DPI setting (kept for compatibility)
283
+ """
284
  if not file.filename.lower().endswith(".pdf"):
285
  raise HTTPException(status_code=400, detail="only PDF is supported")
286
 
 
293
  if doc.page_count == 0:
294
  raise HTTPException(status_code=400, detail="no pages found")
295
 
296
+ print(f"\n{'='*60}")
297
+ print(f"Processing PDF: {file.filename}")
298
+ print(f"Total pages: {doc.page_count}")
299
+ print(f"{'='*60}")
300
+
301
+ # Step 1: Detect PDF type (text-based vs image-based)
302
+ is_image_pdf, avg_text_len = is_image_based_pdf(doc)
303
+
304
+ if is_image_pdf and not get_azure_client():
305
+ raise HTTPException(
306
+ status_code=500,
307
+ detail="Image-based PDF detected but Azure Document Intelligence is not configured. "
308
+ "Please update AZURE_FORM_RECOGNIZER_ENDPOINT and AZURE_FORM_RECOGNIZER_KEY in the code."
309
+ )
310
+
311
+ # Step 2: Extract invoice numbers from each page
312
  page_invoice_nos: List[Optional[str]] = []
313
  for i in range(doc.page_count):
314
+ print(f"\n--- Page {i+1}/{doc.page_count} ---")
315
+ inv = extract_invoice_no_from_page(doc.load_page(i), is_image_pdf)
316
+ if inv:
317
+ print(f" ✓ Invoice found: {inv}")
318
+ else:
319
+ print(f" ✗ No invoice found")
320
  page_invoice_nos.append(inv)
321
 
322
+ print(f"\n{'='*60}")
323
+ print(f"Extraction Results: {page_invoice_nos}")
324
+ print(f"{'='*60}")
325
+
326
+ # Step 3: Group pages by invoice number
327
  groups: List[Dict] = []
328
  current_group_pages: List[int] = []
329
  current_invoice: Optional[str] = None
330
 
331
  for idx, inv in enumerate(page_invoice_nos):
332
  if current_invoice is None:
333
+ # Start first group
334
  current_invoice = inv
335
  current_group_pages = [idx]
336
  else:
337
  if inv is not None and inv != current_invoice:
338
+ # New invoice detected - save current group
339
  groups.append({
340
  "invoice_no": current_invoice,
341
  "pages": current_group_pages[:],
 
343
  current_invoice = inv
344
  current_group_pages = [idx]
345
  else:
346
+ # Continue current group (same invoice or no invoice)
347
  current_group_pages.append(idx)
348
 
349
+ # Save last group
350
  if current_group_pages:
351
+ groups.append({
352
+ "invoice_no": current_invoice,
353
+ "pages": current_group_pages[:]
354
+ })
355
 
356
+ # If no invoices found, return whole document as one part
357
  if all(g["invoice_no"] is None for g in groups):
358
+ print("\n⚠ Warning: No invoices detected in any page!")
359
+ print(" Returning entire PDF as single part")
360
+ groups = [{
361
+ "invoice_no": None,
362
+ "pages": list(range(doc.page_count))
363
+ }]
364
 
365
+ # Step 4: Build response parts
366
  parts = []
367
+ for idx, g in enumerate(groups):
368
  part_bytes = build_pdf_from_pages(doc, g["pages"])
369
  info = {
370
  "invoice_no": g["invoice_no"],
 
373
  "size_bytes": len(part_bytes),
374
  }
375
  if include_pdf:
376
+ info["pdf_base64"] = base64.b64encode(part_bytes).decode("ascii")
 
377
  parts.append(info)
378
+ print(f"\nPart {idx+1}:")
379
+ print(f" Invoice: {g['invoice_no']}")
380
+ print(f" Pages: {info['pages']}")
381
+ print(f" Size: {len(part_bytes):,} bytes")
382
 
383
  doc.close()
384
+
385
+ print(f"\n{'='*60}")
386
+ print(f"✓ Successfully split into {len(parts)} part(s)")
387
+ print(f"{'='*60}\n")
388
+
389
+ return JSONResponse({
390
+ "count": len(parts),
391
+ "pdf_type": "image-based" if is_image_pdf else "text-based",
392
+ "parts": parts
393
+ })
394
 
395
  except HTTPException:
396
  raise
397
  except Exception as e:
398
+ print(f"\n✗ Error: {str(e)}")
399
+ import traceback
400
+ traceback.print_exc()
401
  return JSONResponse({"error": str(e)}, status_code=500)
402
+
403
+
404
+ @app.get("/health")
405
+ async def health_check():
406
+ """Health check endpoint to verify Azure configuration."""
407
+ azure_status = "configured" if get_azure_client() else "not configured"
408
+ return {
409
+ "status": "healthy",
410
+ "azure_document_intelligence": azure_status,
411
+ "azure_available": AZURE_AVAILABLE,
412
+ "endpoint": AZURE_FORM_RECOGNIZER_ENDPOINT if azure_status == "configured" else "not set"
413
+ }
414
+
415
+
416
+ if __name__ == "__main__":
417
+ import uvicorn
418
+ uvicorn.run(app, host="0.0.0.0", port=7860)