anujakkulkarni commited on
Commit
ea68370
·
verified ·
1 Parent(s): b8cd992

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -19
app.py CHANGED
@@ -12,7 +12,7 @@ from collections import deque
12
  from pathlib import Path
13
 
14
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
15
- from fastapi.middleware.cors import CORSMiddleware
16
  from fastapi.responses import JSONResponse, StreamingResponse
17
  from starlette.requests import Request
18
  import fitz # PyMuPDF
@@ -22,7 +22,7 @@ try:
22
  import google.generativeai as genai
23
  from PIL import Image
24
  GEMINI_AVAILABLE = True
25
- except ImportError:
26
  GEMINI_AVAILABLE = False
27
  print("Warning: google-generativeai not installed. Image-based PDFs won't be supported.")
28
 
@@ -113,7 +113,7 @@ def check_daily_quota():
113
  global last_quota_reset, daily_quota_exhausted
114
  now = datetime.datetime.now()
115
 
116
- if last_quota_reset is None:
117
  last_quota_reset = now
118
  daily_quota_exhausted = False
119
  return True
@@ -185,10 +185,10 @@ def reset_to_primary_model():
185
 
186
  # --- Regex Patterns ---
187
  INVOICE_NO_RE = re.compile(
188
- r"""(?: Invoice\s*No\. ?|Inv\. ?\s*No\.?|Bill\s*No\.?|Document\s*No\.?|Doc\s*No\.?|Tax\s*Invoice\s*No\.?)\s*[:\-]?\s*([A-Z0-9][A-Z0-9\-\/]{3,})""",
189
  re.IGNORECASE | re.VERBOSE
190
  )
191
- PREFIXED_INVOICE_RE = re.compile(r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b")
192
  GST_LIKE_RE = re.compile(r"\b((?: GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b", re.IGNORECASE)
193
 
194
 
@@ -204,14 +204,14 @@ def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool,
204
 
205
  # --- Extraction Logic ---
206
  def normalize_text_for_search(s: str) -> str:
207
- if not s:
208
  return s
209
  s = s.replace("\u00A0", " ")
210
  return re.sub(r"[ ]{2,}", " ", re.sub(r"[\r\n\t]+", " ", s)).strip()
211
 
212
 
213
  def try_extract_invoice_from_text(text: str) -> Optional[str]:
214
- if not text:
215
  return None
216
  text_norm = normalize_text_for_search(text)
217
 
@@ -237,10 +237,10 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
237
 
238
 
239
  def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
240
- if not check_daily_quota():
241
  return None
242
  model = get_gemini_model()
243
- if not model:
244
  return None
245
 
246
  if not gemini_rate_limiter.allow_request():
@@ -251,7 +251,7 @@ def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
251
 
252
  try:
253
  # ⭐ Reduced resolution from 2x to 1.5x to save memory
254
- pix = page.get_pixmap(matrix=fitz.Matrix(1. 5, 1.5), dpi=150)
255
  img_bytes = pix.tobytes("png")
256
 
257
  # ⭐ Explicitly free pixmap memory
@@ -299,14 +299,14 @@ def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optiona
299
  # 1. Try Text Extraction (Fastest)
300
  text = page.get_text("text") or ""
301
  inv = try_extract_invoice_from_text(text)
302
- if inv:
303
  return inv
304
 
305
  # 2. Try Block Extraction
306
  for block in (page.get_text("blocks") or []):
307
  if len(block) > 4 and block[4]:
308
  inv = try_extract_invoice_from_text(block[4])
309
- if inv:
310
  return inv
311
 
312
  # 3. Gemini Fallback (Only if enabled and seemingly image-based)
@@ -444,7 +444,7 @@ async def split_invoices(
444
  for i in range(doc. page_count):
445
  # ⭐ Progress logging for large documents
446
  if i > 0 and i % 50 == 0:
447
- print(f" 📄 Processed {i}/{doc.page_count} pages")
448
 
449
  page = doc. load_page(i)
450
 
@@ -558,7 +558,7 @@ async def split_invoices(
558
  }
559
  })
560
 
561
- except HTTPException:
562
  raise # Re-raise HTTP exceptions as-is
563
 
564
  except Exception as e:
@@ -574,7 +574,7 @@ async def split_invoices(
574
  doc.close()
575
  print("📕 Closed PDF document")
576
  except Exception as e:
577
- print(f"⚠️ Error closing document: {e}")
578
 
579
  # Delete temp file
580
  remove_file(temp_path)
@@ -590,7 +590,7 @@ async def split_invoices_stream(
590
  max_file_size_mb: int = Form(200)
591
  ):
592
  """
593
- Streaming version for extremely large files.
594
  Returns NDJSON (newline-delimited JSON) with each part as a separate line.
595
 
596
  This avoids building a large JSON response in memory.
@@ -638,7 +638,7 @@ async def split_invoices_stream(
638
  # Extract invoice numbers
639
  page_invoice_nos = []
640
  for i in range(doc.page_count):
641
- page = doc. load_page(i)
642
  inv = extract_invoice_no_from_page(page, is_image_pdf)
643
  page_invoice_nos.append(inv)
644
  page = None
@@ -701,7 +701,7 @@ async def split_invoices_stream(
701
  "error": str(e)
702
  }) + "\n"
703
  finally:
704
- if doc:
705
  doc.close()
706
  remove_file(temp_path)
707
  gc.collect()
@@ -710,7 +710,7 @@ async def split_invoices_stream(
710
  generate_parts(),
711
  media_type="application/x-ndjson",
712
  headers={
713
- "Content-Disposition": f"attachment; filename=invoices-split. ndjson"
714
  }
715
  )
716
 
 
12
  from pathlib import Path
13
 
14
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
15
+ from fastapi. middleware.cors import CORSMiddleware
16
  from fastapi.responses import JSONResponse, StreamingResponse
17
  from starlette.requests import Request
18
  import fitz # PyMuPDF
 
22
  import google.generativeai as genai
23
  from PIL import Image
24
  GEMINI_AVAILABLE = True
25
+ except ImportError:
26
  GEMINI_AVAILABLE = False
27
  print("Warning: google-generativeai not installed. Image-based PDFs won't be supported.")
28
 
 
113
  global last_quota_reset, daily_quota_exhausted
114
  now = datetime.datetime.now()
115
 
116
+ if last_quota_reset is None:
117
  last_quota_reset = now
118
  daily_quota_exhausted = False
119
  return True
 
185
 
186
  # --- Regex Patterns ---
187
  INVOICE_NO_RE = re.compile(
188
+ r"""(?: Invoice\s*No\.?|Inv\.\s*No\.?|Bill\s*No\.?|Document\s*No\.?|Doc\s*No\.?|Tax\s*Invoice\s*No\.?)\s*[:\-]?\s*([A-Z0-9][A-Z0-9\-\/]{3,})""",
189
  re.IGNORECASE | re.VERBOSE
190
  )
191
+ PREFIXED_INVOICE_RE = re. compile(r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b")
192
  GST_LIKE_RE = re.compile(r"\b((?: GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b", re.IGNORECASE)
193
 
194
 
 
204
 
205
  # --- Extraction Logic ---
206
  def normalize_text_for_search(s: str) -> str:
207
+ if not s:
208
  return s
209
  s = s.replace("\u00A0", " ")
210
  return re.sub(r"[ ]{2,}", " ", re.sub(r"[\r\n\t]+", " ", s)).strip()
211
 
212
 
213
  def try_extract_invoice_from_text(text: str) -> Optional[str]:
214
+ if not text:
215
  return None
216
  text_norm = normalize_text_for_search(text)
217
 
 
237
 
238
 
239
  def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
240
+ if not check_daily_quota():
241
  return None
242
  model = get_gemini_model()
243
+ if not model:
244
  return None
245
 
246
  if not gemini_rate_limiter.allow_request():
 
251
 
252
  try:
253
  # ⭐ Reduced resolution from 2x to 1.5x to save memory
254
+ pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5), dpi=150)
255
  img_bytes = pix.tobytes("png")
256
 
257
  # ⭐ Explicitly free pixmap memory
 
299
  # 1. Try Text Extraction (Fastest)
300
  text = page.get_text("text") or ""
301
  inv = try_extract_invoice_from_text(text)
302
+ if inv:
303
  return inv
304
 
305
  # 2. Try Block Extraction
306
  for block in (page.get_text("blocks") or []):
307
  if len(block) > 4 and block[4]:
308
  inv = try_extract_invoice_from_text(block[4])
309
+ if inv:
310
  return inv
311
 
312
  # 3. Gemini Fallback (Only if enabled and seemingly image-based)
 
444
  for i in range(doc. page_count):
445
  # ⭐ Progress logging for large documents
446
  if i > 0 and i % 50 == 0:
447
+ print(f" ��� Processed {i}/{doc.page_count} pages")
448
 
449
  page = doc. load_page(i)
450
 
 
558
  }
559
  })
560
 
561
+ except HTTPException:
562
  raise # Re-raise HTTP exceptions as-is
563
 
564
  except Exception as e:
 
574
  doc.close()
575
  print("📕 Closed PDF document")
576
  except Exception as e:
577
+ print(f"⚠️ Error closing document: {e}")
578
 
579
  # Delete temp file
580
  remove_file(temp_path)
 
590
  max_file_size_mb: int = Form(200)
591
  ):
592
  """
593
+ Streaming version for extremely large files.
594
  Returns NDJSON (newline-delimited JSON) with each part as a separate line.
595
 
596
  This avoids building a large JSON response in memory.
 
638
  # Extract invoice numbers
639
  page_invoice_nos = []
640
  for i in range(doc.page_count):
641
+ page = doc.load_page(i)
642
  inv = extract_invoice_no_from_page(page, is_image_pdf)
643
  page_invoice_nos.append(inv)
644
  page = None
 
701
  "error": str(e)
702
  }) + "\n"
703
  finally:
704
+ if doc:
705
  doc.close()
706
  remove_file(temp_path)
707
  gc.collect()
 
710
  generate_parts(),
711
  media_type="application/x-ndjson",
712
  headers={
713
+ "Content-Disposition": f"attachment; filename=invoices-split. ndjson"
714
  }
715
  )
716