Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,7 +12,7 @@ from collections import deque
|
|
| 12 |
from pathlib import Path
|
| 13 |
|
| 14 |
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
|
| 15 |
-
from fastapi.middleware.cors import CORSMiddleware
|
| 16 |
from fastapi.responses import JSONResponse, StreamingResponse
|
| 17 |
from starlette.requests import Request
|
| 18 |
import fitz # PyMuPDF
|
|
@@ -22,7 +22,7 @@ try:
|
|
| 22 |
import google.generativeai as genai
|
| 23 |
from PIL import Image
|
| 24 |
GEMINI_AVAILABLE = True
|
| 25 |
-
except ImportError:
|
| 26 |
GEMINI_AVAILABLE = False
|
| 27 |
print("Warning: google-generativeai not installed. Image-based PDFs won't be supported.")
|
| 28 |
|
|
@@ -113,7 +113,7 @@ def check_daily_quota():
|
|
| 113 |
global last_quota_reset, daily_quota_exhausted
|
| 114 |
now = datetime.datetime.now()
|
| 115 |
|
| 116 |
-
if last_quota_reset is None:
|
| 117 |
last_quota_reset = now
|
| 118 |
daily_quota_exhausted = False
|
| 119 |
return True
|
|
@@ -185,10 +185,10 @@ def reset_to_primary_model():
|
|
| 185 |
|
| 186 |
# --- Regex Patterns ---
|
| 187 |
INVOICE_NO_RE = re.compile(
|
| 188 |
-
r"""(?: Invoice\s*No\.
|
| 189 |
re.IGNORECASE | re.VERBOSE
|
| 190 |
)
|
| 191 |
-
PREFIXED_INVOICE_RE = re.compile(r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b")
|
| 192 |
GST_LIKE_RE = re.compile(r"\b((?: GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b", re.IGNORECASE)
|
| 193 |
|
| 194 |
|
|
@@ -204,14 +204,14 @@ def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool,
|
|
| 204 |
|
| 205 |
# --- Extraction Logic ---
|
| 206 |
def normalize_text_for_search(s: str) -> str:
|
| 207 |
-
if not s:
|
| 208 |
return s
|
| 209 |
s = s.replace("\u00A0", " ")
|
| 210 |
return re.sub(r"[ ]{2,}", " ", re.sub(r"[\r\n\t]+", " ", s)).strip()
|
| 211 |
|
| 212 |
|
| 213 |
def try_extract_invoice_from_text(text: str) -> Optional[str]:
|
| 214 |
-
if not text:
|
| 215 |
return None
|
| 216 |
text_norm = normalize_text_for_search(text)
|
| 217 |
|
|
@@ -237,10 +237,10 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
|
|
| 237 |
|
| 238 |
|
| 239 |
def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
|
| 240 |
-
if not check_daily_quota():
|
| 241 |
return None
|
| 242 |
model = get_gemini_model()
|
| 243 |
-
if not model:
|
| 244 |
return None
|
| 245 |
|
| 246 |
if not gemini_rate_limiter.allow_request():
|
|
@@ -251,7 +251,7 @@ def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
|
|
| 251 |
|
| 252 |
try:
|
| 253 |
# ⭐ Reduced resolution from 2x to 1.5x to save memory
|
| 254 |
-
pix = page.get_pixmap(matrix=fitz.Matrix(1.
|
| 255 |
img_bytes = pix.tobytes("png")
|
| 256 |
|
| 257 |
# ⭐ Explicitly free pixmap memory
|
|
@@ -299,14 +299,14 @@ def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optiona
|
|
| 299 |
# 1. Try Text Extraction (Fastest)
|
| 300 |
text = page.get_text("text") or ""
|
| 301 |
inv = try_extract_invoice_from_text(text)
|
| 302 |
-
if inv:
|
| 303 |
return inv
|
| 304 |
|
| 305 |
# 2. Try Block Extraction
|
| 306 |
for block in (page.get_text("blocks") or []):
|
| 307 |
if len(block) > 4 and block[4]:
|
| 308 |
inv = try_extract_invoice_from_text(block[4])
|
| 309 |
-
if inv:
|
| 310 |
return inv
|
| 311 |
|
| 312 |
# 3. Gemini Fallback (Only if enabled and seemingly image-based)
|
|
@@ -444,7 +444,7 @@ async def split_invoices(
|
|
| 444 |
for i in range(doc. page_count):
|
| 445 |
# ⭐ Progress logging for large documents
|
| 446 |
if i > 0 and i % 50 == 0:
|
| 447 |
-
print(f"
|
| 448 |
|
| 449 |
page = doc. load_page(i)
|
| 450 |
|
|
@@ -558,7 +558,7 @@ async def split_invoices(
|
|
| 558 |
}
|
| 559 |
})
|
| 560 |
|
| 561 |
-
except HTTPException:
|
| 562 |
raise # Re-raise HTTP exceptions as-is
|
| 563 |
|
| 564 |
except Exception as e:
|
|
@@ -574,7 +574,7 @@ async def split_invoices(
|
|
| 574 |
doc.close()
|
| 575 |
print("📕 Closed PDF document")
|
| 576 |
except Exception as e:
|
| 577 |
-
print(f"⚠️ Error closing document:
|
| 578 |
|
| 579 |
# Delete temp file
|
| 580 |
remove_file(temp_path)
|
|
@@ -590,7 +590,7 @@ async def split_invoices_stream(
|
|
| 590 |
max_file_size_mb: int = Form(200)
|
| 591 |
):
|
| 592 |
"""
|
| 593 |
-
Streaming version for extremely large files.
|
| 594 |
Returns NDJSON (newline-delimited JSON) with each part as a separate line.
|
| 595 |
|
| 596 |
This avoids building a large JSON response in memory.
|
|
@@ -638,7 +638,7 @@ async def split_invoices_stream(
|
|
| 638 |
# Extract invoice numbers
|
| 639 |
page_invoice_nos = []
|
| 640 |
for i in range(doc.page_count):
|
| 641 |
-
page = doc.
|
| 642 |
inv = extract_invoice_no_from_page(page, is_image_pdf)
|
| 643 |
page_invoice_nos.append(inv)
|
| 644 |
page = None
|
|
@@ -701,7 +701,7 @@ async def split_invoices_stream(
|
|
| 701 |
"error": str(e)
|
| 702 |
}) + "\n"
|
| 703 |
finally:
|
| 704 |
-
if doc:
|
| 705 |
doc.close()
|
| 706 |
remove_file(temp_path)
|
| 707 |
gc.collect()
|
|
@@ -710,7 +710,7 @@ async def split_invoices_stream(
|
|
| 710 |
generate_parts(),
|
| 711 |
media_type="application/x-ndjson",
|
| 712 |
headers={
|
| 713 |
-
"Content-Disposition":
|
| 714 |
}
|
| 715 |
)
|
| 716 |
|
|
|
|
| 12 |
from pathlib import Path
|
| 13 |
|
| 14 |
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
|
| 15 |
+
from fastapi. middleware.cors import CORSMiddleware
|
| 16 |
from fastapi.responses import JSONResponse, StreamingResponse
|
| 17 |
from starlette.requests import Request
|
| 18 |
import fitz # PyMuPDF
|
|
|
|
| 22 |
import google.generativeai as genai
|
| 23 |
from PIL import Image
|
| 24 |
GEMINI_AVAILABLE = True
|
| 25 |
+
except ImportError:
|
| 26 |
GEMINI_AVAILABLE = False
|
| 27 |
print("Warning: google-generativeai not installed. Image-based PDFs won't be supported.")
|
| 28 |
|
|
|
|
| 113 |
global last_quota_reset, daily_quota_exhausted
|
| 114 |
now = datetime.datetime.now()
|
| 115 |
|
| 116 |
+
if last_quota_reset is None:
|
| 117 |
last_quota_reset = now
|
| 118 |
daily_quota_exhausted = False
|
| 119 |
return True
|
|
|
|
| 185 |
|
| 186 |
# --- Regex Patterns ---
|
| 187 |
INVOICE_NO_RE = re.compile(
|
| 188 |
+
r"""(?: Invoice\s*No\.?|Inv\.\s*No\.?|Bill\s*No\.?|Document\s*No\.?|Doc\s*No\.?|Tax\s*Invoice\s*No\.?)\s*[:\-]?\s*([A-Z0-9][A-Z0-9\-\/]{3,})""",
|
| 189 |
re.IGNORECASE | re.VERBOSE
|
| 190 |
)
|
| 191 |
+
PREFIXED_INVOICE_RE = re. compile(r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b")
|
| 192 |
GST_LIKE_RE = re.compile(r"\b((?: GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b", re.IGNORECASE)
|
| 193 |
|
| 194 |
|
|
|
|
| 204 |
|
| 205 |
# --- Extraction Logic ---
|
| 206 |
def normalize_text_for_search(s: str) -> str:
|
| 207 |
+
if not s:
|
| 208 |
return s
|
| 209 |
s = s.replace("\u00A0", " ")
|
| 210 |
return re.sub(r"[ ]{2,}", " ", re.sub(r"[\r\n\t]+", " ", s)).strip()
|
| 211 |
|
| 212 |
|
| 213 |
def try_extract_invoice_from_text(text: str) -> Optional[str]:
|
| 214 |
+
if not text:
|
| 215 |
return None
|
| 216 |
text_norm = normalize_text_for_search(text)
|
| 217 |
|
|
|
|
| 237 |
|
| 238 |
|
| 239 |
def extract_invoice_gemini(page: fitz.Page, retry_count=0) -> Optional[str]:
|
| 240 |
+
if not check_daily_quota():
|
| 241 |
return None
|
| 242 |
model = get_gemini_model()
|
| 243 |
+
if not model:
|
| 244 |
return None
|
| 245 |
|
| 246 |
if not gemini_rate_limiter.allow_request():
|
|
|
|
| 251 |
|
| 252 |
try:
|
| 253 |
# ⭐ Reduced resolution from 2x to 1.5x to save memory
|
| 254 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5), dpi=150)
|
| 255 |
img_bytes = pix.tobytes("png")
|
| 256 |
|
| 257 |
# ⭐ Explicitly free pixmap memory
|
|
|
|
| 299 |
# 1. Try Text Extraction (Fastest)
|
| 300 |
text = page.get_text("text") or ""
|
| 301 |
inv = try_extract_invoice_from_text(text)
|
| 302 |
+
if inv:
|
| 303 |
return inv
|
| 304 |
|
| 305 |
# 2. Try Block Extraction
|
| 306 |
for block in (page.get_text("blocks") or []):
|
| 307 |
if len(block) > 4 and block[4]:
|
| 308 |
inv = try_extract_invoice_from_text(block[4])
|
| 309 |
+
if inv:
|
| 310 |
return inv
|
| 311 |
|
| 312 |
# 3. Gemini Fallback (Only if enabled and seemingly image-based)
|
|
|
|
| 444 |
for i in range(doc. page_count):
|
| 445 |
# ⭐ Progress logging for large documents
|
| 446 |
if i > 0 and i % 50 == 0:
|
| 447 |
+
print(f" ��� Processed {i}/{doc.page_count} pages")
|
| 448 |
|
| 449 |
page = doc. load_page(i)
|
| 450 |
|
|
|
|
| 558 |
}
|
| 559 |
})
|
| 560 |
|
| 561 |
+
except HTTPException:
|
| 562 |
raise # Re-raise HTTP exceptions as-is
|
| 563 |
|
| 564 |
except Exception as e:
|
|
|
|
| 574 |
doc.close()
|
| 575 |
print("📕 Closed PDF document")
|
| 576 |
except Exception as e:
|
| 577 |
+
print(f"⚠️ Error closing document: {e}")
|
| 578 |
|
| 579 |
# Delete temp file
|
| 580 |
remove_file(temp_path)
|
|
|
|
| 590 |
max_file_size_mb: int = Form(200)
|
| 591 |
):
|
| 592 |
"""
|
| 593 |
+
Streaming version for extremely large files.
|
| 594 |
Returns NDJSON (newline-delimited JSON) with each part as a separate line.
|
| 595 |
|
| 596 |
This avoids building a large JSON response in memory.
|
|
|
|
| 638 |
# Extract invoice numbers
|
| 639 |
page_invoice_nos = []
|
| 640 |
for i in range(doc.page_count):
|
| 641 |
+
page = doc.load_page(i)
|
| 642 |
inv = extract_invoice_no_from_page(page, is_image_pdf)
|
| 643 |
page_invoice_nos.append(inv)
|
| 644 |
page = None
|
|
|
|
| 701 |
"error": str(e)
|
| 702 |
}) + "\n"
|
| 703 |
finally:
|
| 704 |
+
if doc:
|
| 705 |
doc.close()
|
| 706 |
remove_file(temp_path)
|
| 707 |
gc.collect()
|
|
|
|
| 710 |
generate_parts(),
|
| 711 |
media_type="application/x-ndjson",
|
| 712 |
headers={
|
| 713 |
+
"Content-Disposition": f"attachment; filename=invoices-split. ndjson"
|
| 714 |
}
|
| 715 |
)
|
| 716 |
|