Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -15,7 +15,7 @@ import google.generativeai as genai
|
|
| 15 |
# ---------------- LLM CONFIG (Gemini) ----------------
|
| 16 |
|
| 17 |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 18 |
-
GEMINI_MODEL_NAME = "gemini-
|
| 19 |
|
| 20 |
if GEMINI_API_KEY:
|
| 21 |
genai.configure(api_key=GEMINI_API_KEY)
|
|
@@ -250,24 +250,26 @@ async def extract_bill_data(payload: BillRequest):
|
|
| 250 |
|
| 251 |
# ---- Step 2: OCR (PDF + images) ----
|
| 252 |
pagewise_ocr = [] # list of {page_no, page_type, text}
|
| 253 |
-
|
|
|
|
|
|
|
| 254 |
|
| 255 |
try:
|
| 256 |
# PDF case
|
| 257 |
-
if
|
| 258 |
pages = convert_from_bytes(file_bytes)
|
| 259 |
for idx, page_img in enumerate(pages, start=1):
|
| 260 |
text = pytesseract.image_to_string(page_img)
|
| 261 |
pagewise_ocr.append(
|
| 262 |
{
|
| 263 |
"page_no": str(idx),
|
| 264 |
-
"page_type": "Bill Detail",
|
| 265 |
"text": text,
|
| 266 |
}
|
| 267 |
)
|
| 268 |
|
| 269 |
# Image case
|
| 270 |
-
elif any(
|
| 271 |
image = Image.open(BytesIO(file_bytes))
|
| 272 |
text = pytesseract.image_to_string(image)
|
| 273 |
pagewise_ocr.append(
|
|
|
|
| 15 |
# ---------------- LLM CONFIG (Gemini) ----------------
|
| 16 |
|
| 17 |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 18 |
+
GEMINI_MODEL_NAME = "gemini-2.5-flash"
|
| 19 |
|
| 20 |
if GEMINI_API_KEY:
|
| 21 |
genai.configure(api_key=GEMINI_API_KEY)
|
|
|
|
| 250 |
|
| 251 |
# ---- Step 2: OCR (PDF + images) ----
|
| 252 |
pagewise_ocr = [] # list of {page_no, page_type, text}
|
| 253 |
+
|
| 254 |
+
# IMPORTANT: strip query (?sv=...) only for extension detection
|
| 255 |
+
clean_url = doc_url.split("?", 1)[0].lower()
|
| 256 |
|
| 257 |
try:
|
| 258 |
# PDF case
|
| 259 |
+
if clean_url.endswith(".pdf"):
|
| 260 |
pages = convert_from_bytes(file_bytes)
|
| 261 |
for idx, page_img in enumerate(pages, start=1):
|
| 262 |
text = pytesseract.image_to_string(page_img)
|
| 263 |
pagewise_ocr.append(
|
| 264 |
{
|
| 265 |
"page_no": str(idx),
|
| 266 |
+
"page_type": "Bill Detail",
|
| 267 |
"text": text,
|
| 268 |
}
|
| 269 |
)
|
| 270 |
|
| 271 |
# Image case
|
| 272 |
+
elif any(clean_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg"]):
|
| 273 |
image = Image.open(BytesIO(file_bytes))
|
| 274 |
text = pytesseract.image_to_string(image)
|
| 275 |
pagewise_ocr.append(
|