Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# app.py
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
import json
|
|
@@ -24,10 +24,11 @@ if GEMINI_API_KEY:
|
|
| 24 |
genai.configure(api_key=GEMINI_API_KEY)
|
| 25 |
|
| 26 |
# ---------------- FASTAPI APP ----------------
|
| 27 |
-
app = FastAPI(title="Bajaj Datathon - Bill Extractor")
|
| 28 |
|
| 29 |
class BillRequest(BaseModel):
|
| 30 |
document: str
|
|
|
|
| 31 |
# ---------------- Helpers: number normalization & detection ----------------
|
| 32 |
NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?") # matches numbers with commas, decimals
|
| 33 |
TOTAL_KEYWORDS = re.compile(r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal)", re.I)
|
|
@@ -244,6 +245,7 @@ def parse_row_to_item(cells_row: List[Dict[str, Any]]) -> Optional[Dict[str, Any
|
|
| 244 |
"item_rate": float(round(rate_val, 2)) if rate_val else 0.0,
|
| 245 |
"item_quantity": float(qty_val)
|
| 246 |
}
|
|
|
|
| 247 |
# ---------------- Duplicate suppression & subtotal detection ----------------
|
| 248 |
def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 249 |
"""
|
|
@@ -351,322 +353,153 @@ def refine_with_gemini(page_items: List[Dict[str, Any]]) -> (List[Dict[str, Any]
|
|
| 351 |
except Exception:
|
| 352 |
return page_items, zero_usage
|
| 353 |
|
| 354 |
-
|
| 355 |
-
# ---------------- FALLBACK REGEX EXTRACTOR ----------------
|
| 356 |
-
|
| 357 |
-
def extract_items_from_text(text: str):
|
| 358 |
-
"""
|
| 359 |
-
Very simple rule-based extractor used as a fallback
|
| 360 |
-
when LLM is not available or fails.
|
| 361 |
-
|
| 362 |
-
Logic:
|
| 363 |
-
- Split OCR text into lines
|
| 364 |
-
- For each line, if it has at least one numeric token,
|
| 365 |
-
treat the last numeric token as item_amount
|
| 366 |
-
- Everything before that is item_name
|
| 367 |
-
- Skip lines that look like totals
|
| 368 |
-
"""
|
| 369 |
-
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
| 370 |
-
bill_items = []
|
| 371 |
-
|
| 372 |
-
for line in lines:
|
| 373 |
-
# Skip obvious total lines
|
| 374 |
-
if re.search(r"(total|grand total|net payable)", line, re.IGNORECASE):
|
| 375 |
-
continue
|
| 376 |
-
|
| 377 |
-
tokens = line.split()
|
| 378 |
-
if not tokens:
|
| 379 |
-
continue
|
| 380 |
-
|
| 381 |
-
# Numeric tokens like 123 or 45.67
|
| 382 |
-
numeric_indices = [
|
| 383 |
-
i for i, tok in enumerate(tokens)
|
| 384 |
-
if re.fullmatch(r"\d+(\.\d+)?", tok)
|
| 385 |
-
]
|
| 386 |
-
|
| 387 |
-
if not numeric_indices:
|
| 388 |
-
continue
|
| 389 |
-
|
| 390 |
-
last_idx = numeric_indices[-1]
|
| 391 |
-
amount_str = tokens[last_idx]
|
| 392 |
-
name_tokens = tokens[:last_idx]
|
| 393 |
-
|
| 394 |
-
if not name_tokens:
|
| 395 |
-
continue
|
| 396 |
-
|
| 397 |
-
try:
|
| 398 |
-
amount_val = float(amount_str)
|
| 399 |
-
except ValueError:
|
| 400 |
-
continue
|
| 401 |
-
|
| 402 |
-
item_name = " ".join(name_tokens)
|
| 403 |
-
|
| 404 |
-
bill_items.append(
|
| 405 |
-
{
|
| 406 |
-
"item_name": item_name,
|
| 407 |
-
"item_amount": amount_val,
|
| 408 |
-
"item_rate": 0.0, # to be improved later
|
| 409 |
-
"item_quantity": 0.0, # to be improved later
|
| 410 |
-
}
|
| 411 |
-
)
|
| 412 |
-
|
| 413 |
-
return bill_items
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
# ---------------- LLM CALL (GEMINI) ----------------
|
| 417 |
-
|
| 418 |
-
def call_gemini_for_items(pages_ocr):
|
| 419 |
-
"""
|
| 420 |
-
pages_ocr: list of dicts:
|
| 421 |
-
{ "page_no": "1", "page_type": "Bill Detail", "text": "<ocr_text>" }
|
| 422 |
-
|
| 423 |
-
Returns:
|
| 424 |
-
(pagewise_line_items, token_usage_dict)
|
| 425 |
-
or (None, zero_token_usage) if LLM is unavailable / fails.
|
| 426 |
-
"""
|
| 427 |
-
zero_usage = {
|
| 428 |
-
"total_tokens": 0,
|
| 429 |
-
"input_tokens": 0,
|
| 430 |
-
"output_tokens": 0
|
| 431 |
-
}
|
| 432 |
-
|
| 433 |
-
if not GEMINI_API_KEY:
|
| 434 |
-
# No key configured → skip LLM and let caller fallback
|
| 435 |
-
return None, zero_usage
|
| 436 |
-
|
| 437 |
-
# Build a concise representation of pages for the prompt
|
| 438 |
-
pages_repr = [
|
| 439 |
-
{
|
| 440 |
-
"page_no": p["page_no"],
|
| 441 |
-
"page_type": p["page_type"],
|
| 442 |
-
"text": p["text"],
|
| 443 |
-
}
|
| 444 |
-
for p in pages_ocr
|
| 445 |
-
]
|
| 446 |
-
|
| 447 |
-
system_instruction = (
|
| 448 |
-
"You are a medical bill extraction engine. "
|
| 449 |
-
"Given OCR text from each page of a bill, extract individual line items.\n\n"
|
| 450 |
-
"For each page, you must return bill_items with fields:\n"
|
| 451 |
-
"- item_name (string, as close as possible to bill text)\n"
|
| 452 |
-
"- item_rate (float; 0.0 if not clearly present)\n"
|
| 453 |
-
"- item_quantity (float; 1.0 if implicit; 0.0 if unknown)\n"
|
| 454 |
-
"- item_amount (float; net amount for that line)\n\n"
|
| 455 |
-
"Do NOT include grand totals, sub-totals, or net payable rows as separate items.\n"
|
| 456 |
-
"Only include the per-service / per-medicine lines.\n\n"
|
| 457 |
-
"Return ONLY valid JSON in this exact shape (no comments, no extra keys):\n"
|
| 458 |
-
"{\n"
|
| 459 |
-
" \"pagewise_line_items\": [\n"
|
| 460 |
-
" {\n"
|
| 461 |
-
" \"page_no\": \"1\",\n"
|
| 462 |
-
" \"page_type\": \"Bill Detail\",\n"
|
| 463 |
-
" \"bill_items\": [\n"
|
| 464 |
-
" {\n"
|
| 465 |
-
" \"item_name\": \"...\",\n"
|
| 466 |
-
" \"item_amount\": 123.45,\n"
|
| 467 |
-
" \"item_rate\": 61.72,\n"
|
| 468 |
-
" \"item_quantity\": 2.0\n"
|
| 469 |
-
" }\n"
|
| 470 |
-
" ]\n"
|
| 471 |
-
" }\n"
|
| 472 |
-
" ]\n"
|
| 473 |
-
"}\n"
|
| 474 |
-
)
|
| 475 |
-
|
| 476 |
-
user_prompt = (
|
| 477 |
-
"Use the following OCR text per page to extract line items into the required schema.\n"
|
| 478 |
-
"The data is provided as a JSON array under the key 'pages_ocr'.\n\n"
|
| 479 |
-
f"pages_ocr = {json.dumps(pages_repr, ensure_ascii=False)}"
|
| 480 |
-
)
|
| 481 |
-
|
| 482 |
-
try:
|
| 483 |
-
model = genai.GenerativeModel(GEMINI_MODEL_NAME)
|
| 484 |
-
response = model.generate_content(
|
| 485 |
-
[
|
| 486 |
-
{"role": "system", "parts": [system_instruction]},
|
| 487 |
-
{"role": "user", "parts": [user_prompt]},
|
| 488 |
-
]
|
| 489 |
-
)
|
| 490 |
-
|
| 491 |
-
raw_text = response.text.strip()
|
| 492 |
-
|
| 493 |
-
# Strip possible ```json ... ``` wrappers
|
| 494 |
-
if raw_text.startswith("```"):
|
| 495 |
-
raw_text = re.sub(r"^```[a-zA-Z]*", "", raw_text)
|
| 496 |
-
raw_text = re.sub(r"```$", "", raw_text)
|
| 497 |
-
raw_text = raw_text.strip()
|
| 498 |
-
|
| 499 |
-
parsed = json.loads(raw_text)
|
| 500 |
-
|
| 501 |
-
pagewise = parsed.get("pagewise_line_items", [])
|
| 502 |
-
if not isinstance(pagewise, list):
|
| 503 |
-
return None, zero_usage
|
| 504 |
-
|
| 505 |
-
# We are on free tier, so we keep token_usage as zeros (schema only)
|
| 506 |
-
token_usage = zero_usage
|
| 507 |
-
|
| 508 |
-
return pagewise, token_usage
|
| 509 |
-
|
| 510 |
-
except Exception:
|
| 511 |
-
# Any LLM error → caller will fallback to regex
|
| 512 |
-
return None, zero_usage
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
# ---------------- MAIN ENDPOINT ----------------
|
| 516 |
-
|
| 517 |
@app.post("/extract-bill-data")
|
| 518 |
async def extract_bill_data(payload: BillRequest):
|
| 519 |
-
"""
|
| 520 |
-
Main Datathon endpoint.
|
| 521 |
-
|
| 522 |
-
Flow:
|
| 523 |
-
- Download document from URL
|
| 524 |
-
- If PDF: convert each page to an image and run OCR
|
| 525 |
-
- If image: run OCR directly
|
| 526 |
-
- Build page-wise OCR text
|
| 527 |
-
- Try LLM (Gemini) to extract structured line items
|
| 528 |
-
- If LLM fails or key missing → fallback to regex-only extraction
|
| 529 |
-
- Return JSON in the exact schema expected by the evaluators
|
| 530 |
-
"""
|
| 531 |
doc_url = payload.document
|
| 532 |
-
|
| 533 |
-
# ---- Step 1: Download file ----
|
| 534 |
try:
|
| 535 |
-
headers = {
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
return {
|
| 542 |
-
"is_success": False,
|
| 543 |
-
"token_usage": {
|
| 544 |
-
"total_tokens": 0,
|
| 545 |
-
"input_tokens": 0,
|
| 546 |
-
"output_tokens": 0
|
| 547 |
-
},
|
| 548 |
-
"data": {
|
| 549 |
-
"pagewise_line_items": [],
|
| 550 |
-
"total_item_count": 0
|
| 551 |
-
}
|
| 552 |
-
}
|
| 553 |
-
|
| 554 |
-
file_bytes = response.content
|
| 555 |
-
|
| 556 |
-
except Exception:
|
| 557 |
return {
|
| 558 |
"is_success": False,
|
| 559 |
-
"token_usage": {
|
| 560 |
-
|
| 561 |
-
"input_tokens": 0,
|
| 562 |
-
"output_tokens": 0
|
| 563 |
-
},
|
| 564 |
-
"data": {
|
| 565 |
-
"pagewise_line_items": [],
|
| 566 |
-
"total_item_count": 0
|
| 567 |
-
}
|
| 568 |
}
|
| 569 |
|
| 570 |
-
#
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
# IMPORTANT: strip query (?sv=...) only for extension detection
|
| 574 |
clean_url = doc_url.split("?", 1)[0].lower()
|
| 575 |
-
|
| 576 |
try:
|
| 577 |
-
# PDF case
|
| 578 |
if clean_url.endswith(".pdf"):
|
| 579 |
pages = convert_from_bytes(file_bytes)
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
"page_no": str(idx),
|
| 585 |
-
"page_type": "Bill Detail",
|
| 586 |
-
"text": text,
|
| 587 |
-
}
|
| 588 |
-
)
|
| 589 |
-
|
| 590 |
-
# Image case
|
| 591 |
-
elif any(clean_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg"]):
|
| 592 |
-
image = Image.open(BytesIO(file_bytes))
|
| 593 |
-
text = pytesseract.image_to_string(image)
|
| 594 |
-
pagewise_ocr.append(
|
| 595 |
-
{
|
| 596 |
-
"page_no": "1",
|
| 597 |
-
"page_type": "Bill Detail",
|
| 598 |
-
"text": text,
|
| 599 |
-
}
|
| 600 |
-
)
|
| 601 |
-
|
| 602 |
-
# Other file types → currently not handled
|
| 603 |
else:
|
| 604 |
-
|
| 605 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 606 |
except Exception:
|
| 607 |
-
|
| 608 |
-
return {
|
| 609 |
-
"is_success": False,
|
| 610 |
-
"token_usage": {
|
| 611 |
-
"total_tokens": 0,
|
| 612 |
-
"input_tokens": 0,
|
| 613 |
-
"output_tokens": 0
|
| 614 |
-
},
|
| 615 |
-
"data": {
|
| 616 |
-
"pagewise_line_items": [],
|
| 617 |
-
"total_item_count": 0
|
| 618 |
-
}
|
| 619 |
-
}
|
| 620 |
|
| 621 |
-
# ---- Step 3: LLM extraction + fallback ----
|
| 622 |
pagewise_line_items = []
|
| 623 |
-
|
| 624 |
-
"total_tokens": 0,
|
| 625 |
-
"input_tokens": 0,
|
| 626 |
-
"output_tokens": 0
|
| 627 |
-
}
|
| 628 |
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 632 |
|
| 633 |
-
if pagewise_llm:
|
| 634 |
-
pagewise_line_items = pagewise_llm
|
| 635 |
-
else:
|
| 636 |
-
# Fallback: regex-based extraction
|
| 637 |
-
for p in pagewise_ocr:
|
| 638 |
-
items = extract_items_from_text(p["text"])
|
| 639 |
-
if items:
|
| 640 |
-
pagewise_line_items.append(
|
| 641 |
-
{
|
| 642 |
-
"page_no": p["page_no"],
|
| 643 |
-
"page_type": p["page_type"],
|
| 644 |
-
"bill_items": items,
|
| 645 |
-
}
|
| 646 |
-
)
|
| 647 |
-
|
| 648 |
-
total_item_count = sum(
|
| 649 |
-
len(p.get("bill_items", [])) for p in pagewise_line_items
|
| 650 |
-
)
|
| 651 |
-
|
| 652 |
-
# ---- Step 4: Final response ----
|
| 653 |
return {
|
| 654 |
"is_success": True,
|
| 655 |
-
"token_usage":
|
| 656 |
"data": {
|
| 657 |
"pagewise_line_items": pagewise_line_items,
|
| 658 |
"total_item_count": total_item_count
|
| 659 |
}
|
| 660 |
}
|
| 661 |
|
| 662 |
-
|
| 663 |
@app.get("/")
|
| 664 |
def health_check():
|
| 665 |
-
"""
|
| 666 |
-
Simple health endpoint to verify that the API is running.
|
| 667 |
-
"""
|
| 668 |
return {
|
| 669 |
"status": "ok",
|
| 670 |
-
"message": "Bajaj Datathon bill extraction API is live.",
|
| 671 |
-
"hint": "
|
| 672 |
}
|
|
|
|
| 1 |
+
# app.py (HIGH ACCURACY TSV + preprocessing + optional Gemini refinement)
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
import json
|
|
|
|
| 24 |
genai.configure(api_key=GEMINI_API_KEY)
|
| 25 |
|
| 26 |
# ---------------- FASTAPI APP ----------------
|
| 27 |
+
app = FastAPI(title="Bajaj Datathon - Bill Extractor (High Accuracy)")
|
| 28 |
|
| 29 |
class BillRequest(BaseModel):
|
| 30 |
document: str
|
| 31 |
+
|
| 32 |
# ---------------- Helpers: number normalization & detection ----------------
|
| 33 |
NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?") # matches numbers with commas, decimals
|
| 34 |
TOTAL_KEYWORDS = re.compile(r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal)", re.I)
|
|
|
|
| 245 |
"item_rate": float(round(rate_val, 2)) if rate_val else 0.0,
|
| 246 |
"item_quantity": float(qty_val)
|
| 247 |
}
|
| 248 |
+
|
| 249 |
# ---------------- Duplicate suppression & subtotal detection ----------------
|
| 250 |
def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 251 |
"""
|
|
|
|
| 353 |
except Exception:
|
| 354 |
return page_items, zero_usage
|
| 355 |
|
| 356 |
+
# ---------------- Main endpoint logic ----------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
@app.post("/extract-bill-data")
|
| 358 |
async def extract_bill_data(payload: BillRequest):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
doc_url = payload.document
|
| 360 |
+
# Step 1: download
|
|
|
|
| 361 |
try:
|
| 362 |
+
headers = {"User-Agent": "Mozilla/5.0"}
|
| 363 |
+
resp = requests.get(doc_url, headers=headers, timeout=30)
|
| 364 |
+
if resp.status_code != 200:
|
| 365 |
+
raise RuntimeError(f"download failed status={resp.status_code}")
|
| 366 |
+
file_bytes = resp.content
|
| 367 |
+
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
return {
|
| 369 |
"is_success": False,
|
| 370 |
+
"token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
|
| 371 |
+
"data": {"pagewise_line_items": [], "total_item_count": 0}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
}
|
| 373 |
|
| 374 |
+
# Step 2: convert PDF->images or handle single image
|
| 375 |
+
images = []
|
|
|
|
|
|
|
| 376 |
clean_url = doc_url.split("?", 1)[0].lower()
|
|
|
|
| 377 |
try:
|
|
|
|
| 378 |
if clean_url.endswith(".pdf"):
|
| 379 |
pages = convert_from_bytes(file_bytes)
|
| 380 |
+
images = pages
|
| 381 |
+
elif any(clean_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]):
|
| 382 |
+
img = Image.open(BytesIO(file_bytes))
|
| 383 |
+
images = [img]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
else:
|
| 385 |
+
# try PDF conversion as fallback
|
| 386 |
+
try:
|
| 387 |
+
pages = convert_from_bytes(file_bytes)
|
| 388 |
+
images = pages
|
| 389 |
+
except Exception:
|
| 390 |
+
images = []
|
| 391 |
except Exception:
|
| 392 |
+
images = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
|
|
|
|
| 394 |
pagewise_line_items = []
|
| 395 |
+
cumulative_token_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
|
| 397 |
+
# Process each page
|
| 398 |
+
for idx, page_img in enumerate(images, start=1):
|
| 399 |
+
try:
|
| 400 |
+
# preprocess
|
| 401 |
+
processed_cv = preprocess_image(page_img)
|
| 402 |
+
|
| 403 |
+
# get TSV / word cells
|
| 404 |
+
cells = image_to_tsv_cells(processed_cv)
|
| 405 |
+
|
| 406 |
+
# reconstruct rows
|
| 407 |
+
rows = group_cells_into_rows(cells, y_tolerance=12)
|
| 408 |
+
rows_texts = [" ".join([c["text"] for c in r]) for r in rows]
|
| 409 |
+
|
| 410 |
+
# detect subtotal/final totals in page text
|
| 411 |
+
subtotals = detect_subtotals_and_totals(rows_texts)
|
| 412 |
+
|
| 413 |
+
# parse each row to items
|
| 414 |
+
parsed_items = []
|
| 415 |
+
for r in rows:
|
| 416 |
+
parsed = parse_row_to_item(r)
|
| 417 |
+
if parsed is None:
|
| 418 |
+
continue
|
| 419 |
+
# filter out obvious total-like names
|
| 420 |
+
if TOTAL_KEYWORDS.search(parsed["item_name"]):
|
| 421 |
+
continue
|
| 422 |
+
parsed_items.append(parsed)
|
| 423 |
+
|
| 424 |
+
# dedupe
|
| 425 |
+
parsed_items = dedupe_items(parsed_items)
|
| 426 |
+
|
| 427 |
+
# if no items found via TSV (e.g., OCR failed), fallback to plain OCR text + simple extractor
|
| 428 |
+
if not parsed_items:
|
| 429 |
+
try:
|
| 430 |
+
raw_text = pytesseract.image_to_string(processed_cv)
|
| 431 |
+
parsed_items = []
|
| 432 |
+
# reuse your simpler extractor logic (very small and safe)
|
| 433 |
+
for line in [ln.strip() for ln in raw_text.splitlines() if ln.strip()]:
|
| 434 |
+
if TOTAL_KEYWORDS.search(line):
|
| 435 |
+
continue
|
| 436 |
+
toks = line.split()
|
| 437 |
+
numeric_idxs = [i for i,t in enumerate(toks) if NUM_RE.search(t)]
|
| 438 |
+
if numeric_idxs:
|
| 439 |
+
last = numeric_idxs[-1]
|
| 440 |
+
amt = normalize_num_str(toks[last])
|
| 441 |
+
if amt is None:
|
| 442 |
+
continue
|
| 443 |
+
name = " ".join(toks[:last]).strip()
|
| 444 |
+
if name == "":
|
| 445 |
+
continue
|
| 446 |
+
parsed_items.append({
|
| 447 |
+
"item_name": name,
|
| 448 |
+
"item_amount": float(round(amt, 2)),
|
| 449 |
+
"item_rate": 0.0,
|
| 450 |
+
"item_quantity": 1.0
|
| 451 |
+
})
|
| 452 |
+
parsed_items = dedupe_items(parsed_items)
|
| 453 |
+
except Exception:
|
| 454 |
+
parsed_items = []
|
| 455 |
+
|
| 456 |
+
# optional Gemini refinement (page-level)
|
| 457 |
+
if GEMINI_API_KEY and parsed_items:
|
| 458 |
+
refined, token_u = refine_with_gemini(parsed_items)
|
| 459 |
+
parsed_items = refined
|
| 460 |
+
# accumulate token usage (placeholder zeros kept)
|
| 461 |
+
for k in cumulative_token_usage:
|
| 462 |
+
cumulative_token_usage[k] += token_u.get(k, 0)
|
| 463 |
+
|
| 464 |
+
# Page type heuristics
|
| 465 |
+
page_type = "Bill Detail"
|
| 466 |
+
page_text_join = " ".join(rows_texts).lower()
|
| 467 |
+
if "pharmacy" in page_text_join or "medicine" in page_text_join or "tablet" in page_text_join:
|
| 468 |
+
page_type = "Pharmacy"
|
| 469 |
+
if "final bill" in page_text_join or "grand total" in page_text_join:
|
| 470 |
+
page_type = "Final Bill"
|
| 471 |
+
|
| 472 |
+
# attach to pagewise output
|
| 473 |
+
pagewise_line_items.append({
|
| 474 |
+
"page_no": str(idx),
|
| 475 |
+
"page_type": page_type,
|
| 476 |
+
"bill_items": parsed_items
|
| 477 |
+
})
|
| 478 |
+
|
| 479 |
+
except Exception:
|
| 480 |
+
# on per-page failure continue with empty list
|
| 481 |
+
pagewise_line_items.append({
|
| 482 |
+
"page_no": str(idx),
|
| 483 |
+
"page_type": "Bill Detail",
|
| 484 |
+
"bill_items": []
|
| 485 |
+
})
|
| 486 |
+
continue
|
| 487 |
+
|
| 488 |
+
total_item_count = sum(len(p.get("bill_items", [])) for p in pagewise_line_items)
|
| 489 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
return {
|
| 491 |
"is_success": True,
|
| 492 |
+
"token_usage": cumulative_token_usage,
|
| 493 |
"data": {
|
| 494 |
"pagewise_line_items": pagewise_line_items,
|
| 495 |
"total_item_count": total_item_count
|
| 496 |
}
|
| 497 |
}
|
| 498 |
|
|
|
|
| 499 |
@app.get("/")
|
| 500 |
def health_check():
|
|
|
|
|
|
|
|
|
|
| 501 |
return {
|
| 502 |
"status": "ok",
|
| 503 |
+
"message": "Bajaj Datathon bill extraction API (high-accuracy) is live.",
|
| 504 |
+
"hint": "POST /extract-bill-data with { 'document': '<url>' }"
|
| 505 |
}
|