Sathvik-kota commited on
Commit
4de63d1
·
verified ·
1 Parent(s): b64719f

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +123 -290
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py
2
  import os
3
  import re
4
  import json
@@ -24,10 +24,11 @@ if GEMINI_API_KEY:
24
  genai.configure(api_key=GEMINI_API_KEY)
25
 
26
  # ---------------- FASTAPI APP ----------------
27
- app = FastAPI(title="Bajaj Datathon - Bill Extractor")
28
 
29
  class BillRequest(BaseModel):
30
  document: str
 
31
  # ---------------- Helpers: number normalization & detection ----------------
32
  NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?") # matches numbers with commas, decimals
33
  TOTAL_KEYWORDS = re.compile(r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal)", re.I)
@@ -244,6 +245,7 @@ def parse_row_to_item(cells_row: List[Dict[str, Any]]) -> Optional[Dict[str, Any
244
  "item_rate": float(round(rate_val, 2)) if rate_val else 0.0,
245
  "item_quantity": float(qty_val)
246
  }
 
247
  # ---------------- Duplicate suppression & subtotal detection ----------------
248
  def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
249
  """
@@ -351,322 +353,153 @@ def refine_with_gemini(page_items: List[Dict[str, Any]]) -> (List[Dict[str, Any]
351
  except Exception:
352
  return page_items, zero_usage
353
 
354
-
355
- # ---------------- FALLBACK REGEX EXTRACTOR ----------------
356
-
357
- def extract_items_from_text(text: str):
358
- """
359
- Very simple rule-based extractor used as a fallback
360
- when LLM is not available or fails.
361
-
362
- Logic:
363
- - Split OCR text into lines
364
- - For each line, if it has at least one numeric token,
365
- treat the last numeric token as item_amount
366
- - Everything before that is item_name
367
- - Skip lines that look like totals
368
- """
369
- lines = [line.strip() for line in text.splitlines() if line.strip()]
370
- bill_items = []
371
-
372
- for line in lines:
373
- # Skip obvious total lines
374
- if re.search(r"(total|grand total|net payable)", line, re.IGNORECASE):
375
- continue
376
-
377
- tokens = line.split()
378
- if not tokens:
379
- continue
380
-
381
- # Numeric tokens like 123 or 45.67
382
- numeric_indices = [
383
- i for i, tok in enumerate(tokens)
384
- if re.fullmatch(r"\d+(\.\d+)?", tok)
385
- ]
386
-
387
- if not numeric_indices:
388
- continue
389
-
390
- last_idx = numeric_indices[-1]
391
- amount_str = tokens[last_idx]
392
- name_tokens = tokens[:last_idx]
393
-
394
- if not name_tokens:
395
- continue
396
-
397
- try:
398
- amount_val = float(amount_str)
399
- except ValueError:
400
- continue
401
-
402
- item_name = " ".join(name_tokens)
403
-
404
- bill_items.append(
405
- {
406
- "item_name": item_name,
407
- "item_amount": amount_val,
408
- "item_rate": 0.0, # to be improved later
409
- "item_quantity": 0.0, # to be improved later
410
- }
411
- )
412
-
413
- return bill_items
414
-
415
-
416
- # ---------------- LLM CALL (GEMINI) ----------------
417
-
418
- def call_gemini_for_items(pages_ocr):
419
- """
420
- pages_ocr: list of dicts:
421
- { "page_no": "1", "page_type": "Bill Detail", "text": "<ocr_text>" }
422
-
423
- Returns:
424
- (pagewise_line_items, token_usage_dict)
425
- or (None, zero_token_usage) if LLM is unavailable / fails.
426
- """
427
- zero_usage = {
428
- "total_tokens": 0,
429
- "input_tokens": 0,
430
- "output_tokens": 0
431
- }
432
-
433
- if not GEMINI_API_KEY:
434
- # No key configured → skip LLM and let caller fallback
435
- return None, zero_usage
436
-
437
- # Build a concise representation of pages for the prompt
438
- pages_repr = [
439
- {
440
- "page_no": p["page_no"],
441
- "page_type": p["page_type"],
442
- "text": p["text"],
443
- }
444
- for p in pages_ocr
445
- ]
446
-
447
- system_instruction = (
448
- "You are a medical bill extraction engine. "
449
- "Given OCR text from each page of a bill, extract individual line items.\n\n"
450
- "For each page, you must return bill_items with fields:\n"
451
- "- item_name (string, as close as possible to bill text)\n"
452
- "- item_rate (float; 0.0 if not clearly present)\n"
453
- "- item_quantity (float; 1.0 if implicit; 0.0 if unknown)\n"
454
- "- item_amount (float; net amount for that line)\n\n"
455
- "Do NOT include grand totals, sub-totals, or net payable rows as separate items.\n"
456
- "Only include the per-service / per-medicine lines.\n\n"
457
- "Return ONLY valid JSON in this exact shape (no comments, no extra keys):\n"
458
- "{\n"
459
- " \"pagewise_line_items\": [\n"
460
- " {\n"
461
- " \"page_no\": \"1\",\n"
462
- " \"page_type\": \"Bill Detail\",\n"
463
- " \"bill_items\": [\n"
464
- " {\n"
465
- " \"item_name\": \"...\",\n"
466
- " \"item_amount\": 123.45,\n"
467
- " \"item_rate\": 61.72,\n"
468
- " \"item_quantity\": 2.0\n"
469
- " }\n"
470
- " ]\n"
471
- " }\n"
472
- " ]\n"
473
- "}\n"
474
- )
475
-
476
- user_prompt = (
477
- "Use the following OCR text per page to extract line items into the required schema.\n"
478
- "The data is provided as a JSON array under the key 'pages_ocr'.\n\n"
479
- f"pages_ocr = {json.dumps(pages_repr, ensure_ascii=False)}"
480
- )
481
-
482
- try:
483
- model = genai.GenerativeModel(GEMINI_MODEL_NAME)
484
- response = model.generate_content(
485
- [
486
- {"role": "system", "parts": [system_instruction]},
487
- {"role": "user", "parts": [user_prompt]},
488
- ]
489
- )
490
-
491
- raw_text = response.text.strip()
492
-
493
- # Strip possible ```json ... ``` wrappers
494
- if raw_text.startswith("```"):
495
- raw_text = re.sub(r"^```[a-zA-Z]*", "", raw_text)
496
- raw_text = re.sub(r"```$", "", raw_text)
497
- raw_text = raw_text.strip()
498
-
499
- parsed = json.loads(raw_text)
500
-
501
- pagewise = parsed.get("pagewise_line_items", [])
502
- if not isinstance(pagewise, list):
503
- return None, zero_usage
504
-
505
- # We are on free tier, so we keep token_usage as zeros (schema only)
506
- token_usage = zero_usage
507
-
508
- return pagewise, token_usage
509
-
510
- except Exception:
511
- # Any LLM error → caller will fallback to regex
512
- return None, zero_usage
513
-
514
-
515
- # ---------------- MAIN ENDPOINT ----------------
516
-
517
  @app.post("/extract-bill-data")
518
  async def extract_bill_data(payload: BillRequest):
519
- """
520
- Main Datathon endpoint.
521
-
522
- Flow:
523
- - Download document from URL
524
- - If PDF: convert each page to an image and run OCR
525
- - If image: run OCR directly
526
- - Build page-wise OCR text
527
- - Try LLM (Gemini) to extract structured line items
528
- - If LLM fails or key missing → fallback to regex-only extraction
529
- - Return JSON in the exact schema expected by the evaluators
530
- """
531
  doc_url = payload.document
532
-
533
- # ---- Step 1: Download file ----
534
  try:
535
- headers = {
536
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
537
- }
538
- response = requests.get(doc_url, headers=headers, timeout=20)
539
-
540
- if response.status_code != 200:
541
- return {
542
- "is_success": False,
543
- "token_usage": {
544
- "total_tokens": 0,
545
- "input_tokens": 0,
546
- "output_tokens": 0
547
- },
548
- "data": {
549
- "pagewise_line_items": [],
550
- "total_item_count": 0
551
- }
552
- }
553
-
554
- file_bytes = response.content
555
-
556
- except Exception:
557
  return {
558
  "is_success": False,
559
- "token_usage": {
560
- "total_tokens": 0,
561
- "input_tokens": 0,
562
- "output_tokens": 0
563
- },
564
- "data": {
565
- "pagewise_line_items": [],
566
- "total_item_count": 0
567
- }
568
  }
569
 
570
- # ---- Step 2: OCR (PDF + images) ----
571
- pagewise_ocr = [] # list of {page_no, page_type, text}
572
-
573
- # IMPORTANT: strip query (?sv=...) only for extension detection
574
  clean_url = doc_url.split("?", 1)[0].lower()
575
-
576
  try:
577
- # PDF case
578
  if clean_url.endswith(".pdf"):
579
  pages = convert_from_bytes(file_bytes)
580
- for idx, page_img in enumerate(pages, start=1):
581
- text = pytesseract.image_to_string(page_img)
582
- pagewise_ocr.append(
583
- {
584
- "page_no": str(idx),
585
- "page_type": "Bill Detail",
586
- "text": text,
587
- }
588
- )
589
-
590
- # Image case
591
- elif any(clean_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg"]):
592
- image = Image.open(BytesIO(file_bytes))
593
- text = pytesseract.image_to_string(image)
594
- pagewise_ocr.append(
595
- {
596
- "page_no": "1",
597
- "page_type": "Bill Detail",
598
- "text": text,
599
- }
600
- )
601
-
602
- # Other file types → currently not handled
603
  else:
604
- pagewise_ocr = []
605
-
 
 
 
 
606
  except Exception:
607
- # OCR failure
608
- return {
609
- "is_success": False,
610
- "token_usage": {
611
- "total_tokens": 0,
612
- "input_tokens": 0,
613
- "output_tokens": 0
614
- },
615
- "data": {
616
- "pagewise_line_items": [],
617
- "total_item_count": 0
618
- }
619
- }
620
 
621
- # ---- Step 3: LLM extraction + fallback ----
622
  pagewise_line_items = []
623
- token_usage = {
624
- "total_tokens": 0,
625
- "input_tokens": 0,
626
- "output_tokens": 0
627
- }
628
 
629
- if pagewise_ocr:
630
- # Try Gemini first (if key is set)
631
- pagewise_llm, token_usage = call_gemini_for_items(pagewise_ocr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
632
 
633
- if pagewise_llm:
634
- pagewise_line_items = pagewise_llm
635
- else:
636
- # Fallback: regex-based extraction
637
- for p in pagewise_ocr:
638
- items = extract_items_from_text(p["text"])
639
- if items:
640
- pagewise_line_items.append(
641
- {
642
- "page_no": p["page_no"],
643
- "page_type": p["page_type"],
644
- "bill_items": items,
645
- }
646
- )
647
-
648
- total_item_count = sum(
649
- len(p.get("bill_items", [])) for p in pagewise_line_items
650
- )
651
-
652
- # ---- Step 4: Final response ----
653
  return {
654
  "is_success": True,
655
- "token_usage": token_usage,
656
  "data": {
657
  "pagewise_line_items": pagewise_line_items,
658
  "total_item_count": total_item_count
659
  }
660
  }
661
 
662
-
663
  @app.get("/")
664
  def health_check():
665
- """
666
- Simple health endpoint to verify that the API is running.
667
- """
668
  return {
669
  "status": "ok",
670
- "message": "Bajaj Datathon bill extraction API is live.",
671
- "hint": "Use POST /extract-bill-data with { 'document': '<url>' }"
672
  }
 
1
+ # app.py (HIGH ACCURACY TSV + preprocessing + optional Gemini refinement)
2
  import os
3
  import re
4
  import json
 
24
  genai.configure(api_key=GEMINI_API_KEY)
25
 
26
  # ---------------- FASTAPI APP ----------------
27
+ app = FastAPI(title="Bajaj Datathon - Bill Extractor (High Accuracy)")
28
 
29
  class BillRequest(BaseModel):
30
  document: str
31
+
32
  # ---------------- Helpers: number normalization & detection ----------------
33
  NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?") # matches numbers with commas, decimals
34
  TOTAL_KEYWORDS = re.compile(r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal)", re.I)
 
245
  "item_rate": float(round(rate_val, 2)) if rate_val else 0.0,
246
  "item_quantity": float(qty_val)
247
  }
248
+
249
  # ---------------- Duplicate suppression & subtotal detection ----------------
250
  def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
251
  """
 
353
  except Exception:
354
  return page_items, zero_usage
355
 
356
+ # ---------------- Main endpoint logic ----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  @app.post("/extract-bill-data")
358
  async def extract_bill_data(payload: BillRequest):
 
 
 
 
 
 
 
 
 
 
 
 
359
  doc_url = payload.document
360
+ # Step 1: download
 
361
  try:
362
+ headers = {"User-Agent": "Mozilla/5.0"}
363
+ resp = requests.get(doc_url, headers=headers, timeout=30)
364
+ if resp.status_code != 200:
365
+ raise RuntimeError(f"download failed status={resp.status_code}")
366
+ file_bytes = resp.content
367
+ except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
  return {
369
  "is_success": False,
370
+ "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
371
+ "data": {"pagewise_line_items": [], "total_item_count": 0}
 
 
 
 
 
 
 
372
  }
373
 
374
+ # Step 2: convert PDF->images or handle single image
375
+ images = []
 
 
376
  clean_url = doc_url.split("?", 1)[0].lower()
 
377
  try:
 
378
  if clean_url.endswith(".pdf"):
379
  pages = convert_from_bytes(file_bytes)
380
+ images = pages
381
+ elif any(clean_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]):
382
+ img = Image.open(BytesIO(file_bytes))
383
+ images = [img]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  else:
385
+ # try PDF conversion as fallback
386
+ try:
387
+ pages = convert_from_bytes(file_bytes)
388
+ images = pages
389
+ except Exception:
390
+ images = []
391
  except Exception:
392
+ images = []
 
 
 
 
 
 
 
 
 
 
 
 
393
 
 
394
  pagewise_line_items = []
395
+ cumulative_token_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
 
 
 
 
396
 
397
+ # Process each page
398
+ for idx, page_img in enumerate(images, start=1):
399
+ try:
400
+ # preprocess
401
+ processed_cv = preprocess_image(page_img)
402
+
403
+ # get TSV / word cells
404
+ cells = image_to_tsv_cells(processed_cv)
405
+
406
+ # reconstruct rows
407
+ rows = group_cells_into_rows(cells, y_tolerance=12)
408
+ rows_texts = [" ".join([c["text"] for c in r]) for r in rows]
409
+
410
+ # detect subtotal/final totals in page text
411
+ subtotals = detect_subtotals_and_totals(rows_texts)
412
+
413
+ # parse each row to items
414
+ parsed_items = []
415
+ for r in rows:
416
+ parsed = parse_row_to_item(r)
417
+ if parsed is None:
418
+ continue
419
+ # filter out obvious total-like names
420
+ if TOTAL_KEYWORDS.search(parsed["item_name"]):
421
+ continue
422
+ parsed_items.append(parsed)
423
+
424
+ # dedupe
425
+ parsed_items = dedupe_items(parsed_items)
426
+
427
+ # if no items found via TSV (e.g., OCR failed), fallback to plain OCR text + simple extractor
428
+ if not parsed_items:
429
+ try:
430
+ raw_text = pytesseract.image_to_string(processed_cv)
431
+ parsed_items = []
432
+ # reuse your simpler extractor logic (very small and safe)
433
+ for line in [ln.strip() for ln in raw_text.splitlines() if ln.strip()]:
434
+ if TOTAL_KEYWORDS.search(line):
435
+ continue
436
+ toks = line.split()
437
+ numeric_idxs = [i for i,t in enumerate(toks) if NUM_RE.search(t)]
438
+ if numeric_idxs:
439
+ last = numeric_idxs[-1]
440
+ amt = normalize_num_str(toks[last])
441
+ if amt is None:
442
+ continue
443
+ name = " ".join(toks[:last]).strip()
444
+ if name == "":
445
+ continue
446
+ parsed_items.append({
447
+ "item_name": name,
448
+ "item_amount": float(round(amt, 2)),
449
+ "item_rate": 0.0,
450
+ "item_quantity": 1.0
451
+ })
452
+ parsed_items = dedupe_items(parsed_items)
453
+ except Exception:
454
+ parsed_items = []
455
+
456
+ # optional Gemini refinement (page-level)
457
+ if GEMINI_API_KEY and parsed_items:
458
+ refined, token_u = refine_with_gemini(parsed_items)
459
+ parsed_items = refined
460
+ # accumulate token usage (placeholder zeros kept)
461
+ for k in cumulative_token_usage:
462
+ cumulative_token_usage[k] += token_u.get(k, 0)
463
+
464
+ # Page type heuristics
465
+ page_type = "Bill Detail"
466
+ page_text_join = " ".join(rows_texts).lower()
467
+ if "pharmacy" in page_text_join or "medicine" in page_text_join or "tablet" in page_text_join:
468
+ page_type = "Pharmacy"
469
+ if "final bill" in page_text_join or "grand total" in page_text_join:
470
+ page_type = "Final Bill"
471
+
472
+ # attach to pagewise output
473
+ pagewise_line_items.append({
474
+ "page_no": str(idx),
475
+ "page_type": page_type,
476
+ "bill_items": parsed_items
477
+ })
478
+
479
+ except Exception:
480
+ # on per-page failure continue with empty list
481
+ pagewise_line_items.append({
482
+ "page_no": str(idx),
483
+ "page_type": "Bill Detail",
484
+ "bill_items": []
485
+ })
486
+ continue
487
+
488
+ total_item_count = sum(len(p.get("bill_items", [])) for p in pagewise_line_items)
489
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
  return {
491
  "is_success": True,
492
+ "token_usage": cumulative_token_usage,
493
  "data": {
494
  "pagewise_line_items": pagewise_line_items,
495
  "total_item_count": total_item_count
496
  }
497
  }
498
 
 
499
  @app.get("/")
500
  def health_check():
 
 
 
501
  return {
502
  "status": "ok",
503
+ "message": "Bajaj Datathon bill extraction API (high-accuracy) is live.",
504
+ "hint": "POST /extract-bill-data with { 'document': '<url>' }"
505
  }