Sathvik-kota commited on
Commit
8803a3c
·
verified ·
1 Parent(s): 84d69f8

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +441 -224
app.py CHANGED
@@ -1,46 +1,16 @@
1
- # app_bill_extractor_final_v2.py
2
- # Humanized, high-accuracy bill extraction API.
3
- # Robust OCR preprocessing, TSV layout parsing, numeric-column inference,
4
- # header prefiltering, deterministic Gemini refinement (if configured).
5
-
6
  import os
7
  import re
8
  import json
9
  from io import BytesIO
10
  from typing import List, Dict, Any, Optional, Tuple
11
 
12
- from fastapi import FastAPI
13
- from pydantic import BaseModel
14
- import requests
15
  from PIL import Image
16
- from pdf2image import convert_from_bytes
17
- import pytesseract
18
- from pytesseract import Output
19
  import numpy as np
20
  import cv2
 
 
21
 
22
- # Optional: Google Gemini SDK (if available)
23
- try:
24
- import google.generativeai as genai
25
- except Exception:
26
- genai = None
27
-
28
- # ---------------- LLM CONFIG ----------------
29
- GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
30
- GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL_NAME", "gemini-2.5-flash")
31
- if GEMINI_API_KEY and genai is not None:
32
- try:
33
- genai.configure(api_key=GEMINI_API_KEY)
34
- except Exception:
35
- pass
36
-
37
- # ---------------- FastAPI app ----------------
38
- app = FastAPI(title="Bajaj Datathon - Bill Extractor (final, humanized)")
39
-
40
- class BillRequest(BaseModel):
41
- document: str
42
-
43
- # ---------------- Regex and keywords ----------------
44
  NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
45
  TOTAL_KEYWORDS = re.compile(
46
  r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
@@ -48,8 +18,10 @@ TOTAL_KEYWORDS = re.compile(
48
  )
49
  FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
50
 
51
- # generalized header-related tokens & exact header phrase blacklist (common variants)
52
- HEADER_KEYWORDS = ["description", "qty", "hrs", "rate", "discount", "net", "amt", "amount", "consultation", "qty/hrs", "qty / hrs"]
 
 
53
  HEADER_PHRASES = [
54
  "description qty / hrs consultation rate discount net amt",
55
  "description qty / hrs rate discount net amt",
@@ -59,7 +31,7 @@ HEADER_PHRASES = [
59
  ]
60
  HEADER_PHRASES = [h.lower() for h in HEADER_PHRASES]
61
 
62
- # ---------------- small utilities ----------------
63
  def sanitize_ocr_text(s: str) -> str:
64
  if not s:
65
  return ""
@@ -96,13 +68,41 @@ def is_numeric_token(t: Optional[str]) -> bool:
96
  return bool(t and NUM_RE.search(str(t)))
97
 
98
  def clean_name_text(s: str) -> str:
99
- s = s.replace("", "-")
 
 
 
 
 
 
100
  s = re.sub(r"\s+", " ", s)
101
  s = s.strip(" -:,.")
 
102
  s = re.sub(r"\bSG0?(\d+)\b", r"SG\1", s, flags=re.I)
103
  s = re.sub(r"\b(RR)[\s\-]*2\b", r"RR-2", s, flags=re.I)
104
- # fix common OCR mistakes for doctor prefixes
105
- s = re.sub(r"\bOR\b", "DR", s) # sometimes OCR turns 'DR' -> 'OR'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  return s.strip()
107
 
108
  # ---------------- image preprocessing ----------------
@@ -121,22 +121,27 @@ def preprocess_image(pil_img: Image.Image) -> Any:
121
  pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
122
  cv_img = pil_to_cv2(pil_img)
123
  gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
124
- gray = cv2.fastNlMeansDenoising(gray, h=10)
125
  try:
126
- bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 41, 15)
 
 
 
 
 
127
  except Exception:
128
  _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
129
  kernel = np.ones((1,1), np.uint8)
130
  bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
131
  return bw
132
 
133
- # ---------------- OCR TSV ----------------
134
  def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
135
  try:
136
  o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
137
  except Exception:
138
  o = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
139
- cells = []
140
  n = len(o.get("text", []))
141
  for i in range(n):
142
  raw = o["text"][i]
@@ -155,15 +160,24 @@ def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
155
  height = int(o.get("height", [0])[i])
156
  center_y = top + height / 2.0
157
  center_x = left + width / 2.0
158
- cells.append({"text": txt, "conf": conf, "left": left, "top": top, "width": width, "height": height, "center_y": center_y, "center_x": center_x})
 
 
 
 
 
 
 
 
 
159
  return cells
160
 
161
- # ---------------- grouping & merge helpers ----------------
162
  def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) -> List[List[Dict[str, Any]]]:
163
  if not cells:
164
  return []
165
  sorted_cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
166
- rows = []
167
  current = [sorted_cells[0]]
168
  last_y = sorted_cells[0]["center_y"]
169
  for c in sorted_cells[1:]:
@@ -178,63 +192,106 @@ def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) ->
178
  rows.append(sorted(current, key=lambda cc: cc["left"]))
179
  return rows
180
 
 
181
  def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
 
 
 
 
 
182
  if not rows:
183
  return rows
184
- merged = []
185
  i = 0
186
  while i < len(rows):
187
  row = rows[i]
188
  tokens = [c["text"] for c in row]
 
189
  has_num = any(is_numeric_token(t) for t in tokens)
190
- # if row looks pure text and next row contains numbers but short left text tokens, merge
 
 
 
 
191
  if not has_num and i + 1 < len(rows):
192
  next_row = rows[i+1]
193
- next_tokens = [c["text"] for c in next_row]
 
 
 
194
  next_has_num = any(is_numeric_token(t) for t in next_tokens)
195
- if next_has_num and len(tokens) >= 2 and len([t for t in next_tokens if not is_numeric_token(t)]) <= 3:
196
- merged_row = []
197
- min_left = min((c["left"] for c in next_row), default=0)
198
- offset = 10
199
- for c in row:
200
- newc = c.copy()
201
- newc["left"] = min_left - offset
202
- newc["center_x"] = newc["left"] + newc.get("width", 0) / 2.0
203
- merged_row.append(newc)
204
- offset += 10
205
- merged_row.extend(next_row)
206
- merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
207
- i += 2
208
- continue
209
- # Additional merge: If a row ends with a trailing token like a doctor's name line with single token and next row also text, merge (helps names split across 2+ lines)
210
  if not has_num and i + 1 < len(rows):
211
  next_row = rows[i+1]
212
  next_tokens = [c["text"] for c in next_row]
213
  next_has_num = any(is_numeric_token(t) for t in next_tokens)
214
- if not next_has_num and len(tokens) <= 3 and len(next_tokens) <= 3:
215
- # merge both textual lines into one (keeps relative left ordering by shifting)
216
- merged_row = []
217
- min_left = min((c["left"] for c in next_row + row), default=0)
218
- offset = 10
219
- for c in row + next_row:
220
- newc = c.copy()
221
- if newc["left"] > min_left:
222
- newc["left"] = newc["left"]
223
- else:
224
- newc["left"] = min_left - offset
225
- newc["center_x"] = newc["left"] + newc.get("width", 0) / 2.0
226
- merged_row.append(newc)
227
- offset += 5
228
  merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
229
  i += 2
230
  continue
 
 
231
  merged.append(row)
232
  i += 1
 
233
  return merged
234
 
235
- # ---------------- numeric column detection ----------------
236
- # >>> CHANGE: adaptive clustering (restored to conservative adaptive threshold)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
 
 
 
238
  xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
239
  if not xs:
240
  return []
@@ -265,9 +322,29 @@ def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optio
265
  distances = [abs(token_x - cx) for cx in column_centers]
266
  return int(np.argmin(distances))
267
 
268
- # ---------------- parsing rows into items ----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
270
- parsed_items = []
 
 
 
 
271
  rows = merge_multiline_names(rows)
272
  column_centers = detect_numeric_columns(page_cells, max_columns=4)
273
 
@@ -276,8 +353,10 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
276
  if not tokens:
277
  continue
278
  joined_lower = " ".join(tokens).lower()
 
279
  if FOOTER_KEYWORDS.search(joined_lower) and not any(is_numeric_token(t) for t in tokens):
280
  continue
 
281
  if all(not is_numeric_token(t) for t in tokens):
282
  continue
283
 
@@ -288,10 +367,14 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
288
  v = normalize_num_str(t)
289
  if v is not None:
290
  numeric_values.append(float(v))
291
- # de-duplicate and sort descending (larger candidates first)
292
- numeric_values = sorted(list({int(x) if float(x).is_integer() else x for x in numeric_values}), reverse=True)
 
 
 
293
 
294
  if column_centers:
 
295
  left_text_parts = []
296
  numeric_bucket_map = {i: [] for i in range(len(column_centers))}
297
  for c in row:
@@ -300,7 +383,7 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
300
  if is_numeric_token(t):
301
  col_idx = assign_token_to_column(cx, column_centers)
302
  if col_idx is None:
303
- numeric_bucket_map[len(column_centers) - 1].append(t)
304
  else:
305
  numeric_bucket_map[col_idx].append(t)
306
  else:
@@ -317,6 +400,7 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
317
  rate = normalize_num_str(get_bucket(num_cols - 2)) if num_cols >= 2 else None
318
  qty = normalize_num_str(get_bucket(num_cols - 3)) if num_cols >= 3 else None
319
 
 
320
  if amount is None:
321
  for t in reversed(tokens):
322
  if is_numeric_token(t):
@@ -324,70 +408,91 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
324
  if amount is not None:
325
  break
326
 
327
- # >>> CHANGE: safer inference skip tiny candidates like 1, enforce qty bounds, require close ratio
328
- if amount is not None and numeric_values:
329
- # Only accept candidate as rate if candidate >= 2 (or amount is tiny) and candidate < amount
330
- for cand in numeric_values:
331
- try:
332
- cand_float = float(cand)
333
- except:
334
- continue
335
- if cand_float <= 1.0:
 
 
 
 
 
 
 
 
336
  continue
337
- if amount <= 5 and cand_float < 1.0:
338
  continue
339
- if cand_float >= amount:
340
- continue
341
- ratio = amount / cand_float if cand_float else None
342
  if ratio is None:
343
  continue
344
  r = round(ratio)
345
  if r < 1 or r > 200:
346
  continue
347
- # require relative closeness threshold (adaptive)
 
 
348
  if abs(ratio - r) <= max(0.03 * r, 0.15):
349
- # Accept only if qty reasonable (<=100)
350
- if r <= 100:
351
- rate = cand_float
352
- qty = float(r)
353
- break
354
 
355
  # fallback compute rate if qty found but rate missing
356
- if (rate is None or rate == 0) and qty and qty != 0 and amount is not None:
357
  try:
358
- candidate_rate = amount / qty
359
- # require candidate_rate > 1 (avoid tiny rates) and reasonable
360
- if candidate_rate >= 2:
361
- rate = candidate_rate
362
  except Exception:
363
  pass
364
 
 
 
 
 
365
  # final defaults
366
- if qty is None:
367
- qty = 1.0
 
 
368
 
369
- # normalize and sanity-check
370
  try:
371
- amount = float(round(amount, 2))
372
  except Exception:
373
- continue
374
  try:
375
- rate = float(round(rate, 2)) if rate is not None else 0.0
376
  except Exception:
377
- rate = 0.0
378
  try:
379
- qty = float(qty)
380
  except Exception:
381
- qty = 1.0
 
 
 
 
 
 
 
 
 
382
 
383
  parsed_items.append({
384
  "item_name": name if name else "UNKNOWN",
385
- "item_amount": amount,
386
- "item_rate": rate if rate is not None else 0.0,
387
- "item_quantity": qty if qty is not None else 1.0,
388
  })
389
 
390
  else:
 
391
  numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t)]
392
  if not numeric_idxs:
393
  continue
@@ -398,29 +503,28 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
398
  name = " ".join(tokens[:last]).strip()
399
  if not name:
400
  continue
401
- rate = None; qty = None
402
-
403
- # try to pick rate/qty from previous numeric tokens (right-to-left)
404
- # and use the safer inference logic (ignore candidate == 1)
405
  right_nums = []
406
  for i in numeric_idxs:
407
  v = normalize_num_str(tokens[i])
408
  if v is not None:
409
  right_nums.append(float(v))
410
- right_nums = sorted(list({int(x) if float(x).is_integer() else x for x in right_nums}), reverse=True)
411
 
412
- # attempt direct mapping: last numeric = amount, previous maybe rate / qty
 
 
 
413
  if len(right_nums) >= 2:
414
  cand = right_nums[1]
415
  if float(cand) > 1 and float(cand) < float(amt):
416
- # check ratio
417
  ratio = float(amt) / float(cand) if cand else None
418
  if ratio:
419
  r = round(ratio)
420
  if 1 <= r <= 200 and abs(ratio - r) <= max(0.03 * r, 0.15) and r <= 100:
421
  rate = float(cand)
422
  qty = float(r)
423
- # fallback: conservative search like above
424
  if rate is None and right_nums:
425
  for cand in right_nums:
426
  if cand <= 1.0 or cand >= float(amt):
@@ -437,6 +541,17 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
437
  if rate is None:
438
  rate = 0.0
439
 
 
 
 
 
 
 
 
 
 
 
 
440
  parsed_items.append({
441
  "item_name": clean_name_text(name),
442
  "item_amount": float(round(amt, 2)),
@@ -449,10 +564,10 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
449
  # ---------------- dedupe & totals ----------------
450
  def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
451
  seen = set()
452
- out = []
453
  for it in items:
454
- nm = re.sub(r"\s+", " ", it["item_name"].lower()).strip()
455
- key = (nm[:120], round(float(it["item_amount"]), 2))
456
  if key in seen:
457
  continue
458
  seen.add(key)
@@ -476,8 +591,11 @@ def detect_subtotals_and_totals(rows_texts: List[str]) -> Dict[str, Optional[flo
476
  if final is None: final = float(round(v, 2))
477
  return {"subtotal": subtotal, "final_total": final}
478
 
479
- # ---------------- Gemini refinement (deterministic) ----------------
480
  def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
 
 
 
481
  zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
482
  if not GEMINI_API_KEY or genai is None:
483
  return page_items, zero_usage
@@ -486,18 +604,36 @@ def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") ->
486
  system_prompt = (
487
  "You are a strict bill-extraction cleaner. Return ONLY a JSON array (no explanation, no backticks). "
488
  "Each entry must be an object with keys: item_name (string), item_amount (float), item_rate (float), item_quantity (float). "
489
- "Do NOT include subtotal or total lines as items. Do not invent items; only clean/fix/normalize the given items."
490
- )
491
- user_prompt = (
492
- f"page_text='''{safe_text}'''\n"
493
- f"items = {json.dumps(page_items, ensure_ascii=False)}\n\n"
494
- "Example:\n"
495
- "items = [{'item_name':'Consultation Charge | DR PREETHI','item_amount':300.0,'item_rate':0.0,'item_quantity':300.0},\n"
496
- " {'item_name':'Description Qty / Hrs Consultation Rate Discount Net Amt','item_amount':1950.0,'item_rate':1950.0,'item_quantity':1.0}]\n"
497
- "=>\n"
498
- "[{'item_name':'Consultation Charge | DR PREETHI MARY JOSEPH','item_amount':300.0,'item_rate':300.0,'item_quantity':1.0}]\n\n"
499
- "Return only the cleaned JSON array of items."
500
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501
  model = genai.GenerativeModel(GEMINI_MODEL_NAME)
502
  response = model.generate_content(
503
  [
@@ -524,79 +660,85 @@ def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") ->
524
  })
525
  except Exception:
526
  continue
 
527
  return cleaned, zero_usage
528
  return page_items, zero_usage
529
  except Exception:
530
  return page_items, zero_usage
531
 
532
- # ---------------- header heuristics & final filter ----------------
533
- def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
534
- if not txt:
535
- return False
536
- t = re.sub(r"\s+", " ", txt.strip().lower())
537
- # exact phrase blacklist
538
- if any(h == t for h in HEADER_PHRASES):
539
- return True
540
- hits = sum(1 for k in HEADER_KEYWORDS if k in t)
541
- if hits >= 2:
542
- return True
543
- tokens = re.split(r"[\s\|,/:]+", t)
544
- key_hit_count = sum(1 for tok in tokens if tok in HEADER_KEYWORDS)
545
- if key_hit_count >= 3:
546
- return True
547
- if top_of_page and len(tokens) <= 10 and key_hit_count >= 2:
548
- return True
549
- if ("rate" in t or "net" in t) and "amt" in t and not any(ch.isdigit() for ch in t):
550
- return True
551
- if t.startswith("description") or t.startswith("qty") or t.startswith("qty /"):
552
- return True
553
- return False
554
 
555
- def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = [], other_item_names: List[str] = []) -> bool:
556
- name = (item.get("item_name") or "").strip()
557
- if not name:
558
- return False
559
- ln = name.lower()
560
- # header exact detection
561
- for h in known_page_headers:
562
- if h and h.strip() and h.strip().lower() in ln:
563
- return False
564
- if FOOTER_KEYWORDS.search(ln):
565
- return False
566
- if item.get("item_amount", 0) > 1_000_000:
567
- return False
568
- if len(name) <= 2 and not re.search(r"[a-zA-Z]", name):
569
- return False
570
- # avoid pure section headers (short & header words)
571
- words = ln.split()
572
- header_word_hits = sum(1 for k in HEADER_KEYWORDS if k in ln)
573
- if header_word_hits >= 1 and len(words) <= 3:
574
- # if page contains more detailed items with 'room'/'rent'/'nursing' etc, remove this generic header
575
- lower_other = " ".join(other_item_names).lower()
576
- if any(k in lower_other for k in ["room", "rent", "nursing", "ward", "surgeon", "anaes", "ot", "charges", "procedure", "radiology"]):
577
- return False
578
- # also if name is exactly one of the short header words, drop
579
- if ln in ("charge", "charges", "services", "consultation", "room", "radiology", "surgery"):
580
- return False
581
- # drop non-informative labels even if they have amount (summary rows)
582
- if len(words) <= 4 and re.search(r"\b(charges|services|room|radiolog|laborat|surgery|procedure|rent|nursing)\b", ln):
583
- # try to detect if it's a summary (presence of other more specific items)
584
- lower_other = " ".join(other_item_names).lower()
585
- if any(tok in lower_other for tok in ["rent", "room", "ward", "nursing", "surgeon", "anaes", "ot"]):
586
- return False
587
- if float(item.get("item_amount", 0)) <= 0.0:
588
- return False
589
- # sanity check rate vs amount
590
- rate = float(item.get("item_rate", 0) or 0)
591
- amt = float(item.get("item_amount", 0) or 0)
592
- if rate and rate > amt * 10 and amt < 10000:
593
- return False
594
- return True
595
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
  # ---------------- main endpoint ----------------
597
  @app.post("/extract-bill-data")
598
  async def extract_bill_data(payload: BillRequest):
599
  doc_url = payload.document
 
 
600
  try:
601
  headers = {"User-Agent": "Mozilla/5.0"}
602
  resp = requests.get(doc_url, headers=headers, timeout=30)
@@ -604,8 +746,17 @@ async def extract_bill_data(payload: BillRequest):
604
  raise RuntimeError(f"download failed status={resp.status_code}")
605
  file_bytes = resp.content
606
  except Exception:
607
- return {"is_success": False, "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}, "data": {"pagewise_line_items": [], "total_item_count": 0}}
608
-
 
 
 
 
 
 
 
 
 
609
  images = []
610
  clean_url = doc_url.split("?", 1)[0].lower()
611
  try:
@@ -616,7 +767,7 @@ async def extract_bill_data(payload: BillRequest):
616
  else:
617
  try:
618
  images = convert_from_bytes(file_bytes)
619
- except Exception:
620
  images = []
621
  except Exception:
622
  images = []
@@ -624,49 +775,68 @@ async def extract_bill_data(payload: BillRequest):
624
  pagewise = []
625
  cumulative_token_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
626
 
 
627
  for idx, page_img in enumerate(images, start=1):
628
  try:
629
  proc = preprocess_image(page_img)
 
 
630
  cells = image_to_tsv_cells(proc)
631
  rows = group_cells_into_rows(cells, y_tolerance=12)
 
632
  rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
633
 
634
- # === HEADER PREFILTER: remove header-like rows anywhere on page ===
635
  rows_filtered = []
636
  for i, (r, rt) in enumerate(zip(rows, rows_texts)):
637
  top_flag = (i < 6)
638
  rt_norm = sanitize_ocr_text(rt).lower()
 
 
639
  if looks_like_header_text(rt_norm, top_of_page=top_flag):
640
  continue
 
 
641
  if any(h in rt_norm for h in HEADER_PHRASES):
642
  continue
 
643
  rows_filtered.append(r)
644
- # recompute row texts and a simple page_text
645
  rows = rows_filtered
646
  rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
647
  page_text = sanitize_ocr_text(" ".join(rows_texts))
648
 
649
- # detect page-level top headers (for final filtering)
650
  top_headers = []
651
  for i, rt in enumerate(rows_texts[:6]):
652
- if looks_like_header_text(rt, top_of_page=(i < 4)):
653
  top_headers.append(rt.strip().lower())
654
 
 
655
  parsed_items = parse_rows_with_columns(rows, cells)
656
 
657
- # ALWAYS attempt Gemini refinement if available (deterministic settings)
658
  refined_items, token_u = refine_with_gemini(parsed_items, page_text)
659
  for k in cumulative_token_usage:
660
  cumulative_token_usage[k] += token_u.get(k, 0)
661
 
662
- # Prepare other_item_names for contextual filtering (helps remove generic section headers)
663
- other_item_names = [it.get("item_name","") for it in refined_items]
 
 
 
 
 
664
 
665
- # final cleaning & dedupe
666
- cleaned = [p for p in refined_items if final_item_filter(p, known_page_headers=top_headers, other_item_names=other_item_names)]
667
  cleaned = dedupe_items(cleaned)
 
 
668
  cleaned = [p for p in cleaned if not looks_like_header_text(p["item_name"].lower())]
669
 
 
 
 
 
670
  page_type = "Bill Detail"
671
  page_txt = page_text.lower()
672
  if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
@@ -674,16 +844,55 @@ async def extract_bill_data(payload: BillRequest):
674
  if "final bill" in page_txt or "grand total" in page_txt:
675
  page_type = "Final Bill"
676
 
677
- pagewise.append({"page_no": str(idx), "page_type": page_type, "bill_items": cleaned})
 
 
 
 
 
 
 
 
 
 
 
 
 
678
  except Exception:
679
- pagewise.append({"page_no": str(idx), "page_type": "Bill Detail", "bill_items": []})
 
 
 
 
 
 
680
  continue
681
 
 
682
  total_item_count = sum(len(p.get("bill_items", [])) for p in pagewise)
 
 
 
 
 
 
 
 
 
 
683
  if not GEMINI_API_KEY or genai is None:
684
  cumulative_token_usage["warning_no_gemini"] = 1
685
 
686
- return {"is_success": True, "token_usage": cumulative_token_usage, "data": {"pagewise_line_items": pagewise, "total_item_count": total_item_count}}
 
 
 
 
 
 
 
 
 
687
 
688
  # ---------------- debug TSV ----------------
689
  @app.post("/debug-tsv")
@@ -696,19 +905,27 @@ async def debug_tsv(payload: BillRequest):
696
  file_bytes = resp.content
697
  except Exception:
698
  return {"error": "Download failed"}
 
699
  clean_url = doc_url.split("?", 1)[0].lower()
700
  if clean_url.endswith(".pdf"):
701
  imgs = convert_from_bytes(file_bytes)
702
  img = imgs[0]
703
  else:
704
  img = Image.open(BytesIO(file_bytes))
 
705
  proc = preprocess_image(img)
706
  cells = image_to_tsv_cells(proc)
707
  return {"cells": cells}
708
 
 
 
709
  @app.get("/")
710
  def health_check():
711
- msg = "Bill extraction API (final) live."
712
  if not GEMINI_API_KEY or genai is None:
713
- msg += " (No GEMINI_API_KEY/configured SDK LLM refinement skipped.)"
714
- return {"status": "ok", "message": msg, "hint": "POST /extract-bill-data with {'document':'<url>'}"}
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import re
3
  import json
4
  from io import BytesIO
5
  from typing import List, Dict, Any, Optional, Tuple
6
 
 
 
 
7
  from PIL import Image
 
 
 
8
  import numpy as np
9
  import cv2
10
+ import pytesseract
11
+ from pytesseract import Output
12
 
13
+ # ---------------- Config / Keywords ----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
15
  TOTAL_KEYWORDS = re.compile(
16
  r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
 
18
  )
19
  FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
20
 
21
+ HEADER_KEYWORDS = [
22
+ "description", "qty", "hrs", "rate", "discount", "net", "amt", "amount",
23
+ "consultation", "qty/hrs", "qty / hrs", "qty /", "qty/"
24
+ ]
25
  HEADER_PHRASES = [
26
  "description qty / hrs consultation rate discount net amt",
27
  "description qty / hrs rate discount net amt",
 
31
  ]
32
  HEADER_PHRASES = [h.lower() for h in HEADER_PHRASES]
33
 
34
+ # ---------------- Small utilities ----------------
35
  def sanitize_ocr_text(s: str) -> str:
36
  if not s:
37
  return ""
 
68
  return bool(t and NUM_RE.search(str(t)))
69
 
70
  def clean_name_text(s: str) -> str:
71
+ """
72
+ Normalize OCR names: remove odd punctuation, normalize SG codes, RR-2, and
73
+ safely map OR->DR only when it looks like a doctor's name.
74
+ """
75
+ if not s:
76
+ return s
77
+ s = s.replace("—", "-").replace("–", "-")
78
  s = re.sub(r"\s+", " ", s)
79
  s = s.strip(" -:,.")
80
+ # SG code normalization
81
  s = re.sub(r"\bSG0?(\d+)\b", r"SG\1", s, flags=re.I)
82
  s = re.sub(r"\b(RR)[\s\-]*2\b", r"RR-2", s, flags=re.I)
83
+
84
+ # Safer OR -> DR: only when pattern looks like a doctor name (e.g. "OR S SALIL KUMAR")
85
+ # Heuristic: 'OR' token followed by one or more tokens that are all alphabetic
86
+ # and at least one seems like a personal name (length > 2).
87
+ def safe_or_to_dr(text: str) -> str:
88
+ toks = text.split()
89
+ out = []
90
+ i = 0
91
+ while i < len(toks):
92
+ tok = toks[i]
93
+ if tok.upper() == "OR" and i + 1 < len(toks):
94
+ lookahead = toks[i+1:i+5] # check up to 4 following tokens
95
+ # all lookahead tokens are alphabetic-ish and at least one token length>2
96
+ if all(re.match(r"^[A-Za-z\-\.\']+$", la) for la in lookahead if la) and any(len(la) > 2 for la in lookahead):
97
+ out.append("DR")
98
+ i += 1
99
+ continue
100
+ out.append(tok)
101
+ i += 1
102
+ return " ".join(out)
103
+
104
+ s = safe_or_to_dr(s)
105
+
106
  return s.strip()
107
 
108
  # ---------------- image preprocessing ----------------
 
121
  pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
122
  cv_img = pil_to_cv2(pil_img)
123
  gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
124
+ # denoise
125
  try:
126
+ gray = cv2.fastNlMeansDenoising(gray, h=10)
127
+ except Exception:
128
+ pass
129
+ try:
130
+ bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
131
+ cv2.THRESH_BINARY, 41, 15)
132
  except Exception:
133
  _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
134
  kernel = np.ones((1,1), np.uint8)
135
  bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
136
  return bw
137
 
138
+ # ---------------- OCR TSV helpers ----------------
139
  def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
140
  try:
141
  o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
142
  except Exception:
143
  o = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
144
+ cells: List[Dict[str, Any]] = []
145
  n = len(o.get("text", []))
146
  for i in range(n):
147
  raw = o["text"][i]
 
160
  height = int(o.get("height", [0])[i])
161
  center_y = top + height / 2.0
162
  center_x = left + width / 2.0
163
+ cells.append({
164
+ "text": txt,
165
+ "conf": conf,
166
+ "left": left,
167
+ "top": top,
168
+ "width": width,
169
+ "height": height,
170
+ "center_y": center_y,
171
+ "center_x": center_x
172
+ })
173
  return cells
174
 
175
+ # ---------------- grouping into rows ----------------
176
  def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) -> List[List[Dict[str, Any]]]:
177
  if not cells:
178
  return []
179
  sorted_cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
180
+ rows: List[List[Dict[str, Any]]] = []
181
  current = [sorted_cells[0]]
182
  last_y = sorted_cells[0]["center_y"]
183
  for c in sorted_cells[1:]:
 
192
  rows.append(sorted(current, key=lambda cc: cc["left"]))
193
  return rows
194
 
195
+ # ---------------- merge multiline names (doctor merge added) ----------------
196
  def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
197
+ """
198
+ Merge split item/name rows. Added robust doctor-name merger:
199
+ - If a row is text-only and next row is doctor-name-like, merge them.
200
+ - Also merge short textual lines when both are short and non-numeric.
201
+ """
202
  if not rows:
203
  return rows
204
+ merged: List[List[Dict[str, Any]]] = []
205
  i = 0
206
  while i < len(rows):
207
  row = rows[i]
208
  tokens = [c["text"] for c in row]
209
+ joined = " ".join(tokens)
210
  has_num = any(is_numeric_token(t) for t in tokens)
211
+
212
+ # Doctor-name merger:
213
+ # If current row contains a header-like token (e.g. 'Consultation', 'Charge', '|')
214
+ # and next row looks like a doctor's name (mostly alphabetic tokens, few tokens),
215
+ # merge them.
216
  if not has_num and i + 1 < len(rows):
217
  next_row = rows[i+1]
218
+ next_txt = " ".join([c["text"] for c in next_row]).strip()
219
+ # doctor-like heuristics: mostly alphabetic tokens, not numeric, token count <= 6
220
+ next_tokens = [t for t in re.split(r"\s+", next_txt) if t]
221
+ next_alpha = all(re.match(r"^[A-Za-z\-\.\']+$", t) for t in next_tokens if t)
222
  next_has_num = any(is_numeric_token(t) for t in next_tokens)
223
+ # current row contains 'consultation' or 'charge' or '|' or 'dr' hint
224
+ if next_alpha and not next_has_num and len(next_tokens) <= 6:
225
+ # also ensure current row contains words like 'consultation' or 'charge' or 'dr' or '|'
226
+ if re.search(r"\b(consultation|charge|charges|\|)\b", joined, re.I) or re.search(r"\bdr\b", joined, re.I):
227
+ merged_row = row + next_row
228
+ merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
229
+ i += 2
230
+ continue
231
+
232
+ # If both current and next are short pure-text lines (likely split names), merge them
 
 
 
 
 
233
  if not has_num and i + 1 < len(rows):
234
  next_row = rows[i+1]
235
  next_tokens = [c["text"] for c in next_row]
236
  next_has_num = any(is_numeric_token(t) for t in next_tokens)
237
+ if not next_has_num and len(tokens) <= 3 and len(next_tokens) <= 4:
238
+ merged_row = row + next_row
 
 
 
 
 
 
 
 
 
 
 
 
239
  merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
240
  i += 2
241
  continue
242
+
243
+ # Default
244
  merged.append(row)
245
  i += 1
246
+
247
  return merged
248
 
249
+ # ---------------- Strong header detection (PATCH 1) ----------------
250
+ def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
251
+ if not txt:
252
+ return False
253
+ t = re.sub(r"\s+", " ", txt.strip().lower())
254
+
255
+ # universal blocklist patterns
256
+ header_patterns = [
257
+ r"description.*qty",
258
+ r"qty.*rate",
259
+ r"rate.*amount",
260
+ r"net\s*amt",
261
+ r"discount",
262
+ r"hrs\s*/\s*qty",
263
+ r"qty\s*/\s*hrs",
264
+ ]
265
+ for p in header_patterns:
266
+ if re.search(p, t):
267
+ return True
268
+
269
+ # blacklisted exact headers
270
+ if any(h == t for h in HEADER_PHRASES):
271
+ return True
272
+
273
+ # generic: if ≥3 header words → header
274
+ hits = sum(1 for k in HEADER_KEYWORDS if k in t)
275
+ if hits >= 3:
276
+ return True
277
+
278
+ # numeric structure: if line contains ≥3 numbers in tokenized order → header
279
+ tokens = re.split(r"[ \|,/]+", t)
280
+ numeric_count = sum(1 for tok in tokens if NUM_RE.search(tok))
281
+ if numeric_count >= 3:
282
+ return True
283
+
284
+ # top-of-page slightly looser
285
+ if top_of_page and hits >= 2:
286
+ return True
287
+
288
+ return False
289
+ # ---------------- parsing rows into items (Part 2) ----------------
290
+
291
  def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
292
+ """
293
+ Adaptive clustering of numeric tokens into column centers (restores conservative adaptive threshold).
294
+ """
295
  xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
296
  if not xs:
297
  return []
 
322
  distances = [abs(token_x - cx) for cx in column_centers]
323
  return int(np.argmin(distances))
324
 
325
+ # helper: quick check if item name looks like a lab/test (so we can adjust candidate rules)
326
+ LAB_TEST_KEYWORDS = set(["ct", "et", "hiv", "hcv", "pt", "rbs", "rft", "ts", "tsh", "hb", "hbsaG".lower()])
327
+ # more robust: tokens that are short and uppercase-like are often test codes; we'll check token itself lowercased.
328
+
329
+ def looks_like_lab_test(name: str) -> bool:
330
+ if not name:
331
+ return False
332
+ ln = name.lower()
333
+ # common short codes
334
+ for k in ["ct", "et", "hiv", "hcv", "pt", "rbs", "rft", "tsh", "hbsag", "hb", "pus", "group", "rh"]:
335
+ if re.search(r"\b" + re.escape(k) + r"\b", ln):
336
+ return True
337
+ # if the name contains terms 'test' or 'lab' or parentheses with code, treat as lab
338
+ if re.search(r"\b(test|lab|laborat|cmia|cima|cs)\b", ln):
339
+ return True
340
+ return False
341
+
342
  def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
343
+ """
344
+ Conservative parse: prefer not to invent rate/qty. Uses numeric column mapping, safer inference,
345
+ and special handling for lab tests to avoid exploding qty.
346
+ """
347
+ parsed_items: List[Dict[str, Any]] = []
348
  rows = merge_multiline_names(rows)
349
  column_centers = detect_numeric_columns(page_cells, max_columns=4)
350
 
 
353
  if not tokens:
354
  continue
355
  joined_lower = " ".join(tokens).lower()
356
+ # skip footer-like lines unless numeric
357
  if FOOTER_KEYWORDS.search(joined_lower) and not any(is_numeric_token(t) for t in tokens):
358
  continue
359
+ # skip lines with no numeric tokens (likely headers or pure text)
360
  if all(not is_numeric_token(t) for t in tokens):
361
  continue
362
 
 
367
  v = normalize_num_str(t)
368
  if v is not None:
369
  numeric_values.append(float(v))
370
+ # de-duplicate
371
+ numeric_values = sorted(list({float(x) for x in numeric_values}), reverse=True)
372
+
373
+ # Heuristic: remove tiny tokens that cause qty explosion except when amount < 100
374
+ # We'll apply this later when we know amount. For now keep them but mark.
375
 
376
  if column_centers:
377
+ # map numeric tokens to nearest columns
378
  left_text_parts = []
379
  numeric_bucket_map = {i: [] for i in range(len(column_centers))}
380
  for c in row:
 
383
  if is_numeric_token(t):
384
  col_idx = assign_token_to_column(cx, column_centers)
385
  if col_idx is None:
386
+ numeric_bucket_map[len(column_centers)-1].append(t)
387
  else:
388
  numeric_bucket_map[col_idx].append(t)
389
  else:
 
400
  rate = normalize_num_str(get_bucket(num_cols - 2)) if num_cols >= 2 else None
401
  qty = normalize_num_str(get_bucket(num_cols - 3)) if num_cols >= 3 else None
402
 
403
+ # fallback: last numeric token as amount
404
  if amount is None:
405
  for t in reversed(tokens):
406
  if is_numeric_token(t):
 
408
  if amount is not None:
409
  break
410
 
411
+ # Clean numeric_values now that we may know amount
412
+ numeric_candidates = numeric_values.copy()
413
+ if amount is not None:
414
+ numeric_candidates = [v for v in numeric_candidates if (v >= 5 or amount <= 100)]
415
+ else:
416
+ numeric_candidates = [v for v in numeric_candidates if v >= 5]
417
+
418
+ # special handling for lab tests: avoid tiny rates / large qty
419
+ lab_like = looks_like_lab_test(name)
420
+
421
+ # Try to infer rate & qty from numeric_candidates conservatively
422
+ inferred_rate = rate
423
+ inferred_qty = qty
424
+ if amount is not None and numeric_candidates:
425
+ # try candidates as rate
426
+ for cand in numeric_candidates:
427
+ if cand <= 1:
428
  continue
429
+ if cand >= amount:
430
  continue
431
+ ratio = amount / cand if cand else None
 
 
432
  if ratio is None:
433
  continue
434
  r = round(ratio)
435
  if r < 1 or r > 200:
436
  continue
437
+ # stricter for lab tests: reject qty > 10 and candidate < 5
438
+ if lab_like and r > 10:
439
+ continue
440
  if abs(ratio - r) <= max(0.03 * r, 0.15):
441
+ inferred_rate = float(cand)
442
+ inferred_qty = float(r)
443
+ break
 
 
444
 
445
  # fallback compute rate if qty found but rate missing
446
+ if (inferred_rate is None or inferred_rate == 0) and inferred_qty and inferred_qty != 0 and amount is not None:
447
  try:
448
+ candidate_rate = amount / inferred_qty
449
+ if candidate_rate >= 1:
450
+ inferred_rate = candidate_rate
 
451
  except Exception:
452
  pass
453
 
454
+ # If amount is zero but rate exists and qty exists, compute amount
455
+ if (amount is None or amount == 0) and inferred_rate and inferred_qty:
456
+ amount = round(inferred_rate * inferred_qty, 2)
457
+
458
  # final defaults
459
+ if inferred_qty is None:
460
+ inferred_qty = 1.0
461
+ if inferred_rate is None:
462
+ inferred_rate = 0.0
463
 
464
+ # final sanity checks
465
  try:
466
+ amount = float(round(amount, 2)) if amount is not None else None
467
  except Exception:
468
+ amount = None
469
  try:
470
+ inferred_rate = float(round(inferred_rate, 2)) if inferred_rate is not None else 0.0
471
  except Exception:
472
+ inferred_rate = 0.0
473
  try:
474
+ inferred_qty = float(inferred_qty)
475
  except Exception:
476
+ inferred_qty = 1.0
477
+
478
+ if amount is None or amount == 0:
479
+ # if amount still zero but we have rate>0 and qty present, compute
480
+ if inferred_rate and inferred_qty:
481
+ amount = round(inferred_rate * inferred_qty, 2)
482
+
483
+ if amount is None or amount == 0:
484
+ # give up - skip this row (avoid inventing)
485
+ continue
486
 
487
  parsed_items.append({
488
  "item_name": name if name else "UNKNOWN",
489
+ "item_amount": float(round(amount, 2)),
490
+ "item_rate": float(round(inferred_rate, 2)) if inferred_rate else 0.0,
491
+ "item_quantity": float(inferred_qty) if inferred_qty else 1.0,
492
  })
493
 
494
  else:
495
+ # no clear numeric columns — conservative right-to-left parsing
496
  numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t)]
497
  if not numeric_idxs:
498
  continue
 
503
  name = " ".join(tokens[:last]).strip()
504
  if not name:
505
  continue
506
+ # collect numeric tokens on RHS to attempt inference
 
 
 
507
  right_nums = []
508
  for i in numeric_idxs:
509
  v = normalize_num_str(tokens[i])
510
  if v is not None:
511
  right_nums.append(float(v))
512
+ right_nums = sorted(list({float(x) for x in right_nums}), reverse=True)
513
 
514
+ rate = None
515
+ qty = None
516
+
517
+ # conservative mapping
518
  if len(right_nums) >= 2:
519
  cand = right_nums[1]
520
  if float(cand) > 1 and float(cand) < float(amt):
 
521
  ratio = float(amt) / float(cand) if cand else None
522
  if ratio:
523
  r = round(ratio)
524
  if 1 <= r <= 200 and abs(ratio - r) <= max(0.03 * r, 0.15) and r <= 100:
525
  rate = float(cand)
526
  qty = float(r)
527
+
528
  if rate is None and right_nums:
529
  for cand in right_nums:
530
  if cand <= 1.0 or cand >= float(amt):
 
541
  if rate is None:
542
  rate = 0.0
543
 
544
+ # special lab test protections
545
+ if looks_like_lab_test(name):
546
+ # if rate <5 and amt>100 -> treat rate as 0 (avoid cand like 12 causing qty 25)
547
+ if rate < 5 and amt > 100:
548
+ rate = 0.0
549
+ qty = 1.0
550
+
551
+ # if amount==0 but rate>0, update
552
+ if amt == 0 and rate and qty:
553
+ amt = round(rate * qty, 2)
554
+
555
  parsed_items.append({
556
  "item_name": clean_name_text(name),
557
  "item_amount": float(round(amt, 2)),
 
564
  # ---------------- dedupe & totals ----------------
565
  def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
566
  seen = set()
567
+ out: List[Dict[str, Any]] = []
568
  for it in items:
569
+ nm = re.sub(r"\s+", " ", (it.get("item_name") or "").lower()).strip()
570
+ key = (nm[:120], round(float(it.get("item_amount", 0.0)), 2))
571
  if key in seen:
572
  continue
573
  seen.add(key)
 
591
  if final is None: final = float(round(v, 2))
592
  return {"subtotal": subtotal, "final_total": final}
593
 
594
+ # ---------------- Gemini refinement (improved prompt per PATCH 7) ----------------
595
  def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
596
+ """
597
+ Attempt deterministic Gemini refinement. If Gemini not configured/available, return page_items as-is.
598
+ """
599
  zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
600
  if not GEMINI_API_KEY or genai is None:
601
  return page_items, zero_usage
 
604
  system_prompt = (
605
  "You are a strict bill-extraction cleaner. Return ONLY a JSON array (no explanation, no backticks). "
606
  "Each entry must be an object with keys: item_name (string), item_amount (float), item_rate (float), item_quantity (float). "
607
+ "Do NOT include subtotal or total lines as items. Do NOT invent items; only clean/fix/normalize the given items. "
608
+ "Prefer exact names from the bill. If names are broken across lines, merge them. Do not rename items unless it's obvious OCR noise."
 
 
 
 
 
 
 
 
 
609
  )
610
+ user_prompt = f"""
611
+ Extract ONLY line items from this hospital bill.
612
+
613
+ ### RULES (MUST FOLLOW)
614
+ - Do NOT invent items.
615
+ - Do NOT return section headers (Room Charges, Lab Services, Radiology).
616
+ - Merge broken multi-line names.
617
+ - Reconstruct missing rate/qty using amt=rate*qty if visible in text.
618
+ - Prefer exact names as shown in bill.
619
+ - If a doctor name appears across lines, merge to full name.
620
+ - Ignore totals / subtotals.
621
+ - Ignore page numbers.
622
+ - Avoid changing 'OR' unless it is clearly a doctor prefix.
623
+ - Ignore final bill summaries.
624
+
625
+ ### OCR TEXT:
626
+ {safe_text}
627
+
628
+ ### INITIAL ITEMS:
629
+ {json.dumps(page_items, ensure_ascii=False, indent=2)}
630
+
631
+ Return ONLY a JSON array of cleaned items, e.g.:
632
+ [
633
+ {{ "item_name": "Consultation Charge | DR PREETHI MARY JOSEPH", "item_amount": 300.0, "item_rate": 300.0, "item_quantity": 1.0 }},
634
+ ...
635
+ ]
636
+ """
637
  model = genai.GenerativeModel(GEMINI_MODEL_NAME)
638
  response = model.generate_content(
639
  [
 
660
  })
661
  except Exception:
662
  continue
663
+ # token usage not reliably available here; return zeros
664
  return cleaned, zero_usage
665
  return page_items, zero_usage
666
  except Exception:
667
  return page_items, zero_usage
668
 
669
+ # ---------------- Post-validation engine (PATCH 5) ----------------
670
+ def post_validate_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
671
+ """
672
+ Rule engine to fix common Gemini hallucinations / OCR inference errors.
673
+ - If amount==0 and rate & qty present -> amount = rate * qty
674
+ - If rate*qty differs from amount by tolerance -> recompute qty or rate conservatively
675
+ - Clamp unreasonable qty for lab tests
676
+ """
677
+ out = []
678
+ for it in items:
679
+ name = it.get("item_name", "") or ""
680
+ amt = float(it.get("item_amount", 0.0) or 0.0)
681
+ rate = float(it.get("item_rate", 0.0) or 0.0)
682
+ qty = float(it.get("item_quantity", 1.0) or 1.0)
 
 
 
 
 
 
 
 
683
 
684
+ lab_like = looks_like_lab_test(name)
685
+
686
+ # If amount missing but rate & qty known -> compute amount
687
+ if (amt == 0 or amt is None) and rate > 0 and qty > 0:
688
+ amt = round(rate * qty, 2)
689
+
690
+ # If rate missing but amt and qty present -> compute rate
691
+ if (rate == 0 or rate is None) and qty and qty != 0:
692
+ try:
693
+ candidate_rate = amt / qty
694
+ if candidate_rate > 0:
695
+ rate = round(candidate_rate, 2)
696
+ except Exception:
697
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698
 
699
+ # If qty obviously wrong (amt not close to rate*qty), try recompute qty
700
+ if rate > 0:
701
+ ideal = rate * qty
702
+ if abs(ideal - amt) > max(2.0, 0.1 * ideal):
703
+ # try compute qty = amt/rate
704
+ try:
705
+ q = amt / rate if rate else qty
706
+ if 1 <= round(q) <= (10 if lab_like else 100):
707
+ qty = float(round(q))
708
+ else:
709
+ # fallback: set qty to 1
710
+ qty = 1.0
711
+ except Exception:
712
+ qty = 1.0
713
+
714
+ # Clamp lab test qtys to reasonable bounds
715
+ if lab_like and qty > 10:
716
+ qty = 1.0
717
+
718
+ # Recompute amt if mismatch after adjustments
719
+ if rate > 0:
720
+ recomputed = round(rate * qty, 2)
721
+ # if recomputed is close to amt, prefer recomputed
722
+ if abs(recomputed - amt) <= max(2.0, 0.05 * recomputed):
723
+ amt = recomputed
724
+ # else if amt much larger but not matching, keep amt but set qty=1
725
+ else:
726
+ if abs(amt - recomputed) / max(1.0, recomputed) > 0.5:
727
+ qty = 1.0
728
+ # and try recompute rate if rate seems wrong
729
+ rate = round(amt / qty, 2) if qty else rate
730
+
731
+ it["item_amount"] = round(float(amt or 0.0), 2)
732
+ it["item_rate"] = round(float(rate or 0.0), 2)
733
+ it["item_quantity"] = float(qty or 1.0)
734
+ out.append(it)
735
+ return out
736
  # ---------------- main endpoint ----------------
737
  @app.post("/extract-bill-data")
738
  async def extract_bill_data(payload: BillRequest):
739
  doc_url = payload.document
740
+
741
+ # ---------- download ----------
742
  try:
743
  headers = {"User-Agent": "Mozilla/5.0"}
744
  resp = requests.get(doc_url, headers=headers, timeout=30)
 
746
  raise RuntimeError(f"download failed status={resp.status_code}")
747
  file_bytes = resp.content
748
  except Exception:
749
+ return {
750
+ "is_success": False,
751
+ "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
752
+ "data": {
753
+ "pagewise_line_items": [],
754
+ "total_item_count": 0,
755
+ "final_total": 0.0
756
+ }
757
+ }
758
+
759
+ # ---------- convert to images ----------
760
  images = []
761
  clean_url = doc_url.split("?", 1)[0].lower()
762
  try:
 
767
  else:
768
  try:
769
  images = convert_from_bytes(file_bytes)
770
+ except:
771
  images = []
772
  except Exception:
773
  images = []
 
775
  pagewise = []
776
  cumulative_token_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
777
 
778
+ # ---------- per page ----------
779
  for idx, page_img in enumerate(images, start=1):
780
  try:
781
  proc = preprocess_image(page_img)
782
+
783
+ # TSV
784
  cells = image_to_tsv_cells(proc)
785
  rows = group_cells_into_rows(cells, y_tolerance=12)
786
+
787
  rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
788
 
789
+ # ---------------- HEADER PREFILTER ----------------
790
  rows_filtered = []
791
  for i, (r, rt) in enumerate(zip(rows, rows_texts)):
792
  top_flag = (i < 6)
793
  rt_norm = sanitize_ocr_text(rt).lower()
794
+
795
+ # strong header detector (from patched Part 1)
796
  if looks_like_header_text(rt_norm, top_of_page=top_flag):
797
  continue
798
+
799
+ # legacy blacklist
800
  if any(h in rt_norm for h in HEADER_PHRASES):
801
  continue
802
+
803
  rows_filtered.append(r)
804
+
805
  rows = rows_filtered
806
  rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
807
  page_text = sanitize_ocr_text(" ".join(rows_texts))
808
 
809
+ # detect headers at top of page
810
  top_headers = []
811
  for i, rt in enumerate(rows_texts[:6]):
812
+ if looks_like_header_text(rt.lower(), top_of_page=(i < 4)):
813
  top_headers.append(rt.strip().lower())
814
 
815
+ # ---------------- PARSE ITEMS ----------------
816
  parsed_items = parse_rows_with_columns(rows, cells)
817
 
818
+ # ---------------- GEMINI REFINEMENT ----------------
819
  refined_items, token_u = refine_with_gemini(parsed_items, page_text)
820
  for k in cumulative_token_usage:
821
  cumulative_token_usage[k] += token_u.get(k, 0)
822
 
823
+ # ---------------- CONTEXT-AWARE SECTION FILTER ----------------
824
+ other_item_names = [it.get("item_name", "") for it in refined_items]
825
+
826
+ cleaned = []
827
+ for p in refined_items:
828
+ if final_item_filter(p, known_page_headers=top_headers, other_item_names=other_item_names):
829
+ cleaned.append(p)
830
 
 
 
831
  cleaned = dedupe_items(cleaned)
832
+
833
+ # drop any leftover header noise
834
  cleaned = [p for p in cleaned if not looks_like_header_text(p["item_name"].lower())]
835
 
836
+ # ---------------- RULE ENGINE POST-VALIDATION ----------------
837
+ cleaned = post_validate_items(cleaned)
838
+
839
+ # ---------------- PAGE TYPE ----------------
840
  page_type = "Bill Detail"
841
  page_txt = page_text.lower()
842
  if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
 
844
  if "final bill" in page_txt or "grand total" in page_txt:
845
  page_type = "Final Bill"
846
 
847
+ # ---------------- PER-PAGE SUBTOTAL/TOTAL ----------------
848
+ detected = detect_subtotals_and_totals(rows_texts)
849
+ page_subtotal = detected.get("subtotal")
850
+ page_final = detected.get("final_total")
851
+
852
+ # ---------------- STORE PAGE ----------------
853
+ pagewise.append({
854
+ "page_no": str(idx),
855
+ "page_type": page_type,
856
+ "bill_items": cleaned,
857
+ "subtotal": page_subtotal,
858
+ "final_page_total": page_final
859
+ })
860
+
861
  except Exception:
862
+ pagewise.append({
863
+ "page_no": str(idx),
864
+ "page_type": "Bill Detail",
865
+ "bill_items": [],
866
+ "subtotal": None,
867
+ "final_page_total": None
868
+ })
869
  continue
870
 
871
+ # ---------------- GLOBAL FINAL TOTAL ----------------
872
  total_item_count = sum(len(p.get("bill_items", [])) for p in pagewise)
873
+
874
+ # Sum items across all pages (no double counting)
875
+ grand_total = 0.0
876
+ for p in pagewise:
877
+ for it in p.get("bill_items", []):
878
+ try:
879
+ grand_total += float(it.get("item_amount", 0.0) or 0.0)
880
+ except:
881
+ pass
882
+
883
  if not GEMINI_API_KEY or genai is None:
884
  cumulative_token_usage["warning_no_gemini"] = 1
885
 
886
+ return {
887
+ "is_success": True,
888
+ "token_usage": cumulative_token_usage,
889
+ "data": {
890
+ "pagewise_line_items": pagewise,
891
+ "total_item_count": total_item_count,
892
+ "final_total": round(grand_total, 2)
893
+ }
894
+ }
895
+
896
 
897
  # ---------------- debug TSV ----------------
898
  @app.post("/debug-tsv")
 
905
  file_bytes = resp.content
906
  except Exception:
907
  return {"error": "Download failed"}
908
+
909
  clean_url = doc_url.split("?", 1)[0].lower()
910
  if clean_url.endswith(".pdf"):
911
  imgs = convert_from_bytes(file_bytes)
912
  img = imgs[0]
913
  else:
914
  img = Image.open(BytesIO(file_bytes))
915
+
916
  proc = preprocess_image(img)
917
  cells = image_to_tsv_cells(proc)
918
  return {"cells": cells}
919
 
920
+
921
+ # ---------------- health check ----------------
922
  @app.get("/")
923
  def health_check():
924
+ msg = "Bill extraction API (patched v3) live."
925
  if not GEMINI_API_KEY or genai is None:
926
+ msg += " (No Gemini LLM refinement disabled)"
927
+ return {
928
+ "status": "ok",
929
+ "message": msg,
930
+ "hint": "POST /extract-bill-data with {'document':'<url>'}"
931
+ }