Sathvik-kota commited on
Commit
310da4b
·
verified ·
1 Parent(s): 8803a3c

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +514 -648
app.py CHANGED
@@ -1,27 +1,60 @@
 
 
 
 
 
1
  import os
2
  import re
3
  import json
4
  from io import BytesIO
5
  from typing import List, Dict, Any, Optional, Tuple
6
 
 
 
 
7
  from PIL import Image
8
- import numpy as np
9
- import cv2
10
  import pytesseract
11
  from pytesseract import Output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # ---------------- Config / Keywords ----------------
14
  NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
15
- TOTAL_KEYWORDS = re.compile(
16
- r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
17
- re.I,
18
- )
19
- FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
20
 
21
  HEADER_KEYWORDS = [
22
- "description", "qty", "hrs", "rate", "discount", "net", "amt", "amount",
23
- "consultation", "qty/hrs", "qty / hrs", "qty /", "qty/"
 
24
  ]
 
25
  HEADER_PHRASES = [
26
  "description qty / hrs consultation rate discount net amt",
27
  "description qty / hrs rate discount net amt",
@@ -31,7 +64,14 @@ HEADER_PHRASES = [
31
  ]
32
  HEADER_PHRASES = [h.lower() for h in HEADER_PHRASES]
33
 
34
- # ---------------- Small utilities ----------------
 
 
 
 
 
 
 
35
  def sanitize_ocr_text(s: str) -> str:
36
  if not s:
37
  return ""
@@ -40,220 +80,237 @@ def sanitize_ocr_text(s: str) -> str:
40
  s = s.replace("\r\n", "\n").replace("\r", "\n")
41
  s = re.sub(r"[ \t]+", " ", s)
42
  s = s.strip()
43
- return s[:4000]
 
44
 
45
  def normalize_num_str(s: Optional[str]) -> Optional[float]:
46
  if s is None:
47
  return None
48
  s = str(s).strip()
 
49
  if s == "":
50
  return None
51
- s = re.sub(r"[^\d\-\+\,\.\(\)]", "", s)
52
  negative = False
53
  if s.startswith("(") and s.endswith(")"):
54
  negative = True
55
  s = s[1:-1]
56
  s = s.replace(",", "")
57
- if s in ("", "-", "+"):
58
- return None
59
  try:
60
- return -float(s) if negative else float(s)
61
- except Exception:
62
- try:
63
- return float(s.replace(" ", ""))
64
- except Exception:
65
- return None
66
 
67
  def is_numeric_token(t: Optional[str]) -> bool:
68
  return bool(t and NUM_RE.search(str(t)))
69
 
 
70
  def clean_name_text(s: str) -> str:
71
- """
72
- Normalize OCR names: remove odd punctuation, normalize SG codes, RR-2, and
73
- safely map OR->DR only when it looks like a doctor's name.
74
- """
75
- if not s:
76
- return s
77
- s = s.replace("—", "-").replace("–", "-")
78
  s = re.sub(r"\s+", " ", s)
79
  s = s.strip(" -:,.")
80
- # SG code normalization
81
- s = re.sub(r"\bSG0?(\d+)\b", r"SG\1", s, flags=re.I)
82
- s = re.sub(r"\b(RR)[\s\-]*2\b", r"RR-2", s, flags=re.I)
83
-
84
- # Safer OR -> DR: only when pattern looks like a doctor name (e.g. "OR S SALIL KUMAR")
85
- # Heuristic: 'OR' token followed by one or more tokens that are all alphabetic
86
- # and at least one seems like a personal name (length > 2).
87
- def safe_or_to_dr(text: str) -> str:
88
- toks = text.split()
89
- out = []
90
- i = 0
91
- while i < len(toks):
92
- tok = toks[i]
93
- if tok.upper() == "OR" and i + 1 < len(toks):
94
- lookahead = toks[i+1:i+5] # check up to 4 following tokens
95
- # all lookahead tokens are alphabetic-ish and at least one token length>2
96
- if all(re.match(r"^[A-Za-z\-\.\']+$", la) for la in lookahead if la) and any(len(la) > 2 for la in lookahead):
97
- out.append("DR")
98
- i += 1
99
- continue
100
- out.append(tok)
101
- i += 1
102
- return " ".join(out)
103
 
104
- s = safe_or_to_dr(s)
105
 
106
- return s.strip()
 
 
107
 
108
- # ---------------- image preprocessing ----------------
109
- def pil_to_cv2(img: Image.Image) -> Any:
110
  arr = np.array(img)
111
  if arr.ndim == 2:
112
  return arr
113
  return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
114
 
115
- def preprocess_image(pil_img: Image.Image) -> Any:
 
116
  pil_img = pil_img.convert("RGB")
117
  w, h = pil_img.size
118
- target_w = 1500
119
- if w < target_w:
120
- scale = target_w / float(w)
121
  pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
122
- cv_img = pil_to_cv2(pil_img)
123
- gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
124
- # denoise
125
- try:
126
- gray = cv2.fastNlMeansDenoising(gray, h=10)
127
- except Exception:
128
- pass
129
  try:
130
- bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
 
131
  cv2.THRESH_BINARY, 41, 15)
132
- except Exception:
133
- _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
134
- kernel = np.ones((1,1), np.uint8)
135
- bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
 
136
  return bw
137
 
138
- # ---------------- OCR TSV helpers ----------------
139
- def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
 
 
 
 
140
  try:
141
- o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
142
- except Exception:
143
- o = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
144
- cells: List[Dict[str, Any]] = []
145
- n = len(o.get("text", []))
 
 
 
 
 
 
146
  for i in range(n):
147
- raw = o["text"][i]
148
- if raw is None:
149
- continue
150
- txt = str(raw).strip()
151
- if not txt:
152
  continue
153
  try:
154
- conf = float(o["conf"][i]) if o["conf"][i] not in (None, "", "-1") else -1.0
155
- except Exception:
156
  conf = -1.0
157
- left = int(o.get("left", [0])[i])
158
- top = int(o.get("top", [0])[i])
159
- width = int(o.get("width", [0])[i])
160
- height = int(o.get("height", [0])[i])
161
- center_y = top + height / 2.0
162
- center_x = left + width / 2.0
163
  cells.append({
164
- "text": txt,
165
  "conf": conf,
166
  "left": left,
167
  "top": top,
168
  "width": width,
169
  "height": height,
170
- "center_y": center_y,
171
- "center_x": center_x
172
  })
173
  return cells
174
 
175
- # ---------------- grouping into rows ----------------
176
- def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) -> List[List[Dict[str, Any]]]:
 
 
 
 
177
  if not cells:
178
  return []
179
- sorted_cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
180
- rows: List[List[Dict[str, Any]]] = []
181
- current = [sorted_cells[0]]
182
- last_y = sorted_cells[0]["center_y"]
183
- for c in sorted_cells[1:]:
184
- if abs(c["center_y"] - last_y) <= y_tolerance:
 
 
185
  current.append(c)
186
  last_y = (last_y * (len(current) - 1) + c["center_y"]) / len(current)
187
  else:
188
  rows.append(sorted(current, key=lambda cc: cc["left"]))
189
  current = [c]
190
  last_y = c["center_y"]
 
191
  if current:
192
  rows.append(sorted(current, key=lambda cc: cc["left"]))
 
193
  return rows
194
 
195
- # ---------------- merge multiline names (doctor merge added) ----------------
196
- def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
197
- """
198
- Merge split item/name rows. Added robust doctor-name merger:
199
- - If a row is text-only and next row is doctor-name-like, merge them.
200
- - Also merge short textual lines when both are short and non-numeric.
201
- """
202
  if not rows:
203
  return rows
204
- merged: List[List[Dict[str, Any]]] = []
 
205
  i = 0
206
  while i < len(rows):
207
  row = rows[i]
208
  tokens = [c["text"] for c in row]
209
  joined = " ".join(tokens)
 
210
  has_num = any(is_numeric_token(t) for t in tokens)
211
 
212
- # Doctor-name merger:
213
- # If current row contains a header-like token (e.g. 'Consultation', 'Charge', '|')
214
- # and next row looks like a doctor's name (mostly alphabetic tokens, few tokens),
215
- # merge them.
216
- if not has_num and i + 1 < len(rows):
217
- next_row = rows[i+1]
218
- next_txt = " ".join([c["text"] for c in next_row]).strip()
219
- # doctor-like heuristics: mostly alphabetic tokens, not numeric, token count <= 6
220
- next_tokens = [t for t in re.split(r"\s+", next_txt) if t]
221
- next_alpha = all(re.match(r"^[A-Za-z\-\.\']+$", t) for t in next_tokens if t)
222
- next_has_num = any(is_numeric_token(t) for t in next_tokens)
223
- # current row contains 'consultation' or 'charge' or '|' or 'dr' hint
224
- if next_alpha and not next_has_num and len(next_tokens) <= 6:
225
- # also ensure current row contains words like 'consultation' or 'charge' or 'dr' or '|'
226
- if re.search(r"\b(consultation|charge|charges|\|)\b", joined, re.I) or re.search(r"\bdr\b", joined, re.I):
227
- merged_row = row + next_row
228
- merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
229
- i += 2
230
- continue
231
 
232
- # If both current and next are short pure-text lines (likely split names), merge them
233
- if not has_num and i + 1 < len(rows):
234
- next_row = rows[i+1]
235
- next_tokens = [c["text"] for c in next_row]
236
- next_has_num = any(is_numeric_token(t) for t in next_tokens)
237
- if not next_has_num and len(tokens) <= 3 and len(next_tokens) <= 4:
238
- merged_row = row + next_row
239
  merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
240
  i += 2
241
  continue
242
 
243
- # Default
244
  merged.append(row)
245
  i += 1
246
 
247
  return merged
248
 
249
- # ---------------- Strong header detection (PATCH 1) ----------------
250
- def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  if not txt:
252
  return False
 
253
  t = re.sub(r"\s+", " ", txt.strip().lower())
254
 
255
- # universal blocklist patterns
256
- header_patterns = [
257
  r"description.*qty",
258
  r"qty.*rate",
259
  r"rate.*amount",
@@ -262,603 +319,429 @@ def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
262
  r"hrs\s*/\s*qty",
263
  r"qty\s*/\s*hrs",
264
  ]
265
- for p in header_patterns:
266
  if re.search(p, t):
267
  return True
268
 
269
- # blacklisted exact headers
270
  if any(h == t for h in HEADER_PHRASES):
271
  return True
272
 
273
- # generic: if ≥3 header words → header
274
  hits = sum(1 for k in HEADER_KEYWORDS if k in t)
275
  if hits >= 3:
276
  return True
277
 
278
- # numeric structure: if line contains ≥3 numbers in tokenized order → header
279
  tokens = re.split(r"[ \|,/]+", t)
280
- numeric_count = sum(1 for tok in tokens if NUM_RE.search(tok))
281
- if numeric_count >= 3:
282
  return True
283
 
284
- # top-of-page slightly looser
285
  if top_of_page and hits >= 2:
286
  return True
287
 
288
  return False
289
- # ---------------- parsing rows into items (Part 2) ----------------
290
-
291
- def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
292
- """
293
- Adaptive clustering of numeric tokens into column centers (restores conservative adaptive threshold).
294
- """
295
- xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
296
- if not xs:
297
- return []
298
- xs = sorted(xs)
299
- if len(xs) == 1:
300
- return [xs[0]]
301
- gaps = [xs[i+1] - xs[i] for i in range(len(xs)-1)]
302
- mean_gap = float(np.mean(gaps))
303
- std_gap = float(np.std(gaps)) if len(gaps) > 1 else 0.0
304
- gap_thresh = max(30.0, mean_gap + 0.6 * std_gap)
305
- clusters = []
306
- curr = [xs[0]]
307
- for i, g in enumerate(gaps):
308
- if g > gap_thresh and len(clusters) < (max_columns - 1):
309
- clusters.append(curr)
310
- curr = [xs[i+1]]
311
- else:
312
- curr.append(xs[i+1])
313
- clusters.append(curr)
314
- centers = [float(np.median(c)) for c in clusters]
315
- if len(centers) > max_columns:
316
- centers = centers[-max_columns:]
317
- return sorted(centers)
318
 
319
- def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optional[int]:
320
- if not column_centers:
321
- return None
322
- distances = [abs(token_x - cx) for cx in column_centers]
323
- return int(np.argmin(distances))
324
-
325
- # helper: quick check if item name looks like a lab/test (so we can adjust candidate rules)
326
- LAB_TEST_KEYWORDS = set(["ct", "et", "hiv", "hcv", "pt", "rbs", "rft", "ts", "tsh", "hb", "hbsaG".lower()])
327
- # more robust: tokens that are short and uppercase-like are often test codes; we'll check token itself lowercased.
328
 
329
- def looks_like_lab_test(name: str) -> bool:
330
- if not name:
331
- return False
332
- ln = name.lower()
333
- # common short codes
334
- for k in ["ct", "et", "hiv", "hcv", "pt", "rbs", "rft", "tsh", "hbsag", "hb", "pus", "group", "rh"]:
335
- if re.search(r"\b" + re.escape(k) + r"\b", ln):
336
- return True
337
- # if the name contains terms 'test' or 'lab' or parentheses with code, treat as lab
338
- if re.search(r"\b(test|lab|laborat|cmia|cima|cs)\b", ln):
339
- return True
340
- return False
341
 
342
- def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
343
- """
344
- Conservative parse: prefer not to invent rate/qty. Uses numeric column mapping, safer inference,
345
- and special handling for lab tests to avoid exploding qty.
346
- """
347
- parsed_items: List[Dict[str, Any]] = []
348
  rows = merge_multiline_names(rows)
349
- column_centers = detect_numeric_columns(page_cells, max_columns=4)
 
 
350
 
351
  for row in rows:
352
- tokens = [c["text"] for c in row]
353
- if not tokens:
354
- continue
355
- joined_lower = " ".join(tokens).lower()
356
- # skip footer-like lines unless numeric
357
- if FOOTER_KEYWORDS.search(joined_lower) and not any(is_numeric_token(t) for t in tokens):
358
  continue
359
- # skip lines with no numeric tokens (likely headers or pure text)
360
- if all(not is_numeric_token(t) for t in tokens):
361
  continue
362
 
363
- # gather numeric candidates (unique, filtered)
364
  numeric_values = []
365
- for t in tokens:
366
  if is_numeric_token(t):
367
  v = normalize_num_str(t)
368
  if v is not None:
369
  numeric_values.append(float(v))
370
- # de-duplicate
371
- numeric_values = sorted(list({float(x) for x in numeric_values}), reverse=True)
372
 
373
- # Heuristic: remove tiny tokens that cause qty explosion except when amount < 100
374
- # We'll apply this later when we know amount. For now keep them but mark.
 
 
 
 
 
 
 
375
 
376
- if column_centers:
377
- # map numeric tokens to nearest columns
378
- left_text_parts = []
379
- numeric_bucket_map = {i: [] for i in range(len(column_centers))}
380
  for c in row:
381
  t = c["text"]
382
- cx = c["center_x"]
383
  if is_numeric_token(t):
384
- col_idx = assign_token_to_column(cx, column_centers)
385
- if col_idx is None:
386
- numeric_bucket_map[len(column_centers)-1].append(t)
387
- else:
388
- numeric_bucket_map[col_idx].append(t)
389
  else:
390
- left_text_parts.append(t)
391
- raw_name = " ".join(left_text_parts).strip()
392
- name = clean_name_text(raw_name) if raw_name else ""
393
 
394
- num_cols = len(column_centers)
395
- def get_bucket(idx):
396
- vals = numeric_bucket_map.get(idx, [])
 
 
 
 
397
  return vals[-1] if vals else None
398
 
399
- amount = normalize_num_str(get_bucket(num_cols - 1)) if num_cols >= 1 else None
400
- rate = normalize_num_str(get_bucket(num_cols - 2)) if num_cols >= 2 else None
401
- qty = normalize_num_str(get_bucket(num_cols - 3)) if num_cols >= 3 else None
402
 
403
- # fallback: last numeric token as amount
404
  if amount is None:
405
- for t in reversed(tokens):
406
  if is_numeric_token(t):
407
  amount = normalize_num_str(t)
408
  if amount is not None:
409
  break
410
 
411
- # Clean numeric_values now that we may know amount
412
- numeric_candidates = numeric_values.copy()
413
- if amount is not None:
414
- numeric_candidates = [v for v in numeric_candidates if (v >= 5 or amount <= 100)]
415
- else:
416
- numeric_candidates = [v for v in numeric_candidates if v >= 5]
417
-
418
- # special handling for lab tests: avoid tiny rates / large qty
419
- lab_like = looks_like_lab_test(name)
420
-
421
- # Try to infer rate & qty from numeric_candidates conservatively
422
- inferred_rate = rate
423
- inferred_qty = qty
424
- if amount is not None and numeric_candidates:
425
- # try candidates as rate
426
- for cand in numeric_candidates:
427
- if cand <= 1:
428
- continue
429
- if cand >= amount:
430
- continue
431
- ratio = amount / cand if cand else None
432
- if ratio is None:
433
- continue
434
- r = round(ratio)
435
- if r < 1 or r > 200:
436
- continue
437
- # stricter for lab tests: reject qty > 10 and candidate < 5
438
- if lab_like and r > 10:
439
- continue
440
- if abs(ratio - r) <= max(0.03 * r, 0.15):
441
- inferred_rate = float(cand)
442
- inferred_qty = float(r)
443
- break
444
-
445
- # fallback compute rate if qty found but rate missing
446
- if (inferred_rate is None or inferred_rate == 0) and inferred_qty and inferred_qty != 0 and amount is not None:
447
- try:
448
- candidate_rate = amount / inferred_qty
449
- if candidate_rate >= 1:
450
- inferred_rate = candidate_rate
451
- except Exception:
452
- pass
453
-
454
- # If amount is zero but rate exists and qty exists, compute amount
455
- if (amount is None or amount == 0) and inferred_rate and inferred_qty:
456
- amount = round(inferred_rate * inferred_qty, 2)
457
-
458
- # final defaults
459
- if inferred_qty is None:
460
- inferred_qty = 1.0
461
- if inferred_rate is None:
462
- inferred_rate = 0.0
463
-
464
- # final sanity checks
465
- try:
466
- amount = float(round(amount, 2)) if amount is not None else None
467
- except Exception:
468
- amount = None
469
- try:
470
- inferred_rate = float(round(inferred_rate, 2)) if inferred_rate is not None else 0.0
471
- except Exception:
472
- inferred_rate = 0.0
473
- try:
474
- inferred_qty = float(inferred_qty)
475
- except Exception:
476
- inferred_qty = 1.0
477
-
478
- if amount is None or amount == 0:
479
- # if amount still zero but we have rate>0 and qty present, compute
480
- if inferred_rate and inferred_qty:
481
- amount = round(inferred_rate * inferred_qty, 2)
482
-
483
- if amount is None or amount == 0:
484
- # give up - skip this row (avoid inventing)
485
- continue
486
 
487
- parsed_items.append({
488
  "item_name": name if name else "UNKNOWN",
489
- "item_amount": float(round(amount, 2)),
490
- "item_rate": float(round(inferred_rate, 2)) if inferred_rate else 0.0,
491
- "item_quantity": float(inferred_qty) if inferred_qty else 1.0,
492
  })
493
 
494
  else:
495
- # no clear numeric columns conservative right-to-left parsing
496
- numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t)]
497
- if not numeric_idxs:
498
  continue
499
- last = numeric_idxs[-1]
500
- amt = normalize_num_str(tokens[last])
501
  if amt is None:
502
  continue
503
- name = " ".join(tokens[:last]).strip()
 
504
  if not name:
505
  continue
506
- # collect numeric tokens on RHS to attempt inference
507
- right_nums = []
508
- for i in numeric_idxs:
509
- v = normalize_num_str(tokens[i])
510
- if v is not None:
511
- right_nums.append(float(v))
512
- right_nums = sorted(list({float(x) for x in right_nums}), reverse=True)
513
-
514
- rate = None
515
- qty = None
516
-
517
- # conservative mapping
518
- if len(right_nums) >= 2:
519
- cand = right_nums[1]
520
- if float(cand) > 1 and float(cand) < float(amt):
521
- ratio = float(amt) / float(cand) if cand else None
522
- if ratio:
523
- r = round(ratio)
524
- if 1 <= r <= 200 and abs(ratio - r) <= max(0.03 * r, 0.15) and r <= 100:
525
- rate = float(cand)
526
- qty = float(r)
527
-
528
- if rate is None and right_nums:
529
- for cand in right_nums:
530
- if cand <= 1.0 or cand >= float(amt):
531
- continue
532
- ratio = float(amt) / float(cand)
533
- r = round(ratio)
534
- if 1 <= r <= 100 and abs(ratio - r) <= max(0.03 * r, 0.15):
535
- rate = float(cand)
536
- qty = float(r)
537
- break
538
 
539
- if qty is None:
540
- qty = 1.0
541
- if rate is None:
542
- rate = 0.0
543
 
544
- # special lab test protections
545
- if looks_like_lab_test(name):
546
- # if rate <5 and amt>100 -> treat rate as 0 (avoid cand like 12 causing qty 25)
547
- if rate < 5 and amt > 100:
548
- rate = 0.0
549
- qty = 1.0
550
 
551
- # if amount==0 but rate>0, update
552
- if amt == 0 and rate and qty:
553
- amt = round(rate * qty, 2)
554
 
555
- parsed_items.append({
 
 
 
 
 
 
 
 
 
 
556
  "item_name": clean_name_text(name),
557
  "item_amount": float(round(amt, 2)),
558
  "item_rate": float(round(rate, 2)),
559
- "item_quantity": float(qty),
560
  })
561
 
562
- return parsed_items
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
 
564
- # ---------------- dedupe & totals ----------------
565
- def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
566
- seen = set()
567
- out: List[Dict[str, Any]] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
568
  for it in items:
569
- nm = re.sub(r"\s+", " ", (it.get("item_name") or "").lower()).strip()
570
- key = (nm[:120], round(float(it.get("item_amount", 0.0)), 2))
571
- if key in seen:
572
- continue
573
- seen.add(key)
 
 
 
 
 
 
 
 
 
 
 
 
 
574
  out.append(it)
575
  return out
576
 
577
- def detect_subtotals_and_totals(rows_texts: List[str]) -> Dict[str, Optional[float]]:
578
- subtotal = None; final = None
579
- for rt in rows_texts[::-1]:
580
- if not rt or rt.strip() == "":
 
 
 
 
 
 
 
581
  continue
 
582
  if TOTAL_KEYWORDS.search(rt):
583
  m = NUM_RE.search(rt)
584
  if m:
585
  v = normalize_num_str(m.group(0))
586
  if v is None:
587
  continue
588
- if re.search(r"sub", rt, re.I):
589
- if subtotal is None: subtotal = float(round(v, 2))
 
 
590
  else:
591
- if final is None: final = float(round(v, 2))
592
- return {"subtotal": subtotal, "final_total": final}
593
-
594
- # ---------------- Gemini refinement (improved prompt per PATCH 7) ----------------
595
- def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
596
- """
597
- Attempt deterministic Gemini refinement. If Gemini not configured/available, return page_items as-is.
598
- """
599
- zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
 
 
600
  if not GEMINI_API_KEY or genai is None:
601
- return page_items, zero_usage
602
- try:
603
- safe_text = sanitize_ocr_text(page_text)
604
- system_prompt = (
605
- "You are a strict bill-extraction cleaner. Return ONLY a JSON array (no explanation, no backticks). "
606
- "Each entry must be an object with keys: item_name (string), item_amount (float), item_rate (float), item_quantity (float). "
607
- "Do NOT include subtotal or total lines as items. Do NOT invent items; only clean/fix/normalize the given items. "
608
- "Prefer exact names from the bill. If names are broken across lines, merge them. Do not rename items unless it's obvious OCR noise."
609
- )
610
- user_prompt = f"""
611
- Extract ONLY line items from this hospital bill.
612
-
613
- ### RULES (MUST FOLLOW)
614
- - Do NOT invent items.
615
- - Do NOT return section headers (Room Charges, Lab Services, Radiology).
616
- - Merge broken multi-line names.
617
- - Reconstruct missing rate/qty using amt=rate*qty if visible in text.
618
- - Prefer exact names as shown in bill.
619
- - If a doctor name appears across lines, merge to full name.
620
- - Ignore totals / subtotals.
 
621
  - Ignore page numbers.
622
- - Avoid changing 'OR' unless it is clearly a doctor prefix.
623
- - Ignore final bill summaries.
624
 
625
- ### OCR TEXT:
626
- {safe_text}
627
 
628
- ### INITIAL ITEMS:
629
- {json.dumps(page_items, ensure_ascii=False, indent=2)}
630
 
631
- Return ONLY a JSON array of cleaned items, e.g.:
632
  [
633
- {{ "item_name": "Consultation Charge | DR PREETHI MARY JOSEPH", "item_amount": 300.0, "item_rate": 300.0, "item_quantity": 1.0 }},
634
- ...
635
  ]
636
  """
 
 
637
  model = genai.GenerativeModel(GEMINI_MODEL_NAME)
638
- response = model.generate_content(
639
  [
640
  {"role": "system", "parts": [system_prompt]},
641
  {"role": "user", "parts": [user_prompt]},
642
  ],
643
  temperature=0.0,
644
- max_output_tokens=1000,
645
  )
646
- raw = response.text.strip()
647
- if raw.startswith("```"):
648
- raw = re.sub(r"^```[a-zA-Z]*", "", raw)
649
- raw = re.sub(r"```$", "", raw).strip()
650
  parsed = json.loads(raw)
651
- if isinstance(parsed, list):
652
- cleaned = []
653
- for obj in parsed:
654
- try:
655
- cleaned.append({
656
- "item_name": str(obj.get("item_name", "")).strip(),
657
- "item_amount": float(obj.get("item_amount", 0.0)),
658
- "item_rate": float(obj.get("item_rate", 0.0) or 0.0),
659
- "item_quantity": float(obj.get("item_quantity", 1.0) or 1.0),
660
- })
661
- except Exception:
662
- continue
663
- # token usage not reliably available here; return zeros
664
- return cleaned, zero_usage
665
- return page_items, zero_usage
666
- except Exception:
667
- return page_items, zero_usage
668
-
669
- # ---------------- Post-validation engine (PATCH 5) ----------------
670
- def post_validate_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
671
- """
672
- Rule engine to fix common Gemini hallucinations / OCR inference errors.
673
- - If amount==0 and rate & qty present -> amount = rate * qty
674
- - If rate*qty differs from amount by tolerance -> recompute qty or rate conservatively
675
- - Clamp unreasonable qty for lab tests
676
- """
677
- out = []
678
- for it in items:
679
- name = it.get("item_name", "") or ""
680
- amt = float(it.get("item_amount", 0.0) or 0.0)
681
- rate = float(it.get("item_rate", 0.0) or 0.0)
682
- qty = float(it.get("item_quantity", 1.0) or 1.0)
683
-
684
- lab_like = looks_like_lab_test(name)
685
-
686
- # If amount missing but rate & qty known -> compute amount
687
- if (amt == 0 or amt is None) and rate > 0 and qty > 0:
688
- amt = round(rate * qty, 2)
689
-
690
- # If rate missing but amt and qty present -> compute rate
691
- if (rate == 0 or rate is None) and qty and qty != 0:
692
- try:
693
- candidate_rate = amt / qty
694
- if candidate_rate > 0:
695
- rate = round(candidate_rate, 2)
696
- except Exception:
697
- pass
698
-
699
- # If qty obviously wrong (amt not close to rate*qty), try recompute qty
700
- if rate > 0:
701
- ideal = rate * qty
702
- if abs(ideal - amt) > max(2.0, 0.1 * ideal):
703
- # try compute qty = amt/rate
704
- try:
705
- q = amt / rate if rate else qty
706
- if 1 <= round(q) <= (10 if lab_like else 100):
707
- qty = float(round(q))
708
- else:
709
- # fallback: set qty to 1
710
- qty = 1.0
711
- except Exception:
712
- qty = 1.0
713
-
714
- # Clamp lab test qtys to reasonable bounds
715
- if lab_like and qty > 10:
716
- qty = 1.0
717
 
718
- # Recompute amt if mismatch after adjustments
719
- if rate > 0:
720
- recomputed = round(rate * qty, 2)
721
- # if recomputed is close to amt, prefer recomputed
722
- if abs(recomputed - amt) <= max(2.0, 0.05 * recomputed):
723
- amt = recomputed
724
- # else if amt much larger but not matching, keep amt but set qty=1
725
- else:
726
- if abs(amt - recomputed) / max(1.0, recomputed) > 0.5:
727
- qty = 1.0
728
- # and try recompute rate if rate seems wrong
729
- rate = round(amt / qty, 2) if qty else rate
730
-
731
- it["item_amount"] = round(float(amt or 0.0), 2)
732
- it["item_rate"] = round(float(rate or 0.0), 2)
733
- it["item_quantity"] = float(qty or 1.0)
734
- out.append(it)
735
- return out
736
- # ---------------- main endpoint ----------------
737
  @app.post("/extract-bill-data")
738
  async def extract_bill_data(payload: BillRequest):
739
- doc_url = payload.document
740
 
741
- # ---------- download ----------
 
 
742
  try:
743
- headers = {"User-Agent": "Mozilla/5.0"}
744
- resp = requests.get(doc_url, headers=headers, timeout=30)
745
- if resp.status_code != 200:
746
- raise RuntimeError(f"download failed status={resp.status_code}")
747
- file_bytes = resp.content
748
- except Exception:
749
  return {
750
  "is_success": False,
751
- "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
752
- "data": {
753
- "pagewise_line_items": [],
754
- "total_item_count": 0,
755
- "final_total": 0.0
756
- }
757
  }
758
 
759
- # ---------- convert to images ----------
760
- images = []
761
- clean_url = doc_url.split("?", 1)[0].lower()
762
  try:
763
- if clean_url.endswith(".pdf"):
764
- images = convert_from_bytes(file_bytes)
765
- elif any(clean_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]):
766
- images = [Image.open(BytesIO(file_bytes))]
767
  else:
768
- try:
769
- images = convert_from_bytes(file_bytes)
770
- except:
771
- images = []
772
- except Exception:
773
- images = []
774
 
775
  pagewise = []
776
- cumulative_token_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
777
 
778
- # ---------- per page ----------
779
- for idx, page_img in enumerate(images, start=1):
780
- try:
781
- proc = preprocess_image(page_img)
782
 
783
- # TSV
 
784
  cells = image_to_tsv_cells(proc)
785
- rows = group_cells_into_rows(cells, y_tolerance=12)
786
 
787
- rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
788
-
789
- # ---------------- HEADER PREFILTER ----------------
790
- rows_filtered = []
791
- for i, (r, rt) in enumerate(zip(rows, rows_texts)):
792
- top_flag = (i < 6)
793
- rt_norm = sanitize_ocr_text(rt).lower()
794
-
795
- # strong header detector (from patched Part 1)
796
- if looks_like_header_text(rt_norm, top_of_page=top_flag):
797
- continue
798
 
799
- # legacy blacklist
800
- if any(h in rt_norm for h in HEADER_PHRASES):
 
 
801
  continue
 
802
 
803
- rows_filtered.append(r)
 
 
804
 
805
- rows = rows_filtered
806
- rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
807
- page_text = sanitize_ocr_text(" ".join(rows_texts))
808
-
809
- # detect headers at top of page
810
  top_headers = []
811
- for i, rt in enumerate(rows_texts[:6]):
812
- if looks_like_header_text(rt.lower(), top_of_page=(i < 4)):
813
- top_headers.append(rt.strip().lower())
814
 
815
- # ---------------- PARSE ITEMS ----------------
816
  parsed_items = parse_rows_with_columns(rows, cells)
817
 
818
- # ---------------- GEMINI REFINEMENT ----------------
819
- refined_items, token_u = refine_with_gemini(parsed_items, page_text)
820
- for k in cumulative_token_usage:
821
- cumulative_token_usage[k] += token_u.get(k, 0)
822
-
823
- # ---------------- CONTEXT-AWARE SECTION FILTER ----------------
824
- other_item_names = [it.get("item_name", "") for it in refined_items]
825
 
826
- cleaned = []
827
- for p in refined_items:
828
- if final_item_filter(p, known_page_headers=top_headers, other_item_names=other_item_names):
829
- cleaned.append(p)
830
 
831
- cleaned = dedupe_items(cleaned)
832
 
833
- # drop any leftover header noise
834
- cleaned = [p for p in cleaned if not looks_like_header_text(p["item_name"].lower())]
 
 
835
 
836
- # ---------------- RULE ENGINE POST-VALIDATION ----------------
837
  cleaned = post_validate_items(cleaned)
838
 
839
- # ---------------- PAGE TYPE ----------------
 
840
  page_type = "Bill Detail"
841
- page_txt = page_text.lower()
842
- if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
843
  page_type = "Pharmacy"
844
- if "final bill" in page_txt or "grand total" in page_txt:
845
  page_type = "Final Bill"
846
 
847
- # ---------------- PER-PAGE SUBTOTAL/TOTAL ----------------
848
- detected = detect_subtotals_and_totals(rows_texts)
849
- page_subtotal = detected.get("subtotal")
850
- page_final = detected.get("final_total")
851
-
852
- # ---------------- STORE PAGE ----------------
853
  pagewise.append({
854
  "page_no": str(idx),
855
  "page_type": page_type,
856
  "bill_items": cleaned,
857
- "subtotal": page_subtotal,
858
- "final_page_total": page_final
859
  })
860
 
861
- except Exception:
862
  pagewise.append({
863
  "page_no": str(idx),
864
  "page_type": "Bill Detail",
@@ -866,66 +749,49 @@ async def extract_bill_data(payload: BillRequest):
866
  "subtotal": None,
867
  "final_page_total": None
868
  })
869
- continue
870
-
871
- # ---------------- GLOBAL FINAL TOTAL ----------------
872
- total_item_count = sum(len(p.get("bill_items", [])) for p in pagewise)
873
 
874
- # Sum items across all pages (no double counting)
875
- grand_total = 0.0
876
  for p in pagewise:
877
- for it in p.get("bill_items", []):
878
- try:
879
- grand_total += float(it.get("item_amount", 0.0) or 0.0)
880
- except:
881
- pass
882
 
883
- if not GEMINI_API_KEY or genai is None:
884
- cumulative_token_usage["warning_no_gemini"] = 1
885
 
886
  return {
887
  "is_success": True,
888
- "token_usage": cumulative_token_usage,
889
  "data": {
890
  "pagewise_line_items": pagewise,
891
  "total_item_count": total_item_count,
892
- "final_total": round(grand_total, 2)
893
  }
894
  }
895
 
896
 
897
- # ---------------- debug TSV ----------------
 
 
 
898
  @app.post("/debug-tsv")
899
  async def debug_tsv(payload: BillRequest):
900
- doc_url = payload.document
901
  try:
902
- resp = requests.get(doc_url, timeout=20)
903
- if resp.status_code != 200:
904
- return {"error": "Download failed"}
905
- file_bytes = resp.content
906
- except Exception:
907
- return {"error": "Download failed"}
908
-
909
- clean_url = doc_url.split("?", 1)[0].lower()
910
- if clean_url.endswith(".pdf"):
911
- imgs = convert_from_bytes(file_bytes)
912
- img = imgs[0]
913
- else:
914
- img = Image.open(BytesIO(file_bytes))
915
-
916
- proc = preprocess_image(img)
917
- cells = image_to_tsv_cells(proc)
918
- return {"cells": cells}
919
-
920
-
921
- # ---------------- health check ----------------
922
  @app.get("/")
923
- def health_check():
924
- msg = "Bill extraction API (patched v3) live."
925
- if not GEMINI_API_KEY or genai is None:
926
- msg += " (No Gemini → LLM refinement disabled)"
927
- return {
928
- "status": "ok",
929
- "message": msg,
930
- "hint": "POST /extract-bill-data with {'document':'<url>'}"
931
- }
 
1
+ ###############################################
2
+ # Bajaj Datathon - FINAL PATCHED BILL EXTRACTOR
3
+ # High Accuracy | Robust OCR | Gemini Refinement
4
+ ###############################################
5
+
6
  import os
7
  import re
8
  import json
9
  from io import BytesIO
10
  from typing import List, Dict, Any, Optional, Tuple
11
 
12
+ from fastapi import FastAPI
13
+ from pydantic import BaseModel
14
+ import requests
15
  from PIL import Image
16
+ from pdf2image import convert_from_bytes
 
17
  import pytesseract
18
  from pytesseract import Output
19
+ import numpy as np
20
+ import cv2
21
+
22
+ # Optional Gemini SDK
23
+ try:
24
+ import google.generativeai as genai
25
+ except:
26
+ genai = None
27
+
28
+ # ---------------- LLM CONFIG ----------------
29
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
30
+ GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL_NAME", "gemini-2.5-flash")
31
+
32
+ if GEMINI_API_KEY and genai is not None:
33
+ try:
34
+ genai.configure(api_key=GEMINI_API_KEY)
35
+ except:
36
+ pass
37
+
38
+
39
+ # ---------------- FASTAPI APP ----------------
40
+ app = FastAPI(title="Bajaj Datathon - Bill Extractor (patched v3)")
41
+
42
+ class BillRequest(BaseModel):
43
+ document: str
44
+
45
+
46
+ ###############################################
47
+ # COMMON REGEX AND UTILITY FUNCTIONS
48
+ ###############################################
49
 
 
50
  NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
 
 
 
 
 
51
 
52
  HEADER_KEYWORDS = [
53
+ "description", "qty", "hrs", "rate",
54
+ "discount", "net", "amt", "amount",
55
+ "qty/hrs", "qty / hrs"
56
  ]
57
+
58
  HEADER_PHRASES = [
59
  "description qty / hrs consultation rate discount net amt",
60
  "description qty / hrs rate discount net amt",
 
64
  ]
65
  HEADER_PHRASES = [h.lower() for h in HEADER_PHRASES]
66
 
67
+ TOTAL_KEYWORDS = re.compile(
68
+ r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
69
+ re.I,
70
+ )
71
+
72
+ FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
73
+
74
+
75
  def sanitize_ocr_text(s: str) -> str:
76
  if not s:
77
  return ""
 
80
  s = s.replace("\r\n", "\n").replace("\r", "\n")
81
  s = re.sub(r"[ \t]+", " ", s)
82
  s = s.strip()
83
+ return s[:5000]
84
+
85
 
86
  def normalize_num_str(s: Optional[str]) -> Optional[float]:
87
  if s is None:
88
  return None
89
  s = str(s).strip()
90
+ s = re.sub(r"[^\d\-\+\,\.\(\)]", "", s)
91
  if s == "":
92
  return None
 
93
  negative = False
94
  if s.startswith("(") and s.endswith(")"):
95
  negative = True
96
  s = s[1:-1]
97
  s = s.replace(",", "")
 
 
98
  try:
99
+ v = float(s)
100
+ return -v if negative else v
101
+ except:
102
+ return None
103
+
 
104
 
105
  def is_numeric_token(t: Optional[str]) -> bool:
106
  return bool(t and NUM_RE.search(str(t)))
107
 
108
+
109
  def clean_name_text(s: str) -> str:
110
+ s = s.replace("", "-")
 
 
 
 
 
 
111
  s = re.sub(r"\s+", " ", s)
112
  s = s.strip(" -:,.")
113
+ # Fix doctor prefix only if followed by name
114
+ s = re.sub(r"\bOR (?=[A-Z][a-z])", "DR ", s)
115
+ return s.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
 
117
 
118
+ ###############################################
119
+ # IMAGE PREPROCESSING
120
+ ###############################################
121
 
122
+ def pil_to_cv2(img: Image.Image):
 
123
  arr = np.array(img)
124
  if arr.ndim == 2:
125
  return arr
126
  return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
127
 
128
+
129
+ def preprocess_image(pil_img: Image.Image):
130
  pil_img = pil_img.convert("RGB")
131
  w, h = pil_img.size
132
+
133
+ if w < 1500:
134
+ scale = 1500 / float(w)
135
  pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
136
+
137
+ img = pil_to_cv2(pil_img)
138
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
139
+
140
+ gray = cv2.fastNlMeansDenoising(gray, h=10)
141
+
 
142
  try:
143
+ bw = cv2.adaptiveThreshold(gray, 255,
144
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
145
  cv2.THRESH_BINARY, 41, 15)
146
+ except:
147
+ _, bw = cv2.threshold(gray, 127, 255,
148
+ cv2.THRESH_BINARY + cv2.THRESH_OTSU)
149
+
150
+ bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, np.ones((1, 1), np.uint8))
151
  return bw
152
 
153
+
154
+ ###############################################
155
+ # OCR TSV EXTRACTION
156
+ ###############################################
157
+
158
+ def image_to_tsv_cells(cv_img):
159
  try:
160
+ ocr = pytesseract.image_to_data(
161
+ cv_img,
162
+ output_type=Output.DICT,
163
+ config="--psm 6"
164
+ )
165
+ except:
166
+ ocr = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
167
+
168
+ cells = []
169
+ n = len(ocr.get("text", []))
170
+
171
  for i in range(n):
172
+ t = (ocr["text"][i] or "").strip()
173
+ if not t:
 
 
 
174
  continue
175
  try:
176
+ conf = float(ocr["conf"][i])
177
+ except:
178
  conf = -1.0
179
+
180
+ left = int(ocr.get("left", [0])[i])
181
+ top = int(ocr.get("top", [0])[i])
182
+ width = int(ocr.get("width", [0])[i])
183
+ height = int(ocr.get("height", [0])[i])
184
+
185
  cells.append({
186
+ "text": t,
187
  "conf": conf,
188
  "left": left,
189
  "top": top,
190
  "width": width,
191
  "height": height,
192
+ "center_x": left + width / 2,
193
+ "center_y": top + height / 2,
194
  })
195
  return cells
196
 
197
+
198
+ ###############################################
199
+ # GROUPING INTO TEXT LINES
200
+ ###############################################
201
+
202
+ def group_cells_into_rows(cells, y_tol=12):
203
  if not cells:
204
  return []
205
+ cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
206
+
207
+ rows = []
208
+ current = [cells[0]]
209
+ last_y = cells[0]["center_y"]
210
+
211
+ for c in cells[1:]:
212
+ if abs(c["center_y"] - last_y) <= y_tol:
213
  current.append(c)
214
  last_y = (last_y * (len(current) - 1) + c["center_y"]) / len(current)
215
  else:
216
  rows.append(sorted(current, key=lambda cc: cc["left"]))
217
  current = [c]
218
  last_y = c["center_y"]
219
+
220
  if current:
221
  rows.append(sorted(current, key=lambda cc: cc["left"]))
222
+
223
  return rows
224
 
225
+
226
+ ###############################################
227
+ # DOCTOR-NAME MERGING (PATCH)
228
+ ###############################################
229
+
230
+ def merge_multiline_names(rows):
 
231
  if not rows:
232
  return rows
233
+
234
+ merged = []
235
  i = 0
236
  while i < len(rows):
237
  row = rows[i]
238
  tokens = [c["text"] for c in row]
239
  joined = " ".join(tokens)
240
+
241
  has_num = any(is_numeric_token(t) for t in tokens)
242
 
243
+ # --- Doctor Name Merge Fix ---
244
+ if (not has_num and
245
+ re.search(r"\bdr\b", joined.lower()) and
246
+ i + 1 < len(rows)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
+ next_tokens = " ".join([c["text"] for c in rows[i + 1]])
249
+ if not any(is_numeric_token(x) for x in next_tokens.split()):
250
+ merged_row = row + rows[i + 1]
 
 
 
 
251
  merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
252
  i += 2
253
  continue
254
 
 
255
  merged.append(row)
256
  i += 1
257
 
258
  return merged
259
 
260
+
261
+ ###############################################
262
+ # DETECT NUMERIC COLUMNS
263
+ ###############################################
264
+
265
+ def detect_numeric_columns(cells, max_cols=4):
266
+ xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
267
+ if not xs:
268
+ return []
269
+
270
+ xs = sorted(xs)
271
+ if len(xs) == 1:
272
+ return [xs[0]]
273
+
274
+ gaps = [xs[i + 1] - xs[i] for i in range(len(xs) - 1)]
275
+ mean_gap = float(np.mean(gaps))
276
+ std_gap = float(np.std(gaps)) if len(gaps) > 1 else 0.0
277
+ thresh = max(30.0, mean_gap + 0.6 * std_gap)
278
+
279
+ clusters = []
280
+ curr = [xs[0]]
281
+
282
+ for i, g in enumerate(gaps):
283
+ if g > thresh and len(clusters) < (max_cols - 1):
284
+ clusters.append(curr)
285
+ curr = [xs[i + 1]]
286
+ else:
287
+ curr.append(xs[i + 1])
288
+
289
+ clusters.append(curr)
290
+
291
+ centers = [float(np.median(c)) for c in clusters]
292
+ centers = centers[-max_cols:]
293
+ return sorted(centers)
294
+
295
+
296
+ def assign_token_to_column(x, centers):
297
+ if not centers:
298
+ return None
299
+ dist = [abs(x - c) for c in centers]
300
+ return int(np.argmin(dist))
301
+
302
+
303
+ ###############################################
304
+ # STRONG HEADER DETECTION (PATCHED)
305
+ ###############################################
306
+
307
+ def looks_like_header_text(txt: str, top_of_page=False):
308
  if not txt:
309
  return False
310
+
311
  t = re.sub(r"\s+", " ", txt.strip().lower())
312
 
313
+ patterns = [
 
314
  r"description.*qty",
315
  r"qty.*rate",
316
  r"rate.*amount",
 
319
  r"hrs\s*/\s*qty",
320
  r"qty\s*/\s*hrs",
321
  ]
322
+ for p in patterns:
323
  if re.search(p, t):
324
  return True
325
 
 
326
  if any(h == t for h in HEADER_PHRASES):
327
  return True
328
 
 
329
  hits = sum(1 for k in HEADER_KEYWORDS if k in t)
330
  if hits >= 3:
331
  return True
332
 
 
333
  tokens = re.split(r"[ \|,/]+", t)
334
+ num = sum(1 for tok in tokens if NUM_RE.search(tok))
335
+ if num >= 3:
336
  return True
337
 
 
338
  if top_of_page and hits >= 2:
339
  return True
340
 
341
  return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
 
 
 
 
 
 
 
 
 
343
 
344
+ ###############################################
345
+ # PARSE ROWS INTO ITEMS
346
+ ###############################################
 
 
 
 
 
 
 
 
 
347
 
348
+ def parse_rows_with_columns(rows, cells):
 
 
 
 
 
349
  rows = merge_multiline_names(rows)
350
+ col_centers = detect_numeric_columns(cells)
351
+
352
+ parsed = []
353
 
354
  for row in rows:
355
+ texts = [c["text"] for c in row]
356
+ joined = " ".join(texts).lower()
357
+
358
+ if FOOTER_KEYWORDS.search(joined) and not any(is_numeric_token(t) for t in texts):
 
 
359
  continue
360
+ if all(not is_numeric_token(t) for t in texts):
 
361
  continue
362
 
 
363
  numeric_values = []
364
+ for t in texts:
365
  if is_numeric_token(t):
366
  v = normalize_num_str(t)
367
  if v is not None:
368
  numeric_values.append(float(v))
 
 
369
 
370
+ # De-duplicate & sort largest first
371
+ numeric_values = sorted(list({float(v) for v in numeric_values}), reverse=True)
372
+
373
+ # Drop tiny noise
374
+ numeric_values = [v for v in numeric_values if v >= 5 or (v < 5 and len(numeric_values) == 1)]
375
+
376
+ if col_centers:
377
+ left_text = []
378
+ bucket = {i: [] for i in range(len(col_centers))}
379
 
 
 
 
 
380
  for c in row:
381
  t = c["text"]
382
+ x = c["center_x"]
383
  if is_numeric_token(t):
384
+ idx = assign_token_to_column(x, col_centers)
385
+ if idx is not None:
386
+ bucket[idx].append(t)
 
 
387
  else:
388
+ left_text.append(t)
 
 
389
 
390
+ name_raw = " ".join(left_text).strip()
391
+ name = clean_name_text(name_raw)
392
+
393
+ N = len(col_centers)
394
+
395
+ def pick(k):
396
+ vals = bucket.get(k, [])
397
  return vals[-1] if vals else None
398
 
399
+ amount = normalize_num_str(pick(N - 1)) if N >= 1 else None
400
+ rate = normalize_num_str(pick(N - 2)) if N >= 2 else None
401
+ qty = normalize_num_str(pick(N - 3)) if N >= 3 else None
402
 
403
+ # fallback amount
404
  if amount is None:
405
+ for t in reversed(texts):
406
  if is_numeric_token(t):
407
  amount = normalize_num_str(t)
408
  if amount is not None:
409
  break
410
 
411
+ # strong qty/rate inference
412
+ if amount is not None and rate is not None:
413
+ ratio = amount / rate if rate else None
414
+ if ratio and 1 <= round(ratio) <= 10:
415
+ qty = float(round(ratio))
416
+
417
+ if qty is None:
418
+ qty = 1.0
419
+
420
+ if amount == 0 and rate and qty:
421
+ amount = rate * qty
422
+
423
+ try: amount = float(round(amount, 2))
424
+ except: continue
425
+
426
+ try: rate = float(round(rate or 0.0, 2))
427
+ except: rate = 0.0
428
+
429
+ try: qty = float(qty)
430
+ except: qty = 1.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
 
432
+ parsed.append({
433
  "item_name": name if name else "UNKNOWN",
434
+ "item_amount": amount,
435
+ "item_rate": rate,
436
+ "item_quantity": qty
437
  })
438
 
439
  else:
440
+ idxs = [i for i, t in enumerate(texts) if is_numeric_token(t)]
441
+ if not idxs:
 
442
  continue
443
+
444
+ amt = normalize_num_str(texts[idxs[-1]])
445
  if amt is None:
446
  continue
447
+
448
+ name = " ".join(texts[: idxs[-1]]).strip()
449
  if not name:
450
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
 
452
+ rate = 0.0
453
+ qty = 1.0
 
 
454
 
455
+ possible = []
456
+ for i in idxs:
457
+ v = normalize_num_str(texts[i])
458
+ if v is not None:
459
+ possible.append(float(v))
 
460
 
461
+ possible = sorted(list({v for v in possible}), reverse=True)
 
 
462
 
463
+ for p in possible:
464
+ if p <= 1 or p >= amt:
465
+ continue
466
+ ratio = amt / p
467
+ r = round(ratio)
468
+ if 1 <= r <= 10:
469
+ rate = p
470
+ qty = r
471
+ break
472
+
473
+ parsed.append({
474
  "item_name": clean_name_text(name),
475
  "item_amount": float(round(amt, 2)),
476
  "item_rate": float(round(rate, 2)),
477
+ "item_quantity": float(qty)
478
  })
479
 
480
+ return parsed
481
+
482
+
483
+ ###############################################
484
+ # FINAL ITEM FILTER
485
+ ###############################################
486
+
487
+ def final_item_filter(item, headers, all_names):
488
+ name = item["item_name"].strip()
489
+ ln = name.lower()
490
+
491
+ if not name:
492
+ return False
493
+
494
+ for h in headers:
495
+ if h in ln:
496
+ return False
497
+
498
+ if FOOTER_KEYWORDS.search(ln):
499
+ return False
500
+
501
+ if item["item_amount"] <= 0:
502
+ return False
503
+
504
+ words = ln.split()
505
+ short = len(words) <= 3
506
 
507
+ if any(k in ln for k in ["charges", "services", "room", "radiology", "surgery"]) and short:
508
+ lower_other = " ".join(all_names).lower()
509
+ if any(z in lower_other for z in [
510
+ "rent","ward","nursing","surgeon","anaes","ot","procedure"
511
+ ]):
512
+ return False
513
+
514
+ rate = item["item_rate"]
515
+ amt = item["item_amount"]
516
+ if rate and rate > amt * 10 and amt < 10000:
517
+ return False
518
+
519
+ return True
520
+
521
+
522
+ ###############################################
523
+ # POST VALIDATION (PATCH)
524
+ ###############################################
525
+
526
+ def post_validate_items(items):
527
+ out = []
528
  for it in items:
529
+ amt = it["item_amount"]
530
+ rate = it["item_rate"]
531
+ qty = it["item_quantity"]
532
+
533
+ if amt == 0 and rate > 0:
534
+ amt = rate * qty
535
+
536
+ if rate > 0:
537
+ ideal = rate * qty
538
+ if abs(ideal - amt) > max(2, 0.15 * ideal):
539
+ q = amt / rate
540
+ if 1 <= round(q) <= 10:
541
+ qty = round(q)
542
+
543
+ it["item_amount"] = round(amt, 2)
544
+ it["item_rate"] = round(rate, 2)
545
+ it["item_quantity"] = float(qty)
546
+
547
  out.append(it)
548
  return out
549
 
550
+
551
+ ###############################################
552
+ # SUBTOTAL / FINAL TOTAL DETECTION
553
+ ###############################################
554
+
555
+ def detect_subtotals_and_totals(rows):
556
+ sub = None
557
+ final = None
558
+
559
+ for rt in rows[::-1]:
560
+ if not rt.strip():
561
  continue
562
+
563
  if TOTAL_KEYWORDS.search(rt):
564
  m = NUM_RE.search(rt)
565
  if m:
566
  v = normalize_num_str(m.group(0))
567
  if v is None:
568
  continue
569
+
570
+ if "sub" in rt.lower():
571
+ if sub is None:
572
+ sub = round(v, 2)
573
  else:
574
+ if final is None:
575
+ final = round(v, 2)
576
+
577
+ return {"subtotal": sub, "final_total": final}
578
+
579
+
580
+ ###############################################
581
+ # GEMINI REFINER (PATCHED PROMPT)
582
+ ###############################################
583
+
584
+ def refine_with_gemini(items, page_text=""):
585
  if not GEMINI_API_KEY or genai is None:
586
+ return items, {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
587
+
588
+ safe = sanitize_ocr_text(page_text)
589
+
590
+ system_prompt = (
591
+ "You are a strict hospital bill item cleaner.\n"
592
+ "Return ONLY a JSON array of cleaned line items.\n"
593
+ "Do NOT include section headers, totals, subtotals, page numbers.\n"
594
+ "Do NOT invent items.\n"
595
+ )
596
+
597
+ user_prompt = f"""
598
+ Extract ONLY valid line items from the bill.
599
+
600
+ RULES YOU MUST FOLLOW:
601
+ - Do NOT create new items.
602
+ - Do NOT output section headers (Room Charges, Lab Services, Radiology).
603
+ - Merge broken names (doctor names on multiple lines).
604
+ - Use exact item names from OCR text.
605
+ - Recompute rate/qty if amount = rate×qty is clear.
606
+ - Ignore totals or summary lines.
607
  - Ignore page numbers.
608
+ - Always output: item_name, item_amount, item_rate, item_quantity.
 
609
 
610
+ OCR TEXT:
611
+ {safe}
612
 
613
+ INITIAL ITEMS:
614
+ {json.dumps(items, ensure_ascii=False)}
615
 
616
+ Return ONLY a JSON array:
617
  [
618
+ {{"item_name":"...","item_amount":float,"item_rate":float,"item_quantity":float}}
 
619
  ]
620
  """
621
+
622
+ try:
623
  model = genai.GenerativeModel(GEMINI_MODEL_NAME)
624
+ resp = model.generate_content(
625
  [
626
  {"role": "system", "parts": [system_prompt]},
627
  {"role": "user", "parts": [user_prompt]},
628
  ],
629
  temperature=0.0,
630
+ max_output_tokens=1200,
631
  )
632
+
633
+ raw = resp.text.strip()
634
+ raw = raw.replace("```json", "").replace("```", "").strip()
 
635
  parsed = json.loads(raw)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
636
 
637
+ cleaned = []
638
+ for obj in parsed:
639
+ cleaned.append({
640
+ "item_name": str(obj.get("item_name", "")).strip(),
641
+ "item_amount": float(obj.get("item_amount", 0.0)),
642
+ "item_rate": float(obj.get("item_rate", 0.0)),
643
+ "item_quantity": float(obj.get("item_quantity", 1.0)),
644
+ })
645
+
646
+ return cleaned, {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
647
+
648
+ except:
649
+ return items, {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
650
+
651
+
652
+ ###############################################
653
+ # MAIN EXTRACTION ENDPOINT
654
+ ###############################################
655
+
656
  @app.post("/extract-bill-data")
657
  async def extract_bill_data(payload: BillRequest):
 
658
 
659
+ url = payload.document
660
+
661
+ # download
662
  try:
663
+ r = requests.get(url, headers={"User-Agent": "Mozilla"}, timeout=30)
664
+ if r.status_code != 200:
665
+ raise RuntimeError("Download failed")
666
+ data = r.content
667
+ except:
 
668
  return {
669
  "is_success": False,
670
+ "token_usage": {},
671
+ "data": {"pagewise_line_items": [], "total_item_count": 0}
 
 
 
 
672
  }
673
 
674
+ # load image(s)
 
 
675
  try:
676
+ if url.lower().split("?")[0].endswith(".pdf"):
677
+ imgs = convert_from_bytes(data)
 
 
678
  else:
679
+ imgs = [Image.open(BytesIO(data))]
680
+ except:
681
+ imgs = []
 
 
 
682
 
683
  pagewise = []
684
+ total_tokens = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
685
 
686
+ for idx, img in enumerate(imgs, 1):
 
 
 
687
 
688
+ try:
689
+ proc = preprocess_image(img)
690
  cells = image_to_tsv_cells(proc)
691
+ rows = group_cells_into_rows(cells)
692
 
693
+ row_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
 
 
 
 
 
 
 
 
 
 
694
 
695
+ # remove headers
696
+ filtered = []
697
+ for i, (r, t) in enumerate(zip(rows, row_texts)):
698
+ if looks_like_header_text(t, top_of_page=(i < 5)):
699
  continue
700
+ filtered.append(r)
701
 
702
+ rows = filtered
703
+ row_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
704
+ page_text = " ".join(row_texts)
705
 
 
 
 
 
 
706
  top_headers = []
707
+ for t in row_texts[:5]:
708
+ if looks_like_header_text(t, top_of_page=True):
709
+ top_headers.append(t.lower())
710
 
 
711
  parsed_items = parse_rows_with_columns(rows, cells)
712
 
713
+ refined, usage = refine_with_gemini(parsed_items, page_text)
 
 
 
 
 
 
714
 
715
+ for k in total_tokens:
716
+ total_tokens[k] += usage.get(k, 0)
 
 
717
 
718
+ all_names = [x["item_name"] for x in refined]
719
 
720
+ cleaned = [
721
+ x for x in refined
722
+ if final_item_filter(x, top_headers, all_names)
723
+ ]
724
 
 
725
  cleaned = post_validate_items(cleaned)
726
 
727
+ totals = detect_subtotals_and_totals(row_texts)
728
+
729
  page_type = "Bill Detail"
730
+ low = page_text.lower()
731
+ if "pharmacy" in low:
732
  page_type = "Pharmacy"
733
+ if "final bill" in low or "grand total" in low:
734
  page_type = "Final Bill"
735
 
 
 
 
 
 
 
736
  pagewise.append({
737
  "page_no": str(idx),
738
  "page_type": page_type,
739
  "bill_items": cleaned,
740
+ "subtotal": totals["subtotal"],
741
+ "final_page_total": totals["final_total"]
742
  })
743
 
744
+ except:
745
  pagewise.append({
746
  "page_no": str(idx),
747
  "page_type": "Bill Detail",
 
749
  "subtotal": None,
750
  "final_page_total": None
751
  })
 
 
 
 
752
 
753
+ # global final total = sum of all item amounts
754
+ final_sum = 0.0
755
  for p in pagewise:
756
+ for it in p["bill_items"]:
757
+ final_sum += it["item_amount"]
 
 
 
758
 
759
+ total_item_count = sum(len(p["bill_items"]) for p in pagewise)
 
760
 
761
  return {
762
  "is_success": True,
763
+ "token_usage": total_tokens,
764
  "data": {
765
  "pagewise_line_items": pagewise,
766
  "total_item_count": total_item_count,
767
+ "final_total": round(final_sum, 2)
768
  }
769
  }
770
 
771
 
772
+ ###############################################
773
+ # DEBUG ENDPOINT
774
+ ###############################################
775
+
776
  @app.post("/debug-tsv")
777
  async def debug_tsv(payload: BillRequest):
 
778
  try:
779
+ r = requests.get(payload.document, timeout=20)
780
+ img = Image.open(BytesIO(r.content))
781
+ proc = preprocess_image(img)
782
+ cells = image_to_tsv_cells(proc)
783
+ return {"cells": cells}
784
+ except:
785
+ return {"error": "debug failed"}
786
+
787
+
788
+ ###############################################
789
+ # HEALTH CHECK
790
+ ###############################################
791
+
 
 
 
 
 
 
 
792
  @app.get("/")
793
+ def ping():
794
+ msg = "Bill extractor live."
795
+ if not GEMINI_API_KEY:
796
+ msg += " (Gemini missing)"
797
+ return {"status": "ok", "message": msg}