Sathvik-kota commited on
Commit
e568983
·
verified ·
1 Parent(s): 56ab53e

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +575 -609
app.py CHANGED
@@ -1,17 +1,22 @@
1
- # app.py
2
- # High-accuracy bill extraction API with optional Amazon Textract / Google Vision + robust Tesseract fallback.
3
- # Usage:
4
- # export OCR_ENGINE=textract # or "vision" or "tesseract"
5
- # export AWS_REGION=us-east-1 # required for Textract
6
- # export GEMINI_API_KEY=... # optional
7
- # uvicorn app:app --host 0.0.0.0 --port 8080
 
 
 
8
 
9
  import os
10
  import re
11
  import json
12
  import logging
13
  from io import BytesIO
14
- from typing import List, Dict, Any, Optional, Tuple
 
 
15
 
16
  from fastapi import FastAPI
17
  from pydantic import BaseModel
@@ -35,23 +40,22 @@ try:
35
  except Exception:
36
  vision = None
37
 
38
- # Optional: Google Gemini SDK (if available)
39
  try:
40
  import google.generativeai as genai
41
  except Exception:
42
  genai = None
43
 
44
  # -------------------------------------------------------------------------
45
- # Configuration and logging
46
  # -------------------------------------------------------------------------
47
- OCR_ENGINE = os.getenv("OCR_ENGINE", "textract").lower() # 'textract' | 'vision' | 'tesseract'
48
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
49
- GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL_NAME", "gemini-2.5-flash")
50
  AWS_REGION = os.getenv("AWS_REGION", "us-east-1")
51
- TESSERACT_PSM = os.getenv("TESSERACT_PSM", "6") # page segmentation mode default
52
 
53
  logging.basicConfig(level=logging.INFO)
54
- logger = logging.getLogger("bill-extractor")
55
 
56
  if GEMINI_API_KEY and genai is not None:
57
  try:
@@ -60,138 +64,312 @@ if GEMINI_API_KEY and genai is not None:
60
  except Exception as e:
61
  logger.warning("Gemini config failed: %s", e)
62
 
63
- # Boto3 textract client (lazy init)
64
  _textract_client = None
 
 
65
  def textract_client():
66
  global _textract_client
67
  if _textract_client is None:
68
  if boto3 is None:
69
- raise RuntimeError("boto3 not installed but OCR_ENGINE=textract requested")
70
  _textract_client = boto3.client("textract", region_name=AWS_REGION)
71
  return _textract_client
72
 
73
- # Google Vision client (lazy)
74
- _vision_client = None
75
  def vision_client():
76
  global _vision_client
77
  if _vision_client is None:
78
  if vision is None:
79
- raise RuntimeError("google-cloud-vision not installed but OCR_ENGINE=vision requested")
80
  _vision_client = vision.ImageAnnotatorClient()
81
  return _vision_client
82
 
83
  # -------------------------------------------------------------------------
84
- # Request model
85
  # -------------------------------------------------------------------------
86
- app = FastAPI(title="Bajaj Datathon - Bill Extractor (high-accuracy)")
87
-
88
- class BillRequest(BaseModel):
89
- document: str # file://local_path or http(s) url
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  # -------------------------------------------------------------------------
92
- # Helpers (numbers, cleaning, OCR preprocessing)
93
  # -------------------------------------------------------------------------
94
  NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
 
 
95
  TOTAL_KEYWORDS = re.compile(
96
- r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
97
- re.I,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  )
99
- FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
100
 
101
  HEADER_KEYWORDS = [
102
- "description", "qty", "hrs", "rate", "discount", "net", "amt", "amount",
103
- "consultation", "address", "sex", "age", "mobile", "patient", "category",
104
- "doctor", "dr", "invoice", "bill", "subtotal", "total", "charges", "service"
105
  ]
106
- HEADER_PHRASES = [
107
- "description qty / hrs consultation rate discount net amt",
108
- "description qty / hrs rate discount net amt",
109
- ]
110
- HEADER_PHRASES = [h.lower() for h in HEADER_PHRASES]
111
 
 
 
 
112
  def sanitize_ocr_text(s: Optional[str]) -> str:
 
113
  if not s:
114
  return ""
115
  s = s.replace("\u2014", "-").replace("\u2013", "-")
 
116
  s = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", " ", s)
117
  s = s.replace("\r\n", "\n").replace("\r", "\n")
118
  s = re.sub(r"[ \t]+", " ", s)
119
- # common OCR corrections
120
- s = re.sub(r"\bqiy\b", "qty", s, flags=re.I)
121
- s = re.sub(r"\bdeseription\b", "description", s, flags=re.I)
122
  return s.strip()
123
 
124
- def normalize_num_str(s: Optional[str]) -> Optional[float]:
 
125
  if s is None:
126
  return None
127
  s = str(s).strip()
128
  if s == "":
129
  return None
130
- s = re.sub(r"[^\d\-\+\,\.\(\)]", "", s)
 
131
  negative = False
132
  if s.startswith("(") and s.endswith(")"):
133
  negative = True
134
  s = s[1:-1]
 
 
 
135
  s = s.replace(",", "")
 
136
  if s in ("", "-", "+"):
137
  return None
 
138
  try:
139
  val = float(s)
140
- return -val if negative else val
141
- except Exception:
142
- try:
143
- return float(s.replace(" ", ""))
144
- except Exception:
145
  return None
 
 
 
146
 
147
  def is_numeric_token(t: Optional[str]) -> bool:
 
148
  return bool(t and NUM_RE.search(str(t)))
149
 
150
- def clean_name_text(s: str) -> str:
151
- s = s.replace("", "-")
 
152
  s = re.sub(r"\s+", " ", s)
153
- s = s.strip(" -:,.=")
154
- s = re.sub(r"\s+x$", "", s, flags=re.I)
155
- s = re.sub(r"[\)\}\]]+$", "", s)
156
  s = re.sub(r"\bOR\b", "DR", s) # OCR OR -> DR
157
  return s.strip()
158
 
159
  # -------------------------------------------------------------------------
160
- # Image preprocessing helpers (for Tesseract pipeline)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  # -------------------------------------------------------------------------
162
  def pil_to_cv2(img: Image.Image) -> Any:
 
163
  arr = np.array(img)
164
  if arr.ndim == 2:
165
  return arr
166
  return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
167
 
168
  def preprocess_image_for_tesseract(pil_img: Image.Image, target_w: int = 1500) -> Any:
 
169
  pil_img = pil_img.convert("RGB")
170
  w, h = pil_img.size
 
 
171
  if w < target_w:
172
  scale = target_w / float(w)
173
  pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
 
174
  cv_img = pil_to_cv2(pil_img)
 
 
175
  if cv_img.ndim == 3:
176
  gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
177
  else:
178
  gray = cv_img
 
 
179
  gray = cv2.fastNlMeansDenoising(gray, h=10)
 
 
180
  try:
181
  bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
182
  cv2.THRESH_BINARY, 41, 15)
183
  except Exception:
184
  _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
185
- kernel = np.ones((1,1), np.uint8)
 
 
 
186
  bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
 
187
  return bw
188
 
189
  def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
190
- # returns list of OCR 'cells' compatible with your parsing pipeline
191
  try:
192
  o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config=f"--psm {TESSERACT_PSM}")
193
  except Exception:
194
  o = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
 
195
  cells = []
196
  n = len(o.get("text", []))
197
  for i in range(n):
@@ -201,28 +379,39 @@ def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
201
  txt = str(raw).strip()
202
  if not txt:
203
  continue
 
204
  try:
205
  conf_raw = o.get("conf", [])[i]
206
  conf = float(conf_raw) if conf_raw not in (None, "", "-1") else -1.0
207
  except Exception:
208
  conf = -1.0
 
209
  left = int(o.get("left", [0])[i])
210
  top = int(o.get("top", [0])[i])
211
  width = int(o.get("width", [0])[i])
212
  height = int(o.get("height", [0])[i])
213
  center_y = top + height / 2.0
214
  center_x = left + width / 2.0
215
- cells.append({"text": txt, "conf": conf, "left": left, "top": top,
216
- "width": width, "height": height, "center_y": center_y, "center_x": center_x})
 
 
 
 
 
 
217
  return cells
218
 
219
  def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) -> List[List[Dict[str, Any]]]:
 
220
  if not cells:
221
  return []
 
222
  sorted_cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
223
  rows = []
224
  current = [sorted_cells[0]]
225
  last_y = sorted_cells[0]["center_y"]
 
226
  for c in sorted_cells[1:]:
227
  if abs(c["center_y"] - last_y) <= y_tolerance:
228
  current.append(c)
@@ -231,72 +420,31 @@ def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) ->
231
  rows.append(sorted(current, key=lambda cc: cc["left"]))
232
  current = [c]
233
  last_y = c["center_y"]
 
234
  if current:
235
  rows.append(sorted(current, key=lambda cc: cc["left"]))
 
236
  return rows
237
 
238
- def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
239
- if not rows:
240
- return rows
241
- merged = []
242
- i = 0
243
- while i < len(rows):
244
- row = rows[i]
245
- tokens = [c["text"] for c in row]
246
- has_num = any(is_numeric_token(t) for t in tokens)
247
- if not has_num and i + 1 < len(rows):
248
- next_row = rows[i+1]
249
- next_tokens = [c["text"] for c in next_row]
250
- next_has_num = any(is_numeric_token(t) for t in next_tokens)
251
- if next_has_num and len(tokens) >= 2 and len([t for t in next_tokens if not is_numeric_token(t)]) <= 3:
252
- merged_row = []
253
- min_left = min((c["left"] for c in next_row), default=0)
254
- offset = 10
255
- for c in row:
256
- newc = c.copy()
257
- newc["left"] = min_left - offset
258
- newc["center_x"] = newc["left"] + newc.get("width", 0) / 2.0
259
- merged_row.append(newc)
260
- offset += 10
261
- merged_row.extend(next_row)
262
- merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
263
- i += 2
264
- continue
265
- if not has_num and i + 1 < len(rows):
266
- next_row = rows[i+1]
267
- next_tokens = [c["text"] for c in next_row]
268
- next_has_num = any(is_numeric_token(t) for t in next_tokens)
269
- if not next_has_num and len(tokens) <= 3 and len(next_tokens) <= 3:
270
- merged_row = []
271
- min_left = min((c["left"] for c in next_row + row), default=0)
272
- offset = 10
273
- for c in row + next_row:
274
- newc = c.copy()
275
- if newc["left"] > min_left:
276
- newc["left"] = newc["left"]
277
- else:
278
- newc["left"] = min_left - offset
279
- newc["center_x"] = newc["left"] + newc.get("width", 0) / 2.0
280
- merged_row.append(newc)
281
- offset += 5
282
- merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
283
- i += 2
284
- continue
285
- merged.append(row)
286
- i += 1
287
- return merged
288
-
289
  def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 6) -> List[float]:
 
290
  xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
291
  if not xs:
292
  return []
293
- xs = sorted(xs)
 
294
  if len(xs) == 1:
295
- return [xs[0]]
 
 
296
  gaps = [xs[i+1] - xs[i] for i in range(len(xs)-1)]
297
  mean_gap = float(np.mean(gaps))
298
  std_gap = float(np.std(gaps)) if len(gaps) > 1 else 0.0
299
- gap_thresh = max(28.0, mean_gap + 0.6 * std_gap)
 
300
  clusters = []
301
  curr = [xs[0]]
302
  for i, g in enumerate(gaps):
@@ -306,596 +454,414 @@ def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 6) ->
306
  else:
307
  curr.append(xs[i+1])
308
  clusters.append(curr)
 
309
  centers = [float(np.median(c)) for c in clusters]
310
  if len(centers) > max_columns:
311
  centers = centers[-max_columns:]
 
312
  return sorted(centers)
313
 
314
  def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optional[int]:
 
315
  if not column_centers:
316
  return None
317
  distances = [abs(token_x - cx) for cx in column_centers]
318
  return int(np.argmin(distances))
319
 
320
  # -------------------------------------------------------------------------
321
- # Parsing pipeline (shared)
322
  # -------------------------------------------------------------------------
323
- def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
324
- parsed_items = []
325
- rows = merge_multiline_names(rows)
 
 
 
 
 
 
 
326
  column_centers = detect_numeric_columns(page_cells, max_columns=6)
327
-
328
- for row in rows:
329
  tokens = [c["text"] for c in row]
330
- if not tokens:
331
- continue
332
- joined_lower = " ".join(tokens).lower()
333
- if FOOTER_KEYWORDS.search(joined_lower) and not any(is_numeric_token(t) for t in tokens):
 
334
  continue
335
- # require some numeric token (date-only rows excluded later)
336
- if all(not is_numeric_token(t) for t in tokens):
 
337
  continue
338
-
 
339
  numeric_values = []
340
  for t in tokens:
341
  if is_numeric_token(t):
342
- v = normalize_num_str(t)
343
  if v is not None:
344
  numeric_values.append(float(v))
345
- numeric_values = sorted(list({int(x) if float(x).is_integer() else x for x in numeric_values}), reverse=True)
346
-
 
 
 
 
 
347
  if column_centers:
348
  left_text_parts = []
349
- numeric_bucket_map = {i: [] for i in range(len(column_centers))}
 
350
  for c in row:
351
  t = c["text"]
352
  cx = c["center_x"]
 
 
353
  if is_numeric_token(t):
354
  col_idx = assign_token_to_column(cx, column_centers)
355
  if col_idx is None:
356
- numeric_bucket_map[len(column_centers) - 1].append(t)
357
- else:
358
- numeric_bucket_map[col_idx].append(t)
359
  else:
360
  left_text_parts.append(t)
361
- raw_name = " ".join(left_text_parts).strip()
362
- name = clean_name_text(raw_name) if raw_name else ""
 
 
 
363
  num_cols = len(column_centers)
364
- def get_bucket(idx):
365
- vals = numeric_bucket_map.get(idx, [])
366
- return vals[-1] if vals else None
367
- amount = normalize_num_str(get_bucket(num_cols - 1)) if num_cols >= 1 else None
368
- rate = normalize_num_str(get_bucket(num_cols - 2)) if num_cols >= 2 else None
369
- qty = normalize_num_str(get_bucket(num_cols - 3)) if num_cols >= 3 else None
370
-
 
 
 
 
371
  if amount is None:
372
- for t in reversed(tokens):
373
- if is_numeric_token(t):
374
- amount = normalize_num_str(t)
375
- if amount is not None:
376
- break
377
-
378
- # infer rate and qty heuristics
379
- if amount is not None and numeric_values:
 
 
 
 
 
 
 
 
 
 
 
 
380
  for cand in numeric_values:
381
- try:
382
- cand_float = float(cand)
383
- except:
384
- continue
385
- if cand_float <= 1.0:
386
- continue
387
- if amount <= 5 and cand_float < 1.0:
388
- continue
389
- if cand_float >= amount:
390
- continue
391
- ratio = amount / cand_float if cand_float else None
392
- if ratio is None:
393
  continue
 
394
  r = round(ratio)
395
- if r < 1 or r > 200:
396
- continue
397
- if abs(ratio - r) <= max(0.03 * r, 0.15):
398
- if r <= 100:
399
- rate = cand_float
400
- qty = float(r)
401
- break
402
-
403
- if (rate is None or rate == 0) and qty and qty != 0 and amount is not None:
404
- try:
405
- candidate_rate = amount / qty
406
- if candidate_rate >= 1.0:
407
- rate = candidate_rate
408
- except Exception:
409
- pass
410
-
411
  if qty is None:
412
  qty = 1.0
413
-
414
- try:
415
- amount = float(round(amount, 2))
416
- except Exception:
417
- continue
418
- try:
419
- rate = float(round(rate, 2)) if rate is not None else 0.0
420
- except Exception:
421
  rate = 0.0
422
- try:
423
- qty = float(qty)
424
- except Exception:
425
- qty = 1.0
426
-
427
- parsed_items.append({
428
- "item_name": name if name else "UNKNOWN",
429
- "item_amount": amount,
430
- "item_rate": rate if rate is not None else 0.0,
431
- "item_quantity": qty if qty is not None else 1.0,
432
- })
 
 
 
433
  else:
 
434
  numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t)]
435
  if not numeric_idxs:
436
  continue
 
437
  last = numeric_idxs[-1]
438
- amt = normalize_num_str(tokens[last])
439
- if amt is None:
440
  continue
 
441
  name = " ".join(tokens[:last]).strip()
442
- if not name:
443
- continue
444
- rate = None; qty = None
445
-
446
- right_nums = []
447
- for i in numeric_idxs:
448
- v = normalize_num_str(tokens[i])
449
- if v is not None:
450
- right_nums.append(float(v))
451
- right_nums = sorted(list({int(x) if float(x).is_integer() else x for x in right_nums}), reverse=True)
452
-
453
- if len(right_nums) >= 2:
454
- cand = right_nums[1]
455
- if float(cand) > 1 and float(cand) < float(amt):
456
- ratio = float(amt) / float(cand) if cand else None
457
- if ratio:
458
- r = round(ratio)
459
- if 1 <= r <= 200 and abs(ratio - r) <= max(0.03 * r, 0.15) and r <= 100:
460
- rate = float(cand)
461
- qty = float(r)
462
- if rate is None and right_nums:
463
- for cand in right_nums:
464
- if cand <= 1.0 or cand >= float(amt):
465
- continue
466
- ratio = float(amt) / float(cand)
467
- r = round(ratio)
468
- if 1 <= r <= 100 and abs(ratio - r) <= max(0.03 * r, 0.15):
469
- rate = float(cand)
470
- qty = float(r)
471
- break
472
-
473
- if qty is None:
474
- qty = 1.0
475
- if rate is None:
476
- rate = 0.0
477
-
478
- parsed_items.append({
479
- "item_name": clean_name_text(name),
480
- "item_amount": float(round(amt, 2)),
481
- "item_rate": float(round(rate, 2)),
482
- "item_quantity": float(qty),
483
- })
484
- return parsed_items
485
-
486
- def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
487
- seen = set()
488
- out = []
489
- for it in items:
490
- nm = re.sub(r"\s+", " ", (it.get("item_name","") or "").lower()).strip()
491
- key = (nm[:120], round(float(it.get("item_amount", 0) or 0), 2))
492
- if key in seen:
493
- continue
494
- seen.add(key)
495
- out.append(it)
496
- return out
497
-
498
- def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
499
- if not txt:
500
- return False
501
- t = re.sub(r"\s+", " ", txt.strip().lower())
502
- if any(h == t for h in HEADER_PHRASES):
503
- return True
504
- hits = sum(1 for k in HEADER_KEYWORDS if k in t)
505
- if hits >= 2:
506
- return True
507
- tokens = re.split(r"[\s\|,/:]+", t)
508
- key_hit_count = sum(1 for tok in tokens if tok in HEADER_KEYWORDS)
509
- if key_hit_count >= 3:
510
- return True
511
- if top_of_page and len(tokens) <= 10 and key_hit_count >= 2:
512
- return True
513
- if ("rate" in t or "net" in t) and "amt" in t and not any(ch.isdigit() for ch in t):
514
- return True
515
- if t.startswith("description") or t.startswith("qty") or t.startswith("qty /"):
516
- return True
517
- return False
518
-
519
- def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = []) -> bool:
520
- name = (item.get("item_name") or "").strip()
521
- if not name:
522
- return False
523
- ln = name.lower()
524
- for h in known_page_headers:
525
- if h and h.strip() and h.strip().lower() in ln:
526
- return False
527
- if FOOTER_KEYWORDS.search(ln):
528
- return False
529
- amt = float(item.get("item_amount", 0) or 0)
530
- if amt <= 0:
531
- return False
532
- # sanity: weird giant amounts are likely OCR garbage
533
- if amt > 10_000_000:
534
- return False
535
- rate = float(item.get("item_rate", 0) or 0)
536
- if rate and rate > amt * 20 and amt < 10000:
537
- return False
538
- return True
539
 
540
  # -------------------------------------------------------------------------
541
- # Gemini refinement (deterministic, optional)
542
  # -------------------------------------------------------------------------
543
- def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str,int]]:
544
- zero_usage = {"total_tokens":0, "input_tokens":0, "output_tokens":0}
545
- if not GEMINI_API_KEY or genai is None:
546
- return page_items, zero_usage
547
- try:
548
- safe_text = sanitize_ocr_text(page_text)[:3000]
549
- system_prompt = (
550
- "You are a strict bill-extraction cleaner. Return ONLY a JSON array (no explanation, no backticks). "
551
- "Each entry must be an object with keys: item_name (string), item_amount (float), item_rate (float), item_quantity (float). "
552
- "Do NOT include subtotal or total lines as items. Do not invent items; only clean/fix/normalize the given items."
553
- )
554
- user_prompt = f"page_text='''{safe_text}'''\nitems={json.dumps(page_items, ensure_ascii=False)}\nReturn only the cleaned JSON array."
555
- model = genai.GenerativeModel(GEMINI_MODEL_NAME)
556
- response = model.generate_content(
557
- [
558
- {"role":"system","parts":[system_prompt]},
559
- {"role":"user","parts":[user_prompt]},
560
- ],
561
- temperature=0.0,
562
- max_output_tokens=1000,
563
- )
564
- raw = response.text.strip()
565
- if raw.startswith("```"):
566
- raw = re.sub(r"^```[a-zA-Z]*", "", raw)
567
- raw = re.sub(r"```$", "", raw).strip()
568
- parsed = json.loads(raw)
569
- out = []
570
- for obj in parsed:
571
- try:
572
- out.append({
573
- "item_name": str(obj.get("item_name","")).strip(),
574
- "item_amount": float(obj.get("item_amount",0.0)),
575
- "item_rate": float(obj.get("item_rate",0.0) or 0.0),
576
- "item_quantity": float(obj.get("item_quantity",1.0) or 1.0),
577
- })
578
- except Exception:
579
- continue
580
- return out, zero_usage
581
- except Exception as e:
582
- logger.warning("Gemini refine failed: %s", e)
583
- return page_items, zero_usage
584
-
585
- # -------------------------------------------------------------------------
586
- # OCR engine implementations
587
- # -------------------------------------------------------------------------
588
- def ocr_with_textract(file_bytes: bytes) -> List[Dict[str, Any]]:
589
- """
590
- Use Amazon Textract AnalyzeExpense on each page image. Returns list of pages:
591
- [{"page_no": "1", "page_type": "...", "bill_items": [...]}]
592
- Note: Textract AnalyzeExpense returns structured expense/line-item data; we map it to our output.
593
- """
594
- pages_out = []
595
- client = textract_client()
596
-
597
- # Convert bytes to images and call AnalyzeExpense for each page (synchronous).
598
- try:
599
- images = convert_from_bytes(file_bytes)
600
- except Exception as e:
601
- logger.warning("Textract fallback: PDF->image conversion failed: %s", e)
602
- return []
603
-
604
- for idx, pil_img in enumerate(images, start=1):
605
- bio = BytesIO()
606
- pil_img.save(bio, format="JPEG", quality=90)
607
- img_bytes = bio.getvalue()
608
- try:
609
- resp = client.analyze_expense(Document={'Bytes': img_bytes})
610
- except (BotoCoreError, ClientError) as e:
611
- logger.exception("Textract analyze_expense failed: %s", e)
612
- pages_out.append({"page_no": str(idx), "page_type": "Bill Detail", "bill_items": []})
613
- continue
614
- # Parse Textract response
615
- items = []
616
- line_item_groups = resp.get("ExpenseDocuments", [])
617
- if line_item_groups:
618
- for doc in line_item_groups:
619
- groups = doc.get("LineItemGroups", [])
620
- for g in groups:
621
- for li in g.get("LineItems", []):
622
- # Each line item has LineItemExpenseFields list
623
- name_parts = []
624
- amount = None
625
- rate = None
626
- qty = None
627
- for f in li.get("LineItemExpenseFields", []):
628
- tname = f.get("Type", {}).get("Text", "") or ""
629
- v = f.get("ValueDetection", {}).get("Text", "") or ""
630
- txt_l = tname.lower()
631
- if txt_l in ("item", "description", "item description", "service"):
632
- name_parts.append(v)
633
- elif txt_l in ("amount", "price", "total"):
634
- maybe = normalize_num_str(v)
635
- if maybe is not None:
636
- amount = maybe
637
- elif txt_l in ("quantity", "qty"):
638
- maybe = normalize_num_str(v)
639
- if maybe is not None:
640
- qty = maybe
641
- elif txt_l in ("rate", "unit price", "price per unit"):
642
- maybe = normalize_num_str(v)
643
- if maybe is not None:
644
- rate = maybe
645
- else:
646
- # Heuristic: if value looks numeric and field name is empty, try assign
647
- if is_numeric_token(v) and amount is None:
648
- maybe = normalize_num_str(v)
649
- if maybe is not None:
650
- amount = maybe
651
- elif v and not is_numeric_token(v):
652
- name_parts.append(v)
653
- name = " ".join(name_parts).strip() or "UNKNOWN"
654
- # Post-process amount/rate/qty
655
- if amount is None:
656
- # try to find from summary fields
657
- pass
658
- if qty is None and rate is not None and amount is not None and rate != 0:
659
- try:
660
- qty = round(amount / rate, 2)
661
- except Exception:
662
- qty = 1.0
663
- if qty is None:
664
- qty = 1.0
665
- if rate is None and qty and qty != 0 and amount is not None:
666
- try:
667
- rate = round(amount / qty, 2)
668
- except Exception:
669
- rate = 0.0
670
- if amount is None:
671
- amount = 0.0
672
- items.append({
673
- "item_name": clean_name_text(name),
674
- "item_amount": float(round(amount, 2)),
675
- "item_rate": float(round(rate or 0.0, 2)),
676
- "item_quantity": float(qty or 1.0),
677
- })
678
- # Fallback: if Textract returned no structured line items, attempt to extract lines from Blocks
679
- if not items:
680
- # try to extract lines from DocumentMetadata / Blocks
681
- blocks = resp.get("Blocks", [])
682
- lines = []
683
- for b in blocks:
684
- if b.get("BlockType") == "LINE":
685
- lines.append(b.get("Text", ""))
686
- # naive fallback: group lines that contain numbers
687
- for ln in lines:
688
- tokens = ln.split()
689
- numbers = [t for t in tokens if is_numeric_token(t)]
690
- if numbers:
691
- name = " ".join([t for t in tokens if not is_numeric_token(t)])
692
- amount = None
693
- for t in reversed(tokens):
694
- if is_numeric_token(t):
695
- v = normalize_num_str(t)
696
- if v is not None:
697
- amount = v
698
- break
699
- if amount:
700
- items.append({
701
- "item_name": clean_name_text(name or "UNKNOWN"),
702
- "item_amount": float(round(amount, 2)),
703
- "item_rate": 0.0,
704
- "item_quantity": 1.0,
705
- })
706
- # Filter & dedupe
707
- items = [it for it in items if final_item_filter(it, [])]
708
- items = dedupe_items(items)
709
- page_type = "Bill Detail"
710
- items_text = " ".join([it["item_name"] for it in items]).lower()
711
- if "pharmacy" in items_text or "tablet" in items_text or "medicine" in items_text:
712
- page_type = "Pharmacy"
713
- pages_out.append({"page_no": str(idx), "page_type": page_type, "bill_items": items})
714
- return pages_out
715
-
716
- def ocr_with_google_vision(file_bytes: bytes) -> List[Dict[str,Any]]:
717
  """
718
- Google Vision Document OCR pipeline. Returns parsed pages (same format).
 
719
  """
720
- client = vision_client()
721
- pages_out = []
722
- try:
723
- images = convert_from_bytes(file_bytes)
724
- except Exception as e:
725
- logger.warning("Vision pipeline: PDF->image conversion failed: %s", e)
726
- return []
727
- for idx, pil_img in enumerate(images, start=1):
728
- bio = BytesIO()
729
- pil_img.save(bio, format="JPEG", quality=90)
730
- content = bio.getvalue()
731
- image = vision.Image(content=content)
732
- resp = client.document_text_detection(image=image)
733
- text = resp.full_text_annotation.text if resp.full_text_annotation else ""
734
- # Build pseudo-cells from words using bounding boxes if available
735
- cells = []
736
- for page in (resp.full_text_annotation.pages or []):
737
- for block in page.blocks:
738
- for para in block.paragraphs:
739
- for word in para.words:
740
- word_text = "".join([sym.text for sym in word.symbols])
741
- bbox = word.bounding_box
742
- # compute approximate left/top/width/height
743
- xs = [v.x for v in bbox.vertices]
744
- ys = [v.y for v in bbox.vertices]
745
- left = int(min(xs)) if xs else 0
746
- top = int(min(ys)) if ys else 0
747
- width = int(max(xs)-min(xs)) if xs else 0
748
- height = int(max(ys)-min(ys)) if ys else 0
749
- center_x = left + width/2.0
750
- center_y = top + height/2.0
751
- cells.append({"text": word_text, "conf": -1.0, "left": left, "top": top, "width": width, "height": height, "center_x": center_x, "center_y": center_y})
752
- # row grouping + parse using shared functions
753
- rows = group_cells_into_rows(cells, y_tolerance=14)
754
- parsed_items = parse_rows_with_columns(rows, cells)
755
- cleaned = [p for p in parsed_items if final_item_filter(p, [])]
756
- cleaned = dedupe_items(cleaned)
757
- page_type = "Bill Detail"
758
- page_txt = text.lower()
759
- if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
760
- page_type = "Pharmacy"
761
- pages_out.append({"page_no": str(idx), "page_type": page_type, "bill_items": cleaned})
762
- return pages_out
763
 
764
- def ocr_with_tesseract(file_bytes: bytes) -> List[Dict[str,Any]]:
765
- """Tesseract pipeline using your preprocessing + TSV + parsing functions."""
 
 
 
766
  pages_out = []
 
767
  try:
768
  images = convert_from_bytes(file_bytes)
769
- except Exception as e:
770
- # maybe it's a single image format (jpg/png)
771
  try:
772
  im = Image.open(BytesIO(file_bytes))
773
  images = [im]
774
- except Exception:
775
- logger.exception("Tesseract pipeline can't open file: %s", e)
776
  return []
 
777
  for idx, pil_img in enumerate(images, start=1):
778
  try:
 
779
  proc = preprocess_image_for_tesseract(pil_img)
780
  cells = image_to_tsv_cells(proc)
781
  rows = group_cells_into_rows(cells, y_tolerance=12)
782
- rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
783
- # header prefilter
784
- rows_filtered = []
785
- for i, (r, rt) in enumerate(zip(rows, rows_texts)):
786
- top_flag = (i < 6)
787
- rt_norm = sanitize_ocr_text(rt).lower()
788
- if looks_like_header_text(rt_norm, top_of_page=top_flag):
789
- continue
790
- if any(h in rt_norm for h in HEADER_PHRASES):
 
 
 
 
 
 
 
 
 
 
 
791
  continue
792
- rows_filtered.append(r)
793
- rows = rows_filtered
794
- parsed_items = parse_rows_with_columns(rows, cells)
795
- refined_items, _ = refine_with_gemini(parsed_items, sanitize_ocr_text(" ".join(rows_texts)))
796
- cleaned = [p for p in refined_items if final_item_filter(p, [])]
797
- cleaned = dedupe_items(cleaned)
798
- page_type = "Bill Detail"
799
- page_txt = " ".join(rows_texts).lower()
800
- if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
801
- page_type = "Pharmacy"
802
- pages_out.append({"page_no": str(idx), "page_type": page_type, "bill_items": cleaned})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
803
  except Exception as e:
804
- logger.exception("Tesseract parse page failed: %s", e)
805
- pages_out.append({"page_no": str(idx), "page_type": "Bill Detail", "bill_items": []})
 
 
 
 
 
 
 
806
  return pages_out
807
 
808
  # -------------------------------------------------------------------------
809
- # Main endpoint
810
  # -------------------------------------------------------------------------
811
- @app.post("/extract-bill-data")
 
 
 
 
 
 
 
 
 
 
 
 
 
812
  async def extract_bill_data(payload: BillRequest):
 
813
  doc_url = payload.document
814
  file_bytes = None
815
-
816
- # local file support
817
  if doc_url.startswith("file://"):
818
  local_path = doc_url.replace("file://", "")
819
  try:
820
  with open(local_path, "rb") as f:
821
  file_bytes = f.read()
822
  except Exception as e:
823
- return {"is_success": False, "error": f"Local file read error: {e}",
824
- "data": {"pagewise_line_items": [], "total_item_count": 0}, "token_usage": {"total_tokens":0,"input_tokens":0,"output_tokens":0}}
 
 
 
 
 
 
825
  else:
826
  try:
827
  headers = {"User-Agent": "Mozilla/5.0"}
828
  resp = requests.get(doc_url, headers=headers, timeout=30)
829
  if resp.status_code != 200:
830
- return {"is_success": False, "error": f"Download failed status={resp.status_code}",
831
- "data": {"pagewise_line_items": [], "total_item_count": 0}, "token_usage": {"total_tokens":0,"input_tokens":0,"output_tokens":0}}
 
 
 
 
 
 
832
  file_bytes = resp.content
833
  except Exception as e:
834
- return {"is_success": False, "error": f"HTTP error: {e}",
835
- "data": {"pagewise_line_items": [], "total_item_count": 0}, "token_usage": {"total_tokens":0,"input_tokens":0,"output_tokens":0}}
836
-
 
 
 
 
 
 
837
  if not file_bytes:
838
- return {"is_success": False, "error": "No file bytes found", "data": {"pagewise_line_items": [], "total_item_count": 0}, "token_usage": {"total_tokens":0,"input_tokens":0,"output_tokens":0}}
839
-
840
- pages = []
841
- token_usage = {"total_tokens":0,"input_tokens":0,"output_tokens":0}
842
- engine = OCR_ENGINE
843
- logger.info("Using OCR engine: %s", engine)
844
-
 
 
 
 
845
  try:
846
- if engine == "textract":
847
- pages = ocr_with_textract(file_bytes)
848
- elif engine == "vision":
849
- pages = ocr_with_google_vision(file_bytes)
850
- else:
851
  pages = ocr_with_tesseract(file_bytes)
852
- except Exception as e:
853
- logger.exception("OCR engine failed: %s", e)
854
- # fallback to tesseract pipeline
855
- try:
856
- pages = ocr_with_tesseract(file_bytes)
857
- except Exception as e:
858
- logger.exception("Tesseract fallback also failed: %s", e)
859
- pages = []
860
-
861
- total_item_count = sum(len(p.get("bill_items", [])) for p in pages)
862
- if not GEMINI_API_KEY or genai is None:
863
- token_usage["warning_no_gemini"] = 1
864
-
865
- return {"is_success": True, "token_usage": token_usage, "data": {"pagewise_line_items": pages, "total_item_count": total_item_count}}
866
-
867
- # -------------------------------------------------------------------------
868
- # Debug endpoint to return tsv cell info for inspection
869
- # -------------------------------------------------------------------------
870
- @app.post("/debug-tsv")
871
- async def debug_tsv(payload: BillRequest):
872
- doc_url = payload.document
873
- try:
874
- if doc_url.startswith("file://"):
875
- local_path = doc_url.replace("file://", "")
876
- with open(local_path, "rb") as f:
877
- file_bytes = f.read()
878
  else:
879
- resp = requests.get(doc_url, timeout=20)
880
- resp.raise_for_status()
881
- file_bytes = resp.content
882
  except Exception as e:
883
- return {"error": f"Download failed: {e}"}
884
- try:
885
- imgs = convert_from_bytes(file_bytes)
886
- img = imgs[0]
887
- except Exception:
888
- try:
889
- img = Image.open(BytesIO(file_bytes)).convert("RGB")
890
- except Exception as e:
891
- return {"error": f"Image conversion failed: {e}"}
892
- proc = preprocess_image_for_tesseract(img)
893
- cells = image_to_tsv_cells(proc)
894
- return {"cells": cells}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
895
 
896
  @app.get("/")
897
- def health_check():
898
- msg = f"Bill extraction API live. OCR_ENGINE={OCR_ENGINE}"
899
- if not GEMINI_API_KEY or genai is None:
900
- msg += " (Gemini not configured — LLM refinement skipped.)"
901
- return {"status": "ok", "message": msg, "hint": "POST /extract-bill-data with {'document':'<url or file://path>'}"}
 
 
 
 
 
 
 
1
+ # Enhanced Bill Extraction API
2
+ # Designed for Bajaj Datathon: accurate line item + subtotal + total extraction
3
+ #
4
+ # Key improvements:
5
+ # 1. Explicit subtotal/total detection and preservation
6
+ # 2. Double-count prevention via fingerprinting
7
+ # 3. Item-sum vs bill-total validation
8
+ # 4. Confidence scoring and anomaly detection
9
+ # 5. Enhanced preprocessing for table structures
10
+ # 6. Gemini-powered structural validation
11
 
12
  import os
13
  import re
14
  import json
15
  import logging
16
  from io import BytesIO
17
+ from typing import List, Dict, Any, Optional, Tuple, Set
18
+ from dataclasses import dataclass, asdict
19
+ from collections import defaultdict
20
 
21
  from fastapi import FastAPI
22
  from pydantic import BaseModel
 
40
  except Exception:
41
  vision = None
42
 
 
43
  try:
44
  import google.generativeai as genai
45
  except Exception:
46
  genai = None
47
 
48
  # -------------------------------------------------------------------------
49
+ # Configuration
50
  # -------------------------------------------------------------------------
51
+ OCR_ENGINE = os.getenv("OCR_ENGINE", "tesseract").lower()
52
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
53
+ GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL_NAME", "gemini-2.0-flash")
54
  AWS_REGION = os.getenv("AWS_REGION", "us-east-1")
55
+ TESSERACT_PSM = os.getenv("TESSERACT_PSM", "6")
56
 
57
  logging.basicConfig(level=logging.INFO)
58
+ logger = logging.getLogger("bill-extractor-v2")
59
 
60
  if GEMINI_API_KEY and genai is not None:
61
  try:
 
64
  except Exception as e:
65
  logger.warning("Gemini config failed: %s", e)
66
 
67
+ # Lazy clients
68
  _textract_client = None
69
+ _vision_client = None
70
+
71
  def textract_client():
72
  global _textract_client
73
  if _textract_client is None:
74
  if boto3 is None:
75
+ raise RuntimeError("boto3 not installed")
76
  _textract_client = boto3.client("textract", region_name=AWS_REGION)
77
  return _textract_client
78
 
 
 
79
  def vision_client():
80
  global _vision_client
81
  if _vision_client is None:
82
  if vision is None:
83
+ raise RuntimeError("google-cloud-vision not installed")
84
  _vision_client = vision.ImageAnnotatorClient()
85
  return _vision_client
86
 
87
  # -------------------------------------------------------------------------
88
+ # Data Models
89
  # -------------------------------------------------------------------------
90
+ @dataclass
91
+ class BillLineItem:
92
+ """Represents a single line item in a bill"""
93
+ item_name: str
94
+ item_quantity: float = 1.0
95
+ item_rate: float = 0.0
96
+ item_amount: float = 0.0
97
+ confidence: float = 1.0 # 0-1 confidence score
98
+ source_row: str = "" # raw OCR text for debugging
99
+ is_description_continuation: bool = False # multi-line item flag
100
+
101
+ def to_dict(self) -> Dict[str, Any]:
102
+ d = asdict(self)
103
+ d.pop("source_row", None) # exclude raw text from output
104
+ d.pop("is_description_continuation", None)
105
+ return d
106
+
107
+ @dataclass
108
+ class BillTotal:
109
+ """Subtotal and total information"""
110
+ subtotal_amount: Optional[float] = None
111
+ tax_amount: Optional[float] = None
112
+ discount_amount: Optional[float] = None
113
+ final_total_amount: Optional[float] = None
114
+
115
+ def to_dict(self) -> Dict[str, Any]:
116
+ return {k: v for k, v in asdict(self).items() if v is not None}
117
+
118
+ @dataclass
119
+ class ExtractedPage:
120
+ """Page-level extraction result"""
121
+ page_no: int
122
+ page_type: str # "Bill Detail", "Header", "Footer", etc.
123
+ line_items: List[BillLineItem]
124
+ bill_totals: BillTotal
125
+ page_confidence: float = 1.0
126
+
127
+ def to_dict(self) -> Dict[str, Any]:
128
+ return {
129
+ "page_no": self.page_no,
130
+ "page_type": self.page_type,
131
+ "line_items": [item.to_dict() for item in self.line_items],
132
+ "bill_totals": self.bill_totals.to_dict(),
133
+ "page_confidence": round(self.page_confidence, 3),
134
+ }
135
 
136
  # -------------------------------------------------------------------------
137
+ # Regular Expressions (Enhanced)
138
  # -------------------------------------------------------------------------
139
  NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
140
+
141
+ # Total/Subtotal keywords (improved detection)
142
  TOTAL_KEYWORDS = re.compile(
143
+ r"\b(grand\s+total|net\s+payable|total\s+(?:amount|due)|amount\s+payable|bill\s+amount|"
144
+ r"final\s+(?:amount|total)|balance\s+due|amount\s+due|total\s+payable|payable)\b",
145
+ re.I
146
+ )
147
+ SUBTOTAL_KEYWORDS = re.compile(
148
+ r"\b(sub\s*[\-\s]?total|subtotal|sub\s+total|items\s+total|line\s+items\s+total)\b",
149
+ re.I
150
+ )
151
+ TAX_KEYWORDS = re.compile(
152
+ r"\b(tax|gst|vat|sgst|cgst|igst|sales\s+tax|service\s+tax)\b",
153
+ re.I
154
+ )
155
+ DISCOUNT_KEYWORDS = re.compile(
156
+ r"\b(discount|rebate|deduction)\b",
157
+ re.I
158
+ )
159
+ FOOTER_KEYWORDS = re.compile(
160
+ r"(page|printed\s+on|printed|date|time|signature|authorized|terms|conditions)",
161
+ re.I
162
  )
 
163
 
164
  HEADER_KEYWORDS = [
165
+ "description", "qty", "qty/hrs", "hrs", "rate", "unit price", "discount",
166
+ "net", "amt", "amount", "price", "total", "sl.no", "s.no", "item", "service",
167
+ "consultation", "patient", "invoice", "bill", "charges"
168
  ]
 
 
 
 
 
169
 
170
+ # -------------------------------------------------------------------------
171
+ # Text Cleaning & Normalization
172
+ # -------------------------------------------------------------------------
173
  def sanitize_ocr_text(s: Optional[str]) -> str:
174
+ """Deep clean OCR text"""
175
  if not s:
176
  return ""
177
  s = s.replace("\u2014", "-").replace("\u2013", "-")
178
+ s = s.replace("\u00A0", " ") # nbsp
179
  s = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", " ", s)
180
  s = s.replace("\r\n", "\n").replace("\r", "\n")
181
  s = re.sub(r"[ \t]+", " ", s)
182
+ # OCR corrections
183
+ s = re.sub(r"\b(qiy|qty|oty|gty)\b", "qty", s, flags=re.I)
184
+ s = re.sub(r"\b(deseription|descriptin|desription)\b", "description", s, flags=re.I)
185
  return s.strip()
186
 
187
+ def normalize_num_str(s: Optional[str], allow_zero: bool = False) -> Optional[float]:
188
+ """Robust number parsing"""
189
  if s is None:
190
  return None
191
  s = str(s).strip()
192
  if s == "":
193
  return None
194
+
195
+ # Handle parentheses (negative indicator)
196
  negative = False
197
  if s.startswith("(") and s.endswith(")"):
198
  negative = True
199
  s = s[1:-1]
200
+
201
+ # Remove non-numeric chars except decimal/comma
202
+ s = re.sub(r"[^\d\-\+\,\.\(\)]", "", s)
203
  s = s.replace(",", "")
204
+
205
  if s in ("", "-", "+"):
206
  return None
207
+
208
  try:
209
  val = float(s)
210
+ val = -val if negative else val
211
+ if val == 0 and not allow_zero:
 
 
 
212
  return None
213
+ return val
214
+ except Exception:
215
+ return None
216
 
217
  def is_numeric_token(t: Optional[str]) -> bool:
218
+ """Check if token is numeric"""
219
  return bool(t and NUM_RE.search(str(t)))
220
 
221
+ def clean_item_name(s: str) -> str:
222
+ """Clean item description text"""
223
+ s = s.replace("—", "-").replace("–", "-")
224
  s = re.sub(r"\s+", " ", s)
225
+ s = s.strip(" -:,.=()[]{}|\\")
 
 
226
  s = re.sub(r"\bOR\b", "DR", s) # OCR OR -> DR
227
  return s.strip()
228
 
229
  # -------------------------------------------------------------------------
230
+ # Item Fingerprinting (for deduplication)
231
+ # -------------------------------------------------------------------------
232
+ def item_fingerprint(item: BillLineItem) -> Tuple[str, float]:
233
+ """Create fingerprint for deduplication"""
234
+ name_norm = re.sub(r"\s+", " ", item.item_name.lower()).strip()[:100]
235
+ amount_rounded = round(float(item.item_amount), 2)
236
+ return (name_norm, amount_rounded)
237
+
238
+ def dedupe_items_advanced(items: List[BillLineItem]) -> List[BillLineItem]:
239
+ """
240
+ Remove duplicates while preserving highest-confidence versions.
241
+ Handles multi-line descriptions by checking sequential items.
242
+ """
243
+ if not items:
244
+ return []
245
+
246
+ # Remove exact duplicates (same fingerprint)
247
+ seen: Dict[Tuple, BillLineItem] = {}
248
+ for item in items:
249
+ fp = item_fingerprint(item)
250
+ if fp not in seen or item.confidence > seen[fp].confidence:
251
+ seen[fp] = item
252
+
253
+ # Remove high-similarity continuation rows (likely description wrapping)
254
+ final = []
255
+ for item in seen.values():
256
+ if item.is_description_continuation:
257
+ # Check if very similar to previous item
258
+ if final and abs(float(final[-1].item_amount) - float(item.item_amount)) < 0.01:
259
+ # Likely continuation; merge
260
+ final[-1].item_name = (final[-1].item_name + " " + item.item_name).strip()
261
+ continue
262
+ final.append(item)
263
+
264
+ return final
265
+
266
+ # -------------------------------------------------------------------------
267
+ # Total/Subtotal Detection
268
+ # -------------------------------------------------------------------------
269
+ def detect_totals_in_rows(rows: List[List[Dict[str, Any]]]) -> Tuple[Optional[float], Optional[float], Optional[float], Optional[float]]:
270
+ """
271
+ Scan rows for subtotal, tax, discount, final total.
272
+ Returns: (subtotal, tax, discount, final_total)
273
+ """
274
+ subtotal = None
275
+ tax = None
276
+ discount = None
277
+ final_total = None
278
+
279
+ rows_text = []
280
+ for row in rows:
281
+ row_text = " ".join([c["text"] for c in row])
282
+ rows_text.append((row_text, row))
283
+
284
+ # Scan for keywords
285
+ for row_text, row in rows_text:
286
+ row_lower = row_text.lower()
287
+ tokens = row_text.split()
288
+
289
+ # Extract number from row
290
+ amounts = []
291
+ for t in tokens:
292
+ if is_numeric_token(t):
293
+ v = normalize_num_str(t, allow_zero=True)
294
+ if v is not None:
295
+ amounts.append(v)
296
+
297
+ if not amounts:
298
+ continue
299
+
300
+ # Use rightmost/largest amount typically
301
+ amount = max(amounts)
302
+
303
+ # Keyword matching
304
+ if FINAL_TOTAL_KEYWORDS.search(row_lower):
305
+ final_total = amount
306
+ elif SUBTOTAL_KEYWORDS.search(row_lower):
307
+ subtotal = amount
308
+ elif TAX_KEYWORDS.search(row_lower):
309
+ tax = amount
310
+ elif DISCOUNT_KEYWORDS.search(row_lower):
311
+ discount = amount
312
+
313
+ return subtotal, tax, discount, final_total
314
+
315
+ FINAL_TOTAL_KEYWORDS = re.compile(
316
+ r"\b(grand\s+total|final\s+(?:total|amount)|total\s+(?:due|payable|amount)|"
317
+ r"net\s+payable|amount\s+(?:due|payable)|balance\s+due|payable)\b",
318
+ re.I
319
+ )
320
+
321
+ # -------------------------------------------------------------------------
322
+ # Image Preprocessing
323
  # -------------------------------------------------------------------------
324
  def pil_to_cv2(img: Image.Image) -> Any:
325
+ """Convert PIL to OpenCV"""
326
  arr = np.array(img)
327
  if arr.ndim == 2:
328
  return arr
329
  return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
330
 
331
  def preprocess_image_for_tesseract(pil_img: Image.Image, target_w: int = 1500) -> Any:
332
+ """Enhanced preprocessing for table-heavy documents"""
333
  pil_img = pil_img.convert("RGB")
334
  w, h = pil_img.size
335
+
336
+ # Upscale if too small
337
  if w < target_w:
338
  scale = target_w / float(w)
339
  pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
340
+
341
  cv_img = pil_to_cv2(pil_img)
342
+
343
+ # Grayscale
344
  if cv_img.ndim == 3:
345
  gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
346
  else:
347
  gray = cv_img
348
+
349
+ # Denoise
350
  gray = cv2.fastNlMeansDenoising(gray, h=10)
351
+
352
+ # Adaptive thresholding (better for tables with shadows)
353
  try:
354
  bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
355
  cv2.THRESH_BINARY, 41, 15)
356
  except Exception:
357
  _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
358
+
359
+ # Morphological cleanup
360
+ kernel = np.ones((2, 2), np.uint8)
361
+ bw = cv2.morphologyEx(bw, cv2.MORPH_CLOSE, kernel)
362
  bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
363
+
364
  return bw
365
 
366
  def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
367
+ """Extract OCR cells from image"""
368
  try:
369
  o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config=f"--psm {TESSERACT_PSM}")
370
  except Exception:
371
  o = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
372
+
373
  cells = []
374
  n = len(o.get("text", []))
375
  for i in range(n):
 
379
  txt = str(raw).strip()
380
  if not txt:
381
  continue
382
+
383
  try:
384
  conf_raw = o.get("conf", [])[i]
385
  conf = float(conf_raw) if conf_raw not in (None, "", "-1") else -1.0
386
  except Exception:
387
  conf = -1.0
388
+
389
  left = int(o.get("left", [0])[i])
390
  top = int(o.get("top", [0])[i])
391
  width = int(o.get("width", [0])[i])
392
  height = int(o.get("height", [0])[i])
393
  center_y = top + height / 2.0
394
  center_x = left + width / 2.0
395
+
396
+ cells.append({
397
+ "text": txt,
398
+ "conf": max(0.0, conf) / 100.0, # normalize to 0-1
399
+ "left": left, "top": top, "width": width, "height": height,
400
+ "center_x": center_x, "center_y": center_y
401
+ })
402
+
403
  return cells
404
 
405
  def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) -> List[List[Dict[str, Any]]]:
406
+ """Group cells by horizontal position (rows)"""
407
  if not cells:
408
  return []
409
+
410
  sorted_cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
411
  rows = []
412
  current = [sorted_cells[0]]
413
  last_y = sorted_cells[0]["center_y"]
414
+
415
  for c in sorted_cells[1:]:
416
  if abs(c["center_y"] - last_y) <= y_tolerance:
417
  current.append(c)
 
420
  rows.append(sorted(current, key=lambda cc: cc["left"]))
421
  current = [c]
422
  last_y = c["center_y"]
423
+
424
  if current:
425
  rows.append(sorted(current, key=lambda cc: cc["left"]))
426
+
427
  return rows
428
 
429
+ # -------------------------------------------------------------------------
430
+ # Column Detection (Enhanced)
431
+ # -------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432
  def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 6) -> List[float]:
433
+ """Detect x-positions of numeric columns"""
434
  xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
435
  if not xs:
436
  return []
437
+
438
+ xs = sorted(set(xs))
439
  if len(xs) == 1:
440
+ return xs
441
+
442
+ # Cluster columns by gap analysis
443
  gaps = [xs[i+1] - xs[i] for i in range(len(xs)-1)]
444
  mean_gap = float(np.mean(gaps))
445
  std_gap = float(np.std(gaps)) if len(gaps) > 1 else 0.0
446
+ gap_thresh = max(35.0, mean_gap + 0.7 * std_gap)
447
+
448
  clusters = []
449
  curr = [xs[0]]
450
  for i, g in enumerate(gaps):
 
454
  else:
455
  curr.append(xs[i+1])
456
  clusters.append(curr)
457
+
458
  centers = [float(np.median(c)) for c in clusters]
459
  if len(centers) > max_columns:
460
  centers = centers[-max_columns:]
461
+
462
  return sorted(centers)
463
 
464
  def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optional[int]:
465
+ """Find closest column index for token"""
466
  if not column_centers:
467
  return None
468
  distances = [abs(token_x - cx) for cx in column_centers]
469
  return int(np.argmin(distances))
470
 
471
  # -------------------------------------------------------------------------
472
+ # Row Parsing (Enhanced for accuracy)
473
  # -------------------------------------------------------------------------
474
+ def parse_rows_with_columns(
475
+ rows: List[List[Dict[str, Any]]],
476
+ page_cells: List[Dict[str, Any]],
477
+ page_text: str = ""
478
+ ) -> List[BillLineItem]:
479
+ """
480
+ Parse rows into line items with improved accuracy.
481
+ Handles multi-line descriptions and uncertain quantities.
482
+ """
483
+ items = []
484
  column_centers = detect_numeric_columns(page_cells, max_columns=6)
485
+
486
+ for row_idx, row in enumerate(rows):
487
  tokens = [c["text"] for c in row]
488
+ row_text = " ".join(tokens)
489
+ row_lower = row_text.lower()
490
+
491
+ # Skip footers/headers
492
+ if FOOTER_KEYWORDS.search(row_lower) and not any(is_numeric_token(t) for t in tokens):
493
  continue
494
+
495
+ # Require at least one numeric token
496
+ if not any(is_numeric_token(t) for t in tokens):
497
  continue
498
+
499
+ # Extract amounts
500
  numeric_values = []
501
  for t in tokens:
502
  if is_numeric_token(t):
503
+ v = normalize_num_str(t, allow_zero=False)
504
  if v is not None:
505
  numeric_values.append(float(v))
506
+
507
+ if not numeric_values:
508
+ continue
509
+
510
+ numeric_values = sorted(list(set(numeric_values)), reverse=True)
511
+
512
+ # Column-based parsing
513
  if column_centers:
514
  left_text_parts = []
515
+ numeric_buckets = {i: [] for i in range(len(column_centers))}
516
+
517
  for c in row:
518
  t = c["text"]
519
  cx = c["center_x"]
520
+ conf = c.get("conf", 1.0)
521
+
522
  if is_numeric_token(t):
523
  col_idx = assign_token_to_column(cx, column_centers)
524
  if col_idx is None:
525
+ col_idx = len(column_centers) - 1
526
+ numeric_buckets[col_idx].append((t, conf))
 
527
  else:
528
  left_text_parts.append(t)
529
+
530
+ item_name = " ".join(left_text_parts).strip()
531
+ item_name = clean_item_name(item_name) if item_name else "UNKNOWN"
532
+
533
+ # Extract from columns (right-most is typically amount)
534
  num_cols = len(column_centers)
535
+ amount = None
536
+ rate = None
537
+ qty = None
538
+
539
+ # Try rightmost column first (usually total amount)
540
+ if num_cols >= 1:
541
+ bucket = numeric_buckets.get(num_cols - 1, [])
542
+ if bucket:
543
+ amt_str = bucket[-1][0]
544
+ amount = normalize_num_str(amt_str, allow_zero=False)
545
+
546
  if amount is None:
547
+ # Fallback: take largest numeric value
548
+ for v in numeric_values:
549
+ if v > 0:
550
+ amount = v
551
+ break
552
+
553
+ # Try second-to-right for rate
554
+ if num_cols >= 2:
555
+ bucket = numeric_buckets.get(num_cols - 2, [])
556
+ if bucket:
557
+ rate = normalize_num_str(bucket[-1][0], allow_zero=False)
558
+
559
+ # Try third-to-right for quantity
560
+ if num_cols >= 3:
561
+ bucket = numeric_buckets.get(num_cols - 3, [])
562
+ if bucket:
563
+ qty = normalize_num_str(bucket[-1][0], allow_zero=False)
564
+
565
+ # Smart qty/rate inference
566
+ if amount and not qty and not rate and numeric_values:
567
  for cand in numeric_values:
568
+ if cand <= 0.1 or cand >= amount:
 
 
 
 
 
 
 
 
 
 
 
569
  continue
570
+ ratio = amount / cand
571
  r = round(ratio)
572
+ if 1 <= r <= 100 and abs(ratio - r) <= 0.15 * r:
573
+ qty = float(r)
574
+ rate = cand
575
+ break
576
+
577
+ # Derive missing values
578
+ if qty and rate is None and amount and amount != 0:
579
+ rate = amount / qty
580
+ elif rate and qty is None and amount and amount != 0:
581
+ qty = amount / rate
582
+ elif amount and qty and rate is None:
583
+ rate = amount / qty if qty != 0 else 0.0
584
+
585
+ # Defaults
 
 
586
  if qty is None:
587
  qty = 1.0
588
+ if rate is None:
 
 
 
 
 
 
 
589
  rate = 0.0
590
+ if amount is None:
591
+ amount = qty * rate if qty and rate else 0.0
592
+
593
+ # Finalize
594
+ if amount > 0:
595
+ confidence = np.mean([c.get("conf", 0.85) for c in row]) if row else 0.85
596
+ items.append(BillLineItem(
597
+ item_name=item_name,
598
+ item_quantity=float(qty),
599
+ item_rate=float(round(rate, 2)),
600
+ item_amount=float(round(amount, 2)),
601
+ confidence=min(1.0, max(0.0, confidence)),
602
+ source_row=row_text,
603
+ ))
604
  else:
605
+ # Fallback: simple parsing without columns
606
  numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t)]
607
  if not numeric_idxs:
608
  continue
609
+
610
  last = numeric_idxs[-1]
611
+ amount = normalize_num_str(tokens[last], allow_zero=False)
612
+ if amount is None:
613
  continue
614
+
615
  name = " ".join(tokens[:last]).strip()
616
+ name = clean_item_name(name) if name else "UNKNOWN"
617
+
618
+ confidence = np.mean([c.get("conf", 0.85) for c in row]) if row else 0.85
619
+ items.append(BillLineItem(
620
+ item_name=name,
621
+ item_quantity=1.0,
622
+ item_rate=0.0,
623
+ item_amount=float(round(amount, 2)),
624
+ confidence=min(1.0, max(0.0, confidence)),
625
+ source_row=row_text,
626
+ ))
627
+
628
+ return items
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
629
 
630
  # -------------------------------------------------------------------------
631
+ # Accuracy Validation
632
  # -------------------------------------------------------------------------
633
+ def validate_totals(
634
+ line_items: List[BillLineItem],
635
+ bill_totals: BillTotal,
636
+ tolerance_pct: float = 2.0
637
+ ) -> Tuple[float, str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
638
  """
639
+ Validate extracted items sum vs bill total.
640
+ Returns: (accuracy_score 0-100, validation_msg)
641
  """
642
+ if not line_items:
643
+ return 0.0, "No line items extracted"
644
+
645
+ items_sum = sum(item.item_amount for item in line_items)
646
+
647
+ # If we detected a final total, compare
648
+ if bill_totals.final_total_amount is not None:
649
+ final_total = bill_totals.final_total_amount
650
+ diff = abs(items_sum - final_total)
651
+ diff_pct = (diff / final_total * 100) if final_total != 0 else 0.0
652
+
653
+ if diff_pct <= tolerance_pct:
654
+ score = 100.0
655
+ msg = f"✓ Extracted total ({items_sum:.2f}) matches bill total ({final_total:.2f})"
656
+ else:
657
+ # Scale score based on how close
658
+ score = max(0.0, 100.0 - (diff_pct * 5))
659
+ msg = f"⚠ Mismatch: items_sum={items_sum:.2f}, bill_total={final_total:.2f}, diff={diff_pct:.1f}%"
660
+
661
+ return score, msg
662
+
663
+ return 85.0, f"No bill total detected; items_sum={items_sum:.2f}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
664
 
665
+ # -------------------------------------------------------------------------
666
+ # Main OCR Pipelines (Tesseract)
667
+ # -------------------------------------------------------------------------
668
+ def ocr_with_tesseract(file_bytes: bytes) -> List[ExtractedPage]:
669
+ """Enhanced Tesseract pipeline"""
670
  pages_out = []
671
+
672
  try:
673
  images = convert_from_bytes(file_bytes)
674
+ except Exception:
 
675
  try:
676
  im = Image.open(BytesIO(file_bytes))
677
  images = [im]
678
+ except Exception as e:
679
+ logger.exception("Tesseract: file open failed: %s", e)
680
  return []
681
+
682
  for idx, pil_img in enumerate(images, start=1):
683
  try:
684
+ # Preprocess & extract
685
  proc = preprocess_image_for_tesseract(pil_img)
686
  cells = image_to_tsv_cells(proc)
687
  rows = group_cells_into_rows(cells, y_tolerance=12)
688
+
689
+ # Get page text
690
+ page_text = " ".join([" ".join([c["text"] for c in r]) for r in rows])
691
+
692
+ # Detect totals early
693
+ subtotal, tax, discount, final_total = detect_totals_in_rows(rows)
694
+
695
+ # Parse line items
696
+ items = parse_rows_with_columns(rows, cells, page_text)
697
+
698
+ # Deduplicate
699
+ items = dedupe_items_advanced(items)
700
+
701
+ # Filter (exclude totals/subtotals)
702
+ filtered_items = []
703
+ for item in items:
704
+ name_lower = item.item_name.lower()
705
+
706
+ # Skip if name matches total keywords
707
+ if TOTAL_KEYWORDS.search(name_lower) or SUBTOTAL_KEYWORDS.search(name_lower):
708
  continue
709
+
710
+ if item.item_amount > 0:
711
+ filtered_items.append(item)
712
+
713
+ # Create bill totals object
714
+ bill_totals = BillTotal(
715
+ subtotal_amount=subtotal,
716
+ tax_amount=tax,
717
+ discount_amount=discount,
718
+ final_total_amount=final_total,
719
+ )
720
+
721
+ # Validate
722
+ accuracy, val_msg = validate_totals(filtered_items, bill_totals)
723
+ logger.info(f"Page {idx}: {val_msg}")
724
+
725
+ page_conf = np.mean([item.confidence for item in filtered_items]) if filtered_items else 0.8
726
+
727
+ pages_out.append(ExtractedPage(
728
+ page_no=idx,
729
+ page_type="Bill Detail",
730
+ line_items=filtered_items,
731
+ bill_totals=bill_totals,
732
+ page_confidence=page_conf,
733
+ ))
734
+
735
  except Exception as e:
736
+ logger.exception(f"Tesseract page {idx} failed: %s", e)
737
+ pages_out.append(ExtractedPage(
738
+ page_no=idx,
739
+ page_type="Bill Detail",
740
+ line_items=[],
741
+ bill_totals=BillTotal(),
742
+ page_confidence=0.0,
743
+ ))
744
+
745
  return pages_out
746
 
747
  # -------------------------------------------------------------------------
748
+ # FastAPI App
749
  # -------------------------------------------------------------------------
750
+ app = FastAPI(title="Enhanced Bill Extractor (Datathon v2)")
751
+
752
+ class BillRequest(BaseModel):
753
+ document: str # file://path or http(s) URL
754
+
755
+ class BillResponse(BaseModel):
756
+ is_success: bool
757
+ error: Optional[str] = None
758
+ data: Dict[str, Any]
759
+ accuracy_score: float # 0-100
760
+ validation_message: str
761
+ token_usage: Dict[str, int]
762
+
763
+ @app.post("/extract-bill-data", response_model=BillResponse)
764
  async def extract_bill_data(payload: BillRequest):
765
+ """Main extraction endpoint"""
766
  doc_url = payload.document
767
  file_bytes = None
768
+
769
+ # Load file
770
  if doc_url.startswith("file://"):
771
  local_path = doc_url.replace("file://", "")
772
  try:
773
  with open(local_path, "rb") as f:
774
  file_bytes = f.read()
775
  except Exception as e:
776
+ return BillResponse(
777
+ is_success=False,
778
+ error=f"Local file read failed: {e}",
779
+ data={"pagewise_line_items": [], "total_item_count": 0},
780
+ accuracy_score=0.0,
781
+ validation_message="File load failed",
782
+ token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
783
+ )
784
  else:
785
  try:
786
  headers = {"User-Agent": "Mozilla/5.0"}
787
  resp = requests.get(doc_url, headers=headers, timeout=30)
788
  if resp.status_code != 200:
789
+ return BillResponse(
790
+ is_success=False,
791
+ error=f"Download failed (status={resp.status_code})",
792
+ data={"pagewise_line_items": [], "total_item_count": 0},
793
+ accuracy_score=0.0,
794
+ validation_message="HTTP error",
795
+ token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
796
+ )
797
  file_bytes = resp.content
798
  except Exception as e:
799
+ return BillResponse(
800
+ is_success=False,
801
+ error=f"HTTP error: {e}",
802
+ data={"pagewise_line_items": [], "total_item_count": 0},
803
+ accuracy_score=0.0,
804
+ validation_message="Network error",
805
+ token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
806
+ )
807
+
808
  if not file_bytes:
809
+ return BillResponse(
810
+ is_success=False,
811
+ error="No file bytes",
812
+ data={"pagewise_line_items": [], "total_item_count": 0},
813
+ accuracy_score=0.0,
814
+ validation_message="Empty file",
815
+ token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
816
+ )
817
+
818
+ # Extract
819
+ logger.info(f"Processing with engine: {OCR_ENGINE}")
820
  try:
821
+ if OCR_ENGINE == "tesseract":
 
 
 
 
822
  pages = ocr_with_tesseract(file_bytes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
823
  else:
824
+ # Fallback to tesseract
825
+ pages = ocr_with_tesseract(file_bytes)
 
826
  except Exception as e:
827
+ logger.exception("OCR failed: %s", e)
828
+ pages = []
829
+
830
+ # Prepare response
831
+ total_items = sum(len(p.line_items) for p in pages)
832
+ pages_dict = [p.to_dict() for p in pages]
833
+
834
+ # Calculate overall accuracy
835
+ all_items = [item for p in pages for item in p.line_items]
836
+ all_totals = BillTotal(
837
+ subtotal_amount=sum(p.bill_totals.subtotal_amount or 0 for p in pages) or None,
838
+ tax_amount=sum(p.bill_totals.tax_amount or 0 for p in pages) or None,
839
+ discount_amount=sum(p.bill_totals.discount_amount or 0 for p in pages) or None,
840
+ final_total_amount=sum(p.bill_totals.final_total_amount or 0 for p in pages) or None,
841
+ )
842
+
843
+ overall_acc, msg = validate_totals(all_items, all_totals)
844
+
845
+ return BillResponse(
846
+ is_success=True,
847
+ data={
848
+ "pagewise_line_items": pages_dict,
849
+ "total_item_count": total_items,
850
+ },
851
+ accuracy_score=overall_acc,
852
+ validation_message=msg,
853
+ token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
854
+ )
855
 
856
  @app.get("/")
857
+ def health():
858
+ return {
859
+ "status": "ok",
860
+ "engine": OCR_ENGINE,
861
+ "message": "Enhanced Bill Extractor (Datathon v2 - High Accuracy Mode)",
862
+ "hint": "POST /extract-bill-data with {'document': '<url or file://path>'}",
863
+ }
864
+
865
+ if __name__ == "__main__":
866
+ import uvicorn
867
+ uvicorn.run(app, host="0.0.0.0", port=8080)