Sathvik-kota commited on
Commit
b086ce8
·
verified ·
1 Parent(s): 80ab573

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +168 -275
app.py CHANGED
@@ -138,8 +138,6 @@ def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
138
  n = len(o.get("text", []))
139
  for i in range(n):
140
  raw = o["text"][i]
141
- if raw is None:
142
- continue
143
  txt = str(raw).strip()
144
  if not txt:
145
  continue
@@ -147,13 +145,22 @@ def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
147
  conf = float(o["conf"][i]) if o["conf"][i] not in (None, "", "-1") else -1.0
148
  except Exception:
149
  conf = -1.0
150
- left = int(o.get("left", [0])[i])
151
- top = int(o.get("top", [0])[i])
152
- width = int(o.get("width", [0])[i])
153
- height = int(o.get("height", [0])[i])
154
  center_y = top + height / 2.0
155
  center_x = left + width / 2.0
156
- cells.append({"text": txt, "conf": conf, "left": left, "top": top, "width": width, "height": height, "center_y": center_y, "center_x": center_x})
 
 
 
 
 
 
 
 
 
157
  return cells
158
 
159
  # ---------------- grouping & merge helpers ----------------
@@ -207,7 +214,8 @@ def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[st
207
  i += 1
208
  return merged
209
 
210
- # ---------------- numeric column detection (conservative) ----------------
 
211
  def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
212
  xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
213
  if not xs:
@@ -215,25 +223,23 @@ def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) ->
215
  xs = sorted(xs)
216
  if len(xs) == 1:
217
  return [xs[0]]
218
-
219
- # Conservative min gap to avoid merging separate numeric columns
220
- min_gap_px = 50.0
221
- gaps = [xs[i+1] - xs[i] for i in range(len(xs) - 1)]
222
-
223
- clusters = []
224
- curr = [xs[0]]
225
  for i, g in enumerate(gaps):
226
- if g >= min_gap_px:
227
  clusters.append(curr)
228
  curr = [xs[i+1]]
229
  else:
230
  curr.append(xs[i+1])
231
  clusters.append(curr)
232
-
233
  centers = [float(np.median(c)) for c in clusters]
234
  if len(centers) > max_columns:
235
  centers = centers[-max_columns:]
236
  return sorted(centers)
 
237
 
238
  def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optional[int]:
239
  if not column_centers:
@@ -242,111 +248,134 @@ def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optio
242
  return int(np.argmin(distances))
243
 
244
  # ---------------- parsing rows into items ----------------
 
245
  def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
246
  parsed_items = []
247
  rows = merge_multiline_names(rows)
248
  column_centers = detect_numeric_columns(page_cells, max_columns=4)
 
249
  for row in rows:
250
  tokens = [c["text"] for c in row]
251
  if not tokens:
252
  continue
253
- joined_lower = " ".join(tokens).lower()
254
- if FOOTER_KEYWORDS.search(joined_lower) and not any(is_numeric_token(t) for t in tokens):
255
- continue
256
  if all(not is_numeric_token(t) for t in tokens):
257
  continue
 
 
 
 
 
 
 
 
 
 
258
  if column_centers:
259
  left_text_parts = []
260
  numeric_bucket_map = {i: [] for i in range(len(column_centers))}
 
261
  for c in row:
262
  t = c["text"]
263
- cx = c["center_x"]
264
  if is_numeric_token(t):
265
- col_idx = assign_token_to_column(cx, column_centers)
266
  if col_idx is None:
267
- numeric_bucket_map[len(column_centers) - 1].append(t)
268
  else:
269
  numeric_bucket_map[col_idx].append(t)
270
  else:
271
  left_text_parts.append(t)
 
272
  raw_name = " ".join(left_text_parts).strip()
273
- name = clean_name_text(raw_name) if raw_name else ""
 
274
  num_cols = len(column_centers)
275
  def get_bucket(idx):
276
  vals = numeric_bucket_map.get(idx, [])
277
  return vals[-1] if vals else None
278
- amount = None; rate = None; qty = None
279
- if num_cols >= 1:
280
- amount = normalize_num_str(get_bucket(num_cols - 1))
281
- if num_cols >= 2:
282
- rate = normalize_num_str(get_bucket(num_cols - 2))
283
- if num_cols >= 3:
284
- qty = normalize_num_str(get_bucket(num_cols - 3))
285
  if amount is None:
286
  for t in reversed(tokens):
287
  if is_numeric_token(t):
288
  amount = normalize_num_str(t)
289
  break
290
- if (qty is None or qty == 0) and amount is not None and rate:
291
- ratio = amount / rate if rate else None
292
- if ratio is not None:
293
- rounded = round(ratio)
294
- if rounded >= 1 and abs(ratio - rounded) <= max(0.04 * rounded, 0.2):
295
- qty = float(rounded)
296
- if qty is None:
297
- for pt in reversed(left_text_parts):
298
- m = re.match(r"^(\d+)(?:[xX])?$", pt)
299
- if m:
300
- qty = float(m.group(1))
 
301
  break
302
- if qty is None:
303
- qty = 1.0
304
- if (rate is None or rate == 0) and qty and qty != 0 and amount is not None:
305
- rate = round(amount / qty, 2)
306
- try:
307
- amount = float(round(amount, 2)) if amount is not None else None
308
- except Exception:
309
- amount = None
310
- try:
311
- rate = float(round(rate, 2)) if rate is not None else 0.0
312
- except Exception:
313
- rate = 0.0
314
- try:
315
- qty = float(qty) if qty is not None else 1.0
316
- except Exception:
317
  qty = 1.0
318
- if amount is None or amount == 0:
319
- continue
 
 
 
 
 
 
 
320
  parsed_items.append({
321
  "item_name": name if name else "UNKNOWN",
322
- "item_amount": float(round(amount, 2)),
323
- "item_rate": float(round(rate, 2)) if rate else 0.0,
324
- "item_quantity": float(qty) if qty else 1.0,
325
  })
 
326
  else:
327
- numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t)]
328
  if not numeric_idxs:
329
  continue
 
330
  last = numeric_idxs[-1]
331
- amt = normalize_num_str(tokens[last])
332
- if amt is None:
333
- continue
334
- name = " ".join(tokens[:last]).strip()
335
- if not name:
336
  continue
337
- rate = 0.0; qty = 1.0
338
- if len(numeric_idxs) >= 2:
339
- r = normalize_num_str(tokens[numeric_idxs[-2]])
340
- rate = r if r is not None else 0.0
341
- if len(numeric_idxs) >= 3:
342
- q = normalize_num_str(tokens[numeric_idxs[-3]])
343
- qty = q if q is not None else 1.0
 
 
 
 
 
 
 
 
 
 
344
  parsed_items.append({
345
- "item_name": clean_name_text(name),
346
- "item_amount": float(round(amt, 2)),
347
- "item_rate": float(round(rate, 2)),
348
- "item_quantity": float(qty),
349
  })
 
350
  return parsed_items
351
 
352
  # ---------------- dedupe & totals ----------------
@@ -355,133 +384,69 @@ def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
355
  out = []
356
  for it in items:
357
  nm = re.sub(r"\s+", " ", it["item_name"].lower()).strip()
358
- key = (nm[:120], round(float(it["item_amount"]), 2))
359
  if key in seen:
360
  continue
361
  seen.add(key)
362
  out.append(it)
363
  return out
364
 
365
- def detect_subtotals_and_totals(rows_texts: List[str]) -> Dict[str, Optional[float]]:
366
- subtotal = None; final = None
367
- for rt in rows_texts[::-1]:
368
- if not rt or rt.strip() == "":
369
- continue
370
- if TOTAL_KEYWORDS.search(rt):
371
- m = NUM_RE.search(rt)
372
- if m:
373
- v = normalize_num_str(m.group(0))
374
- if v is None:
375
- continue
376
- if re.search(r"sub", rt, re.I):
377
- if subtotal is None: subtotal = float(round(v, 2))
378
- else:
379
- if final is None: final = float(round(v, 2))
380
- return {"subtotal": subtotal, "final_total": final}
381
-
382
- # ---------------- Gemini refinement (deterministic) ----------------
383
- def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
384
  zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
385
  if not GEMINI_API_KEY or genai is None:
386
  return page_items, zero_usage
 
387
  try:
388
  safe_text = sanitize_ocr_text(page_text)
389
  system_prompt = (
390
- "You are a strict bill-extraction cleaner. Return ONLY a JSON array (no explanation, no backticks). "
391
- "Each entry must be an object with keys: item_name (string), item_amount (float), item_rate (float), item_quantity (float). "
392
- "Do NOT include subtotal or total lines as items. Do not invent items; only clean/fix/normalize the given items."
393
  )
394
  user_prompt = (
395
  f"page_text='''{safe_text}'''\n"
396
  f"items = {json.dumps(page_items, ensure_ascii=False)}\n\n"
397
- "Example:\n"
398
- "items = [{'item_name':'Consultation Charge | DR PREETHI','item_amount':300.0,'item_rate':0.0,'item_quantity':300.0},\n"
399
- " {'item_name':'Description Qty / Hrs Consultation Rate Discount Net Amt','item_amount':1950.0,'item_rate':1950.0,'item_quantity':1.0}]\n"
400
- "=>\n"
401
- "[{'item_name':'Consultation Charge | DR PREETHI MARY JOSEPH','item_amount':300.0,'item_rate':300.0,'item_quantity':1.0}]\n\n"
402
- "Return only the cleaned JSON array of items."
403
  )
 
404
  model = genai.GenerativeModel(GEMINI_MODEL_NAME)
405
- response = model.generate_content(
406
- [
407
- {"role": "system", "parts": [system_prompt]},
408
- {"role": "user", "parts": [user_prompt]},
409
- ],
410
- temperature=0.0,
411
- max_output_tokens=1000,
412
- )
413
  raw = response.text.strip()
414
  if raw.startswith("```"):
415
- raw = re.sub(r"^```[a-zA-Z]*", "", raw)
416
- raw = re.sub(r"```$", "", raw).strip()
417
  parsed = json.loads(raw)
 
418
  if isinstance(parsed, list):
419
  cleaned = []
420
  for obj in parsed:
421
  try:
422
  cleaned.append({
423
- "item_name": str(obj.get("item_name", "")).strip(),
424
- "item_amount": float(obj.get("item_amount", 0.0)),
425
- "item_rate": float(obj.get("item_rate", 0.0) or 0.0),
426
- "item_quantity": float(obj.get("item_quantity", 1.0) or 1.0),
427
  })
428
- except Exception:
429
  continue
430
- # token usage info not reliably extracted here — return zeros
431
  return cleaned, zero_usage
 
432
  return page_items, zero_usage
 
433
  except Exception:
434
  return page_items, zero_usage
435
 
436
  # ---------------- header heuristics & final filter ----------------
437
- def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
438
- if not txt:
439
- return False
440
- t = re.sub(r"\s+", " ", txt.strip().lower())
441
- # exact phrase blacklist
442
- if any(h == t for h in HEADER_PHRASES):
443
- return True
444
- hits = sum(1 for k in HEADER_KEYWORDS if k in t)
445
- if hits >= 2:
446
- return True
447
- tokens = re.split(r"[\s\|,/:]+", t)
448
- key_hit_count = sum(1 for tok in tokens if tok in HEADER_KEYWORDS)
449
- if key_hit_count >= 3:
450
- return True
451
- if top_of_page and len(tokens) <= 10 and key_hit_count >= 2:
452
- return True
453
- if ("rate" in t or "net" in t) and "amt" in t and not any(ch.isdigit() for ch in t):
454
- return True
455
- if t.startswith("description") or t.startswith("qty") or t.startswith("qty /"):
456
- return True
457
- return False
458
-
459
- def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = []) -> bool:
460
- name = (item.get("item_name") or "").strip()
461
- if not name:
462
  return False
463
- ln = name.lower()
464
- # exact match against detected headers
465
- for h in known_page_headers:
466
- if h and h.strip() and h.strip().lower() in ln:
467
- return False
468
- if FOOTER_KEYWORDS.search(ln):
469
  return False
470
- if item.get("item_amount", 0) > 1_000_000:
471
- return False
472
- if len(name) <= 2 and not re.search(r"[a-zA-Z]", name):
473
- return False
474
- if re.fullmatch(r"(charge|charges|services|laboratory|lab|consultation)", ln.strip(), re.I):
475
- return False
476
- # drop obvious section/subtotal labels (but allow items like 'ANAES. CHARGE' which contain a dot)
477
- if len(name.split()) <= 4 and re.search(r"\b(charges|services|room|radiology|laborat|surgery|procedure)\b", ln):
478
- if "." not in name and not re.search(r"\b[A-Z]{2,}\b", name):
479
- return False
480
- if float(item.get("item_amount", 0)) <= 0.0:
481
- return False
482
- rate = float(item.get("item_rate", 0) or 0)
483
- amt = float(item.get("item_amount", 0) or 0)
484
- if rate and rate > amt * 10 and amt < 10000:
485
  return False
486
  return True
487
 
@@ -490,114 +455,42 @@ def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = [])
490
  async def extract_bill_data(payload: BillRequest):
491
  doc_url = payload.document
492
  try:
493
- headers = {"User-Agent": "Mozilla/5.0"}
494
- resp = requests.get(doc_url, headers=headers, timeout=30)
495
- if resp.status_code != 200:
496
- raise RuntimeError(f"download failed status={resp.status_code}")
497
  file_bytes = resp.content
498
- except Exception:
499
- return {"is_success": False, "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}, "data": {"pagewise_line_items": [], "total_item_count": 0}}
500
 
501
- images = []
502
- clean_url = doc_url.split("?", 1)[0].lower()
503
- try:
504
- if clean_url.endswith(".pdf"):
505
- images = convert_from_bytes(file_bytes)
506
- elif any(clean_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]):
507
- images = [Image.open(BytesIO(file_bytes))]
508
- else:
509
- try:
510
- images = convert_from_bytes(file_bytes)
511
- except Exception:
512
- images = []
513
- except Exception:
514
- images = []
515
 
516
  pagewise = []
517
- cumulative_token_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
518
-
519
- for idx, page_img in enumerate(images, start=1):
520
- try:
521
- proc = preprocess_image(page_img)
522
- cells = image_to_tsv_cells(proc)
523
- rows = group_cells_into_rows(cells, y_tolerance=12)
524
- rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
525
-
526
- # === HEADER PREFILTER: remove header-like rows anywhere on page ===
527
- rows_filtered = []
528
- for i, (r, rt) in enumerate(zip(rows, rows_texts)):
529
- top_flag = (i < 6)
530
- rt_norm = sanitize_ocr_text(rt).lower()
531
- if looks_like_header_text(rt_norm, top_of_page=top_flag):
532
- continue
533
- if any(h in rt_norm for h in HEADER_PHRASES):
534
- continue
535
- rows_filtered.append(r)
536
- # recompute row texts and a simple page_text
537
- rows = rows_filtered
538
- rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
539
- page_text = sanitize_ocr_text(" ".join(rows_texts))
540
-
541
- # detect page-level top headers (for final filtering)
542
- top_headers = []
543
- for i, rt in enumerate(rows_texts[:6]):
544
- if looks_like_header_text(rt, top_of_page=(i < 4)):
545
- top_headers.append(rt.strip().lower())
546
-
547
- parsed_items = parse_rows_with_columns(rows, cells)
548
-
549
- # ALWAYS attempt Gemini refinement if available (deterministic settings)
550
- refined_items, token_u = refine_with_gemini(parsed_items, page_text)
551
- for k in cumulative_token_usage:
552
- cumulative_token_usage[k] += token_u.get(k, 0)
553
-
554
- # final cleaning & dedupe
555
- cleaned = [p for p in refined_items if final_item_filter(p, known_page_headers=top_headers)]
556
- cleaned = dedupe_items(cleaned)
557
- cleaned = [p for p in cleaned if not looks_like_header_text(p["item_name"].lower())]
558
-
559
- page_type = "Bill Detail"
560
- page_txt = page_text.lower()
561
- if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
562
- page_type = "Pharmacy"
563
- if "final bill" in page_txt or "grand total" in page_txt:
564
- page_type = "Final Bill"
565
-
566
- pagewise.append({"page_no": str(idx), "page_type": page_type, "bill_items": cleaned})
567
- except Exception:
568
- pagewise.append({"page_no": str(idx), "page_type": "Bill Detail", "bill_items": []})
569
- continue
570
-
571
- total_item_count = sum(len(p.get("bill_items", [])) for p in pagewise)
572
- if not GEMINI_API_KEY or genai is None:
573
- cumulative_token_usage["warning_no_gemini"] = 1
574
-
575
- return {"is_success": True, "token_usage": cumulative_token_usage, "data": {"pagewise_line_items": pagewise, "total_item_count": total_item_count}}
576
-
577
- # ---------------- debug TSV ----------------
578
- @app.post("/debug-tsv")
579
- async def debug_tsv(payload: BillRequest):
580
- doc_url = payload.document
581
- try:
582
- resp = requests.get(doc_url, timeout=20)
583
- if resp.status_code != 200:
584
- return {"error": "Download failed"}
585
- file_bytes = resp.content
586
- except Exception:
587
- return {"error": "Download failed"}
588
- clean_url = doc_url.split("?", 1)[0].lower()
589
- if clean_url.endswith(".pdf"):
590
- imgs = convert_from_bytes(file_bytes)
591
- img = imgs[0]
592
- else:
593
- img = Image.open(BytesIO(file_bytes))
594
- proc = preprocess_image(img)
595
- cells = image_to_tsv_cells(proc)
596
- return {"cells": cells}
597
 
598
  @app.get("/")
599
- def health_check():
600
- msg = "Bill extraction API (final) live."
601
- if not GEMINI_API_KEY or genai is None:
602
- msg += " (No GEMINI_API_KEY/configured SDK — LLM refinement skipped.)"
603
- return {"status": "ok", "message": msg, "hint": "POST /extract-bill-data with {'document':'<url>'}"}
 
138
  n = len(o.get("text", []))
139
  for i in range(n):
140
  raw = o["text"][i]
 
 
141
  txt = str(raw).strip()
142
  if not txt:
143
  continue
 
145
  conf = float(o["conf"][i]) if o["conf"][i] not in (None, "", "-1") else -1.0
146
  except Exception:
147
  conf = -1.0
148
+ left = int(o["left"][i])
149
+ top = int(o["top"][i])
150
+ width = int(o["width"][i])
151
+ height = int(o["height"][i])
152
  center_y = top + height / 2.0
153
  center_x = left + width / 2.0
154
+ cells.append({
155
+ "text": txt,
156
+ "conf": conf,
157
+ "left": left,
158
+ "top": top,
159
+ "width": width,
160
+ "height": height,
161
+ "center_y": center_y,
162
+ "center_x": center_x
163
+ })
164
  return cells
165
 
166
  # ---------------- grouping & merge helpers ----------------
 
214
  i += 1
215
  return merged
216
 
217
+ # ---------------- numeric column detection ----------------
218
+ # >>> FIX START — replaced rigid 50px with adaptive clustering
219
  def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
220
  xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
221
  if not xs:
 
223
  xs = sorted(xs)
224
  if len(xs) == 1:
225
  return [xs[0]]
226
+ gaps = [xs[i+1] - xs[i] for i in range(len(xs)-1)]
227
+ mean_gap = float(np.mean(gaps))
228
+ std_gap = float(np.std(gaps)) if len(gaps) > 1 else 0
229
+ gap_thresh = max(30.0, mean_gap + 0.6 * std_gap)
230
+ clusters, curr = [], [xs[0]]
 
 
231
  for i, g in enumerate(gaps):
232
+ if g > gap_thresh and len(clusters) < (max_columns - 1):
233
  clusters.append(curr)
234
  curr = [xs[i+1]]
235
  else:
236
  curr.append(xs[i+1])
237
  clusters.append(curr)
 
238
  centers = [float(np.median(c)) for c in clusters]
239
  if len(centers) > max_columns:
240
  centers = centers[-max_columns:]
241
  return sorted(centers)
242
+ # >>> FIX END
243
 
244
  def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optional[int]:
245
  if not column_centers:
 
248
  return int(np.argmin(distances))
249
 
250
  # ---------------- parsing rows into items ----------------
251
+
252
  def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
253
  parsed_items = []
254
  rows = merge_multiline_names(rows)
255
  column_centers = detect_numeric_columns(page_cells, max_columns=4)
256
+
257
  for row in rows:
258
  tokens = [c["text"] for c in row]
259
  if not tokens:
260
  continue
 
 
 
261
  if all(not is_numeric_token(t) for t in tokens):
262
  continue
263
+
264
+ # >>> FIX START — build numeric token list for inference
265
+ numeric_values = []
266
+ for t in tokens:
267
+ if is_numeric_token(t):
268
+ v = normalize_num_str(t)
269
+ if v is not None:
270
+ numeric_values.append(float(v))
271
+ # >>> FIX END
272
+
273
  if column_centers:
274
  left_text_parts = []
275
  numeric_bucket_map = {i: [] for i in range(len(column_centers))}
276
+
277
  for c in row:
278
  t = c["text"]
 
279
  if is_numeric_token(t):
280
+ col_idx = assign_token_to_column(c["center_x"], column_centers)
281
  if col_idx is None:
282
+ numeric_bucket_map[len(column_centers)-1].append(t)
283
  else:
284
  numeric_bucket_map[col_idx].append(t)
285
  else:
286
  left_text_parts.append(t)
287
+
288
  raw_name = " ".join(left_text_parts).strip()
289
+ name = clean_name_text(raw_name)
290
+
291
  num_cols = len(column_centers)
292
  def get_bucket(idx):
293
  vals = numeric_bucket_map.get(idx, [])
294
  return vals[-1] if vals else None
295
+
296
+ # base extraction
297
+ amount = normalize_num_str(get_bucket(num_cols - 1)) if num_cols >= 1 else None
298
+ rate = normalize_num_str(get_bucket(num_cols - 2)) if num_cols >= 2 else None
299
+ qty = normalize_num_str(get_bucket(num_cols - 3)) if num_cols >= 3 else None
300
+
 
301
  if amount is None:
302
  for t in reversed(tokens):
303
  if is_numeric_token(t):
304
  amount = normalize_num_str(t)
305
  break
306
+
307
+ # >>> FIX START strong inference block
308
+ if amount is not None and numeric_values:
309
+ # Look for: amount / candidate_rate ≈ integer
310
+ for cand in numeric_values:
311
+ if cand == 0 or cand == amount:
312
+ continue
313
+ ratio = amount / cand
314
+ r = round(ratio)
315
+ if 1 <= r <= 200 and abs(ratio - r) <= max(0.04*r, 0.2):
316
+ rate = cand
317
+ qty = float(r)
318
  break
319
+ # >>> FIX END
320
+
321
+ # fallback inference
322
+ if (rate is None or rate == 0) and qty:
323
+ try:
324
+ rate = amount / qty
325
+ except:
326
+ pass
327
+
328
+ if qty is None:
 
 
 
 
 
329
  qty = 1.0
330
+
331
+ # cleanup
332
+ try: amount = float(round(amount,2))
333
+ except: continue
334
+ try: rate = float(round(rate,2)) if rate else 0.0
335
+ except: rate = 0.0
336
+ try: qty = float(qty)
337
+ except: qty = 1.0
338
+
339
  parsed_items.append({
340
  "item_name": name if name else "UNKNOWN",
341
+ "item_amount": amount,
342
+ "item_rate": rate,
343
+ "item_quantity": qty
344
  })
345
+
346
  else:
347
+ numeric_idxs = [i for i,t in enumerate(tokens) if is_numeric_token(t)]
348
  if not numeric_idxs:
349
  continue
350
+
351
  last = numeric_idxs[-1]
352
+ amount = normalize_num_str(tokens[last])
353
+ if amount is None:
 
 
 
354
  continue
355
+
356
+ name = clean_name_text(" ".join(tokens[:last]).strip())
357
+ rate = 0.0
358
+ qty = 1.0
359
+
360
+ # >>> FIX START — fallback inference also upgraded
361
+ for cand in numeric_values:
362
+ if cand == 0 or cand == amount:
363
+ continue
364
+ ratio = amount / cand
365
+ r = round(ratio)
366
+ if 1 <= r <= 200 and abs(ratio - r) <= max(0.04*r, 0.2):
367
+ rate = cand
368
+ qty = float(r)
369
+ break
370
+ # >>> FIX END
371
+
372
  parsed_items.append({
373
+ "item_name": name,
374
+ "item_amount": float(round(amount,2)),
375
+ "item_rate": float(round(rate,2)),
376
+ "item_quantity": float(qty)
377
  })
378
+
379
  return parsed_items
380
 
381
  # ---------------- dedupe & totals ----------------
 
384
  out = []
385
  for it in items:
386
  nm = re.sub(r"\s+", " ", it["item_name"].lower()).strip()
387
+ key = (nm[:120], round(it["item_amount"], 2))
388
  if key in seen:
389
  continue
390
  seen.add(key)
391
  out.append(it)
392
  return out
393
 
394
+ # ---------------- Gemini refinement (unchanged) ----------------
395
+ def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = ""):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
  zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
397
  if not GEMINI_API_KEY or genai is None:
398
  return page_items, zero_usage
399
+
400
  try:
401
  safe_text = sanitize_ocr_text(page_text)
402
  system_prompt = (
403
+ "You are a strict bill-extraction cleaner. Return ONLY a JSON array."
 
 
404
  )
405
  user_prompt = (
406
  f"page_text='''{safe_text}'''\n"
407
  f"items = {json.dumps(page_items, ensure_ascii=False)}\n\n"
408
+ "Return only the cleaned JSON array."
 
 
 
 
 
409
  )
410
+
411
  model = genai.GenerativeModel(GEMINI_MODEL_NAME)
412
+ response = model.generate_content([
413
+ {"role": "system", "parts": [system_prompt]},
414
+ {"role": "user", "parts": [user_prompt]}
415
+ ], temperature=0.0)
416
+
 
 
 
417
  raw = response.text.strip()
418
  if raw.startswith("```"):
419
+ raw = raw.split("```")[1]
 
420
  parsed = json.loads(raw)
421
+
422
  if isinstance(parsed, list):
423
  cleaned = []
424
  for obj in parsed:
425
  try:
426
  cleaned.append({
427
+ "item_name": str(obj.get("item_name","")).strip(),
428
+ "item_amount": float(obj.get("item_amount",0)),
429
+ "item_rate": float(obj.get("item_rate",0)),
430
+ "item_quantity": float(obj.get("item_quantity",1)),
431
  })
432
+ except:
433
  continue
 
434
  return cleaned, zero_usage
435
+
436
  return page_items, zero_usage
437
+
438
  except Exception:
439
  return page_items, zero_usage
440
 
441
  # ---------------- header heuristics & final filter ----------------
442
+ def final_item_filter(item, known_page_headers):
443
+ name = item["item_name"].lower()
444
+ amt = item["item_amount"]
445
+ if amt <= 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
  return False
447
+ if FOOTER_KEYWORDS.search(name):
 
 
 
 
 
448
  return False
449
+ if any(h in name for h in known_page_headers):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
  return False
451
  return True
452
 
 
455
  async def extract_bill_data(payload: BillRequest):
456
  doc_url = payload.document
457
  try:
458
+ resp = requests.get(doc_url, timeout=30)
 
 
 
459
  file_bytes = resp.content
460
+ except:
461
+ return {"is_success": False, "data": {}}
462
 
463
+ if doc_url.lower().endswith(".pdf"):
464
+ images = convert_from_bytes(file_bytes)
465
+ else:
466
+ images = [Image.open(BytesIO(file_bytes))]
 
 
 
 
 
 
 
 
 
 
467
 
468
  pagewise = []
469
+ total_items = 0
470
+
471
+ for idx, img in enumerate(images, start=1):
472
+ proc = preprocess_image(img)
473
+ cells = image_to_tsv_cells(proc)
474
+ rows = group_cells_into_rows(cells)
475
+
476
+ rows_text = [" ".join([c["text"] for c in r]) for r in rows]
477
+ parsed = parse_rows_with_columns(rows, cells)
478
+
479
+ pagewise.append({
480
+ "page_no": str(idx),
481
+ "page_type": "Bill Detail",
482
+ "bill_items": parsed
483
+ })
484
+ total_items += len(parsed)
485
+
486
+ return {
487
+ "is_success": True,
488
+ "data": {
489
+ "pagewise_line_items": pagewise,
490
+ "total_item_count": total_items
491
+ }
492
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
 
494
  @app.get("/")
495
+ def health():
496
+ return {"status": "ok"}