Spaces:

Sathvik-kota
/

Datathon

Sleeping

App Files Files Community

Sathvik-kota commited on Nov 29, 2025

Commit

b086ce8

verified ·

1 Parent(s): 80ab573

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +168 -275

app.py CHANGED Viewed

@@ -138,8 +138,6 @@ def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
     n = len(o.get("text", []))
     for i in range(n):
         raw = o["text"][i]
-        if raw is None:
-            continue
         txt = str(raw).strip()
         if not txt:
             continue
@@ -147,13 +145,22 @@ def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
             conf = float(o["conf"][i]) if o["conf"][i] not in (None, "", "-1") else -1.0
         except Exception:
             conf = -1.0
-        left = int(o.get("left", [0])[i])
-        top = int(o.get("top", [0])[i])
-        width = int(o.get("width", [0])[i])
-        height = int(o.get("height", [0])[i])
         center_y = top + height / 2.0
         center_x = left + width / 2.0
-        cells.append({"text": txt, "conf": conf, "left": left, "top": top, "width": width, "height": height, "center_y": center_y, "center_x": center_x})
     return cells
 # ---------------- grouping & merge helpers ----------------
@@ -207,7 +214,8 @@ def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[st
         i += 1
     return merged
-# ---------------- numeric column detection (conservative) ----------------
 def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
     xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
     if not xs:
@@ -215,25 +223,23 @@ def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) ->
     xs = sorted(xs)
     if len(xs) == 1:
         return [xs[0]]
-    # Conservative min gap to avoid merging separate numeric columns
-    min_gap_px = 50.0
-    gaps = [xs[i+1] - xs[i] for i in range(len(xs) - 1)]
-    clusters = []
-    curr = [xs[0]]
     for i, g in enumerate(gaps):
-        if g >= min_gap_px:
             clusters.append(curr)
             curr = [xs[i+1]]
         else:
             curr.append(xs[i+1])
     clusters.append(curr)
     centers = [float(np.median(c)) for c in clusters]
     if len(centers) > max_columns:
         centers = centers[-max_columns:]
     return sorted(centers)
 def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optional[int]:
     if not column_centers:
@@ -242,111 +248,134 @@ def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optio
     return int(np.argmin(distances))
 # ---------------- parsing rows into items ----------------
 def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     parsed_items = []
     rows = merge_multiline_names(rows)
     column_centers = detect_numeric_columns(page_cells, max_columns=4)
     for row in rows:
         tokens = [c["text"] for c in row]
         if not tokens:
             continue
-        joined_lower = " ".join(tokens).lower()
-        if FOOTER_KEYWORDS.search(joined_lower) and not any(is_numeric_token(t) for t in tokens):
-            continue
         if all(not is_numeric_token(t) for t in tokens):
             continue
         if column_centers:
             left_text_parts = []
             numeric_bucket_map = {i: [] for i in range(len(column_centers))}
             for c in row:
                 t = c["text"]
-                cx = c["center_x"]
                 if is_numeric_token(t):
-                    col_idx = assign_token_to_column(cx, column_centers)
                     if col_idx is None:
-                        numeric_bucket_map[len(column_centers) - 1].append(t)
                     else:
                         numeric_bucket_map[col_idx].append(t)
                 else:
                     left_text_parts.append(t)
             raw_name = " ".join(left_text_parts).strip()
-            name = clean_name_text(raw_name) if raw_name else ""
             num_cols = len(column_centers)
             def get_bucket(idx):
                 vals = numeric_bucket_map.get(idx, [])
                 return vals[-1] if vals else None
-            amount = None; rate = None; qty = None
-            if num_cols >= 1:
-                amount = normalize_num_str(get_bucket(num_cols - 1))
-            if num_cols >= 2:
-                rate = normalize_num_str(get_bucket(num_cols - 2))
-            if num_cols >= 3:
-                qty = normalize_num_str(get_bucket(num_cols - 3))
             if amount is None:
                 for t in reversed(tokens):
                     if is_numeric_token(t):
                         amount = normalize_num_str(t)
                         break
-            if (qty is None or qty == 0) and amount is not None and rate:
-                ratio = amount / rate if rate else None
-                if ratio is not None:
-                    rounded = round(ratio)
-                    if rounded >= 1 and abs(ratio - rounded) <= max(0.04 * rounded, 0.2):
-                        qty = float(rounded)
-            if qty is None:
-                for pt in reversed(left_text_parts):
-                    m = re.match(r"^(\d+)(?:[xX])?$", pt)
-                    if m:
-                        qty = float(m.group(1))
                         break
-                if qty is None:
-                    qty = 1.0
-            if (rate is None or rate == 0) and qty and qty != 0 and amount is not None:
-                rate = round(amount / qty, 2)
-            try:
-                amount = float(round(amount, 2)) if amount is not None else None
-            except Exception:
-                amount = None
-            try:
-                rate = float(round(rate, 2)) if rate is not None else 0.0
-            except Exception:
-                rate = 0.0
-            try:
-                qty = float(qty) if qty is not None else 1.0
-            except Exception:
                 qty = 1.0
-            if amount is None or amount == 0:
-                continue
             parsed_items.append({
                 "item_name": name if name else "UNKNOWN",
-                "item_amount": float(round(amount, 2)),
-                "item_rate": float(round(rate, 2)) if rate else 0.0,
-                "item_quantity": float(qty) if qty else 1.0,
             })
         else:
-            numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t)]
             if not numeric_idxs:
                 continue
             last = numeric_idxs[-1]
-            amt = normalize_num_str(tokens[last])
-            if amt is None:
-                continue
-            name = " ".join(tokens[:last]).strip()
-            if not name:
                 continue
-            rate = 0.0; qty = 1.0
-            if len(numeric_idxs) >= 2:
-                r = normalize_num_str(tokens[numeric_idxs[-2]])
-                rate = r if r is not None else 0.0
-            if len(numeric_idxs) >= 3:
-                q = normalize_num_str(tokens[numeric_idxs[-3]])
-                qty = q if q is not None else 1.0
             parsed_items.append({
-                "item_name": clean_name_text(name),
-                "item_amount": float(round(amt, 2)),
-                "item_rate": float(round(rate, 2)),
-                "item_quantity": float(qty),
             })
     return parsed_items
 # ---------------- dedupe & totals ----------------
@@ -355,133 +384,69 @@ def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     out = []
     for it in items:
         nm = re.sub(r"\s+", " ", it["item_name"].lower()).strip()
-        key = (nm[:120], round(float(it["item_amount"]), 2))
         if key in seen:
             continue
         seen.add(key)
         out.append(it)
     return out
-def detect_subtotals_and_totals(rows_texts: List[str]) -> Dict[str, Optional[float]]:
-    subtotal = None; final = None
-    for rt in rows_texts[::-1]:
-        if not rt or rt.strip() == "":
-            continue
-        if TOTAL_KEYWORDS.search(rt):
-            m = NUM_RE.search(rt)
-            if m:
-                v = normalize_num_str(m.group(0))
-                if v is None:
-                    continue
-                if re.search(r"sub", rt, re.I):
-                    if subtotal is None: subtotal = float(round(v, 2))
-                else:
-                    if final is None: final = float(round(v, 2))
-    return {"subtotal": subtotal, "final_total": final}
-# ---------------- Gemini refinement (deterministic) ----------------
-def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
     zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
     if not GEMINI_API_KEY or genai is None:
         return page_items, zero_usage
     try:
         safe_text = sanitize_ocr_text(page_text)
         system_prompt = (
-            "You are a strict bill-extraction cleaner. Return ONLY a JSON array (no explanation, no backticks). "
-            "Each entry must be an object with keys: item_name (string), item_amount (float), item_rate (float), item_quantity (float). "
-            "Do NOT include subtotal or total lines as items. Do not invent items; only clean/fix/normalize the given items."
         )
         user_prompt = (
             f"page_text='''{safe_text}'''\n"
             f"items = {json.dumps(page_items, ensure_ascii=False)}\n\n"
-            "Example:\n"
-            "items = [{'item_name':'Consultation Charge | DR PREETHI','item_amount':300.0,'item_rate':0.0,'item_quantity':300.0},\n"
-            "         {'item_name':'Description Qty / Hrs Consultation Rate Discount Net Amt','item_amount':1950.0,'item_rate':1950.0,'item_quantity':1.0}]\n"
-            "=>\n"
-            "[{'item_name':'Consultation Charge | DR PREETHI MARY JOSEPH','item_amount':300.0,'item_rate':300.0,'item_quantity':1.0}]\n\n"
-            "Return only the cleaned JSON array of items."
         )
         model = genai.GenerativeModel(GEMINI_MODEL_NAME)
-        response = model.generate_content(
-            [
-                {"role": "system", "parts": [system_prompt]},
-                {"role": "user", "parts": [user_prompt]},
-            ],
-            temperature=0.0,
-            max_output_tokens=1000,
-        )
         raw = response.text.strip()
         if raw.startswith("```"):
-            raw = re.sub(r"^```[a-zA-Z]*", "", raw)
-            raw = re.sub(r"```$", "", raw).strip()
         parsed = json.loads(raw)
         if isinstance(parsed, list):
             cleaned = []
             for obj in parsed:
                 try:
                     cleaned.append({
-                        "item_name": str(obj.get("item_name", "")).strip(),
-                        "item_amount": float(obj.get("item_amount", 0.0)),
-                        "item_rate": float(obj.get("item_rate", 0.0) or 0.0),
-                        "item_quantity": float(obj.get("item_quantity", 1.0) or 1.0),
                     })
-                except Exception:
                     continue
-            # token usage info not reliably extracted here — return zeros
             return cleaned, zero_usage
         return page_items, zero_usage
     except Exception:
         return page_items, zero_usage
 # ---------------- header heuristics & final filter ----------------
-def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
-    if not txt:
-        return False
-    t = re.sub(r"\s+", " ", txt.strip().lower())
-    # exact phrase blacklist
-    if any(h == t for h in HEADER_PHRASES):
-        return True
-    hits = sum(1 for k in HEADER_KEYWORDS if k in t)
-    if hits >= 2:
-        return True
-    tokens = re.split(r"[\s\|,/:]+", t)
-    key_hit_count = sum(1 for tok in tokens if tok in HEADER_KEYWORDS)
-    if key_hit_count >= 3:
-        return True
-    if top_of_page and len(tokens) <= 10 and key_hit_count >= 2:
-        return True
-    if ("rate" in t or "net" in t) and "amt" in t and not any(ch.isdigit() for ch in t):
-        return True
-    if t.startswith("description") or t.startswith("qty") or t.startswith("qty /"):
-        return True
-    return False
-def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = []) -> bool:
-    name = (item.get("item_name") or "").strip()
-    if not name:
         return False
-    ln = name.lower()
-    # exact match against detected headers
-    for h in known_page_headers:
-        if h and h.strip() and h.strip().lower() in ln:
-            return False
-    if FOOTER_KEYWORDS.search(ln):
         return False
-    if item.get("item_amount", 0) > 1_000_000:
-        return False
-    if len(name) <= 2 and not re.search(r"[a-zA-Z]", name):
-        return False
-    if re.fullmatch(r"(charge|charges|services|laboratory|lab|consultation)", ln.strip(), re.I):
-        return False
-    # drop obvious section/subtotal labels (but allow items like 'ANAES. CHARGE' which contain a dot)
-    if len(name.split()) <= 4 and re.search(r"\b(charges|services|room|radiology|laborat|surgery|procedure)\b", ln):
-        if "." not in name and not re.search(r"\b[A-Z]{2,}\b", name):
-            return False
-    if float(item.get("item_amount", 0)) <= 0.0:
-        return False
-    rate = float(item.get("item_rate", 0) or 0)
-    amt = float(item.get("item_amount", 0) or 0)
-    if rate and rate > amt * 10 and amt < 10000:
         return False
     return True
@@ -490,114 +455,42 @@ def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = [])
 async def extract_bill_data(payload: BillRequest):
     doc_url = payload.document
     try:
-        headers = {"User-Agent": "Mozilla/5.0"}
-        resp = requests.get(doc_url, headers=headers, timeout=30)
-        if resp.status_code != 200:
-            raise RuntimeError(f"download failed status={resp.status_code}")
         file_bytes = resp.content
-    except Exception:
-        return {"is_success": False, "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}, "data": {"pagewise_line_items": [], "total_item_count": 0}}
-    images = []
-    clean_url = doc_url.split("?", 1)[0].lower()
-    try:
-        if clean_url.endswith(".pdf"):
-            images = convert_from_bytes(file_bytes)
-        elif any(clean_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]):
-            images = [Image.open(BytesIO(file_bytes))]
-        else:
-            try:
-                images = convert_from_bytes(file_bytes)
-            except Exception:
-                images = []
-    except Exception:
-        images = []
     pagewise = []
-    cumulative_token_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
-    for idx, page_img in enumerate(images, start=1):
-        try:
-            proc = preprocess_image(page_img)
-            cells = image_to_tsv_cells(proc)
-            rows = group_cells_into_rows(cells, y_tolerance=12)
-            rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
-            # === HEADER PREFILTER: remove header-like rows anywhere on page ===
-            rows_filtered = []
-            for i, (r, rt) in enumerate(zip(rows, rows_texts)):
-                top_flag = (i < 6)
-                rt_norm = sanitize_ocr_text(rt).lower()
-                if looks_like_header_text(rt_norm, top_of_page=top_flag):
-                    continue
-                if any(h in rt_norm for h in HEADER_PHRASES):
-                    continue
-                rows_filtered.append(r)
-            # recompute row texts and a simple page_text
-            rows = rows_filtered
-            rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
-            page_text = sanitize_ocr_text(" ".join(rows_texts))
-            # detect page-level top headers (for final filtering)
-            top_headers = []
-            for i, rt in enumerate(rows_texts[:6]):
-                if looks_like_header_text(rt, top_of_page=(i < 4)):
-                    top_headers.append(rt.strip().lower())
-            parsed_items = parse_rows_with_columns(rows, cells)
-            # ALWAYS attempt Gemini refinement if available (deterministic settings)
-            refined_items, token_u = refine_with_gemini(parsed_items, page_text)
-            for k in cumulative_token_usage:
-                cumulative_token_usage[k] += token_u.get(k, 0)
-            # final cleaning & dedupe
-            cleaned = [p for p in refined_items if final_item_filter(p, known_page_headers=top_headers)]
-            cleaned = dedupe_items(cleaned)
-            cleaned = [p for p in cleaned if not looks_like_header_text(p["item_name"].lower())]
-            page_type = "Bill Detail"
-            page_txt = page_text.lower()
-            if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
-                page_type = "Pharmacy"
-            if "final bill" in page_txt or "grand total" in page_txt:
-                page_type = "Final Bill"
-            pagewise.append({"page_no": str(idx), "page_type": page_type, "bill_items": cleaned})
-        except Exception:
-            pagewise.append({"page_no": str(idx), "page_type": "Bill Detail", "bill_items": []})
-            continue
-    total_item_count = sum(len(p.get("bill_items", [])) for p in pagewise)
-    if not GEMINI_API_KEY or genai is None:
-        cumulative_token_usage["warning_no_gemini"] = 1
-    return {"is_success": True, "token_usage": cumulative_token_usage, "data": {"pagewise_line_items": pagewise, "total_item_count": total_item_count}}
-# ---------------- debug TSV ----------------
-@app.post("/debug-tsv")
-async def debug_tsv(payload: BillRequest):
-    doc_url = payload.document
-    try:
-        resp = requests.get(doc_url, timeout=20)
-        if resp.status_code != 200:
-            return {"error": "Download failed"}
-        file_bytes = resp.content
-    except Exception:
-        return {"error": "Download failed"}
-    clean_url = doc_url.split("?", 1)[0].lower()
-    if clean_url.endswith(".pdf"):
-        imgs = convert_from_bytes(file_bytes)
-        img = imgs[0]
-    else:
-        img = Image.open(BytesIO(file_bytes))
-    proc = preprocess_image(img)
-    cells = image_to_tsv_cells(proc)
-    return {"cells": cells}
 @app.get("/")
-def health_check():
-    msg = "Bill extraction API (final) live."
-    if not GEMINI_API_KEY or genai is None:
-        msg += " (No GEMINI_API_KEY/configured SDK — LLM refinement skipped.)"
-    return {"status": "ok", "message": msg, "hint": "POST /extract-bill-data with {'document':'<url>'}"}

     n = len(o.get("text", []))
     for i in range(n):
         raw = o["text"][i]
         txt = str(raw).strip()
         if not txt:
             continue
             conf = float(o["conf"][i]) if o["conf"][i] not in (None, "", "-1") else -1.0
         except Exception:
             conf = -1.0
+        left = int(o["left"][i])
+        top = int(o["top"][i])
+        width = int(o["width"][i])
+        height = int(o["height"][i])
         center_y = top + height / 2.0
         center_x = left + width / 2.0
+        cells.append({
+            "text": txt,
+            "conf": conf,
+            "left": left,
+            "top": top,
+            "width": width,
+            "height": height,
+            "center_y": center_y,
+            "center_x": center_x
+        })
     return cells
 # ---------------- grouping & merge helpers ----------------
         i += 1
     return merged
+# ---------------- numeric column detection ----------------
+# >>> FIX START — replaced rigid 50px with adaptive clustering
 def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
     xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
     if not xs:
     xs = sorted(xs)
     if len(xs) == 1:
         return [xs[0]]
+    gaps = [xs[i+1] - xs[i] for i in range(len(xs)-1)]
+    mean_gap = float(np.mean(gaps))
+    std_gap  = float(np.std(gaps)) if len(gaps) > 1 else 0
+    gap_thresh = max(30.0, mean_gap + 0.6 * std_gap)
+    clusters, curr = [], [xs[0]]
     for i, g in enumerate(gaps):
+        if g > gap_thresh and len(clusters) < (max_columns - 1):
             clusters.append(curr)
             curr = [xs[i+1]]
         else:
             curr.append(xs[i+1])
     clusters.append(curr)
     centers = [float(np.median(c)) for c in clusters]
     if len(centers) > max_columns:
         centers = centers[-max_columns:]
     return sorted(centers)
+# >>> FIX END
 def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optional[int]:
     if not column_centers:
     return int(np.argmin(distances))
 # ---------------- parsing rows into items ----------------
 def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     parsed_items = []
     rows = merge_multiline_names(rows)
     column_centers = detect_numeric_columns(page_cells, max_columns=4)
     for row in rows:
         tokens = [c["text"] for c in row]
         if not tokens:
             continue
         if all(not is_numeric_token(t) for t in tokens):
             continue
+        # >>> FIX START — build numeric token list for inference
+        numeric_values = []
+        for t in tokens:
+            if is_numeric_token(t):
+                v = normalize_num_str(t)
+                if v is not None:
+                    numeric_values.append(float(v))
+        # >>> FIX END
         if column_centers:
             left_text_parts = []
             numeric_bucket_map = {i: [] for i in range(len(column_centers))}
             for c in row:
                 t = c["text"]
                 if is_numeric_token(t):
+                    col_idx = assign_token_to_column(c["center_x"], column_centers)
                     if col_idx is None:
+                        numeric_bucket_map[len(column_centers)-1].append(t)
                     else:
                         numeric_bucket_map[col_idx].append(t)
                 else:
                     left_text_parts.append(t)
             raw_name = " ".join(left_text_parts).strip()
+            name = clean_name_text(raw_name)
             num_cols = len(column_centers)
             def get_bucket(idx):
                 vals = numeric_bucket_map.get(idx, [])
                 return vals[-1] if vals else None
+            # base extraction
+            amount = normalize_num_str(get_bucket(num_cols - 1)) if num_cols >= 1 else None
+            rate   = normalize_num_str(get_bucket(num_cols - 2)) if num_cols >= 2 else None
+            qty    = normalize_num_str(get_bucket(num_cols - 3)) if num_cols >= 3 else None
             if amount is None:
                 for t in reversed(tokens):
                     if is_numeric_token(t):
                         amount = normalize_num_str(t)
                         break
+            # >>> FIX START — strong inference block
+            if amount is not None and numeric_values:
+                # Look for: amount / candidate_rate ≈ integer
+                for cand in numeric_values:
+                    if cand == 0 or cand == amount:
+                        continue
+                    ratio = amount / cand
+                    r = round(ratio)
+                    if 1 <= r <= 200 and abs(ratio - r) <= max(0.04*r, 0.2):
+                        rate = cand
+                        qty  = float(r)
                         break
+            # >>> FIX END
+            # fallback inference
+            if (rate is None or rate == 0) and qty:
+                try:
+                    rate = amount / qty
+                except:
+                    pass
+            if qty is None:
                 qty = 1.0
+            # cleanup
+            try: amount = float(round(amount,2))
+            except: continue
+            try: rate = float(round(rate,2)) if rate else 0.0
+            except: rate = 0.0
+            try: qty = float(qty)
+            except: qty = 1.0
             parsed_items.append({
                 "item_name": name if name else "UNKNOWN",
+                "item_amount": amount,
+                "item_rate": rate,
+                "item_quantity": qty
             })
         else:
+            numeric_idxs = [i for i,t in enumerate(tokens) if is_numeric_token(t)]
             if not numeric_idxs:
                 continue
             last = numeric_idxs[-1]
+            amount = normalize_num_str(tokens[last])
+            if amount is None:
                 continue
+            name = clean_name_text(" ".join(tokens[:last]).strip())
+            rate = 0.0
+            qty = 1.0
+            # >>> FIX START — fallback inference also upgraded
+            for cand in numeric_values:
+                if cand == 0 or cand == amount:
+                    continue
+                ratio = amount / cand
+                r = round(ratio)
+                if 1 <= r <= 200 and abs(ratio - r) <= max(0.04*r, 0.2):
+                    rate = cand
+                    qty = float(r)
+                    break
+            # >>> FIX END
             parsed_items.append({
+                "item_name": name,
+                "item_amount": float(round(amount,2)),
+                "item_rate": float(round(rate,2)),
+                "item_quantity": float(qty)
             })
     return parsed_items
 # ---------------- dedupe & totals ----------------
     out = []
     for it in items:
         nm = re.sub(r"\s+", " ", it["item_name"].lower()).strip()
+        key = (nm[:120], round(it["item_amount"], 2))
         if key in seen:
             continue
         seen.add(key)
         out.append(it)
     return out
+# ---------------- Gemini refinement (unchanged) ----------------
+def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = ""):
     zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
     if not GEMINI_API_KEY or genai is None:
         return page_items, zero_usage
     try:
         safe_text = sanitize_ocr_text(page_text)
         system_prompt = (
+            "You are a strict bill-extraction cleaner. Return ONLY a JSON array."
         )
         user_prompt = (
             f"page_text='''{safe_text}'''\n"
             f"items = {json.dumps(page_items, ensure_ascii=False)}\n\n"
+            "Return only the cleaned JSON array."
         )
         model = genai.GenerativeModel(GEMINI_MODEL_NAME)
+        response = model.generate_content([
+            {"role": "system", "parts": [system_prompt]},
+            {"role": "user", "parts": [user_prompt]}
+        ], temperature=0.0)
         raw = response.text.strip()
         if raw.startswith("```"):
+            raw = raw.split("```")[1]
         parsed = json.loads(raw)
         if isinstance(parsed, list):
             cleaned = []
             for obj in parsed:
                 try:
                     cleaned.append({
+                        "item_name": str(obj.get("item_name","")).strip(),
+                        "item_amount": float(obj.get("item_amount",0)),
+                        "item_rate": float(obj.get("item_rate",0)),
+                        "item_quantity": float(obj.get("item_quantity",1)),
                     })
+                except:
                     continue
             return cleaned, zero_usage
         return page_items, zero_usage
     except Exception:
         return page_items, zero_usage
 # ---------------- header heuristics & final filter ----------------
+def final_item_filter(item, known_page_headers):
+    name = item["item_name"].lower()
+    amt = item["item_amount"]
+    if amt <= 0:
         return False
+    if FOOTER_KEYWORDS.search(name):
         return False
+    if any(h in name for h in known_page_headers):
         return False
     return True
 async def extract_bill_data(payload: BillRequest):
     doc_url = payload.document
     try:
+        resp = requests.get(doc_url, timeout=30)
         file_bytes = resp.content
+    except:
+        return {"is_success": False, "data": {}}
+    if doc_url.lower().endswith(".pdf"):
+        images = convert_from_bytes(file_bytes)
+    else:
+        images = [Image.open(BytesIO(file_bytes))]
     pagewise = []
+    total_items = 0
+    for idx, img in enumerate(images, start=1):
+        proc = preprocess_image(img)
+        cells = image_to_tsv_cells(proc)
+        rows = group_cells_into_rows(cells)
+        rows_text = [" ".join([c["text"] for c in r]) for r in rows]
+        parsed = parse_rows_with_columns(rows, cells)
+        pagewise.append({
+            "page_no": str(idx),
+            "page_type": "Bill Detail",
+            "bill_items": parsed
+        })
+        total_items += len(parsed)
+    return {
+        "is_success": True,
+        "data": {
+            "pagewise_line_items": pagewise,
+            "total_item_count": total_items
+        }
+    }
 @app.get("/")
+def health():
+    return {"status": "ok"}