Spaces:

Sathvik-kota
/

Datathon

Sleeping

App Files Files Community

Sathvik-kota commited on Nov 29, 2025

Commit

2eca474

verified ·

1 Parent(s): 901f760

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +67 -0

app.py CHANGED Viewed

@@ -179,6 +179,73 @@ def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) ->
     return rows
 # ---------------- FALLBACK REGEX EXTRACTOR ----------------

     return rows
+# ---------------- Parse row into columns (name, qty, rate, amount) ----------------
+def parse_row_to_item(cells_row: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+    """
+    Given a row (list of cells sorted by x), attempt to parse:
+    - right-most numeric token -> item_amount
+    - preceding numeric tokens -> rate, quantity (in that order)
+    - remaining left text -> item_name
+    Returns dict or None if not parsable.
+    """
+    tokens = [c["text"] for c in cells_row]
+    token_positions = [c["left"] for c in cells_row]
+    # find numeric tokens indices
+    numeric_indices = []
+    for i, t in enumerate(tokens):
+        if looks_like_amount_token(t):
+            numeric_indices.append(i)
+    if not numeric_indices:
+        return None
+    # Right-most numeric token is candidate amount
+    amt_idx = numeric_indices[-1]
+    amt_token = tokens[amt_idx]
+    amt_val = normalize_num_str(amt_token)
+    if amt_val is None:
+        return None
+    # Name = everything before amt_idx, but exclude obvious label columns like "qty"
+    name_tokens = tokens[:amt_idx]
+    name = " ".join(name_tokens).strip()
+    if name == "":
+        # sometimes shape is reversed: maybe item name after left numeric; discard
+        return None
+    # try parse rate and quantity from preceding numeric tokens
+    rate_val = 0.0
+    qty_val = 0.0
+    if len(numeric_indices) >= 2:
+        rate_token = tokens[numeric_indices[-2]]
+        r = normalize_num_str(rate_token)
+        rate_val = r if r is not None else 0.0
+    if len(numeric_indices) >= 3:
+        qty_token = tokens[numeric_indices[-3]]
+        q = normalize_num_str(qty_token)
+        qty_val = q if q is not None else 0.0
+    # Basic plausibility fixes: if quantity seems zero or not present, default to 1.0
+    if qty_val == 0.0:
+        # attempt to infer qty from tokens like '2x' or '2'
+        # search in entire row
+        for t in tokens:
+            m = re.match(r"^(\d+)\s*[xX]?$", t)
+            if m:
+                qty_val = float(m.group(1))
+                break
+        if qty_val == 0.0:
+            qty_val = 1.0
+    return {
+        "item_name": name,
+        "item_amount": float(round(amt_val, 2)),
+        "item_rate": float(round(rate_val, 2)) if rate_val else 0.0,
+        "item_quantity": float(qty_val)
+    }
 # ---------------- FALLBACK REGEX EXTRACTOR ----------------