Spaces:

Sathvik-kota
/

Datathon

Sleeping

Sathvik-kota commited on Nov 28, 2025

Commit

0bfaa94

verified ·

1 Parent(s): 5dc6960

Upload folder using huggingface_hub

Files changed (1) hide show

app.py CHANGED Viewed

@@ -29,31 +29,39 @@ def parse_text(text):
 def extract_items_from_text(text: str):
     """
-    Takes raw OCR text of a page and returns a list of bill_items
-    in the required schema format.
-    Current heuristic:
-    - Split into lines
-    - If a line ends with a number, treat that as item_amount
-    - Everything before that is item_name
-    - item_rate and item_quantity are set to 0.0 for now
     """
     lines = [line.strip() for line in text.splitlines() if line.strip()]
     bill_items = []
-    # Example pattern: "Room Rent 3500" or "Paracetamol 50.25"
-    pattern = re.compile(r"^(.*\D)?(\d+(?:\.\d+)?)$")
     for line in lines:
-        match = pattern.match(line)
-        if not match:
             continue
-        raw_name = (match.group(1) or "").strip()
-        amount_str = match.group(2)
-        # Skip lines that are just numbers
-        if not raw_name:
             continue
         try:
@@ -61,9 +69,11 @@ def extract_items_from_text(text: str):
         except ValueError:
             continue
         bill_items.append(
             {
-                "item_name": raw_name,
                 "item_amount": amount_val,
                 "item_rate": 0.0,
                 "item_quantity": 0.0,

 def extract_items_from_text(text: str):
     """
+    Looser heuristic:
+    - Take any line that has at least one numeric token
+    - Use the last numeric token as item_amount
+    - Everything before that token is item_name
+    - Skip obvious total/summary lines
     """
     lines = [line.strip() for line in text.splitlines() if line.strip()]
     bill_items = []
     for line in lines:
+        # Skip totals / summary lines
+        if re.search(r"(total|grand total|net payable)", line, re.IGNORECASE):
+            continue
+        tokens = line.split()
+        if not tokens:
             continue
+        # Find all purely numeric tokens (e.g. 123, 45.67)
+        numeric_indices = [
+            i for i, tok in enumerate(tokens)
+            if re.fullmatch(r"\d+(\.\d+)?", tok)
+        ]
+        if not numeric_indices:
+            continue
+        last_idx = numeric_indices[-1]
+        amount_str = tokens[last_idx]
+        name_tokens = tokens[:last_idx]
+        # If there's no text before the amount, skip
+        if not name_tokens:
             continue
         try:
         except ValueError:
             continue
+        item_name = " ".join(name_tokens)
         bill_items.append(
             {
+                "item_name": item_name,
                 "item_amount": amount_val,
                 "item_rate": 0.0,
                 "item_quantity": 0.0,