Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -29,31 +29,39 @@ def parse_text(text):
|
|
| 29 |
|
| 30 |
def extract_items_from_text(text: str):
|
| 31 |
"""
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
-
|
| 37 |
-
- If a line ends with a number, treat that as item_amount
|
| 38 |
-
- Everything before that is item_name
|
| 39 |
-
- item_rate and item_quantity are set to 0.0 for now
|
| 40 |
"""
|
| 41 |
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
| 42 |
bill_items = []
|
| 43 |
|
| 44 |
-
# Example pattern: "Room Rent 3500" or "Paracetamol 50.25"
|
| 45 |
-
pattern = re.compile(r"^(.*\D)?(\d+(?:\.\d+)?)$")
|
| 46 |
-
|
| 47 |
for line in lines:
|
| 48 |
-
|
| 49 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
continue
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
continue
|
| 58 |
|
| 59 |
try:
|
|
@@ -61,9 +69,11 @@ def extract_items_from_text(text: str):
|
|
| 61 |
except ValueError:
|
| 62 |
continue
|
| 63 |
|
|
|
|
|
|
|
| 64 |
bill_items.append(
|
| 65 |
{
|
| 66 |
-
"item_name":
|
| 67 |
"item_amount": amount_val,
|
| 68 |
"item_rate": 0.0,
|
| 69 |
"item_quantity": 0.0,
|
|
|
|
| 29 |
|
| 30 |
def extract_items_from_text(text: str):
|
| 31 |
"""
|
| 32 |
+
Looser heuristic:
|
| 33 |
+
- Take any line that has at least one numeric token
|
| 34 |
+
- Use the last numeric token as item_amount
|
| 35 |
+
- Everything before that token is item_name
|
| 36 |
+
- Skip obvious total/summary lines
|
|
|
|
|
|
|
|
|
|
| 37 |
"""
|
| 38 |
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
| 39 |
bill_items = []
|
| 40 |
|
|
|
|
|
|
|
|
|
|
| 41 |
for line in lines:
|
| 42 |
+
# Skip totals / summary lines
|
| 43 |
+
if re.search(r"(total|grand total|net payable)", line, re.IGNORECASE):
|
| 44 |
+
continue
|
| 45 |
+
|
| 46 |
+
tokens = line.split()
|
| 47 |
+
if not tokens:
|
| 48 |
continue
|
| 49 |
|
| 50 |
+
# Find all purely numeric tokens (e.g. 123, 45.67)
|
| 51 |
+
numeric_indices = [
|
| 52 |
+
i for i, tok in enumerate(tokens)
|
| 53 |
+
if re.fullmatch(r"\d+(\.\d+)?", tok)
|
| 54 |
+
]
|
| 55 |
|
| 56 |
+
if not numeric_indices:
|
| 57 |
+
continue
|
| 58 |
+
|
| 59 |
+
last_idx = numeric_indices[-1]
|
| 60 |
+
amount_str = tokens[last_idx]
|
| 61 |
+
name_tokens = tokens[:last_idx]
|
| 62 |
+
|
| 63 |
+
# If there's no text before the amount, skip
|
| 64 |
+
if not name_tokens:
|
| 65 |
continue
|
| 66 |
|
| 67 |
try:
|
|
|
|
| 69 |
except ValueError:
|
| 70 |
continue
|
| 71 |
|
| 72 |
+
item_name = " ".join(name_tokens)
|
| 73 |
+
|
| 74 |
bill_items.append(
|
| 75 |
{
|
| 76 |
+
"item_name": item_name,
|
| 77 |
"item_amount": amount_val,
|
| 78 |
"item_rate": 0.0,
|
| 79 |
"item_quantity": 0.0,
|