Sathvik-kota commited on
Commit
0bfaa94
·
verified ·
1 Parent(s): 5dc6960

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +28 -18
app.py CHANGED
@@ -29,31 +29,39 @@ def parse_text(text):
29
 
30
  def extract_items_from_text(text: str):
31
  """
32
- Takes raw OCR text of a page and returns a list of bill_items
33
- in the required schema format.
34
-
35
- Current heuristic:
36
- - Split into lines
37
- - If a line ends with a number, treat that as item_amount
38
- - Everything before that is item_name
39
- - item_rate and item_quantity are set to 0.0 for now
40
  """
41
  lines = [line.strip() for line in text.splitlines() if line.strip()]
42
  bill_items = []
43
 
44
- # Example pattern: "Room Rent 3500" or "Paracetamol 50.25"
45
- pattern = re.compile(r"^(.*\D)?(\d+(?:\.\d+)?)$")
46
-
47
  for line in lines:
48
- match = pattern.match(line)
49
- if not match:
 
 
 
 
50
  continue
51
 
52
- raw_name = (match.group(1) or "").strip()
53
- amount_str = match.group(2)
 
 
 
54
 
55
- # Skip lines that are just numbers
56
- if not raw_name:
 
 
 
 
 
 
 
57
  continue
58
 
59
  try:
@@ -61,9 +69,11 @@ def extract_items_from_text(text: str):
61
  except ValueError:
62
  continue
63
 
 
 
64
  bill_items.append(
65
  {
66
- "item_name": raw_name,
67
  "item_amount": amount_val,
68
  "item_rate": 0.0,
69
  "item_quantity": 0.0,
 
29
 
30
  def extract_items_from_text(text: str):
31
  """
32
+ Looser heuristic:
33
+ - Take any line that has at least one numeric token
34
+ - Use the last numeric token as item_amount
35
+ - Everything before that token is item_name
36
+ - Skip obvious total/summary lines
 
 
 
37
  """
38
  lines = [line.strip() for line in text.splitlines() if line.strip()]
39
  bill_items = []
40
 
 
 
 
41
  for line in lines:
42
+ # Skip totals / summary lines
43
+ if re.search(r"(total|grand total|net payable)", line, re.IGNORECASE):
44
+ continue
45
+
46
+ tokens = line.split()
47
+ if not tokens:
48
  continue
49
 
50
+ # Find all purely numeric tokens (e.g. 123, 45.67)
51
+ numeric_indices = [
52
+ i for i, tok in enumerate(tokens)
53
+ if re.fullmatch(r"\d+(\.\d+)?", tok)
54
+ ]
55
 
56
+ if not numeric_indices:
57
+ continue
58
+
59
+ last_idx = numeric_indices[-1]
60
+ amount_str = tokens[last_idx]
61
+ name_tokens = tokens[:last_idx]
62
+
63
+ # If there's no text before the amount, skip
64
+ if not name_tokens:
65
  continue
66
 
67
  try:
 
69
  except ValueError:
70
  continue
71
 
72
+ item_name = " ".join(name_tokens)
73
+
74
  bill_items.append(
75
  {
76
+ "item_name": item_name,
77
  "item_amount": amount_val,
78
  "item_rate": 0.0,
79
  "item_quantity": 0.0,