Sathvik-kota commited on
Commit
2eca474
·
verified ·
1 Parent(s): 901f760

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +67 -0
app.py CHANGED
@@ -179,6 +179,73 @@ def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) ->
179
 
180
  return rows
181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
  # ---------------- FALLBACK REGEX EXTRACTOR ----------------
184
 
 
179
 
180
  return rows
181
 
182
+ # ---------------- Parse row into columns (name, qty, rate, amount) ----------------
183
+ def parse_row_to_item(cells_row: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
184
+ """
185
+ Given a row (list of cells sorted by x), attempt to parse:
186
+ - right-most numeric token -> item_amount
187
+ - preceding numeric tokens -> rate, quantity (in that order)
188
+ - remaining left text -> item_name
189
+ Returns dict or None if not parsable.
190
+ """
191
+ tokens = [c["text"] for c in cells_row]
192
+ token_positions = [c["left"] for c in cells_row]
193
+ # find numeric tokens indices
194
+ numeric_indices = []
195
+ for i, t in enumerate(tokens):
196
+ if looks_like_amount_token(t):
197
+ numeric_indices.append(i)
198
+ if not numeric_indices:
199
+ return None
200
+
201
+ # Right-most numeric token is candidate amount
202
+ amt_idx = numeric_indices[-1]
203
+ amt_token = tokens[amt_idx]
204
+ amt_val = normalize_num_str(amt_token)
205
+ if amt_val is None:
206
+ return None
207
+
208
+ # Name = everything before amt_idx, but exclude obvious label columns like "qty"
209
+ name_tokens = tokens[:amt_idx]
210
+ name = " ".join(name_tokens).strip()
211
+ if name == "":
212
+ # sometimes shape is reversed: maybe item name after left numeric; discard
213
+ return None
214
+
215
+ # try parse rate and quantity from preceding numeric tokens
216
+ rate_val = 0.0
217
+ qty_val = 0.0
218
+
219
+ if len(numeric_indices) >= 2:
220
+ rate_token = tokens[numeric_indices[-2]]
221
+ r = normalize_num_str(rate_token)
222
+ rate_val = r if r is not None else 0.0
223
+
224
+ if len(numeric_indices) >= 3:
225
+ qty_token = tokens[numeric_indices[-3]]
226
+ q = normalize_num_str(qty_token)
227
+ qty_val = q if q is not None else 0.0
228
+
229
+ # Basic plausibility fixes: if quantity seems zero or not present, default to 1.0
230
+ if qty_val == 0.0:
231
+ # attempt to infer qty from tokens like '2x' or '2'
232
+ # search in entire row
233
+ for t in tokens:
234
+ m = re.match(r"^(\d+)\s*[xX]?$", t)
235
+ if m:
236
+ qty_val = float(m.group(1))
237
+ break
238
+ if qty_val == 0.0:
239
+ qty_val = 1.0
240
+
241
+ return {
242
+ "item_name": name,
243
+ "item_amount": float(round(amt_val, 2)),
244
+ "item_rate": float(round(rate_val, 2)) if rate_val else 0.0,
245
+ "item_quantity": float(qty_val)
246
+ }
247
+
248
+
249
 
250
  # ---------------- FALLBACK REGEX EXTRACTOR ----------------
251