Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -179,6 +179,73 @@ def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) ->
|
|
| 179 |
|
| 180 |
return rows
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
|
| 183 |
# ---------------- FALLBACK REGEX EXTRACTOR ----------------
|
| 184 |
|
|
|
|
| 179 |
|
| 180 |
return rows
|
| 181 |
|
| 182 |
+
# ---------------- Parse row into columns (name, qty, rate, amount) ----------------
|
| 183 |
+
def parse_row_to_item(cells_row: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
|
| 184 |
+
"""
|
| 185 |
+
Given a row (list of cells sorted by x), attempt to parse:
|
| 186 |
+
- right-most numeric token -> item_amount
|
| 187 |
+
- preceding numeric tokens -> rate, quantity (in that order)
|
| 188 |
+
- remaining left text -> item_name
|
| 189 |
+
Returns dict or None if not parsable.
|
| 190 |
+
"""
|
| 191 |
+
tokens = [c["text"] for c in cells_row]
|
| 192 |
+
token_positions = [c["left"] for c in cells_row]
|
| 193 |
+
# find numeric tokens indices
|
| 194 |
+
numeric_indices = []
|
| 195 |
+
for i, t in enumerate(tokens):
|
| 196 |
+
if looks_like_amount_token(t):
|
| 197 |
+
numeric_indices.append(i)
|
| 198 |
+
if not numeric_indices:
|
| 199 |
+
return None
|
| 200 |
+
|
| 201 |
+
# Right-most numeric token is candidate amount
|
| 202 |
+
amt_idx = numeric_indices[-1]
|
| 203 |
+
amt_token = tokens[amt_idx]
|
| 204 |
+
amt_val = normalize_num_str(amt_token)
|
| 205 |
+
if amt_val is None:
|
| 206 |
+
return None
|
| 207 |
+
|
| 208 |
+
# Name = everything before amt_idx, but exclude obvious label columns like "qty"
|
| 209 |
+
name_tokens = tokens[:amt_idx]
|
| 210 |
+
name = " ".join(name_tokens).strip()
|
| 211 |
+
if name == "":
|
| 212 |
+
# sometimes shape is reversed: maybe item name after left numeric; discard
|
| 213 |
+
return None
|
| 214 |
+
|
| 215 |
+
# try parse rate and quantity from preceding numeric tokens
|
| 216 |
+
rate_val = 0.0
|
| 217 |
+
qty_val = 0.0
|
| 218 |
+
|
| 219 |
+
if len(numeric_indices) >= 2:
|
| 220 |
+
rate_token = tokens[numeric_indices[-2]]
|
| 221 |
+
r = normalize_num_str(rate_token)
|
| 222 |
+
rate_val = r if r is not None else 0.0
|
| 223 |
+
|
| 224 |
+
if len(numeric_indices) >= 3:
|
| 225 |
+
qty_token = tokens[numeric_indices[-3]]
|
| 226 |
+
q = normalize_num_str(qty_token)
|
| 227 |
+
qty_val = q if q is not None else 0.0
|
| 228 |
+
|
| 229 |
+
# Basic plausibility fixes: if quantity seems zero or not present, default to 1.0
|
| 230 |
+
if qty_val == 0.0:
|
| 231 |
+
# attempt to infer qty from tokens like '2x' or '2'
|
| 232 |
+
# search in entire row
|
| 233 |
+
for t in tokens:
|
| 234 |
+
m = re.match(r"^(\d+)\s*[xX]?$", t)
|
| 235 |
+
if m:
|
| 236 |
+
qty_val = float(m.group(1))
|
| 237 |
+
break
|
| 238 |
+
if qty_val == 0.0:
|
| 239 |
+
qty_val = 1.0
|
| 240 |
+
|
| 241 |
+
return {
|
| 242 |
+
"item_name": name,
|
| 243 |
+
"item_amount": float(round(amt_val, 2)),
|
| 244 |
+
"item_rate": float(round(rate_val, 2)) if rate_val else 0.0,
|
| 245 |
+
"item_quantity": float(qty_val)
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
|
| 249 |
|
| 250 |
# ---------------- FALLBACK REGEX EXTRACTOR ----------------
|
| 251 |
|