MakPr016 commited on
Commit
265e719
·
1 Parent(s): eac74fb

Updated output

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. rfq_parser.py +68 -3
.gitignore CHANGED
@@ -15,3 +15,4 @@ venv/
15
  build/
16
  dist/
17
  *.egg-info/
 
 
15
  build/
16
  dist/
17
  *.egg-info/
18
+ response.json
rfq_parser.py CHANGED
@@ -297,6 +297,64 @@ def _looks_like_item_continuation(table):
297
 
298
 
299
  def _extract_rows(rows, idx_map, num_cols, seen_srs, items):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
  for row in rows:
301
  row_clean = [_clean(c) for c in row]
302
  row_clean = (row_clean + [""] * num_cols)[:num_cols]
@@ -342,13 +400,20 @@ def _extract_rows(rows, idx_map, num_cols, seen_srs, items):
342
  continue
343
  seen_srs.add(key)
344
 
 
 
 
345
  # --- NEW: classify the item ---
346
- category = determine_item_category(desc, unit_val)
347
 
348
  items.append({
349
  "sr": sr_val if sr_val is not None else len(items) + 1,
350
- "description": desc,
351
- "unit": unit_val,
 
 
 
 
352
  "qty": qty_val,
353
  "unit_price": None,
354
  "total_price": None,
 
297
 
298
 
299
  def _extract_rows(rows, idx_map, num_cols, seen_srs, items):
300
+ def _parse_description_parts(raw_desc):
301
+ text = raw_desc.strip()
302
+ if not text:
303
+ return "", "", ""
304
+
305
+ # Pull dosage-like fragments such as "156 Mg/5ml" or "500 mg".
306
+ dosage_match = re.search(
307
+ r"\b\d+(?:\.\d+)?\s*(?:mg|mcg|g|iu|ml|mg/ml|mcg/ml|g/ml)\b(?:\s*/\s*\d+(?:\.\d+)?\s*ml)?",
308
+ text,
309
+ flags=re.IGNORECASE,
310
+ )
311
+ dosage = dosage_match.group(0) if dosage_match else ""
312
+
313
+ # Common dosage forms that appear in descriptions.
314
+ form_match = re.search(
315
+ r"\b(tablet|tab|capsule|cap|suspension|syrup|injection|inj|vial|ampoule|amp|drops|inhaler|ointment|cream|gel|lotion|suppository|supp|solution|powder|elixir|serum)\b",
316
+ text,
317
+ flags=re.IGNORECASE,
318
+ )
319
+ form = form_match.group(0) if form_match else ""
320
+
321
+ cleaned = text
322
+ for fragment in [dosage, form]:
323
+ if fragment:
324
+ cleaned = re.sub(re.escape(fragment), "", cleaned, flags=re.IGNORECASE)
325
+ cleaned = re.sub(r"\s{2,}", " ", cleaned).strip(" ,.-")
326
+
327
+ return cleaned, dosage, form
328
+
329
+ def _parse_pack_from_unit(raw_unit):
330
+ text = raw_unit.strip()
331
+ if not text:
332
+ return "", 0, ""
333
+
334
+ # Match patterns like "Pack of 20 Tablet" or "Box of 100".
335
+ pack_match = re.search(r"\b(pack|box|bottle|bag|tube|vial|ampoule|amp|ea|each|single unit)\b", text, flags=re.IGNORECASE)
336
+ unit_type = pack_match.group(0) if pack_match else ""
337
+
338
+ qty_match = re.search(r"\b(\d+(?:\.\d+)?)\b", text)
339
+ pack_size = 0
340
+ if qty_match:
341
+ try:
342
+ pack_size_val = float(qty_match.group(1))
343
+ pack_size = int(pack_size_val) if pack_size_val.is_integer() else pack_size_val
344
+ except Exception:
345
+ pack_size = 0
346
+
347
+ pack_unit = ""
348
+ trailing = text
349
+ if qty_match:
350
+ trailing = text[qty_match.end():]
351
+ if trailing:
352
+ m = re.search(r"\b([a-zA-Z]+(?:\s+[a-zA-Z]+)?)\b", trailing)
353
+ if m:
354
+ pack_unit = m.group(1).strip()
355
+
356
+ return unit_type.title() if unit_type else "", pack_size, pack_unit.title() if pack_unit else ""
357
+
358
  for row in rows:
359
  row_clean = [_clean(c) for c in row]
360
  row_clean = (row_clean + [""] * num_cols)[:num_cols]
 
400
  continue
401
  seen_srs.add(key)
402
 
403
+ clean_desc, dosage, form = _parse_description_parts(desc)
404
+ unit_type, pack_size, pack_unit = _parse_pack_from_unit(unit_val)
405
+
406
  # --- NEW: classify the item ---
407
+ category = determine_item_category(clean_desc or desc, unit_val)
408
 
409
  items.append({
410
  "sr": sr_val if sr_val is not None else len(items) + 1,
411
+ "description": clean_desc or desc,
412
+ "dosage": dosage,
413
+ "form": form.title() if form else "",
414
+ "pack_size": pack_size,
415
+ "pack_unit": pack_unit,
416
+ "unit": unit_type,
417
  "qty": qty_val,
418
  "unit_price": None,
419
  "total_price": None,