Spaces:
Running
Running
MakPr016 commited on
Commit ·
eac74fb
1
Parent(s): a77318b
Added categories
Browse files- rfq_parser.py +204 -1
rfq_parser.py
CHANGED
|
@@ -24,6 +24,195 @@ SR_RE = re.compile(r'\b(sr|item\s*no|pos\.?)\b|^no\.?$')
|
|
| 24 |
UNIT_RE = re.compile(r'(unit|uom|pack\s*size|measure)')
|
| 25 |
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
def _get_genai_client():
|
| 28 |
global _client
|
| 29 |
if _client is None:
|
|
@@ -153,6 +342,9 @@ def _extract_rows(rows, idx_map, num_cols, seen_srs, items):
|
|
| 153 |
continue
|
| 154 |
seen_srs.add(key)
|
| 155 |
|
|
|
|
|
|
|
|
|
|
| 156 |
items.append({
|
| 157 |
"sr": sr_val if sr_val is not None else len(items) + 1,
|
| 158 |
"description": desc,
|
|
@@ -163,6 +355,7 @@ def _extract_rows(rows, idx_map, num_cols, seen_srs, items):
|
|
| 163 |
"brand": "",
|
| 164 |
"expiry_date": "",
|
| 165 |
"remarks": "",
|
|
|
|
| 166 |
})
|
| 167 |
|
| 168 |
|
|
@@ -215,7 +408,9 @@ def _extract_line_items_from_llm(full_text):
|
|
| 215 |
"You are an expert at parsing RFQ documents. Extract ALL line items / schedule of requirements from the text. "
|
| 216 |
"Return a JSON array only. Each object must have exactly these keys: "
|
| 217 |
'{"sr": integer, "description": "string", "unit": "string or empty string", "qty": number or 0, '
|
| 218 |
-
'"unit_price": null, "total_price": null, "brand": "", "expiry_date": "", "remarks": ""}. '
|
|
|
|
|
|
|
| 219 |
"If no line items are found, return []. RETURN JSON ARRAY ONLY, no markdown, no preamble."
|
| 220 |
)
|
| 221 |
try:
|
|
@@ -231,6 +426,14 @@ def _extract_line_items_from_llm(full_text):
|
|
| 231 |
)
|
| 232 |
result = json.loads(response.text)
|
| 233 |
if isinstance(result, list):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
return result
|
| 235 |
return []
|
| 236 |
except Exception:
|
|
|
|
| 24 |
UNIT_RE = re.compile(r'(unit|uom|pack\s*size|measure)')
|
| 25 |
|
| 26 |
|
| 27 |
+
# ---------------------------------------------------------------------------
|
| 28 |
+
# CATEGORY DEFINITIONS (ported from old parser)
|
| 29 |
+
# Ordered by specificity. Whole-word boundary matching is applied.
|
| 30 |
+
# ---------------------------------------------------------------------------
|
| 31 |
+
CATEGORY_DEFINITIONS = {
|
| 32 |
+
"Pharmaceuticals & Biologics": [
|
| 33 |
+
"tablet", "tab", "capsule", "cap", "syrup", "suspension", "susp", "injection", "inj", "vial", "ampoule", "amp",
|
| 34 |
+
"drops", "gtt", "inhaler", "vaccine", "insulin", "dose", "drug", "medication", "ointment", "cream", "gel",
|
| 35 |
+
"lotion", "suppository", "supp", "antibiotic", "antiviral", "analgesic", "anesthetic", "hormone", "steroid",
|
| 36 |
+
"vitamin", "mineral", "supplement", "lozenge", "patch", "solution", "powder for suspension", "elixir", "serum",
|
| 37 |
+
"antitoxin",
|
| 38 |
+
],
|
| 39 |
+
"Surgical Products": [
|
| 40 |
+
"scalpel", "forceps", "retractor", "clamp", "suture", "stapler", "surgical mesh", "hemostatic", "sealant",
|
| 41 |
+
"surgical drape", "surgical gown", "laparoscopic", "robotic surgery", "electrosurgical", "surgical laser",
|
| 42 |
+
"surgical blade", "trocar", "surgical clip", "surgical scissor", "needle holder",
|
| 43 |
+
],
|
| 44 |
+
"Orthopedic & Spine": [
|
| 45 |
+
"orthopedic", "spine", "joint replacement", "trauma fixation", "bone plate", "bone screw",
|
| 46 |
+
"intramedullary rod", "bone nail", "spinal implant", "spinal fusion", "bone graft", "orthopedic brace",
|
| 47 |
+
"cast", "arthroscopy", "fixator", "prosthesis", "bone drill", "bone saw",
|
| 48 |
+
],
|
| 49 |
+
"Cardiovascular Products": [
|
| 50 |
+
"cardiac stent", "pacemaker", "defibrillator", "icd", "heart valve", "vascular graft", "cardiac catheter",
|
| 51 |
+
"guidewire", "cardiac balloon", "ablation", "coronary", "angioplasty", "introducer sheath",
|
| 52 |
+
],
|
| 53 |
+
"Medical Imaging Equipment": [
|
| 54 |
+
"mri", "ct scanner", "x-ray", "ultrasound", "mammography", "fluoroscopy", "pet scanner", "c-arm",
|
| 55 |
+
"medical imaging", "transducer", "x-ray film", "contrast media", "lead apron",
|
| 56 |
+
],
|
| 57 |
+
"Diagnostic Products": [
|
| 58 |
+
"diagnostic", "test kit", "glucose test", "reagent", "immunoassay", "chemistry analyzer", "hematology",
|
| 59 |
+
"microbiology", "culture media", "pregnancy test", "covid", "rapid test", "urinalysis", "penlight",
|
| 60 |
+
"specula", "otoscope", "ophthalmoscope", "lancet", "glucometer strips", "test strip",
|
| 61 |
+
],
|
| 62 |
+
"Patient Monitoring Equipment": [
|
| 63 |
+
"vital signs", "ecg", "ekg", "pulse oximeter", "blood pressure monitor", "sphygmomanometer",
|
| 64 |
+
"medical thermometer", "capnography", "fetal monitor", "telemetry", "spo2 sensor", "bp cuff",
|
| 65 |
+
"temperature probe",
|
| 66 |
+
],
|
| 67 |
+
"Respiratory & Anesthesia": [
|
| 68 |
+
"ventilator", "anesthesia machine", "oxygen concentrator", "nebulizer", "cpap", "bipap", "respiratory",
|
| 69 |
+
"endotracheal", "tracheostomy", "spirometer", "oxygen mask", "breathing circuit", "nasal cannula",
|
| 70 |
+
"resuscitator", "laryngoscope",
|
| 71 |
+
],
|
| 72 |
+
"Infusion & Vascular Access": [
|
| 73 |
+
"infusion pump", "syringe pump", "iv set", "iv catheter", "venous", "picc", "iv port",
|
| 74 |
+
"dialysis catheter", "administration set", "extension set", "stopcock", "giving set", "saline",
|
| 75 |
+
"dextrose", "ringer", "sodium chloride", "water for injection",
|
| 76 |
+
],
|
| 77 |
+
"Wound Care & Tissue Management": [
|
| 78 |
+
"wound dressing", "bandage", "gauze", "medical tape", "plaster", "adhesive", "wound foam", "alginate",
|
| 79 |
+
"hydrocolloid", "compression bandage", "ostomy", "skin substitute", "negative pressure",
|
| 80 |
+
],
|
| 81 |
+
"Dialysis & Renal Care": [
|
| 82 |
+
"hemodialysis", "peritoneal", "dialyzer", "blood line", "fistula needle", "dialysis concentrate",
|
| 83 |
+
"bicarbonate",
|
| 84 |
+
],
|
| 85 |
+
"Ophthalmic Products": [
|
| 86 |
+
"intraocular", "intraocular lens", "phaco", "vitrectomy", "lasik", "contact lens", "viscoelastic",
|
| 87 |
+
"ophthalmic solution", "eye drops",
|
| 88 |
+
],
|
| 89 |
+
"Dental Products": [
|
| 90 |
+
"dental implant", "orthodontic", "dental bracket", "dental wire", "dental drill", "dental handpiece",
|
| 91 |
+
"dental cement", "dental composite", "amalgam", "impression material", "teeth whitening", "dental chair",
|
| 92 |
+
],
|
| 93 |
+
"Neurology & Neurosurgery": [
|
| 94 |
+
"neurostimulation", "spinal cord stimulator", "neuro coil", "flow diverter", "cranial", "shunt",
|
| 95 |
+
"neuro electrode", "eeg", "emg",
|
| 96 |
+
],
|
| 97 |
+
"Laboratory Equipment & Supplies": [
|
| 98 |
+
"microscope", "lab centrifuge", "incubator", "autoclave", "pipette", "glassware", "test tube",
|
| 99 |
+
"petri dish", "flask", "beaker", "microscope slide", "cover glass", "fume hood", "biosafety cabinet",
|
| 100 |
+
],
|
| 101 |
+
"Personal Protective Equipment (PPE)": [
|
| 102 |
+
"ppe", "n95", "face shield", "safety eyewear", "goggles", "protective apron", "shoe cover",
|
| 103 |
+
"head cover", "coverall", "isolation gown", "hazmat", "surgical mask",
|
| 104 |
+
],
|
| 105 |
+
"Sterilization & Disinfection": [
|
| 106 |
+
"sterilization", "disinfectant", "antiseptic", "povidone", "iodine", "chlorhexidine", "alcohol swab",
|
| 107 |
+
"hand sanitizer", "medical soap", "enzymatic cleaner", "detergent", "washer disinfector", "sterilizer",
|
| 108 |
+
"sterilization indicator",
|
| 109 |
+
],
|
| 110 |
+
"Hospital Furniture & Equipment": [
|
| 111 |
+
"hospital bed", "examination table", "stretcher", "medical trolley", "medical cart", "medical cabinet",
|
| 112 |
+
"bedside locker", "overbed table", "iv pole", "wheelchair",
|
| 113 |
+
],
|
| 114 |
+
"Rehabilitation & Physical Therapy": [
|
| 115 |
+
"rehabilitation", "physiotherapy", "walker", "walking cane", "crutch", "exercise band", "traction",
|
| 116 |
+
"electrotherapy", "massage table", "orthosis",
|
| 117 |
+
],
|
| 118 |
+
"Home Healthcare Products": [
|
| 119 |
+
"home care", "blood glucose meter", "hearing aid", "mobility aid", "bathroom safety", "commode",
|
| 120 |
+
],
|
| 121 |
+
"Emergency & Trauma Care": [
|
| 122 |
+
"emergency kit", "trauma kit", "first aid", "aed", "defibrillator", "manual resuscitator",
|
| 123 |
+
"suction unit", "immobilizer", "cervical collar", "splint", "tourniquet", "crash cart",
|
| 124 |
+
],
|
| 125 |
+
"Maternal & Neonatal Care": [
|
| 126 |
+
"maternal", "neonatal", "infant incubator", "infant warmer", "phototherapy", "breast pump",
|
| 127 |
+
"obstetric", "birthing bed", "fetal doppler", "umbilical",
|
| 128 |
+
],
|
| 129 |
+
"Urology Products": [
|
| 130 |
+
"urology", "foley catheter", "urine bag", "urinary drainage", "ureteral stent", "stone basket",
|
| 131 |
+
],
|
| 132 |
+
"Gastroenterology & Endoscopy": [
|
| 133 |
+
"endoscope", "gastroscope", "colonoscope", "biopsy forceps", "polypectomy snare", "gastric balloon",
|
| 134 |
+
"ercp",
|
| 135 |
+
],
|
| 136 |
+
"Oncology Products": [
|
| 137 |
+
"oncology", "chemotherapy", "radiotherapy", "brachytherapy", "port-a-cath", "cancer diagnostic",
|
| 138 |
+
],
|
| 139 |
+
"Pain Management": [
|
| 140 |
+
"pain management", "pca pump", "epidural", "nerve block", "tens unit",
|
| 141 |
+
],
|
| 142 |
+
"Sleep Medicine": [
|
| 143 |
+
"sleep apnea", "cpap mask", "bipap mask", "sleep tubing", "polysomnography",
|
| 144 |
+
],
|
| 145 |
+
"Telemedicine & Digital Health": [
|
| 146 |
+
"telemedicine", "telehealth", "remote monitor", "medical software", "health app",
|
| 147 |
+
],
|
| 148 |
+
"Blood Management": [
|
| 149 |
+
"blood bag", "blood transfusion", "blood bank", "blood warmer", "apheresis",
|
| 150 |
+
],
|
| 151 |
+
"Mortuary & Pathology": [
|
| 152 |
+
"mortuary", "autopsy", "body bag", "morgue fridge", "dissection table", "microtome",
|
| 153 |
+
"tissue processor",
|
| 154 |
+
],
|
| 155 |
+
"Environmental Control": [
|
| 156 |
+
"medical gas", "medical vacuum", "medical air plant", "gas manifold", "gas outlet", "gas alarm",
|
| 157 |
+
],
|
| 158 |
+
"Mobility & Accessibility": [
|
| 159 |
+
"patient lift", "patient hoist", "wheelchair ramp", "stair lift", "transfer board",
|
| 160 |
+
],
|
| 161 |
+
"Bariatric Products": [
|
| 162 |
+
"bariatric bed", "bariatric wheelchair", "heavy duty scale",
|
| 163 |
+
],
|
| 164 |
+
"Medical Textiles": [
|
| 165 |
+
"hospital linen", "bed sheet", "pillow case", "medical blanket", "towel", "privacy curtain",
|
| 166 |
+
"medical uniform", "scrub suit", "lab coat",
|
| 167 |
+
],
|
| 168 |
+
"Infection Control Products": [
|
| 169 |
+
"waste bin", "sharps container", "biohazard bag", "spill kit", "air purifier",
|
| 170 |
+
],
|
| 171 |
+
"Medical Gases & Cryogenics": [
|
| 172 |
+
"gas cylinder", "oxygen regulator", "flowmeter", "liquid oxygen", "nitrogen tank",
|
| 173 |
+
],
|
| 174 |
+
"Nutrition & Feeding": [
|
| 175 |
+
"enteral feeding", "clinical nutrition", "nasogastric tube", "feeding pump", "feeding set", "peg tube",
|
| 176 |
+
],
|
| 177 |
+
"Specimen Collection & Transport": [
|
| 178 |
+
"specimen container", "sample collection", "transport media", "transport swab", "urine container",
|
| 179 |
+
"stool container", "cool box", "transport bag",
|
| 180 |
+
],
|
| 181 |
+
"Medical Software & IT": [
|
| 182 |
+
"emr", "ehr", "pacs", "ris", "lis", "his", "hospital information system",
|
| 183 |
+
],
|
| 184 |
+
"Aesthetics & Dermatology": [
|
| 185 |
+
"dermatology", "aesthetic laser", "ipl", "dermal filler", "botulinum", "botox", "chemical peel",
|
| 186 |
+
"microdermabrasion",
|
| 187 |
+
],
|
| 188 |
+
# Catch-all — must remain last
|
| 189 |
+
"Medical Supplies & Consumables": [
|
| 190 |
+
"syringe", "needle", "glove", "examination glove", "disposable", "consumable", "cotton wool",
|
| 191 |
+
"alcohol prep", "urinal", "bedpan", "underpad", "tongue depressor", "applicator",
|
| 192 |
+
"lubricant jelly", "cannula",
|
| 193 |
+
],
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def determine_item_category(description: str, unit: str = "") -> str:
|
| 198 |
+
"""
|
| 199 |
+
Returns the best-matching category for a line item using whole-word regex
|
| 200 |
+
matching against CATEGORY_DEFINITIONS. Falls back to
|
| 201 |
+
'Medical Supplies & Consumables' if nothing matches.
|
| 202 |
+
"""
|
| 203 |
+
text = (description + " " + unit).lower()
|
| 204 |
+
for category, keywords in CATEGORY_DEFINITIONS.items():
|
| 205 |
+
for keyword in keywords:
|
| 206 |
+
pattern = r'\b' + re.escape(keyword) + r'\b'
|
| 207 |
+
if re.search(pattern, text):
|
| 208 |
+
return category
|
| 209 |
+
return "Medical Supplies & Consumables"
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
# ---------------------------------------------------------------------------
|
| 213 |
+
# Remaining helpers (unchanged from original)
|
| 214 |
+
# ---------------------------------------------------------------------------
|
| 215 |
+
|
| 216 |
def _get_genai_client():
|
| 217 |
global _client
|
| 218 |
if _client is None:
|
|
|
|
| 342 |
continue
|
| 343 |
seen_srs.add(key)
|
| 344 |
|
| 345 |
+
# --- NEW: classify the item ---
|
| 346 |
+
category = determine_item_category(desc, unit_val)
|
| 347 |
+
|
| 348 |
items.append({
|
| 349 |
"sr": sr_val if sr_val is not None else len(items) + 1,
|
| 350 |
"description": desc,
|
|
|
|
| 355 |
"brand": "",
|
| 356 |
"expiry_date": "",
|
| 357 |
"remarks": "",
|
| 358 |
+
"category": category, # ← new field
|
| 359 |
})
|
| 360 |
|
| 361 |
|
|
|
|
| 408 |
"You are an expert at parsing RFQ documents. Extract ALL line items / schedule of requirements from the text. "
|
| 409 |
"Return a JSON array only. Each object must have exactly these keys: "
|
| 410 |
'{"sr": integer, "description": "string", "unit": "string or empty string", "qty": number or 0, '
|
| 411 |
+
'"unit_price": null, "total_price": null, "brand": "", "expiry_date": "", "remarks": "", "category": "string"}. '
|
| 412 |
+
"For 'category', classify each item into the most appropriate medical supply category "
|
| 413 |
+
"(e.g. 'Pharmaceuticals & Biologics', 'Surgical Products', 'Diagnostic Products', etc.). "
|
| 414 |
"If no line items are found, return []. RETURN JSON ARRAY ONLY, no markdown, no preamble."
|
| 415 |
)
|
| 416 |
try:
|
|
|
|
| 426 |
)
|
| 427 |
result = json.loads(response.text)
|
| 428 |
if isinstance(result, list):
|
| 429 |
+
# Apply local rule-based categorisation as a safety net in case
|
| 430 |
+
# the LLM returns an empty or generic category string.
|
| 431 |
+
for item in result:
|
| 432 |
+
if not item.get("category") or item["category"] in ("string", ""):
|
| 433 |
+
item["category"] = determine_item_category(
|
| 434 |
+
item.get("description", ""),
|
| 435 |
+
item.get("unit", ""),
|
| 436 |
+
)
|
| 437 |
return result
|
| 438 |
return []
|
| 439 |
except Exception:
|