MakPr016 commited on
Commit
eac74fb
·
1 Parent(s): a77318b

Added categories

Browse files
Files changed (1) hide show
  1. rfq_parser.py +204 -1
rfq_parser.py CHANGED
@@ -24,6 +24,195 @@ SR_RE = re.compile(r'\b(sr|item\s*no|pos\.?)\b|^no\.?$')
24
  UNIT_RE = re.compile(r'(unit|uom|pack\s*size|measure)')
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def _get_genai_client():
28
  global _client
29
  if _client is None:
@@ -153,6 +342,9 @@ def _extract_rows(rows, idx_map, num_cols, seen_srs, items):
153
  continue
154
  seen_srs.add(key)
155
 
 
 
 
156
  items.append({
157
  "sr": sr_val if sr_val is not None else len(items) + 1,
158
  "description": desc,
@@ -163,6 +355,7 @@ def _extract_rows(rows, idx_map, num_cols, seen_srs, items):
163
  "brand": "",
164
  "expiry_date": "",
165
  "remarks": "",
 
166
  })
167
 
168
 
@@ -215,7 +408,9 @@ def _extract_line_items_from_llm(full_text):
215
  "You are an expert at parsing RFQ documents. Extract ALL line items / schedule of requirements from the text. "
216
  "Return a JSON array only. Each object must have exactly these keys: "
217
  '{"sr": integer, "description": "string", "unit": "string or empty string", "qty": number or 0, '
218
- '"unit_price": null, "total_price": null, "brand": "", "expiry_date": "", "remarks": ""}. '
 
 
219
  "If no line items are found, return []. RETURN JSON ARRAY ONLY, no markdown, no preamble."
220
  )
221
  try:
@@ -231,6 +426,14 @@ def _extract_line_items_from_llm(full_text):
231
  )
232
  result = json.loads(response.text)
233
  if isinstance(result, list):
 
 
 
 
 
 
 
 
234
  return result
235
  return []
236
  except Exception:
 
24
  UNIT_RE = re.compile(r'(unit|uom|pack\s*size|measure)')
25
 
26
 
27
+ # ---------------------------------------------------------------------------
28
+ # CATEGORY DEFINITIONS (ported from old parser)
29
+ # Ordered by specificity. Whole-word boundary matching is applied.
30
+ # ---------------------------------------------------------------------------
31
+ CATEGORY_DEFINITIONS = {
32
+ "Pharmaceuticals & Biologics": [
33
+ "tablet", "tab", "capsule", "cap", "syrup", "suspension", "susp", "injection", "inj", "vial", "ampoule", "amp",
34
+ "drops", "gtt", "inhaler", "vaccine", "insulin", "dose", "drug", "medication", "ointment", "cream", "gel",
35
+ "lotion", "suppository", "supp", "antibiotic", "antiviral", "analgesic", "anesthetic", "hormone", "steroid",
36
+ "vitamin", "mineral", "supplement", "lozenge", "patch", "solution", "powder for suspension", "elixir", "serum",
37
+ "antitoxin",
38
+ ],
39
+ "Surgical Products": [
40
+ "scalpel", "forceps", "retractor", "clamp", "suture", "stapler", "surgical mesh", "hemostatic", "sealant",
41
+ "surgical drape", "surgical gown", "laparoscopic", "robotic surgery", "electrosurgical", "surgical laser",
42
+ "surgical blade", "trocar", "surgical clip", "surgical scissor", "needle holder",
43
+ ],
44
+ "Orthopedic & Spine": [
45
+ "orthopedic", "spine", "joint replacement", "trauma fixation", "bone plate", "bone screw",
46
+ "intramedullary rod", "bone nail", "spinal implant", "spinal fusion", "bone graft", "orthopedic brace",
47
+ "cast", "arthroscopy", "fixator", "prosthesis", "bone drill", "bone saw",
48
+ ],
49
+ "Cardiovascular Products": [
50
+ "cardiac stent", "pacemaker", "defibrillator", "icd", "heart valve", "vascular graft", "cardiac catheter",
51
+ "guidewire", "cardiac balloon", "ablation", "coronary", "angioplasty", "introducer sheath",
52
+ ],
53
+ "Medical Imaging Equipment": [
54
+ "mri", "ct scanner", "x-ray", "ultrasound", "mammography", "fluoroscopy", "pet scanner", "c-arm",
55
+ "medical imaging", "transducer", "x-ray film", "contrast media", "lead apron",
56
+ ],
57
+ "Diagnostic Products": [
58
+ "diagnostic", "test kit", "glucose test", "reagent", "immunoassay", "chemistry analyzer", "hematology",
59
+ "microbiology", "culture media", "pregnancy test", "covid", "rapid test", "urinalysis", "penlight",
60
+ "specula", "otoscope", "ophthalmoscope", "lancet", "glucometer strips", "test strip",
61
+ ],
62
+ "Patient Monitoring Equipment": [
63
+ "vital signs", "ecg", "ekg", "pulse oximeter", "blood pressure monitor", "sphygmomanometer",
64
+ "medical thermometer", "capnography", "fetal monitor", "telemetry", "spo2 sensor", "bp cuff",
65
+ "temperature probe",
66
+ ],
67
+ "Respiratory & Anesthesia": [
68
+ "ventilator", "anesthesia machine", "oxygen concentrator", "nebulizer", "cpap", "bipap", "respiratory",
69
+ "endotracheal", "tracheostomy", "spirometer", "oxygen mask", "breathing circuit", "nasal cannula",
70
+ "resuscitator", "laryngoscope",
71
+ ],
72
+ "Infusion & Vascular Access": [
73
+ "infusion pump", "syringe pump", "iv set", "iv catheter", "venous", "picc", "iv port",
74
+ "dialysis catheter", "administration set", "extension set", "stopcock", "giving set", "saline",
75
+ "dextrose", "ringer", "sodium chloride", "water for injection",
76
+ ],
77
+ "Wound Care & Tissue Management": [
78
+ "wound dressing", "bandage", "gauze", "medical tape", "plaster", "adhesive", "wound foam", "alginate",
79
+ "hydrocolloid", "compression bandage", "ostomy", "skin substitute", "negative pressure",
80
+ ],
81
+ "Dialysis & Renal Care": [
82
+ "hemodialysis", "peritoneal", "dialyzer", "blood line", "fistula needle", "dialysis concentrate",
83
+ "bicarbonate",
84
+ ],
85
+ "Ophthalmic Products": [
86
+ "intraocular", "intraocular lens", "phaco", "vitrectomy", "lasik", "contact lens", "viscoelastic",
87
+ "ophthalmic solution", "eye drops",
88
+ ],
89
+ "Dental Products": [
90
+ "dental implant", "orthodontic", "dental bracket", "dental wire", "dental drill", "dental handpiece",
91
+ "dental cement", "dental composite", "amalgam", "impression material", "teeth whitening", "dental chair",
92
+ ],
93
+ "Neurology & Neurosurgery": [
94
+ "neurostimulation", "spinal cord stimulator", "neuro coil", "flow diverter", "cranial", "shunt",
95
+ "neuro electrode", "eeg", "emg",
96
+ ],
97
+ "Laboratory Equipment & Supplies": [
98
+ "microscope", "lab centrifuge", "incubator", "autoclave", "pipette", "glassware", "test tube",
99
+ "petri dish", "flask", "beaker", "microscope slide", "cover glass", "fume hood", "biosafety cabinet",
100
+ ],
101
+ "Personal Protective Equipment (PPE)": [
102
+ "ppe", "n95", "face shield", "safety eyewear", "goggles", "protective apron", "shoe cover",
103
+ "head cover", "coverall", "isolation gown", "hazmat", "surgical mask",
104
+ ],
105
+ "Sterilization & Disinfection": [
106
+ "sterilization", "disinfectant", "antiseptic", "povidone", "iodine", "chlorhexidine", "alcohol swab",
107
+ "hand sanitizer", "medical soap", "enzymatic cleaner", "detergent", "washer disinfector", "sterilizer",
108
+ "sterilization indicator",
109
+ ],
110
+ "Hospital Furniture & Equipment": [
111
+ "hospital bed", "examination table", "stretcher", "medical trolley", "medical cart", "medical cabinet",
112
+ "bedside locker", "overbed table", "iv pole", "wheelchair",
113
+ ],
114
+ "Rehabilitation & Physical Therapy": [
115
+ "rehabilitation", "physiotherapy", "walker", "walking cane", "crutch", "exercise band", "traction",
116
+ "electrotherapy", "massage table", "orthosis",
117
+ ],
118
+ "Home Healthcare Products": [
119
+ "home care", "blood glucose meter", "hearing aid", "mobility aid", "bathroom safety", "commode",
120
+ ],
121
+ "Emergency & Trauma Care": [
122
+ "emergency kit", "trauma kit", "first aid", "aed", "defibrillator", "manual resuscitator",
123
+ "suction unit", "immobilizer", "cervical collar", "splint", "tourniquet", "crash cart",
124
+ ],
125
+ "Maternal & Neonatal Care": [
126
+ "maternal", "neonatal", "infant incubator", "infant warmer", "phototherapy", "breast pump",
127
+ "obstetric", "birthing bed", "fetal doppler", "umbilical",
128
+ ],
129
+ "Urology Products": [
130
+ "urology", "foley catheter", "urine bag", "urinary drainage", "ureteral stent", "stone basket",
131
+ ],
132
+ "Gastroenterology & Endoscopy": [
133
+ "endoscope", "gastroscope", "colonoscope", "biopsy forceps", "polypectomy snare", "gastric balloon",
134
+ "ercp",
135
+ ],
136
+ "Oncology Products": [
137
+ "oncology", "chemotherapy", "radiotherapy", "brachytherapy", "port-a-cath", "cancer diagnostic",
138
+ ],
139
+ "Pain Management": [
140
+ "pain management", "pca pump", "epidural", "nerve block", "tens unit",
141
+ ],
142
+ "Sleep Medicine": [
143
+ "sleep apnea", "cpap mask", "bipap mask", "sleep tubing", "polysomnography",
144
+ ],
145
+ "Telemedicine & Digital Health": [
146
+ "telemedicine", "telehealth", "remote monitor", "medical software", "health app",
147
+ ],
148
+ "Blood Management": [
149
+ "blood bag", "blood transfusion", "blood bank", "blood warmer", "apheresis",
150
+ ],
151
+ "Mortuary & Pathology": [
152
+ "mortuary", "autopsy", "body bag", "morgue fridge", "dissection table", "microtome",
153
+ "tissue processor",
154
+ ],
155
+ "Environmental Control": [
156
+ "medical gas", "medical vacuum", "medical air plant", "gas manifold", "gas outlet", "gas alarm",
157
+ ],
158
+ "Mobility & Accessibility": [
159
+ "patient lift", "patient hoist", "wheelchair ramp", "stair lift", "transfer board",
160
+ ],
161
+ "Bariatric Products": [
162
+ "bariatric bed", "bariatric wheelchair", "heavy duty scale",
163
+ ],
164
+ "Medical Textiles": [
165
+ "hospital linen", "bed sheet", "pillow case", "medical blanket", "towel", "privacy curtain",
166
+ "medical uniform", "scrub suit", "lab coat",
167
+ ],
168
+ "Infection Control Products": [
169
+ "waste bin", "sharps container", "biohazard bag", "spill kit", "air purifier",
170
+ ],
171
+ "Medical Gases & Cryogenics": [
172
+ "gas cylinder", "oxygen regulator", "flowmeter", "liquid oxygen", "nitrogen tank",
173
+ ],
174
+ "Nutrition & Feeding": [
175
+ "enteral feeding", "clinical nutrition", "nasogastric tube", "feeding pump", "feeding set", "peg tube",
176
+ ],
177
+ "Specimen Collection & Transport": [
178
+ "specimen container", "sample collection", "transport media", "transport swab", "urine container",
179
+ "stool container", "cool box", "transport bag",
180
+ ],
181
+ "Medical Software & IT": [
182
+ "emr", "ehr", "pacs", "ris", "lis", "his", "hospital information system",
183
+ ],
184
+ "Aesthetics & Dermatology": [
185
+ "dermatology", "aesthetic laser", "ipl", "dermal filler", "botulinum", "botox", "chemical peel",
186
+ "microdermabrasion",
187
+ ],
188
+ # Catch-all — must remain last
189
+ "Medical Supplies & Consumables": [
190
+ "syringe", "needle", "glove", "examination glove", "disposable", "consumable", "cotton wool",
191
+ "alcohol prep", "urinal", "bedpan", "underpad", "tongue depressor", "applicator",
192
+ "lubricant jelly", "cannula",
193
+ ],
194
+ }
195
+
196
+
197
+ def determine_item_category(description: str, unit: str = "") -> str:
198
+ """
199
+ Returns the best-matching category for a line item using whole-word regex
200
+ matching against CATEGORY_DEFINITIONS. Falls back to
201
+ 'Medical Supplies & Consumables' if nothing matches.
202
+ """
203
+ text = (description + " " + unit).lower()
204
+ for category, keywords in CATEGORY_DEFINITIONS.items():
205
+ for keyword in keywords:
206
+ pattern = r'\b' + re.escape(keyword) + r'\b'
207
+ if re.search(pattern, text):
208
+ return category
209
+ return "Medical Supplies & Consumables"
210
+
211
+
212
+ # ---------------------------------------------------------------------------
213
+ # Remaining helpers (unchanged from original)
214
+ # ---------------------------------------------------------------------------
215
+
216
  def _get_genai_client():
217
  global _client
218
  if _client is None:
 
342
  continue
343
  seen_srs.add(key)
344
 
345
+ # --- NEW: classify the item ---
346
+ category = determine_item_category(desc, unit_val)
347
+
348
  items.append({
349
  "sr": sr_val if sr_val is not None else len(items) + 1,
350
  "description": desc,
 
355
  "brand": "",
356
  "expiry_date": "",
357
  "remarks": "",
358
+ "category": category, # ← new field
359
  })
360
 
361
 
 
408
  "You are an expert at parsing RFQ documents. Extract ALL line items / schedule of requirements from the text. "
409
  "Return a JSON array only. Each object must have exactly these keys: "
410
  '{"sr": integer, "description": "string", "unit": "string or empty string", "qty": number or 0, '
411
+ '"unit_price": null, "total_price": null, "brand": "", "expiry_date": "", "remarks": "", "category": "string"}. '
412
+ "For 'category', classify each item into the most appropriate medical supply category "
413
+ "(e.g. 'Pharmaceuticals & Biologics', 'Surgical Products', 'Diagnostic Products', etc.). "
414
  "If no line items are found, return []. RETURN JSON ARRAY ONLY, no markdown, no preamble."
415
  )
416
  try:
 
426
  )
427
  result = json.loads(response.text)
428
  if isinstance(result, list):
429
+ # Apply local rule-based categorisation as a safety net in case
430
+ # the LLM returns an empty or generic category string.
431
+ for item in result:
432
+ if not item.get("category") or item["category"] in ("string", ""):
433
+ item["category"] = determine_item_category(
434
+ item.get("description", ""),
435
+ item.get("unit", ""),
436
+ )
437
  return result
438
  return []
439
  except Exception: