MakPr016 commited on
Commit
005ba6a
·
1 Parent(s): 1d09b8f
Files changed (1) hide show
  1. main.py +55 -2
main.py CHANGED
@@ -44,6 +44,49 @@ def is_garbage_row(row_text: str) -> bool:
44
  t = row_text.lower()
45
  return any(bad in t for bad in blacklist)
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  async def delete_file_safety_net(file_path: str, delay: int = 600):
48
  await asyncio.sleep(delay)
49
  try:
@@ -69,6 +112,7 @@ def parse_pdf_file(file_path: str) -> List[Dict[str, Any]]:
69
  try:
70
  qty = 1
71
  qty_idx = -1
 
72
  for i in range(len(cleaned_row) - 1, -1, -1):
73
  val = cleaned_row[i].replace(',', '').replace('.', '')
74
  if val.isdigit() and int(val) < 1000000:
@@ -78,7 +122,9 @@ def parse_pdf_file(file_path: str) -> List[Dict[str, Any]]:
78
 
79
  if qty_idx == -1: continue
80
 
 
81
  desc_idx = 0
 
82
  if re.match(r'^\d+\.?$', cleaned_row[0]) and len(cleaned_row) > 1:
83
  desc_idx = 1
84
 
@@ -86,17 +132,23 @@ def parse_pdf_file(file_path: str) -> List[Dict[str, Any]]:
86
  if re.match(r'^\d+$', description): continue
87
  if is_garbage_row(description): continue
88
 
 
89
  unit = "Unit"
90
  if qty_idx > 0 and qty_idx > desc_idx:
 
91
  potential_unit = cleaned_row[qty_idx - 1]
92
  if len(potential_unit) < 20 and potential_unit != description:
93
  unit = potential_unit
94
 
 
 
 
95
  extracted_items.append({
96
  "inn_name": description,
97
  "quantity": qty,
98
  "form": unit,
99
- "dosage": ""
 
100
  })
101
  except Exception:
102
  continue
@@ -164,7 +216,8 @@ async def match_all(req: MatchRequest):
164
  matches = []
165
  for v in vendors:
166
  cats = [c.lower() for c in v.get('primary_categories', [])]
167
- if 'pharmaceuticals' in cats or 'medical devices' in cats:
 
168
  matches.append({
169
  'vendor_id': v.get('vendor_id'),
170
  'name': v.get('legal_name'),
 
44
  t = row_text.lower()
45
  return any(bad in t for bad in blacklist)
46
 
47
+ def determine_item_type(description: str, form: str) -> str:
48
+ """
49
+ Determines the category of the item based on its description and form/unit.
50
+ Categories: Pharmaceuticals, Medical Supplies, Medical Equipment.
51
+ """
52
+ text = (description + " " + form).lower()
53
+
54
+ # Priority 1: Medical Supplies (Consumables)
55
+ # Check these first to handle cases like "Insulin Syringe" (Supply) vs "Insulin" (Pharma)
56
+ supplies_keywords = [
57
+ 'syringe', 'needle', 'cannula', 'catheter', 'glove', 'mask', 'gauze',
58
+ 'bandage', 'dressing', 'cotton', 'swab', 'lancet', 'strip', 'test kit',
59
+ 'blade', 'suture', 'plaster', 'gown', 'sheet', 'bag', 'alcohol',
60
+ 'disinfectant', 'sanitizer', 'tongue depressor', 'specula', 'paper',
61
+ 'wipes', 'apron', 'cap', 'shoe cover', 'tape'
62
+ ]
63
+ if any(k in text for k in supplies_keywords):
64
+ return 'Medical Supplies'
65
+
66
+ # Priority 2: Medical Equipment (Devices/Durable)
67
+ equipment_keywords = [
68
+ 'thermometer', 'sphygmomanometer', 'stethoscope', 'oximeter',
69
+ 'glucometer', 'nebulizer', 'otoscope', 'penlight', 'monitor',
70
+ 'scale', 'microscope', 'centrifuge', 'refrigerator', 'cool box',
71
+ 'freezer', 'lamp', 'bed', 'chair', 'pump', 'bp machine', 'device'
72
+ ]
73
+ if any(k in text for k in equipment_keywords):
74
+ return 'Medical Equipment'
75
+
76
+ # Priority 3: Pharmaceuticals (Medicines/Drugs)
77
+ pharma_keywords = [
78
+ 'tablet', 'capsule', 'cap', 'tab', 'syrup', 'suspension', 'susp',
79
+ 'injection', 'inj', 'ampoule', 'amp', 'vial', 'cream', 'ointment',
80
+ 'gel', 'suppository', 'supp', 'drops', 'inhaler', 'vaccine', 'sera',
81
+ 'insulin', 'medicine', 'drug', 'mg', 'ml', 'mcg', 'iu', 'dose',
82
+ 'solution', 'infusion', 'spray', 'lozenge'
83
+ ]
84
+ if any(k in text for k in pharma_keywords):
85
+ return 'Pharmaceuticals'
86
+
87
+ # Fallback
88
+ return 'Medical Supplies'
89
+
90
  async def delete_file_safety_net(file_path: str, delay: int = 600):
91
  await asyncio.sleep(delay)
92
  try:
 
112
  try:
113
  qty = 1
114
  qty_idx = -1
115
+ # Attempt to find the Quantity column (usually a number towards the end)
116
  for i in range(len(cleaned_row) - 1, -1, -1):
117
  val = cleaned_row[i].replace(',', '').replace('.', '')
118
  if val.isdigit() and int(val) < 1000000:
 
122
 
123
  if qty_idx == -1: continue
124
 
125
+ # Attempt to find Description
126
  desc_idx = 0
127
+ # If first col is just a number (Item No), skip it
128
  if re.match(r'^\d+\.?$', cleaned_row[0]) and len(cleaned_row) > 1:
129
  desc_idx = 1
130
 
 
132
  if re.match(r'^\d+$', description): continue
133
  if is_garbage_row(description): continue
134
 
135
+ # Attempt to find Unit/Form
136
  unit = "Unit"
137
  if qty_idx > 0 and qty_idx > desc_idx:
138
+ # Usually the column before Qty is Unit
139
  potential_unit = cleaned_row[qty_idx - 1]
140
  if len(potential_unit) < 20 and potential_unit != description:
141
  unit = potential_unit
142
 
143
+ # Determine Category
144
+ item_type = determine_item_type(description, unit)
145
+
146
  extracted_items.append({
147
  "inn_name": description,
148
  "quantity": qty,
149
  "form": unit,
150
+ "dosage": "",
151
+ "type": item_type
152
  })
153
  except Exception:
154
  continue
 
216
  matches = []
217
  for v in vendors:
218
  cats = [c.lower() for c in v.get('primary_categories', [])]
219
+ # Simple matching logic - can be expanded to use item['type'] if needed
220
+ if 'pharmaceuticals' in cats or 'medical devices' in cats or 'medical supplies' in cats:
221
  matches.append({
222
  'vendor_id': v.get('vendor_id'),
223
  'name': v.get('legal_name'),