MakPr016
commited on
Commit
·
005ba6a
1
Parent(s):
1d09b8f
update
Browse files
main.py
CHANGED
|
@@ -44,6 +44,49 @@ def is_garbage_row(row_text: str) -> bool:
|
|
| 44 |
t = row_text.lower()
|
| 45 |
return any(bad in t for bad in blacklist)
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
async def delete_file_safety_net(file_path: str, delay: int = 600):
|
| 48 |
await asyncio.sleep(delay)
|
| 49 |
try:
|
|
@@ -69,6 +112,7 @@ def parse_pdf_file(file_path: str) -> List[Dict[str, Any]]:
|
|
| 69 |
try:
|
| 70 |
qty = 1
|
| 71 |
qty_idx = -1
|
|
|
|
| 72 |
for i in range(len(cleaned_row) - 1, -1, -1):
|
| 73 |
val = cleaned_row[i].replace(',', '').replace('.', '')
|
| 74 |
if val.isdigit() and int(val) < 1000000:
|
|
@@ -78,7 +122,9 @@ def parse_pdf_file(file_path: str) -> List[Dict[str, Any]]:
|
|
| 78 |
|
| 79 |
if qty_idx == -1: continue
|
| 80 |
|
|
|
|
| 81 |
desc_idx = 0
|
|
|
|
| 82 |
if re.match(r'^\d+\.?$', cleaned_row[0]) and len(cleaned_row) > 1:
|
| 83 |
desc_idx = 1
|
| 84 |
|
|
@@ -86,17 +132,23 @@ def parse_pdf_file(file_path: str) -> List[Dict[str, Any]]:
|
|
| 86 |
if re.match(r'^\d+$', description): continue
|
| 87 |
if is_garbage_row(description): continue
|
| 88 |
|
|
|
|
| 89 |
unit = "Unit"
|
| 90 |
if qty_idx > 0 and qty_idx > desc_idx:
|
|
|
|
| 91 |
potential_unit = cleaned_row[qty_idx - 1]
|
| 92 |
if len(potential_unit) < 20 and potential_unit != description:
|
| 93 |
unit = potential_unit
|
| 94 |
|
|
|
|
|
|
|
|
|
|
| 95 |
extracted_items.append({
|
| 96 |
"inn_name": description,
|
| 97 |
"quantity": qty,
|
| 98 |
"form": unit,
|
| 99 |
-
"dosage": ""
|
|
|
|
| 100 |
})
|
| 101 |
except Exception:
|
| 102 |
continue
|
|
@@ -164,7 +216,8 @@ async def match_all(req: MatchRequest):
|
|
| 164 |
matches = []
|
| 165 |
for v in vendors:
|
| 166 |
cats = [c.lower() for c in v.get('primary_categories', [])]
|
| 167 |
-
|
|
|
|
| 168 |
matches.append({
|
| 169 |
'vendor_id': v.get('vendor_id'),
|
| 170 |
'name': v.get('legal_name'),
|
|
|
|
| 44 |
t = row_text.lower()
|
| 45 |
return any(bad in t for bad in blacklist)
|
| 46 |
|
| 47 |
+
def determine_item_type(description: str, form: str) -> str:
|
| 48 |
+
"""
|
| 49 |
+
Determines the category of the item based on its description and form/unit.
|
| 50 |
+
Categories: Pharmaceuticals, Medical Supplies, Medical Equipment.
|
| 51 |
+
"""
|
| 52 |
+
text = (description + " " + form).lower()
|
| 53 |
+
|
| 54 |
+
# Priority 1: Medical Supplies (Consumables)
|
| 55 |
+
# Check these first to handle cases like "Insulin Syringe" (Supply) vs "Insulin" (Pharma)
|
| 56 |
+
supplies_keywords = [
|
| 57 |
+
'syringe', 'needle', 'cannula', 'catheter', 'glove', 'mask', 'gauze',
|
| 58 |
+
'bandage', 'dressing', 'cotton', 'swab', 'lancet', 'strip', 'test kit',
|
| 59 |
+
'blade', 'suture', 'plaster', 'gown', 'sheet', 'bag', 'alcohol',
|
| 60 |
+
'disinfectant', 'sanitizer', 'tongue depressor', 'specula', 'paper',
|
| 61 |
+
'wipes', 'apron', 'cap', 'shoe cover', 'tape'
|
| 62 |
+
]
|
| 63 |
+
if any(k in text for k in supplies_keywords):
|
| 64 |
+
return 'Medical Supplies'
|
| 65 |
+
|
| 66 |
+
# Priority 2: Medical Equipment (Devices/Durable)
|
| 67 |
+
equipment_keywords = [
|
| 68 |
+
'thermometer', 'sphygmomanometer', 'stethoscope', 'oximeter',
|
| 69 |
+
'glucometer', 'nebulizer', 'otoscope', 'penlight', 'monitor',
|
| 70 |
+
'scale', 'microscope', 'centrifuge', 'refrigerator', 'cool box',
|
| 71 |
+
'freezer', 'lamp', 'bed', 'chair', 'pump', 'bp machine', 'device'
|
| 72 |
+
]
|
| 73 |
+
if any(k in text for k in equipment_keywords):
|
| 74 |
+
return 'Medical Equipment'
|
| 75 |
+
|
| 76 |
+
# Priority 3: Pharmaceuticals (Medicines/Drugs)
|
| 77 |
+
pharma_keywords = [
|
| 78 |
+
'tablet', 'capsule', 'cap', 'tab', 'syrup', 'suspension', 'susp',
|
| 79 |
+
'injection', 'inj', 'ampoule', 'amp', 'vial', 'cream', 'ointment',
|
| 80 |
+
'gel', 'suppository', 'supp', 'drops', 'inhaler', 'vaccine', 'sera',
|
| 81 |
+
'insulin', 'medicine', 'drug', 'mg', 'ml', 'mcg', 'iu', 'dose',
|
| 82 |
+
'solution', 'infusion', 'spray', 'lozenge'
|
| 83 |
+
]
|
| 84 |
+
if any(k in text for k in pharma_keywords):
|
| 85 |
+
return 'Pharmaceuticals'
|
| 86 |
+
|
| 87 |
+
# Fallback
|
| 88 |
+
return 'Medical Supplies'
|
| 89 |
+
|
| 90 |
async def delete_file_safety_net(file_path: str, delay: int = 600):
|
| 91 |
await asyncio.sleep(delay)
|
| 92 |
try:
|
|
|
|
| 112 |
try:
|
| 113 |
qty = 1
|
| 114 |
qty_idx = -1
|
| 115 |
+
# Attempt to find the Quantity column (usually a number towards the end)
|
| 116 |
for i in range(len(cleaned_row) - 1, -1, -1):
|
| 117 |
val = cleaned_row[i].replace(',', '').replace('.', '')
|
| 118 |
if val.isdigit() and int(val) < 1000000:
|
|
|
|
| 122 |
|
| 123 |
if qty_idx == -1: continue
|
| 124 |
|
| 125 |
+
# Attempt to find Description
|
| 126 |
desc_idx = 0
|
| 127 |
+
# If first col is just a number (Item No), skip it
|
| 128 |
if re.match(r'^\d+\.?$', cleaned_row[0]) and len(cleaned_row) > 1:
|
| 129 |
desc_idx = 1
|
| 130 |
|
|
|
|
| 132 |
if re.match(r'^\d+$', description): continue
|
| 133 |
if is_garbage_row(description): continue
|
| 134 |
|
| 135 |
+
# Attempt to find Unit/Form
|
| 136 |
unit = "Unit"
|
| 137 |
if qty_idx > 0 and qty_idx > desc_idx:
|
| 138 |
+
# Usually the column before Qty is Unit
|
| 139 |
potential_unit = cleaned_row[qty_idx - 1]
|
| 140 |
if len(potential_unit) < 20 and potential_unit != description:
|
| 141 |
unit = potential_unit
|
| 142 |
|
| 143 |
+
# Determine Category
|
| 144 |
+
item_type = determine_item_type(description, unit)
|
| 145 |
+
|
| 146 |
extracted_items.append({
|
| 147 |
"inn_name": description,
|
| 148 |
"quantity": qty,
|
| 149 |
"form": unit,
|
| 150 |
+
"dosage": "",
|
| 151 |
+
"type": item_type
|
| 152 |
})
|
| 153 |
except Exception:
|
| 154 |
continue
|
|
|
|
| 216 |
matches = []
|
| 217 |
for v in vendors:
|
| 218 |
cats = [c.lower() for c in v.get('primary_categories', [])]
|
| 219 |
+
# Simple matching logic - can be expanded to use item['type'] if needed
|
| 220 |
+
if 'pharmaceuticals' in cats or 'medical devices' in cats or 'medical supplies' in cats:
|
| 221 |
matches.append({
|
| 222 |
'vendor_id': v.get('vendor_id'),
|
| 223 |
'name': v.get('legal_name'),
|