Spaces:
Sleeping
Sleeping
| import re | |
| import json | |
| from typing import List, Dict | |
| from pathlib import Path | |
| from simulation.manufacturer_db import ManufacturerDB | |
| class EntityExtractor: | |
| def __init__(self, db: ManufacturerDB): | |
| self.db = db | |
| self.aliases = self._load_aliases() | |
| # Form keywords that indicate a medicine nearby | |
| self.form_keywords = { | |
| 'tablet': ['tablet', 'tab', 'tabs', 'capsule', 'cap', 'caps'], | |
| 'syrup': ['syrup', 'liquid', 'suspension'], | |
| 'injection': ['injection', 'inj', 'vial', 'ampoule'], | |
| 'cream': ['cream', 'gel', 'ointment', 'tube'], | |
| 'spray': ['spray', 'inhaler', 'puff'], | |
| 'drops': ['drops', 'eye drops', 'ear drops'], | |
| 'sachet': ['sachet', 'powder', 'granules'] | |
| } | |
| # Unit keywords for quantity extraction | |
| self.unit_keywords = ['strips', 'strip', 'slips', 'slip', 'bottles', 'bottle', | |
| 'tablets', 'tabs', 'pieces', 'pcs', 'boxes', 'box', | |
| 'packs', 'pack', 'vials', 'vial', 'ampoules'] | |
| # Spoken number mapping | |
| self.spoken_numbers = { | |
| 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, | |
| 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10, | |
| 'eleven': 11, 'twelve': 12, 'fifteen': 15, 'twenty': 20, | |
| 'twenty-five': 25, 'thirty': 30, 'forty': 40, 'fifty': 50, | |
| 'sixty': 60, 'seventy': 70, 'eighty': 80, 'ninety': 90, | |
| 'hundred': 100, 'two hundred': 200, 'three hundred': 300, | |
| 'five hundred': 500, 'thousand': 1000 | |
| } | |
| def _load_aliases(self) -> Dict: | |
| """Load pronunciation aliases from JSON file.""" | |
| alias_path = Path("data/aliases.json") | |
| if alias_path.exists(): | |
| with open(alias_path, 'r') as f: | |
| return json.load(f) | |
| return {} | |
| def _normalize_text(self, text: str) -> str: | |
| """Normalize input text for parsing.""" | |
| text = text.lower() | |
| # Remove common ASR artifacts | |
| text = re.sub(r'</s>|<unk>|<s>', '', text) | |
| # Remove filler words | |
| text = re.sub(r'\b(uh|um|like|maybe|please|kindly)\b', '', text) | |
| # Normalize punctuation | |
| text = text.replace(",", " , ").replace(".", " ") | |
| # Convert spoken numbers to digits | |
| for word, num in self.spoken_numbers.items(): | |
| text = re.sub(rf'\b{word}\b', str(num), text) | |
| return text.strip() | |
| def _resolve_alias(self, word: str) -> str: | |
| """Check if word is an alias for a known medicine.""" | |
| word_lower = word.lower() | |
| for canonical, aliases in self.aliases.items(): | |
| if word_lower in aliases or word_lower == canonical: | |
| return canonical | |
| return word | |
| def _extract_form(self, segment: str) -> str: | |
| """Extract form type from segment.""" | |
| segment_lower = segment.lower() | |
| for form_type, keywords in self.form_keywords.items(): | |
| for kw in keywords: | |
| if kw in segment_lower: | |
| return form_type | |
| return "tablet" # Default | |
| def _extract_quantity(self, segment: str) -> tuple: | |
| """Extract quantity and unit from segment.""" | |
| # Pattern 1: Number followed by unit word | |
| # e.g., "300 strips", "20 bottles" | |
| qty_pattern = r'(\d+)\s*(' + '|'.join(self.unit_keywords) + r')?' | |
| match = re.search(qty_pattern, segment, re.IGNORECASE) | |
| if match: | |
| num = match.group(1) | |
| unit = match.group(2) if match.group(2) else "units" | |
| # Normalize common typos | |
| if unit in ['slips', 'slip']: | |
| unit = 'strips' | |
| return num, unit | |
| return "1", "units" # Default | |
| def _extract_dosage(self, segment: str) -> str: | |
| """Extract dosage from segment.""" | |
| # Pattern: Number followed by mg/ml/gm | |
| dosage_match = re.search(r'(\d+)\s*(mg|ml|gm|mcg)', segment, re.IGNORECASE) | |
| if dosage_match: | |
| return f"{dosage_match.group(1)}{dosage_match.group(2)}" | |
| return "-" | |
| def extract(self, text: str) -> List[Dict]: | |
| """ | |
| Extract medicine entities from text. | |
| Returns: List of dicts {'medicine': str, 'form': str, 'quantity': str, 'dosage': str} | |
| """ | |
| if not text: | |
| return [] | |
| # Normalize text | |
| text = self._normalize_text(text) | |
| found_orders = [] | |
| # Get all known medicines from DB for matching | |
| known_meds = self.db.medicines['medicine_name'].tolist() | |
| # Split by multiple delimiters for multi-item orders | |
| # Handles: "send", "order", "add", "also", "plus", "then", "and", comma | |
| delimiters = r'\b(?:send|add|want|need|order|also|plus|then)\b|,|\band\b' | |
| segments = re.split(delimiters, text) | |
| for segment in segments: | |
| segment = segment.strip() | |
| if not segment or len(segment) < 3: | |
| continue | |
| # Try to find a medicine match in this segment | |
| from rapidfuzz import process, fuzz | |
| # First, check if any word is a known alias | |
| words = segment.split() | |
| resolved_segment = ' '.join([self._resolve_alias(w) for w in words]) | |
| # Fuzzy match against known medicines | |
| match = process.extractOne(resolved_segment, known_meds, scorer=fuzz.partial_ratio) | |
| if match and match[1] > 75: # Confidence threshold | |
| med_name = match[0] | |
| # Extract form, quantity, dosage | |
| form = self._extract_form(segment) | |
| num, unit = self._extract_quantity(segment) | |
| quantity = f"{num} {unit}" | |
| dosage = self._extract_dosage(segment) | |
| if dosage == "-": | |
| # Lookup default dosage from DB | |
| med_row = self.db.medicines[self.db.medicines['medicine_name'] == med_name].iloc[0] | |
| dosage = med_row['dosage'] | |
| found_orders.append({ | |
| "medicine": med_name, | |
| "form": form, | |
| "quantity": quantity, | |
| "dosage": dosage, | |
| "confidence": match[1], | |
| "original_segment": segment.strip() | |
| }) | |
| return found_orders | |