Spaces:

Khedhar
/

pharma-voice-orders

Sleeping

App Files Files Community

pharma-voice-orders / core /entity_extractor.py

Khedhar

project start

ee14b8f about 1 month ago

raw

history blame contribute delete

6.6 kB

	import re
	import json
	from typing import List, Dict
	from pathlib import Path
	from simulation.manufacturer_db import ManufacturerDB

	class EntityExtractor:
	def __init__(self, db: ManufacturerDB):
	self.db = db
	self.aliases = self._load_aliases()

	# Form keywords that indicate a medicine nearby
	self.form_keywords = {
	'tablet': ['tablet', 'tab', 'tabs', 'capsule', 'cap', 'caps'],
	'syrup': ['syrup', 'liquid', 'suspension'],
	'injection': ['injection', 'inj', 'vial', 'ampoule'],
	'cream': ['cream', 'gel', 'ointment', 'tube'],
	'spray': ['spray', 'inhaler', 'puff'],
	'drops': ['drops', 'eye drops', 'ear drops'],
	'sachet': ['sachet', 'powder', 'granules']
	}

	# Unit keywords for quantity extraction
	self.unit_keywords = ['strips', 'strip', 'slips', 'slip', 'bottles', 'bottle',
	'tablets', 'tabs', 'pieces', 'pcs', 'boxes', 'box',
	'packs', 'pack', 'vials', 'vial', 'ampoules']

	# Spoken number mapping
	self.spoken_numbers = {
	'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
	'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10,
	'eleven': 11, 'twelve': 12, 'fifteen': 15, 'twenty': 20,
	'twenty-five': 25, 'thirty': 30, 'forty': 40, 'fifty': 50,
	'sixty': 60, 'seventy': 70, 'eighty': 80, 'ninety': 90,
	'hundred': 100, 'two hundred': 200, 'three hundred': 300,
	'five hundred': 500, 'thousand': 1000
	}

	def _load_aliases(self) -> Dict:
	"""Load pronunciation aliases from JSON file."""
	alias_path = Path("data/aliases.json")
	if alias_path.exists():
	with open(alias_path, 'r') as f:
	return json.load(f)
	return {}

	def _normalize_text(self, text: str) -> str:
	"""Normalize input text for parsing."""
	text = text.lower()
	# Remove common ASR artifacts
	text = re.sub(r'</s>\|<unk>\|<s>', '', text)
	# Remove filler words
	text = re.sub(r'\b(uh\|um\|like\|maybe\|please\|kindly)\b', '', text)
	# Normalize punctuation
	text = text.replace(",", " , ").replace(".", " ")
	# Convert spoken numbers to digits
	for word, num in self.spoken_numbers.items():
	text = re.sub(rf'\b{word}\b', str(num), text)
	return text.strip()

	def _resolve_alias(self, word: str) -> str:
	"""Check if word is an alias for a known medicine."""
	word_lower = word.lower()
	for canonical, aliases in self.aliases.items():
	if word_lower in aliases or word_lower == canonical:
	return canonical
	return word

	def _extract_form(self, segment: str) -> str:
	"""Extract form type from segment."""
	segment_lower = segment.lower()
	for form_type, keywords in self.form_keywords.items():
	for kw in keywords:
	if kw in segment_lower:
	return form_type
	return "tablet" # Default

	def _extract_quantity(self, segment: str) -> tuple:
	"""Extract quantity and unit from segment."""
	# Pattern 1: Number followed by unit word
	# e.g., "300 strips", "20 bottles"
	qty_pattern = r'(\d+)\s*(' + '\|'.join(self.unit_keywords) + r')?'
	match = re.search(qty_pattern, segment, re.IGNORECASE)

	if match:
	num = match.group(1)
	unit = match.group(2) if match.group(2) else "units"
	# Normalize common typos
	if unit in ['slips', 'slip']:
	unit = 'strips'
	return num, unit

	return "1", "units" # Default

	def _extract_dosage(self, segment: str) -> str:
	"""Extract dosage from segment."""
	# Pattern: Number followed by mg/ml/gm
	dosage_match = re.search(r'(\d+)\s*(mg\|ml\|gm\|mcg)', segment, re.IGNORECASE)
	if dosage_match:
	return f"{dosage_match.group(1)}{dosage_match.group(2)}"
	return "-"

	def extract(self, text: str) -> List[Dict]:
	"""
	Extract medicine entities from text.
	Returns: List of dicts {'medicine': str, 'form': str, 'quantity': str, 'dosage': str}
	"""
	if not text:
	return []

	# Normalize text
	text = self._normalize_text(text)

	found_orders = []

	# Get all known medicines from DB for matching
	known_meds = self.db.medicines['medicine_name'].tolist()

	# Split by multiple delimiters for multi-item orders
	# Handles: "send", "order", "add", "also", "plus", "then", "and", comma
	delimiters = r'\b(?:send\|add\|want\|need\|order\|also\|plus\|then)\b\|,\|\band\b'
	segments = re.split(delimiters, text)

	for segment in segments:
	segment = segment.strip()
	if not segment or len(segment) < 3:
	continue

	# Try to find a medicine match in this segment
	from rapidfuzz import process, fuzz

	# First, check if any word is a known alias
	words = segment.split()
	resolved_segment = ' '.join([self._resolve_alias(w) for w in words])

	# Fuzzy match against known medicines
	match = process.extractOne(resolved_segment, known_meds, scorer=fuzz.partial_ratio)

	if match and match[1] > 75: # Confidence threshold
	med_name = match[0]

	# Extract form, quantity, dosage
	form = self._extract_form(segment)
	num, unit = self._extract_quantity(segment)
	quantity = f"{num} {unit}"

	dosage = self._extract_dosage(segment)
	if dosage == "-":
	# Lookup default dosage from DB
	med_row = self.db.medicines[self.db.medicines['medicine_name'] == med_name].iloc[0]
	dosage = med_row['dosage']

	found_orders.append({
	"medicine": med_name,
	"form": form,
	"quantity": quantity,
	"dosage": dosage,
	"confidence": match[1],
	"original_segment": segment.strip()
	})

	return found_orders