import re class LabDataExtractor: def __init__(self): self.targets = { "AST": {"name_en": ["AST", "GOT"], "name_fa": "آنزیم کبدی AST"}, "ALAT": {"name_en": ["ALAT", "GPT", "ALT"], "name_fa": "آنزیم کبدی ALAT"}, "BILIRUBIN": {"name_en": ["Bilirubin", "T.Bil"], "name_fa": "بیلی‌روبین کل"}, "WBC": {"name_en": ["WBC"], "name_fa": "گلبول سفید"}, "RBC": {"name_en": ["RBC"], "name_fa": "گلبول قرمز"}, "HGB": { "name_en": ["HGB", "Hemoglobin", "Hb", "Haemoglobin"], "name_fa": "هموگلوبین" }, "HCT": {"name_en": ["HCT", "Hematocrit"], "name_fa": "هماتوکریت"}, "PLT": {"name_en": ["PLT", "Platelets"], "name_fa": "پلاکت"}, "MCV": {"name_en": ["MCV"], "name_fa": "حجم گلبول (MCV)"}, "MCH": {"name_en": ["MCH"], "name_fa": "هموگلوبین گلبول (MCH)"}, "MCHC": {"name_en": ["MCHC"], "name_fa": "غلظت هموگلوبین (MCHC)"}, "FBS": {"name_en": ["FBS", "Glucose"], "name_fa": "قند ناشتا"}, "CHOL": {"name_en": ["Cholesterol", "CHOL"], "name_fa": "کلسترول کل"}, "HDL": {"name_en": ["HDL"], "name_fa": "کلسترول خوب"}, "LDL": {"name_en": ["LDL"], "name_fa": "کلسترول بد"}, "TG": {"name_en": ["Triglycerides", "TG"], "name_fa": "تری‌گلیسرید"}, "UREA": {"name_en": ["Urea", "BUN"], "name_fa": "اوره خون"}, "CREAT": {"name_en": ["Creatinine", "Crea"], "name_fa": "کراتینین"}, "ALP": {"name_en": ["Alkaline Phosphatase", "ALP"], "name_fa": "فسفاتاز قلیایی"}, "TSH": {"name_en": ["TSH"], "name_fa": "هورمون تیروئید"}, "IRON": {"name_en": ["Iron", "Serum Iron"], "name_fa": "آهن"}, "FERRITIN": {"name_en": ["Ferritin"], "name_fa": "فریتین"} } def get_patient_age(self, raw_text: str) -> int: """سن بیمار را از متن پیدا می‌کند.""" age_match = re.search(r'Age[:\s]*(\d+)', raw_text, re.IGNORECASE) if not age_match: age_match = re.search(r'(\d+)\s*Years', raw_text, re.IGNORECASE) if age_match: return int(age_match.group(1)) return None def is_lab_id(self, text, num_obj): """چک می‌کند که آیا عدد شماره آزمایش (ID/No) است یا نه.""" start_index = num_obj['start'] context_start = max(0, start_index - 20) context_text = text[context_start:start_index] id_keywords = ["no", "id", "ref", "reg", "scl", "lab", "time", "collection", "admission"] if any(keyword in context_text.lower() for keyword in id_keywords): return True return False def find_number_nearby(self, text, keyword): """این متد اسم آزمایش رو پیدا می‌کنه و نزدیک‌ترین عدد رو در کل متن بهش پیدا می‌کند.""" match_pos = -1 matched_name = None for name in keyword: # تغییر مهم: استفاده از \b برای دقیق بودن جستجو (رفع باگ ALFALAH) pattern = r'\b' + re.escape(name) + r'\b' match = re.search(pattern, text, re.IGNORECASE) if match: match_pos = match.start() matched_name = name break if match_pos == -1: return None number_positions = [] for match in re.finditer(r'(?])\b\d+[.,]?\d*\b', text): if not self.is_lab_id(text, {'start': match.start()}): number_positions.append({ 'value': float(match.group().replace(',', '.')), 'start': match.start() }) if not number_positions: return None closest_num = None min_distance = float('inf') max_allowed_dist = 300 for num_obj in number_positions: dist = abs(num_obj['start'] - match_pos) if dist < min_distance: min_distance = dist closest_num = num_obj if closest_num and min_distance < max_allowed_dist: return closest_num['value'] return None def extract_all(self, raw_text: str) -> dict: clean_raw = " ".join(raw_text.split()) extracted_data = {} for test_code, info in self.targets.items(): value = self.find_number_nearby(clean_raw, info["name_en"]) if value: extracted_data[test_code] = value return extracted_data