| import re
|
|
|
| class LabDataExtractor:
|
| def __init__(self):
|
| self.targets = {
|
| "AST": {"name_en": ["AST", "GOT"], "name_fa": "آنزیم کبدی AST"},
|
| "ALAT": {"name_en": ["ALAT", "GPT", "ALT"], "name_fa": "آنزیم کبدی ALAT"},
|
| "BILIRUBIN": {"name_en": ["Bilirubin", "T.Bil"], "name_fa": "بیلیروبین کل"},
|
| "WBC": {"name_en": ["WBC"], "name_fa": "گلبول سفید"},
|
| "RBC": {"name_en": ["RBC"], "name_fa": "گلبول قرمز"},
|
| "HGB": {
|
| "name_en": ["HGB", "Hemoglobin", "Hb", "Haemoglobin"],
|
| "name_fa": "هموگلوبین"
|
| },
|
| "HCT": {"name_en": ["HCT", "Hematocrit"], "name_fa": "هماتوکریت"},
|
| "PLT": {"name_en": ["PLT", "Platelets"], "name_fa": "پلاکت"},
|
| "MCV": {"name_en": ["MCV"], "name_fa": "حجم گلبول (MCV)"},
|
| "MCH": {"name_en": ["MCH"], "name_fa": "هموگلوبین گلبول (MCH)"},
|
| "MCHC": {"name_en": ["MCHC"], "name_fa": "غلظت هموگلوبین (MCHC)"},
|
| "FBS": {"name_en": ["FBS", "Glucose"], "name_fa": "قند ناشتا"},
|
| "CHOL": {"name_en": ["Cholesterol", "CHOL"], "name_fa": "کلسترول کل"},
|
| "HDL": {"name_en": ["HDL"], "name_fa": "کلسترول خوب"},
|
| "LDL": {"name_en": ["LDL"], "name_fa": "کلسترول بد"},
|
| "TG": {"name_en": ["Triglycerides", "TG"], "name_fa": "تریگلیسرید"},
|
| "UREA": {"name_en": ["Urea", "BUN"], "name_fa": "اوره خون"},
|
| "CREAT": {"name_en": ["Creatinine", "Crea"], "name_fa": "کراتینین"},
|
| "ALP": {"name_en": ["Alkaline Phosphatase", "ALP"], "name_fa": "فسفاتاز قلیایی"},
|
| "TSH": {"name_en": ["TSH"], "name_fa": "هورمون تیروئید"},
|
| "IRON": {"name_en": ["Iron", "Serum Iron"], "name_fa": "آهن"},
|
| "FERRITIN": {"name_en": ["Ferritin"], "name_fa": "فریتین"}
|
| }
|
|
|
| def get_patient_age(self, raw_text: str) -> int:
|
| """سن بیمار را از متن پیدا میکند."""
|
| age_match = re.search(r'Age[:\s]*(\d+)', raw_text, re.IGNORECASE)
|
| if not age_match:
|
| age_match = re.search(r'(\d+)\s*Years', raw_text, re.IGNORECASE)
|
| if age_match:
|
| return int(age_match.group(1))
|
| return None
|
|
|
| def is_lab_id(self, text, num_obj):
|
| """چک میکند که آیا عدد شماره آزمایش (ID/No) است یا نه."""
|
| start_index = num_obj['start']
|
| context_start = max(0, start_index - 20)
|
| context_text = text[context_start:start_index]
|
| id_keywords = ["no", "id", "ref", "reg", "scl", "lab", "time", "collection", "admission"]
|
| if any(keyword in context_text.lower() for keyword in id_keywords):
|
| return True
|
| return False
|
|
|
| def find_number_nearby(self, text, keyword):
|
| """این متد اسم آزمایش رو پیدا میکنه و نزدیکترین عدد رو در کل متن بهش پیدا میکند."""
|
| match_pos = -1
|
| matched_name = None
|
| for name in keyword:
|
|
|
| pattern = r'\b' + re.escape(name) + r'\b'
|
| match = re.search(pattern, text, re.IGNORECASE)
|
| if match:
|
| match_pos = match.start()
|
| matched_name = name
|
| break
|
|
|
| if match_pos == -1:
|
| return None
|
|
|
| number_positions = []
|
| for match in re.finditer(r'(?<![<>])\b\d+[.,]?\d*\b', text):
|
| if not self.is_lab_id(text, {'start': match.start()}):
|
| number_positions.append({
|
| 'value': float(match.group().replace(',', '.')),
|
| 'start': match.start()
|
| })
|
|
|
| if not number_positions:
|
| return None
|
|
|
| closest_num = None
|
| min_distance = float('inf')
|
| max_allowed_dist = 300
|
|
|
| for num_obj in number_positions:
|
| dist = abs(num_obj['start'] - match_pos)
|
| if dist < min_distance:
|
| min_distance = dist
|
| closest_num = num_obj
|
|
|
| if closest_num and min_distance < max_allowed_dist:
|
| return closest_num['value']
|
| return None
|
|
|
| def extract_all(self, raw_text: str) -> dict:
|
| clean_raw = " ".join(raw_text.split())
|
| extracted_data = {}
|
| for test_code, info in self.targets.items():
|
| value = self.find_number_nearby(clean_raw, info["name_en"])
|
| if value:
|
| extracted_data[test_code] = value
|
| return extracted_data |