File size: 4,943 Bytes
dc4feed | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | import re
class LabDataExtractor:
def __init__(self):
self.targets = {
"AST": {"name_en": ["AST", "GOT"], "name_fa": "آنزیم کبدی AST"},
"ALAT": {"name_en": ["ALAT", "GPT", "ALT"], "name_fa": "آنزیم کبدی ALAT"},
"BILIRUBIN": {"name_en": ["Bilirubin", "T.Bil"], "name_fa": "بیلیروبین کل"},
"WBC": {"name_en": ["WBC"], "name_fa": "گلبول سفید"},
"RBC": {"name_en": ["RBC"], "name_fa": "گلبول قرمز"},
"HGB": {
"name_en": ["HGB", "Hemoglobin", "Hb", "Haemoglobin"],
"name_fa": "هموگلوبین"
},
"HCT": {"name_en": ["HCT", "Hematocrit"], "name_fa": "هماتوکریت"},
"PLT": {"name_en": ["PLT", "Platelets"], "name_fa": "پلاکت"},
"MCV": {"name_en": ["MCV"], "name_fa": "حجم گلبول (MCV)"},
"MCH": {"name_en": ["MCH"], "name_fa": "هموگلوبین گلبول (MCH)"},
"MCHC": {"name_en": ["MCHC"], "name_fa": "غلظت هموگلوبین (MCHC)"},
"FBS": {"name_en": ["FBS", "Glucose"], "name_fa": "قند ناشتا"},
"CHOL": {"name_en": ["Cholesterol", "CHOL"], "name_fa": "کلسترول کل"},
"HDL": {"name_en": ["HDL"], "name_fa": "کلسترول خوب"},
"LDL": {"name_en": ["LDL"], "name_fa": "کلسترول بد"},
"TG": {"name_en": ["Triglycerides", "TG"], "name_fa": "تریگلیسرید"},
"UREA": {"name_en": ["Urea", "BUN"], "name_fa": "اوره خون"},
"CREAT": {"name_en": ["Creatinine", "Crea"], "name_fa": "کراتینین"},
"ALP": {"name_en": ["Alkaline Phosphatase", "ALP"], "name_fa": "فسفاتاز قلیایی"},
"TSH": {"name_en": ["TSH"], "name_fa": "هورمون تیروئید"},
"IRON": {"name_en": ["Iron", "Serum Iron"], "name_fa": "آهن"},
"FERRITIN": {"name_en": ["Ferritin"], "name_fa": "فریتین"}
}
def get_patient_age(self, raw_text: str) -> int:
"""سن بیمار را از متن پیدا میکند."""
age_match = re.search(r'Age[:\s]*(\d+)', raw_text, re.IGNORECASE)
if not age_match:
age_match = re.search(r'(\d+)\s*Years', raw_text, re.IGNORECASE)
if age_match:
return int(age_match.group(1))
return None
def is_lab_id(self, text, num_obj):
"""چک میکند که آیا عدد شماره آزمایش (ID/No) است یا نه."""
start_index = num_obj['start']
context_start = max(0, start_index - 20)
context_text = text[context_start:start_index]
id_keywords = ["no", "id", "ref", "reg", "scl", "lab", "time", "collection", "admission"]
if any(keyword in context_text.lower() for keyword in id_keywords):
return True
return False
def find_number_nearby(self, text, keyword):
"""این متد اسم آزمایش رو پیدا میکنه و نزدیکترین عدد رو در کل متن بهش پیدا میکند."""
match_pos = -1
matched_name = None
for name in keyword:
# تغییر مهم: استفاده از \b برای دقیق بودن جستجو (رفع باگ ALFALAH)
pattern = r'\b' + re.escape(name) + r'\b'
match = re.search(pattern, text, re.IGNORECASE)
if match:
match_pos = match.start()
matched_name = name
break
if match_pos == -1:
return None
number_positions = []
for match in re.finditer(r'(?<![<>])\b\d+[.,]?\d*\b', text):
if not self.is_lab_id(text, {'start': match.start()}):
number_positions.append({
'value': float(match.group().replace(',', '.')),
'start': match.start()
})
if not number_positions:
return None
closest_num = None
min_distance = float('inf')
max_allowed_dist = 300
for num_obj in number_positions:
dist = abs(num_obj['start'] - match_pos)
if dist < min_distance:
min_distance = dist
closest_num = num_obj
if closest_num and min_distance < max_allowed_dist:
return closest_num['value']
return None
def extract_all(self, raw_text: str) -> dict:
clean_raw = " ".join(raw_text.split())
extracted_data = {}
for test_code, info in self.targets.items():
value = self.find_number_nearby(clean_raw, info["name_en"])
if value:
extracted_data[test_code] = value
return extracted_data |