File size: 4,943 Bytes
dc4feed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import re

class LabDataExtractor:
    def __init__(self):
        self.targets = {
            "AST": {"name_en": ["AST", "GOT"], "name_fa": "آنزیم کبدی AST"},
            "ALAT": {"name_en": ["ALAT", "GPT", "ALT"], "name_fa": "آنزیم کبدی ALAT"},
            "BILIRUBIN": {"name_en": ["Bilirubin", "T.Bil"], "name_fa": "بیلی‌روبین کل"},
            "WBC": {"name_en": ["WBC"], "name_fa": "گلبول سفید"},
            "RBC": {"name_en": ["RBC"], "name_fa": "گلبول قرمز"},
            "HGB": {
                "name_en": ["HGB", "Hemoglobin", "Hb", "Haemoglobin"], 
                "name_fa": "هموگلوبین"
            },
            "HCT": {"name_en": ["HCT", "Hematocrit"], "name_fa": "هماتوکریت"},
            "PLT": {"name_en": ["PLT", "Platelets"], "name_fa": "پلاکت"},
            "MCV": {"name_en": ["MCV"], "name_fa": "حجم گلبول (MCV)"},
            "MCH": {"name_en": ["MCH"], "name_fa": "هموگلوبین گلبول (MCH)"},
            "MCHC": {"name_en": ["MCHC"], "name_fa": "غلظت هموگلوبین (MCHC)"},
            "FBS": {"name_en": ["FBS", "Glucose"], "name_fa": "قند ناشتا"},
            "CHOL": {"name_en": ["Cholesterol", "CHOL"], "name_fa": "کلسترول کل"},
            "HDL": {"name_en": ["HDL"], "name_fa": "کلسترول خوب"},
            "LDL": {"name_en": ["LDL"], "name_fa": "کلسترول بد"},
            "TG": {"name_en": ["Triglycerides", "TG"], "name_fa": "تری‌گلیسرید"},
            "UREA": {"name_en": ["Urea", "BUN"], "name_fa": "اوره خون"},
            "CREAT": {"name_en": ["Creatinine", "Crea"], "name_fa": "کراتینین"},
            "ALP": {"name_en": ["Alkaline Phosphatase", "ALP"], "name_fa": "فسفاتاز قلیایی"},
            "TSH": {"name_en": ["TSH"], "name_fa": "هورمون تیروئید"},
            "IRON": {"name_en": ["Iron", "Serum Iron"], "name_fa": "آهن"},
            "FERRITIN": {"name_en": ["Ferritin"], "name_fa": "فریتین"}
        }

    def get_patient_age(self, raw_text: str) -> int:
        """سن بیمار را از متن پیدا می‌کند."""
        age_match = re.search(r'Age[:\s]*(\d+)', raw_text, re.IGNORECASE)
        if not age_match:
            age_match = re.search(r'(\d+)\s*Years', raw_text, re.IGNORECASE)
        if age_match:
            return int(age_match.group(1))
        return None

    def is_lab_id(self, text, num_obj):
        """چک می‌کند که آیا عدد شماره آزمایش (ID/No) است یا نه."""
        start_index = num_obj['start']
        context_start = max(0, start_index - 20)
        context_text = text[context_start:start_index]
        id_keywords = ["no", "id", "ref", "reg", "scl", "lab", "time", "collection", "admission"]
        if any(keyword in context_text.lower() for keyword in id_keywords):
            return True
        return False

    def find_number_nearby(self, text, keyword):
        """این متد اسم آزمایش رو پیدا می‌کنه و نزدیک‌ترین عدد رو در کل متن بهش پیدا می‌کند."""
        match_pos = -1
        matched_name = None
        for name in keyword:
            # تغییر مهم: استفاده از \b برای دقیق بودن جستجو (رفع باگ ALFALAH)
            pattern = r'\b' + re.escape(name) + r'\b'
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                match_pos = match.start()
                matched_name = name
                break
        
        if match_pos == -1:
            return None

        number_positions = []
        for match in re.finditer(r'(?<![<>])\b\d+[.,]?\d*\b', text):
            if not self.is_lab_id(text, {'start': match.start()}):
                number_positions.append({
                    'value': float(match.group().replace(',', '.')),
                    'start': match.start()
                })
        
        if not number_positions:
            return None

        closest_num = None
        min_distance = float('inf')
        max_allowed_dist = 300 
        
        for num_obj in number_positions:
            dist = abs(num_obj['start'] - match_pos)
            if dist < min_distance:
                min_distance = dist
                closest_num = num_obj
        
        if closest_num and min_distance < max_allowed_dist:
            return closest_num['value']
        return None

    def extract_all(self, raw_text: str) -> dict:
        clean_raw = " ".join(raw_text.split())
        extracted_data = {}
        for test_code, info in self.targets.items():
            value = self.find_number_nearby(clean_raw, info["name_en"])
            if value:
                extracted_data[test_code] = value
        return extracted_data