File size: 1,200 Bytes
25f8367
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
"""
Text preprocessing utilities for clinical notes.
"""
import re


def clean_clinical_text(text: str) -> str:
    """Clean raw clinical text: remove artifacts, normalize whitespace."""
    if not isinstance(text, str):
        return ""
    text = re.sub(r"x000D", " ", text)
    text = re.sub(r"[\r\n\t]+", " ", text)
    text = re.sub(r"<[^>]+>", " ", text)  # strip HTML tags
    text = re.sub(r"\s{2,}", " ", text)
    text = text.lower().strip()
    return text


def build_input_text(dept_code: str, age: int, sex: str, clean_text: str) -> str:
    """Prepend structured demographic metadata to the clinical note."""
    sex_label = sex.lower() if isinstance(sex, str) else "unknown"
    return f"[DEPT:{dept_code}] [AGE:{int(age)}] [SEX:{sex_label}] {clean_text}"


def get_icd_chapter(code: str) -> str:
    """Extract the chapter letter from an ICD-10 code."""
    if not code:
        return "Unknown"
    return code[0].upper()


def get_icd_category(code: str) -> str:
    """Extract the 3-character category from an ICD-10 code (e.g., 'J18' from 'J18.9')."""
    if not code:
        return "Unknown"
    clean = code.replace(".", "")
    return clean[:3] if len(clean) >= 3 else code