| """ |
| Text preprocessing utilities for clinical notes. |
| """ |
| import re |
|
|
|
|
| def clean_clinical_text(text: str) -> str: |
| """Clean raw clinical text: remove artifacts, normalize whitespace.""" |
| if not isinstance(text, str): |
| return "" |
| text = re.sub(r"x000D", " ", text) |
| text = re.sub(r"[\r\n\t]+", " ", text) |
| text = re.sub(r"<[^>]+>", " ", text) |
| text = re.sub(r"\s{2,}", " ", text) |
| text = text.lower().strip() |
| return text |
|
|
|
|
| def build_input_text(dept_code: str, age: int, sex: str, clean_text: str) -> str: |
| """Prepend structured demographic metadata to the clinical note.""" |
| sex_label = sex.lower() if isinstance(sex, str) else "unknown" |
| return f"[DEPT:{dept_code}] [AGE:{int(age)}] [SEX:{sex_label}] {clean_text}" |
|
|
|
|
| def get_icd_chapter(code: str) -> str: |
| """Extract the chapter letter from an ICD-10 code.""" |
| if not code: |
| return "Unknown" |
| return code[0].upper() |
|
|
|
|
| def get_icd_category(code: str) -> str: |
| """Extract the 3-character category from an ICD-10 code (e.g., 'J18' from 'J18.9').""" |
| if not code: |
| return "Unknown" |
| clean = code.replace(".", "") |
| return clean[:3] if len(clean) >= 3 else code |
|
|