File size: 1,200 Bytes
25f8367 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | """
Text preprocessing utilities for clinical notes.
"""
import re
def clean_clinical_text(text: str) -> str:
"""Clean raw clinical text: remove artifacts, normalize whitespace."""
if not isinstance(text, str):
return ""
text = re.sub(r"x000D", " ", text)
text = re.sub(r"[\r\n\t]+", " ", text)
text = re.sub(r"<[^>]+>", " ", text) # strip HTML tags
text = re.sub(r"\s{2,}", " ", text)
text = text.lower().strip()
return text
def build_input_text(dept_code: str, age: int, sex: str, clean_text: str) -> str:
"""Prepend structured demographic metadata to the clinical note."""
sex_label = sex.lower() if isinstance(sex, str) else "unknown"
return f"[DEPT:{dept_code}] [AGE:{int(age)}] [SEX:{sex_label}] {clean_text}"
def get_icd_chapter(code: str) -> str:
"""Extract the chapter letter from an ICD-10 code."""
if not code:
return "Unknown"
return code[0].upper()
def get_icd_category(code: str) -> str:
"""Extract the 3-character category from an ICD-10 code (e.g., 'J18' from 'J18.9')."""
if not code:
return "Unknown"
clean = code.replace(".", "")
return clean[:3] if len(clean) >= 3 else code
|