""" Text preprocessing utilities for clinical notes. """ import re def clean_clinical_text(text: str) -> str: """Clean raw clinical text: remove artifacts, normalize whitespace.""" if not isinstance(text, str): return "" text = re.sub(r"x000D", " ", text) text = re.sub(r"[\r\n\t]+", " ", text) text = re.sub(r"<[^>]+>", " ", text) # strip HTML tags text = re.sub(r"\s{2,}", " ", text) text = text.lower().strip() return text def build_input_text(dept_code: str, age: int, sex: str, clean_text: str) -> str: """Prepend structured demographic metadata to the clinical note.""" sex_label = sex.lower() if isinstance(sex, str) else "unknown" return f"[DEPT:{dept_code}] [AGE:{int(age)}] [SEX:{sex_label}] {clean_text}" def get_icd_chapter(code: str) -> str: """Extract the chapter letter from an ICD-10 code.""" if not code: return "Unknown" return code[0].upper() def get_icd_category(code: str) -> str: """Extract the 3-character category from an ICD-10 code (e.g., 'J18' from 'J18.9').""" if not code: return "Unknown" clean = code.replace(".", "") return clean[:3] if len(clean) >= 3 else code