ICD10_Code_Prediction / src /utils /preprocessing.py
Vaibhavi53's picture
Uploading the required files
25f8367 verified
Raw
History Blame Contribute Delete
1.2 kB
"""
Text preprocessing utilities for clinical notes.
"""
import re
def clean_clinical_text(text: str) -> str:
"""Clean raw clinical text: remove artifacts, normalize whitespace."""
if not isinstance(text, str):
return ""
text = re.sub(r"x000D", " ", text)
text = re.sub(r"[\r\n\t]+", " ", text)
text = re.sub(r"<[^>]+>", " ", text) # strip HTML tags
text = re.sub(r"\s{2,}", " ", text)
text = text.lower().strip()
return text
def build_input_text(dept_code: str, age: int, sex: str, clean_text: str) -> str:
"""Prepend structured demographic metadata to the clinical note."""
sex_label = sex.lower() if isinstance(sex, str) else "unknown"
return f"[DEPT:{dept_code}] [AGE:{int(age)}] [SEX:{sex_label}] {clean_text}"
def get_icd_chapter(code: str) -> str:
"""Extract the chapter letter from an ICD-10 code."""
if not code:
return "Unknown"
return code[0].upper()
def get_icd_category(code: str) -> str:
"""Extract the 3-character category from an ICD-10 code (e.g., 'J18' from 'J18.9')."""
if not code:
return "Unknown"
clean = code.replace(".", "")
return clean[:3] if len(clean) >= 3 else code