""" data_loader.py — CognitivePulse Loads the El Kharoua Alzheimer's Disease Dataset (Kaggle, 2024; DOI 10.34740/KAGGLE/DSV/8668279) and exposes a clean, preprocessed DataFrame for downstream modelling. The dataset covers 2,149 patients and 33 features spanning: - Demographics: Age, Gender, Ethnicity, EducationLevel - Lifestyle: BMI, Smoking, AlcoholConsumption, PhysicalActivity, DietQuality, SleepQuality - Medical history: FamilyHistoryAlzheimers, CardiovascularDisease, Diabetes, Depression, HeadInjury, Hypertension - Clinical measurements: SystolicBP, DiastolicBP, CholesterolTotal, CholesterolLDL, CholesterolHDL, CholesterolTriglycerides - Cognitive assessments: MMSE, FunctionalAssessment, MemoryComplaints, BehavioralProblems, ADL - Symptoms: Confusion, Disorientation, PersonalityChanges, DifficultyCompletingTasks, Forgetfulness - Target: Diagnosis (0 = No Alzheimer's, 1 = Alzheimer's) Download strategy (tried in order): 1. kagglehub (requires KAGGLE_USERNAME + KAGGLE_KEY env vars or ~/.kaggle/kaggle.json) 2. Local file at data/alzheimers.csv (for pre-downloaded environments) 3. Synthetic fallback — statistically matched to published feature distributions from the Kaggle dataset description; clearly flagged in the UI and README. Reference: El Kharoua, R. (2024). Alzheimer's Disease Dataset [Data set]. Kaggle. https://doi.org/10.34740/KAGGLE/DSV/8668279 """ from __future__ import annotations import os import json import hashlib from pathlib import Path import numpy as np import pandas as pd # Feature metadata: used for UI labels, binning, and intervention logic. FEATURE_META = { "Age": {"label": "Age (years)", "type": "continuous", "modifiable": False}, "Gender": {"label": "Gender", "type": "binary", "modifiable": False}, "Ethnicity": {"label": "Ethnicity", "type": "categorical","modifiable": False}, "EducationLevel": {"label": "Education Level", "type": "ordinal", "modifiable": False}, "BMI": {"label": "BMI", "type": "continuous", "modifiable": True}, "Smoking": {"label": "Smoking", "type": "binary", "modifiable": True}, "AlcoholConsumption": {"label": "Alcohol (units/week)", "type": "continuous", "modifiable": True}, "PhysicalActivity": {"label": "Physical Activity (hrs/wk)", "type": "continuous", "modifiable": True}, "DietQuality": {"label": "Diet Quality Score", "type": "continuous", "modifiable": True}, "SleepQuality": {"label": "Sleep Quality Score", "type": "continuous", "modifiable": True}, "FamilyHistoryAlzheimers": {"label": "Family History of Alzheimer's", "type": "binary", "modifiable": False}, "CardiovascularDisease": {"label": "Cardiovascular Disease", "type": "binary", "modifiable": True}, "Diabetes": {"label": "Diabetes", "type": "binary", "modifiable": True}, "Depression": {"label": "Depression", "type": "binary", "modifiable": True}, "HeadInjury": {"label": "History of Head Injury", "type": "binary", "modifiable": False}, "Hypertension": {"label": "Hypertension", "type": "binary", "modifiable": True}, "SystolicBP": {"label": "Systolic BP (mmHg)", "type": "continuous", "modifiable": True}, "DiastolicBP": {"label": "Diastolic BP (mmHg)", "type": "continuous", "modifiable": True}, "CholesterolTotal": {"label": "Total Cholesterol (mg/dL)", "type": "continuous", "modifiable": True}, "CholesterolLDL": {"label": "LDL Cholesterol (mg/dL)", "type": "continuous", "modifiable": True}, "CholesterolHDL": {"label": "HDL Cholesterol (mg/dL)", "type": "continuous", "modifiable": True}, "CholesterolTriglycerides": {"label": "Triglycerides (mg/dL)","type": "continuous", "modifiable": True}, "MMSE": {"label": "MMSE Score", "type": "continuous", "modifiable": False}, "FunctionalAssessment": {"label": "Functional Assessment", "type": "continuous", "modifiable": False}, "MemoryComplaints": {"label": "Memory Complaints", "type": "binary", "modifiable": False}, "BehavioralProblems": {"label": "Behavioral Problems", "type": "binary", "modifiable": False}, "ADL": {"label": "Activities of Daily Living", "type": "continuous", "modifiable": False}, "Confusion": {"label": "Confusion", "type": "binary", "modifiable": False}, "Disorientation": {"label": "Disorientation", "type": "binary", "modifiable": False}, "PersonalityChanges": {"label": "Personality Changes", "type": "binary", "modifiable": False}, "DifficultyCompletingTasks": {"label": "Difficulty Completing Tasks", "type": "binary", "modifiable": False}, "Forgetfulness": {"label": "Forgetfulness", "type": "binary", "modifiable": False}, } FEATURE_COLS = list(FEATURE_META.keys()) TARGET_COL = "Diagnosis" # Published reference ranges / population norms (approximate midpoints from # dataset description and AD prevention literature); used for dashboard banding. REFERENCE_RANGES = { "BMI": {"optimal": (18.5, 25), "caution": (25, 30), "flag": (30, 40)}, "PhysicalActivity": {"optimal": (5, 10), "caution": (2, 5), "flag": (0, 2)}, "DietQuality": {"optimal": (7, 10), "caution": (4, 7), "flag": (0, 4)}, "SleepQuality": {"optimal": (7, 10), "caution": (5, 7), "flag": (4, 5)}, "SystolicBP": {"optimal": (90, 120), "caution": (120, 140), "flag": (140, 180)}, "DiastolicBP": {"optimal": (60, 80), "caution": (80, 90), "flag": (90, 120)}, "CholesterolLDL": {"optimal": (0, 100), "caution": (100, 160), "flag": (160, 300)}, "CholesterolHDL": {"optimal": (60, 300), "caution": (40, 60), "flag": (0, 40)}, "CholesterolTriglycerides": {"optimal": (0, 150), "caution": (150, 200), "flag": (200, 500)}, "MMSE": {"optimal": (24, 30), "caution": (18, 24), "flag": (0, 18)}, "AlcoholConsumption": {"optimal": (0, 7), "caution": (7, 14), "flag": (14, 20)}, } DATA_PATH = Path(__file__).parent / "data" / "alzheimers.csv" SYNTHETIC_SEED = 42 def _generate_synthetic(n: int = 500, seed: int = SYNTHETIC_SEED) -> pd.DataFrame: """ Generates a synthetic dataset that matches the approximate feature distributions described in El Kharoua (2024). Used as a fallback when the Kaggle dataset is not available. Clearly flagged as synthetic in the UI and README. """ rng = np.random.default_rng(seed) n_pos = int(n * 0.354) # ~35.4% positive rate matching the published class balance n_neg = n - n_pos def sample(n_samples, pos): age = rng.integers(60, 91, n_samples) gender = rng.integers(0, 2, n_samples) ethnicity = rng.choice([0, 1, 2, 3], n_samples, p=[0.65, 0.15, 0.12, 0.08]) edu = rng.choice([0, 1, 2, 3], n_samples, p=[0.10, 0.30, 0.40, 0.20]) bmi_mu = 28.5 if pos else 27.2 bmi = rng.normal(bmi_mu, 4.5, n_samples).clip(15, 40) smoking = rng.binomial(1, 0.35 if pos else 0.20, n_samples) alcohol = rng.uniform(0, 20, n_samples) pa_mu = 3.5 if pos else 5.5 pa = rng.normal(pa_mu, 2, n_samples).clip(0, 10) diet_mu = 5.2 if pos else 6.8 diet = rng.normal(diet_mu, 1.8, n_samples).clip(0, 10) sleep_mu = 5.8 if pos else 7.2 sleep = rng.normal(sleep_mu, 1.5, n_samples).clip(4, 10) fam = rng.binomial(1, 0.55 if pos else 0.25, n_samples) cvd = rng.binomial(1, 0.42 if pos else 0.22, n_samples) diab = rng.binomial(1, 0.38 if pos else 0.20, n_samples) dep = rng.binomial(1, 0.45 if pos else 0.20, n_samples) head = rng.binomial(1, 0.30 if pos else 0.15, n_samples) htn = rng.binomial(1, 0.52 if pos else 0.30, n_samples) sbp_mu = 145 if pos else 128 sbp = rng.normal(sbp_mu, 18, n_samples).clip(90, 180) dbp = rng.normal(82 if pos else 75, 12, n_samples).clip(60, 120) chol_t = rng.normal(220 if pos else 200, 35, n_samples).clip(150, 300) chol_ldl = rng.normal(145 if pos else 115, 28, n_samples).clip(50, 300) chol_hdl = rng.normal(48 if pos else 58, 12, n_samples).clip(20, 100) chol_trig = rng.normal(175 if pos else 140, 45, n_samples).clip(50, 500) mmse = rng.normal(20 if pos else 27, 4, n_samples).clip(0, 30) fa = rng.normal(6 if pos else 8, 2, n_samples).clip(0, 10) mc = rng.binomial(1, 0.70 if pos else 0.25, n_samples) bp = rng.binomial(1, 0.55 if pos else 0.15, n_samples) adl = rng.normal(5.5 if pos else 8, 2, n_samples).clip(0, 10) conf = rng.binomial(1, 0.60 if pos else 0.15, n_samples) dis = rng.binomial(1, 0.55 if pos else 0.10, n_samples) pc = rng.binomial(1, 0.50 if pos else 0.12, n_samples) dct = rng.binomial(1, 0.65 if pos else 0.18, n_samples) forget = rng.binomial(1, 0.75 if pos else 0.30, n_samples) diag = np.ones(n_samples, dtype=int) if pos else np.zeros(n_samples, dtype=int) return pd.DataFrame({ "Age": age, "Gender": gender, "Ethnicity": ethnicity, "EducationLevel": edu, "BMI": bmi.round(1), "Smoking": smoking, "AlcoholConsumption": alcohol.round(1), "PhysicalActivity": pa.round(1), "DietQuality": diet.round(1), "SleepQuality": sleep.round(1), "FamilyHistoryAlzheimers": fam, "CardiovascularDisease": cvd, "Diabetes": diab, "Depression": dep, "HeadInjury": head, "Hypertension": htn, "SystolicBP": sbp.round(0).astype(int), "DiastolicBP": dbp.round(0).astype(int), "CholesterolTotal": chol_t.round(0).astype(int), "CholesterolLDL": chol_ldl.round(0).astype(int), "CholesterolHDL": chol_hdl.round(0).astype(int), "CholesterolTriglycerides": chol_trig.round(0).astype(int), "MMSE": mmse.round(1), "FunctionalAssessment": fa.round(1), "MemoryComplaints": mc, "BehavioralProblems": bp, "ADL": adl.round(1), "Confusion": conf, "Disorientation": dis, "PersonalityChanges": pc, "DifficultyCompletingTasks": dct, "Forgetfulness": forget, "Diagnosis": diag, }) df = pd.concat([sample(n_neg, False), sample(n_pos, True)], ignore_index=True) df = df.sample(frac=1, random_state=seed).reset_index(drop=True) return df def load_dataset(allow_synthetic: bool = True) -> tuple[pd.DataFrame, str]: """ Returns (dataframe, source_label). source_label is one of: "kaggle", "local_file", "synthetic" """ # 1. Try kagglehub try: import kagglehub path = kagglehub.dataset_download("rabieelkharoua/alzheimers-disease-dataset") csv_files = list(Path(path).glob("*.csv")) if csv_files: df = pd.read_csv(csv_files[0]) df = _clean(df) df.to_csv(DATA_PATH, index=False) return df, "kaggle" except Exception: pass # 2. Try local pre-downloaded file if DATA_PATH.exists(): df = pd.read_csv(DATA_PATH) df = _clean(df) return df, "local_file" # 3. Synthetic fallback if allow_synthetic: return _generate_synthetic(), "synthetic" raise FileNotFoundError( "Could not load the dataset. Set KAGGLE_USERNAME and KAGGLE_KEY environment " "variables, or place the CSV at data/alzheimers.csv." ) def _clean(df: pd.DataFrame) -> pd.DataFrame: cols_present = [c for c in FEATURE_COLS + [TARGET_COL] if c in df.columns] drop_cols = [c for c in df.columns if c not in cols_present] df = df.drop(columns=drop_cols, errors="ignore") df = df[cols_present].copy() df = df.dropna(subset=[TARGET_COL]) for col in FEATURE_COLS: if col in df.columns and df[col].isna().any(): if FEATURE_META[col]["type"] in ("binary", "categorical", "ordinal"): df[col] = df[col].fillna(df[col].mode()[0]) else: df[col] = df[col].fillna(df[col].median()) return df def get_population_stats(df: pd.DataFrame) -> dict: """Computes per-feature population statistics for dashboard comparison.""" stats = {} for col in FEATURE_COLS: if col not in df.columns: continue if FEATURE_META[col]["type"] == "continuous": stats[col] = { "mean": round(float(df[col].mean()), 2), "std": round(float(df[col].std()), 2), "p25": round(float(df[col].quantile(0.25)), 2), "p75": round(float(df[col].quantile(0.75)), 2), } else: stats[col] = {"mode": int(df[col].mode()[0]), "rate": round(float(df[col].mean()), 3)} return stats if __name__ == "__main__": df, source = load_dataset() print(f"Source: {source} | Shape: {df.shape}") print(f"Diagnosis rate: {df['Diagnosis'].mean():.1%}") print(df[FEATURE_COLS[:6]].head(3))