Spaces:
Sleeping
Sleeping
| """ | |
| data_loader.py — CognitivePulse | |
| Loads the El Kharoua Alzheimer's Disease Dataset (Kaggle, 2024; DOI 10.34740/KAGGLE/DSV/8668279) | |
| and exposes a clean, preprocessed DataFrame for downstream modelling. | |
| The dataset covers 2,149 patients and 33 features spanning: | |
| - Demographics: Age, Gender, Ethnicity, EducationLevel | |
| - Lifestyle: BMI, Smoking, AlcoholConsumption, PhysicalActivity, DietQuality, SleepQuality | |
| - Medical history: FamilyHistoryAlzheimers, CardiovascularDisease, Diabetes, | |
| Depression, HeadInjury, Hypertension | |
| - Clinical measurements: SystolicBP, DiastolicBP, CholesterolTotal, | |
| CholesterolLDL, CholesterolHDL, CholesterolTriglycerides | |
| - Cognitive assessments: MMSE, FunctionalAssessment, MemoryComplaints, | |
| BehavioralProblems, ADL | |
| - Symptoms: Confusion, Disorientation, PersonalityChanges, | |
| DifficultyCompletingTasks, Forgetfulness | |
| - Target: Diagnosis (0 = No Alzheimer's, 1 = Alzheimer's) | |
| Download strategy (tried in order): | |
| 1. kagglehub (requires KAGGLE_USERNAME + KAGGLE_KEY env vars or ~/.kaggle/kaggle.json) | |
| 2. Local file at data/alzheimers.csv (for pre-downloaded environments) | |
| 3. Synthetic fallback — statistically matched to published feature distributions from | |
| the Kaggle dataset description; clearly flagged in the UI and README. | |
| Reference: El Kharoua, R. (2024). Alzheimer's Disease Dataset [Data set]. | |
| Kaggle. https://doi.org/10.34740/KAGGLE/DSV/8668279 | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import json | |
| import hashlib | |
| from pathlib import Path | |
| import numpy as np | |
| import pandas as pd | |
| # Feature metadata: used for UI labels, binning, and intervention logic. | |
| FEATURE_META = { | |
| "Age": {"label": "Age (years)", "type": "continuous", "modifiable": False}, | |
| "Gender": {"label": "Gender", "type": "binary", "modifiable": False}, | |
| "Ethnicity": {"label": "Ethnicity", "type": "categorical","modifiable": False}, | |
| "EducationLevel": {"label": "Education Level", "type": "ordinal", "modifiable": False}, | |
| "BMI": {"label": "BMI", "type": "continuous", "modifiable": True}, | |
| "Smoking": {"label": "Smoking", "type": "binary", "modifiable": True}, | |
| "AlcoholConsumption": {"label": "Alcohol (units/week)", "type": "continuous", "modifiable": True}, | |
| "PhysicalActivity": {"label": "Physical Activity (hrs/wk)", "type": "continuous", "modifiable": True}, | |
| "DietQuality": {"label": "Diet Quality Score", "type": "continuous", "modifiable": True}, | |
| "SleepQuality": {"label": "Sleep Quality Score", "type": "continuous", "modifiable": True}, | |
| "FamilyHistoryAlzheimers": {"label": "Family History of Alzheimer's", "type": "binary", "modifiable": False}, | |
| "CardiovascularDisease": {"label": "Cardiovascular Disease", "type": "binary", "modifiable": True}, | |
| "Diabetes": {"label": "Diabetes", "type": "binary", "modifiable": True}, | |
| "Depression": {"label": "Depression", "type": "binary", "modifiable": True}, | |
| "HeadInjury": {"label": "History of Head Injury", "type": "binary", "modifiable": False}, | |
| "Hypertension": {"label": "Hypertension", "type": "binary", "modifiable": True}, | |
| "SystolicBP": {"label": "Systolic BP (mmHg)", "type": "continuous", "modifiable": True}, | |
| "DiastolicBP": {"label": "Diastolic BP (mmHg)", "type": "continuous", "modifiable": True}, | |
| "CholesterolTotal": {"label": "Total Cholesterol (mg/dL)", "type": "continuous", "modifiable": True}, | |
| "CholesterolLDL": {"label": "LDL Cholesterol (mg/dL)", "type": "continuous", "modifiable": True}, | |
| "CholesterolHDL": {"label": "HDL Cholesterol (mg/dL)", "type": "continuous", "modifiable": True}, | |
| "CholesterolTriglycerides": {"label": "Triglycerides (mg/dL)","type": "continuous", "modifiable": True}, | |
| "MMSE": {"label": "MMSE Score", "type": "continuous", "modifiable": False}, | |
| "FunctionalAssessment": {"label": "Functional Assessment", "type": "continuous", "modifiable": False}, | |
| "MemoryComplaints": {"label": "Memory Complaints", "type": "binary", "modifiable": False}, | |
| "BehavioralProblems": {"label": "Behavioral Problems", "type": "binary", "modifiable": False}, | |
| "ADL": {"label": "Activities of Daily Living", "type": "continuous", "modifiable": False}, | |
| "Confusion": {"label": "Confusion", "type": "binary", "modifiable": False}, | |
| "Disorientation": {"label": "Disorientation", "type": "binary", "modifiable": False}, | |
| "PersonalityChanges": {"label": "Personality Changes", "type": "binary", "modifiable": False}, | |
| "DifficultyCompletingTasks": {"label": "Difficulty Completing Tasks", "type": "binary", "modifiable": False}, | |
| "Forgetfulness": {"label": "Forgetfulness", "type": "binary", "modifiable": False}, | |
| } | |
| FEATURE_COLS = list(FEATURE_META.keys()) | |
| TARGET_COL = "Diagnosis" | |
| # Published reference ranges / population norms (approximate midpoints from | |
| # dataset description and AD prevention literature); used for dashboard banding. | |
| REFERENCE_RANGES = { | |
| "BMI": {"optimal": (18.5, 25), "caution": (25, 30), "flag": (30, 40)}, | |
| "PhysicalActivity": {"optimal": (5, 10), "caution": (2, 5), "flag": (0, 2)}, | |
| "DietQuality": {"optimal": (7, 10), "caution": (4, 7), "flag": (0, 4)}, | |
| "SleepQuality": {"optimal": (7, 10), "caution": (5, 7), "flag": (4, 5)}, | |
| "SystolicBP": {"optimal": (90, 120), "caution": (120, 140), "flag": (140, 180)}, | |
| "DiastolicBP": {"optimal": (60, 80), "caution": (80, 90), "flag": (90, 120)}, | |
| "CholesterolLDL": {"optimal": (0, 100), "caution": (100, 160), "flag": (160, 300)}, | |
| "CholesterolHDL": {"optimal": (60, 300), "caution": (40, 60), "flag": (0, 40)}, | |
| "CholesterolTriglycerides": {"optimal": (0, 150), "caution": (150, 200), "flag": (200, 500)}, | |
| "MMSE": {"optimal": (24, 30), "caution": (18, 24), "flag": (0, 18)}, | |
| "AlcoholConsumption": {"optimal": (0, 7), "caution": (7, 14), "flag": (14, 20)}, | |
| } | |
| DATA_PATH = Path(__file__).parent / "data" / "alzheimers.csv" | |
| SYNTHETIC_SEED = 42 | |
| def _generate_synthetic(n: int = 500, seed: int = SYNTHETIC_SEED) -> pd.DataFrame: | |
| """ | |
| Generates a synthetic dataset that matches the approximate feature distributions | |
| described in El Kharoua (2024). Used as a fallback when the Kaggle dataset is not | |
| available. Clearly flagged as synthetic in the UI and README. | |
| """ | |
| rng = np.random.default_rng(seed) | |
| n_pos = int(n * 0.354) # ~35.4% positive rate matching the published class balance | |
| n_neg = n - n_pos | |
| def sample(n_samples, pos): | |
| age = rng.integers(60, 91, n_samples) | |
| gender = rng.integers(0, 2, n_samples) | |
| ethnicity = rng.choice([0, 1, 2, 3], n_samples, p=[0.65, 0.15, 0.12, 0.08]) | |
| edu = rng.choice([0, 1, 2, 3], n_samples, p=[0.10, 0.30, 0.40, 0.20]) | |
| bmi_mu = 28.5 if pos else 27.2 | |
| bmi = rng.normal(bmi_mu, 4.5, n_samples).clip(15, 40) | |
| smoking = rng.binomial(1, 0.35 if pos else 0.20, n_samples) | |
| alcohol = rng.uniform(0, 20, n_samples) | |
| pa_mu = 3.5 if pos else 5.5 | |
| pa = rng.normal(pa_mu, 2, n_samples).clip(0, 10) | |
| diet_mu = 5.2 if pos else 6.8 | |
| diet = rng.normal(diet_mu, 1.8, n_samples).clip(0, 10) | |
| sleep_mu = 5.8 if pos else 7.2 | |
| sleep = rng.normal(sleep_mu, 1.5, n_samples).clip(4, 10) | |
| fam = rng.binomial(1, 0.55 if pos else 0.25, n_samples) | |
| cvd = rng.binomial(1, 0.42 if pos else 0.22, n_samples) | |
| diab = rng.binomial(1, 0.38 if pos else 0.20, n_samples) | |
| dep = rng.binomial(1, 0.45 if pos else 0.20, n_samples) | |
| head = rng.binomial(1, 0.30 if pos else 0.15, n_samples) | |
| htn = rng.binomial(1, 0.52 if pos else 0.30, n_samples) | |
| sbp_mu = 145 if pos else 128 | |
| sbp = rng.normal(sbp_mu, 18, n_samples).clip(90, 180) | |
| dbp = rng.normal(82 if pos else 75, 12, n_samples).clip(60, 120) | |
| chol_t = rng.normal(220 if pos else 200, 35, n_samples).clip(150, 300) | |
| chol_ldl = rng.normal(145 if pos else 115, 28, n_samples).clip(50, 300) | |
| chol_hdl = rng.normal(48 if pos else 58, 12, n_samples).clip(20, 100) | |
| chol_trig = rng.normal(175 if pos else 140, 45, n_samples).clip(50, 500) | |
| mmse = rng.normal(20 if pos else 27, 4, n_samples).clip(0, 30) | |
| fa = rng.normal(6 if pos else 8, 2, n_samples).clip(0, 10) | |
| mc = rng.binomial(1, 0.70 if pos else 0.25, n_samples) | |
| bp = rng.binomial(1, 0.55 if pos else 0.15, n_samples) | |
| adl = rng.normal(5.5 if pos else 8, 2, n_samples).clip(0, 10) | |
| conf = rng.binomial(1, 0.60 if pos else 0.15, n_samples) | |
| dis = rng.binomial(1, 0.55 if pos else 0.10, n_samples) | |
| pc = rng.binomial(1, 0.50 if pos else 0.12, n_samples) | |
| dct = rng.binomial(1, 0.65 if pos else 0.18, n_samples) | |
| forget = rng.binomial(1, 0.75 if pos else 0.30, n_samples) | |
| diag = np.ones(n_samples, dtype=int) if pos else np.zeros(n_samples, dtype=int) | |
| return pd.DataFrame({ | |
| "Age": age, "Gender": gender, "Ethnicity": ethnicity, "EducationLevel": edu, | |
| "BMI": bmi.round(1), "Smoking": smoking, "AlcoholConsumption": alcohol.round(1), | |
| "PhysicalActivity": pa.round(1), "DietQuality": diet.round(1), | |
| "SleepQuality": sleep.round(1), "FamilyHistoryAlzheimers": fam, | |
| "CardiovascularDisease": cvd, "Diabetes": diab, "Depression": dep, | |
| "HeadInjury": head, "Hypertension": htn, "SystolicBP": sbp.round(0).astype(int), | |
| "DiastolicBP": dbp.round(0).astype(int), "CholesterolTotal": chol_t.round(0).astype(int), | |
| "CholesterolLDL": chol_ldl.round(0).astype(int), "CholesterolHDL": chol_hdl.round(0).astype(int), | |
| "CholesterolTriglycerides": chol_trig.round(0).astype(int), | |
| "MMSE": mmse.round(1), "FunctionalAssessment": fa.round(1), | |
| "MemoryComplaints": mc, "BehavioralProblems": bp, "ADL": adl.round(1), | |
| "Confusion": conf, "Disorientation": dis, "PersonalityChanges": pc, | |
| "DifficultyCompletingTasks": dct, "Forgetfulness": forget, "Diagnosis": diag, | |
| }) | |
| df = pd.concat([sample(n_neg, False), sample(n_pos, True)], ignore_index=True) | |
| df = df.sample(frac=1, random_state=seed).reset_index(drop=True) | |
| return df | |
| def load_dataset(allow_synthetic: bool = True) -> tuple[pd.DataFrame, str]: | |
| """ | |
| Returns (dataframe, source_label). | |
| source_label is one of: "kaggle", "local_file", "synthetic" | |
| """ | |
| # 1. Try kagglehub | |
| try: | |
| import kagglehub | |
| path = kagglehub.dataset_download("rabieelkharoua/alzheimers-disease-dataset") | |
| csv_files = list(Path(path).glob("*.csv")) | |
| if csv_files: | |
| df = pd.read_csv(csv_files[0]) | |
| df = _clean(df) | |
| df.to_csv(DATA_PATH, index=False) | |
| return df, "kaggle" | |
| except Exception: | |
| pass | |
| # 2. Try local pre-downloaded file | |
| if DATA_PATH.exists(): | |
| df = pd.read_csv(DATA_PATH) | |
| df = _clean(df) | |
| return df, "local_file" | |
| # 3. Synthetic fallback | |
| if allow_synthetic: | |
| return _generate_synthetic(), "synthetic" | |
| raise FileNotFoundError( | |
| "Could not load the dataset. Set KAGGLE_USERNAME and KAGGLE_KEY environment " | |
| "variables, or place the CSV at data/alzheimers.csv." | |
| ) | |
| def _clean(df: pd.DataFrame) -> pd.DataFrame: | |
| cols_present = [c for c in FEATURE_COLS + [TARGET_COL] if c in df.columns] | |
| drop_cols = [c for c in df.columns if c not in cols_present] | |
| df = df.drop(columns=drop_cols, errors="ignore") | |
| df = df[cols_present].copy() | |
| df = df.dropna(subset=[TARGET_COL]) | |
| for col in FEATURE_COLS: | |
| if col in df.columns and df[col].isna().any(): | |
| if FEATURE_META[col]["type"] in ("binary", "categorical", "ordinal"): | |
| df[col] = df[col].fillna(df[col].mode()[0]) | |
| else: | |
| df[col] = df[col].fillna(df[col].median()) | |
| return df | |
| def get_population_stats(df: pd.DataFrame) -> dict: | |
| """Computes per-feature population statistics for dashboard comparison.""" | |
| stats = {} | |
| for col in FEATURE_COLS: | |
| if col not in df.columns: | |
| continue | |
| if FEATURE_META[col]["type"] == "continuous": | |
| stats[col] = { | |
| "mean": round(float(df[col].mean()), 2), | |
| "std": round(float(df[col].std()), 2), | |
| "p25": round(float(df[col].quantile(0.25)), 2), | |
| "p75": round(float(df[col].quantile(0.75)), 2), | |
| } | |
| else: | |
| stats[col] = {"mode": int(df[col].mode()[0]), "rate": round(float(df[col].mean()), 3)} | |
| return stats | |
| if __name__ == "__main__": | |
| df, source = load_dataset() | |
| print(f"Source: {source} | Shape: {df.shape}") | |
| print(f"Diagnosis rate: {df['Diagnosis'].mean():.1%}") | |
| print(df[FEATURE_COLS[:6]].head(3)) | |