Spaces:
Sleeping
Sleeping
File size: 13,475 Bytes
14a5ab4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | """
data_loader.py — CognitivePulse
Loads the El Kharoua Alzheimer's Disease Dataset (Kaggle, 2024; DOI 10.34740/KAGGLE/DSV/8668279)
and exposes a clean, preprocessed DataFrame for downstream modelling.
The dataset covers 2,149 patients and 33 features spanning:
- Demographics: Age, Gender, Ethnicity, EducationLevel
- Lifestyle: BMI, Smoking, AlcoholConsumption, PhysicalActivity, DietQuality, SleepQuality
- Medical history: FamilyHistoryAlzheimers, CardiovascularDisease, Diabetes,
Depression, HeadInjury, Hypertension
- Clinical measurements: SystolicBP, DiastolicBP, CholesterolTotal,
CholesterolLDL, CholesterolHDL, CholesterolTriglycerides
- Cognitive assessments: MMSE, FunctionalAssessment, MemoryComplaints,
BehavioralProblems, ADL
- Symptoms: Confusion, Disorientation, PersonalityChanges,
DifficultyCompletingTasks, Forgetfulness
- Target: Diagnosis (0 = No Alzheimer's, 1 = Alzheimer's)
Download strategy (tried in order):
1. kagglehub (requires KAGGLE_USERNAME + KAGGLE_KEY env vars or ~/.kaggle/kaggle.json)
2. Local file at data/alzheimers.csv (for pre-downloaded environments)
3. Synthetic fallback — statistically matched to published feature distributions from
the Kaggle dataset description; clearly flagged in the UI and README.
Reference: El Kharoua, R. (2024). Alzheimer's Disease Dataset [Data set].
Kaggle. https://doi.org/10.34740/KAGGLE/DSV/8668279
"""
from __future__ import annotations
import os
import json
import hashlib
from pathlib import Path
import numpy as np
import pandas as pd
# Feature metadata: used for UI labels, binning, and intervention logic.
FEATURE_META = {
"Age": {"label": "Age (years)", "type": "continuous", "modifiable": False},
"Gender": {"label": "Gender", "type": "binary", "modifiable": False},
"Ethnicity": {"label": "Ethnicity", "type": "categorical","modifiable": False},
"EducationLevel": {"label": "Education Level", "type": "ordinal", "modifiable": False},
"BMI": {"label": "BMI", "type": "continuous", "modifiable": True},
"Smoking": {"label": "Smoking", "type": "binary", "modifiable": True},
"AlcoholConsumption": {"label": "Alcohol (units/week)", "type": "continuous", "modifiable": True},
"PhysicalActivity": {"label": "Physical Activity (hrs/wk)", "type": "continuous", "modifiable": True},
"DietQuality": {"label": "Diet Quality Score", "type": "continuous", "modifiable": True},
"SleepQuality": {"label": "Sleep Quality Score", "type": "continuous", "modifiable": True},
"FamilyHistoryAlzheimers": {"label": "Family History of Alzheimer's", "type": "binary", "modifiable": False},
"CardiovascularDisease": {"label": "Cardiovascular Disease", "type": "binary", "modifiable": True},
"Diabetes": {"label": "Diabetes", "type": "binary", "modifiable": True},
"Depression": {"label": "Depression", "type": "binary", "modifiable": True},
"HeadInjury": {"label": "History of Head Injury", "type": "binary", "modifiable": False},
"Hypertension": {"label": "Hypertension", "type": "binary", "modifiable": True},
"SystolicBP": {"label": "Systolic BP (mmHg)", "type": "continuous", "modifiable": True},
"DiastolicBP": {"label": "Diastolic BP (mmHg)", "type": "continuous", "modifiable": True},
"CholesterolTotal": {"label": "Total Cholesterol (mg/dL)", "type": "continuous", "modifiable": True},
"CholesterolLDL": {"label": "LDL Cholesterol (mg/dL)", "type": "continuous", "modifiable": True},
"CholesterolHDL": {"label": "HDL Cholesterol (mg/dL)", "type": "continuous", "modifiable": True},
"CholesterolTriglycerides": {"label": "Triglycerides (mg/dL)","type": "continuous", "modifiable": True},
"MMSE": {"label": "MMSE Score", "type": "continuous", "modifiable": False},
"FunctionalAssessment": {"label": "Functional Assessment", "type": "continuous", "modifiable": False},
"MemoryComplaints": {"label": "Memory Complaints", "type": "binary", "modifiable": False},
"BehavioralProblems": {"label": "Behavioral Problems", "type": "binary", "modifiable": False},
"ADL": {"label": "Activities of Daily Living", "type": "continuous", "modifiable": False},
"Confusion": {"label": "Confusion", "type": "binary", "modifiable": False},
"Disorientation": {"label": "Disorientation", "type": "binary", "modifiable": False},
"PersonalityChanges": {"label": "Personality Changes", "type": "binary", "modifiable": False},
"DifficultyCompletingTasks": {"label": "Difficulty Completing Tasks", "type": "binary", "modifiable": False},
"Forgetfulness": {"label": "Forgetfulness", "type": "binary", "modifiable": False},
}
FEATURE_COLS = list(FEATURE_META.keys())
TARGET_COL = "Diagnosis"
# Published reference ranges / population norms (approximate midpoints from
# dataset description and AD prevention literature); used for dashboard banding.
REFERENCE_RANGES = {
"BMI": {"optimal": (18.5, 25), "caution": (25, 30), "flag": (30, 40)},
"PhysicalActivity": {"optimal": (5, 10), "caution": (2, 5), "flag": (0, 2)},
"DietQuality": {"optimal": (7, 10), "caution": (4, 7), "flag": (0, 4)},
"SleepQuality": {"optimal": (7, 10), "caution": (5, 7), "flag": (4, 5)},
"SystolicBP": {"optimal": (90, 120), "caution": (120, 140), "flag": (140, 180)},
"DiastolicBP": {"optimal": (60, 80), "caution": (80, 90), "flag": (90, 120)},
"CholesterolLDL": {"optimal": (0, 100), "caution": (100, 160), "flag": (160, 300)},
"CholesterolHDL": {"optimal": (60, 300), "caution": (40, 60), "flag": (0, 40)},
"CholesterolTriglycerides": {"optimal": (0, 150), "caution": (150, 200), "flag": (200, 500)},
"MMSE": {"optimal": (24, 30), "caution": (18, 24), "flag": (0, 18)},
"AlcoholConsumption": {"optimal": (0, 7), "caution": (7, 14), "flag": (14, 20)},
}
DATA_PATH = Path(__file__).parent / "data" / "alzheimers.csv"
SYNTHETIC_SEED = 42
def _generate_synthetic(n: int = 500, seed: int = SYNTHETIC_SEED) -> pd.DataFrame:
"""
Generates a synthetic dataset that matches the approximate feature distributions
described in El Kharoua (2024). Used as a fallback when the Kaggle dataset is not
available. Clearly flagged as synthetic in the UI and README.
"""
rng = np.random.default_rng(seed)
n_pos = int(n * 0.354) # ~35.4% positive rate matching the published class balance
n_neg = n - n_pos
def sample(n_samples, pos):
age = rng.integers(60, 91, n_samples)
gender = rng.integers(0, 2, n_samples)
ethnicity = rng.choice([0, 1, 2, 3], n_samples, p=[0.65, 0.15, 0.12, 0.08])
edu = rng.choice([0, 1, 2, 3], n_samples, p=[0.10, 0.30, 0.40, 0.20])
bmi_mu = 28.5 if pos else 27.2
bmi = rng.normal(bmi_mu, 4.5, n_samples).clip(15, 40)
smoking = rng.binomial(1, 0.35 if pos else 0.20, n_samples)
alcohol = rng.uniform(0, 20, n_samples)
pa_mu = 3.5 if pos else 5.5
pa = rng.normal(pa_mu, 2, n_samples).clip(0, 10)
diet_mu = 5.2 if pos else 6.8
diet = rng.normal(diet_mu, 1.8, n_samples).clip(0, 10)
sleep_mu = 5.8 if pos else 7.2
sleep = rng.normal(sleep_mu, 1.5, n_samples).clip(4, 10)
fam = rng.binomial(1, 0.55 if pos else 0.25, n_samples)
cvd = rng.binomial(1, 0.42 if pos else 0.22, n_samples)
diab = rng.binomial(1, 0.38 if pos else 0.20, n_samples)
dep = rng.binomial(1, 0.45 if pos else 0.20, n_samples)
head = rng.binomial(1, 0.30 if pos else 0.15, n_samples)
htn = rng.binomial(1, 0.52 if pos else 0.30, n_samples)
sbp_mu = 145 if pos else 128
sbp = rng.normal(sbp_mu, 18, n_samples).clip(90, 180)
dbp = rng.normal(82 if pos else 75, 12, n_samples).clip(60, 120)
chol_t = rng.normal(220 if pos else 200, 35, n_samples).clip(150, 300)
chol_ldl = rng.normal(145 if pos else 115, 28, n_samples).clip(50, 300)
chol_hdl = rng.normal(48 if pos else 58, 12, n_samples).clip(20, 100)
chol_trig = rng.normal(175 if pos else 140, 45, n_samples).clip(50, 500)
mmse = rng.normal(20 if pos else 27, 4, n_samples).clip(0, 30)
fa = rng.normal(6 if pos else 8, 2, n_samples).clip(0, 10)
mc = rng.binomial(1, 0.70 if pos else 0.25, n_samples)
bp = rng.binomial(1, 0.55 if pos else 0.15, n_samples)
adl = rng.normal(5.5 if pos else 8, 2, n_samples).clip(0, 10)
conf = rng.binomial(1, 0.60 if pos else 0.15, n_samples)
dis = rng.binomial(1, 0.55 if pos else 0.10, n_samples)
pc = rng.binomial(1, 0.50 if pos else 0.12, n_samples)
dct = rng.binomial(1, 0.65 if pos else 0.18, n_samples)
forget = rng.binomial(1, 0.75 if pos else 0.30, n_samples)
diag = np.ones(n_samples, dtype=int) if pos else np.zeros(n_samples, dtype=int)
return pd.DataFrame({
"Age": age, "Gender": gender, "Ethnicity": ethnicity, "EducationLevel": edu,
"BMI": bmi.round(1), "Smoking": smoking, "AlcoholConsumption": alcohol.round(1),
"PhysicalActivity": pa.round(1), "DietQuality": diet.round(1),
"SleepQuality": sleep.round(1), "FamilyHistoryAlzheimers": fam,
"CardiovascularDisease": cvd, "Diabetes": diab, "Depression": dep,
"HeadInjury": head, "Hypertension": htn, "SystolicBP": sbp.round(0).astype(int),
"DiastolicBP": dbp.round(0).astype(int), "CholesterolTotal": chol_t.round(0).astype(int),
"CholesterolLDL": chol_ldl.round(0).astype(int), "CholesterolHDL": chol_hdl.round(0).astype(int),
"CholesterolTriglycerides": chol_trig.round(0).astype(int),
"MMSE": mmse.round(1), "FunctionalAssessment": fa.round(1),
"MemoryComplaints": mc, "BehavioralProblems": bp, "ADL": adl.round(1),
"Confusion": conf, "Disorientation": dis, "PersonalityChanges": pc,
"DifficultyCompletingTasks": dct, "Forgetfulness": forget, "Diagnosis": diag,
})
df = pd.concat([sample(n_neg, False), sample(n_pos, True)], ignore_index=True)
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
return df
def load_dataset(allow_synthetic: bool = True) -> tuple[pd.DataFrame, str]:
"""
Returns (dataframe, source_label).
source_label is one of: "kaggle", "local_file", "synthetic"
"""
# 1. Try kagglehub
try:
import kagglehub
path = kagglehub.dataset_download("rabieelkharoua/alzheimers-disease-dataset")
csv_files = list(Path(path).glob("*.csv"))
if csv_files:
df = pd.read_csv(csv_files[0])
df = _clean(df)
df.to_csv(DATA_PATH, index=False)
return df, "kaggle"
except Exception:
pass
# 2. Try local pre-downloaded file
if DATA_PATH.exists():
df = pd.read_csv(DATA_PATH)
df = _clean(df)
return df, "local_file"
# 3. Synthetic fallback
if allow_synthetic:
return _generate_synthetic(), "synthetic"
raise FileNotFoundError(
"Could not load the dataset. Set KAGGLE_USERNAME and KAGGLE_KEY environment "
"variables, or place the CSV at data/alzheimers.csv."
)
def _clean(df: pd.DataFrame) -> pd.DataFrame:
cols_present = [c for c in FEATURE_COLS + [TARGET_COL] if c in df.columns]
drop_cols = [c for c in df.columns if c not in cols_present]
df = df.drop(columns=drop_cols, errors="ignore")
df = df[cols_present].copy()
df = df.dropna(subset=[TARGET_COL])
for col in FEATURE_COLS:
if col in df.columns and df[col].isna().any():
if FEATURE_META[col]["type"] in ("binary", "categorical", "ordinal"):
df[col] = df[col].fillna(df[col].mode()[0])
else:
df[col] = df[col].fillna(df[col].median())
return df
def get_population_stats(df: pd.DataFrame) -> dict:
"""Computes per-feature population statistics for dashboard comparison."""
stats = {}
for col in FEATURE_COLS:
if col not in df.columns:
continue
if FEATURE_META[col]["type"] == "continuous":
stats[col] = {
"mean": round(float(df[col].mean()), 2),
"std": round(float(df[col].std()), 2),
"p25": round(float(df[col].quantile(0.25)), 2),
"p75": round(float(df[col].quantile(0.75)), 2),
}
else:
stats[col] = {"mode": int(df[col].mode()[0]), "rate": round(float(df[col].mean()), 3)}
return stats
if __name__ == "__main__":
df, source = load_dataset()
print(f"Source: {source} | Shape: {df.shape}")
print(f"Diagnosis rate: {df['Diagnosis'].mean():.1%}")
print(df[FEATURE_COLS[:6]].head(3))
|