CognitivePulse / data_loader.py
Kshamaa S
Initial deployment: CognitivePulse biomarker intelligence and coaching assistant
14a5ab4
Raw
History Blame Contribute Delete
13.5 kB
"""
data_loader.py — CognitivePulse
Loads the El Kharoua Alzheimer's Disease Dataset (Kaggle, 2024; DOI 10.34740/KAGGLE/DSV/8668279)
and exposes a clean, preprocessed DataFrame for downstream modelling.
The dataset covers 2,149 patients and 33 features spanning:
- Demographics: Age, Gender, Ethnicity, EducationLevel
- Lifestyle: BMI, Smoking, AlcoholConsumption, PhysicalActivity, DietQuality, SleepQuality
- Medical history: FamilyHistoryAlzheimers, CardiovascularDisease, Diabetes,
Depression, HeadInjury, Hypertension
- Clinical measurements: SystolicBP, DiastolicBP, CholesterolTotal,
CholesterolLDL, CholesterolHDL, CholesterolTriglycerides
- Cognitive assessments: MMSE, FunctionalAssessment, MemoryComplaints,
BehavioralProblems, ADL
- Symptoms: Confusion, Disorientation, PersonalityChanges,
DifficultyCompletingTasks, Forgetfulness
- Target: Diagnosis (0 = No Alzheimer's, 1 = Alzheimer's)
Download strategy (tried in order):
1. kagglehub (requires KAGGLE_USERNAME + KAGGLE_KEY env vars or ~/.kaggle/kaggle.json)
2. Local file at data/alzheimers.csv (for pre-downloaded environments)
3. Synthetic fallback — statistically matched to published feature distributions from
the Kaggle dataset description; clearly flagged in the UI and README.
Reference: El Kharoua, R. (2024). Alzheimer's Disease Dataset [Data set].
Kaggle. https://doi.org/10.34740/KAGGLE/DSV/8668279
"""
from __future__ import annotations
import os
import json
import hashlib
from pathlib import Path
import numpy as np
import pandas as pd
# Feature metadata: used for UI labels, binning, and intervention logic.
FEATURE_META = {
"Age": {"label": "Age (years)", "type": "continuous", "modifiable": False},
"Gender": {"label": "Gender", "type": "binary", "modifiable": False},
"Ethnicity": {"label": "Ethnicity", "type": "categorical","modifiable": False},
"EducationLevel": {"label": "Education Level", "type": "ordinal", "modifiable": False},
"BMI": {"label": "BMI", "type": "continuous", "modifiable": True},
"Smoking": {"label": "Smoking", "type": "binary", "modifiable": True},
"AlcoholConsumption": {"label": "Alcohol (units/week)", "type": "continuous", "modifiable": True},
"PhysicalActivity": {"label": "Physical Activity (hrs/wk)", "type": "continuous", "modifiable": True},
"DietQuality": {"label": "Diet Quality Score", "type": "continuous", "modifiable": True},
"SleepQuality": {"label": "Sleep Quality Score", "type": "continuous", "modifiable": True},
"FamilyHistoryAlzheimers": {"label": "Family History of Alzheimer's", "type": "binary", "modifiable": False},
"CardiovascularDisease": {"label": "Cardiovascular Disease", "type": "binary", "modifiable": True},
"Diabetes": {"label": "Diabetes", "type": "binary", "modifiable": True},
"Depression": {"label": "Depression", "type": "binary", "modifiable": True},
"HeadInjury": {"label": "History of Head Injury", "type": "binary", "modifiable": False},
"Hypertension": {"label": "Hypertension", "type": "binary", "modifiable": True},
"SystolicBP": {"label": "Systolic BP (mmHg)", "type": "continuous", "modifiable": True},
"DiastolicBP": {"label": "Diastolic BP (mmHg)", "type": "continuous", "modifiable": True},
"CholesterolTotal": {"label": "Total Cholesterol (mg/dL)", "type": "continuous", "modifiable": True},
"CholesterolLDL": {"label": "LDL Cholesterol (mg/dL)", "type": "continuous", "modifiable": True},
"CholesterolHDL": {"label": "HDL Cholesterol (mg/dL)", "type": "continuous", "modifiable": True},
"CholesterolTriglycerides": {"label": "Triglycerides (mg/dL)","type": "continuous", "modifiable": True},
"MMSE": {"label": "MMSE Score", "type": "continuous", "modifiable": False},
"FunctionalAssessment": {"label": "Functional Assessment", "type": "continuous", "modifiable": False},
"MemoryComplaints": {"label": "Memory Complaints", "type": "binary", "modifiable": False},
"BehavioralProblems": {"label": "Behavioral Problems", "type": "binary", "modifiable": False},
"ADL": {"label": "Activities of Daily Living", "type": "continuous", "modifiable": False},
"Confusion": {"label": "Confusion", "type": "binary", "modifiable": False},
"Disorientation": {"label": "Disorientation", "type": "binary", "modifiable": False},
"PersonalityChanges": {"label": "Personality Changes", "type": "binary", "modifiable": False},
"DifficultyCompletingTasks": {"label": "Difficulty Completing Tasks", "type": "binary", "modifiable": False},
"Forgetfulness": {"label": "Forgetfulness", "type": "binary", "modifiable": False},
}
FEATURE_COLS = list(FEATURE_META.keys())
TARGET_COL = "Diagnosis"
# Published reference ranges / population norms (approximate midpoints from
# dataset description and AD prevention literature); used for dashboard banding.
REFERENCE_RANGES = {
"BMI": {"optimal": (18.5, 25), "caution": (25, 30), "flag": (30, 40)},
"PhysicalActivity": {"optimal": (5, 10), "caution": (2, 5), "flag": (0, 2)},
"DietQuality": {"optimal": (7, 10), "caution": (4, 7), "flag": (0, 4)},
"SleepQuality": {"optimal": (7, 10), "caution": (5, 7), "flag": (4, 5)},
"SystolicBP": {"optimal": (90, 120), "caution": (120, 140), "flag": (140, 180)},
"DiastolicBP": {"optimal": (60, 80), "caution": (80, 90), "flag": (90, 120)},
"CholesterolLDL": {"optimal": (0, 100), "caution": (100, 160), "flag": (160, 300)},
"CholesterolHDL": {"optimal": (60, 300), "caution": (40, 60), "flag": (0, 40)},
"CholesterolTriglycerides": {"optimal": (0, 150), "caution": (150, 200), "flag": (200, 500)},
"MMSE": {"optimal": (24, 30), "caution": (18, 24), "flag": (0, 18)},
"AlcoholConsumption": {"optimal": (0, 7), "caution": (7, 14), "flag": (14, 20)},
}
DATA_PATH = Path(__file__).parent / "data" / "alzheimers.csv"
SYNTHETIC_SEED = 42
def _generate_synthetic(n: int = 500, seed: int = SYNTHETIC_SEED) -> pd.DataFrame:
"""
Generates a synthetic dataset that matches the approximate feature distributions
described in El Kharoua (2024). Used as a fallback when the Kaggle dataset is not
available. Clearly flagged as synthetic in the UI and README.
"""
rng = np.random.default_rng(seed)
n_pos = int(n * 0.354) # ~35.4% positive rate matching the published class balance
n_neg = n - n_pos
def sample(n_samples, pos):
age = rng.integers(60, 91, n_samples)
gender = rng.integers(0, 2, n_samples)
ethnicity = rng.choice([0, 1, 2, 3], n_samples, p=[0.65, 0.15, 0.12, 0.08])
edu = rng.choice([0, 1, 2, 3], n_samples, p=[0.10, 0.30, 0.40, 0.20])
bmi_mu = 28.5 if pos else 27.2
bmi = rng.normal(bmi_mu, 4.5, n_samples).clip(15, 40)
smoking = rng.binomial(1, 0.35 if pos else 0.20, n_samples)
alcohol = rng.uniform(0, 20, n_samples)
pa_mu = 3.5 if pos else 5.5
pa = rng.normal(pa_mu, 2, n_samples).clip(0, 10)
diet_mu = 5.2 if pos else 6.8
diet = rng.normal(diet_mu, 1.8, n_samples).clip(0, 10)
sleep_mu = 5.8 if pos else 7.2
sleep = rng.normal(sleep_mu, 1.5, n_samples).clip(4, 10)
fam = rng.binomial(1, 0.55 if pos else 0.25, n_samples)
cvd = rng.binomial(1, 0.42 if pos else 0.22, n_samples)
diab = rng.binomial(1, 0.38 if pos else 0.20, n_samples)
dep = rng.binomial(1, 0.45 if pos else 0.20, n_samples)
head = rng.binomial(1, 0.30 if pos else 0.15, n_samples)
htn = rng.binomial(1, 0.52 if pos else 0.30, n_samples)
sbp_mu = 145 if pos else 128
sbp = rng.normal(sbp_mu, 18, n_samples).clip(90, 180)
dbp = rng.normal(82 if pos else 75, 12, n_samples).clip(60, 120)
chol_t = rng.normal(220 if pos else 200, 35, n_samples).clip(150, 300)
chol_ldl = rng.normal(145 if pos else 115, 28, n_samples).clip(50, 300)
chol_hdl = rng.normal(48 if pos else 58, 12, n_samples).clip(20, 100)
chol_trig = rng.normal(175 if pos else 140, 45, n_samples).clip(50, 500)
mmse = rng.normal(20 if pos else 27, 4, n_samples).clip(0, 30)
fa = rng.normal(6 if pos else 8, 2, n_samples).clip(0, 10)
mc = rng.binomial(1, 0.70 if pos else 0.25, n_samples)
bp = rng.binomial(1, 0.55 if pos else 0.15, n_samples)
adl = rng.normal(5.5 if pos else 8, 2, n_samples).clip(0, 10)
conf = rng.binomial(1, 0.60 if pos else 0.15, n_samples)
dis = rng.binomial(1, 0.55 if pos else 0.10, n_samples)
pc = rng.binomial(1, 0.50 if pos else 0.12, n_samples)
dct = rng.binomial(1, 0.65 if pos else 0.18, n_samples)
forget = rng.binomial(1, 0.75 if pos else 0.30, n_samples)
diag = np.ones(n_samples, dtype=int) if pos else np.zeros(n_samples, dtype=int)
return pd.DataFrame({
"Age": age, "Gender": gender, "Ethnicity": ethnicity, "EducationLevel": edu,
"BMI": bmi.round(1), "Smoking": smoking, "AlcoholConsumption": alcohol.round(1),
"PhysicalActivity": pa.round(1), "DietQuality": diet.round(1),
"SleepQuality": sleep.round(1), "FamilyHistoryAlzheimers": fam,
"CardiovascularDisease": cvd, "Diabetes": diab, "Depression": dep,
"HeadInjury": head, "Hypertension": htn, "SystolicBP": sbp.round(0).astype(int),
"DiastolicBP": dbp.round(0).astype(int), "CholesterolTotal": chol_t.round(0).astype(int),
"CholesterolLDL": chol_ldl.round(0).astype(int), "CholesterolHDL": chol_hdl.round(0).astype(int),
"CholesterolTriglycerides": chol_trig.round(0).astype(int),
"MMSE": mmse.round(1), "FunctionalAssessment": fa.round(1),
"MemoryComplaints": mc, "BehavioralProblems": bp, "ADL": adl.round(1),
"Confusion": conf, "Disorientation": dis, "PersonalityChanges": pc,
"DifficultyCompletingTasks": dct, "Forgetfulness": forget, "Diagnosis": diag,
})
df = pd.concat([sample(n_neg, False), sample(n_pos, True)], ignore_index=True)
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
return df
def load_dataset(allow_synthetic: bool = True) -> tuple[pd.DataFrame, str]:
"""
Returns (dataframe, source_label).
source_label is one of: "kaggle", "local_file", "synthetic"
"""
# 1. Try kagglehub
try:
import kagglehub
path = kagglehub.dataset_download("rabieelkharoua/alzheimers-disease-dataset")
csv_files = list(Path(path).glob("*.csv"))
if csv_files:
df = pd.read_csv(csv_files[0])
df = _clean(df)
df.to_csv(DATA_PATH, index=False)
return df, "kaggle"
except Exception:
pass
# 2. Try local pre-downloaded file
if DATA_PATH.exists():
df = pd.read_csv(DATA_PATH)
df = _clean(df)
return df, "local_file"
# 3. Synthetic fallback
if allow_synthetic:
return _generate_synthetic(), "synthetic"
raise FileNotFoundError(
"Could not load the dataset. Set KAGGLE_USERNAME and KAGGLE_KEY environment "
"variables, or place the CSV at data/alzheimers.csv."
)
def _clean(df: pd.DataFrame) -> pd.DataFrame:
cols_present = [c for c in FEATURE_COLS + [TARGET_COL] if c in df.columns]
drop_cols = [c for c in df.columns if c not in cols_present]
df = df.drop(columns=drop_cols, errors="ignore")
df = df[cols_present].copy()
df = df.dropna(subset=[TARGET_COL])
for col in FEATURE_COLS:
if col in df.columns and df[col].isna().any():
if FEATURE_META[col]["type"] in ("binary", "categorical", "ordinal"):
df[col] = df[col].fillna(df[col].mode()[0])
else:
df[col] = df[col].fillna(df[col].median())
return df
def get_population_stats(df: pd.DataFrame) -> dict:
"""Computes per-feature population statistics for dashboard comparison."""
stats = {}
for col in FEATURE_COLS:
if col not in df.columns:
continue
if FEATURE_META[col]["type"] == "continuous":
stats[col] = {
"mean": round(float(df[col].mean()), 2),
"std": round(float(df[col].std()), 2),
"p25": round(float(df[col].quantile(0.25)), 2),
"p75": round(float(df[col].quantile(0.75)), 2),
}
else:
stats[col] = {"mode": int(df[col].mode()[0]), "rate": round(float(df[col].mean()), 3)}
return stats
if __name__ == "__main__":
df, source = load_dataset()
print(f"Source: {source} | Shape: {df.shape}")
print(f"Diagnosis rate: {df['Diagnosis'].mean():.1%}")
print(df[FEATURE_COLS[:6]].head(3))