Spaces:

kshamaasuresh
/

CognitivePulse

Sleeping

File size: 13,475 Bytes

14a5ab4

"""
data_loader.py — CognitivePulse

Loads the El Kharoua Alzheimer's Disease Dataset (Kaggle, 2024; DOI 10.34740/KAGGLE/DSV/8668279)
and exposes a clean, preprocessed DataFrame for downstream modelling.

The dataset covers 2,149 patients and 33 features spanning:
  - Demographics: Age, Gender, Ethnicity, EducationLevel
  - Lifestyle: BMI, Smoking, AlcoholConsumption, PhysicalActivity, DietQuality, SleepQuality
  - Medical history: FamilyHistoryAlzheimers, CardiovascularDisease, Diabetes,
                     Depression, HeadInjury, Hypertension
  - Clinical measurements: SystolicBP, DiastolicBP, CholesterolTotal,
                            CholesterolLDL, CholesterolHDL, CholesterolTriglycerides
  - Cognitive assessments: MMSE, FunctionalAssessment, MemoryComplaints,
                            BehavioralProblems, ADL
  - Symptoms: Confusion, Disorientation, PersonalityChanges,
              DifficultyCompletingTasks, Forgetfulness
  - Target: Diagnosis (0 = No Alzheimer's, 1 = Alzheimer's)

Download strategy (tried in order):
  1. kagglehub (requires KAGGLE_USERNAME + KAGGLE_KEY env vars or ~/.kaggle/kaggle.json)
  2. Local file at data/alzheimers.csv (for pre-downloaded environments)
  3. Synthetic fallback — statistically matched to published feature distributions from
     the Kaggle dataset description; clearly flagged in the UI and README.

Reference: El Kharoua, R. (2024). Alzheimer's Disease Dataset [Data set].
           Kaggle. https://doi.org/10.34740/KAGGLE/DSV/8668279
"""

from __future__ import annotations

import os
import json
import hashlib
from pathlib import Path

import numpy as np
import pandas as pd

# Feature metadata: used for UI labels, binning, and intervention logic.
FEATURE_META = {
    "Age":                   {"label": "Age (years)",             "type": "continuous", "modifiable": False},
    "Gender":                {"label": "Gender",                  "type": "binary",     "modifiable": False},
    "Ethnicity":             {"label": "Ethnicity",               "type": "categorical","modifiable": False},
    "EducationLevel":        {"label": "Education Level",         "type": "ordinal",    "modifiable": False},
    "BMI":                   {"label": "BMI",                     "type": "continuous", "modifiable": True},
    "Smoking":               {"label": "Smoking",                 "type": "binary",     "modifiable": True},
    "AlcoholConsumption":    {"label": "Alcohol (units/week)",    "type": "continuous", "modifiable": True},
    "PhysicalActivity":      {"label": "Physical Activity (hrs/wk)", "type": "continuous", "modifiable": True},
    "DietQuality":           {"label": "Diet Quality Score",      "type": "continuous", "modifiable": True},
    "SleepQuality":          {"label": "Sleep Quality Score",     "type": "continuous", "modifiable": True},
    "FamilyHistoryAlzheimers": {"label": "Family History of Alzheimer's", "type": "binary", "modifiable": False},
    "CardiovascularDisease": {"label": "Cardiovascular Disease",  "type": "binary",     "modifiable": True},
    "Diabetes":              {"label": "Diabetes",                "type": "binary",     "modifiable": True},
    "Depression":            {"label": "Depression",              "type": "binary",     "modifiable": True},
    "HeadInjury":            {"label": "History of Head Injury",  "type": "binary",     "modifiable": False},
    "Hypertension":          {"label": "Hypertension",            "type": "binary",     "modifiable": True},
    "SystolicBP":            {"label": "Systolic BP (mmHg)",      "type": "continuous", "modifiable": True},
    "DiastolicBP":           {"label": "Diastolic BP (mmHg)",     "type": "continuous", "modifiable": True},
    "CholesterolTotal":      {"label": "Total Cholesterol (mg/dL)", "type": "continuous", "modifiable": True},
    "CholesterolLDL":        {"label": "LDL Cholesterol (mg/dL)", "type": "continuous", "modifiable": True},
    "CholesterolHDL":        {"label": "HDL Cholesterol (mg/dL)", "type": "continuous", "modifiable": True},
    "CholesterolTriglycerides": {"label": "Triglycerides (mg/dL)","type": "continuous", "modifiable": True},
    "MMSE":                  {"label": "MMSE Score",              "type": "continuous", "modifiable": False},
    "FunctionalAssessment":  {"label": "Functional Assessment",   "type": "continuous", "modifiable": False},
    "MemoryComplaints":      {"label": "Memory Complaints",       "type": "binary",     "modifiable": False},
    "BehavioralProblems":    {"label": "Behavioral Problems",     "type": "binary",     "modifiable": False},
    "ADL":                   {"label": "Activities of Daily Living", "type": "continuous", "modifiable": False},
    "Confusion":             {"label": "Confusion",               "type": "binary",     "modifiable": False},
    "Disorientation":        {"label": "Disorientation",          "type": "binary",     "modifiable": False},
    "PersonalityChanges":    {"label": "Personality Changes",     "type": "binary",     "modifiable": False},
    "DifficultyCompletingTasks": {"label": "Difficulty Completing Tasks", "type": "binary", "modifiable": False},
    "Forgetfulness":         {"label": "Forgetfulness",           "type": "binary",     "modifiable": False},
}

FEATURE_COLS = list(FEATURE_META.keys())
TARGET_COL = "Diagnosis"

# Published reference ranges / population norms (approximate midpoints from
# dataset description and AD prevention literature); used for dashboard banding.
REFERENCE_RANGES = {
    "BMI":                   {"optimal": (18.5, 25),  "caution": (25, 30),   "flag": (30, 40)},
    "PhysicalActivity":      {"optimal": (5, 10),     "caution": (2, 5),     "flag": (0, 2)},
    "DietQuality":           {"optimal": (7, 10),     "caution": (4, 7),     "flag": (0, 4)},
    "SleepQuality":          {"optimal": (7, 10),     "caution": (5, 7),     "flag": (4, 5)},
    "SystolicBP":            {"optimal": (90, 120),   "caution": (120, 140), "flag": (140, 180)},
    "DiastolicBP":           {"optimal": (60, 80),    "caution": (80, 90),   "flag": (90, 120)},
    "CholesterolLDL":        {"optimal": (0, 100),    "caution": (100, 160), "flag": (160, 300)},
    "CholesterolHDL":        {"optimal": (60, 300),   "caution": (40, 60),   "flag": (0, 40)},
    "CholesterolTriglycerides": {"optimal": (0, 150), "caution": (150, 200), "flag": (200, 500)},
    "MMSE":                  {"optimal": (24, 30),    "caution": (18, 24),   "flag": (0, 18)},
    "AlcoholConsumption":    {"optimal": (0, 7),      "caution": (7, 14),    "flag": (14, 20)},
}


DATA_PATH = Path(__file__).parent / "data" / "alzheimers.csv"
SYNTHETIC_SEED = 42


def _generate_synthetic(n: int = 500, seed: int = SYNTHETIC_SEED) -> pd.DataFrame:
    """
    Generates a synthetic dataset that matches the approximate feature distributions
    described in El Kharoua (2024). Used as a fallback when the Kaggle dataset is not
    available. Clearly flagged as synthetic in the UI and README.
    """
    rng = np.random.default_rng(seed)

    n_pos = int(n * 0.354)  # ~35.4% positive rate matching the published class balance
    n_neg = n - n_pos

    def sample(n_samples, pos):
        age = rng.integers(60, 91, n_samples)
        gender = rng.integers(0, 2, n_samples)
        ethnicity = rng.choice([0, 1, 2, 3], n_samples, p=[0.65, 0.15, 0.12, 0.08])
        edu = rng.choice([0, 1, 2, 3], n_samples, p=[0.10, 0.30, 0.40, 0.20])
        bmi_mu = 28.5 if pos else 27.2
        bmi = rng.normal(bmi_mu, 4.5, n_samples).clip(15, 40)
        smoking = rng.binomial(1, 0.35 if pos else 0.20, n_samples)
        alcohol = rng.uniform(0, 20, n_samples)
        pa_mu = 3.5 if pos else 5.5
        pa = rng.normal(pa_mu, 2, n_samples).clip(0, 10)
        diet_mu = 5.2 if pos else 6.8
        diet = rng.normal(diet_mu, 1.8, n_samples).clip(0, 10)
        sleep_mu = 5.8 if pos else 7.2
        sleep = rng.normal(sleep_mu, 1.5, n_samples).clip(4, 10)
        fam = rng.binomial(1, 0.55 if pos else 0.25, n_samples)
        cvd = rng.binomial(1, 0.42 if pos else 0.22, n_samples)
        diab = rng.binomial(1, 0.38 if pos else 0.20, n_samples)
        dep = rng.binomial(1, 0.45 if pos else 0.20, n_samples)
        head = rng.binomial(1, 0.30 if pos else 0.15, n_samples)
        htn = rng.binomial(1, 0.52 if pos else 0.30, n_samples)
        sbp_mu = 145 if pos else 128
        sbp = rng.normal(sbp_mu, 18, n_samples).clip(90, 180)
        dbp = rng.normal(82 if pos else 75, 12, n_samples).clip(60, 120)
        chol_t = rng.normal(220 if pos else 200, 35, n_samples).clip(150, 300)
        chol_ldl = rng.normal(145 if pos else 115, 28, n_samples).clip(50, 300)
        chol_hdl = rng.normal(48 if pos else 58, 12, n_samples).clip(20, 100)
        chol_trig = rng.normal(175 if pos else 140, 45, n_samples).clip(50, 500)
        mmse = rng.normal(20 if pos else 27, 4, n_samples).clip(0, 30)
        fa = rng.normal(6 if pos else 8, 2, n_samples).clip(0, 10)
        mc = rng.binomial(1, 0.70 if pos else 0.25, n_samples)
        bp = rng.binomial(1, 0.55 if pos else 0.15, n_samples)
        adl = rng.normal(5.5 if pos else 8, 2, n_samples).clip(0, 10)
        conf = rng.binomial(1, 0.60 if pos else 0.15, n_samples)
        dis = rng.binomial(1, 0.55 if pos else 0.10, n_samples)
        pc = rng.binomial(1, 0.50 if pos else 0.12, n_samples)
        dct = rng.binomial(1, 0.65 if pos else 0.18, n_samples)
        forget = rng.binomial(1, 0.75 if pos else 0.30, n_samples)
        diag = np.ones(n_samples, dtype=int) if pos else np.zeros(n_samples, dtype=int)
        return pd.DataFrame({
            "Age": age, "Gender": gender, "Ethnicity": ethnicity, "EducationLevel": edu,
            "BMI": bmi.round(1), "Smoking": smoking, "AlcoholConsumption": alcohol.round(1),
            "PhysicalActivity": pa.round(1), "DietQuality": diet.round(1),
            "SleepQuality": sleep.round(1), "FamilyHistoryAlzheimers": fam,
            "CardiovascularDisease": cvd, "Diabetes": diab, "Depression": dep,
            "HeadInjury": head, "Hypertension": htn, "SystolicBP": sbp.round(0).astype(int),
            "DiastolicBP": dbp.round(0).astype(int), "CholesterolTotal": chol_t.round(0).astype(int),
            "CholesterolLDL": chol_ldl.round(0).astype(int), "CholesterolHDL": chol_hdl.round(0).astype(int),
            "CholesterolTriglycerides": chol_trig.round(0).astype(int),
            "MMSE": mmse.round(1), "FunctionalAssessment": fa.round(1),
            "MemoryComplaints": mc, "BehavioralProblems": bp, "ADL": adl.round(1),
            "Confusion": conf, "Disorientation": dis, "PersonalityChanges": pc,
            "DifficultyCompletingTasks": dct, "Forgetfulness": forget, "Diagnosis": diag,
        })

    df = pd.concat([sample(n_neg, False), sample(n_pos, True)], ignore_index=True)
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    return df


def load_dataset(allow_synthetic: bool = True) -> tuple[pd.DataFrame, str]:
    """
    Returns (dataframe, source_label).
    source_label is one of: "kaggle", "local_file", "synthetic"
    """
    # 1. Try kagglehub
    try:
        import kagglehub
        path = kagglehub.dataset_download("rabieelkharoua/alzheimers-disease-dataset")
        csv_files = list(Path(path).glob("*.csv"))
        if csv_files:
            df = pd.read_csv(csv_files[0])
            df = _clean(df)
            df.to_csv(DATA_PATH, index=False)
            return df, "kaggle"
    except Exception:
        pass

    # 2. Try local pre-downloaded file
    if DATA_PATH.exists():
        df = pd.read_csv(DATA_PATH)
        df = _clean(df)
        return df, "local_file"

    # 3. Synthetic fallback
    if allow_synthetic:
        return _generate_synthetic(), "synthetic"

    raise FileNotFoundError(
        "Could not load the dataset. Set KAGGLE_USERNAME and KAGGLE_KEY environment "
        "variables, or place the CSV at data/alzheimers.csv."
    )


def _clean(df: pd.DataFrame) -> pd.DataFrame:
    cols_present = [c for c in FEATURE_COLS + [TARGET_COL] if c in df.columns]
    drop_cols = [c for c in df.columns if c not in cols_present]
    df = df.drop(columns=drop_cols, errors="ignore")
    df = df[cols_present].copy()
    df = df.dropna(subset=[TARGET_COL])
    for col in FEATURE_COLS:
        if col in df.columns and df[col].isna().any():
            if FEATURE_META[col]["type"] in ("binary", "categorical", "ordinal"):
                df[col] = df[col].fillna(df[col].mode()[0])
            else:
                df[col] = df[col].fillna(df[col].median())
    return df


def get_population_stats(df: pd.DataFrame) -> dict:
    """Computes per-feature population statistics for dashboard comparison."""
    stats = {}
    for col in FEATURE_COLS:
        if col not in df.columns:
            continue
        if FEATURE_META[col]["type"] == "continuous":
            stats[col] = {
                "mean": round(float(df[col].mean()), 2),
                "std": round(float(df[col].std()), 2),
                "p25": round(float(df[col].quantile(0.25)), 2),
                "p75": round(float(df[col].quantile(0.75)), 2),
            }
        else:
            stats[col] = {"mode": int(df[col].mode()[0]), "rate": round(float(df[col].mean()), 3)}
    return stats


if __name__ == "__main__":
    df, source = load_dataset()
    print(f"Source: {source} | Shape: {df.shape}")
    print(f"Diagnosis rate: {df['Diagnosis'].mean():.1%}")
    print(df[FEATURE_COLS[:6]].head(3))