Spaces:

kshamaasuresh
/

CognitivePulse

Sleeping

Kshamaa S

Initial deployment: CognitivePulse biomarker intelligence and coaching assistant

14a5ab4 8 days ago

13.5 kB

	"""
	data_loader.py — CognitivePulse

	Loads the El Kharoua Alzheimer's Disease Dataset (Kaggle, 2024; DOI 10.34740/KAGGLE/DSV/8668279)
	and exposes a clean, preprocessed DataFrame for downstream modelling.

	The dataset covers 2,149 patients and 33 features spanning:
	- Demographics: Age, Gender, Ethnicity, EducationLevel
	- Lifestyle: BMI, Smoking, AlcoholConsumption, PhysicalActivity, DietQuality, SleepQuality
	- Medical history: FamilyHistoryAlzheimers, CardiovascularDisease, Diabetes,
	Depression, HeadInjury, Hypertension
	- Clinical measurements: SystolicBP, DiastolicBP, CholesterolTotal,
	CholesterolLDL, CholesterolHDL, CholesterolTriglycerides
	- Cognitive assessments: MMSE, FunctionalAssessment, MemoryComplaints,
	BehavioralProblems, ADL
	- Symptoms: Confusion, Disorientation, PersonalityChanges,
	DifficultyCompletingTasks, Forgetfulness
	- Target: Diagnosis (0 = No Alzheimer's, 1 = Alzheimer's)

	Download strategy (tried in order):
	1. kagglehub (requires KAGGLE_USERNAME + KAGGLE_KEY env vars or ~/.kaggle/kaggle.json)
	2. Local file at data/alzheimers.csv (for pre-downloaded environments)
	3. Synthetic fallback — statistically matched to published feature distributions from
	the Kaggle dataset description; clearly flagged in the UI and README.

	Reference: El Kharoua, R. (2024). Alzheimer's Disease Dataset [Data set].
	Kaggle. https://doi.org/10.34740/KAGGLE/DSV/8668279
	"""

	from __future__ import annotations

	import os
	import json
	import hashlib
	from pathlib import Path

	import numpy as np
	import pandas as pd

	# Feature metadata: used for UI labels, binning, and intervention logic.
	FEATURE_META = {
	"Age": {"label": "Age (years)", "type": "continuous", "modifiable": False},
	"Gender": {"label": "Gender", "type": "binary", "modifiable": False},
	"Ethnicity": {"label": "Ethnicity", "type": "categorical","modifiable": False},
	"EducationLevel": {"label": "Education Level", "type": "ordinal", "modifiable": False},
	"BMI": {"label": "BMI", "type": "continuous", "modifiable": True},
	"Smoking": {"label": "Smoking", "type": "binary", "modifiable": True},
	"AlcoholConsumption": {"label": "Alcohol (units/week)", "type": "continuous", "modifiable": True},
	"PhysicalActivity": {"label": "Physical Activity (hrs/wk)", "type": "continuous", "modifiable": True},
	"DietQuality": {"label": "Diet Quality Score", "type": "continuous", "modifiable": True},
	"SleepQuality": {"label": "Sleep Quality Score", "type": "continuous", "modifiable": True},
	"FamilyHistoryAlzheimers": {"label": "Family History of Alzheimer's", "type": "binary", "modifiable": False},
	"CardiovascularDisease": {"label": "Cardiovascular Disease", "type": "binary", "modifiable": True},
	"Diabetes": {"label": "Diabetes", "type": "binary", "modifiable": True},
	"Depression": {"label": "Depression", "type": "binary", "modifiable": True},
	"HeadInjury": {"label": "History of Head Injury", "type": "binary", "modifiable": False},
	"Hypertension": {"label": "Hypertension", "type": "binary", "modifiable": True},
	"SystolicBP": {"label": "Systolic BP (mmHg)", "type": "continuous", "modifiable": True},
	"DiastolicBP": {"label": "Diastolic BP (mmHg)", "type": "continuous", "modifiable": True},
	"CholesterolTotal": {"label": "Total Cholesterol (mg/dL)", "type": "continuous", "modifiable": True},
	"CholesterolLDL": {"label": "LDL Cholesterol (mg/dL)", "type": "continuous", "modifiable": True},
	"CholesterolHDL": {"label": "HDL Cholesterol (mg/dL)", "type": "continuous", "modifiable": True},
	"CholesterolTriglycerides": {"label": "Triglycerides (mg/dL)","type": "continuous", "modifiable": True},
	"MMSE": {"label": "MMSE Score", "type": "continuous", "modifiable": False},
	"FunctionalAssessment": {"label": "Functional Assessment", "type": "continuous", "modifiable": False},
	"MemoryComplaints": {"label": "Memory Complaints", "type": "binary", "modifiable": False},
	"BehavioralProblems": {"label": "Behavioral Problems", "type": "binary", "modifiable": False},
	"ADL": {"label": "Activities of Daily Living", "type": "continuous", "modifiable": False},
	"Confusion": {"label": "Confusion", "type": "binary", "modifiable": False},
	"Disorientation": {"label": "Disorientation", "type": "binary", "modifiable": False},
	"PersonalityChanges": {"label": "Personality Changes", "type": "binary", "modifiable": False},
	"DifficultyCompletingTasks": {"label": "Difficulty Completing Tasks", "type": "binary", "modifiable": False},
	"Forgetfulness": {"label": "Forgetfulness", "type": "binary", "modifiable": False},
	}

	FEATURE_COLS = list(FEATURE_META.keys())
	TARGET_COL = "Diagnosis"

	# Published reference ranges / population norms (approximate midpoints from
	# dataset description and AD prevention literature); used for dashboard banding.
	REFERENCE_RANGES = {
	"BMI": {"optimal": (18.5, 25), "caution": (25, 30), "flag": (30, 40)},
	"PhysicalActivity": {"optimal": (5, 10), "caution": (2, 5), "flag": (0, 2)},
	"DietQuality": {"optimal": (7, 10), "caution": (4, 7), "flag": (0, 4)},
	"SleepQuality": {"optimal": (7, 10), "caution": (5, 7), "flag": (4, 5)},
	"SystolicBP": {"optimal": (90, 120), "caution": (120, 140), "flag": (140, 180)},
	"DiastolicBP": {"optimal": (60, 80), "caution": (80, 90), "flag": (90, 120)},
	"CholesterolLDL": {"optimal": (0, 100), "caution": (100, 160), "flag": (160, 300)},
	"CholesterolHDL": {"optimal": (60, 300), "caution": (40, 60), "flag": (0, 40)},
	"CholesterolTriglycerides": {"optimal": (0, 150), "caution": (150, 200), "flag": (200, 500)},
	"MMSE": {"optimal": (24, 30), "caution": (18, 24), "flag": (0, 18)},
	"AlcoholConsumption": {"optimal": (0, 7), "caution": (7, 14), "flag": (14, 20)},
	}


	DATA_PATH = Path(__file__).parent / "data" / "alzheimers.csv"
	SYNTHETIC_SEED = 42


	def _generate_synthetic(n: int = 500, seed: int = SYNTHETIC_SEED) -> pd.DataFrame:
	"""
	Generates a synthetic dataset that matches the approximate feature distributions
	described in El Kharoua (2024). Used as a fallback when the Kaggle dataset is not
	available. Clearly flagged as synthetic in the UI and README.
	"""
	rng = np.random.default_rng(seed)

	n_pos = int(n * 0.354) # ~35.4% positive rate matching the published class balance
	n_neg = n - n_pos

	def sample(n_samples, pos):
	age = rng.integers(60, 91, n_samples)
	gender = rng.integers(0, 2, n_samples)
	ethnicity = rng.choice([0, 1, 2, 3], n_samples, p=[0.65, 0.15, 0.12, 0.08])
	edu = rng.choice([0, 1, 2, 3], n_samples, p=[0.10, 0.30, 0.40, 0.20])
	bmi_mu = 28.5 if pos else 27.2
	bmi = rng.normal(bmi_mu, 4.5, n_samples).clip(15, 40)
	smoking = rng.binomial(1, 0.35 if pos else 0.20, n_samples)
	alcohol = rng.uniform(0, 20, n_samples)
	pa_mu = 3.5 if pos else 5.5
	pa = rng.normal(pa_mu, 2, n_samples).clip(0, 10)
	diet_mu = 5.2 if pos else 6.8
	diet = rng.normal(diet_mu, 1.8, n_samples).clip(0, 10)
	sleep_mu = 5.8 if pos else 7.2
	sleep = rng.normal(sleep_mu, 1.5, n_samples).clip(4, 10)
	fam = rng.binomial(1, 0.55 if pos else 0.25, n_samples)
	cvd = rng.binomial(1, 0.42 if pos else 0.22, n_samples)
	diab = rng.binomial(1, 0.38 if pos else 0.20, n_samples)
	dep = rng.binomial(1, 0.45 if pos else 0.20, n_samples)
	head = rng.binomial(1, 0.30 if pos else 0.15, n_samples)
	htn = rng.binomial(1, 0.52 if pos else 0.30, n_samples)
	sbp_mu = 145 if pos else 128
	sbp = rng.normal(sbp_mu, 18, n_samples).clip(90, 180)
	dbp = rng.normal(82 if pos else 75, 12, n_samples).clip(60, 120)
	chol_t = rng.normal(220 if pos else 200, 35, n_samples).clip(150, 300)
	chol_ldl = rng.normal(145 if pos else 115, 28, n_samples).clip(50, 300)
	chol_hdl = rng.normal(48 if pos else 58, 12, n_samples).clip(20, 100)
	chol_trig = rng.normal(175 if pos else 140, 45, n_samples).clip(50, 500)
	mmse = rng.normal(20 if pos else 27, 4, n_samples).clip(0, 30)
	fa = rng.normal(6 if pos else 8, 2, n_samples).clip(0, 10)
	mc = rng.binomial(1, 0.70 if pos else 0.25, n_samples)
	bp = rng.binomial(1, 0.55 if pos else 0.15, n_samples)
	adl = rng.normal(5.5 if pos else 8, 2, n_samples).clip(0, 10)
	conf = rng.binomial(1, 0.60 if pos else 0.15, n_samples)
	dis = rng.binomial(1, 0.55 if pos else 0.10, n_samples)
	pc = rng.binomial(1, 0.50 if pos else 0.12, n_samples)
	dct = rng.binomial(1, 0.65 if pos else 0.18, n_samples)
	forget = rng.binomial(1, 0.75 if pos else 0.30, n_samples)
	diag = np.ones(n_samples, dtype=int) if pos else np.zeros(n_samples, dtype=int)
	return pd.DataFrame({
	"Age": age, "Gender": gender, "Ethnicity": ethnicity, "EducationLevel": edu,
	"BMI": bmi.round(1), "Smoking": smoking, "AlcoholConsumption": alcohol.round(1),
	"PhysicalActivity": pa.round(1), "DietQuality": diet.round(1),
	"SleepQuality": sleep.round(1), "FamilyHistoryAlzheimers": fam,
	"CardiovascularDisease": cvd, "Diabetes": diab, "Depression": dep,
	"HeadInjury": head, "Hypertension": htn, "SystolicBP": sbp.round(0).astype(int),
	"DiastolicBP": dbp.round(0).astype(int), "CholesterolTotal": chol_t.round(0).astype(int),
	"CholesterolLDL": chol_ldl.round(0).astype(int), "CholesterolHDL": chol_hdl.round(0).astype(int),
	"CholesterolTriglycerides": chol_trig.round(0).astype(int),
	"MMSE": mmse.round(1), "FunctionalAssessment": fa.round(1),
	"MemoryComplaints": mc, "BehavioralProblems": bp, "ADL": adl.round(1),
	"Confusion": conf, "Disorientation": dis, "PersonalityChanges": pc,
	"DifficultyCompletingTasks": dct, "Forgetfulness": forget, "Diagnosis": diag,
	})

	df = pd.concat([sample(n_neg, False), sample(n_pos, True)], ignore_index=True)
	df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
	return df


	def load_dataset(allow_synthetic: bool = True) -> tuple[pd.DataFrame, str]:
	"""
	Returns (dataframe, source_label).
	source_label is one of: "kaggle", "local_file", "synthetic"
	"""
	# 1. Try kagglehub
	try:
	import kagglehub
	path = kagglehub.dataset_download("rabieelkharoua/alzheimers-disease-dataset")
	csv_files = list(Path(path).glob("*.csv"))
	if csv_files:
	df = pd.read_csv(csv_files[0])
	df = _clean(df)
	df.to_csv(DATA_PATH, index=False)
	return df, "kaggle"
	except Exception:
	pass

	# 2. Try local pre-downloaded file
	if DATA_PATH.exists():
	df = pd.read_csv(DATA_PATH)
	df = _clean(df)
	return df, "local_file"

	# 3. Synthetic fallback
	if allow_synthetic:
	return _generate_synthetic(), "synthetic"

	raise FileNotFoundError(
	"Could not load the dataset. Set KAGGLE_USERNAME and KAGGLE_KEY environment "
	"variables, or place the CSV at data/alzheimers.csv."
	)


	def _clean(df: pd.DataFrame) -> pd.DataFrame:
	cols_present = [c for c in FEATURE_COLS + [TARGET_COL] if c in df.columns]
	drop_cols = [c for c in df.columns if c not in cols_present]
	df = df.drop(columns=drop_cols, errors="ignore")
	df = df[cols_present].copy()
	df = df.dropna(subset=[TARGET_COL])
	for col in FEATURE_COLS:
	if col in df.columns and df[col].isna().any():
	if FEATURE_META[col]["type"] in ("binary", "categorical", "ordinal"):
	df[col] = df[col].fillna(df[col].mode()[0])
	else:
	df[col] = df[col].fillna(df[col].median())
	return df


	def get_population_stats(df: pd.DataFrame) -> dict:
	"""Computes per-feature population statistics for dashboard comparison."""
	stats = {}
	for col in FEATURE_COLS:
	if col not in df.columns:
	continue
	if FEATURE_META[col]["type"] == "continuous":
	stats[col] = {
	"mean": round(float(df[col].mean()), 2),
	"std": round(float(df[col].std()), 2),
	"p25": round(float(df[col].quantile(0.25)), 2),
	"p75": round(float(df[col].quantile(0.75)), 2),
	}
	else:
	stats[col] = {"mode": int(df[col].mode()[0]), "rate": round(float(df[col].mean()), 3)}
	return stats


	if __name__ == "__main__":
	df, source = load_dataset()
	print(f"Source: {source} \| Shape: {df.shape}")
	print(f"Diagnosis rate: {df['Diagnosis'].mean():.1%}")
	print(df[FEATURE_COLS[:6]].head(3))