headache-predictor-xgboost / run_training.py
emp-admin's picture
Upload 9 files
56f192b verified
"""
═══════════════════════════════════════════════════════════════════════════
PHOEBE HEADACHE PREDICTOR v3.0 β€” Production Training Pipeline
EmpedocLabs Β© 2025
═══════════════════════════════════════════════════════════════════════════
Clinical-grade synthetic data with user archetypes:
- Chronic migraineur (high baseline, medication dependent)
- Episodic tension-type (stress/sleep driven)
- Menstrual migraine (hormonal cycle dominant)
- Weather-sensitive (barometric pressure dominant)
- Mixed/general (moderate baseline)
Leak-free: predicts day T headache using day T weather + day T-1 health/diary.
38 features matching the iOS DailySnapshotDTO:
WeatherKit forecast (6) β€” pressure, Ξ”p, |Ξ”p|, humidity, temp, drop flag
HealthKit yesterday (7) β€” sleep h/deep/rem, rhr, hrv, workout, menstrual
Diary yesterday (6) β€” headache, severity, duration, mood, #symptoms, #triggers
Diary 2-days-ago (3) β€” headache, severity, duration
Temporal (7) β€” dow sin/cos, month sin/cos, doy sin/cos, weekend
User context (3) β€” age, is_europe, is_tropical
Interactions (6) β€” sleepΓ—pressure, low_hrv, sleep_deficit,
high_humidity, streak_2d, consecutive_days
"""
import os, sys, math, random, pickle, json, warnings
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn.model_selection import GroupShuffleSplit
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
classification_report, roc_auc_score, f1_score,
precision_recall_curve, average_precision_score, confusion_matrix,
)
warnings.filterwarnings("ignore")
# ═══════════════════════════════════════════════════════════════════════
# FEATURE SCHEMA
# ═══════════════════════════════════════════════════════════════════════
FEATURE_NAMES = [
"pressure_mb", "pressure_change_24h", "pressure_volatility",
"humidity_pct", "temperature_c", "is_pressure_drop",
"sleep_total_hours", "deep_sleep_min", "rem_sleep_min",
"resting_hr", "hrv_avg_ms", "workout_min", "menstrual_flow_flag",
"had_headache_1d", "severity_1d", "duration_1d",
"mood_1d", "symptom_count_1d", "trigger_count_1d",
"had_headache_2d", "severity_2d", "duration_2d",
"dow_sin", "dow_cos", "month_sin", "month_cos",
"doy_sin", "doy_cos", "is_weekend",
"age_midpoint", "is_europe", "is_tropical",
"sleep_x_pressure", "low_hrv_flag", "sleep_deficit",
"high_humidity_flag", "headache_streak_2d", "consecutive_headache_days",
]
NUM_FEATURES = len(FEATURE_NAMES) # 38
# ═══════════════════════════════════════════════════════════════════════
# USER ARCHETYPES (based on migraine clinical literature)
# ═══════════════════════════════════════════════════════════════════════
ARCHETYPES = {
"chronic_migraine": {
"weight": 0.20,
"base_rate": 0.40, # ~12 headache days/month
"sensitivity": 0.3,
"pressure_coeff": 0.5,
"sleep_coeff": 0.6,
"hrv_coeff": 0.3,
"menstrual_coeff": 0.4,
"humidity_coeff": 0.2,
"rebound_coeff": 0.7, # strong rebound/cluster effect
"weekend_coeff": 0.1,
"temp_coeff": 0.15,
},
"episodic_tension": {
"weight": 0.25,
"base_rate": 0.12,
"sensitivity": 0.0,
"pressure_coeff": 0.2,
"sleep_coeff": 0.9, # very sleep-dependent
"hrv_coeff": 0.7, # very stress-dependent
"menstrual_coeff": 0.1,
"humidity_coeff": 0.1,
"rebound_coeff": 0.2,
"weekend_coeff": 0.25, # "weekend headache" pattern
"temp_coeff": 0.1,
},
"menstrual_migraine": {
"weight": 0.20,
"base_rate": 0.15,
"sensitivity": 0.1,
"pressure_coeff": 0.3,
"sleep_coeff": 0.4,
"hrv_coeff": 0.3,
"menstrual_coeff": 1.2, # dominant factor
"humidity_coeff": 0.15,
"rebound_coeff": 0.4,
"weekend_coeff": 0.05,
"temp_coeff": 0.1,
},
"weather_sensitive": {
"weight": 0.15,
"base_rate": 0.15,
"sensitivity": 0.1,
"pressure_coeff": 1.0, # dominant factor
"sleep_coeff": 0.3,
"hrv_coeff": 0.2,
"menstrual_coeff": 0.2,
"humidity_coeff": 0.6, # also weather
"rebound_coeff": 0.3,
"weekend_coeff": 0.05,
"temp_coeff": 0.4, # temperature sensitive too
},
"mixed_general": {
"weight": 0.20,
"base_rate": 0.18,
"sensitivity": 0.0,
"pressure_coeff": 0.4,
"sleep_coeff": 0.5,
"hrv_coeff": 0.4,
"menstrual_coeff": 0.3,
"humidity_coeff": 0.2,
"rebound_coeff": 0.35,
"weekend_coeff": 0.12,
"temp_coeff": 0.15,
},
}
def _logistic(x):
return 1.0 / (1.0 + math.exp(-max(-20, min(20, x))))
def _cyclic(val, period):
a = 2 * math.pi * val / period
return math.sin(a), math.cos(a)
# ═══════════════════════════════════════════════════════════════════════
# USER CLASS
# ═══════════════════════════════════════════════════════════════════════
class User:
def __init__(self, uid):
self.uid = uid
# Pick archetype by weights
names = list(ARCHETYPES.keys())
weights = [ARCHETYPES[n]["weight"] for n in names]
self.archetype_name = random.choices(names, weights=weights)[0]
self.arch = ARCHETYPES[self.archetype_name]
age_lo = random.choice([18, 20, 25, 30, 35, 40, 45, 50, 55, 60])
self.age_mid = age_lo + 4.5
self.region = random.choices(
["europe", "americas", "asia", "tropical"],
weights=[40, 30, 20, 10],
)[0]
# Menstrual migraine archetype β†’ always female
if self.archetype_name == "menstrual_migraine":
self.is_female = True
else:
self.is_female = random.random() < 0.65
# Personal baselines with variance
self.base_hr = random.gauss(65, 8)
self.base_hrv = random.gauss(45, 15)
self.base_sleep = random.gauss(7.0, 0.8)
self.personal_noise = random.gauss(0, 0.2)
# Cycle params
self.cycle_len = random.randint(26, 32) if self.is_female else 0
self.cycle_off = random.randint(0, 30)
# ═══════════════════════════════════════════════════════════════════════
# DATA GENERATION β€” per user
# ═══════════════════════════════════════════════════════════════════════
def generate_user(user: User, n_days: int, start: datetime):
"""Generate n_days of raw data, return leak-free (features, labels)."""
a = user.arch
# Weather random walk
pressure = random.gauss(1013.25, 8)
temp = random.gauss(15, 10)
humidity = random.gauss(60, 15)
raw = []
for d in range(n_days):
dt = start + timedelta(days=d)
month = dt.month
# ── Weather with realistic autocorrelation ───────────────────
seasonal_t = 15 + 14 * math.sin(2 * math.pi * (month - 4) / 12)
seasonal_h = 55 + 20 * math.sin(2 * math.pi * (month - 7) / 12)
# Pressure: occasional fronts (sudden drops)
if random.random() < 0.08: # ~3x/month cold front
p_change = random.gauss(-8, 3)
elif random.random() < 0.05: # occasional rapid rise
p_change = random.gauss(6, 2)
else:
p_change = random.gauss(0, 2.5) + 0.12 * (1013.25 - pressure)
pressure += p_change
pressure = max(970, min(1050, pressure))
temp += random.gauss(0, 1.8) + 0.15 * (seasonal_t - temp)
humidity += random.gauss(0, 4) + 0.08 * (seasonal_h - humidity)
humidity = max(15, min(98, humidity))
# ── HealthKit ────────────────────────────────────────────────
# Sleep varies: weekends slightly longer, bad days shorter
base_s = user.base_sleep + (0.5 if dt.weekday() >= 5 else 0)
sleep_h = max(2.5, random.gauss(base_s, 1.0))
# If had headache yesterday β†’ worse sleep tonight
if d > 0 and raw[d-1]["headache"]:
sleep_h = max(2.5, sleep_h - random.gauss(0.8, 0.5))
deep = max(0, random.gauss(75 + sleep_h * 3, 18))
rem = max(0, random.gauss(80 + sleep_h * 5, 22))
rhr = max(45, random.gauss(user.base_hr, 4))
# HRV: stress lowers it, good sleep raises it
hrv_base = user.base_hrv + (sleep_h - 7) * 3
hrv = max(8, random.gauss(hrv_base, 8))
workout = max(0, int(random.gauss(25, 18)))
# Less workout on headache days
if d > 0 and raw[d-1]["headache"]:
workout = max(0, workout - 15)
cycle_day = 0
menstrual = False
if user.is_female and user.cycle_len > 0:
cycle_day = ((d + user.cycle_off) % user.cycle_len) + 1
menstrual = cycle_day <= 4
# ── Headache probability ─────────────────────────────────────
base_rate = a["base_rate"]
lo = math.log(base_rate / (1 - base_rate))
lo += user.personal_noise
# Barometric pressure: graded response
if p_change < -8:
lo += a["pressure_coeff"] * 1.2
elif p_change < -5:
lo += a["pressure_coeff"] * 0.8
elif p_change < -3:
lo += a["pressure_coeff"] * 0.4
elif p_change > 8:
lo += a["pressure_coeff"] * 0.5 # rapid rise also triggers
# Sleep: graded response
if sleep_h < 4:
lo += a["sleep_coeff"] * 1.2
elif sleep_h < 5:
lo += a["sleep_coeff"] * 0.8
elif sleep_h < 6:
lo += a["sleep_coeff"] * 0.4
elif sleep_h > 9:
lo += a["sleep_coeff"] * 0.3 # oversleep trigger
# HRV (stress proxy): graded
if hrv < 20:
lo += a["hrv_coeff"] * 1.0
elif hrv < 30:
lo += a["hrv_coeff"] * 0.6
elif hrv < 35:
lo += a["hrv_coeff"] * 0.2
# Menstrual: perimenstrual window (days 1-3, 26-28)
if user.is_female and user.cycle_len > 0:
if cycle_day <= 3:
lo += a["menstrual_coeff"] * 1.0
elif cycle_day <= 5:
lo += a["menstrual_coeff"] * 0.4
elif cycle_day >= user.cycle_len - 2:
lo += a["menstrual_coeff"] * 0.7 # premenstrual
# Humidity
if humidity > 85:
lo += a["humidity_coeff"] * 0.8
elif humidity > 75:
lo += a["humidity_coeff"] * 0.3
# Temperature extremes
if temp > 32 or temp < -8:
lo += a["temp_coeff"] * 0.8
elif temp > 28 or temp < -3:
lo += a["temp_coeff"] * 0.3
# Rebound / cluster effect
if d > 0 and raw[d-1]["headache"]:
lo += a["rebound_coeff"] * 0.6
if d > 1 and raw[d-2]["headache"]:
lo += a["rebound_coeff"] * 0.3 # 2-day streak
# Weekend "let-down" headache
if dt.weekday() == 5: # Saturday
lo += a["weekend_coeff"]
elif dt.weekday() == 6:
lo += a["weekend_coeff"] * 0.6
# Small random noise (less than before β€” let signal dominate)
lo += random.gauss(0, 0.15)
prob = _logistic(lo)
headache = random.random() < prob
# ── Diary details ────────────────────────────────────────────
if headache:
severity = random.choices([1,2,3,4,5], weights=[8,22,38,22,10])[0]
duration = round(max(0.5, random.gauss(2.5 + severity * 1.0, 1.5)), 1)
n_symp = random.randint(1, min(5, 1 + severity))
n_trig = random.randint(0, min(4, severity))
mood = random.choices([1,2,3,4,5], weights=[30,35,25,8,2])[0]
else:
severity, duration, n_symp, n_trig = 0, 0.0, 0, 0
mood = random.choices([1,2,3,4,5], weights=[3,8,25,38,26])[0]
raw.append({
"dt": dt, "pressure": pressure, "p_change": p_change,
"humidity": humidity, "temp": temp,
"sleep_h": round(sleep_h, 1), "deep": round(deep, 0),
"rem": round(rem, 0), "rhr": round(rhr, 0),
"hrv": round(hrv, 1), "workout": workout,
"menstrual": menstrual,
"headache": headache, "severity": severity,
"duration": duration, "mood": mood,
"n_symp": n_symp, "n_trig": n_trig,
})
# ── Build feature vectors (leak-free) ────────────────────────────
rows, labels = [], []
consec = 0
for i in range(2, n_days):
t = raw[i] # target day
y = raw[i - 1] # yesterday
p = raw[i - 2] # 2 days ago
dt = t["dt"]
f = []
# Weather target (6)
f.append(t["pressure"])
f.append(t["p_change"])
f.append(abs(t["p_change"]))
f.append(t["humidity"])
f.append(t["temp"])
f.append(1.0 if t["p_change"] < -5 else 0.0)
# HealthKit yesterday (7)
f.append(y["sleep_h"])
f.append(y["deep"])
f.append(y["rem"])
f.append(y["rhr"])
f.append(y["hrv"])
f.append(float(y["workout"]))
f.append(1.0 if y["menstrual"] else 0.0)
# Diary yesterday (6)
f.append(1.0 if y["headache"] else 0.0)
f.append(float(y["severity"]))
f.append(float(y["duration"]))
f.append(float(y["mood"]))
f.append(float(y["n_symp"]))
f.append(float(y["n_trig"]))
# Diary 2d ago (3)
f.append(1.0 if p["headache"] else 0.0)
f.append(float(p["severity"]))
f.append(float(p["duration"]))
# Temporal (7)
dw_s, dw_c = _cyclic(dt.weekday(), 7)
mn_s, mn_c = _cyclic(dt.month - 1, 12)
dy_s, dy_c = _cyclic(dt.timetuple().tm_yday, 365)
f.extend([dw_s, dw_c, mn_s, mn_c, dy_s, dy_c])
f.append(1.0 if dt.weekday() >= 5 else 0.0)
# User context (3)
f.append(user.age_mid)
f.append(1.0 if "europe" in user.region else 0.0)
f.append(1.0 if "tropical" in user.region else 0.0)
# Interactions (6)
f.append(y["sleep_h"] * abs(t["p_change"]))
f.append(1.0 if y["hrv"] < 25 else 0.0)
f.append(max(0.0, 6.0 - y["sleep_h"]))
f.append(1.0 if t["humidity"] > 80 else 0.0)
streak = (1.0 if y["headache"] else 0.0) + (1.0 if p["headache"] else 0.0)
f.append(streak)
consec = (consec + 1) if y["headache"] else 0
f.append(float(min(consec, 7)))
rows.append(f)
labels.append(1 if t["headache"] else 0)
return np.array(rows, dtype=np.float32), np.array(labels, dtype=np.int32)
# ═══════════════════════════════════════════════════════════════════════
# DATASET ASSEMBLY
# ═══════════════════════════════════════════════════════════════════════
def generate_dataset(n_users=2000, days=365, seed=42):
random.seed(seed)
np.random.seed(seed)
all_X, all_y, all_uid, all_arch = [], [], [], []
start = datetime(2023, 6, 1)
arch_counts = {}
for uid in range(n_users):
user = User(uid)
arch_counts[user.archetype_name] = arch_counts.get(user.archetype_name, 0) + 1
X_u, y_u = generate_user(user, days, start)
all_X.append(X_u)
all_y.append(y_u)
all_uid.extend([uid] * len(y_u))
all_arch.extend([user.archetype_name] * len(y_u))
if (uid + 1) % 200 == 0:
print(f" {uid + 1}/{n_users} users generated")
X = np.vstack(all_X)
y = np.concatenate(all_y)
df = pd.DataFrame(X, columns=FEATURE_NAMES)
df["headache"] = y
df["user_id"] = all_uid
df["archetype"] = all_arch
print(f"\nβœ… Dataset: {df.shape[0]:,} rows Γ— {NUM_FEATURES} features")
print(f" Headache rate: {y.mean():.1%}")
print(f" Archetypes: {arch_counts}")
return df
# ═══════════════════════════════════════════════════════════════════════
# TRAINING
# ═══════════════════════════════════════════════════════════════════════
def group_split(df, test_f=0.12, val_f=0.12, seed=42):
gss = GroupShuffleSplit(n_splits=1, test_size=test_f, random_state=seed)
i_tv, i_te = next(gss.split(df, groups=df["user_id"]))
df_tv, df_test = df.iloc[i_tv], df.iloc[i_te]
rel_val = val_f / (1 - test_f)
gss2 = GroupShuffleSplit(n_splits=1, test_size=rel_val, random_state=seed)
i_tr, i_v = next(gss2.split(df_tv, groups=df_tv["user_id"]))
return df_tv.iloc[i_tr], df_tv.iloc[i_v], df_test
def tune_threshold(y_true, y_prob):
prec, rec, thr = precision_recall_curve(y_true, y_prob)
f1 = 2 * prec * rec / (prec + rec + 1e-8)
best = np.argmax(f1)
return float(thr[min(best, len(thr)-1)]), float(f1[best])
def evaluate(y_true, y_prob, thr, label):
y_pred = (y_prob >= thr).astype(int)
print(f"\n{'═'*60}")
print(f" {label} (threshold={thr:.3f})")
print(f"{'═'*60}")
print(classification_report(y_true, y_pred,
target_names=["No headache", "Headache"], zero_division=0))
auc = roc_auc_score(y_true, y_prob)
ap = average_precision_score(y_true, y_prob)
f1 = f1_score(y_true, y_pred, zero_division=0)
cm = confusion_matrix(y_true, y_pred)
print(f" ROC-AUC : {auc:.4f}")
print(f" PR-AUC : {ap:.4f}")
print(f" F1 : {f1:.4f}")
print(f" Confusion:\n{cm}")
return {"roc_auc": round(auc,4), "pr_auc": round(ap,4), "f1": round(f1,4)}
def main():
print("=" * 62)
print(" PHOEBE HEADACHE PREDICTOR v3.0 β€” Production Training")
print(" EmpedocLabs | Beta Release Build")
print("=" * 62)
# ── Generate ─────────────────────────────────────────────────────
print("\nπŸ“Š Generating clinical-grade synthetic data (2000 users Γ— 365 days)...")
df = generate_dataset(n_users=2000, days=365, seed=42)
# ── Split ────────────────────────────────────────────────────────
df_train, df_val, df_test = group_split(df)
print(f"\nπŸ“‚ Split: Train={len(df_train):,} Val={len(df_val):,} Test={len(df_test):,}")
X_tr = df_train[FEATURE_NAMES].values.astype(np.float32)
y_tr = df_train["headache"].values.astype(np.int32)
X_va = df_val[FEATURE_NAMES].values.astype(np.float32)
y_va = df_val["headache"].values.astype(np.int32)
X_te = df_test[FEATURE_NAMES].values.astype(np.float32)
y_te = df_test["headache"].values.astype(np.int32)
neg, pos = np.bincount(y_tr)
print(f" Class: neg={neg:,} pos={pos:,} ratio={neg/pos:.2f}")
# ── Train ────────────────────────────────────────────────────────
print("\nπŸš€ Training HistGradientBoosting (production config)...")
model = HistGradientBoostingClassifier(
max_iter=800,
max_depth=6,
learning_rate=0.03,
min_samples_leaf=25,
max_leaf_nodes=48,
l2_regularization=0.8,
max_features=0.85,
early_stopping=True,
validation_fraction=0.08,
n_iter_no_change=50,
scoring="loss",
class_weight="balanced",
random_state=42,
)
model.fit(X_tr, y_tr)
print(f" Iterations: {model.n_iter_}")
# ── Calibrate ────────────────────────────────────────────────────
print("\nπŸ“ Probability calibration (isotonic, 5-fold)...")
calibrated = CalibratedClassifierCV(model, method="isotonic", cv=5)
calibrated.fit(X_tr, y_tr)
# ── Threshold ────────────────────────────────────────────────────
vp = calibrated.predict_proba(X_va)[:, 1]
opt_thr, vf1 = tune_threshold(y_va, vp)
print(f"\n🎯 Optimal threshold: {opt_thr:.3f} (val F1={vf1:.4f})")
# ── Evaluate ─────────────────────────────────────────────────────
val_m = evaluate(y_va, vp, opt_thr, "VALIDATION")
tp = calibrated.predict_proba(X_te)[:, 1]
test_m = evaluate(y_te, tp, opt_thr, "TEST")
# ── Per-archetype eval ───────────────────────────────────────────
print(f"\nπŸ“Š Per-archetype performance:")
for arch in ARCHETYPES:
mask = df_test["archetype"] == arch
if mask.sum() < 50:
continue
a_y = y_te[mask.values]
a_p = tp[mask.values]
try:
a_auc = roc_auc_score(a_y, a_p)
a_f1 = f1_score(a_y, (a_p >= opt_thr).astype(int), zero_division=0)
rate = a_y.mean()
print(f" {arch:25s} n={mask.sum():6,} rate={rate:.1%} AUC={a_auc:.3f} F1={a_f1:.3f}")
except:
pass
# ── Feature importance ───────────────────────────────────────────
print(f"\nπŸ“Š Permutation feature importance...")
base_auc = roc_auc_score(y_te, tp)
imps = np.zeros(NUM_FEATURES)
rng = np.random.RandomState(42)
for fi in range(NUM_FEATURES):
Xp = X_te.copy()
Xp[:, fi] = rng.permutation(Xp[:, fi])
pp = calibrated.predict_proba(Xp)[:, 1]
imps[fi] = base_auc - roc_auc_score(y_te, pp)
top_idx = np.argsort(imps)[-15:][::-1]
print(f"\n Top features:")
mx = max(imps.max(), 1e-6)
for r, i in enumerate(top_idx, 1):
bar = "β–ˆ" * max(1, int(imps[i] / mx * 35)) if imps[i] > 0 else "Β·"
print(f" {r:2d}. {FEATURE_NAMES[i]:30s} Ξ”AUC={imps[i]:+.4f} {bar}")
# ── Save ─────────────────────────────────────────────────────────
os.makedirs("model", exist_ok=True)
model_data = {
"model": calibrated,
"raw_model": model,
"optimal_threshold": opt_thr,
"feature_names": FEATURE_NAMES,
"num_features": NUM_FEATURES,
"model_version": "3.0.0",
"trained_at": datetime.now().isoformat(),
"test_metrics": test_m,
"val_metrics": val_m,
"training_rows": len(df_train),
"total_users": 2000,
"feature_importances": {
FEATURE_NAMES[i]: round(float(imps[i]), 6)
for i in top_idx if imps[i] > 0
},
}
with open("model/model.pkl", "wb") as f:
pickle.dump(model_data, f)
sz = os.path.getsize("model/model.pkl") / 1024
print(f"\nπŸ’Ύ model/model.pkl ({sz:.0f} KB)")
meta = {k: v for k, v in model_data.items() if k not in ("model", "raw_model")}
with open("model/metadata.json", "w") as f:
json.dump(meta, f, indent=2, default=str)
print(f"πŸ“‹ model/metadata.json")
os.makedirs("data", exist_ok=True)
df.to_parquet("data/training_data.parquet", index=False)
print(f"πŸ“ data/training_data.parquet")
print(f"\n{'═'*62}")
print(f" βœ… PRODUCTION MODEL READY β€” v3.0.0")
print(f" Test ROC-AUC: {test_m['roc_auc']}")
print(f" Test F1: {test_m['f1']}")
print(f" Threshold: {opt_thr:.3f}")
print(f"{'═'*62}")
if __name__ == "__main__":
main()