| """ |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| PHOEBE HEADACHE PREDICTOR v3.0 β Production Training Pipeline |
| EmpedocLabs Β© 2025 |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| |
| Clinical-grade synthetic data with user archetypes: |
| - Chronic migraineur (high baseline, medication dependent) |
| - Episodic tension-type (stress/sleep driven) |
| - Menstrual migraine (hormonal cycle dominant) |
| - Weather-sensitive (barometric pressure dominant) |
| - Mixed/general (moderate baseline) |
| |
| Leak-free: predicts day T headache using day T weather + day T-1 health/diary. |
| |
| 38 features matching the iOS DailySnapshotDTO: |
| WeatherKit forecast (6) β pressure, Ξp, |Ξp|, humidity, temp, drop flag |
| HealthKit yesterday (7) β sleep h/deep/rem, rhr, hrv, workout, menstrual |
| Diary yesterday (6) β headache, severity, duration, mood, #symptoms, #triggers |
| Diary 2-days-ago (3) β headache, severity, duration |
| Temporal (7) β dow sin/cos, month sin/cos, doy sin/cos, weekend |
| User context (3) β age, is_europe, is_tropical |
| Interactions (6) β sleepΓpressure, low_hrv, sleep_deficit, |
| high_humidity, streak_2d, consecutive_days |
| """ |
|
|
| import os, sys, math, random, pickle, json, warnings |
| import numpy as np |
| import pandas as pd |
| from datetime import datetime, timedelta |
| from sklearn.model_selection import GroupShuffleSplit |
| from sklearn.ensemble import HistGradientBoostingClassifier |
| from sklearn.calibration import CalibratedClassifierCV |
| from sklearn.metrics import ( |
| classification_report, roc_auc_score, f1_score, |
| precision_recall_curve, average_precision_score, confusion_matrix, |
| ) |
|
|
| warnings.filterwarnings("ignore") |
|
|
| |
| |
| |
|
|
| FEATURE_NAMES = [ |
| "pressure_mb", "pressure_change_24h", "pressure_volatility", |
| "humidity_pct", "temperature_c", "is_pressure_drop", |
| "sleep_total_hours", "deep_sleep_min", "rem_sleep_min", |
| "resting_hr", "hrv_avg_ms", "workout_min", "menstrual_flow_flag", |
| "had_headache_1d", "severity_1d", "duration_1d", |
| "mood_1d", "symptom_count_1d", "trigger_count_1d", |
| "had_headache_2d", "severity_2d", "duration_2d", |
| "dow_sin", "dow_cos", "month_sin", "month_cos", |
| "doy_sin", "doy_cos", "is_weekend", |
| "age_midpoint", "is_europe", "is_tropical", |
| "sleep_x_pressure", "low_hrv_flag", "sleep_deficit", |
| "high_humidity_flag", "headache_streak_2d", "consecutive_headache_days", |
| ] |
| NUM_FEATURES = len(FEATURE_NAMES) |
|
|
| |
| |
| |
|
|
| ARCHETYPES = { |
| "chronic_migraine": { |
| "weight": 0.20, |
| "base_rate": 0.40, |
| "sensitivity": 0.3, |
| "pressure_coeff": 0.5, |
| "sleep_coeff": 0.6, |
| "hrv_coeff": 0.3, |
| "menstrual_coeff": 0.4, |
| "humidity_coeff": 0.2, |
| "rebound_coeff": 0.7, |
| "weekend_coeff": 0.1, |
| "temp_coeff": 0.15, |
| }, |
| "episodic_tension": { |
| "weight": 0.25, |
| "base_rate": 0.12, |
| "sensitivity": 0.0, |
| "pressure_coeff": 0.2, |
| "sleep_coeff": 0.9, |
| "hrv_coeff": 0.7, |
| "menstrual_coeff": 0.1, |
| "humidity_coeff": 0.1, |
| "rebound_coeff": 0.2, |
| "weekend_coeff": 0.25, |
| "temp_coeff": 0.1, |
| }, |
| "menstrual_migraine": { |
| "weight": 0.20, |
| "base_rate": 0.15, |
| "sensitivity": 0.1, |
| "pressure_coeff": 0.3, |
| "sleep_coeff": 0.4, |
| "hrv_coeff": 0.3, |
| "menstrual_coeff": 1.2, |
| "humidity_coeff": 0.15, |
| "rebound_coeff": 0.4, |
| "weekend_coeff": 0.05, |
| "temp_coeff": 0.1, |
| }, |
| "weather_sensitive": { |
| "weight": 0.15, |
| "base_rate": 0.15, |
| "sensitivity": 0.1, |
| "pressure_coeff": 1.0, |
| "sleep_coeff": 0.3, |
| "hrv_coeff": 0.2, |
| "menstrual_coeff": 0.2, |
| "humidity_coeff": 0.6, |
| "rebound_coeff": 0.3, |
| "weekend_coeff": 0.05, |
| "temp_coeff": 0.4, |
| }, |
| "mixed_general": { |
| "weight": 0.20, |
| "base_rate": 0.18, |
| "sensitivity": 0.0, |
| "pressure_coeff": 0.4, |
| "sleep_coeff": 0.5, |
| "hrv_coeff": 0.4, |
| "menstrual_coeff": 0.3, |
| "humidity_coeff": 0.2, |
| "rebound_coeff": 0.35, |
| "weekend_coeff": 0.12, |
| "temp_coeff": 0.15, |
| }, |
| } |
|
|
|
|
| def _logistic(x): |
| return 1.0 / (1.0 + math.exp(-max(-20, min(20, x)))) |
|
|
|
|
| def _cyclic(val, period): |
| a = 2 * math.pi * val / period |
| return math.sin(a), math.cos(a) |
|
|
|
|
| |
| |
| |
|
|
| class User: |
| def __init__(self, uid): |
| self.uid = uid |
|
|
| |
| names = list(ARCHETYPES.keys()) |
| weights = [ARCHETYPES[n]["weight"] for n in names] |
| self.archetype_name = random.choices(names, weights=weights)[0] |
| self.arch = ARCHETYPES[self.archetype_name] |
|
|
| age_lo = random.choice([18, 20, 25, 30, 35, 40, 45, 50, 55, 60]) |
| self.age_mid = age_lo + 4.5 |
| self.region = random.choices( |
| ["europe", "americas", "asia", "tropical"], |
| weights=[40, 30, 20, 10], |
| )[0] |
|
|
| |
| if self.archetype_name == "menstrual_migraine": |
| self.is_female = True |
| else: |
| self.is_female = random.random() < 0.65 |
|
|
| |
| self.base_hr = random.gauss(65, 8) |
| self.base_hrv = random.gauss(45, 15) |
| self.base_sleep = random.gauss(7.0, 0.8) |
| self.personal_noise = random.gauss(0, 0.2) |
|
|
| |
| self.cycle_len = random.randint(26, 32) if self.is_female else 0 |
| self.cycle_off = random.randint(0, 30) |
|
|
|
|
| |
| |
| |
|
|
| def generate_user(user: User, n_days: int, start: datetime): |
| """Generate n_days of raw data, return leak-free (features, labels).""" |
| a = user.arch |
|
|
| |
| pressure = random.gauss(1013.25, 8) |
| temp = random.gauss(15, 10) |
| humidity = random.gauss(60, 15) |
|
|
| raw = [] |
|
|
| for d in range(n_days): |
| dt = start + timedelta(days=d) |
| month = dt.month |
|
|
| |
| seasonal_t = 15 + 14 * math.sin(2 * math.pi * (month - 4) / 12) |
| seasonal_h = 55 + 20 * math.sin(2 * math.pi * (month - 7) / 12) |
|
|
| |
| if random.random() < 0.08: |
| p_change = random.gauss(-8, 3) |
| elif random.random() < 0.05: |
| p_change = random.gauss(6, 2) |
| else: |
| p_change = random.gauss(0, 2.5) + 0.12 * (1013.25 - pressure) |
| pressure += p_change |
| pressure = max(970, min(1050, pressure)) |
|
|
| temp += random.gauss(0, 1.8) + 0.15 * (seasonal_t - temp) |
| humidity += random.gauss(0, 4) + 0.08 * (seasonal_h - humidity) |
| humidity = max(15, min(98, humidity)) |
|
|
| |
| |
| base_s = user.base_sleep + (0.5 if dt.weekday() >= 5 else 0) |
| sleep_h = max(2.5, random.gauss(base_s, 1.0)) |
|
|
| |
| if d > 0 and raw[d-1]["headache"]: |
| sleep_h = max(2.5, sleep_h - random.gauss(0.8, 0.5)) |
|
|
| deep = max(0, random.gauss(75 + sleep_h * 3, 18)) |
| rem = max(0, random.gauss(80 + sleep_h * 5, 22)) |
|
|
| rhr = max(45, random.gauss(user.base_hr, 4)) |
| |
| hrv_base = user.base_hrv + (sleep_h - 7) * 3 |
| hrv = max(8, random.gauss(hrv_base, 8)) |
|
|
| workout = max(0, int(random.gauss(25, 18))) |
| |
| if d > 0 and raw[d-1]["headache"]: |
| workout = max(0, workout - 15) |
|
|
| cycle_day = 0 |
| menstrual = False |
| if user.is_female and user.cycle_len > 0: |
| cycle_day = ((d + user.cycle_off) % user.cycle_len) + 1 |
| menstrual = cycle_day <= 4 |
|
|
| |
| base_rate = a["base_rate"] |
| lo = math.log(base_rate / (1 - base_rate)) |
| lo += user.personal_noise |
|
|
| |
| if p_change < -8: |
| lo += a["pressure_coeff"] * 1.2 |
| elif p_change < -5: |
| lo += a["pressure_coeff"] * 0.8 |
| elif p_change < -3: |
| lo += a["pressure_coeff"] * 0.4 |
| elif p_change > 8: |
| lo += a["pressure_coeff"] * 0.5 |
|
|
| |
| if sleep_h < 4: |
| lo += a["sleep_coeff"] * 1.2 |
| elif sleep_h < 5: |
| lo += a["sleep_coeff"] * 0.8 |
| elif sleep_h < 6: |
| lo += a["sleep_coeff"] * 0.4 |
| elif sleep_h > 9: |
| lo += a["sleep_coeff"] * 0.3 |
|
|
| |
| if hrv < 20: |
| lo += a["hrv_coeff"] * 1.0 |
| elif hrv < 30: |
| lo += a["hrv_coeff"] * 0.6 |
| elif hrv < 35: |
| lo += a["hrv_coeff"] * 0.2 |
|
|
| |
| if user.is_female and user.cycle_len > 0: |
| if cycle_day <= 3: |
| lo += a["menstrual_coeff"] * 1.0 |
| elif cycle_day <= 5: |
| lo += a["menstrual_coeff"] * 0.4 |
| elif cycle_day >= user.cycle_len - 2: |
| lo += a["menstrual_coeff"] * 0.7 |
|
|
| |
| if humidity > 85: |
| lo += a["humidity_coeff"] * 0.8 |
| elif humidity > 75: |
| lo += a["humidity_coeff"] * 0.3 |
|
|
| |
| if temp > 32 or temp < -8: |
| lo += a["temp_coeff"] * 0.8 |
| elif temp > 28 or temp < -3: |
| lo += a["temp_coeff"] * 0.3 |
|
|
| |
| if d > 0 and raw[d-1]["headache"]: |
| lo += a["rebound_coeff"] * 0.6 |
| if d > 1 and raw[d-2]["headache"]: |
| lo += a["rebound_coeff"] * 0.3 |
|
|
| |
| if dt.weekday() == 5: |
| lo += a["weekend_coeff"] |
| elif dt.weekday() == 6: |
| lo += a["weekend_coeff"] * 0.6 |
|
|
| |
| lo += random.gauss(0, 0.15) |
|
|
| prob = _logistic(lo) |
| headache = random.random() < prob |
|
|
| |
| if headache: |
| severity = random.choices([1,2,3,4,5], weights=[8,22,38,22,10])[0] |
| duration = round(max(0.5, random.gauss(2.5 + severity * 1.0, 1.5)), 1) |
| n_symp = random.randint(1, min(5, 1 + severity)) |
| n_trig = random.randint(0, min(4, severity)) |
| mood = random.choices([1,2,3,4,5], weights=[30,35,25,8,2])[0] |
| else: |
| severity, duration, n_symp, n_trig = 0, 0.0, 0, 0 |
| mood = random.choices([1,2,3,4,5], weights=[3,8,25,38,26])[0] |
|
|
| raw.append({ |
| "dt": dt, "pressure": pressure, "p_change": p_change, |
| "humidity": humidity, "temp": temp, |
| "sleep_h": round(sleep_h, 1), "deep": round(deep, 0), |
| "rem": round(rem, 0), "rhr": round(rhr, 0), |
| "hrv": round(hrv, 1), "workout": workout, |
| "menstrual": menstrual, |
| "headache": headache, "severity": severity, |
| "duration": duration, "mood": mood, |
| "n_symp": n_symp, "n_trig": n_trig, |
| }) |
|
|
| |
| rows, labels = [], [] |
| consec = 0 |
|
|
| for i in range(2, n_days): |
| t = raw[i] |
| y = raw[i - 1] |
| p = raw[i - 2] |
| dt = t["dt"] |
| f = [] |
|
|
| |
| f.append(t["pressure"]) |
| f.append(t["p_change"]) |
| f.append(abs(t["p_change"])) |
| f.append(t["humidity"]) |
| f.append(t["temp"]) |
| f.append(1.0 if t["p_change"] < -5 else 0.0) |
|
|
| |
| f.append(y["sleep_h"]) |
| f.append(y["deep"]) |
| f.append(y["rem"]) |
| f.append(y["rhr"]) |
| f.append(y["hrv"]) |
| f.append(float(y["workout"])) |
| f.append(1.0 if y["menstrual"] else 0.0) |
|
|
| |
| f.append(1.0 if y["headache"] else 0.0) |
| f.append(float(y["severity"])) |
| f.append(float(y["duration"])) |
| f.append(float(y["mood"])) |
| f.append(float(y["n_symp"])) |
| f.append(float(y["n_trig"])) |
|
|
| |
| f.append(1.0 if p["headache"] else 0.0) |
| f.append(float(p["severity"])) |
| f.append(float(p["duration"])) |
|
|
| |
| dw_s, dw_c = _cyclic(dt.weekday(), 7) |
| mn_s, mn_c = _cyclic(dt.month - 1, 12) |
| dy_s, dy_c = _cyclic(dt.timetuple().tm_yday, 365) |
| f.extend([dw_s, dw_c, mn_s, mn_c, dy_s, dy_c]) |
| f.append(1.0 if dt.weekday() >= 5 else 0.0) |
|
|
| |
| f.append(user.age_mid) |
| f.append(1.0 if "europe" in user.region else 0.0) |
| f.append(1.0 if "tropical" in user.region else 0.0) |
|
|
| |
| f.append(y["sleep_h"] * abs(t["p_change"])) |
| f.append(1.0 if y["hrv"] < 25 else 0.0) |
| f.append(max(0.0, 6.0 - y["sleep_h"])) |
| f.append(1.0 if t["humidity"] > 80 else 0.0) |
| streak = (1.0 if y["headache"] else 0.0) + (1.0 if p["headache"] else 0.0) |
| f.append(streak) |
| consec = (consec + 1) if y["headache"] else 0 |
| f.append(float(min(consec, 7))) |
|
|
| rows.append(f) |
| labels.append(1 if t["headache"] else 0) |
|
|
| return np.array(rows, dtype=np.float32), np.array(labels, dtype=np.int32) |
|
|
|
|
| |
| |
| |
|
|
| def generate_dataset(n_users=2000, days=365, seed=42): |
| random.seed(seed) |
| np.random.seed(seed) |
|
|
| all_X, all_y, all_uid, all_arch = [], [], [], [] |
| start = datetime(2023, 6, 1) |
|
|
| arch_counts = {} |
|
|
| for uid in range(n_users): |
| user = User(uid) |
| arch_counts[user.archetype_name] = arch_counts.get(user.archetype_name, 0) + 1 |
| X_u, y_u = generate_user(user, days, start) |
| all_X.append(X_u) |
| all_y.append(y_u) |
| all_uid.extend([uid] * len(y_u)) |
| all_arch.extend([user.archetype_name] * len(y_u)) |
| if (uid + 1) % 200 == 0: |
| print(f" {uid + 1}/{n_users} users generated") |
|
|
| X = np.vstack(all_X) |
| y = np.concatenate(all_y) |
|
|
| df = pd.DataFrame(X, columns=FEATURE_NAMES) |
| df["headache"] = y |
| df["user_id"] = all_uid |
| df["archetype"] = all_arch |
|
|
| print(f"\nβ
Dataset: {df.shape[0]:,} rows Γ {NUM_FEATURES} features") |
| print(f" Headache rate: {y.mean():.1%}") |
| print(f" Archetypes: {arch_counts}") |
| return df |
|
|
|
|
| |
| |
| |
|
|
| def group_split(df, test_f=0.12, val_f=0.12, seed=42): |
| gss = GroupShuffleSplit(n_splits=1, test_size=test_f, random_state=seed) |
| i_tv, i_te = next(gss.split(df, groups=df["user_id"])) |
| df_tv, df_test = df.iloc[i_tv], df.iloc[i_te] |
| rel_val = val_f / (1 - test_f) |
| gss2 = GroupShuffleSplit(n_splits=1, test_size=rel_val, random_state=seed) |
| i_tr, i_v = next(gss2.split(df_tv, groups=df_tv["user_id"])) |
| return df_tv.iloc[i_tr], df_tv.iloc[i_v], df_test |
|
|
|
|
| def tune_threshold(y_true, y_prob): |
| prec, rec, thr = precision_recall_curve(y_true, y_prob) |
| f1 = 2 * prec * rec / (prec + rec + 1e-8) |
| best = np.argmax(f1) |
| return float(thr[min(best, len(thr)-1)]), float(f1[best]) |
|
|
|
|
| def evaluate(y_true, y_prob, thr, label): |
| y_pred = (y_prob >= thr).astype(int) |
| print(f"\n{'β'*60}") |
| print(f" {label} (threshold={thr:.3f})") |
| print(f"{'β'*60}") |
| print(classification_report(y_true, y_pred, |
| target_names=["No headache", "Headache"], zero_division=0)) |
| auc = roc_auc_score(y_true, y_prob) |
| ap = average_precision_score(y_true, y_prob) |
| f1 = f1_score(y_true, y_pred, zero_division=0) |
| cm = confusion_matrix(y_true, y_pred) |
| print(f" ROC-AUC : {auc:.4f}") |
| print(f" PR-AUC : {ap:.4f}") |
| print(f" F1 : {f1:.4f}") |
| print(f" Confusion:\n{cm}") |
| return {"roc_auc": round(auc,4), "pr_auc": round(ap,4), "f1": round(f1,4)} |
|
|
|
|
| def main(): |
| print("=" * 62) |
| print(" PHOEBE HEADACHE PREDICTOR v3.0 β Production Training") |
| print(" EmpedocLabs | Beta Release Build") |
| print("=" * 62) |
|
|
| |
| print("\nπ Generating clinical-grade synthetic data (2000 users Γ 365 days)...") |
| df = generate_dataset(n_users=2000, days=365, seed=42) |
|
|
| |
| df_train, df_val, df_test = group_split(df) |
| print(f"\nπ Split: Train={len(df_train):,} Val={len(df_val):,} Test={len(df_test):,}") |
|
|
| X_tr = df_train[FEATURE_NAMES].values.astype(np.float32) |
| y_tr = df_train["headache"].values.astype(np.int32) |
| X_va = df_val[FEATURE_NAMES].values.astype(np.float32) |
| y_va = df_val["headache"].values.astype(np.int32) |
| X_te = df_test[FEATURE_NAMES].values.astype(np.float32) |
| y_te = df_test["headache"].values.astype(np.int32) |
|
|
| neg, pos = np.bincount(y_tr) |
| print(f" Class: neg={neg:,} pos={pos:,} ratio={neg/pos:.2f}") |
|
|
| |
| print("\nπ Training HistGradientBoosting (production config)...") |
| model = HistGradientBoostingClassifier( |
| max_iter=800, |
| max_depth=6, |
| learning_rate=0.03, |
| min_samples_leaf=25, |
| max_leaf_nodes=48, |
| l2_regularization=0.8, |
| max_features=0.85, |
| early_stopping=True, |
| validation_fraction=0.08, |
| n_iter_no_change=50, |
| scoring="loss", |
| class_weight="balanced", |
| random_state=42, |
| ) |
| model.fit(X_tr, y_tr) |
| print(f" Iterations: {model.n_iter_}") |
|
|
| |
| print("\nπ Probability calibration (isotonic, 5-fold)...") |
| calibrated = CalibratedClassifierCV(model, method="isotonic", cv=5) |
| calibrated.fit(X_tr, y_tr) |
|
|
| |
| vp = calibrated.predict_proba(X_va)[:, 1] |
| opt_thr, vf1 = tune_threshold(y_va, vp) |
| print(f"\nπ― Optimal threshold: {opt_thr:.3f} (val F1={vf1:.4f})") |
|
|
| |
| val_m = evaluate(y_va, vp, opt_thr, "VALIDATION") |
| tp = calibrated.predict_proba(X_te)[:, 1] |
| test_m = evaluate(y_te, tp, opt_thr, "TEST") |
|
|
| |
| print(f"\nπ Per-archetype performance:") |
| for arch in ARCHETYPES: |
| mask = df_test["archetype"] == arch |
| if mask.sum() < 50: |
| continue |
| a_y = y_te[mask.values] |
| a_p = tp[mask.values] |
| try: |
| a_auc = roc_auc_score(a_y, a_p) |
| a_f1 = f1_score(a_y, (a_p >= opt_thr).astype(int), zero_division=0) |
| rate = a_y.mean() |
| print(f" {arch:25s} n={mask.sum():6,} rate={rate:.1%} AUC={a_auc:.3f} F1={a_f1:.3f}") |
| except: |
| pass |
|
|
| |
| print(f"\nπ Permutation feature importance...") |
| base_auc = roc_auc_score(y_te, tp) |
| imps = np.zeros(NUM_FEATURES) |
| rng = np.random.RandomState(42) |
| for fi in range(NUM_FEATURES): |
| Xp = X_te.copy() |
| Xp[:, fi] = rng.permutation(Xp[:, fi]) |
| pp = calibrated.predict_proba(Xp)[:, 1] |
| imps[fi] = base_auc - roc_auc_score(y_te, pp) |
|
|
| top_idx = np.argsort(imps)[-15:][::-1] |
| print(f"\n Top features:") |
| mx = max(imps.max(), 1e-6) |
| for r, i in enumerate(top_idx, 1): |
| bar = "β" * max(1, int(imps[i] / mx * 35)) if imps[i] > 0 else "Β·" |
| print(f" {r:2d}. {FEATURE_NAMES[i]:30s} ΞAUC={imps[i]:+.4f} {bar}") |
|
|
| |
| os.makedirs("model", exist_ok=True) |
|
|
| model_data = { |
| "model": calibrated, |
| "raw_model": model, |
| "optimal_threshold": opt_thr, |
| "feature_names": FEATURE_NAMES, |
| "num_features": NUM_FEATURES, |
| "model_version": "3.0.0", |
| "trained_at": datetime.now().isoformat(), |
| "test_metrics": test_m, |
| "val_metrics": val_m, |
| "training_rows": len(df_train), |
| "total_users": 2000, |
| "feature_importances": { |
| FEATURE_NAMES[i]: round(float(imps[i]), 6) |
| for i in top_idx if imps[i] > 0 |
| }, |
| } |
|
|
| with open("model/model.pkl", "wb") as f: |
| pickle.dump(model_data, f) |
| sz = os.path.getsize("model/model.pkl") / 1024 |
| print(f"\nπΎ model/model.pkl ({sz:.0f} KB)") |
|
|
| meta = {k: v for k, v in model_data.items() if k not in ("model", "raw_model")} |
| with open("model/metadata.json", "w") as f: |
| json.dump(meta, f, indent=2, default=str) |
| print(f"π model/metadata.json") |
|
|
| os.makedirs("data", exist_ok=True) |
| df.to_parquet("data/training_data.parquet", index=False) |
| print(f"π data/training_data.parquet") |
|
|
| print(f"\n{'β'*62}") |
| print(f" β
PRODUCTION MODEL READY β v3.0.0") |
| print(f" Test ROC-AUC: {test_m['roc_auc']}") |
| print(f" Test F1: {test_m['f1']}") |
| print(f" Threshold: {opt_thr:.3f}") |
| print(f"{'β'*62}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|