#!/usr/bin/env python3 """Train multiple models to predict k_state (parity of k) from x/y features. Inputs: features.parquet Output: results/ with metrics.json and model artifacts. """ import os, json, time, sys import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.metrics import accuracy_score, roc_auc_score, log_loss import xgboost as xgb import lightgbm as lgb OUT = "results" os.makedirs(OUT, exist_ok=True) def split_by_k(df, frac_train=0.70, frac_val=0.15): """Sequential split by k so holdout is fully unseen k-range.""" n = len(df) i1 = int(n * frac_train); i2 = int(n * (frac_train + frac_val)) return df.iloc[:i1], df.iloc[i1:i2], df.iloc[i2:] def main(parquet="features.parquet"): print(f"loading {parquet}...") df = pd.read_parquet(parquet) print(f" rows={len(df):,} cols={len(df.columns)}") # exclude target, traceability column k, and the giant abs_x_minus_y (string), from features. drop = {"k", "k_state", "abs_x_minus_y"} feat_cols = [c for c in df.columns if c not in drop] X = df[feat_cols].astype(np.float32).values y = df["k_state"].astype(np.int8).values print(f" features: {len(feat_cols)} ยท label balance: mean={y.mean():.4f}") df_idx = pd.DataFrame({"k": df["k"].values, "y": y}) df_idx["_i"] = np.arange(len(df_idx)) tr, va, ho = split_by_k(df_idx) Xtr, ytr = X[tr["_i"]], y[tr["_i"]] Xva, yva = X[va["_i"]], y[va["_i"]] Xho, yho = X[ho["_i"]], y[ho["_i"]] print(f" splits: train={len(ytr)} val={len(yva)} holdout={len(yho)}") results = {} # ---- logistic regression (sanity baseline) ---- print("\n[1] LogisticRegression ...") t=time.time() sc = StandardScaler().fit(Xtr) lr = LogisticRegression(max_iter=200, n_jobs=-1).fit(sc.transform(Xtr), ytr) pred_va = lr.predict_proba(sc.transform(Xva))[:,1] pred_ho = lr.predict_proba(sc.transform(Xho))[:,1] results["logreg"] = { "val_acc": float(accuracy_score(yva, pred_va>0.5)), "val_auc": float(roc_auc_score(yva, pred_va)), "ho_acc": float(accuracy_score(yho, pred_ho>0.5)), "ho_auc": float(roc_auc_score(yho, pred_ho)), "train_s": round(time.time()-t,1), } print(f" val_acc={results['logreg']['val_acc']:.4f} val_auc={results['logreg']['val_auc']:.4f} " f"ho_acc={results['logreg']['ho_acc']:.4f} ho_auc={results['logreg']['ho_auc']:.4f}") # ---- XGBoost (CPU; flip to gpu_hist if device available) ---- print("\n[2] XGBoost ...") t=time.time() try: bst = xgb.XGBClassifier( n_estimators=400, max_depth=6, learning_rate=0.1, tree_method="hist", device="cuda", eval_metric="logloss", n_jobs=-1) bst.fit(Xtr, ytr, eval_set=[(Xva, yva)], verbose=False) except Exception as e: print(f" GPU XGBoost failed ({e}); falling back to CPU.") bst = xgb.XGBClassifier( n_estimators=400, max_depth=6, learning_rate=0.1, tree_method="hist", eval_metric="logloss", n_jobs=-1) bst.fit(Xtr, ytr, eval_set=[(Xva, yva)], verbose=False) pred_va = bst.predict_proba(Xva)[:,1] pred_ho = bst.predict_proba(Xho)[:,1] results["xgboost"] = { "val_acc": float(accuracy_score(yva, pred_va>0.5)), "val_auc": float(roc_auc_score(yva, pred_va)), "ho_acc": float(accuracy_score(yho, pred_ho>0.5)), "ho_auc": float(roc_auc_score(yho, pred_ho)), "train_s": round(time.time()-t,1), } print(f" val_acc={results['xgboost']['val_acc']:.4f} val_auc={results['xgboost']['val_auc']:.4f} " f"ho_acc={results['xgboost']['ho_acc']:.4f} ho_auc={results['xgboost']['ho_auc']:.4f}") bst.save_model(os.path.join(OUT, "xgb.json")) fi = dict(sorted(zip(feat_cols, bst.feature_importances_), key=lambda x: -x[1])[:10]) results["xgboost"]["top_features"] = {k:float(v) for k,v in fi.items()} print(f" top features: {list(fi.items())[:5]}") # ---- LightGBM ---- print("\n[3] LightGBM ...") t=time.time() lgbm = lgb.LGBMClassifier(n_estimators=400, max_depth=-1, num_leaves=63, learning_rate=0.05, n_jobs=-1, verbose=-1) lgbm.fit(Xtr, ytr, eval_set=[(Xva, yva)]) pred_va = lgbm.predict_proba(Xva)[:,1] pred_ho = lgbm.predict_proba(Xho)[:,1] results["lightgbm"] = { "val_acc": float(accuracy_score(yva, pred_va>0.5)), "val_auc": float(roc_auc_score(yva, pred_va)), "ho_acc": float(accuracy_score(yho, pred_ho>0.5)), "ho_auc": float(roc_auc_score(yho, pred_ho)), "train_s": round(time.time()-t,1), } print(f" val_acc={results['lightgbm']['val_acc']:.4f} val_auc={results['lightgbm']['val_auc']:.4f} " f"ho_acc={results['lightgbm']['ho_acc']:.4f} ho_auc={results['lightgbm']['ho_auc']:.4f}") lgbm.booster_.save_model(os.path.join(OUT, "lgbm.txt")) # ---- MLP (PyTorch, GPU if available) ---- print("\n[4] MLP (PyTorch) ...") import torch import torch.nn as nn device = "cuda" if torch.cuda.is_available() else "cpu" print(f" device: {device}") t=time.time() Xs = StandardScaler().fit(Xtr) Xtr_t = torch.tensor(Xs.transform(Xtr), dtype=torch.float32, device=device) ytr_t = torch.tensor(ytr, dtype=torch.float32, device=device) Xva_t = torch.tensor(Xs.transform(Xva), dtype=torch.float32, device=device) Xho_t = torch.tensor(Xs.transform(Xho), dtype=torch.float32, device=device) yva_t = torch.tensor(yva, dtype=torch.float32, device=device) yho_t = torch.tensor(yho, dtype=torch.float32, device=device) D = Xtr.shape[1] model = nn.Sequential( nn.Linear(D, 512), nn.ReLU(), nn.Linear(512, 512), nn.ReLU(), nn.Linear(512, 256), nn.ReLU(), nn.Linear(256, 1) ).to(device) opt = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4) crit = nn.BCEWithLogitsLoss() BS = 8192 EPOCHS = 20 best_val_auc = 0 for ep in range(EPOCHS): model.train() idx = torch.randperm(len(ytr_t), device=device) for i in range(0, len(idx), BS): b = idx[i:i+BS] logits = model(Xtr_t[b]).squeeze(1) loss = crit(logits, ytr_t[b]) opt.zero_grad(); loss.backward(); opt.step() model.eval() with torch.no_grad(): pv = torch.sigmoid(model(Xva_t)).squeeze(1).cpu().numpy() acc = float(accuracy_score(yva, pv>0.5)) auc = float(roc_auc_score(yva, pv)) if auc > best_val_auc: best_val_auc = auc torch.save(model.state_dict(), os.path.join(OUT, "mlp.pt")) if (ep+1) % 5 == 0: print(f" epoch {ep+1}/{EPOCHS} val_acc={acc:.4f} val_auc={auc:.4f}") model.eval() with torch.no_grad(): pv = torch.sigmoid(model(Xva_t)).squeeze(1).cpu().numpy() ph = torch.sigmoid(model(Xho_t)).squeeze(1).cpu().numpy() results["mlp"] = { "val_acc": float(accuracy_score(yva, pv>0.5)), "val_auc": float(roc_auc_score(yva, pv)), "ho_acc": float(accuracy_score(yho, ph>0.5)), "ho_auc": float(roc_auc_score(yho, ph)), "train_s": round(time.time()-t,1), } print(f" val_acc={results['mlp']['val_acc']:.4f} val_auc={results['mlp']['val_auc']:.4f} " f"ho_acc={results['mlp']['ho_acc']:.4f} ho_auc={results['mlp']['ho_auc']:.4f}") # ---- permutation sanity check on XGBoost ---- print("\n[5] permutation sanity check (XGBoost on shuffled labels)...") t=time.time() yshuf = np.random.RandomState(42).permutation(ytr) bst2 = xgb.XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, tree_method="hist", n_jobs=-1) try: bst2.set_params(device="cuda") except: pass bst2.fit(Xtr, yshuf) pred = bst2.predict_proba(Xho)[:,1] results["permutation_xgb_ho"] = { "acc": float(accuracy_score(yho, pred>0.5)), "auc": float(roc_auc_score(yho, pred)), } print(f" holdout acc on shuffled-label model = {results['permutation_xgb_ho']['acc']:.4f} " f"auc={results['permutation_xgb_ho']['auc']:.4f} (should be ~0.5/~0.5)") # ---- save ---- with open(os.path.join(OUT, "metrics.json"), "w") as f: json.dump(results, f, indent=2) print(f"\nall metrics saved to {OUT}/metrics.json") if __name__ == "__main__": main()