| |
| """Train multiple models to predict k_state (parity of k) from x/y features. |
| Inputs: features.parquet |
| Output: results/ with metrics.json and model artifacts. |
| """ |
| import os, json, time, sys |
| import numpy as np |
| import pandas as pd |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.preprocessing import StandardScaler |
| from sklearn.metrics import accuracy_score, roc_auc_score, log_loss |
| import xgboost as xgb |
| import lightgbm as lgb |
|
|
| OUT = "results" |
| os.makedirs(OUT, exist_ok=True) |
|
|
| def split_by_k(df, frac_train=0.70, frac_val=0.15): |
| """Sequential split by k so holdout is fully unseen k-range.""" |
| n = len(df) |
| i1 = int(n * frac_train); i2 = int(n * (frac_train + frac_val)) |
| return df.iloc[:i1], df.iloc[i1:i2], df.iloc[i2:] |
|
|
| def main(parquet="features.parquet"): |
| print(f"loading {parquet}...") |
| df = pd.read_parquet(parquet) |
| print(f" rows={len(df):,} cols={len(df.columns)}") |
|
|
| |
| drop = {"k", "k_state", "abs_x_minus_y"} |
| feat_cols = [c for c in df.columns if c not in drop] |
| X = df[feat_cols].astype(np.float32).values |
| y = df["k_state"].astype(np.int8).values |
| print(f" features: {len(feat_cols)} · label balance: mean={y.mean():.4f}") |
|
|
| df_idx = pd.DataFrame({"k": df["k"].values, "y": y}) |
| df_idx["_i"] = np.arange(len(df_idx)) |
| tr, va, ho = split_by_k(df_idx) |
| Xtr, ytr = X[tr["_i"]], y[tr["_i"]] |
| Xva, yva = X[va["_i"]], y[va["_i"]] |
| Xho, yho = X[ho["_i"]], y[ho["_i"]] |
| print(f" splits: train={len(ytr)} val={len(yva)} holdout={len(yho)}") |
|
|
| results = {} |
|
|
| |
| print("\n[1] LogisticRegression ...") |
| t=time.time() |
| sc = StandardScaler().fit(Xtr) |
| lr = LogisticRegression(max_iter=200, n_jobs=-1).fit(sc.transform(Xtr), ytr) |
| pred_va = lr.predict_proba(sc.transform(Xva))[:,1] |
| pred_ho = lr.predict_proba(sc.transform(Xho))[:,1] |
| results["logreg"] = { |
| "val_acc": float(accuracy_score(yva, pred_va>0.5)), |
| "val_auc": float(roc_auc_score(yva, pred_va)), |
| "ho_acc": float(accuracy_score(yho, pred_ho>0.5)), |
| "ho_auc": float(roc_auc_score(yho, pred_ho)), |
| "train_s": round(time.time()-t,1), |
| } |
| print(f" val_acc={results['logreg']['val_acc']:.4f} val_auc={results['logreg']['val_auc']:.4f} " |
| f"ho_acc={results['logreg']['ho_acc']:.4f} ho_auc={results['logreg']['ho_auc']:.4f}") |
|
|
| |
| print("\n[2] XGBoost ...") |
| t=time.time() |
| try: |
| bst = xgb.XGBClassifier( |
| n_estimators=400, max_depth=6, learning_rate=0.1, |
| tree_method="hist", device="cuda", |
| eval_metric="logloss", n_jobs=-1) |
| bst.fit(Xtr, ytr, eval_set=[(Xva, yva)], verbose=False) |
| except Exception as e: |
| print(f" GPU XGBoost failed ({e}); falling back to CPU.") |
| bst = xgb.XGBClassifier( |
| n_estimators=400, max_depth=6, learning_rate=0.1, |
| tree_method="hist", eval_metric="logloss", n_jobs=-1) |
| bst.fit(Xtr, ytr, eval_set=[(Xva, yva)], verbose=False) |
| pred_va = bst.predict_proba(Xva)[:,1] |
| pred_ho = bst.predict_proba(Xho)[:,1] |
| results["xgboost"] = { |
| "val_acc": float(accuracy_score(yva, pred_va>0.5)), |
| "val_auc": float(roc_auc_score(yva, pred_va)), |
| "ho_acc": float(accuracy_score(yho, pred_ho>0.5)), |
| "ho_auc": float(roc_auc_score(yho, pred_ho)), |
| "train_s": round(time.time()-t,1), |
| } |
| print(f" val_acc={results['xgboost']['val_acc']:.4f} val_auc={results['xgboost']['val_auc']:.4f} " |
| f"ho_acc={results['xgboost']['ho_acc']:.4f} ho_auc={results['xgboost']['ho_auc']:.4f}") |
| bst.save_model(os.path.join(OUT, "xgb.json")) |
| fi = dict(sorted(zip(feat_cols, bst.feature_importances_), key=lambda x: -x[1])[:10]) |
| results["xgboost"]["top_features"] = {k:float(v) for k,v in fi.items()} |
| print(f" top features: {list(fi.items())[:5]}") |
|
|
| |
| print("\n[3] LightGBM ...") |
| t=time.time() |
| lgbm = lgb.LGBMClassifier(n_estimators=400, max_depth=-1, num_leaves=63, |
| learning_rate=0.05, n_jobs=-1, verbose=-1) |
| lgbm.fit(Xtr, ytr, eval_set=[(Xva, yva)]) |
| pred_va = lgbm.predict_proba(Xva)[:,1] |
| pred_ho = lgbm.predict_proba(Xho)[:,1] |
| results["lightgbm"] = { |
| "val_acc": float(accuracy_score(yva, pred_va>0.5)), |
| "val_auc": float(roc_auc_score(yva, pred_va)), |
| "ho_acc": float(accuracy_score(yho, pred_ho>0.5)), |
| "ho_auc": float(roc_auc_score(yho, pred_ho)), |
| "train_s": round(time.time()-t,1), |
| } |
| print(f" val_acc={results['lightgbm']['val_acc']:.4f} val_auc={results['lightgbm']['val_auc']:.4f} " |
| f"ho_acc={results['lightgbm']['ho_acc']:.4f} ho_auc={results['lightgbm']['ho_auc']:.4f}") |
| lgbm.booster_.save_model(os.path.join(OUT, "lgbm.txt")) |
|
|
| |
| print("\n[4] MLP (PyTorch) ...") |
| import torch |
| import torch.nn as nn |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| print(f" device: {device}") |
| t=time.time() |
| Xs = StandardScaler().fit(Xtr) |
| Xtr_t = torch.tensor(Xs.transform(Xtr), dtype=torch.float32, device=device) |
| ytr_t = torch.tensor(ytr, dtype=torch.float32, device=device) |
| Xva_t = torch.tensor(Xs.transform(Xva), dtype=torch.float32, device=device) |
| Xho_t = torch.tensor(Xs.transform(Xho), dtype=torch.float32, device=device) |
| yva_t = torch.tensor(yva, dtype=torch.float32, device=device) |
| yho_t = torch.tensor(yho, dtype=torch.float32, device=device) |
| D = Xtr.shape[1] |
| model = nn.Sequential( |
| nn.Linear(D, 512), nn.ReLU(), |
| nn.Linear(512, 512), nn.ReLU(), |
| nn.Linear(512, 256), nn.ReLU(), |
| nn.Linear(256, 1) |
| ).to(device) |
| opt = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4) |
| crit = nn.BCEWithLogitsLoss() |
| BS = 8192 |
| EPOCHS = 20 |
| best_val_auc = 0 |
| for ep in range(EPOCHS): |
| model.train() |
| idx = torch.randperm(len(ytr_t), device=device) |
| for i in range(0, len(idx), BS): |
| b = idx[i:i+BS] |
| logits = model(Xtr_t[b]).squeeze(1) |
| loss = crit(logits, ytr_t[b]) |
| opt.zero_grad(); loss.backward(); opt.step() |
| model.eval() |
| with torch.no_grad(): |
| pv = torch.sigmoid(model(Xva_t)).squeeze(1).cpu().numpy() |
| acc = float(accuracy_score(yva, pv>0.5)) |
| auc = float(roc_auc_score(yva, pv)) |
| if auc > best_val_auc: |
| best_val_auc = auc |
| torch.save(model.state_dict(), os.path.join(OUT, "mlp.pt")) |
| if (ep+1) % 5 == 0: |
| print(f" epoch {ep+1}/{EPOCHS} val_acc={acc:.4f} val_auc={auc:.4f}") |
| model.eval() |
| with torch.no_grad(): |
| pv = torch.sigmoid(model(Xva_t)).squeeze(1).cpu().numpy() |
| ph = torch.sigmoid(model(Xho_t)).squeeze(1).cpu().numpy() |
| results["mlp"] = { |
| "val_acc": float(accuracy_score(yva, pv>0.5)), |
| "val_auc": float(roc_auc_score(yva, pv)), |
| "ho_acc": float(accuracy_score(yho, ph>0.5)), |
| "ho_auc": float(roc_auc_score(yho, ph)), |
| "train_s": round(time.time()-t,1), |
| } |
| print(f" val_acc={results['mlp']['val_acc']:.4f} val_auc={results['mlp']['val_auc']:.4f} " |
| f"ho_acc={results['mlp']['ho_acc']:.4f} ho_auc={results['mlp']['ho_auc']:.4f}") |
|
|
| |
| print("\n[5] permutation sanity check (XGBoost on shuffled labels)...") |
| t=time.time() |
| yshuf = np.random.RandomState(42).permutation(ytr) |
| bst2 = xgb.XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, |
| tree_method="hist", n_jobs=-1) |
| try: bst2.set_params(device="cuda") |
| except: pass |
| bst2.fit(Xtr, yshuf) |
| pred = bst2.predict_proba(Xho)[:,1] |
| results["permutation_xgb_ho"] = { |
| "acc": float(accuracy_score(yho, pred>0.5)), |
| "auc": float(roc_auc_score(yho, pred)), |
| } |
| print(f" holdout acc on shuffled-label model = {results['permutation_xgb_ho']['acc']:.4f} " |
| f"auc={results['permutation_xgb_ho']['auc']:.4f} (should be ~0.5/~0.5)") |
|
|
| |
| with open(os.path.join(OUT, "metrics.json"), "w") as f: |
| json.dump(results, f, indent=2) |
| print(f"\nall metrics saved to {OUT}/metrics.json") |
|
|
| if __name__ == "__main__": |
| main() |
|
|