hackinet's picture
Initial upload: negative-result study on secp256k1 parity prediction.
6b93c3b verified
#!/usr/bin/env python3
"""Train multiple models to predict k_state (parity of k) from x/y features.
Inputs: features.parquet
Output: results/ with metrics.json and model artifacts.
"""
import os, json, time, sys
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
import xgboost as xgb
import lightgbm as lgb
OUT = "results"
os.makedirs(OUT, exist_ok=True)
def split_by_k(df, frac_train=0.70, frac_val=0.15):
"""Sequential split by k so holdout is fully unseen k-range."""
n = len(df)
i1 = int(n * frac_train); i2 = int(n * (frac_train + frac_val))
return df.iloc[:i1], df.iloc[i1:i2], df.iloc[i2:]
def main(parquet="features.parquet"):
print(f"loading {parquet}...")
df = pd.read_parquet(parquet)
print(f" rows={len(df):,} cols={len(df.columns)}")
# exclude target, traceability column k, and the giant abs_x_minus_y (string), from features.
drop = {"k", "k_state", "abs_x_minus_y"}
feat_cols = [c for c in df.columns if c not in drop]
X = df[feat_cols].astype(np.float32).values
y = df["k_state"].astype(np.int8).values
print(f" features: {len(feat_cols)} · label balance: mean={y.mean():.4f}")
df_idx = pd.DataFrame({"k": df["k"].values, "y": y})
df_idx["_i"] = np.arange(len(df_idx))
tr, va, ho = split_by_k(df_idx)
Xtr, ytr = X[tr["_i"]], y[tr["_i"]]
Xva, yva = X[va["_i"]], y[va["_i"]]
Xho, yho = X[ho["_i"]], y[ho["_i"]]
print(f" splits: train={len(ytr)} val={len(yva)} holdout={len(yho)}")
results = {}
# ---- logistic regression (sanity baseline) ----
print("\n[1] LogisticRegression ...")
t=time.time()
sc = StandardScaler().fit(Xtr)
lr = LogisticRegression(max_iter=200, n_jobs=-1).fit(sc.transform(Xtr), ytr)
pred_va = lr.predict_proba(sc.transform(Xva))[:,1]
pred_ho = lr.predict_proba(sc.transform(Xho))[:,1]
results["logreg"] = {
"val_acc": float(accuracy_score(yva, pred_va>0.5)),
"val_auc": float(roc_auc_score(yva, pred_va)),
"ho_acc": float(accuracy_score(yho, pred_ho>0.5)),
"ho_auc": float(roc_auc_score(yho, pred_ho)),
"train_s": round(time.time()-t,1),
}
print(f" val_acc={results['logreg']['val_acc']:.4f} val_auc={results['logreg']['val_auc']:.4f} "
f"ho_acc={results['logreg']['ho_acc']:.4f} ho_auc={results['logreg']['ho_auc']:.4f}")
# ---- XGBoost (CPU; flip to gpu_hist if device available) ----
print("\n[2] XGBoost ...")
t=time.time()
try:
bst = xgb.XGBClassifier(
n_estimators=400, max_depth=6, learning_rate=0.1,
tree_method="hist", device="cuda",
eval_metric="logloss", n_jobs=-1)
bst.fit(Xtr, ytr, eval_set=[(Xva, yva)], verbose=False)
except Exception as e:
print(f" GPU XGBoost failed ({e}); falling back to CPU.")
bst = xgb.XGBClassifier(
n_estimators=400, max_depth=6, learning_rate=0.1,
tree_method="hist", eval_metric="logloss", n_jobs=-1)
bst.fit(Xtr, ytr, eval_set=[(Xva, yva)], verbose=False)
pred_va = bst.predict_proba(Xva)[:,1]
pred_ho = bst.predict_proba(Xho)[:,1]
results["xgboost"] = {
"val_acc": float(accuracy_score(yva, pred_va>0.5)),
"val_auc": float(roc_auc_score(yva, pred_va)),
"ho_acc": float(accuracy_score(yho, pred_ho>0.5)),
"ho_auc": float(roc_auc_score(yho, pred_ho)),
"train_s": round(time.time()-t,1),
}
print(f" val_acc={results['xgboost']['val_acc']:.4f} val_auc={results['xgboost']['val_auc']:.4f} "
f"ho_acc={results['xgboost']['ho_acc']:.4f} ho_auc={results['xgboost']['ho_auc']:.4f}")
bst.save_model(os.path.join(OUT, "xgb.json"))
fi = dict(sorted(zip(feat_cols, bst.feature_importances_), key=lambda x: -x[1])[:10])
results["xgboost"]["top_features"] = {k:float(v) for k,v in fi.items()}
print(f" top features: {list(fi.items())[:5]}")
# ---- LightGBM ----
print("\n[3] LightGBM ...")
t=time.time()
lgbm = lgb.LGBMClassifier(n_estimators=400, max_depth=-1, num_leaves=63,
learning_rate=0.05, n_jobs=-1, verbose=-1)
lgbm.fit(Xtr, ytr, eval_set=[(Xva, yva)])
pred_va = lgbm.predict_proba(Xva)[:,1]
pred_ho = lgbm.predict_proba(Xho)[:,1]
results["lightgbm"] = {
"val_acc": float(accuracy_score(yva, pred_va>0.5)),
"val_auc": float(roc_auc_score(yva, pred_va)),
"ho_acc": float(accuracy_score(yho, pred_ho>0.5)),
"ho_auc": float(roc_auc_score(yho, pred_ho)),
"train_s": round(time.time()-t,1),
}
print(f" val_acc={results['lightgbm']['val_acc']:.4f} val_auc={results['lightgbm']['val_auc']:.4f} "
f"ho_acc={results['lightgbm']['ho_acc']:.4f} ho_auc={results['lightgbm']['ho_auc']:.4f}")
lgbm.booster_.save_model(os.path.join(OUT, "lgbm.txt"))
# ---- MLP (PyTorch, GPU if available) ----
print("\n[4] MLP (PyTorch) ...")
import torch
import torch.nn as nn
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f" device: {device}")
t=time.time()
Xs = StandardScaler().fit(Xtr)
Xtr_t = torch.tensor(Xs.transform(Xtr), dtype=torch.float32, device=device)
ytr_t = torch.tensor(ytr, dtype=torch.float32, device=device)
Xva_t = torch.tensor(Xs.transform(Xva), dtype=torch.float32, device=device)
Xho_t = torch.tensor(Xs.transform(Xho), dtype=torch.float32, device=device)
yva_t = torch.tensor(yva, dtype=torch.float32, device=device)
yho_t = torch.tensor(yho, dtype=torch.float32, device=device)
D = Xtr.shape[1]
model = nn.Sequential(
nn.Linear(D, 512), nn.ReLU(),
nn.Linear(512, 512), nn.ReLU(),
nn.Linear(512, 256), nn.ReLU(),
nn.Linear(256, 1)
).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
crit = nn.BCEWithLogitsLoss()
BS = 8192
EPOCHS = 20
best_val_auc = 0
for ep in range(EPOCHS):
model.train()
idx = torch.randperm(len(ytr_t), device=device)
for i in range(0, len(idx), BS):
b = idx[i:i+BS]
logits = model(Xtr_t[b]).squeeze(1)
loss = crit(logits, ytr_t[b])
opt.zero_grad(); loss.backward(); opt.step()
model.eval()
with torch.no_grad():
pv = torch.sigmoid(model(Xva_t)).squeeze(1).cpu().numpy()
acc = float(accuracy_score(yva, pv>0.5))
auc = float(roc_auc_score(yva, pv))
if auc > best_val_auc:
best_val_auc = auc
torch.save(model.state_dict(), os.path.join(OUT, "mlp.pt"))
if (ep+1) % 5 == 0:
print(f" epoch {ep+1}/{EPOCHS} val_acc={acc:.4f} val_auc={auc:.4f}")
model.eval()
with torch.no_grad():
pv = torch.sigmoid(model(Xva_t)).squeeze(1).cpu().numpy()
ph = torch.sigmoid(model(Xho_t)).squeeze(1).cpu().numpy()
results["mlp"] = {
"val_acc": float(accuracy_score(yva, pv>0.5)),
"val_auc": float(roc_auc_score(yva, pv)),
"ho_acc": float(accuracy_score(yho, ph>0.5)),
"ho_auc": float(roc_auc_score(yho, ph)),
"train_s": round(time.time()-t,1),
}
print(f" val_acc={results['mlp']['val_acc']:.4f} val_auc={results['mlp']['val_auc']:.4f} "
f"ho_acc={results['mlp']['ho_acc']:.4f} ho_auc={results['mlp']['ho_auc']:.4f}")
# ---- permutation sanity check on XGBoost ----
print("\n[5] permutation sanity check (XGBoost on shuffled labels)...")
t=time.time()
yshuf = np.random.RandomState(42).permutation(ytr)
bst2 = xgb.XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1,
tree_method="hist", n_jobs=-1)
try: bst2.set_params(device="cuda")
except: pass
bst2.fit(Xtr, yshuf)
pred = bst2.predict_proba(Xho)[:,1]
results["permutation_xgb_ho"] = {
"acc": float(accuracy_score(yho, pred>0.5)),
"auc": float(roc_auc_score(yho, pred)),
}
print(f" holdout acc on shuffled-label model = {results['permutation_xgb_ho']['acc']:.4f} "
f"auc={results['permutation_xgb_ho']['auc']:.4f} (should be ~0.5/~0.5)")
# ---- save ----
with open(os.path.join(OUT, "metrics.json"), "w") as f:
json.dump(results, f, indent=2)
print(f"\nall metrics saved to {OUT}/metrics.json")
if __name__ == "__main__":
main()