#!/usr/bin/env python3
"""Predict k_state (parity of k) from (x, y) using the trained models."""
import sys, os, json
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb
import torch
import torch.nn as nn

p = 2**256 - 2**32 - 977

def on_curve(x, y): return (y*y - (x*x*x + 7)) % p == 0

def num_features(v, prefix):
    s = str(v); digs = [int(c) for c in s]
    return {
        f"{prefix}_num_digits": len(s),
        f"{prefix}_first_digit": digs[0],
        f"{prefix}_last_digit": digs[-1],
        f"{prefix}_last2": v % 100,
        f"{prefix}_last3": v % 1000,
        f"{prefix}_digit_sum": sum(digs),
        f"{prefix}_digit_sum_mod_9": sum(digs) % 9,
        f"{prefix}_even_digit_count": sum(1 for d in digs if d%2==0),
        f"{prefix}_odd_digit_count":  sum(1 for d in digs if d%2==1),
        f"{prefix}_zero_count": s.count("0"),
        f"{prefix}_unique_digit_count": len(set(s)),
        f"{prefix}_bit_length": v.bit_length(),
        f"{prefix}_popcount": bin(v).count("1"),
        f"{prefix}_state": v % 2,
        f"{prefix}_mod_3":  v % 3, f"{prefix}_mod_5":  v % 5,
        f"{prefix}_mod_7":  v % 7, f"{prefix}_mod_11": v % 11,
        f"{prefix}_mod_13": v % 13, f"{prefix}_mod_17": v % 17,
        f"{prefix}_mod_19": v % 19,
    }

def featurize(x, y):
    sxd = sum(int(c) for c in str(x))
    syd = sum(int(c) for c in str(y))
    row = {}
    row.update(num_features(x, "x"))
    row.update(num_features(y, "y"))
    row["x_gt_y"] = int(x > y)
    row["digit_sum_diff_xy"] = sxd - syd
    return row

def main():
    x = int(sys.argv[1]); y = int(sys.argv[2])
    if not on_curve(x, y):
        print("⚠ (x, y) is NOT on secp256k1. Refusing to predict on an invalid point.")
        sys.exit(1)
    print(f"X = {x}")
    print(f"Y = {y}")

    # Same column order as training: must match df.drop({k, k_state, abs_x_minus_y}) order.
    # Reproduce by reading the training parquet header to be safe.
    df = pd.read_parquet("features.parquet")
    drop = {"k", "k_state", "abs_x_minus_y"}
    feat_cols = [c for c in df.columns if c not in drop]

    feat = featurize(x, y)
    Xv = np.array([[feat[c] for c in feat_cols]], dtype=np.float32)
    print(f"\nfeatures: {len(feat_cols)} columns")

    # XGBoost
    bst = xgb.XGBClassifier()
    bst.load_model("results/xgb.json")
    p_xgb = float(bst.predict_proba(Xv)[0,1])

    # LightGBM
    lgbm = lgb.Booster(model_file="results/lgbm.txt")
    p_lgb = float(lgbm.predict(Xv)[0])

    # MLP — fit a scaler from training X for parity with training
    drop2 = drop
    Xtrain = df[[c for c in df.columns if c not in drop2]].astype(np.float32).values
    sc = StandardScaler().fit(Xtrain[:int(0.7*len(Xtrain))])
    Xs = sc.transform(Xv)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    D = Xv.shape[1]
    mlp = nn.Sequential(
        nn.Linear(D,512), nn.ReLU(),
        nn.Linear(512,512), nn.ReLU(),
        nn.Linear(512,256), nn.ReLU(),
        nn.Linear(256,1)
    ).to(device)
    mlp.load_state_dict(torch.load("results/mlp.pt", map_location=device))
    mlp.eval()
    with torch.no_grad():
        logit = mlp(torch.tensor(Xs, dtype=torch.float32, device=device)).squeeze().item()
    p_mlp = 1/(1+np.exp(-logit))

    avg = (p_xgb + p_lgb + p_mlp) / 3

    print("\n--- predictions (probability that k is ODD) ---")
    print(f"  XGBoost   : {p_xgb:.4f}  ->  parity = {int(p_xgb>0.5)}")
    print(f"  LightGBM  : {p_lgb:.4f}  ->  parity = {int(p_lgb>0.5)}")
    print(f"  MLP       : {p_mlp:.4f}  ->  parity = {int(p_mlp>0.5)}")
    print(f"  Ensemble  : {avg:.4f}    ->  parity = {int(avg>0.5)}")
    print("\nReminder: all models train-time landed at 50% accuracy on held-out k's,")
    print("so any specific prediction here is essentially a coin flip with a confidence number attached.")

if __name__ == "__main__":
    main()