| |
| """Generate 1M rows (k=1..N) of secp256k1 features for the parity-prediction task. |
| Output: features.parquet (k held aside for traceability; not used as input) |
| """ |
| import os, sys, time, math |
| import numpy as np |
| import pandas as pd |
|
|
| |
| p = 2**256 - 2**32 - 977 |
| n = 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141 |
| Gx = 55066263022277343669578718895168534326250603453777594175500187360389116729240 |
| Gy = 32670510020758816978083085130507043184471273380659243275938904335757337482424 |
|
|
| def inv(a): return pow(a, p-2, p) |
| def add(P, Q): |
| if P is None: return Q |
| if Q is None: return P |
| x1,y1=P; x2,y2=Q |
| if x1==x2 and (y1+y2)%p==0: return None |
| m=(3*x1*x1)*inv(2*y1)%p if P==Q else (y2-y1)*inv(x2-x1)%p |
| x3=(m*m-x1-x2)%p |
| return (x3,(m*(x1-x3)-y1)%p) |
|
|
| |
| def num_features(v, prefix): |
| s = str(v) |
| digs = [int(c) for c in s] |
| return { |
| f"{prefix}_num_digits": len(s), |
| f"{prefix}_first_digit": digs[0], |
| f"{prefix}_last_digit": digs[-1], |
| f"{prefix}_last2": v % 100, |
| f"{prefix}_last3": v % 1000, |
| f"{prefix}_digit_sum": sum(digs), |
| f"{prefix}_digit_sum_mod_9": sum(digs) % 9, |
| f"{prefix}_even_digit_count": sum(1 for d in digs if d%2==0), |
| f"{prefix}_odd_digit_count": sum(1 for d in digs if d%2==1), |
| f"{prefix}_zero_count": s.count("0"), |
| f"{prefix}_unique_digit_count": len(set(s)), |
| f"{prefix}_bit_length": v.bit_length(), |
| f"{prefix}_popcount": bin(v).count("1"), |
| f"{prefix}_state": v % 2, |
| f"{prefix}_mod_3": v % 3, |
| f"{prefix}_mod_5": v % 5, |
| f"{prefix}_mod_7": v % 7, |
| f"{prefix}_mod_11": v % 11, |
| f"{prefix}_mod_13": v % 13, |
| f"{prefix}_mod_17": v % 17, |
| f"{prefix}_mod_19": v % 19, |
| } |
|
|
| def main(N=1_000_000, out="features.parquet"): |
| G = (Gx, Gy) |
| P = None |
| rows = [] |
| t0 = time.time() |
| LOG = max(1, N // 20) |
| for k in range(1, N+1): |
| P = add(P, G) |
| x, y = P |
| sxd = sum(int(c) for c in str(x)) |
| syd = sum(int(c) for c in str(y)) |
| row = {"k": k, "k_state": k & 1} |
| row.update(num_features(x, "x")) |
| row.update(num_features(y, "y")) |
| row["abs_x_minus_y"] = abs(x - y) |
| row["x_gt_y"] = int(x > y) |
| row["digit_sum_diff_xy"] = sxd - syd |
| rows.append(row) |
| if k % LOG == 0: |
| elapsed = time.time() - t0 |
| rate = k / elapsed |
| eta = (N-k) / rate |
| print(f" k={k:>9}/{N} ({k/N*100:5.1f}%) rate={rate:>8.0f} k/s ETA={eta:5.0f}s", flush=True) |
| df = pd.DataFrame(rows) |
| |
| small_cols = [c for c in df.columns if c not in ("k","abs_x_minus_y")] |
| for c in small_cols: |
| if df[c].dtype != object and df[c].abs().max() < 2**15: |
| df[c] = df[c].astype("int16") |
| df["k"] = df["k"].astype("int64") |
| |
| df["abs_x_minus_y"] = df["abs_x_minus_y"].astype(str) |
| df.to_parquet(out, index=False, compression="snappy") |
| sz = os.path.getsize(out) / 1e6 |
| print(f"\nwrote {out} 路 {len(df)} rows 路 {len(df.columns)} cols 路 {sz:.1f} MB 路 {time.time()-t0:.1f}s total") |
|
|
| if __name__ == "__main__": |
| N = int(sys.argv[1]) if len(sys.argv) > 1 else 1_000_000 |
| main(N=N) |
|
|