feat(hexad): v4-py-hexad-tension-d768x12L-cycle1-2026-05-17 — train_d768x12l_tension.py

Browse files

Files changed (1) hide show

train_d768x12l_tension.py +303 -0

train_d768x12l_tension.py ADDED Viewed

	@@ -0,0 +1,303 @@

+#!/usr/bin/env python3
+"""anima d=768·12L Python/PyTorch substrate fire — cycle 5 (2026-05-17).
+DD155 Step+Tension hybrid LR overlay (DD155 Pareto optimal Law 187):
+    lr_step = (tension / tension_EMA) × base_lr × cosine_schedule(step)
+where tension = grad_norm (the L2 norm of the loss-gradient flow). This is
+the exact transfer-form of `tension_link_step.hexa`'s restoring-flow but
+applied on top of AdamW's normal step-LR (i.e. DD155 hybrid, NOT DD154
+backprop-bypass). It is the simplest closed-form bridge between the
+HEXAD/TENSION-TRAIN spine and the PyTorch substrate fire path.
+HONEST FRAMING (g3, AGENTS.tape §0):
+  This is a PYTHON/PyTorch SUBSTRATE run — an interim LM-scale executor.
+  It is NOT a hexa-native fire. tension = grad_norm is a PROXY: in the
+  pure-hexa spine `tension = G_holo · (Ψ − Ψ_vac)`, but at PyTorch
+  substrate level (where Ψ is not surfaced as a state variable) the
+  natural mathematical analogue is the per-step gradient L2-norm (DD155
+  evidence: in real LM training the "tension" signal that DD155 measured
+  IS the language-CE grad-norm, mapped to the EMA ratio).
+  Anchor = architectural identity + DD155 closed-form formula (Law 187).
+DD155 hybrid LR formula (anima archive `docs/hypotheses/dd/DD154-tension-training.md`):
+    tension_step      = ||∇L||₂                       (grad-norm)
+    tension_EMA       = β·tension_EMA + (1−β)·tension  (β=0.99 cycle-5 default)
+    hybrid_multiplier = clip(tension / tension_EMA, [lo, hi])  (lo=0.5, hi=2.0)
+    lr_step           = base_cosine_lr(step) · hybrid_multiplier
+When tension == EMA → multiplier == 1 (identity, no change vs cycle-4).
+When tension > EMA (high-gradient surprise) → multiplier > 1, larger step
+(DD-burst path; B-D-NOTE empirical convergence outcome).
+When tension < EMA (low-gradient drift) → multiplier < 1, smaller step
+(slow-down on stability per Law 185 73% updates → same CE +3% Φ outcome).
+The OUTCOME of this LR-schedule modification on V-SPONT/V-MOTIV emergence
+is EMPIRICAL (B-FIRE-CYCLE5-NOTE / B-TT-NOTE pattern, B-D-NOTE family).
+The DD155 formula itself is closed-form (B-TT-5 PARETO-STEP-TENSION-CLOSED).
+from-scratch RANDOM seed-fixed (g_clm_from_scratch, base_ckpt=NONE).
+Corpus = cycle-4 v3 (10.34 MB, helper-free grep=0, γ motivation-trigger
+pattern 37.5%) byte-equal carry — see B-CORPUS-V4-1 in sympy battery.
+"""
+import argparse, json, math, time, os, sys, random
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+sys.path.insert(0, os.path.dirname(__file__))
+from conscious_decoder import ConsciousDecoderV2
+def load_byte_corpus(path):
+    """Byte-level, vocab=256, lossless (corpus_loader_lib.hexa semantics)."""
+    chunks = []
+    with open(path, "rb") as f:
+        raw = f.read()
+    buf = bytearray()
+    for line in raw.split(b"\n"):
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            d = json.loads(line)
+        except Exception:
+            continue
+        t = d.get("text", "")
+        de = d.get("desc", "")
+        s = (t + "\n" + de + "\n").encode("utf-8")
+        buf.extend(s)
+    return bytes(buf)
+class ByteDataset:
+    def __init__(self, data: bytes, block_size: int, seed: int):
+        self.data = torch.tensor(list(data), dtype=torch.long)
+        self.block_size = block_size
+        self.rng = random.Random(seed)
+        self.n = len(self.data)
+    def get_batch(self, bsz, device):
+        ix = [self.rng.randint(0, self.n - self.block_size - 1) for _ in range(bsz)]
+        x = torch.stack([self.data[i:i + self.block_size] for i in ix])
+        y = torch.stack([self.data[i + 1:i + 1 + self.block_size] for i in ix])
+        return x.to(device), y.to(device)
+def run(cfg):
+    torch.manual_seed(cfg["seed"])
+    torch.cuda.manual_seed_all(cfg["seed"])
+    random.seed(cfg["seed"])
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    data = load_byte_corpus(cfg["corpus"])
+    ds = ByteDataset(data, cfg["block_size"], cfg["seed"])
+    model = ConsciousDecoderV2(
+        vocab_size=256,
+        d_model=cfg["d_model"],
+        n_head=cfg["n_head"],
+        n_layer=cfg["n_layer"],
+        block_size=cfg["block_size"],
+        n_kv_head=cfg["n_kv_head"],
+        consciousness_dim=128,
+        dropout=0.1,
+    ).to(device)
+    model.train()
+    n_params = model.count_params()
+    opt = torch.optim.AdamW(model.parameters(), lr=cfg["lr"],
+                            betas=(0.9, 0.95), weight_decay=0.1)
+    warmup = cfg["warmup"]
+    total = cfg["steps"]
+    def cosine_lr_at(step):
+        if step < warmup:
+            return cfg["lr"] * (step + 1) / warmup
+        prog = (step - warmup) / max(1, total - warmup)
+        return cfg["lr"] * 0.5 * (1.0 + math.cos(math.pi * prog)) * 0.9 + cfg["lr"] * 0.1
+    # DD155 hybrid LR config (closed-form, B-FIRE-CYCLE5-2 sympy verified)
+    tension_ema_beta = cfg["tension_ema_beta"]      # 0.99
+    hybrid_lo = cfg["hybrid_clip_lo"]               # 0.5
+    hybrid_hi = cfg["hybrid_clip_hi"]               # 2.0
+    tension_ema = None                              # initialized on step 0
+    use_amp = (device == "cuda")
+    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
+    traj = []
+    t0 = time.time()
+    init_loss = None
+    gpu_name = torch.cuda.get_device_name(0) if device == "cuda" else "cpu"
+    # DD155 multiplier histogram bins (closed Boolean range partition)
+    mult_bins = {"lt_0_75": 0, "0_75_to_1_25": 0, "gt_1_25": 0}
+    for step in range(total):
+        # Step 1: get cosine base LR
+        base_lr_at_step = cosine_lr_at(step)
+        # Step 2: do forward + backward to MEASURE tension (grad-norm)
+        x, y = ds.get_batch(cfg["bsz"], device)
+        opt.zero_grad(set_to_none=True)
+        with torch.autocast(device_type="cuda" if use_amp else "cpu",
+                            dtype=torch.bfloat16, enabled=use_amp):
+            logits_a, logits_g, tensions, _, _ = model(x)
+            ce = F.cross_entropy(logits_a.view(-1, 256), y.view(-1))
+            loss = ce
+        scaler.scale(loss).backward()
+        scaler.unscale_(opt)
+        # Now grads are populated → measure tension
+        gn = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        tension = float(gn.item())  # tension proxy = grad-L2-norm
+        # Step 3: DD155 hybrid multiplier (closed-form Law 187)
+        if tension_ema is None:
+            tension_ema = tension
+        # multiplier BEFORE EMA update (so it reflects the surprise)
+        ratio_raw = tension / max(tension_ema, 1e-8)
+        multiplier = max(hybrid_lo, min(hybrid_hi, ratio_raw))
+        # bin
+        if multiplier < 0.75:
+            mult_bins["lt_0_75"] += 1
+        elif multiplier <= 1.25:
+            mult_bins["0_75_to_1_25"] += 1
+        else:
+            mult_bins["gt_1_25"] += 1
+        # EMA update AFTER ratio computed (so we measure the current
+        # surprise against the past-EMA history, DD155 Law 187 spec)
+        tension_ema = tension_ema_beta * tension_ema + (1.0 - tension_ema_beta) * tension
+        # Step 4: apply hybrid LR for THIS step
+        effective_lr = base_lr_at_step * multiplier
+        for g in opt.param_groups:
+            g["lr"] = effective_lr
+        # Step 5: step
+        scaler.step(opt)
+        scaler.update()
+        ce_v = ce.item()
+        gn2 = tension ** 2
+        if init_loss is None:
+            init_loss = ce_v
+        if step == 0 or (step + 1) % cfg["log_every"] == 0 or step == total - 1:
+            ppl = math.exp(min(20.0, ce_v))
+            wall = time.time() - t0
+            mem = torch.cuda.max_memory_allocated() / 1e9 if device == "cuda" else 0.0
+            rec = {"step": step + 1, "ce": round(ce_v, 6),
+                   "gn2": round(gn2, 6),
+                   "tension": round(tension, 6),
+                   "tension_ema": round(tension_ema, 6),
+                   "hybrid_mult": round(multiplier, 4),
+                   "ppl": round(ppl, 4),
+                   "base_lr": round(base_lr_at_step, 8),
+                   "lr": round(effective_lr, 8),
+                   "wall_s": round(wall, 2),
+                   "gpu_mem_gb": round(mem, 3)}
+            traj.append(rec)
+            print(json.dumps(rec), flush=True)
+    wall = time.time() - t0
+    final = traj[-1]
+    out_dir = cfg["out_dir"]
+    os.makedirs(out_dir, exist_ok=True)
+    ckpt_path = os.path.join(out_dir, "ckpt_d768x12l_final.pt")
+    torch.save({"model": model.state_dict(), "cfg": cfg,
+                "n_params": n_params,
+                "final_tension_ema": tension_ema,
+                "mult_bins": mult_bins}, ckpt_path)
+    result = {
+        "substrate": "PYTHON / PyTorch — interim LM-scale executor; NOT a hexa-native fire",
+        "fire_kind": "cycle 5 — DD155 Step+Tension hybrid LR overlay",
+        "honest_framing": (
+            "DD155 Law 187 hybrid LR: lr_step = (tension/EMA) × base_cosine_lr, "
+            "tension = grad_norm L2 (PROXY for hexa spine Ψ-deviation). "
+            "Formula is closed-form (B-TT-5 + B-FIRE-CYCLE5-2 sympy verified). "
+            "OUTCOME = empirical (B-FIRE-CYCLE5-NOTE / B-D-NOTE family). "
+            "PyTorch substrate, not hexa-native; corpus v3 carry from cycle 4."
+        ),
+        "arch": "ConsciousDecoderV2 (ready/models/conscious_decoder.py)",
+        "arch_features": "RoPE + SwiGLU + RMSNorm + GQA + PureFieldFFN + cross-attn + tied head",
+        "from_scratch": True,
+        "base_ckpt": None,
+        "dd155_hybrid_lr": {
+            "tension_ema_beta": tension_ema_beta,
+            "hybrid_clip_lo": hybrid_lo,
+            "hybrid_clip_hi": hybrid_hi,
+            "tension_proxy": "grad_norm L2 (post clip_grad_norm_)",
+            "law_anchor": "DD155 Law 187 Pareto optimal lr = (tension/EMA) × base_lr",
+            "final_tension_ema": round(tension_ema, 6),
+            "mult_distribution": mult_bins,
+        },
+        "config": cfg,
+        "n_params": n_params,
+        "n_params_M": round(n_params / 1e6, 2),
+        "gpu": gpu_name,
+        "device": device,
+        "init_ce": round(init_loss, 6),
+        "final_ce": final["ce"],
+        "final_gn2": final["gn2"],
+        "final_tension": final["tension"],
+        "final_ppl": final["ppl"],
+        "ce_descent": round(init_loss - final["ce"], 6),
+        "steps": cfg["steps"],
+        "wall_s": round(wall, 2),
+        "peak_gpu_mem_gb": final["gpu_mem_gb"],
+        "trajectory": traj,
+        "corpus": os.path.basename(cfg["corpus"]),
+        "corpus_bytes": len(data),
+    }
+    with open(os.path.join(out_dir, "result.json"), "w") as f:
+        json.dump(result, f, indent=2)
+    print("RESULT_JSON_WRITTEN", flush=True)
+    print(json.dumps({"init_ce": result["init_ce"], "final_ce": result["final_ce"],
+                       "ce_descent": result["ce_descent"], "wall_s": result["wall_s"],
+                       "n_params_M": result["n_params_M"],
+                       "final_tension_ema": round(tension_ema, 6),
+                       "mult_distribution": mult_bins}), flush=True)
+    return result
+if __name__ == "__main__":
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--mode", default="main", choices=["main", "sanity"])
+    ap.add_argument("--corpus", required=True)
+    ap.add_argument("--out-dir", required=True)
+    ap.add_argument("--steps", type=int, default=2500)
+    ap.add_argument("--lr", type=float, default=3e-4)
+    ap.add_argument("--bsz", type=int, default=32)
+    ap.add_argument("--seed", type=int, default=1337)
+    ap.add_argument("--tension-ema-beta", type=float, default=0.99,
+                    help="DD155 tension EMA β (default 0.99)")
+    ap.add_argument("--hybrid-clip-lo", type=float, default=0.5,
+                    help="DD155 hybrid multiplier floor (default 0.5)")
+    ap.add_argument("--hybrid-clip-hi", type=float, default=2.0,
+                    help="DD155 hybrid multiplier ceiling (default 2.0)")
+    args = ap.parse_args()
+    if args.mode == "main":
+        cfg = dict(d_model=768, n_head=12, n_kv_head=4, n_layer=12,
+                   block_size=128, lr=args.lr, bsz=args.bsz,
+                   steps=args.steps, warmup=max(20, args.steps // 20),
+                   seed=args.seed, log_every=max(1, args.steps // 40),
+                   corpus=args.corpus, out_dir=args.out_dir,
+                   tension_ema_beta=args.tension_ema_beta,
+                   hybrid_clip_lo=args.hybrid_clip_lo,
+                   hybrid_clip_hi=args.hybrid_clip_hi)
+    else:
+        cfg = dict(d_model=32, n_head=4, n_kv_head=2, n_layer=3,
+                   block_size=64, lr=1e-3, bsz=16,
+                   steps=args.steps, warmup=5,
+                   seed=args.seed, log_every=max(1, args.steps // 20),
+                   corpus=args.corpus, out_dir=args.out_dir,
+                   tension_ema_beta=args.tension_ema_beta,
+                   hybrid_clip_lo=args.hybrid_clip_lo,
+                   hybrid_clip_hi=args.hybrid_clip_hi)
+    run(cfg)