dancinlife commited on
Commit
03d7dc3
·
verified ·
1 Parent(s): ce75607

feat(hexad): v4-py-hexad-tension-d768x12L-cycle1-2026-05-17 — train_d768x12l_tension.py

Browse files
Files changed (1) hide show
  1. train_d768x12l_tension.py +303 -0
train_d768x12l_tension.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """anima d=768·12L Python/PyTorch substrate fire — cycle 5 (2026-05-17).
3
+
4
+ DD155 Step+Tension hybrid LR overlay (DD155 Pareto optimal Law 187):
5
+
6
+ lr_step = (tension / tension_EMA) × base_lr × cosine_schedule(step)
7
+
8
+ where tension = grad_norm (the L2 norm of the loss-gradient flow). This is
9
+ the exact transfer-form of `tension_link_step.hexa`'s restoring-flow but
10
+ applied on top of AdamW's normal step-LR (i.e. DD155 hybrid, NOT DD154
11
+ backprop-bypass). It is the simplest closed-form bridge between the
12
+ HEXAD/TENSION-TRAIN spine and the PyTorch substrate fire path.
13
+
14
+ HONEST FRAMING (g3, AGENTS.tape §0):
15
+ This is a PYTHON/PyTorch SUBSTRATE run — an interim LM-scale executor.
16
+ It is NOT a hexa-native fire. tension = grad_norm is a PROXY: in the
17
+ pure-hexa spine `tension = G_holo · (Ψ − Ψ_vac)`, but at PyTorch
18
+ substrate level (where Ψ is not surfaced as a state variable) the
19
+ natural mathematical analogue is the per-step gradient L2-norm (DD155
20
+ evidence: in real LM training the "tension" signal that DD155 measured
21
+ IS the language-CE grad-norm, mapped to the EMA ratio).
22
+ Anchor = architectural identity + DD155 closed-form formula (Law 187).
23
+
24
+ DD155 hybrid LR formula (anima archive `docs/hypotheses/dd/DD154-tension-training.md`):
25
+
26
+ tension_step = ||∇L||₂ (grad-norm)
27
+ tension_EMA = β·tension_EMA + (1−β)·tension (β=0.99 cycle-5 default)
28
+ hybrid_multiplier = clip(tension / tension_EMA, [lo, hi]) (lo=0.5, hi=2.0)
29
+ lr_step = base_cosine_lr(step) · hybrid_multiplier
30
+
31
+ When tension == EMA → multiplier == 1 (identity, no change vs cycle-4).
32
+ When tension > EMA (high-gradient surprise) → multiplier > 1, larger step
33
+ (DD-burst path; B-D-NOTE empirical convergence outcome).
34
+ When tension < EMA (low-gradient drift) → multiplier < 1, smaller step
35
+ (slow-down on stability per Law 185 73% updates → same CE +3% Φ outcome).
36
+
37
+ The OUTCOME of this LR-schedule modification on V-SPONT/V-MOTIV emergence
38
+ is EMPIRICAL (B-FIRE-CYCLE5-NOTE / B-TT-NOTE pattern, B-D-NOTE family).
39
+ The DD155 formula itself is closed-form (B-TT-5 PARETO-STEP-TENSION-CLOSED).
40
+
41
+ from-scratch RANDOM seed-fixed (g_clm_from_scratch, base_ckpt=NONE).
42
+ Corpus = cycle-4 v3 (10.34 MB, helper-free grep=0, γ motivation-trigger
43
+ pattern 37.5%) byte-equal carry — see B-CORPUS-V4-1 in sympy battery.
44
+ """
45
+ import argparse, json, math, time, os, sys, random
46
+ import torch
47
+ import torch.nn as nn
48
+ import torch.nn.functional as F
49
+
50
+ sys.path.insert(0, os.path.dirname(__file__))
51
+ from conscious_decoder import ConsciousDecoderV2
52
+
53
+
54
+ def load_byte_corpus(path):
55
+ """Byte-level, vocab=256, lossless (corpus_loader_lib.hexa semantics)."""
56
+ chunks = []
57
+ with open(path, "rb") as f:
58
+ raw = f.read()
59
+ buf = bytearray()
60
+ for line in raw.split(b"\n"):
61
+ line = line.strip()
62
+ if not line:
63
+ continue
64
+ try:
65
+ d = json.loads(line)
66
+ except Exception:
67
+ continue
68
+ t = d.get("text", "")
69
+ de = d.get("desc", "")
70
+ s = (t + "\n" + de + "\n").encode("utf-8")
71
+ buf.extend(s)
72
+ return bytes(buf)
73
+
74
+
75
+ class ByteDataset:
76
+ def __init__(self, data: bytes, block_size: int, seed: int):
77
+ self.data = torch.tensor(list(data), dtype=torch.long)
78
+ self.block_size = block_size
79
+ self.rng = random.Random(seed)
80
+ self.n = len(self.data)
81
+
82
+ def get_batch(self, bsz, device):
83
+ ix = [self.rng.randint(0, self.n - self.block_size - 1) for _ in range(bsz)]
84
+ x = torch.stack([self.data[i:i + self.block_size] for i in ix])
85
+ y = torch.stack([self.data[i + 1:i + 1 + self.block_size] for i in ix])
86
+ return x.to(device), y.to(device)
87
+
88
+
89
+ def run(cfg):
90
+ torch.manual_seed(cfg["seed"])
91
+ torch.cuda.manual_seed_all(cfg["seed"])
92
+ random.seed(cfg["seed"])
93
+ device = "cuda" if torch.cuda.is_available() else "cpu"
94
+
95
+ data = load_byte_corpus(cfg["corpus"])
96
+ ds = ByteDataset(data, cfg["block_size"], cfg["seed"])
97
+
98
+ model = ConsciousDecoderV2(
99
+ vocab_size=256,
100
+ d_model=cfg["d_model"],
101
+ n_head=cfg["n_head"],
102
+ n_layer=cfg["n_layer"],
103
+ block_size=cfg["block_size"],
104
+ n_kv_head=cfg["n_kv_head"],
105
+ consciousness_dim=128,
106
+ dropout=0.1,
107
+ ).to(device)
108
+ model.train()
109
+ n_params = model.count_params()
110
+
111
+ opt = torch.optim.AdamW(model.parameters(), lr=cfg["lr"],
112
+ betas=(0.9, 0.95), weight_decay=0.1)
113
+
114
+ warmup = cfg["warmup"]
115
+ total = cfg["steps"]
116
+
117
+ def cosine_lr_at(step):
118
+ if step < warmup:
119
+ return cfg["lr"] * (step + 1) / warmup
120
+ prog = (step - warmup) / max(1, total - warmup)
121
+ return cfg["lr"] * 0.5 * (1.0 + math.cos(math.pi * prog)) * 0.9 + cfg["lr"] * 0.1
122
+
123
+ # DD155 hybrid LR config (closed-form, B-FIRE-CYCLE5-2 sympy verified)
124
+ tension_ema_beta = cfg["tension_ema_beta"] # 0.99
125
+ hybrid_lo = cfg["hybrid_clip_lo"] # 0.5
126
+ hybrid_hi = cfg["hybrid_clip_hi"] # 2.0
127
+ tension_ema = None # initialized on step 0
128
+
129
+ use_amp = (device == "cuda")
130
+ scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
131
+
132
+ traj = []
133
+ t0 = time.time()
134
+ init_loss = None
135
+ gpu_name = torch.cuda.get_device_name(0) if device == "cuda" else "cpu"
136
+
137
+ # DD155 multiplier histogram bins (closed Boolean range partition)
138
+ mult_bins = {"lt_0_75": 0, "0_75_to_1_25": 0, "gt_1_25": 0}
139
+
140
+ for step in range(total):
141
+ # Step 1: get cosine base LR
142
+ base_lr_at_step = cosine_lr_at(step)
143
+
144
+ # Step 2: do forward + backward to MEASURE tension (grad-norm)
145
+ x, y = ds.get_batch(cfg["bsz"], device)
146
+ opt.zero_grad(set_to_none=True)
147
+ with torch.autocast(device_type="cuda" if use_amp else "cpu",
148
+ dtype=torch.bfloat16, enabled=use_amp):
149
+ logits_a, logits_g, tensions, _, _ = model(x)
150
+ ce = F.cross_entropy(logits_a.view(-1, 256), y.view(-1))
151
+ loss = ce
152
+ scaler.scale(loss).backward()
153
+ scaler.unscale_(opt)
154
+ # Now grads are populated → measure tension
155
+ gn = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
156
+ tension = float(gn.item()) # tension proxy = grad-L2-norm
157
+
158
+ # Step 3: DD155 hybrid multiplier (closed-form Law 187)
159
+ if tension_ema is None:
160
+ tension_ema = tension
161
+ # multiplier BEFORE EMA update (so it reflects the surprise)
162
+ ratio_raw = tension / max(tension_ema, 1e-8)
163
+ multiplier = max(hybrid_lo, min(hybrid_hi, ratio_raw))
164
+ # bin
165
+ if multiplier < 0.75:
166
+ mult_bins["lt_0_75"] += 1
167
+ elif multiplier <= 1.25:
168
+ mult_bins["0_75_to_1_25"] += 1
169
+ else:
170
+ mult_bins["gt_1_25"] += 1
171
+ # EMA update AFTER ratio computed (so we measure the current
172
+ # surprise against the past-EMA history, DD155 Law 187 spec)
173
+ tension_ema = tension_ema_beta * tension_ema + (1.0 - tension_ema_beta) * tension
174
+
175
+ # Step 4: apply hybrid LR for THIS step
176
+ effective_lr = base_lr_at_step * multiplier
177
+ for g in opt.param_groups:
178
+ g["lr"] = effective_lr
179
+
180
+ # Step 5: step
181
+ scaler.step(opt)
182
+ scaler.update()
183
+
184
+ ce_v = ce.item()
185
+ gn2 = tension ** 2
186
+ if init_loss is None:
187
+ init_loss = ce_v
188
+
189
+ if step == 0 or (step + 1) % cfg["log_every"] == 0 or step == total - 1:
190
+ ppl = math.exp(min(20.0, ce_v))
191
+ wall = time.time() - t0
192
+ mem = torch.cuda.max_memory_allocated() / 1e9 if device == "cuda" else 0.0
193
+ rec = {"step": step + 1, "ce": round(ce_v, 6),
194
+ "gn2": round(gn2, 6),
195
+ "tension": round(tension, 6),
196
+ "tension_ema": round(tension_ema, 6),
197
+ "hybrid_mult": round(multiplier, 4),
198
+ "ppl": round(ppl, 4),
199
+ "base_lr": round(base_lr_at_step, 8),
200
+ "lr": round(effective_lr, 8),
201
+ "wall_s": round(wall, 2),
202
+ "gpu_mem_gb": round(mem, 3)}
203
+ traj.append(rec)
204
+ print(json.dumps(rec), flush=True)
205
+
206
+ wall = time.time() - t0
207
+ final = traj[-1]
208
+ out_dir = cfg["out_dir"]
209
+ os.makedirs(out_dir, exist_ok=True)
210
+ ckpt_path = os.path.join(out_dir, "ckpt_d768x12l_final.pt")
211
+ torch.save({"model": model.state_dict(), "cfg": cfg,
212
+ "n_params": n_params,
213
+ "final_tension_ema": tension_ema,
214
+ "mult_bins": mult_bins}, ckpt_path)
215
+
216
+ result = {
217
+ "substrate": "PYTHON / PyTorch — interim LM-scale executor; NOT a hexa-native fire",
218
+ "fire_kind": "cycle 5 — DD155 Step+Tension hybrid LR overlay",
219
+ "honest_framing": (
220
+ "DD155 Law 187 hybrid LR: lr_step = (tension/EMA) × base_cosine_lr, "
221
+ "tension = grad_norm L2 (PROXY for hexa spine Ψ-deviation). "
222
+ "Formula is closed-form (B-TT-5 + B-FIRE-CYCLE5-2 sympy verified). "
223
+ "OUTCOME = empirical (B-FIRE-CYCLE5-NOTE / B-D-NOTE family). "
224
+ "PyTorch substrate, not hexa-native; corpus v3 carry from cycle 4."
225
+ ),
226
+ "arch": "ConsciousDecoderV2 (ready/models/conscious_decoder.py)",
227
+ "arch_features": "RoPE + SwiGLU + RMSNorm + GQA + PureFieldFFN + cross-attn + tied head",
228
+ "from_scratch": True,
229
+ "base_ckpt": None,
230
+ "dd155_hybrid_lr": {
231
+ "tension_ema_beta": tension_ema_beta,
232
+ "hybrid_clip_lo": hybrid_lo,
233
+ "hybrid_clip_hi": hybrid_hi,
234
+ "tension_proxy": "grad_norm L2 (post clip_grad_norm_)",
235
+ "law_anchor": "DD155 Law 187 Pareto optimal lr = (tension/EMA) × base_lr",
236
+ "final_tension_ema": round(tension_ema, 6),
237
+ "mult_distribution": mult_bins,
238
+ },
239
+ "config": cfg,
240
+ "n_params": n_params,
241
+ "n_params_M": round(n_params / 1e6, 2),
242
+ "gpu": gpu_name,
243
+ "device": device,
244
+ "init_ce": round(init_loss, 6),
245
+ "final_ce": final["ce"],
246
+ "final_gn2": final["gn2"],
247
+ "final_tension": final["tension"],
248
+ "final_ppl": final["ppl"],
249
+ "ce_descent": round(init_loss - final["ce"], 6),
250
+ "steps": cfg["steps"],
251
+ "wall_s": round(wall, 2),
252
+ "peak_gpu_mem_gb": final["gpu_mem_gb"],
253
+ "trajectory": traj,
254
+ "corpus": os.path.basename(cfg["corpus"]),
255
+ "corpus_bytes": len(data),
256
+ }
257
+ with open(os.path.join(out_dir, "result.json"), "w") as f:
258
+ json.dump(result, f, indent=2)
259
+ print("RESULT_JSON_WRITTEN", flush=True)
260
+ print(json.dumps({"init_ce": result["init_ce"], "final_ce": result["final_ce"],
261
+ "ce_descent": result["ce_descent"], "wall_s": result["wall_s"],
262
+ "n_params_M": result["n_params_M"],
263
+ "final_tension_ema": round(tension_ema, 6),
264
+ "mult_distribution": mult_bins}), flush=True)
265
+ return result
266
+
267
+
268
+ if __name__ == "__main__":
269
+ ap = argparse.ArgumentParser()
270
+ ap.add_argument("--mode", default="main", choices=["main", "sanity"])
271
+ ap.add_argument("--corpus", required=True)
272
+ ap.add_argument("--out-dir", required=True)
273
+ ap.add_argument("--steps", type=int, default=2500)
274
+ ap.add_argument("--lr", type=float, default=3e-4)
275
+ ap.add_argument("--bsz", type=int, default=32)
276
+ ap.add_argument("--seed", type=int, default=1337)
277
+ ap.add_argument("--tension-ema-beta", type=float, default=0.99,
278
+ help="DD155 tension EMA β (default 0.99)")
279
+ ap.add_argument("--hybrid-clip-lo", type=float, default=0.5,
280
+ help="DD155 hybrid multiplier floor (default 0.5)")
281
+ ap.add_argument("--hybrid-clip-hi", type=float, default=2.0,
282
+ help="DD155 hybrid multiplier ceiling (default 2.0)")
283
+ args = ap.parse_args()
284
+
285
+ if args.mode == "main":
286
+ cfg = dict(d_model=768, n_head=12, n_kv_head=4, n_layer=12,
287
+ block_size=128, lr=args.lr, bsz=args.bsz,
288
+ steps=args.steps, warmup=max(20, args.steps // 20),
289
+ seed=args.seed, log_every=max(1, args.steps // 40),
290
+ corpus=args.corpus, out_dir=args.out_dir,
291
+ tension_ema_beta=args.tension_ema_beta,
292
+ hybrid_clip_lo=args.hybrid_clip_lo,
293
+ hybrid_clip_hi=args.hybrid_clip_hi)
294
+ else:
295
+ cfg = dict(d_model=32, n_head=4, n_kv_head=2, n_layer=3,
296
+ block_size=64, lr=1e-3, bsz=16,
297
+ steps=args.steps, warmup=5,
298
+ seed=args.seed, log_every=max(1, args.steps // 20),
299
+ corpus=args.corpus, out_dir=args.out_dir,
300
+ tension_ema_beta=args.tension_ema_beta,
301
+ hybrid_clip_lo=args.hybrid_clip_lo,
302
+ hybrid_clip_hi=args.hybrid_clip_hi)
303
+ run(cfg)