Upgrade VRAM-first DiffusionBlocks trainer

Files changed (5) hide show

README.md +8 -0
dblocks_train.py +226 -50
fused_ce.py +45 -19
nB300_agillm4_vram_dblock.py +15 -0
relaunch_agillm4_dblock.sh +3 -1

README.md CHANGED Viewed

@@ -44,6 +44,8 @@ whose released code is ViT/classification only.
 - `--tie_weights` now means AR, SAT, and NAT share the embedding projection tensor. This drops the live parameter count from 1,213,418,242 to 716,595,202.
 - Old untied checkpoint head matrices are intentionally skipped under tied mode; core weights still warm-start and the optimizer can rebuild.
 - SAT now uses fused vocab-streaming CE in the dblock path, and the dblock step releases AR/SAT activations before moving to the next objective.
 ## Honest findings
 - DiffusionBlocks and gradient-checkpointing are **substitutes** for activation
@@ -59,4 +61,10 @@ only: old untied AR/SAT/NAT head tensors are skipped when tied heads are active,
 optimizer state is allowed to reset. The priority is lower VRAM over preserving every
 old training assumption.
 License: Apache-2.0 (matching the upstream method).

 - `--tie_weights` now means AR, SAT, and NAT share the embedding projection tensor. This drops the live parameter count from 1,213,418,242 to 716,595,202.
 - Old untied checkpoint head matrices are intentionally skipped under tied mode; core weights still warm-start and the optimizer can rebuild.
 - SAT now uses fused vocab-streaming CE in the dblock path, and the dblock step releases AR/SAT activations before moving to the next objective.
+- DBlock now uses loss-balanced block scheduling after warmup, per-block EMA diagnostics, sigma-range curriculum, objective weights, and peak VRAM logging.
+- The folded-in DBlock path now builds the dense causal/SAT masks once per objective instead of once per layer, and NAT obeys `--nat_max_tokens` so long-context AR does not force full-context NAT memory.
 ## Honest findings
 - DiffusionBlocks and gradient-checkpointing are **substitutes** for activation
 optimizer state is allowed to reset. The priority is lower VRAM over preserving every
 old training assumption.
+Upgrade update 2026-05-29: DBlock is no longer just a random-block prototype. The live
+path now has loss-balanced scheduling, sigma curriculum, DBlock objective weights,
+per-block loss/VRAM logging, single-build masks per objective, and NAT token capping.
+These are meant to preserve the VRAM breakthrough while making block-wise training
+less brittle over long runs.
 License: Apache-2.0 (matching the upstream method).

dblocks_train.py CHANGED Viewed

@@ -5,63 +5,239 @@ Block-wise EDM denoising on the real Encoder blocks, supervising AR + SAT(fixed+
 CE. Reuses the live data stream / optimizer / checkpointing of nB300_agillm4.
 Lazy-imports nB300 inside functions to avoid a circular import.
 """
-import math, random, numpy as np, torch, torch.nn as nn, torch.nn.functional as F
 import torch.utils.checkpoint as _ck
 from fused_ce import fused_ce
-SD=0.5
-def _cdf(x): return 0.5*(1+math.erf(x/math.sqrt(2)))
-def _ppf(p): return float(torch.erfinv(torch.tensor(2*p-1.0))*math.sqrt(2))
-def _block_sigmas(B,smin=0.002,smax=80.0,pm=-1.2,ps=1.2):
-    a,b=_cdf((math.log(smin)-pm)/ps),_cdf((math.log(smax)-pm)/ps)
-    return [float(np.exp(pm+ps*_ppf(a+(b-a)*(i/B)))) for i in range(B+1)]
-def _edm_pre(s): s=s[:,None,None]; return SD**2/(s**2+SD**2), s*SD/(s**2+SD**2)**0.5, 1/(s**2+SD**2)**0.5
-def _edm_w(s,wmax=5.0): return float(((s**2+SD**2)/(s*SD)**2).clamp(max=wmax).mean())
 def _dblock_init(core, args):
-    B=int(getattr(args,"dblock_blocks",4)); L=len(core.blocks); sp=max(1,L//B)
-    asg=[list(range(i*sp,(i+1)*sp)) for i in range(B)]; asg[-1]=list(range((B-1)*sp,L))
     print(f"[dblock] DiffusionBlocks mode: {L} layers -> {B} blocks {asg}")
-    print(f"[dblock] equi-prob sigma boundaries: {[round(x,3) for x in _block_sigmas(B)]}")
-    return {"B":B,"assign":asg,"bsig":_block_sigmas(B)}
 def _dblock_step(core, ar_h, sat_h, nat_h, opt, scaler, args, ids, state):
     import nB300_agillm4 as M
-    B=state["B"]; asg=state["assign"]; bs=state["bsig"]; T=ids.size(1)
-    bi=random.randrange(B); lo,hi=sorted([bs[bi],bs[bi+1]]); layers=asg[bi]
-    sig=torch.from_numpy(np.exp(np.random.uniform(math.log(max(lo,1e-4)),math.log(hi),ids.size(0))).astype("float32")).to(ids.device)
-    cs,co,ci=_edm_pre(sig); w=_edm_w(sig); SATB=M.SAT_BLOCK
-    # ---- AR: causal diffusion denoise ----
-    with M.amp(args.amp):
-        emb=core.emb(ids); zt=emb+sig[:,None,None]*torch.randn_like(emb); h=ci*zt
-        for li in layers: h=_ck.checkpoint(core.blocks[li], h, M.causal_mask(T), use_reentrant=False)
-        Dn=core.ln(cs*zt+co*h)
-    ar=w*fused_ce(Dn[:,:-1].contiguous(), ar_h.proj.weight, ids[:,1:].contiguous())
-    scaler.scale(ar).backward()
-    ar_val=float(ar.detach())
-    del emb, zt, h, Dn, ar
-    # ---- SAT: block-causal diffusion; fixed proj + variable gate ----
-    with M.amp(args.amp):
-        emb2=core.emb(ids); zt2=emb2+sig[:,None,None]*torch.randn_like(emb2); h2=ci*zt2
-        for li in layers: h2=_ck.checkpoint(core.blocks[li], h2, M.sat_mask(T), use_reentrant=False)
-        Ds=core.ln(cs*zt2+co*h2); last=Ds[:,-SATB:]
-        satf=fused_ce(last.contiguous(), sat_h.proj.weight, ids[:,1:SATB+1].contiguous())
-        satv=(M.EMIT_LAMBDA*F.cross_entropy(sat_h.gate(Ds[:,0].float()), torch.ones(ids.size(0),dtype=torch.long,device=ids.device))) if sat_h.gate is not None else 0.0
-        sat=w*(satf+satv)
-    scaler.scale(sat).backward()
-    sat_val=float(sat.detach())
-    del emb2, zt2, h2, Ds, last, satf, satv, sat
-    # ---- NAT: bidirectional mask-predict ----
-    nat_val=0.0
-    if nat_h is not None:
-        ratio=min(max(float(getattr(args,"nat_mask_ratio",0.5)),0.05),0.95)
         with M.amp(args.amp):
-            nat_ids=ids.clone(); m=torch.rand(ids.shape,device=ids.device)<ratio
-            if not bool(m.any()): m[...,-1]=True
-            nat_ids[m]=M.BLANK; hn=core.emb(nat_ids)
-            for li in layers: hn=_ck.checkpoint(core.blocks[li], hn, None, use_reentrant=False)
-            Dnat=core.ln(hn)
-        nat=fused_ce(Dnat[m], nat_h.proj.weight, ids[m]); scaler.scale(nat).backward(); nat_val=float(nat.detach()); del nat_ids, m, hn, Dnat, nat
     scaler.unscale_(opt)
-    nn.utils.clip_grad_norm_([p for g in opt.param_groups for p in g["params"]],1.0)
-    scaler.step(opt); scaler.update(); opt.zero_grad(set_to_none=True)
-    return ar_val+sat_val+nat_val

 CE. Reuses the live data stream / optimizer / checkpointing of nB300_agillm4.
 Lazy-imports nB300 inside functions to avoid a circular import.
 """
+import math
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
 import torch.utils.checkpoint as _ck
 from fused_ce import fused_ce
+SD = 0.5
+def _cdf(x):
+    return 0.5 * (1 + math.erf(x / math.sqrt(2)))
+def _ppf(p):
+    return float(torch.erfinv(torch.tensor(2 * p - 1.0)) * math.sqrt(2))
+def _block_sigmas(B, smin=0.002, smax=80.0, pm=-1.2, ps=1.2):
+    a, b = _cdf((math.log(smin) - pm) / ps), _cdf((math.log(smax) - pm) / ps)
+    return [float(np.exp(pm + ps * _ppf(a + (b - a) * (i / B)))) for i in range(B + 1)]
+def _edm_pre(s):
+    s = s[:, None, None]
+    return SD**2 / (s**2 + SD**2), s * SD / (s**2 + SD**2) ** 0.5, 1 / (s**2 + SD**2) ** 0.5
+def _edm_w(s, wmax=5.0):
+    return float(((s**2 + SD**2) / (s * SD) ** 2).clamp(max=wmax).mean())
 def _dblock_init(core, args):
+    B = int(getattr(args, "dblock_blocks", 4))
+    L = len(core.blocks)
+    sp = max(1, L // B)
+    asg = [list(range(i * sp, (i + 1) * sp)) for i in range(B)]
+    asg[-1] = list(range((B - 1) * sp, L))
+    bsig = _block_sigmas(B)
+    schedule = getattr(args, "dblock_schedule", "loss_balanced")
     print(f"[dblock] DiffusionBlocks mode: {L} layers -> {B} blocks {asg}")
+    print(f"[dblock] schedule={schedule} sigma boundaries: {[round(x, 3) for x in bsig]}")
+    return {
+        "B": B,
+        "assign": asg,
+        "bsig": bsig,
+        "step": 0,
+        "counts": [0 for _ in range(B)],
+        "loss_ema": [None for _ in range(B)],
+    }
+def _choose_block(state, args):
+    B = state["B"]
+    schedule = str(getattr(args, "dblock_schedule", "loss_balanced") or "loss_balanced").lower()
+    step = int(state.get("step", 0))
+    counts = state.setdefault("counts", [0 for _ in range(B)])
+    emas = state.setdefault("loss_ema", [None for _ in range(B)])
+    if schedule == "random":
+        return random.randrange(B)
+    if schedule == "roundrobin":
+        return step % B
+    explore = float(getattr(args, "dblock_explore", 0.05))
+    warmup = int(getattr(args, "dblock_warmup_steps", max(8, B * 2)))
+    if step < warmup or any(c == 0 for c in counts):
+        return min(range(B), key=lambda i: (counts[i], i))
+    if explore > 0.0 and random.random() < explore:
+        return min(range(B), key=lambda i: (counts[i], i))
+    return max(range(B), key=lambda i: (-1.0 if emas[i] is None else emas[i], -counts[i]))
+def _sample_sigma(ids, lo, hi, args, state):
+    cur_step = int(state.get("step", 0))
+    curriculum = int(getattr(args, "dblock_sigma_curriculum_steps", 0))
+    if curriculum > 0:
+        frac = min(1.0, max(0.05, (cur_step + 1) / float(curriculum)))
+        hi = lo * ((hi / max(lo, 1e-8)) ** frac)
+    sig_np = np.exp(
+        np.random.uniform(
+            math.log(max(lo, 1e-4)),
+            math.log(max(hi, lo + 1e-4)),
+            ids.size(0),
+        ).astype("float32")
+    )
+    return torch.from_numpy(sig_np).to(ids.device)
+def _maybe_log(state, args, bi, layers, ar_val, sat_val, nat_val, total_val, peak_alloc, peak_reserved):
+    log_every = int(getattr(args, "dblock_log_every", 50))
+    step = int(state.get("step", 0))
+    if log_every <= 0 or step % log_every != 0:
+        return
+    counts = ",".join(str(x) for x in state.get("counts", []))
+    emas = ",".join("nan" if x is None else f"{x:.2f}" for x in state.get("loss_ema", []))
+    mem = ""
+    if peak_alloc is not None:
+        mem = f" peak_alloc={peak_alloc:.2f}GB peak_reserved={peak_reserved:.2f}GB"
+    print(
+        f"[dblock] step={step} block={bi} layers={layers} "
+        f"loss={total_val:.3f} ar={ar_val:.3f} sat={sat_val:.3f} nat={nat_val:.3f} "
+        f"counts=[{counts}] ema=[{emas}]{mem}",
+        flush=True,
+    )
+def _update_stats(state, bi, loss_value):
+    B = state["B"]
+    counts = state.setdefault("counts", [0 for _ in range(B)])
+    emas = state.setdefault("loss_ema", [None for _ in range(B)])
+    counts[bi] += 1
+    prev = emas[bi]
+    beta = 0.96
+    emas[bi] = float(loss_value) if prev is None else beta * float(prev) + (1.0 - beta) * float(loss_value)
+    state["step"] = int(state.get("step", 0)) + 1
 def _dblock_step(core, ar_h, sat_h, nat_h, opt, scaler, args, ids, state):
     import nB300_agillm4 as M
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()
+    B = state["B"]
+    asg = state["assign"]
+    bs = state["bsig"]
+    T = ids.size(1)
+    bi = _choose_block(state, args)
+    lo, hi = sorted([bs[bi], bs[bi + 1]])
+    layers = asg[bi]
+    sig = _sample_sigma(ids, lo, hi, args, state)
+    cs, co, ci = _edm_pre(sig)
+    w = _edm_w(sig, float(getattr(args, "dblock_edm_wmax", 5.0)))
+    SATB = M.SAT_BLOCK
+    ar_weight = float(getattr(args, "dblock_ar_weight", 1.0))
+    sat_weight = float(getattr(args, "dblock_sat_weight", 1.0))
+    nat_weight = float(getattr(args, "dblock_nat_weight", 1.0)) * float(getattr(args, "nat_loss_weight", 1.0))
+    ar_val = 0.0
+    sat_val = 0.0
+    nat_val = 0.0
+    if ar_weight > 0.0:
+        causal = M.causal_mask(T)
+        with M.amp(args.amp):
+            emb = core.emb(ids)
+            zt = emb + sig[:, None, None] * torch.randn_like(emb)
+            h = ci * zt
+            for li in layers:
+                h = _ck.checkpoint(core.blocks[li], h, causal, use_reentrant=False)
+            Dn = core.ln(cs * zt + co * h)
+        ar = ar_weight * w * fused_ce(Dn[:, :-1].contiguous(), ar_h.proj.weight, ids[:, 1:].contiguous())
+        ar_val = float(ar.detach())
+        scaler.scale(ar).backward()
+        del causal, emb, zt, h, Dn, ar
+    do_sat = (not getattr(args, "ar_only", False)) and (
+        int(getattr(args, "sat_every", 1)) <= 1 or ((int(state.get("step", 0)) + 1) % int(getattr(args, "sat_every", 1)) == 0)
+    )
+    if sat_weight > 0.0 and do_sat:
+        smask = M.sat_mask(T)
+        with M.amp(args.amp):
+            emb2 = core.emb(ids)
+            zt2 = emb2 + sig[:, None, None] * torch.randn_like(emb2)
+            h2 = ci * zt2
+            for li in layers:
+                h2 = _ck.checkpoint(core.blocks[li], h2, smask, use_reentrant=False)
+            Ds = core.ln(cs * zt2 + co * h2)
+            last = Ds[:, -SATB:]
+            satf = fused_ce(last.contiguous(), sat_h.proj.weight, ids[:, 1 : SATB + 1].contiguous())
+            satv = (
+                M.EMIT_LAMBDA
+                * F.cross_entropy(
+                    sat_h.gate(Ds[:, 0].float()),
+                    torch.ones(ids.size(0), dtype=torch.long, device=ids.device),
+                )
+                if sat_h.gate is not None
+                else 0.0
+            )
+            sat = sat_weight * w * (satf + satv)
+        sat_val = float(sat.detach())
+        scaler.scale(sat).backward()
+        del smask, emb2, zt2, h2, Ds, last, satf, satv, sat
+    do_nat = (
+        nat_h is not None
+        and nat_weight > 0.0
+        and (not getattr(args, "ar_only", False))
+        and int(getattr(args, "nat_every", 1)) > 0
+        and (
+            int(getattr(args, "nat_every", 1)) <= 1
+            or ((int(state.get("step", 0)) + 1) % int(getattr(args, "nat_every", 1)) == 0)
+        )
+    )
+    if do_nat:
+        ratio = min(max(float(getattr(args, "nat_mask_ratio", 0.5)), 0.05), 0.95)
+        nat_ids = M._nat_ids_for_training(ids, int(getattr(args, "nat_max_tokens", 0)))
         with M.amp(args.amp):
+            nat_in = nat_ids.clone()
+            m = torch.rand(nat_ids.shape, device=nat_ids.device) < ratio
+            if not bool(m.any()):
+                m[..., -1] = True
+            nat_in[m] = M.BLANK
+            hn = core.emb(nat_in)
+            for li in layers:
+                hn = _ck.checkpoint(core.blocks[li], hn, None, use_reentrant=False)
+            Dnat = core.ln(hn)
+        nat = nat_weight * fused_ce(Dnat[m], nat_h.proj.weight, nat_ids[m])
+        nat_val = float(nat.detach())
+        scaler.scale(nat).backward()
+        del nat_ids, nat_in, m, hn, Dnat, nat
+    total_val = ar_val + sat_val + nat_val
+    if not math.isfinite(total_val):
+        opt.zero_grad(set_to_none=True)
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        print(f"[dblock] non-finite loss {total_val}; skipped optimizer step", flush=True)
+        _update_stats(state, bi, total_val)
+        return total_val
     scaler.unscale_(opt)
+    nn.utils.clip_grad_norm_([p for g in opt.param_groups for p in g["params"]], 1.0)
+    scaler.step(opt)
+    scaler.update()
+    opt.zero_grad(set_to_none=True)
+    peak_alloc = None
+    peak_reserved = None
+    if torch.cuda.is_available():
+        peak_alloc = torch.cuda.max_memory_allocated() / (1024**3)
+        peak_reserved = torch.cuda.max_memory_reserved() / (1024**3)
+    _update_stats(state, bi, total_val)
+    _maybe_log(state, args, bi, layers, ar_val, sat_val, nat_val, total_val, peak_alloc, peak_reserved)
+    return total_val

fused_ce.py CHANGED Viewed

@@ -4,28 +4,54 @@ recomputes softmax per vocab-chunk (grad = softmax - onehot). This is the
 DiffusionBlocks 'process in chunks, don't hold the whole thing' idea applied to
 the output head instead of network depth."""
 import torch
 class FusedCE(torch.autograd.Function):
     @staticmethod
     def forward(ctx, h, W, tgt, vchunk=16384):
-        N,d=h.shape; V=W.shape[0]; hf=h.float()
-        m=torch.full((N,),-1e30,device=h.device); s=torch.zeros(N,device=h.device); zt=torch.zeros(N,device=h.device)
-        for c in range(0,V,vchunk):
-            lg=hf@W[c:c+vchunk].float().T                    # [N,vchunk] transient only
-            cm=lg.max(1).values; nm=torch.maximum(m,cm)
-            s=s*torch.exp(m-nm)+torch.exp(lg-nm[:,None]).sum(1); m=nm
-            ic=(tgt>=c)&(tgt<c+vchunk)
-            if ic.any(): zt[ic]=lg[ic,tgt[ic]-c]
-        lse=m+torch.log(s); ctx.save_for_backward(h,W,tgt,lse); ctx.vchunk=vchunk
-        return (lse-zt).mean()
     @staticmethod
     def backward(ctx, go):
-        h,W,tgt,lse=ctx.saved_tensors; vc=ctx.vchunk; N,d=h.shape; V=W.shape[0]; hf=h.float()
-        gh=torch.zeros_like(hf); gW=torch.zeros(W.shape,device=W.device,dtype=torch.float32); sc=float(go)/N
-        for c in range(0,V,vc):
-            Wc=W[c:c+vc].float(); p=torch.exp(hf@Wc.T-lse[:,None])     # softmax chunk [N,vchunk]
-            ic=(tgt>=c)&(tgt<c+vc)
-            if ic.any(): p[ic,tgt[ic]-c]-=1.0
-            p*=sc; gh+=p@Wc; gW[c:c+vc]+=p.T@hf
-        return gh.to(h.dtype), gW.to(W.dtype), None, None
 def fused_ce(h, W, tgt, vchunk=16384):
-    return FusedCE.apply(h.reshape(-1,h.size(-1)), W, tgt.reshape(-1), vchunk)

 DiffusionBlocks 'process in chunks, don't hold the whole thing' idea applied to
 the output head instead of network depth."""
 import torch
 class FusedCE(torch.autograd.Function):
     @staticmethod
     def forward(ctx, h, W, tgt, vchunk=16384):
+        with torch.cuda.amp.autocast(enabled=False):
+            hf = h.float()
+            Wf = W.float()
+            N, d = h.shape
+            V = W.shape[0]
+            m = torch.full((N,), -1e30, device=h.device, dtype=torch.float32)
+            s = torch.zeros(N, device=h.device, dtype=torch.float32)
+            zt = torch.zeros(N, device=h.device, dtype=torch.float32)
+            for c in range(0, V, vchunk):
+                lg = hf @ Wf[c:c+vchunk].T                    # [N,vchunk] transient only
+                cm = lg.max(1).values
+                nm = torch.maximum(m, cm)
+                s = s * torch.exp(m - nm) + torch.exp(lg - nm[:, None]).sum(1)
+                m = nm
+                ic = (tgt >= c) & (tgt < c+vchunk)
+                if ic.any():
+                    zt[ic] = lg[ic, tgt[ic] - c].float()
+            lse = m + torch.log(s)
+            ctx.save_for_backward(h, W, tgt, lse)
+            ctx.vchunk = vchunk
+            return (lse - zt).mean()
     @staticmethod
     def backward(ctx, go):
+        h, W, tgt, lse = ctx.saved_tensors
+        vc = ctx.vchunk
+        N, d = h.shape
+        V = W.shape[0]
+        with torch.cuda.amp.autocast(enabled=False):
+            hf = h.float()
+            Wc_all = W.float()
+            gh = torch.zeros_like(hf)
+            gW = torch.zeros(W.shape, device=W.device, dtype=torch.float32)
+            sc = float(go) / N
+            for c in range(0, V, vc):
+                Wc = Wc_all[c:c+vc]
+                p = torch.exp(hf @ Wc.T - lse[:, None])     # softmax chunk [N,vchunk]
+                ic = (tgt >= c) & (tgt < c+vc)
+                if ic.any():
+                    p[ic, tgt[ic] - c] -= 1.0
+                p *= sc
+                gh += p @ Wc
+                gW[c:c+vc] += p.T @ hf
+            return gh.to(h.dtype), gW.to(W.dtype), None, None
 def fused_ce(h, W, tgt, vchunk=16384):
+    return FusedCE.apply(h.reshape(-1, h.size(-1)), W, tgt.reshape(-1), vchunk)

nB300_agillm4_vram_dblock.py CHANGED Viewed

@@ -2806,6 +2806,21 @@ def main():
                     help="Fraction of positions masked to BLANK for the NAT mask-predict (CMLM) objective.")
     tr.add_argument("--dblock", action="store_true", help="DiffusionBlocks block-wise denoising training (low VRAM).")
     tr.add_argument("--dblock_blocks", type=int, default=4, help="Partition layers into this many DiffusionBlocks blocks.")
     tr.add_argument("--reinit_nat", action="store_true",
                     help="Reinitialize NAT head weights after load (use once when switching to mask-predict).")
     tr.add_argument("--seed_nat_from_ar", action="store_true",

                     help="Fraction of positions masked to BLANK for the NAT mask-predict (CMLM) objective.")
     tr.add_argument("--dblock", action="store_true", help="DiffusionBlocks block-wise denoising training (low VRAM).")
     tr.add_argument("--dblock_blocks", type=int, default=4, help="Partition layers into this many DiffusionBlocks blocks.")
+    tr.add_argument("--dblock_schedule", choices=["random", "roundrobin", "loss_balanced"], default="loss_balanced",
+                    help="How --dblock chooses the next layer block. loss_balanced focuses blocks whose EMA loss is highest after warmup.")
+    tr.add_argument("--dblock_warmup_steps", type=int, default=16,
+                    help="Initial DBlock steps spent covering every block before loss-balanced scheduling.")
+    tr.add_argument("--dblock_explore", type=float, default=0.05,
+                    help="Exploration rate for loss-balanced DBlock scheduling.")
+    tr.add_argument("--dblock_log_every", type=int, default=25,
+                    help="Print DBlock block/loss/VRAM diagnostics every N DBlock steps; 0 disables.")
+    tr.add_argument("--dblock_sigma_curriculum_steps", type=int, default=2000,
+                    help="Warm sigma ranges from easy to full span over this many DBlock steps; 0 disables.")
+    tr.add_argument("--dblock_edm_wmax", type=float, default=5.0,
+                    help="Cap for EDM loss weighting in DBlock mode.")
+    tr.add_argument("--dblock_ar_weight", type=float, default=1.0)
+    tr.add_argument("--dblock_sat_weight", type=float, default=1.0)
+    tr.add_argument("--dblock_nat_weight", type=float, default=1.0)
     tr.add_argument("--reinit_nat", action="store_true",
                     help="Reinitialize NAT head weights after load (use once when switching to mask-predict).")
     tr.add_argument("--seed_nat_from_ar", action="store_true",

relaunch_agillm4_dblock.sh CHANGED Viewed

@@ -13,7 +13,9 @@ CKPT="$(ls -1t "$SAVE_DIR"/pretrain_step*.pt 2>/dev/null | head -1)"
 exec >> /workspace/agillm4_floor_train.log 2>&1
 echo "RELAUNCH_AGILLM4_DBLOCK $(date -u +%Y-%m-%dT%H:%M:%SZ) resume=$CKPT --dblock blocks=${AGILLM4_DBLOCKS:-4} tie_weights=1 attn=${AGILLM_ATTN_BACKEND}"
 exec python -u nB300_agillm4.py train --preset agillm4_floor --resume "$CKPT" \
-  --dblock --dblock_blocks "${AGILLM4_DBLOCKS:-4}" --tie_weights \
   --batch_size 1 --block "${AGILLM4_BLOCK:-1280}" --amp --attn_backend "${AGILLM_ATTN_BACKEND}" --grad_checkpoint \
   --optimizer paged_adamw8bit --sat_every 1 --nat_every 1 --nat_max_tokens 768 --nat_mask_ratio 0.5 \
   --token_param_ratio 100 --save_dir "$SAVE_DIR" \

 exec >> /workspace/agillm4_floor_train.log 2>&1
 echo "RELAUNCH_AGILLM4_DBLOCK $(date -u +%Y-%m-%dT%H:%M:%SZ) resume=$CKPT --dblock blocks=${AGILLM4_DBLOCKS:-4} tie_weights=1 attn=${AGILLM_ATTN_BACKEND}"
 exec python -u nB300_agillm4.py train --preset agillm4_floor --resume "$CKPT" \
+  --dblock --dblock_blocks "${AGILLM4_DBLOCKS:-4}" --dblock_schedule "${AGILLM4_DBLOCK_SCHEDULE:-loss_balanced}" \
+  --dblock_warmup_steps "${AGILLM4_DBLOCK_WARMUP:-16}" --dblock_sigma_curriculum_steps "${AGILLM4_DBLOCK_SIGMA_CURRICULUM:-2000}" \
+  --dblock_log_every "${AGILLM4_DBLOCK_LOG_EVERY:-25}" --tie_weights \
   --batch_size 1 --block "${AGILLM4_BLOCK:-1280}" --amp --attn_backend "${AGILLM_ATTN_BACKEND}" --grad_checkpoint \
   --optimizer paged_adamw8bit --sat_every 1 --nat_every 1 --nat_max_tokens 768 --nat_mask_ratio 0.5 \
   --token_param_ratio 100 --save_dir "$SAVE_DIR" \