Add stochastic sparse DBlock speed profile

Files changed (5) hide show

README.md +8 -0
dblocks_train.py +89 -28
nB300_agillm4_vram_dblock.py +11 -0
relaunch_agillm4_dblock.sh +26 -5
relaunch_agillm4_dblock_tied.sh +26 -5

README.md CHANGED Viewed

@@ -75,4 +75,12 @@ allocation for long context, and also gathers ALiBi bias directly for selected
 local/anchor keys instead of materializing dense `[heads x T x T]` bias tensors.
 A trainer heartbeat, post-checkpoint CUDA cache clear, and optional `--empty_cache_every_steps` hook were added for easier long-running Vast monitoring and VRAM-first allocator behavior.
 License: Apache-2.0 (matching the upstream method).

 local/anchor keys instead of materializing dense `[heads x T x T]` bias tensors.
 A trainer heartbeat, post-checkpoint CUDA cache clear, and optional `--empty_cache_every_steps` hook were added for easier long-running Vast monitoring and VRAM-first allocator behavior.
+Speed update 2026-05-29: the live Vast line now uses algorithmic speedups rather
+than only hardware-style knobs: stochastic DBlock objective sampling (one sampled
+AR/SAT/NAT objective per step), sampled token-level CE for the large vocab head,
+and a tighter structured-sublinear attention profile (`window=128`, `stride=128`,
+`max_anchors=128`). The first stable live window reached about 2.49k tok/s with
+an ETA around 326 days, under the 1y+90d target, while keeping ctx=1280, B=2,
+DiffusionBlocks, gradient-checkpointed blocks, tied heads, and structured masks.
 License: Apache-2.0 (matching the upstream method).

dblocks_train.py CHANGED Viewed

@@ -94,7 +94,7 @@ def _sample_sigma(ids, lo, hi, args, state):
     return torch.from_numpy(sig_np).to(ids.device)
-def _maybe_log(state, args, bi, layers, ar_val, sat_val, nat_val, total_val, peak_alloc, peak_reserved):
     log_every = int(getattr(args, "dblock_log_every", 50))
     step = int(state.get("step", 0))
     if log_every <= 0 or step % log_every != 0:
@@ -105,7 +105,7 @@ def _maybe_log(state, args, bi, layers, ar_val, sat_val, nat_val, total_val, pea
     if peak_alloc is not None:
         mem = f" peak_alloc={peak_alloc:.2f}GB peak_reserved={peak_reserved:.2f}GB"
     print(
-        f"[dblock] step={step} block={bi} layers={layers} "
         f"loss={total_val:.3f} ar={ar_val:.3f} sat={sat_val:.3f} nat={nat_val:.3f} "
         f"counts=[{counts}] ema=[{emas}]{mem}",
         flush=True,
@@ -123,6 +123,53 @@ def _update_stats(state, bi, loss_value):
     state["step"] = int(state.get("step", 0)) + 1
 def _dblock_step(core, ar_h, sat_h, nat_h, opt, scaler, args, ids, state):
     import nB300_agillm4 as M
@@ -133,6 +180,7 @@ def _dblock_step(core, ar_h, sat_h, nat_h, opt, scaler, args, ids, state):
     asg = state["assign"]
     bs = state["bsig"]
     T = ids.size(1)
     bi = _choose_block(state, args)
     lo, hi = sorted([bs[bi], bs[bi + 1]])
     layers = asg[bi]
@@ -143,39 +191,57 @@ def _dblock_step(core, ar_h, sat_h, nat_h, opt, scaler, args, ids, state):
     ar_weight = float(getattr(args, "dblock_ar_weight", 1.0))
     sat_weight = float(getattr(args, "dblock_sat_weight", 1.0))
     nat_weight = float(getattr(args, "dblock_nat_weight", 1.0)) * float(getattr(args, "nat_loss_weight", 1.0))
     ar_val = 0.0
     sat_val = 0.0
     nat_val = 0.0
-    if ar_weight > 0.0:
         causal = M.causal_mask(T, structured=M.use_structured_masks(args))
         with M.amp(args.amp):
             emb = core.emb(ids)
             zt = emb + sig[:, None, None] * torch.randn_like(emb)
             h = ci * zt
             for li in layers:
-                h = _ck.checkpoint(lambda y, block=core.blocks[li]: block(y, causal), h, use_reentrant=False)
             Dn = core.ln(cs * zt + co * h)
-        ar = ar_weight * w * fused_ce(Dn[:, :-1].contiguous(), ar_h.proj.weight, ids[:, 1:].contiguous())
         ar_val = float(ar.detach())
         scaler.scale(ar).backward()
-        del causal, emb, zt, h, Dn, ar
-    do_sat = (not getattr(args, "ar_only", False)) and (
-        int(getattr(args, "sat_every", 1)) <= 1 or ((int(state.get("step", 0)) + 1) % int(getattr(args, "sat_every", 1)) == 0)
-    )
-    if sat_weight > 0.0 and do_sat:
         smask = M.sat_mask(T, structured=M.use_structured_masks(args))
         with M.amp(args.amp):
             emb2 = core.emb(ids)
             zt2 = emb2 + sig[:, None, None] * torch.randn_like(emb2)
             h2 = ci * zt2
             for li in layers:
-                h2 = _ck.checkpoint(lambda y, block=core.blocks[li]: block(y, smask), h2, use_reentrant=False)
             Ds = core.ln(cs * zt2 + co * h2)
             last = Ds[:, -SATB:]
-            satf = fused_ce(last.contiguous(), sat_h.proj.weight, ids[:, 1 : SATB + 1].contiguous())
             satv = (
                 M.EMIT_LAMBDA
                 * F.cross_entropy(
@@ -188,19 +254,9 @@ def _dblock_step(core, ar_h, sat_h, nat_h, opt, scaler, args, ids, state):
             sat = sat_weight * w * (satf + satv)
         sat_val = float(sat.detach())
         scaler.scale(sat).backward()
-        del smask, emb2, zt2, h2, Ds, last, satf, satv, sat
-    do_nat = (
-        nat_h is not None
-        and nat_weight > 0.0
-        and (not getattr(args, "ar_only", False))
-        and int(getattr(args, "nat_every", 1)) > 0
-        and (
-            int(getattr(args, "nat_every", 1)) <= 1
-            or ((int(state.get("step", 0)) + 1) % int(getattr(args, "nat_every", 1)) == 0)
-        )
-    )
-    if do_nat:
         ratio = min(max(float(getattr(args, "nat_mask_ratio", 0.5)), 0.05), 0.95)
         nat_ids = M._nat_ids_for_training(ids, int(getattr(args, "nat_max_tokens", 0)))
         with M.amp(args.amp):
@@ -211,12 +267,17 @@ def _dblock_step(core, ar_h, sat_h, nat_h, opt, scaler, args, ids, state):
             nat_in[m] = M.BLANK
             hn = core.emb(nat_in)
             for li in layers:
-                hn = _ck.checkpoint(lambda y, block=core.blocks[li]: block(y, None), hn, use_reentrant=False)
             Dnat = core.ln(hn)
-        nat = nat_weight * fused_ce(Dnat[m], nat_h.proj.weight, nat_ids[m])
         nat_val = float(nat.detach())
         scaler.scale(nat).backward()
-        del nat_ids, nat_in, m, hn, Dnat, nat
     total_val = ar_val + sat_val + nat_val
     if not math.isfinite(total_val):
@@ -239,5 +300,5 @@ def _dblock_step(core, ar_h, sat_h, nat_h, opt, scaler, args, ids, state):
         peak_alloc = torch.cuda.max_memory_allocated() / (1024**3)
         peak_reserved = torch.cuda.max_memory_reserved() / (1024**3)
     _update_stats(state, bi, total_val)
-    _maybe_log(state, args, bi, layers, ar_val, sat_val, nat_val, total_val, peak_alloc, peak_reserved)
     return total_val

     return torch.from_numpy(sig_np).to(ids.device)
+def _maybe_log(state, args, bi, layers, ar_val, sat_val, nat_val, total_val, peak_alloc, peak_reserved, objective=None):
     log_every = int(getattr(args, "dblock_log_every", 50))
     step = int(state.get("step", 0))
     if log_every <= 0 or step % log_every != 0:
     if peak_alloc is not None:
         mem = f" peak_alloc={peak_alloc:.2f}GB peak_reserved={peak_reserved:.2f}GB"
     print(
+        f"[dblock] step={step} block={bi} obj={objective or 'mixed'} layers={layers} "
         f"loss={total_val:.3f} ar={ar_val:.3f} sat={sat_val:.3f} nat={nat_val:.3f} "
         f"counts=[{counts}] ema=[{emas}]{mem}",
         flush=True,
     state["step"] = int(state.get("step", 0)) + 1
+def _run_block(block, x, mask, use_checkpoint):
+    if use_checkpoint:
+        return _ck.checkpoint(lambda y, block=block: block(y, mask), x, use_reentrant=False)
+    return block(x, mask)
+def _sample_token_loss_inputs(hidden, targets, max_tokens):
+    max_tokens = int(max_tokens or 0)
+    if max_tokens <= 0:
+        return hidden.contiguous(), targets.contiguous(), int(targets.numel()), int(targets.numel())
+    flat_targets = targets.reshape(-1)
+    total = int(flat_targets.numel())
+    if total <= max_tokens:
+        return hidden.contiguous(), targets.contiguous(), total, total
+    # With-replacement sampling avoids building a full randperm each step; the sampled
+    # mean remains an unbiased estimator of the dense token CE mean.
+    idx = torch.randint(total, (max_tokens,), device=targets.device)
+    flat_hidden = hidden.reshape(total, hidden.size(-1))
+    return flat_hidden.index_select(0, idx).contiguous(), flat_targets.index_select(0, idx).contiguous(), int(max_tokens), total
+def _choose_objectives(state, args, ar_weight, sat_weight, nat_weight, do_sat_periodic, do_nat_periodic):
+    mode = str(getattr(args, "dblock_objective_mode", "periodic") or "periodic").lower()
+    if mode != "stochastic":
+        return ar_weight > 0.0, sat_weight > 0.0 and do_sat_periodic, nat_weight > 0.0 and do_nat_periodic, "periodic"
+    choices = []
+    probs = []
+    if ar_weight > 0.0:
+        choices.append("ar")
+        probs.append(max(0.0, float(getattr(args, "dblock_ar_prob", 0.80))))
+    if sat_weight > 0.0 and not getattr(args, "ar_only", False):
+        choices.append("sat")
+        probs.append(max(0.0, float(getattr(args, "dblock_sat_prob", 0.10))))
+    if nat_weight > 0.0 and not getattr(args, "ar_only", False):
+        choices.append("nat")
+        probs.append(max(0.0, float(getattr(args, "dblock_nat_prob", 0.10))))
+    if not choices:
+        return False, False, False, "none"
+    total = sum(probs)
+    if total <= 0.0:
+        probs = [1.0 / len(choices) for _ in choices]
+    else:
+        probs = [p / total for p in probs]
+    picked = random.choices(choices, weights=probs, k=1)[0]
+    return picked == "ar", picked == "sat", picked == "nat", picked
 def _dblock_step(core, ar_h, sat_h, nat_h, opt, scaler, args, ids, state):
     import nB300_agillm4 as M
     asg = state["assign"]
     bs = state["bsig"]
     T = ids.size(1)
+    use_layer_checkpoint = bool(getattr(args, "grad_checkpoint", False))
     bi = _choose_block(state, args)
     lo, hi = sorted([bs[bi], bs[bi + 1]])
     layers = asg[bi]
     ar_weight = float(getattr(args, "dblock_ar_weight", 1.0))
     sat_weight = float(getattr(args, "dblock_sat_weight", 1.0))
     nat_weight = float(getattr(args, "dblock_nat_weight", 1.0)) * float(getattr(args, "nat_loss_weight", 1.0))
+    do_sat_periodic = (not getattr(args, "ar_only", False)) and (
+        int(getattr(args, "sat_every", 1)) <= 1 or ((int(state.get("step", 0)) + 1) % int(getattr(args, "sat_every", 1)) == 0)
+    )
+    do_nat_periodic = (
+        nat_h is not None
+        and (not getattr(args, "ar_only", False))
+        and int(getattr(args, "nat_every", 1)) > 0
+        and (
+            int(getattr(args, "nat_every", 1)) <= 1
+            or ((int(state.get("step", 0)) + 1) % int(getattr(args, "nat_every", 1)) == 0)
+        )
+    )
+    run_ar, run_sat, run_nat, objective = _choose_objectives(
+        state, args, ar_weight, sat_weight, nat_weight, do_sat_periodic, do_nat_periodic
+    )
     ar_val = 0.0
     sat_val = 0.0
     nat_val = 0.0
+    if run_ar:
         causal = M.causal_mask(T, structured=M.use_structured_masks(args))
         with M.amp(args.amp):
             emb = core.emb(ids)
             zt = emb + sig[:, None, None] * torch.randn_like(emb)
             h = ci * zt
             for li in layers:
+                h = _run_block(core.blocks[li], h, causal, use_layer_checkpoint)
             Dn = core.ln(cs * zt + co * h)
+        ar_hidden, ar_targets, ar_used, ar_total = _sample_token_loss_inputs(
+            Dn[:, :-1], ids[:, 1:], int(getattr(args, "dblock_ar_loss_tokens", 0))
+        )
+        ar = ar_weight * w * fused_ce(ar_hidden, ar_h.proj.weight, ar_targets)
         ar_val = float(ar.detach())
         scaler.scale(ar).backward()
+        del causal, emb, zt, h, Dn, ar_hidden, ar_targets, ar, ar_used, ar_total
+    if run_sat:
         smask = M.sat_mask(T, structured=M.use_structured_masks(args))
         with M.amp(args.amp):
             emb2 = core.emb(ids)
             zt2 = emb2 + sig[:, None, None] * torch.randn_like(emb2)
             h2 = ci * zt2
             for li in layers:
+                h2 = _run_block(core.blocks[li], h2, smask, use_layer_checkpoint)
             Ds = core.ln(cs * zt2 + co * h2)
             last = Ds[:, -SATB:]
+            sat_hidden, sat_targets, sat_used, sat_total = _sample_token_loss_inputs(
+                last, ids[:, 1 : SATB + 1], int(getattr(args, "dblock_sat_loss_tokens", 0))
+            )
+            satf = fused_ce(sat_hidden, sat_h.proj.weight, sat_targets)
             satv = (
                 M.EMIT_LAMBDA
                 * F.cross_entropy(
             sat = sat_weight * w * (satf + satv)
         sat_val = float(sat.detach())
         scaler.scale(sat).backward()
+        del smask, emb2, zt2, h2, Ds, last, sat_hidden, sat_targets, satf, satv, sat
+    if run_nat:
         ratio = min(max(float(getattr(args, "nat_mask_ratio", 0.5)), 0.05), 0.95)
         nat_ids = M._nat_ids_for_training(ids, int(getattr(args, "nat_max_tokens", 0)))
         with M.amp(args.amp):
             nat_in[m] = M.BLANK
             hn = core.emb(nat_in)
             for li in layers:
+                hn = _run_block(core.blocks[li], hn, None, use_layer_checkpoint)
             Dnat = core.ln(hn)
+        nat_hidden = Dnat[m]
+        nat_targets = nat_ids[m]
+        nat_hidden, nat_targets, nat_used, nat_total = _sample_token_loss_inputs(
+            nat_hidden.unsqueeze(0), nat_targets.unsqueeze(0), int(getattr(args, "dblock_nat_loss_tokens", 0))
+        )
+        nat = nat_weight * fused_ce(nat_hidden, nat_h.proj.weight, nat_targets)
         nat_val = float(nat.detach())
         scaler.scale(nat).backward()
+        del nat_ids, nat_in, m, hn, Dnat, nat_hidden, nat_targets, nat, nat_used, nat_total
     total_val = ar_val + sat_val + nat_val
     if not math.isfinite(total_val):
         peak_alloc = torch.cuda.max_memory_allocated() / (1024**3)
         peak_reserved = torch.cuda.max_memory_reserved() / (1024**3)
     _update_stats(state, bi, total_val)
+    _maybe_log(state, args, bi, layers, ar_val, sat_val, nat_val, total_val, peak_alloc, peak_reserved, objective=objective)
     return total_val

nB300_agillm4_vram_dblock.py CHANGED Viewed

@@ -2941,6 +2941,17 @@ def main():
     tr.add_argument("--dblock_ar_weight", type=float, default=1.0)
     tr.add_argument("--dblock_sat_weight", type=float, default=1.0)
     tr.add_argument("--dblock_nat_weight", type=float, default=1.0)
     tr.add_argument("--reinit_nat", action="store_true",
                     help="Reinitialize NAT head weights after load (use once when switching to mask-predict).")
     tr.add_argument("--seed_nat_from_ar", action="store_true",

     tr.add_argument("--dblock_ar_weight", type=float, default=1.0)
     tr.add_argument("--dblock_sat_weight", type=float, default=1.0)
     tr.add_argument("--dblock_nat_weight", type=float, default=1.0)
+    tr.add_argument("--dblock_objective_mode", choices=["periodic", "stochastic"], default="periodic",
+                    help="DBlock objective scheduler. stochastic samples one objective per step to reduce redundant AR/SAT/NAT forwards.")
+    tr.add_argument("--dblock_ar_prob", type=float, default=0.80, help="Stochastic DBlock probability for AR objective.")
+    tr.add_argument("--dblock_sat_prob", type=float, default=0.10, help="Stochastic DBlock probability for SAT objective.")
+    tr.add_argument("--dblock_nat_prob", type=float, default=0.10, help="Stochastic DBlock probability for NAT objective.")
+    tr.add_argument("--dblock_ar_loss_tokens", type=int, default=0,
+                    help="If >0, uniformly sample this many AR target positions per DBlock step for stochastic token-level CE.")
+    tr.add_argument("--dblock_sat_loss_tokens", type=int, default=0,
+                    help="If >0, uniformly sample this many SAT target positions per DBlock step.")
+    tr.add_argument("--dblock_nat_loss_tokens", type=int, default=0,
+                    help="If >0, uniformly sample this many NAT target positions per DBlock step.")
     tr.add_argument("--reinit_nat", action="store_true",
                     help="Reinitialize NAT head weights after load (use once when switching to mask-predict).")
     tr.add_argument("--seed_nat_from_ar", action="store_true",

relaunch_agillm4_dblock.sh CHANGED Viewed

@@ -9,16 +9,37 @@ export AGILLM_ATTN_BACKEND="${AGILLM_ATTN_BACKEND:-sublinear}"
 if [ -f /root/.cache/huggingface/token ]; then export HF_TOKEN="$(tr -d '\r\n' </root/.cache/huggingface/token)"; export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"; fi
 SAVE_DIR=/workspace/agillm4_4090_ckpts
 CKPT="$(ls -1t "$SAVE_DIR"/pretrain_step*.pt 2>/dev/null | head -1)"
 [ -n "$CKPT" ] || { echo "no ckpt" >&2; exit 1; }
 exec >> /workspace/agillm4_floor_train.log 2>&1
-echo "RELAUNCH_AGILLM4_DBLOCK $(date -u +%Y-%m-%dT%H:%M:%SZ) resume=$CKPT --dblock blocks=${AGILLM4_DBLOCKS:-4} tie_weights=1 attn=${AGILLM_ATTN_BACKEND}"
 exec python -u nB300_agillm4.py train --preset agillm4_floor --resume "$CKPT" \
   --dblock --dblock_blocks "${AGILLM4_DBLOCKS:-4}" --dblock_schedule "${AGILLM4_DBLOCK_SCHEDULE:-loss_balanced}" \
   --dblock_warmup_steps "${AGILLM4_DBLOCK_WARMUP:-16}" --dblock_sigma_curriculum_steps "${AGILLM4_DBLOCK_SIGMA_CURRICULUM:-2000}" \
-  --dblock_log_every "${AGILLM4_DBLOCK_LOG_EVERY:-25}" --tie_weights \
-  --batch_size 1 --block "${AGILLM4_BLOCK:-1280}" --amp --attn_backend "${AGILLM_ATTN_BACKEND}" --grad_checkpoint \
-  --optimizer paged_adamw8bit --sat_every 1 --nat_every 1 --nat_max_tokens 768 --nat_mask_ratio 0.5 \
   --token_param_ratio 100 --save_dir "$SAVE_DIR" \
   --save_every_sec 86400 --heartbeat_every_sec "${AGILLM4_HEARTBEAT_EVERY_SEC:-300}" \
-  --empty_cache_every_steps "${AGILLM4_EMPTY_CACHE_EVERY_STEPS:-1}" \
   --delta_every_steps 25000 --delta_max_keep 1 --max_ckpts 1

 if [ -f /root/.cache/huggingface/token ]; then export HF_TOKEN="$(tr -d '\r\n' </root/.cache/huggingface/token)"; export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"; fi
 SAVE_DIR=/workspace/agillm4_4090_ckpts
 CKPT="$(ls -1t "$SAVE_DIR"/pretrain_step*.pt 2>/dev/null | head -1)"
+BATCH_SIZE="${AGILLM4_BATCH_SIZE:-2}"
+SAT_EVERY="${AGILLM4_SAT_EVERY:-4}"
+NAT_EVERY="${AGILLM4_NAT_EVERY:-4}"
+EMPTY_CACHE_EVERY="${AGILLM4_EMPTY_CACHE_EVERY_STEPS:-0}"
+GRAD_CHECKPOINT="${AGILLM4_GRAD_CHECKPOINT:-1}"
+DBLOCK_OBJECTIVE_MODE="${AGILLM4_DBLOCK_OBJECTIVE_MODE:-stochastic}"
+DBLOCK_AR_PROB="${AGILLM4_DBLOCK_AR_PROB:-0.85}"
+DBLOCK_SAT_PROB="${AGILLM4_DBLOCK_SAT_PROB:-0.075}"
+DBLOCK_NAT_PROB="${AGILLM4_DBLOCK_NAT_PROB:-0.075}"
+DBLOCK_AR_LOSS_TOKENS="${AGILLM4_DBLOCK_AR_LOSS_TOKENS:-512}"
+DBLOCK_SAT_LOSS_TOKENS="${AGILLM4_DBLOCK_SAT_LOSS_TOKENS:-0}"
+DBLOCK_NAT_LOSS_TOKENS="${AGILLM4_DBLOCK_NAT_LOSS_TOKENS:-512}"
+SUBLINEAR_WINDOW="${AGILLM4_SUBLINEAR_WINDOW:-128}"
+SUBLINEAR_STRIDE="${AGILLM4_SUBLINEAR_STRIDE:-128}"
+SUBLINEAR_MAX_ANCHORS="${AGILLM4_SUBLINEAR_MAX_ANCHORS:-128}"
+SUBLINEAR_CHUNK="${AGILLM4_SUBLINEAR_CHUNK:-128}"
+GC_FLAG=()
+if [ "$GRAD_CHECKPOINT" = "1" ] || [ "$GRAD_CHECKPOINT" = "true" ] || [ "$GRAD_CHECKPOINT" = "yes" ]; then GC_FLAG=(--grad_checkpoint); fi
 [ -n "$CKPT" ] || { echo "no ckpt" >&2; exit 1; }
 exec >> /workspace/agillm4_floor_train.log 2>&1
+echo "RELAUNCH_AGILLM4_DBLOCK_SPEED $(date -u +%Y-%m-%dT%H:%M:%SZ) resume=$CKPT --dblock blocks=${AGILLM4_DBLOCKS:-4} tie_weights=1 attn=${AGILLM_ATTN_BACKEND} batch=$BATCH_SIZE sat_every=$SAT_EVERY nat_every=$NAT_EVERY empty_cache_every=$EMPTY_CACHE_EVERY grad_checkpoint=$GRAD_CHECKPOINT objective=$DBLOCK_OBJECTIVE_MODE ar_prob=$DBLOCK_AR_PROB sat_prob=$DBLOCK_SAT_PROB nat_prob=$DBLOCK_NAT_PROB ar_loss_tokens=$DBLOCK_AR_LOSS_TOKENS nat_loss_tokens=$DBLOCK_NAT_LOSS_TOKENS sublinear_window=$SUBLINEAR_WINDOW sublinear_stride=$SUBLINEAR_STRIDE sublinear_max_anchors=$SUBLINEAR_MAX_ANCHORS"
 exec python -u nB300_agillm4.py train --preset agillm4_floor --resume "$CKPT" \
   --dblock --dblock_blocks "${AGILLM4_DBLOCKS:-4}" --dblock_schedule "${AGILLM4_DBLOCK_SCHEDULE:-loss_balanced}" \
   --dblock_warmup_steps "${AGILLM4_DBLOCK_WARMUP:-16}" --dblock_sigma_curriculum_steps "${AGILLM4_DBLOCK_SIGMA_CURRICULUM:-2000}" \
+  --dblock_log_every "${AGILLM4_DBLOCK_LOG_EVERY:-25}" --dblock_objective_mode "$DBLOCK_OBJECTIVE_MODE" \
+  --dblock_ar_prob "$DBLOCK_AR_PROB" --dblock_sat_prob "$DBLOCK_SAT_PROB" --dblock_nat_prob "$DBLOCK_NAT_PROB" \
+  --dblock_ar_loss_tokens "$DBLOCK_AR_LOSS_TOKENS" --dblock_sat_loss_tokens "$DBLOCK_SAT_LOSS_TOKENS" --dblock_nat_loss_tokens "$DBLOCK_NAT_LOSS_TOKENS" \
+  --tie_weights \
+  --batch_size "$BATCH_SIZE" --block "${AGILLM4_BLOCK:-1280}" --amp --attn_backend "${AGILLM_ATTN_BACKEND}" --sublinear_window "$SUBLINEAR_WINDOW" --sublinear_stride "$SUBLINEAR_STRIDE" --sublinear_max_anchors "$SUBLINEAR_MAX_ANCHORS" --sublinear_chunk "$SUBLINEAR_CHUNK" "${GC_FLAG[@]}" \
+  --optimizer paged_adamw8bit --sat_every "$SAT_EVERY" --nat_every "$NAT_EVERY" --nat_max_tokens 768 --nat_mask_ratio 0.5 \
   --token_param_ratio 100 --save_dir "$SAVE_DIR" \
   --save_every_sec 86400 --heartbeat_every_sec "${AGILLM4_HEARTBEAT_EVERY_SEC:-300}" \
+  --empty_cache_every_steps "$EMPTY_CACHE_EVERY" \
   --delta_every_steps 25000 --delta_max_keep 1 --max_ckpts 1

relaunch_agillm4_dblock_tied.sh CHANGED Viewed

@@ -8,15 +8,36 @@ export AGILLM_ATTN_BACKEND=sublinear
 [ -f /root/.cache/huggingface/token ] && { export HF_TOKEN="$(tr -d '\r\n' </root/.cache/huggingface/token)"; export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"; }
 SAVE_DIR=/workspace/agillm4_4090_ckpts
 CKPT="$(ls -1t "$SAVE_DIR"/pretrain_step*.pt 2>/dev/null | head -1)"
 exec >> /workspace/agillm4_floor_train.log 2>&1
-echo "RELAUNCH_AGILLM4_DBLOCK_TIED $(date -u +%Y-%m-%dT%H:%M:%SZ) resume=$CKPT --dblock --tie_weights --attn_backend sublinear (fused_ce fixed)"
 exec python -u nB300_agillm4.py train --preset agillm4_floor --resume "$CKPT" \
   --dblock --dblock_blocks "${AGILLM4_DBLOCKS:-4}" --dblock_schedule "${AGILLM4_DBLOCK_SCHEDULE:-loss_balanced}" \
   --dblock_warmup_steps "${AGILLM4_DBLOCK_WARMUP:-16}" --dblock_sigma_curriculum_steps "${AGILLM4_DBLOCK_SIGMA_CURRICULUM:-2000}" \
-  --dblock_log_every "${AGILLM4_DBLOCK_LOG_EVERY:-25}" --tie_weights \
-  --batch_size 1 --block 1280 --amp --attn_backend sublinear --grad_checkpoint \
-  --optimizer paged_adamw8bit --sat_every 1 --nat_every 1 --nat_max_tokens 768 --nat_mask_ratio 0.5 \
   --token_param_ratio 100 --save_dir "$SAVE_DIR" \
   --save_every_sec 86400 --heartbeat_every_sec "${AGILLM4_HEARTBEAT_EVERY_SEC:-300}" \
-  --empty_cache_every_steps "${AGILLM4_EMPTY_CACHE_EVERY_STEPS:-1}" \
   --delta_every_steps 25000 --delta_max_keep 1 --max_ckpts 1

 [ -f /root/.cache/huggingface/token ] && { export HF_TOKEN="$(tr -d '\r\n' </root/.cache/huggingface/token)"; export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"; }
 SAVE_DIR=/workspace/agillm4_4090_ckpts
 CKPT="$(ls -1t "$SAVE_DIR"/pretrain_step*.pt 2>/dev/null | head -1)"
+BATCH_SIZE="${AGILLM4_BATCH_SIZE:-2}"
+SAT_EVERY="${AGILLM4_SAT_EVERY:-4}"
+NAT_EVERY="${AGILLM4_NAT_EVERY:-4}"
+EMPTY_CACHE_EVERY="${AGILLM4_EMPTY_CACHE_EVERY_STEPS:-0}"
+GRAD_CHECKPOINT="${AGILLM4_GRAD_CHECKPOINT:-1}"
+DBLOCK_OBJECTIVE_MODE="${AGILLM4_DBLOCK_OBJECTIVE_MODE:-stochastic}"
+DBLOCK_AR_PROB="${AGILLM4_DBLOCK_AR_PROB:-0.85}"
+DBLOCK_SAT_PROB="${AGILLM4_DBLOCK_SAT_PROB:-0.075}"
+DBLOCK_NAT_PROB="${AGILLM4_DBLOCK_NAT_PROB:-0.075}"
+DBLOCK_AR_LOSS_TOKENS="${AGILLM4_DBLOCK_AR_LOSS_TOKENS:-512}"
+DBLOCK_SAT_LOSS_TOKENS="${AGILLM4_DBLOCK_SAT_LOSS_TOKENS:-0}"
+DBLOCK_NAT_LOSS_TOKENS="${AGILLM4_DBLOCK_NAT_LOSS_TOKENS:-512}"
+SUBLINEAR_WINDOW="${AGILLM4_SUBLINEAR_WINDOW:-128}"
+SUBLINEAR_STRIDE="${AGILLM4_SUBLINEAR_STRIDE:-128}"
+SUBLINEAR_MAX_ANCHORS="${AGILLM4_SUBLINEAR_MAX_ANCHORS:-128}"
+SUBLINEAR_CHUNK="${AGILLM4_SUBLINEAR_CHUNK:-128}"
+GC_FLAG=()
+if [ "$GRAD_CHECKPOINT" = "1" ] || [ "$GRAD_CHECKPOINT" = "true" ] || [ "$GRAD_CHECKPOINT" = "yes" ]; then GC_FLAG=(--grad_checkpoint); fi
 exec >> /workspace/agillm4_floor_train.log 2>&1
+echo "RELAUNCH_AGILLM4_DBLOCK_TIED_SPEED $(date -u +%Y-%m-%dT%H:%M:%SZ) resume=$CKPT --dblock --tie_weights --attn_backend sublinear batch=$BATCH_SIZE sat_every=$SAT_EVERY nat_every=$NAT_EVERY empty_cache_every=$EMPTY_CACHE_EVERY grad_checkpoint=$GRAD_CHECKPOINT objective=$DBLOCK_OBJECTIVE_MODE ar_prob=$DBLOCK_AR_PROB sat_prob=$DBLOCK_SAT_PROB nat_prob=$DBLOCK_NAT_PROB ar_loss_tokens=$DBLOCK_AR_LOSS_TOKENS nat_loss_tokens=$DBLOCK_NAT_LOSS_TOKENS sublinear_window=$SUBLINEAR_WINDOW sublinear_stride=$SUBLINEAR_STRIDE sublinear_max_anchors=$SUBLINEAR_MAX_ANCHORS"
 exec python -u nB300_agillm4.py train --preset agillm4_floor --resume "$CKPT" \
   --dblock --dblock_blocks "${AGILLM4_DBLOCKS:-4}" --dblock_schedule "${AGILLM4_DBLOCK_SCHEDULE:-loss_balanced}" \
   --dblock_warmup_steps "${AGILLM4_DBLOCK_WARMUP:-16}" --dblock_sigma_curriculum_steps "${AGILLM4_DBLOCK_SIGMA_CURRICULUM:-2000}" \
+  --dblock_log_every "${AGILLM4_DBLOCK_LOG_EVERY:-25}" --dblock_objective_mode "$DBLOCK_OBJECTIVE_MODE" \
+  --dblock_ar_prob "$DBLOCK_AR_PROB" --dblock_sat_prob "$DBLOCK_SAT_PROB" --dblock_nat_prob "$DBLOCK_NAT_PROB" \
+  --dblock_ar_loss_tokens "$DBLOCK_AR_LOSS_TOKENS" --dblock_sat_loss_tokens "$DBLOCK_SAT_LOSS_TOKENS" --dblock_nat_loss_tokens "$DBLOCK_NAT_LOSS_TOKENS" \
+  --tie_weights \
+  --batch_size "$BATCH_SIZE" --block 1280 --amp --attn_backend sublinear --sublinear_window "$SUBLINEAR_WINDOW" --sublinear_stride "$SUBLINEAR_STRIDE" --sublinear_max_anchors "$SUBLINEAR_MAX_ANCHORS" --sublinear_chunk "$SUBLINEAR_CHUNK" "${GC_FLAG[@]}" \
+  --optimizer paged_adamw8bit --sat_every "$SAT_EVERY" --nat_every "$NAT_EVERY" --nat_max_tokens 768 --nat_mask_ratio 0.5 \
   --token_param_ratio 100 --save_dir "$SAVE_DIR" \
   --save_every_sec 86400 --heartbeat_every_sec "${AGILLM4_HEARTBEAT_EVERY_SEC:-300}" \
+  --empty_cache_every_steps "$EMPTY_CACHE_EVERY" \
   --delta_every_steps 25000 --delta_max_keep 1 --max_ckpts 1