Add DBlock profiler and speed-tuned batch config

Files changed (4) hide show

README.md +4 -0
dblocks_train.py +108 -9
nB300_agillm4_vram_dblock.py +26 -0
relaunch_agillm4_dblock_sg2.sh +4 -3

README.md CHANGED Viewed

@@ -85,4 +85,8 @@ DiffusionBlocks, gradient-checkpointed blocks, tied heads, and structured masks.
 Sublinear coverage update 2026-05-29: the saved AGILLM-4 trainer snapshot now matches the live v2 sparse global memory path. It fixes gathered ALiBi distance, suppresses duplicate local/anchor candidates before softmax, uses hybrid full-span + recent-tail anchors with explicit `--sublinear_sinks` and `--sublinear_recent_anchors`, and includes optional pooled K/V landmark summaries behind `--sublinear_pooled_landmarks`. At the live 128/128/128 profile it keeps deep-past coverage while preserving recent anchors and the same VRAM-first key budget. See `sublinear_improved_snippet.py` for the minimal blocks and `sublinear_improved.py` for the coverage demo/standalone selector.
 License: Apache-2.0 (matching the upstream method).

 Sublinear coverage update 2026-05-29: the saved AGILLM-4 trainer snapshot now matches the live v2 sparse global memory path. It fixes gathered ALiBi distance, suppresses duplicate local/anchor candidates before softmax, uses hybrid full-span + recent-tail anchors with explicit `--sublinear_sinks` and `--sublinear_recent_anchors`, and includes optional pooled K/V landmark summaries behind `--sublinear_pooled_landmarks`. At the live 128/128/128 profile it keeps deep-past coverage while preserving recent anchors and the same VRAM-first key budget. See `sublinear_improved_snippet.py` for the minimal blocks and `sublinear_improved.py` for the coverage demo/standalone selector.
+Profiling/speed update 2026-05-29: added in-process DBlock profiling (`--profile_steps`, `--profile_log_every`) after external ptrace profiling was blocked on Vast. The profile showed the bottleneck is transformer recompute/backward, not fused CE or the optimizer: at B=2 full checkpointing, AR backward averaged ~605 ms/step, AR forward ~184 ms, CE ~4.5 ms, optimizer ~17 ms. Tested speed levers live: no checkpointing OOMed at B=2 and fell to B=1, selective checkpoint stride=2 fit but hugged VRAM and reached ~2.94k tok/s, B=5/6 hit a memory-pressure cliff, while B=4 with full DBlock checkpointing was the best stable official setting (~3.0k tok/s warm window, ~13.2 GB tensor peak / ~17.6 GB reserved, ETA ~269-275 days). The live relaunch now uses `--batch_size 4 --grad_checkpoint --dblock_checkpoint_stride 1` and leaves selective checkpointing available for future context/batch tradeoffs.
 License: Apache-2.0 (matching the upstream method).

dblocks_train.py CHANGED Viewed

@@ -7,6 +7,8 @@ Lazy-imports nB300 inside functions to avoid a circular import.
 """
 import math
 import random
 import numpy as np
 import torch
 import torch.nn as nn
@@ -17,6 +19,63 @@ from fused_ce import fused_ce
 SD = 0.5
 def _cdf(x):
     return 0.5 * (1 + math.erf(x / math.sqrt(2)))
@@ -129,6 +188,17 @@ def _run_block(block, x, mask, use_checkpoint):
     return block(x, mask)
 def _sample_token_loss_inputs(hidden, targets, max_tokens):
     max_tokens = int(max_tokens or 0)
     if max_tokens <= 0:
@@ -173,9 +243,12 @@ def _choose_objectives(state, args, ar_weight, sat_weight, nat_weight, do_sat_pe
 def _dblock_step(core, ar_h, sat_h, nat_h, opt, scaler, args, ids, state):
     import nB300_agillm4 as M
     if torch.cuda.is_available():
         torch.cuda.reset_peak_memory_stats()
     B = state["B"]
     asg = state["assign"]
     bs = state["bsig"]
@@ -206,6 +279,7 @@ def _dblock_step(core, ar_h, sat_h, nat_h, opt, scaler, args, ids, state):
     run_ar, run_sat, run_nat, objective = _choose_objectives(
         state, args, ar_weight, sat_weight, nat_weight, do_sat_periodic, do_nat_periodic
     )
     ar_val = 0.0
     sat_val = 0.0
@@ -213,34 +287,44 @@ def _dblock_step(core, ar_h, sat_h, nat_h, opt, scaler, args, ids, state):
     if run_ar:
         causal = M.causal_mask(T, structured=M.use_structured_masks(args))
         with M.amp(args.amp):
             emb = core.emb(ids)
             zt = emb + sig[:, None, None] * torch.randn_like(emb)
             h = ci * zt
-            for li in layers:
-                h = _run_block(core.blocks[li], h, causal, use_layer_checkpoint)
             Dn = core.ln(cs * zt + co * h)
         ar_hidden, ar_targets, ar_used, ar_total = _sample_token_loss_inputs(
             Dn[:, :-1], ids[:, 1:], int(getattr(args, "dblock_ar_loss_tokens", 0))
         )
         ar = ar_weight * w * fused_ce(ar_hidden, ar_h.proj.weight, ar_targets)
         ar_val = float(ar.detach())
         scaler.scale(ar).backward()
         del causal, emb, zt, h, Dn, ar_hidden, ar_targets, ar, ar_used, ar_total
     if run_sat:
         smask = M.sat_mask(T, structured=M.use_structured_masks(args))
         with M.amp(args.amp):
             emb2 = core.emb(ids)
             zt2 = emb2 + sig[:, None, None] * torch.randn_like(emb2)
             h2 = ci * zt2
-            for li in layers:
-                h2 = _run_block(core.blocks[li], h2, smask, use_layer_checkpoint)
             Ds = core.ln(cs * zt2 + co * h2)
             last = Ds[:, -SATB:]
-            sat_hidden, sat_targets, sat_used, sat_total = _sample_token_loss_inputs(
-                last, ids[:, 1 : SATB + 1], int(getattr(args, "dblock_sat_loss_tokens", 0))
-            )
             satf = fused_ce(sat_hidden, sat_h.proj.weight, sat_targets)
             satv = (
                 M.EMIT_LAMBDA
@@ -252,13 +336,17 @@ def _dblock_step(core, ar_h, sat_h, nat_h, opt, scaler, args, ids, state):
                 else 0.0
             )
             sat = sat_weight * w * (satf + satv)
         sat_val = float(sat.detach())
         scaler.scale(sat).backward()
         del smask, emb2, zt2, h2, Ds, last, sat_hidden, sat_targets, satf, satv, sat
     if run_nat:
         ratio = min(max(float(getattr(args, "nat_mask_ratio", 0.5)), 0.05), 0.95)
         nat_ids = M._nat_ids_for_training(ids, int(getattr(args, "nat_max_tokens", 0)))
         with M.amp(args.amp):
             nat_in = nat_ids.clone()
             m = torch.rand(nat_ids.shape, device=nat_ids.device) < ratio
@@ -266,9 +354,11 @@ def _dblock_step(core, ar_h, sat_h, nat_h, opt, scaler, args, ids, state):
                 m[..., -1] = True
             nat_in[m] = M.BLANK
             hn = core.emb(nat_in)
-            for li in layers:
-                hn = _run_block(core.blocks[li], hn, None, use_layer_checkpoint)
             Dnat = core.ln(hn)
         nat_hidden = Dnat[m]
         nat_targets = nat_ids[m]
         nat_hidden, nat_targets, nat_used, nat_total = _sample_token_loss_inputs(
@@ -276,7 +366,10 @@ def _dblock_step(core, ar_h, sat_h, nat_h, opt, scaler, args, ids, state):
         )
         nat = nat_weight * fused_ce(nat_hidden, nat_h.proj.weight, nat_targets)
         nat_val = float(nat.detach())
         scaler.scale(nat).backward()
         del nat_ids, nat_in, m, hn, Dnat, nat_hidden, nat_targets, nat, nat_used, nat_total
     total_val = ar_val + sat_val + nat_val
@@ -285,20 +378,26 @@ def _dblock_step(core, ar_h, sat_h, nat_h, opt, scaler, args, ids, state):
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         print(f"[dblock] non-finite loss {total_val}; skipped optimizer step", flush=True)
         _update_stats(state, bi, total_val)
         return total_val
     scaler.unscale_(opt)
     nn.utils.clip_grad_norm_([p for g in opt.param_groups for p in g["params"]], 1.0)
     scaler.step(opt)
     scaler.update()
     opt.zero_grad(set_to_none=True)
     peak_alloc = None
     peak_reserved = None
     if torch.cuda.is_available():
         peak_alloc = torch.cuda.max_memory_allocated() / (1024**3)
         peak_reserved = torch.cuda.max_memory_reserved() / (1024**3)
     _update_stats(state, bi, total_val)
     _maybe_log(state, args, bi, layers, ar_val, sat_val, nat_val, total_val, peak_alloc, peak_reserved, objective=objective)
     return total_val

 """
 import math
 import random
+import time
+from collections import defaultdict
 import numpy as np
 import torch
 import torch.nn as nn
 SD = 0.5
+def _profile_active(state, args):
+    limit = int(getattr(args, "profile_steps", 0) or 0)
+    return limit > 0 and int(state.get("profile_n", 0)) < limit
+def _profile_add(state, name, seconds):
+    if seconds is None:
+        return
+    prof = state.setdefault("profile_times", defaultdict(float))
+    prof[name] += float(seconds)
+def _profile_tic(enabled):
+    if not enabled:
+        return None
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    return time.perf_counter()
+def _profile_toc(state, name, start):
+    if start is None:
+        return
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    _profile_add(state, name, time.perf_counter() - start)
+def _profile_step_done(state, args):
+    limit = int(getattr(args, "profile_steps", 0) or 0)
+    if limit <= 0:
+        return
+    n_prev = int(state.get("profile_n", 0))
+    if n_prev >= limit:
+        return
+    state["profile_n"] = n_prev + 1
+    n = int(state["profile_n"])
+    log_every = max(1, int(getattr(args, "profile_log_every", 25) or 25))
+    if n % log_every != 0 and n != limit:
+        return
+    times = state.get("profile_times", {})
+    keys = [
+        "data_stream", "tensor", "setup",
+        "ar_forward", "ar_ce", "ar_backward",
+        "sat_forward", "sat_ce", "sat_backward",
+        "nat_forward", "nat_ce", "nat_backward",
+        "opt_step", "step_total",
+    ]
+    parts = []
+    for key in keys:
+        val = float(times.get(key, 0.0)) * 1000.0 / max(1, n)
+        if val > 0.01:
+            parts.append(f"{key}={val:.2f}ms")
+    print(f"[profile] n={n}/{limit} avg " + " ".join(parts), flush=True)
 def _cdf(x):
     return 0.5 * (1 + math.erf(x / math.sqrt(2)))
     return block(x, mask)
+def _dblock_checkpoint_this_layer(args, base_enabled, layer_pos):
+    if not base_enabled:
+        return False
+    stride = int(getattr(args, "dblock_checkpoint_stride", 1) or 1)
+    if stride <= 0:
+        return False
+    if stride == 1:
+        return True
+    return (int(layer_pos) % stride) == 0
 def _sample_token_loss_inputs(hidden, targets, max_tokens):
     max_tokens = int(max_tokens or 0)
     if max_tokens <= 0:
 def _dblock_step(core, ar_h, sat_h, nat_h, opt, scaler, args, ids, state):
     import nB300_agillm4 as M
+    prof = _profile_active(state, args)
+    _step_t = _profile_tic(prof)
     if torch.cuda.is_available():
         torch.cuda.reset_peak_memory_stats()
+    _setup_t = _profile_tic(prof)
     B = state["B"]
     asg = state["assign"]
     bs = state["bsig"]
     run_ar, run_sat, run_nat, objective = _choose_objectives(
         state, args, ar_weight, sat_weight, nat_weight, do_sat_periodic, do_nat_periodic
     )
+    _profile_toc(state, "setup", _setup_t)
     ar_val = 0.0
     sat_val = 0.0
     if run_ar:
         causal = M.causal_mask(T, structured=M.use_structured_masks(args))
+        _t = _profile_tic(prof)
         with M.amp(args.amp):
             emb = core.emb(ids)
             zt = emb + sig[:, None, None] * torch.randn_like(emb)
             h = ci * zt
+            for lpos, li in enumerate(layers):
+                h = _run_block(core.blocks[li], h, causal, _dblock_checkpoint_this_layer(args, use_layer_checkpoint, lpos))
             Dn = core.ln(cs * zt + co * h)
+        _profile_toc(state, "ar_forward", _t)
+        _t = _profile_tic(prof)
         ar_hidden, ar_targets, ar_used, ar_total = _sample_token_loss_inputs(
             Dn[:, :-1], ids[:, 1:], int(getattr(args, "dblock_ar_loss_tokens", 0))
         )
         ar = ar_weight * w * fused_ce(ar_hidden, ar_h.proj.weight, ar_targets)
         ar_val = float(ar.detach())
+        _profile_toc(state, "ar_ce", _t)
+        _t = _profile_tic(prof)
         scaler.scale(ar).backward()
+        _profile_toc(state, "ar_backward", _t)
         del causal, emb, zt, h, Dn, ar_hidden, ar_targets, ar, ar_used, ar_total
     if run_sat:
         smask = M.sat_mask(T, structured=M.use_structured_masks(args))
+        _t = _profile_tic(prof)
         with M.amp(args.amp):
             emb2 = core.emb(ids)
             zt2 = emb2 + sig[:, None, None] * torch.randn_like(emb2)
             h2 = ci * zt2
+            for lpos, li in enumerate(layers):
+                h2 = _run_block(core.blocks[li], h2, smask, _dblock_checkpoint_this_layer(args, use_layer_checkpoint, lpos))
             Ds = core.ln(cs * zt2 + co * h2)
             last = Ds[:, -SATB:]
+        _profile_toc(state, "sat_forward", _t)
+        _t = _profile_tic(prof)
+        sat_hidden, sat_targets, sat_used, sat_total = _sample_token_loss_inputs(
+            last, ids[:, 1 : SATB + 1], int(getattr(args, "dblock_sat_loss_tokens", 0))
+        )
+        with M.amp(args.amp):
             satf = fused_ce(sat_hidden, sat_h.proj.weight, sat_targets)
             satv = (
                 M.EMIT_LAMBDA
                 else 0.0
             )
             sat = sat_weight * w * (satf + satv)
+        _profile_toc(state, "sat_ce", _t)
         sat_val = float(sat.detach())
+        _t = _profile_tic(prof)
         scaler.scale(sat).backward()
+        _profile_toc(state, "sat_backward", _t)
         del smask, emb2, zt2, h2, Ds, last, sat_hidden, sat_targets, satf, satv, sat
     if run_nat:
         ratio = min(max(float(getattr(args, "nat_mask_ratio", 0.5)), 0.05), 0.95)
         nat_ids = M._nat_ids_for_training(ids, int(getattr(args, "nat_max_tokens", 0)))
+        _t = _profile_tic(prof)
         with M.amp(args.amp):
             nat_in = nat_ids.clone()
             m = torch.rand(nat_ids.shape, device=nat_ids.device) < ratio
                 m[..., -1] = True
             nat_in[m] = M.BLANK
             hn = core.emb(nat_in)
+            for lpos, li in enumerate(layers):
+                hn = _run_block(core.blocks[li], hn, None, _dblock_checkpoint_this_layer(args, use_layer_checkpoint, lpos))
             Dnat = core.ln(hn)
+        _profile_toc(state, "nat_forward", _t)
+        _t = _profile_tic(prof)
         nat_hidden = Dnat[m]
         nat_targets = nat_ids[m]
         nat_hidden, nat_targets, nat_used, nat_total = _sample_token_loss_inputs(
         )
         nat = nat_weight * fused_ce(nat_hidden, nat_h.proj.weight, nat_targets)
         nat_val = float(nat.detach())
+        _profile_toc(state, "nat_ce", _t)
+        _t = _profile_tic(prof)
         scaler.scale(nat).backward()
+        _profile_toc(state, "nat_backward", _t)
         del nat_ids, nat_in, m, hn, Dnat, nat_hidden, nat_targets, nat, nat_used, nat_total
     total_val = ar_val + sat_val + nat_val
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         print(f"[dblock] non-finite loss {total_val}; skipped optimizer step", flush=True)
+        _profile_toc(state, "step_total", _step_t)
+        _profile_step_done(state, args)
         _update_stats(state, bi, total_val)
         return total_val
+    _t = _profile_tic(prof)
     scaler.unscale_(opt)
     nn.utils.clip_grad_norm_([p for g in opt.param_groups for p in g["params"]], 1.0)
     scaler.step(opt)
     scaler.update()
     opt.zero_grad(set_to_none=True)
+    _profile_toc(state, "opt_step", _t)
     peak_alloc = None
     peak_reserved = None
     if torch.cuda.is_available():
         peak_alloc = torch.cuda.max_memory_allocated() / (1024**3)
         peak_reserved = torch.cuda.max_memory_reserved() / (1024**3)
+    _profile_toc(state, "step_total", _step_t)
+    _profile_step_done(state, args)
     _update_stats(state, bi, total_val)
     _maybe_log(state, args, bi, layers, ar_val, sat_val, nat_val, total_val, peak_alloc, peak_reserved, objective=objective)
     return total_val

nB300_agillm4_vram_dblock.py CHANGED Viewed

@@ -2324,17 +2324,37 @@ def _train_phase(
         except Exception:
             pass
     while seen_tok < total_tokens_needed:
         try:
             while len(buf) < BLOCK:
                 buf.append(next(stream))
         except StopIteration:
             break
         seq = buf[:BLOCK]
         buf = buf[BLOCK:]
         batch_accum.append(seq)
         if len(batch_accum) < BATCH:
             continue
         ids = torch.tensor(batch_accum, device=DEV)
         batch_accum = []
         tgt_ar = ids.clone()
         try:
@@ -2980,6 +3000,10 @@ def main():
                     help="Print lightweight trainer heartbeat/status lines every N seconds; 0 disables.")
     tr.add_argument("--empty_cache_every_steps", type=int, default=0,
                     help="Call torch.cuda.empty_cache() every N train steps; useful for VRAM-first runs where lower reserved VRAM matters more than speed.")
     tr.add_argument("--delta_every_steps", type=int, default=DEFAULT_DELTA_STEPS, help="Weight-only delta save every N steps (0=off)")
     tr.add_argument("--delta_max_keep", type=int, default=DEFAULT_MAX_DELTAS, help="Max delta checkpoints to keep")
     tr.add_argument("--resume_delta", type=str, help="Resume from a delta (weight-only, no optimizer state)")
@@ -3013,6 +3037,8 @@ def main():
                     help="Exploration rate for loss-balanced DBlock scheduling.")
     tr.add_argument("--dblock_log_every", type=int, default=25,
                     help="Print DBlock block/loss/VRAM diagnostics every N DBlock steps; 0 disables.")
     tr.add_argument("--dblock_sigma_curriculum_steps", type=int, default=2000,
                     help="Warm sigma ranges from easy to full span over this many DBlock steps; 0 disables.")
     tr.add_argument("--dblock_edm_wmax", type=float, default=5.0,

         except Exception:
             pass
     while seen_tok < total_tokens_needed:
+        _profile_batch = _DBS is not None and int(getattr(args, "profile_steps", 0) or 0) > 0 and int(_DBS.get("profile_n", 0)) < int(getattr(args, "profile_steps", 0) or 0)
+        _data_t = time.perf_counter() if _profile_batch else None
         try:
             while len(buf) < BLOCK:
                 buf.append(next(stream))
         except StopIteration:
             break
+        if _profile_batch:
+            try:
+                import dblocks_train as _db_prof
+                _db_prof._profile_add(_DBS, "data_stream", time.perf_counter() - _data_t)
+            except Exception:
+                pass
         seq = buf[:BLOCK]
         buf = buf[BLOCK:]
         batch_accum.append(seq)
         if len(batch_accum) < BATCH:
             continue
+        _tensor_t = time.perf_counter() if _profile_batch else None
         ids = torch.tensor(batch_accum, device=DEV)
+        if _profile_batch:
+            if DEV.type == "cuda":
+                try:
+                    torch.cuda.synchronize()
+                except Exception:
+                    pass
+            try:
+                import dblocks_train as _db_prof
+                _db_prof._profile_add(_DBS, "tensor", time.perf_counter() - _tensor_t)
+            except Exception:
+                pass
         batch_accum = []
         tgt_ar = ids.clone()
         try:
                     help="Print lightweight trainer heartbeat/status lines every N seconds; 0 disables.")
     tr.add_argument("--empty_cache_every_steps", type=int, default=0,
                     help="Call torch.cuda.empty_cache() every N train steps; useful for VRAM-first runs where lower reserved VRAM matters more than speed.")
+    tr.add_argument("--profile_steps", type=int, default=0,
+                    help="Profile the first N DBlock training steps with in-process CUDA timers; 0 disables.")
+    tr.add_argument("--profile_log_every", type=int, default=25,
+                    help="Print averaged profiler timings every N profiled steps.")
     tr.add_argument("--delta_every_steps", type=int, default=DEFAULT_DELTA_STEPS, help="Weight-only delta save every N steps (0=off)")
     tr.add_argument("--delta_max_keep", type=int, default=DEFAULT_MAX_DELTAS, help="Max delta checkpoints to keep")
     tr.add_argument("--resume_delta", type=str, help="Resume from a delta (weight-only, no optimizer state)")
                     help="Exploration rate for loss-balanced DBlock scheduling.")
     tr.add_argument("--dblock_log_every", type=int, default=25,
                     help="Print DBlock block/loss/VRAM diagnostics every N DBlock steps; 0 disables.")
+    tr.add_argument("--dblock_checkpoint_stride", type=int, default=1,
+                    help="With --grad_checkpoint in --dblock mode, checkpoint one layer every N selected block layers; 1=all layers, 2=alternate, 0=off.")
     tr.add_argument("--dblock_sigma_curriculum_steps", type=int, default=2000,
                     help="Warm sigma ranges from easy to full span over this many DBlock steps; 0 disables.")
     tr.add_argument("--dblock_edm_wmax", type=float, default=5.0,

relaunch_agillm4_dblock_sg2.sh CHANGED Viewed

@@ -10,15 +10,16 @@ export AGILLM_ATTN_BACKEND=sublinear
 SAVE_DIR=/workspace/agillm4_4090_ckpts
 CKPT="$(ls -1t "$SAVE_DIR"/pretrain_step*.pt 2>/dev/null | head -1)"
 exec >> /workspace/agillm4_floor_train.log 2>&1
-echo "RELAUNCH_AGILLM4_DBLOCK_SG2 $(date -u +%Y-%m-%dT%H:%M:%SZ) resume=$CKPT (improved sublinear v2: ALiBi distance + dedupe + hybrid anchors + sinks)"
 exec python -u nB300_agillm4.py train --preset agillm4_floor --resume "$CKPT" \
   --dblock --dblock_blocks 4 --dblock_schedule loss_balanced --dblock_warmup_steps 16 \
   --dblock_sigma_curriculum_steps 2000 --dblock_log_every 25 --dblock_objective_mode stochastic \
   --dblock_ar_prob 0.85 --dblock_sat_prob 0.075 --dblock_nat_prob 0.075 \
   --dblock_ar_loss_tokens 512 --dblock_sat_loss_tokens 0 --dblock_nat_loss_tokens 512 \
-  --tie_weights --batch_size 2 --block 1280 --amp --attn_backend sublinear \
   --sublinear_window 128 --sublinear_stride 128 --sublinear_max_anchors 128 --sublinear_chunk 128 \
   --sublinear_sinks 4 --sublinear_recent_anchors 64 --no-sublinear_pooled_landmarks \
-  --grad_checkpoint --optimizer paged_adamw8bit --sat_every 4 --nat_every 4 --nat_max_tokens 768 --nat_mask_ratio 0.5 \
   --token_param_ratio 100 --save_dir "$SAVE_DIR" --save_every_sec 86400 --heartbeat_every_sec 300 \
   --empty_cache_every_steps 0 --delta_every_steps 25000 --delta_max_keep 1 --max_ckpts 1

 SAVE_DIR=/workspace/agillm4_4090_ckpts
 CKPT="$(ls -1t "$SAVE_DIR"/pretrain_step*.pt 2>/dev/null | head -1)"
 exec >> /workspace/agillm4_floor_train.log 2>&1
+echo "RELAUNCH_AGILLM4_DBLOCK_SG2 $(date -u +%Y-%m-%dT%H:%M:%SZ) resume=$CKPT (batch4 official speed-optimized + sublinear v2)"
 exec python -u nB300_agillm4.py train --preset agillm4_floor --resume "$CKPT" \
   --dblock --dblock_blocks 4 --dblock_schedule loss_balanced --dblock_warmup_steps 16 \
   --dblock_sigma_curriculum_steps 2000 --dblock_log_every 25 --dblock_objective_mode stochastic \
   --dblock_ar_prob 0.85 --dblock_sat_prob 0.075 --dblock_nat_prob 0.075 \
   --dblock_ar_loss_tokens 512 --dblock_sat_loss_tokens 0 --dblock_nat_loss_tokens 512 \
+  --tie_weights --batch_size 4 --block 1280 --amp --attn_backend sublinear \
   --sublinear_window 128 --sublinear_stride 128 --sublinear_max_anchors 128 --sublinear_chunk 128 \
   --sublinear_sinks 4 --sublinear_recent_anchors 64 --no-sublinear_pooled_landmarks \
+  --grad_checkpoint --dblock_checkpoint_stride 1 --optimizer paged_adamw8bit --sat_every 4 --nat_every 4 --nat_max_tokens 768 --nat_mask_ratio 0.5 \
   --token_param_ratio 100 --save_dir "$SAVE_DIR" --save_every_sec 86400 --heartbeat_every_sec 300 \
+  \
   --empty_cache_every_steps 0 --delta_every_steps 25000 --delta_max_keep 1 --max_ckpts 1