AGILLM4_add_v47_floor_tmux_launcher

Browse files

Files changed (4) hide show

AGILLM-4.md +2 -1
README.md +3 -0
launch_agillm4_4090_floor_from_v47.sh +19 -0
nB300_agillm4.py +22 -3

AGILLM-4.md CHANGED Viewed

@@ -39,7 +39,8 @@ AGILLM4_4090_WARMSTART_FROM=/workspace/agillm-4/agillm4_floor_seed_from_v3_v47.p
 AGILLM4_4090_PRESET=agillm4_floor \
 AGILLM4_4090_BLOCK=512 \
 AGILLM4_4090_TOKEN_PARAM_RATIO=100 \
-bash /workspace/agillm-4/run_agillm4_4090_longblock.sh
 ```
 Important: `--sat_every 1 --nat_every 4` keeps SAT trained every step and NAT active on a cadence that fits 24GB cards. On B200/B300 use `--nat_every 1` for full AR+SAT+NAT every step. The AGILLM-4 code now backprops AR, SAT, and NAT sequentially, so the objective remains joint while peak VRAM is lower than holding all activation graphs at once.

 AGILLM4_4090_PRESET=agillm4_floor \
 AGILLM4_4090_BLOCK=512 \
 AGILLM4_4090_TOKEN_PARAM_RATIO=100 \
+tmux new-session -d -s agillm4_floor \
+  /workspace/agillm-4/launch_agillm4_4090_floor_from_v47.sh
 ```
 Important: `--sat_every 1 --nat_every 4` keeps SAT trained every step and NAT active on a cadence that fits 24GB cards. On B200/B300 use `--nat_every 1` for full AR+SAT+NAT every step. The AGILLM-4 code now backprops AR, SAT, and NAT sequentially, so the objective remains joint while peak VRAM is lower than holding all activation graphs at once.

README.md CHANGED Viewed

@@ -31,5 +31,8 @@ against SDPA before using it for a real run.
 On RTX 4090-class 24GB cards, `run_agillm4_4090_longblock.sh` now defaults to
 `agillm4_floor` instead of the AGILLM-3-sized `large` preset. Override
 `AGILLM4_4090_BLOCK` upward only after the first floor run is stable.
 Current harvest status from n1.py is tracked in [N1_HARVEST.md](N1_HARVEST.md).

 On RTX 4090-class 24GB cards, `run_agillm4_4090_longblock.sh` now defaults to
 `agillm4_floor` instead of the AGILLM-3-sized `large` preset. Override
 `AGILLM4_4090_BLOCK` upward only after the first floor run is stable.
+For the current v47 seed, launch tmux with
+`/workspace/agillm-4/launch_agillm4_4090_floor_from_v47.sh`; it writes
+`/workspace/agillm4_floor_train.log`.
 Current harvest status from n1.py is tracked in [N1_HARVEST.md](N1_HARVEST.md).

launch_agillm4_4090_floor_from_v47.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/usr/bin/env bash
+set -Eeuo pipefail
+LOG="${AGILLM4_FLOOR_LOG:-/workspace/agillm4_floor_train.log}"
+mkdir -p "$(dirname "$LOG")"
+exec >> "$LOG" 2>&1
+echo "LAUNCH_AGILLM4_4090_FLOOR_FROM_V47 $(date -u +%Y-%m-%dT%H:%M:%SZ) host=$(hostname)"
+export AGILLM4_4090_WARMSTART_FROM="${AGILLM4_4090_WARMSTART_FROM:-/workspace/agillm-4/agillm4_floor_seed_from_v3_v47.pt}"
+export AGILLM4_4090_PRESET="${AGILLM4_4090_PRESET:-agillm4_floor}"
+export AGILLM4_4090_BLOCK="${AGILLM4_4090_BLOCK:-512}"
+export AGILLM4_4090_TOKEN_PARAM_RATIO="${AGILLM4_4090_TOKEN_PARAM_RATIO:-100}"
+export AGILLM4_4090_NAT_EVERY="${AGILLM4_4090_NAT_EVERY:-4}"
+export AGILLM4_4090_NAT_MAX_TOKENS="${AGILLM4_4090_NAT_MAX_TOKENS:-512}"
+export AGILLM4_4090_SAVE_EVERY_SEC="${AGILLM4_4090_SAVE_EVERY_SEC:-21600}"
+export AGILLM4_4090_SAVE_DIR="${AGILLM4_4090_SAVE_DIR:-/workspace/agillm4_4090_ckpts}"
+exec /workspace/agillm-4/run_agillm4_4090_longblock.sh

nB300_agillm4.py CHANGED Viewed

@@ -1561,8 +1561,12 @@ def sat_mask(n, block=SAT_BLOCK):
 def sat_mask_cached(new_len: int, cached_len: int, block=SAT_BLOCK):
     total_len = cached_len + new_len
-    mask = torch.zeros((1, 1, new_len, total_len), device=DEV)
-    return mask
 # ───────────────────────── Checkpoint helpers ─────────────────────────
@@ -2480,9 +2484,23 @@ def infer(args):
     else:
         cached_len = ids.size(1)
         h, kvs = core(ids, sat_mask(ids.size(1)), use_cache=True, total_seq_len=cached_len)
         added = 0
         while added < args.max_new:
-            logits_all, gate = sat_h(h[:, -SAT_BLOCK:])
             stride = SAT_BLOCK if (not args.var or gate is None) else (gate.softmax(-1).multinomial(1).item() + 1)
             stride = min(int(stride), logits_all.size(1))
             new_tokens = []
@@ -2499,6 +2517,7 @@ def infer(args):
             mask = sat_mask_cached(new_ids.size(1), cached_len)
             h, kvs = core(new_ids, mask, kv_caches=kvs, use_cache=True, total_seq_len=ids.size(1))
             cached_len = ids.size(1)
     elapsed = time.time() - start
     gen_tokens = len(ids[0]) - prompt_len
     tok_per_sec = gen_tokens / elapsed if elapsed > 0 else 0

 def sat_mask_cached(new_len: int, cached_len: int, block=SAT_BLOCK):
     total_len = cached_len + new_len
+    q_idx = torch.arange(cached_len, total_len, device=DEV).unsqueeze(1)
+    k_idx = torch.arange(total_len, device=DEV).unsqueeze(0)
+    q_grp = q_idx // block
+    k_grp = k_idx // block
+    allow = q_grp >= k_grp
+    return torch.where(allow, 0.0, float("-inf")).unsqueeze(0).unsqueeze(0)
 # ───────────────────────── Checkpoint helpers ─────────────────────────
     else:
         cached_len = ids.size(1)
         h, kvs = core(ids, sat_mask(ids.size(1)), use_cache=True, total_seq_len=cached_len)
+        h_buffer = h[:, -SAT_BLOCK:]
         added = 0
+        stop = False
+        # Align to block boundary if prompt is off-boundary
+        if ids.size(1) % SAT_BLOCK != 0:
+            logits = ar_h(h)[:, -1]
+            logits = _apply_penalties(logits, ids, args.penalty_last_n, args.repetition_penalty, args.presence_penalty, args.frequency_penalty)
+            nxt = _sample(logits, args.temperature, args.top_k, args.top_p, args.min_p, args.greedy)
+            ids = torch.cat([ids, nxt], 1)
+            added += 1
+            h, kvs = core(nxt, None, kv_caches=kvs, use_cache=True, total_seq_len=ids.size(1))
+            cached_len = ids.size(1)
+            h_buffer = torch.cat([h_buffer, h], dim=1)[:, -SAT_BLOCK:]
         while added < args.max_new:
+            logits_all, gate = sat_h(h_buffer)
             stride = SAT_BLOCK if (not args.var or gate is None) else (gate.softmax(-1).multinomial(1).item() + 1)
             stride = min(int(stride), logits_all.size(1))
             new_tokens = []
             mask = sat_mask_cached(new_ids.size(1), cached_len)
             h, kvs = core(new_ids, mask, kv_caches=kvs, use_cache=True, total_seq_len=ids.size(1))
             cached_len = ids.size(1)
+            h_buffer = torch.cat([h_buffer, h], dim=1)[:, -SAT_BLOCK:]
     elapsed = time.time() - start
     gen_tokens = len(ids[0]) - prompt_len
     tok_per_sec = gen_tokens / elapsed if elapsed > 0 else 0