OpenTransformer commited on
Commit
60f0b3c
·
verified ·
1 Parent(s): c2b5995

AGILLM4_include_NAT_in_param_estimates

Browse files
Files changed (4) hide show
  1. AGILLM-4.md +3 -3
  2. README.md +1 -1
  3. estimate_agillm4_params.py +3 -1
  4. nB300_agillm4.py +2 -2
AGILLM-4.md CHANGED
@@ -22,9 +22,9 @@ Implemented presets:
22
 
23
  | Preset | Shape | Approx params | Use |
24
  | --- | --- | ---: | --- |
25
- | `agillm4_floor` | d=1280, L=28, H=20, rank=160 | ~1.05B | minimum AGILLM-4 run |
26
- | `agillm4_main` | d=1536, L=32, H=24, rank=192 | ~1.5B | main target |
27
- | `agillm4_big` | d=1792, L=36, H=28, rank=224 | ~2.1B | stretch target after memory works |
28
 
29
  Default recommendation: train `agillm4_main` if B200/B300 availability is good. On a 24GB 4090, start with `agillm4_floor` so the run is still larger than AGILLM-3 while leaving enough VRAM for AR+SAT+NAT.
30
 
 
22
 
23
  | Preset | Shape | Approx params | Use |
24
  | --- | --- | ---: | --- |
25
+ | `agillm4_floor` | d=1280, L=28, H=20, rank=160 | ~1.21B | minimum AGILLM-4 run |
26
+ | `agillm4_main` | d=1536, L=32, H=24, rank=192 | ~1.70B | main target |
27
+ | `agillm4_big` | d=1792, L=36, H=28, rank=224 | ~2.40B | stretch target after memory works |
28
 
29
  Default recommendation: train `agillm4_main` if B200/B300 availability is good. On a 24GB 4090, start with `agillm4_floor` so the run is still larger than AGILLM-3 while leaving enough VRAM for AR+SAT+NAT.
30
 
README.md CHANGED
@@ -14,7 +14,7 @@ AGILLM-4 is the next training target after AGILLM-3. The current code is a
14
  production-oriented starting point, copied from the proven single-file trainer
15
  and extended for:
16
 
17
- - >1B parameter floor preset (`agillm4_floor`) and ~1.5B main preset (`agillm4_main`)
18
  - 100 tokens per parameter target ratio, above the AGILLM-3 training ratio
19
  - longer block-size work on 24GB, B200, and B300 class GPUs
20
  - AR+SAT+NAT training, with sequential backward to reduce peak VRAM
 
14
  production-oriented starting point, copied from the proven single-file trainer
15
  and extended for:
16
 
17
+ - >1B parameter floor preset (`agillm4_floor`) and ~1.7B main preset (`agillm4_main`) with AR+SAT+NAT heads
18
  - 100 tokens per parameter target ratio, above the AGILLM-3 training ratio
19
  - longer block-size work on 24GB, B200, and B300 class GPUs
20
  - AR+SAT+NAT training, with sequential backward to reduce peak VRAM
estimate_agillm4_params.py CHANGED
@@ -25,8 +25,9 @@ def estimate(vocab: int, d: int, layers: int, heads: int, rank: int, tie_weights
25
  block = attn + ff + norms
26
  core = embed + layers * block + 2 * d
27
  ar = 0 if tie_weights else vocab * d + vocab
 
28
  sat = vocab * d + vocab + 2 * d + 2
29
- total = core + ar + sat
30
  return {
31
  "vocab": vocab,
32
  "d_model": d,
@@ -38,6 +39,7 @@ def estimate(vocab: int, d: int, layers: int, heads: int, rank: int, tie_weights
38
  "block_params_each": block,
39
  "core_params": core,
40
  "ar_head_params": ar,
 
41
  "sat_head_params": sat,
42
  "total_params": total,
43
  "tokens_at_100_to_1": total * 100,
 
25
  block = attn + ff + norms
26
  core = embed + layers * block + 2 * d
27
  ar = 0 if tie_weights else vocab * d + vocab
28
+ nat = vocab * d + vocab
29
  sat = vocab * d + vocab + 2 * d + 2
30
+ total = core + ar + sat + nat
31
  return {
32
  "vocab": vocab,
33
  "d_model": d,
 
39
  "block_params_each": block,
40
  "core_params": core,
41
  "ar_head_params": ar,
42
+ "nat_head_params": nat,
43
  "sat_head_params": sat,
44
  "total_params": total,
45
  "tokens_at_100_to_1": total * 100,
nB300_agillm4.py CHANGED
@@ -832,8 +832,8 @@ PRESETS: Dict[str, Dict[str, int]] = {
832
  "base18": dict(d=768, layers=18, heads=24, rank=96),
833
  "large": dict(d=1024, layers=24, heads=16, rank=128),
834
  # AGILLM-4 tiers. These are intentionally above the ~700M AGILLM-3 size.
835
- # Approx dense parameter count with the current untied embedding+AR+SAT heads:
836
- # agillm4_floor ~= 1.05B, agillm4_main ~= 1.50B, agillm4_big ~= 2.1B.
837
  "agillm4_floor": dict(d=1280, layers=28, heads=20, rank=160),
838
  "agillm4_main": dict(d=1536, layers=32, heads=24, rank=192),
839
  "agillm4_big": dict(d=1792, layers=36, heads=28, rank=224),
 
832
  "base18": dict(d=768, layers=18, heads=24, rank=96),
833
  "large": dict(d=1024, layers=24, heads=16, rank=128),
834
  # AGILLM-4 tiers. These are intentionally above the ~700M AGILLM-3 size.
835
+ # Approx dense parameter count with the current untied embedding+AR+SAT+NAT heads:
836
+ # agillm4_floor ~= 1.21B, agillm4_main ~= 1.70B, agillm4_big ~= 2.40B.
837
  "agillm4_floor": dict(d=1280, layers=28, heads=20, rank=160),
838
  "agillm4_main": dict(d=1536, layers=32, heads=24, rank=192),
839
  "agillm4_big": dict(d=1792, layers=36, heads=28, rank=224),