AGILLM4_include_NAT_in_param_estimates
Browse files- AGILLM-4.md +3 -3
- README.md +1 -1
- estimate_agillm4_params.py +3 -1
- nB300_agillm4.py +2 -2
AGILLM-4.md
CHANGED
|
@@ -22,9 +22,9 @@ Implemented presets:
|
|
| 22 |
|
| 23 |
| Preset | Shape | Approx params | Use |
|
| 24 |
| --- | --- | ---: | --- |
|
| 25 |
-
| `agillm4_floor` | d=1280, L=28, H=20, rank=160 | ~1.
|
| 26 |
-
| `agillm4_main` | d=1536, L=32, H=24, rank=192 | ~1.
|
| 27 |
-
| `agillm4_big` | d=1792, L=36, H=28, rank=224 | ~2.
|
| 28 |
|
| 29 |
Default recommendation: train `agillm4_main` if B200/B300 availability is good. On a 24GB 4090, start with `agillm4_floor` so the run is still larger than AGILLM-3 while leaving enough VRAM for AR+SAT+NAT.
|
| 30 |
|
|
|
|
| 22 |
|
| 23 |
| Preset | Shape | Approx params | Use |
|
| 24 |
| --- | --- | ---: | --- |
|
| 25 |
+
| `agillm4_floor` | d=1280, L=28, H=20, rank=160 | ~1.21B | minimum AGILLM-4 run |
|
| 26 |
+
| `agillm4_main` | d=1536, L=32, H=24, rank=192 | ~1.70B | main target |
|
| 27 |
+
| `agillm4_big` | d=1792, L=36, H=28, rank=224 | ~2.40B | stretch target after memory works |
|
| 28 |
|
| 29 |
Default recommendation: train `agillm4_main` if B200/B300 availability is good. On a 24GB 4090, start with `agillm4_floor` so the run is still larger than AGILLM-3 while leaving enough VRAM for AR+SAT+NAT.
|
| 30 |
|
README.md
CHANGED
|
@@ -14,7 +14,7 @@ AGILLM-4 is the next training target after AGILLM-3. The current code is a
|
|
| 14 |
production-oriented starting point, copied from the proven single-file trainer
|
| 15 |
and extended for:
|
| 16 |
|
| 17 |
-
- >1B parameter floor preset (`agillm4_floor`) and ~1.
|
| 18 |
- 100 tokens per parameter target ratio, above the AGILLM-3 training ratio
|
| 19 |
- longer block-size work on 24GB, B200, and B300 class GPUs
|
| 20 |
- AR+SAT+NAT training, with sequential backward to reduce peak VRAM
|
|
|
|
| 14 |
production-oriented starting point, copied from the proven single-file trainer
|
| 15 |
and extended for:
|
| 16 |
|
| 17 |
+
- >1B parameter floor preset (`agillm4_floor`) and ~1.7B main preset (`agillm4_main`) with AR+SAT+NAT heads
|
| 18 |
- 100 tokens per parameter target ratio, above the AGILLM-3 training ratio
|
| 19 |
- longer block-size work on 24GB, B200, and B300 class GPUs
|
| 20 |
- AR+SAT+NAT training, with sequential backward to reduce peak VRAM
|
estimate_agillm4_params.py
CHANGED
|
@@ -25,8 +25,9 @@ def estimate(vocab: int, d: int, layers: int, heads: int, rank: int, tie_weights
|
|
| 25 |
block = attn + ff + norms
|
| 26 |
core = embed + layers * block + 2 * d
|
| 27 |
ar = 0 if tie_weights else vocab * d + vocab
|
|
|
|
| 28 |
sat = vocab * d + vocab + 2 * d + 2
|
| 29 |
-
total = core + ar + sat
|
| 30 |
return {
|
| 31 |
"vocab": vocab,
|
| 32 |
"d_model": d,
|
|
@@ -38,6 +39,7 @@ def estimate(vocab: int, d: int, layers: int, heads: int, rank: int, tie_weights
|
|
| 38 |
"block_params_each": block,
|
| 39 |
"core_params": core,
|
| 40 |
"ar_head_params": ar,
|
|
|
|
| 41 |
"sat_head_params": sat,
|
| 42 |
"total_params": total,
|
| 43 |
"tokens_at_100_to_1": total * 100,
|
|
|
|
| 25 |
block = attn + ff + norms
|
| 26 |
core = embed + layers * block + 2 * d
|
| 27 |
ar = 0 if tie_weights else vocab * d + vocab
|
| 28 |
+
nat = vocab * d + vocab
|
| 29 |
sat = vocab * d + vocab + 2 * d + 2
|
| 30 |
+
total = core + ar + sat + nat
|
| 31 |
return {
|
| 32 |
"vocab": vocab,
|
| 33 |
"d_model": d,
|
|
|
|
| 39 |
"block_params_each": block,
|
| 40 |
"core_params": core,
|
| 41 |
"ar_head_params": ar,
|
| 42 |
+
"nat_head_params": nat,
|
| 43 |
"sat_head_params": sat,
|
| 44 |
"total_params": total,
|
| 45 |
"tokens_at_100_to_1": total * 100,
|
nB300_agillm4.py
CHANGED
|
@@ -832,8 +832,8 @@ PRESETS: Dict[str, Dict[str, int]] = {
|
|
| 832 |
"base18": dict(d=768, layers=18, heads=24, rank=96),
|
| 833 |
"large": dict(d=1024, layers=24, heads=16, rank=128),
|
| 834 |
# AGILLM-4 tiers. These are intentionally above the ~700M AGILLM-3 size.
|
| 835 |
-
# Approx dense parameter count with the current untied embedding+AR+SAT heads:
|
| 836 |
-
# agillm4_floor ~= 1.
|
| 837 |
"agillm4_floor": dict(d=1280, layers=28, heads=20, rank=160),
|
| 838 |
"agillm4_main": dict(d=1536, layers=32, heads=24, rank=192),
|
| 839 |
"agillm4_big": dict(d=1792, layers=36, heads=28, rank=224),
|
|
|
|
| 832 |
"base18": dict(d=768, layers=18, heads=24, rank=96),
|
| 833 |
"large": dict(d=1024, layers=24, heads=16, rank=128),
|
| 834 |
# AGILLM-4 tiers. These are intentionally above the ~700M AGILLM-3 size.
|
| 835 |
+
# Approx dense parameter count with the current untied embedding+AR+SAT+NAT heads:
|
| 836 |
+
# agillm4_floor ~= 1.21B, agillm4_main ~= 1.70B, agillm4_big ~= 2.40B.
|
| 837 |
"agillm4_floor": dict(d=1280, layers=28, heads=20, rank=160),
|
| 838 |
"agillm4_main": dict(d=1536, layers=32, heads=24, rank=192),
|
| 839 |
"agillm4_big": dict(d=1792, layers=36, heads=28, rank=224),
|