LoopLM-135M-naive / spec.json
harims95's picture
Initial release: LoopLM-135M-naive trained on FineWeb 4.6B tokens
12f0a98 verified
Raw
History Blame Contribute Delete
1.9 kB
{
"timestamp": "2026-06-28T11:27:05.869434+00:00",
"run_name": "real_naive_fineweb_5B_2gpu",
"git_commit": "unknown",
"cli_args": {
"preset": "135M",
"run_name": "real_naive_fineweb_5B_2gpu",
"data_dir": "/data/fineweb",
"train_pattern": "fineweb_train_*.bin",
"val_pattern": "fineweb_train_*.bin",
"max_steps": 20000,
"seq_len": 1024,
"batch_tokens": 262144,
"micro_batch_seqs": 32,
"val_every": 250,
"out_dir": "/data/runs",
"save_every": 2500,
"no_compile": false,
"holdout_last_for_val": true,
"set": [
"use_a_matrix=false",
"use_input_norm=false"
]
},
"train_config": {
"data_dir": "/data/fineweb_edu",
"train_pattern": "edu_fineweb_train_*.bin",
"val_pattern": "edu_fineweb_val_*.bin",
"seq_len": 1024,
"batch_tokens": 262144,
"micro_batch_seqs": 32,
"max_steps": 20000,
"warmup_steps": 100,
"cooldown_frac": 0.4,
"final_lr_frac": 0.1,
"muon_lr": 0.02,
"muon_momentum": 0.95,
"muon_wd": 0.1,
"muon_ns_steps": 5,
"adam_lr": 0.0003,
"adam_betas": [
0.9,
0.95
],
"adam_wd": 0.1,
"grad_clip": 1.0,
"val_every": 250,
"val_tokens": 10485760,
"log_every": 10,
"seed": 1337,
"compile": true,
"bf16": true,
"out_dir": "/data/runs",
"run_name": "real_naive_fineweb_5B_2gpu"
},
"model_config": {
"vocab_size": 50304,
"d_model": 1024,
"n_prelude": 4,
"n_coda": 2,
"mu_rec": 6,
"n_q_heads": 16,
"n_kv_heads": 8,
"head_dim": 64,
"qk_norm": true,
"rope_theta": 10000.0,
"dense_ffn": 2816,
"tie_embeddings": true,
"final_z_loss_coef": 0.0001,
"use_a_matrix": false,
"use_input_norm": false,
"init_std": 0.02
},
"hostname": "modal",
"gpu_count": 2,
"gpu_type": "NVIDIA H100 80GB HBM3",
"pytorch_version": "2.12.0+cu130"
}