{ "timestamp": "2026-06-28T11:27:05.869434+00:00", "run_name": "real_naive_fineweb_5B_2gpu", "git_commit": "unknown", "cli_args": { "preset": "135M", "run_name": "real_naive_fineweb_5B_2gpu", "data_dir": "/data/fineweb", "train_pattern": "fineweb_train_*.bin", "val_pattern": "fineweb_train_*.bin", "max_steps": 20000, "seq_len": 1024, "batch_tokens": 262144, "micro_batch_seqs": 32, "val_every": 250, "out_dir": "/data/runs", "save_every": 2500, "no_compile": false, "holdout_last_for_val": true, "set": [ "use_a_matrix=false", "use_input_norm=false" ] }, "train_config": { "data_dir": "/data/fineweb_edu", "train_pattern": "edu_fineweb_train_*.bin", "val_pattern": "edu_fineweb_val_*.bin", "seq_len": 1024, "batch_tokens": 262144, "micro_batch_seqs": 32, "max_steps": 20000, "warmup_steps": 100, "cooldown_frac": 0.4, "final_lr_frac": 0.1, "muon_lr": 0.02, "muon_momentum": 0.95, "muon_wd": 0.1, "muon_ns_steps": 5, "adam_lr": 0.0003, "adam_betas": [ 0.9, 0.95 ], "adam_wd": 0.1, "grad_clip": 1.0, "val_every": 250, "val_tokens": 10485760, "log_every": 10, "seed": 1337, "compile": true, "bf16": true, "out_dir": "/data/runs", "run_name": "real_naive_fineweb_5B_2gpu" }, "model_config": { "vocab_size": 50304, "d_model": 1024, "n_prelude": 4, "n_coda": 2, "mu_rec": 6, "n_q_heads": 16, "n_kv_heads": 8, "head_dim": 64, "qk_norm": true, "rope_theta": 10000.0, "dense_ffn": 2816, "tie_embeddings": true, "final_z_loss_coef": 0.0001, "use_a_matrix": false, "use_input_norm": false, "init_std": 0.02 }, "hostname": "modal", "gpu_count": 2, "gpu_type": "NVIDIA H100 80GB HBM3", "pytorch_version": "2.12.0+cu130" }