| { |
| "data_dir": "../data/fineweb-edu-10B", |
| "batch_size": 80, |
| "seq_len": 1024, |
| "max_iters": 16000, |
| "gradient_accumulation_steps": 1, |
| "optimizer": "muon", |
| "learning_rate": 0.0006, |
| "min_lr": 6e-05, |
| "weight_decay": 0.1, |
| "beta1": 0.9, |
| "beta2": 0.95, |
| "grad_clip": 1.0, |
| "warmup_iters": 0, |
| "lr_decay_iters": 16000, |
| "muon_lr": 0.01, |
| "muon_min_lr": 0.002, |
| "adamw_lr": 0.01, |
| "adamw_min_lr": 0.00036, |
| "muon_momentum": 0.95, |
| "muon_nesterov": true, |
| "muon_ns_steps": 5, |
| "muon_weight_decay": 0.0, |
| "lr_schedule": "linear_warmdown", |
| "warmdown_frac": 0.4, |
| "device": "cuda", |
| "dtype": "bfloat16", |
| "compile": true, |
| "sequential_data": false, |
| "n_layer": 12, |
| "n_head": 8, |
| "n_embd": 1024, |
| "block_size": 1024, |
| "vocab_size": 50304, |
| "sparsity_mode": "none", |
| "k1": 8, |
| "k2": 64, |
| "m2": null, |
| "basis_k": 128, |
| "r": 32, |
| "hash_seed": 42, |
| "deeptopk_backend": "auto", |
| "rmsnorm_affine": true, |
| "wf_f_min_ratio": 0.0, |
| "wf_quantile_lr": 0.1, |
| "wf_ema_beta": 0.9, |
| "weight_sparsify": false, |
| "weight_sparsify_k": 32, |
| "grad_sparsify_k": 4, |
| "l1l1_grad_proj": false, |
| "sgd_lr": 0.01, |
| "sgd_min_lr": 0.001, |
| "sgd_momentum": 0.9, |
| "sgd_weight_decay": 0.0, |
| "cayley_levels": [], |
| "cayley_per_parent_budget": false, |
| "cayley_wf_f_min_ratio": 0.0, |
| "cayley_wf_ema_beta": 0.99, |
| "cayley_wf_bisect_iters": 15, |
| "cayley_wf_warmup_batches": 10, |
| "cayley_wf_batch_size": 64, |
| "cayley_seed": 42, |
| "cayley_backend": "auto", |
| "cayley_locations": [ |
| "resid_mid" |
| ], |
| "cayley_score_standardize": false, |
| "cayley_richardson_refine": false, |
| "cayley_matching_pursuit": false, |
| "cayley_sparse_mlp": false, |
| "sparse_mlp_fan_out": 64, |
| "sparse_mlp_K_out": 0, |
| "sparse_mlp_activation": "relu", |
| "sparse_mlp_chunk_size": 1024, |
| "sparse_mlp_norm_preserve": true, |
| "sparse_mlp_backend": "auto", |
| "sparse_mlp_target_hidden_size": 4, |
| "tied_block_init": false, |
| "dead_threshold_c": 0.1, |
| "resume": false, |
| "out_dir": "../out/vanilla-v5", |
| "eval_interval": 500, |
| "eval_iters": 100, |
| "log_interval": 10, |
| "wandb_project": "sparse-nanogpt", |
| "wandb_run_name": "vanilla-v5" |
| } |