{ "data_dir": "../data/fineweb-edu-10B", "batch_size": 80, "seq_len": 1024, "max_iters": 16000, "gradient_accumulation_steps": 1, "optimizer": "muon", "learning_rate": 0.0006, "min_lr": 6e-05, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "warmup_iters": 0, "lr_decay_iters": 16000, "muon_lr": 0.01, "muon_min_lr": 0.002, "adamw_lr": 0.01, "adamw_min_lr": 0.00036, "muon_momentum": 0.95, "muon_nesterov": true, "muon_ns_steps": 5, "muon_weight_decay": 0.0, "lr_schedule": "linear_warmdown", "warmdown_frac": 0.4, "device": "cuda", "dtype": "bfloat16", "compile": true, "sequential_data": false, "n_layer": 12, "n_head": 8, "n_embd": 1024, "block_size": 1024, "vocab_size": 50304, "sparsity_mode": "none", "k1": 8, "k2": 64, "m2": null, "basis_k": 128, "r": 32, "hash_seed": 42, "deeptopk_backend": "auto", "rmsnorm_affine": true, "wf_f_min_ratio": 0.0, "wf_quantile_lr": 0.1, "wf_ema_beta": 0.9, "weight_sparsify": false, "weight_sparsify_k": 32, "grad_sparsify_k": 4, "l1l1_grad_proj": false, "sgd_lr": 0.01, "sgd_min_lr": 0.001, "sgd_momentum": 0.9, "sgd_weight_decay": 0.0, "cayley_levels": [], "cayley_per_parent_budget": false, "cayley_wf_f_min_ratio": 0.0, "cayley_wf_ema_beta": 0.99, "cayley_wf_bisect_iters": 15, "cayley_wf_warmup_batches": 10, "cayley_wf_batch_size": 64, "cayley_seed": 42, "cayley_backend": "auto", "cayley_locations": [ "resid_mid" ], "cayley_score_standardize": false, "cayley_richardson_refine": false, "cayley_matching_pursuit": false, "cayley_sparse_mlp": false, "sparse_mlp_fan_out": 64, "sparse_mlp_K_out": 0, "sparse_mlp_activation": "relu", "sparse_mlp_chunk_size": 1024, "sparse_mlp_norm_preserve": true, "sparse_mlp_backend": "auto", "sparse_mlp_target_hidden_size": 4, "tied_block_init": false, "dead_threshold_c": 0.1, "resume": false, "out_dir": "../out/vanilla-v5", "eval_interval": 500, "eval_iters": 100, "log_interval": 10, "wandb_project": "sparse-nanogpt", "wandb_run_name": "vanilla-v5" }