{ "data_dir": "data/fineweb-edu-10B", "data_seed": 1337, "batch_size": 20, "seq_len": 1024, "max_iters": 15267, "gradient_accumulation_steps": 4, "optimizer": "muon", "learning_rate": 0.0006, "min_lr": 6e-05, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "warmup_iters": 200, "lr_decay_iters": 15267, "muon_lr": 0.004, "muon_min_lr": 0.002, "adamw_lr": 0.004, "adamw_min_lr": 0.00036, "muon_momentum": 0.95, "muon_nesterov": true, "muon_ns_steps": 5, "muon_weight_decay": 0.0, "sparse_mlp_ambient_muon_lr": 0.004, "sparse_mlp_ambient_muon_min_lr": 0.0004, "sparse_mlp_ambient_muon_momentum": 0.95, "sparse_mlp_ambient_muon_nesterov": true, "sparse_mlp_ambient_muon_ns_steps": 5, "sparse_mlp_ambient_muon_weight_decay": 0.0, "sparse_mlp_row_norm_lr": 0.004, "sparse_mlp_row_norm_min_lr": 0.0004, "sparse_mlp_row_norm_momentum": 0.9, "sparse_mlp_row_norm_weight_decay": 0.0, "sparse_mlp_active_time_lr": 0.004, "sparse_mlp_active_time_min_lr": 0.0004, "sparse_mlp_active_time_beta1": 0.9, "sparse_mlp_active_time_beta2": 0.95, "sparse_mlp_active_time_k_ref": 10240.0, "sparse_mlp_active_time_c_max": 4.0, "sparse_mlp_active_time_weight_decay": 0.0, "sparse_mlp_adamw_lr": 0.004, "sparse_mlp_adamw_min_lr": 0.00036, "sparse_mlp_beta1": 0.9, "sparse_mlp_beta2": 0.95, "sparse_mlp_equalize_lr": true, "lr_schedule": "cosine", "warmdown_frac": 0.1, "device": "cuda", "dtype": "bfloat16", "compile": true, "n_layer": 24, "n_head": 8, "n_embd": 2048, "block_size": 1024, "vocab_size": 50304, "pos_encoding": "rope", "rope_theta": 10000.0, "attn_alpha": 1.0, "attn_zero_init_proj": false, "zero_init_proj_targets": [], "tied_embedding": true, "sparsity_mode": "cayley_sparse_gather_dense", "rmsnorm_affine": true, "disable_mlp": false, "cayley_levels": [ [ 11, 16, 0 ], [ 15, 32, 240 ], [ 19, 64, 240 ], [ 23, 128, 240 ] ], "cayley_per_parent_budget": true, "cayley_leaf_runner_j": 0, "cayley_seed": 42, "cayley_balanced_generators": true, "cayley_backend": "auto", "cayley_locations": [ "mlp_in" ], "cayley_score_standardize": true, "cayley_score_std_scope": "per_vertex", "cayley_score_std_batch_size": 64, "cayley_activation_std_levels": [ "all" ], "cayley_activation_tanh": false, "cayley_matching_pursuit": false, "cayley_additive_parent_bias": false, "cayley_alpha_init_override": null, "cayley_alpha_weight_decay": 0.1, "sparse_mlp_K_in": 56, "sparse_mlp_H": 32, "sparse_mlp_K_out": 128, "sparse_mlp_F_up": 32, "sparse_mlp_F_down": 32, "sparse_mlp_Delta": 32, "sparse_mlp_rewire_every": 50, "sparse_mlp_rewire_components": [], "sparse_mlp_seed": 43, "sparse_mlp_init_mode": "supported", "sparse_mlp_tied_adjacency": false, "sparse_mlp_tied_rewire": false, "sparse_mlp_init_boost": 1.0, "sparse_mlp_up_init_scale": 1.0, "sparse_mlp_down_init_scale": 1.0, "sparse_mlp_up_norm_constraint": false, "sparse_mlp_up_per_level_init": true, "matryoshka_train": false, "matryoshka_cut_power": 0.0, "sparse_mlp_weight_decay": 0.0, "sparse_mlp_optimizer": "active_time_adam", "tied_block_init": false, "feature_parallel": true, "sparse_mlp_zero_l1plus": true, "profile_iter": -1, "profile_microbatches": 2, "dead_threshold_c": 0.1, "resume": false, "out_dir": "/workspace/sparse-nanogpt-private/run_logs/sweep_alt/4L_24L_d2048_K16326412_bal_f16_m23_seq1024_10B_fp_zero_l1plus_notanh", "eval_interval": 500, "eval_iters": 100, "log_interval": 10, "wandb_project": "sparse-mlp-mscaling", "wandb_run_name": "4L-24L-d2048-K16326412-bal-f16-m23-seq1024-10B-fp-zero-l1plus-notanh" }