markhenry commited on
Commit
2b5237a
·
verified ·
1 Parent(s): df5d926

Upload config.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.json +87 -0
config.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data_dir": "../data/fineweb-edu-10B",
3
+ "batch_size": 80,
4
+ "seq_len": 1024,
5
+ "max_iters": 16000,
6
+ "gradient_accumulation_steps": 1,
7
+ "optimizer": "muon",
8
+ "learning_rate": 0.0006,
9
+ "min_lr": 6e-05,
10
+ "weight_decay": 0.1,
11
+ "beta1": 0.9,
12
+ "beta2": 0.95,
13
+ "grad_clip": 1.0,
14
+ "warmup_iters": 0,
15
+ "lr_decay_iters": 16000,
16
+ "muon_lr": 0.01,
17
+ "muon_min_lr": 0.002,
18
+ "adamw_lr": 0.01,
19
+ "adamw_min_lr": 0.00036,
20
+ "muon_momentum": 0.95,
21
+ "muon_nesterov": true,
22
+ "muon_ns_steps": 5,
23
+ "muon_weight_decay": 0.0,
24
+ "lr_schedule": "linear_warmdown",
25
+ "warmdown_frac": 0.4,
26
+ "device": "cuda",
27
+ "dtype": "bfloat16",
28
+ "compile": true,
29
+ "sequential_data": false,
30
+ "n_layer": 12,
31
+ "n_head": 8,
32
+ "n_embd": 1024,
33
+ "block_size": 1024,
34
+ "vocab_size": 50304,
35
+ "sparsity_mode": "none",
36
+ "k1": 8,
37
+ "k2": 64,
38
+ "m2": null,
39
+ "basis_k": 128,
40
+ "r": 32,
41
+ "hash_seed": 42,
42
+ "deeptopk_backend": "auto",
43
+ "rmsnorm_affine": true,
44
+ "wf_f_min_ratio": 0.0,
45
+ "wf_quantile_lr": 0.1,
46
+ "wf_ema_beta": 0.9,
47
+ "weight_sparsify": false,
48
+ "weight_sparsify_k": 32,
49
+ "grad_sparsify_k": 4,
50
+ "l1l1_grad_proj": false,
51
+ "sgd_lr": 0.01,
52
+ "sgd_min_lr": 0.001,
53
+ "sgd_momentum": 0.9,
54
+ "sgd_weight_decay": 0.0,
55
+ "cayley_levels": [],
56
+ "cayley_per_parent_budget": false,
57
+ "cayley_wf_f_min_ratio": 0.0,
58
+ "cayley_wf_ema_beta": 0.99,
59
+ "cayley_wf_bisect_iters": 15,
60
+ "cayley_wf_warmup_batches": 10,
61
+ "cayley_wf_batch_size": 64,
62
+ "cayley_seed": 42,
63
+ "cayley_backend": "auto",
64
+ "cayley_locations": [
65
+ "resid_mid"
66
+ ],
67
+ "cayley_score_standardize": false,
68
+ "cayley_richardson_refine": false,
69
+ "cayley_matching_pursuit": false,
70
+ "cayley_sparse_mlp": false,
71
+ "sparse_mlp_fan_out": 64,
72
+ "sparse_mlp_K_out": 0,
73
+ "sparse_mlp_activation": "relu",
74
+ "sparse_mlp_chunk_size": 1024,
75
+ "sparse_mlp_norm_preserve": true,
76
+ "sparse_mlp_backend": "auto",
77
+ "sparse_mlp_target_hidden_size": 4,
78
+ "tied_block_init": false,
79
+ "dead_threshold_c": 0.1,
80
+ "resume": false,
81
+ "out_dir": "../out/vanilla-v5",
82
+ "eval_interval": 500,
83
+ "eval_iters": 100,
84
+ "log_interval": 10,
85
+ "wandb_project": "sparse-nanogpt",
86
+ "wandb_run_name": "vanilla-v5"
87
+ }