markhenry commited on
Commit
274f995
·
verified ·
1 Parent(s): 2ac1677

Upload config.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.json +86 -0
config.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data_dir": "../data/fineweb-edu-10B",
3
+ "batch_size": 80,
4
+ "seq_len": 1024,
5
+ "max_iters": 16000,
6
+ "gradient_accumulation_steps": 1,
7
+ "optimizer": "muon",
8
+ "learning_rate": 0.0006,
9
+ "min_lr": 6e-05,
10
+ "weight_decay": 0.1,
11
+ "beta1": 0.9,
12
+ "beta2": 0.95,
13
+ "grad_clip": 1.0,
14
+ "warmup_iters": 2000,
15
+ "lr_decay_iters": 16000,
16
+ "muon_lr": 0.006,
17
+ "muon_min_lr": 0.002,
18
+ "adamw_lr": 0.006,
19
+ "adamw_min_lr": 0.00036,
20
+ "muon_momentum": 0.95,
21
+ "muon_nesterov": true,
22
+ "muon_ns_steps": 5,
23
+ "muon_weight_decay": 0.0,
24
+ "lr_schedule": "linear_warmdown",
25
+ "warmdown_frac": 0.2,
26
+ "device": "cuda",
27
+ "dtype": "bfloat16",
28
+ "compile": true,
29
+ "n_layer": 12,
30
+ "n_head": 8,
31
+ "n_embd": 1024,
32
+ "block_size": 1024,
33
+ "vocab_size": 50304,
34
+ "sparsity_mode": "none",
35
+ "k1": 8,
36
+ "k2": 64,
37
+ "m2": null,
38
+ "basis_k": 128,
39
+ "r": 32,
40
+ "hash_seed": 42,
41
+ "deeptopk_backend": "auto",
42
+ "rmsnorm_affine": true,
43
+ "wf_f_min_ratio": 0.0,
44
+ "wf_quantile_lr": 0.1,
45
+ "wf_ema_beta": 0.9,
46
+ "weight_sparsify": false,
47
+ "weight_sparsify_k": 32,
48
+ "grad_sparsify_k": 4,
49
+ "l1l1_grad_proj": false,
50
+ "sgd_lr": 0.01,
51
+ "sgd_min_lr": 0.001,
52
+ "sgd_momentum": 0.9,
53
+ "sgd_weight_decay": 0.0,
54
+ "cayley_levels": [],
55
+ "cayley_per_parent_budget": false,
56
+ "cayley_wf_f_min_ratio": 0.0,
57
+ "cayley_wf_ema_beta": 0.99,
58
+ "cayley_wf_bisect_iters": 15,
59
+ "cayley_wf_warmup_batches": 10,
60
+ "cayley_wf_batch_size": 64,
61
+ "cayley_seed": 42,
62
+ "cayley_backend": "auto",
63
+ "cayley_locations": [
64
+ "resid_mid"
65
+ ],
66
+ "cayley_score_standardize": false,
67
+ "cayley_richardson_refine": false,
68
+ "cayley_matching_pursuit": false,
69
+ "cayley_sparse_mlp": false,
70
+ "sparse_mlp_fan_out": 64,
71
+ "sparse_mlp_K_out": 0,
72
+ "sparse_mlp_activation": "relu",
73
+ "sparse_mlp_chunk_size": 1024,
74
+ "sparse_mlp_norm_preserve": true,
75
+ "sparse_mlp_backend": "auto",
76
+ "sparse_mlp_target_hidden_size": 4,
77
+ "tied_block_init": false,
78
+ "dead_threshold_c": 0.1,
79
+ "resume": false,
80
+ "out_dir": "../out/vanilla-10b",
81
+ "eval_interval": 500,
82
+ "eval_iters": 100,
83
+ "log_interval": 10,
84
+ "wandb_project": "sparse-nanogpt",
85
+ "wandb_run_name": "vanilla-10b"
86
+ }