markhenry commited on
Commit
cd9aa08
·
verified ·
1 Parent(s): ef95895

Upload config.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.json +71 -0
config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data_dir": "/dev/shm/data/fineweb-edu-25B",
3
+ "batch_size": 64,
4
+ "seq_len": 1024,
5
+ "max_iters": 5723,
6
+ "gradient_accumulation_steps": 8,
7
+ "seed": 1337,
8
+ "optimizer": "muon",
9
+ "learning_rate": 0.0006,
10
+ "min_lr": 6e-05,
11
+ "weight_decay": 0.1,
12
+ "beta1": 0.9,
13
+ "beta2": 0.95,
14
+ "grad_clip": 1.0,
15
+ "warmup_iters": 200,
16
+ "lr_decay_iters": 5723,
17
+ "muon_lr": 0.008,
18
+ "muon_min_lr": 0.0001,
19
+ "adamw_lr": 0.0003,
20
+ "adamw_min_lr": 1e-05,
21
+ "muon_momentum": 0.95,
22
+ "muon_nesterov": true,
23
+ "muon_ns_steps": 5,
24
+ "muon_weight_decay": 0.0,
25
+ "lr_schedule": "linear_warmdown",
26
+ "warmdown_frac": 0.5,
27
+ "parity_target": 0.0,
28
+ "parity_margin": 0.03,
29
+ "parity_warmdown_iters": 1500,
30
+ "parity_trigger_iter": -1,
31
+ "retain_from_iter": -1,
32
+ "device": "cuda",
33
+ "dtype": "bfloat16",
34
+ "compile": true,
35
+ "n_layer": 24,
36
+ "n_head": 16,
37
+ "n_embd": 2048,
38
+ "block_size": 1024,
39
+ "vocab_size": 50304,
40
+ "pos_encoding": "learned",
41
+ "rope_theta": 10000.0,
42
+ "sparsity_mode": "none",
43
+ "rmsnorm_affine": true,
44
+ "cayley_levels": [],
45
+ "cayley_per_parent_budget": false,
46
+ "cayley_seed": 42,
47
+ "cayley_backend": "auto",
48
+ "cayley_locations": [
49
+ "resid_mid"
50
+ ],
51
+ "cayley_score_standardize": false,
52
+ "cayley_forward_standardized": false,
53
+ "cayley_score_std_ema_beta": 0.99,
54
+ "cayley_replaces_pre_rmsnorm": false,
55
+ "cayley_richardson_refine": false,
56
+ "cayley_matching_pursuit": false,
57
+ "tied_block_init": false,
58
+ "dead_threshold_c": 0.1,
59
+ "resume": false,
60
+ "out_dir": "/dev/shm/out/probe-vanilla-vanilla-large-parity-3b",
61
+ "eval_interval": 100,
62
+ "eval_iters": 50,
63
+ "log_interval": 10,
64
+ "wandb_project": "sparse-nanogpt",
65
+ "wandb_run_name": "probe-vanilla-vanilla-large-parity-3b",
66
+ "wandb_step_offset": 0,
67
+ "quick_eval_interval": 0,
68
+ "quick_eval_timeout": 900,
69
+ "hierarchy_eval_enabled": false,
70
+ "hierarchy_eval_iters": null
71
+ }