ASzecsenyi commited on
Commit
e1c11b7
·
verified ·
1 Parent(s): 9eccf49

Upload rva_2x1024_e2huhg1r/meta_003333.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. rva_2x1024_e2huhg1r/meta_003333.json +157 -0
rva_2x1024_e2huhg1r/meta_003333.json ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 3333,
3
+ "val_bpb": 1.9419620347296425,
4
+ "model_config": {
5
+ "run": "rva_2x1024",
6
+ "wandb_group": null,
7
+ "seed": 42,
8
+ "device_type": "cuda",
9
+ "config": "rva20m",
10
+ "depth": 2,
11
+ "aspect_ratio": 64,
12
+ "model_dim": 1024,
13
+ "head_dim": 64,
14
+ "sequence_len": 1024,
15
+ "vocab_size": 265,
16
+ "clustering_init_num_samples": 0,
17
+ "rva_blocks": [
18
+ -1
19
+ ],
20
+ "recurrent_vocab_sizes": [
21
+ 265,
22
+ 16384
23
+ ],
24
+ "kla_blocks": [],
25
+ "mamba_blocks": [],
26
+ "gdn_blocks": [],
27
+ "gla_blocks": [],
28
+ "moe_blocks": [],
29
+ "d_state": 16,
30
+ "mamba_params": true,
31
+ "kla_kernel": true,
32
+ "mimo_rank": 1,
33
+ "skip_around_kla": true,
34
+ "decoder_mlp": false,
35
+ "use_reparametrisation_trick": true,
36
+ "num_experts": 4,
37
+ "moe_top_k": 2,
38
+ "moe_capacity_factor": 1.0,
39
+ "moe_aux_loss_weight": 0.01,
40
+ "moe_embedding_experts": false,
41
+ "num_iterations": -1,
42
+ "target_flops": -1.0,
43
+ "target_param_data_ratio": 20,
44
+ "data_dir": "base_data",
45
+ "device_batch_size": 2,
46
+ "total_batch_size": 524288,
47
+ "use_muon": true,
48
+ "embedding_lr": 0.3,
49
+ "unembedding_lr": 0.004,
50
+ "grad_clip": 1.0,
51
+ "weight_decay": 0.0,
52
+ "matrix_lr": 0.02,
53
+ "adam_beta1": 0.8,
54
+ "adam_beta2": 0.95,
55
+ "warmup_ratio": 0.05,
56
+ "warmdown_ratio": 0.4,
57
+ "final_lr_frac": 0.0,
58
+ "resume_from_step": -1,
59
+ "eval_every": 250,
60
+ "eval_tokens": 10485760,
61
+ "core_metric_every": -1,
62
+ "core_metric_max_per_task": 500,
63
+ "sample_every": 250,
64
+ "save_every": 1000,
65
+ "push_checkpoints_to_hub": true,
66
+ "use_profiler": false,
67
+ "profile_step": 2,
68
+ "profile_micro_step": 0,
69
+ "memory_history_max_entries": 10000,
70
+ "model_tag": "rva_2x1024",
71
+ "n_layer": 2,
72
+ "n_head": 16,
73
+ "n_kv_head": 16,
74
+ "n_embd": 1024
75
+ },
76
+ "user_config": {
77
+ "run": "rva_2x1024",
78
+ "wandb_group": null,
79
+ "seed": 42,
80
+ "device_type": "cuda",
81
+ "config": "rva20m",
82
+ "depth": 2,
83
+ "aspect_ratio": 64,
84
+ "model_dim": 1024,
85
+ "head_dim": 64,
86
+ "sequence_len": 1024,
87
+ "vocab_size": 265,
88
+ "clustering_init_num_samples": 0,
89
+ "rva_blocks": [
90
+ -1
91
+ ],
92
+ "recurrent_vocab_sizes": [
93
+ 265,
94
+ 16384
95
+ ],
96
+ "kla_blocks": [],
97
+ "mamba_blocks": [],
98
+ "gdn_blocks": [],
99
+ "gla_blocks": [],
100
+ "moe_blocks": [],
101
+ "d_state": 16,
102
+ "mamba_params": true,
103
+ "kla_kernel": true,
104
+ "mimo_rank": 1,
105
+ "skip_around_kla": true,
106
+ "decoder_mlp": false,
107
+ "use_reparametrisation_trick": true,
108
+ "num_experts": 4,
109
+ "moe_top_k": 2,
110
+ "moe_capacity_factor": 1.0,
111
+ "moe_aux_loss_weight": 0.01,
112
+ "moe_embedding_experts": false,
113
+ "num_iterations": -1,
114
+ "target_flops": -1.0,
115
+ "target_param_data_ratio": 20,
116
+ "data_dir": "base_data",
117
+ "device_batch_size": 2,
118
+ "total_batch_size": 524288,
119
+ "use_muon": true,
120
+ "embedding_lr": 0.3,
121
+ "unembedding_lr": 0.004,
122
+ "grad_clip": 1.0,
123
+ "weight_decay": 0.0,
124
+ "matrix_lr": 0.02,
125
+ "adam_beta1": 0.8,
126
+ "adam_beta2": 0.95,
127
+ "warmup_ratio": 0.05,
128
+ "warmdown_ratio": 0.4,
129
+ "final_lr_frac": 0.0,
130
+ "resume_from_step": -1,
131
+ "eval_every": 250,
132
+ "eval_tokens": 10485760,
133
+ "core_metric_every": -1,
134
+ "core_metric_max_per_task": 500,
135
+ "sample_every": 250,
136
+ "save_every": 1000,
137
+ "push_checkpoints_to_hub": true,
138
+ "use_profiler": false,
139
+ "profile_step": 2,
140
+ "profile_micro_step": 0,
141
+ "memory_history_max_entries": 10000,
142
+ "model_tag": "rva_2x1024",
143
+ "pod_name": "s2027538-infk8s-job-77k2f-6hdlr"
144
+ },
145
+ "device_batch_size": 2,
146
+ "sequence_len": 1024,
147
+ "dataloader_state_dict": {
148
+ "pq_idx": 6,
149
+ "rg_idx": 44
150
+ },
151
+ "loop_state": {
152
+ "min_val_bpb": 1.9419620347296425,
153
+ "smooth_train_loss": 2.025373429663995,
154
+ "smooth_aux_loss": 0.6445939797561,
155
+ "total_training_time": 19466.94282412529
156
+ }
157
+ }