ASzecsenyi commited on
Commit
f82e3bc
·
verified ·
1 Parent(s): 0973f62

Upload rva_ts_1x1024_09yiapl9/meta_000413.json with huggingface_hub

Browse files
rva_ts_1x1024_09yiapl9/meta_000413.json ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 413,
3
+ "val_bpb": 3.2953677197732074,
4
+ "model_config": {
5
+ "run": "rva_ts_1x1024",
6
+ "device_type": "cuda",
7
+ "config": "rva1m",
8
+ "depth": 1,
9
+ "aspect_ratio": 64,
10
+ "model_dim": 1024,
11
+ "head_dim": 128,
12
+ "sequence_len": 1024,
13
+ "vocab_size": 265,
14
+ "rva_blocks": [
15
+ -1
16
+ ],
17
+ "recurrent_vocab_sizes": [],
18
+ "kla_blocks": [],
19
+ "mamba_blocks": [],
20
+ "gdn_blocks": [],
21
+ "gla_blocks": [],
22
+ "d_state": 16,
23
+ "mamba_params": true,
24
+ "kla_kernel": true,
25
+ "mimo_rank": 1,
26
+ "skip_around_kla": true,
27
+ "decoder_mlp": false,
28
+ "use_reparametrisation_trick": true,
29
+ "num_iterations": -1,
30
+ "target_flops": -1.0,
31
+ "target_param_data_ratio": 20,
32
+ "data_dir": "tinystories_data",
33
+ "device_batch_size": 64,
34
+ "total_batch_size": 524288,
35
+ "use_muon": true,
36
+ "embedding_lr": 0.3,
37
+ "unembedding_lr": 0.004,
38
+ "grad_clip": 1.0,
39
+ "weight_decay": 0.0,
40
+ "matrix_lr": 0.02,
41
+ "adam_beta1": 0.8,
42
+ "adam_beta2": 0.95,
43
+ "warmup_ratio": 0.1,
44
+ "warmdown_ratio": 0.4,
45
+ "final_lr_frac": 0.0,
46
+ "resume_from_step": -1,
47
+ "eval_every": 250,
48
+ "eval_tokens": 10485760,
49
+ "core_metric_every": -1,
50
+ "core_metric_max_per_task": 500,
51
+ "sample_every": 250,
52
+ "save_every": 1000,
53
+ "push_checkpoints_to_hub": true,
54
+ "use_profiler": false,
55
+ "profile_step": 2,
56
+ "profile_micro_step": 0,
57
+ "memory_history_max_entries": 10000,
58
+ "model_tag": "rva_ts_1x1024",
59
+ "n_layer": 1,
60
+ "n_head": 8,
61
+ "n_kv_head": 8,
62
+ "n_embd": 1024
63
+ },
64
+ "user_config": {
65
+ "run": "rva_ts_1x1024",
66
+ "device_type": "",
67
+ "config": "rva1m",
68
+ "depth": 1,
69
+ "aspect_ratio": 64,
70
+ "model_dim": 1024,
71
+ "head_dim": 128,
72
+ "sequence_len": 1024,
73
+ "vocab_size": 265,
74
+ "rva_blocks": [
75
+ -1
76
+ ],
77
+ "recurrent_vocab_sizes": [],
78
+ "kla_blocks": [],
79
+ "mamba_blocks": [],
80
+ "gdn_blocks": [],
81
+ "gla_blocks": [],
82
+ "d_state": 16,
83
+ "mamba_params": true,
84
+ "kla_kernel": true,
85
+ "mimo_rank": 1,
86
+ "skip_around_kla": true,
87
+ "decoder_mlp": false,
88
+ "use_reparametrisation_trick": true,
89
+ "num_iterations": -1,
90
+ "target_flops": -1.0,
91
+ "target_param_data_ratio": 20,
92
+ "data_dir": "tinystories_data",
93
+ "device_batch_size": 64,
94
+ "total_batch_size": 524288,
95
+ "use_muon": true,
96
+ "embedding_lr": 0.3,
97
+ "unembedding_lr": 0.004,
98
+ "grad_clip": 1.0,
99
+ "weight_decay": 0.0,
100
+ "matrix_lr": 0.02,
101
+ "adam_beta1": 0.8,
102
+ "adam_beta2": 0.95,
103
+ "warmup_ratio": 0.1,
104
+ "warmdown_ratio": 0.4,
105
+ "final_lr_frac": 0.0,
106
+ "resume_from_step": -1,
107
+ "eval_every": 250,
108
+ "eval_tokens": 10485760,
109
+ "core_metric_every": -1,
110
+ "core_metric_max_per_task": 500,
111
+ "sample_every": 250,
112
+ "save_every": 1000,
113
+ "push_checkpoints_to_hub": true,
114
+ "use_profiler": false,
115
+ "profile_step": 2,
116
+ "profile_micro_step": 0,
117
+ "memory_history_max_entries": 10000,
118
+ "model_tag": "rva_ts_1x1024"
119
+ },
120
+ "device_batch_size": 64,
121
+ "sequence_len": 1024,
122
+ "dataloader_state_dict": {
123
+ "pq_idx": 0,
124
+ "rg_idx": 238
125
+ },
126
+ "loop_state": {
127
+ "min_val_bpb": 3.2953677197732074,
128
+ "smooth_train_loss": 2.293877559053924,
129
+ "total_training_time": 71.48586106300354
130
+ }
131
+ }