aixk commited on
Commit
07b4bb2
·
1 Parent(s): 7ec9523

Upload folder using huggingface_hub

Browse files
slots/12/checkpoint-496/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "TwinyForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "float32",
7
+ "hidden_dropout": 0.0,
8
+ "hidden_size": 768,
9
+ "initializer_range": 0.02,
10
+ "intermediate_size": 3072,
11
+ "max_position_embeddings": 128,
12
+ "model_type": "twiny",
13
+ "neftune_alpha": 0.0,
14
+ "num_attention_heads": 12,
15
+ "num_hidden_layers": 1,
16
+ "num_key_value_heads": 3,
17
+ "qk_norm": true,
18
+ "rezero_init": 1.0,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_theta": 10000.0,
21
+ "transformers_version": "5.0.0",
22
+ "use_cache": false,
23
+ "vocab_size": 32000
24
+ }
slots/12/checkpoint-496/trainer_state.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.03708688500074772,
6
+ "eval_steps": 500,
7
+ "global_step": 496,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 2.617016627215141e-05,
14
+ "grad_norm": 122.33340454101562,
15
+ "learning_rate": 5e-05,
16
+ "loss": 419.9996032714844,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.0005234033254430282,
21
+ "grad_norm": 90.82410430908203,
22
+ "learning_rate": 4.999999511980259e-05,
23
+ "loss": 428.18197471217104,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.0010468066508860564,
28
+ "grad_norm": 59.52641296386719,
29
+ "learning_rate": 4.999997943828395e-05,
30
+ "loss": 366.3947265625,
31
+ "step": 40
32
+ },
33
+ {
34
+ "epoch": 0.0015702099763290846,
35
+ "grad_norm": 41.61277389526367,
36
+ "learning_rate": 4.999995294193233e-05,
37
+ "loss": 325.6545166015625,
38
+ "step": 60
39
+ },
40
+ {
41
+ "epoch": 0.0020936133017721128,
42
+ "grad_norm": 46.67556381225586,
43
+ "learning_rate": 4.999991563075919e-05,
44
+ "loss": 310.3648193359375,
45
+ "step": 80
46
+ },
47
+ {
48
+ "epoch": 0.002617016627215141,
49
+ "grad_norm": 47.65824890136719,
50
+ "learning_rate": 4.999986750478065e-05,
51
+ "loss": 303.49443359375,
52
+ "step": 100
53
+ },
54
+ {
55
+ "epoch": 0.003140419952658169,
56
+ "grad_norm": 62.748138427734375,
57
+ "learning_rate": 4.999980856401755e-05,
58
+ "loss": 322.389990234375,
59
+ "step": 120
60
+ },
61
+ {
62
+ "epoch": 0.007327646556202395,
63
+ "grad_norm": 48.20169448852539,
64
+ "learning_rate": 4.999895523943923e-05,
65
+ "loss": 288.82497336647725,
66
+ "step": 140
67
+ },
68
+ {
69
+ "epoch": 0.008374453207088451,
70
+ "grad_norm": 68.66603088378906,
71
+ "learning_rate": 4.99986329623214e-05,
72
+ "loss": 293.87509765625,
73
+ "step": 160
74
+ },
75
+ {
76
+ "epoch": 0.009421259857974507,
77
+ "grad_norm": 61.69401931762695,
78
+ "learning_rate": 4.999826742820622e-05,
79
+ "loss": 324.10361328125,
80
+ "step": 180
81
+ },
82
+ {
83
+ "epoch": 0.010468066508860565,
84
+ "grad_norm": 56.778385162353516,
85
+ "learning_rate": 4.999785863772618e-05,
86
+ "loss": 274.875537109375,
87
+ "step": 200
88
+ },
89
+ {
90
+ "epoch": 0.01151487315974662,
91
+ "grad_norm": 71.3816146850586,
92
+ "learning_rate": 4.999740659158865e-05,
93
+ "loss": 281.357080078125,
94
+ "step": 220
95
+ },
96
+ {
97
+ "epoch": 0.012561679810632677,
98
+ "grad_norm": 75.08697509765625,
99
+ "learning_rate": 4.9996911290575844e-05,
100
+ "loss": 263.852783203125,
101
+ "step": 240
102
+ },
103
+ {
104
+ "epoch": 0.013608486461518732,
105
+ "grad_norm": 53.91341781616211,
106
+ "learning_rate": 4.999637273554481e-05,
107
+ "loss": 258.546630859375,
108
+ "step": 260
109
+ },
110
+ {
111
+ "epoch": 0.01465529311240479,
112
+ "grad_norm": 56.39162063598633,
113
+ "learning_rate": 4.9995790927427446e-05,
114
+ "loss": 270.5470703125,
115
+ "step": 280
116
+ },
117
+ {
118
+ "epoch": 0.015702099763290844,
119
+ "grad_norm": 62.93777084350586,
120
+ "learning_rate": 4.99951658672305e-05,
121
+ "loss": 252.8383056640625,
122
+ "step": 300
123
+ },
124
+ {
125
+ "epoch": 0.016748906414176902,
126
+ "grad_norm": 61.51578903198242,
127
+ "learning_rate": 4.9994497556035567e-05,
128
+ "loss": 287.8167724609375,
129
+ "step": 320
130
+ },
131
+ {
132
+ "epoch": 0.01779571306506296,
133
+ "grad_norm": 62.75235366821289,
134
+ "learning_rate": 4.9993785994999074e-05,
135
+ "loss": 258.673779296875,
136
+ "step": 340
137
+ },
138
+ {
139
+ "epoch": 0.018842519715949014,
140
+ "grad_norm": 56.2258186340332,
141
+ "learning_rate": 4.999303118535229e-05,
142
+ "loss": 252.9609375,
143
+ "step": 360
144
+ },
145
+ {
146
+ "epoch": 0.01988932636683507,
147
+ "grad_norm": 54.998130798339844,
148
+ "learning_rate": 4.9992233128401314e-05,
149
+ "loss": 241.9411376953125,
150
+ "step": 380
151
+ },
152
+ {
153
+ "epoch": 0.02093613301772113,
154
+ "grad_norm": 63.37522888183594,
155
+ "learning_rate": 4.999139182552709e-05,
156
+ "loss": 249.7280517578125,
157
+ "step": 400
158
+ },
159
+ {
160
+ "epoch": 0.021982939668607183,
161
+ "grad_norm": 60.007144927978516,
162
+ "learning_rate": 4.9990507278185374e-05,
163
+ "loss": 236.941015625,
164
+ "step": 420
165
+ },
166
+ {
167
+ "epoch": 0.032899656049050395,
168
+ "grad_norm": 65.54226684570312,
169
+ "learning_rate": 4.99710597273532e-05,
170
+ "loss": 258.5776638454861,
171
+ "step": 440
172
+ },
173
+ {
174
+ "epoch": 0.03439509496037087,
175
+ "grad_norm": 73.97871398925781,
176
+ "learning_rate": 4.9968363302753075e-05,
177
+ "loss": 229.468310546875,
178
+ "step": 460
179
+ },
180
+ {
181
+ "epoch": 0.03589053387169134,
182
+ "grad_norm": 62.42031478881836,
183
+ "learning_rate": 4.9965546873726513e-05,
184
+ "loss": 227.5928955078125,
185
+ "step": 480
186
+ }
187
+ ],
188
+ "logging_steps": 20,
189
+ "max_steps": 28660,
190
+ "num_input_tokens_seen": 0,
191
+ "num_train_epochs": 3,
192
+ "save_steps": 1000000000,
193
+ "stateful_callbacks": {
194
+ "TrainerControl": {
195
+ "args": {
196
+ "should_epoch_stop": false,
197
+ "should_evaluate": false,
198
+ "should_log": false,
199
+ "should_save": true,
200
+ "should_training_stop": false
201
+ },
202
+ "attributes": {}
203
+ }
204
+ },
205
+ "total_flos": 513863019233280.0,
206
+ "train_batch_size": 1,
207
+ "trial_name": null,
208
+ "trial_params": null
209
+ }