aixk commited on
Commit
b55f4b0
·
1 Parent(s): cca673e

Delete folder shared/checkpoints/latest with huggingface_hub

Browse files
shared/checkpoints/latest/config.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "architectures": [
3
- "AxaiForCausalLM"
4
- ],
5
- "attention_dropout": 0.0,
6
- "dtype": "float32",
7
- "hidden_dropout": 0.0,
8
- "hidden_size": 768,
9
- "initializer_range": 0.02,
10
- "intermediate_size": 3072,
11
- "max_position_embeddings": 128,
12
- "model_type": "axai",
13
- "neftune_alpha": 0.0,
14
- "num_attention_heads": 12,
15
- "num_hidden_layers": 24,
16
- "num_key_value_heads": 6,
17
- "qk_norm": true,
18
- "rezero_init": 1.0,
19
- "rms_norm_eps": 1e-06,
20
- "rope_theta": 10000.0,
21
- "transformers_version": "5.0.0",
22
- "use_cache": false,
23
- "vocab_size": 32000
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
shared/checkpoints/latest/trainer_state.json DELETED
@@ -1,321 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 0.9432293815952367,
6
- "eval_steps": 500,
7
- "global_step": 800,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.001179036726994046,
14
- "grad_norm": 18.818986892700195,
15
- "learning_rate": 5e-05,
16
- "loss": 214.36204528808594,
17
- "step": 1
18
- },
19
- {
20
- "epoch": 0.023580734539880917,
21
- "grad_norm": 17.1884765625,
22
- "learning_rate": 4.993823781115963e-05,
23
- "loss": 199.185546875,
24
- "step": 20
25
- },
26
- {
27
- "epoch": 0.047161469079761834,
28
- "grad_norm": 18.005523681640625,
29
- "learning_rate": 4.97401218720448e-05,
30
- "loss": 203.69400634765626,
31
- "step": 40
32
- },
33
- {
34
- "epoch": 0.07074220361964276,
35
- "grad_norm": 17.965116500854492,
36
- "learning_rate": 4.940656561583624e-05,
37
- "loss": 199.9952880859375,
38
- "step": 60
39
- },
40
- {
41
- "epoch": 0.09432293815952367,
42
- "grad_norm": 17.385486602783203,
43
- "learning_rate": 4.893939510326814e-05,
44
- "loss": 200.97593994140624,
45
- "step": 80
46
- },
47
- {
48
- "epoch": 0.11790367269940459,
49
- "grad_norm": 18.995485305786133,
50
- "learning_rate": 4.834116786912897e-05,
51
- "loss": 197.9327880859375,
52
- "step": 100
53
- },
54
- {
55
- "epoch": 0.14148440723928551,
56
- "grad_norm": 17.99912452697754,
57
- "learning_rate": 4.761515892098357e-05,
58
- "loss": 200.61953125,
59
- "step": 120
60
- },
61
- {
62
- "epoch": 0.16506514177916642,
63
- "grad_norm": 18.637039184570312,
64
- "learning_rate": 4.67653428100754e-05,
65
- "loss": 195.87960205078124,
66
- "step": 140
67
- },
68
- {
69
- "epoch": 0.18864587631904733,
70
- "grad_norm": 19.77713966369629,
71
- "learning_rate": 4.579637187256222e-05,
72
- "loss": 203.71815185546876,
73
- "step": 160
74
- },
75
- {
76
- "epoch": 0.21222661085892824,
77
- "grad_norm": 20.302921295166016,
78
- "learning_rate": 4.4713550760204035e-05,
79
- "loss": 201.46798095703124,
80
- "step": 180
81
- },
82
- {
83
- "epoch": 0.23580734539880918,
84
- "grad_norm": 20.111055374145508,
85
- "learning_rate": 4.352280739993557e-05,
86
- "loss": 191.88206787109374,
87
- "step": 200
88
- },
89
- {
90
- "epoch": 0.25938807993869006,
91
- "grad_norm": 20.618942260742188,
92
- "learning_rate": 4.223066054130568e-05,
93
- "loss": 189.92415771484374,
94
- "step": 220
95
- },
96
- {
97
- "epoch": 0.28296881447857103,
98
- "grad_norm": 20.38855743408203,
99
- "learning_rate": 4.0844184069445955e-05,
100
- "loss": 197.627880859375,
101
- "step": 240
102
- },
103
- {
104
- "epoch": 0.30654954901845194,
105
- "grad_norm": 22.40736198425293,
106
- "learning_rate": 3.937096827893793e-05,
107
- "loss": 192.96744384765626,
108
- "step": 260
109
- },
110
- {
111
- "epoch": 0.33013028355833285,
112
- "grad_norm": 26.542959213256836,
113
- "learning_rate": 3.781907832058587e-05,
114
- "loss": 192.7069580078125,
115
- "step": 280
116
- },
117
- {
118
- "epoch": 0.35371101809821376,
119
- "grad_norm": 24.460668563842773,
120
- "learning_rate": 3.619701004857919e-05,
121
- "loss": 193.03927001953124,
122
- "step": 300
123
- },
124
- {
125
- "epoch": 0.37729175263809467,
126
- "grad_norm": 22.90284538269043,
127
- "learning_rate": 3.451364350976022e-05,
128
- "loss": 185.04234619140624,
129
- "step": 320
130
- },
131
- {
132
- "epoch": 0.4008724871779756,
133
- "grad_norm": 23.723445892333984,
134
- "learning_rate": 3.2778194329621104e-05,
135
- "loss": 190.406298828125,
136
- "step": 340
137
- },
138
- {
139
- "epoch": 0.4244532217178565,
140
- "grad_norm": 25.815519332885742,
141
- "learning_rate": 3.1000163261168366e-05,
142
- "loss": 188.7949951171875,
143
- "step": 360
144
- },
145
- {
146
- "epoch": 0.44803395625773745,
147
- "grad_norm": 26.482738494873047,
148
- "learning_rate": 2.9189284172850983e-05,
149
- "loss": 187.61990966796876,
150
- "step": 380
151
- },
152
- {
153
- "epoch": 0.47161469079761836,
154
- "grad_norm": 25.560789108276367,
155
- "learning_rate": 2.7355470760292956e-05,
156
- "loss": 188.86490478515626,
157
- "step": 400
158
- },
159
- {
160
- "epoch": 0.4951954253374993,
161
- "grad_norm": 26.315908432006836,
162
- "learning_rate": 2.5508762273558284e-05,
163
- "loss": 190.40867919921874,
164
- "step": 420
165
- },
166
- {
167
- "epoch": 0.5187761598773801,
168
- "grad_norm": 23.34573745727539,
169
- "learning_rate": 2.3659268557065544e-05,
170
- "loss": 191.79708251953124,
171
- "step": 440
172
- },
173
- {
174
- "epoch": 0.5423568944172611,
175
- "grad_norm": 26.028112411499023,
176
- "learning_rate": 2.1817114703032176e-05,
177
- "loss": 188.81153564453126,
178
- "step": 460
179
- },
180
- {
181
- "epoch": 0.5659376289571421,
182
- "grad_norm": 26.664793014526367,
183
- "learning_rate": 1.99923856214443e-05,
184
- "loss": 182.7518798828125,
185
- "step": 480
186
- },
187
- {
188
- "epoch": 0.5895183634970229,
189
- "grad_norm": 25.86208152770996,
190
- "learning_rate": 1.819507083000514e-05,
191
- "loss": 189.74847412109375,
192
- "step": 500
193
- },
194
- {
195
- "epoch": 0.6130990980369039,
196
- "grad_norm": 26.640764236450195,
197
- "learning_rate": 1.643500976631037e-05,
198
- "loss": 186.2022216796875,
199
- "step": 520
200
- },
201
- {
202
- "epoch": 0.6366798325767847,
203
- "grad_norm": 29.260278701782227,
204
- "learning_rate": 1.47218379216403e-05,
205
- "loss": 186.85718994140626,
206
- "step": 540
207
- },
208
- {
209
- "epoch": 0.6602605671166657,
210
- "grad_norm": 25.371997833251953,
211
- "learning_rate": 1.3064934091260262e-05,
212
- "loss": 190.2005126953125,
213
- "step": 560
214
- },
215
- {
216
- "epoch": 0.6838413016565466,
217
- "grad_norm": 23.361217498779297,
218
- "learning_rate": 1.1473369030008974e-05,
219
- "loss": 192.95072021484376,
220
- "step": 580
221
- },
222
- {
223
- "epoch": 0.7074220361964275,
224
- "grad_norm": 26.195369720458984,
225
- "learning_rate": 9.955855794260422e-06,
226
- "loss": 187.2462646484375,
227
- "step": 600
228
- },
229
- {
230
- "epoch": 0.7310027707363085,
231
- "grad_norm": 27.924829483032227,
232
- "learning_rate": 8.520702042113765e-06,
233
- "loss": 187.06983642578126,
234
- "step": 620
235
- },
236
- {
237
- "epoch": 0.7545835052761893,
238
- "grad_norm": 25.70247459411621,
239
- "learning_rate": 7.1757645529443665e-06,
240
- "loss": 178.98265380859374,
241
- "step": 640
242
- },
243
- {
244
- "epoch": 0.7781642398160703,
245
- "grad_norm": 25.393587112426758,
246
- "learning_rate": 5.928406215300114e-06,
247
- "loss": 186.89532470703125,
248
- "step": 660
249
- },
250
- {
251
- "epoch": 0.8017449743559512,
252
- "grad_norm": 26.78150177001953,
253
- "learning_rate": 4.785455718613227e-06,
254
- "loss": 181.5225341796875,
255
- "step": 680
256
- },
257
- {
258
- "epoch": 0.8253257088958321,
259
- "grad_norm": 26.85440444946289,
260
- "learning_rate": 3.7531701693965554e-06,
261
- "loss": 180.88773193359376,
262
- "step": 700
263
- },
264
- {
265
- "epoch": 0.848906443435713,
266
- "grad_norm": 28.118528366088867,
267
- "learning_rate": 2.8372008365823904e-06,
268
- "loss": 183.96854248046876,
269
- "step": 720
270
- },
271
- {
272
- "epoch": 0.8724871779755939,
273
- "grad_norm": 26.2855167388916,
274
- "learning_rate": 2.0425622135320262e-06,
275
- "loss": 183.86905517578126,
276
- "step": 740
277
- },
278
- {
279
- "epoch": 0.8960679125154749,
280
- "grad_norm": 25.188119888305664,
281
- "learning_rate": 1.3736045660864034e-06,
282
- "loss": 186.562548828125,
283
- "step": 760
284
- },
285
- {
286
- "epoch": 0.9196486470553558,
287
- "grad_norm": 26.751577377319336,
288
- "learning_rate": 8.339901169443831e-07,
289
- "loss": 186.4565673828125,
290
- "step": 780
291
- },
292
- {
293
- "epoch": 0.9432293815952367,
294
- "grad_norm": 25.48533821105957,
295
- "learning_rate": 4.266729967476013e-07,
296
- "loss": 185.49075927734376,
297
- "step": 800
298
- }
299
- ],
300
- "logging_steps": 20,
301
- "max_steps": 849,
302
- "num_input_tokens_seen": 0,
303
- "num_train_epochs": 1,
304
- "save_steps": 400,
305
- "stateful_callbacks": {
306
- "TrainerControl": {
307
- "args": {
308
- "should_epoch_stop": false,
309
- "should_evaluate": false,
310
- "should_log": false,
311
- "should_save": true,
312
- "should_training_stop": false
313
- },
314
- "attributes": {}
315
- }
316
- },
317
- "total_flos": 1.234689785856e+16,
318
- "train_batch_size": 1,
319
- "trial_name": null,
320
- "trial_params": null
321
- }