aixk commited on
Commit
65288f4
·
1 Parent(s): b6b0dcb

Delete folder shared/checkpoints/latest with huggingface_hub

Browse files
shared/checkpoints/latest/config.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "architectures": [
3
- "AxaiForCausalLM"
4
- ],
5
- "attention_dropout": 0.0,
6
- "dtype": "float32",
7
- "hidden_dropout": 0.0,
8
- "hidden_size": 768,
9
- "initializer_range": 0.02,
10
- "intermediate_size": 3072,
11
- "max_position_embeddings": 128,
12
- "model_type": "axai",
13
- "neftune_alpha": 0.0,
14
- "num_attention_heads": 12,
15
- "num_hidden_layers": 16,
16
- "num_key_value_heads": 6,
17
- "qk_norm": true,
18
- "rezero_init": 1.0,
19
- "rms_norm_eps": 1e-06,
20
- "rope_theta": 10000.0,
21
- "transformers_version": "5.0.0",
22
- "use_cache": false,
23
- "vocab_size": 32000
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
shared/checkpoints/latest/trainer_state.json DELETED
@@ -1,419 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 1.5291516830749279,
6
- "eval_steps": 500,
7
- "global_step": 1081,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.0014148440723928551,
14
- "grad_norm": 46.59520721435547,
15
- "learning_rate": 5e-05,
16
- "loss": 408.9247131347656,
17
- "step": 1
18
- },
19
- {
20
- "epoch": 0.0282968814478571,
21
- "grad_norm": 43.969398498535156,
22
- "learning_rate": 4.996525250695594e-05,
23
- "loss": 385.9318205180921,
24
- "step": 20
25
- },
26
- {
27
- "epoch": 0.0565937628957142,
28
- "grad_norm": 82.97836303710938,
29
- "learning_rate": 4.985370748593809e-05,
30
- "loss": 385.9879638671875,
31
- "step": 40
32
- },
33
- {
34
- "epoch": 0.0848906443435713,
35
- "grad_norm": 145.0235595703125,
36
- "learning_rate": 4.966561222361462e-05,
37
- "loss": 368.023681640625,
38
- "step": 60
39
- },
40
- {
41
- "epoch": 0.1131875257914284,
42
- "grad_norm": 156.58584594726562,
43
- "learning_rate": 4.940154605961632e-05,
44
- "loss": 363.8541259765625,
45
- "step": 80
46
- },
47
- {
48
- "epoch": 0.14148440723928551,
49
- "grad_norm": 72.1839828491211,
50
- "learning_rate": 4.906232232646455e-05,
51
- "loss": 367.31640625,
52
- "step": 100
53
- },
54
- {
55
- "epoch": 0.1697812886871426,
56
- "grad_norm": 74.4588394165039,
57
- "learning_rate": 4.864898584448015e-05,
58
- "loss": 358.9984375,
59
- "step": 120
60
- },
61
- {
62
- "epoch": 0.1980781701349997,
63
- "grad_norm": 76.57646179199219,
64
- "learning_rate": 4.8162809703702284e-05,
65
- "loss": 359.79423828125,
66
- "step": 140
67
- },
68
- {
69
- "epoch": 0.2263750515828568,
70
- "grad_norm": 61.2109260559082,
71
- "learning_rate": 4.760529134272895e-05,
72
- "loss": 347.1311767578125,
73
- "step": 160
74
- },
75
- {
76
- "epoch": 0.2546719330307139,
77
- "grad_norm": 70.05961608886719,
78
- "learning_rate": 4.697814793655667e-05,
79
- "loss": 336.2667236328125,
80
- "step": 180
81
- },
82
- {
83
- "epoch": 0.28296881447857103,
84
- "grad_norm": 68.51240539550781,
85
- "learning_rate": 4.628331110762456e-05,
86
- "loss": 340.3522216796875,
87
- "step": 200
88
- },
89
- {
90
- "epoch": 0.3112656959264281,
91
- "grad_norm": 75.84900665283203,
92
- "learning_rate": 4.9999614897993864e-05,
93
- "loss": 339.859619140625,
94
- "step": 220
95
- },
96
- {
97
- "epoch": 0.3395625773742852,
98
- "grad_norm": 73.60946655273438,
99
- "learning_rate": 4.995341701131627e-05,
100
- "loss": 327.986181640625,
101
- "step": 240
102
- },
103
- {
104
- "epoch": 0.3678594588221423,
105
- "grad_norm": 75.10031127929688,
106
- "learning_rate": 4.983036177469612e-05,
107
- "loss": 327.8923828125,
108
- "step": 260
109
- },
110
- {
111
- "epoch": 0.3961563402699994,
112
- "grad_norm": 80.19892883300781,
113
- "learning_rate": 4.963082820233332e-05,
114
- "loss": 318.4505615234375,
115
- "step": 280
116
- },
117
- {
118
- "epoch": 0.4244532217178565,
119
- "grad_norm": 59.05309295654297,
120
- "learning_rate": 4.935543086423201e-05,
121
- "loss": 320.59248046875,
122
- "step": 300
123
- },
124
- {
125
- "epoch": 0.4527501031657136,
126
- "grad_norm": 46.86222457885742,
127
- "learning_rate": 4.9005017993304684e-05,
128
- "loss": 315.8417724609375,
129
- "step": 320
130
- },
131
- {
132
- "epoch": 0.4810469846135707,
133
- "grad_norm": 52.75422668457031,
134
- "learning_rate": 4.8580668872786686e-05,
135
- "loss": 315.3454833984375,
136
- "step": 340
137
- },
138
- {
139
- "epoch": 0.5093438660614278,
140
- "grad_norm": 53.963768005371094,
141
- "learning_rate": 4.80836905120082e-05,
142
- "loss": 312.086181640625,
143
- "step": 360
144
- },
145
- {
146
- "epoch": 0.5376407475092849,
147
- "grad_norm": 47.19234085083008,
148
- "learning_rate": 4.751561362076222e-05,
149
- "loss": 310.32763671875,
150
- "step": 380
151
- },
152
- {
153
- "epoch": 0.5659376289571421,
154
- "grad_norm": 54.37424850463867,
155
- "learning_rate": 4.68781878946678e-05,
156
- "loss": 298.73134765625,
157
- "step": 400
158
- },
159
- {
160
- "epoch": 0.5942345104049991,
161
- "grad_norm": 69.96826171875,
162
- "learning_rate": 4.6173376626049615e-05,
163
- "loss": 304.4895263671875,
164
- "step": 420
165
- },
166
- {
167
- "epoch": 0.6225313918528562,
168
- "grad_norm": 46.4427490234375,
169
- "learning_rate": 4.5403350656932655e-05,
170
- "loss": 296.0695068359375,
171
- "step": 440
172
- },
173
- {
174
- "epoch": 0.6508282733007134,
175
- "grad_norm": 60.224464416503906,
176
- "learning_rate": 4.457048169277701e-05,
177
- "loss": 303.4669921875,
178
- "step": 460
179
- },
180
- {
181
- "epoch": 0.6791251547485704,
182
- "grad_norm": 48.74308776855469,
183
- "learning_rate": 4.367733499754655e-05,
184
- "loss": 298.897607421875,
185
- "step": 480
186
- },
187
- {
188
- "epoch": 0.7074220361964275,
189
- "grad_norm": 42.0952033996582,
190
- "learning_rate": 4.996525250695594e-05,
191
- "loss": 296.8440673828125,
192
- "step": 500
193
- },
194
- {
195
- "epoch": 0.7357189176442847,
196
- "grad_norm": 59.525245666503906,
197
- "learning_rate": 4.985370748593809e-05,
198
- "loss": 289.252197265625,
199
- "step": 520
200
- },
201
- {
202
- "epoch": 0.7640157990921417,
203
- "grad_norm": 41.16178512573242,
204
- "learning_rate": 4.966561222361462e-05,
205
- "loss": 285.179833984375,
206
- "step": 540
207
- },
208
- {
209
- "epoch": 0.7923126805399988,
210
- "grad_norm": 46.047428131103516,
211
- "learning_rate": 4.940154605961632e-05,
212
- "loss": 283.002978515625,
213
- "step": 560
214
- },
215
- {
216
- "epoch": 0.820609561987856,
217
- "grad_norm": 50.66560363769531,
218
- "learning_rate": 4.906232232646455e-05,
219
- "loss": 277.0517333984375,
220
- "step": 580
221
- },
222
- {
223
- "epoch": 0.848906443435713,
224
- "grad_norm": 47.08381271362305,
225
- "learning_rate": 4.864898584448015e-05,
226
- "loss": 279.8613037109375,
227
- "step": 600
228
- },
229
- {
230
- "epoch": 0.8772033248835701,
231
- "grad_norm": 50.696502685546875,
232
- "learning_rate": 4.8162809703702284e-05,
233
- "loss": 277.827294921875,
234
- "step": 620
235
- },
236
- {
237
- "epoch": 0.9055002063314272,
238
- "grad_norm": 62.32334518432617,
239
- "learning_rate": 4.760529134272895e-05,
240
- "loss": 280.201611328125,
241
- "step": 640
242
- },
243
- {
244
- "epoch": 0.9337970877792843,
245
- "grad_norm": 48.96547317504883,
246
- "learning_rate": 4.697814793655667e-05,
247
- "loss": 275.6574462890625,
248
- "step": 660
249
- },
250
- {
251
- "epoch": 0.9620939692271414,
252
- "grad_norm": 50.21261215209961,
253
- "learning_rate": 4.628331110762456e-05,
254
- "loss": 275.423828125,
255
- "step": 680
256
- },
257
- {
258
- "epoch": 0.9903908506749985,
259
- "grad_norm": 53.1383056640625,
260
- "learning_rate": 4.5522920976353423e-05,
261
- "loss": 272.92900390625,
262
- "step": 700
263
- },
264
- {
265
- "epoch": 1.0183929729411072,
266
- "grad_norm": 53.41257095336914,
267
- "learning_rate": 4.4699319569503964e-05,
268
- "loss": 259.012353515625,
269
- "step": 720
270
- },
271
- {
272
- "epoch": 1.0466898543889642,
273
- "grad_norm": 43.35313034057617,
274
- "learning_rate": 4.3815043606656954e-05,
275
- "loss": 267.3876953125,
276
- "step": 740
277
- },
278
- {
279
- "epoch": 1.0749867358368212,
280
- "grad_norm": 40.72585678100586,
281
- "learning_rate": 4.2872816687032934e-05,
282
- "loss": 263.577294921875,
283
- "step": 760
284
- },
285
- {
286
- "epoch": 1.1032836172846785,
287
- "grad_norm": 39.94559860229492,
288
- "learning_rate": 4.1875540900716586e-05,
289
- "loss": 262.37470703125,
290
- "step": 780
291
- },
292
- {
293
- "epoch": 1.1315804987325355,
294
- "grad_norm": 45.3363151550293,
295
- "learning_rate": 4.082628789012317e-05,
296
- "loss": 257.188037109375,
297
- "step": 800
298
- },
299
- {
300
- "epoch": 1.1598773801803925,
301
- "grad_norm": 54.727962493896484,
302
- "learning_rate": 3.972828938923834e-05,
303
- "loss": 256.655029296875,
304
- "step": 820
305
- },
306
- {
307
- "epoch": 1.1881742616282498,
308
- "grad_norm": 46.29310607910156,
309
- "learning_rate": 3.858492726977051e-05,
310
- "loss": 264.1218994140625,
311
- "step": 840
312
- },
313
- {
314
- "epoch": 1.2164711430761068,
315
- "grad_norm": 57.223209381103516,
316
- "learning_rate": 3.739972312487412e-05,
317
- "loss": 250.47421875,
318
- "step": 860
319
- },
320
- {
321
- "epoch": 1.2447680245239638,
322
- "grad_norm": 55.62833023071289,
323
- "learning_rate": 3.6176327422526226e-05,
324
- "loss": 255.1604248046875,
325
- "step": 880
326
- },
327
- {
328
- "epoch": 1.273064905971821,
329
- "grad_norm": 56.45187759399414,
330
- "learning_rate": 3.491850826196432e-05,
331
- "loss": 254.080322265625,
332
- "step": 900
333
- },
334
- {
335
- "epoch": 1.3013617874196781,
336
- "grad_norm": 60.60747146606445,
337
- "learning_rate": 3.363013976781608e-05,
338
- "loss": 250.2458251953125,
339
- "step": 920
340
- },
341
- {
342
- "epoch": 1.3296586688675351,
343
- "grad_norm": 47.585941314697266,
344
- "learning_rate": 3.231519015766727e-05,
345
- "loss": 258.33154296875,
346
- "step": 940
347
- },
348
- {
349
- "epoch": 1.3579555503153924,
350
- "grad_norm": 72.65380096435547,
351
- "learning_rate": 3.097770951982037e-05,
352
- "loss": 253.3177734375,
353
- "step": 960
354
- },
355
- {
356
- "epoch": 1.3862524317632494,
357
- "grad_norm": 61.72402572631836,
358
- "learning_rate": 2.9621817338888546e-05,
359
- "loss": 246.7986572265625,
360
- "step": 980
361
- },
362
- {
363
- "epoch": 1.4145493132111064,
364
- "grad_norm": 46.793060302734375,
365
- "learning_rate": 2.825168980764682e-05,
366
- "loss": 247.830126953125,
367
- "step": 1000
368
- },
369
- {
370
- "epoch": 1.4428461946589637,
371
- "grad_norm": 36.52789306640625,
372
- "learning_rate": 2.6871546964220068e-05,
373
- "loss": 253.4376708984375,
374
- "step": 1020
375
- },
376
- {
377
- "epoch": 1.4711430761068207,
378
- "grad_norm": 50.205909729003906,
379
- "learning_rate": 2.548563969422611e-05,
380
- "loss": 245.1465576171875,
381
- "step": 1040
382
- },
383
- {
384
- "epoch": 1.499439957554678,
385
- "grad_norm": 41.533912658691406,
386
- "learning_rate": 2.409823663790725e-05,
387
- "loss": 245.4486328125,
388
- "step": 1060
389
- },
390
- {
391
- "epoch": 1.527736839002535,
392
- "grad_norm": 60.07231903076172,
393
- "learning_rate": 2.271361104257716e-05,
394
- "loss": 232.100341796875,
395
- "step": 1080
396
- }
397
- ],
398
- "logging_steps": 20,
399
- "max_steps": 1132,
400
- "num_input_tokens_seen": 0,
401
- "num_train_epochs": 2,
402
- "save_steps": 40,
403
- "stateful_callbacks": {
404
- "TrainerControl": {
405
- "args": {
406
- "should_epoch_stop": false,
407
- "should_evaluate": false,
408
- "should_log": false,
409
- "should_save": true,
410
- "should_training_stop": false
411
- },
412
- "attributes": {}
413
- }
414
- },
415
- "total_flos": 1.3997221022711808e+16,
416
- "train_batch_size": 1,
417
- "trial_name": null,
418
- "trial_params": null
419
- }