tranhuyHoang commited on
Commit
b3825e1
·
verified ·
1 Parent(s): 8f2f5df

Delete last-checkpoint

Browse files
last-checkpoint/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e65a61da3bd59857bc61ebd07e726758d8f85f0c3bbf9a6bdfafa5218f0a6c3
3
- size 91951912
 
 
 
 
last-checkpoint/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:665e7c098b20618c3e078ad86acab2e0ca25d1467a8d0e2112c68af547b92299
3
- size 183993547
 
 
 
 
last-checkpoint/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:236304ae89e49aae8260113165ee63419b9b745f79120014328a7fa31ed79b42
3
- size 14645
 
 
 
 
last-checkpoint/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d352eeae2aa15ed4d011e01fe52a140d1d472b101b8457914fdee010588d6f43
3
- size 1465
 
 
 
 
last-checkpoint/special_tokens_map.json DELETED
@@ -1,16 +0,0 @@
1
- {
2
- "eos_token": {
3
- "content": "<|endoftext|>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "pad_token": {
10
- "content": "<|endoftext|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- }
16
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/tokenizer_config.json DELETED
@@ -1,20 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "<|endoftext|>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- }
11
- },
12
- "clean_up_tokenization_spaces": false,
13
- "eos_token": "<|endoftext|>",
14
- "extra_special_tokens": {},
15
- "model_max_length": 2048,
16
- "pad_token": "<|endoftext|>",
17
- "padding_side": "right",
18
- "tokenizer_class": "PreTrainedTokenizerFast",
19
- "truncation_side": "right"
20
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/trainer_state.json DELETED
@@ -1,970 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 0.012,
6
- "eval_steps": 10,
7
- "global_step": 120,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.0001,
14
- "grad_norm": 13.644583702087402,
15
- "learning_rate": 0.0,
16
- "loss": 73.3569,
17
- "step": 1
18
- },
19
- {
20
- "epoch": 0.0002,
21
- "grad_norm": 13.757925987243652,
22
- "learning_rate": 1e-05,
23
- "loss": 73.384,
24
- "step": 2
25
- },
26
- {
27
- "epoch": 0.0003,
28
- "grad_norm": 13.819925308227539,
29
- "learning_rate": 2e-05,
30
- "loss": 73.4764,
31
- "step": 3
32
- },
33
- {
34
- "epoch": 0.0004,
35
- "grad_norm": 13.885299682617188,
36
- "learning_rate": 3e-05,
37
- "loss": 73.2476,
38
- "step": 4
39
- },
40
- {
41
- "epoch": 0.0005,
42
- "grad_norm": 13.726334571838379,
43
- "learning_rate": 4e-05,
44
- "loss": 72.8724,
45
- "step": 5
46
- },
47
- {
48
- "epoch": 0.0006,
49
- "grad_norm": 13.66053581237793,
50
- "learning_rate": 5e-05,
51
- "loss": 72.2172,
52
- "step": 6
53
- },
54
- {
55
- "epoch": 0.0007,
56
- "grad_norm": 13.866334915161133,
57
- "learning_rate": 6e-05,
58
- "loss": 71.1639,
59
- "step": 7
60
- },
61
- {
62
- "epoch": 0.0008,
63
- "grad_norm": 13.102575302124023,
64
- "learning_rate": 7.000000000000001e-05,
65
- "loss": 70.3215,
66
- "step": 8
67
- },
68
- {
69
- "epoch": 0.0009,
70
- "grad_norm": 13.716519355773926,
71
- "learning_rate": 8e-05,
72
- "loss": 67.5932,
73
- "step": 9
74
- },
75
- {
76
- "epoch": 0.001,
77
- "grad_norm": 12.51129150390625,
78
- "learning_rate": 8.999999999999999e-05,
79
- "loss": 65.3889,
80
- "step": 10
81
- },
82
- {
83
- "epoch": 0.001,
84
- "eval_loss": 7.900570869445801,
85
- "eval_runtime": 680.8566,
86
- "eval_samples_per_second": 14.687,
87
- "eval_steps_per_second": 1.836,
88
- "step": 10
89
- },
90
- {
91
- "epoch": 0.0011,
92
- "grad_norm": 11.444281578063965,
93
- "learning_rate": 0.0001,
94
- "loss": 63.3063,
95
- "step": 11
96
- },
97
- {
98
- "epoch": 0.0012,
99
- "grad_norm": 9.952727317810059,
100
- "learning_rate": 0.00011,
101
- "loss": 60.919,
102
- "step": 12
103
- },
104
- {
105
- "epoch": 0.0013,
106
- "grad_norm": 8.779980659484863,
107
- "learning_rate": 0.00012,
108
- "loss": 59.1227,
109
- "step": 13
110
- },
111
- {
112
- "epoch": 0.0014,
113
- "grad_norm": 7.721734046936035,
114
- "learning_rate": 0.00013000000000000002,
115
- "loss": 57.2133,
116
- "step": 14
117
- },
118
- {
119
- "epoch": 0.0015,
120
- "grad_norm": 7.298959732055664,
121
- "learning_rate": 0.00014000000000000001,
122
- "loss": 55.2262,
123
- "step": 15
124
- },
125
- {
126
- "epoch": 0.0016,
127
- "grad_norm": 7.175290107727051,
128
- "learning_rate": 0.00015000000000000001,
129
- "loss": 52.2363,
130
- "step": 16
131
- },
132
- {
133
- "epoch": 0.0017,
134
- "grad_norm": 6.67252779006958,
135
- "learning_rate": 0.00016,
136
- "loss": 50.2609,
137
- "step": 17
138
- },
139
- {
140
- "epoch": 0.0018,
141
- "grad_norm": 6.430507183074951,
142
- "learning_rate": 0.00016999999999999999,
143
- "loss": 47.5378,
144
- "step": 18
145
- },
146
- {
147
- "epoch": 0.0019,
148
- "grad_norm": 5.900411128997803,
149
- "learning_rate": 0.00017999999999999998,
150
- "loss": 45.5708,
151
- "step": 19
152
- },
153
- {
154
- "epoch": 0.002,
155
- "grad_norm": 5.60172176361084,
156
- "learning_rate": 0.00019,
157
- "loss": 43.1171,
158
- "step": 20
159
- },
160
- {
161
- "epoch": 0.002,
162
- "eval_loss": 5.050407409667969,
163
- "eval_runtime": 679.6399,
164
- "eval_samples_per_second": 14.714,
165
- "eval_steps_per_second": 1.839,
166
- "step": 20
167
- },
168
- {
169
- "epoch": 0.0021,
170
- "grad_norm": 5.3365912437438965,
171
- "learning_rate": 0.0002,
172
- "loss": 39.7964,
173
- "step": 21
174
- },
175
- {
176
- "epoch": 0.0022,
177
- "grad_norm": 5.166492462158203,
178
- "learning_rate": 0.00021000000000000004,
179
- "loss": 37.9665,
180
- "step": 22
181
- },
182
- {
183
- "epoch": 0.0023,
184
- "grad_norm": 4.789157867431641,
185
- "learning_rate": 0.00022,
186
- "loss": 35.5756,
187
- "step": 23
188
- },
189
- {
190
- "epoch": 0.0024,
191
- "grad_norm": 4.50107479095459,
192
- "learning_rate": 0.00022999999999999998,
193
- "loss": 33.2697,
194
- "step": 24
195
- },
196
- {
197
- "epoch": 0.0025,
198
- "grad_norm": 4.265627384185791,
199
- "learning_rate": 0.00024,
200
- "loss": 31.4949,
201
- "step": 25
202
- },
203
- {
204
- "epoch": 0.0026,
205
- "grad_norm": 4.027210235595703,
206
- "learning_rate": 0.00025,
207
- "loss": 28.0505,
208
- "step": 26
209
- },
210
- {
211
- "epoch": 0.0027,
212
- "grad_norm": 3.928957223892212,
213
- "learning_rate": 0.00026000000000000003,
214
- "loss": 26.2355,
215
- "step": 27
216
- },
217
- {
218
- "epoch": 0.0028,
219
- "grad_norm": 3.5308196544647217,
220
- "learning_rate": 0.00027,
221
- "loss": 24.1822,
222
- "step": 28
223
- },
224
- {
225
- "epoch": 0.0029,
226
- "grad_norm": 3.3050384521484375,
227
- "learning_rate": 0.00028000000000000003,
228
- "loss": 23.0633,
229
- "step": 29
230
- },
231
- {
232
- "epoch": 0.003,
233
- "grad_norm": 3.0656802654266357,
234
- "learning_rate": 0.00029,
235
- "loss": 20.7796,
236
- "step": 30
237
- },
238
- {
239
- "epoch": 0.003,
240
- "eval_loss": 2.453228235244751,
241
- "eval_runtime": 680.3163,
242
- "eval_samples_per_second": 14.699,
243
- "eval_steps_per_second": 1.837,
244
- "step": 30
245
- },
246
- {
247
- "epoch": 0.0031,
248
- "grad_norm": 2.902653694152832,
249
- "learning_rate": 0.00030000000000000003,
250
- "loss": 19.6354,
251
- "step": 31
252
- },
253
- {
254
- "epoch": 0.0032,
255
- "grad_norm": 2.723034620285034,
256
- "learning_rate": 0.00031,
257
- "loss": 18.3838,
258
- "step": 32
259
- },
260
- {
261
- "epoch": 0.0033,
262
- "grad_norm": 2.559993267059326,
263
- "learning_rate": 0.00032,
264
- "loss": 16.345,
265
- "step": 33
266
- },
267
- {
268
- "epoch": 0.0034,
269
- "grad_norm": 2.283393144607544,
270
- "learning_rate": 0.00033,
271
- "loss": 14.2652,
272
- "step": 34
273
- },
274
- {
275
- "epoch": 0.0035,
276
- "grad_norm": 2.1815197467803955,
277
- "learning_rate": 0.00033999999999999997,
278
- "loss": 14.0831,
279
- "step": 35
280
- },
281
- {
282
- "epoch": 0.0036,
283
- "grad_norm": 2.018310308456421,
284
- "learning_rate": 0.00035,
285
- "loss": 12.7781,
286
- "step": 36
287
- },
288
- {
289
- "epoch": 0.0037,
290
- "grad_norm": 1.862168550491333,
291
- "learning_rate": 0.00035999999999999997,
292
- "loss": 11.4392,
293
- "step": 37
294
- },
295
- {
296
- "epoch": 0.0038,
297
- "grad_norm": 1.762819528579712,
298
- "learning_rate": 0.00037,
299
- "loss": 10.6711,
300
- "step": 38
301
- },
302
- {
303
- "epoch": 0.0039,
304
- "grad_norm": 1.6565489768981934,
305
- "learning_rate": 0.00038,
306
- "loss": 9.3263,
307
- "step": 39
308
- },
309
- {
310
- "epoch": 0.004,
311
- "grad_norm": 1.545681118965149,
312
- "learning_rate": 0.00039000000000000005,
313
- "loss": 9.0413,
314
- "step": 40
315
- },
316
- {
317
- "epoch": 0.004,
318
- "eval_loss": 0.9703376889228821,
319
- "eval_runtime": 681.7846,
320
- "eval_samples_per_second": 14.667,
321
- "eval_steps_per_second": 1.833,
322
- "step": 40
323
- },
324
- {
325
- "epoch": 0.0041,
326
- "grad_norm": 1.7677608728408813,
327
- "learning_rate": 0.0004,
328
- "loss": 9.2559,
329
- "step": 41
330
- },
331
- {
332
- "epoch": 0.0042,
333
- "grad_norm": 1.4745514392852783,
334
- "learning_rate": 0.00041,
335
- "loss": 7.3328,
336
- "step": 42
337
- },
338
- {
339
- "epoch": 0.0043,
340
- "grad_norm": 1.3988405466079712,
341
- "learning_rate": 0.00042000000000000007,
342
- "loss": 7.0773,
343
- "step": 43
344
- },
345
- {
346
- "epoch": 0.0044,
347
- "grad_norm": 1.1933400630950928,
348
- "learning_rate": 0.00043000000000000004,
349
- "loss": 5.9391,
350
- "step": 44
351
- },
352
- {
353
- "epoch": 0.0045,
354
- "grad_norm": 1.4306049346923828,
355
- "learning_rate": 0.00044,
356
- "loss": 6.1685,
357
- "step": 45
358
- },
359
- {
360
- "epoch": 0.0046,
361
- "grad_norm": 1.0445979833602905,
362
- "learning_rate": 0.00045,
363
- "loss": 4.2649,
364
- "step": 46
365
- },
366
- {
367
- "epoch": 0.0047,
368
- "grad_norm": 0.9933378100395203,
369
- "learning_rate": 0.00045999999999999996,
370
- "loss": 4.1511,
371
- "step": 47
372
- },
373
- {
374
- "epoch": 0.0048,
375
- "grad_norm": 1.1010223627090454,
376
- "learning_rate": 0.00047000000000000004,
377
- "loss": 4.194,
378
- "step": 48
379
- },
380
- {
381
- "epoch": 0.0049,
382
- "grad_norm": 3.0977938175201416,
383
- "learning_rate": 0.00048,
384
- "loss": 4.409,
385
- "step": 49
386
- },
387
- {
388
- "epoch": 0.005,
389
- "grad_norm": 0.9864963293075562,
390
- "learning_rate": 0.00049,
391
- "loss": 3.0262,
392
- "step": 50
393
- },
394
- {
395
- "epoch": 0.005,
396
- "eval_loss": 0.34309181571006775,
397
- "eval_runtime": 680.6754,
398
- "eval_samples_per_second": 14.691,
399
- "eval_steps_per_second": 1.836,
400
- "step": 50
401
- },
402
- {
403
- "epoch": 0.0051,
404
- "grad_norm": 1.5408843755722046,
405
- "learning_rate": 0.0005,
406
- "loss": 3.7493,
407
- "step": 51
408
- },
409
- {
410
- "epoch": 0.0052,
411
- "grad_norm": 0.7898032069206238,
412
- "learning_rate": 0.00051,
413
- "loss": 2.5679,
414
- "step": 52
415
- },
416
- {
417
- "epoch": 0.0053,
418
- "grad_norm": 0.7600494027137756,
419
- "learning_rate": 0.0005200000000000001,
420
- "loss": 2.116,
421
- "step": 53
422
- },
423
- {
424
- "epoch": 0.0054,
425
- "grad_norm": 0.6847724318504333,
426
- "learning_rate": 0.00053,
427
- "loss": 1.9753,
428
- "step": 54
429
- },
430
- {
431
- "epoch": 0.0055,
432
- "grad_norm": 0.9391168355941772,
433
- "learning_rate": 0.00054,
434
- "loss": 2.6345,
435
- "step": 55
436
- },
437
- {
438
- "epoch": 0.0056,
439
- "grad_norm": 0.614797830581665,
440
- "learning_rate": 0.0005499999999999999,
441
- "loss": 1.4403,
442
- "step": 56
443
- },
444
- {
445
- "epoch": 0.0057,
446
- "grad_norm": 0.6510812044143677,
447
- "learning_rate": 0.0005600000000000001,
448
- "loss": 1.4256,
449
- "step": 57
450
- },
451
- {
452
- "epoch": 0.0058,
453
- "grad_norm": 0.5823825001716614,
454
- "learning_rate": 0.00057,
455
- "loss": 1.2711,
456
- "step": 58
457
- },
458
- {
459
- "epoch": 0.0059,
460
- "grad_norm": 1.0760960578918457,
461
- "learning_rate": 0.00058,
462
- "loss": 2.4494,
463
- "step": 59
464
- },
465
- {
466
- "epoch": 0.006,
467
- "grad_norm": 0.6940433382987976,
468
- "learning_rate": 0.00059,
469
- "loss": 1.2022,
470
- "step": 60
471
- },
472
- {
473
- "epoch": 0.006,
474
- "eval_loss": 0.11387959867715836,
475
- "eval_runtime": 680.3956,
476
- "eval_samples_per_second": 14.697,
477
- "eval_steps_per_second": 1.837,
478
- "step": 60
479
- },
480
- {
481
- "epoch": 0.0061,
482
- "grad_norm": 0.47780829668045044,
483
- "learning_rate": 0.0006000000000000001,
484
- "loss": 0.8812,
485
- "step": 61
486
- },
487
- {
488
- "epoch": 0.0062,
489
- "grad_norm": 0.4161126911640167,
490
- "learning_rate": 0.00061,
491
- "loss": 0.6873,
492
- "step": 62
493
- },
494
- {
495
- "epoch": 0.0063,
496
- "grad_norm": 0.4566328525543213,
497
- "learning_rate": 0.00062,
498
- "loss": 0.7777,
499
- "step": 63
500
- },
501
- {
502
- "epoch": 0.0064,
503
- "grad_norm": 0.6500325202941895,
504
- "learning_rate": 0.00063,
505
- "loss": 0.8523,
506
- "step": 64
507
- },
508
- {
509
- "epoch": 0.0065,
510
- "grad_norm": 0.31760597229003906,
511
- "learning_rate": 0.00064,
512
- "loss": 0.4715,
513
- "step": 65
514
- },
515
- {
516
- "epoch": 0.0066,
517
- "grad_norm": 0.33907350897789,
518
- "learning_rate": 0.0006500000000000001,
519
- "loss": 0.4799,
520
- "step": 66
521
- },
522
- {
523
- "epoch": 0.0067,
524
- "grad_norm": 0.429651141166687,
525
- "learning_rate": 0.00066,
526
- "loss": 0.5399,
527
- "step": 67
528
- },
529
- {
530
- "epoch": 0.0068,
531
- "grad_norm": 0.26789650321006775,
532
- "learning_rate": 0.00067,
533
- "loss": 0.3205,
534
- "step": 68
535
- },
536
- {
537
- "epoch": 0.0069,
538
- "grad_norm": 0.5304676294326782,
539
- "learning_rate": 0.0006799999999999999,
540
- "loss": 0.6197,
541
- "step": 69
542
- },
543
- {
544
- "epoch": 0.007,
545
- "grad_norm": 0.22003565728664398,
546
- "learning_rate": 0.0006900000000000001,
547
- "loss": 0.2592,
548
- "step": 70
549
- },
550
- {
551
- "epoch": 0.007,
552
- "eval_loss": 0.04052043333649635,
553
- "eval_runtime": 682.0602,
554
- "eval_samples_per_second": 14.661,
555
- "eval_steps_per_second": 1.833,
556
- "step": 70
557
- },
558
- {
559
- "epoch": 0.0071,
560
- "grad_norm": 0.550254762172699,
561
- "learning_rate": 0.0007,
562
- "loss": 0.5598,
563
- "step": 71
564
- },
565
- {
566
- "epoch": 0.0072,
567
- "grad_norm": 0.21443326771259308,
568
- "learning_rate": 0.00071,
569
- "loss": 0.23,
570
- "step": 72
571
- },
572
- {
573
- "epoch": 0.0073,
574
- "grad_norm": 0.25100332498550415,
575
- "learning_rate": 0.0007199999999999999,
576
- "loss": 0.2287,
577
- "step": 73
578
- },
579
- {
580
- "epoch": 0.0074,
581
- "grad_norm": 0.8701838850975037,
582
- "learning_rate": 0.0007300000000000001,
583
- "loss": 0.8359,
584
- "step": 74
585
- },
586
- {
587
- "epoch": 0.0075,
588
- "grad_norm": 0.19172491133213043,
589
- "learning_rate": 0.00074,
590
- "loss": 0.1903,
591
- "step": 75
592
- },
593
- {
594
- "epoch": 0.0076,
595
- "grad_norm": 0.15641027688980103,
596
- "learning_rate": 0.00075,
597
- "loss": 0.13,
598
- "step": 76
599
- },
600
- {
601
- "epoch": 0.0077,
602
- "grad_norm": 0.1556449830532074,
603
- "learning_rate": 0.00076,
604
- "loss": 0.1248,
605
- "step": 77
606
- },
607
- {
608
- "epoch": 0.0078,
609
- "grad_norm": 0.2690158486366272,
610
- "learning_rate": 0.00077,
611
- "loss": 0.1378,
612
- "step": 78
613
- },
614
- {
615
- "epoch": 0.0079,
616
- "grad_norm": 0.17920906841754913,
617
- "learning_rate": 0.0007800000000000001,
618
- "loss": 0.1314,
619
- "step": 79
620
- },
621
- {
622
- "epoch": 0.008,
623
- "grad_norm": 0.13513772189617157,
624
- "learning_rate": 0.0007899999999999999,
625
- "loss": 0.0999,
626
- "step": 80
627
- },
628
- {
629
- "epoch": 0.008,
630
- "eval_loss": 0.019889511168003082,
631
- "eval_runtime": 681.178,
632
- "eval_samples_per_second": 14.68,
633
- "eval_steps_per_second": 1.835,
634
- "step": 80
635
- },
636
- {
637
- "epoch": 0.0081,
638
- "grad_norm": 0.10746733844280243,
639
- "learning_rate": 0.0008,
640
- "loss": 0.1001,
641
- "step": 81
642
- },
643
- {
644
- "epoch": 0.0082,
645
- "grad_norm": 0.0835120752453804,
646
- "learning_rate": 0.0008100000000000001,
647
- "loss": 0.0699,
648
- "step": 82
649
- },
650
- {
651
- "epoch": 0.0083,
652
- "grad_norm": 0.13253141939640045,
653
- "learning_rate": 0.00082,
654
- "loss": 0.0795,
655
- "step": 83
656
- },
657
- {
658
- "epoch": 0.0084,
659
- "grad_norm": 0.08485159277915955,
660
- "learning_rate": 0.00083,
661
- "loss": 0.0697,
662
- "step": 84
663
- },
664
- {
665
- "epoch": 0.0085,
666
- "grad_norm": 0.14905264973640442,
667
- "learning_rate": 0.0008400000000000001,
668
- "loss": 0.0928,
669
- "step": 85
670
- },
671
- {
672
- "epoch": 0.0086,
673
- "grad_norm": 0.9524427652359009,
674
- "learning_rate": 0.00085,
675
- "loss": 0.3045,
676
- "step": 86
677
- },
678
- {
679
- "epoch": 0.0087,
680
- "grad_norm": 0.5842136740684509,
681
- "learning_rate": 0.0008600000000000001,
682
- "loss": 0.1334,
683
- "step": 87
684
- },
685
- {
686
- "epoch": 0.0088,
687
- "grad_norm": 0.11962056159973145,
688
- "learning_rate": 0.00087,
689
- "loss": 0.0735,
690
- "step": 88
691
- },
692
- {
693
- "epoch": 0.0089,
694
- "grad_norm": 0.057546887546777725,
695
- "learning_rate": 0.00088,
696
- "loss": 0.0477,
697
- "step": 89
698
- },
699
- {
700
- "epoch": 0.009,
701
- "grad_norm": 0.14116229116916656,
702
- "learning_rate": 0.0008900000000000001,
703
- "loss": 0.0651,
704
- "step": 90
705
- },
706
- {
707
- "epoch": 0.009,
708
- "eval_loss": 0.013806294649839401,
709
- "eval_runtime": 680.6174,
710
- "eval_samples_per_second": 14.693,
711
- "eval_steps_per_second": 1.837,
712
- "step": 90
713
- },
714
- {
715
- "epoch": 0.0091,
716
- "grad_norm": 0.0823589563369751,
717
- "learning_rate": 0.0009,
718
- "loss": 0.062,
719
- "step": 91
720
- },
721
- {
722
- "epoch": 0.0092,
723
- "grad_norm": 0.06650274246931076,
724
- "learning_rate": 0.00091,
725
- "loss": 0.0455,
726
- "step": 92
727
- },
728
- {
729
- "epoch": 0.0093,
730
- "grad_norm": 0.49958088994026184,
731
- "learning_rate": 0.0009199999999999999,
732
- "loss": 0.5843,
733
- "step": 93
734
- },
735
- {
736
- "epoch": 0.0094,
737
- "grad_norm": 0.0794510543346405,
738
- "learning_rate": 0.00093,
739
- "loss": 0.0424,
740
- "step": 94
741
- },
742
- {
743
- "epoch": 0.0095,
744
- "grad_norm": 0.09296651929616928,
745
- "learning_rate": 0.0009400000000000001,
746
- "loss": 0.0569,
747
- "step": 95
748
- },
749
- {
750
- "epoch": 0.0096,
751
- "grad_norm": 0.06935586035251617,
752
- "learning_rate": 0.00095,
753
- "loss": 0.0461,
754
- "step": 96
755
- },
756
- {
757
- "epoch": 0.0097,
758
- "grad_norm": 0.06106742471456528,
759
- "learning_rate": 0.00096,
760
- "loss": 0.0388,
761
- "step": 97
762
- },
763
- {
764
- "epoch": 0.0098,
765
- "grad_norm": 0.07485494017601013,
766
- "learning_rate": 0.0009699999999999999,
767
- "loss": 0.0517,
768
- "step": 98
769
- },
770
- {
771
- "epoch": 0.0099,
772
- "grad_norm": 0.15889950096607208,
773
- "learning_rate": 0.00098,
774
- "loss": 0.0938,
775
- "step": 99
776
- },
777
- {
778
- "epoch": 0.01,
779
- "grad_norm": 0.11253108084201813,
780
- "learning_rate": 0.00099,
781
- "loss": 0.0644,
782
- "step": 100
783
- },
784
- {
785
- "epoch": 0.01,
786
- "eval_loss": 0.011293401941657066,
787
- "eval_runtime": 681.3771,
788
- "eval_samples_per_second": 14.676,
789
- "eval_steps_per_second": 1.835,
790
- "step": 100
791
- },
792
- {
793
- "epoch": 0.0101,
794
- "grad_norm": 0.06460646539926529,
795
- "learning_rate": 0.001,
796
- "loss": 0.0401,
797
- "step": 101
798
- },
799
- {
800
- "epoch": 0.0102,
801
- "grad_norm": 0.08616077899932861,
802
- "learning_rate": 0.00101,
803
- "loss": 0.0476,
804
- "step": 102
805
- },
806
- {
807
- "epoch": 0.0103,
808
- "grad_norm": 0.14644859731197357,
809
- "learning_rate": 0.00102,
810
- "loss": 0.0833,
811
- "step": 103
812
- },
813
- {
814
- "epoch": 0.0104,
815
- "grad_norm": 0.04703814536333084,
816
- "learning_rate": 0.00103,
817
- "loss": 0.0301,
818
- "step": 104
819
- },
820
- {
821
- "epoch": 0.0105,
822
- "grad_norm": 0.8709274530410767,
823
- "learning_rate": 0.0010400000000000001,
824
- "loss": 0.7189,
825
- "step": 105
826
- },
827
- {
828
- "epoch": 0.0106,
829
- "grad_norm": 0.08391022682189941,
830
- "learning_rate": 0.00105,
831
- "loss": 0.0517,
832
- "step": 106
833
- },
834
- {
835
- "epoch": 0.0107,
836
- "grad_norm": 0.1281561255455017,
837
- "learning_rate": 0.00106,
838
- "loss": 0.0558,
839
- "step": 107
840
- },
841
- {
842
- "epoch": 0.0108,
843
- "grad_norm": 0.06590331345796585,
844
- "learning_rate": 0.00107,
845
- "loss": 0.0425,
846
- "step": 108
847
- },
848
- {
849
- "epoch": 0.0109,
850
- "grad_norm": 0.024916600435972214,
851
- "learning_rate": 0.00108,
852
- "loss": 0.0243,
853
- "step": 109
854
- },
855
- {
856
- "epoch": 0.011,
857
- "grad_norm": 0.028951430693268776,
858
- "learning_rate": 0.00109,
859
- "loss": 0.0245,
860
- "step": 110
861
- },
862
- {
863
- "epoch": 0.011,
864
- "eval_loss": 0.005778464023023844,
865
- "eval_runtime": 680.0884,
866
- "eval_samples_per_second": 14.704,
867
- "eval_steps_per_second": 1.838,
868
- "step": 110
869
- },
870
- {
871
- "epoch": 0.0111,
872
- "grad_norm": 0.035101134330034256,
873
- "learning_rate": 0.0010999999999999998,
874
- "loss": 0.0292,
875
- "step": 111
876
- },
877
- {
878
- "epoch": 0.0112,
879
- "grad_norm": 0.03888938948512077,
880
- "learning_rate": 0.00111,
881
- "loss": 0.0291,
882
- "step": 112
883
- },
884
- {
885
- "epoch": 0.0113,
886
- "grad_norm": 0.027459941804409027,
887
- "learning_rate": 0.0011200000000000001,
888
- "loss": 0.0232,
889
- "step": 113
890
- },
891
- {
892
- "epoch": 0.0114,
893
- "grad_norm": 0.04769216105341911,
894
- "learning_rate": 0.00113,
895
- "loss": 0.0337,
896
- "step": 114
897
- },
898
- {
899
- "epoch": 0.0115,
900
- "grad_norm": 0.024265864863991737,
901
- "learning_rate": 0.00114,
902
- "loss": 0.0233,
903
- "step": 115
904
- },
905
- {
906
- "epoch": 0.0116,
907
- "grad_norm": 0.07471495121717453,
908
- "learning_rate": 0.0011500000000000002,
909
- "loss": 0.039,
910
- "step": 116
911
- },
912
- {
913
- "epoch": 0.0117,
914
- "grad_norm": 0.03743023797869682,
915
- "learning_rate": 0.00116,
916
- "loss": 0.0291,
917
- "step": 117
918
- },
919
- {
920
- "epoch": 0.0118,
921
- "grad_norm": 0.025691425427794456,
922
- "learning_rate": 0.00117,
923
- "loss": 0.024,
924
- "step": 118
925
- },
926
- {
927
- "epoch": 0.0119,
928
- "grad_norm": 0.02821824699640274,
929
- "learning_rate": 0.00118,
930
- "loss": 0.0245,
931
- "step": 119
932
- },
933
- {
934
- "epoch": 0.012,
935
- "grad_norm": 0.027008380740880966,
936
- "learning_rate": 0.00119,
937
- "loss": 0.0271,
938
- "step": 120
939
- },
940
- {
941
- "epoch": 0.012,
942
- "eval_loss": 0.0045553590171039104,
943
- "eval_runtime": 685.7922,
944
- "eval_samples_per_second": 14.582,
945
- "eval_steps_per_second": 1.823,
946
- "step": 120
947
- }
948
- ],
949
- "logging_steps": 1,
950
- "max_steps": 10000,
951
- "num_input_tokens_seen": 0,
952
- "num_train_epochs": 9223372036854775807,
953
- "save_steps": 10,
954
- "stateful_callbacks": {
955
- "TrainerControl": {
956
- "args": {
957
- "should_epoch_stop": false,
958
- "should_evaluate": false,
959
- "should_log": false,
960
- "should_save": true,
961
- "should_training_stop": false
962
- },
963
- "attributes": {}
964
- }
965
- },
966
- "total_flos": 0.0,
967
- "train_batch_size": 8,
968
- "trial_name": null,
969
- "trial_params": null
970
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:af02e602b37732e5013cd321602a51e5ba92ee6a737c53590897eaa51eee7722
3
- size 5841