Ba2han commited on
Commit
d6c4e1f
·
verified ·
1 Parent(s): 3f726c1

Training in progress, step 202, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -74,7 +74,7 @@
74
  },
75
  "sliding_window": null,
76
  "tie_word_embeddings": true,
77
- "transformers_version": "5.5.0",
78
  "unsloth_version": "2026.5.5",
79
  "use_cache": false,
80
  "use_sliding_window": false,
 
74
  },
75
  "sliding_window": null,
76
  "tie_word_embeddings": true,
77
+ "transformers_version": "5.9.0",
78
  "unsloth_version": "2026.5.5",
79
  "use_cache": false,
80
  "use_sliding_window": false,
last-checkpoint/generation_config.json CHANGED
@@ -9,6 +9,6 @@
9
  "output_attentions": false,
10
  "output_hidden_states": false,
11
  "pad_token_id": 50034,
12
- "transformers_version": "5.5.0",
13
  "use_cache": false
14
  }
 
9
  "output_attentions": false,
10
  "output_hidden_states": false,
11
  "pad_token_id": 50034,
12
+ "transformers_version": "5.9.0",
13
  "use_cache": false
14
  }
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ddd73a9f4705d43af46441fdabae4f1fcdb54eba8592d8aa3b59157cb049a61d
3
  size 1049614696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e86ea7e76d56fce0e7cf1b5ead34055c12d9e1c046d85c8d5e52e91e6ee5f69d
3
  size 1049614696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75180711f91895ee76c5f92a1790222aed1a68917f97d9bddb9b8c90258b062e
3
- size 1372902609
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:987fe636d2e5772693b0825e190d31af506ea6d4eec9f928250ac37333c1578a
3
+ size 1372902161
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4956fb60be60c7de5858bdceb864c9f31c63309229a058e355df8cca31faf0ff
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5315fadac529441a6634fe188c8336bd2e818d51c6e2348dc0d1261d0463f5d4
3
  size 1465
last-checkpoint/tokenizer_config.json CHANGED
@@ -8,6 +8,7 @@
8
  "<|im_end|>"
9
  ],
10
  "is_local": true,
 
11
  "model_input_names": [
12
  "input_ids",
13
  "attention_mask"
 
8
  "<|im_end|>"
9
  ],
10
  "is_local": true,
11
+ "local_files_only": false,
12
  "model_input_names": [
13
  "input_ids",
14
  "attention_mask"
last-checkpoint/trainer_state.json CHANGED
@@ -2,1251 +2,742 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0,
6
- "eval_steps": 51,
7
- "global_step": 337,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.005943536404160475,
14
- "grad_norm": 1.8359375,
15
- "learning_rate": 7.142857142857143e-06,
16
- "loss": 1.5533452033996582,
17
  "step": 2
18
  },
19
  {
20
- "epoch": 0.01188707280832095,
21
- "grad_norm": 1.9609375,
22
- "learning_rate": 2.1428571428571428e-05,
23
- "loss": 1.618288278579712,
24
  "step": 4
25
  },
26
  {
27
- "epoch": 0.017830609212481426,
28
- "grad_norm": 1.8671875,
29
- "learning_rate": 3.571428571428572e-05,
30
- "loss": 1.5396310091018677,
31
  "step": 6
32
  },
33
  {
34
- "epoch": 0.0237741456166419,
35
- "grad_norm": 1.890625,
36
- "learning_rate": 5e-05,
37
- "loss": 1.579732894897461,
38
  "step": 8
39
  },
40
  {
41
- "epoch": 0.029717682020802376,
42
- "grad_norm": 1.1640625,
43
- "learning_rate": 5e-05,
44
- "loss": 1.5438868999481201,
45
  "step": 10
46
  },
47
  {
48
- "epoch": 0.03566121842496285,
49
- "grad_norm": 1.0,
50
  "learning_rate": 5e-05,
51
- "loss": 1.4871983528137207,
52
  "step": 12
53
  },
54
  {
55
- "epoch": 0.041604754829123326,
56
- "grad_norm": 0.8828125,
57
  "learning_rate": 5e-05,
58
- "loss": 1.529313325881958,
59
  "step": 14
60
  },
61
  {
62
- "epoch": 0.0475482912332838,
63
- "grad_norm": 1.234375,
64
  "learning_rate": 5e-05,
65
- "loss": 1.494563341140747,
66
  "step": 16
67
  },
68
  {
69
- "epoch": 0.05349182763744428,
70
- "grad_norm": 1.2109375,
71
  "learning_rate": 5e-05,
72
- "loss": 1.5288245677947998,
73
  "step": 18
74
  },
75
  {
76
- "epoch": 0.05943536404160475,
77
- "grad_norm": 1.109375,
78
  "learning_rate": 5e-05,
79
- "loss": 1.5613418817520142,
80
  "step": 20
81
  },
82
  {
83
- "epoch": 0.06537890044576523,
84
- "grad_norm": 0.88671875,
85
  "learning_rate": 5e-05,
86
- "loss": 1.4736829996109009,
87
  "step": 22
88
  },
89
  {
90
- "epoch": 0.0713224368499257,
91
- "grad_norm": 0.78515625,
92
  "learning_rate": 5e-05,
93
- "loss": 1.3815312385559082,
94
  "step": 24
95
  },
96
  {
97
- "epoch": 0.07726597325408618,
98
- "grad_norm": 0.8515625,
99
  "learning_rate": 5e-05,
100
- "loss": 1.5204863548278809,
101
  "step": 26
102
  },
103
  {
104
- "epoch": 0.08320950965824665,
105
- "grad_norm": 0.9375,
106
  "learning_rate": 5e-05,
107
- "loss": 1.4796512126922607,
108
  "step": 28
109
  },
110
  {
111
- "epoch": 0.08915304606240713,
112
- "grad_norm": 0.78125,
113
  "learning_rate": 5e-05,
114
- "loss": 1.4494001865386963,
115
  "step": 30
116
  },
117
  {
118
- "epoch": 0.0950965824665676,
119
- "grad_norm": 0.9375,
120
  "learning_rate": 5e-05,
121
- "loss": 1.4973952770233154,
122
  "step": 32
123
  },
124
  {
125
- "epoch": 0.10104011887072809,
126
- "grad_norm": 0.875,
127
  "learning_rate": 5e-05,
128
- "loss": 1.4398908615112305,
129
  "step": 34
130
  },
131
  {
132
- "epoch": 0.10698365527488855,
133
- "grad_norm": 0.9609375,
134
  "learning_rate": 5e-05,
135
- "loss": 1.3991800546646118,
136
  "step": 36
137
  },
138
  {
139
- "epoch": 0.11292719167904904,
140
- "grad_norm": 0.80859375,
141
  "learning_rate": 5e-05,
142
- "loss": 1.475541353225708,
143
  "step": 38
144
  },
145
  {
146
- "epoch": 0.1188707280832095,
147
- "grad_norm": 0.9375,
148
  "learning_rate": 5e-05,
149
- "loss": 1.4666297435760498,
150
  "step": 40
151
  },
152
  {
153
- "epoch": 0.12481426448736999,
154
- "grad_norm": 0.859375,
155
  "learning_rate": 5e-05,
156
- "loss": 1.4133172035217285,
157
  "step": 42
158
  },
159
  {
160
- "epoch": 0.13075780089153047,
161
- "grad_norm": 0.83203125,
162
  "learning_rate": 5e-05,
163
- "loss": 1.4405813217163086,
164
  "step": 44
165
  },
166
  {
167
- "epoch": 0.13670133729569092,
168
- "grad_norm": 0.7265625,
169
  "learning_rate": 5e-05,
170
- "loss": 1.5167930126190186,
171
  "step": 46
172
  },
173
  {
174
- "epoch": 0.1426448736998514,
175
- "grad_norm": 0.88671875,
176
  "learning_rate": 5e-05,
177
- "loss": 1.35231614112854,
178
  "step": 48
179
  },
180
  {
181
- "epoch": 0.1485884101040119,
182
- "grad_norm": 0.9296875,
183
  "learning_rate": 5e-05,
184
- "loss": 1.5603207349777222,
185
  "step": 50
186
  },
187
  {
188
- "epoch": 0.1515601783060921,
189
- "eval_loss": 1.49801504611969,
190
- "eval_runtime": 1.855,
191
- "eval_samples_per_second": 58.76,
192
- "eval_steps_per_second": 7.547,
193
- "step": 51
194
- },
195
- {
196
- "epoch": 0.15453194650817237,
197
- "grad_norm": 0.8359375,
198
  "learning_rate": 5e-05,
199
- "loss": 1.5468140840530396,
200
  "step": 52
201
  },
202
  {
203
- "epoch": 0.16047548291233285,
204
- "grad_norm": 0.8515625,
205
  "learning_rate": 5e-05,
206
- "loss": 1.483694076538086,
207
  "step": 54
208
  },
209
  {
210
- "epoch": 0.1664190193164933,
211
- "grad_norm": 0.859375,
212
  "learning_rate": 5e-05,
213
- "loss": 1.3892195224761963,
214
  "step": 56
215
  },
216
  {
217
- "epoch": 0.1723625557206538,
218
- "grad_norm": 0.8359375,
219
  "learning_rate": 5e-05,
220
- "loss": 1.4213433265686035,
221
  "step": 58
222
  },
223
  {
224
- "epoch": 0.17830609212481427,
225
- "grad_norm": 0.75,
226
  "learning_rate": 5e-05,
227
- "loss": 1.5514280796051025,
228
  "step": 60
229
  },
230
  {
231
- "epoch": 0.18424962852897475,
232
- "grad_norm": 0.7265625,
233
  "learning_rate": 5e-05,
234
- "loss": 1.415259599685669,
235
  "step": 62
236
  },
237
  {
238
- "epoch": 0.1901931649331352,
239
- "grad_norm": 0.83984375,
240
  "learning_rate": 5e-05,
241
- "loss": 1.4870233535766602,
242
  "step": 64
243
  },
244
  {
245
- "epoch": 0.1961367013372957,
246
- "grad_norm": 0.83203125,
247
  "learning_rate": 5e-05,
248
- "loss": 1.5115995407104492,
249
  "step": 66
250
  },
251
  {
252
- "epoch": 0.20208023774145617,
253
- "grad_norm": 0.83984375,
254
  "learning_rate": 5e-05,
255
- "loss": 1.5251541137695312,
256
  "step": 68
257
  },
258
  {
259
- "epoch": 0.20802377414561665,
260
- "grad_norm": 0.8515625,
261
  "learning_rate": 5e-05,
262
- "loss": 1.4827852249145508,
263
  "step": 70
264
  },
265
  {
266
- "epoch": 0.2139673105497771,
267
- "grad_norm": 1.0078125,
268
  "learning_rate": 5e-05,
269
- "loss": 1.563201904296875,
270
  "step": 72
271
  },
272
  {
273
- "epoch": 0.2199108469539376,
274
- "grad_norm": 0.73046875,
275
  "learning_rate": 5e-05,
276
- "loss": 1.488223910331726,
277
  "step": 74
278
  },
279
  {
280
- "epoch": 0.22585438335809807,
281
- "grad_norm": 0.8828125,
282
  "learning_rate": 5e-05,
283
- "loss": 1.4918928146362305,
284
  "step": 76
285
  },
286
  {
287
- "epoch": 0.23179791976225855,
288
- "grad_norm": 0.78515625,
 
 
 
 
 
 
 
 
289
  "learning_rate": 5e-05,
290
- "loss": 1.499556541442871,
291
  "step": 78
292
  },
293
  {
294
- "epoch": 0.237741456166419,
295
- "grad_norm": 0.68359375,
296
  "learning_rate": 5e-05,
297
- "loss": 1.398608684539795,
298
  "step": 80
299
  },
300
  {
301
- "epoch": 0.2436849925705795,
302
- "grad_norm": 0.76953125,
303
  "learning_rate": 5e-05,
304
- "loss": 1.491286277770996,
305
  "step": 82
306
  },
307
  {
308
- "epoch": 0.24962852897473997,
309
- "grad_norm": 0.93359375,
310
  "learning_rate": 5e-05,
311
- "loss": 1.3734058141708374,
312
  "step": 84
313
  },
314
  {
315
- "epoch": 0.2555720653789004,
316
- "grad_norm": 0.9453125,
317
  "learning_rate": 5e-05,
318
- "loss": 1.5038151741027832,
319
  "step": 86
320
  },
321
  {
322
- "epoch": 0.26151560178306094,
323
- "grad_norm": 0.73828125,
324
  "learning_rate": 5e-05,
325
- "loss": 1.5082918405532837,
326
  "step": 88
327
  },
328
  {
329
- "epoch": 0.2674591381872214,
330
- "grad_norm": 0.73828125,
331
  "learning_rate": 5e-05,
332
- "loss": 1.5045579671859741,
333
  "step": 90
334
  },
335
  {
336
- "epoch": 0.27340267459138184,
337
- "grad_norm": 0.765625,
338
  "learning_rate": 5e-05,
339
- "loss": 1.550144910812378,
340
  "step": 92
341
  },
342
  {
343
- "epoch": 0.27934621099554235,
344
  "grad_norm": 0.6875,
345
  "learning_rate": 5e-05,
346
- "loss": 1.459304690361023,
347
  "step": 94
348
  },
349
  {
350
- "epoch": 0.2852897473997028,
351
- "grad_norm": 0.84765625,
352
  "learning_rate": 5e-05,
353
- "loss": 1.4019618034362793,
354
  "step": 96
355
  },
356
  {
357
- "epoch": 0.2912332838038633,
358
- "grad_norm": 0.765625,
359
  "learning_rate": 5e-05,
360
- "loss": 1.4445351362228394,
361
  "step": 98
362
  },
363
  {
364
- "epoch": 0.2971768202080238,
365
- "grad_norm": 1.0078125,
366
  "learning_rate": 5e-05,
367
- "loss": 1.5843751430511475,
368
  "step": 100
369
  },
370
  {
371
- "epoch": 0.3031203566121842,
372
- "grad_norm": 0.8203125,
373
  "learning_rate": 5e-05,
374
- "loss": 1.4642349481582642,
375
- "step": 102
376
- },
377
- {
378
- "epoch": 0.3031203566121842,
379
- "eval_loss": 1.4828336238861084,
380
- "eval_runtime": 1.1636,
381
- "eval_samples_per_second": 93.674,
382
- "eval_steps_per_second": 12.032,
383
  "step": 102
384
  },
385
  {
386
- "epoch": 0.30906389301634474,
387
- "grad_norm": 0.99609375,
388
  "learning_rate": 5e-05,
389
- "loss": 1.4447834491729736,
390
  "step": 104
391
  },
392
  {
393
- "epoch": 0.3150074294205052,
394
- "grad_norm": 0.94921875,
395
  "learning_rate": 5e-05,
396
- "loss": 1.4195196628570557,
397
  "step": 106
398
  },
399
  {
400
- "epoch": 0.3209509658246657,
401
- "grad_norm": 0.8046875,
402
  "learning_rate": 5e-05,
403
- "loss": 1.4759737253189087,
404
  "step": 108
405
  },
406
  {
407
- "epoch": 0.32689450222882616,
408
- "grad_norm": 0.796875,
409
  "learning_rate": 5e-05,
410
- "loss": 1.5813639163970947,
411
  "step": 110
412
  },
413
  {
414
- "epoch": 0.3328380386329866,
415
- "grad_norm": 0.8828125,
416
  "learning_rate": 5e-05,
417
- "loss": 1.4312057495117188,
418
  "step": 112
419
  },
420
  {
421
- "epoch": 0.3387815750371471,
422
- "grad_norm": 0.8125,
423
  "learning_rate": 5e-05,
424
- "loss": 1.405687689781189,
425
  "step": 114
426
  },
427
  {
428
- "epoch": 0.3447251114413076,
429
- "grad_norm": 0.89453125,
430
  "learning_rate": 5e-05,
431
- "loss": 1.4332811832427979,
432
  "step": 116
433
  },
434
  {
435
- "epoch": 0.35066864784546803,
436
- "grad_norm": 0.72265625,
437
  "learning_rate": 5e-05,
438
- "loss": 1.4324063062667847,
439
  "step": 118
440
  },
441
  {
442
- "epoch": 0.35661218424962854,
443
- "grad_norm": 0.89453125,
444
  "learning_rate": 5e-05,
445
- "loss": 1.5157840251922607,
446
  "step": 120
447
  },
448
  {
449
- "epoch": 0.362555720653789,
450
- "grad_norm": 0.9375,
451
  "learning_rate": 5e-05,
452
- "loss": 1.4901947975158691,
453
  "step": 122
454
  },
455
  {
456
- "epoch": 0.3684992570579495,
457
- "grad_norm": 0.85546875,
458
  "learning_rate": 5e-05,
459
- "loss": 1.4857661724090576,
460
  "step": 124
461
  },
462
  {
463
- "epoch": 0.37444279346210996,
464
- "grad_norm": 0.75390625,
465
  "learning_rate": 5e-05,
466
- "loss": 1.4482135772705078,
467
  "step": 126
468
  },
469
  {
470
- "epoch": 0.3803863298662704,
471
- "grad_norm": 0.86328125,
472
  "learning_rate": 5e-05,
473
- "loss": 1.5095102787017822,
474
  "step": 128
475
  },
476
  {
477
- "epoch": 0.3863298662704309,
478
- "grad_norm": 0.8828125,
479
  "learning_rate": 5e-05,
480
- "loss": 1.4518234729766846,
481
  "step": 130
482
  },
483
  {
484
- "epoch": 0.3922734026745914,
485
- "grad_norm": 0.7265625,
486
  "learning_rate": 5e-05,
487
- "loss": 1.4330897331237793,
488
  "step": 132
489
  },
490
  {
491
- "epoch": 0.39821693907875183,
492
- "grad_norm": 0.796875,
493
  "learning_rate": 5e-05,
494
- "loss": 1.462794303894043,
495
  "step": 134
496
  },
497
  {
498
- "epoch": 0.40416047548291234,
499
- "grad_norm": 0.76171875,
500
  "learning_rate": 5e-05,
501
- "loss": 1.4570199251174927,
502
  "step": 136
503
  },
504
  {
505
- "epoch": 0.4101040118870728,
506
- "grad_norm": 0.8671875,
507
  "learning_rate": 5e-05,
508
- "loss": 1.50748610496521,
509
  "step": 138
510
  },
511
  {
512
- "epoch": 0.4160475482912333,
513
- "grad_norm": 0.86328125,
514
  "learning_rate": 5e-05,
515
- "loss": 1.478920578956604,
516
  "step": 140
517
  },
518
  {
519
- "epoch": 0.42199108469539376,
520
- "grad_norm": 0.8359375,
521
  "learning_rate": 5e-05,
522
- "loss": 1.4309303760528564,
523
  "step": 142
524
  },
525
  {
526
- "epoch": 0.4279346210995542,
527
- "grad_norm": 0.6875,
528
  "learning_rate": 5e-05,
529
- "loss": 1.5331854820251465,
530
  "step": 144
531
  },
532
  {
533
- "epoch": 0.4338781575037147,
534
- "grad_norm": 0.78125,
535
  "learning_rate": 5e-05,
536
- "loss": 1.426405668258667,
537
  "step": 146
538
  },
539
  {
540
- "epoch": 0.4398216939078752,
541
- "grad_norm": 0.87109375,
542
  "learning_rate": 5e-05,
543
- "loss": 1.4882712364196777,
544
  "step": 148
545
  },
546
  {
547
- "epoch": 0.4457652303120357,
548
- "grad_norm": 0.8359375,
549
  "learning_rate": 5e-05,
550
- "loss": 1.5059183835983276,
551
  "step": 150
552
  },
553
  {
554
- "epoch": 0.45170876671619614,
555
- "grad_norm": 0.8671875,
556
  "learning_rate": 5e-05,
557
- "loss": 1.3766722679138184,
558
  "step": 152
559
  },
560
  {
561
- "epoch": 0.45468053491827637,
562
- "eval_loss": 1.4696053266525269,
563
- "eval_runtime": 1.1246,
564
- "eval_samples_per_second": 96.922,
565
- "eval_steps_per_second": 12.449,
566
- "step": 153
567
  },
568
  {
569
- "epoch": 0.4576523031203566,
570
- "grad_norm": 0.81640625,
571
  "learning_rate": 5e-05,
572
- "loss": 1.4256658554077148,
573
  "step": 154
574
  },
575
  {
576
- "epoch": 0.4635958395245171,
577
- "grad_norm": 0.83984375,
578
  "learning_rate": 5e-05,
579
- "loss": 1.458268165588379,
580
  "step": 156
581
  },
582
  {
583
- "epoch": 0.46953937592867756,
584
- "grad_norm": 0.94921875,
585
  "learning_rate": 5e-05,
586
- "loss": 1.3582110404968262,
587
  "step": 158
588
  },
589
  {
590
- "epoch": 0.475482912332838,
591
- "grad_norm": 0.875,
592
  "learning_rate": 5e-05,
593
- "loss": 1.435016393661499,
594
  "step": 160
595
  },
596
  {
597
- "epoch": 0.4814264487369985,
598
- "grad_norm": 0.734375,
599
  "learning_rate": 5e-05,
600
- "loss": 1.4493205547332764,
601
  "step": 162
602
  },
603
  {
604
- "epoch": 0.487369985141159,
605
- "grad_norm": 0.734375,
606
  "learning_rate": 5e-05,
607
- "loss": 1.4993988275527954,
608
  "step": 164
609
  },
610
  {
611
- "epoch": 0.4933135215453195,
612
- "grad_norm": 0.72265625,
613
  "learning_rate": 5e-05,
614
- "loss": 1.5079882144927979,
615
  "step": 166
616
  },
617
  {
618
- "epoch": 0.49925705794947994,
619
- "grad_norm": 0.75390625,
620
  "learning_rate": 5e-05,
621
- "loss": 1.518232822418213,
622
  "step": 168
623
  },
624
  {
625
- "epoch": 0.5052005943536404,
626
- "grad_norm": 0.90234375,
627
  "learning_rate": 5e-05,
628
- "loss": 1.5173208713531494,
629
  "step": 170
630
  },
631
  {
632
- "epoch": 0.5111441307578009,
633
- "grad_norm": 0.8359375,
634
  "learning_rate": 5e-05,
635
- "loss": 1.4525389671325684,
636
  "step": 172
637
  },
638
  {
639
- "epoch": 0.5170876671619614,
640
- "grad_norm": 0.90234375,
641
  "learning_rate": 5e-05,
642
- "loss": 1.5169626474380493,
643
  "step": 174
644
  },
645
  {
646
- "epoch": 0.5230312035661219,
647
- "grad_norm": 0.7578125,
648
  "learning_rate": 5e-05,
649
- "loss": 1.3763575553894043,
650
  "step": 176
651
  },
652
  {
653
- "epoch": 0.5289747399702823,
654
- "grad_norm": 0.78125,
655
  "learning_rate": 5e-05,
656
- "loss": 1.500097393989563,
657
  "step": 178
658
  },
659
  {
660
- "epoch": 0.5349182763744428,
661
- "grad_norm": 0.93359375,
662
  "learning_rate": 5e-05,
663
- "loss": 1.4460171461105347,
664
  "step": 180
665
  },
666
  {
667
- "epoch": 0.5408618127786032,
668
- "grad_norm": 0.79296875,
669
  "learning_rate": 5e-05,
670
- "loss": 1.4529346227645874,
671
  "step": 182
672
  },
673
  {
674
- "epoch": 0.5468053491827637,
675
- "grad_norm": 0.86328125,
676
  "learning_rate": 5e-05,
677
- "loss": 1.4821476936340332,
678
  "step": 184
679
  },
680
  {
681
- "epoch": 0.5527488855869243,
682
- "grad_norm": 1.078125,
683
  "learning_rate": 5e-05,
684
- "loss": 1.4030323028564453,
685
  "step": 186
686
  },
687
  {
688
- "epoch": 0.5586924219910847,
689
- "grad_norm": 1.0,
690
  "learning_rate": 5e-05,
691
- "loss": 1.416299819946289,
692
  "step": 188
693
  },
694
  {
695
- "epoch": 0.5646359583952452,
696
- "grad_norm": 0.85546875,
697
  "learning_rate": 5e-05,
698
- "loss": 1.4422305822372437,
699
  "step": 190
700
  },
701
  {
702
- "epoch": 0.5705794947994056,
703
- "grad_norm": 0.83203125,
704
  "learning_rate": 5e-05,
705
- "loss": 1.4656535387039185,
706
  "step": 192
707
  },
708
  {
709
- "epoch": 0.5765230312035661,
710
- "grad_norm": 0.7421875,
711
  "learning_rate": 5e-05,
712
- "loss": 1.4635984897613525,
713
  "step": 194
714
  },
715
  {
716
- "epoch": 0.5824665676077266,
717
- "grad_norm": 0.8828125,
718
  "learning_rate": 5e-05,
719
- "loss": 1.4563398361206055,
720
  "step": 196
721
  },
722
  {
723
- "epoch": 0.5884101040118871,
724
- "grad_norm": 0.81640625,
725
  "learning_rate": 5e-05,
726
- "loss": 1.4318304061889648,
727
  "step": 198
728
  },
729
  {
730
- "epoch": 0.5943536404160475,
731
- "grad_norm": 0.90234375,
732
  "learning_rate": 5e-05,
733
- "loss": 1.389236569404602,
734
  "step": 200
735
  },
736
  {
737
- "epoch": 0.600297176820208,
738
- "grad_norm": 0.85546875,
739
  "learning_rate": 5e-05,
740
- "loss": 1.4266612529754639,
741
  "step": 202
742
- },
743
- {
744
- "epoch": 0.6062407132243685,
745
- "grad_norm": 0.734375,
746
- "learning_rate": 5e-05,
747
- "loss": 1.5085554122924805,
748
- "step": 204
749
- },
750
- {
751
- "epoch": 0.6062407132243685,
752
- "eval_loss": 1.4601037502288818,
753
- "eval_runtime": 1.128,
754
- "eval_samples_per_second": 96.628,
755
- "eval_steps_per_second": 12.411,
756
- "step": 204
757
- },
758
- {
759
- "epoch": 0.612184249628529,
760
- "grad_norm": 0.734375,
761
- "learning_rate": 5e-05,
762
- "loss": 1.4719808101654053,
763
- "step": 206
764
- },
765
- {
766
- "epoch": 0.6181277860326895,
767
- "grad_norm": 0.90625,
768
- "learning_rate": 5e-05,
769
- "loss": 1.4344429969787598,
770
- "step": 208
771
- },
772
- {
773
- "epoch": 0.6240713224368499,
774
- "grad_norm": 0.81640625,
775
- "learning_rate": 5e-05,
776
- "loss": 1.4264543056488037,
777
- "step": 210
778
- },
779
- {
780
- "epoch": 0.6300148588410104,
781
- "grad_norm": 0.7265625,
782
- "learning_rate": 5e-05,
783
- "loss": 1.4732258319854736,
784
- "step": 212
785
- },
786
- {
787
- "epoch": 0.6359583952451708,
788
- "grad_norm": 0.73828125,
789
- "learning_rate": 5e-05,
790
- "loss": 1.371578574180603,
791
- "step": 214
792
- },
793
- {
794
- "epoch": 0.6419019316493314,
795
- "grad_norm": 0.82421875,
796
- "learning_rate": 5e-05,
797
- "loss": 1.4412343502044678,
798
- "step": 216
799
- },
800
- {
801
- "epoch": 0.6478454680534919,
802
- "grad_norm": 0.71484375,
803
- "learning_rate": 5e-05,
804
- "loss": 1.51022207736969,
805
- "step": 218
806
- },
807
- {
808
- "epoch": 0.6537890044576523,
809
- "grad_norm": 0.84375,
810
- "learning_rate": 5e-05,
811
- "loss": 1.367915391921997,
812
- "step": 220
813
- },
814
- {
815
- "epoch": 0.6597325408618128,
816
- "grad_norm": 0.86328125,
817
- "learning_rate": 5e-05,
818
- "loss": 1.4306704998016357,
819
- "step": 222
820
- },
821
- {
822
- "epoch": 0.6656760772659732,
823
- "grad_norm": 0.76171875,
824
- "learning_rate": 5e-05,
825
- "loss": 1.432612419128418,
826
- "step": 224
827
- },
828
- {
829
- "epoch": 0.6716196136701337,
830
- "grad_norm": 0.67578125,
831
- "learning_rate": 5e-05,
832
- "loss": 1.4430606365203857,
833
- "step": 226
834
- },
835
- {
836
- "epoch": 0.6775631500742942,
837
- "grad_norm": 0.67578125,
838
- "learning_rate": 5e-05,
839
- "loss": 1.4083107709884644,
840
- "step": 228
841
- },
842
- {
843
- "epoch": 0.6835066864784547,
844
- "grad_norm": 0.87109375,
845
- "learning_rate": 5e-05,
846
- "loss": 1.4255032539367676,
847
- "step": 230
848
- },
849
- {
850
- "epoch": 0.6894502228826151,
851
- "grad_norm": 0.828125,
852
- "learning_rate": 5e-05,
853
- "loss": 1.4819388389587402,
854
- "step": 232
855
- },
856
- {
857
- "epoch": 0.6953937592867756,
858
- "grad_norm": 0.84375,
859
- "learning_rate": 5e-05,
860
- "loss": 1.541199803352356,
861
- "step": 234
862
- },
863
- {
864
- "epoch": 0.7013372956909361,
865
- "grad_norm": 0.75390625,
866
- "learning_rate": 5e-05,
867
- "loss": 1.4741461277008057,
868
- "step": 236
869
- },
870
- {
871
- "epoch": 0.7072808320950966,
872
- "grad_norm": 0.83203125,
873
- "learning_rate": 5e-05,
874
- "loss": 1.4825263023376465,
875
- "step": 238
876
- },
877
- {
878
- "epoch": 0.7132243684992571,
879
- "grad_norm": 0.95703125,
880
- "learning_rate": 5e-05,
881
- "loss": 1.4338710308074951,
882
- "step": 240
883
- },
884
- {
885
- "epoch": 0.7191679049034175,
886
- "grad_norm": 0.79296875,
887
- "learning_rate": 5e-05,
888
- "loss": 1.4071189165115356,
889
- "step": 242
890
- },
891
- {
892
- "epoch": 0.725111441307578,
893
- "grad_norm": 0.765625,
894
- "learning_rate": 5e-05,
895
- "loss": 1.4799857139587402,
896
- "step": 244
897
- },
898
- {
899
- "epoch": 0.7310549777117384,
900
- "grad_norm": 0.6953125,
901
- "learning_rate": 5e-05,
902
- "loss": 1.4438296556472778,
903
- "step": 246
904
- },
905
- {
906
- "epoch": 0.736998514115899,
907
- "grad_norm": 0.77734375,
908
- "learning_rate": 5e-05,
909
- "loss": 1.4408268928527832,
910
- "step": 248
911
- },
912
- {
913
- "epoch": 0.7429420505200595,
914
- "grad_norm": 0.7734375,
915
- "learning_rate": 5e-05,
916
- "loss": 1.3916218280792236,
917
- "step": 250
918
- },
919
- {
920
- "epoch": 0.7488855869242199,
921
- "grad_norm": 0.94140625,
922
- "learning_rate": 5e-05,
923
- "loss": 1.3853819370269775,
924
- "step": 252
925
- },
926
- {
927
- "epoch": 0.7548291233283804,
928
- "grad_norm": 0.83984375,
929
- "learning_rate": 4.998292650357558e-05,
930
- "loss": 1.3740458488464355,
931
- "step": 254
932
- },
933
- {
934
- "epoch": 0.7578008915304606,
935
- "eval_loss": 1.4529203176498413,
936
- "eval_runtime": 1.1255,
937
- "eval_samples_per_second": 96.842,
938
- "eval_steps_per_second": 12.438,
939
- "step": 255
940
- },
941
- {
942
- "epoch": 0.7607726597325408,
943
- "grad_norm": 0.8671875,
944
- "learning_rate": 4.984647842238185e-05,
945
- "loss": 1.5183303356170654,
946
- "step": 256
947
- },
948
- {
949
- "epoch": 0.7667161961367014,
950
- "grad_norm": 0.7265625,
951
- "learning_rate": 4.957432749209755e-05,
952
- "loss": 1.4994571208953857,
953
- "step": 258
954
- },
955
- {
956
- "epoch": 0.7726597325408618,
957
- "grad_norm": 0.84765625,
958
- "learning_rate": 4.916796010672969e-05,
959
- "loss": 1.494471549987793,
960
- "step": 260
961
- },
962
- {
963
- "epoch": 0.7786032689450223,
964
- "grad_norm": 0.8671875,
965
- "learning_rate": 4.862959570402049e-05,
966
- "loss": 1.4754960536956787,
967
- "step": 262
968
- },
969
- {
970
- "epoch": 0.7845468053491828,
971
- "grad_norm": 0.8046875,
972
- "learning_rate": 4.796217464364808e-05,
973
- "loss": 1.4098687171936035,
974
- "step": 264
975
- },
976
- {
977
- "epoch": 0.7904903417533432,
978
- "grad_norm": 0.8046875,
979
- "learning_rate": 4.716934214800155e-05,
980
- "loss": 1.445394515991211,
981
- "step": 266
982
- },
983
- {
984
- "epoch": 0.7964338781575037,
985
- "grad_norm": 0.7109375,
986
- "learning_rate": 4.625542839324036e-05,
987
- "loss": 1.4716103076934814,
988
- "step": 268
989
- },
990
- {
991
- "epoch": 0.8023774145616642,
992
- "grad_norm": 0.8359375,
993
- "learning_rate": 4.522542485937369e-05,
994
- "loss": 1.4395897388458252,
995
- "step": 270
996
- },
997
- {
998
- "epoch": 0.8083209509658247,
999
- "grad_norm": 0.75,
1000
- "learning_rate": 4.408495706852758e-05,
1001
- "loss": 1.4456340074539185,
1002
- "step": 272
1003
- },
1004
- {
1005
- "epoch": 0.8142644873699851,
1006
- "grad_norm": 0.74609375,
1007
- "learning_rate": 4.284025386029381e-05,
1008
- "loss": 1.455463171005249,
1009
- "step": 274
1010
- },
1011
- {
1012
- "epoch": 0.8202080237741456,
1013
- "grad_norm": 0.8828125,
1014
- "learning_rate": 4.149811337196807e-05,
1015
- "loss": 1.4609105587005615,
1016
- "step": 276
1017
- },
1018
- {
1019
- "epoch": 0.826151560178306,
1020
- "grad_norm": 0.83203125,
1021
- "learning_rate": 4.0065865909481417e-05,
1022
- "loss": 1.4417420625686646,
1023
- "step": 278
1024
- },
1025
- {
1026
- "epoch": 0.8320950965824666,
1027
- "grad_norm": 0.86328125,
1028
- "learning_rate": 3.855133391181124e-05,
1029
- "loss": 1.4518589973449707,
1030
- "step": 280
1031
- },
1032
- {
1033
- "epoch": 0.8380386329866271,
1034
- "grad_norm": 0.76171875,
1035
- "learning_rate": 3.696278922753216e-05,
1036
- "loss": 1.4845668077468872,
1037
- "step": 282
1038
- },
1039
- {
1040
- "epoch": 0.8439821693907875,
1041
- "grad_norm": 0.80078125,
1042
- "learning_rate": 3.5308907936847594e-05,
1043
- "loss": 1.4887086153030396,
1044
- "step": 284
1045
- },
1046
- {
1047
- "epoch": 0.849925705794948,
1048
- "grad_norm": 0.7578125,
1049
- "learning_rate": 3.3598722965848204e-05,
1050
- "loss": 1.5054309368133545,
1051
- "step": 286
1052
- },
1053
- {
1054
- "epoch": 0.8558692421991084,
1055
- "grad_norm": 0.82421875,
1056
- "learning_rate": 3.1841574751802076e-05,
1057
- "loss": 1.5620818138122559,
1058
- "step": 288
1059
- },
1060
- {
1061
- "epoch": 0.861812778603269,
1062
- "grad_norm": 0.86328125,
1063
- "learning_rate": 3.0047060228925256e-05,
1064
- "loss": 1.5021510124206543,
1065
- "step": 290
1066
- },
1067
- {
1068
- "epoch": 0.8677563150074294,
1069
- "grad_norm": 0.71875,
1070
- "learning_rate": 2.8224980413255086e-05,
1071
- "loss": 1.514552354812622,
1072
- "step": 292
1073
- },
1074
- {
1075
- "epoch": 0.8736998514115899,
1076
- "grad_norm": 0.73828125,
1077
- "learning_rate": 2.638528687289925e-05,
1078
- "loss": 1.460189700126648,
1079
- "step": 294
1080
- },
1081
- {
1082
- "epoch": 0.8796433878157504,
1083
- "grad_norm": 0.7421875,
1084
- "learning_rate": 2.453802737602176e-05,
1085
- "loss": 1.3503719568252563,
1086
- "step": 296
1087
- },
1088
- {
1089
- "epoch": 0.8855869242199108,
1090
- "grad_norm": 0.76171875,
1091
- "learning_rate": 2.2693291013417453e-05,
1092
- "loss": 1.5173900127410889,
1093
- "step": 298
1094
- },
1095
- {
1096
- "epoch": 0.8915304606240714,
1097
- "grad_norm": 0.72265625,
1098
- "learning_rate": 2.0861153095396748e-05,
1099
- "loss": 1.4111042022705078,
1100
- "step": 300
1101
- },
1102
- {
1103
- "epoch": 0.8974739970282318,
1104
- "grad_norm": 0.71484375,
1105
- "learning_rate": 1.9051620123934537e-05,
1106
- "loss": 1.4748804569244385,
1107
- "step": 302
1108
- },
1109
- {
1110
- "epoch": 0.9034175334323923,
1111
- "grad_norm": 0.859375,
1112
- "learning_rate": 1.7274575140626318e-05,
1113
- "loss": 1.4101568460464478,
1114
- "step": 304
1115
- },
1116
- {
1117
- "epoch": 0.9093610698365527,
1118
- "grad_norm": 0.98828125,
1119
- "learning_rate": 1.5539723748942245e-05,
1120
- "loss": 1.5212171077728271,
1121
- "step": 306
1122
- },
1123
- {
1124
- "epoch": 0.9093610698365527,
1125
- "eval_loss": 1.4502739906311035,
1126
- "eval_runtime": 1.126,
1127
- "eval_samples_per_second": 96.803,
1128
- "eval_steps_per_second": 12.433,
1129
- "step": 306
1130
- },
1131
- {
1132
- "epoch": 0.9153046062407132,
1133
- "grad_norm": 0.79296875,
1134
- "learning_rate": 1.3856541105586545e-05,
1135
- "loss": 1.4394731521606445,
1136
- "step": 308
1137
- },
1138
- {
1139
- "epoch": 0.9212481426448736,
1140
- "grad_norm": 0.828125,
1141
- "learning_rate": 1.223422017047733e-05,
1142
- "loss": 1.4315065145492554,
1143
- "step": 310
1144
- },
1145
- {
1146
- "epoch": 0.9271916790490342,
1147
- "grad_norm": 0.7109375,
1148
- "learning_rate": 1.068162149798737e-05,
1149
- "loss": 1.4175217151641846,
1150
- "step": 312
1151
- },
1152
- {
1153
- "epoch": 0.9331352154531947,
1154
- "grad_norm": 0.8203125,
1155
- "learning_rate": 9.207224843668732e-06,
1156
- "loss": 1.4227707386016846,
1157
- "step": 314
1158
- },
1159
- {
1160
- "epoch": 0.9390787518573551,
1161
- "grad_norm": 0.79296875,
1162
- "learning_rate": 7.819082850768434e-06,
1163
- "loss": 1.4493082761764526,
1164
- "step": 316
1165
- },
1166
- {
1167
- "epoch": 0.9450222882615156,
1168
- "grad_norm": 0.8125,
1169
- "learning_rate": 6.524777069483526e-06,
1170
- "loss": 1.5667023658752441,
1171
- "step": 318
1172
- },
1173
- {
1174
- "epoch": 0.950965824665676,
1175
- "grad_norm": 0.8125,
1176
- "learning_rate": 5.33137654916292e-06,
1177
- "loss": 1.4023852348327637,
1178
- "step": 320
1179
- },
1180
- {
1181
- "epoch": 0.9569093610698366,
1182
- "grad_norm": 0.79296875,
1183
- "learning_rate": 4.245399229611238e-06,
1184
- "loss": 1.4751367568969727,
1185
- "step": 322
1186
- },
1187
- {
1188
- "epoch": 0.962852897473997,
1189
- "grad_norm": 0.82421875,
1190
- "learning_rate": 3.2727763423617913e-06,
1191
- "loss": 1.474877953529358,
1192
- "step": 324
1193
- },
1194
- {
1195
- "epoch": 0.9687964338781575,
1196
- "grad_norm": 0.7890625,
1197
- "learning_rate": 2.418820016346779e-06,
1198
- "loss": 1.4048078060150146,
1199
- "step": 326
1200
- },
1201
- {
1202
- "epoch": 0.974739970282318,
1203
- "grad_norm": 0.71484375,
1204
- "learning_rate": 1.6881942648911076e-06,
1205
- "loss": 1.442025899887085,
1206
- "step": 328
1207
- },
1208
- {
1209
- "epoch": 0.9806835066864784,
1210
- "grad_norm": 0.78515625,
1211
- "learning_rate": 1.0848895124889818e-06,
1212
- "loss": 1.4316831827163696,
1213
- "step": 330
1214
- },
1215
- {
1216
- "epoch": 0.986627043090639,
1217
- "grad_norm": 0.77734375,
1218
- "learning_rate": 6.122008004890851e-07,
1219
- "loss": 1.4720109701156616,
1220
- "step": 332
1221
- },
1222
- {
1223
- "epoch": 0.9925705794947994,
1224
- "grad_norm": 0.765625,
1225
- "learning_rate": 2.7270979072135104e-07,
1226
- "loss": 1.3915910720825195,
1227
- "step": 334
1228
- },
1229
- {
1230
- "epoch": 0.9985141158989599,
1231
- "grad_norm": 0.72265625,
1232
- "learning_rate": 6.827066535529946e-08,
1233
- "loss": 1.4640827178955078,
1234
- "step": 336
1235
- },
1236
- {
1237
- "epoch": 1.0,
1238
- "eval_loss": 1.44913911819458,
1239
- "eval_runtime": 1.1162,
1240
- "eval_samples_per_second": 97.657,
1241
- "eval_steps_per_second": 12.543,
1242
- "step": 337
1243
  }
1244
  ],
1245
  "logging_steps": 2,
1246
- "max_steps": 337,
1247
  "num_input_tokens_seen": 0,
1248
- "num_train_epochs": 1,
1249
- "save_steps": 135,
1250
  "stateful_callbacks": {
1251
  "TrainerControl": {
1252
  "args": {
@@ -1254,12 +745,12 @@
1254
  "should_evaluate": false,
1255
  "should_log": false,
1256
  "should_save": true,
1257
- "should_training_stop": true
1258
  },
1259
  "attributes": {}
1260
  }
1261
  },
1262
- "total_flos": 5917994317250560.0,
1263
  "train_batch_size": 4,
1264
  "trial_name": null,
1265
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.5024875621890548,
6
+ "eval_steps": 76,
7
+ "global_step": 202,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.004975124378109453,
14
+ "grad_norm": 1.3828125,
15
+ "learning_rate": 4.5454545454545455e-06,
16
+ "loss": 1.6639080047607422,
17
  "step": 2
18
  },
19
  {
20
+ "epoch": 0.009950248756218905,
21
+ "grad_norm": 2.140625,
22
+ "learning_rate": 1.3636363636363637e-05,
23
+ "loss": 1.7496397495269775,
24
  "step": 4
25
  },
26
  {
27
+ "epoch": 0.014925373134328358,
28
+ "grad_norm": 1.3828125,
29
+ "learning_rate": 2.272727272727273e-05,
30
+ "loss": 1.6321816444396973,
31
  "step": 6
32
  },
33
  {
34
+ "epoch": 0.01990049751243781,
35
+ "grad_norm": 1.6640625,
36
+ "learning_rate": 3.181818181818182e-05,
37
+ "loss": 1.6631131172180176,
38
  "step": 8
39
  },
40
  {
41
+ "epoch": 0.024875621890547265,
42
+ "grad_norm": 1.8359375,
43
+ "learning_rate": 4.0909090909090915e-05,
44
+ "loss": 1.5767230987548828,
45
  "step": 10
46
  },
47
  {
48
+ "epoch": 0.029850746268656716,
49
+ "grad_norm": 1.734375,
50
  "learning_rate": 5e-05,
51
+ "loss": 1.6631473302841187,
52
  "step": 12
53
  },
54
  {
55
+ "epoch": 0.03482587064676617,
56
+ "grad_norm": 1.3828125,
57
  "learning_rate": 5e-05,
58
+ "loss": 1.632326602935791,
59
  "step": 14
60
  },
61
  {
62
+ "epoch": 0.03980099502487562,
63
+ "grad_norm": 0.9296875,
64
  "learning_rate": 5e-05,
65
+ "loss": 1.6015336513519287,
66
  "step": 16
67
  },
68
  {
69
+ "epoch": 0.04477611940298507,
70
+ "grad_norm": 1.1484375,
71
  "learning_rate": 5e-05,
72
+ "loss": 1.542963981628418,
73
  "step": 18
74
  },
75
  {
76
+ "epoch": 0.04975124378109453,
77
+ "grad_norm": 1.125,
78
  "learning_rate": 5e-05,
79
+ "loss": 1.7240108251571655,
80
  "step": 20
81
  },
82
  {
83
+ "epoch": 0.05472636815920398,
84
+ "grad_norm": 0.89453125,
85
  "learning_rate": 5e-05,
86
+ "loss": 1.628758430480957,
87
  "step": 22
88
  },
89
  {
90
+ "epoch": 0.05970149253731343,
91
+ "grad_norm": 1.0859375,
92
  "learning_rate": 5e-05,
93
+ "loss": 1.5917885303497314,
94
  "step": 24
95
  },
96
  {
97
+ "epoch": 0.06467661691542288,
98
+ "grad_norm": 0.8984375,
99
  "learning_rate": 5e-05,
100
+ "loss": 1.5668330192565918,
101
  "step": 26
102
  },
103
  {
104
+ "epoch": 0.06965174129353234,
105
+ "grad_norm": 0.8125,
106
  "learning_rate": 5e-05,
107
+ "loss": 1.6025413274765015,
108
  "step": 28
109
  },
110
  {
111
+ "epoch": 0.07462686567164178,
112
+ "grad_norm": 0.7578125,
113
  "learning_rate": 5e-05,
114
+ "loss": 1.59916090965271,
115
  "step": 30
116
  },
117
  {
118
+ "epoch": 0.07960199004975124,
119
+ "grad_norm": 0.7890625,
120
  "learning_rate": 5e-05,
121
+ "loss": 1.6070656776428223,
122
  "step": 32
123
  },
124
  {
125
+ "epoch": 0.0845771144278607,
126
+ "grad_norm": 0.8046875,
127
  "learning_rate": 5e-05,
128
+ "loss": 1.5991092920303345,
129
  "step": 34
130
  },
131
  {
132
+ "epoch": 0.08955223880597014,
133
+ "grad_norm": 0.80859375,
134
  "learning_rate": 5e-05,
135
+ "loss": 1.517960548400879,
136
  "step": 36
137
  },
138
  {
139
+ "epoch": 0.0945273631840796,
140
+ "grad_norm": 0.74609375,
141
  "learning_rate": 5e-05,
142
+ "loss": 1.6418545246124268,
143
  "step": 38
144
  },
145
  {
146
+ "epoch": 0.09950248756218906,
147
+ "grad_norm": 0.765625,
148
  "learning_rate": 5e-05,
149
+ "loss": 1.5496408939361572,
150
  "step": 40
151
  },
152
  {
153
+ "epoch": 0.1044776119402985,
154
+ "grad_norm": 0.8046875,
155
  "learning_rate": 5e-05,
156
+ "loss": 1.591111183166504,
157
  "step": 42
158
  },
159
  {
160
+ "epoch": 0.10945273631840796,
161
+ "grad_norm": 0.79296875,
162
  "learning_rate": 5e-05,
163
+ "loss": 1.5569896697998047,
164
  "step": 44
165
  },
166
  {
167
+ "epoch": 0.11442786069651742,
168
+ "grad_norm": 0.71484375,
169
  "learning_rate": 5e-05,
170
+ "loss": 1.573204517364502,
171
  "step": 46
172
  },
173
  {
174
+ "epoch": 0.11940298507462686,
175
+ "grad_norm": 0.7421875,
176
  "learning_rate": 5e-05,
177
+ "loss": 1.5156298875808716,
178
  "step": 48
179
  },
180
  {
181
+ "epoch": 0.12437810945273632,
182
+ "grad_norm": 0.83984375,
183
  "learning_rate": 5e-05,
184
+ "loss": 1.4954731464385986,
185
  "step": 50
186
  },
187
  {
188
+ "epoch": 0.12935323383084577,
189
+ "grad_norm": 0.7734375,
 
 
 
 
 
 
 
 
190
  "learning_rate": 5e-05,
191
+ "loss": 1.6173453330993652,
192
  "step": 52
193
  },
194
  {
195
+ "epoch": 0.13432835820895522,
196
+ "grad_norm": 0.8671875,
197
  "learning_rate": 5e-05,
198
+ "loss": 1.579205870628357,
199
  "step": 54
200
  },
201
  {
202
+ "epoch": 0.13930348258706468,
203
+ "grad_norm": 0.84765625,
204
  "learning_rate": 5e-05,
205
+ "loss": 1.5793405771255493,
206
  "step": 56
207
  },
208
  {
209
+ "epoch": 0.14427860696517414,
210
+ "grad_norm": 0.84375,
211
  "learning_rate": 5e-05,
212
+ "loss": 1.5619373321533203,
213
  "step": 58
214
  },
215
  {
216
+ "epoch": 0.14925373134328357,
217
+ "grad_norm": 0.80859375,
218
  "learning_rate": 5e-05,
219
+ "loss": 1.5738036632537842,
220
  "step": 60
221
  },
222
  {
223
+ "epoch": 0.15422885572139303,
224
+ "grad_norm": 0.97265625,
225
  "learning_rate": 5e-05,
226
+ "loss": 1.528868317604065,
227
  "step": 62
228
  },
229
  {
230
+ "epoch": 0.15920398009950248,
231
+ "grad_norm": 0.77734375,
232
  "learning_rate": 5e-05,
233
+ "loss": 1.5742967128753662,
234
  "step": 64
235
  },
236
  {
237
+ "epoch": 0.16417910447761194,
238
+ "grad_norm": 0.765625,
239
  "learning_rate": 5e-05,
240
+ "loss": 1.5363436937332153,
241
  "step": 66
242
  },
243
  {
244
+ "epoch": 0.1691542288557214,
245
+ "grad_norm": 0.71484375,
246
  "learning_rate": 5e-05,
247
+ "loss": 1.5038269758224487,
248
  "step": 68
249
  },
250
  {
251
+ "epoch": 0.17412935323383086,
252
+ "grad_norm": 0.72265625,
253
  "learning_rate": 5e-05,
254
+ "loss": 1.5686390399932861,
255
  "step": 70
256
  },
257
  {
258
+ "epoch": 0.1791044776119403,
259
+ "grad_norm": 0.71875,
260
  "learning_rate": 5e-05,
261
+ "loss": 1.5683722496032715,
262
  "step": 72
263
  },
264
  {
265
+ "epoch": 0.18407960199004975,
266
+ "grad_norm": 0.8984375,
267
  "learning_rate": 5e-05,
268
+ "loss": 1.6040563583374023,
269
  "step": 74
270
  },
271
  {
272
+ "epoch": 0.1890547263681592,
273
+ "grad_norm": 0.7734375,
274
  "learning_rate": 5e-05,
275
+ "loss": 1.5213723182678223,
276
  "step": 76
277
  },
278
  {
279
+ "epoch": 0.1890547263681592,
280
+ "eval_loss": 1.5235487222671509,
281
+ "eval_runtime": 2.1832,
282
+ "eval_samples_per_second": 59.545,
283
+ "eval_steps_per_second": 7.787,
284
+ "step": 76
285
+ },
286
+ {
287
+ "epoch": 0.19402985074626866,
288
+ "grad_norm": 0.81640625,
289
  "learning_rate": 5e-05,
290
+ "loss": 1.548842191696167,
291
  "step": 78
292
  },
293
  {
294
+ "epoch": 0.19900497512437812,
295
+ "grad_norm": 0.71875,
296
  "learning_rate": 5e-05,
297
+ "loss": 1.4947301149368286,
298
  "step": 80
299
  },
300
  {
301
+ "epoch": 0.20398009950248755,
302
+ "grad_norm": 0.6953125,
303
  "learning_rate": 5e-05,
304
+ "loss": 1.5300225019454956,
305
  "step": 82
306
  },
307
  {
308
+ "epoch": 0.208955223880597,
309
+ "grad_norm": 0.6796875,
310
  "learning_rate": 5e-05,
311
+ "loss": 1.5121870040893555,
312
  "step": 84
313
  },
314
  {
315
+ "epoch": 0.21393034825870647,
316
+ "grad_norm": 0.79296875,
317
  "learning_rate": 5e-05,
318
+ "loss": 1.562124252319336,
319
  "step": 86
320
  },
321
  {
322
+ "epoch": 0.21890547263681592,
323
+ "grad_norm": 0.80078125,
324
  "learning_rate": 5e-05,
325
+ "loss": 1.5368881225585938,
326
  "step": 88
327
  },
328
  {
329
+ "epoch": 0.22388059701492538,
330
+ "grad_norm": 0.671875,
331
  "learning_rate": 5e-05,
332
+ "loss": 1.5035767555236816,
333
  "step": 90
334
  },
335
  {
336
+ "epoch": 0.22885572139303484,
337
+ "grad_norm": 0.6953125,
338
  "learning_rate": 5e-05,
339
+ "loss": 1.5528807640075684,
340
  "step": 92
341
  },
342
  {
343
+ "epoch": 0.23383084577114427,
344
  "grad_norm": 0.6875,
345
  "learning_rate": 5e-05,
346
+ "loss": 1.5195538997650146,
347
  "step": 94
348
  },
349
  {
350
+ "epoch": 0.23880597014925373,
351
+ "grad_norm": 0.6484375,
352
  "learning_rate": 5e-05,
353
+ "loss": 1.4883313179016113,
354
  "step": 96
355
  },
356
  {
357
+ "epoch": 0.24378109452736318,
358
+ "grad_norm": 0.7890625,
359
  "learning_rate": 5e-05,
360
+ "loss": 1.50142502784729,
361
  "step": 98
362
  },
363
  {
364
+ "epoch": 0.24875621890547264,
365
+ "grad_norm": 0.65234375,
366
  "learning_rate": 5e-05,
367
+ "loss": 1.5273784399032593,
368
  "step": 100
369
  },
370
  {
371
+ "epoch": 0.2537313432835821,
372
+ "grad_norm": 0.69140625,
373
  "learning_rate": 5e-05,
374
+ "loss": 1.5636398792266846,
 
 
 
 
 
 
 
 
375
  "step": 102
376
  },
377
  {
378
+ "epoch": 0.25870646766169153,
379
+ "grad_norm": 0.6171875,
380
  "learning_rate": 5e-05,
381
+ "loss": 1.5040175914764404,
382
  "step": 104
383
  },
384
  {
385
+ "epoch": 0.263681592039801,
386
+ "grad_norm": 0.7890625,
387
  "learning_rate": 5e-05,
388
+ "loss": 1.580956220626831,
389
  "step": 106
390
  },
391
  {
392
+ "epoch": 0.26865671641791045,
393
+ "grad_norm": 0.765625,
394
  "learning_rate": 5e-05,
395
+ "loss": 1.5606034994125366,
396
  "step": 108
397
  },
398
  {
399
+ "epoch": 0.2736318407960199,
400
+ "grad_norm": 0.7890625,
401
  "learning_rate": 5e-05,
402
+ "loss": 1.5479745864868164,
403
  "step": 110
404
  },
405
  {
406
+ "epoch": 0.27860696517412936,
407
+ "grad_norm": 0.796875,
408
  "learning_rate": 5e-05,
409
+ "loss": 1.4870773553848267,
410
  "step": 112
411
  },
412
  {
413
+ "epoch": 0.2835820895522388,
414
+ "grad_norm": 1.015625,
415
  "learning_rate": 5e-05,
416
+ "loss": 1.5059258937835693,
417
  "step": 114
418
  },
419
  {
420
+ "epoch": 0.2885572139303483,
421
+ "grad_norm": 0.68359375,
422
  "learning_rate": 5e-05,
423
+ "loss": 1.566910982131958,
424
  "step": 116
425
  },
426
  {
427
+ "epoch": 0.2935323383084577,
428
+ "grad_norm": 0.77734375,
429
  "learning_rate": 5e-05,
430
+ "loss": 1.5694658756256104,
431
  "step": 118
432
  },
433
  {
434
+ "epoch": 0.29850746268656714,
435
+ "grad_norm": 0.75390625,
436
  "learning_rate": 5e-05,
437
+ "loss": 1.6117546558380127,
438
  "step": 120
439
  },
440
  {
441
+ "epoch": 0.3034825870646766,
442
+ "grad_norm": 0.6953125,
443
  "learning_rate": 5e-05,
444
+ "loss": 1.5218111276626587,
445
  "step": 122
446
  },
447
  {
448
+ "epoch": 0.30845771144278605,
449
+ "grad_norm": 0.71484375,
450
  "learning_rate": 5e-05,
451
+ "loss": 1.4893097877502441,
452
  "step": 124
453
  },
454
  {
455
+ "epoch": 0.31343283582089554,
456
+ "grad_norm": 1.0078125,
457
  "learning_rate": 5e-05,
458
+ "loss": 1.5823085308074951,
459
  "step": 126
460
  },
461
  {
462
+ "epoch": 0.31840796019900497,
463
+ "grad_norm": 0.76171875,
464
  "learning_rate": 5e-05,
465
+ "loss": 1.5641398429870605,
466
  "step": 128
467
  },
468
  {
469
+ "epoch": 0.32338308457711445,
470
+ "grad_norm": 0.83984375,
471
  "learning_rate": 5e-05,
472
+ "loss": 1.573578119277954,
473
  "step": 130
474
  },
475
  {
476
+ "epoch": 0.3283582089552239,
477
+ "grad_norm": 0.67578125,
478
  "learning_rate": 5e-05,
479
+ "loss": 1.5401456356048584,
480
  "step": 132
481
  },
482
  {
483
+ "epoch": 0.3333333333333333,
484
+ "grad_norm": 0.80859375,
485
  "learning_rate": 5e-05,
486
+ "loss": 1.5921449661254883,
487
  "step": 134
488
  },
489
  {
490
+ "epoch": 0.3383084577114428,
491
+ "grad_norm": 0.796875,
492
  "learning_rate": 5e-05,
493
+ "loss": 1.4478504657745361,
494
  "step": 136
495
  },
496
  {
497
+ "epoch": 0.34328358208955223,
498
+ "grad_norm": 0.6953125,
499
  "learning_rate": 5e-05,
500
+ "loss": 1.5600370168685913,
501
  "step": 138
502
  },
503
  {
504
+ "epoch": 0.3482587064676617,
505
+ "grad_norm": 0.6796875,
506
  "learning_rate": 5e-05,
507
+ "loss": 1.5285460948944092,
508
  "step": 140
509
  },
510
  {
511
+ "epoch": 0.35323383084577115,
512
+ "grad_norm": 0.83203125,
513
  "learning_rate": 5e-05,
514
+ "loss": 1.542060375213623,
515
  "step": 142
516
  },
517
  {
518
+ "epoch": 0.3582089552238806,
519
+ "grad_norm": 0.73046875,
520
  "learning_rate": 5e-05,
521
+ "loss": 1.5423415899276733,
522
  "step": 144
523
  },
524
  {
525
+ "epoch": 0.36318407960199006,
526
+ "grad_norm": 0.7890625,
527
  "learning_rate": 5e-05,
528
+ "loss": 1.5532753467559814,
529
  "step": 146
530
  },
531
  {
532
+ "epoch": 0.3681592039800995,
533
+ "grad_norm": 0.703125,
534
  "learning_rate": 5e-05,
535
+ "loss": 1.498077392578125,
536
  "step": 148
537
  },
538
  {
539
+ "epoch": 0.373134328358209,
540
+ "grad_norm": 0.86328125,
541
  "learning_rate": 5e-05,
542
+ "loss": 1.4719334840774536,
543
  "step": 150
544
  },
545
  {
546
+ "epoch": 0.3781094527363184,
547
+ "grad_norm": 0.75390625,
548
  "learning_rate": 5e-05,
549
+ "loss": 1.6089732646942139,
550
  "step": 152
551
  },
552
  {
553
+ "epoch": 0.3781094527363184,
554
+ "eval_loss": 1.4965729713439941,
555
+ "eval_runtime": 1.4549,
556
+ "eval_samples_per_second": 89.354,
557
+ "eval_steps_per_second": 11.685,
558
+ "step": 152
559
  },
560
  {
561
+ "epoch": 0.38308457711442784,
562
+ "grad_norm": 0.765625,
563
  "learning_rate": 5e-05,
564
+ "loss": 1.4956027269363403,
565
  "step": 154
566
  },
567
  {
568
+ "epoch": 0.3880597014925373,
569
+ "grad_norm": 0.85546875,
570
  "learning_rate": 5e-05,
571
+ "loss": 1.4428843259811401,
572
  "step": 156
573
  },
574
  {
575
+ "epoch": 0.39303482587064675,
576
+ "grad_norm": 0.71484375,
577
  "learning_rate": 5e-05,
578
+ "loss": 1.5057318210601807,
579
  "step": 158
580
  },
581
  {
582
+ "epoch": 0.39800995024875624,
583
+ "grad_norm": 0.66015625,
584
  "learning_rate": 5e-05,
585
+ "loss": 1.5654449462890625,
586
  "step": 160
587
  },
588
  {
589
+ "epoch": 0.40298507462686567,
590
+ "grad_norm": 0.73046875,
591
  "learning_rate": 5e-05,
592
+ "loss": 1.5439975261688232,
593
  "step": 162
594
  },
595
  {
596
+ "epoch": 0.4079601990049751,
597
+ "grad_norm": 0.7421875,
598
  "learning_rate": 5e-05,
599
+ "loss": 1.5199835300445557,
600
  "step": 164
601
  },
602
  {
603
+ "epoch": 0.4129353233830846,
604
+ "grad_norm": 0.73828125,
605
  "learning_rate": 5e-05,
606
+ "loss": 1.4676998853683472,
607
  "step": 166
608
  },
609
  {
610
+ "epoch": 0.417910447761194,
611
+ "grad_norm": 0.671875,
612
  "learning_rate": 5e-05,
613
+ "loss": 1.5374722480773926,
614
  "step": 168
615
  },
616
  {
617
+ "epoch": 0.4228855721393035,
618
+ "grad_norm": 0.75390625,
619
  "learning_rate": 5e-05,
620
+ "loss": 1.563814401626587,
621
  "step": 170
622
  },
623
  {
624
+ "epoch": 0.42786069651741293,
625
+ "grad_norm": 0.8046875,
626
  "learning_rate": 5e-05,
627
+ "loss": 1.568427562713623,
628
  "step": 172
629
  },
630
  {
631
+ "epoch": 0.43283582089552236,
632
+ "grad_norm": 0.78125,
633
  "learning_rate": 5e-05,
634
+ "loss": 1.5757570266723633,
635
  "step": 174
636
  },
637
  {
638
+ "epoch": 0.43781094527363185,
639
+ "grad_norm": 0.6875,
640
  "learning_rate": 5e-05,
641
+ "loss": 1.5818047523498535,
642
  "step": 176
643
  },
644
  {
645
+ "epoch": 0.4427860696517413,
646
+ "grad_norm": 0.734375,
647
  "learning_rate": 5e-05,
648
+ "loss": 1.5185985565185547,
649
  "step": 178
650
  },
651
  {
652
+ "epoch": 0.44776119402985076,
653
+ "grad_norm": 0.67578125,
654
  "learning_rate": 5e-05,
655
+ "loss": 1.5347332954406738,
656
  "step": 180
657
  },
658
  {
659
+ "epoch": 0.4527363184079602,
660
+ "grad_norm": 1.0546875,
661
  "learning_rate": 5e-05,
662
+ "loss": 1.466269850730896,
663
  "step": 182
664
  },
665
  {
666
+ "epoch": 0.4577114427860697,
667
+ "grad_norm": 0.828125,
668
  "learning_rate": 5e-05,
669
+ "loss": 1.535921335220337,
670
  "step": 184
671
  },
672
  {
673
+ "epoch": 0.4626865671641791,
674
+ "grad_norm": 0.71484375,
675
  "learning_rate": 5e-05,
676
+ "loss": 1.559277057647705,
677
  "step": 186
678
  },
679
  {
680
+ "epoch": 0.46766169154228854,
681
+ "grad_norm": 0.78125,
682
  "learning_rate": 5e-05,
683
+ "loss": 1.5251140594482422,
684
  "step": 188
685
  },
686
  {
687
+ "epoch": 0.472636815920398,
688
+ "grad_norm": 0.640625,
689
  "learning_rate": 5e-05,
690
+ "loss": 1.5697033405303955,
691
  "step": 190
692
  },
693
  {
694
+ "epoch": 0.47761194029850745,
695
+ "grad_norm": 0.71875,
696
  "learning_rate": 5e-05,
697
+ "loss": 1.4658384323120117,
698
  "step": 192
699
  },
700
  {
701
+ "epoch": 0.48258706467661694,
702
+ "grad_norm": 0.828125,
703
  "learning_rate": 5e-05,
704
+ "loss": 1.5391371250152588,
705
  "step": 194
706
  },
707
  {
708
+ "epoch": 0.48756218905472637,
709
+ "grad_norm": 0.62890625,
710
  "learning_rate": 5e-05,
711
+ "loss": 1.517061710357666,
712
  "step": 196
713
  },
714
  {
715
+ "epoch": 0.4925373134328358,
716
+ "grad_norm": 0.85546875,
717
  "learning_rate": 5e-05,
718
+ "loss": 1.549302339553833,
719
  "step": 198
720
  },
721
  {
722
+ "epoch": 0.4975124378109453,
723
+ "grad_norm": 0.70703125,
724
  "learning_rate": 5e-05,
725
+ "loss": 1.52018404006958,
726
  "step": 200
727
  },
728
  {
729
+ "epoch": 0.5024875621890548,
730
+ "grad_norm": 0.65625,
731
  "learning_rate": 5e-05,
732
+ "loss": 1.5727933645248413,
733
  "step": 202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
734
  }
735
  ],
736
  "logging_steps": 2,
737
+ "max_steps": 503,
738
  "num_input_tokens_seen": 0,
739
+ "num_train_epochs": 2,
740
+ "save_steps": 202,
741
  "stateful_callbacks": {
742
  "TrainerControl": {
743
  "args": {
 
745
  "should_evaluate": false,
746
  "should_log": false,
747
  "should_save": true,
748
+ "should_training_stop": false
749
  },
750
  "attributes": {}
751
  }
752
  },
753
+ "total_flos": 4803482146045952.0,
754
  "train_batch_size": 4,
755
  "trial_name": null,
756
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15867342fd377cf74773a6b47c5bb552643dbadb054a78f267afaea78cf9a2a2
3
- size 5777
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfdf90fde84a79be9f401a8d95d02b50c23a941f33d8ecb04d6f5a91b2b29739
3
+ size 5841