SystemAdmin123 commited on
Commit
22e2d89
·
verified ·
1 Parent(s): 78f9b05

Training in progress, step 400, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65a553bd7056176e2288025437ccdeff8d47868eefca4ade3457c24cca308444
3
  size 723674912
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcb42890fd3e3733df15325188e71ea98cd125dad14aa982eb9d9229b15a8bdc
3
  size 723674912
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1bfbfd154cb90e557e9d1d28cf0a383f1d2890c6a73e5538416e046c8ee482fe
3
  size 735625626
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aaaf6afb1371a4f19d7057b3f5ca8fca65fcc7a584da182d9f36fb7032085bb
3
  size 735625626
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19ab3d6cfcb43de67f16e412d0cb4f86309db602f8242d16f2b203a0212d6cbb
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9feae33b2fec0a6229240e7adaee6ecc8f5cfdf1a8bd0e827b1d8a241424e3c0
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c88b3aeb8ec2bf995149291b90b69667d3f268ff2f13afbeab1a220b8cc27590
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a673aaf85c0fe6b6c29cb8f3e7dbd829eef637110e4ad9a775f3fcf001c92591
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5918910920390648,
5
  "eval_steps": 200,
6
- "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -11,1490 +11,306 @@
11
  {
12
  "epoch": 0.0002959455460195324,
13
  "eval_loss": 3.0371296405792236,
14
- "eval_runtime": 40.2878,
15
- "eval_samples_per_second": 37.282,
16
- "eval_steps_per_second": 9.333,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.002959455460195324,
21
  "grad_norm": 2.84375,
22
  "learning_rate": 1.6000000000000003e-05,
23
- "loss": 2.5732,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.005918910920390648,
28
- "grad_norm": 4.40625,
29
  "learning_rate": 3.2000000000000005e-05,
30
  "loss": 2.6895,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 0.008878366380585973,
35
- "grad_norm": 5.375,
36
  "learning_rate": 4.8e-05,
37
- "loss": 2.5336,
38
  "step": 30
39
  },
40
  {
41
  "epoch": 0.011837821840781295,
42
- "grad_norm": 7.90625,
43
  "learning_rate": 6.400000000000001e-05,
44
- "loss": 2.9661,
45
  "step": 40
46
  },
47
  {
48
  "epoch": 0.01479727730097662,
49
  "grad_norm": 25.375,
50
  "learning_rate": 8e-05,
51
- "loss": 3.4081,
52
  "step": 50
53
  },
54
  {
55
  "epoch": 0.017756732761171946,
56
- "grad_norm": 3.171875,
57
  "learning_rate": 9.6e-05,
58
- "loss": 2.3133,
59
  "step": 60
60
  },
61
  {
62
  "epoch": 0.020716188221367268,
63
- "grad_norm": 2.953125,
64
  "learning_rate": 0.00011200000000000001,
65
- "loss": 2.3044,
66
  "step": 70
67
  },
68
  {
69
  "epoch": 0.02367564368156259,
70
  "grad_norm": 4.34375,
71
  "learning_rate": 0.00012800000000000002,
72
- "loss": 2.411,
73
  "step": 80
74
  },
75
  {
76
  "epoch": 0.026635099141757917,
77
  "grad_norm": 5.5,
78
  "learning_rate": 0.000144,
79
- "loss": 2.326,
80
  "step": 90
81
  },
82
  {
83
  "epoch": 0.02959455460195324,
84
- "grad_norm": 22.375,
85
  "learning_rate": 0.00016,
86
- "loss": 3.3354,
87
  "step": 100
88
  },
89
  {
90
  "epoch": 0.032554010062148565,
91
- "grad_norm": 3.875,
92
  "learning_rate": 0.00017600000000000002,
93
- "loss": 2.4824,
94
  "step": 110
95
  },
96
  {
97
  "epoch": 0.03551346552234389,
98
- "grad_norm": 2.90625,
99
  "learning_rate": 0.000192,
100
- "loss": 2.2624,
101
  "step": 120
102
  },
103
  {
104
  "epoch": 0.03847292098253921,
105
- "grad_norm": 2.890625,
106
  "learning_rate": 0.0001999978128380225,
107
- "loss": 2.3521,
108
  "step": 130
109
  },
110
  {
111
  "epoch": 0.041432376442734536,
112
- "grad_norm": 7.09375,
113
  "learning_rate": 0.0001999803161162393,
114
  "loss": 2.3626,
115
  "step": 140
116
  },
117
  {
118
  "epoch": 0.04439183190292986,
119
- "grad_norm": 31.625,
120
  "learning_rate": 0.00019994532573409262,
121
- "loss": 2.2468,
122
  "step": 150
123
  },
124
  {
125
  "epoch": 0.04735128736312518,
126
  "grad_norm": 5.6875,
127
  "learning_rate": 0.00019989284781388617,
128
- "loss": 2.3798,
129
  "step": 160
130
  },
131
  {
132
  "epoch": 0.05031074282332051,
133
- "grad_norm": 2.578125,
134
  "learning_rate": 0.00019982289153773646,
135
- "loss": 2.3013,
136
  "step": 170
137
  },
138
  {
139
  "epoch": 0.053270198283515834,
140
- "grad_norm": 4.375,
141
  "learning_rate": 0.00019973546914596623,
142
- "loss": 2.4314,
143
  "step": 180
144
  },
145
  {
146
  "epoch": 0.05622965374371116,
147
  "grad_norm": 6.90625,
148
  "learning_rate": 0.00019963059593496268,
149
- "loss": 2.225,
150
  "step": 190
151
  },
152
  {
153
  "epoch": 0.05918910920390648,
154
- "grad_norm": 25.75,
155
  "learning_rate": 0.00019950829025450114,
156
- "loss": 2.5078,
157
  "step": 200
158
  },
159
  {
160
  "epoch": 0.05918910920390648,
161
- "eval_loss": 2.297968864440918,
162
- "eval_runtime": 38.145,
163
- "eval_samples_per_second": 39.376,
164
- "eval_steps_per_second": 9.857,
165
  "step": 200
166
  },
167
  {
168
  "epoch": 0.062148564664101805,
169
- "grad_norm": 2.5625,
170
  "learning_rate": 0.0001993685735045343,
171
- "loss": 2.4115,
172
  "step": 210
173
  },
174
  {
175
  "epoch": 0.06510802012429713,
176
  "grad_norm": 2.890625,
177
  "learning_rate": 0.0001992114701314478,
178
- "loss": 2.3576,
179
  "step": 220
180
  },
181
  {
182
  "epoch": 0.06806747558449246,
183
- "grad_norm": 3.03125,
184
  "learning_rate": 0.000199037007623783,
185
- "loss": 2.2509,
186
  "step": 230
187
  },
188
  {
189
  "epoch": 0.07102693104468778,
190
  "grad_norm": 6.65625,
191
  "learning_rate": 0.00019884521650742715,
192
- "loss": 2.2513,
193
  "step": 240
194
  },
195
  {
196
  "epoch": 0.0739863865048831,
197
- "grad_norm": 39.25,
198
  "learning_rate": 0.00019863613034027224,
199
- "loss": 2.7071,
200
  "step": 250
201
  },
202
  {
203
  "epoch": 0.07694584196507842,
204
- "grad_norm": 5.125,
205
  "learning_rate": 0.0001984097857063434,
206
- "loss": 2.1904,
207
  "step": 260
208
  },
209
  {
210
  "epoch": 0.07990529742527375,
211
- "grad_norm": 2.734375,
212
  "learning_rate": 0.0001981662222093976,
213
- "loss": 2.4093,
214
  "step": 270
215
  },
216
  {
217
  "epoch": 0.08286475288546907,
218
- "grad_norm": 3.140625,
219
  "learning_rate": 0.00019790548246599447,
220
- "loss": 2.2674,
221
  "step": 280
222
  },
223
  {
224
  "epoch": 0.0858242083456644,
225
- "grad_norm": 5.34375,
226
  "learning_rate": 0.00019762761209803927,
227
- "loss": 2.052,
228
  "step": 290
229
  },
230
  {
231
  "epoch": 0.08878366380585972,
232
- "grad_norm": 15.5,
233
  "learning_rate": 0.0001973326597248006,
234
- "loss": 2.2147,
235
  "step": 300
236
  },
237
  {
238
  "epoch": 0.09174311926605505,
239
  "grad_norm": 3.234375,
240
  "learning_rate": 0.00019702067695440332,
241
- "loss": 2.4276,
242
  "step": 310
243
  },
244
  {
245
  "epoch": 0.09470257472625036,
246
- "grad_norm": 2.875,
247
  "learning_rate": 0.00019669171837479873,
248
- "loss": 2.0212,
249
  "step": 320
250
  },
251
  {
252
  "epoch": 0.09766203018644569,
253
  "grad_norm": 3.0,
254
  "learning_rate": 0.00019634584154421317,
255
- "loss": 2.2913,
256
  "step": 330
257
  },
258
  {
259
  "epoch": 0.10062148564664102,
260
- "grad_norm": 5.53125,
261
  "learning_rate": 0.00019598310698107702,
262
- "loss": 2.1126,
263
  "step": 340
264
  },
265
  {
266
  "epoch": 0.10358094110683634,
267
- "grad_norm": 23.375,
268
  "learning_rate": 0.00019560357815343577,
269
- "loss": 1.9027,
270
  "step": 350
271
  },
272
  {
273
  "epoch": 0.10654039656703167,
274
  "grad_norm": 2.515625,
275
  "learning_rate": 0.00019520732146784491,
276
- "loss": 2.2814,
277
  "step": 360
278
  },
279
  {
280
  "epoch": 0.109499852027227,
281
  "grad_norm": 3.09375,
282
  "learning_rate": 0.0001947944062577507,
283
- "loss": 2.0483,
284
  "step": 370
285
  },
286
  {
287
  "epoch": 0.11245930748742232,
288
  "grad_norm": 4.125,
289
  "learning_rate": 0.00019436490477135878,
290
- "loss": 2.2125,
291
  "step": 380
292
  },
293
  {
294
  "epoch": 0.11541876294761765,
295
- "grad_norm": 5.53125,
296
  "learning_rate": 0.00019391889215899299,
297
- "loss": 1.9433,
298
  "step": 390
299
  },
300
  {
301
  "epoch": 0.11837821840781296,
302
- "grad_norm": 25.375,
303
  "learning_rate": 0.0001934564464599461,
304
- "loss": 1.858,
305
  "step": 400
306
  },
307
  {
308
  "epoch": 0.11837821840781296,
309
- "eval_loss": 2.2061269283294678,
310
- "eval_runtime": 37.9861,
311
- "eval_samples_per_second": 39.541,
312
- "eval_steps_per_second": 9.898,
313
  "step": 400
314
- },
315
- {
316
- "epoch": 0.12133767386800828,
317
- "grad_norm": 1.9296875,
318
- "learning_rate": 0.00019297764858882514,
319
- "loss": 2.4536,
320
- "step": 410
321
- },
322
- {
323
- "epoch": 0.12429712932820361,
324
- "grad_norm": 3.796875,
325
- "learning_rate": 0.00019248258232139388,
326
- "loss": 2.2639,
327
- "step": 420
328
- },
329
- {
330
- "epoch": 0.12725658478839894,
331
- "grad_norm": 4.21875,
332
- "learning_rate": 0.00019197133427991436,
333
- "loss": 2.1831,
334
- "step": 430
335
- },
336
- {
337
- "epoch": 0.13021604024859426,
338
- "grad_norm": 3.78125,
339
- "learning_rate": 0.00019144399391799043,
340
- "loss": 2.0126,
341
- "step": 440
342
- },
343
- {
344
- "epoch": 0.1331754957087896,
345
- "grad_norm": 26.875,
346
- "learning_rate": 0.00019090065350491626,
347
- "loss": 2.449,
348
- "step": 450
349
- },
350
- {
351
- "epoch": 0.1361349511689849,
352
- "grad_norm": 2.0625,
353
- "learning_rate": 0.0001903414081095315,
354
- "loss": 2.3837,
355
- "step": 460
356
- },
357
- {
358
- "epoch": 0.13909440662918024,
359
- "grad_norm": 2.265625,
360
- "learning_rate": 0.00018976635558358722,
361
- "loss": 2.121,
362
- "step": 470
363
- },
364
- {
365
- "epoch": 0.14205386208937557,
366
- "grad_norm": 3.953125,
367
- "learning_rate": 0.00018917559654462474,
368
- "loss": 2.1884,
369
- "step": 480
370
- },
371
- {
372
- "epoch": 0.1450133175495709,
373
- "grad_norm": 5.21875,
374
- "learning_rate": 0.00018856923435837022,
375
- "loss": 2.1975,
376
- "step": 490
377
- },
378
- {
379
- "epoch": 0.1479727730097662,
380
- "grad_norm": 19.75,
381
- "learning_rate": 0.0001879473751206489,
382
- "loss": 1.6031,
383
- "step": 500
384
- },
385
- {
386
- "epoch": 0.15093222846996152,
387
- "grad_norm": 2.109375,
388
- "learning_rate": 0.00018731012763882133,
389
- "loss": 2.4093,
390
- "step": 510
391
- },
392
- {
393
- "epoch": 0.15389168393015684,
394
- "grad_norm": 2.8125,
395
- "learning_rate": 0.00018665760341274505,
396
- "loss": 2.199,
397
- "step": 520
398
- },
399
- {
400
- "epoch": 0.15685113939035217,
401
- "grad_norm": 3.765625,
402
- "learning_rate": 0.00018598991661526572,
403
- "loss": 2.2808,
404
- "step": 530
405
- },
406
- {
407
- "epoch": 0.1598105948505475,
408
- "grad_norm": 3.796875,
409
- "learning_rate": 0.00018530718407223974,
410
- "loss": 2.221,
411
- "step": 540
412
- },
413
- {
414
- "epoch": 0.16277005031074282,
415
- "grad_norm": 18.875,
416
- "learning_rate": 0.00018460952524209355,
417
- "loss": 1.9977,
418
- "step": 550
419
- },
420
- {
421
- "epoch": 0.16572950577093815,
422
- "grad_norm": 2.84375,
423
- "learning_rate": 0.00018389706219492147,
424
- "loss": 2.2811,
425
- "step": 560
426
- },
427
- {
428
- "epoch": 0.16868896123113347,
429
- "grad_norm": 2.3125,
430
- "learning_rate": 0.00018316991959112716,
431
- "loss": 2.3694,
432
- "step": 570
433
- },
434
- {
435
- "epoch": 0.1716484166913288,
436
- "grad_norm": 2.796875,
437
- "learning_rate": 0.00018242822465961176,
438
- "loss": 1.8728,
439
- "step": 580
440
- },
441
- {
442
- "epoch": 0.17460787215152412,
443
- "grad_norm": 8.0625,
444
- "learning_rate": 0.00018167210717551224,
445
- "loss": 2.0789,
446
- "step": 590
447
- },
448
- {
449
- "epoch": 0.17756732761171945,
450
- "grad_norm": 17.375,
451
- "learning_rate": 0.00018090169943749476,
452
- "loss": 1.9051,
453
- "step": 600
454
- },
455
- {
456
- "epoch": 0.17756732761171945,
457
- "eval_loss": 2.1772165298461914,
458
- "eval_runtime": 38.2926,
459
- "eval_samples_per_second": 39.224,
460
- "eval_steps_per_second": 9.819,
461
- "step": 600
462
- },
463
- {
464
- "epoch": 0.18052678307191478,
465
- "grad_norm": 2.75,
466
- "learning_rate": 0.00018011713624460608,
467
- "loss": 2.0598,
468
- "step": 610
469
- },
470
- {
471
- "epoch": 0.1834862385321101,
472
- "grad_norm": 2.203125,
473
- "learning_rate": 0.00017931855487268782,
474
- "loss": 2.0458,
475
- "step": 620
476
- },
477
- {
478
- "epoch": 0.18644569399230543,
479
- "grad_norm": 3.046875,
480
- "learning_rate": 0.0001785060950503568,
481
- "loss": 2.4327,
482
- "step": 630
483
- },
484
- {
485
- "epoch": 0.18940514945250073,
486
- "grad_norm": 10.5,
487
- "learning_rate": 0.00017767989893455698,
488
- "loss": 2.2315,
489
- "step": 640
490
- },
491
- {
492
- "epoch": 0.19236460491269605,
493
- "grad_norm": 19.125,
494
- "learning_rate": 0.00017684011108568592,
495
- "loss": 2.2792,
496
- "step": 650
497
- },
498
- {
499
- "epoch": 0.19532406037289138,
500
- "grad_norm": 2.140625,
501
- "learning_rate": 0.00017598687844230088,
502
- "loss": 2.4382,
503
- "step": 660
504
- },
505
- {
506
- "epoch": 0.1982835158330867,
507
- "grad_norm": 2.78125,
508
- "learning_rate": 0.00017512035029540885,
509
- "loss": 2.1791,
510
- "step": 670
511
- },
512
- {
513
- "epoch": 0.20124297129328203,
514
- "grad_norm": 5.1875,
515
- "learning_rate": 0.000174240678262345,
516
- "loss": 2.2385,
517
- "step": 680
518
- },
519
- {
520
- "epoch": 0.20420242675347736,
521
- "grad_norm": 6.25,
522
- "learning_rate": 0.000173348016260244,
523
- "loss": 1.9492,
524
- "step": 690
525
- },
526
- {
527
- "epoch": 0.20716188221367268,
528
- "grad_norm": 27.125,
529
- "learning_rate": 0.00017244252047910892,
530
- "loss": 1.8527,
531
- "step": 700
532
- },
533
- {
534
- "epoch": 0.210121337673868,
535
- "grad_norm": 2.15625,
536
- "learning_rate": 0.00017152434935448256,
537
- "loss": 2.2149,
538
- "step": 710
539
- },
540
- {
541
- "epoch": 0.21308079313406333,
542
- "grad_norm": 2.46875,
543
- "learning_rate": 0.0001705936635397259,
544
- "loss": 2.3124,
545
- "step": 720
546
- },
547
- {
548
- "epoch": 0.21604024859425866,
549
- "grad_norm": 3.5625,
550
- "learning_rate": 0.00016965062587790823,
551
- "loss": 2.109,
552
- "step": 730
553
- },
554
- {
555
- "epoch": 0.218999704054454,
556
- "grad_norm": 2.703125,
557
- "learning_rate": 0.00016869540137331445,
558
- "loss": 1.9086,
559
- "step": 740
560
- },
561
- {
562
- "epoch": 0.2219591595146493,
563
- "grad_norm": 10.0625,
564
- "learning_rate": 0.00016772815716257412,
565
- "loss": 1.9308,
566
- "step": 750
567
- },
568
- {
569
- "epoch": 0.22491861497484464,
570
- "grad_norm": 1.78125,
571
- "learning_rate": 0.00016674906248541726,
572
- "loss": 2.3023,
573
- "step": 760
574
- },
575
- {
576
- "epoch": 0.22787807043503996,
577
- "grad_norm": 3.78125,
578
- "learning_rate": 0.00016575828865506245,
579
- "loss": 2.1113,
580
- "step": 770
581
- },
582
- {
583
- "epoch": 0.2308375258952353,
584
- "grad_norm": 4.125,
585
- "learning_rate": 0.0001647560090282419,
586
- "loss": 2.0416,
587
- "step": 780
588
- },
589
- {
590
- "epoch": 0.2337969813554306,
591
- "grad_norm": 6.5625,
592
- "learning_rate": 0.000163742398974869,
593
- "loss": 1.9142,
594
- "step": 790
595
- },
596
- {
597
- "epoch": 0.23675643681562591,
598
- "grad_norm": 43.0,
599
- "learning_rate": 0.0001627176358473537,
600
- "loss": 2.1002,
601
- "step": 800
602
- },
603
- {
604
- "epoch": 0.23675643681562591,
605
- "eval_loss": 2.160681962966919,
606
- "eval_runtime": 38.0783,
607
- "eval_samples_per_second": 39.445,
608
- "eval_steps_per_second": 9.874,
609
- "step": 800
610
- },
611
- {
612
- "epoch": 0.23971589227582124,
613
- "grad_norm": 2.390625,
614
- "learning_rate": 0.0001616818989495711,
615
- "loss": 2.4747,
616
- "step": 810
617
- },
618
- {
619
- "epoch": 0.24267534773601657,
620
- "grad_norm": 3.59375,
621
- "learning_rate": 0.00016063536950548826,
622
- "loss": 2.1356,
623
- "step": 820
624
- },
625
- {
626
- "epoch": 0.2456348031962119,
627
- "grad_norm": 2.484375,
628
- "learning_rate": 0.0001595782306274553,
629
- "loss": 2.0635,
630
- "step": 830
631
- },
632
- {
633
- "epoch": 0.24859425865640722,
634
- "grad_norm": 5.625,
635
- "learning_rate": 0.00015851066728416618,
636
- "loss": 2.122,
637
- "step": 840
638
- },
639
- {
640
- "epoch": 0.25155371411660254,
641
- "grad_norm": 27.75,
642
- "learning_rate": 0.00015743286626829437,
643
- "loss": 2.1348,
644
- "step": 850
645
- },
646
- {
647
- "epoch": 0.25451316957679787,
648
- "grad_norm": 1.875,
649
- "learning_rate": 0.00015634501616380967,
650
- "loss": 2.2736,
651
- "step": 860
652
- },
653
- {
654
- "epoch": 0.2574726250369932,
655
- "grad_norm": 3.34375,
656
- "learning_rate": 0.00015524730731298134,
657
- "loss": 2.1229,
658
- "step": 870
659
- },
660
- {
661
- "epoch": 0.2604320804971885,
662
- "grad_norm": 3.640625,
663
- "learning_rate": 0.0001541399317830738,
664
- "loss": 2.3073,
665
- "step": 880
666
- },
667
- {
668
- "epoch": 0.26339153595738385,
669
- "grad_norm": 4.4375,
670
- "learning_rate": 0.0001530230833327405,
671
- "loss": 1.9859,
672
- "step": 890
673
- },
674
- {
675
- "epoch": 0.2663509914175792,
676
- "grad_norm": 12.25,
677
- "learning_rate": 0.00015189695737812152,
678
- "loss": 2.0202,
679
- "step": 900
680
- },
681
- {
682
- "epoch": 0.2693104468777745,
683
- "grad_norm": 2.1875,
684
- "learning_rate": 0.0001507617509586517,
685
- "loss": 2.2262,
686
- "step": 910
687
- },
688
- {
689
- "epoch": 0.2722699023379698,
690
- "grad_norm": 2.125,
691
- "learning_rate": 0.00014961766270258422,
692
- "loss": 2.2005,
693
- "step": 920
694
- },
695
- {
696
- "epoch": 0.27522935779816515,
697
- "grad_norm": 2.734375,
698
- "learning_rate": 0.00014846489279223652,
699
- "loss": 1.8805,
700
- "step": 930
701
- },
702
- {
703
- "epoch": 0.2781888132583605,
704
- "grad_norm": 4.5,
705
- "learning_rate": 0.0001473036429289641,
706
- "loss": 2.272,
707
- "step": 940
708
- },
709
- {
710
- "epoch": 0.2811482687185558,
711
- "grad_norm": 23.5,
712
- "learning_rate": 0.0001461341162978688,
713
- "loss": 1.926,
714
- "step": 950
715
- },
716
- {
717
- "epoch": 0.28410772417875113,
718
- "grad_norm": 2.234375,
719
- "learning_rate": 0.00014495651753224705,
720
- "loss": 2.264,
721
- "step": 960
722
- },
723
- {
724
- "epoch": 0.28706717963894646,
725
- "grad_norm": 2.28125,
726
- "learning_rate": 0.00014377105267778518,
727
- "loss": 2.3195,
728
- "step": 970
729
- },
730
- {
731
- "epoch": 0.2900266350991418,
732
- "grad_norm": 2.859375,
733
- "learning_rate": 0.00014257792915650728,
734
- "loss": 2.1811,
735
- "step": 980
736
- },
737
- {
738
- "epoch": 0.2929860905593371,
739
- "grad_norm": 5.34375,
740
- "learning_rate": 0.00014137735573048233,
741
- "loss": 2.2588,
742
- "step": 990
743
- },
744
- {
745
- "epoch": 0.2959455460195324,
746
- "grad_norm": 37.75,
747
- "learning_rate": 0.00014016954246529696,
748
- "loss": 1.901,
749
- "step": 1000
750
- },
751
- {
752
- "epoch": 0.2959455460195324,
753
- "eval_loss": 2.1415603160858154,
754
- "eval_runtime": 37.3621,
755
- "eval_samples_per_second": 40.201,
756
- "eval_steps_per_second": 10.064,
757
- "step": 1000
758
- },
759
- {
760
- "epoch": 0.2989050014797277,
761
- "grad_norm": 2.15625,
762
- "learning_rate": 0.00013895470069330004,
763
- "loss": 2.4456,
764
- "step": 1010
765
- },
766
- {
767
- "epoch": 0.30186445693992303,
768
- "grad_norm": 2.984375,
769
- "learning_rate": 0.00013773304297662559,
770
- "loss": 2.0006,
771
- "step": 1020
772
- },
773
- {
774
- "epoch": 0.30482391240011836,
775
- "grad_norm": 3.5625,
776
- "learning_rate": 0.00013650478307000057,
777
- "loss": 2.1085,
778
- "step": 1030
779
- },
780
- {
781
- "epoch": 0.3077833678603137,
782
- "grad_norm": 9.5,
783
- "learning_rate": 0.00013527013588334415,
784
- "loss": 2.2559,
785
- "step": 1040
786
- },
787
- {
788
- "epoch": 0.310742823320509,
789
- "grad_norm": 19.375,
790
- "learning_rate": 0.00013402931744416433,
791
- "loss": 2.0562,
792
- "step": 1050
793
- },
794
- {
795
- "epoch": 0.31370227878070434,
796
- "grad_norm": 2.109375,
797
- "learning_rate": 0.00013278254485975976,
798
- "loss": 2.2395,
799
- "step": 1060
800
- },
801
- {
802
- "epoch": 0.31666173424089966,
803
- "grad_norm": 2.578125,
804
- "learning_rate": 0.00013153003627923218,
805
- "loss": 2.3515,
806
- "step": 1070
807
- },
808
- {
809
- "epoch": 0.319621189701095,
810
- "grad_norm": 3.890625,
811
- "learning_rate": 0.00013027201085531634,
812
- "loss": 2.3772,
813
- "step": 1080
814
- },
815
- {
816
- "epoch": 0.3225806451612903,
817
- "grad_norm": 3.390625,
818
- "learning_rate": 0.00012900868870603503,
819
- "loss": 2.0078,
820
- "step": 1090
821
- },
822
- {
823
- "epoch": 0.32554010062148564,
824
- "grad_norm": 17.75,
825
- "learning_rate": 0.00012774029087618446,
826
- "loss": 2.3486,
827
- "step": 1100
828
- },
829
- {
830
- "epoch": 0.32849955608168097,
831
- "grad_norm": 2.0,
832
- "learning_rate": 0.00012646703929865817,
833
- "loss": 2.1657,
834
- "step": 1110
835
- },
836
- {
837
- "epoch": 0.3314590115418763,
838
- "grad_norm": 2.578125,
839
- "learning_rate": 0.00012518915675561483,
840
- "loss": 2.3418,
841
- "step": 1120
842
- },
843
- {
844
- "epoch": 0.3344184670020716,
845
- "grad_norm": 3.390625,
846
- "learning_rate": 0.00012390686683949798,
847
- "loss": 2.0369,
848
- "step": 1130
849
- },
850
- {
851
- "epoch": 0.33737792246226694,
852
- "grad_norm": 7.90625,
853
- "learning_rate": 0.00012262039391391404,
854
- "loss": 2.0235,
855
- "step": 1140
856
- },
857
- {
858
- "epoch": 0.34033737792246227,
859
- "grad_norm": 31.75,
860
- "learning_rate": 0.0001213299630743747,
861
- "loss": 2.1905,
862
- "step": 1150
863
- },
864
- {
865
- "epoch": 0.3432968333826576,
866
- "grad_norm": 2.421875,
867
- "learning_rate": 0.00012003580010891213,
868
- "loss": 2.4817,
869
- "step": 1160
870
- },
871
- {
872
- "epoch": 0.3462562888428529,
873
- "grad_norm": 2.25,
874
- "learning_rate": 0.00011873813145857249,
875
- "loss": 2.195,
876
- "step": 1170
877
- },
878
- {
879
- "epoch": 0.34921574430304825,
880
- "grad_norm": 4.0,
881
- "learning_rate": 0.00011743718417779517,
882
- "loss": 2.307,
883
- "step": 1180
884
- },
885
- {
886
- "epoch": 0.3521751997632436,
887
- "grad_norm": 4.59375,
888
- "learning_rate": 0.00011613318589468511,
889
- "loss": 1.9753,
890
- "step": 1190
891
- },
892
- {
893
- "epoch": 0.3551346552234389,
894
- "grad_norm": 11.3125,
895
- "learning_rate": 0.0001148263647711842,
896
- "loss": 1.53,
897
- "step": 1200
898
- },
899
- {
900
- "epoch": 0.3551346552234389,
901
- "eval_loss": 2.1453933715820312,
902
- "eval_runtime": 37.6799,
903
- "eval_samples_per_second": 39.862,
904
- "eval_steps_per_second": 9.979,
905
- "step": 1200
906
- },
907
- {
908
- "epoch": 0.3580941106836342,
909
- "grad_norm": 2.46875,
910
- "learning_rate": 0.0001135169494631497,
911
- "loss": 2.2422,
912
- "step": 1210
913
- },
914
- {
915
- "epoch": 0.36105356614382955,
916
- "grad_norm": 2.46875,
917
- "learning_rate": 0.00011220516908034601,
918
- "loss": 2.1766,
919
- "step": 1220
920
- },
921
- {
922
- "epoch": 0.3640130216040249,
923
- "grad_norm": 4.40625,
924
- "learning_rate": 0.00011089125314635726,
925
- "loss": 2.2057,
926
- "step": 1230
927
- },
928
- {
929
- "epoch": 0.3669724770642202,
930
- "grad_norm": 5.09375,
931
- "learning_rate": 0.00010957543155842702,
932
- "loss": 1.7583,
933
- "step": 1240
934
- },
935
- {
936
- "epoch": 0.36993193252441553,
937
- "grad_norm": 16.125,
938
- "learning_rate": 0.00010825793454723325,
939
- "loss": 1.9914,
940
- "step": 1250
941
- },
942
- {
943
- "epoch": 0.37289138798461086,
944
- "grad_norm": 2.046875,
945
- "learning_rate": 0.00010693899263660441,
946
- "loss": 2.4002,
947
- "step": 1260
948
- },
949
- {
950
- "epoch": 0.3758508434448062,
951
- "grad_norm": 3.0,
952
- "learning_rate": 0.00010561883660318455,
953
- "loss": 2.2105,
954
- "step": 1270
955
- },
956
- {
957
- "epoch": 0.37881029890500145,
958
- "grad_norm": 3.953125,
959
- "learning_rate": 0.00010429769743605407,
960
- "loss": 1.9458,
961
- "step": 1280
962
- },
963
- {
964
- "epoch": 0.3817697543651968,
965
- "grad_norm": 3.953125,
966
- "learning_rate": 0.00010297580629631325,
967
- "loss": 1.7066,
968
- "step": 1290
969
- },
970
- {
971
- "epoch": 0.3847292098253921,
972
- "grad_norm": 20.5,
973
- "learning_rate": 0.00010165339447663587,
974
- "loss": 1.9038,
975
- "step": 1300
976
- },
977
- {
978
- "epoch": 0.38768866528558743,
979
- "grad_norm": 2.109375,
980
- "learning_rate": 0.00010033069336079952,
981
- "loss": 2.2051,
982
- "step": 1310
983
- },
984
- {
985
- "epoch": 0.39064812074578276,
986
- "grad_norm": 4.15625,
987
- "learning_rate": 9.900793438320037e-05,
988
- "loss": 2.137,
989
- "step": 1320
990
- },
991
- {
992
- "epoch": 0.3936075762059781,
993
- "grad_norm": 3.40625,
994
- "learning_rate": 9.768534898835862e-05,
995
- "loss": 2.072,
996
- "step": 1330
997
- },
998
- {
999
- "epoch": 0.3965670316661734,
1000
- "grad_norm": 4.125,
1001
- "learning_rate": 9.636316859042259e-05,
1002
- "loss": 2.2351,
1003
- "step": 1340
1004
- },
1005
- {
1006
- "epoch": 0.39952648712636873,
1007
- "grad_norm": 12.5625,
1008
- "learning_rate": 9.504162453267777e-05,
1009
- "loss": 2.0439,
1010
- "step": 1350
1011
- },
1012
- {
1013
- "epoch": 0.40248594258656406,
1014
- "grad_norm": 2.875,
1015
- "learning_rate": 9.372094804706867e-05,
1016
- "loss": 2.5911,
1017
- "step": 1360
1018
- },
1019
- {
1020
- "epoch": 0.4054453980467594,
1021
- "grad_norm": 3.75,
1022
- "learning_rate": 9.24013702137397e-05,
1023
- "loss": 2.1565,
1024
- "step": 1370
1025
- },
1026
- {
1027
- "epoch": 0.4084048535069547,
1028
- "grad_norm": 4.0,
1029
- "learning_rate": 9.108312192060298e-05,
1030
- "loss": 2.2613,
1031
- "step": 1380
1032
- },
1033
- {
1034
- "epoch": 0.41136430896715004,
1035
- "grad_norm": 4.875,
1036
- "learning_rate": 8.97664338229395e-05,
1037
- "loss": 1.8462,
1038
- "step": 1390
1039
- },
1040
- {
1041
- "epoch": 0.41432376442734536,
1042
- "grad_norm": 24.375,
1043
- "learning_rate": 8.845153630304139e-05,
1044
- "loss": 2.1576,
1045
- "step": 1400
1046
- },
1047
- {
1048
- "epoch": 0.41432376442734536,
1049
- "eval_loss": 2.132373571395874,
1050
- "eval_runtime": 38.4807,
1051
- "eval_samples_per_second": 39.033,
1052
- "eval_steps_per_second": 9.771,
1053
- "step": 1400
1054
- },
1055
- {
1056
- "epoch": 0.4172832198875407,
1057
- "grad_norm": 2.046875,
1058
- "learning_rate": 8.713865942990141e-05,
1059
- "loss": 2.4016,
1060
- "step": 1410
1061
- },
1062
- {
1063
- "epoch": 0.420242675347736,
1064
- "grad_norm": 3.515625,
1065
- "learning_rate": 8.582803291895758e-05,
1066
- "loss": 2.3257,
1067
- "step": 1420
1068
- },
1069
- {
1070
- "epoch": 0.42320213080793134,
1071
- "grad_norm": 6.3125,
1072
- "learning_rate": 8.451988609189987e-05,
1073
- "loss": 2.0979,
1074
- "step": 1430
1075
- },
1076
- {
1077
- "epoch": 0.42616158626812667,
1078
- "grad_norm": 4.25,
1079
- "learning_rate": 8.321444783654524e-05,
1080
- "loss": 1.8707,
1081
- "step": 1440
1082
- },
1083
- {
1084
- "epoch": 0.429121041728322,
1085
- "grad_norm": 13.4375,
1086
- "learning_rate": 8.191194656678904e-05,
1087
- "loss": 2.05,
1088
- "step": 1450
1089
- },
1090
- {
1091
- "epoch": 0.4320804971885173,
1092
- "grad_norm": 1.8671875,
1093
- "learning_rate": 8.061261018263919e-05,
1094
- "loss": 2.3737,
1095
- "step": 1460
1096
- },
1097
- {
1098
- "epoch": 0.43503995264871265,
1099
- "grad_norm": 3.015625,
1100
- "learning_rate": 7.931666603034033e-05,
1101
- "loss": 2.3148,
1102
- "step": 1470
1103
- },
1104
- {
1105
- "epoch": 0.437999408108908,
1106
- "grad_norm": 3.296875,
1107
- "learning_rate": 7.80243408625947e-05,
1108
- "loss": 2.3303,
1109
- "step": 1480
1110
- },
1111
- {
1112
- "epoch": 0.4409588635691033,
1113
- "grad_norm": 4.15625,
1114
- "learning_rate": 7.673586079888698e-05,
1115
- "loss": 2.2395,
1116
- "step": 1490
1117
- },
1118
- {
1119
- "epoch": 0.4439183190292986,
1120
- "grad_norm": 18.375,
1121
- "learning_rate": 7.54514512859201e-05,
1122
- "loss": 2.1118,
1123
- "step": 1500
1124
- },
1125
- {
1126
- "epoch": 0.44687777448949395,
1127
- "grad_norm": 1.78125,
1128
- "learning_rate": 7.417133705816837e-05,
1129
- "loss": 2.2514,
1130
- "step": 1510
1131
- },
1132
- {
1133
- "epoch": 0.4498372299496893,
1134
- "grad_norm": 2.359375,
1135
- "learning_rate": 7.289574209855559e-05,
1136
- "loss": 2.3813,
1137
- "step": 1520
1138
- },
1139
- {
1140
- "epoch": 0.4527966854098846,
1141
- "grad_norm": 3.25,
1142
- "learning_rate": 7.16248895992645e-05,
1143
- "loss": 2.0406,
1144
- "step": 1530
1145
- },
1146
- {
1147
- "epoch": 0.45575614087007993,
1148
- "grad_norm": 6.15625,
1149
- "learning_rate": 7.035900192268464e-05,
1150
- "loss": 1.8527,
1151
- "step": 1540
1152
- },
1153
- {
1154
- "epoch": 0.45871559633027525,
1155
- "grad_norm": 24.125,
1156
- "learning_rate": 6.909830056250527e-05,
1157
- "loss": 1.8146,
1158
- "step": 1550
1159
- },
1160
- {
1161
- "epoch": 0.4616750517904706,
1162
- "grad_norm": 1.9453125,
1163
- "learning_rate": 6.784300610496048e-05,
1164
- "loss": 2.4665,
1165
- "step": 1560
1166
- },
1167
- {
1168
- "epoch": 0.46463450725066585,
1169
- "grad_norm": 2.859375,
1170
- "learning_rate": 6.65933381902329e-05,
1171
- "loss": 2.2625,
1172
- "step": 1570
1173
- },
1174
- {
1175
- "epoch": 0.4675939627108612,
1176
- "grad_norm": 3.609375,
1177
- "learning_rate": 6.534951547402322e-05,
1178
- "loss": 2.2715,
1179
- "step": 1580
1180
- },
1181
- {
1182
- "epoch": 0.4705534181710565,
1183
- "grad_norm": 4.5625,
1184
- "learning_rate": 6.411175558929152e-05,
1185
- "loss": 1.9711,
1186
- "step": 1590
1187
- },
1188
- {
1189
- "epoch": 0.47351287363125183,
1190
- "grad_norm": 11.125,
1191
- "learning_rate": 6.28802751081779e-05,
1192
- "loss": 2.1731,
1193
- "step": 1600
1194
- },
1195
- {
1196
- "epoch": 0.47351287363125183,
1197
- "eval_loss": 2.12680983543396,
1198
- "eval_runtime": 39.4155,
1199
- "eval_samples_per_second": 38.107,
1200
- "eval_steps_per_second": 9.539,
1201
- "step": 1600
1202
- },
1203
- {
1204
- "epoch": 0.47647232909144716,
1205
- "grad_norm": 1.96875,
1206
- "learning_rate": 6.165528950410884e-05,
1207
- "loss": 2.1656,
1208
- "step": 1610
1209
- },
1210
- {
1211
- "epoch": 0.4794317845516425,
1212
- "grad_norm": 3.1875,
1213
- "learning_rate": 6.0437013114095195e-05,
1214
- "loss": 2.2137,
1215
- "step": 1620
1216
- },
1217
- {
1218
- "epoch": 0.4823912400118378,
1219
- "grad_norm": 3.71875,
1220
- "learning_rate": 5.922565910122967e-05,
1221
- "loss": 2.253,
1222
- "step": 1630
1223
- },
1224
- {
1225
- "epoch": 0.48535069547203313,
1226
- "grad_norm": 7.21875,
1227
- "learning_rate": 5.8021439417389444e-05,
1228
- "loss": 2.077,
1229
- "step": 1640
1230
- },
1231
- {
1232
- "epoch": 0.48831015093222846,
1233
- "grad_norm": 18.375,
1234
- "learning_rate": 5.6824564766150726e-05,
1235
- "loss": 1.8215,
1236
- "step": 1650
1237
- },
1238
- {
1239
- "epoch": 0.4912696063924238,
1240
- "grad_norm": 1.953125,
1241
- "learning_rate": 5.563524456592163e-05,
1242
- "loss": 2.2989,
1243
- "step": 1660
1244
- },
1245
- {
1246
- "epoch": 0.4942290618526191,
1247
- "grad_norm": 3.0625,
1248
- "learning_rate": 5.4453686913300074e-05,
1249
- "loss": 2.1334,
1250
- "step": 1670
1251
- },
1252
- {
1253
- "epoch": 0.49718851731281444,
1254
- "grad_norm": 4.125,
1255
- "learning_rate": 5.328009854666303e-05,
1256
- "loss": 2.3074,
1257
- "step": 1680
1258
- },
1259
- {
1260
- "epoch": 0.5001479727730098,
1261
- "grad_norm": 4.3125,
1262
- "learning_rate": 5.2114684809993044e-05,
1263
- "loss": 2.1466,
1264
- "step": 1690
1265
- },
1266
- {
1267
- "epoch": 0.5031074282332051,
1268
- "grad_norm": 21.375,
1269
- "learning_rate": 5.095764961694922e-05,
1270
- "loss": 1.9702,
1271
- "step": 1700
1272
- },
1273
- {
1274
- "epoch": 0.5060668836934004,
1275
- "grad_norm": 2.671875,
1276
- "learning_rate": 4.980919541518796e-05,
1277
- "loss": 2.3311,
1278
- "step": 1710
1279
- },
1280
- {
1281
- "epoch": 0.5090263391535957,
1282
- "grad_norm": 3.546875,
1283
- "learning_rate": 4.866952315094088e-05,
1284
- "loss": 2.1625,
1285
- "step": 1720
1286
- },
1287
- {
1288
- "epoch": 0.511985794613791,
1289
- "grad_norm": 4.9375,
1290
- "learning_rate": 4.753883223385467e-05,
1291
- "loss": 2.1047,
1292
- "step": 1730
1293
- },
1294
- {
1295
- "epoch": 0.5149452500739864,
1296
- "grad_norm": 5.40625,
1297
- "learning_rate": 4.6417320502100316e-05,
1298
- "loss": 1.9118,
1299
- "step": 1740
1300
- },
1301
- {
1302
- "epoch": 0.5179047055341817,
1303
- "grad_norm": 22.5,
1304
- "learning_rate": 4.530518418775733e-05,
1305
- "loss": 2.3183,
1306
- "step": 1750
1307
- },
1308
- {
1309
- "epoch": 0.520864160994377,
1310
- "grad_norm": 2.09375,
1311
- "learning_rate": 4.4202617882478405e-05,
1312
- "loss": 2.3082,
1313
- "step": 1760
1314
- },
1315
- {
1316
- "epoch": 0.5238236164545723,
1317
- "grad_norm": 2.46875,
1318
- "learning_rate": 4.310981450344189e-05,
1319
- "loss": 2.1393,
1320
- "step": 1770
1321
- },
1322
- {
1323
- "epoch": 0.5267830719147677,
1324
- "grad_norm": 3.109375,
1325
- "learning_rate": 4.2026965259596666e-05,
1326
- "loss": 2.0223,
1327
- "step": 1780
1328
- },
1329
- {
1330
- "epoch": 0.529742527374963,
1331
- "grad_norm": 7.875,
1332
- "learning_rate": 4.0954259618206295e-05,
1333
- "loss": 1.9122,
1334
- "step": 1790
1335
- },
1336
- {
1337
- "epoch": 0.5327019828351583,
1338
- "grad_norm": 38.25,
1339
- "learning_rate": 3.9891885271697496e-05,
1340
- "loss": 2.149,
1341
- "step": 1800
1342
- },
1343
- {
1344
- "epoch": 0.5327019828351583,
1345
- "eval_loss": 2.1312239170074463,
1346
- "eval_runtime": 37.8217,
1347
- "eval_samples_per_second": 39.713,
1348
- "eval_steps_per_second": 9.941,
1349
- "step": 1800
1350
- },
1351
- {
1352
- "epoch": 0.5356614382953536,
1353
- "grad_norm": 1.7890625,
1354
- "learning_rate": 3.884002810481958e-05,
1355
- "loss": 2.3824,
1356
- "step": 1810
1357
- },
1358
- {
1359
- "epoch": 0.538620893755549,
1360
- "grad_norm": 2.859375,
1361
- "learning_rate": 3.779887216211995e-05,
1362
- "loss": 2.1831,
1363
- "step": 1820
1364
- },
1365
- {
1366
- "epoch": 0.5415803492157443,
1367
- "grad_norm": 4.25,
1368
- "learning_rate": 3.676859961574162e-05,
1369
- "loss": 2.0926,
1370
- "step": 1830
1371
- },
1372
- {
1373
- "epoch": 0.5445398046759397,
1374
- "grad_norm": 5.4375,
1375
- "learning_rate": 3.574939073354838e-05,
1376
- "loss": 2.107,
1377
- "step": 1840
1378
- },
1379
- {
1380
- "epoch": 0.5474992601361349,
1381
- "grad_norm": 22.75,
1382
- "learning_rate": 3.4741423847583134e-05,
1383
- "loss": 2.1866,
1384
- "step": 1850
1385
- },
1386
- {
1387
- "epoch": 0.5504587155963303,
1388
- "grad_norm": 2.046875,
1389
- "learning_rate": 3.3744875322865034e-05,
1390
- "loss": 2.3345,
1391
- "step": 1860
1392
- },
1393
- {
1394
- "epoch": 0.5534181710565256,
1395
- "grad_norm": 2.734375,
1396
- "learning_rate": 3.275991952653054e-05,
1397
- "loss": 2.2518,
1398
- "step": 1870
1399
- },
1400
- {
1401
- "epoch": 0.556377626516721,
1402
- "grad_norm": 6.0,
1403
- "learning_rate": 3.178672879732435e-05,
1404
- "loss": 2.0092,
1405
- "step": 1880
1406
- },
1407
- {
1408
- "epoch": 0.5593370819769162,
1409
- "grad_norm": 7.0625,
1410
- "learning_rate": 3.0825473415445074e-05,
1411
- "loss": 1.7693,
1412
- "step": 1890
1413
- },
1414
- {
1415
- "epoch": 0.5622965374371116,
1416
- "grad_norm": 34.25,
1417
- "learning_rate": 2.9876321572751144e-05,
1418
- "loss": 2.3584,
1419
- "step": 1900
1420
- },
1421
- {
1422
- "epoch": 0.5652559928973069,
1423
- "grad_norm": 1.8828125,
1424
- "learning_rate": 2.8939439343332086e-05,
1425
- "loss": 2.4875,
1426
- "step": 1910
1427
- },
1428
- {
1429
- "epoch": 0.5682154483575023,
1430
- "grad_norm": 2.625,
1431
- "learning_rate": 2.8014990654450325e-05,
1432
- "loss": 2.3285,
1433
- "step": 1920
1434
- },
1435
- {
1436
- "epoch": 0.5711749038176975,
1437
- "grad_norm": 3.5,
1438
- "learning_rate": 2.7103137257858868e-05,
1439
- "loss": 2.2535,
1440
- "step": 1930
1441
- },
1442
- {
1443
- "epoch": 0.5741343592778929,
1444
- "grad_norm": 5.875,
1445
- "learning_rate": 2.6204038701499056e-05,
1446
- "loss": 2.0563,
1447
- "step": 1940
1448
- },
1449
- {
1450
- "epoch": 0.5770938147380882,
1451
- "grad_norm": 16.5,
1452
- "learning_rate": 2.5317852301584643e-05,
1453
- "loss": 1.8811,
1454
- "step": 1950
1455
- },
1456
- {
1457
- "epoch": 0.5800532701982836,
1458
- "grad_norm": 1.796875,
1459
- "learning_rate": 2.4444733115075823e-05,
1460
- "loss": 2.3993,
1461
- "step": 1960
1462
- },
1463
- {
1464
- "epoch": 0.5830127256584788,
1465
- "grad_norm": 2.359375,
1466
- "learning_rate": 2.3584833912548888e-05,
1467
- "loss": 2.1668,
1468
- "step": 1970
1469
- },
1470
- {
1471
- "epoch": 0.5859721811186742,
1472
- "grad_norm": 3.71875,
1473
- "learning_rate": 2.2738305151465645e-05,
1474
- "loss": 2.1702,
1475
- "step": 1980
1476
- },
1477
- {
1478
- "epoch": 0.5889316365788695,
1479
- "grad_norm": 6.53125,
1480
- "learning_rate": 2.190529494984782e-05,
1481
- "loss": 2.2042,
1482
- "step": 1990
1483
- },
1484
- {
1485
- "epoch": 0.5918910920390648,
1486
- "grad_norm": 18.375,
1487
- "learning_rate": 2.1085949060360654e-05,
1488
- "loss": 2.7263,
1489
- "step": 2000
1490
- },
1491
- {
1492
- "epoch": 0.5918910920390648,
1493
- "eval_loss": 2.1315317153930664,
1494
- "eval_runtime": 38.3028,
1495
- "eval_samples_per_second": 39.214,
1496
- "eval_steps_per_second": 9.817,
1497
- "step": 2000
1498
  }
1499
  ],
1500
  "logging_steps": 10,
@@ -1514,7 +330,7 @@
1514
  "attributes": {}
1515
  }
1516
  },
1517
- "total_flos": 3.09608285995008e+16,
1518
  "train_batch_size": 4,
1519
  "trial_name": null,
1520
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.11837821840781296,
5
  "eval_steps": 200,
6
+ "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
11
  {
12
  "epoch": 0.0002959455460195324,
13
  "eval_loss": 3.0371296405792236,
14
+ "eval_runtime": 24.5776,
15
+ "eval_samples_per_second": 61.112,
16
+ "eval_steps_per_second": 15.298,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.002959455460195324,
21
  "grad_norm": 2.84375,
22
  "learning_rate": 1.6000000000000003e-05,
23
+ "loss": 2.5729,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.005918910920390648,
28
+ "grad_norm": 4.4375,
29
  "learning_rate": 3.2000000000000005e-05,
30
  "loss": 2.6895,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 0.008878366380585973,
35
+ "grad_norm": 5.3125,
36
  "learning_rate": 4.8e-05,
37
+ "loss": 2.5333,
38
  "step": 30
39
  },
40
  {
41
  "epoch": 0.011837821840781295,
42
+ "grad_norm": 7.9375,
43
  "learning_rate": 6.400000000000001e-05,
44
+ "loss": 2.9663,
45
  "step": 40
46
  },
47
  {
48
  "epoch": 0.01479727730097662,
49
  "grad_norm": 25.375,
50
  "learning_rate": 8e-05,
51
+ "loss": 3.4106,
52
  "step": 50
53
  },
54
  {
55
  "epoch": 0.017756732761171946,
56
+ "grad_norm": 3.125,
57
  "learning_rate": 9.6e-05,
58
+ "loss": 2.3132,
59
  "step": 60
60
  },
61
  {
62
  "epoch": 0.020716188221367268,
63
+ "grad_norm": 2.9375,
64
  "learning_rate": 0.00011200000000000001,
65
+ "loss": 2.3039,
66
  "step": 70
67
  },
68
  {
69
  "epoch": 0.02367564368156259,
70
  "grad_norm": 4.34375,
71
  "learning_rate": 0.00012800000000000002,
72
+ "loss": 2.4118,
73
  "step": 80
74
  },
75
  {
76
  "epoch": 0.026635099141757917,
77
  "grad_norm": 5.5,
78
  "learning_rate": 0.000144,
79
+ "loss": 2.3272,
80
  "step": 90
81
  },
82
  {
83
  "epoch": 0.02959455460195324,
84
+ "grad_norm": 22.0,
85
  "learning_rate": 0.00016,
86
+ "loss": 3.3301,
87
  "step": 100
88
  },
89
  {
90
  "epoch": 0.032554010062148565,
91
+ "grad_norm": 3.890625,
92
  "learning_rate": 0.00017600000000000002,
93
+ "loss": 2.4817,
94
  "step": 110
95
  },
96
  {
97
  "epoch": 0.03551346552234389,
98
+ "grad_norm": 2.921875,
99
  "learning_rate": 0.000192,
100
+ "loss": 2.2621,
101
  "step": 120
102
  },
103
  {
104
  "epoch": 0.03847292098253921,
105
+ "grad_norm": 2.859375,
106
  "learning_rate": 0.0001999978128380225,
107
+ "loss": 2.3509,
108
  "step": 130
109
  },
110
  {
111
  "epoch": 0.041432376442734536,
112
+ "grad_norm": 7.125,
113
  "learning_rate": 0.0001999803161162393,
114
  "loss": 2.3626,
115
  "step": 140
116
  },
117
  {
118
  "epoch": 0.04439183190292986,
119
+ "grad_norm": 32.5,
120
  "learning_rate": 0.00019994532573409262,
121
+ "loss": 2.2664,
122
  "step": 150
123
  },
124
  {
125
  "epoch": 0.04735128736312518,
126
  "grad_norm": 5.6875,
127
  "learning_rate": 0.00019989284781388617,
128
+ "loss": 2.378,
129
  "step": 160
130
  },
131
  {
132
  "epoch": 0.05031074282332051,
133
+ "grad_norm": 2.609375,
134
  "learning_rate": 0.00019982289153773646,
135
+ "loss": 2.3008,
136
  "step": 170
137
  },
138
  {
139
  "epoch": 0.053270198283515834,
140
+ "grad_norm": 4.40625,
141
  "learning_rate": 0.00019973546914596623,
142
+ "loss": 2.4329,
143
  "step": 180
144
  },
145
  {
146
  "epoch": 0.05622965374371116,
147
  "grad_norm": 6.90625,
148
  "learning_rate": 0.00019963059593496268,
149
+ "loss": 2.2273,
150
  "step": 190
151
  },
152
  {
153
  "epoch": 0.05918910920390648,
154
+ "grad_norm": 26.125,
155
  "learning_rate": 0.00019950829025450114,
156
+ "loss": 2.5059,
157
  "step": 200
158
  },
159
  {
160
  "epoch": 0.05918910920390648,
161
+ "eval_loss": 2.298793077468872,
162
+ "eval_runtime": 23.8828,
163
+ "eval_samples_per_second": 62.891,
164
+ "eval_steps_per_second": 15.744,
165
  "step": 200
166
  },
167
  {
168
  "epoch": 0.062148564664101805,
169
+ "grad_norm": 2.53125,
170
  "learning_rate": 0.0001993685735045343,
171
+ "loss": 2.4118,
172
  "step": 210
173
  },
174
  {
175
  "epoch": 0.06510802012429713,
176
  "grad_norm": 2.890625,
177
  "learning_rate": 0.0001992114701314478,
178
+ "loss": 2.3572,
179
  "step": 220
180
  },
181
  {
182
  "epoch": 0.06806747558449246,
183
+ "grad_norm": 3.046875,
184
  "learning_rate": 0.000199037007623783,
185
+ "loss": 2.2494,
186
  "step": 230
187
  },
188
  {
189
  "epoch": 0.07102693104468778,
190
  "grad_norm": 6.65625,
191
  "learning_rate": 0.00019884521650742715,
192
+ "loss": 2.2517,
193
  "step": 240
194
  },
195
  {
196
  "epoch": 0.0739863865048831,
197
+ "grad_norm": 39.75,
198
  "learning_rate": 0.00019863613034027224,
199
+ "loss": 2.7168,
200
  "step": 250
201
  },
202
  {
203
  "epoch": 0.07694584196507842,
204
+ "grad_norm": 5.09375,
205
  "learning_rate": 0.0001984097857063434,
206
+ "loss": 2.1898,
207
  "step": 260
208
  },
209
  {
210
  "epoch": 0.07990529742527375,
211
+ "grad_norm": 2.75,
212
  "learning_rate": 0.0001981662222093976,
213
+ "loss": 2.409,
214
  "step": 270
215
  },
216
  {
217
  "epoch": 0.08286475288546907,
218
+ "grad_norm": 3.15625,
219
  "learning_rate": 0.00019790548246599447,
220
+ "loss": 2.2657,
221
  "step": 280
222
  },
223
  {
224
  "epoch": 0.0858242083456644,
225
+ "grad_norm": 5.375,
226
  "learning_rate": 0.00019762761209803927,
227
+ "loss": 2.0511,
228
  "step": 290
229
  },
230
  {
231
  "epoch": 0.08878366380585972,
232
+ "grad_norm": 15.3125,
233
  "learning_rate": 0.0001973326597248006,
234
+ "loss": 2.2033,
235
  "step": 300
236
  },
237
  {
238
  "epoch": 0.09174311926605505,
239
  "grad_norm": 3.234375,
240
  "learning_rate": 0.00019702067695440332,
241
+ "loss": 2.4281,
242
  "step": 310
243
  },
244
  {
245
  "epoch": 0.09470257472625036,
246
+ "grad_norm": 2.890625,
247
  "learning_rate": 0.00019669171837479873,
248
+ "loss": 2.0234,
249
  "step": 320
250
  },
251
  {
252
  "epoch": 0.09766203018644569,
253
  "grad_norm": 3.0,
254
  "learning_rate": 0.00019634584154421317,
255
+ "loss": 2.2922,
256
  "step": 330
257
  },
258
  {
259
  "epoch": 0.10062148564664102,
260
+ "grad_norm": 5.5,
261
  "learning_rate": 0.00019598310698107702,
262
+ "loss": 2.1127,
263
  "step": 340
264
  },
265
  {
266
  "epoch": 0.10358094110683634,
267
+ "grad_norm": 23.125,
268
  "learning_rate": 0.00019560357815343577,
269
+ "loss": 1.9014,
270
  "step": 350
271
  },
272
  {
273
  "epoch": 0.10654039656703167,
274
  "grad_norm": 2.515625,
275
  "learning_rate": 0.00019520732146784491,
276
+ "loss": 2.2815,
277
  "step": 360
278
  },
279
  {
280
  "epoch": 0.109499852027227,
281
  "grad_norm": 3.09375,
282
  "learning_rate": 0.0001947944062577507,
283
+ "loss": 2.0484,
284
  "step": 370
285
  },
286
  {
287
  "epoch": 0.11245930748742232,
288
  "grad_norm": 4.125,
289
  "learning_rate": 0.00019436490477135878,
290
+ "loss": 2.2128,
291
  "step": 380
292
  },
293
  {
294
  "epoch": 0.11541876294761765,
295
+ "grad_norm": 5.5625,
296
  "learning_rate": 0.00019391889215899299,
297
+ "loss": 1.9424,
298
  "step": 390
299
  },
300
  {
301
  "epoch": 0.11837821840781296,
302
+ "grad_norm": 25.5,
303
  "learning_rate": 0.0001934564464599461,
304
+ "loss": 1.8624,
305
  "step": 400
306
  },
307
  {
308
  "epoch": 0.11837821840781296,
309
+ "eval_loss": 2.2062976360321045,
310
+ "eval_runtime": 23.7998,
311
+ "eval_samples_per_second": 63.11,
312
+ "eval_steps_per_second": 15.798,
313
  "step": 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  }
315
  ],
316
  "logging_steps": 10,
 
330
  "attributes": {}
331
  }
332
  },
333
+ "total_flos": 6216909638860800.0,
334
  "train_batch_size": 4,
335
  "trial_name": null,
336
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8728015aed6a4d50cfee4eda0cc3ff69e5a4901af146b3fd10ebc2972270b3a8
3
  size 6840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c005ee5b7b542fe5c31ab145aa5cb83513af587ae4027e5fb31ac777061c2a2
3
  size 6840