madhuHuggingface commited on
Commit
1b9237e
·
verified ·
1 Parent(s): 70b41d2

Training in progress, step 1500, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4cb557daf6d75831c1e4da4535fdc7690f5ccf481dc20f270738e296bd0bdd5c
3
  size 60785144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6eda6363323c6d79792a81bd69d938b3a58d34e1eb645e055ee597b8bf472ad
3
  size 60785144
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:148ccd6ae7f5f80cfe57bd86a288b885812f4fc65258f37d567f0168c8f6621a
3
- size 31803787
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebc1f15cea2729790f76d46a9aa205d07f6d4e3b1ddf01b2a41c940267766469
3
+ size 31149205
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e028462760f837e3f0e379c4b9e8963a3ff91e1441d984e3578a111bec3744ac
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd1b8760497cf2afe6ff758fde5edda9af4f73c2c6f23659a7e5f0cb97215d93
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -4,1169 +4,1064 @@
4
  "best_model_checkpoint": null,
5
  "epoch": 3.0,
6
  "eval_steps": 500,
7
- "global_step": 1650,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.01818181818181818,
14
- "grad_norm": 2.724055290222168,
15
  "learning_rate": 9e-05,
16
- "loss": 0.8719,
17
  "step": 10
18
  },
19
  {
20
- "epoch": 0.03636363636363636,
21
- "grad_norm": 1.765032410621643,
22
  "learning_rate": 0.00019,
23
- "loss": 0.0961,
24
  "step": 20
25
  },
26
  {
27
- "epoch": 0.05454545454545454,
28
- "grad_norm": 2.5369484424591064,
29
- "learning_rate": 0.00019998495581483372,
30
- "loss": 0.2139,
31
  "step": 30
32
  },
33
  {
34
- "epoch": 0.07272727272727272,
35
- "grad_norm": 1.9454748630523682,
36
- "learning_rate": 0.00019993295703551577,
37
- "loss": 0.2203,
38
  "step": 40
39
  },
40
  {
41
- "epoch": 0.09090909090909091,
42
- "grad_norm": 10.630720138549805,
43
- "learning_rate": 0.00019984383724218924,
44
- "loss": 0.1146,
45
  "step": 50
46
  },
47
  {
48
- "epoch": 0.10909090909090909,
49
- "grad_norm": 0.7372597455978394,
50
- "learning_rate": 0.00019971762953921923,
51
- "loss": 0.0478,
52
  "step": 60
53
  },
54
  {
55
- "epoch": 0.12727272727272726,
56
- "grad_norm": 0.2733962833881378,
57
- "learning_rate": 0.00019955438080761524,
58
- "loss": 0.0542,
59
  "step": 70
60
  },
61
  {
62
- "epoch": 0.14545454545454545,
63
- "grad_norm": 0.24756257236003876,
64
- "learning_rate": 0.00019935415168761682,
65
- "loss": 0.0428,
66
  "step": 80
67
  },
68
  {
69
- "epoch": 0.16363636363636364,
70
- "grad_norm": 0.9512230753898621,
71
- "learning_rate": 0.00019911701655616818,
72
- "loss": 0.0108,
73
  "step": 90
74
  },
75
  {
76
- "epoch": 0.18181818181818182,
77
- "grad_norm": 0.02735295332968235,
78
- "learning_rate": 0.00019884306349929017,
79
- "loss": 0.0079,
80
  "step": 100
81
  },
82
  {
83
- "epoch": 0.2,
84
- "grad_norm": 0.17632828652858734,
85
- "learning_rate": 0.00019853239427935994,
86
- "loss": 0.0156,
87
  "step": 110
88
  },
89
  {
90
- "epoch": 0.21818181818181817,
91
- "grad_norm": 1.9710729122161865,
92
- "learning_rate": 0.0001981851242973103,
93
- "loss": 0.0087,
94
  "step": 120
95
  },
96
  {
97
- "epoch": 0.23636363636363636,
98
- "grad_norm": 1.9644619226455688,
99
- "learning_rate": 0.00019780138254976308,
100
- "loss": 0.015,
101
  "step": 130
102
  },
103
  {
104
- "epoch": 0.2545454545454545,
105
- "grad_norm": 3.3607516288757324,
106
- "learning_rate": 0.0001973813115811122,
107
- "loss": 0.0164,
108
  "step": 140
109
  },
110
  {
111
- "epoch": 0.2727272727272727,
112
- "grad_norm": 0.11576563864946365,
113
- "learning_rate": 0.00019692506743057404,
114
- "loss": 0.0072,
115
  "step": 150
116
  },
117
  {
118
- "epoch": 0.2909090909090909,
119
- "grad_norm": 0.01492376159876585,
120
- "learning_rate": 0.00019643281957422545,
121
- "loss": 0.0124,
122
  "step": 160
123
  },
124
  {
125
- "epoch": 0.3090909090909091,
126
- "grad_norm": 0.09349380433559418,
127
- "learning_rate": 0.0001959047508620502,
128
- "loss": 0.019,
129
  "step": 170
130
  },
131
  {
132
- "epoch": 0.32727272727272727,
133
- "grad_norm": 1.256628155708313,
134
- "learning_rate": 0.00019534105745001762,
135
- "loss": 0.0115,
136
  "step": 180
137
  },
138
  {
139
- "epoch": 0.34545454545454546,
140
- "grad_norm": 0.023120613768696785,
141
- "learning_rate": 0.00019474194872721892,
142
- "loss": 0.0071,
143
  "step": 190
144
  },
145
  {
146
- "epoch": 0.36363636363636365,
147
- "grad_norm": 0.7280399799346924,
148
- "learning_rate": 0.0001941076472380873,
149
- "loss": 0.0091,
150
  "step": 200
151
  },
152
  {
153
- "epoch": 0.38181818181818183,
154
- "grad_norm": 0.3502386808395386,
155
- "learning_rate": 0.00019343838859973219,
156
- "loss": 0.0073,
157
  "step": 210
158
  },
159
  {
160
- "epoch": 0.4,
161
- "grad_norm": 0.02848837897181511,
162
- "learning_rate": 0.0001927344214144167,
163
- "loss": 0.0085,
164
  "step": 220
165
  },
166
  {
167
- "epoch": 0.41818181818181815,
168
- "grad_norm": 0.06355214864015579,
169
- "learning_rate": 0.00019199600717721243,
170
- "loss": 0.0089,
171
  "step": 230
172
  },
173
  {
174
- "epoch": 0.43636363636363634,
175
- "grad_norm": 0.043841782957315445,
176
- "learning_rate": 0.0001912234201788645,
177
- "loss": 0.0056,
178
  "step": 240
179
  },
180
  {
181
- "epoch": 0.45454545454545453,
182
- "grad_norm": 0.4108541011810303,
183
- "learning_rate": 0.00019041694740390362,
184
- "loss": 0.0136,
185
  "step": 250
186
  },
187
  {
188
- "epoch": 0.4727272727272727,
189
- "grad_norm": 1.324217677116394,
190
- "learning_rate": 0.00018957688842404337,
191
- "loss": 0.013,
192
  "step": 260
193
  },
194
  {
195
- "epoch": 0.4909090909090909,
196
- "grad_norm": 1.3584219217300415,
197
- "learning_rate": 0.00018870355528690134,
198
- "loss": 0.018,
199
  "step": 270
200
  },
201
  {
202
- "epoch": 0.509090909090909,
203
- "grad_norm": 0.11316097527742386,
204
- "learning_rate": 0.00018779727240008618,
205
- "loss": 0.0114,
206
  "step": 280
207
  },
208
  {
209
- "epoch": 0.5272727272727272,
210
- "grad_norm": 0.015650872141122818,
211
- "learning_rate": 0.00018685837641069342,
212
- "loss": 0.0067,
213
  "step": 290
214
  },
215
  {
216
- "epoch": 0.5454545454545454,
217
- "grad_norm": 0.011964845471084118,
218
- "learning_rate": 0.0001858872160802549,
219
- "loss": 0.0049,
220
  "step": 300
221
  },
222
  {
223
- "epoch": 0.5636363636363636,
224
- "grad_norm": 0.1816367506980896,
225
- "learning_rate": 0.00018488415215518807,
226
- "loss": 0.0055,
227
  "step": 310
228
  },
229
  {
230
- "epoch": 0.5818181818181818,
231
- "grad_norm": 0.0085946349427104,
232
- "learning_rate": 0.00018384955723279325,
233
- "loss": 0.0043,
234
  "step": 320
235
  },
236
  {
237
- "epoch": 0.6,
238
- "grad_norm": 0.0685335248708725,
239
- "learning_rate": 0.00018278381562284926,
240
- "loss": 0.0053,
241
  "step": 330
242
  },
243
  {
244
- "epoch": 0.6181818181818182,
245
- "grad_norm": 0.06212463974952698,
246
- "learning_rate": 0.00018168732320485774,
247
- "loss": 0.0045,
248
  "step": 340
249
  },
250
  {
251
- "epoch": 0.6363636363636364,
252
- "grad_norm": 0.002532408805564046,
253
- "learning_rate": 0.00018056048728099024,
254
- "loss": 0.0017,
255
  "step": 350
256
  },
257
  {
258
- "epoch": 0.6545454545454545,
259
- "grad_norm": 0.19549418985843658,
260
- "learning_rate": 0.000179403726424792,
261
- "loss": 0.0196,
262
  "step": 360
263
  },
264
  {
265
- "epoch": 0.6727272727272727,
266
- "grad_norm": 0.20419709384441376,
267
- "learning_rate": 0.00017821747032569906,
268
- "loss": 0.0073,
269
  "step": 370
270
  },
271
  {
272
- "epoch": 0.6909090909090909,
273
- "grad_norm": 0.038685109466314316,
274
- "learning_rate": 0.0001770021596294261,
275
- "loss": 0.0044,
276
  "step": 380
277
  },
278
  {
279
- "epoch": 0.7090909090909091,
280
- "grad_norm": 0.023335754871368408,
281
- "learning_rate": 0.00017575824577428453,
282
- "loss": 0.008,
283
  "step": 390
284
  },
285
  {
286
- "epoch": 0.7272727272727273,
287
- "grad_norm": 0.0774979218840599,
288
- "learning_rate": 0.00017448619082349165,
289
- "loss": 0.0046,
290
  "step": 400
291
  },
292
  {
293
- "epoch": 0.7454545454545455,
294
- "grad_norm": 0.009631125256419182,
295
- "learning_rate": 0.000173186467293533,
296
- "loss": 0.0043,
297
  "step": 410
298
  },
299
  {
300
- "epoch": 0.7636363636363637,
301
- "grad_norm": 0.04730548709630966,
302
- "learning_rate": 0.00017185955797864184,
303
- "loss": 0.0039,
304
  "step": 420
305
  },
306
  {
307
- "epoch": 0.7818181818181819,
308
- "grad_norm": 0.05997833237051964,
309
- "learning_rate": 0.00017050595577146061,
310
- "loss": 0.004,
311
  "step": 430
312
  },
313
  {
314
- "epoch": 0.8,
315
- "grad_norm": 0.7156815528869629,
316
- "learning_rate": 0.00016912616347995157,
317
- "loss": 0.0053,
318
  "step": 440
319
  },
320
  {
321
- "epoch": 0.8181818181818182,
322
- "grad_norm": 1.1794134378433228,
323
- "learning_rate": 0.00016772069364062432,
324
- "loss": 0.0063,
325
  "step": 450
326
  },
327
  {
328
- "epoch": 0.8363636363636363,
329
- "grad_norm": 0.9097093939781189,
330
- "learning_rate": 0.0001662900683281491,
331
- "loss": 0.0085,
332
  "step": 460
333
  },
334
  {
335
- "epoch": 0.8545454545454545,
336
- "grad_norm": 0.008992375805974007,
337
- "learning_rate": 0.0001648348189614275,
338
- "loss": 0.0037,
339
  "step": 470
340
  },
341
  {
342
- "epoch": 0.8727272727272727,
343
- "grad_norm": 0.024826984852552414,
344
- "learning_rate": 0.00016335548610619215,
345
- "loss": 0.0045,
346
  "step": 480
347
  },
348
  {
349
- "epoch": 0.8909090909090909,
350
- "grad_norm": 0.04557984322309494,
351
- "learning_rate": 0.00016185261927420845,
352
  "loss": 0.0039,
353
  "step": 490
354
  },
355
  {
356
- "epoch": 0.9090909090909091,
357
- "grad_norm": 0.016058191657066345,
358
- "learning_rate": 0.00016032677671915343,
359
- "loss": 0.0048,
360
  "step": 500
361
  },
362
  {
363
- "epoch": 0.9272727272727272,
364
- "grad_norm": 0.08177982270717621,
365
- "learning_rate": 0.00015877852522924732,
366
- "loss": 0.0047,
367
  "step": 510
368
  },
369
  {
370
- "epoch": 0.9454545454545454,
371
- "grad_norm": 0.00327894976362586,
372
- "learning_rate": 0.00015720843991671486,
373
- "loss": 0.0033,
374
  "step": 520
375
  },
376
  {
377
- "epoch": 0.9636363636363636,
378
- "grad_norm": 0.0032260078005492687,
379
- "learning_rate": 0.0001556171040041546,
380
- "loss": 0.0027,
381
  "step": 530
382
  },
383
  {
384
- "epoch": 0.9818181818181818,
385
- "grad_norm": 0.0032725839409977198,
386
- "learning_rate": 0.00015400510860789546,
387
- "loss": 0.0034,
388
  "step": 540
389
  },
390
  {
391
- "epoch": 1.0,
392
- "grad_norm": 0.005383970681577921,
393
- "learning_rate": 0.00015237305251842122,
394
- "loss": 0.003,
395
  "step": 550
396
  },
397
  {
398
- "epoch": 1.018181818181818,
399
- "grad_norm": 0.03116321749985218,
400
- "learning_rate": 0.00015072154197794422,
401
- "loss": 0.0038,
402
  "step": 560
403
  },
404
  {
405
- "epoch": 1.0363636363636364,
406
- "grad_norm": 0.08080937713384628,
407
- "learning_rate": 0.00014905119045521115,
408
- "loss": 0.0022,
409
  "step": 570
410
  },
411
  {
412
- "epoch": 1.0545454545454545,
413
- "grad_norm": 0.08265340328216553,
414
- "learning_rate": 0.00014736261841762454,
415
- "loss": 0.0052,
416
  "step": 580
417
  },
418
  {
419
- "epoch": 1.0727272727272728,
420
- "grad_norm": 0.001597880502231419,
421
- "learning_rate": 0.00014565645310076427,
422
- "loss": 0.0043,
423
  "step": 590
424
  },
425
  {
426
- "epoch": 1.0909090909090908,
427
- "grad_norm": 0.02374003641307354,
428
- "learning_rate": 0.0001439333282753954,
429
- "loss": 0.0045,
430
  "step": 600
431
  },
432
  {
433
- "epoch": 1.1090909090909091,
434
- "grad_norm": 0.024072911590337753,
435
- "learning_rate": 0.00014219388401204796,
436
- "loss": 0.0059,
437
  "step": 610
438
  },
439
  {
440
- "epoch": 1.1272727272727272,
441
- "grad_norm": 0.0032293887343257666,
442
- "learning_rate": 0.00014043876644325703,
443
- "loss": 0.0042,
444
  "step": 620
445
  },
446
  {
447
- "epoch": 1.1454545454545455,
448
- "grad_norm": 0.03435875102877617,
449
- "learning_rate": 0.00013866862752355088,
450
- "loss": 0.0035,
451
  "step": 630
452
  },
453
  {
454
- "epoch": 1.1636363636363636,
455
- "grad_norm": 0.0040184855461120605,
456
- "learning_rate": 0.00013688412478727634,
457
- "loss": 0.0035,
458
  "step": 640
459
  },
460
  {
461
- "epoch": 1.1818181818181819,
462
- "grad_norm": 0.002403195947408676,
463
- "learning_rate": 0.0001350859211043517,
464
- "loss": 0.0036,
465
  "step": 650
466
  },
467
  {
468
- "epoch": 1.2,
469
- "grad_norm": 0.0033931646030396223,
470
- "learning_rate": 0.00013327468443403783,
471
- "loss": 0.0037,
472
  "step": 660
473
  },
474
  {
475
- "epoch": 1.2181818181818183,
476
- "grad_norm": 0.05022850260138512,
477
- "learning_rate": 0.00013145108757681818,
478
- "loss": 0.0039,
479
  "step": 670
480
  },
481
  {
482
- "epoch": 1.2363636363636363,
483
- "grad_norm": 0.03751479461789131,
484
- "learning_rate": 0.00012961580792448106,
485
- "loss": 0.0068,
486
  "step": 680
487
  },
488
  {
489
- "epoch": 1.2545454545454544,
490
- "grad_norm": 0.02040746621787548,
491
- "learning_rate": 0.00012776952720849636,
492
  "loss": 0.0035,
493
  "step": 690
494
  },
495
  {
496
- "epoch": 1.2727272727272727,
497
- "grad_norm": 0.07843760401010513,
498
- "learning_rate": 0.0001259129312467799,
499
- "loss": 0.0053,
500
  "step": 700
501
  },
502
  {
503
- "epoch": 1.290909090909091,
504
- "grad_norm": 0.03921971097588539,
505
- "learning_rate": 0.00012404670968894037,
506
- "loss": 0.0029,
507
  "step": 710
508
  },
509
  {
510
- "epoch": 1.309090909090909,
511
- "grad_norm": 0.01010841503739357,
512
- "learning_rate": 0.00012217155576010224,
513
- "loss": 0.0043,
514
  "step": 720
515
  },
516
  {
517
- "epoch": 1.3272727272727272,
518
- "grad_norm": 0.007104328367859125,
519
- "learning_rate": 0.00012028816600340136,
520
- "loss": 0.0029,
521
  "step": 730
522
  },
523
  {
524
- "epoch": 1.3454545454545455,
525
- "grad_norm": 0.0017335556913167238,
526
- "learning_rate": 0.0001183972400212473,
527
- "loss": 0.0029,
528
  "step": 740
529
  },
530
  {
531
- "epoch": 1.3636363636363638,
532
- "grad_norm": 0.03576100617647171,
533
- "learning_rate": 0.00011649948021544979,
534
- "loss": 0.0033,
535
  "step": 750
536
  },
537
  {
538
- "epoch": 1.3818181818181818,
539
- "grad_norm": 0.0428117960691452,
540
- "learning_rate": 0.00011459559152630511,
541
- "loss": 0.0047,
542
  "step": 760
543
  },
544
  {
545
- "epoch": 1.4,
546
- "grad_norm": 0.005341957323253155,
547
- "learning_rate": 0.00011268628117073939,
548
- "loss": 0.0026,
549
  "step": 770
550
  },
551
  {
552
- "epoch": 1.4181818181818182,
553
- "grad_norm": 0.0017387029947713017,
554
- "learning_rate": 0.00011077225837960659,
555
- "loss": 0.003,
556
  "step": 780
557
  },
558
  {
559
- "epoch": 1.4363636363636363,
560
- "grad_norm": 0.3902546167373657,
561
- "learning_rate": 0.00010885423413423812,
562
- "loss": 0.0032,
563
  "step": 790
564
  },
565
  {
566
- "epoch": 1.4545454545454546,
567
- "grad_norm": 0.0021570881363004446,
568
- "learning_rate": 0.00010693292090234228,
569
- "loss": 0.0023,
570
  "step": 800
571
  },
572
  {
573
- "epoch": 1.4727272727272727,
574
- "grad_norm": 0.0013360620941966772,
575
- "learning_rate": 0.00010500903237335156,
576
- "loss": 0.0028,
577
  "step": 810
578
  },
579
  {
580
- "epoch": 1.490909090909091,
581
- "grad_norm": 0.0020807127002626657,
582
- "learning_rate": 0.00010308328319331621,
583
- "loss": 0.004,
584
  "step": 820
585
  },
586
  {
587
- "epoch": 1.509090909090909,
588
- "grad_norm": 0.002286926843225956,
589
- "learning_rate": 0.00010115638869944238,
590
- "loss": 0.0026,
591
  "step": 830
592
  },
593
  {
594
- "epoch": 1.5272727272727273,
595
- "grad_norm": 0.00438573257997632,
596
- "learning_rate": 9.922906465437359e-05,
597
- "loss": 0.005,
598
  "step": 840
599
  },
600
  {
601
- "epoch": 1.5454545454545454,
602
- "grad_norm": 0.008608890697360039,
603
- "learning_rate": 9.730202698031409e-05,
604
- "loss": 0.0036,
605
  "step": 850
606
  },
607
  {
608
- "epoch": 1.5636363636363635,
609
- "grad_norm": 0.07345500588417053,
610
- "learning_rate": 9.537599149309288e-05,
611
- "loss": 0.0031,
612
  "step": 860
613
  },
614
  {
615
- "epoch": 1.5818181818181818,
616
- "grad_norm": 0.03977720066905022,
617
- "learning_rate": 9.345167363626764e-05,
618
- "loss": 0.0021,
619
  "step": 870
620
  },
621
  {
622
- "epoch": 1.6,
623
- "grad_norm": 0.04508666321635246,
624
- "learning_rate": 9.15297882153664e-05,
625
- "loss": 0.0036,
626
  "step": 880
627
  },
628
  {
629
- "epoch": 1.6181818181818182,
630
- "grad_norm": 0.001087481272406876,
631
- "learning_rate": 8.961104913236644e-05,
632
- "loss": 0.0018,
633
  "step": 890
634
  },
635
  {
636
- "epoch": 1.6363636363636362,
637
- "grad_norm": 0.0006801167037338018,
638
- "learning_rate": 8.769616912050914e-05,
639
- "loss": 0.0023,
640
  "step": 900
641
  },
642
  {
643
- "epoch": 1.6545454545454545,
644
- "grad_norm": 0.0018512771930545568,
645
- "learning_rate": 8.578585947954832e-05,
646
- "loss": 0.0029,
647
  "step": 910
648
  },
649
  {
650
- "epoch": 1.6727272727272728,
651
- "grad_norm": 0.002133868169039488,
652
- "learning_rate": 8.388082981153165e-05,
653
  "loss": 0.003,
654
  "step": 920
655
  },
656
  {
657
- "epoch": 1.690909090909091,
658
- "grad_norm": 0.027992183342576027,
659
- "learning_rate": 8.198178775721249e-05,
660
- "loss": 0.0034,
661
  "step": 930
662
  },
663
  {
664
- "epoch": 1.709090909090909,
665
- "grad_norm": 0.01641463302075863,
666
- "learning_rate": 8.008943873319001e-05,
667
- "loss": 0.0021,
668
  "step": 940
669
  },
670
  {
671
- "epoch": 1.7272727272727273,
672
- "grad_norm": 0.0009675964247435331,
673
- "learning_rate": 7.820448566987582e-05,
674
- "loss": 0.0028,
675
  "step": 950
676
  },
677
  {
678
- "epoch": 1.7454545454545456,
679
- "grad_norm": 0.0009243777021765709,
680
- "learning_rate": 7.632762875038421e-05,
681
- "loss": 0.0038,
682
  "step": 960
683
  },
684
  {
685
- "epoch": 1.7636363636363637,
686
- "grad_norm": 0.0014060670509934425,
687
- "learning_rate": 7.445956515044248e-05,
688
- "loss": 0.0022,
689
  "step": 970
690
  },
691
  {
692
- "epoch": 1.7818181818181817,
693
- "grad_norm": 0.0010076279286295176,
694
- "learning_rate": 7.260098877941856e-05,
695
- "loss": 0.0027,
696
  "step": 980
697
  },
698
  {
699
- "epoch": 1.8,
700
- "grad_norm": 0.03766465559601784,
701
- "learning_rate": 7.075259002256233e-05,
702
- "loss": 0.0051,
703
  "step": 990
704
  },
705
  {
706
- "epoch": 1.8181818181818183,
707
- "grad_norm": 0.0012187482789158821,
708
- "learning_rate": 6.891505548455539e-05,
709
- "loss": 0.0026,
710
  "step": 1000
711
  },
712
  {
713
- "epoch": 1.8363636363636364,
714
- "grad_norm": 0.0012040241854265332,
715
- "learning_rate": 6.708906773446544e-05,
716
- "loss": 0.0022,
717
  "step": 1010
718
  },
719
  {
720
- "epoch": 1.8545454545454545,
721
- "grad_norm": 0.021220913156867027,
722
- "learning_rate": 6.527530505220008e-05,
723
- "loss": 0.003,
724
  "step": 1020
725
  },
726
  {
727
- "epoch": 1.8727272727272726,
728
- "grad_norm": 0.0018169321119785309,
729
- "learning_rate": 6.347444117655306e-05,
730
- "loss": 0.0032,
731
  "step": 1030
732
  },
733
  {
734
- "epoch": 1.8909090909090909,
735
- "grad_norm": 0.0020051824394613504,
736
- "learning_rate": 6.16871450549381e-05,
737
- "loss": 0.0028,
738
  "step": 1040
739
  },
740
  {
741
- "epoch": 1.9090909090909092,
742
- "grad_norm": 0.04109319671988487,
743
- "learning_rate": 5.9914080594902235e-05,
744
- "loss": 0.0033,
745
  "step": 1050
746
  },
747
  {
748
- "epoch": 1.9272727272727272,
749
- "grad_norm": 0.028795786201953888,
750
- "learning_rate": 5.815590641751112e-05,
751
- "loss": 0.0034,
752
  "step": 1060
753
  },
754
  {
755
- "epoch": 1.9454545454545453,
756
- "grad_norm": 0.0015263812383636832,
757
- "learning_rate": 5.641327561269828e-05,
758
- "loss": 0.0025,
759
  "step": 1070
760
  },
761
  {
762
- "epoch": 1.9636363636363636,
763
- "grad_norm": 0.0009399221162311733,
764
- "learning_rate": 5.468683549666884e-05,
765
- "loss": 0.0024,
766
  "step": 1080
767
  },
768
  {
769
- "epoch": 1.981818181818182,
770
- "grad_norm": 0.034878093749284744,
771
- "learning_rate": 5.297722737144802e-05,
772
- "loss": 0.0028,
773
  "step": 1090
774
  },
775
  {
776
- "epoch": 2.0,
777
- "grad_norm": 0.045912522822618484,
778
- "learning_rate": 5.128508628666364e-05,
779
- "loss": 0.0032,
780
  "step": 1100
781
  },
782
  {
783
- "epoch": 2.018181818181818,
784
- "grad_norm": 0.0015094137052074075,
785
- "learning_rate": 4.96110408036509e-05,
786
- "loss": 0.0022,
787
  "step": 1110
788
  },
789
  {
790
- "epoch": 2.036363636363636,
791
- "grad_norm": 0.0008388591813854873,
792
- "learning_rate": 4.7955712761967785e-05,
793
- "loss": 0.0021,
794
  "step": 1120
795
  },
796
  {
797
- "epoch": 2.0545454545454547,
798
- "grad_norm": 0.046201568096876144,
799
- "learning_rate": 4.631971704840685e-05,
800
- "loss": 0.0067,
801
  "step": 1130
802
  },
803
  {
804
- "epoch": 2.0727272727272728,
805
- "grad_norm": 0.09966272115707397,
806
- "learning_rate": 4.470366136858994e-05,
807
- "loss": 0.0034,
808
  "step": 1140
809
  },
810
  {
811
- "epoch": 2.090909090909091,
812
- "grad_norm": 0.0031047267839312553,
813
- "learning_rate": 4.310814602123047e-05,
814
- "loss": 0.003,
815
  "step": 1150
816
  },
817
  {
818
- "epoch": 2.109090909090909,
819
- "grad_norm": 0.04409536346793175,
820
- "learning_rate": 4.153376367514673e-05,
821
  "loss": 0.0031,
822
  "step": 1160
823
  },
824
  {
825
- "epoch": 2.1272727272727274,
826
- "grad_norm": 0.03463288024067879,
827
- "learning_rate": 3.998109914910978e-05,
828
- "loss": 0.0034,
829
  "step": 1170
830
  },
831
  {
832
- "epoch": 2.1454545454545455,
833
- "grad_norm": 0.04570608213543892,
834
- "learning_rate": 3.845072919460717e-05,
835
- "loss": 0.0038,
836
  "step": 1180
837
  },
838
  {
839
- "epoch": 2.1636363636363636,
840
- "grad_norm": 0.0019727866165339947,
841
- "learning_rate": 3.694322228160325e-05,
842
- "loss": 0.0031,
843
  "step": 1190
844
  },
845
  {
846
- "epoch": 2.1818181818181817,
847
- "grad_norm": 0.0016377349384129047,
848
- "learning_rate": 3.545913838737567e-05,
849
- "loss": 0.0027,
850
  "step": 1200
851
  },
852
  {
853
- "epoch": 2.2,
854
- "grad_norm": 0.053264349699020386,
855
- "learning_rate": 3.399902878850693e-05,
856
- "loss": 0.0019,
857
  "step": 1210
858
  },
859
  {
860
- "epoch": 2.2181818181818183,
861
- "grad_norm": 0.03723418712615967,
862
- "learning_rate": 3.256343585610739e-05,
863
- "loss": 0.0024,
864
  "step": 1220
865
  },
866
  {
867
- "epoch": 2.2363636363636363,
868
- "grad_norm": 0.03567889332771301,
869
- "learning_rate": 3.115289285434671e-05,
870
- "loss": 0.0038,
871
  "step": 1230
872
  },
873
  {
874
- "epoch": 2.2545454545454544,
875
- "grad_norm": 0.05600603297352791,
876
- "learning_rate": 2.9767923742367942e-05,
877
- "loss": 0.0025,
878
  "step": 1240
879
  },
880
  {
881
- "epoch": 2.2727272727272725,
882
- "grad_norm": 0.036135077476501465,
883
- "learning_rate": 2.8409042979657995e-05,
884
- "loss": 0.0022,
885
  "step": 1250
886
  },
887
  {
888
- "epoch": 2.290909090909091,
889
- "grad_norm": 0.03194129467010498,
890
- "learning_rate": 2.7076755334947122e-05,
891
- "loss": 0.0027,
892
  "step": 1260
893
  },
894
  {
895
- "epoch": 2.309090909090909,
896
- "grad_norm": 0.0014442523242905736,
897
- "learning_rate": 2.5771555698707804e-05,
898
- "loss": 0.0024,
899
  "step": 1270
900
  },
901
  {
902
- "epoch": 2.327272727272727,
903
- "grad_norm": 0.02508995682001114,
904
- "learning_rate": 2.449392889932315e-05,
905
- "loss": 0.0033,
906
  "step": 1280
907
  },
908
  {
909
- "epoch": 2.3454545454545457,
910
- "grad_norm": 0.056714046746492386,
911
- "learning_rate": 2.324434952299298e-05,
912
- "loss": 0.0026,
913
  "step": 1290
914
  },
915
  {
916
- "epoch": 2.3636363636363638,
917
- "grad_norm": 0.03895105794072151,
918
- "learning_rate": 2.2023281737444435e-05,
919
- "loss": 0.0031,
920
  "step": 1300
921
  },
922
  {
923
- "epoch": 2.381818181818182,
924
- "grad_norm": 0.0020949903409928083,
925
- "learning_rate": 2.0831179119512623e-05,
926
- "loss": 0.0032,
927
  "step": 1310
928
  },
929
  {
930
- "epoch": 2.4,
931
- "grad_norm": 0.04687955975532532,
932
- "learning_rate": 1.966848448665529e-05,
933
- "loss": 0.0037,
934
  "step": 1320
935
  },
936
  {
937
- "epoch": 2.418181818181818,
938
- "grad_norm": 0.05086053907871246,
939
- "learning_rate": 1.853562973246421e-05,
940
- "loss": 0.0038,
941
  "step": 1330
942
  },
943
  {
944
- "epoch": 2.4363636363636365,
945
- "grad_norm": 0.04270637780427933,
946
- "learning_rate": 1.7433035666234442e-05,
947
- "loss": 0.0025,
948
  "step": 1340
949
  },
950
  {
951
- "epoch": 2.4545454545454546,
952
- "grad_norm": 0.03331838920712471,
953
- "learning_rate": 1.6361111856650768e-05,
954
- "loss": 0.0029,
955
  "step": 1350
956
  },
957
  {
958
- "epoch": 2.4727272727272727,
959
- "grad_norm": 0.0021509944926947355,
960
- "learning_rate": 1.5320256479649607e-05,
961
- "loss": 0.002,
962
  "step": 1360
963
  },
964
  {
965
- "epoch": 2.4909090909090907,
966
- "grad_norm": 0.0010530983563512564,
967
- "learning_rate": 1.4310856170513087e-05,
968
- "loss": 0.0032,
969
  "step": 1370
970
  },
971
  {
972
- "epoch": 2.509090909090909,
973
- "grad_norm": 0.044435929507017136,
974
- "learning_rate": 1.333328588024959e-05,
975
- "loss": 0.0035,
976
  "step": 1380
977
  },
978
  {
979
- "epoch": 2.5272727272727273,
980
- "grad_norm": 0.00098559504840523,
981
- "learning_rate": 1.2387908736314923e-05,
982
- "loss": 0.0031,
983
  "step": 1390
984
  },
985
  {
986
- "epoch": 2.5454545454545454,
987
- "grad_norm": 0.11319796741008759,
988
- "learning_rate": 1.1475075907725253e-05,
989
- "loss": 0.0031,
990
  "step": 1400
991
  },
992
  {
993
- "epoch": 2.5636363636363635,
994
- "grad_norm": 0.0011413119500502944,
995
- "learning_rate": 1.0595126474612106e-05,
996
- "loss": 0.0023,
997
  "step": 1410
998
  },
999
  {
1000
- "epoch": 2.581818181818182,
1001
- "grad_norm": 0.0011832962045446038,
1002
- "learning_rate": 9.748387302268036e-06,
1003
- "loss": 0.0023,
1004
  "step": 1420
1005
  },
1006
  {
1007
- "epoch": 2.6,
1008
- "grad_norm": 0.0010551942978054285,
1009
- "learning_rate": 8.935172919729373e-06,
1010
- "loss": 0.0028,
1011
  "step": 1430
1012
  },
1013
  {
1014
- "epoch": 2.618181818181818,
1015
- "grad_norm": 0.0023533699568361044,
1016
- "learning_rate": 8.155785402941684e-06,
1017
- "loss": 0.0029,
1018
  "step": 1440
1019
  },
1020
  {
1021
- "epoch": 2.6363636363636362,
1022
- "grad_norm": 0.0014758601319044828,
1023
- "learning_rate": 7.410514262550749e-06,
1024
- "loss": 0.002,
1025
  "step": 1450
1026
  },
1027
  {
1028
- "epoch": 2.6545454545454543,
1029
- "grad_norm": 0.001560390810482204,
1030
- "learning_rate": 6.6996363363612925e-06,
1031
- "loss": 0.0033,
1032
  "step": 1460
1033
  },
1034
  {
1035
- "epoch": 2.672727272727273,
1036
- "grad_norm": 0.0016578083159402013,
1037
- "learning_rate": 6.023415686502942e-06,
1038
- "loss": 0.0018,
1039
  "step": 1470
1040
  },
1041
  {
1042
- "epoch": 2.690909090909091,
1043
- "grad_norm": 0.002960038837045431,
1044
- "learning_rate": 5.382103501341973e-06,
1045
  "loss": 0.0026,
1046
  "step": 1480
1047
  },
1048
  {
1049
- "epoch": 2.709090909090909,
1050
- "grad_norm": 0.03903718665242195,
1051
- "learning_rate": 4.775938002175129e-06,
1052
- "loss": 0.003,
1053
- "step": 1490
1054
- },
1055
- {
1056
- "epoch": 2.7272727272727275,
1057
- "grad_norm": 0.03972185030579567,
1058
- "learning_rate": 4.205144354740032e-06,
1059
- "loss": 0.0031,
1060
- "step": 1500
1061
- },
1062
- {
1063
- "epoch": 2.7454545454545456,
1064
- "grad_norm": 0.0010361653985455632,
1065
- "learning_rate": 3.6699345855753855e-06,
1066
- "loss": 0.0021,
1067
- "step": 1510
1068
- },
1069
- {
1070
- "epoch": 2.7636363636363637,
1071
- "grad_norm": 0.0898214727640152,
1072
- "learning_rate": 3.170507503261766e-06,
1073
- "loss": 0.0027,
1074
- "step": 1520
1075
- },
1076
- {
1077
- "epoch": 2.7818181818181817,
1078
- "grad_norm": 0.004568600095808506,
1079
- "learning_rate": 2.7070486245722837e-06,
1080
- "loss": 0.0017,
1081
- "step": 1530
1082
- },
1083
- {
1084
- "epoch": 2.8,
1085
- "grad_norm": 0.0027899593114852905,
1086
- "learning_rate": 2.2797301055607513e-06,
1087
- "loss": 0.0025,
1088
- "step": 1540
1089
- },
1090
- {
1091
- "epoch": 2.8181818181818183,
1092
- "grad_norm": 0.0008926861337386072,
1093
- "learning_rate": 1.888710677612693e-06,
1094
- "loss": 0.0014,
1095
- "step": 1550
1096
- },
1097
- {
1098
- "epoch": 2.8363636363636364,
1099
- "grad_norm": 0.0007694001542404294,
1100
- "learning_rate": 1.5341355884831431e-06,
1101
- "loss": 0.0024,
1102
- "step": 1560
1103
- },
1104
- {
1105
- "epoch": 2.8545454545454545,
1106
- "grad_norm": 0.03972714766860008,
1107
- "learning_rate": 1.2161365483429942e-06,
1108
- "loss": 0.0039,
1109
- "step": 1570
1110
- },
1111
- {
1112
- "epoch": 2.8727272727272726,
1113
- "grad_norm": 0.06516830623149872,
1114
- "learning_rate": 9.348316808541091e-07,
1115
- "loss": 0.0031,
1116
- "step": 1580
1117
- },
1118
- {
1119
- "epoch": 2.8909090909090907,
1120
- "grad_norm": 0.044061657041311264,
1121
- "learning_rate": 6.903254792911318e-07,
1122
- "loss": 0.003,
1123
- "step": 1590
1124
- },
1125
- {
1126
- "epoch": 2.909090909090909,
1127
- "grad_norm": 0.0019460883922874928,
1128
- "learning_rate": 4.827087677265585e-07,
1129
  "loss": 0.0028,
1130
- "step": 1600
1131
- },
1132
- {
1133
- "epoch": 2.9272727272727272,
1134
- "grad_norm": 0.03875862807035446,
1135
- "learning_rate": 3.1205866729324687e-07,
1136
- "loss": 0.0024,
1137
- "step": 1610
1138
- },
1139
- {
1140
- "epoch": 2.9454545454545453,
1141
- "grad_norm": 0.002218488836660981,
1142
- "learning_rate": 1.784385675371425e-07,
1143
- "loss": 0.0019,
1144
- "step": 1620
1145
- },
1146
- {
1147
- "epoch": 2.963636363636364,
1148
- "grad_norm": 0.043530985713005066,
1149
- "learning_rate": 8.189810287055899e-08,
1150
- "loss": 0.0028,
1151
- "step": 1630
1152
- },
1153
- {
1154
- "epoch": 2.981818181818182,
1155
- "grad_norm": 0.044648706912994385,
1156
- "learning_rate": 2.247313413507035e-08,
1157
- "loss": 0.0031,
1158
- "step": 1640
1159
  },
1160
  {
1161
  "epoch": 3.0,
1162
- "grad_norm": 0.04245592653751373,
1163
- "learning_rate": 1.8573528069998346e-10,
1164
- "loss": 0.0031,
1165
- "step": 1650
1166
  }
1167
  ],
1168
  "logging_steps": 10,
1169
- "max_steps": 1650,
1170
  "num_input_tokens_seen": 0,
1171
  "num_train_epochs": 3,
1172
  "save_steps": 100,
@@ -1182,7 +1077,7 @@
1182
  "attributes": {}
1183
  }
1184
  },
1185
- "total_flos": 2568571720559616.0,
1186
  "train_batch_size": 2,
1187
  "trial_name": null,
1188
  "trial_params": null
 
4
  "best_model_checkpoint": null,
5
  "epoch": 3.0,
6
  "eval_steps": 500,
7
+ "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.02,
14
+ "grad_norm": 6.955190181732178,
15
  "learning_rate": 9e-05,
16
+ "loss": 1.8499,
17
  "step": 10
18
  },
19
  {
20
+ "epoch": 0.04,
21
+ "grad_norm": 8.705879211425781,
22
  "learning_rate": 0.00019,
23
+ "loss": 0.1145,
24
  "step": 20
25
  },
26
  {
27
+ "epoch": 0.06,
28
+ "grad_norm": 1.1119884252548218,
29
+ "learning_rate": 0.00019998175187996916,
30
+ "loss": 0.0391,
31
  "step": 30
32
  },
33
  {
34
+ "epoch": 0.08,
35
+ "grad_norm": 1.489068865776062,
36
+ "learning_rate": 0.0001999186805091047,
37
+ "loss": 0.0224,
38
  "step": 40
39
  },
40
  {
41
+ "epoch": 0.1,
42
+ "grad_norm": 0.8914257884025574,
43
+ "learning_rate": 0.00019981058901312606,
44
+ "loss": 0.0211,
45
  "step": 50
46
  },
47
  {
48
+ "epoch": 0.12,
49
+ "grad_norm": 0.17663325369358063,
50
+ "learning_rate": 0.00019965752609456464,
51
+ "loss": 0.0078,
52
  "step": 60
53
  },
54
  {
55
+ "epoch": 0.14,
56
+ "grad_norm": 0.03448864817619324,
57
+ "learning_rate": 0.00019945956071862003,
58
+ "loss": 0.0081,
59
  "step": 70
60
  },
61
  {
62
+ "epoch": 0.16,
63
+ "grad_norm": 4.913799285888672,
64
+ "learning_rate": 0.00019921678208208654,
65
+ "loss": 0.0047,
66
  "step": 80
67
  },
68
  {
69
+ "epoch": 0.18,
70
+ "grad_norm": 0.6515750885009766,
71
+ "learning_rate": 0.00019892929957316397,
72
+ "loss": 0.0115,
73
  "step": 90
74
  },
75
  {
76
+ "epoch": 0.2,
77
+ "grad_norm": 0.25924447178840637,
78
+ "learning_rate": 0.00019859724272217099,
79
+ "loss": 0.0168,
80
  "step": 100
81
  },
82
  {
83
+ "epoch": 0.22,
84
+ "grad_norm": 0.21993209421634674,
85
+ "learning_rate": 0.0001982207611431827,
86
+ "loss": 0.0097,
87
  "step": 110
88
  },
89
  {
90
+ "epoch": 0.24,
91
+ "grad_norm": 1.2233567237854004,
92
+ "learning_rate": 0.00019780002446661966,
93
+ "loss": 0.0079,
94
  "step": 120
95
  },
96
  {
97
+ "epoch": 0.26,
98
+ "grad_norm": 0.14659936726093292,
99
+ "learning_rate": 0.0001973352222628176,
100
+ "loss": 0.0041,
101
  "step": 130
102
  },
103
  {
104
+ "epoch": 0.28,
105
+ "grad_norm": 0.13708041608333588,
106
+ "learning_rate": 0.0001968265639566135,
107
+ "loss": 0.0047,
108
  "step": 140
109
  },
110
  {
111
+ "epoch": 0.3,
112
+ "grad_norm": 0.11654426902532578,
113
+ "learning_rate": 0.0001962742787329852,
114
+ "loss": 0.0039,
115
  "step": 150
116
  },
117
  {
118
+ "epoch": 0.32,
119
+ "grad_norm": 0.1725812405347824,
120
+ "learning_rate": 0.00019567861543378837,
121
+ "loss": 0.0041,
122
  "step": 160
123
  },
124
  {
125
+ "epoch": 0.34,
126
+ "grad_norm": 0.002145808655768633,
127
+ "learning_rate": 0.00019503984244563616,
128
+ "loss": 0.0017,
129
  "step": 170
130
  },
131
  {
132
+ "epoch": 0.36,
133
+ "grad_norm": 2.0850086212158203,
134
+ "learning_rate": 0.000194358247578973,
135
+ "loss": 0.0048,
136
  "step": 180
137
  },
138
  {
139
+ "epoch": 0.38,
140
+ "grad_norm": 0.07945302873849869,
141
+ "learning_rate": 0.00019363413793839658,
142
+ "loss": 0.0051,
143
  "step": 190
144
  },
145
  {
146
+ "epoch": 0.4,
147
+ "grad_norm": 0.231564462184906,
148
+ "learning_rate": 0.00019286783978428624,
149
+ "loss": 0.0078,
150
  "step": 200
151
  },
152
  {
153
+ "epoch": 0.42,
154
+ "grad_norm": 0.013585828244686127,
155
+ "learning_rate": 0.00019205969838580094,
156
+ "loss": 0.0042,
157
  "step": 210
158
  },
159
  {
160
+ "epoch": 0.44,
161
+ "grad_norm": 0.009324288927018642,
162
+ "learning_rate": 0.00019121007786531178,
163
+ "loss": 0.0056,
164
  "step": 220
165
  },
166
  {
167
+ "epoch": 0.46,
168
+ "grad_norm": 0.0634087324142456,
169
+ "learning_rate": 0.00019031936103434044,
170
+ "loss": 0.0046,
171
  "step": 230
172
  },
173
  {
174
+ "epoch": 0.48,
175
+ "grad_norm": 0.06287066638469696,
176
+ "learning_rate": 0.00018938794922107675,
177
+ "loss": 0.0028,
178
  "step": 240
179
  },
180
  {
181
+ "epoch": 0.5,
182
+ "grad_norm": 0.0060167331248521805,
183
+ "learning_rate": 0.00018841626208955292,
184
+ "loss": 0.0027,
185
  "step": 250
186
  },
187
  {
188
+ "epoch": 0.52,
189
+ "grad_norm": 0.0029571247287094593,
190
+ "learning_rate": 0.0001874047374505569,
191
+ "loss": 0.0046,
192
  "step": 260
193
  },
194
  {
195
+ "epoch": 0.54,
196
+ "grad_norm": 0.0028790035285055637,
197
+ "learning_rate": 0.00018635383106436855,
198
+ "loss": 0.0028,
199
  "step": 270
200
  },
201
  {
202
+ "epoch": 0.56,
203
+ "grad_norm": 0.04535164311528206,
204
+ "learning_rate": 0.00018526401643540922,
205
+ "loss": 0.0046,
206
  "step": 280
207
  },
208
  {
209
+ "epoch": 0.58,
210
+ "grad_norm": 0.09899382293224335,
211
+ "learning_rate": 0.0001841357845988957,
212
+ "loss": 0.0037,
213
  "step": 290
214
  },
215
  {
216
+ "epoch": 0.6,
217
+ "grad_norm": 0.6568487286567688,
218
+ "learning_rate": 0.00018296964389959578,
219
+ "loss": 0.0101,
220
  "step": 300
221
  },
222
  {
223
+ "epoch": 0.62,
224
+ "grad_norm": 0.16598820686340332,
225
+ "learning_rate": 0.00018176611976278441,
226
+ "loss": 0.0237,
227
  "step": 310
228
  },
229
  {
230
+ "epoch": 0.64,
231
+ "grad_norm": 0.08521094173192978,
232
+ "learning_rate": 0.00018052575445750419,
233
+ "loss": 0.0058,
234
  "step": 320
235
  },
236
  {
237
+ "epoch": 0.66,
238
+ "grad_norm": 1.5081530809402466,
239
+ "learning_rate": 0.00017924910685223643,
240
+ "loss": 0.0205,
241
  "step": 330
242
  },
243
  {
244
+ "epoch": 0.68,
245
+ "grad_norm": 0.35874947905540466,
246
+ "learning_rate": 0.0001779367521630931,
247
+ "loss": 0.1046,
248
  "step": 340
249
  },
250
  {
251
+ "epoch": 0.7,
252
+ "grad_norm": 0.229485422372818,
253
+ "learning_rate": 0.00017658928169464312,
254
+ "loss": 0.0129,
255
  "step": 350
256
  },
257
  {
258
+ "epoch": 0.72,
259
+ "grad_norm": 0.4974204897880554,
260
+ "learning_rate": 0.00017520730257348946,
261
+ "loss": 0.006,
262
  "step": 360
263
  },
264
  {
265
+ "epoch": 0.74,
266
+ "grad_norm": 0.8693443536758423,
267
+ "learning_rate": 0.00017379143747471768,
268
+ "loss": 0.0075,
269
  "step": 370
270
  },
271
  {
272
+ "epoch": 0.76,
273
+ "grad_norm": 1.4635262489318848,
274
+ "learning_rate": 0.00017234232434133883,
275
+ "loss": 0.0077,
276
  "step": 380
277
  },
278
  {
279
+ "epoch": 0.78,
280
+ "grad_norm": 0.047906193882226944,
281
+ "learning_rate": 0.00017086061609685257,
282
+ "loss": 0.0094,
283
  "step": 390
284
  },
285
  {
286
+ "epoch": 0.8,
287
+ "grad_norm": 0.13906188309192657,
288
+ "learning_rate": 0.00016934698035106133,
289
+ "loss": 0.0096,
290
  "step": 400
291
  },
292
  {
293
+ "epoch": 0.82,
294
+ "grad_norm": 0.11411689221858978,
295
+ "learning_rate": 0.00016780209909926676,
296
+ "loss": 0.0092,
297
  "step": 410
298
  },
299
  {
300
+ "epoch": 0.84,
301
+ "grad_norm": 0.011227969080209732,
302
+ "learning_rate": 0.00016622666841498463,
303
+ "loss": 0.0084,
304
  "step": 420
305
  },
306
  {
307
+ "epoch": 0.86,
308
+ "grad_norm": 1.5423718690872192,
309
+ "learning_rate": 0.00016462139813631693,
310
+ "loss": 0.0041,
311
  "step": 430
312
  },
313
  {
314
+ "epoch": 0.88,
315
+ "grad_norm": 0.347548246383667,
316
+ "learning_rate": 0.00016298701154612147,
317
+ "loss": 0.0046,
318
  "step": 440
319
  },
320
  {
321
+ "epoch": 0.9,
322
+ "grad_norm": 0.03537672758102417,
323
+ "learning_rate": 0.00016132424504612406,
324
+ "loss": 0.016,
325
  "step": 450
326
  },
327
  {
328
+ "epoch": 0.92,
329
+ "grad_norm": 0.05080743879079819,
330
+ "learning_rate": 0.00015963384782511993,
331
+ "loss": 0.006,
332
  "step": 460
333
  },
334
  {
335
+ "epoch": 0.94,
336
+ "grad_norm": 0.00858448538929224,
337
+ "learning_rate": 0.00015791658152141327,
338
+ "loss": 0.0047,
339
  "step": 470
340
  },
341
  {
342
+ "epoch": 0.96,
343
+ "grad_norm": 0.19455233216285706,
344
+ "learning_rate": 0.00015617321987964776,
345
+ "loss": 0.0044,
346
  "step": 480
347
  },
348
  {
349
+ "epoch": 0.98,
350
+ "grad_norm": 0.08450737595558167,
351
+ "learning_rate": 0.00015440454840218225,
352
  "loss": 0.0039,
353
  "step": 490
354
  },
355
  {
356
+ "epoch": 1.0,
357
+ "grad_norm": 0.006426886655390263,
358
+ "learning_rate": 0.00015261136399516873,
359
+ "loss": 0.003,
360
  "step": 500
361
  },
362
  {
363
+ "epoch": 1.02,
364
+ "grad_norm": 0.11707904934883118,
365
+ "learning_rate": 0.00015079447460949238,
366
+ "loss": 0.0053,
367
  "step": 510
368
  },
369
  {
370
+ "epoch": 1.04,
371
+ "grad_norm": 0.005810865201056004,
372
+ "learning_rate": 0.00014895469887673483,
373
+ "loss": 0.0028,
374
  "step": 520
375
  },
376
  {
377
+ "epoch": 1.06,
378
+ "grad_norm": 0.007626334670931101,
379
+ "learning_rate": 0.00014709286574032536,
380
+ "loss": 0.0036,
381
  "step": 530
382
  },
383
  {
384
+ "epoch": 1.08,
385
+ "grad_norm": 0.012294676154851913,
386
+ "learning_rate": 0.00014520981408204574,
387
+ "loss": 0.0014,
388
  "step": 540
389
  },
390
  {
391
+ "epoch": 1.1,
392
+ "grad_norm": 0.0394788458943367,
393
+ "learning_rate": 0.00014330639234405742,
394
+ "loss": 0.0032,
395
  "step": 550
396
  },
397
  {
398
+ "epoch": 1.12,
399
+ "grad_norm": 0.006509778555482626,
400
+ "learning_rate": 0.00014138345814662068,
401
+ "loss": 0.0026,
402
  "step": 560
403
  },
404
  {
405
+ "epoch": 1.1400000000000001,
406
+ "grad_norm": 0.12478422373533249,
407
+ "learning_rate": 0.0001394418779016789,
408
+ "loss": 0.0072,
409
  "step": 570
410
  },
411
  {
412
+ "epoch": 1.16,
413
+ "grad_norm": 0.03920350596308708,
414
+ "learning_rate": 0.00013748252642248115,
415
+ "loss": 0.0034,
416
  "step": 580
417
  },
418
  {
419
+ "epoch": 1.18,
420
+ "grad_norm": 0.06001276522874832,
421
+ "learning_rate": 0.00013550628652941985,
422
+ "loss": 0.003,
423
  "step": 590
424
  },
425
  {
426
+ "epoch": 1.2,
427
+ "grad_norm": 0.016110895201563835,
428
+ "learning_rate": 0.0001335140486522604,
429
+ "loss": 0.0028,
430
  "step": 600
431
  },
432
  {
433
+ "epoch": 1.22,
434
+ "grad_norm": 0.012471065856516361,
435
+ "learning_rate": 0.00013150671042894228,
436
+ "loss": 0.003,
437
  "step": 610
438
  },
439
  {
440
+ "epoch": 1.24,
441
+ "grad_norm": 0.03785092383623123,
442
+ "learning_rate": 0.00012948517630113245,
443
+ "loss": 0.0036,
444
  "step": 620
445
  },
446
  {
447
+ "epoch": 1.26,
448
+ "grad_norm": 0.06877022981643677,
449
+ "learning_rate": 0.0001274503571067131,
450
+ "loss": 0.0032,
451
  "step": 630
452
  },
453
  {
454
+ "epoch": 1.28,
455
+ "grad_norm": 0.14263379573822021,
456
+ "learning_rate": 0.00012540316966938795,
457
+ "loss": 0.0038,
458
  "step": 640
459
  },
460
  {
461
+ "epoch": 1.3,
462
+ "grad_norm": 0.011557388119399548,
463
+ "learning_rate": 0.00012334453638559057,
464
+ "loss": 0.0034,
465
  "step": 650
466
  },
467
  {
468
+ "epoch": 1.32,
469
+ "grad_norm": 0.05995357036590576,
470
+ "learning_rate": 0.00012127538480888283,
471
+ "loss": 0.0034,
472
  "step": 660
473
  },
474
  {
475
+ "epoch": 1.34,
476
+ "grad_norm": 0.06527257710695267,
477
+ "learning_rate": 0.00011919664723202906,
478
+ "loss": 0.0028,
479
  "step": 670
480
  },
481
  {
482
+ "epoch": 1.3599999999999999,
483
+ "grad_norm": 0.12198451906442642,
484
+ "learning_rate": 0.00011710926026693525,
485
+ "loss": 0.003,
486
  "step": 680
487
  },
488
  {
489
+ "epoch": 1.38,
490
+ "grad_norm": 0.005978007335215807,
491
+ "learning_rate": 0.00011501416442264184,
492
  "loss": 0.0035,
493
  "step": 690
494
  },
495
  {
496
+ "epoch": 1.4,
497
+ "grad_norm": 0.01665549911558628,
498
+ "learning_rate": 0.00011291230368156087,
499
+ "loss": 0.0065,
500
  "step": 700
501
  },
502
  {
503
+ "epoch": 1.42,
504
+ "grad_norm": 0.24193237721920013,
505
+ "learning_rate": 0.00011080462507414806,
506
+ "loss": 0.0066,
507
  "step": 710
508
  },
509
  {
510
+ "epoch": 1.44,
511
+ "grad_norm": 0.04041582718491554,
512
+ "learning_rate": 0.00010869207825220147,
513
+ "loss": 0.0025,
514
  "step": 720
515
  },
516
  {
517
+ "epoch": 1.46,
518
+ "grad_norm": 0.0575215183198452,
519
+ "learning_rate": 0.0001065756150609792,
520
+ "loss": 0.0066,
521
  "step": 730
522
  },
523
  {
524
+ "epoch": 1.48,
525
+ "grad_norm": 0.06933045387268066,
526
+ "learning_rate": 0.00010445618911032853,
527
+ "loss": 0.0031,
528
  "step": 740
529
  },
530
  {
531
+ "epoch": 1.5,
532
+ "grad_norm": 0.08046724647283554,
533
+ "learning_rate": 0.00010233475534502042,
534
+ "loss": 0.0035,
535
  "step": 750
536
  },
537
  {
538
+ "epoch": 1.52,
539
+ "grad_norm": 0.12715864181518555,
540
+ "learning_rate": 0.00010021226961448209,
541
+ "loss": 0.0031,
542
  "step": 760
543
  },
544
  {
545
+ "epoch": 1.54,
546
+ "grad_norm": 0.05433020740747452,
547
+ "learning_rate": 9.808968824212234e-05,
548
+ "loss": 0.0016,
549
  "step": 770
550
  },
551
  {
552
+ "epoch": 1.56,
553
+ "grad_norm": 0.005047277547419071,
554
+ "learning_rate": 9.596796759444293e-05,
555
+ "loss": 0.0023,
556
  "step": 780
557
  },
558
  {
559
+ "epoch": 1.58,
560
+ "grad_norm": 0.06136553734540939,
561
+ "learning_rate": 9.384806365013113e-05,
562
+ "loss": 0.0037,
563
  "step": 790
564
  },
565
  {
566
+ "epoch": 1.6,
567
+ "grad_norm": 0.07095402479171753,
568
+ "learning_rate": 9.173093156932623e-05,
569
+ "loss": 0.0032,
570
  "step": 800
571
  },
572
  {
573
+ "epoch": 1.62,
574
+ "grad_norm": 0.007647352758795023,
575
+ "learning_rate": 8.961752526325565e-05,
576
+ "loss": 0.0031,
577
  "step": 810
578
  },
579
  {
580
+ "epoch": 1.6400000000000001,
581
+ "grad_norm": 0.10909626632928848,
582
+ "learning_rate": 8.750879696443321e-05,
583
+ "loss": 0.0044,
584
  "step": 820
585
  },
586
  {
587
+ "epoch": 1.6600000000000001,
588
+ "grad_norm": 0.005036857444792986,
589
+ "learning_rate": 8.540569679761391e-05,
590
+ "loss": 0.0017,
591
  "step": 830
592
  },
593
  {
594
+ "epoch": 1.6800000000000002,
595
+ "grad_norm": 0.04456391558051109,
596
+ "learning_rate": 8.330917235169867e-05,
597
+ "loss": 0.0028,
598
  "step": 840
599
  },
600
  {
601
+ "epoch": 1.7,
602
+ "grad_norm": 0.026686355471611023,
603
+ "learning_rate": 8.12201682527811e-05,
604
+ "loss": 0.0009,
605
  "step": 850
606
  },
607
  {
608
+ "epoch": 1.72,
609
+ "grad_norm": 0.5429267883300781,
610
+ "learning_rate": 7.913962573852996e-05,
611
+ "loss": 0.0042,
612
  "step": 860
613
  },
614
  {
615
+ "epoch": 1.74,
616
+ "grad_norm": 0.09535812586545944,
617
+ "learning_rate": 7.706848223409759e-05,
618
+ "loss": 0.003,
619
  "step": 870
620
  },
621
  {
622
+ "epoch": 1.76,
623
+ "grad_norm": 0.004211663268506527,
624
+ "learning_rate": 7.500767092974647e-05,
625
+ "loss": 0.0034,
626
  "step": 880
627
  },
628
  {
629
+ "epoch": 1.78,
630
+ "grad_norm": 0.007590270135551691,
631
+ "learning_rate": 7.295812036038407e-05,
632
+ "loss": 0.0023,
633
  "step": 890
634
  },
635
  {
636
+ "epoch": 1.8,
637
+ "grad_norm": 0.09613944590091705,
638
+ "learning_rate": 7.092075398719502e-05,
639
+ "loss": 0.0029,
640
  "step": 900
641
  },
642
  {
643
+ "epoch": 1.8199999999999998,
644
+ "grad_norm": 0.04118485003709793,
645
+ "learning_rate": 6.889648978155909e-05,
646
+ "loss": 0.0018,
647
  "step": 910
648
  },
649
  {
650
+ "epoch": 1.8399999999999999,
651
+ "grad_norm": 0.03493022918701172,
652
+ "learning_rate": 6.688623981144339e-05,
653
  "loss": 0.003,
654
  "step": 920
655
  },
656
  {
657
+ "epoch": 1.8599999999999999,
658
+ "grad_norm": 0.05923795700073242,
659
+ "learning_rate": 6.489090983045379e-05,
660
+ "loss": 0.0023,
661
  "step": 930
662
  },
663
  {
664
+ "epoch": 1.88,
665
+ "grad_norm": 0.004041542299091816,
666
+ "learning_rate": 6.291139886973169e-05,
667
+ "loss": 0.0019,
668
  "step": 940
669
  },
670
  {
671
+ "epoch": 1.9,
672
+ "grad_norm": 0.058228544890880585,
673
+ "learning_rate": 6.094859883287977e-05,
674
+ "loss": 0.0027,
675
  "step": 950
676
  },
677
  {
678
+ "epoch": 1.92,
679
+ "grad_norm": 0.0040581803768873215,
680
+ "learning_rate": 5.90033940940989e-05,
681
+ "loss": 0.0013,
682
  "step": 960
683
  },
684
  {
685
+ "epoch": 1.94,
686
+ "grad_norm": 0.0043375324457883835,
687
+ "learning_rate": 5.7076661099717986e-05,
688
+ "loss": 0.0025,
689
  "step": 970
690
  },
691
  {
692
+ "epoch": 1.96,
693
+ "grad_norm": 0.0034573073498904705,
694
+ "learning_rate": 5.5169267973295294e-05,
695
+ "loss": 0.002,
696
  "step": 980
697
  },
698
  {
699
+ "epoch": 1.98,
700
+ "grad_norm": 0.03163556754589081,
701
+ "learning_rate": 5.3282074124470284e-05,
702
+ "loss": 0.0021,
703
  "step": 990
704
  },
705
  {
706
+ "epoch": 2.0,
707
+ "grad_norm": 0.027402976527810097,
708
+ "learning_rate": 5.141592986174151e-05,
709
+ "loss": 0.0032,
710
  "step": 1000
711
  },
712
  {
713
+ "epoch": 2.02,
714
+ "grad_norm": 0.03729909658432007,
715
+ "learning_rate": 4.957167600934474e-05,
716
+ "loss": 0.0027,
717
  "step": 1010
718
  },
719
  {
720
+ "epoch": 2.04,
721
+ "grad_norm": 0.06650708615779877,
722
+ "learning_rate": 4.7750143528405126e-05,
723
+ "loss": 0.0031,
724
  "step": 1020
725
  },
726
  {
727
+ "epoch": 2.06,
728
+ "grad_norm": 0.05407334491610527,
729
+ "learning_rate": 4.595215314253285e-05,
730
+ "loss": 0.0024,
731
  "step": 1030
732
  },
733
  {
734
+ "epoch": 2.08,
735
+ "grad_norm": 0.06878269463777542,
736
+ "learning_rate": 4.417851496803164e-05,
737
+ "loss": 0.0031,
738
  "step": 1040
739
  },
740
  {
741
+ "epoch": 2.1,
742
+ "grad_norm": 0.11301030218601227,
743
+ "learning_rate": 4.243002814888656e-05,
744
+ "loss": 0.0029,
745
  "step": 1050
746
  },
747
  {
748
+ "epoch": 2.12,
749
+ "grad_norm": 0.0333506241440773,
750
+ "learning_rate": 4.0707480496695514e-05,
751
+ "loss": 0.0022,
752
  "step": 1060
753
  },
754
  {
755
+ "epoch": 2.14,
756
+ "grad_norm": 0.07614877074956894,
757
+ "learning_rate": 3.9011648135706966e-05,
758
+ "loss": 0.0022,
759
  "step": 1070
760
  },
761
  {
762
+ "epoch": 2.16,
763
+ "grad_norm": 0.0026465761475265026,
764
+ "learning_rate": 3.734329515312349e-05,
765
+ "loss": 0.0023,
766
  "step": 1080
767
  },
768
  {
769
+ "epoch": 2.18,
770
+ "grad_norm": 0.003543775761500001,
771
+ "learning_rate": 3.570317325482847e-05,
772
+ "loss": 0.0017,
773
  "step": 1090
774
  },
775
  {
776
+ "epoch": 2.2,
777
+ "grad_norm": 0.058600034564733505,
778
+ "learning_rate": 3.409202142669213e-05,
779
+ "loss": 0.0035,
780
  "step": 1100
781
  },
782
  {
783
+ "epoch": 2.22,
784
+ "grad_norm": 0.04403044655919075,
785
+ "learning_rate": 3.251056560160821e-05,
786
+ "loss": 0.003,
787
  "step": 1110
788
  },
789
  {
790
+ "epoch": 2.24,
791
+ "grad_norm": 0.12272568047046661,
792
+ "learning_rate": 3.095951833241213e-05,
793
+ "loss": 0.003,
794
  "step": 1120
795
  },
796
  {
797
+ "epoch": 2.26,
798
+ "grad_norm": 0.003978225402534008,
799
+ "learning_rate": 2.9439578470827755e-05,
800
+ "loss": 0.0026,
801
  "step": 1130
802
  },
803
  {
804
+ "epoch": 2.2800000000000002,
805
+ "grad_norm": 0.0032513344194740057,
806
+ "learning_rate": 2.7951430852587268e-05,
807
+ "loss": 0.0035,
808
  "step": 1140
809
  },
810
  {
811
+ "epoch": 2.3,
812
+ "grad_norm": 0.0030118192080408335,
813
+ "learning_rate": 2.649574598886665e-05,
814
+ "loss": 0.0021,
815
  "step": 1150
816
  },
817
  {
818
+ "epoch": 2.32,
819
+ "grad_norm": 0.050590354949235916,
820
+ "learning_rate": 2.507317976417475e-05,
821
  "loss": 0.0031,
822
  "step": 1160
823
  },
824
  {
825
+ "epoch": 2.34,
826
+ "grad_norm": 0.024627618491649628,
827
+ "learning_rate": 2.3684373140833016e-05,
828
+ "loss": 0.0029,
829
  "step": 1170
830
  },
831
  {
832
+ "epoch": 2.36,
833
+ "grad_norm": 0.05296558514237404,
834
+ "learning_rate": 2.2329951870178655e-05,
835
+ "loss": 0.0041,
836
  "step": 1180
837
  },
838
  {
839
+ "epoch": 2.38,
840
+ "grad_norm": 0.00513384910300374,
841
+ "learning_rate": 2.1010526210621406e-05,
842
+ "loss": 0.002,
843
  "step": 1190
844
  },
845
  {
846
+ "epoch": 2.4,
847
+ "grad_norm": 0.008860021829605103,
848
+ "learning_rate": 1.9726690652680578e-05,
849
+ "loss": 0.0019,
850
  "step": 1200
851
  },
852
  {
853
+ "epoch": 2.42,
854
+ "grad_norm": 0.0037229766603559256,
855
+ "learning_rate": 1.8479023651127115e-05,
856
+ "loss": 0.0023,
857
  "step": 1210
858
  },
859
  {
860
+ "epoch": 2.44,
861
+ "grad_norm": 0.004817347973585129,
862
+ "learning_rate": 1.726808736435046e-05,
863
+ "loss": 0.0015,
864
  "step": 1220
865
  },
866
  {
867
+ "epoch": 2.46,
868
+ "grad_norm": 0.05881744623184204,
869
+ "learning_rate": 1.6094427401068224e-05,
870
+ "loss": 0.0028,
871
  "step": 1230
872
  },
873
  {
874
+ "epoch": 2.48,
875
+ "grad_norm": 0.00562276691198349,
876
+ "learning_rate": 1.4958572574492501e-05,
877
+ "loss": 0.0023,
878
  "step": 1240
879
  },
880
  {
881
+ "epoch": 2.5,
882
+ "grad_norm": 0.0030358799267560244,
883
+ "learning_rate": 1.38610346640637e-05,
884
+ "loss": 0.0018,
885
  "step": 1250
886
  },
887
  {
888
+ "epoch": 2.52,
889
+ "grad_norm": 0.003225122345611453,
890
+ "learning_rate": 1.2802308184859502e-05,
891
+ "loss": 0.0013,
892
  "step": 1260
893
  },
894
  {
895
+ "epoch": 2.54,
896
+ "grad_norm": 0.054522693157196045,
897
+ "learning_rate": 1.1782870164782111e-05,
898
+ "loss": 0.0019,
899
  "step": 1270
900
  },
901
  {
902
+ "epoch": 2.56,
903
+ "grad_norm": 0.041042644530534744,
904
+ "learning_rate": 1.0803179929624973e-05,
905
+ "loss": 0.0021,
906
  "step": 1280
907
  },
908
  {
909
+ "epoch": 2.58,
910
+ "grad_norm": 0.00768931582570076,
911
+ "learning_rate": 9.863678896115559e-06,
912
+ "loss": 0.0022,
913
  "step": 1290
914
  },
915
  {
916
+ "epoch": 2.6,
917
+ "grad_norm": 0.0033417909871786833,
918
+ "learning_rate": 8.964790373027132e-06,
919
+ "loss": 0.0017,
920
  "step": 1300
921
  },
922
  {
923
+ "epoch": 2.62,
924
+ "grad_norm": 0.04406670480966568,
925
+ "learning_rate": 8.106919370449572e-06,
926
+ "loss": 0.0027,
927
  "step": 1310
928
  },
929
  {
930
+ "epoch": 2.64,
931
+ "grad_norm": 0.043826743960380554,
932
+ "learning_rate": 7.290452417304916e-06,
933
+ "loss": 0.0024,
934
  "step": 1320
935
  },
936
  {
937
+ "epoch": 2.66,
938
+ "grad_norm": 0.08563917130231857,
939
+ "learning_rate": 6.515757387189902e-06,
940
+ "loss": 0.0017,
941
  "step": 1330
942
  },
943
  {
944
+ "epoch": 2.68,
945
+ "grad_norm": 0.003863748861476779,
946
+ "learning_rate": 5.783183332624098e-06,
947
+ "loss": 0.0028,
948
  "step": 1340
949
  },
950
  {
951
+ "epoch": 2.7,
952
+ "grad_norm": 0.00339197413995862,
953
+ "learning_rate": 5.093060327778043e-06,
954
+ "loss": 0.0028,
955
  "step": 1350
956
  },
957
  {
958
+ "epoch": 2.7199999999999998,
959
+ "grad_norm": 0.11382139474153519,
960
+ "learning_rate": 4.445699319752539e-06,
961
+ "loss": 0.0034,
962
  "step": 1360
963
  },
964
  {
965
+ "epoch": 2.74,
966
+ "grad_norm": 0.0028220233507454395,
967
+ "learning_rate": 3.841391988476018e-06,
968
+ "loss": 0.0018,
969
  "step": 1370
970
  },
971
  {
972
+ "epoch": 2.76,
973
+ "grad_norm": 0.058400608599185944,
974
+ "learning_rate": 3.2804106152828582e-06,
975
+ "loss": 0.0027,
976
  "step": 1380
977
  },
978
  {
979
+ "epoch": 2.7800000000000002,
980
+ "grad_norm": 0.110735222697258,
981
+ "learning_rate": 2.7630079602323442e-06,
982
+ "loss": 0.0026,
983
  "step": 1390
984
  },
985
  {
986
+ "epoch": 2.8,
987
+ "grad_norm": 0.0034612929448485374,
988
+ "learning_rate": 2.289417148223094e-06,
989
+ "loss": 0.0021,
990
  "step": 1400
991
  },
992
  {
993
+ "epoch": 2.82,
994
+ "grad_norm": 0.04820137843489647,
995
+ "learning_rate": 1.8598515639545622e-06,
996
+ "loss": 0.0021,
997
  "step": 1410
998
  },
999
  {
1000
+ "epoch": 2.84,
1001
+ "grad_norm": 0.08901210874319077,
1002
+ "learning_rate": 1.4745047557827796e-06,
1003
+ "loss": 0.0017,
1004
  "step": 1420
1005
  },
1006
  {
1007
+ "epoch": 2.86,
1008
+ "grad_norm": 0.052542563527822495,
1009
+ "learning_rate": 1.133550348513701e-06,
1010
+ "loss": 0.0017,
1011
  "step": 1430
1012
  },
1013
  {
1014
+ "epoch": 2.88,
1015
+ "grad_norm": 0.0500030443072319,
1016
+ "learning_rate": 8.371419651735268e-07,
1017
+ "loss": 0.0018,
1018
  "step": 1440
1019
  },
1020
  {
1021
+ "epoch": 2.9,
1022
+ "grad_norm": 0.05235590413212776,
1023
+ "learning_rate": 5.854131577911259e-07,
1024
+ "loss": 0.0025,
1025
  "step": 1450
1026
  },
1027
  {
1028
+ "epoch": 2.92,
1029
+ "grad_norm": 0.050380345433950424,
1030
+ "learning_rate": 3.7847734722378234e-07,
1031
+ "loss": 0.0021,
1032
  "step": 1460
1033
  },
1034
  {
1035
+ "epoch": 2.94,
1036
+ "grad_norm": 0.04665536433458328,
1037
+ "learning_rate": 2.1642777205346242e-07,
1038
+ "loss": 0.0021,
1039
  "step": 1470
1040
  },
1041
  {
1042
+ "epoch": 2.96,
1043
+ "grad_norm": 0.08257055282592773,
1044
+ "learning_rate": 9.933744657651956e-08,
1045
  "loss": 0.0026,
1046
  "step": 1480
1047
  },
1048
  {
1049
+ "epoch": 2.98,
1050
+ "grad_norm": 0.04862203821539879,
1051
+ "learning_rate": 2.7259127905776562e-08,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1052
  "loss": 0.0028,
1053
+ "step": 1490
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1054
  },
1055
  {
1056
  "epoch": 3.0,
1057
+ "grad_norm": 0.05159607157111168,
1058
+ "learning_rate": 2.252921999401636e-10,
1059
+ "loss": 0.0029,
1060
+ "step": 1500
1061
  }
1062
  ],
1063
  "logging_steps": 10,
1064
+ "max_steps": 1500,
1065
  "num_input_tokens_seen": 0,
1066
  "num_train_epochs": 3,
1067
  "save_steps": 100,
 
1077
  "attributes": {}
1078
  }
1079
  },
1080
+ "total_flos": 2975560292163072.0,
1081
  "train_batch_size": 2,
1082
  "trial_name": null,
1083
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abc526e0d2d37a3bd6eaa08f79cb4da62578b543694bda368ffceb921df35e95
3
  size 6353
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bad76c15491571a2b0d904f43b98d3ec2521c42abf54bc17c579eedfa7b8332a
3
  size 6353