TahalliAnas commited on
Commit
a4a26db
·
verified ·
1 Parent(s): 3301a1e

first upload from kaggle

Browse files
checkpoint-1182/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6cdac161c65a4b68dcf9ba007ba6f52dfacfd42d2554a1490e37b5030ba7dca9
3
  size 891644712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:841f002f0348e373f18727bdd1a79c542747fbd6590d9f8f3330abfc78d84b62
3
  size 891644712
checkpoint-1182/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d191bf2d3cb3a44e8c60e2a9e3184e23cb1ebff95585e6e7013497a4b922c487
3
  size 1783444794
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:061e75092c0432414c945a6b9b5a4e47e663d5dcf11823594c8e79a2ab4cbd2b
3
  size 1783444794
checkpoint-1182/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:835d84d51ea3b187a26482459bf2fecaf3fcb0bbe41bff40605191987b4fecf0
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cfc0127b3222c5a66e96a5ce914651ccebce1c55569e992240485bf1c037abd
3
  size 14244
checkpoint-1182/trainer_state.json CHANGED
@@ -11,876 +11,876 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.050761421319796954,
14
- "grad_norm": 2.660665988922119,
15
  "learning_rate": 4.961928934010153e-05,
16
- "loss": 6.8339,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.10152284263959391,
21
- "grad_norm": 1.533976674079895,
22
  "learning_rate": 4.919627749576988e-05,
23
- "loss": 3.429,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.15228426395939088,
28
- "grad_norm": 1.221410870552063,
29
  "learning_rate": 4.877326565143824e-05,
30
- "loss": 2.5135,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.20304568527918782,
35
- "grad_norm": 1.1820554733276367,
36
  "learning_rate": 4.83502538071066e-05,
37
- "loss": 1.8569,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.25380710659898476,
42
- "grad_norm": 1.0971927642822266,
43
  "learning_rate": 4.792724196277496e-05,
44
- "loss": 1.4046,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.30456852791878175,
49
- "grad_norm": 1.0558425188064575,
50
  "learning_rate": 4.750423011844332e-05,
51
- "loss": 1.0788,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.3553299492385787,
56
- "grad_norm": 0.9039331674575806,
57
  "learning_rate": 4.7081218274111674e-05,
58
- "loss": 0.8565,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.40609137055837563,
63
- "grad_norm": 0.9374755620956421,
64
  "learning_rate": 4.665820642978004e-05,
65
- "loss": 0.6552,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.45685279187817257,
70
- "grad_norm": 0.924466073513031,
71
  "learning_rate": 4.6235194585448395e-05,
72
- "loss": 0.5584,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.5076142131979695,
77
- "grad_norm": 0.7860057950019836,
78
  "learning_rate": 4.5812182741116755e-05,
79
- "loss": 0.4996,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.5583756345177665,
84
- "grad_norm": 0.6625454425811768,
85
  "learning_rate": 4.538917089678511e-05,
86
- "loss": 0.4086,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.6091370558375635,
91
- "grad_norm": 0.6682830452919006,
92
  "learning_rate": 4.496615905245347e-05,
93
- "loss": 0.3645,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 0.6598984771573604,
98
- "grad_norm": 0.6779201626777649,
99
  "learning_rate": 4.454314720812183e-05,
100
- "loss": 0.334,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 0.7106598984771574,
105
- "grad_norm": 0.5805050730705261,
106
  "learning_rate": 4.412013536379019e-05,
107
- "loss": 0.2835,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.7614213197969543,
112
- "grad_norm": 0.6666281819343567,
113
  "learning_rate": 4.369712351945855e-05,
114
- "loss": 0.2735,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 0.8121827411167513,
119
- "grad_norm": 0.5769293308258057,
120
  "learning_rate": 4.32741116751269e-05,
121
- "loss": 0.2341,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 0.8629441624365483,
126
- "grad_norm": 0.5870254635810852,
127
  "learning_rate": 4.285109983079527e-05,
128
- "loss": 0.2358,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 0.9137055837563451,
133
- "grad_norm": 0.5556586384773254,
134
  "learning_rate": 4.242808798646362e-05,
135
- "loss": 0.2271,
136
  "step": 180
137
  },
138
  {
139
  "epoch": 0.9644670050761421,
140
- "grad_norm": 0.5297289490699768,
141
  "learning_rate": 4.200507614213198e-05,
142
- "loss": 0.2185,
143
  "step": 190
144
  },
145
  {
146
  "epoch": 1.0,
147
- "eval_loss": 0.11792419105768204,
148
- "eval_runtime": 8.2824,
149
- "eval_samples_per_second": 42.258,
150
- "eval_steps_per_second": 2.656,
151
  "step": 197
152
  },
153
  {
154
  "epoch": 1.015228426395939,
155
- "grad_norm": 0.5531201958656311,
156
  "learning_rate": 4.1582064297800336e-05,
157
- "loss": 0.1987,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.0659898477157361,
162
- "grad_norm": 0.5119745135307312,
163
  "learning_rate": 4.1159052453468696e-05,
164
- "loss": 0.1823,
165
  "step": 210
166
  },
167
  {
168
  "epoch": 1.116751269035533,
169
- "grad_norm": 0.4964381158351898,
170
  "learning_rate": 4.073604060913706e-05,
171
- "loss": 0.1783,
172
  "step": 220
173
  },
174
  {
175
  "epoch": 1.16751269035533,
176
- "grad_norm": 0.49353310465812683,
177
  "learning_rate": 4.0313028764805416e-05,
178
- "loss": 0.1678,
179
  "step": 230
180
  },
181
  {
182
  "epoch": 1.218274111675127,
183
- "grad_norm": 0.44556906819343567,
184
  "learning_rate": 3.9890016920473777e-05,
185
- "loss": 0.1704,
186
  "step": 240
187
  },
188
  {
189
  "epoch": 1.2690355329949239,
190
- "grad_norm": 0.4472508132457733,
191
  "learning_rate": 3.946700507614213e-05,
192
- "loss": 0.1661,
193
  "step": 250
194
  },
195
  {
196
  "epoch": 1.3197969543147208,
197
- "grad_norm": 0.4651392996311188,
198
  "learning_rate": 3.90439932318105e-05,
199
- "loss": 0.1597,
200
  "step": 260
201
  },
202
  {
203
  "epoch": 1.3705583756345177,
204
- "grad_norm": 0.43394404649734497,
205
  "learning_rate": 3.862098138747885e-05,
206
- "loss": 0.1557,
207
  "step": 270
208
  },
209
  {
210
  "epoch": 1.4213197969543148,
211
- "grad_norm": 0.40666043758392334,
212
  "learning_rate": 3.819796954314721e-05,
213
- "loss": 0.1532,
214
  "step": 280
215
  },
216
  {
217
  "epoch": 1.4720812182741116,
218
- "grad_norm": 0.35698583722114563,
219
  "learning_rate": 3.7774957698815564e-05,
220
- "loss": 0.1423,
221
  "step": 290
222
  },
223
  {
224
  "epoch": 1.5228426395939088,
225
- "grad_norm": 0.38902562856674194,
226
  "learning_rate": 3.735194585448393e-05,
227
- "loss": 0.1446,
228
  "step": 300
229
  },
230
  {
231
  "epoch": 1.5736040609137056,
232
- "grad_norm": 0.43386396765708923,
233
  "learning_rate": 3.692893401015229e-05,
234
- "loss": 0.1451,
235
  "step": 310
236
  },
237
  {
238
  "epoch": 1.6243654822335025,
239
- "grad_norm": 0.44021275639533997,
240
  "learning_rate": 3.6505922165820644e-05,
241
- "loss": 0.1423,
242
  "step": 320
243
  },
244
  {
245
  "epoch": 1.6751269035532994,
246
- "grad_norm": 0.35672277212142944,
247
  "learning_rate": 3.6082910321489004e-05,
248
- "loss": 0.1355,
249
  "step": 330
250
  },
251
  {
252
  "epoch": 1.7258883248730963,
253
- "grad_norm": 0.4129314124584198,
254
  "learning_rate": 3.565989847715736e-05,
255
- "loss": 0.137,
256
  "step": 340
257
  },
258
  {
259
  "epoch": 1.7766497461928934,
260
- "grad_norm": 0.33145004510879517,
261
  "learning_rate": 3.5236886632825724e-05,
262
- "loss": 0.1325,
263
  "step": 350
264
  },
265
  {
266
  "epoch": 1.8274111675126905,
267
- "grad_norm": 0.3832840323448181,
268
  "learning_rate": 3.481387478849408e-05,
269
- "loss": 0.1299,
270
  "step": 360
271
  },
272
  {
273
  "epoch": 1.8781725888324874,
274
- "grad_norm": 0.4180988073348999,
275
  "learning_rate": 3.439086294416244e-05,
276
- "loss": 0.1344,
277
  "step": 370
278
  },
279
  {
280
  "epoch": 1.9289340101522843,
281
- "grad_norm": 0.35554927587509155,
282
  "learning_rate": 3.396785109983079e-05,
283
- "loss": 0.1207,
284
  "step": 380
285
  },
286
  {
287
  "epoch": 1.9796954314720812,
288
- "grad_norm": 0.3630838096141815,
289
  "learning_rate": 3.354483925549916e-05,
290
- "loss": 0.1226,
291
  "step": 390
292
  },
293
  {
294
  "epoch": 2.0,
295
- "eval_loss": 0.08632376790046692,
296
- "eval_runtime": 8.3339,
297
- "eval_samples_per_second": 41.997,
298
- "eval_steps_per_second": 2.64,
299
  "step": 394
300
  },
301
  {
302
  "epoch": 2.030456852791878,
303
- "grad_norm": 0.32654863595962524,
304
  "learning_rate": 3.312182741116752e-05,
305
- "loss": 0.1246,
306
  "step": 400
307
  },
308
  {
309
  "epoch": 2.081218274111675,
310
- "grad_norm": 0.37500277161598206,
311
  "learning_rate": 3.269881556683587e-05,
312
- "loss": 0.122,
313
  "step": 410
314
  },
315
  {
316
  "epoch": 2.1319796954314723,
317
- "grad_norm": 0.31452369689941406,
318
  "learning_rate": 3.227580372250423e-05,
319
- "loss": 0.1195,
320
  "step": 420
321
  },
322
  {
323
  "epoch": 2.182741116751269,
324
- "grad_norm": 0.29448235034942627,
325
  "learning_rate": 3.185279187817259e-05,
326
- "loss": 0.1186,
327
  "step": 430
328
  },
329
  {
330
  "epoch": 2.233502538071066,
331
- "grad_norm": 0.2858859598636627,
332
  "learning_rate": 3.142978003384095e-05,
333
- "loss": 0.1169,
334
  "step": 440
335
  },
336
  {
337
  "epoch": 2.284263959390863,
338
- "grad_norm": 0.32630932331085205,
339
  "learning_rate": 3.1006768189509306e-05,
340
- "loss": 0.1161,
341
  "step": 450
342
  },
343
  {
344
  "epoch": 2.33502538071066,
345
- "grad_norm": 0.31745606660842896,
346
  "learning_rate": 3.0583756345177666e-05,
347
- "loss": 0.1175,
348
  "step": 460
349
  },
350
  {
351
  "epoch": 2.3857868020304567,
352
- "grad_norm": 0.359678715467453,
353
  "learning_rate": 3.016074450084603e-05,
354
- "loss": 0.112,
355
  "step": 470
356
  },
357
  {
358
  "epoch": 2.436548223350254,
359
- "grad_norm": 0.3830055594444275,
360
  "learning_rate": 2.9737732656514383e-05,
361
- "loss": 0.1154,
362
  "step": 480
363
  },
364
  {
365
  "epoch": 2.487309644670051,
366
- "grad_norm": 0.35126161575317383,
367
  "learning_rate": 2.9314720812182743e-05,
368
- "loss": 0.1159,
369
  "step": 490
370
  },
371
  {
372
  "epoch": 2.5380710659898478,
373
- "grad_norm": 0.32588812708854675,
374
  "learning_rate": 2.88917089678511e-05,
375
- "loss": 0.112,
376
  "step": 500
377
  },
378
  {
379
  "epoch": 2.5888324873096447,
380
- "grad_norm": 0.349881649017334,
381
  "learning_rate": 2.846869712351946e-05,
382
- "loss": 0.1088,
383
  "step": 510
384
  },
385
  {
386
  "epoch": 2.6395939086294415,
387
- "grad_norm": 0.287137508392334,
388
  "learning_rate": 2.8045685279187816e-05,
389
- "loss": 0.1077,
390
  "step": 520
391
  },
392
  {
393
  "epoch": 2.6903553299492384,
394
- "grad_norm": 0.32511353492736816,
395
  "learning_rate": 2.7622673434856176e-05,
396
- "loss": 0.1111,
397
  "step": 530
398
  },
399
  {
400
  "epoch": 2.7411167512690353,
401
- "grad_norm": 0.2147095799446106,
402
  "learning_rate": 2.7199661590524533e-05,
403
- "loss": 0.1083,
404
  "step": 540
405
  },
406
  {
407
  "epoch": 2.7918781725888326,
408
- "grad_norm": 0.30077311396598816,
409
  "learning_rate": 2.6776649746192893e-05,
410
- "loss": 0.1124,
411
  "step": 550
412
  },
413
  {
414
  "epoch": 2.8426395939086295,
415
- "grad_norm": 0.2887895703315735,
416
  "learning_rate": 2.6353637901861257e-05,
417
- "loss": 0.1078,
418
  "step": 560
419
  },
420
  {
421
  "epoch": 2.8934010152284264,
422
- "grad_norm": 0.28950536251068115,
423
  "learning_rate": 2.593062605752961e-05,
424
- "loss": 0.106,
425
  "step": 570
426
  },
427
  {
428
  "epoch": 2.9441624365482233,
429
- "grad_norm": 0.3079921007156372,
430
  "learning_rate": 2.5507614213197974e-05,
431
- "loss": 0.1096,
432
  "step": 580
433
  },
434
  {
435
  "epoch": 2.99492385786802,
436
- "grad_norm": 0.23043136298656464,
437
  "learning_rate": 2.5084602368866327e-05,
438
- "loss": 0.1034,
439
  "step": 590
440
  },
441
  {
442
  "epoch": 3.0,
443
- "eval_loss": 0.0816434845328331,
444
- "eval_runtime": 8.6929,
445
- "eval_samples_per_second": 40.263,
446
- "eval_steps_per_second": 2.531,
447
  "step": 591
448
  },
449
  {
450
  "epoch": 3.045685279187817,
451
- "grad_norm": 0.2885898947715759,
452
  "learning_rate": 2.466159052453469e-05,
453
- "loss": 0.1027,
454
  "step": 600
455
  },
456
  {
457
  "epoch": 3.0964467005076144,
458
- "grad_norm": 0.3026517927646637,
459
  "learning_rate": 2.4238578680203047e-05,
460
- "loss": 0.1033,
461
  "step": 610
462
  },
463
  {
464
  "epoch": 3.1472081218274113,
465
- "grad_norm": 0.3037049472332001,
466
  "learning_rate": 2.3815566835871404e-05,
467
- "loss": 0.1056,
468
  "step": 620
469
  },
470
  {
471
  "epoch": 3.197969543147208,
472
- "grad_norm": 0.2880629897117615,
473
  "learning_rate": 2.3392554991539764e-05,
474
- "loss": 0.102,
475
  "step": 630
476
  },
477
  {
478
  "epoch": 3.248730964467005,
479
- "grad_norm": 0.23089496791362762,
480
  "learning_rate": 2.296954314720812e-05,
481
- "loss": 0.1057,
482
  "step": 640
483
  },
484
  {
485
  "epoch": 3.299492385786802,
486
- "grad_norm": 0.30170634388923645,
487
  "learning_rate": 2.254653130287648e-05,
488
- "loss": 0.1064,
489
  "step": 650
490
  },
491
  {
492
  "epoch": 3.350253807106599,
493
- "grad_norm": 0.21023182570934296,
494
  "learning_rate": 2.2123519458544838e-05,
495
- "loss": 0.1054,
496
  "step": 660
497
  },
498
  {
499
  "epoch": 3.401015228426396,
500
- "grad_norm": 0.33246785402297974,
501
  "learning_rate": 2.17005076142132e-05,
502
- "loss": 0.1053,
503
  "step": 670
504
  },
505
  {
506
  "epoch": 3.451776649746193,
507
- "grad_norm": 0.2597705125808716,
508
  "learning_rate": 2.1277495769881558e-05,
509
- "loss": 0.1041,
510
  "step": 680
511
  },
512
  {
513
  "epoch": 3.50253807106599,
514
- "grad_norm": 0.28724995255470276,
515
  "learning_rate": 2.085448392554992e-05,
516
- "loss": 0.1002,
517
  "step": 690
518
  },
519
  {
520
  "epoch": 3.553299492385787,
521
- "grad_norm": 0.23450523614883423,
522
  "learning_rate": 2.0431472081218275e-05,
523
- "loss": 0.1011,
524
  "step": 700
525
  },
526
  {
527
  "epoch": 3.6040609137055837,
528
- "grad_norm": 0.3018731474876404,
529
  "learning_rate": 2.0008460236886635e-05,
530
- "loss": 0.0987,
531
  "step": 710
532
  },
533
  {
534
  "epoch": 3.6548223350253806,
535
- "grad_norm": 0.2904147207736969,
536
  "learning_rate": 1.9585448392554992e-05,
537
- "loss": 0.1031,
538
  "step": 720
539
  },
540
  {
541
  "epoch": 3.7055837563451774,
542
- "grad_norm": 0.25796592235565186,
543
  "learning_rate": 1.916243654822335e-05,
544
- "loss": 0.1032,
545
  "step": 730
546
  },
547
  {
548
  "epoch": 3.7563451776649748,
549
- "grad_norm": 0.2625073194503784,
550
  "learning_rate": 1.873942470389171e-05,
551
- "loss": 0.0975,
552
  "step": 740
553
  },
554
  {
555
  "epoch": 3.8071065989847717,
556
- "grad_norm": 0.2199327051639557,
557
  "learning_rate": 1.831641285956007e-05,
558
- "loss": 0.1015,
559
  "step": 750
560
  },
561
  {
562
  "epoch": 3.8578680203045685,
563
- "grad_norm": 0.24463722109794617,
564
  "learning_rate": 1.789340101522843e-05,
565
- "loss": 0.103,
566
  "step": 760
567
  },
568
  {
569
  "epoch": 3.9086294416243654,
570
- "grad_norm": 0.25082287192344666,
571
  "learning_rate": 1.7470389170896786e-05,
572
- "loss": 0.0978,
573
  "step": 770
574
  },
575
  {
576
  "epoch": 3.9593908629441623,
577
- "grad_norm": 0.24737493693828583,
578
  "learning_rate": 1.7047377326565146e-05,
579
- "loss": 0.1014,
580
  "step": 780
581
  },
582
  {
583
  "epoch": 4.0,
584
- "eval_loss": 0.0787789523601532,
585
- "eval_runtime": 8.3689,
586
- "eval_samples_per_second": 41.822,
587
- "eval_steps_per_second": 2.629,
588
  "step": 788
589
  },
590
  {
591
  "epoch": 4.01015228426396,
592
- "grad_norm": 0.35116952657699585,
593
  "learning_rate": 1.6624365482233503e-05,
594
- "loss": 0.1018,
595
  "step": 790
596
  },
597
  {
598
  "epoch": 4.060913705583756,
599
- "grad_norm": 0.258379191160202,
600
  "learning_rate": 1.6201353637901863e-05,
601
- "loss": 0.1005,
602
  "step": 800
603
  },
604
  {
605
  "epoch": 4.111675126903553,
606
- "grad_norm": 0.25503265857696533,
607
  "learning_rate": 1.577834179357022e-05,
608
- "loss": 0.0992,
609
  "step": 810
610
  },
611
  {
612
  "epoch": 4.16243654822335,
613
- "grad_norm": 0.23493419587612152,
614
  "learning_rate": 1.535532994923858e-05,
615
- "loss": 0.1006,
616
  "step": 820
617
  },
618
  {
619
  "epoch": 4.213197969543147,
620
- "grad_norm": 0.23928825557231903,
621
  "learning_rate": 1.493231810490694e-05,
622
- "loss": 0.1013,
623
  "step": 830
624
  },
625
  {
626
  "epoch": 4.2639593908629445,
627
- "grad_norm": 0.28521081805229187,
628
  "learning_rate": 1.4509306260575298e-05,
629
- "loss": 0.1008,
630
  "step": 840
631
  },
632
  {
633
  "epoch": 4.314720812182741,
634
- "grad_norm": 0.2244846224784851,
635
  "learning_rate": 1.4086294416243657e-05,
636
- "loss": 0.0978,
637
  "step": 850
638
  },
639
  {
640
  "epoch": 4.365482233502538,
641
- "grad_norm": 0.32869383692741394,
642
  "learning_rate": 1.3663282571912014e-05,
643
- "loss": 0.0969,
644
  "step": 860
645
  },
646
  {
647
  "epoch": 4.416243654822335,
648
- "grad_norm": 0.23633809387683868,
649
  "learning_rate": 1.3240270727580372e-05,
650
- "loss": 0.1013,
651
  "step": 870
652
  },
653
  {
654
  "epoch": 4.467005076142132,
655
- "grad_norm": 0.3110709488391876,
656
  "learning_rate": 1.281725888324873e-05,
657
- "loss": 0.0937,
658
  "step": 880
659
  },
660
  {
661
  "epoch": 4.517766497461929,
662
- "grad_norm": 0.22026929259300232,
663
  "learning_rate": 1.239424703891709e-05,
664
- "loss": 0.0982,
665
  "step": 890
666
  },
667
  {
668
  "epoch": 4.568527918781726,
669
- "grad_norm": 0.23756134510040283,
670
  "learning_rate": 1.1971235194585449e-05,
671
- "loss": 0.0954,
672
  "step": 900
673
  },
674
  {
675
  "epoch": 4.619289340101523,
676
- "grad_norm": 0.24070966243743896,
677
  "learning_rate": 1.1548223350253808e-05,
678
- "loss": 0.1013,
679
  "step": 910
680
  },
681
  {
682
  "epoch": 4.67005076142132,
683
- "grad_norm": 0.22478370368480682,
684
  "learning_rate": 1.1125211505922166e-05,
685
- "loss": 0.0928,
686
  "step": 920
687
  },
688
  {
689
  "epoch": 4.720812182741117,
690
- "grad_norm": 0.19010470807552338,
691
  "learning_rate": 1.0702199661590526e-05,
692
- "loss": 0.0986,
693
  "step": 930
694
  },
695
  {
696
  "epoch": 4.771573604060913,
697
- "grad_norm": 0.26357847452163696,
698
  "learning_rate": 1.0279187817258885e-05,
699
- "loss": 0.0898,
700
  "step": 940
701
  },
702
  {
703
  "epoch": 4.822335025380711,
704
- "grad_norm": 0.2300788015127182,
705
  "learning_rate": 9.856175972927243e-06,
706
- "loss": 0.0958,
707
  "step": 950
708
  },
709
  {
710
  "epoch": 4.873096446700508,
711
- "grad_norm": 0.25872883200645447,
712
  "learning_rate": 9.433164128595601e-06,
713
- "loss": 0.0957,
714
  "step": 960
715
  },
716
  {
717
  "epoch": 4.9238578680203045,
718
- "grad_norm": 0.21223337948322296,
719
  "learning_rate": 9.01015228426396e-06,
720
- "loss": 0.0992,
721
  "step": 970
722
  },
723
  {
724
  "epoch": 4.974619289340102,
725
- "grad_norm": 0.2555055320262909,
726
  "learning_rate": 8.587140439932318e-06,
727
- "loss": 0.0953,
728
  "step": 980
729
  },
730
  {
731
  "epoch": 5.0,
732
- "eval_loss": 0.07795918732881546,
733
- "eval_runtime": 8.314,
734
- "eval_samples_per_second": 42.098,
735
- "eval_steps_per_second": 2.646,
736
  "step": 985
737
  },
738
  {
739
  "epoch": 5.025380710659898,
740
- "grad_norm": 0.22976677119731903,
741
  "learning_rate": 8.164128595600677e-06,
742
- "loss": 0.0938,
743
  "step": 990
744
  },
745
  {
746
  "epoch": 5.0761421319796955,
747
- "grad_norm": 0.21435701847076416,
748
  "learning_rate": 7.741116751269035e-06,
749
- "loss": 0.0938,
750
  "step": 1000
751
  },
752
  {
753
  "epoch": 5.126903553299492,
754
- "grad_norm": 0.22609326243400574,
755
  "learning_rate": 7.318104906937395e-06,
756
- "loss": 0.0977,
757
  "step": 1010
758
  },
759
  {
760
  "epoch": 5.177664974619289,
761
- "grad_norm": 0.19491638243198395,
762
  "learning_rate": 6.895093062605754e-06,
763
- "loss": 0.0958,
764
  "step": 1020
765
  },
766
  {
767
  "epoch": 5.228426395939087,
768
- "grad_norm": 0.23073458671569824,
769
  "learning_rate": 6.472081218274112e-06,
770
- "loss": 0.0877,
771
  "step": 1030
772
  },
773
  {
774
  "epoch": 5.279187817258883,
775
- "grad_norm": 0.24976614117622375,
776
  "learning_rate": 6.049069373942471e-06,
777
- "loss": 0.0939,
778
  "step": 1040
779
  },
780
  {
781
  "epoch": 5.32994923857868,
782
- "grad_norm": 0.18394580483436584,
783
  "learning_rate": 5.626057529610829e-06,
784
- "loss": 0.0907,
785
  "step": 1050
786
  },
787
  {
788
  "epoch": 5.380710659898477,
789
- "grad_norm": 0.24421803653240204,
790
  "learning_rate": 5.203045685279188e-06,
791
- "loss": 0.0956,
792
  "step": 1060
793
  },
794
  {
795
  "epoch": 5.431472081218274,
796
- "grad_norm": 0.24214482307434082,
797
  "learning_rate": 4.780033840947547e-06,
798
- "loss": 0.0989,
799
  "step": 1070
800
  },
801
  {
802
  "epoch": 5.482233502538071,
803
- "grad_norm": 0.24704593420028687,
804
  "learning_rate": 4.357021996615906e-06,
805
- "loss": 0.0963,
806
  "step": 1080
807
  },
808
  {
809
  "epoch": 5.532994923857868,
810
- "grad_norm": 0.2930368483066559,
811
  "learning_rate": 3.934010152284264e-06,
812
- "loss": 0.0919,
813
  "step": 1090
814
  },
815
  {
816
  "epoch": 5.583756345177665,
817
- "grad_norm": 0.17989520728588104,
818
  "learning_rate": 3.5109983079526226e-06,
819
- "loss": 0.0965,
820
  "step": 1100
821
  },
822
  {
823
  "epoch": 5.634517766497462,
824
- "grad_norm": 0.19147488474845886,
825
  "learning_rate": 3.0879864636209815e-06,
826
- "loss": 0.0923,
827
  "step": 1110
828
  },
829
  {
830
  "epoch": 5.685279187817259,
831
- "grad_norm": 0.2020110785961151,
832
  "learning_rate": 2.6649746192893404e-06,
833
- "loss": 0.0968,
834
  "step": 1120
835
  },
836
  {
837
  "epoch": 5.7360406091370555,
838
- "grad_norm": 0.2407018393278122,
839
  "learning_rate": 2.241962774957699e-06,
840
- "loss": 0.0956,
841
  "step": 1130
842
  },
843
  {
844
  "epoch": 5.786802030456853,
845
- "grad_norm": 0.3431318998336792,
846
  "learning_rate": 1.8189509306260577e-06,
847
- "loss": 0.0911,
848
  "step": 1140
849
  },
850
  {
851
  "epoch": 5.837563451776649,
852
- "grad_norm": 0.26111626625061035,
853
  "learning_rate": 1.3959390862944163e-06,
854
- "loss": 0.0944,
855
  "step": 1150
856
  },
857
  {
858
  "epoch": 5.888324873096447,
859
- "grad_norm": 0.2500187158584595,
860
  "learning_rate": 9.72927241962775e-07,
861
- "loss": 0.0945,
862
  "step": 1160
863
  },
864
  {
865
  "epoch": 5.939086294416244,
866
- "grad_norm": 0.22575554251670837,
867
  "learning_rate": 5.499153976311337e-07,
868
- "loss": 0.0983,
869
  "step": 1170
870
  },
871
  {
872
  "epoch": 5.98984771573604,
873
- "grad_norm": 0.19752255082130432,
874
  "learning_rate": 1.2690355329949238e-07,
875
- "loss": 0.0967,
876
  "step": 1180
877
  },
878
  {
879
  "epoch": 6.0,
880
- "eval_loss": 0.07735521346330643,
881
- "eval_runtime": 8.3449,
882
- "eval_samples_per_second": 41.942,
883
- "eval_steps_per_second": 2.636,
884
  "step": 1182
885
  }
886
  ],
@@ -901,7 +901,7 @@
901
  "attributes": {}
902
  }
903
  },
904
- "total_flos": 2580518529792000.0,
905
  "train_batch_size": 16,
906
  "trial_name": null,
907
  "trial_params": null
 
11
  "log_history": [
12
  {
13
  "epoch": 0.050761421319796954,
14
+ "grad_norm": 1.9596132040023804,
15
  "learning_rate": 4.961928934010153e-05,
16
+ "loss": 3.0434,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.10152284263959391,
21
+ "grad_norm": 1.4552912712097168,
22
  "learning_rate": 4.919627749576988e-05,
23
+ "loss": 1.6984,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.15228426395939088,
28
+ "grad_norm": 1.4020640850067139,
29
  "learning_rate": 4.877326565143824e-05,
30
+ "loss": 1.0141,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.20304568527918782,
35
+ "grad_norm": 1.3040558099746704,
36
  "learning_rate": 4.83502538071066e-05,
37
+ "loss": 0.6325,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.25380710659898476,
42
+ "grad_norm": 0.9710696935653687,
43
  "learning_rate": 4.792724196277496e-05,
44
+ "loss": 0.3562,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.30456852791878175,
49
+ "grad_norm": 0.9529483914375305,
50
  "learning_rate": 4.750423011844332e-05,
51
+ "loss": 0.2681,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.3553299492385787,
56
+ "grad_norm": 0.9297605156898499,
57
  "learning_rate": 4.7081218274111674e-05,
58
+ "loss": 0.1867,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.40609137055837563,
63
+ "grad_norm": 0.6723515391349792,
64
  "learning_rate": 4.665820642978004e-05,
65
+ "loss": 0.1557,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.45685279187817257,
70
+ "grad_norm": 0.5906422734260559,
71
  "learning_rate": 4.6235194585448395e-05,
72
+ "loss": 0.1332,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.5076142131979695,
77
+ "grad_norm": 0.562096357345581,
78
  "learning_rate": 4.5812182741116755e-05,
79
+ "loss": 0.1113,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.5583756345177665,
84
+ "grad_norm": 0.6856290102005005,
85
  "learning_rate": 4.538917089678511e-05,
86
+ "loss": 0.0982,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.6091370558375635,
91
+ "grad_norm": 0.3303697407245636,
92
  "learning_rate": 4.496615905245347e-05,
93
+ "loss": 0.0794,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 0.6598984771573604,
98
+ "grad_norm": 0.5941248536109924,
99
  "learning_rate": 4.454314720812183e-05,
100
+ "loss": 0.0799,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 0.7106598984771574,
105
+ "grad_norm": 0.46145302057266235,
106
  "learning_rate": 4.412013536379019e-05,
107
+ "loss": 0.0728,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.7614213197969543,
112
+ "grad_norm": 0.5075628161430359,
113
  "learning_rate": 4.369712351945855e-05,
114
+ "loss": 0.0705,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 0.8121827411167513,
119
+ "grad_norm": 0.2965494394302368,
120
  "learning_rate": 4.32741116751269e-05,
121
+ "loss": 0.0634,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 0.8629441624365483,
126
+ "grad_norm": 0.3922906219959259,
127
  "learning_rate": 4.285109983079527e-05,
128
+ "loss": 0.0599,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 0.9137055837563451,
133
+ "grad_norm": 0.3413899540901184,
134
  "learning_rate": 4.242808798646362e-05,
135
+ "loss": 0.0529,
136
  "step": 180
137
  },
138
  {
139
  "epoch": 0.9644670050761421,
140
+ "grad_norm": 0.37600159645080566,
141
  "learning_rate": 4.200507614213198e-05,
142
+ "loss": 0.0548,
143
  "step": 190
144
  },
145
  {
146
  "epoch": 1.0,
147
+ "eval_loss": 0.03613473102450371,
148
+ "eval_runtime": 6.8779,
149
+ "eval_samples_per_second": 50.887,
150
+ "eval_steps_per_second": 3.199,
151
  "step": 197
152
  },
153
  {
154
  "epoch": 1.015228426395939,
155
+ "grad_norm": 0.4854377508163452,
156
  "learning_rate": 4.1582064297800336e-05,
157
+ "loss": 0.058,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.0659898477157361,
162
+ "grad_norm": 0.39907264709472656,
163
  "learning_rate": 4.1159052453468696e-05,
164
+ "loss": 0.0609,
165
  "step": 210
166
  },
167
  {
168
  "epoch": 1.116751269035533,
169
+ "grad_norm": 0.24890871345996857,
170
  "learning_rate": 4.073604060913706e-05,
171
+ "loss": 0.0473,
172
  "step": 220
173
  },
174
  {
175
  "epoch": 1.16751269035533,
176
+ "grad_norm": 0.4353676736354828,
177
  "learning_rate": 4.0313028764805416e-05,
178
+ "loss": 0.0513,
179
  "step": 230
180
  },
181
  {
182
  "epoch": 1.218274111675127,
183
+ "grad_norm": 0.38258448243141174,
184
  "learning_rate": 3.9890016920473777e-05,
185
+ "loss": 0.0503,
186
  "step": 240
187
  },
188
  {
189
  "epoch": 1.2690355329949239,
190
+ "grad_norm": 0.3302125334739685,
191
  "learning_rate": 3.946700507614213e-05,
192
+ "loss": 0.0478,
193
  "step": 250
194
  },
195
  {
196
  "epoch": 1.3197969543147208,
197
+ "grad_norm": 0.401644229888916,
198
  "learning_rate": 3.90439932318105e-05,
199
+ "loss": 0.0457,
200
  "step": 260
201
  },
202
  {
203
  "epoch": 1.3705583756345177,
204
+ "grad_norm": 0.31225109100341797,
205
  "learning_rate": 3.862098138747885e-05,
206
+ "loss": 0.0452,
207
  "step": 270
208
  },
209
  {
210
  "epoch": 1.4213197969543148,
211
+ "grad_norm": 0.30924656987190247,
212
  "learning_rate": 3.819796954314721e-05,
213
+ "loss": 0.0428,
214
  "step": 280
215
  },
216
  {
217
  "epoch": 1.4720812182741116,
218
+ "grad_norm": 0.543154239654541,
219
  "learning_rate": 3.7774957698815564e-05,
220
+ "loss": 0.048,
221
  "step": 290
222
  },
223
  {
224
  "epoch": 1.5228426395939088,
225
+ "grad_norm": 0.2982091009616852,
226
  "learning_rate": 3.735194585448393e-05,
227
+ "loss": 0.0427,
228
  "step": 300
229
  },
230
  {
231
  "epoch": 1.5736040609137056,
232
+ "grad_norm": 0.3622360825538635,
233
  "learning_rate": 3.692893401015229e-05,
234
+ "loss": 0.0431,
235
  "step": 310
236
  },
237
  {
238
  "epoch": 1.6243654822335025,
239
+ "grad_norm": 0.2379499226808548,
240
  "learning_rate": 3.6505922165820644e-05,
241
+ "loss": 0.0408,
242
  "step": 320
243
  },
244
  {
245
  "epoch": 1.6751269035532994,
246
+ "grad_norm": 0.2724953889846802,
247
  "learning_rate": 3.6082910321489004e-05,
248
+ "loss": 0.0419,
249
  "step": 330
250
  },
251
  {
252
  "epoch": 1.7258883248730963,
253
+ "grad_norm": 0.21542227268218994,
254
  "learning_rate": 3.565989847715736e-05,
255
+ "loss": 0.0439,
256
  "step": 340
257
  },
258
  {
259
  "epoch": 1.7766497461928934,
260
+ "grad_norm": 0.24891333281993866,
261
  "learning_rate": 3.5236886632825724e-05,
262
+ "loss": 0.0393,
263
  "step": 350
264
  },
265
  {
266
  "epoch": 1.8274111675126905,
267
+ "grad_norm": 0.18472662568092346,
268
  "learning_rate": 3.481387478849408e-05,
269
+ "loss": 0.0372,
270
  "step": 360
271
  },
272
  {
273
  "epoch": 1.8781725888324874,
274
+ "grad_norm": 0.1834375113248825,
275
  "learning_rate": 3.439086294416244e-05,
276
+ "loss": 0.0383,
277
  "step": 370
278
  },
279
  {
280
  "epoch": 1.9289340101522843,
281
+ "grad_norm": 0.26916465163230896,
282
  "learning_rate": 3.396785109983079e-05,
283
+ "loss": 0.0419,
284
  "step": 380
285
  },
286
  {
287
  "epoch": 1.9796954314720812,
288
+ "grad_norm": 0.2296602427959442,
289
  "learning_rate": 3.354483925549916e-05,
290
+ "loss": 0.0391,
291
  "step": 390
292
  },
293
  {
294
  "epoch": 2.0,
295
+ "eval_loss": 0.029596656560897827,
296
+ "eval_runtime": 6.9466,
297
+ "eval_samples_per_second": 50.384,
298
+ "eval_steps_per_second": 3.167,
299
  "step": 394
300
  },
301
  {
302
  "epoch": 2.030456852791878,
303
+ "grad_norm": 0.22394953668117523,
304
  "learning_rate": 3.312182741116752e-05,
305
+ "loss": 0.0368,
306
  "step": 400
307
  },
308
  {
309
  "epoch": 2.081218274111675,
310
+ "grad_norm": 0.24742868542671204,
311
  "learning_rate": 3.269881556683587e-05,
312
+ "loss": 0.0417,
313
  "step": 410
314
  },
315
  {
316
  "epoch": 2.1319796954314723,
317
+ "grad_norm": 0.17821934819221497,
318
  "learning_rate": 3.227580372250423e-05,
319
+ "loss": 0.039,
320
  "step": 420
321
  },
322
  {
323
  "epoch": 2.182741116751269,
324
+ "grad_norm": 0.17562909424304962,
325
  "learning_rate": 3.185279187817259e-05,
326
+ "loss": 0.036,
327
  "step": 430
328
  },
329
  {
330
  "epoch": 2.233502538071066,
331
+ "grad_norm": 0.24495473504066467,
332
  "learning_rate": 3.142978003384095e-05,
333
+ "loss": 0.038,
334
  "step": 440
335
  },
336
  {
337
  "epoch": 2.284263959390863,
338
+ "grad_norm": 0.21984700858592987,
339
  "learning_rate": 3.1006768189509306e-05,
340
+ "loss": 0.0364,
341
  "step": 450
342
  },
343
  {
344
  "epoch": 2.33502538071066,
345
+ "grad_norm": 0.263046532869339,
346
  "learning_rate": 3.0583756345177666e-05,
347
+ "loss": 0.0393,
348
  "step": 460
349
  },
350
  {
351
  "epoch": 2.3857868020304567,
352
+ "grad_norm": 0.494204044342041,
353
  "learning_rate": 3.016074450084603e-05,
354
+ "loss": 0.0342,
355
  "step": 470
356
  },
357
  {
358
  "epoch": 2.436548223350254,
359
+ "grad_norm": 0.24457719922065735,
360
  "learning_rate": 2.9737732656514383e-05,
361
+ "loss": 0.0371,
362
  "step": 480
363
  },
364
  {
365
  "epoch": 2.487309644670051,
366
+ "grad_norm": 0.2866905629634857,
367
  "learning_rate": 2.9314720812182743e-05,
368
+ "loss": 0.0375,
369
  "step": 490
370
  },
371
  {
372
  "epoch": 2.5380710659898478,
373
+ "grad_norm": 0.1922035664319992,
374
  "learning_rate": 2.88917089678511e-05,
375
+ "loss": 0.0339,
376
  "step": 500
377
  },
378
  {
379
  "epoch": 2.5888324873096447,
380
+ "grad_norm": 0.2251596301794052,
381
  "learning_rate": 2.846869712351946e-05,
382
+ "loss": 0.0316,
383
  "step": 510
384
  },
385
  {
386
  "epoch": 2.6395939086294415,
387
+ "grad_norm": 0.19956769049167633,
388
  "learning_rate": 2.8045685279187816e-05,
389
+ "loss": 0.0367,
390
  "step": 520
391
  },
392
  {
393
  "epoch": 2.6903553299492384,
394
+ "grad_norm": 0.23161649703979492,
395
  "learning_rate": 2.7622673434856176e-05,
396
+ "loss": 0.0335,
397
  "step": 530
398
  },
399
  {
400
  "epoch": 2.7411167512690353,
401
+ "grad_norm": 0.2735691964626312,
402
  "learning_rate": 2.7199661590524533e-05,
403
+ "loss": 0.0367,
404
  "step": 540
405
  },
406
  {
407
  "epoch": 2.7918781725888326,
408
+ "grad_norm": 0.3856474757194519,
409
  "learning_rate": 2.6776649746192893e-05,
410
+ "loss": 0.0362,
411
  "step": 550
412
  },
413
  {
414
  "epoch": 2.8426395939086295,
415
+ "grad_norm": 0.24519683420658112,
416
  "learning_rate": 2.6353637901861257e-05,
417
+ "loss": 0.031,
418
  "step": 560
419
  },
420
  {
421
  "epoch": 2.8934010152284264,
422
+ "grad_norm": 0.12949654459953308,
423
  "learning_rate": 2.593062605752961e-05,
424
+ "loss": 0.032,
425
  "step": 570
426
  },
427
  {
428
  "epoch": 2.9441624365482233,
429
+ "grad_norm": 0.1476690173149109,
430
  "learning_rate": 2.5507614213197974e-05,
431
+ "loss": 0.0351,
432
  "step": 580
433
  },
434
  {
435
  "epoch": 2.99492385786802,
436
+ "grad_norm": 0.24033169448375702,
437
  "learning_rate": 2.5084602368866327e-05,
438
+ "loss": 0.0337,
439
  "step": 590
440
  },
441
  {
442
  "epoch": 3.0,
443
+ "eval_loss": 0.02790662832558155,
444
+ "eval_runtime": 6.9258,
445
+ "eval_samples_per_second": 50.536,
446
+ "eval_steps_per_second": 3.177,
447
  "step": 591
448
  },
449
  {
450
  "epoch": 3.045685279187817,
451
+ "grad_norm": 0.25604188442230225,
452
  "learning_rate": 2.466159052453469e-05,
453
+ "loss": 0.0339,
454
  "step": 600
455
  },
456
  {
457
  "epoch": 3.0964467005076144,
458
+ "grad_norm": 0.15198302268981934,
459
  "learning_rate": 2.4238578680203047e-05,
460
+ "loss": 0.0316,
461
  "step": 610
462
  },
463
  {
464
  "epoch": 3.1472081218274113,
465
+ "grad_norm": 0.18943068385124207,
466
  "learning_rate": 2.3815566835871404e-05,
467
+ "loss": 0.0302,
468
  "step": 620
469
  },
470
  {
471
  "epoch": 3.197969543147208,
472
+ "grad_norm": 0.23807291686534882,
473
  "learning_rate": 2.3392554991539764e-05,
474
+ "loss": 0.0338,
475
  "step": 630
476
  },
477
  {
478
  "epoch": 3.248730964467005,
479
+ "grad_norm": 0.2615777552127838,
480
  "learning_rate": 2.296954314720812e-05,
481
+ "loss": 0.0291,
482
  "step": 640
483
  },
484
  {
485
  "epoch": 3.299492385786802,
486
+ "grad_norm": 0.20456817746162415,
487
  "learning_rate": 2.254653130287648e-05,
488
+ "loss": 0.0331,
489
  "step": 650
490
  },
491
  {
492
  "epoch": 3.350253807106599,
493
+ "grad_norm": 0.29629555344581604,
494
  "learning_rate": 2.2123519458544838e-05,
495
+ "loss": 0.0324,
496
  "step": 660
497
  },
498
  {
499
  "epoch": 3.401015228426396,
500
+ "grad_norm": 0.19070571660995483,
501
  "learning_rate": 2.17005076142132e-05,
502
+ "loss": 0.0312,
503
  "step": 670
504
  },
505
  {
506
  "epoch": 3.451776649746193,
507
+ "grad_norm": 0.17927491664886475,
508
  "learning_rate": 2.1277495769881558e-05,
509
+ "loss": 0.0331,
510
  "step": 680
511
  },
512
  {
513
  "epoch": 3.50253807106599,
514
+ "grad_norm": 0.16211186349391937,
515
  "learning_rate": 2.085448392554992e-05,
516
+ "loss": 0.0324,
517
  "step": 690
518
  },
519
  {
520
  "epoch": 3.553299492385787,
521
+ "grad_norm": 0.13928809762001038,
522
  "learning_rate": 2.0431472081218275e-05,
523
+ "loss": 0.0315,
524
  "step": 700
525
  },
526
  {
527
  "epoch": 3.6040609137055837,
528
+ "grad_norm": 0.2813867926597595,
529
  "learning_rate": 2.0008460236886635e-05,
530
+ "loss": 0.03,
531
  "step": 710
532
  },
533
  {
534
  "epoch": 3.6548223350253806,
535
+ "grad_norm": 0.2689349353313446,
536
  "learning_rate": 1.9585448392554992e-05,
537
+ "loss": 0.0333,
538
  "step": 720
539
  },
540
  {
541
  "epoch": 3.7055837563451774,
542
+ "grad_norm": 0.2879869043827057,
543
  "learning_rate": 1.916243654822335e-05,
544
+ "loss": 0.035,
545
  "step": 730
546
  },
547
  {
548
  "epoch": 3.7563451776649748,
549
+ "grad_norm": 0.1638893336057663,
550
  "learning_rate": 1.873942470389171e-05,
551
+ "loss": 0.0331,
552
  "step": 740
553
  },
554
  {
555
  "epoch": 3.8071065989847717,
556
+ "grad_norm": 0.08905433863401413,
557
  "learning_rate": 1.831641285956007e-05,
558
+ "loss": 0.0291,
559
  "step": 750
560
  },
561
  {
562
  "epoch": 3.8578680203045685,
563
+ "grad_norm": 0.2221483290195465,
564
  "learning_rate": 1.789340101522843e-05,
565
+ "loss": 0.0334,
566
  "step": 760
567
  },
568
  {
569
  "epoch": 3.9086294416243654,
570
+ "grad_norm": 0.16910187900066376,
571
  "learning_rate": 1.7470389170896786e-05,
572
+ "loss": 0.0331,
573
  "step": 770
574
  },
575
  {
576
  "epoch": 3.9593908629441623,
577
+ "grad_norm": 0.20653125643730164,
578
  "learning_rate": 1.7047377326565146e-05,
579
+ "loss": 0.0328,
580
  "step": 780
581
  },
582
  {
583
  "epoch": 4.0,
584
+ "eval_loss": 0.02677118219435215,
585
+ "eval_runtime": 6.9163,
586
+ "eval_samples_per_second": 50.605,
587
+ "eval_steps_per_second": 3.181,
588
  "step": 788
589
  },
590
  {
591
  "epoch": 4.01015228426396,
592
+ "grad_norm": 0.5460578203201294,
593
  "learning_rate": 1.6624365482233503e-05,
594
+ "loss": 0.0317,
595
  "step": 790
596
  },
597
  {
598
  "epoch": 4.060913705583756,
599
+ "grad_norm": 0.1273794323205948,
600
  "learning_rate": 1.6201353637901863e-05,
601
+ "loss": 0.0324,
602
  "step": 800
603
  },
604
  {
605
  "epoch": 4.111675126903553,
606
+ "grad_norm": 0.2069994956254959,
607
  "learning_rate": 1.577834179357022e-05,
608
+ "loss": 0.0328,
609
  "step": 810
610
  },
611
  {
612
  "epoch": 4.16243654822335,
613
+ "grad_norm": 0.13560791313648224,
614
  "learning_rate": 1.535532994923858e-05,
615
+ "loss": 0.0289,
616
  "step": 820
617
  },
618
  {
619
  "epoch": 4.213197969543147,
620
+ "grad_norm": 0.13835355639457703,
621
  "learning_rate": 1.493231810490694e-05,
622
+ "loss": 0.0285,
623
  "step": 830
624
  },
625
  {
626
  "epoch": 4.2639593908629445,
627
+ "grad_norm": 0.17146103084087372,
628
  "learning_rate": 1.4509306260575298e-05,
629
+ "loss": 0.0328,
630
  "step": 840
631
  },
632
  {
633
  "epoch": 4.314720812182741,
634
+ "grad_norm": 0.25955504179000854,
635
  "learning_rate": 1.4086294416243657e-05,
636
+ "loss": 0.0295,
637
  "step": 850
638
  },
639
  {
640
  "epoch": 4.365482233502538,
641
+ "grad_norm": 0.24718697369098663,
642
  "learning_rate": 1.3663282571912014e-05,
643
+ "loss": 0.0307,
644
  "step": 860
645
  },
646
  {
647
  "epoch": 4.416243654822335,
648
+ "grad_norm": 0.12164635211229324,
649
  "learning_rate": 1.3240270727580372e-05,
650
+ "loss": 0.0287,
651
  "step": 870
652
  },
653
  {
654
  "epoch": 4.467005076142132,
655
+ "grad_norm": 0.17382808029651642,
656
  "learning_rate": 1.281725888324873e-05,
657
+ "loss": 0.0472,
658
  "step": 880
659
  },
660
  {
661
  "epoch": 4.517766497461929,
662
+ "grad_norm": 0.17402203381061554,
663
  "learning_rate": 1.239424703891709e-05,
664
+ "loss": 0.0343,
665
  "step": 890
666
  },
667
  {
668
  "epoch": 4.568527918781726,
669
+ "grad_norm": 0.17245104908943176,
670
  "learning_rate": 1.1971235194585449e-05,
671
+ "loss": 0.0318,
672
  "step": 900
673
  },
674
  {
675
  "epoch": 4.619289340101523,
676
+ "grad_norm": 0.1376132220029831,
677
  "learning_rate": 1.1548223350253808e-05,
678
+ "loss": 0.0319,
679
  "step": 910
680
  },
681
  {
682
  "epoch": 4.67005076142132,
683
+ "grad_norm": 0.17528069019317627,
684
  "learning_rate": 1.1125211505922166e-05,
685
+ "loss": 0.0302,
686
  "step": 920
687
  },
688
  {
689
  "epoch": 4.720812182741117,
690
+ "grad_norm": 0.2443544864654541,
691
  "learning_rate": 1.0702199661590526e-05,
692
+ "loss": 0.0295,
693
  "step": 930
694
  },
695
  {
696
  "epoch": 4.771573604060913,
697
+ "grad_norm": 0.21152476966381073,
698
  "learning_rate": 1.0279187817258885e-05,
699
+ "loss": 0.0331,
700
  "step": 940
701
  },
702
  {
703
  "epoch": 4.822335025380711,
704
+ "grad_norm": 0.13216163218021393,
705
  "learning_rate": 9.856175972927243e-06,
706
+ "loss": 0.0283,
707
  "step": 950
708
  },
709
  {
710
  "epoch": 4.873096446700508,
711
+ "grad_norm": 0.1937057226896286,
712
  "learning_rate": 9.433164128595601e-06,
713
+ "loss": 0.0285,
714
  "step": 960
715
  },
716
  {
717
  "epoch": 4.9238578680203045,
718
+ "grad_norm": 0.1196654811501503,
719
  "learning_rate": 9.01015228426396e-06,
720
+ "loss": 0.0299,
721
  "step": 970
722
  },
723
  {
724
  "epoch": 4.974619289340102,
725
+ "grad_norm": 0.14108304679393768,
726
  "learning_rate": 8.587140439932318e-06,
727
+ "loss": 0.0326,
728
  "step": 980
729
  },
730
  {
731
  "epoch": 5.0,
732
+ "eval_loss": 0.026293369010090828,
733
+ "eval_runtime": 7.2804,
734
+ "eval_samples_per_second": 48.074,
735
+ "eval_steps_per_second": 3.022,
736
  "step": 985
737
  },
738
  {
739
  "epoch": 5.025380710659898,
740
+ "grad_norm": 0.11325781047344208,
741
  "learning_rate": 8.164128595600677e-06,
742
+ "loss": 0.0303,
743
  "step": 990
744
  },
745
  {
746
  "epoch": 5.0761421319796955,
747
+ "grad_norm": 0.1742030531167984,
748
  "learning_rate": 7.741116751269035e-06,
749
+ "loss": 0.029,
750
  "step": 1000
751
  },
752
  {
753
  "epoch": 5.126903553299492,
754
+ "grad_norm": 0.19924026727676392,
755
  "learning_rate": 7.318104906937395e-06,
756
+ "loss": 0.0271,
757
  "step": 1010
758
  },
759
  {
760
  "epoch": 5.177664974619289,
761
+ "grad_norm": 0.23700544238090515,
762
  "learning_rate": 6.895093062605754e-06,
763
+ "loss": 0.0306,
764
  "step": 1020
765
  },
766
  {
767
  "epoch": 5.228426395939087,
768
+ "grad_norm": 0.12165335565805435,
769
  "learning_rate": 6.472081218274112e-06,
770
+ "loss": 0.0318,
771
  "step": 1030
772
  },
773
  {
774
  "epoch": 5.279187817258883,
775
+ "grad_norm": 0.21364423632621765,
776
  "learning_rate": 6.049069373942471e-06,
777
+ "loss": 0.03,
778
  "step": 1040
779
  },
780
  {
781
  "epoch": 5.32994923857868,
782
+ "grad_norm": 0.19045327603816986,
783
  "learning_rate": 5.626057529610829e-06,
784
+ "loss": 0.0325,
785
  "step": 1050
786
  },
787
  {
788
  "epoch": 5.380710659898477,
789
+ "grad_norm": 0.10052906721830368,
790
  "learning_rate": 5.203045685279188e-06,
791
+ "loss": 0.0278,
792
  "step": 1060
793
  },
794
  {
795
  "epoch": 5.431472081218274,
796
+ "grad_norm": 0.2044578194618225,
797
  "learning_rate": 4.780033840947547e-06,
798
+ "loss": 0.0309,
799
  "step": 1070
800
  },
801
  {
802
  "epoch": 5.482233502538071,
803
+ "grad_norm": 0.19502834975719452,
804
  "learning_rate": 4.357021996615906e-06,
805
+ "loss": 0.0318,
806
  "step": 1080
807
  },
808
  {
809
  "epoch": 5.532994923857868,
810
+ "grad_norm": 0.13834765553474426,
811
  "learning_rate": 3.934010152284264e-06,
812
+ "loss": 0.0305,
813
  "step": 1090
814
  },
815
  {
816
  "epoch": 5.583756345177665,
817
+ "grad_norm": 0.19017720222473145,
818
  "learning_rate": 3.5109983079526226e-06,
819
+ "loss": 0.0305,
820
  "step": 1100
821
  },
822
  {
823
  "epoch": 5.634517766497462,
824
+ "grad_norm": 0.14318296313285828,
825
  "learning_rate": 3.0879864636209815e-06,
826
+ "loss": 0.0304,
827
  "step": 1110
828
  },
829
  {
830
  "epoch": 5.685279187817259,
831
+ "grad_norm": 0.13694196939468384,
832
  "learning_rate": 2.6649746192893404e-06,
833
+ "loss": 0.0301,
834
  "step": 1120
835
  },
836
  {
837
  "epoch": 5.7360406091370555,
838
+ "grad_norm": 0.11877632886171341,
839
  "learning_rate": 2.241962774957699e-06,
840
+ "loss": 0.0303,
841
  "step": 1130
842
  },
843
  {
844
  "epoch": 5.786802030456853,
845
+ "grad_norm": 0.1271430402994156,
846
  "learning_rate": 1.8189509306260577e-06,
847
+ "loss": 0.0321,
848
  "step": 1140
849
  },
850
  {
851
  "epoch": 5.837563451776649,
852
+ "grad_norm": 0.1693529337644577,
853
  "learning_rate": 1.3959390862944163e-06,
854
+ "loss": 0.0318,
855
  "step": 1150
856
  },
857
  {
858
  "epoch": 5.888324873096447,
859
+ "grad_norm": 0.1400621086359024,
860
  "learning_rate": 9.72927241962775e-07,
861
+ "loss": 0.0273,
862
  "step": 1160
863
  },
864
  {
865
  "epoch": 5.939086294416244,
866
+ "grad_norm": 0.20201422274112701,
867
  "learning_rate": 5.499153976311337e-07,
868
+ "loss": 0.0287,
869
  "step": 1170
870
  },
871
  {
872
  "epoch": 5.98984771573604,
873
+ "grad_norm": 0.12207765877246857,
874
  "learning_rate": 1.2690355329949238e-07,
875
+ "loss": 0.0313,
876
  "step": 1180
877
  },
878
  {
879
  "epoch": 6.0,
880
+ "eval_loss": 0.02633434534072876,
881
+ "eval_runtime": 6.8922,
882
+ "eval_samples_per_second": 50.782,
883
+ "eval_steps_per_second": 3.192,
884
  "step": 1182
885
  }
886
  ],
 
901
  "attributes": {}
902
  }
903
  },
904
+ "total_flos": 2581201228677120.0,
905
  "train_batch_size": 16,
906
  "trial_name": null,
907
  "trial_params": null
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6cdac161c65a4b68dcf9ba007ba6f52dfacfd42d2554a1490e37b5030ba7dca9
3
  size 891644712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:841f002f0348e373f18727bdd1a79c542747fbd6590d9f8f3330abfc78d84b62
3
  size 891644712