instruct_fea8048 / trainer_state.json
bimabk's picture
Upload task output 1
9baa583 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997592681752527,
"eval_steps": 500,
"global_step": 4153,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001203659123736158,
"grad_norm": 5.4375,
"learning_rate": 8.594285714285714e-06,
"loss": 1.9523,
"step": 5
},
{
"epoch": 0.002407318247472316,
"grad_norm": 3.6875,
"learning_rate": 1.9337142857142854e-05,
"loss": 1.9164,
"step": 10
},
{
"epoch": 0.0036109773712084737,
"grad_norm": 3.046875,
"learning_rate": 3.008e-05,
"loss": 1.8413,
"step": 15
},
{
"epoch": 0.004814636494944632,
"grad_norm": 3.03125,
"learning_rate": 4.082285714285714e-05,
"loss": 1.7485,
"step": 20
},
{
"epoch": 0.00601829561868079,
"grad_norm": 2.78125,
"learning_rate": 5.156571428571429e-05,
"loss": 1.7032,
"step": 25
},
{
"epoch": 0.007221954742416947,
"grad_norm": 2.71875,
"learning_rate": 6.230857142857143e-05,
"loss": 1.5993,
"step": 30
},
{
"epoch": 0.008425613866153106,
"grad_norm": 2.78125,
"learning_rate": 7.305142857142857e-05,
"loss": 1.5406,
"step": 35
},
{
"epoch": 0.009629272989889264,
"grad_norm": 2.703125,
"learning_rate": 7.519999190126141e-05,
"loss": 1.4727,
"step": 40
},
{
"epoch": 0.010832932113625422,
"grad_norm": 2.796875,
"learning_rate": 7.519995900014385e-05,
"loss": 1.452,
"step": 45
},
{
"epoch": 0.01203659123736158,
"grad_norm": 2.78125,
"learning_rate": 7.519990079050565e-05,
"loss": 1.3904,
"step": 50
},
{
"epoch": 0.013240250361097737,
"grad_norm": 2.59375,
"learning_rate": 7.519981727239906e-05,
"loss": 1.3752,
"step": 55
},
{
"epoch": 0.014443909484833895,
"grad_norm": 2.765625,
"learning_rate": 7.519970844589904e-05,
"loss": 1.3351,
"step": 60
},
{
"epoch": 0.015647568608570053,
"grad_norm": 2.59375,
"learning_rate": 7.519957431110327e-05,
"loss": 1.342,
"step": 65
},
{
"epoch": 0.016851227732306212,
"grad_norm": 2.96875,
"learning_rate": 7.51994148681321e-05,
"loss": 1.3116,
"step": 70
},
{
"epoch": 0.018054886856042368,
"grad_norm": 2.734375,
"learning_rate": 7.519923011712865e-05,
"loss": 1.3081,
"step": 75
},
{
"epoch": 0.019258545979778528,
"grad_norm": 2.640625,
"learning_rate": 7.519902005825872e-05,
"loss": 1.2885,
"step": 80
},
{
"epoch": 0.020462205103514684,
"grad_norm": 2.703125,
"learning_rate": 7.519878469171081e-05,
"loss": 1.2879,
"step": 85
},
{
"epoch": 0.021665864227250843,
"grad_norm": 2.5625,
"learning_rate": 7.519852401769621e-05,
"loss": 1.2741,
"step": 90
},
{
"epoch": 0.022869523350987,
"grad_norm": 2.609375,
"learning_rate": 7.519823803644881e-05,
"loss": 1.2429,
"step": 95
},
{
"epoch": 0.02407318247472316,
"grad_norm": 2.984375,
"learning_rate": 7.519792674822529e-05,
"loss": 1.2462,
"step": 100
},
{
"epoch": 0.025276841598459315,
"grad_norm": 2.75,
"learning_rate": 7.519759015330501e-05,
"loss": 1.217,
"step": 105
},
{
"epoch": 0.026480500722195474,
"grad_norm": 2.484375,
"learning_rate": 7.519722825199007e-05,
"loss": 1.2431,
"step": 110
},
{
"epoch": 0.027684159845931634,
"grad_norm": 2.703125,
"learning_rate": 7.519684104460526e-05,
"loss": 1.242,
"step": 115
},
{
"epoch": 0.02888781896966779,
"grad_norm": 2.578125,
"learning_rate": 7.519642853149806e-05,
"loss": 1.2239,
"step": 120
},
{
"epoch": 0.03009147809340395,
"grad_norm": 2.59375,
"learning_rate": 7.519599071303875e-05,
"loss": 1.1809,
"step": 125
},
{
"epoch": 0.031295137217140105,
"grad_norm": 2.625,
"learning_rate": 7.519552758962019e-05,
"loss": 1.2366,
"step": 130
},
{
"epoch": 0.03249879634087626,
"grad_norm": 2.5,
"learning_rate": 7.519503916165803e-05,
"loss": 1.1634,
"step": 135
},
{
"epoch": 0.033702455464612424,
"grad_norm": 2.84375,
"learning_rate": 7.519452542959066e-05,
"loss": 1.1719,
"step": 140
},
{
"epoch": 0.03490611458834858,
"grad_norm": 2.953125,
"learning_rate": 7.51939863938791e-05,
"loss": 1.1596,
"step": 145
},
{
"epoch": 0.036109773712084736,
"grad_norm": 2.390625,
"learning_rate": 7.519342205500712e-05,
"loss": 1.1627,
"step": 150
},
{
"epoch": 0.03731343283582089,
"grad_norm": 2.46875,
"learning_rate": 7.519283241348121e-05,
"loss": 1.166,
"step": 155
},
{
"epoch": 0.038517091959557055,
"grad_norm": 2.59375,
"learning_rate": 7.519221746983052e-05,
"loss": 1.1952,
"step": 160
},
{
"epoch": 0.03972075108329321,
"grad_norm": 2.703125,
"learning_rate": 7.5191577224607e-05,
"loss": 1.1565,
"step": 165
},
{
"epoch": 0.04092441020702937,
"grad_norm": 2.671875,
"learning_rate": 7.519091167838519e-05,
"loss": 1.1575,
"step": 170
},
{
"epoch": 0.04212806933076553,
"grad_norm": 2.21875,
"learning_rate": 7.519022083176244e-05,
"loss": 1.1399,
"step": 175
},
{
"epoch": 0.043331728454501686,
"grad_norm": 2.3125,
"learning_rate": 7.518950468535872e-05,
"loss": 1.1503,
"step": 180
},
{
"epoch": 0.04453538757823784,
"grad_norm": 2.53125,
"learning_rate": 7.518876323981678e-05,
"loss": 1.1025,
"step": 185
},
{
"epoch": 0.045739046701974,
"grad_norm": 2.765625,
"learning_rate": 7.518799649580204e-05,
"loss": 1.1512,
"step": 190
},
{
"epoch": 0.04694270582571016,
"grad_norm": 2.71875,
"learning_rate": 7.518720445400261e-05,
"loss": 1.1202,
"step": 195
},
{
"epoch": 0.04814636494944632,
"grad_norm": 2.90625,
"learning_rate": 7.518638711512932e-05,
"loss": 1.1038,
"step": 200
},
{
"epoch": 0.04935002407318247,
"grad_norm": 2.40625,
"learning_rate": 7.518554447991572e-05,
"loss": 1.1074,
"step": 205
},
{
"epoch": 0.05055368319691863,
"grad_norm": 2.359375,
"learning_rate": 7.518467654911806e-05,
"loss": 1.1035,
"step": 210
},
{
"epoch": 0.05175734232065479,
"grad_norm": 2.703125,
"learning_rate": 7.518378332351524e-05,
"loss": 1.1083,
"step": 215
},
{
"epoch": 0.05296100144439095,
"grad_norm": 2.84375,
"learning_rate": 7.518286480390892e-05,
"loss": 1.1062,
"step": 220
},
{
"epoch": 0.054164660568127104,
"grad_norm": 2.515625,
"learning_rate": 7.518192099112345e-05,
"loss": 1.1028,
"step": 225
},
{
"epoch": 0.05536831969186327,
"grad_norm": 2.46875,
"learning_rate": 7.518095188600586e-05,
"loss": 1.1036,
"step": 230
},
{
"epoch": 0.05657197881559942,
"grad_norm": 2.59375,
"learning_rate": 7.517995748942589e-05,
"loss": 1.0876,
"step": 235
},
{
"epoch": 0.05777563793933558,
"grad_norm": 2.359375,
"learning_rate": 7.517893780227597e-05,
"loss": 1.0686,
"step": 240
},
{
"epoch": 0.058979297063071735,
"grad_norm": 2.34375,
"learning_rate": 7.517789282547126e-05,
"loss": 1.0863,
"step": 245
},
{
"epoch": 0.0601829561868079,
"grad_norm": 2.515625,
"learning_rate": 7.517682255994956e-05,
"loss": 1.0745,
"step": 250
},
{
"epoch": 0.061386615310544054,
"grad_norm": 2.25,
"learning_rate": 7.517572700667141e-05,
"loss": 1.0997,
"step": 255
},
{
"epoch": 0.06259027443428021,
"grad_norm": 2.53125,
"learning_rate": 7.517460616662005e-05,
"loss": 1.0501,
"step": 260
},
{
"epoch": 0.06379393355801637,
"grad_norm": 2.5,
"learning_rate": 7.517346004080137e-05,
"loss": 1.0777,
"step": 265
},
{
"epoch": 0.06499759268175252,
"grad_norm": 2.546875,
"learning_rate": 7.5172288630244e-05,
"loss": 1.0623,
"step": 270
},
{
"epoch": 0.06620125180548869,
"grad_norm": 2.890625,
"learning_rate": 7.517109193599923e-05,
"loss": 1.0649,
"step": 275
},
{
"epoch": 0.06740491092922485,
"grad_norm": 2.65625,
"learning_rate": 7.516986995914106e-05,
"loss": 1.0468,
"step": 280
},
{
"epoch": 0.068608570052961,
"grad_norm": 3.0625,
"learning_rate": 7.516862270076615e-05,
"loss": 1.0485,
"step": 285
},
{
"epoch": 0.06981222917669716,
"grad_norm": 2.515625,
"learning_rate": 7.516735016199392e-05,
"loss": 1.0412,
"step": 290
},
{
"epoch": 0.07101588830043332,
"grad_norm": 2.390625,
"learning_rate": 7.516605234396639e-05,
"loss": 1.0392,
"step": 295
},
{
"epoch": 0.07221954742416947,
"grad_norm": 2.421875,
"learning_rate": 7.516472924784832e-05,
"loss": 1.0129,
"step": 300
},
{
"epoch": 0.07342320654790563,
"grad_norm": 2.546875,
"learning_rate": 7.516338087482715e-05,
"loss": 1.0365,
"step": 305
},
{
"epoch": 0.07462686567164178,
"grad_norm": 2.234375,
"learning_rate": 7.5162007226113e-05,
"loss": 1.0767,
"step": 310
},
{
"epoch": 0.07583052479537795,
"grad_norm": 2.34375,
"learning_rate": 7.516060830293867e-05,
"loss": 1.0139,
"step": 315
},
{
"epoch": 0.07703418391911411,
"grad_norm": 2.484375,
"learning_rate": 7.515918410655963e-05,
"loss": 1.0152,
"step": 320
},
{
"epoch": 0.07823784304285027,
"grad_norm": 2.390625,
"learning_rate": 7.515773463825409e-05,
"loss": 1.0269,
"step": 325
},
{
"epoch": 0.07944150216658642,
"grad_norm": 2.640625,
"learning_rate": 7.515625989932286e-05,
"loss": 1.0453,
"step": 330
},
{
"epoch": 0.08064516129032258,
"grad_norm": 2.578125,
"learning_rate": 7.515475989108947e-05,
"loss": 1.0238,
"step": 335
},
{
"epoch": 0.08184882041405873,
"grad_norm": 2.875,
"learning_rate": 7.515323461490016e-05,
"loss": 1.022,
"step": 340
},
{
"epoch": 0.08305247953779489,
"grad_norm": 2.578125,
"learning_rate": 7.515168407212379e-05,
"loss": 1.0004,
"step": 345
},
{
"epoch": 0.08425613866153106,
"grad_norm": 2.515625,
"learning_rate": 7.515010826415193e-05,
"loss": 1.0361,
"step": 350
},
{
"epoch": 0.08545979778526722,
"grad_norm": 2.3125,
"learning_rate": 7.51485071923988e-05,
"loss": 1.0119,
"step": 355
},
{
"epoch": 0.08666345690900337,
"grad_norm": 2.3125,
"learning_rate": 7.514688085830133e-05,
"loss": 1.0128,
"step": 360
},
{
"epoch": 0.08786711603273953,
"grad_norm": 2.421875,
"learning_rate": 7.514522926331908e-05,
"loss": 1.0119,
"step": 365
},
{
"epoch": 0.08907077515647568,
"grad_norm": 2.390625,
"learning_rate": 7.51435524089343e-05,
"loss": 1.0205,
"step": 370
},
{
"epoch": 0.09027443428021184,
"grad_norm": 2.328125,
"learning_rate": 7.514185029665195e-05,
"loss": 1.0289,
"step": 375
},
{
"epoch": 0.091478093403948,
"grad_norm": 2.25,
"learning_rate": 7.514012292799957e-05,
"loss": 0.9974,
"step": 380
},
{
"epoch": 0.09268175252768417,
"grad_norm": 2.3125,
"learning_rate": 7.513837030452745e-05,
"loss": 1.0058,
"step": 385
},
{
"epoch": 0.09388541165142032,
"grad_norm": 2.328125,
"learning_rate": 7.513659242780848e-05,
"loss": 0.9894,
"step": 390
},
{
"epoch": 0.09508907077515648,
"grad_norm": 2.390625,
"learning_rate": 7.513478929943828e-05,
"loss": 0.9879,
"step": 395
},
{
"epoch": 0.09629272989889263,
"grad_norm": 2.5625,
"learning_rate": 7.513296092103507e-05,
"loss": 1.0006,
"step": 400
},
{
"epoch": 0.09749638902262879,
"grad_norm": 2.640625,
"learning_rate": 7.513110729423976e-05,
"loss": 0.9984,
"step": 405
},
{
"epoch": 0.09870004814636495,
"grad_norm": 2.515625,
"learning_rate": 7.512922842071594e-05,
"loss": 1.0084,
"step": 410
},
{
"epoch": 0.0999037072701011,
"grad_norm": 2.3125,
"learning_rate": 7.512732430214982e-05,
"loss": 1.0034,
"step": 415
},
{
"epoch": 0.10110736639383726,
"grad_norm": 2.546875,
"learning_rate": 7.512539494025027e-05,
"loss": 1.0019,
"step": 420
},
{
"epoch": 0.10231102551757343,
"grad_norm": 2.96875,
"learning_rate": 7.512344033674885e-05,
"loss": 0.9941,
"step": 425
},
{
"epoch": 0.10351468464130958,
"grad_norm": 2.625,
"learning_rate": 7.512146049339975e-05,
"loss": 0.9523,
"step": 430
},
{
"epoch": 0.10471834376504574,
"grad_norm": 2.4375,
"learning_rate": 7.51194554119798e-05,
"loss": 0.9821,
"step": 435
},
{
"epoch": 0.1059220028887819,
"grad_norm": 2.25,
"learning_rate": 7.51174250942885e-05,
"loss": 0.9661,
"step": 440
},
{
"epoch": 0.10712566201251805,
"grad_norm": 2.609375,
"learning_rate": 7.5115369542148e-05,
"loss": 0.9926,
"step": 445
},
{
"epoch": 0.10832932113625421,
"grad_norm": 2.484375,
"learning_rate": 7.511328875740308e-05,
"loss": 0.9999,
"step": 450
},
{
"epoch": 0.10953298025999036,
"grad_norm": 2.40625,
"learning_rate": 7.511118274192118e-05,
"loss": 1.0023,
"step": 455
},
{
"epoch": 0.11073663938372653,
"grad_norm": 2.25,
"learning_rate": 7.510905149759237e-05,
"loss": 0.9643,
"step": 460
},
{
"epoch": 0.11194029850746269,
"grad_norm": 2.375,
"learning_rate": 7.510689502632937e-05,
"loss": 0.9565,
"step": 465
},
{
"epoch": 0.11314395763119885,
"grad_norm": 2.265625,
"learning_rate": 7.510471333006756e-05,
"loss": 0.9777,
"step": 470
},
{
"epoch": 0.114347616754935,
"grad_norm": 2.515625,
"learning_rate": 7.510250641076491e-05,
"loss": 1.0148,
"step": 475
},
{
"epoch": 0.11555127587867116,
"grad_norm": 2.265625,
"learning_rate": 7.51002742704021e-05,
"loss": 0.9534,
"step": 480
},
{
"epoch": 0.11675493500240731,
"grad_norm": 2.421875,
"learning_rate": 7.509801691098234e-05,
"loss": 0.96,
"step": 485
},
{
"epoch": 0.11795859412614347,
"grad_norm": 2.359375,
"learning_rate": 7.50957343345316e-05,
"loss": 0.9168,
"step": 490
},
{
"epoch": 0.11916225324987964,
"grad_norm": 2.25,
"learning_rate": 7.509342654309836e-05,
"loss": 0.9506,
"step": 495
},
{
"epoch": 0.1203659123736158,
"grad_norm": 2.515625,
"learning_rate": 7.509109353875383e-05,
"loss": 0.967,
"step": 500
},
{
"epoch": 0.1203659123736158,
"eval_loss": 0.8579447865486145,
"eval_runtime": 2.4166,
"eval_samples_per_second": 82.761,
"eval_steps_per_second": 82.761,
"step": 500
},
{
"epoch": 0.12156957149735195,
"grad_norm": 2.375,
"learning_rate": 7.508873532359177e-05,
"loss": 0.9136,
"step": 505
},
{
"epoch": 0.12277323062108811,
"grad_norm": 2.328125,
"learning_rate": 7.508635189972863e-05,
"loss": 0.9422,
"step": 510
},
{
"epoch": 0.12397688974482426,
"grad_norm": 2.203125,
"learning_rate": 7.508394326930342e-05,
"loss": 0.9751,
"step": 515
},
{
"epoch": 0.12518054886856042,
"grad_norm": 2.359375,
"learning_rate": 7.508150943447782e-05,
"loss": 0.9974,
"step": 520
},
{
"epoch": 0.12638420799229658,
"grad_norm": 2.265625,
"learning_rate": 7.507905039743612e-05,
"loss": 0.9835,
"step": 525
},
{
"epoch": 0.12758786711603273,
"grad_norm": 2.328125,
"learning_rate": 7.507656616038523e-05,
"loss": 0.9457,
"step": 530
},
{
"epoch": 0.1287915262397689,
"grad_norm": 2.328125,
"learning_rate": 7.507405672555465e-05,
"loss": 0.9453,
"step": 535
},
{
"epoch": 0.12999518536350504,
"grad_norm": 2.5625,
"learning_rate": 7.507152209519653e-05,
"loss": 0.9403,
"step": 540
},
{
"epoch": 0.1311988444872412,
"grad_norm": 2.296875,
"learning_rate": 7.506896227158561e-05,
"loss": 0.9566,
"step": 545
},
{
"epoch": 0.13240250361097738,
"grad_norm": 2.390625,
"learning_rate": 7.506637725701925e-05,
"loss": 0.9112,
"step": 550
},
{
"epoch": 0.13360616273471354,
"grad_norm": 2.4375,
"learning_rate": 7.50637670538174e-05,
"loss": 0.9529,
"step": 555
},
{
"epoch": 0.1348098218584497,
"grad_norm": 2.28125,
"learning_rate": 7.506113166432265e-05,
"loss": 0.9439,
"step": 560
},
{
"epoch": 0.13601348098218585,
"grad_norm": 2.296875,
"learning_rate": 7.505847109090016e-05,
"loss": 0.9204,
"step": 565
},
{
"epoch": 0.137217140105922,
"grad_norm": 2.25,
"learning_rate": 7.505578533593771e-05,
"loss": 0.9252,
"step": 570
},
{
"epoch": 0.13842079922965816,
"grad_norm": 2.25,
"learning_rate": 7.505307440184569e-05,
"loss": 0.8843,
"step": 575
},
{
"epoch": 0.13962445835339432,
"grad_norm": 2.21875,
"learning_rate": 7.505033829105704e-05,
"loss": 0.9302,
"step": 580
},
{
"epoch": 0.14082811747713048,
"grad_norm": 2.484375,
"learning_rate": 7.504757700602735e-05,
"loss": 0.9238,
"step": 585
},
{
"epoch": 0.14203177660086663,
"grad_norm": 2.75,
"learning_rate": 7.504479054923478e-05,
"loss": 0.9393,
"step": 590
},
{
"epoch": 0.1432354357246028,
"grad_norm": 2.578125,
"learning_rate": 7.504197892318008e-05,
"loss": 0.9297,
"step": 595
},
{
"epoch": 0.14443909484833894,
"grad_norm": 2.359375,
"learning_rate": 7.50391421303866e-05,
"loss": 0.9065,
"step": 600
},
{
"epoch": 0.1456427539720751,
"grad_norm": 2.234375,
"learning_rate": 7.503628017340025e-05,
"loss": 0.9263,
"step": 605
},
{
"epoch": 0.14684641309581126,
"grad_norm": 2.109375,
"learning_rate": 7.503339305478953e-05,
"loss": 0.9169,
"step": 610
},
{
"epoch": 0.1480500722195474,
"grad_norm": 2.34375,
"learning_rate": 7.503048077714556e-05,
"loss": 0.9369,
"step": 615
},
{
"epoch": 0.14925373134328357,
"grad_norm": 2.171875,
"learning_rate": 7.5027543343082e-05,
"loss": 0.9541,
"step": 620
},
{
"epoch": 0.15045739046701975,
"grad_norm": 2.359375,
"learning_rate": 7.502458075523511e-05,
"loss": 0.9273,
"step": 625
},
{
"epoch": 0.1516610495907559,
"grad_norm": 2.421875,
"learning_rate": 7.50215930162637e-05,
"loss": 0.9541,
"step": 630
},
{
"epoch": 0.15286470871449206,
"grad_norm": 2.375,
"learning_rate": 7.501858012884915e-05,
"loss": 0.9334,
"step": 635
},
{
"epoch": 0.15406836783822822,
"grad_norm": 2.15625,
"learning_rate": 7.501554209569548e-05,
"loss": 0.9156,
"step": 640
},
{
"epoch": 0.15527202696196438,
"grad_norm": 2.3125,
"learning_rate": 7.501247891952918e-05,
"loss": 0.9295,
"step": 645
},
{
"epoch": 0.15647568608570053,
"grad_norm": 2.25,
"learning_rate": 7.500939060309934e-05,
"loss": 0.9318,
"step": 650
},
{
"epoch": 0.1576793452094367,
"grad_norm": 2.390625,
"learning_rate": 7.500627714917765e-05,
"loss": 0.9627,
"step": 655
},
{
"epoch": 0.15888300433317284,
"grad_norm": 2.421875,
"learning_rate": 7.500313856055832e-05,
"loss": 0.9144,
"step": 660
},
{
"epoch": 0.160086663456909,
"grad_norm": 2.25,
"learning_rate": 7.499997484005813e-05,
"loss": 0.9378,
"step": 665
},
{
"epoch": 0.16129032258064516,
"grad_norm": 2.46875,
"learning_rate": 7.499678599051639e-05,
"loss": 0.9226,
"step": 670
},
{
"epoch": 0.1624939817043813,
"grad_norm": 2.21875,
"learning_rate": 7.499357201479502e-05,
"loss": 0.8941,
"step": 675
},
{
"epoch": 0.16369764082811747,
"grad_norm": 2.53125,
"learning_rate": 7.499033291577844e-05,
"loss": 0.9054,
"step": 680
},
{
"epoch": 0.16490129995185362,
"grad_norm": 2.4375,
"learning_rate": 7.498706869637364e-05,
"loss": 0.9043,
"step": 685
},
{
"epoch": 0.16610495907558978,
"grad_norm": 2.375,
"learning_rate": 7.498377935951014e-05,
"loss": 0.907,
"step": 690
},
{
"epoch": 0.16730861819932596,
"grad_norm": 2.1875,
"learning_rate": 7.498046490814001e-05,
"loss": 0.8948,
"step": 695
},
{
"epoch": 0.16851227732306212,
"grad_norm": 2.515625,
"learning_rate": 7.497712534523786e-05,
"loss": 0.8884,
"step": 700
},
{
"epoch": 0.16971593644679828,
"grad_norm": 2.375,
"learning_rate": 7.497376067380085e-05,
"loss": 0.9339,
"step": 705
},
{
"epoch": 0.17091959557053443,
"grad_norm": 2.546875,
"learning_rate": 7.497037089684863e-05,
"loss": 0.9214,
"step": 710
},
{
"epoch": 0.1721232546942706,
"grad_norm": 2.359375,
"learning_rate": 7.496695601742344e-05,
"loss": 0.909,
"step": 715
},
{
"epoch": 0.17332691381800674,
"grad_norm": 2.171875,
"learning_rate": 7.496351603859001e-05,
"loss": 0.8977,
"step": 720
},
{
"epoch": 0.1745305729417429,
"grad_norm": 2.46875,
"learning_rate": 7.496005096343561e-05,
"loss": 0.9395,
"step": 725
},
{
"epoch": 0.17573423206547906,
"grad_norm": 2.609375,
"learning_rate": 7.495656079507003e-05,
"loss": 0.902,
"step": 730
},
{
"epoch": 0.1769378911892152,
"grad_norm": 2.4375,
"learning_rate": 7.495304553662555e-05,
"loss": 0.9075,
"step": 735
},
{
"epoch": 0.17814155031295137,
"grad_norm": 2.390625,
"learning_rate": 7.494950519125705e-05,
"loss": 0.8822,
"step": 740
},
{
"epoch": 0.17934520943668752,
"grad_norm": 2.203125,
"learning_rate": 7.494593976214182e-05,
"loss": 0.8719,
"step": 745
},
{
"epoch": 0.18054886856042368,
"grad_norm": 2.484375,
"learning_rate": 7.494234925247975e-05,
"loss": 0.8644,
"step": 750
},
{
"epoch": 0.18175252768415984,
"grad_norm": 2.203125,
"learning_rate": 7.493873366549319e-05,
"loss": 0.8841,
"step": 755
},
{
"epoch": 0.182956186807896,
"grad_norm": 2.265625,
"learning_rate": 7.4935093004427e-05,
"loss": 0.8557,
"step": 760
},
{
"epoch": 0.18415984593163215,
"grad_norm": 2.25,
"learning_rate": 7.493142727254856e-05,
"loss": 0.8904,
"step": 765
},
{
"epoch": 0.18536350505536833,
"grad_norm": 2.203125,
"learning_rate": 7.492773647314775e-05,
"loss": 0.8465,
"step": 770
},
{
"epoch": 0.1865671641791045,
"grad_norm": 2.296875,
"learning_rate": 7.492402060953692e-05,
"loss": 0.9323,
"step": 775
},
{
"epoch": 0.18777082330284064,
"grad_norm": 2.390625,
"learning_rate": 7.492027968505095e-05,
"loss": 0.8839,
"step": 780
},
{
"epoch": 0.1889744824265768,
"grad_norm": 2.359375,
"learning_rate": 7.49165137030472e-05,
"loss": 0.9033,
"step": 785
},
{
"epoch": 0.19017814155031296,
"grad_norm": 2.4375,
"learning_rate": 7.491272266690549e-05,
"loss": 0.8841,
"step": 790
},
{
"epoch": 0.1913818006740491,
"grad_norm": 2.21875,
"learning_rate": 7.490890658002814e-05,
"loss": 0.8432,
"step": 795
},
{
"epoch": 0.19258545979778527,
"grad_norm": 2.3125,
"learning_rate": 7.490506544584e-05,
"loss": 0.8822,
"step": 800
},
{
"epoch": 0.19378911892152142,
"grad_norm": 2.28125,
"learning_rate": 7.490119926778834e-05,
"loss": 0.889,
"step": 805
},
{
"epoch": 0.19499277804525758,
"grad_norm": 2.265625,
"learning_rate": 7.489730804934292e-05,
"loss": 0.8852,
"step": 810
},
{
"epoch": 0.19619643716899374,
"grad_norm": 2.40625,
"learning_rate": 7.489339179399597e-05,
"loss": 0.8688,
"step": 815
},
{
"epoch": 0.1974000962927299,
"grad_norm": 2.40625,
"learning_rate": 7.488945050526224e-05,
"loss": 0.8844,
"step": 820
},
{
"epoch": 0.19860375541646605,
"grad_norm": 2.515625,
"learning_rate": 7.488548418667887e-05,
"loss": 0.8692,
"step": 825
},
{
"epoch": 0.1998074145402022,
"grad_norm": 2.34375,
"learning_rate": 7.48814928418055e-05,
"loss": 0.8846,
"step": 830
},
{
"epoch": 0.20101107366393836,
"grad_norm": 2.296875,
"learning_rate": 7.487747647422422e-05,
"loss": 0.895,
"step": 835
},
{
"epoch": 0.20221473278767452,
"grad_norm": 2.40625,
"learning_rate": 7.48734350875396e-05,
"loss": 0.865,
"step": 840
},
{
"epoch": 0.2034183919114107,
"grad_norm": 2.109375,
"learning_rate": 7.486936868537866e-05,
"loss": 0.8804,
"step": 845
},
{
"epoch": 0.20462205103514686,
"grad_norm": 2.296875,
"learning_rate": 7.486527727139085e-05,
"loss": 0.892,
"step": 850
},
{
"epoch": 0.205825710158883,
"grad_norm": 2.171875,
"learning_rate": 7.486116084924808e-05,
"loss": 0.9048,
"step": 855
},
{
"epoch": 0.20702936928261917,
"grad_norm": 2.3125,
"learning_rate": 7.485701942264469e-05,
"loss": 0.8856,
"step": 860
},
{
"epoch": 0.20823302840635532,
"grad_norm": 2.125,
"learning_rate": 7.485285299529746e-05,
"loss": 0.9206,
"step": 865
},
{
"epoch": 0.20943668753009148,
"grad_norm": 2.34375,
"learning_rate": 7.484866157094568e-05,
"loss": 0.902,
"step": 870
},
{
"epoch": 0.21064034665382764,
"grad_norm": 2.5,
"learning_rate": 7.484444515335095e-05,
"loss": 0.8681,
"step": 875
},
{
"epoch": 0.2118440057775638,
"grad_norm": 2.171875,
"learning_rate": 7.484020374629738e-05,
"loss": 0.8925,
"step": 880
},
{
"epoch": 0.21304766490129995,
"grad_norm": 2.234375,
"learning_rate": 7.483593735359151e-05,
"loss": 0.8729,
"step": 885
},
{
"epoch": 0.2142513240250361,
"grad_norm": 2.203125,
"learning_rate": 7.483164597906225e-05,
"loss": 0.8567,
"step": 890
},
{
"epoch": 0.21545498314877226,
"grad_norm": 2.40625,
"learning_rate": 7.482732962656101e-05,
"loss": 0.867,
"step": 895
},
{
"epoch": 0.21665864227250842,
"grad_norm": 2.078125,
"learning_rate": 7.482298829996155e-05,
"loss": 0.8476,
"step": 900
},
{
"epoch": 0.21786230139624457,
"grad_norm": 2.4375,
"learning_rate": 7.481862200316005e-05,
"loss": 0.8878,
"step": 905
},
{
"epoch": 0.21906596051998073,
"grad_norm": 2.46875,
"learning_rate": 7.481423074007512e-05,
"loss": 0.8733,
"step": 910
},
{
"epoch": 0.2202696196437169,
"grad_norm": 2.28125,
"learning_rate": 7.48098145146478e-05,
"loss": 0.8523,
"step": 915
},
{
"epoch": 0.22147327876745307,
"grad_norm": 2.328125,
"learning_rate": 7.480537333084149e-05,
"loss": 0.8696,
"step": 920
},
{
"epoch": 0.22267693789118922,
"grad_norm": 2.65625,
"learning_rate": 7.480090719264199e-05,
"loss": 0.8744,
"step": 925
},
{
"epoch": 0.22388059701492538,
"grad_norm": 2.421875,
"learning_rate": 7.479641610405752e-05,
"loss": 0.8644,
"step": 930
},
{
"epoch": 0.22508425613866154,
"grad_norm": 2.3125,
"learning_rate": 7.479190006911868e-05,
"loss": 0.8718,
"step": 935
},
{
"epoch": 0.2262879152623977,
"grad_norm": 2.34375,
"learning_rate": 7.478735909187847e-05,
"loss": 0.8723,
"step": 940
},
{
"epoch": 0.22749157438613385,
"grad_norm": 2.40625,
"learning_rate": 7.478279317641225e-05,
"loss": 0.8696,
"step": 945
},
{
"epoch": 0.22869523350987,
"grad_norm": 2.1875,
"learning_rate": 7.47782023268178e-05,
"loss": 0.8958,
"step": 950
},
{
"epoch": 0.22989889263360616,
"grad_norm": 2.296875,
"learning_rate": 7.477358654721523e-05,
"loss": 0.8537,
"step": 955
},
{
"epoch": 0.23110255175734232,
"grad_norm": 2.3125,
"learning_rate": 7.476894584174705e-05,
"loss": 0.8586,
"step": 960
},
{
"epoch": 0.23230621088107847,
"grad_norm": 2.375,
"learning_rate": 7.476428021457815e-05,
"loss": 0.8727,
"step": 965
},
{
"epoch": 0.23350987000481463,
"grad_norm": 2.59375,
"learning_rate": 7.475958966989575e-05,
"loss": 0.8582,
"step": 970
},
{
"epoch": 0.23471352912855079,
"grad_norm": 2.328125,
"learning_rate": 7.47548742119095e-05,
"loss": 0.8351,
"step": 975
},
{
"epoch": 0.23591718825228694,
"grad_norm": 2.3125,
"learning_rate": 7.475013384485134e-05,
"loss": 0.841,
"step": 980
},
{
"epoch": 0.2371208473760231,
"grad_norm": 2.109375,
"learning_rate": 7.474536857297558e-05,
"loss": 0.8406,
"step": 985
},
{
"epoch": 0.23832450649975928,
"grad_norm": 2.28125,
"learning_rate": 7.474057840055891e-05,
"loss": 0.8378,
"step": 990
},
{
"epoch": 0.23952816562349544,
"grad_norm": 2.046875,
"learning_rate": 7.473576333190034e-05,
"loss": 0.8534,
"step": 995
},
{
"epoch": 0.2407318247472316,
"grad_norm": 2.28125,
"learning_rate": 7.473092337132126e-05,
"loss": 0.8428,
"step": 1000
},
{
"epoch": 0.2407318247472316,
"eval_loss": 0.7515629529953003,
"eval_runtime": 2.4162,
"eval_samples_per_second": 82.774,
"eval_steps_per_second": 82.774,
"step": 1000
},
{
"epoch": 0.24193548387096775,
"grad_norm": 2.125,
"learning_rate": 7.472605852316533e-05,
"loss": 0.8745,
"step": 1005
},
{
"epoch": 0.2431391429947039,
"grad_norm": 2.171875,
"learning_rate": 7.47211687917986e-05,
"loss": 0.8463,
"step": 1010
},
{
"epoch": 0.24434280211844006,
"grad_norm": 2.265625,
"learning_rate": 7.471625418160947e-05,
"loss": 0.8593,
"step": 1015
},
{
"epoch": 0.24554646124217622,
"grad_norm": 2.453125,
"learning_rate": 7.471131469700862e-05,
"loss": 0.8309,
"step": 1020
},
{
"epoch": 0.24675012036591237,
"grad_norm": 2.125,
"learning_rate": 7.470635034242906e-05,
"loss": 0.8165,
"step": 1025
},
{
"epoch": 0.24795377948964853,
"grad_norm": 2.34375,
"learning_rate": 7.470136112232614e-05,
"loss": 0.8193,
"step": 1030
},
{
"epoch": 0.24915743861338469,
"grad_norm": 2.515625,
"learning_rate": 7.469634704117752e-05,
"loss": 0.8642,
"step": 1035
},
{
"epoch": 0.25036109773712084,
"grad_norm": 2.140625,
"learning_rate": 7.469130810348318e-05,
"loss": 0.8601,
"step": 1040
},
{
"epoch": 0.251564756860857,
"grad_norm": 2.28125,
"learning_rate": 7.468624431376538e-05,
"loss": 0.7957,
"step": 1045
},
{
"epoch": 0.25276841598459315,
"grad_norm": 2.09375,
"learning_rate": 7.468115567656872e-05,
"loss": 0.8385,
"step": 1050
},
{
"epoch": 0.2539720751083293,
"grad_norm": 2.28125,
"learning_rate": 7.467604219646007e-05,
"loss": 0.7962,
"step": 1055
},
{
"epoch": 0.25517573423206547,
"grad_norm": 2.140625,
"learning_rate": 7.467090387802862e-05,
"loss": 0.8701,
"step": 1060
},
{
"epoch": 0.2563793933558016,
"grad_norm": 2.203125,
"learning_rate": 7.466574072588581e-05,
"loss": 0.8678,
"step": 1065
},
{
"epoch": 0.2575830524795378,
"grad_norm": 2.34375,
"learning_rate": 7.466055274466543e-05,
"loss": 0.8385,
"step": 1070
},
{
"epoch": 0.25878671160327393,
"grad_norm": 2.375,
"learning_rate": 7.46553399390235e-05,
"loss": 0.8711,
"step": 1075
},
{
"epoch": 0.2599903707270101,
"grad_norm": 2.21875,
"learning_rate": 7.465010231363835e-05,
"loss": 0.8953,
"step": 1080
},
{
"epoch": 0.26119402985074625,
"grad_norm": 2.234375,
"learning_rate": 7.464483987321056e-05,
"loss": 0.8106,
"step": 1085
},
{
"epoch": 0.2623976889744824,
"grad_norm": 2.28125,
"learning_rate": 7.463955262246301e-05,
"loss": 0.8329,
"step": 1090
},
{
"epoch": 0.26360134809821856,
"grad_norm": 2.21875,
"learning_rate": 7.463424056614082e-05,
"loss": 0.8217,
"step": 1095
},
{
"epoch": 0.26480500722195477,
"grad_norm": 2.296875,
"learning_rate": 7.46289037090114e-05,
"loss": 0.8368,
"step": 1100
},
{
"epoch": 0.2660086663456909,
"grad_norm": 2.140625,
"learning_rate": 7.462354205586437e-05,
"loss": 0.8145,
"step": 1105
},
{
"epoch": 0.2672123254694271,
"grad_norm": 2.109375,
"learning_rate": 7.461815561151166e-05,
"loss": 0.7885,
"step": 1110
},
{
"epoch": 0.26841598459316324,
"grad_norm": 2.28125,
"learning_rate": 7.461274438078741e-05,
"loss": 0.845,
"step": 1115
},
{
"epoch": 0.2696196437168994,
"grad_norm": 2.453125,
"learning_rate": 7.460730836854803e-05,
"loss": 0.7927,
"step": 1120
},
{
"epoch": 0.27082330284063555,
"grad_norm": 2.15625,
"learning_rate": 7.460184757967215e-05,
"loss": 0.85,
"step": 1125
},
{
"epoch": 0.2720269619643717,
"grad_norm": 2.15625,
"learning_rate": 7.459636201906066e-05,
"loss": 0.8376,
"step": 1130
},
{
"epoch": 0.27323062108810786,
"grad_norm": 2.3125,
"learning_rate": 7.459085169163664e-05,
"loss": 0.866,
"step": 1135
},
{
"epoch": 0.274434280211844,
"grad_norm": 2.125,
"learning_rate": 7.458531660234546e-05,
"loss": 0.8382,
"step": 1140
},
{
"epoch": 0.2756379393355802,
"grad_norm": 2.234375,
"learning_rate": 7.457975675615464e-05,
"loss": 0.8455,
"step": 1145
},
{
"epoch": 0.27684159845931633,
"grad_norm": 2.15625,
"learning_rate": 7.457417215805399e-05,
"loss": 0.8559,
"step": 1150
},
{
"epoch": 0.2780452575830525,
"grad_norm": 2.078125,
"learning_rate": 7.456856281305547e-05,
"loss": 0.8299,
"step": 1155
},
{
"epoch": 0.27924891670678864,
"grad_norm": 2.1875,
"learning_rate": 7.45629287261933e-05,
"loss": 0.8586,
"step": 1160
},
{
"epoch": 0.2804525758305248,
"grad_norm": 2.265625,
"learning_rate": 7.455726990252389e-05,
"loss": 0.7975,
"step": 1165
},
{
"epoch": 0.28165623495426095,
"grad_norm": 2.484375,
"learning_rate": 7.455158634712583e-05,
"loss": 0.8304,
"step": 1170
},
{
"epoch": 0.2828598940779971,
"grad_norm": 2.125,
"learning_rate": 7.454587806509992e-05,
"loss": 0.819,
"step": 1175
},
{
"epoch": 0.28406355320173327,
"grad_norm": 2.453125,
"learning_rate": 7.454014506156915e-05,
"loss": 0.8544,
"step": 1180
},
{
"epoch": 0.2852672123254694,
"grad_norm": 2.515625,
"learning_rate": 7.453438734167873e-05,
"loss": 0.8258,
"step": 1185
},
{
"epoch": 0.2864708714492056,
"grad_norm": 2.15625,
"learning_rate": 7.452860491059598e-05,
"loss": 0.8564,
"step": 1190
},
{
"epoch": 0.28767453057294173,
"grad_norm": 2.046875,
"learning_rate": 7.452279777351046e-05,
"loss": 0.8325,
"step": 1195
},
{
"epoch": 0.2888781896966779,
"grad_norm": 2.28125,
"learning_rate": 7.451696593563388e-05,
"loss": 0.8374,
"step": 1200
},
{
"epoch": 0.29008184882041405,
"grad_norm": 2.0,
"learning_rate": 7.451110940220013e-05,
"loss": 0.7921,
"step": 1205
},
{
"epoch": 0.2912855079441502,
"grad_norm": 2.296875,
"learning_rate": 7.450522817846522e-05,
"loss": 0.8379,
"step": 1210
},
{
"epoch": 0.29248916706788636,
"grad_norm": 2.359375,
"learning_rate": 7.449932226970739e-05,
"loss": 0.8362,
"step": 1215
},
{
"epoch": 0.2936928261916225,
"grad_norm": 2.234375,
"learning_rate": 7.449339168122696e-05,
"loss": 0.8319,
"step": 1220
},
{
"epoch": 0.29489648531535867,
"grad_norm": 2.296875,
"learning_rate": 7.448743641834646e-05,
"loss": 0.8261,
"step": 1225
},
{
"epoch": 0.2961001444390948,
"grad_norm": 2.203125,
"learning_rate": 7.448145648641054e-05,
"loss": 0.8369,
"step": 1230
},
{
"epoch": 0.297303803562831,
"grad_norm": 2.421875,
"learning_rate": 7.447545189078597e-05,
"loss": 0.8054,
"step": 1235
},
{
"epoch": 0.29850746268656714,
"grad_norm": 2.203125,
"learning_rate": 7.446942263686169e-05,
"loss": 0.8111,
"step": 1240
},
{
"epoch": 0.29971112181030335,
"grad_norm": 2.15625,
"learning_rate": 7.446336873004875e-05,
"loss": 0.8285,
"step": 1245
},
{
"epoch": 0.3009147809340395,
"grad_norm": 2.3125,
"learning_rate": 7.445729017578033e-05,
"loss": 0.8248,
"step": 1250
},
{
"epoch": 0.30211844005777566,
"grad_norm": 2.25,
"learning_rate": 7.445118697951173e-05,
"loss": 0.8131,
"step": 1255
},
{
"epoch": 0.3033220991815118,
"grad_norm": 1.9765625,
"learning_rate": 7.444505914672035e-05,
"loss": 0.8288,
"step": 1260
},
{
"epoch": 0.304525758305248,
"grad_norm": 2.1875,
"learning_rate": 7.443890668290574e-05,
"loss": 0.7962,
"step": 1265
},
{
"epoch": 0.30572941742898413,
"grad_norm": 2.15625,
"learning_rate": 7.443272959358952e-05,
"loss": 0.8235,
"step": 1270
},
{
"epoch": 0.3069330765527203,
"grad_norm": 2.109375,
"learning_rate": 7.442652788431541e-05,
"loss": 0.8137,
"step": 1275
},
{
"epoch": 0.30813673567645644,
"grad_norm": 2.03125,
"learning_rate": 7.442030156064925e-05,
"loss": 0.7973,
"step": 1280
},
{
"epoch": 0.3093403948001926,
"grad_norm": 2.453125,
"learning_rate": 7.441405062817895e-05,
"loss": 0.8416,
"step": 1285
},
{
"epoch": 0.31054405392392875,
"grad_norm": 2.359375,
"learning_rate": 7.440777509251453e-05,
"loss": 0.8208,
"step": 1290
},
{
"epoch": 0.3117477130476649,
"grad_norm": 2.203125,
"learning_rate": 7.440147495928803e-05,
"loss": 0.8301,
"step": 1295
},
{
"epoch": 0.31295137217140107,
"grad_norm": 2.375,
"learning_rate": 7.439515023415366e-05,
"loss": 0.7933,
"step": 1300
},
{
"epoch": 0.3141550312951372,
"grad_norm": 2.28125,
"learning_rate": 7.438880092278763e-05,
"loss": 0.7935,
"step": 1305
},
{
"epoch": 0.3153586904188734,
"grad_norm": 2.40625,
"learning_rate": 7.438242703088822e-05,
"loss": 0.8092,
"step": 1310
},
{
"epoch": 0.31656234954260953,
"grad_norm": 2.1875,
"learning_rate": 7.43760285641758e-05,
"loss": 0.841,
"step": 1315
},
{
"epoch": 0.3177660086663457,
"grad_norm": 2.109375,
"learning_rate": 7.436960552839279e-05,
"loss": 0.8307,
"step": 1320
},
{
"epoch": 0.31896966779008185,
"grad_norm": 2.1875,
"learning_rate": 7.436315792930362e-05,
"loss": 0.823,
"step": 1325
},
{
"epoch": 0.320173326913818,
"grad_norm": 2.28125,
"learning_rate": 7.435668577269483e-05,
"loss": 0.8125,
"step": 1330
},
{
"epoch": 0.32137698603755416,
"grad_norm": 2.1875,
"learning_rate": 7.435018906437495e-05,
"loss": 0.8152,
"step": 1335
},
{
"epoch": 0.3225806451612903,
"grad_norm": 2.15625,
"learning_rate": 7.434366781017453e-05,
"loss": 0.7877,
"step": 1340
},
{
"epoch": 0.32378430428502647,
"grad_norm": 2.359375,
"learning_rate": 7.433712201594622e-05,
"loss": 0.7896,
"step": 1345
},
{
"epoch": 0.3249879634087626,
"grad_norm": 2.1875,
"learning_rate": 7.433055168756462e-05,
"loss": 0.7763,
"step": 1350
},
{
"epoch": 0.3261916225324988,
"grad_norm": 2.171875,
"learning_rate": 7.432395683092641e-05,
"loss": 0.8121,
"step": 1355
},
{
"epoch": 0.32739528165623494,
"grad_norm": 2.21875,
"learning_rate": 7.431733745195025e-05,
"loss": 0.7965,
"step": 1360
},
{
"epoch": 0.3285989407799711,
"grad_norm": 2.390625,
"learning_rate": 7.431069355657676e-05,
"loss": 0.8458,
"step": 1365
},
{
"epoch": 0.32980259990370725,
"grad_norm": 2.09375,
"learning_rate": 7.430402515076869e-05,
"loss": 0.7621,
"step": 1370
},
{
"epoch": 0.3310062590274434,
"grad_norm": 2.109375,
"learning_rate": 7.429733224051065e-05,
"loss": 0.8226,
"step": 1375
},
{
"epoch": 0.33220991815117956,
"grad_norm": 2.1875,
"learning_rate": 7.429061483180935e-05,
"loss": 0.7758,
"step": 1380
},
{
"epoch": 0.3334135772749157,
"grad_norm": 2.265625,
"learning_rate": 7.428387293069341e-05,
"loss": 0.7796,
"step": 1385
},
{
"epoch": 0.33461723639865193,
"grad_norm": 2.578125,
"learning_rate": 7.427710654321345e-05,
"loss": 0.8098,
"step": 1390
},
{
"epoch": 0.3358208955223881,
"grad_norm": 2.15625,
"learning_rate": 7.427031567544212e-05,
"loss": 0.8161,
"step": 1395
},
{
"epoch": 0.33702455464612424,
"grad_norm": 2.28125,
"learning_rate": 7.426350033347396e-05,
"loss": 0.8314,
"step": 1400
},
{
"epoch": 0.3382282137698604,
"grad_norm": 2.203125,
"learning_rate": 7.425666052342554e-05,
"loss": 0.7734,
"step": 1405
},
{
"epoch": 0.33943187289359655,
"grad_norm": 2.078125,
"learning_rate": 7.424979625143531e-05,
"loss": 0.8005,
"step": 1410
},
{
"epoch": 0.3406355320173327,
"grad_norm": 2.375,
"learning_rate": 7.424290752366379e-05,
"loss": 0.8085,
"step": 1415
},
{
"epoch": 0.34183919114106887,
"grad_norm": 2.1875,
"learning_rate": 7.423599434629334e-05,
"loss": 0.81,
"step": 1420
},
{
"epoch": 0.343042850264805,
"grad_norm": 2.328125,
"learning_rate": 7.422905672552831e-05,
"loss": 0.8262,
"step": 1425
},
{
"epoch": 0.3442465093885412,
"grad_norm": 2.203125,
"learning_rate": 7.4222094667595e-05,
"loss": 0.7969,
"step": 1430
},
{
"epoch": 0.34545016851227733,
"grad_norm": 2.125,
"learning_rate": 7.421510817874162e-05,
"loss": 0.8157,
"step": 1435
},
{
"epoch": 0.3466538276360135,
"grad_norm": 2.15625,
"learning_rate": 7.42080972652383e-05,
"loss": 0.791,
"step": 1440
},
{
"epoch": 0.34785748675974965,
"grad_norm": 2.109375,
"learning_rate": 7.42010619333771e-05,
"loss": 0.7623,
"step": 1445
},
{
"epoch": 0.3490611458834858,
"grad_norm": 2.203125,
"learning_rate": 7.419400218947201e-05,
"loss": 0.7848,
"step": 1450
},
{
"epoch": 0.35026480500722196,
"grad_norm": 2.171875,
"learning_rate": 7.41869180398589e-05,
"loss": 0.77,
"step": 1455
},
{
"epoch": 0.3514684641309581,
"grad_norm": 2.546875,
"learning_rate": 7.417980949089556e-05,
"loss": 0.7763,
"step": 1460
},
{
"epoch": 0.35267212325469427,
"grad_norm": 2.28125,
"learning_rate": 7.417267654896169e-05,
"loss": 0.7987,
"step": 1465
},
{
"epoch": 0.3538757823784304,
"grad_norm": 2.390625,
"learning_rate": 7.416551922045884e-05,
"loss": 0.8275,
"step": 1470
},
{
"epoch": 0.3550794415021666,
"grad_norm": 2.25,
"learning_rate": 7.415833751181048e-05,
"loss": 0.811,
"step": 1475
},
{
"epoch": 0.35628310062590274,
"grad_norm": 2.125,
"learning_rate": 7.415113142946199e-05,
"loss": 0.7969,
"step": 1480
},
{
"epoch": 0.3574867597496389,
"grad_norm": 2.40625,
"learning_rate": 7.414390097988053e-05,
"loss": 0.7832,
"step": 1485
},
{
"epoch": 0.35869041887337505,
"grad_norm": 2.171875,
"learning_rate": 7.413664616955524e-05,
"loss": 0.7666,
"step": 1490
},
{
"epoch": 0.3598940779971112,
"grad_norm": 2.1875,
"learning_rate": 7.412936700499703e-05,
"loss": 0.7793,
"step": 1495
},
{
"epoch": 0.36109773712084736,
"grad_norm": 2.234375,
"learning_rate": 7.412206349273873e-05,
"loss": 0.7734,
"step": 1500
},
{
"epoch": 0.36109773712084736,
"eval_loss": 0.687716543674469,
"eval_runtime": 2.4175,
"eval_samples_per_second": 82.729,
"eval_steps_per_second": 82.729,
"step": 1500
},
{
"epoch": 0.3623013962445835,
"grad_norm": 2.28125,
"learning_rate": 7.411473563933497e-05,
"loss": 0.8028,
"step": 1505
},
{
"epoch": 0.3635050553683197,
"grad_norm": 2.203125,
"learning_rate": 7.410738345136231e-05,
"loss": 0.7837,
"step": 1510
},
{
"epoch": 0.36470871449205583,
"grad_norm": 2.203125,
"learning_rate": 7.410000693541903e-05,
"loss": 0.7968,
"step": 1515
},
{
"epoch": 0.365912373615792,
"grad_norm": 2.125,
"learning_rate": 7.409260609812534e-05,
"loss": 0.7674,
"step": 1520
},
{
"epoch": 0.36711603273952814,
"grad_norm": 2.203125,
"learning_rate": 7.408518094612324e-05,
"loss": 0.7536,
"step": 1525
},
{
"epoch": 0.3683196918632643,
"grad_norm": 2.1875,
"learning_rate": 7.407773148607656e-05,
"loss": 0.8126,
"step": 1530
},
{
"epoch": 0.36952335098700045,
"grad_norm": 2.359375,
"learning_rate": 7.407025772467092e-05,
"loss": 0.8111,
"step": 1535
},
{
"epoch": 0.37072701011073667,
"grad_norm": 2.265625,
"learning_rate": 7.406275966861379e-05,
"loss": 0.8091,
"step": 1540
},
{
"epoch": 0.3719306692344728,
"grad_norm": 2.203125,
"learning_rate": 7.405523732463444e-05,
"loss": 0.7743,
"step": 1545
},
{
"epoch": 0.373134328358209,
"grad_norm": 2.328125,
"learning_rate": 7.404769069948389e-05,
"loss": 0.7793,
"step": 1550
},
{
"epoch": 0.37433798748194513,
"grad_norm": 2.3125,
"learning_rate": 7.404011979993499e-05,
"loss": 0.7935,
"step": 1555
},
{
"epoch": 0.3755416466056813,
"grad_norm": 2.140625,
"learning_rate": 7.403252463278238e-05,
"loss": 0.7894,
"step": 1560
},
{
"epoch": 0.37674530572941745,
"grad_norm": 2.265625,
"learning_rate": 7.402490520484246e-05,
"loss": 0.7806,
"step": 1565
},
{
"epoch": 0.3779489648531536,
"grad_norm": 2.15625,
"learning_rate": 7.401726152295342e-05,
"loss": 0.8119,
"step": 1570
},
{
"epoch": 0.37915262397688976,
"grad_norm": 2.25,
"learning_rate": 7.40095935939752e-05,
"loss": 0.7975,
"step": 1575
},
{
"epoch": 0.3803562831006259,
"grad_norm": 2.28125,
"learning_rate": 7.400190142478953e-05,
"loss": 0.7802,
"step": 1580
},
{
"epoch": 0.38155994222436207,
"grad_norm": 2.203125,
"learning_rate": 7.399418502229986e-05,
"loss": 0.7909,
"step": 1585
},
{
"epoch": 0.3827636013480982,
"grad_norm": 2.375,
"learning_rate": 7.398644439343139e-05,
"loss": 0.8037,
"step": 1590
},
{
"epoch": 0.3839672604718344,
"grad_norm": 2.421875,
"learning_rate": 7.397867954513109e-05,
"loss": 0.7849,
"step": 1595
},
{
"epoch": 0.38517091959557054,
"grad_norm": 2.140625,
"learning_rate": 7.397089048436767e-05,
"loss": 0.7871,
"step": 1600
},
{
"epoch": 0.3863745787193067,
"grad_norm": 2.09375,
"learning_rate": 7.396307721813152e-05,
"loss": 0.7793,
"step": 1605
},
{
"epoch": 0.38757823784304285,
"grad_norm": 2.234375,
"learning_rate": 7.395523975343479e-05,
"loss": 0.7851,
"step": 1610
},
{
"epoch": 0.388781896966779,
"grad_norm": 2.21875,
"learning_rate": 7.394737809731136e-05,
"loss": 0.797,
"step": 1615
},
{
"epoch": 0.38998555609051516,
"grad_norm": 1.9296875,
"learning_rate": 7.39394922568168e-05,
"loss": 0.7627,
"step": 1620
},
{
"epoch": 0.3911892152142513,
"grad_norm": 2.078125,
"learning_rate": 7.393158223902837e-05,
"loss": 0.8324,
"step": 1625
},
{
"epoch": 0.3923928743379875,
"grad_norm": 2.109375,
"learning_rate": 7.392364805104507e-05,
"loss": 0.7787,
"step": 1630
},
{
"epoch": 0.39359653346172363,
"grad_norm": 2.265625,
"learning_rate": 7.391568969998755e-05,
"loss": 0.7932,
"step": 1635
},
{
"epoch": 0.3948001925854598,
"grad_norm": 2.34375,
"learning_rate": 7.390770719299817e-05,
"loss": 0.801,
"step": 1640
},
{
"epoch": 0.39600385170919594,
"grad_norm": 2.25,
"learning_rate": 7.389970053724096e-05,
"loss": 0.7666,
"step": 1645
},
{
"epoch": 0.3972075108329321,
"grad_norm": 2.296875,
"learning_rate": 7.389166973990165e-05,
"loss": 0.7781,
"step": 1650
},
{
"epoch": 0.39841116995666825,
"grad_norm": 2.15625,
"learning_rate": 7.388361480818758e-05,
"loss": 0.7947,
"step": 1655
},
{
"epoch": 0.3996148290804044,
"grad_norm": 1.9921875,
"learning_rate": 7.38755357493278e-05,
"loss": 0.7934,
"step": 1660
},
{
"epoch": 0.40081848820414057,
"grad_norm": 2.203125,
"learning_rate": 7.386743257057299e-05,
"loss": 0.769,
"step": 1665
},
{
"epoch": 0.4020221473278767,
"grad_norm": 2.203125,
"learning_rate": 7.385930527919548e-05,
"loss": 0.7539,
"step": 1670
},
{
"epoch": 0.4032258064516129,
"grad_norm": 2.234375,
"learning_rate": 7.385115388248925e-05,
"loss": 0.7754,
"step": 1675
},
{
"epoch": 0.40442946557534903,
"grad_norm": 2.125,
"learning_rate": 7.384297838776988e-05,
"loss": 0.8041,
"step": 1680
},
{
"epoch": 0.40563312469908525,
"grad_norm": 2.265625,
"learning_rate": 7.383477880237465e-05,
"loss": 0.7606,
"step": 1685
},
{
"epoch": 0.4068367838228214,
"grad_norm": 2.234375,
"learning_rate": 7.382655513366237e-05,
"loss": 0.7865,
"step": 1690
},
{
"epoch": 0.40804044294655756,
"grad_norm": 1.953125,
"learning_rate": 7.381830738901354e-05,
"loss": 0.7656,
"step": 1695
},
{
"epoch": 0.4092441020702937,
"grad_norm": 2.15625,
"learning_rate": 7.381003557583022e-05,
"loss": 0.76,
"step": 1700
},
{
"epoch": 0.41044776119402987,
"grad_norm": 2.234375,
"learning_rate": 7.380173970153607e-05,
"loss": 0.793,
"step": 1705
},
{
"epoch": 0.411651420317766,
"grad_norm": 2.140625,
"learning_rate": 7.37934197735764e-05,
"loss": 0.756,
"step": 1710
},
{
"epoch": 0.4128550794415022,
"grad_norm": 2.15625,
"learning_rate": 7.378507579941802e-05,
"loss": 0.7674,
"step": 1715
},
{
"epoch": 0.41405873856523834,
"grad_norm": 2.171875,
"learning_rate": 7.377670778654941e-05,
"loss": 0.7861,
"step": 1720
},
{
"epoch": 0.4152623976889745,
"grad_norm": 2.375,
"learning_rate": 7.376831574248056e-05,
"loss": 0.7743,
"step": 1725
},
{
"epoch": 0.41646605681271065,
"grad_norm": 2.03125,
"learning_rate": 7.375989967474304e-05,
"loss": 0.7511,
"step": 1730
},
{
"epoch": 0.4176697159364468,
"grad_norm": 2.296875,
"learning_rate": 7.375145959089001e-05,
"loss": 0.7772,
"step": 1735
},
{
"epoch": 0.41887337506018296,
"grad_norm": 2.125,
"learning_rate": 7.374299549849616e-05,
"loss": 0.7708,
"step": 1740
},
{
"epoch": 0.4200770341839191,
"grad_norm": 2.125,
"learning_rate": 7.373450740515772e-05,
"loss": 0.7664,
"step": 1745
},
{
"epoch": 0.4212806933076553,
"grad_norm": 2.171875,
"learning_rate": 7.372599531849249e-05,
"loss": 0.7721,
"step": 1750
},
{
"epoch": 0.42248435243139143,
"grad_norm": 2.09375,
"learning_rate": 7.371745924613975e-05,
"loss": 0.7751,
"step": 1755
},
{
"epoch": 0.4236880115551276,
"grad_norm": 2.1875,
"learning_rate": 7.370889919576037e-05,
"loss": 0.7575,
"step": 1760
},
{
"epoch": 0.42489167067886374,
"grad_norm": 2.0,
"learning_rate": 7.370031517503668e-05,
"loss": 0.7773,
"step": 1765
},
{
"epoch": 0.4260953298025999,
"grad_norm": 2.234375,
"learning_rate": 7.36917071916726e-05,
"loss": 0.7559,
"step": 1770
},
{
"epoch": 0.42729898892633605,
"grad_norm": 2.15625,
"learning_rate": 7.368307525339345e-05,
"loss": 0.7386,
"step": 1775
},
{
"epoch": 0.4285026480500722,
"grad_norm": 2.328125,
"learning_rate": 7.367441936794613e-05,
"loss": 0.7575,
"step": 1780
},
{
"epoch": 0.42970630717380837,
"grad_norm": 2.1875,
"learning_rate": 7.366573954309902e-05,
"loss": 0.7845,
"step": 1785
},
{
"epoch": 0.4309099662975445,
"grad_norm": 2.421875,
"learning_rate": 7.365703578664196e-05,
"loss": 0.8023,
"step": 1790
},
{
"epoch": 0.4321136254212807,
"grad_norm": 2.046875,
"learning_rate": 7.364830810638628e-05,
"loss": 0.7781,
"step": 1795
},
{
"epoch": 0.43331728454501683,
"grad_norm": 2.046875,
"learning_rate": 7.36395565101648e-05,
"loss": 0.7705,
"step": 1800
},
{
"epoch": 0.434520943668753,
"grad_norm": 2.203125,
"learning_rate": 7.363078100583177e-05,
"loss": 0.8125,
"step": 1805
},
{
"epoch": 0.43572460279248915,
"grad_norm": 2.09375,
"learning_rate": 7.36219816012629e-05,
"loss": 0.7666,
"step": 1810
},
{
"epoch": 0.4369282619162253,
"grad_norm": 2.21875,
"learning_rate": 7.361315830435537e-05,
"loss": 0.7514,
"step": 1815
},
{
"epoch": 0.43813192103996146,
"grad_norm": 2.296875,
"learning_rate": 7.360431112302781e-05,
"loss": 0.7494,
"step": 1820
},
{
"epoch": 0.4393355801636976,
"grad_norm": 2.25,
"learning_rate": 7.359544006522026e-05,
"loss": 0.7663,
"step": 1825
},
{
"epoch": 0.4405392392874338,
"grad_norm": 2.15625,
"learning_rate": 7.358654513889417e-05,
"loss": 0.7493,
"step": 1830
},
{
"epoch": 0.44174289841117,
"grad_norm": 2.1875,
"learning_rate": 7.357762635203247e-05,
"loss": 0.7722,
"step": 1835
},
{
"epoch": 0.44294655753490614,
"grad_norm": 2.15625,
"learning_rate": 7.35686837126395e-05,
"loss": 0.7896,
"step": 1840
},
{
"epoch": 0.4441502166586423,
"grad_norm": 1.8984375,
"learning_rate": 7.355971722874091e-05,
"loss": 0.7486,
"step": 1845
},
{
"epoch": 0.44535387578237845,
"grad_norm": 2.125,
"learning_rate": 7.355072690838387e-05,
"loss": 0.7846,
"step": 1850
},
{
"epoch": 0.4465575349061146,
"grad_norm": 2.21875,
"learning_rate": 7.354171275963688e-05,
"loss": 0.7665,
"step": 1855
},
{
"epoch": 0.44776119402985076,
"grad_norm": 2.171875,
"learning_rate": 7.353267479058982e-05,
"loss": 0.7758,
"step": 1860
},
{
"epoch": 0.4489648531535869,
"grad_norm": 2.15625,
"learning_rate": 7.3523613009354e-05,
"loss": 0.723,
"step": 1865
},
{
"epoch": 0.4501685122773231,
"grad_norm": 2.359375,
"learning_rate": 7.351452742406204e-05,
"loss": 0.7733,
"step": 1870
},
{
"epoch": 0.45137217140105923,
"grad_norm": 2.296875,
"learning_rate": 7.350541804286795e-05,
"loss": 0.7683,
"step": 1875
},
{
"epoch": 0.4525758305247954,
"grad_norm": 1.9609375,
"learning_rate": 7.34962848739471e-05,
"loss": 0.7656,
"step": 1880
},
{
"epoch": 0.45377948964853154,
"grad_norm": 2.25,
"learning_rate": 7.348712792549623e-05,
"loss": 0.7732,
"step": 1885
},
{
"epoch": 0.4549831487722677,
"grad_norm": 2.578125,
"learning_rate": 7.347794720573334e-05,
"loss": 0.7221,
"step": 1890
},
{
"epoch": 0.45618680789600385,
"grad_norm": 2.171875,
"learning_rate": 7.346874272289787e-05,
"loss": 0.728,
"step": 1895
},
{
"epoch": 0.45739046701974,
"grad_norm": 2.359375,
"learning_rate": 7.34595144852505e-05,
"loss": 0.8017,
"step": 1900
},
{
"epoch": 0.45859412614347617,
"grad_norm": 2.3125,
"learning_rate": 7.345026250107328e-05,
"loss": 0.7741,
"step": 1905
},
{
"epoch": 0.4597977852672123,
"grad_norm": 2.234375,
"learning_rate": 7.344098677866956e-05,
"loss": 0.7762,
"step": 1910
},
{
"epoch": 0.4610014443909485,
"grad_norm": 2.28125,
"learning_rate": 7.343168732636399e-05,
"loss": 0.7609,
"step": 1915
},
{
"epoch": 0.46220510351468463,
"grad_norm": 2.28125,
"learning_rate": 7.342236415250251e-05,
"loss": 0.7588,
"step": 1920
},
{
"epoch": 0.4634087626384208,
"grad_norm": 2.125,
"learning_rate": 7.341301726545236e-05,
"loss": 0.7907,
"step": 1925
},
{
"epoch": 0.46461242176215695,
"grad_norm": 2.1875,
"learning_rate": 7.340364667360207e-05,
"loss": 0.7583,
"step": 1930
},
{
"epoch": 0.4658160808858931,
"grad_norm": 2.109375,
"learning_rate": 7.339425238536141e-05,
"loss": 0.7541,
"step": 1935
},
{
"epoch": 0.46701974000962926,
"grad_norm": 2.09375,
"learning_rate": 7.338483440916145e-05,
"loss": 0.7562,
"step": 1940
},
{
"epoch": 0.4682233991333654,
"grad_norm": 2.34375,
"learning_rate": 7.337539275345452e-05,
"loss": 0.7563,
"step": 1945
},
{
"epoch": 0.46942705825710157,
"grad_norm": 2.171875,
"learning_rate": 7.336592742671419e-05,
"loss": 0.7385,
"step": 1950
},
{
"epoch": 0.4706307173808377,
"grad_norm": 2.0625,
"learning_rate": 7.335643843743526e-05,
"loss": 0.7353,
"step": 1955
},
{
"epoch": 0.4718343765045739,
"grad_norm": 2.1875,
"learning_rate": 7.334692579413379e-05,
"loss": 0.7242,
"step": 1960
},
{
"epoch": 0.47303803562831004,
"grad_norm": 2.375,
"learning_rate": 7.333738950534705e-05,
"loss": 0.7719,
"step": 1965
},
{
"epoch": 0.4742416947520462,
"grad_norm": 2.125,
"learning_rate": 7.332782957963356e-05,
"loss": 0.7788,
"step": 1970
},
{
"epoch": 0.4754453538757824,
"grad_norm": 2.171875,
"learning_rate": 7.3318246025573e-05,
"loss": 0.7635,
"step": 1975
},
{
"epoch": 0.47664901299951856,
"grad_norm": 2.234375,
"learning_rate": 7.330863885176631e-05,
"loss": 0.7608,
"step": 1980
},
{
"epoch": 0.4778526721232547,
"grad_norm": 2.1875,
"learning_rate": 7.329900806683563e-05,
"loss": 0.7329,
"step": 1985
},
{
"epoch": 0.4790563312469909,
"grad_norm": 2.09375,
"learning_rate": 7.328935367942422e-05,
"loss": 0.751,
"step": 1990
},
{
"epoch": 0.48025999037072703,
"grad_norm": 2.171875,
"learning_rate": 7.32796756981966e-05,
"loss": 0.7366,
"step": 1995
},
{
"epoch": 0.4814636494944632,
"grad_norm": 2.078125,
"learning_rate": 7.326997413183845e-05,
"loss": 0.7259,
"step": 2000
},
{
"epoch": 0.4814636494944632,
"eval_loss": 0.6541061997413635,
"eval_runtime": 2.4161,
"eval_samples_per_second": 82.778,
"eval_steps_per_second": 82.778,
"step": 2000
},
{
"epoch": 0.48266730861819934,
"grad_norm": 2.21875,
"learning_rate": 7.326024898905656e-05,
"loss": 0.7437,
"step": 2005
},
{
"epoch": 0.4838709677419355,
"grad_norm": 2.15625,
"learning_rate": 7.325050027857896e-05,
"loss": 0.7322,
"step": 2010
},
{
"epoch": 0.48507462686567165,
"grad_norm": 2.09375,
"learning_rate": 7.324072800915476e-05,
"loss": 0.7525,
"step": 2015
},
{
"epoch": 0.4862782859894078,
"grad_norm": 2.203125,
"learning_rate": 7.323093218955426e-05,
"loss": 0.7395,
"step": 2020
},
{
"epoch": 0.48748194511314397,
"grad_norm": 2.140625,
"learning_rate": 7.322111282856888e-05,
"loss": 0.7477,
"step": 2025
},
{
"epoch": 0.4886856042368801,
"grad_norm": 2.234375,
"learning_rate": 7.321126993501118e-05,
"loss": 0.7167,
"step": 2030
},
{
"epoch": 0.4898892633606163,
"grad_norm": 2.484375,
"learning_rate": 7.32014035177148e-05,
"loss": 0.7711,
"step": 2035
},
{
"epoch": 0.49109292248435243,
"grad_norm": 2.4375,
"learning_rate": 7.319151358553453e-05,
"loss": 0.7454,
"step": 2040
},
{
"epoch": 0.4922965816080886,
"grad_norm": 2.328125,
"learning_rate": 7.318160014734628e-05,
"loss": 0.7272,
"step": 2045
},
{
"epoch": 0.49350024073182475,
"grad_norm": 2.25,
"learning_rate": 7.3171663212047e-05,
"loss": 0.7585,
"step": 2050
},
{
"epoch": 0.4947038998555609,
"grad_norm": 2.140625,
"learning_rate": 7.316170278855475e-05,
"loss": 0.7301,
"step": 2055
},
{
"epoch": 0.49590755897929706,
"grad_norm": 2.203125,
"learning_rate": 7.315171888580872e-05,
"loss": 0.7209,
"step": 2060
},
{
"epoch": 0.4971112181030332,
"grad_norm": 2.21875,
"learning_rate": 7.314171151276908e-05,
"loss": 0.7412,
"step": 2065
},
{
"epoch": 0.49831487722676937,
"grad_norm": 2.125,
"learning_rate": 7.313168067841716e-05,
"loss": 0.7563,
"step": 2070
},
{
"epoch": 0.4995185363505055,
"grad_norm": 2.0625,
"learning_rate": 7.312162639175524e-05,
"loss": 0.7186,
"step": 2075
},
{
"epoch": 0.5007221954742417,
"grad_norm": 2.21875,
"learning_rate": 7.311154866180677e-05,
"loss": 0.7328,
"step": 2080
},
{
"epoch": 0.5019258545979779,
"grad_norm": 2.046875,
"learning_rate": 7.310144749761613e-05,
"loss": 0.7683,
"step": 2085
},
{
"epoch": 0.503129513721714,
"grad_norm": 2.09375,
"learning_rate": 7.30913229082488e-05,
"loss": 0.7706,
"step": 2090
},
{
"epoch": 0.5043331728454502,
"grad_norm": 1.9140625,
"learning_rate": 7.308117490279124e-05,
"loss": 0.7109,
"step": 2095
},
{
"epoch": 0.5055368319691863,
"grad_norm": 2.046875,
"learning_rate": 7.307100349035097e-05,
"loss": 0.7755,
"step": 2100
},
{
"epoch": 0.5067404910929225,
"grad_norm": 2.046875,
"learning_rate": 7.306080868005648e-05,
"loss": 0.7243,
"step": 2105
},
{
"epoch": 0.5079441502166586,
"grad_norm": 2.125,
"learning_rate": 7.305059048105727e-05,
"loss": 0.7462,
"step": 2110
},
{
"epoch": 0.5091478093403948,
"grad_norm": 2.1875,
"learning_rate": 7.304034890252383e-05,
"loss": 0.7665,
"step": 2115
},
{
"epoch": 0.5103514684641309,
"grad_norm": 2.265625,
"learning_rate": 7.303008395364765e-05,
"loss": 0.7395,
"step": 2120
},
{
"epoch": 0.5115551275878671,
"grad_norm": 2.109375,
"learning_rate": 7.301979564364117e-05,
"loss": 0.7747,
"step": 2125
},
{
"epoch": 0.5127587867116032,
"grad_norm": 2.25,
"learning_rate": 7.300948398173779e-05,
"loss": 0.6931,
"step": 2130
},
{
"epoch": 0.5139624458353395,
"grad_norm": 2.203125,
"learning_rate": 7.299914897719191e-05,
"loss": 0.723,
"step": 2135
},
{
"epoch": 0.5151661049590756,
"grad_norm": 2.03125,
"learning_rate": 7.298879063927882e-05,
"loss": 0.7726,
"step": 2140
},
{
"epoch": 0.5163697640828118,
"grad_norm": 2.125,
"learning_rate": 7.297840897729481e-05,
"loss": 0.7356,
"step": 2145
},
{
"epoch": 0.5175734232065479,
"grad_norm": 2.109375,
"learning_rate": 7.296800400055706e-05,
"loss": 0.7247,
"step": 2150
},
{
"epoch": 0.5187770823302841,
"grad_norm": 2.15625,
"learning_rate": 7.295757571840368e-05,
"loss": 0.7482,
"step": 2155
},
{
"epoch": 0.5199807414540202,
"grad_norm": 2.0,
"learning_rate": 7.294712414019372e-05,
"loss": 0.7282,
"step": 2160
},
{
"epoch": 0.5211844005777564,
"grad_norm": 2.25,
"learning_rate": 7.293664927530712e-05,
"loss": 0.757,
"step": 2165
},
{
"epoch": 0.5223880597014925,
"grad_norm": 2.125,
"learning_rate": 7.292615113314472e-05,
"loss": 0.7544,
"step": 2170
},
{
"epoch": 0.5235917188252287,
"grad_norm": 2.078125,
"learning_rate": 7.291562972312825e-05,
"loss": 0.7363,
"step": 2175
},
{
"epoch": 0.5247953779489648,
"grad_norm": 2.09375,
"learning_rate": 7.290508505470032e-05,
"loss": 0.7396,
"step": 2180
},
{
"epoch": 0.525999037072701,
"grad_norm": 2.03125,
"learning_rate": 7.289451713732443e-05,
"loss": 0.7563,
"step": 2185
},
{
"epoch": 0.5272026961964371,
"grad_norm": 2.046875,
"learning_rate": 7.288392598048492e-05,
"loss": 0.7385,
"step": 2190
},
{
"epoch": 0.5284063553201733,
"grad_norm": 2.0625,
"learning_rate": 7.2873311593687e-05,
"loss": 0.7356,
"step": 2195
},
{
"epoch": 0.5296100144439095,
"grad_norm": 2.171875,
"learning_rate": 7.286267398645673e-05,
"loss": 0.7428,
"step": 2200
},
{
"epoch": 0.5308136735676456,
"grad_norm": 2.265625,
"learning_rate": 7.285201316834101e-05,
"loss": 0.7507,
"step": 2205
},
{
"epoch": 0.5320173326913819,
"grad_norm": 2.109375,
"learning_rate": 7.284132914890758e-05,
"loss": 0.7333,
"step": 2210
},
{
"epoch": 0.533220991815118,
"grad_norm": 2.140625,
"learning_rate": 7.283062193774495e-05,
"loss": 0.7249,
"step": 2215
},
{
"epoch": 0.5344246509388542,
"grad_norm": 2.21875,
"learning_rate": 7.281989154446253e-05,
"loss": 0.7518,
"step": 2220
},
{
"epoch": 0.5356283100625903,
"grad_norm": 2.203125,
"learning_rate": 7.280913797869046e-05,
"loss": 0.7485,
"step": 2225
},
{
"epoch": 0.5368319691863265,
"grad_norm": 2.40625,
"learning_rate": 7.279836125007971e-05,
"loss": 0.7355,
"step": 2230
},
{
"epoch": 0.5380356283100626,
"grad_norm": 2.03125,
"learning_rate": 7.278756136830206e-05,
"loss": 0.7594,
"step": 2235
},
{
"epoch": 0.5392392874337988,
"grad_norm": 2.25,
"learning_rate": 7.277673834305001e-05,
"loss": 0.7225,
"step": 2240
},
{
"epoch": 0.5404429465575349,
"grad_norm": 2.0625,
"learning_rate": 7.276589218403688e-05,
"loss": 0.7132,
"step": 2245
},
{
"epoch": 0.5416466056812711,
"grad_norm": 2.015625,
"learning_rate": 7.275502290099672e-05,
"loss": 0.7118,
"step": 2250
},
{
"epoch": 0.5428502648050072,
"grad_norm": 2.046875,
"learning_rate": 7.274413050368438e-05,
"loss": 0.734,
"step": 2255
},
{
"epoch": 0.5440539239287434,
"grad_norm": 2.078125,
"learning_rate": 7.273321500187538e-05,
"loss": 0.7491,
"step": 2260
},
{
"epoch": 0.5452575830524795,
"grad_norm": 2.15625,
"learning_rate": 7.272227640536604e-05,
"loss": 0.7673,
"step": 2265
},
{
"epoch": 0.5464612421762157,
"grad_norm": 2.03125,
"learning_rate": 7.271131472397339e-05,
"loss": 0.7483,
"step": 2270
},
{
"epoch": 0.5476649012999518,
"grad_norm": 2.34375,
"learning_rate": 7.270032996753517e-05,
"loss": 0.7284,
"step": 2275
},
{
"epoch": 0.548868560423688,
"grad_norm": 2.203125,
"learning_rate": 7.268932214590982e-05,
"loss": 0.7643,
"step": 2280
},
{
"epoch": 0.5500722195474241,
"grad_norm": 2.0625,
"learning_rate": 7.267829126897652e-05,
"loss": 0.7348,
"step": 2285
},
{
"epoch": 0.5512758786711603,
"grad_norm": 2.078125,
"learning_rate": 7.266723734663508e-05,
"loss": 0.7307,
"step": 2290
},
{
"epoch": 0.5524795377948964,
"grad_norm": 1.8828125,
"learning_rate": 7.265616038880603e-05,
"loss": 0.7181,
"step": 2295
},
{
"epoch": 0.5536831969186327,
"grad_norm": 2.09375,
"learning_rate": 7.26450604054306e-05,
"loss": 0.7386,
"step": 2300
},
{
"epoch": 0.5548868560423688,
"grad_norm": 2.3125,
"learning_rate": 7.263393740647062e-05,
"loss": 0.7537,
"step": 2305
},
{
"epoch": 0.556090515166105,
"grad_norm": 2.28125,
"learning_rate": 7.262279140190863e-05,
"loss": 0.7102,
"step": 2310
},
{
"epoch": 0.5572941742898411,
"grad_norm": 2.1875,
"learning_rate": 7.261162240174778e-05,
"loss": 0.7147,
"step": 2315
},
{
"epoch": 0.5584978334135773,
"grad_norm": 2.1875,
"learning_rate": 7.260043041601189e-05,
"loss": 0.7572,
"step": 2320
},
{
"epoch": 0.5597014925373134,
"grad_norm": 2.046875,
"learning_rate": 7.258921545474539e-05,
"loss": 0.7161,
"step": 2325
},
{
"epoch": 0.5609051516610496,
"grad_norm": 1.9453125,
"learning_rate": 7.257797752801332e-05,
"loss": 0.7251,
"step": 2330
},
{
"epoch": 0.5621088107847857,
"grad_norm": 2.0,
"learning_rate": 7.256671664590136e-05,
"loss": 0.6989,
"step": 2335
},
{
"epoch": 0.5633124699085219,
"grad_norm": 2.203125,
"learning_rate": 7.255543281851577e-05,
"loss": 0.753,
"step": 2340
},
{
"epoch": 0.5645161290322581,
"grad_norm": 2.046875,
"learning_rate": 7.25441260559834e-05,
"loss": 0.7316,
"step": 2345
},
{
"epoch": 0.5657197881559942,
"grad_norm": 2.296875,
"learning_rate": 7.253279636845171e-05,
"loss": 0.7296,
"step": 2350
},
{
"epoch": 0.5669234472797304,
"grad_norm": 2.15625,
"learning_rate": 7.252144376608869e-05,
"loss": 0.6987,
"step": 2355
},
{
"epoch": 0.5681271064034665,
"grad_norm": 2.1875,
"learning_rate": 7.251006825908295e-05,
"loss": 0.7098,
"step": 2360
},
{
"epoch": 0.5693307655272027,
"grad_norm": 1.9609375,
"learning_rate": 7.24986698576436e-05,
"loss": 0.6956,
"step": 2365
},
{
"epoch": 0.5705344246509388,
"grad_norm": 1.984375,
"learning_rate": 7.248724857200034e-05,
"loss": 0.6961,
"step": 2370
},
{
"epoch": 0.571738083774675,
"grad_norm": 2.1875,
"learning_rate": 7.24758044124034e-05,
"loss": 0.7157,
"step": 2375
},
{
"epoch": 0.5729417428984112,
"grad_norm": 1.9453125,
"learning_rate": 7.246433738912352e-05,
"loss": 0.7143,
"step": 2380
},
{
"epoch": 0.5741454020221474,
"grad_norm": 2.125,
"learning_rate": 7.245284751245195e-05,
"loss": 0.726,
"step": 2385
},
{
"epoch": 0.5753490611458835,
"grad_norm": 2.046875,
"learning_rate": 7.24413347927005e-05,
"loss": 0.7714,
"step": 2390
},
{
"epoch": 0.5765527202696197,
"grad_norm": 2.171875,
"learning_rate": 7.242979924020144e-05,
"loss": 0.7224,
"step": 2395
},
{
"epoch": 0.5777563793933558,
"grad_norm": 2.203125,
"learning_rate": 7.241824086530754e-05,
"loss": 0.7367,
"step": 2400
},
{
"epoch": 0.578960038517092,
"grad_norm": 2.125,
"learning_rate": 7.240665967839207e-05,
"loss": 0.7353,
"step": 2405
},
{
"epoch": 0.5801636976408281,
"grad_norm": 2.03125,
"learning_rate": 7.239505568984874e-05,
"loss": 0.6976,
"step": 2410
},
{
"epoch": 0.5813673567645643,
"grad_norm": 2.0625,
"learning_rate": 7.238342891009176e-05,
"loss": 0.6909,
"step": 2415
},
{
"epoch": 0.5825710158883004,
"grad_norm": 2.140625,
"learning_rate": 7.237177934955575e-05,
"loss": 0.749,
"step": 2420
},
{
"epoch": 0.5837746750120366,
"grad_norm": 2.03125,
"learning_rate": 7.236010701869583e-05,
"loss": 0.7254,
"step": 2425
},
{
"epoch": 0.5849783341357727,
"grad_norm": 2.828125,
"learning_rate": 7.23484119279875e-05,
"loss": 0.7448,
"step": 2430
},
{
"epoch": 0.5861819932595089,
"grad_norm": 2.265625,
"learning_rate": 7.233669408792673e-05,
"loss": 0.7108,
"step": 2435
},
{
"epoch": 0.587385652383245,
"grad_norm": 2.15625,
"learning_rate": 7.232495350902989e-05,
"loss": 0.7044,
"step": 2440
},
{
"epoch": 0.5885893115069812,
"grad_norm": 2.171875,
"learning_rate": 7.231319020183376e-05,
"loss": 0.7287,
"step": 2445
},
{
"epoch": 0.5897929706307173,
"grad_norm": 2.171875,
"learning_rate": 7.23014041768955e-05,
"loss": 0.7299,
"step": 2450
},
{
"epoch": 0.5909966297544536,
"grad_norm": 2.296875,
"learning_rate": 7.228959544479267e-05,
"loss": 0.7104,
"step": 2455
},
{
"epoch": 0.5922002888781897,
"grad_norm": 1.8671875,
"learning_rate": 7.227776401612323e-05,
"loss": 0.704,
"step": 2460
},
{
"epoch": 0.5934039480019259,
"grad_norm": 2.4375,
"learning_rate": 7.22659099015055e-05,
"loss": 0.7279,
"step": 2465
},
{
"epoch": 0.594607607125662,
"grad_norm": 2.140625,
"learning_rate": 7.225403311157814e-05,
"loss": 0.722,
"step": 2470
},
{
"epoch": 0.5958112662493982,
"grad_norm": 2.078125,
"learning_rate": 7.224213365700016e-05,
"loss": 0.7195,
"step": 2475
},
{
"epoch": 0.5970149253731343,
"grad_norm": 2.078125,
"learning_rate": 7.223021154845092e-05,
"loss": 0.7581,
"step": 2480
},
{
"epoch": 0.5982185844968705,
"grad_norm": 2.0,
"learning_rate": 7.221826679663015e-05,
"loss": 0.7929,
"step": 2485
},
{
"epoch": 0.5994222436206067,
"grad_norm": 1.984375,
"learning_rate": 7.220629941225782e-05,
"loss": 0.7036,
"step": 2490
},
{
"epoch": 0.6006259027443428,
"grad_norm": 2.375,
"learning_rate": 7.21943094060743e-05,
"loss": 0.7072,
"step": 2495
},
{
"epoch": 0.601829561868079,
"grad_norm": 2.109375,
"learning_rate": 7.218229678884018e-05,
"loss": 0.7199,
"step": 2500
},
{
"epoch": 0.601829561868079,
"eval_loss": 0.6185581088066101,
"eval_runtime": 2.4024,
"eval_samples_per_second": 83.25,
"eval_steps_per_second": 83.25,
"step": 2500
},
{
"epoch": 0.6030332209918151,
"grad_norm": 2.203125,
"learning_rate": 7.21702615713364e-05,
"loss": 0.7025,
"step": 2505
},
{
"epoch": 0.6042368801155513,
"grad_norm": 1.9921875,
"learning_rate": 7.215820376436418e-05,
"loss": 0.7126,
"step": 2510
},
{
"epoch": 0.6054405392392874,
"grad_norm": 2.09375,
"learning_rate": 7.214612337874497e-05,
"loss": 0.7045,
"step": 2515
},
{
"epoch": 0.6066441983630236,
"grad_norm": 2.078125,
"learning_rate": 7.213402042532054e-05,
"loss": 0.7276,
"step": 2520
},
{
"epoch": 0.6078478574867597,
"grad_norm": 1.984375,
"learning_rate": 7.212189491495289e-05,
"loss": 0.7343,
"step": 2525
},
{
"epoch": 0.609051516610496,
"grad_norm": 2.203125,
"learning_rate": 7.210974685852423e-05,
"loss": 0.7073,
"step": 2530
},
{
"epoch": 0.610255175734232,
"grad_norm": 1.984375,
"learning_rate": 7.209757626693704e-05,
"loss": 0.6977,
"step": 2535
},
{
"epoch": 0.6114588348579683,
"grad_norm": 2.03125,
"learning_rate": 7.208538315111404e-05,
"loss": 0.6994,
"step": 2540
},
{
"epoch": 0.6126624939817044,
"grad_norm": 2.15625,
"learning_rate": 7.207316752199813e-05,
"loss": 0.7094,
"step": 2545
},
{
"epoch": 0.6138661531054406,
"grad_norm": 2.046875,
"learning_rate": 7.206092939055242e-05,
"loss": 0.7154,
"step": 2550
},
{
"epoch": 0.6150698122291767,
"grad_norm": 2.078125,
"learning_rate": 7.204866876776024e-05,
"loss": 0.7031,
"step": 2555
},
{
"epoch": 0.6162734713529129,
"grad_norm": 2.203125,
"learning_rate": 7.203638566462509e-05,
"loss": 0.6997,
"step": 2560
},
{
"epoch": 0.617477130476649,
"grad_norm": 2.0625,
"learning_rate": 7.202408009217063e-05,
"loss": 0.7273,
"step": 2565
},
{
"epoch": 0.6186807896003852,
"grad_norm": 2.125,
"learning_rate": 7.201175206144072e-05,
"loss": 0.7183,
"step": 2570
},
{
"epoch": 0.6198844487241213,
"grad_norm": 2.03125,
"learning_rate": 7.199940158349934e-05,
"loss": 0.6838,
"step": 2575
},
{
"epoch": 0.6210881078478575,
"grad_norm": 2.125,
"learning_rate": 7.198702866943061e-05,
"loss": 0.6794,
"step": 2580
},
{
"epoch": 0.6222917669715936,
"grad_norm": 2.046875,
"learning_rate": 7.197463333033886e-05,
"loss": 0.7418,
"step": 2585
},
{
"epoch": 0.6234954260953298,
"grad_norm": 2.171875,
"learning_rate": 7.196221557734845e-05,
"loss": 0.706,
"step": 2590
},
{
"epoch": 0.6246990852190659,
"grad_norm": 2.140625,
"learning_rate": 7.194977542160393e-05,
"loss": 0.7136,
"step": 2595
},
{
"epoch": 0.6259027443428021,
"grad_norm": 1.9921875,
"learning_rate": 7.19373128742699e-05,
"loss": 0.7062,
"step": 2600
},
{
"epoch": 0.6271064034665382,
"grad_norm": 2.203125,
"learning_rate": 7.192482794653109e-05,
"loss": 0.7187,
"step": 2605
},
{
"epoch": 0.6283100625902744,
"grad_norm": 2.078125,
"learning_rate": 7.191232064959229e-05,
"loss": 0.7383,
"step": 2610
},
{
"epoch": 0.6295137217140105,
"grad_norm": 2.0,
"learning_rate": 7.18997909946784e-05,
"loss": 0.7232,
"step": 2615
},
{
"epoch": 0.6307173808377468,
"grad_norm": 1.9375,
"learning_rate": 7.188723899303436e-05,
"loss": 0.6968,
"step": 2620
},
{
"epoch": 0.6319210399614829,
"grad_norm": 2.125,
"learning_rate": 7.187466465592516e-05,
"loss": 0.749,
"step": 2625
},
{
"epoch": 0.6331246990852191,
"grad_norm": 2.015625,
"learning_rate": 7.186206799463587e-05,
"loss": 0.7269,
"step": 2630
},
{
"epoch": 0.6343283582089553,
"grad_norm": 2.171875,
"learning_rate": 7.184944902047154e-05,
"loss": 0.7076,
"step": 2635
},
{
"epoch": 0.6355320173326914,
"grad_norm": 2.125,
"learning_rate": 7.183680774475732e-05,
"loss": 0.7502,
"step": 2640
},
{
"epoch": 0.6367356764564276,
"grad_norm": 2.109375,
"learning_rate": 7.182414417883831e-05,
"loss": 0.7216,
"step": 2645
},
{
"epoch": 0.6379393355801637,
"grad_norm": 1.9921875,
"learning_rate": 7.181145833407964e-05,
"loss": 0.7058,
"step": 2650
},
{
"epoch": 0.6391429947038999,
"grad_norm": 2.296875,
"learning_rate": 7.179875022186641e-05,
"loss": 0.7297,
"step": 2655
},
{
"epoch": 0.640346653827636,
"grad_norm": 1.9453125,
"learning_rate": 7.178601985360377e-05,
"loss": 0.712,
"step": 2660
},
{
"epoch": 0.6415503129513722,
"grad_norm": 2.109375,
"learning_rate": 7.177326724071674e-05,
"loss": 0.7122,
"step": 2665
},
{
"epoch": 0.6427539720751083,
"grad_norm": 2.015625,
"learning_rate": 7.176049239465043e-05,
"loss": 0.6803,
"step": 2670
},
{
"epoch": 0.6439576311988445,
"grad_norm": 1.8984375,
"learning_rate": 7.174769532686981e-05,
"loss": 0.7044,
"step": 2675
},
{
"epoch": 0.6451612903225806,
"grad_norm": 2.46875,
"learning_rate": 7.17348760488598e-05,
"loss": 0.7183,
"step": 2680
},
{
"epoch": 0.6463649494463168,
"grad_norm": 2.078125,
"learning_rate": 7.172203457212529e-05,
"loss": 0.7206,
"step": 2685
},
{
"epoch": 0.6475686085700529,
"grad_norm": 2.140625,
"learning_rate": 7.170917090819108e-05,
"loss": 0.7073,
"step": 2690
},
{
"epoch": 0.6487722676937892,
"grad_norm": 2.078125,
"learning_rate": 7.169628506860189e-05,
"loss": 0.7037,
"step": 2695
},
{
"epoch": 0.6499759268175253,
"grad_norm": 2.0625,
"learning_rate": 7.16833770649223e-05,
"loss": 0.7078,
"step": 2700
},
{
"epoch": 0.6511795859412615,
"grad_norm": 1.921875,
"learning_rate": 7.167044690873683e-05,
"loss": 0.7619,
"step": 2705
},
{
"epoch": 0.6523832450649976,
"grad_norm": 2.0625,
"learning_rate": 7.165749461164988e-05,
"loss": 0.6917,
"step": 2710
},
{
"epoch": 0.6535869041887338,
"grad_norm": 1.953125,
"learning_rate": 7.164452018528565e-05,
"loss": 0.7178,
"step": 2715
},
{
"epoch": 0.6547905633124699,
"grad_norm": 1.96875,
"learning_rate": 7.163152364128831e-05,
"loss": 0.7089,
"step": 2720
},
{
"epoch": 0.6559942224362061,
"grad_norm": 2.203125,
"learning_rate": 7.16185049913218e-05,
"loss": 0.6982,
"step": 2725
},
{
"epoch": 0.6571978815599422,
"grad_norm": 2.234375,
"learning_rate": 7.160546424706991e-05,
"loss": 0.7445,
"step": 2730
},
{
"epoch": 0.6584015406836784,
"grad_norm": 1.9765625,
"learning_rate": 7.15924014202363e-05,
"loss": 0.7561,
"step": 2735
},
{
"epoch": 0.6596051998074145,
"grad_norm": 2.03125,
"learning_rate": 7.157931652254441e-05,
"loss": 0.6975,
"step": 2740
},
{
"epoch": 0.6608088589311507,
"grad_norm": 1.9453125,
"learning_rate": 7.156620956573748e-05,
"loss": 0.6788,
"step": 2745
},
{
"epoch": 0.6620125180548868,
"grad_norm": 2.03125,
"learning_rate": 7.155308056157859e-05,
"loss": 0.7178,
"step": 2750
},
{
"epoch": 0.663216177178623,
"grad_norm": 2.03125,
"learning_rate": 7.153992952185058e-05,
"loss": 0.7256,
"step": 2755
},
{
"epoch": 0.6644198363023591,
"grad_norm": 2.03125,
"learning_rate": 7.152675645835607e-05,
"loss": 0.7036,
"step": 2760
},
{
"epoch": 0.6656234954260953,
"grad_norm": 2.171875,
"learning_rate": 7.151356138291742e-05,
"loss": 0.7168,
"step": 2765
},
{
"epoch": 0.6668271545498314,
"grad_norm": 2.125,
"learning_rate": 7.150034430737679e-05,
"loss": 0.7073,
"step": 2770
},
{
"epoch": 0.6680308136735676,
"grad_norm": 2.078125,
"learning_rate": 7.148710524359607e-05,
"loss": 0.6977,
"step": 2775
},
{
"epoch": 0.6692344727973039,
"grad_norm": 2.03125,
"learning_rate": 7.147384420345685e-05,
"loss": 0.7269,
"step": 2780
},
{
"epoch": 0.67043813192104,
"grad_norm": 2.0625,
"learning_rate": 7.14605611988605e-05,
"loss": 0.7017,
"step": 2785
},
{
"epoch": 0.6716417910447762,
"grad_norm": 2.1875,
"learning_rate": 7.144725624172805e-05,
"loss": 0.6911,
"step": 2790
},
{
"epoch": 0.6728454501685123,
"grad_norm": 2.0625,
"learning_rate": 7.143392934400028e-05,
"loss": 0.7137,
"step": 2795
},
{
"epoch": 0.6740491092922485,
"grad_norm": 2.046875,
"learning_rate": 7.142058051763761e-05,
"loss": 0.7144,
"step": 2800
},
{
"epoch": 0.6752527684159846,
"grad_norm": 2.015625,
"learning_rate": 7.140720977462018e-05,
"loss": 0.7026,
"step": 2805
},
{
"epoch": 0.6764564275397208,
"grad_norm": 2.28125,
"learning_rate": 7.139381712694777e-05,
"loss": 0.712,
"step": 2810
},
{
"epoch": 0.6776600866634569,
"grad_norm": 2.296875,
"learning_rate": 7.138040258663984e-05,
"loss": 0.7336,
"step": 2815
},
{
"epoch": 0.6788637457871931,
"grad_norm": 2.125,
"learning_rate": 7.13669661657355e-05,
"loss": 0.7178,
"step": 2820
},
{
"epoch": 0.6800674049109292,
"grad_norm": 2.03125,
"learning_rate": 7.135350787629349e-05,
"loss": 0.6975,
"step": 2825
},
{
"epoch": 0.6812710640346654,
"grad_norm": 2.015625,
"learning_rate": 7.134002773039217e-05,
"loss": 0.6854,
"step": 2830
},
{
"epoch": 0.6824747231584015,
"grad_norm": 2.0,
"learning_rate": 7.13265257401295e-05,
"loss": 0.7039,
"step": 2835
},
{
"epoch": 0.6836783822821377,
"grad_norm": 1.984375,
"learning_rate": 7.131300191762311e-05,
"loss": 0.7228,
"step": 2840
},
{
"epoch": 0.6848820414058738,
"grad_norm": 2.03125,
"learning_rate": 7.129945627501013e-05,
"loss": 0.7109,
"step": 2845
},
{
"epoch": 0.68608570052961,
"grad_norm": 1.9921875,
"learning_rate": 7.128588882444734e-05,
"loss": 0.6984,
"step": 2850
},
{
"epoch": 0.6872893596533461,
"grad_norm": 2.421875,
"learning_rate": 7.127229957811112e-05,
"loss": 0.6898,
"step": 2855
},
{
"epoch": 0.6884930187770824,
"grad_norm": 2.078125,
"learning_rate": 7.125868854819727e-05,
"loss": 0.7012,
"step": 2860
},
{
"epoch": 0.6896966779008185,
"grad_norm": 2.15625,
"learning_rate": 7.124505574692132e-05,
"loss": 0.7063,
"step": 2865
},
{
"epoch": 0.6909003370245547,
"grad_norm": 2.0625,
"learning_rate": 7.123140118651819e-05,
"loss": 0.6994,
"step": 2870
},
{
"epoch": 0.6921039961482908,
"grad_norm": 1.90625,
"learning_rate": 7.121772487924245e-05,
"loss": 0.6898,
"step": 2875
},
{
"epoch": 0.693307655272027,
"grad_norm": 2.015625,
"learning_rate": 7.12040268373681e-05,
"loss": 0.7002,
"step": 2880
},
{
"epoch": 0.6945113143957631,
"grad_norm": 1.96875,
"learning_rate": 7.119030707318866e-05,
"loss": 0.7231,
"step": 2885
},
{
"epoch": 0.6957149735194993,
"grad_norm": 1.96875,
"learning_rate": 7.117656559901716e-05,
"loss": 0.7083,
"step": 2890
},
{
"epoch": 0.6969186326432354,
"grad_norm": 2.09375,
"learning_rate": 7.116280242718616e-05,
"loss": 0.7255,
"step": 2895
},
{
"epoch": 0.6981222917669716,
"grad_norm": 2.1875,
"learning_rate": 7.11490175700476e-05,
"loss": 0.6818,
"step": 2900
},
{
"epoch": 0.6993259508907077,
"grad_norm": 1.890625,
"learning_rate": 7.113521103997295e-05,
"loss": 0.7098,
"step": 2905
},
{
"epoch": 0.7005296100144439,
"grad_norm": 2.140625,
"learning_rate": 7.112138284935309e-05,
"loss": 0.6684,
"step": 2910
},
{
"epoch": 0.70173326913818,
"grad_norm": 2.078125,
"learning_rate": 7.110753301059837e-05,
"loss": 0.7065,
"step": 2915
},
{
"epoch": 0.7029369282619162,
"grad_norm": 1.9296875,
"learning_rate": 7.109366153613856e-05,
"loss": 0.6378,
"step": 2920
},
{
"epoch": 0.7041405873856523,
"grad_norm": 2.046875,
"learning_rate": 7.107976843842285e-05,
"loss": 0.717,
"step": 2925
},
{
"epoch": 0.7053442465093885,
"grad_norm": 2.046875,
"learning_rate": 7.106585372991983e-05,
"loss": 0.6748,
"step": 2930
},
{
"epoch": 0.7065479056331248,
"grad_norm": 2.171875,
"learning_rate": 7.105191742311748e-05,
"loss": 0.6826,
"step": 2935
},
{
"epoch": 0.7077515647568609,
"grad_norm": 2.0625,
"learning_rate": 7.103795953052316e-05,
"loss": 0.6717,
"step": 2940
},
{
"epoch": 0.7089552238805971,
"grad_norm": 1.9375,
"learning_rate": 7.102398006466362e-05,
"loss": 0.7121,
"step": 2945
},
{
"epoch": 0.7101588830043332,
"grad_norm": 2.0,
"learning_rate": 7.100997903808498e-05,
"loss": 0.7021,
"step": 2950
},
{
"epoch": 0.7113625421280694,
"grad_norm": 2.078125,
"learning_rate": 7.099595646335266e-05,
"loss": 0.6888,
"step": 2955
},
{
"epoch": 0.7125662012518055,
"grad_norm": 2.078125,
"learning_rate": 7.098191235305148e-05,
"loss": 0.6547,
"step": 2960
},
{
"epoch": 0.7137698603755417,
"grad_norm": 2.234375,
"learning_rate": 7.096784671978555e-05,
"loss": 0.6816,
"step": 2965
},
{
"epoch": 0.7149735194992778,
"grad_norm": 2.140625,
"learning_rate": 7.09537595761783e-05,
"loss": 0.695,
"step": 2970
},
{
"epoch": 0.716177178623014,
"grad_norm": 2.0625,
"learning_rate": 7.093965093487248e-05,
"loss": 0.6777,
"step": 2975
},
{
"epoch": 0.7173808377467501,
"grad_norm": 2.078125,
"learning_rate": 7.092552080853013e-05,
"loss": 0.6849,
"step": 2980
},
{
"epoch": 0.7185844968704863,
"grad_norm": 1.9765625,
"learning_rate": 7.091136920983255e-05,
"loss": 0.7043,
"step": 2985
},
{
"epoch": 0.7197881559942224,
"grad_norm": 2.1875,
"learning_rate": 7.089719615148034e-05,
"loss": 0.7,
"step": 2990
},
{
"epoch": 0.7209918151179586,
"grad_norm": 2.0625,
"learning_rate": 7.088300164619332e-05,
"loss": 0.6847,
"step": 2995
},
{
"epoch": 0.7221954742416947,
"grad_norm": 1.96875,
"learning_rate": 7.086878570671062e-05,
"loss": 0.6825,
"step": 3000
},
{
"epoch": 0.7221954742416947,
"eval_loss": 0.5935443043708801,
"eval_runtime": 2.4083,
"eval_samples_per_second": 83.047,
"eval_steps_per_second": 83.047,
"step": 3000
},
{
"epoch": 0.7233991333654309,
"grad_norm": 2.171875,
"learning_rate": 7.085454834579054e-05,
"loss": 0.7262,
"step": 3005
},
{
"epoch": 0.724602792489167,
"grad_norm": 2.015625,
"learning_rate": 7.084028957621066e-05,
"loss": 0.7577,
"step": 3010
},
{
"epoch": 0.7258064516129032,
"grad_norm": 1.953125,
"learning_rate": 7.082600941076773e-05,
"loss": 0.6923,
"step": 3015
},
{
"epoch": 0.7270101107366393,
"grad_norm": 1.9375,
"learning_rate": 7.081170786227776e-05,
"loss": 0.6833,
"step": 3020
},
{
"epoch": 0.7282137698603756,
"grad_norm": 2.125,
"learning_rate": 7.079738494357583e-05,
"loss": 0.6757,
"step": 3025
},
{
"epoch": 0.7294174289841117,
"grad_norm": 2.125,
"learning_rate": 7.078304066751637e-05,
"loss": 0.7042,
"step": 3030
},
{
"epoch": 0.7306210881078479,
"grad_norm": 1.9296875,
"learning_rate": 7.076867504697283e-05,
"loss": 0.6797,
"step": 3035
},
{
"epoch": 0.731824747231584,
"grad_norm": 2.046875,
"learning_rate": 7.075428809483791e-05,
"loss": 0.6647,
"step": 3040
},
{
"epoch": 0.7330284063553202,
"grad_norm": 2.203125,
"learning_rate": 7.07398798240234e-05,
"loss": 0.6718,
"step": 3045
},
{
"epoch": 0.7342320654790563,
"grad_norm": 1.9609375,
"learning_rate": 7.072545024746024e-05,
"loss": 0.7162,
"step": 3050
},
{
"epoch": 0.7354357246027925,
"grad_norm": 2.40625,
"learning_rate": 7.07109993780985e-05,
"loss": 0.661,
"step": 3055
},
{
"epoch": 0.7366393837265286,
"grad_norm": 2.109375,
"learning_rate": 7.069652722890736e-05,
"loss": 0.7114,
"step": 3060
},
{
"epoch": 0.7378430428502648,
"grad_norm": 2.21875,
"learning_rate": 7.068203381287507e-05,
"loss": 0.6964,
"step": 3065
},
{
"epoch": 0.7390467019740009,
"grad_norm": 1.9609375,
"learning_rate": 7.0667519143009e-05,
"loss": 0.727,
"step": 3070
},
{
"epoch": 0.7402503610977371,
"grad_norm": 2.015625,
"learning_rate": 7.065298323233558e-05,
"loss": 0.7187,
"step": 3075
},
{
"epoch": 0.7414540202214733,
"grad_norm": 1.9453125,
"learning_rate": 7.06384260939003e-05,
"loss": 0.6952,
"step": 3080
},
{
"epoch": 0.7426576793452094,
"grad_norm": 1.8828125,
"learning_rate": 7.06238477407677e-05,
"loss": 0.6252,
"step": 3085
},
{
"epoch": 0.7438613384689456,
"grad_norm": 2.3125,
"learning_rate": 7.060924818602138e-05,
"loss": 0.722,
"step": 3090
},
{
"epoch": 0.7450649975926817,
"grad_norm": 2.0,
"learning_rate": 7.059462744276395e-05,
"loss": 0.6839,
"step": 3095
},
{
"epoch": 0.746268656716418,
"grad_norm": 2.015625,
"learning_rate": 7.057998552411702e-05,
"loss": 0.6984,
"step": 3100
},
{
"epoch": 0.7474723158401541,
"grad_norm": 2.328125,
"learning_rate": 7.056532244322123e-05,
"loss": 0.6827,
"step": 3105
},
{
"epoch": 0.7486759749638903,
"grad_norm": 2.078125,
"learning_rate": 7.055063821323621e-05,
"loss": 0.6519,
"step": 3110
},
{
"epoch": 0.7498796340876264,
"grad_norm": 1.7890625,
"learning_rate": 7.053593284734058e-05,
"loss": 0.6937,
"step": 3115
},
{
"epoch": 0.7510832932113626,
"grad_norm": 1.7734375,
"learning_rate": 7.052120635873189e-05,
"loss": 0.6719,
"step": 3120
},
{
"epoch": 0.7522869523350987,
"grad_norm": 1.8984375,
"learning_rate": 7.050645876062669e-05,
"loss": 0.6803,
"step": 3125
},
{
"epoch": 0.7534906114588349,
"grad_norm": 2.171875,
"learning_rate": 7.049169006626043e-05,
"loss": 0.7005,
"step": 3130
},
{
"epoch": 0.754694270582571,
"grad_norm": 2.21875,
"learning_rate": 7.047690028888756e-05,
"loss": 0.6623,
"step": 3135
},
{
"epoch": 0.7558979297063072,
"grad_norm": 2.046875,
"learning_rate": 7.046208944178136e-05,
"loss": 0.7266,
"step": 3140
},
{
"epoch": 0.7571015888300433,
"grad_norm": 1.96875,
"learning_rate": 7.044725753823412e-05,
"loss": 0.6812,
"step": 3145
},
{
"epoch": 0.7583052479537795,
"grad_norm": 2.21875,
"learning_rate": 7.043240459155696e-05,
"loss": 0.6907,
"step": 3150
},
{
"epoch": 0.7595089070775156,
"grad_norm": 2.015625,
"learning_rate": 7.041753061507987e-05,
"loss": 0.6656,
"step": 3155
},
{
"epoch": 0.7607125662012518,
"grad_norm": 2.0,
"learning_rate": 7.04026356221518e-05,
"loss": 0.6933,
"step": 3160
},
{
"epoch": 0.7619162253249879,
"grad_norm": 2.078125,
"learning_rate": 7.038771962614047e-05,
"loss": 0.682,
"step": 3165
},
{
"epoch": 0.7631198844487241,
"grad_norm": 1.984375,
"learning_rate": 7.037278264043252e-05,
"loss": 0.6681,
"step": 3170
},
{
"epoch": 0.7643235435724602,
"grad_norm": 2.234375,
"learning_rate": 7.035782467843336e-05,
"loss": 0.6903,
"step": 3175
},
{
"epoch": 0.7655272026961965,
"grad_norm": 2.125,
"learning_rate": 7.034284575356729e-05,
"loss": 0.6795,
"step": 3180
},
{
"epoch": 0.7667308618199326,
"grad_norm": 2.078125,
"learning_rate": 7.032784587927738e-05,
"loss": 0.6882,
"step": 3185
},
{
"epoch": 0.7679345209436688,
"grad_norm": 2.03125,
"learning_rate": 7.031282506902551e-05,
"loss": 0.6924,
"step": 3190
},
{
"epoch": 0.7691381800674049,
"grad_norm": 1.96875,
"learning_rate": 7.029778333629238e-05,
"loss": 0.6932,
"step": 3195
},
{
"epoch": 0.7703418391911411,
"grad_norm": 2.015625,
"learning_rate": 7.028272069457741e-05,
"loss": 0.7174,
"step": 3200
},
{
"epoch": 0.7715454983148772,
"grad_norm": 2.015625,
"learning_rate": 7.026763715739883e-05,
"loss": 0.6819,
"step": 3205
},
{
"epoch": 0.7727491574386134,
"grad_norm": 2.078125,
"learning_rate": 7.025253273829363e-05,
"loss": 0.7052,
"step": 3210
},
{
"epoch": 0.7739528165623495,
"grad_norm": 1.90625,
"learning_rate": 7.02374074508175e-05,
"loss": 0.6917,
"step": 3215
},
{
"epoch": 0.7751564756860857,
"grad_norm": 2.0625,
"learning_rate": 7.022226130854488e-05,
"loss": 0.665,
"step": 3220
},
{
"epoch": 0.7763601348098219,
"grad_norm": 2.03125,
"learning_rate": 7.020709432506894e-05,
"loss": 0.7044,
"step": 3225
},
{
"epoch": 0.777563793933558,
"grad_norm": 1.90625,
"learning_rate": 7.019190651400152e-05,
"loss": 0.7384,
"step": 3230
},
{
"epoch": 0.7787674530572942,
"grad_norm": 1.921875,
"learning_rate": 7.017669788897319e-05,
"loss": 0.7046,
"step": 3235
},
{
"epoch": 0.7799711121810303,
"grad_norm": 2.078125,
"learning_rate": 7.016146846363318e-05,
"loss": 0.6768,
"step": 3240
},
{
"epoch": 0.7811747713047665,
"grad_norm": 2.171875,
"learning_rate": 7.014621825164938e-05,
"loss": 0.6342,
"step": 3245
},
{
"epoch": 0.7823784304285026,
"grad_norm": 1.828125,
"learning_rate": 7.013094726670837e-05,
"loss": 0.6916,
"step": 3250
},
{
"epoch": 0.7835820895522388,
"grad_norm": 2.203125,
"learning_rate": 7.011565552251531e-05,
"loss": 0.6637,
"step": 3255
},
{
"epoch": 0.784785748675975,
"grad_norm": 1.984375,
"learning_rate": 7.010034303279406e-05,
"loss": 0.6942,
"step": 3260
},
{
"epoch": 0.7859894077997112,
"grad_norm": 1.96875,
"learning_rate": 7.008500981128708e-05,
"loss": 0.6655,
"step": 3265
},
{
"epoch": 0.7871930669234473,
"grad_norm": 1.9453125,
"learning_rate": 7.006965587175538e-05,
"loss": 0.661,
"step": 3270
},
{
"epoch": 0.7883967260471835,
"grad_norm": 2.140625,
"learning_rate": 7.005428122797864e-05,
"loss": 0.706,
"step": 3275
},
{
"epoch": 0.7896003851709196,
"grad_norm": 1.84375,
"learning_rate": 7.003888589375508e-05,
"loss": 0.6508,
"step": 3280
},
{
"epoch": 0.7908040442946558,
"grad_norm": 1.90625,
"learning_rate": 7.002346988290149e-05,
"loss": 0.6981,
"step": 3285
},
{
"epoch": 0.7920077034183919,
"grad_norm": 1.984375,
"learning_rate": 7.000803320925323e-05,
"loss": 0.6719,
"step": 3290
},
{
"epoch": 0.7932113625421281,
"grad_norm": 1.984375,
"learning_rate": 6.999257588666419e-05,
"loss": 0.6823,
"step": 3295
},
{
"epoch": 0.7944150216658642,
"grad_norm": 2.359375,
"learning_rate": 6.997709792900683e-05,
"loss": 0.6894,
"step": 3300
},
{
"epoch": 0.7956186807896004,
"grad_norm": 1.9609375,
"learning_rate": 6.996159935017208e-05,
"loss": 0.6801,
"step": 3305
},
{
"epoch": 0.7968223399133365,
"grad_norm": 2.03125,
"learning_rate": 6.994608016406938e-05,
"loss": 0.6678,
"step": 3310
},
{
"epoch": 0.7980259990370727,
"grad_norm": 2.25,
"learning_rate": 6.993054038462671e-05,
"loss": 0.6815,
"step": 3315
},
{
"epoch": 0.7992296581608088,
"grad_norm": 1.9140625,
"learning_rate": 6.991498002579048e-05,
"loss": 0.6926,
"step": 3320
},
{
"epoch": 0.800433317284545,
"grad_norm": 1.9140625,
"learning_rate": 6.989939910152561e-05,
"loss": 0.6916,
"step": 3325
},
{
"epoch": 0.8016369764082811,
"grad_norm": 1.9765625,
"learning_rate": 6.988379762581545e-05,
"loss": 0.6819,
"step": 3330
},
{
"epoch": 0.8028406355320173,
"grad_norm": 2.171875,
"learning_rate": 6.986817561266181e-05,
"loss": 0.6759,
"step": 3335
},
{
"epoch": 0.8040442946557534,
"grad_norm": 2.21875,
"learning_rate": 6.985253307608491e-05,
"loss": 0.6942,
"step": 3340
},
{
"epoch": 0.8052479537794897,
"grad_norm": 2.109375,
"learning_rate": 6.983687003012341e-05,
"loss": 0.6792,
"step": 3345
},
{
"epoch": 0.8064516129032258,
"grad_norm": 1.984375,
"learning_rate": 6.982118648883438e-05,
"loss": 0.6402,
"step": 3350
},
{
"epoch": 0.807655272026962,
"grad_norm": 1.9453125,
"learning_rate": 6.980548246629326e-05,
"loss": 0.6802,
"step": 3355
},
{
"epoch": 0.8088589311506981,
"grad_norm": 1.828125,
"learning_rate": 6.978975797659389e-05,
"loss": 0.615,
"step": 3360
},
{
"epoch": 0.8100625902744343,
"grad_norm": 2.0625,
"learning_rate": 6.97740130338485e-05,
"loss": 0.6543,
"step": 3365
},
{
"epoch": 0.8112662493981705,
"grad_norm": 1.9453125,
"learning_rate": 6.97582476521876e-05,
"loss": 0.6766,
"step": 3370
},
{
"epoch": 0.8124699085219066,
"grad_norm": 1.8515625,
"learning_rate": 6.974246184576012e-05,
"loss": 0.6788,
"step": 3375
},
{
"epoch": 0.8136735676456428,
"grad_norm": 2.078125,
"learning_rate": 6.97266556287333e-05,
"loss": 0.6849,
"step": 3380
},
{
"epoch": 0.8148772267693789,
"grad_norm": 2.015625,
"learning_rate": 6.971082901529267e-05,
"loss": 0.6419,
"step": 3385
},
{
"epoch": 0.8160808858931151,
"grad_norm": 2.046875,
"learning_rate": 6.969498201964212e-05,
"loss": 0.7203,
"step": 3390
},
{
"epoch": 0.8172845450168512,
"grad_norm": 2.203125,
"learning_rate": 6.967911465600376e-05,
"loss": 0.674,
"step": 3395
},
{
"epoch": 0.8184882041405874,
"grad_norm": 1.875,
"learning_rate": 6.966322693861804e-05,
"loss": 0.6785,
"step": 3400
},
{
"epoch": 0.8196918632643235,
"grad_norm": 2.359375,
"learning_rate": 6.964731888174366e-05,
"loss": 0.7204,
"step": 3405
},
{
"epoch": 0.8208955223880597,
"grad_norm": 2.234375,
"learning_rate": 6.963139049965758e-05,
"loss": 0.6844,
"step": 3410
},
{
"epoch": 0.8220991815117958,
"grad_norm": 2.15625,
"learning_rate": 6.961544180665494e-05,
"loss": 0.6818,
"step": 3415
},
{
"epoch": 0.823302840635532,
"grad_norm": 2.015625,
"learning_rate": 6.959947281704922e-05,
"loss": 0.6544,
"step": 3420
},
{
"epoch": 0.8245064997592682,
"grad_norm": 2.046875,
"learning_rate": 6.9583483545172e-05,
"loss": 0.7053,
"step": 3425
},
{
"epoch": 0.8257101588830044,
"grad_norm": 2.03125,
"learning_rate": 6.956747400537315e-05,
"loss": 0.7212,
"step": 3430
},
{
"epoch": 0.8269138180067405,
"grad_norm": 2.0,
"learning_rate": 6.955144421202071e-05,
"loss": 0.6408,
"step": 3435
},
{
"epoch": 0.8281174771304767,
"grad_norm": 1.9921875,
"learning_rate": 6.953539417950085e-05,
"loss": 0.6501,
"step": 3440
},
{
"epoch": 0.8293211362542128,
"grad_norm": 1.921875,
"learning_rate": 6.951932392221796e-05,
"loss": 0.6593,
"step": 3445
},
{
"epoch": 0.830524795377949,
"grad_norm": 2.09375,
"learning_rate": 6.950323345459454e-05,
"loss": 0.6657,
"step": 3450
},
{
"epoch": 0.8317284545016851,
"grad_norm": 1.8828125,
"learning_rate": 6.948712279107125e-05,
"loss": 0.685,
"step": 3455
},
{
"epoch": 0.8329321136254213,
"grad_norm": 2.078125,
"learning_rate": 6.947099194610689e-05,
"loss": 0.7025,
"step": 3460
},
{
"epoch": 0.8341357727491574,
"grad_norm": 1.9921875,
"learning_rate": 6.945484093417835e-05,
"loss": 0.6594,
"step": 3465
},
{
"epoch": 0.8353394318728936,
"grad_norm": 1.953125,
"learning_rate": 6.94386697697806e-05,
"loss": 0.6699,
"step": 3470
},
{
"epoch": 0.8365430909966297,
"grad_norm": 1.9921875,
"learning_rate": 6.942247846742674e-05,
"loss": 0.6582,
"step": 3475
},
{
"epoch": 0.8377467501203659,
"grad_norm": 2.015625,
"learning_rate": 6.940626704164793e-05,
"loss": 0.6745,
"step": 3480
},
{
"epoch": 0.838950409244102,
"grad_norm": 1.8046875,
"learning_rate": 6.939003550699337e-05,
"loss": 0.6824,
"step": 3485
},
{
"epoch": 0.8401540683678382,
"grad_norm": 2.09375,
"learning_rate": 6.93737838780303e-05,
"loss": 0.6271,
"step": 3490
},
{
"epoch": 0.8413577274915743,
"grad_norm": 2.109375,
"learning_rate": 6.935751216934407e-05,
"loss": 0.7001,
"step": 3495
},
{
"epoch": 0.8425613866153105,
"grad_norm": 1.96875,
"learning_rate": 6.934122039553793e-05,
"loss": 0.7044,
"step": 3500
},
{
"epoch": 0.8425613866153105,
"eval_loss": 0.5733353495597839,
"eval_runtime": 2.4041,
"eval_samples_per_second": 83.193,
"eval_steps_per_second": 83.193,
"step": 3500
},
{
"epoch": 0.8437650457390466,
"grad_norm": 1.890625,
"learning_rate": 6.932490857123324e-05,
"loss": 0.685,
"step": 3505
},
{
"epoch": 0.8449687048627829,
"grad_norm": 2.109375,
"learning_rate": 6.930857671106932e-05,
"loss": 0.6795,
"step": 3510
},
{
"epoch": 0.8461723639865191,
"grad_norm": 2.046875,
"learning_rate": 6.929222482970345e-05,
"loss": 0.6792,
"step": 3515
},
{
"epoch": 0.8473760231102552,
"grad_norm": 2.09375,
"learning_rate": 6.92758529418109e-05,
"loss": 0.6647,
"step": 3520
},
{
"epoch": 0.8485796822339914,
"grad_norm": 2.125,
"learning_rate": 6.925946106208492e-05,
"loss": 0.6924,
"step": 3525
},
{
"epoch": 0.8497833413577275,
"grad_norm": 2.03125,
"learning_rate": 6.924304920523662e-05,
"loss": 0.6794,
"step": 3530
},
{
"epoch": 0.8509870004814637,
"grad_norm": 2.0,
"learning_rate": 6.922661738599514e-05,
"loss": 0.7257,
"step": 3535
},
{
"epoch": 0.8521906596051998,
"grad_norm": 1.9609375,
"learning_rate": 6.921016561910748e-05,
"loss": 0.6848,
"step": 3540
},
{
"epoch": 0.853394318728936,
"grad_norm": 2.15625,
"learning_rate": 6.919369391933853e-05,
"loss": 0.6732,
"step": 3545
},
{
"epoch": 0.8545979778526721,
"grad_norm": 1.90625,
"learning_rate": 6.917720230147111e-05,
"loss": 0.6457,
"step": 3550
},
{
"epoch": 0.8558016369764083,
"grad_norm": 2.03125,
"learning_rate": 6.91606907803059e-05,
"loss": 0.6906,
"step": 3555
},
{
"epoch": 0.8570052961001444,
"grad_norm": 1.84375,
"learning_rate": 6.914415937066142e-05,
"loss": 0.6813,
"step": 3560
},
{
"epoch": 0.8582089552238806,
"grad_norm": 1.796875,
"learning_rate": 6.912760808737405e-05,
"loss": 0.7021,
"step": 3565
},
{
"epoch": 0.8594126143476167,
"grad_norm": 2.03125,
"learning_rate": 6.911103694529805e-05,
"loss": 0.6774,
"step": 3570
},
{
"epoch": 0.8606162734713529,
"grad_norm": 1.9375,
"learning_rate": 6.909444595930544e-05,
"loss": 0.6874,
"step": 3575
},
{
"epoch": 0.861819932595089,
"grad_norm": 2.078125,
"learning_rate": 6.907783514428607e-05,
"loss": 0.6654,
"step": 3580
},
{
"epoch": 0.8630235917188253,
"grad_norm": 2.0,
"learning_rate": 6.906120451514761e-05,
"loss": 0.6499,
"step": 3585
},
{
"epoch": 0.8642272508425614,
"grad_norm": 2.078125,
"learning_rate": 6.90445540868155e-05,
"loss": 0.6703,
"step": 3590
},
{
"epoch": 0.8654309099662976,
"grad_norm": 2.1875,
"learning_rate": 6.902788387423292e-05,
"loss": 0.6915,
"step": 3595
},
{
"epoch": 0.8666345690900337,
"grad_norm": 2.28125,
"learning_rate": 6.901119389236082e-05,
"loss": 0.6694,
"step": 3600
},
{
"epoch": 0.8678382282137699,
"grad_norm": 1.953125,
"learning_rate": 6.899448415617794e-05,
"loss": 0.6693,
"step": 3605
},
{
"epoch": 0.869041887337506,
"grad_norm": 2.015625,
"learning_rate": 6.897775468068067e-05,
"loss": 0.6575,
"step": 3610
},
{
"epoch": 0.8702455464612422,
"grad_norm": 1.9375,
"learning_rate": 6.896100548088318e-05,
"loss": 0.6947,
"step": 3615
},
{
"epoch": 0.8714492055849783,
"grad_norm": 2.03125,
"learning_rate": 6.894423657181731e-05,
"loss": 0.6578,
"step": 3620
},
{
"epoch": 0.8726528647087145,
"grad_norm": 1.96875,
"learning_rate": 6.89274479685326e-05,
"loss": 0.6838,
"step": 3625
},
{
"epoch": 0.8738565238324506,
"grad_norm": 1.9921875,
"learning_rate": 6.891063968609624e-05,
"loss": 0.6947,
"step": 3630
},
{
"epoch": 0.8750601829561868,
"grad_norm": 1.9609375,
"learning_rate": 6.889381173959314e-05,
"loss": 0.6484,
"step": 3635
},
{
"epoch": 0.8762638420799229,
"grad_norm": 1.875,
"learning_rate": 6.887696414412577e-05,
"loss": 0.7085,
"step": 3640
},
{
"epoch": 0.8774675012036591,
"grad_norm": 2.09375,
"learning_rate": 6.886009691481434e-05,
"loss": 0.6785,
"step": 3645
},
{
"epoch": 0.8786711603273952,
"grad_norm": 1.9609375,
"learning_rate": 6.884321006679656e-05,
"loss": 0.6721,
"step": 3650
},
{
"epoch": 0.8798748194511314,
"grad_norm": 1.921875,
"learning_rate": 6.882630361522787e-05,
"loss": 0.6621,
"step": 3655
},
{
"epoch": 0.8810784785748677,
"grad_norm": 2.015625,
"learning_rate": 6.880937757528123e-05,
"loss": 0.6415,
"step": 3660
},
{
"epoch": 0.8822821376986038,
"grad_norm": 1.984375,
"learning_rate": 6.879243196214718e-05,
"loss": 0.6314,
"step": 3665
},
{
"epoch": 0.88348579682234,
"grad_norm": 2.015625,
"learning_rate": 6.877546679103384e-05,
"loss": 0.701,
"step": 3670
},
{
"epoch": 0.8846894559460761,
"grad_norm": 2.0,
"learning_rate": 6.875848207716689e-05,
"loss": 0.686,
"step": 3675
},
{
"epoch": 0.8858931150698123,
"grad_norm": 1.9453125,
"learning_rate": 6.874147783578954e-05,
"loss": 0.6813,
"step": 3680
},
{
"epoch": 0.8870967741935484,
"grad_norm": 1.9453125,
"learning_rate": 6.872445408216255e-05,
"loss": 0.6357,
"step": 3685
},
{
"epoch": 0.8883004333172846,
"grad_norm": 2.015625,
"learning_rate": 6.870741083156415e-05,
"loss": 0.6627,
"step": 3690
},
{
"epoch": 0.8895040924410207,
"grad_norm": 2.15625,
"learning_rate": 6.86903480992901e-05,
"loss": 0.6747,
"step": 3695
},
{
"epoch": 0.8907077515647569,
"grad_norm": 2.15625,
"learning_rate": 6.867326590065361e-05,
"loss": 0.6878,
"step": 3700
},
{
"epoch": 0.891911410688493,
"grad_norm": 1.7265625,
"learning_rate": 6.86561642509854e-05,
"loss": 0.6376,
"step": 3705
},
{
"epoch": 0.8931150698122292,
"grad_norm": 1.9921875,
"learning_rate": 6.863904316563362e-05,
"loss": 0.6647,
"step": 3710
},
{
"epoch": 0.8943187289359653,
"grad_norm": 1.890625,
"learning_rate": 6.862190265996387e-05,
"loss": 0.6701,
"step": 3715
},
{
"epoch": 0.8955223880597015,
"grad_norm": 1.8046875,
"learning_rate": 6.86047427493592e-05,
"loss": 0.6583,
"step": 3720
},
{
"epoch": 0.8967260471834376,
"grad_norm": 1.9296875,
"learning_rate": 6.858756344922003e-05,
"loss": 0.6701,
"step": 3725
},
{
"epoch": 0.8979297063071738,
"grad_norm": 1.921875,
"learning_rate": 6.857036477496424e-05,
"loss": 0.6863,
"step": 3730
},
{
"epoch": 0.8991333654309099,
"grad_norm": 1.9765625,
"learning_rate": 6.855314674202704e-05,
"loss": 0.6299,
"step": 3735
},
{
"epoch": 0.9003370245546461,
"grad_norm": 1.8984375,
"learning_rate": 6.853590936586105e-05,
"loss": 0.6614,
"step": 3740
},
{
"epoch": 0.9015406836783822,
"grad_norm": 2.15625,
"learning_rate": 6.851865266193622e-05,
"loss": 0.6342,
"step": 3745
},
{
"epoch": 0.9027443428021185,
"grad_norm": 1.9921875,
"learning_rate": 6.850137664573988e-05,
"loss": 0.6648,
"step": 3750
},
{
"epoch": 0.9039480019258546,
"grad_norm": 1.8203125,
"learning_rate": 6.848408133277669e-05,
"loss": 0.6791,
"step": 3755
},
{
"epoch": 0.9051516610495908,
"grad_norm": 1.984375,
"learning_rate": 6.84667667385686e-05,
"loss": 0.6739,
"step": 3760
},
{
"epoch": 0.9063553201733269,
"grad_norm": 2.140625,
"learning_rate": 6.844943287865487e-05,
"loss": 0.702,
"step": 3765
},
{
"epoch": 0.9075589792970631,
"grad_norm": 1.8828125,
"learning_rate": 6.843207976859207e-05,
"loss": 0.6633,
"step": 3770
},
{
"epoch": 0.9087626384207992,
"grad_norm": 2.0,
"learning_rate": 6.841470742395405e-05,
"loss": 0.6723,
"step": 3775
},
{
"epoch": 0.9099662975445354,
"grad_norm": 2.0625,
"learning_rate": 6.839731586033188e-05,
"loss": 0.6841,
"step": 3780
},
{
"epoch": 0.9111699566682715,
"grad_norm": 2.0,
"learning_rate": 6.837990509333393e-05,
"loss": 0.6754,
"step": 3785
},
{
"epoch": 0.9123736157920077,
"grad_norm": 1.984375,
"learning_rate": 6.836247513858579e-05,
"loss": 0.661,
"step": 3790
},
{
"epoch": 0.9135772749157438,
"grad_norm": 1.8984375,
"learning_rate": 6.834502601173023e-05,
"loss": 0.6476,
"step": 3795
},
{
"epoch": 0.91478093403948,
"grad_norm": 2.09375,
"learning_rate": 6.832755772842727e-05,
"loss": 0.6827,
"step": 3800
},
{
"epoch": 0.9159845931632162,
"grad_norm": 1.8828125,
"learning_rate": 6.831007030435414e-05,
"loss": 0.6691,
"step": 3805
},
{
"epoch": 0.9171882522869523,
"grad_norm": 2.0,
"learning_rate": 6.829256375520516e-05,
"loss": 0.7024,
"step": 3810
},
{
"epoch": 0.9183919114106885,
"grad_norm": 1.9609375,
"learning_rate": 6.827503809669192e-05,
"loss": 0.6433,
"step": 3815
},
{
"epoch": 0.9195955705344246,
"grad_norm": 1.9453125,
"learning_rate": 6.825749334454311e-05,
"loss": 0.6887,
"step": 3820
},
{
"epoch": 0.9207992296581609,
"grad_norm": 1.9765625,
"learning_rate": 6.823992951450455e-05,
"loss": 0.6566,
"step": 3825
},
{
"epoch": 0.922002888781897,
"grad_norm": 1.9765625,
"learning_rate": 6.822234662233916e-05,
"loss": 0.6828,
"step": 3830
},
{
"epoch": 0.9232065479056332,
"grad_norm": 1.9609375,
"learning_rate": 6.820474468382704e-05,
"loss": 0.6761,
"step": 3835
},
{
"epoch": 0.9244102070293693,
"grad_norm": 1.9765625,
"learning_rate": 6.818712371476534e-05,
"loss": 0.626,
"step": 3840
},
{
"epoch": 0.9256138661531055,
"grad_norm": 2.140625,
"learning_rate": 6.816948373096826e-05,
"loss": 0.6551,
"step": 3845
},
{
"epoch": 0.9268175252768416,
"grad_norm": 1.859375,
"learning_rate": 6.815182474826712e-05,
"loss": 0.665,
"step": 3850
},
{
"epoch": 0.9280211844005778,
"grad_norm": 1.9609375,
"learning_rate": 6.813414678251028e-05,
"loss": 0.7109,
"step": 3855
},
{
"epoch": 0.9292248435243139,
"grad_norm": 1.9453125,
"learning_rate": 6.811644984956307e-05,
"loss": 0.6588,
"step": 3860
},
{
"epoch": 0.9304285026480501,
"grad_norm": 2.015625,
"learning_rate": 6.809873396530795e-05,
"loss": 0.6724,
"step": 3865
},
{
"epoch": 0.9316321617717862,
"grad_norm": 2.078125,
"learning_rate": 6.808099914564431e-05,
"loss": 0.691,
"step": 3870
},
{
"epoch": 0.9328358208955224,
"grad_norm": 1.953125,
"learning_rate": 6.806324540648856e-05,
"loss": 0.6624,
"step": 3875
},
{
"epoch": 0.9340394800192585,
"grad_norm": 2.0,
"learning_rate": 6.80454727637741e-05,
"loss": 0.6777,
"step": 3880
},
{
"epoch": 0.9352431391429947,
"grad_norm": 1.9609375,
"learning_rate": 6.802768123345126e-05,
"loss": 0.6342,
"step": 3885
},
{
"epoch": 0.9364467982667308,
"grad_norm": 1.9921875,
"learning_rate": 6.800987083148736e-05,
"loss": 0.661,
"step": 3890
},
{
"epoch": 0.937650457390467,
"grad_norm": 2.0625,
"learning_rate": 6.799204157386665e-05,
"loss": 0.6604,
"step": 3895
},
{
"epoch": 0.9388541165142031,
"grad_norm": 2.09375,
"learning_rate": 6.797419347659026e-05,
"loss": 0.6768,
"step": 3900
},
{
"epoch": 0.9400577756379394,
"grad_norm": 1.8828125,
"learning_rate": 6.795632655567628e-05,
"loss": 0.6441,
"step": 3905
},
{
"epoch": 0.9412614347616755,
"grad_norm": 2.046875,
"learning_rate": 6.793844082715967e-05,
"loss": 0.6903,
"step": 3910
},
{
"epoch": 0.9424650938854117,
"grad_norm": 2.0625,
"learning_rate": 6.79205363070923e-05,
"loss": 0.6843,
"step": 3915
},
{
"epoch": 0.9436687530091478,
"grad_norm": 1.9921875,
"learning_rate": 6.790261301154283e-05,
"loss": 0.6827,
"step": 3920
},
{
"epoch": 0.944872412132884,
"grad_norm": 1.859375,
"learning_rate": 6.788467095659686e-05,
"loss": 0.6374,
"step": 3925
},
{
"epoch": 0.9460760712566201,
"grad_norm": 1.9765625,
"learning_rate": 6.786671015835677e-05,
"loss": 0.6569,
"step": 3930
},
{
"epoch": 0.9472797303803563,
"grad_norm": 2.140625,
"learning_rate": 6.784873063294177e-05,
"loss": 0.6511,
"step": 3935
},
{
"epoch": 0.9484833895040924,
"grad_norm": 1.8984375,
"learning_rate": 6.783073239648788e-05,
"loss": 0.6392,
"step": 3940
},
{
"epoch": 0.9496870486278286,
"grad_norm": 1.90625,
"learning_rate": 6.781271546514794e-05,
"loss": 0.6284,
"step": 3945
},
{
"epoch": 0.9508907077515648,
"grad_norm": 1.90625,
"learning_rate": 6.779467985509152e-05,
"loss": 0.6342,
"step": 3950
},
{
"epoch": 0.9520943668753009,
"grad_norm": 1.9296875,
"learning_rate": 6.777662558250498e-05,
"loss": 0.63,
"step": 3955
},
{
"epoch": 0.9532980259990371,
"grad_norm": 1.9609375,
"learning_rate": 6.775855266359144e-05,
"loss": 0.6278,
"step": 3960
},
{
"epoch": 0.9545016851227732,
"grad_norm": 1.9765625,
"learning_rate": 6.774046111457075e-05,
"loss": 0.6682,
"step": 3965
},
{
"epoch": 0.9557053442465094,
"grad_norm": 2.15625,
"learning_rate": 6.772235095167942e-05,
"loss": 0.6455,
"step": 3970
},
{
"epoch": 0.9569090033702455,
"grad_norm": 1.9921875,
"learning_rate": 6.770422219117076e-05,
"loss": 0.6545,
"step": 3975
},
{
"epoch": 0.9581126624939817,
"grad_norm": 2.171875,
"learning_rate": 6.76860748493147e-05,
"loss": 0.6731,
"step": 3980
},
{
"epoch": 0.9593163216177178,
"grad_norm": 2.25,
"learning_rate": 6.766790894239793e-05,
"loss": 0.6858,
"step": 3985
},
{
"epoch": 0.9605199807414541,
"grad_norm": 2.0,
"learning_rate": 6.764972448672365e-05,
"loss": 0.6308,
"step": 3990
},
{
"epoch": 0.9617236398651902,
"grad_norm": 1.9375,
"learning_rate": 6.763152149861189e-05,
"loss": 0.6771,
"step": 3995
},
{
"epoch": 0.9629272989889264,
"grad_norm": 2.4375,
"learning_rate": 6.761329999439916e-05,
"loss": 0.6341,
"step": 4000
},
{
"epoch": 0.9629272989889264,
"eval_loss": 0.5589016675949097,
"eval_runtime": 2.406,
"eval_samples_per_second": 83.126,
"eval_steps_per_second": 83.126,
"step": 4000
},
{
"epoch": 0.9641309581126625,
"grad_norm": 1.8203125,
"learning_rate": 6.759505999043869e-05,
"loss": 0.7023,
"step": 4005
},
{
"epoch": 0.9653346172363987,
"grad_norm": 2.078125,
"learning_rate": 6.757680150310026e-05,
"loss": 0.66,
"step": 4010
},
{
"epoch": 0.9665382763601348,
"grad_norm": 2.0625,
"learning_rate": 6.755852454877027e-05,
"loss": 0.6577,
"step": 4015
},
{
"epoch": 0.967741935483871,
"grad_norm": 1.96875,
"learning_rate": 6.754022914385163e-05,
"loss": 0.6657,
"step": 4020
},
{
"epoch": 0.9689455946076071,
"grad_norm": 2.078125,
"learning_rate": 6.75219153047639e-05,
"loss": 0.6462,
"step": 4025
},
{
"epoch": 0.9701492537313433,
"grad_norm": 1.8828125,
"learning_rate": 6.750358304794312e-05,
"loss": 0.6606,
"step": 4030
},
{
"epoch": 0.9713529128550794,
"grad_norm": 1.8125,
"learning_rate": 6.748523238984188e-05,
"loss": 0.6602,
"step": 4035
},
{
"epoch": 0.9725565719788156,
"grad_norm": 1.9453125,
"learning_rate": 6.746686334692929e-05,
"loss": 0.6587,
"step": 4040
},
{
"epoch": 0.9737602311025517,
"grad_norm": 1.9921875,
"learning_rate": 6.744847593569092e-05,
"loss": 0.6497,
"step": 4045
},
{
"epoch": 0.9749638902262879,
"grad_norm": 1.9375,
"learning_rate": 6.74300701726289e-05,
"loss": 0.6741,
"step": 4050
},
{
"epoch": 0.976167549350024,
"grad_norm": 2.1875,
"learning_rate": 6.741164607426177e-05,
"loss": 0.6446,
"step": 4055
},
{
"epoch": 0.9773712084737602,
"grad_norm": 2.03125,
"learning_rate": 6.739320365712451e-05,
"loss": 0.6547,
"step": 4060
},
{
"epoch": 0.9785748675974963,
"grad_norm": 2.125,
"learning_rate": 6.737474293776865e-05,
"loss": 0.6354,
"step": 4065
},
{
"epoch": 0.9797785267212326,
"grad_norm": 1.9453125,
"learning_rate": 6.7356263932762e-05,
"loss": 0.6489,
"step": 4070
},
{
"epoch": 0.9809821858449687,
"grad_norm": 2.203125,
"learning_rate": 6.733776665868885e-05,
"loss": 0.7068,
"step": 4075
},
{
"epoch": 0.9821858449687049,
"grad_norm": 1.90625,
"learning_rate": 6.731925113214994e-05,
"loss": 0.6695,
"step": 4080
},
{
"epoch": 0.983389504092441,
"grad_norm": 2.046875,
"learning_rate": 6.730071736976229e-05,
"loss": 0.6576,
"step": 4085
},
{
"epoch": 0.9845931632161772,
"grad_norm": 1.8515625,
"learning_rate": 6.728216538815934e-05,
"loss": 0.6666,
"step": 4090
},
{
"epoch": 0.9857968223399133,
"grad_norm": 2.0625,
"learning_rate": 6.726359520399088e-05,
"loss": 0.6542,
"step": 4095
},
{
"epoch": 0.9870004814636495,
"grad_norm": 2.09375,
"learning_rate": 6.724500683392303e-05,
"loss": 0.6726,
"step": 4100
},
{
"epoch": 0.9882041405873857,
"grad_norm": 2.015625,
"learning_rate": 6.722640029463823e-05,
"loss": 0.6588,
"step": 4105
},
{
"epoch": 0.9894077997111218,
"grad_norm": 1.9921875,
"learning_rate": 6.720777560283523e-05,
"loss": 0.6522,
"step": 4110
},
{
"epoch": 0.990611458834858,
"grad_norm": 1.8359375,
"learning_rate": 6.718913277522905e-05,
"loss": 0.6492,
"step": 4115
},
{
"epoch": 0.9918151179585941,
"grad_norm": 2.0,
"learning_rate": 6.717047182855104e-05,
"loss": 0.6672,
"step": 4120
},
{
"epoch": 0.9930187770823303,
"grad_norm": 1.953125,
"learning_rate": 6.715179277954874e-05,
"loss": 0.6509,
"step": 4125
},
{
"epoch": 0.9942224362060664,
"grad_norm": 1.9921875,
"learning_rate": 6.713309564498599e-05,
"loss": 0.6461,
"step": 4130
},
{
"epoch": 0.9954260953298026,
"grad_norm": 1.9375,
"learning_rate": 6.711438044164282e-05,
"loss": 0.6566,
"step": 4135
},
{
"epoch": 0.9966297544535387,
"grad_norm": 2.046875,
"learning_rate": 6.709564718631556e-05,
"loss": 0.6447,
"step": 4140
},
{
"epoch": 0.997833413577275,
"grad_norm": 1.9609375,
"learning_rate": 6.707689589581662e-05,
"loss": 0.6736,
"step": 4145
},
{
"epoch": 0.999037072701011,
"grad_norm": 1.9140625,
"learning_rate": 6.705812658697467e-05,
"loss": 0.6542,
"step": 4150
},
{
"epoch": 0.9997592681752527,
"eval_loss": 0.5545368194580078,
"eval_runtime": 2.4068,
"eval_samples_per_second": 83.099,
"eval_steps_per_second": 83.099,
"step": 4153
}
],
"logging_steps": 5,
"max_steps": 16616,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.04173997654016e+17,
"train_batch_size": 48,
"trial_name": null,
"trial_params": null
}