instruct_1_our / trainer_state.json
bimabk's picture
Upload task output 1
f1d3cf8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.999277804525758,
"eval_steps": 500,
"global_step": 12459,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001203659123736158,
"grad_norm": 4.21875,
"learning_rate": 2.5919093035841547e-05,
"loss": 1.9379,
"step": 5
},
{
"epoch": 0.002407318247472316,
"grad_norm": 4.09375,
"learning_rate": 5.831795933064347e-05,
"loss": 1.807,
"step": 10
},
{
"epoch": 0.0036109773712084737,
"grad_norm": 3.03125,
"learning_rate": 9.071682562544542e-05,
"loss": 1.682,
"step": 15
},
{
"epoch": 0.004814636494944632,
"grad_norm": 3.015625,
"learning_rate": 0.00012311569192024734,
"loss": 1.5536,
"step": 20
},
{
"epoch": 0.00601829561868079,
"grad_norm": 3.09375,
"learning_rate": 0.0001555145582150493,
"loss": 1.4964,
"step": 25
},
{
"epoch": 0.007221954742416947,
"grad_norm": 3.015625,
"learning_rate": 0.00018791342450985124,
"loss": 1.4161,
"step": 30
},
{
"epoch": 0.008425613866153106,
"grad_norm": 3.0625,
"learning_rate": 0.00022031229080465316,
"loss": 1.3743,
"step": 35
},
{
"epoch": 0.009629272989889264,
"grad_norm": 3.15625,
"learning_rate": 0.0002267920205809419,
"loss": 1.3171,
"step": 40
},
{
"epoch": 0.010832932113625422,
"grad_norm": 2.84375,
"learning_rate": 0.0002267918439326645,
"loss": 1.3092,
"step": 45
},
{
"epoch": 0.01203659123736158,
"grad_norm": 3.125,
"learning_rate": 0.00022679153140139644,
"loss": 1.2614,
"step": 50
},
{
"epoch": 0.013240250361097737,
"grad_norm": 3.140625,
"learning_rate": 0.00022679108298763706,
"loss": 1.2531,
"step": 55
},
{
"epoch": 0.014443909484833895,
"grad_norm": 2.984375,
"learning_rate": 0.00022679049869210282,
"loss": 1.2076,
"step": 60
},
{
"epoch": 0.015647568608570053,
"grad_norm": 2.828125,
"learning_rate": 0.00022678977851572723,
"loss": 1.2066,
"step": 65
},
{
"epoch": 0.016851227732306212,
"grad_norm": 2.984375,
"learning_rate": 0.00022678892245966094,
"loss": 1.1777,
"step": 70
},
{
"epoch": 0.018054886856042368,
"grad_norm": 2.390625,
"learning_rate": 0.00022678793052527177,
"loss": 1.1732,
"step": 75
},
{
"epoch": 0.019258545979778528,
"grad_norm": 2.828125,
"learning_rate": 0.00022678680271414454,
"loss": 1.1521,
"step": 80
},
{
"epoch": 0.020462205103514684,
"grad_norm": 2.453125,
"learning_rate": 0.00022678553902808117,
"loss": 1.152,
"step": 85
},
{
"epoch": 0.021665864227250843,
"grad_norm": 2.578125,
"learning_rate": 0.0002267841394691008,
"loss": 1.1442,
"step": 90
},
{
"epoch": 0.022869523350987,
"grad_norm": 2.484375,
"learning_rate": 0.00022678260403943945,
"loss": 1.1272,
"step": 95
},
{
"epoch": 0.02407318247472316,
"grad_norm": 2.359375,
"learning_rate": 0.00022678093274155044,
"loss": 1.1132,
"step": 100
},
{
"epoch": 0.025276841598459315,
"grad_norm": 2.875,
"learning_rate": 0.00022677912557810405,
"loss": 1.0841,
"step": 105
},
{
"epoch": 0.026480500722195474,
"grad_norm": 2.171875,
"learning_rate": 0.0002267771825519876,
"loss": 1.1196,
"step": 110
},
{
"epoch": 0.027684159845931634,
"grad_norm": 2.46875,
"learning_rate": 0.00022677510366630565,
"loss": 1.1167,
"step": 115
},
{
"epoch": 0.02888781896966779,
"grad_norm": 2.90625,
"learning_rate": 0.00022677288892437964,
"loss": 1.0928,
"step": 120
},
{
"epoch": 0.03009147809340395,
"grad_norm": 2.296875,
"learning_rate": 0.0002267705383297482,
"loss": 1.042,
"step": 125
},
{
"epoch": 0.031295137217140105,
"grad_norm": 2.546875,
"learning_rate": 0.000226768051886167,
"loss": 1.1231,
"step": 130
},
{
"epoch": 0.03249879634087626,
"grad_norm": 2.4375,
"learning_rate": 0.00022676542959760868,
"loss": 1.0561,
"step": 135
},
{
"epoch": 0.033702455464612424,
"grad_norm": 2.453125,
"learning_rate": 0.00022676267146826303,
"loss": 1.0566,
"step": 140
},
{
"epoch": 0.03490611458834858,
"grad_norm": 2.671875,
"learning_rate": 0.00022675977750253681,
"loss": 1.0463,
"step": 145
},
{
"epoch": 0.036109773712084736,
"grad_norm": 2.203125,
"learning_rate": 0.00022675674770505387,
"loss": 1.0553,
"step": 150
},
{
"epoch": 0.03731343283582089,
"grad_norm": 2.390625,
"learning_rate": 0.000226753582080655,
"loss": 1.0489,
"step": 155
},
{
"epoch": 0.038517091959557055,
"grad_norm": 2.171875,
"learning_rate": 0.0002267502806343981,
"loss": 1.0717,
"step": 160
},
{
"epoch": 0.03972075108329321,
"grad_norm": 2.171875,
"learning_rate": 0.000226746843371558,
"loss": 1.0378,
"step": 165
},
{
"epoch": 0.04092441020702937,
"grad_norm": 2.328125,
"learning_rate": 0.00022674327029762663,
"loss": 1.0443,
"step": 170
},
{
"epoch": 0.04212806933076553,
"grad_norm": 2.046875,
"learning_rate": 0.00022673956141831277,
"loss": 1.0333,
"step": 175
},
{
"epoch": 0.043331728454501686,
"grad_norm": 2.390625,
"learning_rate": 0.00022673571673954227,
"loss": 1.0333,
"step": 180
},
{
"epoch": 0.04453538757823784,
"grad_norm": 2.015625,
"learning_rate": 0.00022673173626745798,
"loss": 0.9786,
"step": 185
},
{
"epoch": 0.045739046701974,
"grad_norm": 2.296875,
"learning_rate": 0.00022672762000841963,
"loss": 1.0438,
"step": 190
},
{
"epoch": 0.04694270582571016,
"grad_norm": 2.03125,
"learning_rate": 0.000226723367969004,
"loss": 0.9916,
"step": 195
},
{
"epoch": 0.04814636494944632,
"grad_norm": 2.1875,
"learning_rate": 0.0002267189801560047,
"loss": 0.9889,
"step": 200
},
{
"epoch": 0.04935002407318247,
"grad_norm": 1.921875,
"learning_rate": 0.00022671445657643235,
"loss": 0.987,
"step": 205
},
{
"epoch": 0.05055368319691863,
"grad_norm": 1.8515625,
"learning_rate": 0.0002267097972375145,
"loss": 0.9922,
"step": 210
},
{
"epoch": 0.05175734232065479,
"grad_norm": 2.484375,
"learning_rate": 0.00022670500214669556,
"loss": 1.0039,
"step": 215
},
{
"epoch": 0.05296100144439095,
"grad_norm": 2.125,
"learning_rate": 0.00022670007131163683,
"loss": 0.9978,
"step": 220
},
{
"epoch": 0.054164660568127104,
"grad_norm": 2.15625,
"learning_rate": 0.00022669500474021656,
"loss": 0.9857,
"step": 225
},
{
"epoch": 0.05536831969186327,
"grad_norm": 2.078125,
"learning_rate": 0.00022668980244052982,
"loss": 0.9795,
"step": 230
},
{
"epoch": 0.05657197881559942,
"grad_norm": 1.953125,
"learning_rate": 0.00022668446442088852,
"loss": 0.9548,
"step": 235
},
{
"epoch": 0.05777563793933558,
"grad_norm": 2.15625,
"learning_rate": 0.0002266789906898215,
"loss": 0.9545,
"step": 240
},
{
"epoch": 0.058979297063071735,
"grad_norm": 1.96875,
"learning_rate": 0.00022667338125607434,
"loss": 0.9769,
"step": 245
},
{
"epoch": 0.0601829561868079,
"grad_norm": 1.984375,
"learning_rate": 0.00022666763612860948,
"loss": 0.9618,
"step": 250
},
{
"epoch": 0.061386615310544054,
"grad_norm": 1.8359375,
"learning_rate": 0.0002266617553166062,
"loss": 0.9863,
"step": 255
},
{
"epoch": 0.06259027443428021,
"grad_norm": 1.953125,
"learning_rate": 0.00022665573882946045,
"loss": 0.9239,
"step": 260
},
{
"epoch": 0.06379393355801637,
"grad_norm": 1.984375,
"learning_rate": 0.00022664958667678516,
"loss": 0.9602,
"step": 265
},
{
"epoch": 0.06499759268175252,
"grad_norm": 2.1875,
"learning_rate": 0.0002266432988684098,
"loss": 0.9458,
"step": 270
},
{
"epoch": 0.06620125180548869,
"grad_norm": 2.046875,
"learning_rate": 0.00022663687541438066,
"loss": 0.9507,
"step": 275
},
{
"epoch": 0.06740491092922485,
"grad_norm": 1.9765625,
"learning_rate": 0.00022663031632496082,
"loss": 0.9438,
"step": 280
},
{
"epoch": 0.068608570052961,
"grad_norm": 2.078125,
"learning_rate": 0.00022662362161063,
"loss": 0.9258,
"step": 285
},
{
"epoch": 0.06981222917669716,
"grad_norm": 2.0,
"learning_rate": 0.00022661679128208466,
"loss": 0.9269,
"step": 290
},
{
"epoch": 0.07101588830043332,
"grad_norm": 2.109375,
"learning_rate": 0.0002266098253502379,
"loss": 0.923,
"step": 295
},
{
"epoch": 0.07221954742416947,
"grad_norm": 2.046875,
"learning_rate": 0.0002266027238262195,
"loss": 0.899,
"step": 300
},
{
"epoch": 0.07342320654790563,
"grad_norm": 2.09375,
"learning_rate": 0.0002265954867213759,
"loss": 0.9169,
"step": 305
},
{
"epoch": 0.07462686567164178,
"grad_norm": 1.9765625,
"learning_rate": 0.00022658811404727006,
"loss": 0.9648,
"step": 310
},
{
"epoch": 0.07583052479537795,
"grad_norm": 2.078125,
"learning_rate": 0.00022658060581568168,
"loss": 0.8962,
"step": 315
},
{
"epoch": 0.07703418391911411,
"grad_norm": 2.171875,
"learning_rate": 0.00022657296203860703,
"loss": 0.9045,
"step": 320
},
{
"epoch": 0.07823784304285027,
"grad_norm": 2.28125,
"learning_rate": 0.0002265651827282588,
"loss": 0.9542,
"step": 325
},
{
"epoch": 0.07944150216658642,
"grad_norm": 2.03125,
"learning_rate": 0.00022655726789706638,
"loss": 0.9492,
"step": 330
},
{
"epoch": 0.08064516129032258,
"grad_norm": 2.046875,
"learning_rate": 0.0002265492175576757,
"loss": 0.9183,
"step": 335
},
{
"epoch": 0.08184882041405873,
"grad_norm": 2.046875,
"learning_rate": 0.00022654103172294905,
"loss": 0.9093,
"step": 340
},
{
"epoch": 0.08305247953779489,
"grad_norm": 2.046875,
"learning_rate": 0.00022653271040596538,
"loss": 0.8886,
"step": 345
},
{
"epoch": 0.08425613866153106,
"grad_norm": 1.8203125,
"learning_rate": 0.00022652425362001993,
"loss": 0.9305,
"step": 350
},
{
"epoch": 0.08545979778526722,
"grad_norm": 1.953125,
"learning_rate": 0.00022651566137862455,
"loss": 0.9008,
"step": 355
},
{
"epoch": 0.08666345690900337,
"grad_norm": 1.6640625,
"learning_rate": 0.0002265069336955074,
"loss": 0.9077,
"step": 360
},
{
"epoch": 0.08786711603273953,
"grad_norm": 2.0,
"learning_rate": 0.0002264980705846131,
"loss": 0.9076,
"step": 365
},
{
"epoch": 0.08907077515647568,
"grad_norm": 1.7578125,
"learning_rate": 0.00022648907206010264,
"loss": 0.9081,
"step": 370
},
{
"epoch": 0.09027443428021184,
"grad_norm": 1.8125,
"learning_rate": 0.00022647993813635332,
"loss": 0.9151,
"step": 375
},
{
"epoch": 0.091478093403948,
"grad_norm": 1.671875,
"learning_rate": 0.00022647066882795883,
"loss": 0.8817,
"step": 380
},
{
"epoch": 0.09268175252768417,
"grad_norm": 2.015625,
"learning_rate": 0.00022646126414972915,
"loss": 0.9013,
"step": 385
},
{
"epoch": 0.09388541165142032,
"grad_norm": 1.84375,
"learning_rate": 0.00022645172411669054,
"loss": 0.8935,
"step": 390
},
{
"epoch": 0.09508907077515648,
"grad_norm": 1.875,
"learning_rate": 0.0002264420487440855,
"loss": 0.8913,
"step": 395
},
{
"epoch": 0.09629272989889263,
"grad_norm": 1.953125,
"learning_rate": 0.00022643223804737285,
"loss": 0.885,
"step": 400
},
{
"epoch": 0.09749638902262879,
"grad_norm": 1.734375,
"learning_rate": 0.00022642229204222753,
"loss": 0.8901,
"step": 405
},
{
"epoch": 0.09870004814636495,
"grad_norm": 1.7890625,
"learning_rate": 0.00022641221074454071,
"loss": 0.8957,
"step": 410
},
{
"epoch": 0.0999037072701011,
"grad_norm": 1.8125,
"learning_rate": 0.00022640199417041975,
"loss": 0.9,
"step": 415
},
{
"epoch": 0.10110736639383726,
"grad_norm": 1.96875,
"learning_rate": 0.00022639164233618806,
"loss": 0.8978,
"step": 420
},
{
"epoch": 0.10231102551757343,
"grad_norm": 2.0,
"learning_rate": 0.00022638115525838528,
"loss": 0.8986,
"step": 425
},
{
"epoch": 0.10351468464130958,
"grad_norm": 1.9140625,
"learning_rate": 0.00022637053295376702,
"loss": 0.8391,
"step": 430
},
{
"epoch": 0.10471834376504574,
"grad_norm": 1.7265625,
"learning_rate": 0.00022635977543930503,
"loss": 0.8886,
"step": 435
},
{
"epoch": 0.1059220028887819,
"grad_norm": 1.9296875,
"learning_rate": 0.00022634888273218704,
"loss": 0.8602,
"step": 440
},
{
"epoch": 0.10712566201251805,
"grad_norm": 1.9375,
"learning_rate": 0.0002263378548498168,
"loss": 0.8885,
"step": 445
},
{
"epoch": 0.10832932113625421,
"grad_norm": 1.6484375,
"learning_rate": 0.00022632669180981408,
"loss": 0.8981,
"step": 450
},
{
"epoch": 0.10953298025999036,
"grad_norm": 1.5859375,
"learning_rate": 0.00022631539363001456,
"loss": 0.8979,
"step": 455
},
{
"epoch": 0.11073663938372653,
"grad_norm": 1.6640625,
"learning_rate": 0.00022630396032846976,
"loss": 0.8628,
"step": 460
},
{
"epoch": 0.11194029850746269,
"grad_norm": 1.7890625,
"learning_rate": 0.00022629239192344726,
"loss": 0.8569,
"step": 465
},
{
"epoch": 0.11314395763119885,
"grad_norm": 1.6953125,
"learning_rate": 0.0002262806884334303,
"loss": 0.8698,
"step": 470
},
{
"epoch": 0.114347616754935,
"grad_norm": 1.828125,
"learning_rate": 0.00022626884987711815,
"loss": 0.9038,
"step": 475
},
{
"epoch": 0.11555127587867116,
"grad_norm": 1.6328125,
"learning_rate": 0.0002262568762734257,
"loss": 0.8517,
"step": 480
},
{
"epoch": 0.11675493500240731,
"grad_norm": 1.7421875,
"learning_rate": 0.00022624476764148384,
"loss": 0.8671,
"step": 485
},
{
"epoch": 0.11795859412614347,
"grad_norm": 1.6328125,
"learning_rate": 0.00022623252400063893,
"loss": 0.8056,
"step": 490
},
{
"epoch": 0.11916225324987964,
"grad_norm": 1.71875,
"learning_rate": 0.00022622014537045318,
"loss": 0.8404,
"step": 495
},
{
"epoch": 0.1203659123736158,
"grad_norm": 1.7734375,
"learning_rate": 0.00022620763177070452,
"loss": 0.8565,
"step": 500
},
{
"epoch": 0.1203659123736158,
"eval_loss": 0.7673972845077515,
"eval_runtime": 2.3652,
"eval_samples_per_second": 84.558,
"eval_steps_per_second": 84.558,
"step": 500
},
{
"epoch": 0.12156957149735195,
"grad_norm": 1.8984375,
"learning_rate": 0.00022619498322138643,
"loss": 0.8111,
"step": 505
},
{
"epoch": 0.12277323062108811,
"grad_norm": 2.015625,
"learning_rate": 0.0002261821997427081,
"loss": 0.8504,
"step": 510
},
{
"epoch": 0.12397688974482426,
"grad_norm": 1.8046875,
"learning_rate": 0.0002261692813550942,
"loss": 0.8873,
"step": 515
},
{
"epoch": 0.12518054886856042,
"grad_norm": 1.875,
"learning_rate": 0.00022615622807918505,
"loss": 0.9048,
"step": 520
},
{
"epoch": 0.12638420799229658,
"grad_norm": 1.828125,
"learning_rate": 0.0002261430399358364,
"loss": 0.8834,
"step": 525
},
{
"epoch": 0.12758786711603273,
"grad_norm": 1.90625,
"learning_rate": 0.00022612971694611954,
"loss": 0.8471,
"step": 530
},
{
"epoch": 0.1287915262397689,
"grad_norm": 1.9453125,
"learning_rate": 0.0002261162591313212,
"loss": 0.8373,
"step": 535
},
{
"epoch": 0.12999518536350504,
"grad_norm": 1.7265625,
"learning_rate": 0.00022610266651294347,
"loss": 0.8399,
"step": 540
},
{
"epoch": 0.1311988444872412,
"grad_norm": 1.734375,
"learning_rate": 0.00022608893911270394,
"loss": 0.8563,
"step": 545
},
{
"epoch": 0.13240250361097738,
"grad_norm": 1.84375,
"learning_rate": 0.00022607507695253541,
"loss": 0.8144,
"step": 550
},
{
"epoch": 0.13360616273471354,
"grad_norm": 1.7421875,
"learning_rate": 0.00022606108005458612,
"loss": 0.855,
"step": 555
},
{
"epoch": 0.1348098218584497,
"grad_norm": 1.671875,
"learning_rate": 0.00022604694844121948,
"loss": 0.842,
"step": 560
},
{
"epoch": 0.13601348098218585,
"grad_norm": 1.7890625,
"learning_rate": 0.00022603268213501425,
"loss": 0.8178,
"step": 565
},
{
"epoch": 0.137217140105922,
"grad_norm": 1.6875,
"learning_rate": 0.00022601828115876422,
"loss": 0.8178,
"step": 570
},
{
"epoch": 0.13842079922965816,
"grad_norm": 1.671875,
"learning_rate": 0.00022600374553547852,
"loss": 0.775,
"step": 575
},
{
"epoch": 0.13962445835339432,
"grad_norm": 1.703125,
"learning_rate": 0.00022598907528838139,
"loss": 0.8318,
"step": 580
},
{
"epoch": 0.14082811747713048,
"grad_norm": 1.765625,
"learning_rate": 0.00022597427044091206,
"loss": 0.8249,
"step": 585
},
{
"epoch": 0.14203177660086663,
"grad_norm": 1.71875,
"learning_rate": 0.00022595933101672488,
"loss": 0.8299,
"step": 590
},
{
"epoch": 0.1432354357246028,
"grad_norm": 1.6796875,
"learning_rate": 0.00022594425703968926,
"loss": 0.8309,
"step": 595
},
{
"epoch": 0.14443909484833894,
"grad_norm": 1.6796875,
"learning_rate": 0.00022592904853388957,
"loss": 0.8033,
"step": 600
},
{
"epoch": 0.1456427539720751,
"grad_norm": 1.5703125,
"learning_rate": 0.00022591370552362504,
"loss": 0.8265,
"step": 605
},
{
"epoch": 0.14684641309581126,
"grad_norm": 1.59375,
"learning_rate": 0.0002258982280334099,
"loss": 0.8166,
"step": 610
},
{
"epoch": 0.1480500722195474,
"grad_norm": 1.6171875,
"learning_rate": 0.0002258826160879732,
"loss": 0.8376,
"step": 615
},
{
"epoch": 0.14925373134328357,
"grad_norm": 1.5234375,
"learning_rate": 0.00022586686971225886,
"loss": 0.8574,
"step": 620
},
{
"epoch": 0.15045739046701975,
"grad_norm": 1.828125,
"learning_rate": 0.0002258509889314255,
"loss": 0.8235,
"step": 625
},
{
"epoch": 0.1516610495907559,
"grad_norm": 1.6640625,
"learning_rate": 0.00022583497377084654,
"loss": 0.8483,
"step": 630
},
{
"epoch": 0.15286470871449206,
"grad_norm": 1.65625,
"learning_rate": 0.00022581882425611017,
"loss": 0.8378,
"step": 635
},
{
"epoch": 0.15406836783822822,
"grad_norm": 1.5078125,
"learning_rate": 0.00022580254041301912,
"loss": 0.8122,
"step": 640
},
{
"epoch": 0.15527202696196438,
"grad_norm": 1.7265625,
"learning_rate": 0.0002257861222675908,
"loss": 0.8318,
"step": 645
},
{
"epoch": 0.15647568608570053,
"grad_norm": 1.53125,
"learning_rate": 0.0002257695698460572,
"loss": 0.8372,
"step": 650
},
{
"epoch": 0.1576793452094367,
"grad_norm": 1.6328125,
"learning_rate": 0.00022575288317486488,
"loss": 0.8612,
"step": 655
},
{
"epoch": 0.15888300433317284,
"grad_norm": 1.6640625,
"learning_rate": 0.00022573606228067477,
"loss": 0.823,
"step": 660
},
{
"epoch": 0.160086663456909,
"grad_norm": 1.6171875,
"learning_rate": 0.00022571910719036245,
"loss": 0.8474,
"step": 665
},
{
"epoch": 0.16129032258064516,
"grad_norm": 1.8125,
"learning_rate": 0.00022570201793101777,
"loss": 0.8252,
"step": 670
},
{
"epoch": 0.1624939817043813,
"grad_norm": 1.7265625,
"learning_rate": 0.00022568479452994496,
"loss": 0.7871,
"step": 675
},
{
"epoch": 0.16369764082811747,
"grad_norm": 1.6640625,
"learning_rate": 0.00022566743701466264,
"loss": 0.8156,
"step": 680
},
{
"epoch": 0.16490129995185362,
"grad_norm": 1.671875,
"learning_rate": 0.00022564994541290366,
"loss": 0.8035,
"step": 685
},
{
"epoch": 0.16610495907558978,
"grad_norm": 1.5859375,
"learning_rate": 0.00022563231975261506,
"loss": 0.8049,
"step": 690
},
{
"epoch": 0.16730861819932596,
"grad_norm": 1.6484375,
"learning_rate": 0.00022561456006195825,
"loss": 0.7977,
"step": 695
},
{
"epoch": 0.16851227732306212,
"grad_norm": 1.6328125,
"learning_rate": 0.00022559666636930853,
"loss": 0.792,
"step": 700
},
{
"epoch": 0.16971593644679828,
"grad_norm": 1.8125,
"learning_rate": 0.0002255786387032555,
"loss": 0.8435,
"step": 705
},
{
"epoch": 0.17091959557053443,
"grad_norm": 1.765625,
"learning_rate": 0.00022556047709260273,
"loss": 0.8189,
"step": 710
},
{
"epoch": 0.1721232546942706,
"grad_norm": 1.5859375,
"learning_rate": 0.00022554218156636783,
"loss": 0.8183,
"step": 715
},
{
"epoch": 0.17332691381800674,
"grad_norm": 1.7109375,
"learning_rate": 0.00022552375215378242,
"loss": 0.7978,
"step": 720
},
{
"epoch": 0.1745305729417429,
"grad_norm": 1.7890625,
"learning_rate": 0.00022550518888429184,
"loss": 0.8575,
"step": 725
},
{
"epoch": 0.17573423206547906,
"grad_norm": 1.734375,
"learning_rate": 0.00022548649178755556,
"loss": 0.8085,
"step": 730
},
{
"epoch": 0.1769378911892152,
"grad_norm": 1.6953125,
"learning_rate": 0.00022546766089344666,
"loss": 0.8164,
"step": 735
},
{
"epoch": 0.17814155031295137,
"grad_norm": 1.6015625,
"learning_rate": 0.00022544869623205215,
"loss": 0.7827,
"step": 740
},
{
"epoch": 0.17934520943668752,
"grad_norm": 1.5703125,
"learning_rate": 0.00022542959783367265,
"loss": 0.7821,
"step": 745
},
{
"epoch": 0.18054886856042368,
"grad_norm": 1.546875,
"learning_rate": 0.00022541036572882255,
"loss": 0.7637,
"step": 750
},
{
"epoch": 0.18175252768415984,
"grad_norm": 1.53125,
"learning_rate": 0.00022539099994822978,
"loss": 0.7903,
"step": 755
},
{
"epoch": 0.182956186807896,
"grad_norm": 1.484375,
"learning_rate": 0.00022537150052283589,
"loss": 0.7552,
"step": 760
},
{
"epoch": 0.18415984593163215,
"grad_norm": 1.7890625,
"learning_rate": 0.000225351867483796,
"loss": 0.7977,
"step": 765
},
{
"epoch": 0.18536350505536833,
"grad_norm": 1.390625,
"learning_rate": 0.00022533210086247865,
"loss": 0.7503,
"step": 770
},
{
"epoch": 0.1865671641791045,
"grad_norm": 1.5859375,
"learning_rate": 0.00022531220069046585,
"loss": 0.8362,
"step": 775
},
{
"epoch": 0.18777082330284064,
"grad_norm": 1.703125,
"learning_rate": 0.00022529216699955295,
"loss": 0.7911,
"step": 780
},
{
"epoch": 0.1889744824265768,
"grad_norm": 1.5390625,
"learning_rate": 0.00022527199982174865,
"loss": 0.8015,
"step": 785
},
{
"epoch": 0.19017814155031296,
"grad_norm": 1.7421875,
"learning_rate": 0.000225251699189275,
"loss": 0.7788,
"step": 790
},
{
"epoch": 0.1913818006740491,
"grad_norm": 1.4453125,
"learning_rate": 0.0002252312651345671,
"loss": 0.7523,
"step": 795
},
{
"epoch": 0.19258545979778527,
"grad_norm": 1.6484375,
"learning_rate": 0.0002252106976902734,
"loss": 0.7903,
"step": 800
},
{
"epoch": 0.19378911892152142,
"grad_norm": 1.734375,
"learning_rate": 0.00022518999688925538,
"loss": 0.7934,
"step": 805
},
{
"epoch": 0.19499277804525758,
"grad_norm": 1.5546875,
"learning_rate": 0.00022516916276458764,
"loss": 0.7886,
"step": 810
},
{
"epoch": 0.19619643716899374,
"grad_norm": 1.59375,
"learning_rate": 0.00022514819534955773,
"loss": 0.773,
"step": 815
},
{
"epoch": 0.1974000962927299,
"grad_norm": 3.140625,
"learning_rate": 0.00022512709467766622,
"loss": 0.7914,
"step": 820
},
{
"epoch": 0.19860375541646605,
"grad_norm": 1.5078125,
"learning_rate": 0.0002251058607826266,
"loss": 0.7788,
"step": 825
},
{
"epoch": 0.1998074145402022,
"grad_norm": 1.640625,
"learning_rate": 0.00022508449369836514,
"loss": 0.7857,
"step": 830
},
{
"epoch": 0.20101107366393836,
"grad_norm": 1.6015625,
"learning_rate": 0.00022506299345902102,
"loss": 0.7934,
"step": 835
},
{
"epoch": 0.20221473278767452,
"grad_norm": 1.5859375,
"learning_rate": 0.00022504136009894607,
"loss": 0.7677,
"step": 840
},
{
"epoch": 0.2034183919114107,
"grad_norm": 1.46875,
"learning_rate": 0.00022501959365270487,
"loss": 0.7907,
"step": 845
},
{
"epoch": 0.20462205103514686,
"grad_norm": 1.5703125,
"learning_rate": 0.00022499769415507462,
"loss": 0.7947,
"step": 850
},
{
"epoch": 0.205825710158883,
"grad_norm": 1.453125,
"learning_rate": 0.00022497566164104507,
"loss": 0.8128,
"step": 855
},
{
"epoch": 0.20702936928261917,
"grad_norm": 1.484375,
"learning_rate": 0.00022495349614581862,
"loss": 0.7916,
"step": 860
},
{
"epoch": 0.20823302840635532,
"grad_norm": 1.5703125,
"learning_rate": 0.00022493119770480995,
"loss": 0.8329,
"step": 865
},
{
"epoch": 0.20943668753009148,
"grad_norm": 1.5078125,
"learning_rate": 0.00022490876635364627,
"loss": 0.7984,
"step": 870
},
{
"epoch": 0.21064034665382764,
"grad_norm": 1.6796875,
"learning_rate": 0.00022488620212816722,
"loss": 0.7822,
"step": 875
},
{
"epoch": 0.2118440057775638,
"grad_norm": 1.421875,
"learning_rate": 0.00022486350506442453,
"loss": 0.8054,
"step": 880
},
{
"epoch": 0.21304766490129995,
"grad_norm": 1.609375,
"learning_rate": 0.00022484067519868236,
"loss": 0.7822,
"step": 885
},
{
"epoch": 0.2142513240250361,
"grad_norm": 1.453125,
"learning_rate": 0.00022481771256741695,
"loss": 0.765,
"step": 890
},
{
"epoch": 0.21545498314877226,
"grad_norm": 1.734375,
"learning_rate": 0.0002247946172073167,
"loss": 0.7738,
"step": 895
},
{
"epoch": 0.21665864227250842,
"grad_norm": 1.375,
"learning_rate": 0.0002247713891552821,
"loss": 0.7587,
"step": 900
},
{
"epoch": 0.21786230139624457,
"grad_norm": 1.6875,
"learning_rate": 0.00022474802844842562,
"loss": 0.7927,
"step": 905
},
{
"epoch": 0.21906596051998073,
"grad_norm": 1.4921875,
"learning_rate": 0.00022472453512407164,
"loss": 0.7889,
"step": 910
},
{
"epoch": 0.2202696196437169,
"grad_norm": 1.5234375,
"learning_rate": 0.00022470090921975652,
"loss": 0.7653,
"step": 915
},
{
"epoch": 0.22147327876745307,
"grad_norm": 1.5234375,
"learning_rate": 0.00022467715077322835,
"loss": 0.7868,
"step": 920
},
{
"epoch": 0.22267693789118922,
"grad_norm": 5.34375,
"learning_rate": 0.0002246532598224471,
"loss": 0.7801,
"step": 925
},
{
"epoch": 0.22388059701492538,
"grad_norm": 1.65625,
"learning_rate": 0.00022462923640558435,
"loss": 0.775,
"step": 930
},
{
"epoch": 0.22508425613866154,
"grad_norm": 1.5390625,
"learning_rate": 0.0002246050805610233,
"loss": 0.7802,
"step": 935
},
{
"epoch": 0.2262879152623977,
"grad_norm": 1.5625,
"learning_rate": 0.0002245807923273589,
"loss": 0.7876,
"step": 940
},
{
"epoch": 0.22749157438613385,
"grad_norm": 1.515625,
"learning_rate": 0.00022455637174339748,
"loss": 0.7789,
"step": 945
},
{
"epoch": 0.22869523350987,
"grad_norm": 1.6015625,
"learning_rate": 0.0002245318188481569,
"loss": 0.8027,
"step": 950
},
{
"epoch": 0.22989889263360616,
"grad_norm": 1.546875,
"learning_rate": 0.0002245071336808663,
"loss": 0.748,
"step": 955
},
{
"epoch": 0.23110255175734232,
"grad_norm": 1.578125,
"learning_rate": 0.00022448231628096634,
"loss": 0.7687,
"step": 960
},
{
"epoch": 0.23230621088107847,
"grad_norm": 1.5703125,
"learning_rate": 0.00022445736668810887,
"loss": 0.7748,
"step": 965
},
{
"epoch": 0.23350987000481463,
"grad_norm": 1.328125,
"learning_rate": 0.00022443228494215686,
"loss": 0.769,
"step": 970
},
{
"epoch": 0.23471352912855079,
"grad_norm": 1.5078125,
"learning_rate": 0.0002244070710831846,
"loss": 0.7438,
"step": 975
},
{
"epoch": 0.23591718825228694,
"grad_norm": 1.4921875,
"learning_rate": 0.0002243817251514773,
"loss": 0.7448,
"step": 980
},
{
"epoch": 0.2371208473760231,
"grad_norm": 1.359375,
"learning_rate": 0.0002243562471875313,
"loss": 0.7501,
"step": 985
},
{
"epoch": 0.23832450649975928,
"grad_norm": 1.421875,
"learning_rate": 0.00022433063723205387,
"loss": 0.7518,
"step": 990
},
{
"epoch": 0.23952816562349544,
"grad_norm": 1.515625,
"learning_rate": 0.00022430489532596312,
"loss": 0.762,
"step": 995
},
{
"epoch": 0.2407318247472316,
"grad_norm": 1.5078125,
"learning_rate": 0.000224279021510388,
"loss": 0.7501,
"step": 1000
},
{
"epoch": 0.2407318247472316,
"eval_loss": 0.6653648614883423,
"eval_runtime": 2.3753,
"eval_samples_per_second": 84.199,
"eval_steps_per_second": 84.199,
"step": 1000
},
{
"epoch": 0.24193548387096775,
"grad_norm": 1.5390625,
"learning_rate": 0.00022425301582666831,
"loss": 0.7778,
"step": 1005
},
{
"epoch": 0.2431391429947039,
"grad_norm": 1.5078125,
"learning_rate": 0.0002242268783163544,
"loss": 0.7564,
"step": 1010
},
{
"epoch": 0.24434280211844006,
"grad_norm": 1.4296875,
"learning_rate": 0.00022420060902120735,
"loss": 0.759,
"step": 1015
},
{
"epoch": 0.24554646124217622,
"grad_norm": 1.578125,
"learning_rate": 0.00022417420798319872,
"loss": 0.731,
"step": 1020
},
{
"epoch": 0.24675012036591237,
"grad_norm": 1.328125,
"learning_rate": 0.00022414767524451065,
"loss": 0.7295,
"step": 1025
},
{
"epoch": 0.24795377948964853,
"grad_norm": 1.4453125,
"learning_rate": 0.00022412101084753557,
"loss": 0.7235,
"step": 1030
},
{
"epoch": 0.24915743861338469,
"grad_norm": 1.6171875,
"learning_rate": 0.00022409421483487644,
"loss": 0.7701,
"step": 1035
},
{
"epoch": 0.25036109773712084,
"grad_norm": 1.4375,
"learning_rate": 0.0002240672872493464,
"loss": 0.7675,
"step": 1040
},
{
"epoch": 0.251564756860857,
"grad_norm": 1.5703125,
"learning_rate": 0.0002240402281339688,
"loss": 0.7081,
"step": 1045
},
{
"epoch": 0.25276841598459315,
"grad_norm": 1.4453125,
"learning_rate": 0.00022401303753197716,
"loss": 0.7421,
"step": 1050
},
{
"epoch": 0.2539720751083293,
"grad_norm": 1.390625,
"learning_rate": 0.00022398571548681517,
"loss": 0.6996,
"step": 1055
},
{
"epoch": 0.25517573423206547,
"grad_norm": 1.5234375,
"learning_rate": 0.00022395826204213635,
"loss": 0.7696,
"step": 1060
},
{
"epoch": 0.2563793933558016,
"grad_norm": 1.390625,
"learning_rate": 0.00022393067724180436,
"loss": 0.7645,
"step": 1065
},
{
"epoch": 0.2575830524795378,
"grad_norm": 1.53125,
"learning_rate": 0.00022390296112989258,
"loss": 0.7477,
"step": 1070
},
{
"epoch": 0.25878671160327393,
"grad_norm": 1.421875,
"learning_rate": 0.00022387511375068425,
"loss": 0.7773,
"step": 1075
},
{
"epoch": 0.2599903707270101,
"grad_norm": 1.3984375,
"learning_rate": 0.0002238471351486724,
"loss": 0.8072,
"step": 1080
},
{
"epoch": 0.26119402985074625,
"grad_norm": 1.5703125,
"learning_rate": 0.00022381902536855957,
"loss": 0.7145,
"step": 1085
},
{
"epoch": 0.2623976889744824,
"grad_norm": 1.3515625,
"learning_rate": 0.00022379078445525807,
"loss": 0.7482,
"step": 1090
},
{
"epoch": 0.26360134809821856,
"grad_norm": 1.5390625,
"learning_rate": 0.0002237624124538896,
"loss": 0.7366,
"step": 1095
},
{
"epoch": 0.26480500722195477,
"grad_norm": 1.3984375,
"learning_rate": 0.00022373390940978537,
"loss": 0.7482,
"step": 1100
},
{
"epoch": 0.2660086663456909,
"grad_norm": 1.4296875,
"learning_rate": 0.00022370527536848592,
"loss": 0.7286,
"step": 1105
},
{
"epoch": 0.2672123254694271,
"grad_norm": 1.4296875,
"learning_rate": 0.00022367651037574106,
"loss": 0.6981,
"step": 1110
},
{
"epoch": 0.26841598459316324,
"grad_norm": 1.6953125,
"learning_rate": 0.00022364761447751002,
"loss": 0.758,
"step": 1115
},
{
"epoch": 0.2696196437168994,
"grad_norm": 1.4765625,
"learning_rate": 0.00022361858771996086,
"loss": 0.711,
"step": 1120
},
{
"epoch": 0.27082330284063555,
"grad_norm": 1.453125,
"learning_rate": 0.00022358943014947098,
"loss": 0.7665,
"step": 1125
},
{
"epoch": 0.2720269619643717,
"grad_norm": 1.5078125,
"learning_rate": 0.00022356014181262673,
"loss": 0.7477,
"step": 1130
},
{
"epoch": 0.27323062108810786,
"grad_norm": 1.5234375,
"learning_rate": 0.00022353072275622333,
"loss": 0.774,
"step": 1135
},
{
"epoch": 0.274434280211844,
"grad_norm": 1.4453125,
"learning_rate": 0.00022350117302726488,
"loss": 0.7503,
"step": 1140
},
{
"epoch": 0.2756379393355802,
"grad_norm": 1.5546875,
"learning_rate": 0.00022347149267296432,
"loss": 0.7568,
"step": 1145
},
{
"epoch": 0.27684159845931633,
"grad_norm": 1.578125,
"learning_rate": 0.00022344168174074318,
"loss": 0.7762,
"step": 1150
},
{
"epoch": 0.2780452575830525,
"grad_norm": 1.5234375,
"learning_rate": 0.00022341174027823172,
"loss": 0.7479,
"step": 1155
},
{
"epoch": 0.27924891670678864,
"grad_norm": 1.4765625,
"learning_rate": 0.00022338166833326875,
"loss": 0.7696,
"step": 1160
},
{
"epoch": 0.2804525758305248,
"grad_norm": 1.3671875,
"learning_rate": 0.0002233514659539015,
"loss": 0.7076,
"step": 1165
},
{
"epoch": 0.28165623495426095,
"grad_norm": 1.5234375,
"learning_rate": 0.00022332113318838563,
"loss": 0.7393,
"step": 1170
},
{
"epoch": 0.2828598940779971,
"grad_norm": 1.3046875,
"learning_rate": 0.0002232906700851851,
"loss": 0.7392,
"step": 1175
},
{
"epoch": 0.28406355320173327,
"grad_norm": 1.390625,
"learning_rate": 0.0002232600766929722,
"loss": 0.7585,
"step": 1180
},
{
"epoch": 0.2852672123254694,
"grad_norm": 1.421875,
"learning_rate": 0.00022322935306062726,
"loss": 0.7381,
"step": 1185
},
{
"epoch": 0.2864708714492056,
"grad_norm": 1.6328125,
"learning_rate": 0.00022319849923723884,
"loss": 0.766,
"step": 1190
},
{
"epoch": 0.28767453057294173,
"grad_norm": 1.40625,
"learning_rate": 0.0002231675152721034,
"loss": 0.7428,
"step": 1195
},
{
"epoch": 0.2888781896966779,
"grad_norm": 1.4609375,
"learning_rate": 0.00022313640121472532,
"loss": 0.7461,
"step": 1200
},
{
"epoch": 0.29008184882041405,
"grad_norm": 1.25,
"learning_rate": 0.00022310515711481698,
"loss": 0.711,
"step": 1205
},
{
"epoch": 0.2912855079441502,
"grad_norm": 1.515625,
"learning_rate": 0.0002230737830222984,
"loss": 0.7463,
"step": 1210
},
{
"epoch": 0.29248916706788636,
"grad_norm": 1.375,
"learning_rate": 0.00022304227898729739,
"loss": 0.7507,
"step": 1215
},
{
"epoch": 0.2936928261916225,
"grad_norm": 1.375,
"learning_rate": 0.00022301064506014922,
"loss": 0.7481,
"step": 1220
},
{
"epoch": 0.29489648531535867,
"grad_norm": 1.375,
"learning_rate": 0.00022297888129139685,
"loss": 0.7347,
"step": 1225
},
{
"epoch": 0.2961001444390948,
"grad_norm": 1.421875,
"learning_rate": 0.00022294698773179066,
"loss": 0.7457,
"step": 1230
},
{
"epoch": 0.297303803562831,
"grad_norm": 1.4765625,
"learning_rate": 0.00022291496443228834,
"loss": 0.7161,
"step": 1235
},
{
"epoch": 0.29850746268656714,
"grad_norm": 1.5703125,
"learning_rate": 0.000222882811444055,
"loss": 0.7232,
"step": 1240
},
{
"epoch": 0.29971112181030335,
"grad_norm": 1.359375,
"learning_rate": 0.00022285052881846276,
"loss": 0.7334,
"step": 1245
},
{
"epoch": 0.3009147809340395,
"grad_norm": 1.3828125,
"learning_rate": 0.00022281811660709101,
"loss": 0.7327,
"step": 1250
},
{
"epoch": 0.30211844005777566,
"grad_norm": 1.4765625,
"learning_rate": 0.0002227855748617262,
"loss": 0.7254,
"step": 1255
},
{
"epoch": 0.3033220991815118,
"grad_norm": 1.296875,
"learning_rate": 0.00022275290363436167,
"loss": 0.734,
"step": 1260
},
{
"epoch": 0.304525758305248,
"grad_norm": 1.40625,
"learning_rate": 0.00022272010297719766,
"loss": 0.7085,
"step": 1265
},
{
"epoch": 0.30572941742898413,
"grad_norm": 1.375,
"learning_rate": 0.00022268717294264122,
"loss": 0.7319,
"step": 1270
},
{
"epoch": 0.3069330765527203,
"grad_norm": 1.2421875,
"learning_rate": 0.0002226541135833061,
"loss": 0.7253,
"step": 1275
},
{
"epoch": 0.30813673567645644,
"grad_norm": 1.3828125,
"learning_rate": 0.0002226209249520127,
"loss": 0.7115,
"step": 1280
},
{
"epoch": 0.3093403948001926,
"grad_norm": 1.3515625,
"learning_rate": 0.0002225876071017879,
"loss": 0.7489,
"step": 1285
},
{
"epoch": 0.31054405392392875,
"grad_norm": 1.484375,
"learning_rate": 0.00022255416008586513,
"loss": 0.7314,
"step": 1290
},
{
"epoch": 0.3117477130476649,
"grad_norm": 1.3671875,
"learning_rate": 0.00022252058395768413,
"loss": 0.7498,
"step": 1295
},
{
"epoch": 0.31295137217140107,
"grad_norm": 1.40625,
"learning_rate": 0.00022248687877089092,
"loss": 0.7041,
"step": 1300
},
{
"epoch": 0.3141550312951372,
"grad_norm": 1.546875,
"learning_rate": 0.0002224530445793378,
"loss": 0.7143,
"step": 1305
},
{
"epoch": 0.3153586904188734,
"grad_norm": 1.640625,
"learning_rate": 0.00022241908143708308,
"loss": 0.7291,
"step": 1310
},
{
"epoch": 0.31656234954260953,
"grad_norm": 1.46875,
"learning_rate": 0.0002223849893983912,
"loss": 0.7553,
"step": 1315
},
{
"epoch": 0.3177660086663457,
"grad_norm": 1.46875,
"learning_rate": 0.00022235076851773248,
"loss": 0.7362,
"step": 1320
},
{
"epoch": 0.31896966779008185,
"grad_norm": 1.40625,
"learning_rate": 0.00022231641884978314,
"loss": 0.7415,
"step": 1325
},
{
"epoch": 0.320173326913818,
"grad_norm": 1.421875,
"learning_rate": 0.00022228194044942505,
"loss": 0.7206,
"step": 1330
},
{
"epoch": 0.32137698603755416,
"grad_norm": 1.5390625,
"learning_rate": 0.00022224733337174597,
"loss": 0.7212,
"step": 1335
},
{
"epoch": 0.3225806451612903,
"grad_norm": 1.296875,
"learning_rate": 0.0002222125976720391,
"loss": 0.6955,
"step": 1340
},
{
"epoch": 0.32378430428502647,
"grad_norm": 1.3828125,
"learning_rate": 0.00022217773340580315,
"loss": 0.6881,
"step": 1345
},
{
"epoch": 0.3249879634087626,
"grad_norm": 1.4765625,
"learning_rate": 0.00022214274062874232,
"loss": 0.6833,
"step": 1350
},
{
"epoch": 0.3261916225324988,
"grad_norm": 1.4453125,
"learning_rate": 0.00022210761939676606,
"loss": 0.7225,
"step": 1355
},
{
"epoch": 0.32739528165623494,
"grad_norm": 1.46875,
"learning_rate": 0.00022207236976598917,
"loss": 0.7155,
"step": 1360
},
{
"epoch": 0.3285989407799711,
"grad_norm": 1.3671875,
"learning_rate": 0.00022203699179273144,
"loss": 0.7572,
"step": 1365
},
{
"epoch": 0.32980259990370725,
"grad_norm": 1.3203125,
"learning_rate": 0.00022200148553351781,
"loss": 0.6694,
"step": 1370
},
{
"epoch": 0.3310062590274434,
"grad_norm": 1.3359375,
"learning_rate": 0.00022196585104507823,
"loss": 0.7324,
"step": 1375
},
{
"epoch": 0.33220991815117956,
"grad_norm": 1.3671875,
"learning_rate": 0.00022193008838434746,
"loss": 0.6888,
"step": 1380
},
{
"epoch": 0.3334135772749157,
"grad_norm": 1.1953125,
"learning_rate": 0.00022189419760846503,
"loss": 0.6846,
"step": 1385
},
{
"epoch": 0.33461723639865193,
"grad_norm": 1.3828125,
"learning_rate": 0.00022185817877477525,
"loss": 0.7213,
"step": 1390
},
{
"epoch": 0.3358208955223881,
"grad_norm": 1.3203125,
"learning_rate": 0.00022182203194082693,
"loss": 0.7268,
"step": 1395
},
{
"epoch": 0.33702455464612424,
"grad_norm": 1.28125,
"learning_rate": 0.0002217857571643735,
"loss": 0.7412,
"step": 1400
},
{
"epoch": 0.3382282137698604,
"grad_norm": 1.421875,
"learning_rate": 0.0002217493545033727,
"loss": 0.6811,
"step": 1405
},
{
"epoch": 0.33943187289359655,
"grad_norm": 1.2421875,
"learning_rate": 0.0002217128240159867,
"loss": 0.7167,
"step": 1410
},
{
"epoch": 0.3406355320173327,
"grad_norm": 1.3203125,
"learning_rate": 0.00022167616576058183,
"loss": 0.72,
"step": 1415
},
{
"epoch": 0.34183919114106887,
"grad_norm": 1.5546875,
"learning_rate": 0.00022163937979572857,
"loss": 0.735,
"step": 1420
},
{
"epoch": 0.343042850264805,
"grad_norm": 1.3671875,
"learning_rate": 0.00022160246618020145,
"loss": 0.7331,
"step": 1425
},
{
"epoch": 0.3442465093885412,
"grad_norm": 1.4765625,
"learning_rate": 0.000221565424972979,
"loss": 0.7071,
"step": 1430
},
{
"epoch": 0.34545016851227733,
"grad_norm": 1.359375,
"learning_rate": 0.0002215282562332436,
"loss": 0.7253,
"step": 1435
},
{
"epoch": 0.3466538276360135,
"grad_norm": 1.328125,
"learning_rate": 0.00022149096002038133,
"loss": 0.7084,
"step": 1440
},
{
"epoch": 0.34785748675974965,
"grad_norm": 1.3203125,
"learning_rate": 0.00022145353639398197,
"loss": 0.6772,
"step": 1445
},
{
"epoch": 0.3490611458834858,
"grad_norm": 1.3671875,
"learning_rate": 0.00022141598541383889,
"loss": 0.6878,
"step": 1450
},
{
"epoch": 0.35026480500722196,
"grad_norm": 1.2890625,
"learning_rate": 0.0002213783071399489,
"loss": 0.6812,
"step": 1455
},
{
"epoch": 0.3514684641309581,
"grad_norm": 1.3984375,
"learning_rate": 0.0002213405016325123,
"loss": 0.6928,
"step": 1460
},
{
"epoch": 0.35267212325469427,
"grad_norm": 1.34375,
"learning_rate": 0.00022130256895193254,
"loss": 0.713,
"step": 1465
},
{
"epoch": 0.3538757823784304,
"grad_norm": 1.34375,
"learning_rate": 0.0002212645091588163,
"loss": 0.7452,
"step": 1470
},
{
"epoch": 0.3550794415021666,
"grad_norm": 1.2890625,
"learning_rate": 0.00022122632231397346,
"loss": 0.7227,
"step": 1475
},
{
"epoch": 0.35628310062590274,
"grad_norm": 1.328125,
"learning_rate": 0.0002211880084784167,
"loss": 0.7189,
"step": 1480
},
{
"epoch": 0.3574867597496389,
"grad_norm": 1.421875,
"learning_rate": 0.00022114956771336177,
"loss": 0.6905,
"step": 1485
},
{
"epoch": 0.35869041887337505,
"grad_norm": 1.4296875,
"learning_rate": 0.0002211110000802272,
"loss": 0.6784,
"step": 1490
},
{
"epoch": 0.3598940779971112,
"grad_norm": 1.3125,
"learning_rate": 0.00022107230564063409,
"loss": 0.6913,
"step": 1495
},
{
"epoch": 0.36109773712084736,
"grad_norm": 1.5390625,
"learning_rate": 0.00022103348445640626,
"loss": 0.6823,
"step": 1500
},
{
"epoch": 0.36109773712084736,
"eval_loss": 0.6051262021064758,
"eval_runtime": 2.3791,
"eval_samples_per_second": 84.065,
"eval_steps_per_second": 84.065,
"step": 1500
},
{
"epoch": 0.3623013962445835,
"grad_norm": 1.3984375,
"learning_rate": 0.00022099453658957005,
"loss": 0.7171,
"step": 1505
},
{
"epoch": 0.3635050553683197,
"grad_norm": 1.3515625,
"learning_rate": 0.00022095546210235416,
"loss": 0.6987,
"step": 1510
},
{
"epoch": 0.36470871449205583,
"grad_norm": 1.4453125,
"learning_rate": 0.00022091626105718955,
"loss": 0.7139,
"step": 1515
},
{
"epoch": 0.365912373615792,
"grad_norm": 1.2734375,
"learning_rate": 0.0002208769335167095,
"loss": 0.6801,
"step": 1520
},
{
"epoch": 0.36711603273952814,
"grad_norm": 1.3125,
"learning_rate": 0.0002208374795437493,
"loss": 0.6718,
"step": 1525
},
{
"epoch": 0.3683196918632643,
"grad_norm": 1.4453125,
"learning_rate": 0.0002207978992013463,
"loss": 0.7374,
"step": 1530
},
{
"epoch": 0.36952335098700045,
"grad_norm": 1.4453125,
"learning_rate": 0.00022075819255273977,
"loss": 0.7337,
"step": 1535
},
{
"epoch": 0.37072701011073667,
"grad_norm": 1.3984375,
"learning_rate": 0.00022071835966137068,
"loss": 0.7181,
"step": 1540
},
{
"epoch": 0.3719306692344728,
"grad_norm": 1.3203125,
"learning_rate": 0.00022067840059088187,
"loss": 0.6915,
"step": 1545
},
{
"epoch": 0.373134328358209,
"grad_norm": 1.3671875,
"learning_rate": 0.0002206383154051176,
"loss": 0.6925,
"step": 1550
},
{
"epoch": 0.37433798748194513,
"grad_norm": 1.3828125,
"learning_rate": 0.00022059810416812377,
"loss": 0.7125,
"step": 1555
},
{
"epoch": 0.3755416466056813,
"grad_norm": 1.421875,
"learning_rate": 0.00022055776694414767,
"loss": 0.7095,
"step": 1560
},
{
"epoch": 0.37674530572941745,
"grad_norm": 1.359375,
"learning_rate": 0.00022051730379763778,
"loss": 0.6915,
"step": 1565
},
{
"epoch": 0.3779489648531536,
"grad_norm": 1.3046875,
"learning_rate": 0.00022047671479324385,
"loss": 0.7315,
"step": 1570
},
{
"epoch": 0.37915262397688976,
"grad_norm": 1.3125,
"learning_rate": 0.00022043599999581673,
"loss": 0.7121,
"step": 1575
},
{
"epoch": 0.3803562831006259,
"grad_norm": 1.3125,
"learning_rate": 0.00022039515947040817,
"loss": 0.6884,
"step": 1580
},
{
"epoch": 0.38155994222436207,
"grad_norm": 1.2421875,
"learning_rate": 0.0002203541932822709,
"loss": 0.6939,
"step": 1585
},
{
"epoch": 0.3827636013480982,
"grad_norm": 1.375,
"learning_rate": 0.00022031310149685842,
"loss": 0.7078,
"step": 1590
},
{
"epoch": 0.3839672604718344,
"grad_norm": 1.375,
"learning_rate": 0.0002202718841798248,
"loss": 0.6957,
"step": 1595
},
{
"epoch": 0.38517091959557054,
"grad_norm": 1.3203125,
"learning_rate": 0.0002202305413970248,
"loss": 0.6991,
"step": 1600
},
{
"epoch": 0.3863745787193067,
"grad_norm": 1.2265625,
"learning_rate": 0.00022018907321451356,
"loss": 0.6963,
"step": 1605
},
{
"epoch": 0.38757823784304285,
"grad_norm": 1.40625,
"learning_rate": 0.0002201474796985466,
"loss": 0.6975,
"step": 1610
},
{
"epoch": 0.388781896966779,
"grad_norm": 1.3203125,
"learning_rate": 0.00022010576091557974,
"loss": 0.7146,
"step": 1615
},
{
"epoch": 0.38998555609051516,
"grad_norm": 1.25,
"learning_rate": 0.00022006391693226885,
"loss": 0.6847,
"step": 1620
},
{
"epoch": 0.3911892152142513,
"grad_norm": 1.375,
"learning_rate": 0.0002200219478154699,
"loss": 0.7524,
"step": 1625
},
{
"epoch": 0.3923928743379875,
"grad_norm": 1.25,
"learning_rate": 0.00021997985363223882,
"loss": 0.6945,
"step": 1630
},
{
"epoch": 0.39359653346172363,
"grad_norm": 1.328125,
"learning_rate": 0.00021993763444983126,
"loss": 0.706,
"step": 1635
},
{
"epoch": 0.3948001925854598,
"grad_norm": 1.46875,
"learning_rate": 0.0002198952903357027,
"loss": 0.7133,
"step": 1640
},
{
"epoch": 0.39600385170919594,
"grad_norm": 1.4140625,
"learning_rate": 0.00021985282135750817,
"loss": 0.6806,
"step": 1645
},
{
"epoch": 0.3972075108329321,
"grad_norm": 1.4453125,
"learning_rate": 0.00021981022758310216,
"loss": 0.6917,
"step": 1650
},
{
"epoch": 0.39841116995666825,
"grad_norm": 1.34375,
"learning_rate": 0.00021976750908053868,
"loss": 0.7009,
"step": 1655
},
{
"epoch": 0.3996148290804044,
"grad_norm": 1.265625,
"learning_rate": 0.0002197246659180709,
"loss": 0.7041,
"step": 1660
},
{
"epoch": 0.40081848820414057,
"grad_norm": 1.28125,
"learning_rate": 0.00021968169816415125,
"loss": 0.675,
"step": 1665
},
{
"epoch": 0.4020221473278767,
"grad_norm": 1.3671875,
"learning_rate": 0.00021963860588743113,
"loss": 0.6726,
"step": 1670
},
{
"epoch": 0.4032258064516129,
"grad_norm": 1.3671875,
"learning_rate": 0.000219595389156761,
"loss": 0.6908,
"step": 1675
},
{
"epoch": 0.40442946557534903,
"grad_norm": 1.3984375,
"learning_rate": 0.00021955204804119003,
"loss": 0.7241,
"step": 1680
},
{
"epoch": 0.40563312469908525,
"grad_norm": 1.328125,
"learning_rate": 0.00021950858260996633,
"loss": 0.6698,
"step": 1685
},
{
"epoch": 0.4068367838228214,
"grad_norm": 1.5703125,
"learning_rate": 0.00021946499293253646,
"loss": 0.7039,
"step": 1690
},
{
"epoch": 0.40804044294655756,
"grad_norm": 1.2578125,
"learning_rate": 0.00021942127907854556,
"loss": 0.6867,
"step": 1695
},
{
"epoch": 0.4092441020702937,
"grad_norm": 1.34375,
"learning_rate": 0.00021937744111783717,
"loss": 0.6789,
"step": 1700
},
{
"epoch": 0.41044776119402987,
"grad_norm": 1.4140625,
"learning_rate": 0.00021933347912045305,
"loss": 0.7098,
"step": 1705
},
{
"epoch": 0.411651420317766,
"grad_norm": 1.2578125,
"learning_rate": 0.00021928939315663331,
"loss": 0.6668,
"step": 1710
},
{
"epoch": 0.4128550794415022,
"grad_norm": 1.234375,
"learning_rate": 0.00021924518329681592,
"loss": 0.673,
"step": 1715
},
{
"epoch": 0.41405873856523834,
"grad_norm": 1.3125,
"learning_rate": 0.00021920084961163697,
"loss": 0.7019,
"step": 1720
},
{
"epoch": 0.4152623976889745,
"grad_norm": 1.25,
"learning_rate": 0.00021915639217193027,
"loss": 0.6921,
"step": 1725
},
{
"epoch": 0.41646605681271065,
"grad_norm": 1.2734375,
"learning_rate": 0.00021911181104872747,
"loss": 0.6582,
"step": 1730
},
{
"epoch": 0.4176697159364468,
"grad_norm": 1.4140625,
"learning_rate": 0.00021906710631325774,
"loss": 0.6907,
"step": 1735
},
{
"epoch": 0.41887337506018296,
"grad_norm": 1.3984375,
"learning_rate": 0.00021902227803694774,
"loss": 0.6837,
"step": 1740
},
{
"epoch": 0.4200770341839191,
"grad_norm": 1.3046875,
"learning_rate": 0.00021897732629142167,
"loss": 0.6737,
"step": 1745
},
{
"epoch": 0.4212806933076553,
"grad_norm": 1.265625,
"learning_rate": 0.00021893225114850086,
"loss": 0.6939,
"step": 1750
},
{
"epoch": 0.42248435243139143,
"grad_norm": 1.265625,
"learning_rate": 0.00021888705268020378,
"loss": 0.6971,
"step": 1755
},
{
"epoch": 0.4236880115551276,
"grad_norm": 1.40625,
"learning_rate": 0.00021884173095874603,
"loss": 0.6749,
"step": 1760
},
{
"epoch": 0.42489167067886374,
"grad_norm": 1.3125,
"learning_rate": 0.0002187962860565401,
"loss": 0.6916,
"step": 1765
},
{
"epoch": 0.4260953298025999,
"grad_norm": 1.3828125,
"learning_rate": 0.00021875071804619534,
"loss": 0.675,
"step": 1770
},
{
"epoch": 0.42729898892633605,
"grad_norm": 1.359375,
"learning_rate": 0.00021870502700051765,
"loss": 0.6473,
"step": 1775
},
{
"epoch": 0.4285026480500722,
"grad_norm": 1.234375,
"learning_rate": 0.0002186592129925097,
"loss": 0.6831,
"step": 1780
},
{
"epoch": 0.42970630717380837,
"grad_norm": 1.1953125,
"learning_rate": 0.0002186132760953705,
"loss": 0.7097,
"step": 1785
},
{
"epoch": 0.4309099662975445,
"grad_norm": 1.25,
"learning_rate": 0.00021856721638249541,
"loss": 0.7204,
"step": 1790
},
{
"epoch": 0.4321136254212807,
"grad_norm": 1.40625,
"learning_rate": 0.0002185210339274761,
"loss": 0.6994,
"step": 1795
},
{
"epoch": 0.43331728454501683,
"grad_norm": 1.265625,
"learning_rate": 0.0002184747288041002,
"loss": 0.69,
"step": 1800
},
{
"epoch": 0.434520943668753,
"grad_norm": 1.25,
"learning_rate": 0.00021842830108635155,
"loss": 0.7252,
"step": 1805
},
{
"epoch": 0.43572460279248915,
"grad_norm": 1.1328125,
"learning_rate": 0.00021838175084840962,
"loss": 0.6817,
"step": 1810
},
{
"epoch": 0.4369282619162253,
"grad_norm": 1.3828125,
"learning_rate": 0.00021833507816464986,
"loss": 0.6627,
"step": 1815
},
{
"epoch": 0.43813192103996146,
"grad_norm": 1.453125,
"learning_rate": 0.00021828828310964317,
"loss": 0.6695,
"step": 1820
},
{
"epoch": 0.4393355801636976,
"grad_norm": 1.265625,
"learning_rate": 0.00021824136575815612,
"loss": 0.6805,
"step": 1825
},
{
"epoch": 0.4405392392874338,
"grad_norm": 1.2734375,
"learning_rate": 0.00021819432618515054,
"loss": 0.6649,
"step": 1830
},
{
"epoch": 0.44174289841117,
"grad_norm": 1.2890625,
"learning_rate": 0.00021814716446578368,
"loss": 0.6936,
"step": 1835
},
{
"epoch": 0.44294655753490614,
"grad_norm": 1.3125,
"learning_rate": 0.00021809988067540787,
"loss": 0.7111,
"step": 1840
},
{
"epoch": 0.4441502166586423,
"grad_norm": 1.25,
"learning_rate": 0.00021805247488957042,
"loss": 0.6651,
"step": 1845
},
{
"epoch": 0.44535387578237845,
"grad_norm": 1.234375,
"learning_rate": 0.00021800494718401367,
"loss": 0.6938,
"step": 1850
},
{
"epoch": 0.4465575349061146,
"grad_norm": 1.2109375,
"learning_rate": 0.00021795729763467473,
"loss": 0.6769,
"step": 1855
},
{
"epoch": 0.44776119402985076,
"grad_norm": 1.203125,
"learning_rate": 0.0002179095263176853,
"loss": 0.6861,
"step": 1860
},
{
"epoch": 0.4489648531535869,
"grad_norm": 1.234375,
"learning_rate": 0.00021786163330937176,
"loss": 0.6313,
"step": 1865
},
{
"epoch": 0.4501685122773231,
"grad_norm": 1.4453125,
"learning_rate": 0.00021781361868625484,
"loss": 0.6947,
"step": 1870
},
{
"epoch": 0.45137217140105923,
"grad_norm": 1.28125,
"learning_rate": 0.00021776548252504957,
"loss": 0.6837,
"step": 1875
},
{
"epoch": 0.4525758305247954,
"grad_norm": 1.2578125,
"learning_rate": 0.00021771722490266526,
"loss": 0.6743,
"step": 1880
},
{
"epoch": 0.45377948964853154,
"grad_norm": 1.3671875,
"learning_rate": 0.00021766884589620518,
"loss": 0.6883,
"step": 1885
},
{
"epoch": 0.4549831487722677,
"grad_norm": 1.3359375,
"learning_rate": 0.00021762034558296656,
"loss": 0.6427,
"step": 1890
},
{
"epoch": 0.45618680789600385,
"grad_norm": 1.3515625,
"learning_rate": 0.00021757172404044049,
"loss": 0.6481,
"step": 1895
},
{
"epoch": 0.45739046701974,
"grad_norm": 1.2890625,
"learning_rate": 0.00021752298134631174,
"loss": 0.7225,
"step": 1900
},
{
"epoch": 0.45859412614347617,
"grad_norm": 1.2578125,
"learning_rate": 0.0002174741175784586,
"loss": 0.689,
"step": 1905
},
{
"epoch": 0.4597977852672123,
"grad_norm": 1.3125,
"learning_rate": 0.00021742513281495292,
"loss": 0.6866,
"step": 1910
},
{
"epoch": 0.4610014443909485,
"grad_norm": 1.2265625,
"learning_rate": 0.00021737602713405976,
"loss": 0.6759,
"step": 1915
},
{
"epoch": 0.46220510351468463,
"grad_norm": 1.375,
"learning_rate": 0.00021732680061423734,
"loss": 0.6743,
"step": 1920
},
{
"epoch": 0.4634087626384208,
"grad_norm": 1.234375,
"learning_rate": 0.00021727745333413712,
"loss": 0.711,
"step": 1925
},
{
"epoch": 0.46461242176215695,
"grad_norm": 1.4609375,
"learning_rate": 0.00021722798537260335,
"loss": 0.6761,
"step": 1930
},
{
"epoch": 0.4658160808858931,
"grad_norm": 1.2734375,
"learning_rate": 0.00021717839680867316,
"loss": 0.6692,
"step": 1935
},
{
"epoch": 0.46701974000962926,
"grad_norm": 1.1953125,
"learning_rate": 0.00021712868772157638,
"loss": 0.6692,
"step": 1940
},
{
"epoch": 0.4682233991333654,
"grad_norm": 1.265625,
"learning_rate": 0.00021707885819073535,
"loss": 0.6688,
"step": 1945
},
{
"epoch": 0.46942705825710157,
"grad_norm": 1.2421875,
"learning_rate": 0.00021702890829576493,
"loss": 0.6461,
"step": 1950
},
{
"epoch": 0.4706307173808377,
"grad_norm": 1.2265625,
"learning_rate": 0.00021697883811647224,
"loss": 0.6528,
"step": 1955
},
{
"epoch": 0.4718343765045739,
"grad_norm": 1.203125,
"learning_rate": 0.00021692864773285655,
"loss": 0.6357,
"step": 1960
},
{
"epoch": 0.47303803562831004,
"grad_norm": 1.3828125,
"learning_rate": 0.0002168783372251093,
"loss": 0.6863,
"step": 1965
},
{
"epoch": 0.4742416947520462,
"grad_norm": 1.21875,
"learning_rate": 0.0002168279066736137,
"loss": 0.6932,
"step": 1970
},
{
"epoch": 0.4754453538757824,
"grad_norm": 1.265625,
"learning_rate": 0.00021677735615894487,
"loss": 0.6815,
"step": 1975
},
{
"epoch": 0.47664901299951856,
"grad_norm": 1.3828125,
"learning_rate": 0.0002167266857618696,
"loss": 0.673,
"step": 1980
},
{
"epoch": 0.4778526721232547,
"grad_norm": 1.2578125,
"learning_rate": 0.00021667589556334621,
"loss": 0.6475,
"step": 1985
},
{
"epoch": 0.4790563312469909,
"grad_norm": 1.25,
"learning_rate": 0.00021662498564452436,
"loss": 0.6617,
"step": 1990
},
{
"epoch": 0.48025999037072703,
"grad_norm": 1.4609375,
"learning_rate": 0.0002165739560867451,
"loss": 0.646,
"step": 1995
},
{
"epoch": 0.4814636494944632,
"grad_norm": 1.1796875,
"learning_rate": 0.00021652280697154056,
"loss": 0.6383,
"step": 2000
},
{
"epoch": 0.4814636494944632,
"eval_loss": 0.5746923089027405,
"eval_runtime": 2.3802,
"eval_samples_per_second": 84.028,
"eval_steps_per_second": 84.028,
"step": 2000
},
{
"epoch": 0.48266730861819934,
"grad_norm": 1.3828125,
"learning_rate": 0.00021647153838063392,
"loss": 0.6696,
"step": 2005
},
{
"epoch": 0.4838709677419355,
"grad_norm": 1.25,
"learning_rate": 0.0002164201503959392,
"loss": 0.6469,
"step": 2010
},
{
"epoch": 0.48507462686567165,
"grad_norm": 1.265625,
"learning_rate": 0.0002163686430995613,
"loss": 0.6751,
"step": 2015
},
{
"epoch": 0.4862782859894078,
"grad_norm": 1.2578125,
"learning_rate": 0.00021631701657379564,
"loss": 0.656,
"step": 2020
},
{
"epoch": 0.48748194511314397,
"grad_norm": 1.234375,
"learning_rate": 0.00021626527090112815,
"loss": 0.6699,
"step": 2025
},
{
"epoch": 0.4886856042368801,
"grad_norm": 1.296875,
"learning_rate": 0.0002162134061642352,
"loss": 0.6329,
"step": 2030
},
{
"epoch": 0.4898892633606163,
"grad_norm": 1.359375,
"learning_rate": 0.00021616142244598328,
"loss": 0.6946,
"step": 2035
},
{
"epoch": 0.49109292248435243,
"grad_norm": 1.421875,
"learning_rate": 0.0002161093198294291,
"loss": 0.6635,
"step": 2040
},
{
"epoch": 0.4922965816080886,
"grad_norm": 1.265625,
"learning_rate": 0.00021605709839781932,
"loss": 0.6447,
"step": 2045
},
{
"epoch": 0.49350024073182475,
"grad_norm": 1.28125,
"learning_rate": 0.0002160047582345903,
"loss": 0.6768,
"step": 2050
},
{
"epoch": 0.4947038998555609,
"grad_norm": 1.2578125,
"learning_rate": 0.00021595229942336826,
"loss": 0.6497,
"step": 2055
},
{
"epoch": 0.49590755897929706,
"grad_norm": 1.2578125,
"learning_rate": 0.00021589972204796891,
"loss": 0.6347,
"step": 2060
},
{
"epoch": 0.4971112181030332,
"grad_norm": 1.203125,
"learning_rate": 0.00021584702619239748,
"loss": 0.6583,
"step": 2065
},
{
"epoch": 0.49831487722676937,
"grad_norm": 1.234375,
"learning_rate": 0.00021579421194084836,
"loss": 0.6702,
"step": 2070
},
{
"epoch": 0.4995185363505055,
"grad_norm": 1.28125,
"learning_rate": 0.00021574127937770522,
"loss": 0.636,
"step": 2075
},
{
"epoch": 0.5007221954742417,
"grad_norm": 1.4453125,
"learning_rate": 0.00021568822858754073,
"loss": 0.6476,
"step": 2080
},
{
"epoch": 0.5019258545979779,
"grad_norm": 1.328125,
"learning_rate": 0.00021563505965511642,
"loss": 0.6897,
"step": 2085
},
{
"epoch": 0.503129513721714,
"grad_norm": 1.171875,
"learning_rate": 0.00021558177266538267,
"loss": 0.6855,
"step": 2090
},
{
"epoch": 0.5043331728454502,
"grad_norm": 1.125,
"learning_rate": 0.00021552836770347836,
"loss": 0.6295,
"step": 2095
},
{
"epoch": 0.5055368319691863,
"grad_norm": 1.234375,
"learning_rate": 0.00021547484485473102,
"loss": 0.6907,
"step": 2100
},
{
"epoch": 0.5067404910929225,
"grad_norm": 1.21875,
"learning_rate": 0.00021542120420465637,
"loss": 0.6367,
"step": 2105
},
{
"epoch": 0.5079441502166586,
"grad_norm": 1.28125,
"learning_rate": 0.00021536744583895842,
"loss": 0.6674,
"step": 2110
},
{
"epoch": 0.5091478093403948,
"grad_norm": 1.3984375,
"learning_rate": 0.0002153135698435293,
"loss": 0.6758,
"step": 2115
},
{
"epoch": 0.5103514684641309,
"grad_norm": 1.296875,
"learning_rate": 0.00021525957630444902,
"loss": 0.6559,
"step": 2120
},
{
"epoch": 0.5115551275878671,
"grad_norm": 1.2578125,
"learning_rate": 0.00021520546530798536,
"loss": 0.693,
"step": 2125
},
{
"epoch": 0.5127587867116032,
"grad_norm": 1.1953125,
"learning_rate": 0.0002151512369405939,
"loss": 0.6042,
"step": 2130
},
{
"epoch": 0.5139624458353395,
"grad_norm": 1.1328125,
"learning_rate": 0.00021509689128891763,
"loss": 0.6254,
"step": 2135
},
{
"epoch": 0.5151661049590756,
"grad_norm": 1.1796875,
"learning_rate": 0.00021504242843978696,
"loss": 0.6867,
"step": 2140
},
{
"epoch": 0.5163697640828118,
"grad_norm": 1.2734375,
"learning_rate": 0.00021498784848021963,
"loss": 0.6537,
"step": 2145
},
{
"epoch": 0.5175734232065479,
"grad_norm": 1.234375,
"learning_rate": 0.00021493315149742035,
"loss": 0.6447,
"step": 2150
},
{
"epoch": 0.5187770823302841,
"grad_norm": 1.3515625,
"learning_rate": 0.0002148783375787809,
"loss": 0.6665,
"step": 2155
},
{
"epoch": 0.5199807414540202,
"grad_norm": 1.1796875,
"learning_rate": 0.00021482340681187984,
"loss": 0.6511,
"step": 2160
},
{
"epoch": 0.5211844005777564,
"grad_norm": 1.34375,
"learning_rate": 0.00021476835928448254,
"loss": 0.6739,
"step": 2165
},
{
"epoch": 0.5223880597014925,
"grad_norm": 1.3125,
"learning_rate": 0.00021471319508454073,
"loss": 0.6801,
"step": 2170
},
{
"epoch": 0.5235917188252287,
"grad_norm": 1.2265625,
"learning_rate": 0.00021465791430019273,
"loss": 0.649,
"step": 2175
},
{
"epoch": 0.5247953779489648,
"grad_norm": 1.1328125,
"learning_rate": 0.00021460251701976306,
"loss": 0.6565,
"step": 2180
},
{
"epoch": 0.525999037072701,
"grad_norm": 1.1796875,
"learning_rate": 0.00021454700333176232,
"loss": 0.6726,
"step": 2185
},
{
"epoch": 0.5272026961964371,
"grad_norm": 1.203125,
"learning_rate": 0.00021449137332488723,
"loss": 0.653,
"step": 2190
},
{
"epoch": 0.5284063553201733,
"grad_norm": 1.2265625,
"learning_rate": 0.00021443562708802023,
"loss": 0.6582,
"step": 2195
},
{
"epoch": 0.5296100144439095,
"grad_norm": 1.1484375,
"learning_rate": 0.00021437976471022952,
"loss": 0.6571,
"step": 2200
},
{
"epoch": 0.5308136735676456,
"grad_norm": 1.2734375,
"learning_rate": 0.00021432378628076883,
"loss": 0.6659,
"step": 2205
},
{
"epoch": 0.5320173326913819,
"grad_norm": 1.265625,
"learning_rate": 0.00021426769188907742,
"loss": 0.6504,
"step": 2210
},
{
"epoch": 0.533220991815118,
"grad_norm": 1.109375,
"learning_rate": 0.00021421148162477965,
"loss": 0.6393,
"step": 2215
},
{
"epoch": 0.5344246509388542,
"grad_norm": 1.3046875,
"learning_rate": 0.0002141551555776852,
"loss": 0.6687,
"step": 2220
},
{
"epoch": 0.5356283100625903,
"grad_norm": 1.3046875,
"learning_rate": 0.00021409871383778865,
"loss": 0.6692,
"step": 2225
},
{
"epoch": 0.5368319691863265,
"grad_norm": 1.3984375,
"learning_rate": 0.00021404215649526936,
"loss": 0.6524,
"step": 2230
},
{
"epoch": 0.5380356283100626,
"grad_norm": 1.0859375,
"learning_rate": 0.0002139854836404915,
"loss": 0.6812,
"step": 2235
},
{
"epoch": 0.5392392874337988,
"grad_norm": 1.25,
"learning_rate": 0.0002139286953640038,
"loss": 0.6381,
"step": 2240
},
{
"epoch": 0.5404429465575349,
"grad_norm": 1.171875,
"learning_rate": 0.00021387179175653932,
"loss": 0.6285,
"step": 2245
},
{
"epoch": 0.5416466056812711,
"grad_norm": 1.1484375,
"learning_rate": 0.00021381477290901546,
"loss": 0.6201,
"step": 2250
},
{
"epoch": 0.5428502648050072,
"grad_norm": 1.2734375,
"learning_rate": 0.00021375763891253369,
"loss": 0.6546,
"step": 2255
},
{
"epoch": 0.5440539239287434,
"grad_norm": 1.265625,
"learning_rate": 0.0002137003898583795,
"loss": 0.6643,
"step": 2260
},
{
"epoch": 0.5452575830524795,
"grad_norm": 1.3671875,
"learning_rate": 0.00021364302583802227,
"loss": 0.6825,
"step": 2265
},
{
"epoch": 0.5464612421762157,
"grad_norm": 1.1171875,
"learning_rate": 0.00021358554694311493,
"loss": 0.6575,
"step": 2270
},
{
"epoch": 0.5476649012999518,
"grad_norm": 1.2890625,
"learning_rate": 0.00021352795326549405,
"loss": 0.6442,
"step": 2275
},
{
"epoch": 0.548868560423688,
"grad_norm": 1.296875,
"learning_rate": 0.00021347024489717952,
"loss": 0.6766,
"step": 2280
},
{
"epoch": 0.5500722195474241,
"grad_norm": 1.25,
"learning_rate": 0.00021341242193037455,
"loss": 0.6524,
"step": 2285
},
{
"epoch": 0.5512758786711603,
"grad_norm": 1.21875,
"learning_rate": 0.00021335448445746543,
"loss": 0.65,
"step": 2290
},
{
"epoch": 0.5524795377948964,
"grad_norm": 1.1640625,
"learning_rate": 0.00021329643257102137,
"loss": 0.6366,
"step": 2295
},
{
"epoch": 0.5536831969186327,
"grad_norm": 1.2109375,
"learning_rate": 0.00021323826636379445,
"loss": 0.6531,
"step": 2300
},
{
"epoch": 0.5548868560423688,
"grad_norm": 1.265625,
"learning_rate": 0.00021317998592871925,
"loss": 0.6635,
"step": 2305
},
{
"epoch": 0.556090515166105,
"grad_norm": 1.359375,
"learning_rate": 0.00021312159135891305,
"loss": 0.6369,
"step": 2310
},
{
"epoch": 0.5572941742898411,
"grad_norm": 1.2734375,
"learning_rate": 0.00021306308274767537,
"loss": 0.6303,
"step": 2315
},
{
"epoch": 0.5584978334135773,
"grad_norm": 1.2265625,
"learning_rate": 0.00021300446018848802,
"loss": 0.6758,
"step": 2320
},
{
"epoch": 0.5597014925373134,
"grad_norm": 1.2421875,
"learning_rate": 0.00021294572377501478,
"loss": 0.6296,
"step": 2325
},
{
"epoch": 0.5609051516610496,
"grad_norm": 1.203125,
"learning_rate": 0.00021288687360110137,
"loss": 0.6405,
"step": 2330
},
{
"epoch": 0.5621088107847857,
"grad_norm": 1.1484375,
"learning_rate": 0.0002128279097607753,
"loss": 0.6103,
"step": 2335
},
{
"epoch": 0.5633124699085219,
"grad_norm": 1.2109375,
"learning_rate": 0.0002127688323482457,
"loss": 0.666,
"step": 2340
},
{
"epoch": 0.5645161290322581,
"grad_norm": 1.15625,
"learning_rate": 0.00021270964145790307,
"loss": 0.6503,
"step": 2345
},
{
"epoch": 0.5657197881559942,
"grad_norm": 1.296875,
"learning_rate": 0.00021265033718431933,
"loss": 0.6518,
"step": 2350
},
{
"epoch": 0.5669234472797304,
"grad_norm": 1.2890625,
"learning_rate": 0.0002125909196222475,
"loss": 0.6189,
"step": 2355
},
{
"epoch": 0.5681271064034665,
"grad_norm": 1.1640625,
"learning_rate": 0.00021253138886662156,
"loss": 0.6235,
"step": 2360
},
{
"epoch": 0.5693307655272027,
"grad_norm": 1.1484375,
"learning_rate": 0.00021247174501255647,
"loss": 0.6191,
"step": 2365
},
{
"epoch": 0.5705344246509388,
"grad_norm": 1.234375,
"learning_rate": 0.00021241198815534777,
"loss": 0.6141,
"step": 2370
},
{
"epoch": 0.571738083774675,
"grad_norm": 1.3203125,
"learning_rate": 0.00021235211839047162,
"loss": 0.6335,
"step": 2375
},
{
"epoch": 0.5729417428984112,
"grad_norm": 1.1328125,
"learning_rate": 0.00021229213581358455,
"loss": 0.629,
"step": 2380
},
{
"epoch": 0.5741454020221474,
"grad_norm": 1.1640625,
"learning_rate": 0.00021223204052052332,
"loss": 0.6352,
"step": 2385
},
{
"epoch": 0.5753490611458835,
"grad_norm": 1.1640625,
"learning_rate": 0.00021217183260730486,
"loss": 0.6869,
"step": 2390
},
{
"epoch": 0.5765527202696197,
"grad_norm": 1.1015625,
"learning_rate": 0.00021211151217012593,
"loss": 0.6343,
"step": 2395
},
{
"epoch": 0.5777563793933558,
"grad_norm": 1.2421875,
"learning_rate": 0.00021205107930536316,
"loss": 0.6506,
"step": 2400
},
{
"epoch": 0.578960038517092,
"grad_norm": 1.140625,
"learning_rate": 0.00021199053410957274,
"loss": 0.6532,
"step": 2405
},
{
"epoch": 0.5801636976408281,
"grad_norm": 1.1640625,
"learning_rate": 0.0002119298766794904,
"loss": 0.6083,
"step": 2410
},
{
"epoch": 0.5813673567645643,
"grad_norm": 1.1953125,
"learning_rate": 0.00021186910711203116,
"loss": 0.612,
"step": 2415
},
{
"epoch": 0.5825710158883004,
"grad_norm": 1.21875,
"learning_rate": 0.00021180822550428917,
"loss": 0.666,
"step": 2420
},
{
"epoch": 0.5837746750120366,
"grad_norm": 1.109375,
"learning_rate": 0.00021174723195353768,
"loss": 0.6313,
"step": 2425
},
{
"epoch": 0.5849783341357727,
"grad_norm": 1.6953125,
"learning_rate": 0.00021168612655722872,
"loss": 0.6574,
"step": 2430
},
{
"epoch": 0.5861819932595089,
"grad_norm": 1.3125,
"learning_rate": 0.0002116249094129931,
"loss": 0.6311,
"step": 2435
},
{
"epoch": 0.587385652383245,
"grad_norm": 1.2109375,
"learning_rate": 0.00021156358061864006,
"loss": 0.6233,
"step": 2440
},
{
"epoch": 0.5885893115069812,
"grad_norm": 1.296875,
"learning_rate": 0.0002115021402721573,
"loss": 0.6412,
"step": 2445
},
{
"epoch": 0.5897929706307173,
"grad_norm": 1.1640625,
"learning_rate": 0.00021144058847171078,
"loss": 0.65,
"step": 2450
},
{
"epoch": 0.5909966297544536,
"grad_norm": 1.21875,
"learning_rate": 0.0002113789253156445,
"loss": 0.6345,
"step": 2455
},
{
"epoch": 0.5922002888781897,
"grad_norm": 1.125,
"learning_rate": 0.00021131715090248033,
"loss": 0.6247,
"step": 2460
},
{
"epoch": 0.5934039480019259,
"grad_norm": 1.390625,
"learning_rate": 0.00021125526533091797,
"loss": 0.64,
"step": 2465
},
{
"epoch": 0.594607607125662,
"grad_norm": 1.328125,
"learning_rate": 0.0002111932686998347,
"loss": 0.6463,
"step": 2470
},
{
"epoch": 0.5958112662493982,
"grad_norm": 1.25,
"learning_rate": 0.00021113116110828528,
"loss": 0.6334,
"step": 2475
},
{
"epoch": 0.5970149253731343,
"grad_norm": 1.3203125,
"learning_rate": 0.0002110689426555016,
"loss": 0.6584,
"step": 2480
},
{
"epoch": 0.5982185844968705,
"grad_norm": 1.2109375,
"learning_rate": 0.00021100661344089296,
"loss": 0.7124,
"step": 2485
},
{
"epoch": 0.5994222436206067,
"grad_norm": 1.171875,
"learning_rate": 0.00021094417356404534,
"loss": 0.6188,
"step": 2490
},
{
"epoch": 0.6006259027443428,
"grad_norm": 1.4140625,
"learning_rate": 0.00021088162312472172,
"loss": 0.6233,
"step": 2495
},
{
"epoch": 0.601829561868079,
"grad_norm": 1.140625,
"learning_rate": 0.00021081896222286168,
"loss": 0.6319,
"step": 2500
},
{
"epoch": 0.601829561868079,
"eval_loss": 0.535412073135376,
"eval_runtime": 2.3675,
"eval_samples_per_second": 84.477,
"eval_steps_per_second": 84.477,
"step": 2500
},
{
"epoch": 0.6030332209918151,
"grad_norm": 1.25,
"learning_rate": 0.0002107561909585812,
"loss": 0.613,
"step": 2505
},
{
"epoch": 0.6042368801155513,
"grad_norm": 1.09375,
"learning_rate": 0.00021069330943217275,
"loss": 0.642,
"step": 2510
},
{
"epoch": 0.6054405392392874,
"grad_norm": 1.25,
"learning_rate": 0.00021063031774410483,
"loss": 0.6138,
"step": 2515
},
{
"epoch": 0.6066441983630236,
"grad_norm": 1.1328125,
"learning_rate": 0.00021056721599502207,
"loss": 0.6437,
"step": 2520
},
{
"epoch": 0.6078478574867597,
"grad_norm": 1.2109375,
"learning_rate": 0.00021050400428574483,
"loss": 0.6455,
"step": 2525
},
{
"epoch": 0.609051516610496,
"grad_norm": 1.2265625,
"learning_rate": 0.00021044068271726924,
"loss": 0.6267,
"step": 2530
},
{
"epoch": 0.610255175734232,
"grad_norm": 1.15625,
"learning_rate": 0.00021037725139076694,
"loss": 0.6143,
"step": 2535
},
{
"epoch": 0.6114588348579683,
"grad_norm": 1.2109375,
"learning_rate": 0.00021031371040758498,
"loss": 0.6115,
"step": 2540
},
{
"epoch": 0.6126624939817044,
"grad_norm": 1.171875,
"learning_rate": 0.0002102500598692454,
"loss": 0.6314,
"step": 2545
},
{
"epoch": 0.6138661531054406,
"grad_norm": 1.1484375,
"learning_rate": 0.00021018629987744564,
"loss": 0.6347,
"step": 2550
},
{
"epoch": 0.6150698122291767,
"grad_norm": 1.1015625,
"learning_rate": 0.00021012243053405768,
"loss": 0.6177,
"step": 2555
},
{
"epoch": 0.6162734713529129,
"grad_norm": 1.203125,
"learning_rate": 0.00021005845194112846,
"loss": 0.6154,
"step": 2560
},
{
"epoch": 0.617477130476649,
"grad_norm": 1.171875,
"learning_rate": 0.00020999436420087928,
"loss": 0.6396,
"step": 2565
},
{
"epoch": 0.6186807896003852,
"grad_norm": 1.203125,
"learning_rate": 0.000209930167415706,
"loss": 0.6254,
"step": 2570
},
{
"epoch": 0.6198844487241213,
"grad_norm": 1.1484375,
"learning_rate": 0.00020986586168817852,
"loss": 0.5998,
"step": 2575
},
{
"epoch": 0.6210881078478575,
"grad_norm": 1.2265625,
"learning_rate": 0.00020980144712104103,
"loss": 0.5929,
"step": 2580
},
{
"epoch": 0.6222917669715936,
"grad_norm": 1.109375,
"learning_rate": 0.0002097369238172114,
"loss": 0.6568,
"step": 2585
},
{
"epoch": 0.6234954260953298,
"grad_norm": 1.3046875,
"learning_rate": 0.0002096722918797814,
"loss": 0.6236,
"step": 2590
},
{
"epoch": 0.6246990852190659,
"grad_norm": 1.2421875,
"learning_rate": 0.00020960755141201625,
"loss": 0.633,
"step": 2595
},
{
"epoch": 0.6259027443428021,
"grad_norm": 1.265625,
"learning_rate": 0.00020954270251735465,
"loss": 0.6277,
"step": 2600
},
{
"epoch": 0.6271064034665382,
"grad_norm": 1.2734375,
"learning_rate": 0.0002094777452994085,
"loss": 0.6335,
"step": 2605
},
{
"epoch": 0.6283100625902744,
"grad_norm": 1.2890625,
"learning_rate": 0.00020941267986196275,
"loss": 0.6532,
"step": 2610
},
{
"epoch": 0.6295137217140105,
"grad_norm": 1.2265625,
"learning_rate": 0.00020934750630897535,
"loss": 0.6416,
"step": 2615
},
{
"epoch": 0.6307173808377468,
"grad_norm": 1.1484375,
"learning_rate": 0.00020928222474457688,
"loss": 0.6184,
"step": 2620
},
{
"epoch": 0.6319210399614829,
"grad_norm": 1.234375,
"learning_rate": 0.00020921683527307054,
"loss": 0.6667,
"step": 2625
},
{
"epoch": 0.6331246990852191,
"grad_norm": 1.140625,
"learning_rate": 0.00020915133799893202,
"loss": 0.6361,
"step": 2630
},
{
"epoch": 0.6343283582089553,
"grad_norm": 1.15625,
"learning_rate": 0.0002090857330268091,
"loss": 0.6221,
"step": 2635
},
{
"epoch": 0.6355320173326914,
"grad_norm": 1.1953125,
"learning_rate": 0.0002090200204615217,
"loss": 0.6652,
"step": 2640
},
{
"epoch": 0.6367356764564276,
"grad_norm": 1.296875,
"learning_rate": 0.0002089542004080617,
"loss": 0.6372,
"step": 2645
},
{
"epoch": 0.6379393355801637,
"grad_norm": 1.2578125,
"learning_rate": 0.00020888827297159266,
"loss": 0.6238,
"step": 2650
},
{
"epoch": 0.6391429947038999,
"grad_norm": 1.3203125,
"learning_rate": 0.0002088222382574497,
"loss": 0.6455,
"step": 2655
},
{
"epoch": 0.640346653827636,
"grad_norm": 1.2578125,
"learning_rate": 0.0002087560963711394,
"loss": 0.6246,
"step": 2660
},
{
"epoch": 0.6415503129513722,
"grad_norm": 1.15625,
"learning_rate": 0.0002086898474183395,
"loss": 0.6196,
"step": 2665
},
{
"epoch": 0.6427539720751083,
"grad_norm": 1.109375,
"learning_rate": 0.00020862349150489886,
"loss": 0.5973,
"step": 2670
},
{
"epoch": 0.6439576311988445,
"grad_norm": 1.09375,
"learning_rate": 0.00020855702873683724,
"loss": 0.6189,
"step": 2675
},
{
"epoch": 0.6451612903225806,
"grad_norm": 1.3984375,
"learning_rate": 0.00020849045922034508,
"loss": 0.631,
"step": 2680
},
{
"epoch": 0.6463649494463168,
"grad_norm": 1.1796875,
"learning_rate": 0.00020842378306178342,
"loss": 0.6271,
"step": 2685
},
{
"epoch": 0.6475686085700529,
"grad_norm": 1.203125,
"learning_rate": 0.00020835700036768364,
"loss": 0.6229,
"step": 2690
},
{
"epoch": 0.6487722676937892,
"grad_norm": 1.1796875,
"learning_rate": 0.00020829011124474738,
"loss": 0.6131,
"step": 2695
},
{
"epoch": 0.6499759268175253,
"grad_norm": 1.21875,
"learning_rate": 0.00020822311579984636,
"loss": 0.6246,
"step": 2700
},
{
"epoch": 0.6511795859412615,
"grad_norm": 1.1328125,
"learning_rate": 0.00020815601414002203,
"loss": 0.6777,
"step": 2705
},
{
"epoch": 0.6523832450649976,
"grad_norm": 1.140625,
"learning_rate": 0.00020808880637248573,
"loss": 0.6058,
"step": 2710
},
{
"epoch": 0.6535869041887338,
"grad_norm": 1.109375,
"learning_rate": 0.0002080214926046182,
"loss": 0.6386,
"step": 2715
},
{
"epoch": 0.6547905633124699,
"grad_norm": 1.1953125,
"learning_rate": 0.00020795407294396954,
"loss": 0.6198,
"step": 2720
},
{
"epoch": 0.6559942224362061,
"grad_norm": 1.203125,
"learning_rate": 0.0002078865474982592,
"loss": 0.6168,
"step": 2725
},
{
"epoch": 0.6571978815599422,
"grad_norm": 1.2421875,
"learning_rate": 0.00020781891637537542,
"loss": 0.6586,
"step": 2730
},
{
"epoch": 0.6584015406836784,
"grad_norm": 1.234375,
"learning_rate": 0.0002077511796833755,
"loss": 0.6667,
"step": 2735
},
{
"epoch": 0.6596051998074145,
"grad_norm": 1.09375,
"learning_rate": 0.0002076833375304852,
"loss": 0.6159,
"step": 2740
},
{
"epoch": 0.6608088589311507,
"grad_norm": 1.15625,
"learning_rate": 0.00020761539002509897,
"loss": 0.5914,
"step": 2745
},
{
"epoch": 0.6620125180548868,
"grad_norm": 1.140625,
"learning_rate": 0.00020754733727577945,
"loss": 0.6357,
"step": 2750
},
{
"epoch": 0.663216177178623,
"grad_norm": 1.140625,
"learning_rate": 0.00020747917939125757,
"loss": 0.6456,
"step": 2755
},
{
"epoch": 0.6644198363023591,
"grad_norm": 1.1171875,
"learning_rate": 0.00020741091648043204,
"loss": 0.6176,
"step": 2760
},
{
"epoch": 0.6656234954260953,
"grad_norm": 1.1953125,
"learning_rate": 0.0002073425486523696,
"loss": 0.6324,
"step": 2765
},
{
"epoch": 0.6668271545498314,
"grad_norm": 1.171875,
"learning_rate": 0.00020727407601630447,
"loss": 0.6396,
"step": 2770
},
{
"epoch": 0.6680308136735676,
"grad_norm": 1.3359375,
"learning_rate": 0.00020720549868163835,
"loss": 0.6223,
"step": 2775
},
{
"epoch": 0.6692344727973039,
"grad_norm": 1.1640625,
"learning_rate": 0.00020713681675794027,
"loss": 0.6403,
"step": 2780
},
{
"epoch": 0.67043813192104,
"grad_norm": 1.125,
"learning_rate": 0.0002070680303549463,
"loss": 0.6291,
"step": 2785
},
{
"epoch": 0.6716417910447762,
"grad_norm": 1.2265625,
"learning_rate": 0.00020699913958255951,
"loss": 0.602,
"step": 2790
},
{
"epoch": 0.6728454501685123,
"grad_norm": 1.1328125,
"learning_rate": 0.0002069301445508497,
"loss": 0.6251,
"step": 2795
},
{
"epoch": 0.6740491092922485,
"grad_norm": 1.1328125,
"learning_rate": 0.00020686104537005322,
"loss": 0.6268,
"step": 2800
},
{
"epoch": 0.6752527684159846,
"grad_norm": 1.1875,
"learning_rate": 0.00020679184215057286,
"loss": 0.6092,
"step": 2805
},
{
"epoch": 0.6764564275397208,
"grad_norm": 1.359375,
"learning_rate": 0.00020672253500297766,
"loss": 0.6273,
"step": 2810
},
{
"epoch": 0.6776600866634569,
"grad_norm": 1.34375,
"learning_rate": 0.00020665312403800258,
"loss": 0.6499,
"step": 2815
},
{
"epoch": 0.6788637457871931,
"grad_norm": 1.234375,
"learning_rate": 0.00020658360936654866,
"loss": 0.6342,
"step": 2820
},
{
"epoch": 0.6800674049109292,
"grad_norm": 1.5625,
"learning_rate": 0.00020651399109968243,
"loss": 0.6196,
"step": 2825
},
{
"epoch": 0.6812710640346654,
"grad_norm": 1.125,
"learning_rate": 0.0002064442693486361,
"loss": 0.5978,
"step": 2830
},
{
"epoch": 0.6824747231584015,
"grad_norm": 1.109375,
"learning_rate": 0.0002063744442248071,
"loss": 0.6216,
"step": 2835
},
{
"epoch": 0.6836783822821377,
"grad_norm": 1.171875,
"learning_rate": 0.00020630451583975812,
"loss": 0.64,
"step": 2840
},
{
"epoch": 0.6848820414058738,
"grad_norm": 1.1875,
"learning_rate": 0.0002062344843052168,
"loss": 0.6171,
"step": 2845
},
{
"epoch": 0.68608570052961,
"grad_norm": 1.1796875,
"learning_rate": 0.00020616434973307553,
"loss": 0.6148,
"step": 2850
},
{
"epoch": 0.6872893596533461,
"grad_norm": 1.1796875,
"learning_rate": 0.00020609411223539143,
"loss": 0.5956,
"step": 2855
},
{
"epoch": 0.6884930187770824,
"grad_norm": 1.1640625,
"learning_rate": 0.00020602377192438601,
"loss": 0.6167,
"step": 2860
},
{
"epoch": 0.6896966779008185,
"grad_norm": 1.1875,
"learning_rate": 0.00020595332891244503,
"loss": 0.6228,
"step": 2865
},
{
"epoch": 0.6909003370245547,
"grad_norm": 1.1796875,
"learning_rate": 0.00020588278331211833,
"loss": 0.6118,
"step": 2870
},
{
"epoch": 0.6921039961482908,
"grad_norm": 1.171875,
"learning_rate": 0.00020581213523611976,
"loss": 0.6046,
"step": 2875
},
{
"epoch": 0.693307655272027,
"grad_norm": 1.1171875,
"learning_rate": 0.00020574138479732682,
"loss": 0.6188,
"step": 2880
},
{
"epoch": 0.6945113143957631,
"grad_norm": 1.1328125,
"learning_rate": 0.00020567053210878057,
"loss": 0.6364,
"step": 2885
},
{
"epoch": 0.6957149735194993,
"grad_norm": 1.0859375,
"learning_rate": 0.00020559957728368545,
"loss": 0.6237,
"step": 2890
},
{
"epoch": 0.6969186326432354,
"grad_norm": 1.2109375,
"learning_rate": 0.00020552852043540903,
"loss": 0.6406,
"step": 2895
},
{
"epoch": 0.6981222917669716,
"grad_norm": 1.328125,
"learning_rate": 0.000205457361677482,
"loss": 0.5886,
"step": 2900
},
{
"epoch": 0.6993259508907077,
"grad_norm": 1.15625,
"learning_rate": 0.00020538610112359783,
"loss": 0.6265,
"step": 2905
},
{
"epoch": 0.7005296100144439,
"grad_norm": 1.078125,
"learning_rate": 0.0002053147388876125,
"loss": 0.5859,
"step": 2910
},
{
"epoch": 0.70173326913818,
"grad_norm": 1.203125,
"learning_rate": 0.0002052432750835447,
"loss": 0.6309,
"step": 2915
},
{
"epoch": 0.7029369282619162,
"grad_norm": 1.1484375,
"learning_rate": 0.00020517170982557522,
"loss": 0.5542,
"step": 2920
},
{
"epoch": 0.7041405873856523,
"grad_norm": 1.1796875,
"learning_rate": 0.000205100043228047,
"loss": 0.6296,
"step": 2925
},
{
"epoch": 0.7053442465093885,
"grad_norm": 1.1328125,
"learning_rate": 0.00020502827540546485,
"loss": 0.5848,
"step": 2930
},
{
"epoch": 0.7065479056331248,
"grad_norm": 1.2421875,
"learning_rate": 0.00020495640647249537,
"loss": 0.5947,
"step": 2935
},
{
"epoch": 0.7077515647568609,
"grad_norm": 1.109375,
"learning_rate": 0.00020488443654396676,
"loss": 0.5835,
"step": 2940
},
{
"epoch": 0.7089552238805971,
"grad_norm": 1.0703125,
"learning_rate": 0.00020481236573486846,
"loss": 0.6321,
"step": 2945
},
{
"epoch": 0.7101588830043332,
"grad_norm": 1.3125,
"learning_rate": 0.00020474019416035115,
"loss": 0.621,
"step": 2950
},
{
"epoch": 0.7113625421280694,
"grad_norm": 1.1875,
"learning_rate": 0.0002046679219357265,
"loss": 0.6051,
"step": 2955
},
{
"epoch": 0.7125662012518055,
"grad_norm": 1.15625,
"learning_rate": 0.00020459554917646699,
"loss": 0.5716,
"step": 2960
},
{
"epoch": 0.7137698603755417,
"grad_norm": 1.1171875,
"learning_rate": 0.00020452307599820577,
"loss": 0.5959,
"step": 2965
},
{
"epoch": 0.7149735194992778,
"grad_norm": 1.125,
"learning_rate": 0.00020445050251673635,
"loss": 0.6137,
"step": 2970
},
{
"epoch": 0.716177178623014,
"grad_norm": 1.2421875,
"learning_rate": 0.0002043778288480126,
"loss": 0.5872,
"step": 2975
},
{
"epoch": 0.7173808377467501,
"grad_norm": 1.1484375,
"learning_rate": 0.0002043050551081484,
"loss": 0.6065,
"step": 2980
},
{
"epoch": 0.7185844968704863,
"grad_norm": 1.0859375,
"learning_rate": 0.00020423218141341754,
"loss": 0.6223,
"step": 2985
},
{
"epoch": 0.7197881559942224,
"grad_norm": 1.2578125,
"learning_rate": 0.00020415920788025344,
"loss": 0.6176,
"step": 2990
},
{
"epoch": 0.7209918151179586,
"grad_norm": 1.1640625,
"learning_rate": 0.00020408613462524918,
"loss": 0.597,
"step": 2995
},
{
"epoch": 0.7221954742416947,
"grad_norm": 1.140625,
"learning_rate": 0.00020401296176515704,
"loss": 0.5909,
"step": 3000
},
{
"epoch": 0.7221954742416947,
"eval_loss": 0.5192618370056152,
"eval_runtime": 2.3635,
"eval_samples_per_second": 84.619,
"eval_steps_per_second": 84.619,
"step": 3000
},
{
"epoch": 0.7233991333654309,
"grad_norm": 1.203125,
"learning_rate": 0.00020393968941688853,
"loss": 0.6374,
"step": 3005
},
{
"epoch": 0.724602792489167,
"grad_norm": 1.21875,
"learning_rate": 0.00020386631769751402,
"loss": 0.6696,
"step": 3010
},
{
"epoch": 0.7258064516129032,
"grad_norm": 1.0859375,
"learning_rate": 0.00020379284672426278,
"loss": 0.6053,
"step": 3015
},
{
"epoch": 0.7270101107366393,
"grad_norm": 1.109375,
"learning_rate": 0.0002037192766145225,
"loss": 0.5901,
"step": 3020
},
{
"epoch": 0.7282137698603756,
"grad_norm": 1.078125,
"learning_rate": 0.00020364560748583946,
"loss": 0.5916,
"step": 3025
},
{
"epoch": 0.7294174289841117,
"grad_norm": 1.2265625,
"learning_rate": 0.00020357183945591797,
"loss": 0.6242,
"step": 3030
},
{
"epoch": 0.7306210881078479,
"grad_norm": 1.0546875,
"learning_rate": 0.00020349797264262046,
"loss": 0.5957,
"step": 3035
},
{
"epoch": 0.731824747231584,
"grad_norm": 1.140625,
"learning_rate": 0.00020342400716396718,
"loss": 0.58,
"step": 3040
},
{
"epoch": 0.7330284063553202,
"grad_norm": 1.140625,
"learning_rate": 0.00020334994313813597,
"loss": 0.5831,
"step": 3045
},
{
"epoch": 0.7342320654790563,
"grad_norm": 1.0625,
"learning_rate": 0.00020327578068346212,
"loss": 0.6318,
"step": 3050
},
{
"epoch": 0.7354357246027925,
"grad_norm": 1.3046875,
"learning_rate": 0.00020320151991843832,
"loss": 0.5763,
"step": 3055
},
{
"epoch": 0.7366393837265286,
"grad_norm": 1.2734375,
"learning_rate": 0.00020312716096171417,
"loss": 0.6231,
"step": 3060
},
{
"epoch": 0.7378430428502648,
"grad_norm": 1.1484375,
"learning_rate": 0.0002030527039320962,
"loss": 0.6096,
"step": 3065
},
{
"epoch": 0.7390467019740009,
"grad_norm": 1.1796875,
"learning_rate": 0.00020297814894854773,
"loss": 0.6414,
"step": 3070
},
{
"epoch": 0.7402503610977371,
"grad_norm": 1.1015625,
"learning_rate": 0.00020290349613018846,
"loss": 0.6351,
"step": 3075
},
{
"epoch": 0.7414540202214733,
"grad_norm": 1.109375,
"learning_rate": 0.00020282874559629445,
"loss": 0.6088,
"step": 3080
},
{
"epoch": 0.7426576793452094,
"grad_norm": 1.09375,
"learning_rate": 0.00020275389746629793,
"loss": 0.5377,
"step": 3085
},
{
"epoch": 0.7438613384689456,
"grad_norm": 1.1328125,
"learning_rate": 0.00020267895185978704,
"loss": 0.634,
"step": 3090
},
{
"epoch": 0.7450649975926817,
"grad_norm": 1.015625,
"learning_rate": 0.00020260390889650554,
"loss": 0.5934,
"step": 3095
},
{
"epoch": 0.746268656716418,
"grad_norm": 1.0859375,
"learning_rate": 0.00020252876869635293,
"loss": 0.6131,
"step": 3100
},
{
"epoch": 0.7474723158401541,
"grad_norm": 1.2109375,
"learning_rate": 0.00020245353137938397,
"loss": 0.6048,
"step": 3105
},
{
"epoch": 0.7486759749638903,
"grad_norm": 1.265625,
"learning_rate": 0.00020237819706580865,
"loss": 0.5675,
"step": 3110
},
{
"epoch": 0.7498796340876264,
"grad_norm": 0.96875,
"learning_rate": 0.00020230276587599182,
"loss": 0.6105,
"step": 3115
},
{
"epoch": 0.7510832932113626,
"grad_norm": 1.0546875,
"learning_rate": 0.00020222723793045323,
"loss": 0.5907,
"step": 3120
},
{
"epoch": 0.7522869523350987,
"grad_norm": 1.0859375,
"learning_rate": 0.00020215161334986715,
"loss": 0.5972,
"step": 3125
},
{
"epoch": 0.7534906114588349,
"grad_norm": 1.1484375,
"learning_rate": 0.00020207589225506228,
"loss": 0.619,
"step": 3130
},
{
"epoch": 0.754694270582571,
"grad_norm": 1.1796875,
"learning_rate": 0.0002020000747670215,
"loss": 0.5716,
"step": 3135
},
{
"epoch": 0.7558979297063072,
"grad_norm": 1.046875,
"learning_rate": 0.00020192416100688176,
"loss": 0.6358,
"step": 3140
},
{
"epoch": 0.7571015888300433,
"grad_norm": 1.1328125,
"learning_rate": 0.00020184815109593377,
"loss": 0.5854,
"step": 3145
},
{
"epoch": 0.7583052479537795,
"grad_norm": 1.1328125,
"learning_rate": 0.00020177204515562188,
"loss": 0.6118,
"step": 3150
},
{
"epoch": 0.7595089070775156,
"grad_norm": 1.09375,
"learning_rate": 0.00020169584330754389,
"loss": 0.5739,
"step": 3155
},
{
"epoch": 0.7607125662012518,
"grad_norm": 1.0703125,
"learning_rate": 0.00020161954567345078,
"loss": 0.6189,
"step": 3160
},
{
"epoch": 0.7619162253249879,
"grad_norm": 1.1484375,
"learning_rate": 0.00020154315237524666,
"loss": 0.6023,
"step": 3165
},
{
"epoch": 0.7631198844487241,
"grad_norm": 1.0234375,
"learning_rate": 0.00020146666353498843,
"loss": 0.5824,
"step": 3170
},
{
"epoch": 0.7643235435724602,
"grad_norm": 1.09375,
"learning_rate": 0.0002013900792748856,
"loss": 0.6014,
"step": 3175
},
{
"epoch": 0.7655272026961965,
"grad_norm": 1.15625,
"learning_rate": 0.0002013133997173002,
"loss": 0.5952,
"step": 3180
},
{
"epoch": 0.7667308618199326,
"grad_norm": 1.2265625,
"learning_rate": 0.00020123662498474653,
"loss": 0.6018,
"step": 3185
},
{
"epoch": 0.7679345209436688,
"grad_norm": 1.09375,
"learning_rate": 0.00020115975519989092,
"loss": 0.6101,
"step": 3190
},
{
"epoch": 0.7691381800674049,
"grad_norm": 1.0859375,
"learning_rate": 0.00020108279048555158,
"loss": 0.6157,
"step": 3195
},
{
"epoch": 0.7703418391911411,
"grad_norm": 1.1484375,
"learning_rate": 0.0002010057309646984,
"loss": 0.6372,
"step": 3200
},
{
"epoch": 0.7715454983148772,
"grad_norm": 1.1484375,
"learning_rate": 0.00020092857676045272,
"loss": 0.5911,
"step": 3205
},
{
"epoch": 0.7727491574386134,
"grad_norm": 1.0859375,
"learning_rate": 0.0002008513279960872,
"loss": 0.6214,
"step": 3210
},
{
"epoch": 0.7739528165623495,
"grad_norm": 1.09375,
"learning_rate": 0.0002007739847950256,
"loss": 0.6084,
"step": 3215
},
{
"epoch": 0.7751564756860857,
"grad_norm": 1.1484375,
"learning_rate": 0.00020069654728084243,
"loss": 0.5793,
"step": 3220
},
{
"epoch": 0.7763601348098219,
"grad_norm": 1.0703125,
"learning_rate": 0.00020061901557726308,
"loss": 0.6221,
"step": 3225
},
{
"epoch": 0.777563793933558,
"grad_norm": 1.09375,
"learning_rate": 0.0002005413898081633,
"loss": 0.6532,
"step": 3230
},
{
"epoch": 0.7787674530572942,
"grad_norm": 1.1953125,
"learning_rate": 0.0002004636700975691,
"loss": 0.6124,
"step": 3235
},
{
"epoch": 0.7799711121810303,
"grad_norm": 1.1484375,
"learning_rate": 0.00020038585656965684,
"loss": 0.5842,
"step": 3240
},
{
"epoch": 0.7811747713047665,
"grad_norm": 1.2578125,
"learning_rate": 0.00020030794934875238,
"loss": 0.5539,
"step": 3245
},
{
"epoch": 0.7823784304285026,
"grad_norm": 1.015625,
"learning_rate": 0.0002002299485593316,
"loss": 0.6064,
"step": 3250
},
{
"epoch": 0.7835820895522388,
"grad_norm": 1.3125,
"learning_rate": 0.00020015185432601976,
"loss": 0.5736,
"step": 3255
},
{
"epoch": 0.784785748675975,
"grad_norm": 1.1484375,
"learning_rate": 0.00020007366677359138,
"loss": 0.615,
"step": 3260
},
{
"epoch": 0.7859894077997112,
"grad_norm": 1.015625,
"learning_rate": 0.0001999953860269702,
"loss": 0.5785,
"step": 3265
},
{
"epoch": 0.7871930669234473,
"grad_norm": 1.046875,
"learning_rate": 0.00019991701221122872,
"loss": 0.562,
"step": 3270
},
{
"epoch": 0.7883967260471835,
"grad_norm": 1.21875,
"learning_rate": 0.00019983854545158823,
"loss": 0.6186,
"step": 3275
},
{
"epoch": 0.7896003851709196,
"grad_norm": 1.078125,
"learning_rate": 0.0001997599858734185,
"loss": 0.5616,
"step": 3280
},
{
"epoch": 0.7908040442946558,
"grad_norm": 1.1484375,
"learning_rate": 0.00019968133360223758,
"loss": 0.621,
"step": 3285
},
{
"epoch": 0.7920077034183919,
"grad_norm": 1.078125,
"learning_rate": 0.00019960258876371164,
"loss": 0.5845,
"step": 3290
},
{
"epoch": 0.7932113625421281,
"grad_norm": 1.1484375,
"learning_rate": 0.00019952375148365477,
"loss": 0.602,
"step": 3295
},
{
"epoch": 0.7944150216658642,
"grad_norm": 1.1796875,
"learning_rate": 0.00019944482188802873,
"loss": 0.5973,
"step": 3300
},
{
"epoch": 0.7956186807896004,
"grad_norm": 1.0703125,
"learning_rate": 0.00019936580010294273,
"loss": 0.5952,
"step": 3305
},
{
"epoch": 0.7968223399133365,
"grad_norm": 1.234375,
"learning_rate": 0.0001992866862546534,
"loss": 0.5821,
"step": 3310
},
{
"epoch": 0.7980259990370727,
"grad_norm": 1.1875,
"learning_rate": 0.00019920748046956433,
"loss": 0.6004,
"step": 3315
},
{
"epoch": 0.7992296581608088,
"grad_norm": 1.0703125,
"learning_rate": 0.0001991281828742261,
"loss": 0.6098,
"step": 3320
},
{
"epoch": 0.800433317284545,
"grad_norm": 1.125,
"learning_rate": 0.0001990487935953359,
"loss": 0.6058,
"step": 3325
},
{
"epoch": 0.8016369764082811,
"grad_norm": 1.0859375,
"learning_rate": 0.00019896931275973747,
"loss": 0.5879,
"step": 3330
},
{
"epoch": 0.8028406355320173,
"grad_norm": 1.2109375,
"learning_rate": 0.00019888974049442077,
"loss": 0.5833,
"step": 3335
},
{
"epoch": 0.8040442946557534,
"grad_norm": 1.25,
"learning_rate": 0.00019881007692652199,
"loss": 0.6149,
"step": 3340
},
{
"epoch": 0.8052479537794897,
"grad_norm": 1.15625,
"learning_rate": 0.00019873032218332298,
"loss": 0.5989,
"step": 3345
},
{
"epoch": 0.8064516129032258,
"grad_norm": 1.15625,
"learning_rate": 0.00019865047639225142,
"loss": 0.5539,
"step": 3350
},
{
"epoch": 0.807655272026962,
"grad_norm": 1.125,
"learning_rate": 0.00019857053968088038,
"loss": 0.5937,
"step": 3355
},
{
"epoch": 0.8088589311506981,
"grad_norm": 1.046875,
"learning_rate": 0.0001984905121769283,
"loss": 0.5273,
"step": 3360
},
{
"epoch": 0.8100625902744343,
"grad_norm": 1.15625,
"learning_rate": 0.00019841039400825852,
"loss": 0.5724,
"step": 3365
},
{
"epoch": 0.8112662493981705,
"grad_norm": 1.1015625,
"learning_rate": 0.00019833018530287944,
"loss": 0.589,
"step": 3370
},
{
"epoch": 0.8124699085219066,
"grad_norm": 1.046875,
"learning_rate": 0.00019824988618894398,
"loss": 0.5946,
"step": 3375
},
{
"epoch": 0.8136735676456428,
"grad_norm": 1.125,
"learning_rate": 0.00019816949679474948,
"loss": 0.5985,
"step": 3380
},
{
"epoch": 0.8148772267693789,
"grad_norm": 0.9921875,
"learning_rate": 0.00019808901724873763,
"loss": 0.5461,
"step": 3385
},
{
"epoch": 0.8160808858931151,
"grad_norm": 1.2734375,
"learning_rate": 0.0001980084476794941,
"loss": 0.6385,
"step": 3390
},
{
"epoch": 0.8172845450168512,
"grad_norm": 1.234375,
"learning_rate": 0.00019792778821574843,
"loss": 0.5913,
"step": 3395
},
{
"epoch": 0.8184882041405874,
"grad_norm": 1.078125,
"learning_rate": 0.00019784703898637372,
"loss": 0.5912,
"step": 3400
},
{
"epoch": 0.8196918632643235,
"grad_norm": 1.359375,
"learning_rate": 0.00019776620012038661,
"loss": 0.6319,
"step": 3405
},
{
"epoch": 0.8208955223880597,
"grad_norm": 1.109375,
"learning_rate": 0.00019768527174694682,
"loss": 0.5934,
"step": 3410
},
{
"epoch": 0.8220991815117958,
"grad_norm": 1.1953125,
"learning_rate": 0.00019760425399535718,
"loss": 0.5981,
"step": 3415
},
{
"epoch": 0.823302840635532,
"grad_norm": 1.2265625,
"learning_rate": 0.00019752314699506327,
"loss": 0.5668,
"step": 3420
},
{
"epoch": 0.8245064997592682,
"grad_norm": 1.140625,
"learning_rate": 0.00019744195087565328,
"loss": 0.6208,
"step": 3425
},
{
"epoch": 0.8257101588830044,
"grad_norm": 1.1328125,
"learning_rate": 0.00019736066576685784,
"loss": 0.6283,
"step": 3430
},
{
"epoch": 0.8269138180067405,
"grad_norm": 1.1015625,
"learning_rate": 0.00019727929179854962,
"loss": 0.5476,
"step": 3435
},
{
"epoch": 0.8281174771304767,
"grad_norm": 2.46875,
"learning_rate": 0.00019719782910074347,
"loss": 0.5595,
"step": 3440
},
{
"epoch": 0.8293211362542128,
"grad_norm": 1.0859375,
"learning_rate": 0.0001971162778035958,
"loss": 0.5679,
"step": 3445
},
{
"epoch": 0.830524795377949,
"grad_norm": 1.15625,
"learning_rate": 0.0001970346380374048,
"loss": 0.5718,
"step": 3450
},
{
"epoch": 0.8317284545016851,
"grad_norm": 1.1171875,
"learning_rate": 0.00019695290993260978,
"loss": 0.598,
"step": 3455
},
{
"epoch": 0.8329321136254213,
"grad_norm": 1.2734375,
"learning_rate": 0.00019687109361979133,
"loss": 0.6097,
"step": 3460
},
{
"epoch": 0.8341357727491574,
"grad_norm": 1.0390625,
"learning_rate": 0.00019678918922967094,
"loss": 0.5711,
"step": 3465
},
{
"epoch": 0.8353394318728936,
"grad_norm": 1.0390625,
"learning_rate": 0.00019670719689311085,
"loss": 0.5832,
"step": 3470
},
{
"epoch": 0.8365430909966297,
"grad_norm": 1.125,
"learning_rate": 0.0001966251167411138,
"loss": 0.5661,
"step": 3475
},
{
"epoch": 0.8377467501203659,
"grad_norm": 1.1328125,
"learning_rate": 0.0001965429489048228,
"loss": 0.5885,
"step": 3480
},
{
"epoch": 0.838950409244102,
"grad_norm": 1.046875,
"learning_rate": 0.00019646069351552097,
"loss": 0.6013,
"step": 3485
},
{
"epoch": 0.8401540683678382,
"grad_norm": 1.1171875,
"learning_rate": 0.00019637835070463141,
"loss": 0.541,
"step": 3490
},
{
"epoch": 0.8413577274915743,
"grad_norm": 1.21875,
"learning_rate": 0.00019629592060371674,
"loss": 0.6132,
"step": 3495
},
{
"epoch": 0.8425613866153105,
"grad_norm": 1.0703125,
"learning_rate": 0.00019621340334447922,
"loss": 0.6197,
"step": 3500
},
{
"epoch": 0.8425613866153105,
"eval_loss": 0.49436959624290466,
"eval_runtime": 2.3689,
"eval_samples_per_second": 84.427,
"eval_steps_per_second": 84.427,
"step": 3500
},
{
"epoch": 0.8437650457390466,
"grad_norm": 1.109375,
"learning_rate": 0.0001961307990587602,
"loss": 0.5976,
"step": 3505
},
{
"epoch": 0.8449687048627829,
"grad_norm": 1.109375,
"learning_rate": 0.0001960481078785402,
"loss": 0.5945,
"step": 3510
},
{
"epoch": 0.8461723639865191,
"grad_norm": 1.203125,
"learning_rate": 0.0001959653299359385,
"loss": 0.5928,
"step": 3515
},
{
"epoch": 0.8473760231102552,
"grad_norm": 1.1171875,
"learning_rate": 0.00019588246536321303,
"loss": 0.5758,
"step": 3520
},
{
"epoch": 0.8485796822339914,
"grad_norm": 1.1171875,
"learning_rate": 0.00019579951429276013,
"loss": 0.6009,
"step": 3525
},
{
"epoch": 0.8497833413577275,
"grad_norm": 1.1796875,
"learning_rate": 0.0001957164768571144,
"loss": 0.5906,
"step": 3530
},
{
"epoch": 0.8509870004814637,
"grad_norm": 1.21875,
"learning_rate": 0.00019563335318894832,
"loss": 0.6478,
"step": 3535
},
{
"epoch": 0.8521906596051998,
"grad_norm": 1.0546875,
"learning_rate": 0.00019555014342107223,
"loss": 0.5956,
"step": 3540
},
{
"epoch": 0.853394318728936,
"grad_norm": 1.1875,
"learning_rate": 0.00019546684768643397,
"loss": 0.5881,
"step": 3545
},
{
"epoch": 0.8545979778526721,
"grad_norm": 1.125,
"learning_rate": 0.00019538346611811883,
"loss": 0.5621,
"step": 3550
},
{
"epoch": 0.8558016369764083,
"grad_norm": 1.0703125,
"learning_rate": 0.0001952999988493491,
"loss": 0.6075,
"step": 3555
},
{
"epoch": 0.8570052961001444,
"grad_norm": 1.0078125,
"learning_rate": 0.00019521644601348418,
"loss": 0.5892,
"step": 3560
},
{
"epoch": 0.8582089552238806,
"grad_norm": 1.046875,
"learning_rate": 0.00019513280774402004,
"loss": 0.6142,
"step": 3565
},
{
"epoch": 0.8594126143476167,
"grad_norm": 1.1484375,
"learning_rate": 0.00019504908417458916,
"loss": 0.591,
"step": 3570
},
{
"epoch": 0.8606162734713529,
"grad_norm": 1.0625,
"learning_rate": 0.00019496527543896034,
"loss": 0.5976,
"step": 3575
},
{
"epoch": 0.861819932595089,
"grad_norm": 1.1875,
"learning_rate": 0.00019488138167103852,
"loss": 0.5781,
"step": 3580
},
{
"epoch": 0.8630235917188253,
"grad_norm": 1.015625,
"learning_rate": 0.0001947974030048644,
"loss": 0.56,
"step": 3585
},
{
"epoch": 0.8642272508425614,
"grad_norm": 1.0546875,
"learning_rate": 0.0001947133395746143,
"loss": 0.5765,
"step": 3590
},
{
"epoch": 0.8654309099662976,
"grad_norm": 1.109375,
"learning_rate": 0.00019462919151460014,
"loss": 0.5993,
"step": 3595
},
{
"epoch": 0.8666345690900337,
"grad_norm": 1.1484375,
"learning_rate": 0.00019454495895926887,
"loss": 0.577,
"step": 3600
},
{
"epoch": 0.8678382282137699,
"grad_norm": 1.09375,
"learning_rate": 0.00019446064204320257,
"loss": 0.5855,
"step": 3605
},
{
"epoch": 0.869041887337506,
"grad_norm": 1.1640625,
"learning_rate": 0.00019437624090111802,
"loss": 0.5685,
"step": 3610
},
{
"epoch": 0.8702455464612422,
"grad_norm": 1.046875,
"learning_rate": 0.0001942917556678666,
"loss": 0.6119,
"step": 3615
},
{
"epoch": 0.8714492055849783,
"grad_norm": 1.1796875,
"learning_rate": 0.00019420718647843413,
"loss": 0.5716,
"step": 3620
},
{
"epoch": 0.8726528647087145,
"grad_norm": 1.1171875,
"learning_rate": 0.00019412253346794042,
"loss": 0.5957,
"step": 3625
},
{
"epoch": 0.8738565238324506,
"grad_norm": 1.1015625,
"learning_rate": 0.00019403779677163927,
"loss": 0.6095,
"step": 3630
},
{
"epoch": 0.8750601829561868,
"grad_norm": 1.1171875,
"learning_rate": 0.00019395297652491825,
"loss": 0.5677,
"step": 3635
},
{
"epoch": 0.8762638420799229,
"grad_norm": 1.0078125,
"learning_rate": 0.00019386807286329836,
"loss": 0.613,
"step": 3640
},
{
"epoch": 0.8774675012036591,
"grad_norm": 1.2109375,
"learning_rate": 0.00019378308592243388,
"loss": 0.5871,
"step": 3645
},
{
"epoch": 0.8786711603273952,
"grad_norm": 1.1875,
"learning_rate": 0.00019369801583811214,
"loss": 0.5831,
"step": 3650
},
{
"epoch": 0.8798748194511314,
"grad_norm": 1.0546875,
"learning_rate": 0.00019361286274625333,
"loss": 0.5734,
"step": 3655
},
{
"epoch": 0.8810784785748677,
"grad_norm": 1.078125,
"learning_rate": 0.0001935276267829103,
"loss": 0.5495,
"step": 3660
},
{
"epoch": 0.8822821376986038,
"grad_norm": 1.1015625,
"learning_rate": 0.00019344230808426822,
"loss": 0.5392,
"step": 3665
},
{
"epoch": 0.88348579682234,
"grad_norm": 1.0546875,
"learning_rate": 0.00019335690678664452,
"loss": 0.6094,
"step": 3670
},
{
"epoch": 0.8846894559460761,
"grad_norm": 1.125,
"learning_rate": 0.00019327142302648855,
"loss": 0.5895,
"step": 3675
},
{
"epoch": 0.8858931150698123,
"grad_norm": 1.203125,
"learning_rate": 0.0001931858569403815,
"loss": 0.5965,
"step": 3680
},
{
"epoch": 0.8870967741935484,
"grad_norm": 1.1015625,
"learning_rate": 0.000193100208665036,
"loss": 0.5508,
"step": 3685
},
{
"epoch": 0.8883004333172846,
"grad_norm": 1.1875,
"learning_rate": 0.00019301447833729607,
"loss": 0.578,
"step": 3690
},
{
"epoch": 0.8895040924410207,
"grad_norm": 1.234375,
"learning_rate": 0.00019292866609413675,
"loss": 0.5845,
"step": 3695
},
{
"epoch": 0.8907077515647569,
"grad_norm": 1.171875,
"learning_rate": 0.00019284277207266408,
"loss": 0.6025,
"step": 3700
},
{
"epoch": 0.891911410688493,
"grad_norm": 0.9375,
"learning_rate": 0.0001927567964101146,
"loss": 0.5551,
"step": 3705
},
{
"epoch": 0.8931150698122292,
"grad_norm": 1.0703125,
"learning_rate": 0.00019267073924385546,
"loss": 0.5814,
"step": 3710
},
{
"epoch": 0.8943187289359653,
"grad_norm": 1.078125,
"learning_rate": 0.00019258460071138389,
"loss": 0.5823,
"step": 3715
},
{
"epoch": 0.8955223880597015,
"grad_norm": 1.0078125,
"learning_rate": 0.00019249838095032718,
"loss": 0.5693,
"step": 3720
},
{
"epoch": 0.8967260471834376,
"grad_norm": 1.0,
"learning_rate": 0.00019241208009844246,
"loss": 0.5843,
"step": 3725
},
{
"epoch": 0.8979297063071738,
"grad_norm": 0.97265625,
"learning_rate": 0.00019232569829361632,
"loss": 0.5916,
"step": 3730
},
{
"epoch": 0.8991333654309099,
"grad_norm": 1.09375,
"learning_rate": 0.00019223923567386478,
"loss": 0.5436,
"step": 3735
},
{
"epoch": 0.9003370245546461,
"grad_norm": 1.0,
"learning_rate": 0.0001921526923773329,
"loss": 0.5652,
"step": 3740
},
{
"epoch": 0.9015406836783822,
"grad_norm": 1.109375,
"learning_rate": 0.00019206606854229468,
"loss": 0.5477,
"step": 3745
},
{
"epoch": 0.9027443428021185,
"grad_norm": 1.0703125,
"learning_rate": 0.00019197936430715286,
"loss": 0.5735,
"step": 3750
},
{
"epoch": 0.9039480019258546,
"grad_norm": 1.03125,
"learning_rate": 0.00019189257981043852,
"loss": 0.5843,
"step": 3755
},
{
"epoch": 0.9051516610495908,
"grad_norm": 1.0625,
"learning_rate": 0.00019180571519081108,
"loss": 0.5866,
"step": 3760
},
{
"epoch": 0.9063553201733269,
"grad_norm": 1.1875,
"learning_rate": 0.0001917187705870579,
"loss": 0.6164,
"step": 3765
},
{
"epoch": 0.9075589792970631,
"grad_norm": 1.015625,
"learning_rate": 0.00019163174613809423,
"loss": 0.5736,
"step": 3770
},
{
"epoch": 0.9087626384207992,
"grad_norm": 1.0546875,
"learning_rate": 0.00019154464198296273,
"loss": 0.5858,
"step": 3775
},
{
"epoch": 0.9099662975445354,
"grad_norm": 1.09375,
"learning_rate": 0.0001914574582608336,
"loss": 0.5868,
"step": 3780
},
{
"epoch": 0.9111699566682715,
"grad_norm": 1.015625,
"learning_rate": 0.00019137019511100402,
"loss": 0.5738,
"step": 3785
},
{
"epoch": 0.9123736157920077,
"grad_norm": 1.0859375,
"learning_rate": 0.0001912828526728982,
"loss": 0.5732,
"step": 3790
},
{
"epoch": 0.9135772749157438,
"grad_norm": 1.078125,
"learning_rate": 0.00019119543108606687,
"loss": 0.5582,
"step": 3795
},
{
"epoch": 0.91478093403948,
"grad_norm": 1.109375,
"learning_rate": 0.0001911079304901874,
"loss": 0.5933,
"step": 3800
},
{
"epoch": 0.9159845931632162,
"grad_norm": 1.0390625,
"learning_rate": 0.00019102035102506326,
"loss": 0.5816,
"step": 3805
},
{
"epoch": 0.9171882522869523,
"grad_norm": 1.1328125,
"learning_rate": 0.00019093269283062403,
"loss": 0.6079,
"step": 3810
},
{
"epoch": 0.9183919114106885,
"grad_norm": 1.0546875,
"learning_rate": 0.000190844956046925,
"loss": 0.5637,
"step": 3815
},
{
"epoch": 0.9195955705344246,
"grad_norm": 1.1875,
"learning_rate": 0.00019075714081414705,
"loss": 0.6033,
"step": 3820
},
{
"epoch": 0.9207992296581609,
"grad_norm": 1.203125,
"learning_rate": 0.00019066924727259644,
"loss": 0.5661,
"step": 3825
},
{
"epoch": 0.922002888781897,
"grad_norm": 1.1015625,
"learning_rate": 0.00019058127556270451,
"loss": 0.5875,
"step": 3830
},
{
"epoch": 0.9232065479056332,
"grad_norm": 1.140625,
"learning_rate": 0.00019049322582502748,
"loss": 0.5951,
"step": 3835
},
{
"epoch": 0.9244102070293693,
"grad_norm": 1.0859375,
"learning_rate": 0.00019040509820024626,
"loss": 0.536,
"step": 3840
},
{
"epoch": 0.9256138661531055,
"grad_norm": 1.171875,
"learning_rate": 0.00019031689282916623,
"loss": 0.5662,
"step": 3845
},
{
"epoch": 0.9268175252768416,
"grad_norm": 1.0,
"learning_rate": 0.000190228609852717,
"loss": 0.5794,
"step": 3850
},
{
"epoch": 0.9280211844005778,
"grad_norm": 1.171875,
"learning_rate": 0.00019014024941195202,
"loss": 0.6226,
"step": 3855
},
{
"epoch": 0.9292248435243139,
"grad_norm": 1.1015625,
"learning_rate": 0.00019005181164804874,
"loss": 0.5725,
"step": 3860
},
{
"epoch": 0.9304285026480501,
"grad_norm": 1.109375,
"learning_rate": 0.0001899632967023079,
"loss": 0.5924,
"step": 3865
},
{
"epoch": 0.9316321617717862,
"grad_norm": 1.1640625,
"learning_rate": 0.00018987470471615382,
"loss": 0.6017,
"step": 3870
},
{
"epoch": 0.9328358208955224,
"grad_norm": 1.0546875,
"learning_rate": 0.00018978603583113374,
"loss": 0.5651,
"step": 3875
},
{
"epoch": 0.9340394800192585,
"grad_norm": 1.046875,
"learning_rate": 0.0001896972901889178,
"loss": 0.5857,
"step": 3880
},
{
"epoch": 0.9352431391429947,
"grad_norm": 1.0234375,
"learning_rate": 0.00018960846793129876,
"loss": 0.5409,
"step": 3885
},
{
"epoch": 0.9364467982667308,
"grad_norm": 1.125,
"learning_rate": 0.0001895195692001919,
"loss": 0.5686,
"step": 3890
},
{
"epoch": 0.937650457390467,
"grad_norm": 1.125,
"learning_rate": 0.00018943059413763452,
"loss": 0.5615,
"step": 3895
},
{
"epoch": 0.9388541165142031,
"grad_norm": 1.171875,
"learning_rate": 0.00018934154288578598,
"loss": 0.5917,
"step": 3900
},
{
"epoch": 0.9400577756379394,
"grad_norm": 0.99609375,
"learning_rate": 0.00018925241558692742,
"loss": 0.5641,
"step": 3905
},
{
"epoch": 0.9412614347616755,
"grad_norm": 1.09375,
"learning_rate": 0.0001891632123834613,
"loss": 0.5998,
"step": 3910
},
{
"epoch": 0.9424650938854117,
"grad_norm": 1.0703125,
"learning_rate": 0.00018907393341791154,
"loss": 0.5929,
"step": 3915
},
{
"epoch": 0.9436687530091478,
"grad_norm": 1.1015625,
"learning_rate": 0.00018898457883292306,
"loss": 0.5961,
"step": 3920
},
{
"epoch": 0.944872412132884,
"grad_norm": 1.0,
"learning_rate": 0.00018889514877126155,
"loss": 0.5484,
"step": 3925
},
{
"epoch": 0.9460760712566201,
"grad_norm": 1.1328125,
"learning_rate": 0.00018880564337581332,
"loss": 0.5637,
"step": 3930
},
{
"epoch": 0.9472797303803563,
"grad_norm": 1.0546875,
"learning_rate": 0.00018871606278958501,
"loss": 0.564,
"step": 3935
},
{
"epoch": 0.9484833895040924,
"grad_norm": 1.0234375,
"learning_rate": 0.0001886264071557035,
"loss": 0.546,
"step": 3940
},
{
"epoch": 0.9496870486278286,
"grad_norm": 1.140625,
"learning_rate": 0.0001885366766174155,
"loss": 0.5458,
"step": 3945
},
{
"epoch": 0.9508907077515648,
"grad_norm": 1.1015625,
"learning_rate": 0.00018844687131808741,
"loss": 0.5492,
"step": 3950
},
{
"epoch": 0.9520943668753009,
"grad_norm": 1.0078125,
"learning_rate": 0.00018835699140120504,
"loss": 0.5385,
"step": 3955
},
{
"epoch": 0.9532980259990371,
"grad_norm": 1.0390625,
"learning_rate": 0.00018826703701037344,
"loss": 0.5381,
"step": 3960
},
{
"epoch": 0.9545016851227732,
"grad_norm": 1.0390625,
"learning_rate": 0.00018817700828931675,
"loss": 0.5846,
"step": 3965
},
{
"epoch": 0.9557053442465094,
"grad_norm": 1.125,
"learning_rate": 0.0001880869053818777,
"loss": 0.5579,
"step": 3970
},
{
"epoch": 0.9569090033702455,
"grad_norm": 1.0859375,
"learning_rate": 0.0001879967284320177,
"loss": 0.5652,
"step": 3975
},
{
"epoch": 0.9581126624939817,
"grad_norm": 1.21875,
"learning_rate": 0.00018790647758381638,
"loss": 0.5809,
"step": 3980
},
{
"epoch": 0.9593163216177178,
"grad_norm": 1.21875,
"learning_rate": 0.00018781615298147142,
"loss": 0.5934,
"step": 3985
},
{
"epoch": 0.9605199807414541,
"grad_norm": 1.03125,
"learning_rate": 0.00018772575476929846,
"loss": 0.537,
"step": 3990
},
{
"epoch": 0.9617236398651902,
"grad_norm": 1.109375,
"learning_rate": 0.0001876352830917306,
"loss": 0.5878,
"step": 3995
},
{
"epoch": 0.9629272989889264,
"grad_norm": 1.3671875,
"learning_rate": 0.00018754473809331842,
"loss": 0.5488,
"step": 4000
},
{
"epoch": 0.9629272989889264,
"eval_loss": 0.473175972700119,
"eval_runtime": 2.3626,
"eval_samples_per_second": 84.654,
"eval_steps_per_second": 84.654,
"step": 4000
},
{
"epoch": 0.9641309581126625,
"grad_norm": 1.1015625,
"learning_rate": 0.00018745411991872958,
"loss": 0.6129,
"step": 4005
},
{
"epoch": 0.9653346172363987,
"grad_norm": 1.15625,
"learning_rate": 0.00018736342871274872,
"loss": 0.5727,
"step": 4010
},
{
"epoch": 0.9665382763601348,
"grad_norm": 1.140625,
"learning_rate": 0.00018727266462027715,
"loss": 0.5706,
"step": 4015
},
{
"epoch": 0.967741935483871,
"grad_norm": 1.125,
"learning_rate": 0.0001871818277863326,
"loss": 0.5742,
"step": 4020
},
{
"epoch": 0.9689455946076071,
"grad_norm": 1.171875,
"learning_rate": 0.0001870909183560491,
"loss": 0.5577,
"step": 4025
},
{
"epoch": 0.9701492537313433,
"grad_norm": 1.078125,
"learning_rate": 0.00018699993647467656,
"loss": 0.5688,
"step": 4030
},
{
"epoch": 0.9713529128550794,
"grad_norm": 1.0390625,
"learning_rate": 0.00018690888228758068,
"loss": 0.5731,
"step": 4035
},
{
"epoch": 0.9725565719788156,
"grad_norm": 1.1015625,
"learning_rate": 0.00018681775594024276,
"loss": 0.5729,
"step": 4040
},
{
"epoch": 0.9737602311025517,
"grad_norm": 1.0859375,
"learning_rate": 0.0001867265575782593,
"loss": 0.5622,
"step": 4045
},
{
"epoch": 0.9749638902262879,
"grad_norm": 1.078125,
"learning_rate": 0.000186635287347342,
"loss": 0.5817,
"step": 4050
},
{
"epoch": 0.976167549350024,
"grad_norm": 1.0703125,
"learning_rate": 0.00018654394539331719,
"loss": 0.555,
"step": 4055
},
{
"epoch": 0.9773712084737602,
"grad_norm": 1.09375,
"learning_rate": 0.00018645253186212586,
"loss": 0.5637,
"step": 4060
},
{
"epoch": 0.9785748675974963,
"grad_norm": 1.1171875,
"learning_rate": 0.00018636104689982353,
"loss": 0.5477,
"step": 4065
},
{
"epoch": 0.9797785267212326,
"grad_norm": 0.99609375,
"learning_rate": 0.0001862694906525796,
"loss": 0.5537,
"step": 4070
},
{
"epoch": 0.9809821858449687,
"grad_norm": 1.21875,
"learning_rate": 0.0001861778632666776,
"loss": 0.6147,
"step": 4075
},
{
"epoch": 0.9821858449687049,
"grad_norm": 0.96484375,
"learning_rate": 0.00018608616488851454,
"loss": 0.5849,
"step": 4080
},
{
"epoch": 0.983389504092441,
"grad_norm": 1.0,
"learning_rate": 0.00018599439566460084,
"loss": 0.5631,
"step": 4085
},
{
"epoch": 0.9845931632161772,
"grad_norm": 0.9921875,
"learning_rate": 0.00018590255574156032,
"loss": 0.5792,
"step": 4090
},
{
"epoch": 0.9857968223399133,
"grad_norm": 1.109375,
"learning_rate": 0.00018581064526612963,
"loss": 0.5629,
"step": 4095
},
{
"epoch": 0.9870004814636495,
"grad_norm": 1.1875,
"learning_rate": 0.00018571866438515805,
"loss": 0.5797,
"step": 4100
},
{
"epoch": 0.9882041405873857,
"grad_norm": 1.09375,
"learning_rate": 0.00018562661324560752,
"loss": 0.5703,
"step": 4105
},
{
"epoch": 0.9894077997111218,
"grad_norm": 1.125,
"learning_rate": 0.00018553449199455214,
"loss": 0.5649,
"step": 4110
},
{
"epoch": 0.990611458834858,
"grad_norm": 1.109375,
"learning_rate": 0.0001854423007791781,
"loss": 0.556,
"step": 4115
},
{
"epoch": 0.9918151179585941,
"grad_norm": 1.125,
"learning_rate": 0.00018535003974678324,
"loss": 0.5756,
"step": 4120
},
{
"epoch": 0.9930187770823303,
"grad_norm": 1.109375,
"learning_rate": 0.0001852577090447771,
"loss": 0.561,
"step": 4125
},
{
"epoch": 0.9942224362060664,
"grad_norm": 1.0546875,
"learning_rate": 0.00018516530882068053,
"loss": 0.5537,
"step": 4130
},
{
"epoch": 0.9954260953298026,
"grad_norm": 0.98828125,
"learning_rate": 0.00018507283922212524,
"loss": 0.5629,
"step": 4135
},
{
"epoch": 0.9966297544535387,
"grad_norm": 1.1640625,
"learning_rate": 0.0001849803003968541,
"loss": 0.5457,
"step": 4140
},
{
"epoch": 0.997833413577275,
"grad_norm": 1.265625,
"learning_rate": 0.00018488769249272035,
"loss": 0.5801,
"step": 4145
},
{
"epoch": 0.999037072701011,
"grad_norm": 1.265625,
"learning_rate": 0.00018479501565768768,
"loss": 0.5627,
"step": 4150
},
{
"epoch": 0.9997592681752527,
"eval_loss": 0.4714100658893585,
"eval_runtime": 2.3556,
"eval_samples_per_second": 84.904,
"eval_steps_per_second": 84.904,
"step": 4153
},
{
"epoch": 1.0002407318247473,
"grad_norm": 1.1015625,
"learning_rate": 0.00018470227003982996,
"loss": 0.5758,
"step": 4155
},
{
"epoch": 1.0014443909484834,
"grad_norm": 1.09375,
"learning_rate": 0.00018460945578733083,
"loss": 0.5154,
"step": 4160
},
{
"epoch": 1.0026480500722195,
"grad_norm": 0.98828125,
"learning_rate": 0.00018451657304848377,
"loss": 0.5032,
"step": 4165
},
{
"epoch": 1.0038517091959558,
"grad_norm": 0.984375,
"learning_rate": 0.00018442362197169154,
"loss": 0.5071,
"step": 4170
},
{
"epoch": 1.0050553683196919,
"grad_norm": 1.0078125,
"learning_rate": 0.00018433060270546612,
"loss": 0.5021,
"step": 4175
},
{
"epoch": 1.006259027443428,
"grad_norm": 0.99609375,
"learning_rate": 0.00018423751539842846,
"loss": 0.5127,
"step": 4180
},
{
"epoch": 1.007462686567164,
"grad_norm": 1.140625,
"learning_rate": 0.00018414436019930825,
"loss": 0.5438,
"step": 4185
},
{
"epoch": 1.0086663456909004,
"grad_norm": 1.15625,
"learning_rate": 0.00018405113725694357,
"loss": 0.5077,
"step": 4190
},
{
"epoch": 1.0098700048146365,
"grad_norm": 1.046875,
"learning_rate": 0.00018395784672028083,
"loss": 0.5246,
"step": 4195
},
{
"epoch": 1.0110736639383726,
"grad_norm": 1.046875,
"learning_rate": 0.00018386448873837434,
"loss": 0.5296,
"step": 4200
},
{
"epoch": 1.0122773230621087,
"grad_norm": 1.125,
"learning_rate": 0.00018377106346038625,
"loss": 0.5501,
"step": 4205
},
{
"epoch": 1.013480982185845,
"grad_norm": 0.9609375,
"learning_rate": 0.00018367757103558618,
"loss": 0.538,
"step": 4210
},
{
"epoch": 1.0146846413095811,
"grad_norm": 1.0390625,
"learning_rate": 0.0001835840116133511,
"loss": 0.4937,
"step": 4215
},
{
"epoch": 1.0158883004333172,
"grad_norm": 1.125,
"learning_rate": 0.00018349038534316495,
"loss": 0.5249,
"step": 4220
},
{
"epoch": 1.0170919595570533,
"grad_norm": 1.1328125,
"learning_rate": 0.00018339669237461853,
"loss": 0.5116,
"step": 4225
},
{
"epoch": 1.0182956186807897,
"grad_norm": 1.0390625,
"learning_rate": 0.00018330293285740915,
"loss": 0.5282,
"step": 4230
},
{
"epoch": 1.0194992778045258,
"grad_norm": 1.109375,
"learning_rate": 0.00018320910694134054,
"loss": 0.5156,
"step": 4235
},
{
"epoch": 1.0207029369282619,
"grad_norm": 1.0625,
"learning_rate": 0.0001831152147763224,
"loss": 0.5151,
"step": 4240
},
{
"epoch": 1.021906596051998,
"grad_norm": 1.0859375,
"learning_rate": 0.0001830212565123704,
"loss": 0.5333,
"step": 4245
},
{
"epoch": 1.0231102551757343,
"grad_norm": 1.0078125,
"learning_rate": 0.0001829272322996057,
"loss": 0.5131,
"step": 4250
},
{
"epoch": 1.0243139142994704,
"grad_norm": 0.95703125,
"learning_rate": 0.0001828331422882549,
"loss": 0.5325,
"step": 4255
},
{
"epoch": 1.0255175734232065,
"grad_norm": 1.0703125,
"learning_rate": 0.00018273898662864982,
"loss": 0.5368,
"step": 4260
},
{
"epoch": 1.0267212325469428,
"grad_norm": 1.078125,
"learning_rate": 0.0001826447654712269,
"loss": 0.5401,
"step": 4265
},
{
"epoch": 1.027924891670679,
"grad_norm": 1.015625,
"learning_rate": 0.00018255047896652754,
"loss": 0.5176,
"step": 4270
},
{
"epoch": 1.029128550794415,
"grad_norm": 1.0625,
"learning_rate": 0.00018245612726519733,
"loss": 0.5359,
"step": 4275
},
{
"epoch": 1.030332209918151,
"grad_norm": 0.97265625,
"learning_rate": 0.00018236171051798608,
"loss": 0.5106,
"step": 4280
},
{
"epoch": 1.0315358690418874,
"grad_norm": 0.9921875,
"learning_rate": 0.00018226722887574757,
"loss": 0.5135,
"step": 4285
},
{
"epoch": 1.0327395281656235,
"grad_norm": 1.1484375,
"learning_rate": 0.00018217268248943932,
"loss": 0.4892,
"step": 4290
},
{
"epoch": 1.0339431872893596,
"grad_norm": 0.97265625,
"learning_rate": 0.0001820780715101221,
"loss": 0.4732,
"step": 4295
},
{
"epoch": 1.0351468464130957,
"grad_norm": 0.9921875,
"learning_rate": 0.0001819833960889601,
"loss": 0.4807,
"step": 4300
},
{
"epoch": 1.036350505536832,
"grad_norm": 1.203125,
"learning_rate": 0.00018188865637722027,
"loss": 0.527,
"step": 4305
},
{
"epoch": 1.0375541646605682,
"grad_norm": 1.1015625,
"learning_rate": 0.00018179385252627245,
"loss": 0.5214,
"step": 4310
},
{
"epoch": 1.0387578237843043,
"grad_norm": 1.1015625,
"learning_rate": 0.00018169898468758892,
"loss": 0.5155,
"step": 4315
},
{
"epoch": 1.0399614829080404,
"grad_norm": 1.0078125,
"learning_rate": 0.00018160405301274413,
"loss": 0.5111,
"step": 4320
},
{
"epoch": 1.0411651420317767,
"grad_norm": 0.984375,
"learning_rate": 0.00018150905765341454,
"loss": 0.5386,
"step": 4325
},
{
"epoch": 1.0423688011555128,
"grad_norm": 1.03125,
"learning_rate": 0.0001814139987613784,
"loss": 0.5081,
"step": 4330
},
{
"epoch": 1.0435724602792489,
"grad_norm": 1.109375,
"learning_rate": 0.0001813188764885154,
"loss": 0.4958,
"step": 4335
},
{
"epoch": 1.044776119402985,
"grad_norm": 0.9609375,
"learning_rate": 0.00018122369098680667,
"loss": 0.4992,
"step": 4340
},
{
"epoch": 1.0459797785267213,
"grad_norm": 1.078125,
"learning_rate": 0.00018112844240833413,
"loss": 0.4991,
"step": 4345
},
{
"epoch": 1.0471834376504574,
"grad_norm": 1.125,
"learning_rate": 0.00018103313090528066,
"loss": 0.53,
"step": 4350
},
{
"epoch": 1.0483870967741935,
"grad_norm": 1.140625,
"learning_rate": 0.00018093775662992957,
"loss": 0.4915,
"step": 4355
},
{
"epoch": 1.0495907558979296,
"grad_norm": 1.09375,
"learning_rate": 0.00018084231973466449,
"loss": 0.4983,
"step": 4360
},
{
"epoch": 1.050794415021666,
"grad_norm": 1.15625,
"learning_rate": 0.00018074682037196914,
"loss": 0.5029,
"step": 4365
},
{
"epoch": 1.051998074145402,
"grad_norm": 0.99609375,
"learning_rate": 0.00018065125869442703,
"loss": 0.4978,
"step": 4370
},
{
"epoch": 1.0532017332691381,
"grad_norm": 0.94921875,
"learning_rate": 0.00018055563485472122,
"loss": 0.5173,
"step": 4375
},
{
"epoch": 1.0544053923928742,
"grad_norm": 1.0625,
"learning_rate": 0.0001804599490056341,
"loss": 0.4853,
"step": 4380
},
{
"epoch": 1.0556090515166106,
"grad_norm": 1.0859375,
"learning_rate": 0.00018036420130004702,
"loss": 0.5271,
"step": 4385
},
{
"epoch": 1.0568127106403467,
"grad_norm": 1.09375,
"learning_rate": 0.0001802683918909404,
"loss": 0.54,
"step": 4390
},
{
"epoch": 1.0580163697640828,
"grad_norm": 1.09375,
"learning_rate": 0.00018017252093139308,
"loss": 0.5277,
"step": 4395
},
{
"epoch": 1.059220028887819,
"grad_norm": 0.98046875,
"learning_rate": 0.0001800765885745823,
"loss": 0.5134,
"step": 4400
},
{
"epoch": 1.0604236880115552,
"grad_norm": 1.0234375,
"learning_rate": 0.0001799805949737833,
"loss": 0.5182,
"step": 4405
},
{
"epoch": 1.0616273471352913,
"grad_norm": 1.03125,
"learning_rate": 0.00017988454028236927,
"loss": 0.5119,
"step": 4410
},
{
"epoch": 1.0628310062590274,
"grad_norm": 1.046875,
"learning_rate": 0.000179788424653811,
"loss": 0.513,
"step": 4415
},
{
"epoch": 1.0640346653827637,
"grad_norm": 1.0625,
"learning_rate": 0.00017969224824167666,
"loss": 0.5183,
"step": 4420
},
{
"epoch": 1.0652383245064998,
"grad_norm": 1.1640625,
"learning_rate": 0.0001795960111996315,
"loss": 0.5302,
"step": 4425
},
{
"epoch": 1.066441983630236,
"grad_norm": 1.109375,
"learning_rate": 0.00017949971368143755,
"loss": 0.5186,
"step": 4430
},
{
"epoch": 1.067645642753972,
"grad_norm": 1.0859375,
"learning_rate": 0.00017940335584095362,
"loss": 0.5549,
"step": 4435
},
{
"epoch": 1.0688493018777083,
"grad_norm": 1.046875,
"learning_rate": 0.00017930693783213485,
"loss": 0.512,
"step": 4440
},
{
"epoch": 1.0700529610014444,
"grad_norm": 1.046875,
"learning_rate": 0.00017921045980903244,
"loss": 0.5112,
"step": 4445
},
{
"epoch": 1.0712566201251805,
"grad_norm": 0.99609375,
"learning_rate": 0.00017911392192579369,
"loss": 0.5075,
"step": 4450
},
{
"epoch": 1.0724602792489166,
"grad_norm": 1.03125,
"learning_rate": 0.00017901732433666122,
"loss": 0.5167,
"step": 4455
},
{
"epoch": 1.073663938372653,
"grad_norm": 0.95703125,
"learning_rate": 0.00017892066719597325,
"loss": 0.4909,
"step": 4460
},
{
"epoch": 1.074867597496389,
"grad_norm": 1.015625,
"learning_rate": 0.00017882395065816316,
"loss": 0.5075,
"step": 4465
},
{
"epoch": 1.0760712566201251,
"grad_norm": 1.0546875,
"learning_rate": 0.00017872717487775916,
"loss": 0.5174,
"step": 4470
},
{
"epoch": 1.0772749157438612,
"grad_norm": 1.078125,
"learning_rate": 0.00017863034000938416,
"loss": 0.5006,
"step": 4475
},
{
"epoch": 1.0784785748675976,
"grad_norm": 1.0859375,
"learning_rate": 0.00017853344620775546,
"loss": 0.4767,
"step": 4480
},
{
"epoch": 1.0796822339913337,
"grad_norm": 1.0390625,
"learning_rate": 0.00017843649362768446,
"loss": 0.5045,
"step": 4485
},
{
"epoch": 1.0808858931150698,
"grad_norm": 1.0078125,
"learning_rate": 0.00017833948242407656,
"loss": 0.5218,
"step": 4490
},
{
"epoch": 1.0820895522388059,
"grad_norm": 0.98046875,
"learning_rate": 0.00017824241275193084,
"loss": 0.4977,
"step": 4495
},
{
"epoch": 1.0832932113625422,
"grad_norm": 0.94140625,
"learning_rate": 0.00017814528476633966,
"loss": 0.5185,
"step": 4500
},
{
"epoch": 1.0832932113625422,
"eval_loss": 0.4601253271102905,
"eval_runtime": 2.3727,
"eval_samples_per_second": 84.292,
"eval_steps_per_second": 84.292,
"step": 4500
},
{
"epoch": 1.0844968704862783,
"grad_norm": 0.94921875,
"learning_rate": 0.00017804809862248877,
"loss": 0.5012,
"step": 4505
},
{
"epoch": 1.0857005296100144,
"grad_norm": 1.0,
"learning_rate": 0.0001779508544756566,
"loss": 0.5471,
"step": 4510
},
{
"epoch": 1.0869041887337505,
"grad_norm": 1.046875,
"learning_rate": 0.00017785355248121438,
"loss": 0.5089,
"step": 4515
},
{
"epoch": 1.0881078478574868,
"grad_norm": 1.0,
"learning_rate": 0.00017775619279462575,
"loss": 0.5724,
"step": 4520
},
{
"epoch": 1.089311506981223,
"grad_norm": 1.09375,
"learning_rate": 0.0001776587755714466,
"loss": 0.504,
"step": 4525
},
{
"epoch": 1.090515166104959,
"grad_norm": 1.0625,
"learning_rate": 0.00017756130096732465,
"loss": 0.5172,
"step": 4530
},
{
"epoch": 1.0917188252286953,
"grad_norm": 1.1484375,
"learning_rate": 0.0001774637691379993,
"loss": 0.5528,
"step": 4535
},
{
"epoch": 1.0929224843524314,
"grad_norm": 1.1015625,
"learning_rate": 0.0001773661802393014,
"loss": 0.4926,
"step": 4540
},
{
"epoch": 1.0941261434761675,
"grad_norm": 1.03125,
"learning_rate": 0.00017726853442715307,
"loss": 0.5231,
"step": 4545
},
{
"epoch": 1.0953298025999036,
"grad_norm": 1.203125,
"learning_rate": 0.00017717083185756724,
"loss": 0.4928,
"step": 4550
},
{
"epoch": 1.09653346172364,
"grad_norm": 1.046875,
"learning_rate": 0.00017707307268664753,
"loss": 0.5282,
"step": 4555
},
{
"epoch": 1.097737120847376,
"grad_norm": 1.203125,
"learning_rate": 0.00017697525707058813,
"loss": 0.5398,
"step": 4560
},
{
"epoch": 1.0989407799711122,
"grad_norm": 1.0234375,
"learning_rate": 0.00017687738516567323,
"loss": 0.5118,
"step": 4565
},
{
"epoch": 1.1001444390948483,
"grad_norm": 0.96875,
"learning_rate": 0.00017677945712827705,
"loss": 0.5231,
"step": 4570
},
{
"epoch": 1.1013480982185846,
"grad_norm": 1.0390625,
"learning_rate": 0.00017668147311486354,
"loss": 0.4934,
"step": 4575
},
{
"epoch": 1.1025517573423207,
"grad_norm": 1.1015625,
"learning_rate": 0.000176583433281986,
"loss": 0.4904,
"step": 4580
},
{
"epoch": 1.1037554164660568,
"grad_norm": 1.1015625,
"learning_rate": 0.00017648533778628697,
"loss": 0.531,
"step": 4585
},
{
"epoch": 1.104959075589793,
"grad_norm": 1.1015625,
"learning_rate": 0.0001763871867844979,
"loss": 0.5164,
"step": 4590
},
{
"epoch": 1.1061627347135292,
"grad_norm": 1.0078125,
"learning_rate": 0.0001762889804334389,
"loss": 0.4862,
"step": 4595
},
{
"epoch": 1.1073663938372653,
"grad_norm": 1.1015625,
"learning_rate": 0.0001761907188900186,
"loss": 0.516,
"step": 4600
},
{
"epoch": 1.1085700529610014,
"grad_norm": 1.0625,
"learning_rate": 0.00017609240231123368,
"loss": 0.4761,
"step": 4605
},
{
"epoch": 1.1097737120847375,
"grad_norm": 1.015625,
"learning_rate": 0.0001759940308541689,
"loss": 0.5258,
"step": 4610
},
{
"epoch": 1.1109773712084738,
"grad_norm": 1.0625,
"learning_rate": 0.00017589560467599663,
"loss": 0.5182,
"step": 4615
},
{
"epoch": 1.11218103033221,
"grad_norm": 1.109375,
"learning_rate": 0.0001757971239339766,
"loss": 0.4912,
"step": 4620
},
{
"epoch": 1.113384689455946,
"grad_norm": 1.125,
"learning_rate": 0.0001756985887854559,
"loss": 0.5332,
"step": 4625
},
{
"epoch": 1.1145883485796821,
"grad_norm": 1.0234375,
"learning_rate": 0.0001755999993878683,
"loss": 0.5434,
"step": 4630
},
{
"epoch": 1.1157920077034185,
"grad_norm": 1.2265625,
"learning_rate": 0.0001755013558987345,
"loss": 0.5253,
"step": 4635
},
{
"epoch": 1.1169956668271546,
"grad_norm": 1.0625,
"learning_rate": 0.00017540265847566146,
"loss": 0.5071,
"step": 4640
},
{
"epoch": 1.1181993259508907,
"grad_norm": 1.0,
"learning_rate": 0.00017530390727634238,
"loss": 0.5191,
"step": 4645
},
{
"epoch": 1.1194029850746268,
"grad_norm": 1.140625,
"learning_rate": 0.00017520510245855632,
"loss": 0.4931,
"step": 4650
},
{
"epoch": 1.120606644198363,
"grad_norm": 1.0234375,
"learning_rate": 0.0001751062441801681,
"loss": 0.5007,
"step": 4655
},
{
"epoch": 1.1218103033220992,
"grad_norm": 1.0,
"learning_rate": 0.00017500733259912787,
"loss": 0.5007,
"step": 4660
},
{
"epoch": 1.1230139624458353,
"grad_norm": 1.015625,
"learning_rate": 0.00017490836787347104,
"loss": 0.5392,
"step": 4665
},
{
"epoch": 1.1242176215695716,
"grad_norm": 1.03125,
"learning_rate": 0.00017480935016131777,
"loss": 0.4719,
"step": 4670
},
{
"epoch": 1.1254212806933077,
"grad_norm": 0.9609375,
"learning_rate": 0.00017471027962087302,
"loss": 0.5314,
"step": 4675
},
{
"epoch": 1.1266249398170438,
"grad_norm": 1.0546875,
"learning_rate": 0.0001746111564104262,
"loss": 0.5051,
"step": 4680
},
{
"epoch": 1.12782859894078,
"grad_norm": 1.03125,
"learning_rate": 0.00017451198068835067,
"loss": 0.4988,
"step": 4685
},
{
"epoch": 1.129032258064516,
"grad_norm": 1.046875,
"learning_rate": 0.0001744127526131039,
"loss": 0.5184,
"step": 4690
},
{
"epoch": 1.1302359171882523,
"grad_norm": 1.03125,
"learning_rate": 0.0001743134723432269,
"loss": 0.5001,
"step": 4695
},
{
"epoch": 1.1314395763119884,
"grad_norm": 0.9609375,
"learning_rate": 0.000174214140037344,
"loss": 0.508,
"step": 4700
},
{
"epoch": 1.1326432354357245,
"grad_norm": 1.03125,
"learning_rate": 0.0001741147558541629,
"loss": 0.4887,
"step": 4705
},
{
"epoch": 1.1338468945594609,
"grad_norm": 1.109375,
"learning_rate": 0.00017401531995247393,
"loss": 0.5084,
"step": 4710
},
{
"epoch": 1.135050553683197,
"grad_norm": 1.1640625,
"learning_rate": 0.00017391583249115025,
"loss": 0.5178,
"step": 4715
},
{
"epoch": 1.136254212806933,
"grad_norm": 1.015625,
"learning_rate": 0.0001738162936291473,
"loss": 0.5362,
"step": 4720
},
{
"epoch": 1.1374578719306692,
"grad_norm": 1.078125,
"learning_rate": 0.0001737167035255026,
"loss": 0.5131,
"step": 4725
},
{
"epoch": 1.1386615310544055,
"grad_norm": 1.1953125,
"learning_rate": 0.0001736170623393357,
"loss": 0.5142,
"step": 4730
},
{
"epoch": 1.1398651901781416,
"grad_norm": 0.98828125,
"learning_rate": 0.0001735173702298476,
"loss": 0.5335,
"step": 4735
},
{
"epoch": 1.1410688493018777,
"grad_norm": 1.0,
"learning_rate": 0.0001734176273563208,
"loss": 0.4964,
"step": 4740
},
{
"epoch": 1.1422725084256138,
"grad_norm": 0.99609375,
"learning_rate": 0.0001733178338781188,
"loss": 0.5262,
"step": 4745
},
{
"epoch": 1.14347616754935,
"grad_norm": 0.96875,
"learning_rate": 0.000173217989954686,
"loss": 0.5173,
"step": 4750
},
{
"epoch": 1.1446798266730862,
"grad_norm": 1.125,
"learning_rate": 0.0001731180957455474,
"loss": 0.4965,
"step": 4755
},
{
"epoch": 1.1458834857968223,
"grad_norm": 1.0234375,
"learning_rate": 0.00017301815141030833,
"loss": 0.5187,
"step": 4760
},
{
"epoch": 1.1470871449205584,
"grad_norm": 1.078125,
"learning_rate": 0.0001729181571086542,
"loss": 0.4978,
"step": 4765
},
{
"epoch": 1.1482908040442947,
"grad_norm": 1.09375,
"learning_rate": 0.00017281811300035033,
"loss": 0.5058,
"step": 4770
},
{
"epoch": 1.1494944631680308,
"grad_norm": 1.0859375,
"learning_rate": 0.00017271801924524153,
"loss": 0.4835,
"step": 4775
},
{
"epoch": 1.150698122291767,
"grad_norm": 1.1171875,
"learning_rate": 0.00017261787600325192,
"loss": 0.5231,
"step": 4780
},
{
"epoch": 1.151901781415503,
"grad_norm": 1.2109375,
"learning_rate": 0.00017251768343438478,
"loss": 0.5286,
"step": 4785
},
{
"epoch": 1.1531054405392394,
"grad_norm": 0.96875,
"learning_rate": 0.00017241744169872213,
"loss": 0.5147,
"step": 4790
},
{
"epoch": 1.1543090996629755,
"grad_norm": 1.0234375,
"learning_rate": 0.00017231715095642456,
"loss": 0.5082,
"step": 4795
},
{
"epoch": 1.1555127587867116,
"grad_norm": 1.015625,
"learning_rate": 0.00017221681136773102,
"loss": 0.4931,
"step": 4800
},
{
"epoch": 1.1567164179104479,
"grad_norm": 0.96875,
"learning_rate": 0.00017211642309295836,
"loss": 0.5091,
"step": 4805
},
{
"epoch": 1.157920077034184,
"grad_norm": 1.0,
"learning_rate": 0.00017201598629250136,
"loss": 0.5141,
"step": 4810
},
{
"epoch": 1.15912373615792,
"grad_norm": 1.078125,
"learning_rate": 0.00017191550112683227,
"loss": 0.5256,
"step": 4815
},
{
"epoch": 1.1603273952816562,
"grad_norm": 1.1015625,
"learning_rate": 0.00017181496775650065,
"loss": 0.5224,
"step": 4820
},
{
"epoch": 1.1615310544053923,
"grad_norm": 1.0390625,
"learning_rate": 0.000171714386342133,
"loss": 0.5274,
"step": 4825
},
{
"epoch": 1.1627347135291286,
"grad_norm": 1.015625,
"learning_rate": 0.00017161375704443271,
"loss": 0.4672,
"step": 4830
},
{
"epoch": 1.1639383726528647,
"grad_norm": 1.140625,
"learning_rate": 0.0001715130800241795,
"loss": 0.4977,
"step": 4835
},
{
"epoch": 1.1651420317766008,
"grad_norm": 1.03125,
"learning_rate": 0.00017141235544222957,
"loss": 0.5154,
"step": 4840
},
{
"epoch": 1.1663456909003371,
"grad_norm": 1.046875,
"learning_rate": 0.0001713115834595149,
"loss": 0.505,
"step": 4845
},
{
"epoch": 1.1675493500240732,
"grad_norm": 1.015625,
"learning_rate": 0.00017121076423704326,
"loss": 0.5299,
"step": 4850
},
{
"epoch": 1.1687530091478093,
"grad_norm": 0.98828125,
"learning_rate": 0.000171109897935898,
"loss": 0.4598,
"step": 4855
},
{
"epoch": 1.1699566682715454,
"grad_norm": 1.125,
"learning_rate": 0.00017100898471723755,
"loss": 0.5144,
"step": 4860
},
{
"epoch": 1.1711603273952818,
"grad_norm": 1.0546875,
"learning_rate": 0.00017090802474229537,
"loss": 0.4919,
"step": 4865
},
{
"epoch": 1.1723639865190179,
"grad_norm": 1.0625,
"learning_rate": 0.00017080701817237962,
"loss": 0.4874,
"step": 4870
},
{
"epoch": 1.173567645642754,
"grad_norm": 1.1328125,
"learning_rate": 0.00017070596516887296,
"loss": 0.53,
"step": 4875
},
{
"epoch": 1.17477130476649,
"grad_norm": 1.0703125,
"learning_rate": 0.00017060486589323212,
"loss": 0.5203,
"step": 4880
},
{
"epoch": 1.1759749638902264,
"grad_norm": 0.96484375,
"learning_rate": 0.00017050372050698786,
"loss": 0.5252,
"step": 4885
},
{
"epoch": 1.1771786230139625,
"grad_norm": 1.078125,
"learning_rate": 0.00017040252917174456,
"loss": 0.4889,
"step": 4890
},
{
"epoch": 1.1783822821376986,
"grad_norm": 1.0234375,
"learning_rate": 0.00017030129204918004,
"loss": 0.4896,
"step": 4895
},
{
"epoch": 1.1795859412614347,
"grad_norm": 1.0625,
"learning_rate": 0.00017020000930104528,
"loss": 0.4754,
"step": 4900
},
{
"epoch": 1.180789600385171,
"grad_norm": 0.94921875,
"learning_rate": 0.00017009868108916408,
"loss": 0.4965,
"step": 4905
},
{
"epoch": 1.181993259508907,
"grad_norm": 1.078125,
"learning_rate": 0.00016999730757543308,
"loss": 0.5535,
"step": 4910
},
{
"epoch": 1.1831969186326432,
"grad_norm": 1.0546875,
"learning_rate": 0.00016989588892182107,
"loss": 0.4808,
"step": 4915
},
{
"epoch": 1.1844005777563793,
"grad_norm": 1.0546875,
"learning_rate": 0.00016979442529036905,
"loss": 0.5116,
"step": 4920
},
{
"epoch": 1.1856042368801156,
"grad_norm": 1.0390625,
"learning_rate": 0.00016969291684318995,
"loss": 0.4961,
"step": 4925
},
{
"epoch": 1.1868078960038517,
"grad_norm": 1.0546875,
"learning_rate": 0.00016959136374246822,
"loss": 0.5296,
"step": 4930
},
{
"epoch": 1.1880115551275878,
"grad_norm": 1.0390625,
"learning_rate": 0.00016948976615045966,
"loss": 0.5195,
"step": 4935
},
{
"epoch": 1.1892152142513241,
"grad_norm": 0.9453125,
"learning_rate": 0.00016938812422949122,
"loss": 0.4838,
"step": 4940
},
{
"epoch": 1.1904188733750602,
"grad_norm": 1.0859375,
"learning_rate": 0.0001692864381419606,
"loss": 0.5207,
"step": 4945
},
{
"epoch": 1.1916225324987963,
"grad_norm": 1.0078125,
"learning_rate": 0.00016918470805033615,
"loss": 0.5011,
"step": 4950
},
{
"epoch": 1.1928261916225325,
"grad_norm": 1.015625,
"learning_rate": 0.0001690829341171564,
"loss": 0.4778,
"step": 4955
},
{
"epoch": 1.1940298507462686,
"grad_norm": 1.0703125,
"learning_rate": 0.00016898111650503006,
"loss": 0.5275,
"step": 4960
},
{
"epoch": 1.1952335098700049,
"grad_norm": 1.109375,
"learning_rate": 0.00016887925537663556,
"loss": 0.4944,
"step": 4965
},
{
"epoch": 1.196437168993741,
"grad_norm": 0.99609375,
"learning_rate": 0.00016877735089472089,
"loss": 0.4771,
"step": 4970
},
{
"epoch": 1.197640828117477,
"grad_norm": 1.1015625,
"learning_rate": 0.00016867540322210322,
"loss": 0.5073,
"step": 4975
},
{
"epoch": 1.1988444872412134,
"grad_norm": 1.0546875,
"learning_rate": 0.00016857341252166886,
"loss": 0.494,
"step": 4980
},
{
"epoch": 1.2000481463649495,
"grad_norm": 1.1015625,
"learning_rate": 0.0001684713789563728,
"loss": 0.5127,
"step": 4985
},
{
"epoch": 1.2012518054886856,
"grad_norm": 1.0546875,
"learning_rate": 0.0001683693026892385,
"loss": 0.4949,
"step": 4990
},
{
"epoch": 1.2024554646124217,
"grad_norm": 1.125,
"learning_rate": 0.00016826718388335767,
"loss": 0.5115,
"step": 4995
},
{
"epoch": 1.2036591237361578,
"grad_norm": 1.1875,
"learning_rate": 0.00016816502270189002,
"loss": 0.5322,
"step": 5000
},
{
"epoch": 1.2036591237361578,
"eval_loss": 0.45123574137687683,
"eval_runtime": 2.3732,
"eval_samples_per_second": 84.276,
"eval_steps_per_second": 84.276,
"step": 5000
},
{
"epoch": 1.2048627828598941,
"grad_norm": 1.0234375,
"learning_rate": 0.00016806281930806287,
"loss": 0.4787,
"step": 5005
},
{
"epoch": 1.2060664419836302,
"grad_norm": 1.0390625,
"learning_rate": 0.0001679605738651711,
"loss": 0.4762,
"step": 5010
},
{
"epoch": 1.2072701011073663,
"grad_norm": 0.9765625,
"learning_rate": 0.00016785828653657667,
"loss": 0.5031,
"step": 5015
},
{
"epoch": 1.2084737602311026,
"grad_norm": 0.953125,
"learning_rate": 0.00016775595748570854,
"loss": 0.5028,
"step": 5020
},
{
"epoch": 1.2096774193548387,
"grad_norm": 0.94921875,
"learning_rate": 0.0001676535868760623,
"loss": 0.5268,
"step": 5025
},
{
"epoch": 1.2108810784785748,
"grad_norm": 1.0546875,
"learning_rate": 0.00016755117487119987,
"loss": 0.5078,
"step": 5030
},
{
"epoch": 1.212084737602311,
"grad_norm": 1.0078125,
"learning_rate": 0.0001674487216347495,
"loss": 0.5108,
"step": 5035
},
{
"epoch": 1.2132883967260473,
"grad_norm": 1.0546875,
"learning_rate": 0.00016734622733040514,
"loss": 0.5243,
"step": 5040
},
{
"epoch": 1.2144920558497834,
"grad_norm": 1.0234375,
"learning_rate": 0.00016724369212192637,
"loss": 0.5167,
"step": 5045
},
{
"epoch": 1.2156957149735195,
"grad_norm": 1.015625,
"learning_rate": 0.0001671411161731382,
"loss": 0.5034,
"step": 5050
},
{
"epoch": 1.2168993740972556,
"grad_norm": 1.03125,
"learning_rate": 0.00016703849964793077,
"loss": 0.5159,
"step": 5055
},
{
"epoch": 1.218103033220992,
"grad_norm": 0.9375,
"learning_rate": 0.00016693584271025892,
"loss": 0.4952,
"step": 5060
},
{
"epoch": 1.219306692344728,
"grad_norm": 0.984375,
"learning_rate": 0.00016683314552414207,
"loss": 0.4768,
"step": 5065
},
{
"epoch": 1.220510351468464,
"grad_norm": 1.0703125,
"learning_rate": 0.0001667304082536641,
"loss": 0.4991,
"step": 5070
},
{
"epoch": 1.2217140105922002,
"grad_norm": 0.94140625,
"learning_rate": 0.00016662763106297274,
"loss": 0.4968,
"step": 5075
},
{
"epoch": 1.2229176697159365,
"grad_norm": 1.046875,
"learning_rate": 0.00016652481411627966,
"loss": 0.5338,
"step": 5080
},
{
"epoch": 1.2241213288396726,
"grad_norm": 0.90234375,
"learning_rate": 0.00016642195757785995,
"loss": 0.4641,
"step": 5085
},
{
"epoch": 1.2253249879634087,
"grad_norm": 1.0390625,
"learning_rate": 0.000166319061612052,
"loss": 0.5103,
"step": 5090
},
{
"epoch": 1.2265286470871448,
"grad_norm": 1.0234375,
"learning_rate": 0.00016621612638325717,
"loss": 0.4952,
"step": 5095
},
{
"epoch": 1.2277323062108811,
"grad_norm": 0.97265625,
"learning_rate": 0.00016611315205593958,
"loss": 0.5161,
"step": 5100
},
{
"epoch": 1.2289359653346172,
"grad_norm": 1.0703125,
"learning_rate": 0.00016601013879462585,
"loss": 0.4879,
"step": 5105
},
{
"epoch": 1.2301396244583533,
"grad_norm": 1.046875,
"learning_rate": 0.0001659070867639047,
"loss": 0.4961,
"step": 5110
},
{
"epoch": 1.2313432835820897,
"grad_norm": 0.97265625,
"learning_rate": 0.00016580399612842685,
"loss": 0.5181,
"step": 5115
},
{
"epoch": 1.2325469427058258,
"grad_norm": 0.98828125,
"learning_rate": 0.00016570086705290478,
"loss": 0.4983,
"step": 5120
},
{
"epoch": 1.2337506018295619,
"grad_norm": 1.0703125,
"learning_rate": 0.00016559769970211224,
"loss": 0.4885,
"step": 5125
},
{
"epoch": 1.234954260953298,
"grad_norm": 1.1484375,
"learning_rate": 0.00016549449424088425,
"loss": 0.4803,
"step": 5130
},
{
"epoch": 1.236157920077034,
"grad_norm": 0.9375,
"learning_rate": 0.00016539125083411672,
"loss": 0.4483,
"step": 5135
},
{
"epoch": 1.2373615792007704,
"grad_norm": 0.9921875,
"learning_rate": 0.00016528796964676606,
"loss": 0.5276,
"step": 5140
},
{
"epoch": 1.2385652383245065,
"grad_norm": 0.97265625,
"learning_rate": 0.00016518465084384916,
"loss": 0.5143,
"step": 5145
},
{
"epoch": 1.2397688974482426,
"grad_norm": 1.0078125,
"learning_rate": 0.00016508129459044302,
"loss": 0.4913,
"step": 5150
},
{
"epoch": 1.240972556571979,
"grad_norm": 1.046875,
"learning_rate": 0.0001649779010516844,
"loss": 0.4699,
"step": 5155
},
{
"epoch": 1.242176215695715,
"grad_norm": 1.0390625,
"learning_rate": 0.00016487447039276968,
"loss": 0.5147,
"step": 5160
},
{
"epoch": 1.2433798748194511,
"grad_norm": 1.140625,
"learning_rate": 0.00016477100277895456,
"loss": 0.4911,
"step": 5165
},
{
"epoch": 1.2445835339431872,
"grad_norm": 1.0,
"learning_rate": 0.0001646674983755537,
"loss": 0.5115,
"step": 5170
},
{
"epoch": 1.2457871930669235,
"grad_norm": 1.1171875,
"learning_rate": 0.00016456395734794064,
"loss": 0.4906,
"step": 5175
},
{
"epoch": 1.2469908521906596,
"grad_norm": 1.03125,
"learning_rate": 0.00016446037986154744,
"loss": 0.5136,
"step": 5180
},
{
"epoch": 1.2481945113143957,
"grad_norm": 1.0234375,
"learning_rate": 0.00016435676608186434,
"loss": 0.4902,
"step": 5185
},
{
"epoch": 1.2493981704381318,
"grad_norm": 1.0234375,
"learning_rate": 0.0001642531161744396,
"loss": 0.4867,
"step": 5190
},
{
"epoch": 1.2506018295618682,
"grad_norm": 1.0078125,
"learning_rate": 0.00016414943030487915,
"loss": 0.4988,
"step": 5195
},
{
"epoch": 1.2518054886856043,
"grad_norm": 0.921875,
"learning_rate": 0.00016404570863884647,
"loss": 0.5086,
"step": 5200
},
{
"epoch": 1.2530091478093404,
"grad_norm": 1.0234375,
"learning_rate": 0.0001639419513420622,
"loss": 0.4979,
"step": 5205
},
{
"epoch": 1.2542128069330767,
"grad_norm": 1.0390625,
"learning_rate": 0.00016383815858030392,
"loss": 0.4958,
"step": 5210
},
{
"epoch": 1.2554164660568128,
"grad_norm": 1.03125,
"learning_rate": 0.00016373433051940583,
"loss": 0.4773,
"step": 5215
},
{
"epoch": 1.2566201251805489,
"grad_norm": 1.140625,
"learning_rate": 0.00016363046732525855,
"loss": 0.5022,
"step": 5220
},
{
"epoch": 1.257823784304285,
"grad_norm": 0.99609375,
"learning_rate": 0.00016352656916380885,
"loss": 0.4968,
"step": 5225
},
{
"epoch": 1.259027443428021,
"grad_norm": 1.015625,
"learning_rate": 0.00016342263620105937,
"loss": 0.49,
"step": 5230
},
{
"epoch": 1.2602311025517574,
"grad_norm": 1.03125,
"learning_rate": 0.00016331866860306828,
"loss": 0.5107,
"step": 5235
},
{
"epoch": 1.2614347616754935,
"grad_norm": 1.0546875,
"learning_rate": 0.00016321466653594925,
"loss": 0.5059,
"step": 5240
},
{
"epoch": 1.2626384207992296,
"grad_norm": 1.0,
"learning_rate": 0.00016311063016587082,
"loss": 0.492,
"step": 5245
},
{
"epoch": 1.263842079922966,
"grad_norm": 1.078125,
"learning_rate": 0.0001630065596590565,
"loss": 0.5178,
"step": 5250
},
{
"epoch": 1.265045739046702,
"grad_norm": 0.99609375,
"learning_rate": 0.00016290245518178428,
"loss": 0.4905,
"step": 5255
},
{
"epoch": 1.2662493981704381,
"grad_norm": 1.1171875,
"learning_rate": 0.00016279831690038643,
"loss": 0.5072,
"step": 5260
},
{
"epoch": 1.2674530572941742,
"grad_norm": 1.0625,
"learning_rate": 0.0001626941449812492,
"loss": 0.5378,
"step": 5265
},
{
"epoch": 1.2686567164179103,
"grad_norm": 1.0078125,
"learning_rate": 0.0001625899395908126,
"loss": 0.5014,
"step": 5270
},
{
"epoch": 1.2698603755416467,
"grad_norm": 1.03125,
"learning_rate": 0.00016248570089557018,
"loss": 0.4826,
"step": 5275
},
{
"epoch": 1.2710640346653828,
"grad_norm": 1.109375,
"learning_rate": 0.0001623814290620686,
"loss": 0.516,
"step": 5280
},
{
"epoch": 1.2722676937891189,
"grad_norm": 0.984375,
"learning_rate": 0.00016227712425690758,
"loss": 0.518,
"step": 5285
},
{
"epoch": 1.2734713529128552,
"grad_norm": 1.015625,
"learning_rate": 0.00016217278664673945,
"loss": 0.4844,
"step": 5290
},
{
"epoch": 1.2746750120365913,
"grad_norm": 0.9375,
"learning_rate": 0.0001620684163982689,
"loss": 0.4966,
"step": 5295
},
{
"epoch": 1.2758786711603274,
"grad_norm": 0.953125,
"learning_rate": 0.00016196401367825285,
"loss": 0.5107,
"step": 5300
},
{
"epoch": 1.2770823302840635,
"grad_norm": 0.97265625,
"learning_rate": 0.00016185957865350015,
"loss": 0.4942,
"step": 5305
},
{
"epoch": 1.2782859894077996,
"grad_norm": 1.125,
"learning_rate": 0.00016175511149087114,
"loss": 0.5508,
"step": 5310
},
{
"epoch": 1.279489648531536,
"grad_norm": 1.0390625,
"learning_rate": 0.00016165061235727757,
"loss": 0.4882,
"step": 5315
},
{
"epoch": 1.280693307655272,
"grad_norm": 1.0703125,
"learning_rate": 0.0001615460814196823,
"loss": 0.4969,
"step": 5320
},
{
"epoch": 1.281896966779008,
"grad_norm": 1.078125,
"learning_rate": 0.0001614415188450989,
"loss": 0.4946,
"step": 5325
},
{
"epoch": 1.2831006259027444,
"grad_norm": 1.015625,
"learning_rate": 0.00016133692480059167,
"loss": 0.512,
"step": 5330
},
{
"epoch": 1.2843042850264805,
"grad_norm": 1.03125,
"learning_rate": 0.00016123229945327498,
"loss": 0.5171,
"step": 5335
},
{
"epoch": 1.2855079441502166,
"grad_norm": 1.1796875,
"learning_rate": 0.00016112764297031333,
"loss": 0.5174,
"step": 5340
},
{
"epoch": 1.286711603273953,
"grad_norm": 1.078125,
"learning_rate": 0.00016102295551892102,
"loss": 0.4815,
"step": 5345
},
{
"epoch": 1.287915262397689,
"grad_norm": 1.03125,
"learning_rate": 0.0001609182372663617,
"loss": 0.4913,
"step": 5350
},
{
"epoch": 1.2891189215214252,
"grad_norm": 1.0390625,
"learning_rate": 0.00016081348837994827,
"loss": 0.5244,
"step": 5355
},
{
"epoch": 1.2903225806451613,
"grad_norm": 1.1015625,
"learning_rate": 0.00016070870902704266,
"loss": 0.5241,
"step": 5360
},
{
"epoch": 1.2915262397688974,
"grad_norm": 0.95703125,
"learning_rate": 0.00016060389937505538,
"loss": 0.4976,
"step": 5365
},
{
"epoch": 1.2927298988926337,
"grad_norm": 0.8515625,
"learning_rate": 0.0001604990595914454,
"loss": 0.465,
"step": 5370
},
{
"epoch": 1.2939335580163698,
"grad_norm": 1.0390625,
"learning_rate": 0.0001603941898437198,
"loss": 0.4837,
"step": 5375
},
{
"epoch": 1.2951372171401059,
"grad_norm": 0.9765625,
"learning_rate": 0.00016028929029943356,
"loss": 0.5047,
"step": 5380
},
{
"epoch": 1.2963408762638422,
"grad_norm": 0.984375,
"learning_rate": 0.0001601843611261893,
"loss": 0.5246,
"step": 5385
},
{
"epoch": 1.2975445353875783,
"grad_norm": 1.0,
"learning_rate": 0.00016007940249163687,
"loss": 0.5101,
"step": 5390
},
{
"epoch": 1.2987481945113144,
"grad_norm": 1.078125,
"learning_rate": 0.00015997441456347332,
"loss": 0.5028,
"step": 5395
},
{
"epoch": 1.2999518536350505,
"grad_norm": 1.0078125,
"learning_rate": 0.0001598693975094424,
"loss": 0.4503,
"step": 5400
},
{
"epoch": 1.3011555127587866,
"grad_norm": 0.9609375,
"learning_rate": 0.00015976435149733445,
"loss": 0.5075,
"step": 5405
},
{
"epoch": 1.302359171882523,
"grad_norm": 0.98046875,
"learning_rate": 0.00015965927669498616,
"loss": 0.5204,
"step": 5410
},
{
"epoch": 1.303562831006259,
"grad_norm": 0.9609375,
"learning_rate": 0.00015955417327027997,
"loss": 0.4779,
"step": 5415
},
{
"epoch": 1.3047664901299951,
"grad_norm": 1.0078125,
"learning_rate": 0.00015944904139114435,
"loss": 0.4874,
"step": 5420
},
{
"epoch": 1.3059701492537314,
"grad_norm": 1.0625,
"learning_rate": 0.000159343881225553,
"loss": 0.471,
"step": 5425
},
{
"epoch": 1.3071738083774675,
"grad_norm": 1.0078125,
"learning_rate": 0.00015923869294152493,
"loss": 0.4856,
"step": 5430
},
{
"epoch": 1.3083774675012037,
"grad_norm": 1.0,
"learning_rate": 0.0001591334767071241,
"loss": 0.4937,
"step": 5435
},
{
"epoch": 1.3095811266249398,
"grad_norm": 1.03125,
"learning_rate": 0.000159028232690459,
"loss": 0.5043,
"step": 5440
},
{
"epoch": 1.3107847857486759,
"grad_norm": 1.03125,
"learning_rate": 0.00015892296105968266,
"loss": 0.5179,
"step": 5445
},
{
"epoch": 1.3119884448724122,
"grad_norm": 1.0234375,
"learning_rate": 0.00015881766198299209,
"loss": 0.4825,
"step": 5450
},
{
"epoch": 1.3131921039961483,
"grad_norm": 0.9921875,
"learning_rate": 0.00015871233562862828,
"loss": 0.4928,
"step": 5455
},
{
"epoch": 1.3143957631198844,
"grad_norm": 0.98046875,
"learning_rate": 0.0001586069821648757,
"loss": 0.4961,
"step": 5460
},
{
"epoch": 1.3155994222436207,
"grad_norm": 1.0234375,
"learning_rate": 0.00015850160176006226,
"loss": 0.4984,
"step": 5465
},
{
"epoch": 1.3168030813673568,
"grad_norm": 1.0625,
"learning_rate": 0.00015839619458255874,
"loss": 0.4624,
"step": 5470
},
{
"epoch": 1.318006740491093,
"grad_norm": 0.94140625,
"learning_rate": 0.00015829076080077883,
"loss": 0.4883,
"step": 5475
},
{
"epoch": 1.319210399614829,
"grad_norm": 0.94140625,
"learning_rate": 0.00015818530058317869,
"loss": 0.4877,
"step": 5480
},
{
"epoch": 1.320414058738565,
"grad_norm": 1.0703125,
"learning_rate": 0.00015807981409825665,
"loss": 0.4761,
"step": 5485
},
{
"epoch": 1.3216177178623014,
"grad_norm": 0.97265625,
"learning_rate": 0.00015797430151455317,
"loss": 0.5066,
"step": 5490
},
{
"epoch": 1.3228213769860375,
"grad_norm": 0.96875,
"learning_rate": 0.00015786876300065024,
"loss": 0.4823,
"step": 5495
},
{
"epoch": 1.3240250361097736,
"grad_norm": 0.99609375,
"learning_rate": 0.00015776319872517136,
"loss": 0.4982,
"step": 5500
},
{
"epoch": 1.3240250361097736,
"eval_loss": 0.43385958671569824,
"eval_runtime": 2.3671,
"eval_samples_per_second": 84.49,
"eval_steps_per_second": 84.49,
"step": 5500
},
{
"epoch": 1.32522869523351,
"grad_norm": 1.1015625,
"learning_rate": 0.00015765760885678122,
"loss": 0.5061,
"step": 5505
},
{
"epoch": 1.326432354357246,
"grad_norm": 0.984375,
"learning_rate": 0.0001575519935641853,
"loss": 0.5144,
"step": 5510
},
{
"epoch": 1.3276360134809821,
"grad_norm": 1.015625,
"learning_rate": 0.00015744635301612983,
"loss": 0.4655,
"step": 5515
},
{
"epoch": 1.3288396726047185,
"grad_norm": 1.0859375,
"learning_rate": 0.0001573406873814013,
"loss": 0.4866,
"step": 5520
},
{
"epoch": 1.3300433317284546,
"grad_norm": 1.0703125,
"learning_rate": 0.00015723499682882626,
"loss": 0.501,
"step": 5525
},
{
"epoch": 1.3312469908521907,
"grad_norm": 0.9609375,
"learning_rate": 0.0001571292815272712,
"loss": 0.476,
"step": 5530
},
{
"epoch": 1.3324506499759268,
"grad_norm": 1.0859375,
"learning_rate": 0.00015702354164564197,
"loss": 0.4883,
"step": 5535
},
{
"epoch": 1.3336543090996629,
"grad_norm": 1.0078125,
"learning_rate": 0.00015691777735288387,
"loss": 0.4761,
"step": 5540
},
{
"epoch": 1.3348579682233992,
"grad_norm": 0.890625,
"learning_rate": 0.00015681198881798116,
"loss": 0.5028,
"step": 5545
},
{
"epoch": 1.3360616273471353,
"grad_norm": 1.046875,
"learning_rate": 0.0001567061762099567,
"loss": 0.5045,
"step": 5550
},
{
"epoch": 1.3372652864708714,
"grad_norm": 1.09375,
"learning_rate": 0.00015660033969787198,
"loss": 0.4935,
"step": 5555
},
{
"epoch": 1.3384689455946077,
"grad_norm": 1.109375,
"learning_rate": 0.00015649447945082656,
"loss": 0.5069,
"step": 5560
},
{
"epoch": 1.3396726047183438,
"grad_norm": 1.109375,
"learning_rate": 0.00015638859563795804,
"loss": 0.5222,
"step": 5565
},
{
"epoch": 1.34087626384208,
"grad_norm": 0.98828125,
"learning_rate": 0.0001562826884284416,
"loss": 0.4737,
"step": 5570
},
{
"epoch": 1.342079922965816,
"grad_norm": 1.0078125,
"learning_rate": 0.0001561767579914898,
"loss": 0.5146,
"step": 5575
},
{
"epoch": 1.3432835820895521,
"grad_norm": 0.94921875,
"learning_rate": 0.0001560708044963523,
"loss": 0.4892,
"step": 5580
},
{
"epoch": 1.3444872412132884,
"grad_norm": 0.94921875,
"learning_rate": 0.00015596482811231565,
"loss": 0.4682,
"step": 5585
},
{
"epoch": 1.3456909003370245,
"grad_norm": 1.0546875,
"learning_rate": 0.0001558588290087029,
"loss": 0.48,
"step": 5590
},
{
"epoch": 1.3468945594607606,
"grad_norm": 1.0390625,
"learning_rate": 0.0001557528073548735,
"loss": 0.5319,
"step": 5595
},
{
"epoch": 1.348098218584497,
"grad_norm": 1.015625,
"learning_rate": 0.00015564676332022287,
"loss": 0.5075,
"step": 5600
},
{
"epoch": 1.349301877708233,
"grad_norm": 1.03125,
"learning_rate": 0.00015554069707418217,
"loss": 0.472,
"step": 5605
},
{
"epoch": 1.3505055368319692,
"grad_norm": 1.09375,
"learning_rate": 0.00015543460878621805,
"loss": 0.4913,
"step": 5610
},
{
"epoch": 1.3517091959557053,
"grad_norm": 1.015625,
"learning_rate": 0.00015532849862583245,
"loss": 0.4841,
"step": 5615
},
{
"epoch": 1.3529128550794414,
"grad_norm": 1.03125,
"learning_rate": 0.00015522236676256216,
"loss": 0.4816,
"step": 5620
},
{
"epoch": 1.3541165142031777,
"grad_norm": 1.0546875,
"learning_rate": 0.00015511621336597876,
"loss": 0.5104,
"step": 5625
},
{
"epoch": 1.3553201733269138,
"grad_norm": 1.0234375,
"learning_rate": 0.00015501003860568809,
"loss": 0.4823,
"step": 5630
},
{
"epoch": 1.35652383245065,
"grad_norm": 1.0234375,
"learning_rate": 0.00015490384265133021,
"loss": 0.4823,
"step": 5635
},
{
"epoch": 1.3577274915743862,
"grad_norm": 1.03125,
"learning_rate": 0.00015479762567257904,
"loss": 0.4985,
"step": 5640
},
{
"epoch": 1.3589311506981223,
"grad_norm": 1.0546875,
"learning_rate": 0.00015469138783914208,
"loss": 0.5233,
"step": 5645
},
{
"epoch": 1.3601348098218584,
"grad_norm": 1.0234375,
"learning_rate": 0.0001545851293207602,
"loss": 0.5209,
"step": 5650
},
{
"epoch": 1.3613384689455947,
"grad_norm": 1.0625,
"learning_rate": 0.0001544788502872072,
"loss": 0.489,
"step": 5655
},
{
"epoch": 1.3625421280693308,
"grad_norm": 0.953125,
"learning_rate": 0.00015437255090828983,
"loss": 0.4907,
"step": 5660
},
{
"epoch": 1.363745787193067,
"grad_norm": 1.0390625,
"learning_rate": 0.0001542662313538471,
"loss": 0.4894,
"step": 5665
},
{
"epoch": 1.364949446316803,
"grad_norm": 1.0703125,
"learning_rate": 0.0001541598917937505,
"loss": 0.4831,
"step": 5670
},
{
"epoch": 1.3661531054405391,
"grad_norm": 1.046875,
"learning_rate": 0.00015405353239790343,
"loss": 0.49,
"step": 5675
},
{
"epoch": 1.3673567645642755,
"grad_norm": 0.921875,
"learning_rate": 0.00015394715333624088,
"loss": 0.4663,
"step": 5680
},
{
"epoch": 1.3685604236880116,
"grad_norm": 0.96484375,
"learning_rate": 0.00015384075477872927,
"loss": 0.5082,
"step": 5685
},
{
"epoch": 1.3697640828117477,
"grad_norm": 0.99609375,
"learning_rate": 0.00015373433689536627,
"loss": 0.4546,
"step": 5690
},
{
"epoch": 1.370967741935484,
"grad_norm": 1.0625,
"learning_rate": 0.0001536278998561804,
"loss": 0.5197,
"step": 5695
},
{
"epoch": 1.37217140105922,
"grad_norm": 1.078125,
"learning_rate": 0.00015352144383123074,
"loss": 0.4984,
"step": 5700
},
{
"epoch": 1.3733750601829562,
"grad_norm": 1.03125,
"learning_rate": 0.00015341496899060677,
"loss": 0.4855,
"step": 5705
},
{
"epoch": 1.3745787193066923,
"grad_norm": 1.0078125,
"learning_rate": 0.00015330847550442788,
"loss": 0.4654,
"step": 5710
},
{
"epoch": 1.3757823784304284,
"grad_norm": 1.109375,
"learning_rate": 0.00015320196354284347,
"loss": 0.5112,
"step": 5715
},
{
"epoch": 1.3769860375541647,
"grad_norm": 0.91796875,
"learning_rate": 0.00015309543327603228,
"loss": 0.4769,
"step": 5720
},
{
"epoch": 1.3781896966779008,
"grad_norm": 1.015625,
"learning_rate": 0.00015298888487420243,
"loss": 0.4979,
"step": 5725
},
{
"epoch": 1.379393355801637,
"grad_norm": 1.1171875,
"learning_rate": 0.00015288231850759093,
"loss": 0.4767,
"step": 5730
},
{
"epoch": 1.3805970149253732,
"grad_norm": 1.0390625,
"learning_rate": 0.00015277573434646348,
"loss": 0.5094,
"step": 5735
},
{
"epoch": 1.3818006740491093,
"grad_norm": 0.95703125,
"learning_rate": 0.00015266913256111426,
"loss": 0.4962,
"step": 5740
},
{
"epoch": 1.3830043331728454,
"grad_norm": 0.9609375,
"learning_rate": 0.0001525625133218656,
"loss": 0.4601,
"step": 5745
},
{
"epoch": 1.3842079922965815,
"grad_norm": 1.046875,
"learning_rate": 0.00015245587679906775,
"loss": 0.4983,
"step": 5750
},
{
"epoch": 1.3854116514203176,
"grad_norm": 0.890625,
"learning_rate": 0.0001523492231630985,
"loss": 0.4999,
"step": 5755
},
{
"epoch": 1.386615310544054,
"grad_norm": 1.046875,
"learning_rate": 0.00015224255258436306,
"loss": 0.515,
"step": 5760
},
{
"epoch": 1.38781896966779,
"grad_norm": 0.95703125,
"learning_rate": 0.0001521358652332936,
"loss": 0.4706,
"step": 5765
},
{
"epoch": 1.3890226287915262,
"grad_norm": 1.0390625,
"learning_rate": 0.00015202916128034916,
"loss": 0.4743,
"step": 5770
},
{
"epoch": 1.3902262879152625,
"grad_norm": 1.0234375,
"learning_rate": 0.00015192244089601536,
"loss": 0.5071,
"step": 5775
},
{
"epoch": 1.3914299470389986,
"grad_norm": 0.96875,
"learning_rate": 0.000151815704250804,
"loss": 0.4571,
"step": 5780
},
{
"epoch": 1.3926336061627347,
"grad_norm": 0.98828125,
"learning_rate": 0.00015170895151525287,
"loss": 0.506,
"step": 5785
},
{
"epoch": 1.393837265286471,
"grad_norm": 1.0234375,
"learning_rate": 0.00015160218285992547,
"loss": 0.484,
"step": 5790
},
{
"epoch": 1.395040924410207,
"grad_norm": 0.98046875,
"learning_rate": 0.00015149539845541073,
"loss": 0.4771,
"step": 5795
},
{
"epoch": 1.3962445835339432,
"grad_norm": 0.9921875,
"learning_rate": 0.00015138859847232277,
"loss": 0.4879,
"step": 5800
},
{
"epoch": 1.3974482426576793,
"grad_norm": 0.96875,
"learning_rate": 0.0001512817830813006,
"loss": 0.5126,
"step": 5805
},
{
"epoch": 1.3986519017814154,
"grad_norm": 1.078125,
"learning_rate": 0.00015117495245300783,
"loss": 0.5133,
"step": 5810
},
{
"epoch": 1.3998555609051517,
"grad_norm": 1.046875,
"learning_rate": 0.0001510681067581324,
"loss": 0.4955,
"step": 5815
},
{
"epoch": 1.4010592200288878,
"grad_norm": 0.96875,
"learning_rate": 0.0001509612461673863,
"loss": 0.4837,
"step": 5820
},
{
"epoch": 1.402262879152624,
"grad_norm": 0.9921875,
"learning_rate": 0.00015085437085150545,
"loss": 0.4916,
"step": 5825
},
{
"epoch": 1.4034665382763603,
"grad_norm": 0.94140625,
"learning_rate": 0.00015074748098124912,
"loss": 0.4977,
"step": 5830
},
{
"epoch": 1.4046701974000964,
"grad_norm": 1.0234375,
"learning_rate": 0.00015064057672739995,
"loss": 0.5073,
"step": 5835
},
{
"epoch": 1.4058738565238325,
"grad_norm": 1.0390625,
"learning_rate": 0.00015053365826076364,
"loss": 0.487,
"step": 5840
},
{
"epoch": 1.4070775156475686,
"grad_norm": 1.1171875,
"learning_rate": 0.00015042672575216832,
"loss": 0.4576,
"step": 5845
},
{
"epoch": 1.4082811747713047,
"grad_norm": 0.88671875,
"learning_rate": 0.00015031977937246478,
"loss": 0.4557,
"step": 5850
},
{
"epoch": 1.409484833895041,
"grad_norm": 0.984375,
"learning_rate": 0.00015021281929252598,
"loss": 0.4708,
"step": 5855
},
{
"epoch": 1.410688493018777,
"grad_norm": 1.03125,
"learning_rate": 0.00015010584568324667,
"loss": 0.4612,
"step": 5860
},
{
"epoch": 1.4118921521425132,
"grad_norm": 1.046875,
"learning_rate": 0.00014999885871554326,
"loss": 0.4657,
"step": 5865
},
{
"epoch": 1.4130958112662495,
"grad_norm": 1.0625,
"learning_rate": 0.0001498918585603535,
"loss": 0.4711,
"step": 5870
},
{
"epoch": 1.4142994703899856,
"grad_norm": 1.0,
"learning_rate": 0.00014978484538863613,
"loss": 0.4623,
"step": 5875
},
{
"epoch": 1.4155031295137217,
"grad_norm": 1.03125,
"learning_rate": 0.00014967781937137088,
"loss": 0.4759,
"step": 5880
},
{
"epoch": 1.4167067886374578,
"grad_norm": 1.0703125,
"learning_rate": 0.00014957078067955786,
"loss": 0.4573,
"step": 5885
},
{
"epoch": 1.417910447761194,
"grad_norm": 0.95703125,
"learning_rate": 0.0001494637294842174,
"loss": 0.4783,
"step": 5890
},
{
"epoch": 1.4191141068849302,
"grad_norm": 0.9921875,
"learning_rate": 0.00014935666595639,
"loss": 0.4803,
"step": 5895
},
{
"epoch": 1.4203177660086663,
"grad_norm": 0.9453125,
"learning_rate": 0.00014924959026713555,
"loss": 0.4584,
"step": 5900
},
{
"epoch": 1.4215214251324024,
"grad_norm": 1.0546875,
"learning_rate": 0.0001491425025875337,
"loss": 0.4669,
"step": 5905
},
{
"epoch": 1.4227250842561388,
"grad_norm": 0.953125,
"learning_rate": 0.000149035403088683,
"loss": 0.4869,
"step": 5910
},
{
"epoch": 1.4239287433798749,
"grad_norm": 0.9453125,
"learning_rate": 0.00014892829194170111,
"loss": 0.5024,
"step": 5915
},
{
"epoch": 1.425132402503611,
"grad_norm": 1.0234375,
"learning_rate": 0.00014882116931772408,
"loss": 0.4892,
"step": 5920
},
{
"epoch": 1.4263360616273473,
"grad_norm": 1.0078125,
"learning_rate": 0.00014871403538790649,
"loss": 0.4639,
"step": 5925
},
{
"epoch": 1.4275397207510834,
"grad_norm": 0.984375,
"learning_rate": 0.0001486068903234208,
"loss": 0.4735,
"step": 5930
},
{
"epoch": 1.4287433798748195,
"grad_norm": 0.90234375,
"learning_rate": 0.00014849973429545744,
"loss": 0.4837,
"step": 5935
},
{
"epoch": 1.4299470389985556,
"grad_norm": 0.9453125,
"learning_rate": 0.0001483925674752242,
"loss": 0.4899,
"step": 5940
},
{
"epoch": 1.4311506981222917,
"grad_norm": 0.99609375,
"learning_rate": 0.0001482853900339463,
"loss": 0.4888,
"step": 5945
},
{
"epoch": 1.432354357246028,
"grad_norm": 1.015625,
"learning_rate": 0.00014817820214286568,
"loss": 0.4698,
"step": 5950
},
{
"epoch": 1.433558016369764,
"grad_norm": 0.9609375,
"learning_rate": 0.00014807100397324122,
"loss": 0.5042,
"step": 5955
},
{
"epoch": 1.4347616754935002,
"grad_norm": 1.0390625,
"learning_rate": 0.00014796379569634804,
"loss": 0.4935,
"step": 5960
},
{
"epoch": 1.4359653346172365,
"grad_norm": 1.0078125,
"learning_rate": 0.00014785657748347752,
"loss": 0.4821,
"step": 5965
},
{
"epoch": 1.4371689937409726,
"grad_norm": 1.0234375,
"learning_rate": 0.00014774934950593686,
"loss": 0.4639,
"step": 5970
},
{
"epoch": 1.4383726528647087,
"grad_norm": 1.0625,
"learning_rate": 0.00014764211193504895,
"loss": 0.4849,
"step": 5975
},
{
"epoch": 1.4395763119884448,
"grad_norm": 1.03125,
"learning_rate": 0.0001475348649421518,
"loss": 0.5058,
"step": 5980
},
{
"epoch": 1.440779971112181,
"grad_norm": 1.0625,
"learning_rate": 0.0001474276086985987,
"loss": 0.4944,
"step": 5985
},
{
"epoch": 1.4419836302359172,
"grad_norm": 0.9140625,
"learning_rate": 0.00014732034337575767,
"loss": 0.4737,
"step": 5990
},
{
"epoch": 1.4431872893596533,
"grad_norm": 1.0390625,
"learning_rate": 0.00014721306914501113,
"loss": 0.5094,
"step": 5995
},
{
"epoch": 1.4443909484833894,
"grad_norm": 0.9609375,
"learning_rate": 0.00014710578617775584,
"loss": 0.4894,
"step": 6000
},
{
"epoch": 1.4443909484833894,
"eval_loss": 0.4217573404312134,
"eval_runtime": 2.3697,
"eval_samples_per_second": 84.399,
"eval_steps_per_second": 84.399,
"step": 6000
},
{
"epoch": 1.4455946076071258,
"grad_norm": 1.0078125,
"learning_rate": 0.0001469984946454024,
"loss": 0.4888,
"step": 6005
},
{
"epoch": 1.4467982667308619,
"grad_norm": 1.0078125,
"learning_rate": 0.0001468911947193753,
"loss": 0.468,
"step": 6010
},
{
"epoch": 1.448001925854598,
"grad_norm": 0.98828125,
"learning_rate": 0.00014678388657111223,
"loss": 0.4693,
"step": 6015
},
{
"epoch": 1.449205584978334,
"grad_norm": 0.96484375,
"learning_rate": 0.0001466765703720641,
"loss": 0.4642,
"step": 6020
},
{
"epoch": 1.4504092441020702,
"grad_norm": 0.9453125,
"learning_rate": 0.00014656924629369473,
"loss": 0.4988,
"step": 6025
},
{
"epoch": 1.4516129032258065,
"grad_norm": 0.96484375,
"learning_rate": 0.00014646191450748045,
"loss": 0.4574,
"step": 6030
},
{
"epoch": 1.4528165623495426,
"grad_norm": 0.94921875,
"learning_rate": 0.00014635457518490994,
"loss": 0.4875,
"step": 6035
},
{
"epoch": 1.4540202214732787,
"grad_norm": 0.9609375,
"learning_rate": 0.00014624722849748397,
"loss": 0.4506,
"step": 6040
},
{
"epoch": 1.455223880597015,
"grad_norm": 0.9296875,
"learning_rate": 0.00014613987461671498,
"loss": 0.4625,
"step": 6045
},
{
"epoch": 1.4564275397207511,
"grad_norm": 1.0234375,
"learning_rate": 0.00014603251371412697,
"loss": 0.4864,
"step": 6050
},
{
"epoch": 1.4576311988444872,
"grad_norm": 0.9921875,
"learning_rate": 0.0001459251459612551,
"loss": 0.5142,
"step": 6055
},
{
"epoch": 1.4588348579682233,
"grad_norm": 1.0703125,
"learning_rate": 0.00014581777152964555,
"loss": 0.4851,
"step": 6060
},
{
"epoch": 1.4600385170919594,
"grad_norm": 1.0078125,
"learning_rate": 0.00014571039059085516,
"loss": 0.4754,
"step": 6065
},
{
"epoch": 1.4612421762156957,
"grad_norm": 1.0078125,
"learning_rate": 0.0001456030033164511,
"loss": 0.4664,
"step": 6070
},
{
"epoch": 1.4624458353394318,
"grad_norm": 0.98046875,
"learning_rate": 0.00014549560987801074,
"loss": 0.4779,
"step": 6075
},
{
"epoch": 1.463649494463168,
"grad_norm": 1.0078125,
"learning_rate": 0.00014538821044712128,
"loss": 0.5191,
"step": 6080
},
{
"epoch": 1.4648531535869043,
"grad_norm": 0.98828125,
"learning_rate": 0.00014528080519537933,
"loss": 0.4675,
"step": 6085
},
{
"epoch": 1.4660568127106404,
"grad_norm": 0.88671875,
"learning_rate": 0.00014517339429439115,
"loss": 0.4618,
"step": 6090
},
{
"epoch": 1.4672604718343765,
"grad_norm": 1.015625,
"learning_rate": 0.0001450659779157717,
"loss": 0.49,
"step": 6095
},
{
"epoch": 1.4684641309581128,
"grad_norm": 0.95703125,
"learning_rate": 0.00014495855623114485,
"loss": 0.4862,
"step": 6100
},
{
"epoch": 1.469667790081849,
"grad_norm": 1.046875,
"learning_rate": 0.000144851129412143,
"loss": 0.4855,
"step": 6105
},
{
"epoch": 1.470871449205585,
"grad_norm": 0.9609375,
"learning_rate": 0.00014474369763040648,
"loss": 0.4627,
"step": 6110
},
{
"epoch": 1.472075108329321,
"grad_norm": 0.953125,
"learning_rate": 0.00014463626105758388,
"loss": 0.4996,
"step": 6115
},
{
"epoch": 1.4732787674530572,
"grad_norm": 1.0859375,
"learning_rate": 0.00014452881986533127,
"loss": 0.498,
"step": 6120
},
{
"epoch": 1.4744824265767935,
"grad_norm": 0.9765625,
"learning_rate": 0.00014442137422531216,
"loss": 0.4749,
"step": 6125
},
{
"epoch": 1.4756860857005296,
"grad_norm": 1.0390625,
"learning_rate": 0.00014431392430919705,
"loss": 0.4834,
"step": 6130
},
{
"epoch": 1.4768897448242657,
"grad_norm": 1.09375,
"learning_rate": 0.00014420647028866342,
"loss": 0.5004,
"step": 6135
},
{
"epoch": 1.478093403948002,
"grad_norm": 1.0625,
"learning_rate": 0.0001440990123353953,
"loss": 0.503,
"step": 6140
},
{
"epoch": 1.4792970630717381,
"grad_norm": 0.98828125,
"learning_rate": 0.00014399155062108285,
"loss": 0.5002,
"step": 6145
},
{
"epoch": 1.4805007221954742,
"grad_norm": 1.1015625,
"learning_rate": 0.0001438840853174224,
"loss": 0.5183,
"step": 6150
},
{
"epoch": 1.4817043813192103,
"grad_norm": 0.88671875,
"learning_rate": 0.00014377661659611596,
"loss": 0.4445,
"step": 6155
},
{
"epoch": 1.4829080404429464,
"grad_norm": 0.890625,
"learning_rate": 0.00014366914462887102,
"loss": 0.4669,
"step": 6160
},
{
"epoch": 1.4841116995666828,
"grad_norm": 1.0546875,
"learning_rate": 0.0001435616695874001,
"loss": 0.4944,
"step": 6165
},
{
"epoch": 1.4853153586904189,
"grad_norm": 1.03125,
"learning_rate": 0.0001434541916434209,
"loss": 0.5061,
"step": 6170
},
{
"epoch": 1.486519017814155,
"grad_norm": 1.0390625,
"learning_rate": 0.0001433467109686556,
"loss": 0.4779,
"step": 6175
},
{
"epoch": 1.4877226769378913,
"grad_norm": 0.95703125,
"learning_rate": 0.00014323922773483075,
"loss": 0.481,
"step": 6180
},
{
"epoch": 1.4889263360616274,
"grad_norm": 1.015625,
"learning_rate": 0.00014313174211367697,
"loss": 0.4802,
"step": 6185
},
{
"epoch": 1.4901299951853635,
"grad_norm": 1.0234375,
"learning_rate": 0.00014302425427692878,
"loss": 0.4754,
"step": 6190
},
{
"epoch": 1.4913336543090996,
"grad_norm": 1.078125,
"learning_rate": 0.00014291676439632414,
"loss": 0.481,
"step": 6195
},
{
"epoch": 1.4925373134328357,
"grad_norm": 1.046875,
"learning_rate": 0.00014280927264360442,
"loss": 0.5069,
"step": 6200
},
{
"epoch": 1.493740972556572,
"grad_norm": 1.09375,
"learning_rate": 0.00014270177919051375,
"loss": 0.4674,
"step": 6205
},
{
"epoch": 1.4949446316803081,
"grad_norm": 1.0625,
"learning_rate": 0.00014259428420879922,
"loss": 0.5081,
"step": 6210
},
{
"epoch": 1.4961482908040442,
"grad_norm": 0.953125,
"learning_rate": 0.0001424867878702102,
"loss": 0.4584,
"step": 6215
},
{
"epoch": 1.4973519499277805,
"grad_norm": 1.015625,
"learning_rate": 0.0001423792903464983,
"loss": 0.4802,
"step": 6220
},
{
"epoch": 1.4985556090515166,
"grad_norm": 1.0859375,
"learning_rate": 0.000142271791809417,
"loss": 0.4609,
"step": 6225
},
{
"epoch": 1.4997592681752527,
"grad_norm": 0.94921875,
"learning_rate": 0.0001421642924307214,
"loss": 0.4733,
"step": 6230
},
{
"epoch": 1.500962927298989,
"grad_norm": 0.953125,
"learning_rate": 0.00014205679238216796,
"loss": 0.4642,
"step": 6235
},
{
"epoch": 1.502166586422725,
"grad_norm": 1.0390625,
"learning_rate": 0.0001419492918355142,
"loss": 0.4922,
"step": 6240
},
{
"epoch": 1.5033702455464613,
"grad_norm": 1.0390625,
"learning_rate": 0.00014184179096251844,
"loss": 0.4529,
"step": 6245
},
{
"epoch": 1.5045739046701974,
"grad_norm": 0.92578125,
"learning_rate": 0.00014173428993493947,
"loss": 0.4291,
"step": 6250
},
{
"epoch": 1.5057775637939335,
"grad_norm": 0.9921875,
"learning_rate": 0.00014162678892453643,
"loss": 0.4755,
"step": 6255
},
{
"epoch": 1.5069812229176698,
"grad_norm": 0.97265625,
"learning_rate": 0.00014151928810306836,
"loss": 0.478,
"step": 6260
},
{
"epoch": 1.5081848820414059,
"grad_norm": 1.0234375,
"learning_rate": 0.000141411787642294,
"loss": 0.4878,
"step": 6265
},
{
"epoch": 1.509388541165142,
"grad_norm": 0.9765625,
"learning_rate": 0.00014130428771397157,
"loss": 0.4984,
"step": 6270
},
{
"epoch": 1.5105922002888783,
"grad_norm": 0.9921875,
"learning_rate": 0.00014119678848985837,
"loss": 0.4885,
"step": 6275
},
{
"epoch": 1.5117958594126142,
"grad_norm": 1.046875,
"learning_rate": 0.00014108929014171055,
"loss": 0.4762,
"step": 6280
},
{
"epoch": 1.5129995185363505,
"grad_norm": 1.0625,
"learning_rate": 0.000140981792841283,
"loss": 0.5052,
"step": 6285
},
{
"epoch": 1.5142031776600868,
"grad_norm": 1.0078125,
"learning_rate": 0.00014087429676032883,
"loss": 0.4856,
"step": 6290
},
{
"epoch": 1.5154068367838227,
"grad_norm": 1.0390625,
"learning_rate": 0.0001407668020705992,
"loss": 0.4549,
"step": 6295
},
{
"epoch": 1.516610495907559,
"grad_norm": 1.0078125,
"learning_rate": 0.00014065930894384307,
"loss": 0.4686,
"step": 6300
},
{
"epoch": 1.5178141550312951,
"grad_norm": 1.1796875,
"learning_rate": 0.00014055181755180687,
"loss": 0.4618,
"step": 6305
},
{
"epoch": 1.5190178141550312,
"grad_norm": 0.9921875,
"learning_rate": 0.00014044432806623432,
"loss": 0.4848,
"step": 6310
},
{
"epoch": 1.5202214732787676,
"grad_norm": 0.9296875,
"learning_rate": 0.0001403368406588661,
"loss": 0.5021,
"step": 6315
},
{
"epoch": 1.5214251324025037,
"grad_norm": 1.09375,
"learning_rate": 0.00014022935550143947,
"loss": 0.508,
"step": 6320
},
{
"epoch": 1.5226287915262398,
"grad_norm": 0.9375,
"learning_rate": 0.00014012187276568822,
"loss": 0.4749,
"step": 6325
},
{
"epoch": 1.523832450649976,
"grad_norm": 0.97265625,
"learning_rate": 0.00014001439262334211,
"loss": 0.4905,
"step": 6330
},
{
"epoch": 1.525036109773712,
"grad_norm": 1.078125,
"learning_rate": 0.00013990691524612696,
"loss": 0.4888,
"step": 6335
},
{
"epoch": 1.5262397688974483,
"grad_norm": 0.9609375,
"learning_rate": 0.000139799440805764,
"loss": 0.4817,
"step": 6340
},
{
"epoch": 1.5274434280211844,
"grad_norm": 0.97265625,
"learning_rate": 0.00013969196947396988,
"loss": 0.445,
"step": 6345
},
{
"epoch": 1.5286470871449205,
"grad_norm": 0.9375,
"learning_rate": 0.0001395845014224562,
"loss": 0.4809,
"step": 6350
},
{
"epoch": 1.5298507462686568,
"grad_norm": 0.98828125,
"learning_rate": 0.00013947703682292936,
"loss": 0.439,
"step": 6355
},
{
"epoch": 1.531054405392393,
"grad_norm": 0.93359375,
"learning_rate": 0.00013936957584709028,
"loss": 0.4718,
"step": 6360
},
{
"epoch": 1.532258064516129,
"grad_norm": 1.0234375,
"learning_rate": 0.00013926211866663402,
"loss": 0.4421,
"step": 6365
},
{
"epoch": 1.5334617236398653,
"grad_norm": 1.0078125,
"learning_rate": 0.0001391546654532496,
"loss": 0.4284,
"step": 6370
},
{
"epoch": 1.5346653827636012,
"grad_norm": 0.89453125,
"learning_rate": 0.00013904721637861975,
"loss": 0.4921,
"step": 6375
},
{
"epoch": 1.5358690418873375,
"grad_norm": 0.86328125,
"learning_rate": 0.00013893977161442045,
"loss": 0.434,
"step": 6380
},
{
"epoch": 1.5370727010110736,
"grad_norm": 1.0234375,
"learning_rate": 0.00013883233133232098,
"loss": 0.4782,
"step": 6385
},
{
"epoch": 1.5382763601348097,
"grad_norm": 0.91015625,
"learning_rate": 0.0001387248957039834,
"loss": 0.4726,
"step": 6390
},
{
"epoch": 1.539480019258546,
"grad_norm": 1.0390625,
"learning_rate": 0.0001386174649010622,
"loss": 0.5012,
"step": 6395
},
{
"epoch": 1.5406836783822822,
"grad_norm": 0.9921875,
"learning_rate": 0.00013851003909520434,
"loss": 0.4516,
"step": 6400
},
{
"epoch": 1.5418873375060183,
"grad_norm": 0.95703125,
"learning_rate": 0.00013840261845804867,
"loss": 0.4751,
"step": 6405
},
{
"epoch": 1.5430909966297546,
"grad_norm": 0.921875,
"learning_rate": 0.00013829520316122583,
"loss": 0.4894,
"step": 6410
},
{
"epoch": 1.5442946557534905,
"grad_norm": 1.046875,
"learning_rate": 0.00013818779337635797,
"loss": 0.4568,
"step": 6415
},
{
"epoch": 1.5454983148772268,
"grad_norm": 0.89453125,
"learning_rate": 0.0001380803892750584,
"loss": 0.5007,
"step": 6420
},
{
"epoch": 1.5467019740009629,
"grad_norm": 1.046875,
"learning_rate": 0.00013797299102893124,
"loss": 0.4596,
"step": 6425
},
{
"epoch": 1.547905633124699,
"grad_norm": 1.046875,
"learning_rate": 0.0001378655988095715,
"loss": 0.5106,
"step": 6430
},
{
"epoch": 1.5491092922484353,
"grad_norm": 0.9921875,
"learning_rate": 0.0001377582127885643,
"loss": 0.4743,
"step": 6435
},
{
"epoch": 1.5503129513721714,
"grad_norm": 0.94921875,
"learning_rate": 0.000137650833137485,
"loss": 0.47,
"step": 6440
},
{
"epoch": 1.5515166104959075,
"grad_norm": 0.94921875,
"learning_rate": 0.0001375434600278988,
"loss": 0.4698,
"step": 6445
},
{
"epoch": 1.5527202696196438,
"grad_norm": 1.0,
"learning_rate": 0.00013743609363136037,
"loss": 0.4831,
"step": 6450
},
{
"epoch": 1.55392392874338,
"grad_norm": 1.0234375,
"learning_rate": 0.00013732873411941368,
"loss": 0.4759,
"step": 6455
},
{
"epoch": 1.555127587867116,
"grad_norm": 1.015625,
"learning_rate": 0.00013722138166359177,
"loss": 0.4482,
"step": 6460
},
{
"epoch": 1.5563312469908523,
"grad_norm": 1.078125,
"learning_rate": 0.00013711403643541624,
"loss": 0.4768,
"step": 6465
},
{
"epoch": 1.5575349061145882,
"grad_norm": 0.97265625,
"learning_rate": 0.0001370066986063973,
"loss": 0.4643,
"step": 6470
},
{
"epoch": 1.5587385652383245,
"grad_norm": 0.9375,
"learning_rate": 0.00013689936834803331,
"loss": 0.4877,
"step": 6475
},
{
"epoch": 1.5599422243620606,
"grad_norm": 0.9921875,
"learning_rate": 0.0001367920458318105,
"loss": 0.4637,
"step": 6480
},
{
"epoch": 1.5611458834857967,
"grad_norm": 0.92578125,
"learning_rate": 0.0001366847312292027,
"loss": 0.4779,
"step": 6485
},
{
"epoch": 1.562349542609533,
"grad_norm": 0.953125,
"learning_rate": 0.00013657742471167114,
"loss": 0.5096,
"step": 6490
},
{
"epoch": 1.5635532017332692,
"grad_norm": 0.97265625,
"learning_rate": 0.00013647012645066412,
"loss": 0.4628,
"step": 6495
},
{
"epoch": 1.5647568608570053,
"grad_norm": 0.9921875,
"learning_rate": 0.00013636283661761685,
"loss": 0.4939,
"step": 6500
},
{
"epoch": 1.5647568608570053,
"eval_loss": 0.40878283977508545,
"eval_runtime": 2.4446,
"eval_samples_per_second": 81.814,
"eval_steps_per_second": 81.814,
"step": 6500
},
{
"epoch": 1.5659605199807416,
"grad_norm": 1.0546875,
"learning_rate": 0.00013625555538395088,
"loss": 0.487,
"step": 6505
},
{
"epoch": 1.5671641791044775,
"grad_norm": 1.046875,
"learning_rate": 0.00013614828292107418,
"loss": 0.4773,
"step": 6510
},
{
"epoch": 1.5683678382282138,
"grad_norm": 1.0625,
"learning_rate": 0.00013604101940038057,
"loss": 0.4734,
"step": 6515
},
{
"epoch": 1.56957149735195,
"grad_norm": 0.8984375,
"learning_rate": 0.00013593376499324968,
"loss": 0.471,
"step": 6520
},
{
"epoch": 1.570775156475686,
"grad_norm": 0.90625,
"learning_rate": 0.00013582651987104665,
"loss": 0.4358,
"step": 6525
},
{
"epoch": 1.5719788155994223,
"grad_norm": 0.9609375,
"learning_rate": 0.0001357192842051216,
"loss": 0.5044,
"step": 6530
},
{
"epoch": 1.5731824747231584,
"grad_norm": 0.9921875,
"learning_rate": 0.00013561205816680965,
"loss": 0.487,
"step": 6535
},
{
"epoch": 1.5743861338468945,
"grad_norm": 0.87890625,
"learning_rate": 0.0001355048419274305,
"loss": 0.4617,
"step": 6540
},
{
"epoch": 1.5755897929706308,
"grad_norm": 0.99609375,
"learning_rate": 0.00013539763565828826,
"loss": 0.4796,
"step": 6545
},
{
"epoch": 1.5767934520943667,
"grad_norm": 0.984375,
"learning_rate": 0.00013529043953067107,
"loss": 0.4636,
"step": 6550
},
{
"epoch": 1.577997111218103,
"grad_norm": 1.0078125,
"learning_rate": 0.00013518325371585083,
"loss": 0.4638,
"step": 6555
},
{
"epoch": 1.5792007703418391,
"grad_norm": 1.0,
"learning_rate": 0.00013507607838508302,
"loss": 0.4575,
"step": 6560
},
{
"epoch": 1.5804044294655752,
"grad_norm": 1.0,
"learning_rate": 0.0001349689137096063,
"loss": 0.4627,
"step": 6565
},
{
"epoch": 1.5816080885893116,
"grad_norm": 0.9140625,
"learning_rate": 0.0001348617598606424,
"loss": 0.4768,
"step": 6570
},
{
"epoch": 1.5828117477130477,
"grad_norm": 0.96875,
"learning_rate": 0.00013475461700939573,
"loss": 0.4946,
"step": 6575
},
{
"epoch": 1.5840154068367838,
"grad_norm": 0.83203125,
"learning_rate": 0.00013464748532705296,
"loss": 0.4773,
"step": 6580
},
{
"epoch": 1.58521906596052,
"grad_norm": 1.0,
"learning_rate": 0.00013454036498478322,
"loss": 0.4616,
"step": 6585
},
{
"epoch": 1.5864227250842562,
"grad_norm": 1.0234375,
"learning_rate": 0.00013443325615373724,
"loss": 0.4782,
"step": 6590
},
{
"epoch": 1.5876263842079923,
"grad_norm": 0.9921875,
"learning_rate": 0.0001343261590050475,
"loss": 0.4529,
"step": 6595
},
{
"epoch": 1.5888300433317286,
"grad_norm": 1.0859375,
"learning_rate": 0.00013421907370982786,
"loss": 0.456,
"step": 6600
},
{
"epoch": 1.5900337024554645,
"grad_norm": 0.9375,
"learning_rate": 0.0001341120004391731,
"loss": 0.4851,
"step": 6605
},
{
"epoch": 1.5912373615792008,
"grad_norm": 1.046875,
"learning_rate": 0.00013400493936415887,
"loss": 0.4686,
"step": 6610
},
{
"epoch": 1.592441020702937,
"grad_norm": 0.9765625,
"learning_rate": 0.00013389789065584132,
"loss": 0.5021,
"step": 6615
},
{
"epoch": 1.593644679826673,
"grad_norm": 0.984375,
"learning_rate": 0.00013379085448525683,
"loss": 0.4656,
"step": 6620
},
{
"epoch": 1.5948483389504093,
"grad_norm": 1.0625,
"learning_rate": 0.00013368383102342184,
"loss": 0.4202,
"step": 6625
},
{
"epoch": 1.5960519980741454,
"grad_norm": 0.89453125,
"learning_rate": 0.0001335768204413323,
"loss": 0.469,
"step": 6630
},
{
"epoch": 1.5972556571978815,
"grad_norm": 0.94921875,
"learning_rate": 0.00013346982290996377,
"loss": 0.4255,
"step": 6635
},
{
"epoch": 1.5984593163216179,
"grad_norm": 0.92578125,
"learning_rate": 0.00013336283860027084,
"loss": 0.4514,
"step": 6640
},
{
"epoch": 1.5996629754453537,
"grad_norm": 1.03125,
"learning_rate": 0.00013325586768318695,
"loss": 0.4855,
"step": 6645
},
{
"epoch": 1.60086663456909,
"grad_norm": 1.0,
"learning_rate": 0.00013314891032962438,
"loss": 0.4682,
"step": 6650
},
{
"epoch": 1.6020702936928262,
"grad_norm": 0.94921875,
"learning_rate": 0.00013304196671047334,
"loss": 0.4808,
"step": 6655
},
{
"epoch": 1.6032739528165623,
"grad_norm": 0.99609375,
"learning_rate": 0.00013293503699660252,
"loss": 0.497,
"step": 6660
},
{
"epoch": 1.6044776119402986,
"grad_norm": 0.94921875,
"learning_rate": 0.00013282812135885803,
"loss": 0.5002,
"step": 6665
},
{
"epoch": 1.6056812710640347,
"grad_norm": 1.0703125,
"learning_rate": 0.00013272121996806376,
"loss": 0.4976,
"step": 6670
},
{
"epoch": 1.6068849301877708,
"grad_norm": 0.953125,
"learning_rate": 0.00013261433299502066,
"loss": 0.4829,
"step": 6675
},
{
"epoch": 1.608088589311507,
"grad_norm": 1.0078125,
"learning_rate": 0.00013250746061050674,
"loss": 0.5119,
"step": 6680
},
{
"epoch": 1.609292248435243,
"grad_norm": 0.94921875,
"learning_rate": 0.0001324006029852767,
"loss": 0.4876,
"step": 6685
},
{
"epoch": 1.6104959075589793,
"grad_norm": 0.99609375,
"learning_rate": 0.00013229376029006158,
"loss": 0.4485,
"step": 6690
},
{
"epoch": 1.6116995666827154,
"grad_norm": 0.93359375,
"learning_rate": 0.00013218693269556868,
"loss": 0.4623,
"step": 6695
},
{
"epoch": 1.6129032258064515,
"grad_norm": 1.0390625,
"learning_rate": 0.00013208012037248102,
"loss": 0.4732,
"step": 6700
},
{
"epoch": 1.6141068849301878,
"grad_norm": 0.921875,
"learning_rate": 0.00013197332349145738,
"loss": 0.4795,
"step": 6705
},
{
"epoch": 1.615310544053924,
"grad_norm": 0.97265625,
"learning_rate": 0.0001318665422231318,
"loss": 0.4558,
"step": 6710
},
{
"epoch": 1.61651420317766,
"grad_norm": 0.91015625,
"learning_rate": 0.00013175977673811335,
"loss": 0.4185,
"step": 6715
},
{
"epoch": 1.6177178623013964,
"grad_norm": 0.94921875,
"learning_rate": 0.0001316530272069859,
"loss": 0.4937,
"step": 6720
},
{
"epoch": 1.6189215214251322,
"grad_norm": 0.99609375,
"learning_rate": 0.00013154629380030786,
"loss": 0.4643,
"step": 6725
},
{
"epoch": 1.6201251805488686,
"grad_norm": 0.98828125,
"learning_rate": 0.0001314395766886118,
"loss": 0.4336,
"step": 6730
},
{
"epoch": 1.6213288396726049,
"grad_norm": 1.015625,
"learning_rate": 0.0001313328760424044,
"loss": 0.482,
"step": 6735
},
{
"epoch": 1.6225324987963408,
"grad_norm": 0.9453125,
"learning_rate": 0.00013122619203216585,
"loss": 0.4599,
"step": 6740
},
{
"epoch": 1.623736157920077,
"grad_norm": 0.9609375,
"learning_rate": 0.0001311195248283499,
"loss": 0.4718,
"step": 6745
},
{
"epoch": 1.6249398170438132,
"grad_norm": 0.984375,
"learning_rate": 0.0001310128746013834,
"loss": 0.4743,
"step": 6750
},
{
"epoch": 1.6261434761675493,
"grad_norm": 0.953125,
"learning_rate": 0.00013090624152166603,
"loss": 0.4933,
"step": 6755
},
{
"epoch": 1.6273471352912856,
"grad_norm": 0.9453125,
"learning_rate": 0.00013079962575957016,
"loss": 0.4797,
"step": 6760
},
{
"epoch": 1.6285507944150217,
"grad_norm": 1.0390625,
"learning_rate": 0.00013069302748544041,
"loss": 0.4828,
"step": 6765
},
{
"epoch": 1.6297544535387578,
"grad_norm": 1.046875,
"learning_rate": 0.00013058644686959352,
"loss": 0.4666,
"step": 6770
},
{
"epoch": 1.6309581126624941,
"grad_norm": 0.93359375,
"learning_rate": 0.00013047988408231798,
"loss": 0.4459,
"step": 6775
},
{
"epoch": 1.63216177178623,
"grad_norm": 1.0234375,
"learning_rate": 0.00013037333929387382,
"loss": 0.4466,
"step": 6780
},
{
"epoch": 1.6333654309099663,
"grad_norm": 0.94140625,
"learning_rate": 0.00013026681267449232,
"loss": 0.437,
"step": 6785
},
{
"epoch": 1.6345690900337024,
"grad_norm": 1.03125,
"learning_rate": 0.00013016030439437563,
"loss": 0.4854,
"step": 6790
},
{
"epoch": 1.6357727491574385,
"grad_norm": 1.0234375,
"learning_rate": 0.00013005381462369677,
"loss": 0.4688,
"step": 6795
},
{
"epoch": 1.6369764082811749,
"grad_norm": 1.0,
"learning_rate": 0.00012994734353259904,
"loss": 0.4848,
"step": 6800
},
{
"epoch": 1.638180067404911,
"grad_norm": 1.015625,
"learning_rate": 0.00012984089129119592,
"loss": 0.4789,
"step": 6805
},
{
"epoch": 1.639383726528647,
"grad_norm": 0.98828125,
"learning_rate": 0.00012973445806957088,
"loss": 0.4543,
"step": 6810
},
{
"epoch": 1.6405873856523834,
"grad_norm": 0.9921875,
"learning_rate": 0.00012962804403777686,
"loss": 0.4555,
"step": 6815
},
{
"epoch": 1.6417910447761193,
"grad_norm": 0.96875,
"learning_rate": 0.00012952164936583626,
"loss": 0.4523,
"step": 6820
},
{
"epoch": 1.6429947038998556,
"grad_norm": 1.015625,
"learning_rate": 0.00012941527422374047,
"loss": 0.4583,
"step": 6825
},
{
"epoch": 1.6441983630235917,
"grad_norm": 0.96875,
"learning_rate": 0.00012930891878144967,
"loss": 0.4757,
"step": 6830
},
{
"epoch": 1.6454020221473278,
"grad_norm": 1.015625,
"learning_rate": 0.00012920258320889264,
"loss": 0.4485,
"step": 6835
},
{
"epoch": 1.646605681271064,
"grad_norm": 0.91796875,
"learning_rate": 0.00012909626767596628,
"loss": 0.4796,
"step": 6840
},
{
"epoch": 1.6478093403948002,
"grad_norm": 0.9140625,
"learning_rate": 0.00012898997235253568,
"loss": 0.449,
"step": 6845
},
{
"epoch": 1.6490129995185363,
"grad_norm": 1.015625,
"learning_rate": 0.00012888369740843343,
"loss": 0.4921,
"step": 6850
},
{
"epoch": 1.6502166586422726,
"grad_norm": 1.015625,
"learning_rate": 0.00012877744301345963,
"loss": 0.4378,
"step": 6855
},
{
"epoch": 1.6514203177660085,
"grad_norm": 0.95703125,
"learning_rate": 0.0001286712093373817,
"loss": 0.4479,
"step": 6860
},
{
"epoch": 1.6526239768897448,
"grad_norm": 0.98046875,
"learning_rate": 0.00012856499654993362,
"loss": 0.4571,
"step": 6865
},
{
"epoch": 1.653827636013481,
"grad_norm": 0.94140625,
"learning_rate": 0.0001284588048208164,
"loss": 0.4955,
"step": 6870
},
{
"epoch": 1.655031295137217,
"grad_norm": 0.9453125,
"learning_rate": 0.00012835263431969704,
"loss": 0.4173,
"step": 6875
},
{
"epoch": 1.6562349542609534,
"grad_norm": 1.03125,
"learning_rate": 0.00012824648521620884,
"loss": 0.4715,
"step": 6880
},
{
"epoch": 1.6574386133846895,
"grad_norm": 0.95703125,
"learning_rate": 0.00012814035767995093,
"loss": 0.4965,
"step": 6885
},
{
"epoch": 1.6586422725084256,
"grad_norm": 0.96875,
"learning_rate": 0.00012803425188048775,
"loss": 0.48,
"step": 6890
},
{
"epoch": 1.6598459316321619,
"grad_norm": 1.0078125,
"learning_rate": 0.00012792816798734932,
"loss": 0.4535,
"step": 6895
},
{
"epoch": 1.661049590755898,
"grad_norm": 0.9453125,
"learning_rate": 0.0001278221061700304,
"loss": 0.4546,
"step": 6900
},
{
"epoch": 1.662253249879634,
"grad_norm": 0.9609375,
"learning_rate": 0.0001277160665979907,
"loss": 0.4663,
"step": 6905
},
{
"epoch": 1.6634569090033704,
"grad_norm": 0.99609375,
"learning_rate": 0.00012761004944065413,
"loss": 0.4511,
"step": 6910
},
{
"epoch": 1.6646605681271063,
"grad_norm": 0.95703125,
"learning_rate": 0.000127504054867409,
"loss": 0.4787,
"step": 6915
},
{
"epoch": 1.6658642272508426,
"grad_norm": 0.97265625,
"learning_rate": 0.00012739808304760753,
"loss": 0.4643,
"step": 6920
},
{
"epoch": 1.6670678863745787,
"grad_norm": 0.9296875,
"learning_rate": 0.0001272921341505654,
"loss": 0.461,
"step": 6925
},
{
"epoch": 1.6682715454983148,
"grad_norm": 1.015625,
"learning_rate": 0.00012718620834556186,
"loss": 0.5,
"step": 6930
},
{
"epoch": 1.6694752046220511,
"grad_norm": 0.96484375,
"learning_rate": 0.00012708030580183918,
"loss": 0.4438,
"step": 6935
},
{
"epoch": 1.6706788637457872,
"grad_norm": 0.96484375,
"learning_rate": 0.00012697442668860247,
"loss": 0.4514,
"step": 6940
},
{
"epoch": 1.6718825228695233,
"grad_norm": 1.046875,
"learning_rate": 0.00012686857117501945,
"loss": 0.4891,
"step": 6945
},
{
"epoch": 1.6730861819932596,
"grad_norm": 0.96484375,
"learning_rate": 0.00012676273943022,
"loss": 0.4262,
"step": 6950
},
{
"epoch": 1.6742898411169955,
"grad_norm": 0.98046875,
"learning_rate": 0.00012665693162329622,
"loss": 0.4749,
"step": 6955
},
{
"epoch": 1.6754935002407318,
"grad_norm": 1.015625,
"learning_rate": 0.0001265511479233018,
"loss": 0.4407,
"step": 6960
},
{
"epoch": 1.676697159364468,
"grad_norm": 1.0,
"learning_rate": 0.000126445388499252,
"loss": 0.4846,
"step": 6965
},
{
"epoch": 1.677900818488204,
"grad_norm": 0.953125,
"learning_rate": 0.00012633965352012327,
"loss": 0.4411,
"step": 6970
},
{
"epoch": 1.6791044776119404,
"grad_norm": 1.0625,
"learning_rate": 0.00012623394315485295,
"loss": 0.4615,
"step": 6975
},
{
"epoch": 1.6803081367356765,
"grad_norm": 0.9609375,
"learning_rate": 0.0001261282575723392,
"loss": 0.4301,
"step": 6980
},
{
"epoch": 1.6815117958594126,
"grad_norm": 0.953125,
"learning_rate": 0.00012602259694144042,
"loss": 0.4565,
"step": 6985
},
{
"epoch": 1.682715454983149,
"grad_norm": 1.0546875,
"learning_rate": 0.0001259169614309752,
"loss": 0.4921,
"step": 6990
},
{
"epoch": 1.6839191141068848,
"grad_norm": 0.9453125,
"learning_rate": 0.0001258113512097221,
"loss": 0.4478,
"step": 6995
},
{
"epoch": 1.685122773230621,
"grad_norm": 0.93359375,
"learning_rate": 0.00012570576644641902,
"loss": 0.4615,
"step": 7000
},
{
"epoch": 1.685122773230621,
"eval_loss": 0.39812684059143066,
"eval_runtime": 2.3544,
"eval_samples_per_second": 84.948,
"eval_steps_per_second": 84.948,
"step": 7000
},
{
"epoch": 1.6863264323543572,
"grad_norm": 1.0234375,
"learning_rate": 0.0001256002073097635,
"loss": 0.4651,
"step": 7005
},
{
"epoch": 1.6875300914780933,
"grad_norm": 0.98046875,
"learning_rate": 0.0001254946739684119,
"loss": 0.4577,
"step": 7010
},
{
"epoch": 1.6887337506018296,
"grad_norm": 1.0390625,
"learning_rate": 0.00012538916659097946,
"loss": 0.4781,
"step": 7015
},
{
"epoch": 1.6899374097255657,
"grad_norm": 0.96875,
"learning_rate": 0.00012528368534603994,
"loss": 0.4181,
"step": 7020
},
{
"epoch": 1.6911410688493018,
"grad_norm": 0.93359375,
"learning_rate": 0.0001251782304021253,
"loss": 0.4477,
"step": 7025
},
{
"epoch": 1.6923447279730381,
"grad_norm": 0.94140625,
"learning_rate": 0.00012507280192772553,
"loss": 0.4446,
"step": 7030
},
{
"epoch": 1.6935483870967742,
"grad_norm": 1.0703125,
"learning_rate": 0.00012496740009128828,
"loss": 0.4755,
"step": 7035
},
{
"epoch": 1.6947520462205103,
"grad_norm": 0.91796875,
"learning_rate": 0.0001248620250612187,
"loss": 0.4556,
"step": 7040
},
{
"epoch": 1.6959557053442467,
"grad_norm": 0.99609375,
"learning_rate": 0.00012475667700587907,
"loss": 0.4737,
"step": 7045
},
{
"epoch": 1.6971593644679825,
"grad_norm": 0.9921875,
"learning_rate": 0.00012465135609358852,
"loss": 0.4413,
"step": 7050
},
{
"epoch": 1.6983630235917189,
"grad_norm": 0.98828125,
"learning_rate": 0.00012454606249262298,
"loss": 0.4347,
"step": 7055
},
{
"epoch": 1.699566682715455,
"grad_norm": 1.0078125,
"learning_rate": 0.0001244407963712145,
"loss": 0.4535,
"step": 7060
},
{
"epoch": 1.700770341839191,
"grad_norm": 0.9375,
"learning_rate": 0.00012433555789755142,
"loss": 0.4329,
"step": 7065
},
{
"epoch": 1.7019740009629274,
"grad_norm": 1.0078125,
"learning_rate": 0.0001242303472397779,
"loss": 0.4694,
"step": 7070
},
{
"epoch": 1.7031776600866635,
"grad_norm": 1.015625,
"learning_rate": 0.00012412516456599348,
"loss": 0.4901,
"step": 7075
},
{
"epoch": 1.7043813192103996,
"grad_norm": 0.89453125,
"learning_rate": 0.00012402001004425318,
"loss": 0.4221,
"step": 7080
},
{
"epoch": 1.705584978334136,
"grad_norm": 0.94921875,
"learning_rate": 0.00012391488384256698,
"loss": 0.4446,
"step": 7085
},
{
"epoch": 1.7067886374578718,
"grad_norm": 1.0234375,
"learning_rate": 0.00012380978612889956,
"loss": 0.4455,
"step": 7090
},
{
"epoch": 1.7079922965816081,
"grad_norm": 0.96875,
"learning_rate": 0.0001237047170711702,
"loss": 0.4295,
"step": 7095
},
{
"epoch": 1.7091959557053442,
"grad_norm": 0.921875,
"learning_rate": 0.00012359967683725224,
"loss": 0.4672,
"step": 7100
},
{
"epoch": 1.7103996148290803,
"grad_norm": 1.0078125,
"learning_rate": 0.00012349466559497305,
"loss": 0.4446,
"step": 7105
},
{
"epoch": 1.7116032739528166,
"grad_norm": 1.0,
"learning_rate": 0.0001233896835121137,
"loss": 0.4776,
"step": 7110
},
{
"epoch": 1.7128069330765527,
"grad_norm": 0.91796875,
"learning_rate": 0.00012328473075640865,
"loss": 0.4582,
"step": 7115
},
{
"epoch": 1.7140105922002888,
"grad_norm": 0.921875,
"learning_rate": 0.0001231798074955455,
"loss": 0.4646,
"step": 7120
},
{
"epoch": 1.7152142513240252,
"grad_norm": 0.91796875,
"learning_rate": 0.0001230749138971647,
"loss": 0.4287,
"step": 7125
},
{
"epoch": 1.716417910447761,
"grad_norm": 1.1953125,
"learning_rate": 0.0001229700501288594,
"loss": 0.4927,
"step": 7130
},
{
"epoch": 1.7176215695714974,
"grad_norm": 1.0546875,
"learning_rate": 0.0001228652163581749,
"loss": 0.4705,
"step": 7135
},
{
"epoch": 1.7188252286952335,
"grad_norm": 0.9765625,
"learning_rate": 0.0001227604127526088,
"loss": 0.4653,
"step": 7140
},
{
"epoch": 1.7200288878189696,
"grad_norm": 0.984375,
"learning_rate": 0.00012265563947961032,
"loss": 0.4253,
"step": 7145
},
{
"epoch": 1.7212325469427059,
"grad_norm": 0.83984375,
"learning_rate": 0.00012255089670658035,
"loss": 0.4585,
"step": 7150
},
{
"epoch": 1.722436206066442,
"grad_norm": 0.91796875,
"learning_rate": 0.00012244618460087095,
"loss": 0.4567,
"step": 7155
},
{
"epoch": 1.723639865190178,
"grad_norm": 1.0234375,
"learning_rate": 0.00012234150332978523,
"loss": 0.474,
"step": 7160
},
{
"epoch": 1.7248435243139144,
"grad_norm": 0.92578125,
"learning_rate": 0.00012223685306057708,
"loss": 0.4286,
"step": 7165
},
{
"epoch": 1.7260471834376505,
"grad_norm": 0.94140625,
"learning_rate": 0.00012213223396045068,
"loss": 0.4509,
"step": 7170
},
{
"epoch": 1.7272508425613866,
"grad_norm": 1.0703125,
"learning_rate": 0.00012202764619656066,
"loss": 0.5018,
"step": 7175
},
{
"epoch": 1.728454501685123,
"grad_norm": 1.0234375,
"learning_rate": 0.00012192308993601139,
"loss": 0.4369,
"step": 7180
},
{
"epoch": 1.7296581608088588,
"grad_norm": 1.0078125,
"learning_rate": 0.00012181856534585694,
"loss": 0.478,
"step": 7185
},
{
"epoch": 1.7308618199325951,
"grad_norm": 1.0,
"learning_rate": 0.00012171407259310094,
"loss": 0.4917,
"step": 7190
},
{
"epoch": 1.7320654790563312,
"grad_norm": 0.96875,
"learning_rate": 0.00012160961184469586,
"loss": 0.4386,
"step": 7195
},
{
"epoch": 1.7332691381800673,
"grad_norm": 0.92578125,
"learning_rate": 0.0001215051832675433,
"loss": 0.4611,
"step": 7200
},
{
"epoch": 1.7344727973038037,
"grad_norm": 0.8984375,
"learning_rate": 0.00012140078702849334,
"loss": 0.4395,
"step": 7205
},
{
"epoch": 1.7356764564275398,
"grad_norm": 1.0078125,
"learning_rate": 0.00012129642329434436,
"loss": 0.4648,
"step": 7210
},
{
"epoch": 1.7368801155512759,
"grad_norm": 1.09375,
"learning_rate": 0.00012119209223184295,
"loss": 0.4223,
"step": 7215
},
{
"epoch": 1.7380837746750122,
"grad_norm": 0.99609375,
"learning_rate": 0.00012108779400768328,
"loss": 0.4612,
"step": 7220
},
{
"epoch": 1.739287433798748,
"grad_norm": 0.94921875,
"learning_rate": 0.00012098352878850726,
"loss": 0.4802,
"step": 7225
},
{
"epoch": 1.7404910929224844,
"grad_norm": 0.9453125,
"learning_rate": 0.00012087929674090398,
"loss": 0.4425,
"step": 7230
},
{
"epoch": 1.7416947520462205,
"grad_norm": 0.96875,
"learning_rate": 0.0001207750980314095,
"loss": 0.4666,
"step": 7235
},
{
"epoch": 1.7428984111699566,
"grad_norm": 1.03125,
"learning_rate": 0.00012067093282650665,
"loss": 0.4634,
"step": 7240
},
{
"epoch": 1.744102070293693,
"grad_norm": 0.87109375,
"learning_rate": 0.00012056680129262471,
"loss": 0.4579,
"step": 7245
},
{
"epoch": 1.745305729417429,
"grad_norm": 1.3515625,
"learning_rate": 0.00012046270359613924,
"loss": 0.4597,
"step": 7250
},
{
"epoch": 1.746509388541165,
"grad_norm": 0.92578125,
"learning_rate": 0.00012035863990337164,
"loss": 0.4435,
"step": 7255
},
{
"epoch": 1.7477130476649014,
"grad_norm": 0.91015625,
"learning_rate": 0.00012025461038058895,
"loss": 0.455,
"step": 7260
},
{
"epoch": 1.7489167067886373,
"grad_norm": 0.97265625,
"learning_rate": 0.00012015061519400376,
"loss": 0.4697,
"step": 7265
},
{
"epoch": 1.7501203659123736,
"grad_norm": 0.9453125,
"learning_rate": 0.00012004665450977369,
"loss": 0.4531,
"step": 7270
},
{
"epoch": 1.7513240250361097,
"grad_norm": 0.9375,
"learning_rate": 0.00011994272849400127,
"loss": 0.4369,
"step": 7275
},
{
"epoch": 1.7525276841598458,
"grad_norm": 0.9609375,
"learning_rate": 0.00011983883731273365,
"loss": 0.4385,
"step": 7280
},
{
"epoch": 1.7537313432835822,
"grad_norm": 0.93359375,
"learning_rate": 0.00011973498113196224,
"loss": 0.4429,
"step": 7285
},
{
"epoch": 1.7549350024073183,
"grad_norm": 0.96875,
"learning_rate": 0.00011963116011762266,
"loss": 0.4427,
"step": 7290
},
{
"epoch": 1.7561386615310544,
"grad_norm": 0.88671875,
"learning_rate": 0.00011952737443559425,
"loss": 0.4358,
"step": 7295
},
{
"epoch": 1.7573423206547907,
"grad_norm": 0.9609375,
"learning_rate": 0.0001194236242516999,
"loss": 0.4282,
"step": 7300
},
{
"epoch": 1.7585459797785266,
"grad_norm": 0.90625,
"learning_rate": 0.00011931990973170589,
"loss": 0.4479,
"step": 7305
},
{
"epoch": 1.7597496389022629,
"grad_norm": 0.97265625,
"learning_rate": 0.00011921623104132133,
"loss": 0.4801,
"step": 7310
},
{
"epoch": 1.7609532980259992,
"grad_norm": 0.8671875,
"learning_rate": 0.0001191125883461983,
"loss": 0.4263,
"step": 7315
},
{
"epoch": 1.762156957149735,
"grad_norm": 0.98046875,
"learning_rate": 0.00011900898181193111,
"loss": 0.4402,
"step": 7320
},
{
"epoch": 1.7633606162734714,
"grad_norm": 0.91796875,
"learning_rate": 0.00011890541160405657,
"loss": 0.475,
"step": 7325
},
{
"epoch": 1.7645642753972075,
"grad_norm": 0.96875,
"learning_rate": 0.0001188018778880533,
"loss": 0.4402,
"step": 7330
},
{
"epoch": 1.7657679345209436,
"grad_norm": 0.91015625,
"learning_rate": 0.0001186983808293416,
"loss": 0.478,
"step": 7335
},
{
"epoch": 1.76697159364468,
"grad_norm": 0.953125,
"learning_rate": 0.00011859492059328326,
"loss": 0.4553,
"step": 7340
},
{
"epoch": 1.768175252768416,
"grad_norm": 1.03125,
"learning_rate": 0.00011849149734518117,
"loss": 0.477,
"step": 7345
},
{
"epoch": 1.7693789118921521,
"grad_norm": 1.0078125,
"learning_rate": 0.00011838811125027922,
"loss": 0.438,
"step": 7350
},
{
"epoch": 1.7705825710158885,
"grad_norm": 0.890625,
"learning_rate": 0.00011828476247376191,
"loss": 0.4605,
"step": 7355
},
{
"epoch": 1.7717862301396243,
"grad_norm": 0.92578125,
"learning_rate": 0.00011818145118075404,
"loss": 0.4449,
"step": 7360
},
{
"epoch": 1.7729898892633607,
"grad_norm": 1.0625,
"learning_rate": 0.0001180781775363206,
"loss": 0.4812,
"step": 7365
},
{
"epoch": 1.7741935483870968,
"grad_norm": 0.9375,
"learning_rate": 0.00011797494170546634,
"loss": 0.4349,
"step": 7370
},
{
"epoch": 1.7753972075108329,
"grad_norm": 0.9609375,
"learning_rate": 0.00011787174385313575,
"loss": 0.4664,
"step": 7375
},
{
"epoch": 1.7766008666345692,
"grad_norm": 1.0390625,
"learning_rate": 0.00011776858414421245,
"loss": 0.4646,
"step": 7380
},
{
"epoch": 1.7778045257583053,
"grad_norm": 0.9921875,
"learning_rate": 0.00011766546274351928,
"loss": 0.4445,
"step": 7385
},
{
"epoch": 1.7790081848820414,
"grad_norm": 0.9296875,
"learning_rate": 0.00011756237981581779,
"loss": 0.4477,
"step": 7390
},
{
"epoch": 1.7802118440057777,
"grad_norm": 0.859375,
"learning_rate": 0.000117459335525808,
"loss": 0.4432,
"step": 7395
},
{
"epoch": 1.7814155031295136,
"grad_norm": 0.9609375,
"learning_rate": 0.00011735633003812841,
"loss": 0.4512,
"step": 7400
},
{
"epoch": 1.78261916225325,
"grad_norm": 1.0625,
"learning_rate": 0.00011725336351735521,
"loss": 0.4649,
"step": 7405
},
{
"epoch": 1.783822821376986,
"grad_norm": 0.8984375,
"learning_rate": 0.00011715043612800264,
"loss": 0.4466,
"step": 7410
},
{
"epoch": 1.785026480500722,
"grad_norm": 0.88671875,
"learning_rate": 0.00011704754803452227,
"loss": 0.4504,
"step": 7415
},
{
"epoch": 1.7862301396244584,
"grad_norm": 0.9140625,
"learning_rate": 0.00011694469940130282,
"loss": 0.4661,
"step": 7420
},
{
"epoch": 1.7874337987481945,
"grad_norm": 0.92578125,
"learning_rate": 0.0001168418903926701,
"loss": 0.4334,
"step": 7425
},
{
"epoch": 1.7886374578719306,
"grad_norm": 0.89453125,
"learning_rate": 0.00011673912117288654,
"loss": 0.451,
"step": 7430
},
{
"epoch": 1.789841116995667,
"grad_norm": 1.0546875,
"learning_rate": 0.00011663639190615098,
"loss": 0.4751,
"step": 7435
},
{
"epoch": 1.7910447761194028,
"grad_norm": 0.94921875,
"learning_rate": 0.00011653370275659851,
"loss": 0.4464,
"step": 7440
},
{
"epoch": 1.7922484352431391,
"grad_norm": 0.95703125,
"learning_rate": 0.00011643105388830002,
"loss": 0.4468,
"step": 7445
},
{
"epoch": 1.7934520943668752,
"grad_norm": 1.015625,
"learning_rate": 0.00011632844546526213,
"loss": 0.4679,
"step": 7450
},
{
"epoch": 1.7946557534906113,
"grad_norm": 0.9609375,
"learning_rate": 0.00011622587765142672,
"loss": 0.4461,
"step": 7455
},
{
"epoch": 1.7958594126143477,
"grad_norm": 1.03125,
"learning_rate": 0.00011612335061067093,
"loss": 0.4321,
"step": 7460
},
{
"epoch": 1.7970630717380838,
"grad_norm": 0.921875,
"learning_rate": 0.00011602086450680667,
"loss": 0.454,
"step": 7465
},
{
"epoch": 1.7982667308618199,
"grad_norm": 0.984375,
"learning_rate": 0.00011591841950358047,
"loss": 0.4331,
"step": 7470
},
{
"epoch": 1.7994703899855562,
"grad_norm": 1.09375,
"learning_rate": 0.00011581601576467318,
"loss": 0.4684,
"step": 7475
},
{
"epoch": 1.8006740491092923,
"grad_norm": 0.90234375,
"learning_rate": 0.00011571365345369971,
"loss": 0.4361,
"step": 7480
},
{
"epoch": 1.8018777082330284,
"grad_norm": 0.9296875,
"learning_rate": 0.00011561133273420877,
"loss": 0.4684,
"step": 7485
},
{
"epoch": 1.8030813673567647,
"grad_norm": 0.9921875,
"learning_rate": 0.00011550905376968271,
"loss": 0.4407,
"step": 7490
},
{
"epoch": 1.8042850264805006,
"grad_norm": 1.0546875,
"learning_rate": 0.00011540681672353703,
"loss": 0.4323,
"step": 7495
},
{
"epoch": 1.805488685604237,
"grad_norm": 0.875,
"learning_rate": 0.00011530462175912039,
"loss": 0.4272,
"step": 7500
},
{
"epoch": 1.805488685604237,
"eval_loss": 0.38952553272247314,
"eval_runtime": 2.3583,
"eval_samples_per_second": 84.806,
"eval_steps_per_second": 84.806,
"step": 7500
},
{
"epoch": 1.806692344727973,
"grad_norm": 1.0,
"learning_rate": 0.00011520246903971402,
"loss": 0.4794,
"step": 7505
},
{
"epoch": 1.8078960038517091,
"grad_norm": 0.95703125,
"learning_rate": 0.00011510035872853193,
"loss": 0.4363,
"step": 7510
},
{
"epoch": 1.8090996629754454,
"grad_norm": 1.015625,
"learning_rate": 0.00011499829098872011,
"loss": 0.453,
"step": 7515
},
{
"epoch": 1.8103033220991815,
"grad_norm": 0.93359375,
"learning_rate": 0.0001148962659833567,
"loss": 0.4377,
"step": 7520
},
{
"epoch": 1.8115069812229176,
"grad_norm": 1.0078125,
"learning_rate": 0.0001147942838754515,
"loss": 0.4275,
"step": 7525
},
{
"epoch": 1.812710640346654,
"grad_norm": 0.9375,
"learning_rate": 0.00011469234482794574,
"loss": 0.4642,
"step": 7530
},
{
"epoch": 1.8139142994703898,
"grad_norm": 0.94140625,
"learning_rate": 0.0001145904490037119,
"loss": 0.4863,
"step": 7535
},
{
"epoch": 1.8151179585941262,
"grad_norm": 1.09375,
"learning_rate": 0.00011448859656555343,
"loss": 0.4618,
"step": 7540
},
{
"epoch": 1.8163216177178623,
"grad_norm": 0.96484375,
"learning_rate": 0.00011438678767620438,
"loss": 0.4663,
"step": 7545
},
{
"epoch": 1.8175252768415984,
"grad_norm": 0.91015625,
"learning_rate": 0.00011428502249832926,
"loss": 0.4405,
"step": 7550
},
{
"epoch": 1.8187289359653347,
"grad_norm": 1.0,
"learning_rate": 0.00011418330119452268,
"loss": 0.4922,
"step": 7555
},
{
"epoch": 1.8199325950890708,
"grad_norm": 0.98828125,
"learning_rate": 0.00011408162392730925,
"loss": 0.4401,
"step": 7560
},
{
"epoch": 1.821136254212807,
"grad_norm": 0.95703125,
"learning_rate": 0.00011397999085914326,
"loss": 0.4526,
"step": 7565
},
{
"epoch": 1.8223399133365432,
"grad_norm": 0.91015625,
"learning_rate": 0.0001138784021524082,
"loss": 0.4366,
"step": 7570
},
{
"epoch": 1.823543572460279,
"grad_norm": 1.046875,
"learning_rate": 0.00011377685796941681,
"loss": 0.4534,
"step": 7575
},
{
"epoch": 1.8247472315840154,
"grad_norm": 0.95703125,
"learning_rate": 0.00011367535847241065,
"loss": 0.4468,
"step": 7580
},
{
"epoch": 1.8259508907077515,
"grad_norm": 0.9296875,
"learning_rate": 0.00011357390382355994,
"loss": 0.4426,
"step": 7585
},
{
"epoch": 1.8271545498314876,
"grad_norm": 1.078125,
"learning_rate": 0.00011347249418496313,
"loss": 0.4515,
"step": 7590
},
{
"epoch": 1.828358208955224,
"grad_norm": 0.90625,
"learning_rate": 0.00011337112971864687,
"loss": 0.4689,
"step": 7595
},
{
"epoch": 1.82956186807896,
"grad_norm": 0.9609375,
"learning_rate": 0.00011326981058656562,
"loss": 0.4258,
"step": 7600
},
{
"epoch": 1.8307655272026961,
"grad_norm": 1.0234375,
"learning_rate": 0.00011316853695060129,
"loss": 0.4449,
"step": 7605
},
{
"epoch": 1.8319691863264325,
"grad_norm": 0.8984375,
"learning_rate": 0.0001130673089725633,
"loss": 0.4597,
"step": 7610
},
{
"epoch": 1.8331728454501686,
"grad_norm": 0.82421875,
"learning_rate": 0.00011296612681418791,
"loss": 0.4413,
"step": 7615
},
{
"epoch": 1.8343765045739047,
"grad_norm": 0.984375,
"learning_rate": 0.00011286499063713833,
"loss": 0.4589,
"step": 7620
},
{
"epoch": 1.835580163697641,
"grad_norm": 1.046875,
"learning_rate": 0.00011276390060300422,
"loss": 0.4314,
"step": 7625
},
{
"epoch": 1.8367838228213769,
"grad_norm": 0.98046875,
"learning_rate": 0.00011266285687330156,
"loss": 0.4639,
"step": 7630
},
{
"epoch": 1.8379874819451132,
"grad_norm": 1.0625,
"learning_rate": 0.00011256185960947234,
"loss": 0.4339,
"step": 7635
},
{
"epoch": 1.8391911410688493,
"grad_norm": 0.94921875,
"learning_rate": 0.00011246090897288423,
"loss": 0.4705,
"step": 7640
},
{
"epoch": 1.8403948001925854,
"grad_norm": 0.9375,
"learning_rate": 0.00011236000512483051,
"loss": 0.4569,
"step": 7645
},
{
"epoch": 1.8415984593163217,
"grad_norm": 0.984375,
"learning_rate": 0.00011225914822652971,
"loss": 0.4694,
"step": 7650
},
{
"epoch": 1.8428021184400578,
"grad_norm": 0.97265625,
"learning_rate": 0.00011215833843912521,
"loss": 0.4501,
"step": 7655
},
{
"epoch": 1.844005777563794,
"grad_norm": 1.0625,
"learning_rate": 0.00011205757592368529,
"loss": 0.4504,
"step": 7660
},
{
"epoch": 1.8452094366875302,
"grad_norm": 1.0078125,
"learning_rate": 0.00011195686084120253,
"loss": 0.4722,
"step": 7665
},
{
"epoch": 1.8464130958112661,
"grad_norm": 0.97265625,
"learning_rate": 0.00011185619335259387,
"loss": 0.4609,
"step": 7670
},
{
"epoch": 1.8476167549350024,
"grad_norm": 0.88671875,
"learning_rate": 0.00011175557361870016,
"loss": 0.4594,
"step": 7675
},
{
"epoch": 1.8488204140587385,
"grad_norm": 0.89453125,
"learning_rate": 0.00011165500180028593,
"loss": 0.4189,
"step": 7680
},
{
"epoch": 1.8500240731824746,
"grad_norm": 0.984375,
"learning_rate": 0.00011155447805803916,
"loss": 0.4582,
"step": 7685
},
{
"epoch": 1.851227732306211,
"grad_norm": 1.0703125,
"learning_rate": 0.00011145400255257098,
"loss": 0.4728,
"step": 7690
},
{
"epoch": 1.852431391429947,
"grad_norm": 0.90625,
"learning_rate": 0.00011135357544441552,
"loss": 0.421,
"step": 7695
},
{
"epoch": 1.8536350505536832,
"grad_norm": 1.0,
"learning_rate": 0.00011125319689402963,
"loss": 0.4548,
"step": 7700
},
{
"epoch": 1.8548387096774195,
"grad_norm": 1.015625,
"learning_rate": 0.0001111528670617924,
"loss": 0.4404,
"step": 7705
},
{
"epoch": 1.8560423688011554,
"grad_norm": 0.88671875,
"learning_rate": 0.00011105258610800524,
"loss": 0.4346,
"step": 7710
},
{
"epoch": 1.8572460279248917,
"grad_norm": 0.99609375,
"learning_rate": 0.00011095235419289132,
"loss": 0.4626,
"step": 7715
},
{
"epoch": 1.8584496870486278,
"grad_norm": 1.015625,
"learning_rate": 0.00011085217147659563,
"loss": 0.457,
"step": 7720
},
{
"epoch": 1.8596533461723639,
"grad_norm": 0.99609375,
"learning_rate": 0.00011075203811918447,
"loss": 0.4359,
"step": 7725
},
{
"epoch": 1.8608570052961002,
"grad_norm": 1.0546875,
"learning_rate": 0.00011065195428064525,
"loss": 0.4443,
"step": 7730
},
{
"epoch": 1.8620606644198363,
"grad_norm": 1.0078125,
"learning_rate": 0.0001105519201208863,
"loss": 0.4684,
"step": 7735
},
{
"epoch": 1.8632643235435724,
"grad_norm": 0.88671875,
"learning_rate": 0.00011045193579973652,
"loss": 0.4472,
"step": 7740
},
{
"epoch": 1.8644679826673087,
"grad_norm": 0.90234375,
"learning_rate": 0.00011035200147694524,
"loss": 0.4154,
"step": 7745
},
{
"epoch": 1.8656716417910446,
"grad_norm": 1.0,
"learning_rate": 0.00011025211731218196,
"loss": 0.4633,
"step": 7750
},
{
"epoch": 1.866875300914781,
"grad_norm": 1.015625,
"learning_rate": 0.00011015228346503588,
"loss": 0.4339,
"step": 7755
},
{
"epoch": 1.8680789600385173,
"grad_norm": 1.125,
"learning_rate": 0.00011005250009501595,
"loss": 0.4509,
"step": 7760
},
{
"epoch": 1.8692826191622531,
"grad_norm": 0.96484375,
"learning_rate": 0.0001099527673615504,
"loss": 0.44,
"step": 7765
},
{
"epoch": 1.8704862782859895,
"grad_norm": 1.0078125,
"learning_rate": 0.00010985308542398652,
"loss": 0.4388,
"step": 7770
},
{
"epoch": 1.8716899374097256,
"grad_norm": 0.90234375,
"learning_rate": 0.0001097534544415906,
"loss": 0.4406,
"step": 7775
},
{
"epoch": 1.8728935965334617,
"grad_norm": 1.0390625,
"learning_rate": 0.00010965387457354735,
"loss": 0.4299,
"step": 7780
},
{
"epoch": 1.874097255657198,
"grad_norm": 0.95703125,
"learning_rate": 0.00010955434597895985,
"loss": 0.4439,
"step": 7785
},
{
"epoch": 1.875300914780934,
"grad_norm": 0.9921875,
"learning_rate": 0.0001094548688168493,
"loss": 0.4348,
"step": 7790
},
{
"epoch": 1.8765045739046702,
"grad_norm": 0.9609375,
"learning_rate": 0.00010935544324615469,
"loss": 0.4848,
"step": 7795
},
{
"epoch": 1.8777082330284065,
"grad_norm": 1.046875,
"learning_rate": 0.00010925606942573264,
"loss": 0.4538,
"step": 7800
},
{
"epoch": 1.8789118921521424,
"grad_norm": 0.95703125,
"learning_rate": 0.00010915674751435698,
"loss": 0.4778,
"step": 7805
},
{
"epoch": 1.8801155512758787,
"grad_norm": 0.95703125,
"learning_rate": 0.00010905747767071873,
"loss": 0.4506,
"step": 7810
},
{
"epoch": 1.8813192103996148,
"grad_norm": 0.9375,
"learning_rate": 0.0001089582600534256,
"loss": 0.4254,
"step": 7815
},
{
"epoch": 1.882522869523351,
"grad_norm": 1.0234375,
"learning_rate": 0.00010885909482100192,
"loss": 0.4755,
"step": 7820
},
{
"epoch": 1.8837265286470872,
"grad_norm": 0.98046875,
"learning_rate": 0.0001087599821318883,
"loss": 0.4571,
"step": 7825
},
{
"epoch": 1.8849301877708233,
"grad_norm": 0.94140625,
"learning_rate": 0.0001086609221444414,
"loss": 0.4718,
"step": 7830
},
{
"epoch": 1.8861338468945594,
"grad_norm": 1.0703125,
"learning_rate": 0.00010856191501693376,
"loss": 0.4541,
"step": 7835
},
{
"epoch": 1.8873375060182958,
"grad_norm": 0.94140625,
"learning_rate": 0.00010846296090755331,
"loss": 0.4599,
"step": 7840
},
{
"epoch": 1.8885411651420316,
"grad_norm": 0.953125,
"learning_rate": 0.00010836405997440341,
"loss": 0.4535,
"step": 7845
},
{
"epoch": 1.889744824265768,
"grad_norm": 1.0546875,
"learning_rate": 0.00010826521237550231,
"loss": 0.4192,
"step": 7850
},
{
"epoch": 1.890948483389504,
"grad_norm": 0.99609375,
"learning_rate": 0.0001081664182687832,
"loss": 0.4407,
"step": 7855
},
{
"epoch": 1.8921521425132402,
"grad_norm": 0.98046875,
"learning_rate": 0.00010806767781209375,
"loss": 0.4233,
"step": 7860
},
{
"epoch": 1.8933558016369765,
"grad_norm": 0.9140625,
"learning_rate": 0.00010796899116319585,
"loss": 0.4512,
"step": 7865
},
{
"epoch": 1.8945594607607126,
"grad_norm": 0.91796875,
"learning_rate": 0.00010787035847976552,
"loss": 0.4226,
"step": 7870
},
{
"epoch": 1.8957631198844487,
"grad_norm": 0.9375,
"learning_rate": 0.00010777177991939242,
"loss": 0.4644,
"step": 7875
},
{
"epoch": 1.896966779008185,
"grad_norm": 0.86328125,
"learning_rate": 0.0001076732556395799,
"loss": 0.3976,
"step": 7880
},
{
"epoch": 1.8981704381319209,
"grad_norm": 0.98046875,
"learning_rate": 0.00010757478579774447,
"loss": 0.4365,
"step": 7885
},
{
"epoch": 1.8993740972556572,
"grad_norm": 0.8984375,
"learning_rate": 0.00010747637055121569,
"loss": 0.4584,
"step": 7890
},
{
"epoch": 1.9005777563793933,
"grad_norm": 0.9140625,
"learning_rate": 0.00010737801005723593,
"loss": 0.4546,
"step": 7895
},
{
"epoch": 1.9017814155031294,
"grad_norm": 0.88671875,
"learning_rate": 0.00010727970447295998,
"loss": 0.4129,
"step": 7900
},
{
"epoch": 1.9029850746268657,
"grad_norm": 0.99609375,
"learning_rate": 0.00010718145395545498,
"loss": 0.4565,
"step": 7905
},
{
"epoch": 1.9041887337506018,
"grad_norm": 0.96875,
"learning_rate": 0.00010708325866170012,
"loss": 0.4336,
"step": 7910
},
{
"epoch": 1.905392392874338,
"grad_norm": 0.9375,
"learning_rate": 0.00010698511874858627,
"loss": 0.4316,
"step": 7915
},
{
"epoch": 1.9065960519980742,
"grad_norm": 0.96484375,
"learning_rate": 0.00010688703437291589,
"loss": 0.4559,
"step": 7920
},
{
"epoch": 1.9077997111218103,
"grad_norm": 1.046875,
"learning_rate": 0.0001067890056914026,
"loss": 0.4874,
"step": 7925
},
{
"epoch": 1.9090033702455464,
"grad_norm": 1.0078125,
"learning_rate": 0.00010669103286067112,
"loss": 0.4448,
"step": 7930
},
{
"epoch": 1.9102070293692828,
"grad_norm": 1.0078125,
"learning_rate": 0.00010659311603725699,
"loss": 0.4421,
"step": 7935
},
{
"epoch": 1.9114106884930187,
"grad_norm": 0.890625,
"learning_rate": 0.0001064952553776061,
"loss": 0.4267,
"step": 7940
},
{
"epoch": 1.912614347616755,
"grad_norm": 0.98046875,
"learning_rate": 0.00010639745103807476,
"loss": 0.4584,
"step": 7945
},
{
"epoch": 1.913818006740491,
"grad_norm": 0.91796875,
"learning_rate": 0.00010629970317492917,
"loss": 0.4249,
"step": 7950
},
{
"epoch": 1.9150216658642272,
"grad_norm": 1.0625,
"learning_rate": 0.00010620201194434547,
"loss": 0.4511,
"step": 7955
},
{
"epoch": 1.9162253249879635,
"grad_norm": 0.953125,
"learning_rate": 0.00010610437750240909,
"loss": 0.4595,
"step": 7960
},
{
"epoch": 1.9174289841116996,
"grad_norm": 0.93359375,
"learning_rate": 0.00010600680000511486,
"loss": 0.4414,
"step": 7965
},
{
"epoch": 1.9186326432354357,
"grad_norm": 0.84765625,
"learning_rate": 0.00010590927960836667,
"loss": 0.4072,
"step": 7970
},
{
"epoch": 1.919836302359172,
"grad_norm": 0.90234375,
"learning_rate": 0.00010581181646797702,
"loss": 0.4293,
"step": 7975
},
{
"epoch": 1.921039961482908,
"grad_norm": 1.078125,
"learning_rate": 0.00010571441073966709,
"loss": 0.4366,
"step": 7980
},
{
"epoch": 1.9222436206066442,
"grad_norm": 0.93359375,
"learning_rate": 0.00010561706257906627,
"loss": 0.4763,
"step": 7985
},
{
"epoch": 1.9234472797303803,
"grad_norm": 0.93359375,
"learning_rate": 0.00010551977214171191,
"loss": 0.4426,
"step": 7990
},
{
"epoch": 1.9246509388541164,
"grad_norm": 0.94921875,
"learning_rate": 0.00010542253958304926,
"loss": 0.4527,
"step": 7995
},
{
"epoch": 1.9258545979778527,
"grad_norm": 0.97265625,
"learning_rate": 0.00010532536505843094,
"loss": 0.4288,
"step": 8000
},
{
"epoch": 1.9258545979778527,
"eval_loss": 0.37830567359924316,
"eval_runtime": 2.3611,
"eval_samples_per_second": 84.707,
"eval_steps_per_second": 84.707,
"step": 8000
},
{
"epoch": 1.9270582571015888,
"grad_norm": 1.03125,
"learning_rate": 0.00010522824872311702,
"loss": 0.4604,
"step": 8005
},
{
"epoch": 1.928261916225325,
"grad_norm": 0.9453125,
"learning_rate": 0.00010513119073227441,
"loss": 0.422,
"step": 8010
},
{
"epoch": 1.9294655753490613,
"grad_norm": 1.03125,
"learning_rate": 0.000105034191240977,
"loss": 0.4488,
"step": 8015
},
{
"epoch": 1.9306692344727971,
"grad_norm": 1.0625,
"learning_rate": 0.00010493725040420506,
"loss": 0.4584,
"step": 8020
},
{
"epoch": 1.9318728935965335,
"grad_norm": 0.93359375,
"learning_rate": 0.00010484036837684515,
"loss": 0.4765,
"step": 8025
},
{
"epoch": 1.9330765527202696,
"grad_norm": 0.9453125,
"learning_rate": 0.00010474354531368998,
"loss": 0.4796,
"step": 8030
},
{
"epoch": 1.9342802118440057,
"grad_norm": 0.98828125,
"learning_rate": 0.00010464678136943798,
"loss": 0.4063,
"step": 8035
},
{
"epoch": 1.935483870967742,
"grad_norm": 1.0390625,
"learning_rate": 0.00010455007669869309,
"loss": 0.4747,
"step": 8040
},
{
"epoch": 1.936687530091478,
"grad_norm": 1.046875,
"learning_rate": 0.00010445343145596464,
"loss": 0.4296,
"step": 8045
},
{
"epoch": 1.9378911892152142,
"grad_norm": 0.96484375,
"learning_rate": 0.00010435684579566686,
"loss": 0.4419,
"step": 8050
},
{
"epoch": 1.9390948483389505,
"grad_norm": 0.9453125,
"learning_rate": 0.000104260319872119,
"loss": 0.4341,
"step": 8055
},
{
"epoch": 1.9402985074626866,
"grad_norm": 0.921875,
"learning_rate": 0.00010416385383954461,
"loss": 0.4469,
"step": 8060
},
{
"epoch": 1.9415021665864227,
"grad_norm": 0.94921875,
"learning_rate": 0.00010406744785207181,
"loss": 0.4366,
"step": 8065
},
{
"epoch": 1.942705825710159,
"grad_norm": 0.92578125,
"learning_rate": 0.00010397110206373257,
"loss": 0.4174,
"step": 8070
},
{
"epoch": 1.943909484833895,
"grad_norm": 0.9765625,
"learning_rate": 0.00010387481662846276,
"loss": 0.4326,
"step": 8075
},
{
"epoch": 1.9451131439576312,
"grad_norm": 1.0625,
"learning_rate": 0.00010377859170010186,
"loss": 0.4171,
"step": 8080
},
{
"epoch": 1.9463168030813673,
"grad_norm": 0.95703125,
"learning_rate": 0.0001036824274323926,
"loss": 0.3963,
"step": 8085
},
{
"epoch": 1.9475204622051034,
"grad_norm": 0.89453125,
"learning_rate": 0.00010358632397898084,
"loss": 0.4128,
"step": 8090
},
{
"epoch": 1.9487241213288398,
"grad_norm": 0.84375,
"learning_rate": 0.00010349028149341532,
"loss": 0.4213,
"step": 8095
},
{
"epoch": 1.9499277804525759,
"grad_norm": 1.1484375,
"learning_rate": 0.00010339430012914721,
"loss": 0.4529,
"step": 8100
},
{
"epoch": 1.951131439576312,
"grad_norm": 0.89453125,
"learning_rate": 0.00010329838003953023,
"loss": 0.4824,
"step": 8105
},
{
"epoch": 1.9523350987000483,
"grad_norm": 0.92578125,
"learning_rate": 0.00010320252137781999,
"loss": 0.4271,
"step": 8110
},
{
"epoch": 1.9535387578237842,
"grad_norm": 0.9296875,
"learning_rate": 0.00010310672429717416,
"loss": 0.4412,
"step": 8115
},
{
"epoch": 1.9547424169475205,
"grad_norm": 0.8828125,
"learning_rate": 0.00010301098895065184,
"loss": 0.4195,
"step": 8120
},
{
"epoch": 1.9559460760712566,
"grad_norm": 1.03125,
"learning_rate": 0.00010291531549121358,
"loss": 0.4446,
"step": 8125
},
{
"epoch": 1.9571497351949927,
"grad_norm": 1.1015625,
"learning_rate": 0.0001028197040717211,
"loss": 0.4385,
"step": 8130
},
{
"epoch": 1.958353394318729,
"grad_norm": 1.0078125,
"learning_rate": 0.00010272415484493687,
"loss": 0.4667,
"step": 8135
},
{
"epoch": 1.9595570534424651,
"grad_norm": 0.90625,
"learning_rate": 0.00010262866796352407,
"loss": 0.4352,
"step": 8140
},
{
"epoch": 1.9607607125662012,
"grad_norm": 0.9921875,
"learning_rate": 0.00010253324358004631,
"loss": 0.4366,
"step": 8145
},
{
"epoch": 1.9619643716899375,
"grad_norm": 1.0546875,
"learning_rate": 0.00010243788184696724,
"loss": 0.4098,
"step": 8150
},
{
"epoch": 1.9631680308136734,
"grad_norm": 0.95703125,
"learning_rate": 0.00010234258291665051,
"loss": 0.4504,
"step": 8155
},
{
"epoch": 1.9643716899374097,
"grad_norm": 0.9609375,
"learning_rate": 0.00010224734694135932,
"loss": 0.4285,
"step": 8160
},
{
"epoch": 1.9655753490611458,
"grad_norm": 0.9921875,
"learning_rate": 0.0001021521740732564,
"loss": 0.4287,
"step": 8165
},
{
"epoch": 1.966779008184882,
"grad_norm": 0.94140625,
"learning_rate": 0.00010205706446440356,
"loss": 0.433,
"step": 8170
},
{
"epoch": 1.9679826673086183,
"grad_norm": 0.9765625,
"learning_rate": 0.0001019620182667616,
"loss": 0.4459,
"step": 8175
},
{
"epoch": 1.9691863264323544,
"grad_norm": 0.84375,
"learning_rate": 0.00010186703563218998,
"loss": 0.4676,
"step": 8180
},
{
"epoch": 1.9703899855560905,
"grad_norm": 0.99609375,
"learning_rate": 0.00010177211671244654,
"loss": 0.4462,
"step": 8185
},
{
"epoch": 1.9715936446798268,
"grad_norm": 0.9921875,
"learning_rate": 0.00010167726165918744,
"loss": 0.4411,
"step": 8190
},
{
"epoch": 1.9727973038035629,
"grad_norm": 0.89453125,
"learning_rate": 0.00010158247062396676,
"loss": 0.4275,
"step": 8195
},
{
"epoch": 1.974000962927299,
"grad_norm": 0.9375,
"learning_rate": 0.0001014877437582362,
"loss": 0.4327,
"step": 8200
},
{
"epoch": 1.9752046220510353,
"grad_norm": 1.0078125,
"learning_rate": 0.00010139308121334507,
"loss": 0.4685,
"step": 8205
},
{
"epoch": 1.9764082811747712,
"grad_norm": 1.0078125,
"learning_rate": 0.00010129848314053981,
"loss": 0.4642,
"step": 8210
},
{
"epoch": 1.9776119402985075,
"grad_norm": 0.92578125,
"learning_rate": 0.0001012039496909639,
"loss": 0.4128,
"step": 8215
},
{
"epoch": 1.9788155994222436,
"grad_norm": 0.9296875,
"learning_rate": 0.00010110948101565761,
"loss": 0.4428,
"step": 8220
},
{
"epoch": 1.9800192585459797,
"grad_norm": 0.8671875,
"learning_rate": 0.00010101507726555761,
"loss": 0.4139,
"step": 8225
},
{
"epoch": 1.981222917669716,
"grad_norm": 0.94921875,
"learning_rate": 0.00010092073859149691,
"loss": 0.4373,
"step": 8230
},
{
"epoch": 1.9824265767934521,
"grad_norm": 0.94921875,
"learning_rate": 0.00010082646514420448,
"loss": 0.4306,
"step": 8235
},
{
"epoch": 1.9836302359171882,
"grad_norm": 0.90234375,
"learning_rate": 0.00010073225707430519,
"loss": 0.4366,
"step": 8240
},
{
"epoch": 1.9848338950409246,
"grad_norm": 0.9375,
"learning_rate": 0.00010063811453231937,
"loss": 0.429,
"step": 8245
},
{
"epoch": 1.9860375541646604,
"grad_norm": 0.94921875,
"learning_rate": 0.00010054403766866263,
"loss": 0.4654,
"step": 8250
},
{
"epoch": 1.9872412132883968,
"grad_norm": 0.90234375,
"learning_rate": 0.00010045002663364573,
"loss": 0.4301,
"step": 8255
},
{
"epoch": 1.9884448724121329,
"grad_norm": 0.9609375,
"learning_rate": 0.00010035608157747416,
"loss": 0.4344,
"step": 8260
},
{
"epoch": 1.989648531535869,
"grad_norm": 0.89453125,
"learning_rate": 0.00010026220265024805,
"loss": 0.4339,
"step": 8265
},
{
"epoch": 1.9908521906596053,
"grad_norm": 0.94921875,
"learning_rate": 0.0001001683900019619,
"loss": 0.4508,
"step": 8270
},
{
"epoch": 1.9920558497833414,
"grad_norm": 0.9140625,
"learning_rate": 0.00010007464378250427,
"loss": 0.4365,
"step": 8275
},
{
"epoch": 1.9932595089070775,
"grad_norm": 1.015625,
"learning_rate": 9.99809641416575e-05,
"loss": 0.4448,
"step": 8280
},
{
"epoch": 1.9944631680308138,
"grad_norm": 0.97265625,
"learning_rate": 9.988735122909773e-05,
"loss": 0.4563,
"step": 8285
},
{
"epoch": 1.9956668271545497,
"grad_norm": 1.0078125,
"learning_rate": 9.979380519439437e-05,
"loss": 0.447,
"step": 8290
},
{
"epoch": 1.996870486278286,
"grad_norm": 0.92578125,
"learning_rate": 9.970032618700996e-05,
"loss": 0.4359,
"step": 8295
},
{
"epoch": 1.998074145402022,
"grad_norm": 0.91796875,
"learning_rate": 9.960691435630003e-05,
"loss": 0.4056,
"step": 8300
},
{
"epoch": 1.9992778045257582,
"grad_norm": 0.9375,
"learning_rate": 9.951356985151279e-05,
"loss": 0.4193,
"step": 8305
},
{
"epoch": 1.9995185363505055,
"eval_loss": 0.3733465075492859,
"eval_runtime": 2.373,
"eval_samples_per_second": 84.283,
"eval_steps_per_second": 84.283,
"step": 8306
},
{
"epoch": 2.0004814636494945,
"grad_norm": 0.94140625,
"learning_rate": 9.942029282178871e-05,
"loss": 0.4239,
"step": 8310
},
{
"epoch": 2.0016851227732304,
"grad_norm": 0.875,
"learning_rate": 9.932708341616069e-05,
"loss": 0.3746,
"step": 8315
},
{
"epoch": 2.0028887818969667,
"grad_norm": 0.87109375,
"learning_rate": 9.92339417835534e-05,
"loss": 0.4221,
"step": 8320
},
{
"epoch": 2.004092441020703,
"grad_norm": 0.94140625,
"learning_rate": 9.914086807278328e-05,
"loss": 0.3929,
"step": 8325
},
{
"epoch": 2.005296100144439,
"grad_norm": 0.94140625,
"learning_rate": 9.904786243255833e-05,
"loss": 0.3775,
"step": 8330
},
{
"epoch": 2.0064997592681753,
"grad_norm": 0.91015625,
"learning_rate": 9.895492501147768e-05,
"loss": 0.4045,
"step": 8335
},
{
"epoch": 2.0077034183919116,
"grad_norm": 0.9765625,
"learning_rate": 9.88620559580315e-05,
"loss": 0.4301,
"step": 8340
},
{
"epoch": 2.0089070775156475,
"grad_norm": 0.8359375,
"learning_rate": 9.876925542060069e-05,
"loss": 0.3864,
"step": 8345
},
{
"epoch": 2.0101107366393838,
"grad_norm": 0.90234375,
"learning_rate": 9.867652354745677e-05,
"loss": 0.383,
"step": 8350
},
{
"epoch": 2.01131439576312,
"grad_norm": 0.92578125,
"learning_rate": 9.858386048676152e-05,
"loss": 0.4031,
"step": 8355
},
{
"epoch": 2.012518054886856,
"grad_norm": 0.94921875,
"learning_rate": 9.84912663865667e-05,
"loss": 0.3837,
"step": 8360
},
{
"epoch": 2.0137217140105923,
"grad_norm": 0.91015625,
"learning_rate": 9.8398741394814e-05,
"loss": 0.3913,
"step": 8365
},
{
"epoch": 2.014925373134328,
"grad_norm": 0.8984375,
"learning_rate": 9.830628565933458e-05,
"loss": 0.3777,
"step": 8370
},
{
"epoch": 2.0161290322580645,
"grad_norm": 0.94921875,
"learning_rate": 9.821389932784905e-05,
"loss": 0.405,
"step": 8375
},
{
"epoch": 2.017332691381801,
"grad_norm": 0.86328125,
"learning_rate": 9.81215825479671e-05,
"loss": 0.3561,
"step": 8380
},
{
"epoch": 2.0185363505055367,
"grad_norm": 0.98828125,
"learning_rate": 9.802933546718724e-05,
"loss": 0.4055,
"step": 8385
},
{
"epoch": 2.019740009629273,
"grad_norm": 0.96875,
"learning_rate": 9.793715823289667e-05,
"loss": 0.3848,
"step": 8390
},
{
"epoch": 2.0209436687530093,
"grad_norm": 0.9453125,
"learning_rate": 9.784505099237094e-05,
"loss": 0.3719,
"step": 8395
},
{
"epoch": 2.0221473278767452,
"grad_norm": 0.95703125,
"learning_rate": 9.775301389277384e-05,
"loss": 0.392,
"step": 8400
},
{
"epoch": 2.0233509870004815,
"grad_norm": 0.91015625,
"learning_rate": 9.766104708115711e-05,
"loss": 0.4121,
"step": 8405
},
{
"epoch": 2.0245546461242174,
"grad_norm": 0.93359375,
"learning_rate": 9.756915070446007e-05,
"loss": 0.3852,
"step": 8410
},
{
"epoch": 2.0257583052479537,
"grad_norm": 0.9140625,
"learning_rate": 9.747732490950962e-05,
"loss": 0.3714,
"step": 8415
},
{
"epoch": 2.02696196437169,
"grad_norm": 0.8984375,
"learning_rate": 9.73855698430198e-05,
"loss": 0.3795,
"step": 8420
},
{
"epoch": 2.028165623495426,
"grad_norm": 0.90234375,
"learning_rate": 9.729388565159167e-05,
"loss": 0.4234,
"step": 8425
},
{
"epoch": 2.0293692826191623,
"grad_norm": 0.87890625,
"learning_rate": 9.720227248171316e-05,
"loss": 0.3845,
"step": 8430
},
{
"epoch": 2.0305729417428986,
"grad_norm": 0.8984375,
"learning_rate": 9.711073047975856e-05,
"loss": 0.3976,
"step": 8435
},
{
"epoch": 2.0317766008666345,
"grad_norm": 1.0234375,
"learning_rate": 9.70192597919885e-05,
"loss": 0.4157,
"step": 8440
},
{
"epoch": 2.032980259990371,
"grad_norm": 0.9140625,
"learning_rate": 9.692786056454974e-05,
"loss": 0.408,
"step": 8445
},
{
"epoch": 2.0341839191141067,
"grad_norm": 1.1171875,
"learning_rate": 9.683653294347478e-05,
"loss": 0.4,
"step": 8450
},
{
"epoch": 2.035387578237843,
"grad_norm": 0.85546875,
"learning_rate": 9.674527707468178e-05,
"loss": 0.388,
"step": 8455
},
{
"epoch": 2.0365912373615793,
"grad_norm": 0.99609375,
"learning_rate": 9.665409310397418e-05,
"loss": 0.3848,
"step": 8460
},
{
"epoch": 2.037794896485315,
"grad_norm": 0.9140625,
"learning_rate": 9.656298117704064e-05,
"loss": 0.3957,
"step": 8465
},
{
"epoch": 2.0389985556090515,
"grad_norm": 0.96875,
"learning_rate": 9.647194143945462e-05,
"loss": 0.3974,
"step": 8470
},
{
"epoch": 2.040202214732788,
"grad_norm": 0.94921875,
"learning_rate": 9.638097403667431e-05,
"loss": 0.3817,
"step": 8475
},
{
"epoch": 2.0414058738565237,
"grad_norm": 0.984375,
"learning_rate": 9.629007911404229e-05,
"loss": 0.378,
"step": 8480
},
{
"epoch": 2.04260953298026,
"grad_norm": 0.94140625,
"learning_rate": 9.619925681678533e-05,
"loss": 0.397,
"step": 8485
},
{
"epoch": 2.043813192103996,
"grad_norm": 0.9140625,
"learning_rate": 9.610850729001423e-05,
"loss": 0.4074,
"step": 8490
},
{
"epoch": 2.0450168512277322,
"grad_norm": 0.9375,
"learning_rate": 9.601783067872345e-05,
"loss": 0.3829,
"step": 8495
},
{
"epoch": 2.0462205103514686,
"grad_norm": 1.0234375,
"learning_rate": 9.592722712779095e-05,
"loss": 0.4026,
"step": 8500
},
{
"epoch": 2.0462205103514686,
"eval_loss": 0.3732355237007141,
"eval_runtime": 2.3642,
"eval_samples_per_second": 84.595,
"eval_steps_per_second": 84.595,
"step": 8500
},
{
"epoch": 2.0474241694752044,
"grad_norm": 0.92578125,
"learning_rate": 9.583669678197794e-05,
"loss": 0.3809,
"step": 8505
},
{
"epoch": 2.0486278285989408,
"grad_norm": 0.99609375,
"learning_rate": 9.574623978592874e-05,
"loss": 0.407,
"step": 8510
},
{
"epoch": 2.049831487722677,
"grad_norm": 0.9453125,
"learning_rate": 9.565585628417047e-05,
"loss": 0.3932,
"step": 8515
},
{
"epoch": 2.051035146846413,
"grad_norm": 0.859375,
"learning_rate": 9.556554642111277e-05,
"loss": 0.4076,
"step": 8520
},
{
"epoch": 2.0522388059701493,
"grad_norm": 0.9921875,
"learning_rate": 9.547531034104769e-05,
"loss": 0.3719,
"step": 8525
},
{
"epoch": 2.0534424650938856,
"grad_norm": 0.84765625,
"learning_rate": 9.538514818814925e-05,
"loss": 0.3558,
"step": 8530
},
{
"epoch": 2.0546461242176215,
"grad_norm": 0.87890625,
"learning_rate": 9.529506010647357e-05,
"loss": 0.3985,
"step": 8535
},
{
"epoch": 2.055849783341358,
"grad_norm": 0.84765625,
"learning_rate": 9.520504623995827e-05,
"loss": 0.38,
"step": 8540
},
{
"epoch": 2.0570534424650937,
"grad_norm": 0.98828125,
"learning_rate": 9.511510673242243e-05,
"loss": 0.3802,
"step": 8545
},
{
"epoch": 2.05825710158883,
"grad_norm": 0.921875,
"learning_rate": 9.502524172756631e-05,
"loss": 0.3604,
"step": 8550
},
{
"epoch": 2.0594607607125663,
"grad_norm": 1.0078125,
"learning_rate": 9.493545136897118e-05,
"loss": 0.4151,
"step": 8555
},
{
"epoch": 2.060664419836302,
"grad_norm": 0.9609375,
"learning_rate": 9.484573580009895e-05,
"loss": 0.4033,
"step": 8560
},
{
"epoch": 2.0618680789600385,
"grad_norm": 0.94140625,
"learning_rate": 9.475609516429222e-05,
"loss": 0.3877,
"step": 8565
},
{
"epoch": 2.063071738083775,
"grad_norm": 0.97265625,
"learning_rate": 9.466652960477364e-05,
"loss": 0.3882,
"step": 8570
},
{
"epoch": 2.0642753972075107,
"grad_norm": 0.953125,
"learning_rate": 9.457703926464607e-05,
"loss": 0.3618,
"step": 8575
},
{
"epoch": 2.065479056331247,
"grad_norm": 1.0,
"learning_rate": 9.448762428689208e-05,
"loss": 0.4004,
"step": 8580
},
{
"epoch": 2.066682715454983,
"grad_norm": 0.9453125,
"learning_rate": 9.439828481437394e-05,
"loss": 0.3861,
"step": 8585
},
{
"epoch": 2.0678863745787193,
"grad_norm": 0.9375,
"learning_rate": 9.43090209898332e-05,
"loss": 0.3965,
"step": 8590
},
{
"epoch": 2.0690900337024556,
"grad_norm": 0.921875,
"learning_rate": 9.42198329558906e-05,
"loss": 0.3757,
"step": 8595
},
{
"epoch": 2.0702936928261915,
"grad_norm": 0.94140625,
"learning_rate": 9.413072085504567e-05,
"loss": 0.402,
"step": 8600
},
{
"epoch": 2.071497351949928,
"grad_norm": 0.87890625,
"learning_rate": 9.40416848296768e-05,
"loss": 0.4025,
"step": 8605
},
{
"epoch": 2.072701011073664,
"grad_norm": 0.99609375,
"learning_rate": 9.395272502204067e-05,
"loss": 0.3962,
"step": 8610
},
{
"epoch": 2.0739046701974,
"grad_norm": 0.96484375,
"learning_rate": 9.386384157427228e-05,
"loss": 0.3881,
"step": 8615
},
{
"epoch": 2.0751083293211363,
"grad_norm": 0.93359375,
"learning_rate": 9.377503462838457e-05,
"loss": 0.3697,
"step": 8620
},
{
"epoch": 2.076311988444872,
"grad_norm": 1.0859375,
"learning_rate": 9.368630432626831e-05,
"loss": 0.4251,
"step": 8625
},
{
"epoch": 2.0775156475686085,
"grad_norm": 0.95703125,
"learning_rate": 9.359765080969173e-05,
"loss": 0.3864,
"step": 8630
},
{
"epoch": 2.078719306692345,
"grad_norm": 1.046875,
"learning_rate": 9.350907422030044e-05,
"loss": 0.4006,
"step": 8635
},
{
"epoch": 2.0799229658160807,
"grad_norm": 0.87109375,
"learning_rate": 9.342057469961716e-05,
"loss": 0.3969,
"step": 8640
},
{
"epoch": 2.081126624939817,
"grad_norm": 0.8828125,
"learning_rate": 9.333215238904137e-05,
"loss": 0.3691,
"step": 8645
},
{
"epoch": 2.0823302840635534,
"grad_norm": 0.8984375,
"learning_rate": 9.324380742984934e-05,
"loss": 0.381,
"step": 8650
},
{
"epoch": 2.0835339431872892,
"grad_norm": 0.953125,
"learning_rate": 9.315553996319361e-05,
"loss": 0.373,
"step": 8655
},
{
"epoch": 2.0847376023110256,
"grad_norm": 0.8984375,
"learning_rate": 9.306735013010294e-05,
"loss": 0.3941,
"step": 8660
},
{
"epoch": 2.085941261434762,
"grad_norm": 1.03125,
"learning_rate": 9.297923807148213e-05,
"loss": 0.3726,
"step": 8665
},
{
"epoch": 2.0871449205584978,
"grad_norm": 0.9921875,
"learning_rate": 9.289120392811164e-05,
"loss": 0.3868,
"step": 8670
},
{
"epoch": 2.088348579682234,
"grad_norm": 0.9453125,
"learning_rate": 9.280324784064746e-05,
"loss": 0.3857,
"step": 8675
},
{
"epoch": 2.08955223880597,
"grad_norm": 0.93359375,
"learning_rate": 9.271536994962086e-05,
"loss": 0.3822,
"step": 8680
},
{
"epoch": 2.0907558979297063,
"grad_norm": 0.91015625,
"learning_rate": 9.262757039543819e-05,
"loss": 0.3804,
"step": 8685
},
{
"epoch": 2.0919595570534426,
"grad_norm": 0.93359375,
"learning_rate": 9.253984931838067e-05,
"loss": 0.4042,
"step": 8690
},
{
"epoch": 2.0931632161771785,
"grad_norm": 0.9765625,
"learning_rate": 9.245220685860405e-05,
"loss": 0.3839,
"step": 8695
},
{
"epoch": 2.094366875300915,
"grad_norm": 0.98828125,
"learning_rate": 9.236464315613853e-05,
"loss": 0.3835,
"step": 8700
},
{
"epoch": 2.095570534424651,
"grad_norm": 0.8984375,
"learning_rate": 9.227715835088844e-05,
"loss": 0.3557,
"step": 8705
},
{
"epoch": 2.096774193548387,
"grad_norm": 0.97265625,
"learning_rate": 9.21897525826321e-05,
"loss": 0.368,
"step": 8710
},
{
"epoch": 2.0979778526721233,
"grad_norm": 1.015625,
"learning_rate": 9.21024259910215e-05,
"loss": 0.406,
"step": 8715
},
{
"epoch": 2.099181511795859,
"grad_norm": 0.8984375,
"learning_rate": 9.201517871558213e-05,
"loss": 0.4126,
"step": 8720
},
{
"epoch": 2.1003851709195955,
"grad_norm": 0.91796875,
"learning_rate": 9.192801089571282e-05,
"loss": 0.4155,
"step": 8725
},
{
"epoch": 2.101588830043332,
"grad_norm": 1.015625,
"learning_rate": 9.184092267068535e-05,
"loss": 0.3865,
"step": 8730
},
{
"epoch": 2.1027924891670677,
"grad_norm": 1.03125,
"learning_rate": 9.175391417964443e-05,
"loss": 0.3882,
"step": 8735
},
{
"epoch": 2.103996148290804,
"grad_norm": 0.90625,
"learning_rate": 9.166698556160725e-05,
"loss": 0.3843,
"step": 8740
},
{
"epoch": 2.1051998074145404,
"grad_norm": 0.89453125,
"learning_rate": 9.158013695546353e-05,
"loss": 0.3773,
"step": 8745
},
{
"epoch": 2.1064034665382763,
"grad_norm": 0.92578125,
"learning_rate": 9.149336849997505e-05,
"loss": 0.3813,
"step": 8750
},
{
"epoch": 2.1076071256620126,
"grad_norm": 0.95703125,
"learning_rate": 9.140668033377559e-05,
"loss": 0.3641,
"step": 8755
},
{
"epoch": 2.1088107847857485,
"grad_norm": 0.95703125,
"learning_rate": 9.132007259537052e-05,
"loss": 0.3499,
"step": 8760
},
{
"epoch": 2.110014443909485,
"grad_norm": 0.99609375,
"learning_rate": 9.123354542313694e-05,
"loss": 0.3831,
"step": 8765
},
{
"epoch": 2.111218103033221,
"grad_norm": 0.95703125,
"learning_rate": 9.114709895532298e-05,
"loss": 0.3912,
"step": 8770
},
{
"epoch": 2.112421762156957,
"grad_norm": 1.015625,
"learning_rate": 9.1060733330048e-05,
"loss": 0.4011,
"step": 8775
},
{
"epoch": 2.1136254212806933,
"grad_norm": 0.97265625,
"learning_rate": 9.097444868530207e-05,
"loss": 0.3968,
"step": 8780
},
{
"epoch": 2.1148290804044296,
"grad_norm": 0.95703125,
"learning_rate": 9.0888245158946e-05,
"loss": 0.4062,
"step": 8785
},
{
"epoch": 2.1160327395281655,
"grad_norm": 0.99609375,
"learning_rate": 9.080212288871087e-05,
"loss": 0.4159,
"step": 8790
},
{
"epoch": 2.117236398651902,
"grad_norm": 0.98046875,
"learning_rate": 9.071608201219801e-05,
"loss": 0.3902,
"step": 8795
},
{
"epoch": 2.118440057775638,
"grad_norm": 0.8828125,
"learning_rate": 9.063012266687872e-05,
"loss": 0.3651,
"step": 8800
},
{
"epoch": 2.119643716899374,
"grad_norm": 0.98046875,
"learning_rate": 9.054424499009393e-05,
"loss": 0.3812,
"step": 8805
},
{
"epoch": 2.1208473760231104,
"grad_norm": 0.91796875,
"learning_rate": 9.045844911905422e-05,
"loss": 0.4088,
"step": 8810
},
{
"epoch": 2.1220510351468462,
"grad_norm": 1.078125,
"learning_rate": 9.03727351908394e-05,
"loss": 0.413,
"step": 8815
},
{
"epoch": 2.1232546942705826,
"grad_norm": 0.9765625,
"learning_rate": 9.028710334239825e-05,
"loss": 0.3926,
"step": 8820
},
{
"epoch": 2.124458353394319,
"grad_norm": 0.9375,
"learning_rate": 9.020155371054863e-05,
"loss": 0.3944,
"step": 8825
},
{
"epoch": 2.1256620125180548,
"grad_norm": 1.0078125,
"learning_rate": 9.011608643197683e-05,
"loss": 0.4028,
"step": 8830
},
{
"epoch": 2.126865671641791,
"grad_norm": 0.8828125,
"learning_rate": 9.003070164323774e-05,
"loss": 0.3495,
"step": 8835
},
{
"epoch": 2.1280693307655274,
"grad_norm": 1.0625,
"learning_rate": 8.994539948075428e-05,
"loss": 0.3979,
"step": 8840
},
{
"epoch": 2.1292729898892633,
"grad_norm": 0.88671875,
"learning_rate": 8.986018008081748e-05,
"loss": 0.3892,
"step": 8845
},
{
"epoch": 2.1304766490129996,
"grad_norm": 1.03125,
"learning_rate": 8.977504357958612e-05,
"loss": 0.3633,
"step": 8850
},
{
"epoch": 2.1316803081367355,
"grad_norm": 0.91796875,
"learning_rate": 8.968999011308645e-05,
"loss": 0.3807,
"step": 8855
},
{
"epoch": 2.132883967260472,
"grad_norm": 0.9296875,
"learning_rate": 8.960501981721215e-05,
"loss": 0.3744,
"step": 8860
},
{
"epoch": 2.134087626384208,
"grad_norm": 0.8984375,
"learning_rate": 8.952013282772397e-05,
"loss": 0.3972,
"step": 8865
},
{
"epoch": 2.135291285507944,
"grad_norm": 0.890625,
"learning_rate": 8.943532928024951e-05,
"loss": 0.3795,
"step": 8870
},
{
"epoch": 2.1364949446316803,
"grad_norm": 0.86328125,
"learning_rate": 8.935060931028317e-05,
"loss": 0.3741,
"step": 8875
},
{
"epoch": 2.1376986037554166,
"grad_norm": 0.90234375,
"learning_rate": 8.926597305318563e-05,
"loss": 0.3736,
"step": 8880
},
{
"epoch": 2.1389022628791525,
"grad_norm": 0.88671875,
"learning_rate": 8.918142064418408e-05,
"loss": 0.3879,
"step": 8885
},
{
"epoch": 2.140105922002889,
"grad_norm": 0.88671875,
"learning_rate": 8.909695221837147e-05,
"loss": 0.3723,
"step": 8890
},
{
"epoch": 2.1413095811266247,
"grad_norm": 0.984375,
"learning_rate": 8.901256791070674e-05,
"loss": 0.3808,
"step": 8895
},
{
"epoch": 2.142513240250361,
"grad_norm": 0.94140625,
"learning_rate": 8.892826785601441e-05,
"loss": 0.4039,
"step": 8900
},
{
"epoch": 2.1437168993740974,
"grad_norm": 0.98828125,
"learning_rate": 8.884405218898433e-05,
"loss": 0.3713,
"step": 8905
},
{
"epoch": 2.1449205584978333,
"grad_norm": 1.0703125,
"learning_rate": 8.875992104417155e-05,
"loss": 0.398,
"step": 8910
},
{
"epoch": 2.1461242176215696,
"grad_norm": 1.0,
"learning_rate": 8.867587455599604e-05,
"loss": 0.4043,
"step": 8915
},
{
"epoch": 2.147327876745306,
"grad_norm": 0.953125,
"learning_rate": 8.85919128587426e-05,
"loss": 0.3738,
"step": 8920
},
{
"epoch": 2.1485315358690418,
"grad_norm": 0.95703125,
"learning_rate": 8.850803608656048e-05,
"loss": 0.3856,
"step": 8925
},
{
"epoch": 2.149735194992778,
"grad_norm": 0.984375,
"learning_rate": 8.842424437346322e-05,
"loss": 0.3765,
"step": 8930
},
{
"epoch": 2.1509388541165144,
"grad_norm": 0.91796875,
"learning_rate": 8.834053785332854e-05,
"loss": 0.3824,
"step": 8935
},
{
"epoch": 2.1521425132402503,
"grad_norm": 1.0078125,
"learning_rate": 8.825691665989796e-05,
"loss": 0.3897,
"step": 8940
},
{
"epoch": 2.1533461723639866,
"grad_norm": 0.984375,
"learning_rate": 8.817338092677676e-05,
"loss": 0.3986,
"step": 8945
},
{
"epoch": 2.1545498314877225,
"grad_norm": 0.921875,
"learning_rate": 8.808993078743364e-05,
"loss": 0.3793,
"step": 8950
},
{
"epoch": 2.155753490611459,
"grad_norm": 0.98828125,
"learning_rate": 8.800656637520044e-05,
"loss": 0.363,
"step": 8955
},
{
"epoch": 2.156957149735195,
"grad_norm": 0.97265625,
"learning_rate": 8.79232878232722e-05,
"loss": 0.3917,
"step": 8960
},
{
"epoch": 2.158160808858931,
"grad_norm": 1.0234375,
"learning_rate": 8.784009526470667e-05,
"loss": 0.3593,
"step": 8965
},
{
"epoch": 2.1593644679826673,
"grad_norm": 0.90234375,
"learning_rate": 8.775698883242425e-05,
"loss": 0.406,
"step": 8970
},
{
"epoch": 2.1605681271064037,
"grad_norm": 0.96875,
"learning_rate": 8.767396865920771e-05,
"loss": 0.378,
"step": 8975
},
{
"epoch": 2.1617717862301395,
"grad_norm": 1.015625,
"learning_rate": 8.759103487770195e-05,
"loss": 0.3682,
"step": 8980
},
{
"epoch": 2.162975445353876,
"grad_norm": 0.89453125,
"learning_rate": 8.750818762041396e-05,
"loss": 0.3636,
"step": 8985
},
{
"epoch": 2.1641791044776117,
"grad_norm": 1.0625,
"learning_rate": 8.74254270197124e-05,
"loss": 0.406,
"step": 8990
},
{
"epoch": 2.165382763601348,
"grad_norm": 0.953125,
"learning_rate": 8.734275320782748e-05,
"loss": 0.3768,
"step": 8995
},
{
"epoch": 2.1665864227250844,
"grad_norm": 0.86328125,
"learning_rate": 8.726016631685076e-05,
"loss": 0.3942,
"step": 9000
},
{
"epoch": 2.1665864227250844,
"eval_loss": 0.3683564364910126,
"eval_runtime": 2.3689,
"eval_samples_per_second": 84.428,
"eval_steps_per_second": 84.428,
"step": 9000
},
{
"epoch": 2.1677900818488203,
"grad_norm": 0.89453125,
"learning_rate": 8.717766647873494e-05,
"loss": 0.3749,
"step": 9005
},
{
"epoch": 2.1689937409725566,
"grad_norm": 0.98046875,
"learning_rate": 8.70952538252936e-05,
"loss": 0.4045,
"step": 9010
},
{
"epoch": 2.170197400096293,
"grad_norm": 0.9375,
"learning_rate": 8.701292848820101e-05,
"loss": 0.379,
"step": 9015
},
{
"epoch": 2.171401059220029,
"grad_norm": 0.9296875,
"learning_rate": 8.693069059899202e-05,
"loss": 0.3857,
"step": 9020
},
{
"epoch": 2.172604718343765,
"grad_norm": 0.96875,
"learning_rate": 8.684854028906164e-05,
"loss": 0.3791,
"step": 9025
},
{
"epoch": 2.173808377467501,
"grad_norm": 0.81640625,
"learning_rate": 8.6766477689665e-05,
"loss": 0.4051,
"step": 9030
},
{
"epoch": 2.1750120365912373,
"grad_norm": 0.9453125,
"learning_rate": 8.668450293191714e-05,
"loss": 0.3873,
"step": 9035
},
{
"epoch": 2.1762156957149736,
"grad_norm": 0.95703125,
"learning_rate": 8.660261614679265e-05,
"loss": 0.3943,
"step": 9040
},
{
"epoch": 2.1774193548387095,
"grad_norm": 0.9375,
"learning_rate": 8.652081746512568e-05,
"loss": 0.3886,
"step": 9045
},
{
"epoch": 2.178623013962446,
"grad_norm": 0.96875,
"learning_rate": 8.643910701760951e-05,
"loss": 0.3911,
"step": 9050
},
{
"epoch": 2.179826673086182,
"grad_norm": 0.98046875,
"learning_rate": 8.635748493479652e-05,
"loss": 0.373,
"step": 9055
},
{
"epoch": 2.181030332209918,
"grad_norm": 0.91796875,
"learning_rate": 8.627595134709787e-05,
"loss": 0.3669,
"step": 9060
},
{
"epoch": 2.1822339913336544,
"grad_norm": 0.96484375,
"learning_rate": 8.61945063847833e-05,
"loss": 0.4013,
"step": 9065
},
{
"epoch": 2.1834376504573907,
"grad_norm": 0.9609375,
"learning_rate": 8.611315017798102e-05,
"loss": 0.3809,
"step": 9070
},
{
"epoch": 2.1846413095811266,
"grad_norm": 0.921875,
"learning_rate": 8.603188285667738e-05,
"loss": 0.3842,
"step": 9075
},
{
"epoch": 2.185844968704863,
"grad_norm": 0.97265625,
"learning_rate": 8.595070455071673e-05,
"loss": 0.4009,
"step": 9080
},
{
"epoch": 2.1870486278285988,
"grad_norm": 0.83984375,
"learning_rate": 8.58696153898012e-05,
"loss": 0.3773,
"step": 9085
},
{
"epoch": 2.188252286952335,
"grad_norm": 0.94921875,
"learning_rate": 8.578861550349042e-05,
"loss": 0.3886,
"step": 9090
},
{
"epoch": 2.1894559460760714,
"grad_norm": 0.94140625,
"learning_rate": 8.570770502120153e-05,
"loss": 0.3883,
"step": 9095
},
{
"epoch": 2.1906596051998073,
"grad_norm": 1.0234375,
"learning_rate": 8.562688407220867e-05,
"loss": 0.4031,
"step": 9100
},
{
"epoch": 2.1918632643235436,
"grad_norm": 1.03125,
"learning_rate": 8.554615278564303e-05,
"loss": 0.3919,
"step": 9105
},
{
"epoch": 2.19306692344728,
"grad_norm": 0.921875,
"learning_rate": 8.546551129049254e-05,
"loss": 0.3779,
"step": 9110
},
{
"epoch": 2.194270582571016,
"grad_norm": 0.8828125,
"learning_rate": 8.53849597156016e-05,
"loss": 0.3994,
"step": 9115
},
{
"epoch": 2.195474241694752,
"grad_norm": 1.046875,
"learning_rate": 8.530449818967098e-05,
"loss": 0.4173,
"step": 9120
},
{
"epoch": 2.196677900818488,
"grad_norm": 0.9375,
"learning_rate": 8.522412684125755e-05,
"loss": 0.3897,
"step": 9125
},
{
"epoch": 2.1978815599422243,
"grad_norm": 0.94921875,
"learning_rate": 8.514384579877418e-05,
"loss": 0.3891,
"step": 9130
},
{
"epoch": 2.1990852190659607,
"grad_norm": 0.921875,
"learning_rate": 8.506365519048936e-05,
"loss": 0.4021,
"step": 9135
},
{
"epoch": 2.2002888781896965,
"grad_norm": 0.94140625,
"learning_rate": 8.49835551445271e-05,
"loss": 0.3999,
"step": 9140
},
{
"epoch": 2.201492537313433,
"grad_norm": 0.98828125,
"learning_rate": 8.490354578886679e-05,
"loss": 0.3983,
"step": 9145
},
{
"epoch": 2.202696196437169,
"grad_norm": 1.046875,
"learning_rate": 8.482362725134282e-05,
"loss": 0.4089,
"step": 9150
},
{
"epoch": 2.203899855560905,
"grad_norm": 0.87109375,
"learning_rate": 8.474379965964456e-05,
"loss": 0.3923,
"step": 9155
},
{
"epoch": 2.2051035146846414,
"grad_norm": 1.0,
"learning_rate": 8.466406314131606e-05,
"loss": 0.388,
"step": 9160
},
{
"epoch": 2.2063071738083773,
"grad_norm": 1.03125,
"learning_rate": 8.458441782375577e-05,
"loss": 0.3673,
"step": 9165
},
{
"epoch": 2.2075108329321136,
"grad_norm": 0.890625,
"learning_rate": 8.450486383421655e-05,
"loss": 0.3766,
"step": 9170
},
{
"epoch": 2.20871449205585,
"grad_norm": 1.0390625,
"learning_rate": 8.442540129980523e-05,
"loss": 0.3812,
"step": 9175
},
{
"epoch": 2.209918151179586,
"grad_norm": 0.89453125,
"learning_rate": 8.434603034748262e-05,
"loss": 0.365,
"step": 9180
},
{
"epoch": 2.211121810303322,
"grad_norm": 1.078125,
"learning_rate": 8.426675110406314e-05,
"loss": 0.3912,
"step": 9185
},
{
"epoch": 2.2123254694270584,
"grad_norm": 0.96875,
"learning_rate": 8.418756369621465e-05,
"loss": 0.3777,
"step": 9190
},
{
"epoch": 2.2135291285507943,
"grad_norm": 0.90625,
"learning_rate": 8.41084682504584e-05,
"loss": 0.3737,
"step": 9195
},
{
"epoch": 2.2147327876745306,
"grad_norm": 1.0625,
"learning_rate": 8.402946489316858e-05,
"loss": 0.4014,
"step": 9200
},
{
"epoch": 2.215936446798267,
"grad_norm": 1.0625,
"learning_rate": 8.395055375057235e-05,
"loss": 0.3725,
"step": 9205
},
{
"epoch": 2.217140105922003,
"grad_norm": 0.9140625,
"learning_rate": 8.387173494874944e-05,
"loss": 0.3642,
"step": 9210
},
{
"epoch": 2.218343765045739,
"grad_norm": 1.1171875,
"learning_rate": 8.379300861363211e-05,
"loss": 0.4425,
"step": 9215
},
{
"epoch": 2.219547424169475,
"grad_norm": 0.90234375,
"learning_rate": 8.371437487100489e-05,
"loss": 0.3777,
"step": 9220
},
{
"epoch": 2.2207510832932114,
"grad_norm": 0.9296875,
"learning_rate": 8.363583384650429e-05,
"loss": 0.3929,
"step": 9225
},
{
"epoch": 2.2219547424169477,
"grad_norm": 1.03125,
"learning_rate": 8.355738566561877e-05,
"loss": 0.4091,
"step": 9230
},
{
"epoch": 2.2231584015406836,
"grad_norm": 0.88671875,
"learning_rate": 8.347903045368839e-05,
"loss": 0.3942,
"step": 9235
},
{
"epoch": 2.22436206066442,
"grad_norm": 0.97265625,
"learning_rate": 8.340076833590473e-05,
"loss": 0.374,
"step": 9240
},
{
"epoch": 2.2255657197881558,
"grad_norm": 0.9609375,
"learning_rate": 8.332259943731055e-05,
"loss": 0.3801,
"step": 9245
},
{
"epoch": 2.226769378911892,
"grad_norm": 0.98046875,
"learning_rate": 8.324452388279971e-05,
"loss": 0.3746,
"step": 9250
},
{
"epoch": 2.2279730380356284,
"grad_norm": 0.90234375,
"learning_rate": 8.316654179711699e-05,
"loss": 0.3716,
"step": 9255
},
{
"epoch": 2.2291766971593643,
"grad_norm": 1.0234375,
"learning_rate": 8.30886533048577e-05,
"loss": 0.3951,
"step": 9260
},
{
"epoch": 2.2303803562831006,
"grad_norm": 0.9765625,
"learning_rate": 8.30108585304677e-05,
"loss": 0.3688,
"step": 9265
},
{
"epoch": 2.231584015406837,
"grad_norm": 0.92578125,
"learning_rate": 8.293315759824314e-05,
"loss": 0.3748,
"step": 9270
},
{
"epoch": 2.232787674530573,
"grad_norm": 0.87109375,
"learning_rate": 8.285555063233014e-05,
"loss": 0.3654,
"step": 9275
},
{
"epoch": 2.233991333654309,
"grad_norm": 0.88671875,
"learning_rate": 8.277803775672479e-05,
"loss": 0.3999,
"step": 9280
},
{
"epoch": 2.2351949927780455,
"grad_norm": 0.92578125,
"learning_rate": 8.270061909527272e-05,
"loss": 0.3721,
"step": 9285
},
{
"epoch": 2.2363986519017813,
"grad_norm": 1.0625,
"learning_rate": 8.262329477166919e-05,
"loss": 0.3944,
"step": 9290
},
{
"epoch": 2.2376023110255177,
"grad_norm": 0.97265625,
"learning_rate": 8.254606490945859e-05,
"loss": 0.4229,
"step": 9295
},
{
"epoch": 2.2388059701492535,
"grad_norm": 0.8828125,
"learning_rate": 8.246892963203444e-05,
"loss": 0.4217,
"step": 9300
},
{
"epoch": 2.24000962927299,
"grad_norm": 0.89453125,
"learning_rate": 8.239188906263917e-05,
"loss": 0.3763,
"step": 9305
},
{
"epoch": 2.241213288396726,
"grad_norm": 0.8984375,
"learning_rate": 8.231494332436382e-05,
"loss": 0.3791,
"step": 9310
},
{
"epoch": 2.242416947520462,
"grad_norm": 0.90625,
"learning_rate": 8.2238092540148e-05,
"loss": 0.3797,
"step": 9315
},
{
"epoch": 2.2436206066441984,
"grad_norm": 1.0,
"learning_rate": 8.216133683277955e-05,
"loss": 0.402,
"step": 9320
},
{
"epoch": 2.2448242657679347,
"grad_norm": 0.97265625,
"learning_rate": 8.20846763248944e-05,
"loss": 0.3827,
"step": 9325
},
{
"epoch": 2.2460279248916706,
"grad_norm": 1.015625,
"learning_rate": 8.20081111389764e-05,
"loss": 0.4026,
"step": 9330
},
{
"epoch": 2.247231584015407,
"grad_norm": 0.9453125,
"learning_rate": 8.193164139735707e-05,
"loss": 0.3664,
"step": 9335
},
{
"epoch": 2.2484352431391432,
"grad_norm": 0.97265625,
"learning_rate": 8.18552672222155e-05,
"loss": 0.4098,
"step": 9340
},
{
"epoch": 2.249638902262879,
"grad_norm": 0.90625,
"learning_rate": 8.177898873557799e-05,
"loss": 0.3989,
"step": 9345
},
{
"epoch": 2.2508425613866154,
"grad_norm": 1.015625,
"learning_rate": 8.170280605931803e-05,
"loss": 0.4073,
"step": 9350
},
{
"epoch": 2.2520462205103513,
"grad_norm": 0.95703125,
"learning_rate": 8.162671931515603e-05,
"loss": 0.3738,
"step": 9355
},
{
"epoch": 2.2532498796340876,
"grad_norm": 0.9765625,
"learning_rate": 8.155072862465905e-05,
"loss": 0.4167,
"step": 9360
},
{
"epoch": 2.254453538757824,
"grad_norm": 0.8984375,
"learning_rate": 8.147483410924076e-05,
"loss": 0.3754,
"step": 9365
},
{
"epoch": 2.25565719788156,
"grad_norm": 0.9140625,
"learning_rate": 8.139903589016116e-05,
"loss": 0.3792,
"step": 9370
},
{
"epoch": 2.256860857005296,
"grad_norm": 0.90625,
"learning_rate": 8.132333408852634e-05,
"loss": 0.3805,
"step": 9375
},
{
"epoch": 2.258064516129032,
"grad_norm": 1.03125,
"learning_rate": 8.12477288252884e-05,
"loss": 0.3934,
"step": 9380
},
{
"epoch": 2.2592681752527684,
"grad_norm": 0.921875,
"learning_rate": 8.117222022124516e-05,
"loss": 0.3975,
"step": 9385
},
{
"epoch": 2.2604718343765047,
"grad_norm": 1.015625,
"learning_rate": 8.109680839703998e-05,
"loss": 0.3769,
"step": 9390
},
{
"epoch": 2.2616754935002406,
"grad_norm": 1.0234375,
"learning_rate": 8.10214934731617e-05,
"loss": 0.3778,
"step": 9395
},
{
"epoch": 2.262879152623977,
"grad_norm": 0.93359375,
"learning_rate": 8.094627556994419e-05,
"loss": 0.3732,
"step": 9400
},
{
"epoch": 2.264082811747713,
"grad_norm": 0.96875,
"learning_rate": 8.087115480756642e-05,
"loss": 0.3703,
"step": 9405
},
{
"epoch": 2.265286470871449,
"grad_norm": 0.95703125,
"learning_rate": 8.079613130605205e-05,
"loss": 0.369,
"step": 9410
},
{
"epoch": 2.2664901299951854,
"grad_norm": 1.0703125,
"learning_rate": 8.072120518526948e-05,
"loss": 0.386,
"step": 9415
},
{
"epoch": 2.2676937891189217,
"grad_norm": 0.97265625,
"learning_rate": 8.064637656493139e-05,
"loss": 0.3926,
"step": 9420
},
{
"epoch": 2.2688974482426576,
"grad_norm": 0.96875,
"learning_rate": 8.057164556459475e-05,
"loss": 0.3906,
"step": 9425
},
{
"epoch": 2.270101107366394,
"grad_norm": 0.9140625,
"learning_rate": 8.049701230366056e-05,
"loss": 0.3657,
"step": 9430
},
{
"epoch": 2.27130476649013,
"grad_norm": 1.0078125,
"learning_rate": 8.042247690137359e-05,
"loss": 0.3916,
"step": 9435
},
{
"epoch": 2.272508425613866,
"grad_norm": 0.96484375,
"learning_rate": 8.034803947682238e-05,
"loss": 0.3867,
"step": 9440
},
{
"epoch": 2.2737120847376024,
"grad_norm": 0.88671875,
"learning_rate": 8.027370014893877e-05,
"loss": 0.3769,
"step": 9445
},
{
"epoch": 2.2749157438613383,
"grad_norm": 1.0234375,
"learning_rate": 8.019945903649802e-05,
"loss": 0.4066,
"step": 9450
},
{
"epoch": 2.2761194029850746,
"grad_norm": 0.921875,
"learning_rate": 8.012531625811835e-05,
"loss": 0.3761,
"step": 9455
},
{
"epoch": 2.277323062108811,
"grad_norm": 0.85546875,
"learning_rate": 8.005127193226091e-05,
"loss": 0.369,
"step": 9460
},
{
"epoch": 2.278526721232547,
"grad_norm": 1.015625,
"learning_rate": 7.997732617722959e-05,
"loss": 0.3794,
"step": 9465
},
{
"epoch": 2.279730380356283,
"grad_norm": 0.9609375,
"learning_rate": 7.99034791111707e-05,
"loss": 0.3937,
"step": 9470
},
{
"epoch": 2.2809340394800195,
"grad_norm": 0.87890625,
"learning_rate": 7.982973085207295e-05,
"loss": 0.4039,
"step": 9475
},
{
"epoch": 2.2821376986037554,
"grad_norm": 1.296875,
"learning_rate": 7.97560815177672e-05,
"loss": 0.4048,
"step": 9480
},
{
"epoch": 2.2833413577274917,
"grad_norm": 1.03125,
"learning_rate": 7.96825312259261e-05,
"loss": 0.3805,
"step": 9485
},
{
"epoch": 2.2845450168512276,
"grad_norm": 0.88671875,
"learning_rate": 7.960908009406425e-05,
"loss": 0.3901,
"step": 9490
},
{
"epoch": 2.285748675974964,
"grad_norm": 1.03125,
"learning_rate": 7.953572823953769e-05,
"loss": 0.4191,
"step": 9495
},
{
"epoch": 2.2869523350987,
"grad_norm": 1.0390625,
"learning_rate": 7.946247577954389e-05,
"loss": 0.4066,
"step": 9500
},
{
"epoch": 2.2869523350987,
"eval_loss": 0.36421453952789307,
"eval_runtime": 2.3689,
"eval_samples_per_second": 84.429,
"eval_steps_per_second": 84.429,
"step": 9500
},
{
"epoch": 2.288155994222436,
"grad_norm": 0.95703125,
"learning_rate": 7.938932283112149e-05,
"loss": 0.3765,
"step": 9505
},
{
"epoch": 2.2893596533461724,
"grad_norm": 0.88671875,
"learning_rate": 7.931626951115018e-05,
"loss": 0.3686,
"step": 9510
},
{
"epoch": 2.2905633124699083,
"grad_norm": 1.03125,
"learning_rate": 7.924331593635042e-05,
"loss": 0.3795,
"step": 9515
},
{
"epoch": 2.2917669715936446,
"grad_norm": 0.94140625,
"learning_rate": 7.917046222328329e-05,
"loss": 0.3993,
"step": 9520
},
{
"epoch": 2.292970630717381,
"grad_norm": 0.89453125,
"learning_rate": 7.909770848835036e-05,
"loss": 0.3773,
"step": 9525
},
{
"epoch": 2.294174289841117,
"grad_norm": 0.99609375,
"learning_rate": 7.902505484779351e-05,
"loss": 0.4038,
"step": 9530
},
{
"epoch": 2.295377948964853,
"grad_norm": 1.0234375,
"learning_rate": 7.895250141769457e-05,
"loss": 0.4033,
"step": 9535
},
{
"epoch": 2.2965816080885895,
"grad_norm": 0.90625,
"learning_rate": 7.888004831397534e-05,
"loss": 0.3917,
"step": 9540
},
{
"epoch": 2.2977852672123253,
"grad_norm": 0.8671875,
"learning_rate": 7.880769565239728e-05,
"loss": 0.3858,
"step": 9545
},
{
"epoch": 2.2989889263360617,
"grad_norm": 1.0,
"learning_rate": 7.873544354856142e-05,
"loss": 0.3822,
"step": 9550
},
{
"epoch": 2.300192585459798,
"grad_norm": 0.87890625,
"learning_rate": 7.866329211790813e-05,
"loss": 0.3767,
"step": 9555
},
{
"epoch": 2.301396244583534,
"grad_norm": 0.99609375,
"learning_rate": 7.859124147571687e-05,
"loss": 0.3946,
"step": 9560
},
{
"epoch": 2.30259990370727,
"grad_norm": 0.94921875,
"learning_rate": 7.85192917371061e-05,
"loss": 0.3595,
"step": 9565
},
{
"epoch": 2.303803562831006,
"grad_norm": 0.96484375,
"learning_rate": 7.844744301703301e-05,
"loss": 0.4093,
"step": 9570
},
{
"epoch": 2.3050072219547424,
"grad_norm": 0.9609375,
"learning_rate": 7.837569543029349e-05,
"loss": 0.3601,
"step": 9575
},
{
"epoch": 2.3062108810784787,
"grad_norm": 0.9453125,
"learning_rate": 7.830404909152181e-05,
"loss": 0.4082,
"step": 9580
},
{
"epoch": 2.3074145402022146,
"grad_norm": 0.90625,
"learning_rate": 7.82325041151904e-05,
"loss": 0.3772,
"step": 9585
},
{
"epoch": 2.308618199325951,
"grad_norm": 0.97265625,
"learning_rate": 7.816106061560983e-05,
"loss": 0.3969,
"step": 9590
},
{
"epoch": 2.3098218584496872,
"grad_norm": 0.97265625,
"learning_rate": 7.808971870692846e-05,
"loss": 0.4185,
"step": 9595
},
{
"epoch": 2.311025517573423,
"grad_norm": 0.921875,
"learning_rate": 7.801847850313239e-05,
"loss": 0.3943,
"step": 9600
},
{
"epoch": 2.3122291766971594,
"grad_norm": 0.94921875,
"learning_rate": 7.794734011804522e-05,
"loss": 0.3816,
"step": 9605
},
{
"epoch": 2.3134328358208958,
"grad_norm": 0.92578125,
"learning_rate": 7.787630366532785e-05,
"loss": 0.358,
"step": 9610
},
{
"epoch": 2.3146364949446316,
"grad_norm": 0.96875,
"learning_rate": 7.780536925847832e-05,
"loss": 0.3818,
"step": 9615
},
{
"epoch": 2.315840154068368,
"grad_norm": 0.92578125,
"learning_rate": 7.773453701083158e-05,
"loss": 0.4184,
"step": 9620
},
{
"epoch": 2.317043813192104,
"grad_norm": 0.92578125,
"learning_rate": 7.766380703555945e-05,
"loss": 0.3754,
"step": 9625
},
{
"epoch": 2.31824747231584,
"grad_norm": 0.90625,
"learning_rate": 7.75931794456703e-05,
"loss": 0.3994,
"step": 9630
},
{
"epoch": 2.3194511314395765,
"grad_norm": 0.9453125,
"learning_rate": 7.75226543540089e-05,
"loss": 0.3709,
"step": 9635
},
{
"epoch": 2.3206547905633124,
"grad_norm": 0.9375,
"learning_rate": 7.745223187325628e-05,
"loss": 0.3748,
"step": 9640
},
{
"epoch": 2.3218584496870487,
"grad_norm": 0.8515625,
"learning_rate": 7.738191211592948e-05,
"loss": 0.3813,
"step": 9645
},
{
"epoch": 2.3230621088107846,
"grad_norm": 0.99609375,
"learning_rate": 7.73116951943815e-05,
"loss": 0.3859,
"step": 9650
},
{
"epoch": 2.324265767934521,
"grad_norm": 0.93359375,
"learning_rate": 7.724158122080093e-05,
"loss": 0.3822,
"step": 9655
},
{
"epoch": 2.325469427058257,
"grad_norm": 0.9765625,
"learning_rate": 7.717157030721195e-05,
"loss": 0.3749,
"step": 9660
},
{
"epoch": 2.326673086181993,
"grad_norm": 0.94140625,
"learning_rate": 7.710166256547402e-05,
"loss": 0.3667,
"step": 9665
},
{
"epoch": 2.3278767453057294,
"grad_norm": 1.0390625,
"learning_rate": 7.703185810728186e-05,
"loss": 0.4044,
"step": 9670
},
{
"epoch": 2.3290804044294657,
"grad_norm": 0.92578125,
"learning_rate": 7.696215704416505e-05,
"loss": 0.387,
"step": 9675
},
{
"epoch": 2.3302840635532016,
"grad_norm": 0.9296875,
"learning_rate": 7.689255948748799e-05,
"loss": 0.3841,
"step": 9680
},
{
"epoch": 2.331487722676938,
"grad_norm": 0.91796875,
"learning_rate": 7.682306554844979e-05,
"loss": 0.4014,
"step": 9685
},
{
"epoch": 2.3326913818006743,
"grad_norm": 0.875,
"learning_rate": 7.675367533808395e-05,
"loss": 0.3702,
"step": 9690
},
{
"epoch": 2.33389504092441,
"grad_norm": 0.94921875,
"learning_rate": 7.668438896725818e-05,
"loss": 0.379,
"step": 9695
},
{
"epoch": 2.3350987000481465,
"grad_norm": 0.91796875,
"learning_rate": 7.661520654667441e-05,
"loss": 0.3923,
"step": 9700
},
{
"epoch": 2.3363023591718823,
"grad_norm": 0.9609375,
"learning_rate": 7.654612818686837e-05,
"loss": 0.3723,
"step": 9705
},
{
"epoch": 2.3375060182956187,
"grad_norm": 0.97265625,
"learning_rate": 7.647715399820956e-05,
"loss": 0.3814,
"step": 9710
},
{
"epoch": 2.338709677419355,
"grad_norm": 1.0,
"learning_rate": 7.64082840909011e-05,
"loss": 0.3738,
"step": 9715
},
{
"epoch": 2.339913336543091,
"grad_norm": 0.8984375,
"learning_rate": 7.633951857497943e-05,
"loss": 0.3713,
"step": 9720
},
{
"epoch": 2.341116995666827,
"grad_norm": 0.98828125,
"learning_rate": 7.627085756031421e-05,
"loss": 0.3854,
"step": 9725
},
{
"epoch": 2.3423206547905635,
"grad_norm": 0.94921875,
"learning_rate": 7.620230115660809e-05,
"loss": 0.3878,
"step": 9730
},
{
"epoch": 2.3435243139142994,
"grad_norm": 0.8828125,
"learning_rate": 7.61338494733967e-05,
"loss": 0.3914,
"step": 9735
},
{
"epoch": 2.3447279730380357,
"grad_norm": 0.96484375,
"learning_rate": 7.606550262004827e-05,
"loss": 0.3834,
"step": 9740
},
{
"epoch": 2.345931632161772,
"grad_norm": 0.90234375,
"learning_rate": 7.599726070576351e-05,
"loss": 0.3679,
"step": 9745
},
{
"epoch": 2.347135291285508,
"grad_norm": 0.93359375,
"learning_rate": 7.592912383957557e-05,
"loss": 0.4166,
"step": 9750
},
{
"epoch": 2.3483389504092442,
"grad_norm": 0.93359375,
"learning_rate": 7.586109213034963e-05,
"loss": 0.3567,
"step": 9755
},
{
"epoch": 2.34954260953298,
"grad_norm": 0.9375,
"learning_rate": 7.579316568678294e-05,
"loss": 0.3628,
"step": 9760
},
{
"epoch": 2.3507462686567164,
"grad_norm": 1.046875,
"learning_rate": 7.572534461740457e-05,
"loss": 0.3817,
"step": 9765
},
{
"epoch": 2.3519499277804528,
"grad_norm": 0.90625,
"learning_rate": 7.565762903057518e-05,
"loss": 0.3639,
"step": 9770
},
{
"epoch": 2.3531535869041886,
"grad_norm": 1.078125,
"learning_rate": 7.559001903448696e-05,
"loss": 0.3831,
"step": 9775
},
{
"epoch": 2.354357246027925,
"grad_norm": 0.9921875,
"learning_rate": 7.552251473716325e-05,
"loss": 0.3741,
"step": 9780
},
{
"epoch": 2.355560905151661,
"grad_norm": 0.91015625,
"learning_rate": 7.545511624645872e-05,
"loss": 0.3883,
"step": 9785
},
{
"epoch": 2.356764564275397,
"grad_norm": 1.0234375,
"learning_rate": 7.538782367005884e-05,
"loss": 0.4166,
"step": 9790
},
{
"epoch": 2.3579682233991335,
"grad_norm": 0.94140625,
"learning_rate": 7.532063711547986e-05,
"loss": 0.3791,
"step": 9795
},
{
"epoch": 2.3591718825228694,
"grad_norm": 1.0078125,
"learning_rate": 7.525355669006875e-05,
"loss": 0.3776,
"step": 9800
},
{
"epoch": 2.3603755416466057,
"grad_norm": 0.87109375,
"learning_rate": 7.518658250100275e-05,
"loss": 0.3692,
"step": 9805
},
{
"epoch": 2.361579200770342,
"grad_norm": 0.98828125,
"learning_rate": 7.511971465528949e-05,
"loss": 0.3742,
"step": 9810
},
{
"epoch": 2.362782859894078,
"grad_norm": 0.87890625,
"learning_rate": 7.505295325976668e-05,
"loss": 0.4001,
"step": 9815
},
{
"epoch": 2.363986519017814,
"grad_norm": 0.92578125,
"learning_rate": 7.498629842110183e-05,
"loss": 0.3867,
"step": 9820
},
{
"epoch": 2.3651901781415505,
"grad_norm": 0.92578125,
"learning_rate": 7.491975024579236e-05,
"loss": 0.4005,
"step": 9825
},
{
"epoch": 2.3663938372652864,
"grad_norm": 0.8671875,
"learning_rate": 7.485330884016519e-05,
"loss": 0.3574,
"step": 9830
},
{
"epoch": 2.3675974963890227,
"grad_norm": 0.9375,
"learning_rate": 7.478697431037657e-05,
"loss": 0.4245,
"step": 9835
},
{
"epoch": 2.3688011555127586,
"grad_norm": 0.9140625,
"learning_rate": 7.472074676241218e-05,
"loss": 0.3864,
"step": 9840
},
{
"epoch": 2.370004814636495,
"grad_norm": 1.0078125,
"learning_rate": 7.465462630208658e-05,
"loss": 0.3774,
"step": 9845
},
{
"epoch": 2.3712084737602313,
"grad_norm": 1.109375,
"learning_rate": 7.458861303504338e-05,
"loss": 0.4149,
"step": 9850
},
{
"epoch": 2.372412132883967,
"grad_norm": 0.87890625,
"learning_rate": 7.45227070667548e-05,
"loss": 0.3659,
"step": 9855
},
{
"epoch": 2.3736157920077035,
"grad_norm": 0.90234375,
"learning_rate": 7.445690850252173e-05,
"loss": 0.364,
"step": 9860
},
{
"epoch": 2.3748194511314393,
"grad_norm": 0.9140625,
"learning_rate": 7.439121744747338e-05,
"loss": 0.3781,
"step": 9865
},
{
"epoch": 2.3760231102551757,
"grad_norm": 1.0546875,
"learning_rate": 7.432563400656723e-05,
"loss": 0.3803,
"step": 9870
},
{
"epoch": 2.377226769378912,
"grad_norm": 1.0390625,
"learning_rate": 7.426015828458882e-05,
"loss": 0.382,
"step": 9875
},
{
"epoch": 2.3784304285026483,
"grad_norm": 0.9921875,
"learning_rate": 7.419479038615156e-05,
"loss": 0.4247,
"step": 9880
},
{
"epoch": 2.379634087626384,
"grad_norm": 0.99609375,
"learning_rate": 7.412953041569658e-05,
"loss": 0.3796,
"step": 9885
},
{
"epoch": 2.3808377467501205,
"grad_norm": 0.9609375,
"learning_rate": 7.406437847749255e-05,
"loss": 0.3707,
"step": 9890
},
{
"epoch": 2.3820414058738564,
"grad_norm": 0.97265625,
"learning_rate": 7.399933467563564e-05,
"loss": 0.4145,
"step": 9895
},
{
"epoch": 2.3832450649975927,
"grad_norm": 1.0234375,
"learning_rate": 7.393439911404913e-05,
"loss": 0.3705,
"step": 9900
},
{
"epoch": 2.384448724121329,
"grad_norm": 0.9609375,
"learning_rate": 7.38695718964834e-05,
"loss": 0.3733,
"step": 9905
},
{
"epoch": 2.385652383245065,
"grad_norm": 1.0859375,
"learning_rate": 7.380485312651573e-05,
"loss": 0.3785,
"step": 9910
},
{
"epoch": 2.3868560423688012,
"grad_norm": 0.91015625,
"learning_rate": 7.374024290755012e-05,
"loss": 0.3983,
"step": 9915
},
{
"epoch": 2.388059701492537,
"grad_norm": 0.921875,
"learning_rate": 7.367574134281715e-05,
"loss": 0.378,
"step": 9920
},
{
"epoch": 2.3892633606162734,
"grad_norm": 0.93359375,
"learning_rate": 7.361134853537379e-05,
"loss": 0.3803,
"step": 9925
},
{
"epoch": 2.3904670197400097,
"grad_norm": 1.0390625,
"learning_rate": 7.354706458810322e-05,
"loss": 0.3675,
"step": 9930
},
{
"epoch": 2.3916706788637456,
"grad_norm": 0.8515625,
"learning_rate": 7.348288960371473e-05,
"loss": 0.3665,
"step": 9935
},
{
"epoch": 2.392874337987482,
"grad_norm": 1.0546875,
"learning_rate": 7.341882368474345e-05,
"loss": 0.3671,
"step": 9940
},
{
"epoch": 2.3940779971112183,
"grad_norm": 0.99609375,
"learning_rate": 7.335486693355033e-05,
"loss": 0.3712,
"step": 9945
},
{
"epoch": 2.395281656234954,
"grad_norm": 0.9375,
"learning_rate": 7.329101945232187e-05,
"loss": 0.3988,
"step": 9950
},
{
"epoch": 2.3964853153586905,
"grad_norm": 0.9921875,
"learning_rate": 7.322728134306994e-05,
"loss": 0.3701,
"step": 9955
},
{
"epoch": 2.397688974482427,
"grad_norm": 0.9453125,
"learning_rate": 7.316365270763175e-05,
"loss": 0.4033,
"step": 9960
},
{
"epoch": 2.3988926336061627,
"grad_norm": 0.98828125,
"learning_rate": 7.310013364766951e-05,
"loss": 0.3957,
"step": 9965
},
{
"epoch": 2.400096292729899,
"grad_norm": 0.9765625,
"learning_rate": 7.30367242646704e-05,
"loss": 0.381,
"step": 9970
},
{
"epoch": 2.401299951853635,
"grad_norm": 0.98046875,
"learning_rate": 7.297342465994638e-05,
"loss": 0.4178,
"step": 9975
},
{
"epoch": 2.402503610977371,
"grad_norm": 0.92578125,
"learning_rate": 7.291023493463395e-05,
"loss": 0.3695,
"step": 9980
},
{
"epoch": 2.4037072701011075,
"grad_norm": 1.0234375,
"learning_rate": 7.284715518969416e-05,
"loss": 0.3904,
"step": 9985
},
{
"epoch": 2.4049109292248434,
"grad_norm": 0.859375,
"learning_rate": 7.278418552591222e-05,
"loss": 0.3506,
"step": 9990
},
{
"epoch": 2.4061145883485797,
"grad_norm": 0.9765625,
"learning_rate": 7.27213260438975e-05,
"loss": 0.35,
"step": 9995
},
{
"epoch": 2.4073182474723156,
"grad_norm": 0.9609375,
"learning_rate": 7.265857684408339e-05,
"loss": 0.3886,
"step": 10000
},
{
"epoch": 2.4073182474723156,
"eval_loss": 0.35701805353164673,
"eval_runtime": 2.3753,
"eval_samples_per_second": 84.199,
"eval_steps_per_second": 84.199,
"step": 10000
},
{
"epoch": 2.408521906596052,
"grad_norm": 0.9765625,
"learning_rate": 7.259593802672696e-05,
"loss": 0.3658,
"step": 10005
},
{
"epoch": 2.4097255657197882,
"grad_norm": 0.890625,
"learning_rate": 7.253340969190904e-05,
"loss": 0.3637,
"step": 10010
},
{
"epoch": 2.410929224843524,
"grad_norm": 0.86328125,
"learning_rate": 7.247099193953384e-05,
"loss": 0.3735,
"step": 10015
},
{
"epoch": 2.4121328839672604,
"grad_norm": 1.0546875,
"learning_rate": 7.240868486932893e-05,
"loss": 0.3764,
"step": 10020
},
{
"epoch": 2.4133365430909968,
"grad_norm": 1.0,
"learning_rate": 7.234648858084507e-05,
"loss": 0.3701,
"step": 10025
},
{
"epoch": 2.4145402022147326,
"grad_norm": 0.90625,
"learning_rate": 7.228440317345595e-05,
"loss": 0.4003,
"step": 10030
},
{
"epoch": 2.415743861338469,
"grad_norm": 0.8984375,
"learning_rate": 7.222242874635819e-05,
"loss": 0.3802,
"step": 10035
},
{
"epoch": 2.4169475204622053,
"grad_norm": 0.99609375,
"learning_rate": 7.216056539857098e-05,
"loss": 0.38,
"step": 10040
},
{
"epoch": 2.418151179585941,
"grad_norm": 1.0,
"learning_rate": 7.209881322893608e-05,
"loss": 0.4088,
"step": 10045
},
{
"epoch": 2.4193548387096775,
"grad_norm": 1.0546875,
"learning_rate": 7.20371723361177e-05,
"loss": 0.398,
"step": 10050
},
{
"epoch": 2.4205584978334134,
"grad_norm": 0.99609375,
"learning_rate": 7.197564281860209e-05,
"loss": 0.3749,
"step": 10055
},
{
"epoch": 2.4217621569571497,
"grad_norm": 1.0078125,
"learning_rate": 7.191422477469773e-05,
"loss": 0.3968,
"step": 10060
},
{
"epoch": 2.422965816080886,
"grad_norm": 0.96484375,
"learning_rate": 7.185291830253486e-05,
"loss": 0.3708,
"step": 10065
},
{
"epoch": 2.424169475204622,
"grad_norm": 1.0546875,
"learning_rate": 7.179172350006551e-05,
"loss": 0.3669,
"step": 10070
},
{
"epoch": 2.425373134328358,
"grad_norm": 0.93359375,
"learning_rate": 7.173064046506333e-05,
"loss": 0.3746,
"step": 10075
},
{
"epoch": 2.4265767934520945,
"grad_norm": 1.0078125,
"learning_rate": 7.166966929512329e-05,
"loss": 0.3836,
"step": 10080
},
{
"epoch": 2.4277804525758304,
"grad_norm": 0.9921875,
"learning_rate": 7.160881008766172e-05,
"loss": 0.3863,
"step": 10085
},
{
"epoch": 2.4289841116995667,
"grad_norm": 0.91796875,
"learning_rate": 7.154806293991606e-05,
"loss": 0.3614,
"step": 10090
},
{
"epoch": 2.430187770823303,
"grad_norm": 0.9609375,
"learning_rate": 7.148742794894461e-05,
"loss": 0.359,
"step": 10095
},
{
"epoch": 2.431391429947039,
"grad_norm": 0.94140625,
"learning_rate": 7.142690521162662e-05,
"loss": 0.3852,
"step": 10100
},
{
"epoch": 2.4325950890707753,
"grad_norm": 1.0,
"learning_rate": 7.136649482466184e-05,
"loss": 0.4083,
"step": 10105
},
{
"epoch": 2.433798748194511,
"grad_norm": 0.83203125,
"learning_rate": 7.130619688457064e-05,
"loss": 0.3683,
"step": 10110
},
{
"epoch": 2.4350024073182475,
"grad_norm": 1.0703125,
"learning_rate": 7.124601148769362e-05,
"loss": 0.3938,
"step": 10115
},
{
"epoch": 2.436206066441984,
"grad_norm": 0.984375,
"learning_rate": 7.118593873019167e-05,
"loss": 0.3842,
"step": 10120
},
{
"epoch": 2.4374097255657197,
"grad_norm": 0.921875,
"learning_rate": 7.11259787080456e-05,
"loss": 0.3766,
"step": 10125
},
{
"epoch": 2.438613384689456,
"grad_norm": 0.95703125,
"learning_rate": 7.106613151705618e-05,
"loss": 0.3788,
"step": 10130
},
{
"epoch": 2.439817043813192,
"grad_norm": 1.03125,
"learning_rate": 7.100639725284389e-05,
"loss": 0.3808,
"step": 10135
},
{
"epoch": 2.441020702936928,
"grad_norm": 0.984375,
"learning_rate": 7.094677601084875e-05,
"loss": 0.3767,
"step": 10140
},
{
"epoch": 2.4422243620606645,
"grad_norm": 0.96484375,
"learning_rate": 7.088726788633027e-05,
"loss": 0.3911,
"step": 10145
},
{
"epoch": 2.4434280211844004,
"grad_norm": 0.92578125,
"learning_rate": 7.08278729743671e-05,
"loss": 0.3871,
"step": 10150
},
{
"epoch": 2.4446316803081367,
"grad_norm": 0.95703125,
"learning_rate": 7.076859136985713e-05,
"loss": 0.365,
"step": 10155
},
{
"epoch": 2.445835339431873,
"grad_norm": 1.0546875,
"learning_rate": 7.070942316751717e-05,
"loss": 0.4014,
"step": 10160
},
{
"epoch": 2.447038998555609,
"grad_norm": 0.99609375,
"learning_rate": 7.065036846188283e-05,
"loss": 0.381,
"step": 10165
},
{
"epoch": 2.4482426576793452,
"grad_norm": 0.90625,
"learning_rate": 7.059142734730838e-05,
"loss": 0.3877,
"step": 10170
},
{
"epoch": 2.4494463168030816,
"grad_norm": 0.97265625,
"learning_rate": 7.053259991796663e-05,
"loss": 0.3899,
"step": 10175
},
{
"epoch": 2.4506499759268174,
"grad_norm": 1.0078125,
"learning_rate": 7.04738862678487e-05,
"loss": 0.3881,
"step": 10180
},
{
"epoch": 2.4518536350505538,
"grad_norm": 0.96875,
"learning_rate": 7.0415286490764e-05,
"loss": 0.3723,
"step": 10185
},
{
"epoch": 2.4530572941742896,
"grad_norm": 0.8984375,
"learning_rate": 7.035680068033992e-05,
"loss": 0.371,
"step": 10190
},
{
"epoch": 2.454260953298026,
"grad_norm": 0.90234375,
"learning_rate": 7.029842893002179e-05,
"loss": 0.3842,
"step": 10195
},
{
"epoch": 2.4554646124217623,
"grad_norm": 0.94921875,
"learning_rate": 7.024017133307273e-05,
"loss": 0.3728,
"step": 10200
},
{
"epoch": 2.456668271545498,
"grad_norm": 1.0,
"learning_rate": 7.018202798257341e-05,
"loss": 0.3769,
"step": 10205
},
{
"epoch": 2.4578719306692345,
"grad_norm": 1.0234375,
"learning_rate": 7.012399897142203e-05,
"loss": 0.3838,
"step": 10210
},
{
"epoch": 2.459075589792971,
"grad_norm": 1.0078125,
"learning_rate": 7.006608439233404e-05,
"loss": 0.4098,
"step": 10215
},
{
"epoch": 2.4602792489167067,
"grad_norm": 0.890625,
"learning_rate": 7.000828433784213e-05,
"loss": 0.3997,
"step": 10220
},
{
"epoch": 2.461482908040443,
"grad_norm": 0.96875,
"learning_rate": 6.995059890029594e-05,
"loss": 0.3742,
"step": 10225
},
{
"epoch": 2.4626865671641793,
"grad_norm": 0.96484375,
"learning_rate": 6.989302817186201e-05,
"loss": 0.3805,
"step": 10230
},
{
"epoch": 2.463890226287915,
"grad_norm": 0.97265625,
"learning_rate": 6.983557224452366e-05,
"loss": 0.3806,
"step": 10235
},
{
"epoch": 2.4650938854116515,
"grad_norm": 0.9375,
"learning_rate": 6.977823121008066e-05,
"loss": 0.3629,
"step": 10240
},
{
"epoch": 2.4662975445353874,
"grad_norm": 0.8984375,
"learning_rate": 6.972100516014932e-05,
"loss": 0.4034,
"step": 10245
},
{
"epoch": 2.4675012036591237,
"grad_norm": 1.0390625,
"learning_rate": 6.96638941861622e-05,
"loss": 0.3701,
"step": 10250
},
{
"epoch": 2.46870486278286,
"grad_norm": 1.0078125,
"learning_rate": 6.960689837936796e-05,
"loss": 0.3881,
"step": 10255
},
{
"epoch": 2.469908521906596,
"grad_norm": 0.93359375,
"learning_rate": 6.955001783083136e-05,
"loss": 0.3857,
"step": 10260
},
{
"epoch": 2.4711121810303323,
"grad_norm": 0.94140625,
"learning_rate": 6.949325263143284e-05,
"loss": 0.3868,
"step": 10265
},
{
"epoch": 2.472315840154068,
"grad_norm": 1.03125,
"learning_rate": 6.943660287186872e-05,
"loss": 0.3868,
"step": 10270
},
{
"epoch": 2.4735194992778045,
"grad_norm": 1.0234375,
"learning_rate": 6.938006864265074e-05,
"loss": 0.401,
"step": 10275
},
{
"epoch": 2.474723158401541,
"grad_norm": 0.93359375,
"learning_rate": 6.932365003410615e-05,
"loss": 0.381,
"step": 10280
},
{
"epoch": 2.4759268175252767,
"grad_norm": 1.03125,
"learning_rate": 6.92673471363774e-05,
"loss": 0.3802,
"step": 10285
},
{
"epoch": 2.477130476649013,
"grad_norm": 1.0703125,
"learning_rate": 6.921116003942208e-05,
"loss": 0.3709,
"step": 10290
},
{
"epoch": 2.4783341357727493,
"grad_norm": 1.09375,
"learning_rate": 6.915508883301278e-05,
"loss": 0.3709,
"step": 10295
},
{
"epoch": 2.479537794896485,
"grad_norm": 0.86328125,
"learning_rate": 6.90991336067369e-05,
"loss": 0.3728,
"step": 10300
},
{
"epoch": 2.4807414540202215,
"grad_norm": 1.0078125,
"learning_rate": 6.904329444999657e-05,
"loss": 0.4006,
"step": 10305
},
{
"epoch": 2.481945113143958,
"grad_norm": 0.91015625,
"learning_rate": 6.898757145200843e-05,
"loss": 0.3556,
"step": 10310
},
{
"epoch": 2.4831487722676937,
"grad_norm": 1.046875,
"learning_rate": 6.893196470180354e-05,
"loss": 0.3875,
"step": 10315
},
{
"epoch": 2.48435243139143,
"grad_norm": 0.98046875,
"learning_rate": 6.887647428822726e-05,
"loss": 0.3741,
"step": 10320
},
{
"epoch": 2.485556090515166,
"grad_norm": 0.984375,
"learning_rate": 6.882110029993899e-05,
"loss": 0.396,
"step": 10325
},
{
"epoch": 2.4867597496389022,
"grad_norm": 0.890625,
"learning_rate": 6.876584282541223e-05,
"loss": 0.3651,
"step": 10330
},
{
"epoch": 2.4879634087626386,
"grad_norm": 0.98828125,
"learning_rate": 6.871070195293424e-05,
"loss": 0.3828,
"step": 10335
},
{
"epoch": 2.4891670678863744,
"grad_norm": 1.1171875,
"learning_rate": 6.865567777060598e-05,
"loss": 0.3785,
"step": 10340
},
{
"epoch": 2.4903707270101108,
"grad_norm": 0.9765625,
"learning_rate": 6.860077036634202e-05,
"loss": 0.4172,
"step": 10345
},
{
"epoch": 2.491574386133847,
"grad_norm": 0.94921875,
"learning_rate": 6.854597982787028e-05,
"loss": 0.3592,
"step": 10350
},
{
"epoch": 2.492778045257583,
"grad_norm": 1.0390625,
"learning_rate": 6.849130624273203e-05,
"loss": 0.3741,
"step": 10355
},
{
"epoch": 2.4939817043813193,
"grad_norm": 0.95703125,
"learning_rate": 6.843674969828162e-05,
"loss": 0.386,
"step": 10360
},
{
"epoch": 2.4951853635050556,
"grad_norm": 0.97265625,
"learning_rate": 6.838231028168644e-05,
"loss": 0.404,
"step": 10365
},
{
"epoch": 2.4963890226287915,
"grad_norm": 0.9921875,
"learning_rate": 6.83279880799267e-05,
"loss": 0.4001,
"step": 10370
},
{
"epoch": 2.497592681752528,
"grad_norm": 0.91796875,
"learning_rate": 6.827378317979534e-05,
"loss": 0.3935,
"step": 10375
},
{
"epoch": 2.4987963408762637,
"grad_norm": 1.046875,
"learning_rate": 6.821969566789795e-05,
"loss": 0.3751,
"step": 10380
},
{
"epoch": 2.5,
"grad_norm": 1.0078125,
"learning_rate": 6.816572563065244e-05,
"loss": 0.4075,
"step": 10385
},
{
"epoch": 2.5012036591237363,
"grad_norm": 0.96875,
"learning_rate": 6.811187315428915e-05,
"loss": 0.3555,
"step": 10390
},
{
"epoch": 2.502407318247472,
"grad_norm": 0.98828125,
"learning_rate": 6.805813832485053e-05,
"loss": 0.4228,
"step": 10395
},
{
"epoch": 2.5036109773712085,
"grad_norm": 0.90625,
"learning_rate": 6.800452122819103e-05,
"loss": 0.3705,
"step": 10400
},
{
"epoch": 2.5048146364949444,
"grad_norm": 0.94921875,
"learning_rate": 6.795102194997705e-05,
"loss": 0.3691,
"step": 10405
},
{
"epoch": 2.5060182956186807,
"grad_norm": 0.875,
"learning_rate": 6.789764057568671e-05,
"loss": 0.3758,
"step": 10410
},
{
"epoch": 2.507221954742417,
"grad_norm": 0.9765625,
"learning_rate": 6.784437719060974e-05,
"loss": 0.3784,
"step": 10415
},
{
"epoch": 2.5084256138661534,
"grad_norm": 0.93359375,
"learning_rate": 6.779123187984744e-05,
"loss": 0.3855,
"step": 10420
},
{
"epoch": 2.5096292729898892,
"grad_norm": 1.0,
"learning_rate": 6.773820472831233e-05,
"loss": 0.3967,
"step": 10425
},
{
"epoch": 2.5108329321136256,
"grad_norm": 0.98828125,
"learning_rate": 6.768529582072822e-05,
"loss": 0.394,
"step": 10430
},
{
"epoch": 2.5120365912373614,
"grad_norm": 0.9921875,
"learning_rate": 6.763250524162999e-05,
"loss": 0.3686,
"step": 10435
},
{
"epoch": 2.5132402503610978,
"grad_norm": 1.03125,
"learning_rate": 6.757983307536345e-05,
"loss": 0.3658,
"step": 10440
},
{
"epoch": 2.514443909484834,
"grad_norm": 1.0859375,
"learning_rate": 6.752727940608524e-05,
"loss": 0.3989,
"step": 10445
},
{
"epoch": 2.51564756860857,
"grad_norm": 0.921875,
"learning_rate": 6.747484431776261e-05,
"loss": 0.3621,
"step": 10450
},
{
"epoch": 2.5168512277323063,
"grad_norm": 0.97265625,
"learning_rate": 6.742252789417342e-05,
"loss": 0.3824,
"step": 10455
},
{
"epoch": 2.518054886856042,
"grad_norm": 0.9921875,
"learning_rate": 6.737033021890588e-05,
"loss": 0.3737,
"step": 10460
},
{
"epoch": 2.5192585459797785,
"grad_norm": 1.03125,
"learning_rate": 6.731825137535853e-05,
"loss": 0.38,
"step": 10465
},
{
"epoch": 2.520462205103515,
"grad_norm": 0.875,
"learning_rate": 6.726629144673997e-05,
"loss": 0.3723,
"step": 10470
},
{
"epoch": 2.5216658642272507,
"grad_norm": 1.0078125,
"learning_rate": 6.721445051606887e-05,
"loss": 0.3894,
"step": 10475
},
{
"epoch": 2.522869523350987,
"grad_norm": 0.96484375,
"learning_rate": 6.716272866617375e-05,
"loss": 0.3631,
"step": 10480
},
{
"epoch": 2.524073182474723,
"grad_norm": 0.99609375,
"learning_rate": 6.711112597969284e-05,
"loss": 0.3701,
"step": 10485
},
{
"epoch": 2.525276841598459,
"grad_norm": 1.0390625,
"learning_rate": 6.705964253907401e-05,
"loss": 0.3641,
"step": 10490
},
{
"epoch": 2.5264805007221955,
"grad_norm": 0.89453125,
"learning_rate": 6.700827842657465e-05,
"loss": 0.364,
"step": 10495
},
{
"epoch": 2.527684159845932,
"grad_norm": 0.875,
"learning_rate": 6.695703372426138e-05,
"loss": 0.3691,
"step": 10500
},
{
"epoch": 2.527684159845932,
"eval_loss": 0.35430702567100525,
"eval_runtime": 2.3696,
"eval_samples_per_second": 84.401,
"eval_steps_per_second": 84.401,
"step": 10500
},
{
"epoch": 2.5288878189696677,
"grad_norm": 1.0078125,
"learning_rate": 6.690590851401017e-05,
"loss": 0.3803,
"step": 10505
},
{
"epoch": 2.530091478093404,
"grad_norm": 0.984375,
"learning_rate": 6.685490287750592e-05,
"loss": 0.3845,
"step": 10510
},
{
"epoch": 2.53129513721714,
"grad_norm": 0.98828125,
"learning_rate": 6.68040168962426e-05,
"loss": 0.3698,
"step": 10515
},
{
"epoch": 2.5324987963408763,
"grad_norm": 0.9765625,
"learning_rate": 6.675325065152299e-05,
"loss": 0.3667,
"step": 10520
},
{
"epoch": 2.5337024554646126,
"grad_norm": 0.91796875,
"learning_rate": 6.670260422445847e-05,
"loss": 0.3787,
"step": 10525
},
{
"epoch": 2.5349061145883485,
"grad_norm": 0.984375,
"learning_rate": 6.665207769596911e-05,
"loss": 0.3537,
"step": 10530
},
{
"epoch": 2.536109773712085,
"grad_norm": 0.9453125,
"learning_rate": 6.66016711467833e-05,
"loss": 0.3789,
"step": 10535
},
{
"epoch": 2.5373134328358207,
"grad_norm": 0.9375,
"learning_rate": 6.655138465743781e-05,
"loss": 0.3788,
"step": 10540
},
{
"epoch": 2.538517091959557,
"grad_norm": 0.93359375,
"learning_rate": 6.650121830827758e-05,
"loss": 0.3779,
"step": 10545
},
{
"epoch": 2.5397207510832933,
"grad_norm": 0.9609375,
"learning_rate": 6.645117217945553e-05,
"loss": 0.3759,
"step": 10550
},
{
"epoch": 2.5409244102070296,
"grad_norm": 1.0625,
"learning_rate": 6.640124635093258e-05,
"loss": 0.3863,
"step": 10555
},
{
"epoch": 2.5421280693307655,
"grad_norm": 1.03125,
"learning_rate": 6.635144090247737e-05,
"loss": 0.3704,
"step": 10560
},
{
"epoch": 2.543331728454502,
"grad_norm": 0.9296875,
"learning_rate": 6.630175591366627e-05,
"loss": 0.3687,
"step": 10565
},
{
"epoch": 2.5445353875782377,
"grad_norm": 1.0234375,
"learning_rate": 6.625219146388313e-05,
"loss": 0.3731,
"step": 10570
},
{
"epoch": 2.545739046701974,
"grad_norm": 0.9609375,
"learning_rate": 6.62027476323193e-05,
"loss": 0.3477,
"step": 10575
},
{
"epoch": 2.5469427058257104,
"grad_norm": 0.8515625,
"learning_rate": 6.615342449797326e-05,
"loss": 0.3876,
"step": 10580
},
{
"epoch": 2.5481463649494462,
"grad_norm": 0.96484375,
"learning_rate": 6.610422213965077e-05,
"loss": 0.4155,
"step": 10585
},
{
"epoch": 2.5493500240731826,
"grad_norm": 1.078125,
"learning_rate": 6.60551406359646e-05,
"loss": 0.3834,
"step": 10590
},
{
"epoch": 2.5505536831969184,
"grad_norm": 0.89453125,
"learning_rate": 6.600618006533439e-05,
"loss": 0.4113,
"step": 10595
},
{
"epoch": 2.5517573423206548,
"grad_norm": 0.9765625,
"learning_rate": 6.59573405059866e-05,
"loss": 0.3763,
"step": 10600
},
{
"epoch": 2.552961001444391,
"grad_norm": 1.1328125,
"learning_rate": 6.590862203595433e-05,
"loss": 0.4034,
"step": 10605
},
{
"epoch": 2.554164660568127,
"grad_norm": 1.046875,
"learning_rate": 6.586002473307714e-05,
"loss": 0.3623,
"step": 10610
},
{
"epoch": 2.5553683196918633,
"grad_norm": 1.0390625,
"learning_rate": 6.581154867500117e-05,
"loss": 0.373,
"step": 10615
},
{
"epoch": 2.556571978815599,
"grad_norm": 0.93359375,
"learning_rate": 6.576319393917863e-05,
"loss": 0.3624,
"step": 10620
},
{
"epoch": 2.5577756379393355,
"grad_norm": 0.95703125,
"learning_rate": 6.571496060286808e-05,
"loss": 0.3665,
"step": 10625
},
{
"epoch": 2.558979297063072,
"grad_norm": 0.96875,
"learning_rate": 6.566684874313397e-05,
"loss": 0.3977,
"step": 10630
},
{
"epoch": 2.560182956186808,
"grad_norm": 0.91015625,
"learning_rate": 6.561885843684673e-05,
"loss": 0.3893,
"step": 10635
},
{
"epoch": 2.561386615310544,
"grad_norm": 0.921875,
"learning_rate": 6.557098976068259e-05,
"loss": 0.3812,
"step": 10640
},
{
"epoch": 2.5625902744342803,
"grad_norm": 0.89453125,
"learning_rate": 6.552324279112338e-05,
"loss": 0.401,
"step": 10645
},
{
"epoch": 2.563793933558016,
"grad_norm": 0.8984375,
"learning_rate": 6.547561760445653e-05,
"loss": 0.3697,
"step": 10650
},
{
"epoch": 2.5649975926817525,
"grad_norm": 0.92578125,
"learning_rate": 6.542811427677492e-05,
"loss": 0.3761,
"step": 10655
},
{
"epoch": 2.566201251805489,
"grad_norm": 0.9140625,
"learning_rate": 6.538073288397665e-05,
"loss": 0.3788,
"step": 10660
},
{
"epoch": 2.5674049109292247,
"grad_norm": 1.0234375,
"learning_rate": 6.533347350176507e-05,
"loss": 0.4021,
"step": 10665
},
{
"epoch": 2.568608570052961,
"grad_norm": 0.953125,
"learning_rate": 6.52863362056485e-05,
"loss": 0.3843,
"step": 10670
},
{
"epoch": 2.569812229176697,
"grad_norm": 0.9765625,
"learning_rate": 6.523932107094033e-05,
"loss": 0.3697,
"step": 10675
},
{
"epoch": 2.5710158883004333,
"grad_norm": 0.90625,
"learning_rate": 6.519242817275864e-05,
"loss": 0.3471,
"step": 10680
},
{
"epoch": 2.5722195474241696,
"grad_norm": 1.015625,
"learning_rate": 6.514565758602627e-05,
"loss": 0.3696,
"step": 10685
},
{
"epoch": 2.573423206547906,
"grad_norm": 1.0,
"learning_rate": 6.509900938547065e-05,
"loss": 0.389,
"step": 10690
},
{
"epoch": 2.574626865671642,
"grad_norm": 0.984375,
"learning_rate": 6.505248364562362e-05,
"loss": 0.3975,
"step": 10695
},
{
"epoch": 2.575830524795378,
"grad_norm": 0.95703125,
"learning_rate": 6.50060804408214e-05,
"loss": 0.3653,
"step": 10700
},
{
"epoch": 2.577034183919114,
"grad_norm": 0.9609375,
"learning_rate": 6.495979984520442e-05,
"loss": 0.381,
"step": 10705
},
{
"epoch": 2.5782378430428503,
"grad_norm": 1.046875,
"learning_rate": 6.491364193271718e-05,
"loss": 0.3869,
"step": 10710
},
{
"epoch": 2.5794415021665866,
"grad_norm": 0.96484375,
"learning_rate": 6.486760677710823e-05,
"loss": 0.3844,
"step": 10715
},
{
"epoch": 2.5806451612903225,
"grad_norm": 0.953125,
"learning_rate": 6.48216944519299e-05,
"loss": 0.4066,
"step": 10720
},
{
"epoch": 2.581848820414059,
"grad_norm": 0.953125,
"learning_rate": 6.477590503053839e-05,
"loss": 0.3668,
"step": 10725
},
{
"epoch": 2.5830524795377947,
"grad_norm": 0.87109375,
"learning_rate": 6.473023858609337e-05,
"loss": 0.3904,
"step": 10730
},
{
"epoch": 2.584256138661531,
"grad_norm": 0.99609375,
"learning_rate": 6.468469519155818e-05,
"loss": 0.3985,
"step": 10735
},
{
"epoch": 2.5854597977852674,
"grad_norm": 0.9375,
"learning_rate": 6.463927491969946e-05,
"loss": 0.3593,
"step": 10740
},
{
"epoch": 2.5866634569090032,
"grad_norm": 0.99609375,
"learning_rate": 6.459397784308715e-05,
"loss": 0.3942,
"step": 10745
},
{
"epoch": 2.5878671160327396,
"grad_norm": 0.9453125,
"learning_rate": 6.454880403409437e-05,
"loss": 0.3605,
"step": 10750
},
{
"epoch": 2.5890707751564754,
"grad_norm": 0.93359375,
"learning_rate": 6.450375356489732e-05,
"loss": 0.3816,
"step": 10755
},
{
"epoch": 2.5902744342802118,
"grad_norm": 1.0078125,
"learning_rate": 6.445882650747506e-05,
"loss": 0.3736,
"step": 10760
},
{
"epoch": 2.591478093403948,
"grad_norm": 1.0078125,
"learning_rate": 6.441402293360954e-05,
"loss": 0.3479,
"step": 10765
},
{
"epoch": 2.5926817525276844,
"grad_norm": 0.9921875,
"learning_rate": 6.436934291488535e-05,
"loss": 0.3725,
"step": 10770
},
{
"epoch": 2.5938854116514203,
"grad_norm": 0.94921875,
"learning_rate": 6.432478652268974e-05,
"loss": 0.3473,
"step": 10775
},
{
"epoch": 2.5950890707751566,
"grad_norm": 1.0625,
"learning_rate": 6.428035382821242e-05,
"loss": 0.4079,
"step": 10780
},
{
"epoch": 2.5962927298988925,
"grad_norm": 0.9296875,
"learning_rate": 6.42360449024454e-05,
"loss": 0.3576,
"step": 10785
},
{
"epoch": 2.597496389022629,
"grad_norm": 1.0,
"learning_rate": 6.4191859816183e-05,
"loss": 0.3801,
"step": 10790
},
{
"epoch": 2.598700048146365,
"grad_norm": 0.90234375,
"learning_rate": 6.414779864002165e-05,
"loss": 0.3791,
"step": 10795
},
{
"epoch": 2.599903707270101,
"grad_norm": 0.90234375,
"learning_rate": 6.410386144435985e-05,
"loss": 0.377,
"step": 10800
},
{
"epoch": 2.6011073663938373,
"grad_norm": 0.94921875,
"learning_rate": 6.406004829939794e-05,
"loss": 0.3605,
"step": 10805
},
{
"epoch": 2.602311025517573,
"grad_norm": 1.0390625,
"learning_rate": 6.40163592751381e-05,
"loss": 0.3925,
"step": 10810
},
{
"epoch": 2.6035146846413095,
"grad_norm": 0.94921875,
"learning_rate": 6.39727944413842e-05,
"loss": 0.3766,
"step": 10815
},
{
"epoch": 2.604718343765046,
"grad_norm": 0.91015625,
"learning_rate": 6.392935386774167e-05,
"loss": 0.3609,
"step": 10820
},
{
"epoch": 2.605922002888782,
"grad_norm": 0.859375,
"learning_rate": 6.388603762361743e-05,
"loss": 0.4011,
"step": 10825
},
{
"epoch": 2.607125662012518,
"grad_norm": 0.94921875,
"learning_rate": 6.38428457782197e-05,
"loss": 0.3974,
"step": 10830
},
{
"epoch": 2.6083293211362544,
"grad_norm": 0.98828125,
"learning_rate": 6.3799778400558e-05,
"loss": 0.4045,
"step": 10835
},
{
"epoch": 2.6095329802599903,
"grad_norm": 0.9453125,
"learning_rate": 6.375683555944297e-05,
"loss": 0.3594,
"step": 10840
},
{
"epoch": 2.6107366393837266,
"grad_norm": 0.9765625,
"learning_rate": 6.371401732348621e-05,
"loss": 0.3919,
"step": 10845
},
{
"epoch": 2.611940298507463,
"grad_norm": 1.0390625,
"learning_rate": 6.367132376110036e-05,
"loss": 0.3993,
"step": 10850
},
{
"epoch": 2.6131439576311988,
"grad_norm": 0.95703125,
"learning_rate": 6.362875494049874e-05,
"loss": 0.3814,
"step": 10855
},
{
"epoch": 2.614347616754935,
"grad_norm": 0.9609375,
"learning_rate": 6.358631092969539e-05,
"loss": 0.3851,
"step": 10860
},
{
"epoch": 2.615551275878671,
"grad_norm": 0.9296875,
"learning_rate": 6.354399179650503e-05,
"loss": 0.3816,
"step": 10865
},
{
"epoch": 2.6167549350024073,
"grad_norm": 0.97265625,
"learning_rate": 6.35017976085427e-05,
"loss": 0.3611,
"step": 10870
},
{
"epoch": 2.6179585941261436,
"grad_norm": 0.94140625,
"learning_rate": 6.345972843322398e-05,
"loss": 0.3951,
"step": 10875
},
{
"epoch": 2.6191622532498795,
"grad_norm": 1.0078125,
"learning_rate": 6.341778433776457e-05,
"loss": 0.3976,
"step": 10880
},
{
"epoch": 2.620365912373616,
"grad_norm": 1.015625,
"learning_rate": 6.33759653891804e-05,
"loss": 0.3881,
"step": 10885
},
{
"epoch": 2.6215695714973517,
"grad_norm": 0.9375,
"learning_rate": 6.333427165428746e-05,
"loss": 0.3684,
"step": 10890
},
{
"epoch": 2.622773230621088,
"grad_norm": 1.03125,
"learning_rate": 6.329270319970161e-05,
"loss": 0.3772,
"step": 10895
},
{
"epoch": 2.6239768897448243,
"grad_norm": 0.85546875,
"learning_rate": 6.325126009183858e-05,
"loss": 0.3443,
"step": 10900
},
{
"epoch": 2.6251805488685607,
"grad_norm": 0.9453125,
"learning_rate": 6.320994239691385e-05,
"loss": 0.3801,
"step": 10905
},
{
"epoch": 2.6263842079922965,
"grad_norm": 0.91796875,
"learning_rate": 6.316875018094249e-05,
"loss": 0.3892,
"step": 10910
},
{
"epoch": 2.627587867116033,
"grad_norm": 0.97265625,
"learning_rate": 6.312768350973913e-05,
"loss": 0.3677,
"step": 10915
},
{
"epoch": 2.6287915262397687,
"grad_norm": 1.015625,
"learning_rate": 6.308674244891776e-05,
"loss": 0.3721,
"step": 10920
},
{
"epoch": 2.629995185363505,
"grad_norm": 0.9765625,
"learning_rate": 6.304592706389172e-05,
"loss": 0.3752,
"step": 10925
},
{
"epoch": 2.6311988444872414,
"grad_norm": 0.9140625,
"learning_rate": 6.30052374198735e-05,
"loss": 0.3737,
"step": 10930
},
{
"epoch": 2.6324025036109773,
"grad_norm": 0.96875,
"learning_rate": 6.296467358187474e-05,
"loss": 0.385,
"step": 10935
},
{
"epoch": 2.6336061627347136,
"grad_norm": 0.875,
"learning_rate": 6.292423561470606e-05,
"loss": 0.4001,
"step": 10940
},
{
"epoch": 2.6348098218584495,
"grad_norm": 0.91796875,
"learning_rate": 6.288392358297697e-05,
"loss": 0.383,
"step": 10945
},
{
"epoch": 2.636013480982186,
"grad_norm": 0.9765625,
"learning_rate": 6.284373755109574e-05,
"loss": 0.4,
"step": 10950
},
{
"epoch": 2.637217140105922,
"grad_norm": 0.9375,
"learning_rate": 6.280367758326935e-05,
"loss": 0.3747,
"step": 10955
},
{
"epoch": 2.638420799229658,
"grad_norm": 0.95703125,
"learning_rate": 6.276374374350337e-05,
"loss": 0.3694,
"step": 10960
},
{
"epoch": 2.6396244583533943,
"grad_norm": 1.0546875,
"learning_rate": 6.272393609560185e-05,
"loss": 0.395,
"step": 10965
},
{
"epoch": 2.64082811747713,
"grad_norm": 1.0390625,
"learning_rate": 6.268425470316717e-05,
"loss": 0.4206,
"step": 10970
},
{
"epoch": 2.6420317766008665,
"grad_norm": 0.91796875,
"learning_rate": 6.264469962960005e-05,
"loss": 0.3573,
"step": 10975
},
{
"epoch": 2.643235435724603,
"grad_norm": 0.9609375,
"learning_rate": 6.260527093809936e-05,
"loss": 0.3801,
"step": 10980
},
{
"epoch": 2.644439094848339,
"grad_norm": 0.9375,
"learning_rate": 6.256596869166204e-05,
"loss": 0.3603,
"step": 10985
},
{
"epoch": 2.645642753972075,
"grad_norm": 0.9609375,
"learning_rate": 6.2526792953083e-05,
"loss": 0.4127,
"step": 10990
},
{
"epoch": 2.6468464130958114,
"grad_norm": 1.0078125,
"learning_rate": 6.248774378495501e-05,
"loss": 0.396,
"step": 10995
},
{
"epoch": 2.6480500722195472,
"grad_norm": 1.0234375,
"learning_rate": 6.244882124966866e-05,
"loss": 0.3875,
"step": 11000
},
{
"epoch": 2.6480500722195472,
"eval_loss": 0.351482629776001,
"eval_runtime": 2.3721,
"eval_samples_per_second": 84.314,
"eval_steps_per_second": 84.314,
"step": 11000
},
{
"epoch": 2.6492537313432836,
"grad_norm": 1.0234375,
"learning_rate": 6.241002540941217e-05,
"loss": 0.371,
"step": 11005
},
{
"epoch": 2.65045739046702,
"grad_norm": 0.921875,
"learning_rate": 6.237135632617133e-05,
"loss": 0.361,
"step": 11010
},
{
"epoch": 2.6516610495907558,
"grad_norm": 1.0078125,
"learning_rate": 6.233281406172947e-05,
"loss": 0.3823,
"step": 11015
},
{
"epoch": 2.652864708714492,
"grad_norm": 0.8359375,
"learning_rate": 6.22943986776672e-05,
"loss": 0.3949,
"step": 11020
},
{
"epoch": 2.654068367838228,
"grad_norm": 1.015625,
"learning_rate": 6.225611023536247e-05,
"loss": 0.4004,
"step": 11025
},
{
"epoch": 2.6552720269619643,
"grad_norm": 0.96484375,
"learning_rate": 6.22179487959904e-05,
"loss": 0.3594,
"step": 11030
},
{
"epoch": 2.6564756860857006,
"grad_norm": 0.92578125,
"learning_rate": 6.217991442052319e-05,
"loss": 0.3806,
"step": 11035
},
{
"epoch": 2.657679345209437,
"grad_norm": 1.0390625,
"learning_rate": 6.214200716973001e-05,
"loss": 0.3783,
"step": 11040
},
{
"epoch": 2.658883004333173,
"grad_norm": 0.8828125,
"learning_rate": 6.210422710417694e-05,
"loss": 0.3588,
"step": 11045
},
{
"epoch": 2.660086663456909,
"grad_norm": 0.89453125,
"learning_rate": 6.206657428422685e-05,
"loss": 0.3772,
"step": 11050
},
{
"epoch": 2.661290322580645,
"grad_norm": 0.9296875,
"learning_rate": 6.202904877003929e-05,
"loss": 0.3611,
"step": 11055
},
{
"epoch": 2.6624939817043813,
"grad_norm": 0.98046875,
"learning_rate": 6.199165062157037e-05,
"loss": 0.4088,
"step": 11060
},
{
"epoch": 2.6636976408281177,
"grad_norm": 0.90234375,
"learning_rate": 6.195437989857279e-05,
"loss": 0.3566,
"step": 11065
},
{
"epoch": 2.6649012999518535,
"grad_norm": 1.046875,
"learning_rate": 6.19172366605956e-05,
"loss": 0.4142,
"step": 11070
},
{
"epoch": 2.66610495907559,
"grad_norm": 0.91015625,
"learning_rate": 6.188022096698417e-05,
"loss": 0.3698,
"step": 11075
},
{
"epoch": 2.6673086181993257,
"grad_norm": 0.9609375,
"learning_rate": 6.184333287688008e-05,
"loss": 0.3919,
"step": 11080
},
{
"epoch": 2.668512277323062,
"grad_norm": 0.99609375,
"learning_rate": 6.180657244922108e-05,
"loss": 0.4065,
"step": 11085
},
{
"epoch": 2.6697159364467984,
"grad_norm": 1.0625,
"learning_rate": 6.176993974274084e-05,
"loss": 0.4269,
"step": 11090
},
{
"epoch": 2.6709195955705343,
"grad_norm": 0.9609375,
"learning_rate": 6.17334348159691e-05,
"loss": 0.3642,
"step": 11095
},
{
"epoch": 2.6721232546942706,
"grad_norm": 0.92578125,
"learning_rate": 6.169705772723136e-05,
"loss": 0.3579,
"step": 11100
},
{
"epoch": 2.6733269138180065,
"grad_norm": 0.9140625,
"learning_rate": 6.166080853464888e-05,
"loss": 0.3663,
"step": 11105
},
{
"epoch": 2.674530572941743,
"grad_norm": 0.98046875,
"learning_rate": 6.162468729613855e-05,
"loss": 0.3744,
"step": 11110
},
{
"epoch": 2.675734232065479,
"grad_norm": 0.984375,
"learning_rate": 6.158869406941286e-05,
"loss": 0.3934,
"step": 11115
},
{
"epoch": 2.6769378911892154,
"grad_norm": 0.8828125,
"learning_rate": 6.155282891197976e-05,
"loss": 0.391,
"step": 11120
},
{
"epoch": 2.6781415503129513,
"grad_norm": 0.90234375,
"learning_rate": 6.151709188114261e-05,
"loss": 0.3712,
"step": 11125
},
{
"epoch": 2.6793452094366876,
"grad_norm": 0.921875,
"learning_rate": 6.1481483034e-05,
"loss": 0.3784,
"step": 11130
},
{
"epoch": 2.6805488685604235,
"grad_norm": 0.9296875,
"learning_rate": 6.144600242744574e-05,
"loss": 0.3491,
"step": 11135
},
{
"epoch": 2.68175252768416,
"grad_norm": 0.89453125,
"learning_rate": 6.141065011816873e-05,
"loss": 0.3602,
"step": 11140
},
{
"epoch": 2.682956186807896,
"grad_norm": 1.0390625,
"learning_rate": 6.137542616265291e-05,
"loss": 0.3833,
"step": 11145
},
{
"epoch": 2.684159845931632,
"grad_norm": 1.5390625,
"learning_rate": 6.134033061717713e-05,
"loss": 0.3674,
"step": 11150
},
{
"epoch": 2.6853635050553684,
"grad_norm": 0.92578125,
"learning_rate": 6.130536353781511e-05,
"loss": 0.3695,
"step": 11155
},
{
"epoch": 2.6865671641791042,
"grad_norm": 0.91796875,
"learning_rate": 6.127052498043521e-05,
"loss": 0.3714,
"step": 11160
},
{
"epoch": 2.6877708233028406,
"grad_norm": 0.9609375,
"learning_rate": 6.123581500070057e-05,
"loss": 0.3799,
"step": 11165
},
{
"epoch": 2.688974482426577,
"grad_norm": 0.875,
"learning_rate": 6.12012336540688e-05,
"loss": 0.3944,
"step": 11170
},
{
"epoch": 2.690178141550313,
"grad_norm": 0.875,
"learning_rate": 6.116678099579206e-05,
"loss": 0.3827,
"step": 11175
},
{
"epoch": 2.691381800674049,
"grad_norm": 0.984375,
"learning_rate": 6.113245708091684e-05,
"loss": 0.3582,
"step": 11180
},
{
"epoch": 2.6925854597977854,
"grad_norm": 0.875,
"learning_rate": 6.109826196428398e-05,
"loss": 0.3662,
"step": 11185
},
{
"epoch": 2.6937891189215213,
"grad_norm": 0.8671875,
"learning_rate": 6.106419570052849e-05,
"loss": 0.3711,
"step": 11190
},
{
"epoch": 2.6949927780452576,
"grad_norm": 0.97265625,
"learning_rate": 6.103025834407954e-05,
"loss": 0.4055,
"step": 11195
},
{
"epoch": 2.696196437168994,
"grad_norm": 0.96875,
"learning_rate": 6.099644994916033e-05,
"loss": 0.3714,
"step": 11200
},
{
"epoch": 2.69740009629273,
"grad_norm": 0.9375,
"learning_rate": 6.096277056978799e-05,
"loss": 0.3605,
"step": 11205
},
{
"epoch": 2.698603755416466,
"grad_norm": 0.95703125,
"learning_rate": 6.092922025977354e-05,
"loss": 0.3816,
"step": 11210
},
{
"epoch": 2.699807414540202,
"grad_norm": 0.98046875,
"learning_rate": 6.0895799072721795e-05,
"loss": 0.3263,
"step": 11215
},
{
"epoch": 2.7010110736639383,
"grad_norm": 0.9296875,
"learning_rate": 6.0862507062031206e-05,
"loss": 0.3622,
"step": 11220
},
{
"epoch": 2.7022147327876747,
"grad_norm": 0.95703125,
"learning_rate": 6.082934428089391e-05,
"loss": 0.3419,
"step": 11225
},
{
"epoch": 2.7034183919114105,
"grad_norm": 0.9609375,
"learning_rate": 6.0796310782295507e-05,
"loss": 0.3705,
"step": 11230
},
{
"epoch": 2.704622051035147,
"grad_norm": 0.953125,
"learning_rate": 6.076340661901507e-05,
"loss": 0.3694,
"step": 11235
},
{
"epoch": 2.7058257101588827,
"grad_norm": 0.98046875,
"learning_rate": 6.073063184362501e-05,
"loss": 0.3928,
"step": 11240
},
{
"epoch": 2.707029369282619,
"grad_norm": 0.9375,
"learning_rate": 6.0697986508491e-05,
"loss": 0.3723,
"step": 11245
},
{
"epoch": 2.7082330284063554,
"grad_norm": 1.0078125,
"learning_rate": 6.066547066577197e-05,
"loss": 0.4044,
"step": 11250
},
{
"epoch": 2.7094366875300917,
"grad_norm": 0.85546875,
"learning_rate": 6.063308436741984e-05,
"loss": 0.367,
"step": 11255
},
{
"epoch": 2.7106403466538276,
"grad_norm": 0.890625,
"learning_rate": 6.060082766517967e-05,
"loss": 0.3639,
"step": 11260
},
{
"epoch": 2.711844005777564,
"grad_norm": 0.91796875,
"learning_rate": 6.0568700610589346e-05,
"loss": 0.3635,
"step": 11265
},
{
"epoch": 2.7130476649013,
"grad_norm": 0.94140625,
"learning_rate": 6.0536703254979707e-05,
"loss": 0.3627,
"step": 11270
},
{
"epoch": 2.714251324025036,
"grad_norm": 0.921875,
"learning_rate": 6.0504835649474296e-05,
"loss": 0.395,
"step": 11275
},
{
"epoch": 2.7154549831487724,
"grad_norm": 0.9609375,
"learning_rate": 6.0473097844989376e-05,
"loss": 0.3656,
"step": 11280
},
{
"epoch": 2.7166586422725083,
"grad_norm": 0.90234375,
"learning_rate": 6.0441489892233855e-05,
"loss": 0.3684,
"step": 11285
},
{
"epoch": 2.7178623013962446,
"grad_norm": 0.890625,
"learning_rate": 6.041001184170911e-05,
"loss": 0.3606,
"step": 11290
},
{
"epoch": 2.7190659605199805,
"grad_norm": 0.91796875,
"learning_rate": 6.0378663743709026e-05,
"loss": 0.3906,
"step": 11295
},
{
"epoch": 2.720269619643717,
"grad_norm": 0.91796875,
"learning_rate": 6.034744564831977e-05,
"loss": 0.3674,
"step": 11300
},
{
"epoch": 2.721473278767453,
"grad_norm": 0.97265625,
"learning_rate": 6.031635760541992e-05,
"loss": 0.3944,
"step": 11305
},
{
"epoch": 2.7226769378911895,
"grad_norm": 0.9453125,
"learning_rate": 6.028539966468016e-05,
"loss": 0.3809,
"step": 11310
},
{
"epoch": 2.7238805970149254,
"grad_norm": 0.96484375,
"learning_rate": 6.0254571875563366e-05,
"loss": 0.3567,
"step": 11315
},
{
"epoch": 2.7250842561386617,
"grad_norm": 1.078125,
"learning_rate": 6.0223874287324425e-05,
"loss": 0.3785,
"step": 11320
},
{
"epoch": 2.7262879152623976,
"grad_norm": 1.0,
"learning_rate": 6.019330694901022e-05,
"loss": 0.3667,
"step": 11325
},
{
"epoch": 2.727491574386134,
"grad_norm": 0.9609375,
"learning_rate": 6.0162869909459514e-05,
"loss": 0.3595,
"step": 11330
},
{
"epoch": 2.72869523350987,
"grad_norm": 1.0546875,
"learning_rate": 6.0132563217302914e-05,
"loss": 0.3867,
"step": 11335
},
{
"epoch": 2.729898892633606,
"grad_norm": 0.91796875,
"learning_rate": 6.010238692096272e-05,
"loss": 0.3655,
"step": 11340
},
{
"epoch": 2.7311025517573424,
"grad_norm": 0.890625,
"learning_rate": 6.007234106865294e-05,
"loss": 0.3605,
"step": 11345
},
{
"epoch": 2.7323062108810783,
"grad_norm": 0.8984375,
"learning_rate": 6.0042425708379124e-05,
"loss": 0.3525,
"step": 11350
},
{
"epoch": 2.7335098700048146,
"grad_norm": 0.9296875,
"learning_rate": 6.001264088793834e-05,
"loss": 0.3554,
"step": 11355
},
{
"epoch": 2.734713529128551,
"grad_norm": 0.92578125,
"learning_rate": 5.998298665491915e-05,
"loss": 0.3771,
"step": 11360
},
{
"epoch": 2.735917188252287,
"grad_norm": 0.94921875,
"learning_rate": 5.995346305670136e-05,
"loss": 0.3954,
"step": 11365
},
{
"epoch": 2.737120847376023,
"grad_norm": 0.9140625,
"learning_rate": 5.9924070140456144e-05,
"loss": 0.3789,
"step": 11370
},
{
"epoch": 2.738324506499759,
"grad_norm": 0.9921875,
"learning_rate": 5.989480795314583e-05,
"loss": 0.404,
"step": 11375
},
{
"epoch": 2.7395281656234953,
"grad_norm": 0.859375,
"learning_rate": 5.98656765415239e-05,
"loss": 0.36,
"step": 11380
},
{
"epoch": 2.7407318247472316,
"grad_norm": 0.87890625,
"learning_rate": 5.983667595213488e-05,
"loss": 0.3549,
"step": 11385
},
{
"epoch": 2.741935483870968,
"grad_norm": 0.96484375,
"learning_rate": 5.980780623131426e-05,
"loss": 0.3851,
"step": 11390
},
{
"epoch": 2.743139142994704,
"grad_norm": 0.98828125,
"learning_rate": 5.977906742518849e-05,
"loss": 0.3756,
"step": 11395
},
{
"epoch": 2.74434280211844,
"grad_norm": 0.90625,
"learning_rate": 5.975045957967477e-05,
"loss": 0.3734,
"step": 11400
},
{
"epoch": 2.745546461242176,
"grad_norm": 0.9921875,
"learning_rate": 5.972198274048113e-05,
"loss": 0.3929,
"step": 11405
},
{
"epoch": 2.7467501203659124,
"grad_norm": 1.03125,
"learning_rate": 5.969363695310628e-05,
"loss": 0.4056,
"step": 11410
},
{
"epoch": 2.7479537794896487,
"grad_norm": 0.91796875,
"learning_rate": 5.9665422262839467e-05,
"loss": 0.3685,
"step": 11415
},
{
"epoch": 2.7491574386133846,
"grad_norm": 0.91015625,
"learning_rate": 5.96373387147606e-05,
"loss": 0.3846,
"step": 11420
},
{
"epoch": 2.750361097737121,
"grad_norm": 0.98828125,
"learning_rate": 5.960938635373993e-05,
"loss": 0.3603,
"step": 11425
},
{
"epoch": 2.7515647568608568,
"grad_norm": 0.984375,
"learning_rate": 5.958156522443819e-05,
"loss": 0.3879,
"step": 11430
},
{
"epoch": 2.752768415984593,
"grad_norm": 0.97265625,
"learning_rate": 5.955387537130642e-05,
"loss": 0.3887,
"step": 11435
},
{
"epoch": 2.7539720751083294,
"grad_norm": 0.93359375,
"learning_rate": 5.952631683858589e-05,
"loss": 0.3618,
"step": 11440
},
{
"epoch": 2.7551757342320657,
"grad_norm": 1.0078125,
"learning_rate": 5.9498889670308085e-05,
"loss": 0.3767,
"step": 11445
},
{
"epoch": 2.7563793933558016,
"grad_norm": 0.9453125,
"learning_rate": 5.947159391029458e-05,
"loss": 0.3863,
"step": 11450
},
{
"epoch": 2.757583052479538,
"grad_norm": 0.96484375,
"learning_rate": 5.944442960215698e-05,
"loss": 0.3761,
"step": 11455
},
{
"epoch": 2.758786711603274,
"grad_norm": 0.91015625,
"learning_rate": 5.9417396789296946e-05,
"loss": 0.3543,
"step": 11460
},
{
"epoch": 2.75999037072701,
"grad_norm": 0.9921875,
"learning_rate": 5.939049551490592e-05,
"loss": 0.3569,
"step": 11465
},
{
"epoch": 2.7611940298507465,
"grad_norm": 0.9765625,
"learning_rate": 5.936372582196529e-05,
"loss": 0.3839,
"step": 11470
},
{
"epoch": 2.7623976889744823,
"grad_norm": 0.9296875,
"learning_rate": 5.933708775324613e-05,
"loss": 0.3781,
"step": 11475
},
{
"epoch": 2.7636013480982187,
"grad_norm": 0.91015625,
"learning_rate": 5.9310581351309275e-05,
"loss": 0.3674,
"step": 11480
},
{
"epoch": 2.7648050072219545,
"grad_norm": 0.96484375,
"learning_rate": 5.928420665850513e-05,
"loss": 0.3729,
"step": 11485
},
{
"epoch": 2.766008666345691,
"grad_norm": 0.96484375,
"learning_rate": 5.9257963716973694e-05,
"loss": 0.408,
"step": 11490
},
{
"epoch": 2.767212325469427,
"grad_norm": 0.91796875,
"learning_rate": 5.923185256864449e-05,
"loss": 0.3561,
"step": 11495
},
{
"epoch": 2.768415984593163,
"grad_norm": 0.87890625,
"learning_rate": 5.920587325523642e-05,
"loss": 0.3637,
"step": 11500
},
{
"epoch": 2.768415984593163,
"eval_loss": 0.34961211681365967,
"eval_runtime": 2.3594,
"eval_samples_per_second": 84.768,
"eval_steps_per_second": 84.768,
"step": 11500
},
{
"epoch": 2.7696196437168994,
"grad_norm": 1.03125,
"learning_rate": 5.9180025818257755e-05,
"loss": 0.3503,
"step": 11505
},
{
"epoch": 2.7708233028406353,
"grad_norm": 0.9296875,
"learning_rate": 5.915431029900609e-05,
"loss": 0.3741,
"step": 11510
},
{
"epoch": 2.7720269619643716,
"grad_norm": 0.984375,
"learning_rate": 5.912872673856823e-05,
"loss": 0.3609,
"step": 11515
},
{
"epoch": 2.773230621088108,
"grad_norm": 0.9140625,
"learning_rate": 5.910327517782015e-05,
"loss": 0.3708,
"step": 11520
},
{
"epoch": 2.7744342802118442,
"grad_norm": 0.87890625,
"learning_rate": 5.907795565742691e-05,
"loss": 0.3677,
"step": 11525
},
{
"epoch": 2.77563793933558,
"grad_norm": 0.859375,
"learning_rate": 5.9052768217842614e-05,
"loss": 0.3658,
"step": 11530
},
{
"epoch": 2.7768415984593164,
"grad_norm": 0.921875,
"learning_rate": 5.9027712899310354e-05,
"loss": 0.3784,
"step": 11535
},
{
"epoch": 2.7780452575830523,
"grad_norm": 0.953125,
"learning_rate": 5.900278974186208e-05,
"loss": 0.382,
"step": 11540
},
{
"epoch": 2.7792489167067886,
"grad_norm": 0.99609375,
"learning_rate": 5.897799878531861e-05,
"loss": 0.3691,
"step": 11545
},
{
"epoch": 2.780452575830525,
"grad_norm": 0.91015625,
"learning_rate": 5.8953340069289544e-05,
"loss": 0.3906,
"step": 11550
},
{
"epoch": 2.781656234954261,
"grad_norm": 1.0546875,
"learning_rate": 5.8928813633173194e-05,
"loss": 0.3764,
"step": 11555
},
{
"epoch": 2.782859894077997,
"grad_norm": 0.9609375,
"learning_rate": 5.890441951615651e-05,
"loss": 0.3873,
"step": 11560
},
{
"epoch": 2.784063553201733,
"grad_norm": 0.84375,
"learning_rate": 5.888015775721504e-05,
"loss": 0.3741,
"step": 11565
},
{
"epoch": 2.7852672123254694,
"grad_norm": 0.95703125,
"learning_rate": 5.8856028395112874e-05,
"loss": 0.3669,
"step": 11570
},
{
"epoch": 2.7864708714492057,
"grad_norm": 1.0390625,
"learning_rate": 5.8832031468402505e-05,
"loss": 0.3791,
"step": 11575
},
{
"epoch": 2.787674530572942,
"grad_norm": 0.92578125,
"learning_rate": 5.880816701542492e-05,
"loss": 0.3581,
"step": 11580
},
{
"epoch": 2.788878189696678,
"grad_norm": 0.94921875,
"learning_rate": 5.878443507430935e-05,
"loss": 0.3855,
"step": 11585
},
{
"epoch": 2.790081848820414,
"grad_norm": 0.96484375,
"learning_rate": 5.8760835682973376e-05,
"loss": 0.3761,
"step": 11590
},
{
"epoch": 2.79128550794415,
"grad_norm": 1.0625,
"learning_rate": 5.873736887912278e-05,
"loss": 0.3846,
"step": 11595
},
{
"epoch": 2.7924891670678864,
"grad_norm": 0.87890625,
"learning_rate": 5.871403470025148e-05,
"loss": 0.3836,
"step": 11600
},
{
"epoch": 2.7936928261916227,
"grad_norm": 1.078125,
"learning_rate": 5.869083318364154e-05,
"loss": 0.3734,
"step": 11605
},
{
"epoch": 2.7948964853153586,
"grad_norm": 1.0390625,
"learning_rate": 5.866776436636302e-05,
"loss": 0.3697,
"step": 11610
},
{
"epoch": 2.796100144439095,
"grad_norm": 0.8828125,
"learning_rate": 5.864482828527397e-05,
"loss": 0.3624,
"step": 11615
},
{
"epoch": 2.797303803562831,
"grad_norm": 0.890625,
"learning_rate": 5.862202497702039e-05,
"loss": 0.3829,
"step": 11620
},
{
"epoch": 2.798507462686567,
"grad_norm": 1.0234375,
"learning_rate": 5.859935447803608e-05,
"loss": 0.3867,
"step": 11625
},
{
"epoch": 2.7997111218103035,
"grad_norm": 1.0,
"learning_rate": 5.8576816824542733e-05,
"loss": 0.3666,
"step": 11630
},
{
"epoch": 2.8009147809340393,
"grad_norm": 0.921875,
"learning_rate": 5.8554412052549716e-05,
"loss": 0.3648,
"step": 11635
},
{
"epoch": 2.8021184400577757,
"grad_norm": 1.0234375,
"learning_rate": 5.8532140197854114e-05,
"loss": 0.3785,
"step": 11640
},
{
"epoch": 2.8033220991815115,
"grad_norm": 0.9375,
"learning_rate": 5.851000129604065e-05,
"loss": 0.3488,
"step": 11645
},
{
"epoch": 2.804525758305248,
"grad_norm": 0.97265625,
"learning_rate": 5.848799538248159e-05,
"loss": 0.3802,
"step": 11650
},
{
"epoch": 2.805729417428984,
"grad_norm": 0.93359375,
"learning_rate": 5.846612249233677e-05,
"loss": 0.378,
"step": 11655
},
{
"epoch": 2.8069330765527205,
"grad_norm": 0.9765625,
"learning_rate": 5.844438266055344e-05,
"loss": 0.3668,
"step": 11660
},
{
"epoch": 2.8081367356764564,
"grad_norm": 0.92578125,
"learning_rate": 5.84227759218663e-05,
"loss": 0.3688,
"step": 11665
},
{
"epoch": 2.8093403948001927,
"grad_norm": 0.92578125,
"learning_rate": 5.8401302310797366e-05,
"loss": 0.3956,
"step": 11670
},
{
"epoch": 2.8105440539239286,
"grad_norm": 0.9375,
"learning_rate": 5.837996186165596e-05,
"loss": 0.3865,
"step": 11675
},
{
"epoch": 2.811747713047665,
"grad_norm": 0.91015625,
"learning_rate": 5.835875460853866e-05,
"loss": 0.378,
"step": 11680
},
{
"epoch": 2.8129513721714012,
"grad_norm": 0.875,
"learning_rate": 5.8337680585329203e-05,
"loss": 0.3852,
"step": 11685
},
{
"epoch": 2.814155031295137,
"grad_norm": 0.98046875,
"learning_rate": 5.8316739825698495e-05,
"loss": 0.4052,
"step": 11690
},
{
"epoch": 2.8153586904188734,
"grad_norm": 0.9453125,
"learning_rate": 5.829593236310451e-05,
"loss": 0.3547,
"step": 11695
},
{
"epoch": 2.8165623495426093,
"grad_norm": 0.9765625,
"learning_rate": 5.8275258230792205e-05,
"loss": 0.3902,
"step": 11700
},
{
"epoch": 2.8177660086663456,
"grad_norm": 0.921875,
"learning_rate": 5.8254717461793563e-05,
"loss": 0.3707,
"step": 11705
},
{
"epoch": 2.818969667790082,
"grad_norm": 0.93359375,
"learning_rate": 5.823431008892747e-05,
"loss": 0.3905,
"step": 11710
},
{
"epoch": 2.8201733269138183,
"grad_norm": 0.95703125,
"learning_rate": 5.8214036144799686e-05,
"loss": 0.3578,
"step": 11715
},
{
"epoch": 2.821376986037554,
"grad_norm": 0.92578125,
"learning_rate": 5.819389566180274e-05,
"loss": 0.3622,
"step": 11720
},
{
"epoch": 2.8225806451612905,
"grad_norm": 0.95703125,
"learning_rate": 5.817388867211597e-05,
"loss": 0.3725,
"step": 11725
},
{
"epoch": 2.8237843042850264,
"grad_norm": 0.890625,
"learning_rate": 5.815401520770547e-05,
"loss": 0.3713,
"step": 11730
},
{
"epoch": 2.8249879634087627,
"grad_norm": 0.94140625,
"learning_rate": 5.813427530032388e-05,
"loss": 0.3941,
"step": 11735
},
{
"epoch": 2.826191622532499,
"grad_norm": 1.0234375,
"learning_rate": 5.811466898151054e-05,
"loss": 0.3782,
"step": 11740
},
{
"epoch": 2.827395281656235,
"grad_norm": 0.96484375,
"learning_rate": 5.809519628259132e-05,
"loss": 0.3623,
"step": 11745
},
{
"epoch": 2.828598940779971,
"grad_norm": 0.984375,
"learning_rate": 5.807585723467857e-05,
"loss": 0.3722,
"step": 11750
},
{
"epoch": 2.829802599903707,
"grad_norm": 0.95703125,
"learning_rate": 5.8056651868671185e-05,
"loss": 0.3562,
"step": 11755
},
{
"epoch": 2.8310062590274434,
"grad_norm": 0.94140625,
"learning_rate": 5.803758021525437e-05,
"loss": 0.3757,
"step": 11760
},
{
"epoch": 2.8322099181511797,
"grad_norm": 0.98828125,
"learning_rate": 5.801864230489977e-05,
"loss": 0.3889,
"step": 11765
},
{
"epoch": 2.8334135772749156,
"grad_norm": 0.98828125,
"learning_rate": 5.7999838167865285e-05,
"loss": 0.3868,
"step": 11770
},
{
"epoch": 2.834617236398652,
"grad_norm": 0.99609375,
"learning_rate": 5.798116783419512e-05,
"loss": 0.3725,
"step": 11775
},
{
"epoch": 2.835820895522388,
"grad_norm": 0.90234375,
"learning_rate": 5.796263133371969e-05,
"loss": 0.3593,
"step": 11780
},
{
"epoch": 2.837024554646124,
"grad_norm": 0.88671875,
"learning_rate": 5.794422869605555e-05,
"loss": 0.3748,
"step": 11785
},
{
"epoch": 2.8382282137698605,
"grad_norm": 1.0,
"learning_rate": 5.7925959950605414e-05,
"loss": 0.3795,
"step": 11790
},
{
"epoch": 2.8394318728935968,
"grad_norm": 0.98828125,
"learning_rate": 5.790782512655804e-05,
"loss": 0.3898,
"step": 11795
},
{
"epoch": 2.8406355320173327,
"grad_norm": 0.9609375,
"learning_rate": 5.788982425288825e-05,
"loss": 0.3678,
"step": 11800
},
{
"epoch": 2.841839191141069,
"grad_norm": 0.91015625,
"learning_rate": 5.7871957358356804e-05,
"loss": 0.3665,
"step": 11805
},
{
"epoch": 2.843042850264805,
"grad_norm": 0.97265625,
"learning_rate": 5.7854224471510416e-05,
"loss": 0.3742,
"step": 11810
},
{
"epoch": 2.844246509388541,
"grad_norm": 0.9453125,
"learning_rate": 5.783662562068172e-05,
"loss": 0.3695,
"step": 11815
},
{
"epoch": 2.8454501685122775,
"grad_norm": 0.95703125,
"learning_rate": 5.7819160833989156e-05,
"loss": 0.3894,
"step": 11820
},
{
"epoch": 2.8466538276360134,
"grad_norm": 0.99609375,
"learning_rate": 5.7801830139336955e-05,
"loss": 0.4296,
"step": 11825
},
{
"epoch": 2.8478574867597497,
"grad_norm": 0.98828125,
"learning_rate": 5.778463356441515e-05,
"loss": 0.3739,
"step": 11830
},
{
"epoch": 2.8490611458834856,
"grad_norm": 0.97265625,
"learning_rate": 5.7767571136699455e-05,
"loss": 0.3604,
"step": 11835
},
{
"epoch": 2.850264805007222,
"grad_norm": 0.99609375,
"learning_rate": 5.775064288345125e-05,
"loss": 0.3687,
"step": 11840
},
{
"epoch": 2.8514684641309582,
"grad_norm": 0.88671875,
"learning_rate": 5.773384883171753e-05,
"loss": 0.3586,
"step": 11845
},
{
"epoch": 2.8526721232546945,
"grad_norm": 1.0703125,
"learning_rate": 5.771718900833093e-05,
"loss": 0.3733,
"step": 11850
},
{
"epoch": 2.8538757823784304,
"grad_norm": 1.0234375,
"learning_rate": 5.770066343990953e-05,
"loss": 0.3931,
"step": 11855
},
{
"epoch": 2.8550794415021667,
"grad_norm": 0.953125,
"learning_rate": 5.768427215285697e-05,
"loss": 0.3621,
"step": 11860
},
{
"epoch": 2.8562831006259026,
"grad_norm": 0.90234375,
"learning_rate": 5.766801517336232e-05,
"loss": 0.3703,
"step": 11865
},
{
"epoch": 2.857486759749639,
"grad_norm": 0.98046875,
"learning_rate": 5.7651892527400065e-05,
"loss": 0.3828,
"step": 11870
},
{
"epoch": 2.8586904188733753,
"grad_norm": 0.90234375,
"learning_rate": 5.763590424073006e-05,
"loss": 0.3823,
"step": 11875
},
{
"epoch": 2.859894077997111,
"grad_norm": 0.8984375,
"learning_rate": 5.7620050338897514e-05,
"loss": 0.3814,
"step": 11880
},
{
"epoch": 2.8610977371208475,
"grad_norm": 1.0390625,
"learning_rate": 5.760433084723286e-05,
"loss": 0.4181,
"step": 11885
},
{
"epoch": 2.8623013962445834,
"grad_norm": 0.9296875,
"learning_rate": 5.758874579085185e-05,
"loss": 0.3897,
"step": 11890
},
{
"epoch": 2.8635050553683197,
"grad_norm": 0.98828125,
"learning_rate": 5.757329519465538e-05,
"loss": 0.3766,
"step": 11895
},
{
"epoch": 2.864708714492056,
"grad_norm": 0.9765625,
"learning_rate": 5.755797908332955e-05,
"loss": 0.393,
"step": 11900
},
{
"epoch": 2.865912373615792,
"grad_norm": 0.9140625,
"learning_rate": 5.754279748134561e-05,
"loss": 0.3665,
"step": 11905
},
{
"epoch": 2.867116032739528,
"grad_norm": 0.87890625,
"learning_rate": 5.7527750412959805e-05,
"loss": 0.3591,
"step": 11910
},
{
"epoch": 2.868319691863264,
"grad_norm": 0.96875,
"learning_rate": 5.7512837902213556e-05,
"loss": 0.3488,
"step": 11915
},
{
"epoch": 2.8695233509870004,
"grad_norm": 1.03125,
"learning_rate": 5.749805997293318e-05,
"loss": 0.3772,
"step": 11920
},
{
"epoch": 2.8707270101107367,
"grad_norm": 0.91796875,
"learning_rate": 5.7483416648730076e-05,
"loss": 0.3487,
"step": 11925
},
{
"epoch": 2.871930669234473,
"grad_norm": 0.96875,
"learning_rate": 5.7468907953000476e-05,
"loss": 0.3724,
"step": 11930
},
{
"epoch": 2.873134328358209,
"grad_norm": 1.0234375,
"learning_rate": 5.745453390892555e-05,
"loss": 0.3805,
"step": 11935
},
{
"epoch": 2.8743379874819452,
"grad_norm": 0.88671875,
"learning_rate": 5.74402945394714e-05,
"loss": 0.3462,
"step": 11940
},
{
"epoch": 2.875541646605681,
"grad_norm": 0.94921875,
"learning_rate": 5.742618986738882e-05,
"loss": 0.3703,
"step": 11945
},
{
"epoch": 2.8767453057294174,
"grad_norm": 0.8828125,
"learning_rate": 5.741221991521349e-05,
"loss": 0.3602,
"step": 11950
},
{
"epoch": 2.8779489648531538,
"grad_norm": 0.94921875,
"learning_rate": 5.739838470526581e-05,
"loss": 0.3413,
"step": 11955
},
{
"epoch": 2.8791526239768896,
"grad_norm": 1.0078125,
"learning_rate": 5.7384684259650885e-05,
"loss": 0.3977,
"step": 11960
},
{
"epoch": 2.880356283100626,
"grad_norm": 0.93359375,
"learning_rate": 5.737111860025856e-05,
"loss": 0.3618,
"step": 11965
},
{
"epoch": 2.881559942224362,
"grad_norm": 0.99609375,
"learning_rate": 5.73576877487632e-05,
"loss": 0.4014,
"step": 11970
},
{
"epoch": 2.882763601348098,
"grad_norm": 1.0234375,
"learning_rate": 5.734439172662395e-05,
"loss": 0.3639,
"step": 11975
},
{
"epoch": 2.8839672604718345,
"grad_norm": 1.0,
"learning_rate": 5.733123055508439e-05,
"loss": 0.4067,
"step": 11980
},
{
"epoch": 2.8851709195955704,
"grad_norm": 0.828125,
"learning_rate": 5.7318204255172714e-05,
"loss": 0.3399,
"step": 11985
},
{
"epoch": 2.8863745787193067,
"grad_norm": 1.0,
"learning_rate": 5.7305312847701617e-05,
"loss": 0.3755,
"step": 11990
},
{
"epoch": 2.887578237843043,
"grad_norm": 0.94921875,
"learning_rate": 5.729255635326824e-05,
"loss": 0.3512,
"step": 11995
},
{
"epoch": 2.888781896966779,
"grad_norm": 0.984375,
"learning_rate": 5.727993479225422e-05,
"loss": 0.3536,
"step": 12000
},
{
"epoch": 2.888781896966779,
"eval_loss": 0.34957587718963623,
"eval_runtime": 2.3617,
"eval_samples_per_second": 84.685,
"eval_steps_per_second": 84.685,
"step": 12000
},
{
"epoch": 2.889985556090515,
"grad_norm": 0.8984375,
"learning_rate": 5.726744818482557e-05,
"loss": 0.3493,
"step": 12005
},
{
"epoch": 2.8911892152142515,
"grad_norm": 0.98046875,
"learning_rate": 5.7255096550932674e-05,
"loss": 0.3916,
"step": 12010
},
{
"epoch": 2.8923928743379874,
"grad_norm": 1.0078125,
"learning_rate": 5.724287991031028e-05,
"loss": 0.379,
"step": 12015
},
{
"epoch": 2.8935965334617237,
"grad_norm": 0.9375,
"learning_rate": 5.723079828247745e-05,
"loss": 0.3591,
"step": 12020
},
{
"epoch": 2.8948001925854596,
"grad_norm": 0.921875,
"learning_rate": 5.721885168673753e-05,
"loss": 0.3694,
"step": 12025
},
{
"epoch": 2.896003851709196,
"grad_norm": 1.0078125,
"learning_rate": 5.720704014217813e-05,
"loss": 0.382,
"step": 12030
},
{
"epoch": 2.8972075108329323,
"grad_norm": 0.90625,
"learning_rate": 5.719536366767105e-05,
"loss": 0.3723,
"step": 12035
},
{
"epoch": 2.898411169956668,
"grad_norm": 0.890625,
"learning_rate": 5.7183822281872304e-05,
"loss": 0.3768,
"step": 12040
},
{
"epoch": 2.8996148290804045,
"grad_norm": 1.0546875,
"learning_rate": 5.717241600322208e-05,
"loss": 0.3667,
"step": 12045
},
{
"epoch": 2.9008184882041403,
"grad_norm": 0.98046875,
"learning_rate": 5.716114484994467e-05,
"loss": 0.3876,
"step": 12050
},
{
"epoch": 2.9020221473278767,
"grad_norm": 0.94140625,
"learning_rate": 5.715000884004851e-05,
"loss": 0.393,
"step": 12055
},
{
"epoch": 2.903225806451613,
"grad_norm": 0.921875,
"learning_rate": 5.713900799132607e-05,
"loss": 0.3597,
"step": 12060
},
{
"epoch": 2.9044294655753493,
"grad_norm": 1.0,
"learning_rate": 5.71281423213539e-05,
"loss": 0.37,
"step": 12065
},
{
"epoch": 2.905633124699085,
"grad_norm": 0.9296875,
"learning_rate": 5.7117411847492554e-05,
"loss": 0.3738,
"step": 12070
},
{
"epoch": 2.9068367838228215,
"grad_norm": 1.0078125,
"learning_rate": 5.7106816586886575e-05,
"loss": 0.3591,
"step": 12075
},
{
"epoch": 2.9080404429465574,
"grad_norm": 0.98046875,
"learning_rate": 5.709635655646446e-05,
"loss": 0.3688,
"step": 12080
},
{
"epoch": 2.9092441020702937,
"grad_norm": 1.078125,
"learning_rate": 5.708603177293868e-05,
"loss": 0.354,
"step": 12085
},
{
"epoch": 2.91044776119403,
"grad_norm": 1.0390625,
"learning_rate": 5.707584225280558e-05,
"loss": 0.3955,
"step": 12090
},
{
"epoch": 2.911651420317766,
"grad_norm": 1.015625,
"learning_rate": 5.706578801234538e-05,
"loss": 0.359,
"step": 12095
},
{
"epoch": 2.9128550794415022,
"grad_norm": 0.98046875,
"learning_rate": 5.7055869067622224e-05,
"loss": 0.3675,
"step": 12100
},
{
"epoch": 2.914058738565238,
"grad_norm": 0.93359375,
"learning_rate": 5.704608543448401e-05,
"loss": 0.3857,
"step": 12105
},
{
"epoch": 2.9152623976889744,
"grad_norm": 0.8984375,
"learning_rate": 5.703643712856248e-05,
"loss": 0.3681,
"step": 12110
},
{
"epoch": 2.9164660568127108,
"grad_norm": 0.9375,
"learning_rate": 5.702692416527318e-05,
"loss": 0.3573,
"step": 12115
},
{
"epoch": 2.9176697159364466,
"grad_norm": 1.0078125,
"learning_rate": 5.7017546559815344e-05,
"loss": 0.4083,
"step": 12120
},
{
"epoch": 2.918873375060183,
"grad_norm": 0.98046875,
"learning_rate": 5.700830432717201e-05,
"loss": 0.373,
"step": 12125
},
{
"epoch": 2.920077034183919,
"grad_norm": 1.046875,
"learning_rate": 5.6999197482109896e-05,
"loss": 0.4108,
"step": 12130
},
{
"epoch": 2.921280693307655,
"grad_norm": 0.94140625,
"learning_rate": 5.699022603917939e-05,
"loss": 0.3347,
"step": 12135
},
{
"epoch": 2.9224843524313915,
"grad_norm": 1.0625,
"learning_rate": 5.698139001271457e-05,
"loss": 0.4116,
"step": 12140
},
{
"epoch": 2.923688011555128,
"grad_norm": 0.94140625,
"learning_rate": 5.697268941683314e-05,
"loss": 0.3721,
"step": 12145
},
{
"epoch": 2.9248916706788637,
"grad_norm": 1.0625,
"learning_rate": 5.696412426543641e-05,
"loss": 0.3983,
"step": 12150
},
{
"epoch": 2.9260953298026,
"grad_norm": 0.93359375,
"learning_rate": 5.6955694572209304e-05,
"loss": 0.369,
"step": 12155
},
{
"epoch": 2.927298988926336,
"grad_norm": 1.015625,
"learning_rate": 5.6947400350620327e-05,
"loss": 0.3708,
"step": 12160
},
{
"epoch": 2.928502648050072,
"grad_norm": 1.0078125,
"learning_rate": 5.6939241613921515e-05,
"loss": 0.3619,
"step": 12165
},
{
"epoch": 2.9297063071738085,
"grad_norm": 0.90625,
"learning_rate": 5.69312183751484e-05,
"loss": 0.3736,
"step": 12170
},
{
"epoch": 2.9309099662975444,
"grad_norm": 0.9453125,
"learning_rate": 5.69233306471201e-05,
"loss": 0.386,
"step": 12175
},
{
"epoch": 2.9321136254212807,
"grad_norm": 0.9921875,
"learning_rate": 5.691557844243918e-05,
"loss": 0.3931,
"step": 12180
},
{
"epoch": 2.9333172845450166,
"grad_norm": 1.015625,
"learning_rate": 5.690796177349167e-05,
"loss": 0.3774,
"step": 12185
},
{
"epoch": 2.934520943668753,
"grad_norm": 0.91015625,
"learning_rate": 5.690048065244703e-05,
"loss": 0.3891,
"step": 12190
},
{
"epoch": 2.9357246027924893,
"grad_norm": 0.90625,
"learning_rate": 5.689313509125819e-05,
"loss": 0.3566,
"step": 12195
},
{
"epoch": 2.9369282619162256,
"grad_norm": 1.046875,
"learning_rate": 5.68859251016615e-05,
"loss": 0.3781,
"step": 12200
},
{
"epoch": 2.9381319210399615,
"grad_norm": 1.0546875,
"learning_rate": 5.687885069517665e-05,
"loss": 0.3726,
"step": 12205
},
{
"epoch": 2.939335580163698,
"grad_norm": 1.0703125,
"learning_rate": 5.687191188310672e-05,
"loss": 0.3902,
"step": 12210
},
{
"epoch": 2.9405392392874337,
"grad_norm": 0.984375,
"learning_rate": 5.686510867653818e-05,
"loss": 0.3617,
"step": 12215
},
{
"epoch": 2.94174289841117,
"grad_norm": 0.859375,
"learning_rate": 5.685844108634079e-05,
"loss": 0.3724,
"step": 12220
},
{
"epoch": 2.9429465575349063,
"grad_norm": 0.953125,
"learning_rate": 5.6851909123167645e-05,
"loss": 0.3861,
"step": 12225
},
{
"epoch": 2.944150216658642,
"grad_norm": 0.90625,
"learning_rate": 5.684551279745516e-05,
"loss": 0.3639,
"step": 12230
},
{
"epoch": 2.9453538757823785,
"grad_norm": 0.8828125,
"learning_rate": 5.6839252119423025e-05,
"loss": 0.4051,
"step": 12235
},
{
"epoch": 2.9465575349061144,
"grad_norm": 1.015625,
"learning_rate": 5.683312709907419e-05,
"loss": 0.3672,
"step": 12240
},
{
"epoch": 2.9477611940298507,
"grad_norm": 0.95703125,
"learning_rate": 5.682713774619488e-05,
"loss": 0.346,
"step": 12245
},
{
"epoch": 2.948964853153587,
"grad_norm": 0.9375,
"learning_rate": 5.682128407035453e-05,
"loss": 0.3697,
"step": 12250
},
{
"epoch": 2.950168512277323,
"grad_norm": 0.890625,
"learning_rate": 5.681556608090582e-05,
"loss": 0.3544,
"step": 12255
},
{
"epoch": 2.9513721714010592,
"grad_norm": 1.0625,
"learning_rate": 5.680998378698464e-05,
"loss": 0.3932,
"step": 12260
},
{
"epoch": 2.952575830524795,
"grad_norm": 0.92578125,
"learning_rate": 5.680453719751005e-05,
"loss": 0.365,
"step": 12265
},
{
"epoch": 2.9537794896485314,
"grad_norm": 0.95703125,
"learning_rate": 5.67992263211843e-05,
"loss": 0.374,
"step": 12270
},
{
"epoch": 2.9549831487722678,
"grad_norm": 1.046875,
"learning_rate": 5.679405116649284e-05,
"loss": 0.3662,
"step": 12275
},
{
"epoch": 2.956186807896004,
"grad_norm": 1.0,
"learning_rate": 5.67890117417042e-05,
"loss": 0.3914,
"step": 12280
},
{
"epoch": 2.95739046701974,
"grad_norm": 1.0625,
"learning_rate": 5.6784108054870116e-05,
"loss": 0.3774,
"step": 12285
},
{
"epoch": 2.9585941261434763,
"grad_norm": 0.921875,
"learning_rate": 5.677934011382542e-05,
"loss": 0.35,
"step": 12290
},
{
"epoch": 2.959797785267212,
"grad_norm": 1.046875,
"learning_rate": 5.677470792618806e-05,
"loss": 0.3669,
"step": 12295
},
{
"epoch": 2.9610014443909485,
"grad_norm": 0.9140625,
"learning_rate": 5.6770211499359076e-05,
"loss": 0.364,
"step": 12300
},
{
"epoch": 2.962205103514685,
"grad_norm": 1.0625,
"learning_rate": 5.67658508405226e-05,
"loss": 0.3492,
"step": 12305
},
{
"epoch": 2.9634087626384207,
"grad_norm": 0.8671875,
"learning_rate": 5.676162595664586e-05,
"loss": 0.3694,
"step": 12310
},
{
"epoch": 2.964612421762157,
"grad_norm": 0.94140625,
"learning_rate": 5.675753685447913e-05,
"loss": 0.3701,
"step": 12315
},
{
"epoch": 2.965816080885893,
"grad_norm": 0.87109375,
"learning_rate": 5.6753583540555744e-05,
"loss": 0.3856,
"step": 12320
},
{
"epoch": 2.967019740009629,
"grad_norm": 0.9453125,
"learning_rate": 5.6749766021192104e-05,
"loss": 0.352,
"step": 12325
},
{
"epoch": 2.9682233991333655,
"grad_norm": 0.9375,
"learning_rate": 5.674608430248761e-05,
"loss": 0.3669,
"step": 12330
},
{
"epoch": 2.969427058257102,
"grad_norm": 0.83984375,
"learning_rate": 5.67425383903247e-05,
"loss": 0.3822,
"step": 12335
},
{
"epoch": 2.9706307173808377,
"grad_norm": 0.99609375,
"learning_rate": 5.673912829036885e-05,
"loss": 0.3949,
"step": 12340
},
{
"epoch": 2.971834376504574,
"grad_norm": 1.0078125,
"learning_rate": 5.673585400806851e-05,
"loss": 0.3682,
"step": 12345
},
{
"epoch": 2.97303803562831,
"grad_norm": 0.88671875,
"learning_rate": 5.673271554865515e-05,
"loss": 0.3752,
"step": 12350
},
{
"epoch": 2.9742416947520463,
"grad_norm": 0.9921875,
"learning_rate": 5.6729712917143226e-05,
"loss": 0.368,
"step": 12355
},
{
"epoch": 2.9754453538757826,
"grad_norm": 0.99609375,
"learning_rate": 5.672684611833017e-05,
"loss": 0.3861,
"step": 12360
},
{
"epoch": 2.9766490129995185,
"grad_norm": 0.984375,
"learning_rate": 5.672411515679639e-05,
"loss": 0.3711,
"step": 12365
},
{
"epoch": 2.9778526721232548,
"grad_norm": 0.8828125,
"learning_rate": 5.672152003690527e-05,
"loss": 0.3475,
"step": 12370
},
{
"epoch": 2.9790563312469907,
"grad_norm": 1.0546875,
"learning_rate": 5.671906076280313e-05,
"loss": 0.363,
"step": 12375
},
{
"epoch": 2.980259990370727,
"grad_norm": 1.0390625,
"learning_rate": 5.67167373384193e-05,
"loss": 0.3683,
"step": 12380
},
{
"epoch": 2.9814636494944633,
"grad_norm": 0.9921875,
"learning_rate": 5.671454976746596e-05,
"loss": 0.383,
"step": 12385
},
{
"epoch": 2.982667308618199,
"grad_norm": 0.8515625,
"learning_rate": 5.6712498053438323e-05,
"loss": 0.3659,
"step": 12390
},
{
"epoch": 2.9838709677419355,
"grad_norm": 0.9296875,
"learning_rate": 5.671058219961449e-05,
"loss": 0.3937,
"step": 12395
},
{
"epoch": 2.9850746268656714,
"grad_norm": 0.90234375,
"learning_rate": 5.670880220905551e-05,
"loss": 0.3846,
"step": 12400
},
{
"epoch": 2.9862782859894077,
"grad_norm": 0.953125,
"learning_rate": 5.6707158084605354e-05,
"loss": 0.3594,
"step": 12405
},
{
"epoch": 2.987481945113144,
"grad_norm": 0.94921875,
"learning_rate": 5.670564982889091e-05,
"loss": 0.3887,
"step": 12410
},
{
"epoch": 2.9886856042368803,
"grad_norm": 0.8984375,
"learning_rate": 5.670427744432197e-05,
"loss": 0.3726,
"step": 12415
},
{
"epoch": 2.9898892633606162,
"grad_norm": 0.9453125,
"learning_rate": 5.670304093309127e-05,
"loss": 0.3825,
"step": 12420
},
{
"epoch": 2.9910929224843525,
"grad_norm": 0.9921875,
"learning_rate": 5.670194029717444e-05,
"loss": 0.3857,
"step": 12425
},
{
"epoch": 2.9922965816080884,
"grad_norm": 0.91796875,
"learning_rate": 5.670097553832999e-05,
"loss": 0.3839,
"step": 12430
},
{
"epoch": 2.9935002407318247,
"grad_norm": 0.8984375,
"learning_rate": 5.670014665809939e-05,
"loss": 0.3673,
"step": 12435
},
{
"epoch": 2.994703899855561,
"grad_norm": 1.0,
"learning_rate": 5.669945365780695e-05,
"loss": 0.3954,
"step": 12440
},
{
"epoch": 2.995907558979297,
"grad_norm": 1.015625,
"learning_rate": 5.6698896538559915e-05,
"loss": 0.3747,
"step": 12445
},
{
"epoch": 2.9971112181030333,
"grad_norm": 1.09375,
"learning_rate": 5.669847530124844e-05,
"loss": 0.3947,
"step": 12450
},
{
"epoch": 2.998314877226769,
"grad_norm": 0.9375,
"learning_rate": 5.6698189946545524e-05,
"loss": 0.359,
"step": 12455
},
{
"epoch": 2.999277804525758,
"eval_loss": 0.3474678099155426,
"eval_runtime": 2.3561,
"eval_samples_per_second": 84.886,
"eval_steps_per_second": 84.886,
"step": 12459
}
],
"logging_steps": 5,
"max_steps": 12462,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.12509450944512e+17,
"train_batch_size": 48,
"trial_name": null,
"trial_params": null
}