instruct_15_mar3 / trainer_state.json
bimabk's picture
Upload task output 503ed8fa-75ce-4c8d-b91c-bf02ce892c47
7c03cdd verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9950900163666123,
"eval_steps": 500,
"global_step": 1830,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008183306055646482,
"grad_norm": 2.34375,
"learning_rate": 1.142857142857143e-06,
"loss": 0.6534,
"step": 5
},
{
"epoch": 0.016366612111292964,
"grad_norm": 2.15625,
"learning_rate": 2.571428571428571e-06,
"loss": 0.6508,
"step": 10
},
{
"epoch": 0.024549918166939442,
"grad_norm": 1.84375,
"learning_rate": 4.000000000000001e-06,
"loss": 0.6493,
"step": 15
},
{
"epoch": 0.03273322422258593,
"grad_norm": 1.4453125,
"learning_rate": 5.428571428571429e-06,
"loss": 0.6052,
"step": 20
},
{
"epoch": 0.04091653027823241,
"grad_norm": 1.0625,
"learning_rate": 6.857142857142858e-06,
"loss": 0.5644,
"step": 25
},
{
"epoch": 0.049099836333878884,
"grad_norm": 0.76953125,
"learning_rate": 8.285714285714287e-06,
"loss": 0.5195,
"step": 30
},
{
"epoch": 0.057283142389525366,
"grad_norm": 0.63671875,
"learning_rate": 9.714285714285715e-06,
"loss": 0.5033,
"step": 35
},
{
"epoch": 0.06546644844517185,
"grad_norm": 0.59375,
"learning_rate": 9.99990841172964e-06,
"loss": 0.4931,
"step": 40
},
{
"epoch": 0.07364975450081833,
"grad_norm": 0.56640625,
"learning_rate": 9.999536342048818e-06,
"loss": 0.4682,
"step": 45
},
{
"epoch": 0.08183306055646482,
"grad_norm": 0.60546875,
"learning_rate": 9.998878095066407e-06,
"loss": 0.4627,
"step": 50
},
{
"epoch": 0.09001636661211129,
"grad_norm": 0.53125,
"learning_rate": 9.997933721022044e-06,
"loss": 0.4458,
"step": 55
},
{
"epoch": 0.09819967266775777,
"grad_norm": 0.55078125,
"learning_rate": 9.996703291993557e-06,
"loss": 0.4567,
"step": 60
},
{
"epoch": 0.10638297872340426,
"grad_norm": 0.52734375,
"learning_rate": 9.995186901891448e-06,
"loss": 0.4644,
"step": 65
},
{
"epoch": 0.11456628477905073,
"grad_norm": 0.53515625,
"learning_rate": 9.993384666451743e-06,
"loss": 0.4547,
"step": 70
},
{
"epoch": 0.12274959083469722,
"grad_norm": 0.55859375,
"learning_rate": 9.991296723227148e-06,
"loss": 0.4546,
"step": 75
},
{
"epoch": 0.1309328968903437,
"grad_norm": 0.5234375,
"learning_rate": 9.988923231576558e-06,
"loss": 0.4334,
"step": 80
},
{
"epoch": 0.13911620294599017,
"grad_norm": 0.5078125,
"learning_rate": 9.986264372652883e-06,
"loss": 0.4419,
"step": 85
},
{
"epoch": 0.14729950900163666,
"grad_norm": 0.53125,
"learning_rate": 9.983320349389237e-06,
"loss": 0.4409,
"step": 90
},
{
"epoch": 0.15548281505728315,
"grad_norm": 0.51953125,
"learning_rate": 9.980091386483434e-06,
"loss": 0.4618,
"step": 95
},
{
"epoch": 0.16366612111292964,
"grad_norm": 0.515625,
"learning_rate": 9.976577730380855e-06,
"loss": 0.4488,
"step": 100
},
{
"epoch": 0.1718494271685761,
"grad_norm": 0.515625,
"learning_rate": 9.972779649255617e-06,
"loss": 0.4408,
"step": 105
},
{
"epoch": 0.18003273322422259,
"grad_norm": 0.5234375,
"learning_rate": 9.968697432990129e-06,
"loss": 0.4423,
"step": 110
},
{
"epoch": 0.18821603927986907,
"grad_norm": 0.5078125,
"learning_rate": 9.964331393152947e-06,
"loss": 0.4503,
"step": 115
},
{
"epoch": 0.19639934533551553,
"grad_norm": 0.51171875,
"learning_rate": 9.959681862975007e-06,
"loss": 0.4282,
"step": 120
},
{
"epoch": 0.20458265139116202,
"grad_norm": 0.51171875,
"learning_rate": 9.954749197324184e-06,
"loss": 0.4336,
"step": 125
},
{
"epoch": 0.2127659574468085,
"grad_norm": 0.51953125,
"learning_rate": 9.949533772678215e-06,
"loss": 0.4337,
"step": 130
},
{
"epoch": 0.220949263502455,
"grad_norm": 0.5234375,
"learning_rate": 9.944035987095955e-06,
"loss": 0.4365,
"step": 135
},
{
"epoch": 0.22913256955810146,
"grad_norm": 0.51953125,
"learning_rate": 9.938256260187002e-06,
"loss": 0.4259,
"step": 140
},
{
"epoch": 0.23731587561374795,
"grad_norm": 0.51171875,
"learning_rate": 9.932195033079677e-06,
"loss": 0.4291,
"step": 145
},
{
"epoch": 0.24549918166939444,
"grad_norm": 0.515625,
"learning_rate": 9.925852768387337e-06,
"loss": 0.4362,
"step": 150
},
{
"epoch": 0.25368248772504093,
"grad_norm": 0.5390625,
"learning_rate": 9.919229950173089e-06,
"loss": 0.4434,
"step": 155
},
{
"epoch": 0.2618657937806874,
"grad_norm": 0.51953125,
"learning_rate": 9.912327083912825e-06,
"loss": 0.4278,
"step": 160
},
{
"epoch": 0.2700490998363339,
"grad_norm": 0.53515625,
"learning_rate": 9.905144696456664e-06,
"loss": 0.4324,
"step": 165
},
{
"epoch": 0.27823240589198034,
"grad_norm": 0.51953125,
"learning_rate": 9.897683335988714e-06,
"loss": 0.4505,
"step": 170
},
{
"epoch": 0.2864157119476268,
"grad_norm": 0.515625,
"learning_rate": 9.889943571985258e-06,
"loss": 0.4329,
"step": 175
},
{
"epoch": 0.2945990180032733,
"grad_norm": 0.5234375,
"learning_rate": 9.881925995171272e-06,
"loss": 0.4439,
"step": 180
},
{
"epoch": 0.3027823240589198,
"grad_norm": 0.53125,
"learning_rate": 9.873631217475355e-06,
"loss": 0.4452,
"step": 185
},
{
"epoch": 0.3109656301145663,
"grad_norm": 0.55078125,
"learning_rate": 9.865059871983003e-06,
"loss": 0.4329,
"step": 190
},
{
"epoch": 0.3191489361702128,
"grad_norm": 0.515625,
"learning_rate": 9.856212612888312e-06,
"loss": 0.4224,
"step": 195
},
{
"epoch": 0.32733224222585927,
"grad_norm": 0.54296875,
"learning_rate": 9.847090115444032e-06,
"loss": 0.4186,
"step": 200
},
{
"epoch": 0.3355155482815057,
"grad_norm": 0.515625,
"learning_rate": 9.837693075910037e-06,
"loss": 0.4178,
"step": 205
},
{
"epoch": 0.3436988543371522,
"grad_norm": 0.53125,
"learning_rate": 9.828022211500183e-06,
"loss": 0.441,
"step": 210
},
{
"epoch": 0.3518821603927987,
"grad_norm": 0.54296875,
"learning_rate": 9.81807826032757e-06,
"loss": 0.4261,
"step": 215
},
{
"epoch": 0.36006546644844517,
"grad_norm": 0.515625,
"learning_rate": 9.807861981348196e-06,
"loss": 0.422,
"step": 220
},
{
"epoch": 0.36824877250409166,
"grad_norm": 0.498046875,
"learning_rate": 9.797374154303048e-06,
"loss": 0.4346,
"step": 225
},
{
"epoch": 0.37643207855973815,
"grad_norm": 0.53515625,
"learning_rate": 9.786615579658571e-06,
"loss": 0.4226,
"step": 230
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.53125,
"learning_rate": 9.77558707854559e-06,
"loss": 0.4136,
"step": 235
},
{
"epoch": 0.39279869067103107,
"grad_norm": 0.52734375,
"learning_rate": 9.764289492696628e-06,
"loss": 0.4292,
"step": 240
},
{
"epoch": 0.40098199672667756,
"grad_norm": 0.53125,
"learning_rate": 9.752723684381666e-06,
"loss": 0.4127,
"step": 245
},
{
"epoch": 0.40916530278232405,
"grad_norm": 0.52734375,
"learning_rate": 9.740890536342336e-06,
"loss": 0.4383,
"step": 250
},
{
"epoch": 0.41734860883797054,
"grad_norm": 0.51953125,
"learning_rate": 9.728790951724532e-06,
"loss": 0.4216,
"step": 255
},
{
"epoch": 0.425531914893617,
"grad_norm": 0.5,
"learning_rate": 9.716425854009501e-06,
"loss": 0.4117,
"step": 260
},
{
"epoch": 0.4337152209492635,
"grad_norm": 0.515625,
"learning_rate": 9.70379618694334e-06,
"loss": 0.427,
"step": 265
},
{
"epoch": 0.44189852700491,
"grad_norm": 0.51171875,
"learning_rate": 9.690902914464977e-06,
"loss": 0.4267,
"step": 270
},
{
"epoch": 0.4500818330605565,
"grad_norm": 0.5078125,
"learning_rate": 9.677747020632595e-06,
"loss": 0.4178,
"step": 275
},
{
"epoch": 0.4582651391162029,
"grad_norm": 0.50390625,
"learning_rate": 9.664329509548534e-06,
"loss": 0.4094,
"step": 280
},
{
"epoch": 0.4664484451718494,
"grad_norm": 0.50390625,
"learning_rate": 9.650651405282638e-06,
"loss": 0.402,
"step": 285
},
{
"epoch": 0.4746317512274959,
"grad_norm": 0.5078125,
"learning_rate": 9.63671375179411e-06,
"loss": 0.4147,
"step": 290
},
{
"epoch": 0.4828150572831424,
"grad_norm": 0.494140625,
"learning_rate": 9.622517612851832e-06,
"loss": 0.4107,
"step": 295
},
{
"epoch": 0.4909983633387889,
"grad_norm": 0.5078125,
"learning_rate": 9.608064071953162e-06,
"loss": 0.4152,
"step": 300
},
{
"epoch": 0.49918166939443537,
"grad_norm": 0.5234375,
"learning_rate": 9.593354232241251e-06,
"loss": 0.4308,
"step": 305
},
{
"epoch": 0.5073649754500819,
"grad_norm": 0.51171875,
"learning_rate": 9.578389216420844e-06,
"loss": 0.4145,
"step": 310
},
{
"epoch": 0.5155482815057283,
"grad_norm": 0.515625,
"learning_rate": 9.563170166672585e-06,
"loss": 0.4062,
"step": 315
},
{
"epoch": 0.5237315875613748,
"grad_norm": 0.51953125,
"learning_rate": 9.547698244565855e-06,
"loss": 0.419,
"step": 320
},
{
"epoch": 0.5319148936170213,
"grad_norm": 0.5234375,
"learning_rate": 9.5319746309701e-06,
"loss": 0.413,
"step": 325
},
{
"epoch": 0.5400981996726678,
"grad_norm": 0.515625,
"learning_rate": 9.516000525964716e-06,
"loss": 0.4404,
"step": 330
},
{
"epoch": 0.5482815057283142,
"grad_norm": 0.51171875,
"learning_rate": 9.499777148747455e-06,
"loss": 0.4185,
"step": 335
},
{
"epoch": 0.5564648117839607,
"grad_norm": 0.5,
"learning_rate": 9.48330573754136e-06,
"loss": 0.4134,
"step": 340
},
{
"epoch": 0.5646481178396072,
"grad_norm": 0.5234375,
"learning_rate": 9.466587549500274e-06,
"loss": 0.4305,
"step": 345
},
{
"epoch": 0.5728314238952537,
"grad_norm": 0.50390625,
"learning_rate": 9.449623860612879e-06,
"loss": 0.4108,
"step": 350
},
{
"epoch": 0.5810147299509002,
"grad_norm": 0.50390625,
"learning_rate": 9.432415965605318e-06,
"loss": 0.4086,
"step": 355
},
{
"epoch": 0.5891980360065466,
"grad_norm": 0.5234375,
"learning_rate": 9.414965177842361e-06,
"loss": 0.4025,
"step": 360
},
{
"epoch": 0.5973813420621932,
"grad_norm": 0.51953125,
"learning_rate": 9.397272829227187e-06,
"loss": 0.4191,
"step": 365
},
{
"epoch": 0.6055646481178396,
"grad_norm": 0.51171875,
"learning_rate": 9.379340270099708e-06,
"loss": 0.4117,
"step": 370
},
{
"epoch": 0.613747954173486,
"grad_norm": 0.515625,
"learning_rate": 9.361168869133516e-06,
"loss": 0.418,
"step": 375
},
{
"epoch": 0.6219312602291326,
"grad_norm": 0.515625,
"learning_rate": 9.342760013231429e-06,
"loss": 0.4197,
"step": 380
},
{
"epoch": 0.630114566284779,
"grad_norm": 0.51953125,
"learning_rate": 9.324115107419616e-06,
"loss": 0.4173,
"step": 385
},
{
"epoch": 0.6382978723404256,
"grad_norm": 0.5078125,
"learning_rate": 9.305235574740386e-06,
"loss": 0.4133,
"step": 390
},
{
"epoch": 0.646481178396072,
"grad_norm": 0.53515625,
"learning_rate": 9.286122856143555e-06,
"loss": 0.4223,
"step": 395
},
{
"epoch": 0.6546644844517185,
"grad_norm": 0.494140625,
"learning_rate": 9.266778410376484e-06,
"loss": 0.4045,
"step": 400
},
{
"epoch": 0.662847790507365,
"grad_norm": 0.51171875,
"learning_rate": 9.247203713872732e-06,
"loss": 0.4196,
"step": 405
},
{
"epoch": 0.6710310965630114,
"grad_norm": 0.5234375,
"learning_rate": 9.227400260639374e-06,
"loss": 0.3995,
"step": 410
},
{
"epoch": 0.679214402618658,
"grad_norm": 0.53515625,
"learning_rate": 9.207369562142975e-06,
"loss": 0.4154,
"step": 415
},
{
"epoch": 0.6873977086743044,
"grad_norm": 0.5234375,
"learning_rate": 9.187113147194222e-06,
"loss": 0.4077,
"step": 420
},
{
"epoch": 0.6955810147299509,
"grad_norm": 0.5078125,
"learning_rate": 9.166632561831252e-06,
"loss": 0.4012,
"step": 425
},
{
"epoch": 0.7037643207855974,
"grad_norm": 0.53515625,
"learning_rate": 9.145929369201646e-06,
"loss": 0.4007,
"step": 430
},
{
"epoch": 0.7119476268412439,
"grad_norm": 0.494140625,
"learning_rate": 9.125005149443117e-06,
"loss": 0.412,
"step": 435
},
{
"epoch": 0.7201309328968903,
"grad_norm": 0.56640625,
"learning_rate": 9.103861499562925e-06,
"loss": 0.4165,
"step": 440
},
{
"epoch": 0.7283142389525368,
"grad_norm": 0.5,
"learning_rate": 9.082500033315976e-06,
"loss": 0.4114,
"step": 445
},
{
"epoch": 0.7364975450081833,
"grad_norm": 0.515625,
"learning_rate": 9.060922381081658e-06,
"loss": 0.419,
"step": 450
},
{
"epoch": 0.7446808510638298,
"grad_norm": 0.5234375,
"learning_rate": 9.039130189739405e-06,
"loss": 0.4209,
"step": 455
},
{
"epoch": 0.7528641571194763,
"grad_norm": 0.546875,
"learning_rate": 9.017125122543006e-06,
"loss": 0.4359,
"step": 460
},
{
"epoch": 0.7610474631751227,
"grad_norm": 0.5546875,
"learning_rate": 8.994908858993647e-06,
"loss": 0.4162,
"step": 465
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.515625,
"learning_rate": 8.97248309471174e-06,
"loss": 0.3991,
"step": 470
},
{
"epoch": 0.7774140752864157,
"grad_norm": 0.498046875,
"learning_rate": 8.949849541307505e-06,
"loss": 0.4087,
"step": 475
},
{
"epoch": 0.7855973813420621,
"grad_norm": 0.51953125,
"learning_rate": 8.927009926250324e-06,
"loss": 0.4053,
"step": 480
},
{
"epoch": 0.7937806873977087,
"grad_norm": 0.53125,
"learning_rate": 8.903965992736903e-06,
"loss": 0.4016,
"step": 485
},
{
"epoch": 0.8019639934533551,
"grad_norm": 0.53515625,
"learning_rate": 8.880719499558226e-06,
"loss": 0.4128,
"step": 490
},
{
"epoch": 0.8101472995090017,
"grad_norm": 0.51171875,
"learning_rate": 8.85727222096532e-06,
"loss": 0.4146,
"step": 495
},
{
"epoch": 0.8183306055646481,
"grad_norm": 0.486328125,
"learning_rate": 8.833625946533826e-06,
"loss": 0.407,
"step": 500
},
{
"epoch": 0.8183306055646481,
"eval_loss": 0.40063852071762085,
"eval_runtime": 5.182,
"eval_samples_per_second": 16.017,
"eval_steps_per_second": 16.017,
"step": 500
},
{
"epoch": 0.8265139116202946,
"grad_norm": 0.51953125,
"learning_rate": 8.809782481027425e-06,
"loss": 0.4279,
"step": 505
},
{
"epoch": 0.8346972176759411,
"grad_norm": 0.5234375,
"learning_rate": 8.785743644260087e-06,
"loss": 0.4123,
"step": 510
},
{
"epoch": 0.8428805237315876,
"grad_norm": 0.50390625,
"learning_rate": 8.761511270957179e-06,
"loss": 0.3964,
"step": 515
},
{
"epoch": 0.851063829787234,
"grad_norm": 0.5390625,
"learning_rate": 8.737087210615434e-06,
"loss": 0.4143,
"step": 520
},
{
"epoch": 0.8592471358428805,
"grad_norm": 0.50390625,
"learning_rate": 8.71247332736178e-06,
"loss": 0.4108,
"step": 525
},
{
"epoch": 0.867430441898527,
"grad_norm": 0.5,
"learning_rate": 8.687671499811083e-06,
"loss": 0.4157,
"step": 530
},
{
"epoch": 0.8756137479541735,
"grad_norm": 0.4921875,
"learning_rate": 8.662683620922743e-06,
"loss": 0.4068,
"step": 535
},
{
"epoch": 0.88379705400982,
"grad_norm": 0.5078125,
"learning_rate": 8.637511597856234e-06,
"loss": 0.4154,
"step": 540
},
{
"epoch": 0.8919803600654664,
"grad_norm": 0.4921875,
"learning_rate": 8.612157351825536e-06,
"loss": 0.3982,
"step": 545
},
{
"epoch": 0.900163666121113,
"grad_norm": 0.515625,
"learning_rate": 8.586622817952504e-06,
"loss": 0.4016,
"step": 550
},
{
"epoch": 0.9083469721767594,
"grad_norm": 0.5,
"learning_rate": 8.560909945119162e-06,
"loss": 0.405,
"step": 555
},
{
"epoch": 0.9165302782324058,
"grad_norm": 0.53515625,
"learning_rate": 8.53502069581898e-06,
"loss": 0.4107,
"step": 560
},
{
"epoch": 0.9247135842880524,
"grad_norm": 0.4921875,
"learning_rate": 8.50895704600707e-06,
"loss": 0.398,
"step": 565
},
{
"epoch": 0.9328968903436988,
"grad_norm": 0.50390625,
"learning_rate": 8.48272098494938e-06,
"loss": 0.3961,
"step": 570
},
{
"epoch": 0.9410801963993454,
"grad_norm": 0.49609375,
"learning_rate": 8.45631451507087e-06,
"loss": 0.4021,
"step": 575
},
{
"epoch": 0.9492635024549918,
"grad_norm": 0.51953125,
"learning_rate": 8.429739651802676e-06,
"loss": 0.4104,
"step": 580
},
{
"epoch": 0.9574468085106383,
"grad_norm": 0.498046875,
"learning_rate": 8.402998423428291e-06,
"loss": 0.411,
"step": 585
},
{
"epoch": 0.9656301145662848,
"grad_norm": 0.51953125,
"learning_rate": 8.376092870928752e-06,
"loss": 0.4266,
"step": 590
},
{
"epoch": 0.9738134206219312,
"grad_norm": 0.50390625,
"learning_rate": 8.349025047826873e-06,
"loss": 0.4103,
"step": 595
},
{
"epoch": 0.9819967266775778,
"grad_norm": 0.498046875,
"learning_rate": 8.321797020030504e-06,
"loss": 0.4233,
"step": 600
},
{
"epoch": 0.9901800327332242,
"grad_norm": 0.515625,
"learning_rate": 8.294410865674864e-06,
"loss": 0.4273,
"step": 605
},
{
"epoch": 0.9983633387888707,
"grad_norm": 0.51171875,
"learning_rate": 8.266868674963924e-06,
"loss": 0.4179,
"step": 610
},
{
"epoch": 0.9983633387888707,
"eval_loss": 0.39774245023727417,
"eval_runtime": 5.08,
"eval_samples_per_second": 16.338,
"eval_steps_per_second": 16.338,
"step": 610
},
{
"epoch": 1.0065466448445173,
"grad_norm": 0.5,
"learning_rate": 8.23917255001088e-06,
"loss": 0.381,
"step": 615
},
{
"epoch": 1.0147299509001637,
"grad_norm": 0.498046875,
"learning_rate": 8.211324604677711e-06,
"loss": 0.4149,
"step": 620
},
{
"epoch": 1.0229132569558101,
"grad_norm": 0.51171875,
"learning_rate": 8.183326964413832e-06,
"loss": 0.4204,
"step": 625
},
{
"epoch": 1.0310965630114566,
"grad_norm": 0.51953125,
"learning_rate": 8.155181766093893e-06,
"loss": 0.4049,
"step": 630
},
{
"epoch": 1.039279869067103,
"grad_norm": 0.53515625,
"learning_rate": 8.12689115785467e-06,
"loss": 0.3967,
"step": 635
},
{
"epoch": 1.0474631751227497,
"grad_norm": 0.515625,
"learning_rate": 8.098457298931113e-06,
"loss": 0.3976,
"step": 640
},
{
"epoch": 1.055646481178396,
"grad_norm": 0.5234375,
"learning_rate": 8.069882359491555e-06,
"loss": 0.4098,
"step": 645
},
{
"epoch": 1.0638297872340425,
"grad_norm": 0.546875,
"learning_rate": 8.041168520472065e-06,
"loss": 0.3955,
"step": 650
},
{
"epoch": 1.072013093289689,
"grad_norm": 0.50390625,
"learning_rate": 8.012317973410001e-06,
"loss": 0.3852,
"step": 655
},
{
"epoch": 1.0801963993453354,
"grad_norm": 0.515625,
"learning_rate": 7.983332920276745e-06,
"loss": 0.4018,
"step": 660
},
{
"epoch": 1.088379705400982,
"grad_norm": 0.546875,
"learning_rate": 7.95421557330963e-06,
"loss": 0.3948,
"step": 665
},
{
"epoch": 1.0965630114566285,
"grad_norm": 0.50390625,
"learning_rate": 7.924968154843108e-06,
"loss": 0.3918,
"step": 670
},
{
"epoch": 1.104746317512275,
"grad_norm": 0.5625,
"learning_rate": 7.895592897139128e-06,
"loss": 0.4182,
"step": 675
},
{
"epoch": 1.1129296235679214,
"grad_norm": 0.51953125,
"learning_rate": 7.866092042216755e-06,
"loss": 0.3837,
"step": 680
},
{
"epoch": 1.121112929623568,
"grad_norm": 0.50390625,
"learning_rate": 7.836467841681066e-06,
"loss": 0.4191,
"step": 685
},
{
"epoch": 1.1292962356792144,
"grad_norm": 0.5234375,
"learning_rate": 7.806722556551292e-06,
"loss": 0.4124,
"step": 690
},
{
"epoch": 1.1374795417348609,
"grad_norm": 0.51171875,
"learning_rate": 7.776858457088249e-06,
"loss": 0.4093,
"step": 695
},
{
"epoch": 1.1456628477905073,
"grad_norm": 0.50390625,
"learning_rate": 7.746877822621059e-06,
"loss": 0.4067,
"step": 700
},
{
"epoch": 1.1538461538461537,
"grad_norm": 0.5078125,
"learning_rate": 7.716782941373201e-06,
"loss": 0.4061,
"step": 705
},
{
"epoch": 1.1620294599018004,
"grad_norm": 0.51953125,
"learning_rate": 7.68657611028785e-06,
"loss": 0.3947,
"step": 710
},
{
"epoch": 1.1702127659574468,
"grad_norm": 0.546875,
"learning_rate": 7.656259634852566e-06,
"loss": 0.4099,
"step": 715
},
{
"epoch": 1.1783960720130933,
"grad_norm": 0.515625,
"learning_rate": 7.625835828923344e-06,
"loss": 0.3978,
"step": 720
},
{
"epoch": 1.1865793780687397,
"grad_norm": 0.515625,
"learning_rate": 7.595307014548e-06,
"loss": 0.4063,
"step": 725
},
{
"epoch": 1.1947626841243864,
"grad_norm": 0.51953125,
"learning_rate": 7.5646755217889555e-06,
"loss": 0.3915,
"step": 730
},
{
"epoch": 1.2029459901800328,
"grad_norm": 0.5234375,
"learning_rate": 7.533943688545391e-06,
"loss": 0.4024,
"step": 735
},
{
"epoch": 1.2111292962356792,
"grad_norm": 0.51171875,
"learning_rate": 7.503113860374813e-06,
"loss": 0.4142,
"step": 740
},
{
"epoch": 1.2193126022913257,
"grad_norm": 0.53125,
"learning_rate": 7.472188390314029e-06,
"loss": 0.3906,
"step": 745
},
{
"epoch": 1.227495908346972,
"grad_norm": 0.54296875,
"learning_rate": 7.441169638699565e-06,
"loss": 0.3984,
"step": 750
},
{
"epoch": 1.2356792144026187,
"grad_norm": 0.51171875,
"learning_rate": 7.4100599729875045e-06,
"loss": 0.3901,
"step": 755
},
{
"epoch": 1.2405891980360066,
"eval_loss": 0.39578545093536377,
"eval_runtime": 5.0485,
"eval_samples_per_second": 16.441,
"eval_steps_per_second": 16.441,
"step": 758
},
{
"epoch": 1.2438625204582652,
"grad_norm": 0.53515625,
"learning_rate": 7.378861767572808e-06,
"loss": 0.3995,
"step": 760
},
{
"epoch": 1.2520458265139116,
"grad_norm": 0.54296875,
"learning_rate": 7.347577403608084e-06,
"loss": 0.3964,
"step": 765
},
{
"epoch": 1.260229132569558,
"grad_norm": 0.53515625,
"learning_rate": 7.316209268821852e-06,
"loss": 0.3901,
"step": 770
},
{
"epoch": 1.2684124386252047,
"grad_norm": 0.5546875,
"learning_rate": 7.284759757336304e-06,
"loss": 0.3886,
"step": 775
},
{
"epoch": 1.2765957446808511,
"grad_norm": 0.51171875,
"learning_rate": 7.25323126948458e-06,
"loss": 0.4042,
"step": 780
},
{
"epoch": 1.2847790507364976,
"grad_norm": 0.5234375,
"learning_rate": 7.221626211627557e-06,
"loss": 0.3994,
"step": 785
},
{
"epoch": 1.292962356792144,
"grad_norm": 0.53125,
"learning_rate": 7.1899469959702024e-06,
"loss": 0.3935,
"step": 790
},
{
"epoch": 1.3011456628477904,
"grad_norm": 0.50390625,
"learning_rate": 7.158196040377452e-06,
"loss": 0.3902,
"step": 795
},
{
"epoch": 1.3093289689034369,
"grad_norm": 0.5,
"learning_rate": 7.12637576818968e-06,
"loss": 0.3938,
"step": 800
},
{
"epoch": 1.3175122749590835,
"grad_norm": 0.5078125,
"learning_rate": 7.094488608037731e-06,
"loss": 0.3892,
"step": 805
},
{
"epoch": 1.32569558101473,
"grad_norm": 0.51171875,
"learning_rate": 7.062536993657574e-06,
"loss": 0.3957,
"step": 810
},
{
"epoch": 1.3338788870703764,
"grad_norm": 0.52734375,
"learning_rate": 7.0305233637045375e-06,
"loss": 0.4083,
"step": 815
},
{
"epoch": 1.342062193126023,
"grad_norm": 0.50390625,
"learning_rate": 6.998450161567189e-06,
"loss": 0.3917,
"step": 820
},
{
"epoch": 1.3502454991816695,
"grad_norm": 0.53125,
"learning_rate": 6.966319835180849e-06,
"loss": 0.3878,
"step": 825
},
{
"epoch": 1.358428805237316,
"grad_norm": 0.53125,
"learning_rate": 6.9341348368407505e-06,
"loss": 0.4083,
"step": 830
},
{
"epoch": 1.3666121112929623,
"grad_norm": 0.515625,
"learning_rate": 6.901897623014877e-06,
"loss": 0.3942,
"step": 835
},
{
"epoch": 1.3747954173486088,
"grad_norm": 0.5078125,
"learning_rate": 6.869610654156476e-06,
"loss": 0.3856,
"step": 840
},
{
"epoch": 1.3829787234042552,
"grad_norm": 0.51953125,
"learning_rate": 6.837276394516264e-06,
"loss": 0.4077,
"step": 845
},
{
"epoch": 1.3911620294599019,
"grad_norm": 0.52734375,
"learning_rate": 6.804897311954354e-06,
"loss": 0.3987,
"step": 850
},
{
"epoch": 1.3993453355155483,
"grad_norm": 0.53125,
"learning_rate": 6.77247587775189e-06,
"loss": 0.4003,
"step": 855
},
{
"epoch": 1.4075286415711947,
"grad_norm": 0.52734375,
"learning_rate": 6.7400145664224445e-06,
"loss": 0.4041,
"step": 860
},
{
"epoch": 1.4157119476268412,
"grad_norm": 0.5234375,
"learning_rate": 6.707515855523141e-06,
"loss": 0.4082,
"step": 865
},
{
"epoch": 1.4238952536824878,
"grad_norm": 0.5,
"learning_rate": 6.674982225465568e-06,
"loss": 0.3739,
"step": 870
},
{
"epoch": 1.4320785597381342,
"grad_norm": 0.5,
"learning_rate": 6.642416159326462e-06,
"loss": 0.3866,
"step": 875
},
{
"epoch": 1.4402618657937807,
"grad_norm": 0.5234375,
"learning_rate": 6.609820142658186e-06,
"loss": 0.4101,
"step": 880
},
{
"epoch": 1.4484451718494271,
"grad_norm": 0.5234375,
"learning_rate": 6.577196663299039e-06,
"loss": 0.3888,
"step": 885
},
{
"epoch": 1.4566284779050735,
"grad_norm": 0.50390625,
"learning_rate": 6.544548211183355e-06,
"loss": 0.3937,
"step": 890
},
{
"epoch": 1.4648117839607202,
"grad_norm": 0.53125,
"learning_rate": 6.511877278151479e-06,
"loss": 0.3989,
"step": 895
},
{
"epoch": 1.4729950900163666,
"grad_norm": 0.5078125,
"learning_rate": 6.479186357759575e-06,
"loss": 0.4157,
"step": 900
},
{
"epoch": 1.481178396072013,
"grad_norm": 0.52734375,
"learning_rate": 6.4464779450893086e-06,
"loss": 0.388,
"step": 905
},
{
"epoch": 1.4893617021276595,
"grad_norm": 0.50390625,
"learning_rate": 6.413754536557416e-06,
"loss": 0.3916,
"step": 910
},
{
"epoch": 1.4975450081833062,
"grad_norm": 0.52734375,
"learning_rate": 6.381018629725169e-06,
"loss": 0.4073,
"step": 915
},
{
"epoch": 1.5057283142389526,
"grad_norm": 0.53125,
"learning_rate": 6.34827272310775e-06,
"loss": 0.401,
"step": 920
},
{
"epoch": 1.513911620294599,
"grad_norm": 0.515625,
"learning_rate": 6.315519315983562e-06,
"loss": 0.3859,
"step": 925
},
{
"epoch": 1.5220949263502455,
"grad_norm": 0.51171875,
"learning_rate": 6.282760908203467e-06,
"loss": 0.3952,
"step": 930
},
{
"epoch": 1.530278232405892,
"grad_norm": 0.50390625,
"learning_rate": 6.25e-06,
"loss": 0.384,
"step": 935
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.53125,
"learning_rate": 6.2172390917965345e-06,
"loss": 0.3937,
"step": 940
},
{
"epoch": 1.546644844517185,
"grad_norm": 0.490234375,
"learning_rate": 6.18448068401644e-06,
"loss": 0.3811,
"step": 945
},
{
"epoch": 1.5548281505728314,
"grad_norm": 0.54296875,
"learning_rate": 6.151727276892252e-06,
"loss": 0.3966,
"step": 950
},
{
"epoch": 1.563011456628478,
"grad_norm": 0.53125,
"learning_rate": 6.118981370274833e-06,
"loss": 0.4045,
"step": 955
},
{
"epoch": 1.5711947626841245,
"grad_norm": 0.53125,
"learning_rate": 6.086245463442586e-06,
"loss": 0.3798,
"step": 960
},
{
"epoch": 1.579378068739771,
"grad_norm": 0.53515625,
"learning_rate": 6.0535220549106946e-06,
"loss": 0.4011,
"step": 965
},
{
"epoch": 1.5875613747954174,
"grad_norm": 0.53125,
"learning_rate": 6.020813642240426e-06,
"loss": 0.3887,
"step": 970
},
{
"epoch": 1.5957446808510638,
"grad_norm": 0.52734375,
"learning_rate": 5.988122721848521e-06,
"loss": 0.4012,
"step": 975
},
{
"epoch": 1.6039279869067102,
"grad_norm": 0.52734375,
"learning_rate": 5.955451788816645e-06,
"loss": 0.403,
"step": 980
},
{
"epoch": 1.6121112929623567,
"grad_norm": 0.52734375,
"learning_rate": 5.922803336700962e-06,
"loss": 0.3879,
"step": 985
},
{
"epoch": 1.6202945990180033,
"grad_norm": 0.55859375,
"learning_rate": 5.890179857341814e-06,
"loss": 0.4001,
"step": 990
},
{
"epoch": 1.6284779050736498,
"grad_norm": 0.51953125,
"learning_rate": 5.85758384067354e-06,
"loss": 0.4184,
"step": 995
},
{
"epoch": 1.6366612111292962,
"grad_norm": 0.51953125,
"learning_rate": 5.825017774534434e-06,
"loss": 0.4192,
"step": 1000
},
{
"epoch": 1.6366612111292962,
"eval_loss": 0.3935750126838684,
"eval_runtime": 5.036,
"eval_samples_per_second": 16.481,
"eval_steps_per_second": 16.481,
"step": 1000
},
{
"epoch": 1.6448445171849428,
"grad_norm": 0.53125,
"learning_rate": 5.7924841444768585e-06,
"loss": 0.3859,
"step": 1005
},
{
"epoch": 1.6530278232405893,
"grad_norm": 0.51171875,
"learning_rate": 5.759985433577557e-06,
"loss": 0.3973,
"step": 1010
},
{
"epoch": 1.6612111292962357,
"grad_norm": 0.52734375,
"learning_rate": 5.727524122248112e-06,
"loss": 0.4077,
"step": 1015
},
{
"epoch": 1.6693944353518821,
"grad_norm": 0.51171875,
"learning_rate": 5.695102688045649e-06,
"loss": 0.3916,
"step": 1020
},
{
"epoch": 1.6775777414075286,
"grad_norm": 0.50390625,
"learning_rate": 5.662723605483738e-06,
"loss": 0.3852,
"step": 1025
},
{
"epoch": 1.685761047463175,
"grad_norm": 0.546875,
"learning_rate": 5.6303893458435255e-06,
"loss": 0.4055,
"step": 1030
},
{
"epoch": 1.6939443535188214,
"grad_norm": 0.5390625,
"learning_rate": 5.598102376985124e-06,
"loss": 0.4046,
"step": 1035
},
{
"epoch": 1.702127659574468,
"grad_norm": 0.53515625,
"learning_rate": 5.565865163159252e-06,
"loss": 0.3868,
"step": 1040
},
{
"epoch": 1.7103109656301145,
"grad_norm": 0.51171875,
"learning_rate": 5.5336801648191525e-06,
"loss": 0.3929,
"step": 1045
},
{
"epoch": 1.7184942716857612,
"grad_norm": 0.52734375,
"learning_rate": 5.501549838432812e-06,
"loss": 0.4018,
"step": 1050
},
{
"epoch": 1.7266775777414076,
"grad_norm": 0.52734375,
"learning_rate": 5.469476636295463e-06,
"loss": 0.4009,
"step": 1055
},
{
"epoch": 1.734860883797054,
"grad_norm": 0.5390625,
"learning_rate": 5.437463006342427e-06,
"loss": 0.3959,
"step": 1060
},
{
"epoch": 1.7430441898527005,
"grad_norm": 0.51953125,
"learning_rate": 5.4055113919622714e-06,
"loss": 0.3921,
"step": 1065
},
{
"epoch": 1.751227495908347,
"grad_norm": 0.53125,
"learning_rate": 5.373624231810322e-06,
"loss": 0.3869,
"step": 1070
},
{
"epoch": 1.7594108019639934,
"grad_norm": 0.52734375,
"learning_rate": 5.341803959622549e-06,
"loss": 0.3866,
"step": 1075
},
{
"epoch": 1.7675941080196398,
"grad_norm": 0.52734375,
"learning_rate": 5.310053004029798e-06,
"loss": 0.3853,
"step": 1080
},
{
"epoch": 1.7757774140752864,
"grad_norm": 0.54296875,
"learning_rate": 5.278373788372444e-06,
"loss": 0.4065,
"step": 1085
},
{
"epoch": 1.7839607201309329,
"grad_norm": 0.49609375,
"learning_rate": 5.246768730515424e-06,
"loss": 0.3839,
"step": 1090
},
{
"epoch": 1.7921440261865795,
"grad_norm": 0.515625,
"learning_rate": 5.2152402426636975e-06,
"loss": 0.3914,
"step": 1095
},
{
"epoch": 1.800327332242226,
"grad_norm": 0.53125,
"learning_rate": 5.183790731178151e-06,
"loss": 0.3846,
"step": 1100
},
{
"epoch": 1.8085106382978724,
"grad_norm": 0.51171875,
"learning_rate": 5.152422596391917e-06,
"loss": 0.3841,
"step": 1105
},
{
"epoch": 1.8166939443535188,
"grad_norm": 0.5078125,
"learning_rate": 5.121138232427193e-06,
"loss": 0.3932,
"step": 1110
},
{
"epoch": 1.8248772504091653,
"grad_norm": 0.52734375,
"learning_rate": 5.089940027012498e-06,
"loss": 0.4066,
"step": 1115
},
{
"epoch": 1.8330605564648117,
"grad_norm": 0.53515625,
"learning_rate": 5.058830361300437e-06,
"loss": 0.3977,
"step": 1120
},
{
"epoch": 1.8412438625204581,
"grad_norm": 0.54296875,
"learning_rate": 5.027811609685972e-06,
"loss": 0.4038,
"step": 1125
},
{
"epoch": 1.8494271685761048,
"grad_norm": 0.51953125,
"learning_rate": 4.9968861396251884e-06,
"loss": 0.3984,
"step": 1130
},
{
"epoch": 1.8576104746317512,
"grad_norm": 0.5390625,
"learning_rate": 4.96605631145461e-06,
"loss": 0.3976,
"step": 1135
},
{
"epoch": 1.8657937806873979,
"grad_norm": 0.53515625,
"learning_rate": 4.935324478211047e-06,
"loss": 0.4076,
"step": 1140
},
{
"epoch": 1.8739770867430443,
"grad_norm": 0.5234375,
"learning_rate": 4.9046929854520014e-06,
"loss": 0.3916,
"step": 1145
},
{
"epoch": 1.8821603927986907,
"grad_norm": 0.51171875,
"learning_rate": 4.8741641710766595e-06,
"loss": 0.3976,
"step": 1150
},
{
"epoch": 1.8903436988543372,
"grad_norm": 0.54296875,
"learning_rate": 4.843740365147435e-06,
"loss": 0.4034,
"step": 1155
},
{
"epoch": 1.8985270049099836,
"grad_norm": 0.5,
"learning_rate": 4.8134238897121515e-06,
"loss": 0.4105,
"step": 1160
},
{
"epoch": 1.90671031096563,
"grad_norm": 0.55078125,
"learning_rate": 4.783217058626799e-06,
"loss": 0.4022,
"step": 1165
},
{
"epoch": 1.9148936170212765,
"grad_norm": 0.5625,
"learning_rate": 4.753122177378941e-06,
"loss": 0.3885,
"step": 1170
},
{
"epoch": 1.9230769230769231,
"grad_norm": 0.5234375,
"learning_rate": 4.723141542911755e-06,
"loss": 0.4192,
"step": 1175
},
{
"epoch": 1.9312602291325696,
"grad_norm": 0.5,
"learning_rate": 4.693277443448709e-06,
"loss": 0.3854,
"step": 1180
},
{
"epoch": 1.939443535188216,
"grad_norm": 0.5546875,
"learning_rate": 4.663532158318936e-06,
"loss": 0.4069,
"step": 1185
},
{
"epoch": 1.9476268412438626,
"grad_norm": 0.5078125,
"learning_rate": 4.633907957783249e-06,
"loss": 0.4069,
"step": 1190
},
{
"epoch": 1.955810147299509,
"grad_norm": 0.5546875,
"learning_rate": 4.604407102860875e-06,
"loss": 0.4052,
"step": 1195
},
{
"epoch": 1.9639934533551555,
"grad_norm": 0.54296875,
"learning_rate": 4.575031845156893e-06,
"loss": 0.4004,
"step": 1200
},
{
"epoch": 1.972176759410802,
"grad_norm": 0.51953125,
"learning_rate": 4.545784426690371e-06,
"loss": 0.3926,
"step": 1205
},
{
"epoch": 1.9803600654664484,
"grad_norm": 0.5078125,
"learning_rate": 4.516667079723257e-06,
"loss": 0.3958,
"step": 1210
},
{
"epoch": 1.9885433715220948,
"grad_norm": 0.515625,
"learning_rate": 4.48768202659e-06,
"loss": 0.401,
"step": 1215
},
{
"epoch": 1.9967266775777412,
"grad_norm": 0.53125,
"learning_rate": 4.458831479527936e-06,
"loss": 0.392,
"step": 1220
},
{
"epoch": 1.9967266775777412,
"eval_loss": 0.3923807144165039,
"eval_runtime": 5.0473,
"eval_samples_per_second": 16.445,
"eval_steps_per_second": 16.445,
"step": 1220
},
{
"epoch": 2.0049099836333877,
"grad_norm": 0.49609375,
"learning_rate": 4.430117640508447e-06,
"loss": 0.3889,
"step": 1225
},
{
"epoch": 2.0130932896890346,
"grad_norm": 0.53125,
"learning_rate": 4.401542701068887e-06,
"loss": 0.3813,
"step": 1230
},
{
"epoch": 2.021276595744681,
"grad_norm": 0.51171875,
"learning_rate": 4.373108842145332e-06,
"loss": 0.3885,
"step": 1235
},
{
"epoch": 2.0294599018003274,
"grad_norm": 0.49609375,
"learning_rate": 4.344818233906108e-06,
"loss": 0.388,
"step": 1240
},
{
"epoch": 2.037643207855974,
"grad_norm": 0.49609375,
"learning_rate": 4.316673035586168e-06,
"loss": 0.3848,
"step": 1245
},
{
"epoch": 2.0458265139116203,
"grad_norm": 0.5078125,
"learning_rate": 4.288675395322291e-06,
"loss": 0.3885,
"step": 1250
},
{
"epoch": 2.0540098199672667,
"grad_norm": 0.5546875,
"learning_rate": 4.26082744998912e-06,
"loss": 0.382,
"step": 1255
},
{
"epoch": 2.062193126022913,
"grad_norm": 0.5234375,
"learning_rate": 4.233131325036077e-06,
"loss": 0.38,
"step": 1260
},
{
"epoch": 2.0703764320785596,
"grad_norm": 0.5234375,
"learning_rate": 4.205589134325138e-06,
"loss": 0.3926,
"step": 1265
},
{
"epoch": 2.078559738134206,
"grad_norm": 0.515625,
"learning_rate": 4.178202979969499e-06,
"loss": 0.3936,
"step": 1270
},
{
"epoch": 2.086743044189853,
"grad_norm": 0.53125,
"learning_rate": 4.15097495217313e-06,
"loss": 0.3862,
"step": 1275
},
{
"epoch": 2.0949263502454993,
"grad_norm": 0.5546875,
"learning_rate": 4.1239071290712485e-06,
"loss": 0.3968,
"step": 1280
},
{
"epoch": 2.1031096563011458,
"grad_norm": 0.52734375,
"learning_rate": 4.0970015765717105e-06,
"loss": 0.4087,
"step": 1285
},
{
"epoch": 2.111292962356792,
"grad_norm": 0.5078125,
"learning_rate": 4.070260348197324e-06,
"loss": 0.3784,
"step": 1290
},
{
"epoch": 2.1194762684124386,
"grad_norm": 0.51953125,
"learning_rate": 4.043685484929132e-06,
"loss": 0.3852,
"step": 1295
},
{
"epoch": 2.127659574468085,
"grad_norm": 0.50390625,
"learning_rate": 4.0172790150506215e-06,
"loss": 0.3756,
"step": 1300
},
{
"epoch": 2.1358428805237315,
"grad_norm": 0.5234375,
"learning_rate": 3.991042953992931e-06,
"loss": 0.3841,
"step": 1305
},
{
"epoch": 2.144026186579378,
"grad_norm": 0.52734375,
"learning_rate": 3.96497930418102e-06,
"loss": 0.3847,
"step": 1310
},
{
"epoch": 2.1522094926350244,
"grad_norm": 0.515625,
"learning_rate": 3.939090054880839e-06,
"loss": 0.3826,
"step": 1315
},
{
"epoch": 2.160392798690671,
"grad_norm": 0.5078125,
"learning_rate": 3.913377182047498e-06,
"loss": 0.3882,
"step": 1320
},
{
"epoch": 2.1685761047463177,
"grad_norm": 0.53515625,
"learning_rate": 3.887842648174465e-06,
"loss": 0.3998,
"step": 1325
},
{
"epoch": 2.176759410801964,
"grad_norm": 0.51171875,
"learning_rate": 3.862488402143767e-06,
"loss": 0.3782,
"step": 1330
},
{
"epoch": 2.1849427168576105,
"grad_norm": 0.53125,
"learning_rate": 3.8373163790772595e-06,
"loss": 0.3937,
"step": 1335
},
{
"epoch": 2.193126022913257,
"grad_norm": 0.51953125,
"learning_rate": 3.812328500188919e-06,
"loss": 0.3967,
"step": 1340
},
{
"epoch": 2.2013093289689034,
"grad_norm": 0.54296875,
"learning_rate": 3.78752667263822e-06,
"loss": 0.3856,
"step": 1345
},
{
"epoch": 2.20949263502455,
"grad_norm": 0.5078125,
"learning_rate": 3.762912789384568e-06,
"loss": 0.377,
"step": 1350
},
{
"epoch": 2.2176759410801963,
"grad_norm": 0.53125,
"learning_rate": 3.738488729042821e-06,
"loss": 0.3853,
"step": 1355
},
{
"epoch": 2.2258592471358427,
"grad_norm": 0.5234375,
"learning_rate": 3.7142563557399145e-06,
"loss": 0.386,
"step": 1360
},
{
"epoch": 2.2340425531914896,
"grad_norm": 0.51953125,
"learning_rate": 3.6902175189725764e-06,
"loss": 0.3881,
"step": 1365
},
{
"epoch": 2.242225859247136,
"grad_norm": 0.51953125,
"learning_rate": 3.666374053466175e-06,
"loss": 0.3934,
"step": 1370
},
{
"epoch": 2.2504091653027825,
"grad_norm": 0.53515625,
"learning_rate": 3.642727779034681e-06,
"loss": 0.391,
"step": 1375
},
{
"epoch": 2.258592471358429,
"grad_norm": 0.5078125,
"learning_rate": 3.6192805004417732e-06,
"loss": 0.3818,
"step": 1380
},
{
"epoch": 2.2667757774140753,
"grad_norm": 0.498046875,
"learning_rate": 3.5960340072630984e-06,
"loss": 0.3861,
"step": 1385
},
{
"epoch": 2.2749590834697218,
"grad_norm": 0.51171875,
"learning_rate": 3.572990073749678e-06,
"loss": 0.3989,
"step": 1390
},
{
"epoch": 2.283142389525368,
"grad_norm": 0.51171875,
"learning_rate": 3.550150458692497e-06,
"loss": 0.3973,
"step": 1395
},
{
"epoch": 2.2913256955810146,
"grad_norm": 0.51171875,
"learning_rate": 3.527516905288261e-06,
"loss": 0.3724,
"step": 1400
},
{
"epoch": 2.299509001636661,
"grad_norm": 0.51953125,
"learning_rate": 3.505091141006354e-06,
"loss": 0.3882,
"step": 1405
},
{
"epoch": 2.3076923076923075,
"grad_norm": 0.5234375,
"learning_rate": 3.4828748774569967e-06,
"loss": 0.3945,
"step": 1410
},
{
"epoch": 2.3158756137479544,
"grad_norm": 0.55078125,
"learning_rate": 3.460869810260595e-06,
"loss": 0.3919,
"step": 1415
},
{
"epoch": 2.324058919803601,
"grad_norm": 0.52734375,
"learning_rate": 3.4390776189183435e-06,
"loss": 0.3875,
"step": 1420
},
{
"epoch": 2.3322422258592472,
"grad_norm": 0.52734375,
"learning_rate": 3.4174999666840257e-06,
"loss": 0.3817,
"step": 1425
},
{
"epoch": 2.3404255319148937,
"grad_norm": 0.51171875,
"learning_rate": 3.396138500437076e-06,
"loss": 0.392,
"step": 1430
},
{
"epoch": 2.34860883797054,
"grad_norm": 0.5,
"learning_rate": 3.374994850556884e-06,
"loss": 0.3902,
"step": 1435
},
{
"epoch": 2.3567921440261865,
"grad_norm": 0.53515625,
"learning_rate": 3.354070630798355e-06,
"loss": 0.3935,
"step": 1440
},
{
"epoch": 2.364975450081833,
"grad_norm": 0.54296875,
"learning_rate": 3.3333674381687476e-06,
"loss": 0.3904,
"step": 1445
},
{
"epoch": 2.3731587561374794,
"grad_norm": 0.53125,
"learning_rate": 3.312886852805779e-06,
"loss": 0.403,
"step": 1450
},
{
"epoch": 2.381342062193126,
"grad_norm": 0.5234375,
"learning_rate": 3.292630437857026e-06,
"loss": 0.4007,
"step": 1455
},
{
"epoch": 2.3895253682487727,
"grad_norm": 0.53515625,
"learning_rate": 3.2725997393606266e-06,
"loss": 0.3833,
"step": 1460
},
{
"epoch": 2.397708674304419,
"grad_norm": 0.51953125,
"learning_rate": 3.2527962861272695e-06,
"loss": 0.3862,
"step": 1465
},
{
"epoch": 2.4058919803600656,
"grad_norm": 0.5078125,
"learning_rate": 3.2332215896235176e-06,
"loss": 0.3923,
"step": 1470
},
{
"epoch": 2.414075286415712,
"grad_norm": 0.50390625,
"learning_rate": 3.2138771438564465e-06,
"loss": 0.379,
"step": 1475
},
{
"epoch": 2.4222585924713584,
"grad_norm": 0.546875,
"learning_rate": 3.194764425259615e-06,
"loss": 0.3766,
"step": 1480
},
{
"epoch": 2.430441898527005,
"grad_norm": 0.50390625,
"learning_rate": 3.1758848925803846e-06,
"loss": 0.3904,
"step": 1485
},
{
"epoch": 2.4386252045826513,
"grad_norm": 0.51953125,
"learning_rate": 3.1572399867685727e-06,
"loss": 0.3922,
"step": 1490
},
{
"epoch": 2.4468085106382977,
"grad_norm": 0.5625,
"learning_rate": 3.138831130866484e-06,
"loss": 0.392,
"step": 1495
},
{
"epoch": 2.454991816693944,
"grad_norm": 0.53125,
"learning_rate": 3.1206597299002948e-06,
"loss": 0.3825,
"step": 1500
},
{
"epoch": 2.454991816693944,
"eval_loss": 0.39173567295074463,
"eval_runtime": 5.0609,
"eval_samples_per_second": 16.4,
"eval_steps_per_second": 16.4,
"step": 1500
},
{
"epoch": 2.4631751227495906,
"grad_norm": 0.53125,
"learning_rate": 3.1027271707728147e-06,
"loss": 0.3988,
"step": 1505
},
{
"epoch": 2.4713584288052375,
"grad_norm": 0.53125,
"learning_rate": 3.0850348221576405e-06,
"loss": 0.3944,
"step": 1510
},
{
"epoch": 2.479541734860884,
"grad_norm": 0.51953125,
"learning_rate": 3.067584034394684e-06,
"loss": 0.394,
"step": 1515
},
{
"epoch": 2.4877250409165304,
"grad_norm": 0.51953125,
"learning_rate": 3.050376139387121e-06,
"loss": 0.3959,
"step": 1520
},
{
"epoch": 2.495908346972177,
"grad_norm": 0.5078125,
"learning_rate": 3.0334124504997275e-06,
"loss": 0.384,
"step": 1525
},
{
"epoch": 2.504091653027823,
"grad_norm": 0.53125,
"learning_rate": 3.016694262458642e-06,
"loss": 0.384,
"step": 1530
},
{
"epoch": 2.5122749590834696,
"grad_norm": 0.50390625,
"learning_rate": 3.0002228512525485e-06,
"loss": 0.3892,
"step": 1535
},
{
"epoch": 2.520458265139116,
"grad_norm": 0.51953125,
"learning_rate": 2.983999474035285e-06,
"loss": 0.3921,
"step": 1540
},
{
"epoch": 2.528641571194763,
"grad_norm": 0.52734375,
"learning_rate": 2.968025369029902e-06,
"loss": 0.4082,
"step": 1545
},
{
"epoch": 2.5368248772504094,
"grad_norm": 0.52734375,
"learning_rate": 2.9523017554341465e-06,
"loss": 0.3814,
"step": 1550
},
{
"epoch": 2.545008183306056,
"grad_norm": 0.53125,
"learning_rate": 2.9368298333274148e-06,
"loss": 0.386,
"step": 1555
},
{
"epoch": 2.5531914893617023,
"grad_norm": 0.51953125,
"learning_rate": 2.921610783579157e-06,
"loss": 0.3794,
"step": 1560
},
{
"epoch": 2.5613747954173487,
"grad_norm": 0.5234375,
"learning_rate": 2.9066457677587488e-06,
"loss": 0.3805,
"step": 1565
},
{
"epoch": 2.569558101472995,
"grad_norm": 0.53515625,
"learning_rate": 2.891935928046839e-06,
"loss": 0.4039,
"step": 1570
},
{
"epoch": 2.5777414075286416,
"grad_norm": 0.5234375,
"learning_rate": 2.8774823871481695e-06,
"loss": 0.3814,
"step": 1575
},
{
"epoch": 2.585924713584288,
"grad_norm": 0.53515625,
"learning_rate": 2.86328624820589e-06,
"loss": 0.3798,
"step": 1580
},
{
"epoch": 2.5941080196399344,
"grad_norm": 0.54296875,
"learning_rate": 2.8493485947173643e-06,
"loss": 0.3909,
"step": 1585
},
{
"epoch": 2.602291325695581,
"grad_norm": 0.53125,
"learning_rate": 2.8356704904514683e-06,
"loss": 0.3958,
"step": 1590
},
{
"epoch": 2.6104746317512273,
"grad_norm": 0.54296875,
"learning_rate": 2.8222529793674055e-06,
"loss": 0.3929,
"step": 1595
},
{
"epoch": 2.6186579378068737,
"grad_norm": 0.5234375,
"learning_rate": 2.8090970855350252e-06,
"loss": 0.4019,
"step": 1600
},
{
"epoch": 2.6268412438625206,
"grad_norm": 0.51171875,
"learning_rate": 2.7962038130566616e-06,
"loss": 0.3837,
"step": 1605
},
{
"epoch": 2.635024549918167,
"grad_norm": 0.51953125,
"learning_rate": 2.783574145990501e-06,
"loss": 0.4116,
"step": 1610
},
{
"epoch": 2.6432078559738135,
"grad_norm": 0.51171875,
"learning_rate": 2.7712090482754683e-06,
"loss": 0.386,
"step": 1615
},
{
"epoch": 2.65139116202946,
"grad_norm": 0.5234375,
"learning_rate": 2.759109463657666e-06,
"loss": 0.3837,
"step": 1620
},
{
"epoch": 2.6595744680851063,
"grad_norm": 0.53515625,
"learning_rate": 2.7472763156183346e-06,
"loss": 0.4019,
"step": 1625
},
{
"epoch": 2.6677577741407528,
"grad_norm": 0.50390625,
"learning_rate": 2.735710507303374e-06,
"loss": 0.4055,
"step": 1630
},
{
"epoch": 2.675941080196399,
"grad_norm": 0.51953125,
"learning_rate": 2.7244129214544123e-06,
"loss": 0.3838,
"step": 1635
},
{
"epoch": 2.684124386252046,
"grad_norm": 0.53515625,
"learning_rate": 2.7133844203414305e-06,
"loss": 0.3838,
"step": 1640
},
{
"epoch": 2.6923076923076925,
"grad_norm": 0.55078125,
"learning_rate": 2.7026258456969538e-06,
"loss": 0.3999,
"step": 1645
},
{
"epoch": 2.700490998363339,
"grad_norm": 0.50390625,
"learning_rate": 2.6921380186518042e-06,
"loss": 0.3993,
"step": 1650
},
{
"epoch": 2.7086743044189854,
"grad_norm": 0.5078125,
"learning_rate": 2.6819217396724305e-06,
"loss": 0.3909,
"step": 1655
},
{
"epoch": 2.716857610474632,
"grad_norm": 0.52734375,
"learning_rate": 2.671977788499817e-06,
"loss": 0.3932,
"step": 1660
},
{
"epoch": 2.7250409165302782,
"grad_norm": 0.53125,
"learning_rate": 2.6623069240899642e-06,
"loss": 0.398,
"step": 1665
},
{
"epoch": 2.7332242225859247,
"grad_norm": 0.5078125,
"learning_rate": 2.6529098845559703e-06,
"loss": 0.3899,
"step": 1670
},
{
"epoch": 2.741407528641571,
"grad_norm": 0.52734375,
"learning_rate": 2.6437873871116903e-06,
"loss": 0.385,
"step": 1675
},
{
"epoch": 2.7495908346972175,
"grad_norm": 0.5234375,
"learning_rate": 2.6349401280169985e-06,
"loss": 0.3903,
"step": 1680
},
{
"epoch": 2.757774140752864,
"grad_norm": 0.51953125,
"learning_rate": 2.6263687825246463e-06,
"loss": 0.3708,
"step": 1685
},
{
"epoch": 2.7659574468085104,
"grad_norm": 0.51171875,
"learning_rate": 2.6180740048287274e-06,
"loss": 0.3821,
"step": 1690
},
{
"epoch": 2.774140752864157,
"grad_norm": 0.55859375,
"learning_rate": 2.610056428014743e-06,
"loss": 0.4001,
"step": 1695
},
{
"epoch": 2.7823240589198037,
"grad_norm": 0.546875,
"learning_rate": 2.6023166640112875e-06,
"loss": 0.4013,
"step": 1700
},
{
"epoch": 2.79050736497545,
"grad_norm": 0.52734375,
"learning_rate": 2.594855303543338e-06,
"loss": 0.3969,
"step": 1705
},
{
"epoch": 2.7986906710310966,
"grad_norm": 0.5703125,
"learning_rate": 2.587672916087175e-06,
"loss": 0.4152,
"step": 1710
},
{
"epoch": 2.806873977086743,
"grad_norm": 0.52734375,
"learning_rate": 2.5807700498269134e-06,
"loss": 0.3875,
"step": 1715
},
{
"epoch": 2.8150572831423895,
"grad_norm": 0.51171875,
"learning_rate": 2.574147231612665e-06,
"loss": 0.3951,
"step": 1720
},
{
"epoch": 2.823240589198036,
"grad_norm": 0.51953125,
"learning_rate": 2.5678049669203252e-06,
"loss": 0.404,
"step": 1725
},
{
"epoch": 2.8314238952536823,
"grad_norm": 0.5390625,
"learning_rate": 2.561743739812998e-06,
"loss": 0.3845,
"step": 1730
},
{
"epoch": 2.839607201309329,
"grad_norm": 0.5234375,
"learning_rate": 2.5559640129040464e-06,
"loss": 0.3885,
"step": 1735
},
{
"epoch": 2.8477905073649756,
"grad_norm": 0.515625,
"learning_rate": 2.550466227321786e-06,
"loss": 0.3967,
"step": 1740
},
{
"epoch": 2.855973813420622,
"grad_norm": 0.515625,
"learning_rate": 2.545250802675816e-06,
"loss": 0.3865,
"step": 1745
},
{
"epoch": 2.8641571194762685,
"grad_norm": 0.51953125,
"learning_rate": 2.540318137024994e-06,
"loss": 0.3873,
"step": 1750
},
{
"epoch": 2.872340425531915,
"grad_norm": 0.5546875,
"learning_rate": 2.5356686068470537e-06,
"loss": 0.3912,
"step": 1755
},
{
"epoch": 2.8805237315875614,
"grad_norm": 0.53515625,
"learning_rate": 2.5313025670098725e-06,
"loss": 0.3949,
"step": 1760
},
{
"epoch": 2.888707037643208,
"grad_norm": 0.546875,
"learning_rate": 2.5272203507443836e-06,
"loss": 0.3985,
"step": 1765
},
{
"epoch": 2.8968903436988542,
"grad_norm": 0.53125,
"learning_rate": 2.523422269619147e-06,
"loss": 0.3831,
"step": 1770
},
{
"epoch": 2.9050736497545007,
"grad_norm": 0.54296875,
"learning_rate": 2.5199086135165664e-06,
"loss": 0.3916,
"step": 1775
},
{
"epoch": 2.913256955810147,
"grad_norm": 0.51171875,
"learning_rate": 2.516679650610765e-06,
"loss": 0.4102,
"step": 1780
},
{
"epoch": 2.9214402618657935,
"grad_norm": 0.55859375,
"learning_rate": 2.5137356273471183e-06,
"loss": 0.407,
"step": 1785
},
{
"epoch": 2.9296235679214404,
"grad_norm": 0.53125,
"learning_rate": 2.511076768423443e-06,
"loss": 0.4131,
"step": 1790
},
{
"epoch": 2.937806873977087,
"grad_norm": 0.53125,
"learning_rate": 2.508703276772852e-06,
"loss": 0.3914,
"step": 1795
},
{
"epoch": 2.9459901800327333,
"grad_norm": 0.53125,
"learning_rate": 2.506615333548257e-06,
"loss": 0.4093,
"step": 1800
},
{
"epoch": 2.9541734860883797,
"grad_norm": 0.5,
"learning_rate": 2.5048130981085524e-06,
"loss": 0.3938,
"step": 1805
},
{
"epoch": 2.962356792144026,
"grad_norm": 0.53125,
"learning_rate": 2.5032967080064435e-06,
"loss": 0.4007,
"step": 1810
},
{
"epoch": 2.9705400981996726,
"grad_norm": 0.50390625,
"learning_rate": 2.5020662789779555e-06,
"loss": 0.3872,
"step": 1815
},
{
"epoch": 2.978723404255319,
"grad_norm": 0.55078125,
"learning_rate": 2.501121904933595e-06,
"loss": 0.3835,
"step": 1820
},
{
"epoch": 2.986906710310966,
"grad_norm": 0.53125,
"learning_rate": 2.5004636579511843e-06,
"loss": 0.4001,
"step": 1825
},
{
"epoch": 2.9950900163666123,
"grad_norm": 0.53125,
"learning_rate": 2.5000915882703615e-06,
"loss": 0.3819,
"step": 1830
},
{
"epoch": 2.9950900163666123,
"eval_loss": 0.3913484513759613,
"eval_runtime": 5.0457,
"eval_samples_per_second": 16.45,
"eval_steps_per_second": 16.45,
"step": 1830
}
],
"logging_steps": 5,
"max_steps": 1833,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.0422166028387615e+18,
"train_batch_size": 48,
"trial_name": null,
"trial_params": null
}