instruct_1_gpu_026a7f1 / trainer_state.json
bimabk's picture
Upload task output 1
ff5947e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9995586406130883,
"eval_steps": 500,
"global_step": 4671,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003209886450266822,
"grad_norm": 4.5,
"learning_rate": 1.1428571428571429e-05,
"loss": 1.969,
"step": 5
},
{
"epoch": 0.006419772900533644,
"grad_norm": 2.703125,
"learning_rate": 2.5714285714285714e-05,
"loss": 1.9039,
"step": 10
},
{
"epoch": 0.009629659350800466,
"grad_norm": 2.515625,
"learning_rate": 4e-05,
"loss": 1.7918,
"step": 15
},
{
"epoch": 0.012839545801067288,
"grad_norm": 2.15625,
"learning_rate": 5.428571428571428e-05,
"loss": 1.6624,
"step": 20
},
{
"epoch": 0.01604943225133411,
"grad_norm": 1.9296875,
"learning_rate": 6.857142857142858e-05,
"loss": 1.5578,
"step": 25
},
{
"epoch": 0.01925931870160093,
"grad_norm": 1.921875,
"learning_rate": 8.285714285714287e-05,
"loss": 1.4779,
"step": 30
},
{
"epoch": 0.022469205151867754,
"grad_norm": 1.875,
"learning_rate": 9.714285714285715e-05,
"loss": 1.4165,
"step": 35
},
{
"epoch": 0.025679091602134576,
"grad_norm": 1.8671875,
"learning_rate": 9.999986223659144e-05,
"loss": 1.3396,
"step": 40
},
{
"epoch": 0.028888978052401395,
"grad_norm": 1.765625,
"learning_rate": 9.999930257447894e-05,
"loss": 1.3222,
"step": 45
},
{
"epoch": 0.03209886450266822,
"grad_norm": 1.7578125,
"learning_rate": 9.99983124098696e-05,
"loss": 1.271,
"step": 50
},
{
"epoch": 0.03530875095293504,
"grad_norm": 1.8046875,
"learning_rate": 9.99968917541308e-05,
"loss": 1.2353,
"step": 55
},
{
"epoch": 0.03851863740320186,
"grad_norm": 1.8125,
"learning_rate": 9.999504062357203e-05,
"loss": 1.2284,
"step": 60
},
{
"epoch": 0.04172852385346868,
"grad_norm": 1.640625,
"learning_rate": 9.999275903944482e-05,
"loss": 1.2037,
"step": 65
},
{
"epoch": 0.04493841030373551,
"grad_norm": 1.765625,
"learning_rate": 9.99900470279424e-05,
"loss": 1.1832,
"step": 70
},
{
"epoch": 0.048148296754002326,
"grad_norm": 1.7265625,
"learning_rate": 9.998690462019939e-05,
"loss": 1.1533,
"step": 75
},
{
"epoch": 0.05135818320426915,
"grad_norm": 1.7734375,
"learning_rate": 9.998333185229152e-05,
"loss": 1.1481,
"step": 80
},
{
"epoch": 0.05456806965453597,
"grad_norm": 1.90625,
"learning_rate": 9.99793287652352e-05,
"loss": 1.1369,
"step": 85
},
{
"epoch": 0.05777795610480279,
"grad_norm": 1.765625,
"learning_rate": 9.997489540498695e-05,
"loss": 1.1191,
"step": 90
},
{
"epoch": 0.060987842555069616,
"grad_norm": 1.7421875,
"learning_rate": 9.9970031822443e-05,
"loss": 1.1189,
"step": 95
},
{
"epoch": 0.06419772900533643,
"grad_norm": 1.625,
"learning_rate": 9.996473807343865e-05,
"loss": 1.0978,
"step": 100
},
{
"epoch": 0.06740761545560325,
"grad_norm": 1.9375,
"learning_rate": 9.995901421874761e-05,
"loss": 1.0831,
"step": 105
},
{
"epoch": 0.07061750190587009,
"grad_norm": 1.9609375,
"learning_rate": 9.995286032408134e-05,
"loss": 1.0734,
"step": 110
},
{
"epoch": 0.0738273883561369,
"grad_norm": 1.7890625,
"learning_rate": 9.994627646008827e-05,
"loss": 1.0588,
"step": 115
},
{
"epoch": 0.07703727480640372,
"grad_norm": 2.0625,
"learning_rate": 9.993926270235301e-05,
"loss": 1.0553,
"step": 120
},
{
"epoch": 0.08024716125667054,
"grad_norm": 1.5625,
"learning_rate": 9.993181913139545e-05,
"loss": 1.0605,
"step": 125
},
{
"epoch": 0.08345704770693736,
"grad_norm": 1.6796875,
"learning_rate": 9.992394583266989e-05,
"loss": 1.0296,
"step": 130
},
{
"epoch": 0.0866669341572042,
"grad_norm": 1.65625,
"learning_rate": 9.991564289656398e-05,
"loss": 1.0441,
"step": 135
},
{
"epoch": 0.08987682060747101,
"grad_norm": 1.4609375,
"learning_rate": 9.990691041839778e-05,
"loss": 1.0367,
"step": 140
},
{
"epoch": 0.09308670705773783,
"grad_norm": 1.625,
"learning_rate": 9.989774849842257e-05,
"loss": 1.0188,
"step": 145
},
{
"epoch": 0.09629659350800465,
"grad_norm": 1.6328125,
"learning_rate": 9.988815724181975e-05,
"loss": 1.0121,
"step": 150
},
{
"epoch": 0.09950647995827147,
"grad_norm": 1.6953125,
"learning_rate": 9.987813675869966e-05,
"loss": 1.0097,
"step": 155
},
{
"epoch": 0.1027163664085383,
"grad_norm": 1.8984375,
"learning_rate": 9.98676871641002e-05,
"loss": 1.0222,
"step": 160
},
{
"epoch": 0.10592625285880512,
"grad_norm": 1.921875,
"learning_rate": 9.98568085779857e-05,
"loss": 0.9847,
"step": 165
},
{
"epoch": 0.10913613930907194,
"grad_norm": 1.6484375,
"learning_rate": 9.984550112524535e-05,
"loss": 1.0177,
"step": 170
},
{
"epoch": 0.11234602575933876,
"grad_norm": 1.7109375,
"learning_rate": 9.983376493569186e-05,
"loss": 0.986,
"step": 175
},
{
"epoch": 0.11555591220960558,
"grad_norm": 1.6875,
"learning_rate": 9.982160014406001e-05,
"loss": 0.996,
"step": 180
},
{
"epoch": 0.11876579865987241,
"grad_norm": 1.6015625,
"learning_rate": 9.980900689000498e-05,
"loss": 0.9572,
"step": 185
},
{
"epoch": 0.12197568511013923,
"grad_norm": 1.5703125,
"learning_rate": 9.979598531810088e-05,
"loss": 0.9589,
"step": 190
},
{
"epoch": 0.12518557156040605,
"grad_norm": 1.7109375,
"learning_rate": 9.978253557783898e-05,
"loss": 0.9885,
"step": 195
},
{
"epoch": 0.12839545801067287,
"grad_norm": 1.5390625,
"learning_rate": 9.97686578236261e-05,
"loss": 0.9701,
"step": 200
},
{
"epoch": 0.1316053444609397,
"grad_norm": 1.6484375,
"learning_rate": 9.97543522147827e-05,
"loss": 0.96,
"step": 205
},
{
"epoch": 0.1348152309112065,
"grad_norm": 1.6484375,
"learning_rate": 9.97396189155412e-05,
"loss": 0.9497,
"step": 210
},
{
"epoch": 0.13802511736147333,
"grad_norm": 1.7734375,
"learning_rate": 9.9724458095044e-05,
"loss": 0.9269,
"step": 215
},
{
"epoch": 0.14123500381174017,
"grad_norm": 1.6015625,
"learning_rate": 9.970886992734156e-05,
"loss": 0.9376,
"step": 220
},
{
"epoch": 0.144444890262007,
"grad_norm": 1.59375,
"learning_rate": 9.969285459139044e-05,
"loss": 0.9344,
"step": 225
},
{
"epoch": 0.1476547767122738,
"grad_norm": 1.5625,
"learning_rate": 9.967641227105115e-05,
"loss": 0.9316,
"step": 230
},
{
"epoch": 0.15086466316254063,
"grad_norm": 1.6875,
"learning_rate": 9.965954315508615e-05,
"loss": 0.9611,
"step": 235
},
{
"epoch": 0.15407454961280745,
"grad_norm": 1.5859375,
"learning_rate": 9.964224743715759e-05,
"loss": 0.9371,
"step": 240
},
{
"epoch": 0.15728443606307427,
"grad_norm": 1.7265625,
"learning_rate": 9.962452531582519e-05,
"loss": 0.9436,
"step": 245
},
{
"epoch": 0.1604943225133411,
"grad_norm": 1.65625,
"learning_rate": 9.960637699454385e-05,
"loss": 0.9463,
"step": 250
},
{
"epoch": 0.1637042089636079,
"grad_norm": 1.6875,
"learning_rate": 9.95878026816614e-05,
"loss": 0.9082,
"step": 255
},
{
"epoch": 0.16691409541387472,
"grad_norm": 1.7578125,
"learning_rate": 9.95688025904161e-05,
"loss": 0.9109,
"step": 260
},
{
"epoch": 0.17012398186414154,
"grad_norm": 1.6328125,
"learning_rate": 9.954937693893438e-05,
"loss": 0.9137,
"step": 265
},
{
"epoch": 0.1733338683144084,
"grad_norm": 1.703125,
"learning_rate": 9.952952595022813e-05,
"loss": 0.9238,
"step": 270
},
{
"epoch": 0.1765437547646752,
"grad_norm": 1.84375,
"learning_rate": 9.950924985219228e-05,
"loss": 0.9301,
"step": 275
},
{
"epoch": 0.17975364121494203,
"grad_norm": 1.7265625,
"learning_rate": 9.94885488776021e-05,
"loss": 0.8841,
"step": 280
},
{
"epoch": 0.18296352766520885,
"grad_norm": 1.7734375,
"learning_rate": 9.946742326411057e-05,
"loss": 0.8775,
"step": 285
},
{
"epoch": 0.18617341411547567,
"grad_norm": 1.703125,
"learning_rate": 9.944587325424566e-05,
"loss": 0.8849,
"step": 290
},
{
"epoch": 0.18938330056574249,
"grad_norm": 1.5546875,
"learning_rate": 9.942389909540753e-05,
"loss": 0.9084,
"step": 295
},
{
"epoch": 0.1925931870160093,
"grad_norm": 1.65625,
"learning_rate": 9.940150103986565e-05,
"loss": 0.8777,
"step": 300
},
{
"epoch": 0.19580307346627612,
"grad_norm": 1.7734375,
"learning_rate": 9.9378679344756e-05,
"loss": 0.8883,
"step": 305
},
{
"epoch": 0.19901295991654294,
"grad_norm": 1.5625,
"learning_rate": 9.935543427207801e-05,
"loss": 0.8874,
"step": 310
},
{
"epoch": 0.20222284636680976,
"grad_norm": 1.7890625,
"learning_rate": 9.933176608869166e-05,
"loss": 0.8846,
"step": 315
},
{
"epoch": 0.2054327328170766,
"grad_norm": 1.75,
"learning_rate": 9.930767506631427e-05,
"loss": 0.9083,
"step": 320
},
{
"epoch": 0.20864261926734343,
"grad_norm": 1.6796875,
"learning_rate": 9.928316148151756e-05,
"loss": 0.9058,
"step": 325
},
{
"epoch": 0.21185250571761025,
"grad_norm": 1.5625,
"learning_rate": 9.925822561572435e-05,
"loss": 0.8871,
"step": 330
},
{
"epoch": 0.21506239216787706,
"grad_norm": 1.625,
"learning_rate": 9.923286775520537e-05,
"loss": 0.8707,
"step": 335
},
{
"epoch": 0.21827227861814388,
"grad_norm": 1.6953125,
"learning_rate": 9.920708819107593e-05,
"loss": 0.8788,
"step": 340
},
{
"epoch": 0.2214821650684107,
"grad_norm": 1.625,
"learning_rate": 9.918088721929266e-05,
"loss": 0.867,
"step": 345
},
{
"epoch": 0.22469205151867752,
"grad_norm": 1.59375,
"learning_rate": 9.915426514065007e-05,
"loss": 0.8763,
"step": 350
},
{
"epoch": 0.22790193796894434,
"grad_norm": 1.6875,
"learning_rate": 9.912722226077709e-05,
"loss": 0.8843,
"step": 355
},
{
"epoch": 0.23111182441921116,
"grad_norm": 1.5703125,
"learning_rate": 9.90997588901335e-05,
"loss": 0.8689,
"step": 360
},
{
"epoch": 0.234321710869478,
"grad_norm": 1.6953125,
"learning_rate": 9.907187534400655e-05,
"loss": 0.8666,
"step": 365
},
{
"epoch": 0.23753159731974482,
"grad_norm": 1.6171875,
"learning_rate": 9.90435719425071e-05,
"loss": 0.8511,
"step": 370
},
{
"epoch": 0.24074148377001164,
"grad_norm": 1.6953125,
"learning_rate": 9.90148490105662e-05,
"loss": 0.8491,
"step": 375
},
{
"epoch": 0.24395137022027846,
"grad_norm": 1.8359375,
"learning_rate": 9.898570687793107e-05,
"loss": 0.8691,
"step": 380
},
{
"epoch": 0.24716125667054528,
"grad_norm": 1.46875,
"learning_rate": 9.895614587916162e-05,
"loss": 0.8243,
"step": 385
},
{
"epoch": 0.2503711431208121,
"grad_norm": 1.40625,
"learning_rate": 9.892616635362637e-05,
"loss": 0.8645,
"step": 390
},
{
"epoch": 0.2535810295710789,
"grad_norm": 1.6171875,
"learning_rate": 9.889576864549867e-05,
"loss": 0.8191,
"step": 395
},
{
"epoch": 0.25679091602134574,
"grad_norm": 1.5703125,
"learning_rate": 9.886495310375275e-05,
"loss": 0.8665,
"step": 400
},
{
"epoch": 0.26000080247161256,
"grad_norm": 1.421875,
"learning_rate": 9.883372008215962e-05,
"loss": 0.8695,
"step": 405
},
{
"epoch": 0.2632106889218794,
"grad_norm": 1.5,
"learning_rate": 9.880206993928313e-05,
"loss": 0.8283,
"step": 410
},
{
"epoch": 0.2664205753721462,
"grad_norm": 1.421875,
"learning_rate": 9.87700030384758e-05,
"loss": 0.823,
"step": 415
},
{
"epoch": 0.269630461822413,
"grad_norm": 1.5390625,
"learning_rate": 9.873751974787461e-05,
"loss": 0.8196,
"step": 420
},
{
"epoch": 0.27284034827267983,
"grad_norm": 1.546875,
"learning_rate": 9.870462044039685e-05,
"loss": 0.8504,
"step": 425
},
{
"epoch": 0.27605023472294665,
"grad_norm": 1.625,
"learning_rate": 9.867130549373578e-05,
"loss": 0.8519,
"step": 430
},
{
"epoch": 0.27926012117321347,
"grad_norm": 1.6171875,
"learning_rate": 9.863757529035633e-05,
"loss": 0.8589,
"step": 435
},
{
"epoch": 0.28247000762348035,
"grad_norm": 1.5859375,
"learning_rate": 9.860343021749065e-05,
"loss": 0.8209,
"step": 440
},
{
"epoch": 0.28567989407374716,
"grad_norm": 1.53125,
"learning_rate": 9.856887066713378e-05,
"loss": 0.8453,
"step": 445
},
{
"epoch": 0.288889780524014,
"grad_norm": 1.6484375,
"learning_rate": 9.853389703603901e-05,
"loss": 0.8433,
"step": 450
},
{
"epoch": 0.2920996669742808,
"grad_norm": 1.640625,
"learning_rate": 9.849850972571344e-05,
"loss": 0.8281,
"step": 455
},
{
"epoch": 0.2953095534245476,
"grad_norm": 1.5703125,
"learning_rate": 9.84627091424133e-05,
"loss": 0.8292,
"step": 460
},
{
"epoch": 0.29851943987481444,
"grad_norm": 1.40625,
"learning_rate": 9.84264956971393e-05,
"loss": 0.8199,
"step": 465
},
{
"epoch": 0.30172932632508126,
"grad_norm": 1.4765625,
"learning_rate": 9.838986980563193e-05,
"loss": 0.8263,
"step": 470
},
{
"epoch": 0.3049392127753481,
"grad_norm": 1.5546875,
"learning_rate": 9.835283188836673e-05,
"loss": 0.8324,
"step": 475
},
{
"epoch": 0.3081490992256149,
"grad_norm": 1.515625,
"learning_rate": 9.831538237054931e-05,
"loss": 0.8085,
"step": 480
},
{
"epoch": 0.3113589856758817,
"grad_norm": 1.640625,
"learning_rate": 9.827752168211064e-05,
"loss": 0.8375,
"step": 485
},
{
"epoch": 0.31456887212614854,
"grad_norm": 1.6015625,
"learning_rate": 9.823925025770206e-05,
"loss": 0.8027,
"step": 490
},
{
"epoch": 0.31777875857641535,
"grad_norm": 1.6953125,
"learning_rate": 9.82005685366902e-05,
"loss": 0.8309,
"step": 495
},
{
"epoch": 0.3209886450266822,
"grad_norm": 1.5546875,
"learning_rate": 9.816147696315206e-05,
"loss": 0.8218,
"step": 500
},
{
"epoch": 0.3209886450266822,
"eval_loss": 0.7136461138725281,
"eval_runtime": 2.3986,
"eval_samples_per_second": 83.382,
"eval_steps_per_second": 83.382,
"step": 500
},
{
"epoch": 0.324198531476949,
"grad_norm": 1.5703125,
"learning_rate": 9.812197598586987e-05,
"loss": 0.7931,
"step": 505
},
{
"epoch": 0.3274084179272158,
"grad_norm": 1.6953125,
"learning_rate": 9.808206605832591e-05,
"loss": 0.8032,
"step": 510
},
{
"epoch": 0.33061830437748263,
"grad_norm": 1.4921875,
"learning_rate": 9.80417476386973e-05,
"loss": 0.8131,
"step": 515
},
{
"epoch": 0.33382819082774945,
"grad_norm": 1.625,
"learning_rate": 9.800102118985082e-05,
"loss": 0.7943,
"step": 520
},
{
"epoch": 0.33703807727801627,
"grad_norm": 1.703125,
"learning_rate": 9.795988717933751e-05,
"loss": 0.8233,
"step": 525
},
{
"epoch": 0.3402479637282831,
"grad_norm": 1.5234375,
"learning_rate": 9.79183460793873e-05,
"loss": 0.8013,
"step": 530
},
{
"epoch": 0.3434578501785499,
"grad_norm": 1.7578125,
"learning_rate": 9.78763983669037e-05,
"loss": 0.8121,
"step": 535
},
{
"epoch": 0.3466677366288168,
"grad_norm": 1.5546875,
"learning_rate": 9.783404452345815e-05,
"loss": 0.8053,
"step": 540
},
{
"epoch": 0.3498776230790836,
"grad_norm": 1.640625,
"learning_rate": 9.779128503528468e-05,
"loss": 0.7825,
"step": 545
},
{
"epoch": 0.3530875095293504,
"grad_norm": 1.5,
"learning_rate": 9.774812039327415e-05,
"loss": 0.7883,
"step": 550
},
{
"epoch": 0.35629739597961724,
"grad_norm": 1.515625,
"learning_rate": 9.770455109296878e-05,
"loss": 0.8132,
"step": 555
},
{
"epoch": 0.35950728242988406,
"grad_norm": 1.6484375,
"learning_rate": 9.76605776345563e-05,
"loss": 0.7793,
"step": 560
},
{
"epoch": 0.3627171688801509,
"grad_norm": 1.5703125,
"learning_rate": 9.761620052286438e-05,
"loss": 0.7936,
"step": 565
},
{
"epoch": 0.3659270553304177,
"grad_norm": 1.5078125,
"learning_rate": 9.757142026735464e-05,
"loss": 0.782,
"step": 570
},
{
"epoch": 0.3691369417806845,
"grad_norm": 1.390625,
"learning_rate": 9.752623738211698e-05,
"loss": 0.7888,
"step": 575
},
{
"epoch": 0.37234682823095133,
"grad_norm": 1.46875,
"learning_rate": 9.748065238586357e-05,
"loss": 0.8042,
"step": 580
},
{
"epoch": 0.37555671468121815,
"grad_norm": 1.453125,
"learning_rate": 9.743466580192297e-05,
"loss": 0.7862,
"step": 585
},
{
"epoch": 0.37876660113148497,
"grad_norm": 1.5234375,
"learning_rate": 9.738827815823399e-05,
"loss": 0.7994,
"step": 590
},
{
"epoch": 0.3819764875817518,
"grad_norm": 1.5546875,
"learning_rate": 9.734148998733981e-05,
"loss": 0.7933,
"step": 595
},
{
"epoch": 0.3851863740320186,
"grad_norm": 1.5078125,
"learning_rate": 9.729430182638173e-05,
"loss": 0.7957,
"step": 600
},
{
"epoch": 0.3883962604822854,
"grad_norm": 1.53125,
"learning_rate": 9.724671421709304e-05,
"loss": 0.788,
"step": 605
},
{
"epoch": 0.39160614693255225,
"grad_norm": 1.5625,
"learning_rate": 9.719872770579284e-05,
"loss": 0.7994,
"step": 610
},
{
"epoch": 0.39481603338281906,
"grad_norm": 1.625,
"learning_rate": 9.71503428433797e-05,
"loss": 0.7882,
"step": 615
},
{
"epoch": 0.3980259198330859,
"grad_norm": 1.4375,
"learning_rate": 9.710156018532542e-05,
"loss": 0.7768,
"step": 620
},
{
"epoch": 0.4012358062833527,
"grad_norm": 1.5859375,
"learning_rate": 9.705238029166855e-05,
"loss": 0.7844,
"step": 625
},
{
"epoch": 0.4044456927336195,
"grad_norm": 1.390625,
"learning_rate": 9.700280372700807e-05,
"loss": 0.7825,
"step": 630
},
{
"epoch": 0.4076555791838864,
"grad_norm": 1.3515625,
"learning_rate": 9.695283106049682e-05,
"loss": 0.7749,
"step": 635
},
{
"epoch": 0.4108654656341532,
"grad_norm": 1.578125,
"learning_rate": 9.6902462865835e-05,
"loss": 0.7849,
"step": 640
},
{
"epoch": 0.41407535208442003,
"grad_norm": 1.5234375,
"learning_rate": 9.68516997212636e-05,
"loss": 0.7684,
"step": 645
},
{
"epoch": 0.41728523853468685,
"grad_norm": 1.2890625,
"learning_rate": 9.680054220955774e-05,
"loss": 0.763,
"step": 650
},
{
"epoch": 0.42049512498495367,
"grad_norm": 1.5859375,
"learning_rate": 9.674899091801996e-05,
"loss": 0.7771,
"step": 655
},
{
"epoch": 0.4237050114352205,
"grad_norm": 1.46875,
"learning_rate": 9.669704643847358e-05,
"loss": 0.7729,
"step": 660
},
{
"epoch": 0.4269148978854873,
"grad_norm": 1.4609375,
"learning_rate": 9.664470936725571e-05,
"loss": 0.7644,
"step": 665
},
{
"epoch": 0.43012478433575413,
"grad_norm": 1.4609375,
"learning_rate": 9.659198030521063e-05,
"loss": 0.7702,
"step": 670
},
{
"epoch": 0.43333467078602095,
"grad_norm": 1.3671875,
"learning_rate": 9.653885985768273e-05,
"loss": 0.7859,
"step": 675
},
{
"epoch": 0.43654455723628777,
"grad_norm": 1.5078125,
"learning_rate": 9.648534863450962e-05,
"loss": 0.7817,
"step": 680
},
{
"epoch": 0.4397544436865546,
"grad_norm": 1.625,
"learning_rate": 9.643144725001514e-05,
"loss": 0.7604,
"step": 685
},
{
"epoch": 0.4429643301368214,
"grad_norm": 1.5625,
"learning_rate": 9.637715632300229e-05,
"loss": 0.7772,
"step": 690
},
{
"epoch": 0.4461742165870882,
"grad_norm": 1.65625,
"learning_rate": 9.632247647674606e-05,
"loss": 0.7653,
"step": 695
},
{
"epoch": 0.44938410303735504,
"grad_norm": 1.609375,
"learning_rate": 9.626740833898648e-05,
"loss": 0.7522,
"step": 700
},
{
"epoch": 0.45259398948762186,
"grad_norm": 1.4453125,
"learning_rate": 9.621195254192114e-05,
"loss": 0.7729,
"step": 705
},
{
"epoch": 0.4558038759378887,
"grad_norm": 1.421875,
"learning_rate": 9.615610972219816e-05,
"loss": 0.7425,
"step": 710
},
{
"epoch": 0.4590137623881555,
"grad_norm": 1.5078125,
"learning_rate": 9.609988052090872e-05,
"loss": 0.7838,
"step": 715
},
{
"epoch": 0.4622236488384223,
"grad_norm": 1.546875,
"learning_rate": 9.604326558357983e-05,
"loss": 0.7653,
"step": 720
},
{
"epoch": 0.46543353528868914,
"grad_norm": 1.578125,
"learning_rate": 9.598626556016682e-05,
"loss": 0.7702,
"step": 725
},
{
"epoch": 0.468643421738956,
"grad_norm": 1.4296875,
"learning_rate": 9.59288811050459e-05,
"loss": 0.7565,
"step": 730
},
{
"epoch": 0.47185330818922283,
"grad_norm": 1.6015625,
"learning_rate": 9.587111287700672e-05,
"loss": 0.7352,
"step": 735
},
{
"epoch": 0.47506319463948965,
"grad_norm": 1.3671875,
"learning_rate": 9.581296153924468e-05,
"loss": 0.7715,
"step": 740
},
{
"epoch": 0.47827308108975647,
"grad_norm": 1.5078125,
"learning_rate": 9.575442775935348e-05,
"loss": 0.7536,
"step": 745
},
{
"epoch": 0.4814829675400233,
"grad_norm": 1.4296875,
"learning_rate": 9.569551220931725e-05,
"loss": 0.7404,
"step": 750
},
{
"epoch": 0.4846928539902901,
"grad_norm": 1.5546875,
"learning_rate": 9.563621556550306e-05,
"loss": 0.7383,
"step": 755
},
{
"epoch": 0.4879027404405569,
"grad_norm": 1.5,
"learning_rate": 9.557653850865293e-05,
"loss": 0.7391,
"step": 760
},
{
"epoch": 0.49111262689082374,
"grad_norm": 1.4140625,
"learning_rate": 9.551648172387624e-05,
"loss": 0.751,
"step": 765
},
{
"epoch": 0.49432251334109056,
"grad_norm": 1.3125,
"learning_rate": 9.545604590064167e-05,
"loss": 0.7483,
"step": 770
},
{
"epoch": 0.4975323997913574,
"grad_norm": 1.5234375,
"learning_rate": 9.539523173276942e-05,
"loss": 0.7284,
"step": 775
},
{
"epoch": 0.5007422862416242,
"grad_norm": 1.5390625,
"learning_rate": 9.533403991842317e-05,
"loss": 0.7356,
"step": 780
},
{
"epoch": 0.5039521726918911,
"grad_norm": 1.609375,
"learning_rate": 9.527247116010207e-05,
"loss": 0.7591,
"step": 785
},
{
"epoch": 0.5071620591421578,
"grad_norm": 1.421875,
"learning_rate": 9.521052616463272e-05,
"loss": 0.7411,
"step": 790
},
{
"epoch": 0.5103719455924247,
"grad_norm": 1.546875,
"learning_rate": 9.5148205643161e-05,
"loss": 0.7574,
"step": 795
},
{
"epoch": 0.5135818320426915,
"grad_norm": 1.4609375,
"learning_rate": 9.5085510311144e-05,
"loss": 0.7262,
"step": 800
},
{
"epoch": 0.5167917184929584,
"grad_norm": 1.4921875,
"learning_rate": 9.502244088834164e-05,
"loss": 0.7584,
"step": 805
},
{
"epoch": 0.5200016049432251,
"grad_norm": 1.421875,
"learning_rate": 9.495899809880858e-05,
"loss": 0.7261,
"step": 810
},
{
"epoch": 0.523211491393492,
"grad_norm": 1.6328125,
"learning_rate": 9.489518267088583e-05,
"loss": 0.7463,
"step": 815
},
{
"epoch": 0.5264213778437588,
"grad_norm": 1.4609375,
"learning_rate": 9.483099533719234e-05,
"loss": 0.7477,
"step": 820
},
{
"epoch": 0.5296312642940256,
"grad_norm": 1.453125,
"learning_rate": 9.476643683461672e-05,
"loss": 0.7441,
"step": 825
},
{
"epoch": 0.5328411507442924,
"grad_norm": 1.53125,
"learning_rate": 9.470150790430863e-05,
"loss": 0.7433,
"step": 830
},
{
"epoch": 0.5360510371945593,
"grad_norm": 1.4609375,
"learning_rate": 9.463620929167039e-05,
"loss": 0.7414,
"step": 835
},
{
"epoch": 0.539260923644826,
"grad_norm": 1.4140625,
"learning_rate": 9.457054174634837e-05,
"loss": 0.7412,
"step": 840
},
{
"epoch": 0.5424708100950929,
"grad_norm": 1.640625,
"learning_rate": 9.450450602222435e-05,
"loss": 0.7164,
"step": 845
},
{
"epoch": 0.5456806965453597,
"grad_norm": 1.53125,
"learning_rate": 9.443810287740697e-05,
"loss": 0.755,
"step": 850
},
{
"epoch": 0.5488905829956265,
"grad_norm": 1.4765625,
"learning_rate": 9.437133307422294e-05,
"loss": 0.7512,
"step": 855
},
{
"epoch": 0.5521004694458933,
"grad_norm": 1.5625,
"learning_rate": 9.430419737920828e-05,
"loss": 0.7385,
"step": 860
},
{
"epoch": 0.5553103558961602,
"grad_norm": 1.515625,
"learning_rate": 9.42366965630996e-05,
"loss": 0.7316,
"step": 865
},
{
"epoch": 0.5585202423464269,
"grad_norm": 1.390625,
"learning_rate": 9.416883140082512e-05,
"loss": 0.7297,
"step": 870
},
{
"epoch": 0.5617301287966938,
"grad_norm": 1.5,
"learning_rate": 9.410060267149596e-05,
"loss": 0.7208,
"step": 875
},
{
"epoch": 0.5649400152469607,
"grad_norm": 1.359375,
"learning_rate": 9.403201115839704e-05,
"loss": 0.7288,
"step": 880
},
{
"epoch": 0.5681499016972275,
"grad_norm": 1.4375,
"learning_rate": 9.396305764897813e-05,
"loss": 0.7133,
"step": 885
},
{
"epoch": 0.5713597881474943,
"grad_norm": 1.4921875,
"learning_rate": 9.389374293484483e-05,
"loss": 0.7036,
"step": 890
},
{
"epoch": 0.5745696745977611,
"grad_norm": 1.5234375,
"learning_rate": 9.382406781174949e-05,
"loss": 0.7332,
"step": 895
},
{
"epoch": 0.577779561048028,
"grad_norm": 1.484375,
"learning_rate": 9.3754033079582e-05,
"loss": 0.7343,
"step": 900
},
{
"epoch": 0.5809894474982947,
"grad_norm": 1.546875,
"learning_rate": 9.368363954236075e-05,
"loss": 0.7119,
"step": 905
},
{
"epoch": 0.5841993339485616,
"grad_norm": 1.5703125,
"learning_rate": 9.361288800822321e-05,
"loss": 0.7339,
"step": 910
},
{
"epoch": 0.5874092203988284,
"grad_norm": 1.453125,
"learning_rate": 9.354177928941687e-05,
"loss": 0.7163,
"step": 915
},
{
"epoch": 0.5906191068490952,
"grad_norm": 1.4453125,
"learning_rate": 9.347031420228969e-05,
"loss": 0.7281,
"step": 920
},
{
"epoch": 0.593828993299362,
"grad_norm": 1.5,
"learning_rate": 9.339849356728092e-05,
"loss": 0.7072,
"step": 925
},
{
"epoch": 0.5970388797496289,
"grad_norm": 1.46875,
"learning_rate": 9.332631820891154e-05,
"loss": 0.729,
"step": 930
},
{
"epoch": 0.6002487661998956,
"grad_norm": 1.4609375,
"learning_rate": 9.325378895577491e-05,
"loss": 0.7341,
"step": 935
},
{
"epoch": 0.6034586526501625,
"grad_norm": 1.4140625,
"learning_rate": 9.318090664052713e-05,
"loss": 0.708,
"step": 940
},
{
"epoch": 0.6066685391004293,
"grad_norm": 1.3515625,
"learning_rate": 9.310767209987763e-05,
"loss": 0.7191,
"step": 945
},
{
"epoch": 0.6098784255506962,
"grad_norm": 1.6796875,
"learning_rate": 9.303408617457943e-05,
"loss": 0.7114,
"step": 950
},
{
"epoch": 0.6130883120009629,
"grad_norm": 1.484375,
"learning_rate": 9.296014970941958e-05,
"loss": 0.704,
"step": 955
},
{
"epoch": 0.6162981984512298,
"grad_norm": 1.3359375,
"learning_rate": 9.288586355320938e-05,
"loss": 0.704,
"step": 960
},
{
"epoch": 0.6195080849014966,
"grad_norm": 1.359375,
"learning_rate": 9.281122855877473e-05,
"loss": 0.7112,
"step": 965
},
{
"epoch": 0.6227179713517634,
"grad_norm": 1.4765625,
"learning_rate": 9.273624558294627e-05,
"loss": 0.6998,
"step": 970
},
{
"epoch": 0.6259278578020303,
"grad_norm": 1.421875,
"learning_rate": 9.266091548654958e-05,
"loss": 0.7114,
"step": 975
},
{
"epoch": 0.6291377442522971,
"grad_norm": 1.5625,
"learning_rate": 9.258523913439522e-05,
"loss": 0.7307,
"step": 980
},
{
"epoch": 0.632347630702564,
"grad_norm": 1.53125,
"learning_rate": 9.250921739526896e-05,
"loss": 0.7257,
"step": 985
},
{
"epoch": 0.6355575171528307,
"grad_norm": 1.578125,
"learning_rate": 9.243285114192163e-05,
"loss": 0.7261,
"step": 990
},
{
"epoch": 0.6387674036030976,
"grad_norm": 1.40625,
"learning_rate": 9.235614125105922e-05,
"loss": 0.7139,
"step": 995
},
{
"epoch": 0.6419772900533643,
"grad_norm": 1.390625,
"learning_rate": 9.227908860333275e-05,
"loss": 0.7136,
"step": 1000
},
{
"epoch": 0.6419772900533643,
"eval_loss": 0.6108266711235046,
"eval_runtime": 2.3924,
"eval_samples_per_second": 83.597,
"eval_steps_per_second": 83.597,
"step": 1000
},
{
"epoch": 0.6451871765036312,
"grad_norm": 1.5,
"learning_rate": 9.220169408332821e-05,
"loss": 0.6998,
"step": 1005
},
{
"epoch": 0.648397062953898,
"grad_norm": 1.4375,
"learning_rate": 9.212395857955637e-05,
"loss": 0.7121,
"step": 1010
},
{
"epoch": 0.6516069494041649,
"grad_norm": 1.390625,
"learning_rate": 9.204588298444257e-05,
"loss": 0.7275,
"step": 1015
},
{
"epoch": 0.6548168358544316,
"grad_norm": 1.40625,
"learning_rate": 9.196746819431652e-05,
"loss": 0.7063,
"step": 1020
},
{
"epoch": 0.6580267223046985,
"grad_norm": 1.6171875,
"learning_rate": 9.188871510940198e-05,
"loss": 0.7275,
"step": 1025
},
{
"epoch": 0.6612366087549653,
"grad_norm": 1.375,
"learning_rate": 9.180962463380642e-05,
"loss": 0.6942,
"step": 1030
},
{
"epoch": 0.6644464952052321,
"grad_norm": 1.3828125,
"learning_rate": 9.173019767551064e-05,
"loss": 0.7184,
"step": 1035
},
{
"epoch": 0.6676563816554989,
"grad_norm": 1.515625,
"learning_rate": 9.165043514635836e-05,
"loss": 0.7054,
"step": 1040
},
{
"epoch": 0.6708662681057658,
"grad_norm": 1.53125,
"learning_rate": 9.157033796204579e-05,
"loss": 0.7166,
"step": 1045
},
{
"epoch": 0.6740761545560325,
"grad_norm": 1.53125,
"learning_rate": 9.148990704211103e-05,
"loss": 0.7031,
"step": 1050
},
{
"epoch": 0.6772860410062994,
"grad_norm": 1.453125,
"learning_rate": 9.140914330992356e-05,
"loss": 0.7071,
"step": 1055
},
{
"epoch": 0.6804959274565662,
"grad_norm": 1.5,
"learning_rate": 9.132804769267364e-05,
"loss": 0.7117,
"step": 1060
},
{
"epoch": 0.683705813906833,
"grad_norm": 1.703125,
"learning_rate": 9.124662112136169e-05,
"loss": 0.7063,
"step": 1065
},
{
"epoch": 0.6869157003570998,
"grad_norm": 1.7265625,
"learning_rate": 9.116486453078755e-05,
"loss": 0.7007,
"step": 1070
},
{
"epoch": 0.6901255868073667,
"grad_norm": 1.5703125,
"learning_rate": 9.108277885953975e-05,
"loss": 0.6956,
"step": 1075
},
{
"epoch": 0.6933354732576336,
"grad_norm": 1.4140625,
"learning_rate": 9.100036504998483e-05,
"loss": 0.6968,
"step": 1080
},
{
"epoch": 0.6965453597079003,
"grad_norm": 1.3203125,
"learning_rate": 9.091762404825639e-05,
"loss": 0.7131,
"step": 1085
},
{
"epoch": 0.6997552461581672,
"grad_norm": 1.46875,
"learning_rate": 9.08345568042443e-05,
"loss": 0.6982,
"step": 1090
},
{
"epoch": 0.702965132608434,
"grad_norm": 1.5546875,
"learning_rate": 9.075116427158379e-05,
"loss": 0.6743,
"step": 1095
},
{
"epoch": 0.7061750190587008,
"grad_norm": 1.359375,
"learning_rate": 9.06674474076445e-05,
"loss": 0.6925,
"step": 1100
},
{
"epoch": 0.7093849055089676,
"grad_norm": 1.4375,
"learning_rate": 9.058340717351948e-05,
"loss": 0.6849,
"step": 1105
},
{
"epoch": 0.7125947919592345,
"grad_norm": 1.3828125,
"learning_rate": 9.049904453401412e-05,
"loss": 0.6815,
"step": 1110
},
{
"epoch": 0.7158046784095012,
"grad_norm": 1.4453125,
"learning_rate": 9.04143604576352e-05,
"loss": 0.6905,
"step": 1115
},
{
"epoch": 0.7190145648597681,
"grad_norm": 1.4453125,
"learning_rate": 9.032935591657961e-05,
"loss": 0.69,
"step": 1120
},
{
"epoch": 0.7222244513100349,
"grad_norm": 1.3828125,
"learning_rate": 9.02440318867233e-05,
"loss": 0.6861,
"step": 1125
},
{
"epoch": 0.7254343377603018,
"grad_norm": 1.484375,
"learning_rate": 9.015838934761003e-05,
"loss": 0.7338,
"step": 1130
},
{
"epoch": 0.7286442242105685,
"grad_norm": 1.3515625,
"learning_rate": 9.007242928244014e-05,
"loss": 0.6787,
"step": 1135
},
{
"epoch": 0.7318541106608354,
"grad_norm": 1.421875,
"learning_rate": 8.998615267805922e-05,
"loss": 0.6793,
"step": 1140
},
{
"epoch": 0.7350639971111022,
"grad_norm": 1.3671875,
"learning_rate": 8.98995605249469e-05,
"loss": 0.6791,
"step": 1145
},
{
"epoch": 0.738273883561369,
"grad_norm": 1.4140625,
"learning_rate": 8.981265381720533e-05,
"loss": 0.7028,
"step": 1150
},
{
"epoch": 0.7414837700116358,
"grad_norm": 1.453125,
"learning_rate": 8.972543355254785e-05,
"loss": 0.712,
"step": 1155
},
{
"epoch": 0.7446936564619027,
"grad_norm": 1.4453125,
"learning_rate": 8.963790073228757e-05,
"loss": 0.6749,
"step": 1160
},
{
"epoch": 0.7479035429121694,
"grad_norm": 1.53125,
"learning_rate": 8.955005636132573e-05,
"loss": 0.6844,
"step": 1165
},
{
"epoch": 0.7511134293624363,
"grad_norm": 1.296875,
"learning_rate": 8.946190144814034e-05,
"loss": 0.6753,
"step": 1170
},
{
"epoch": 0.7543233158127032,
"grad_norm": 1.4296875,
"learning_rate": 8.937343700477449e-05,
"loss": 0.6809,
"step": 1175
},
{
"epoch": 0.7575332022629699,
"grad_norm": 1.3515625,
"learning_rate": 8.928466404682478e-05,
"loss": 0.7046,
"step": 1180
},
{
"epoch": 0.7607430887132368,
"grad_norm": 1.3515625,
"learning_rate": 8.91955835934296e-05,
"loss": 0.6763,
"step": 1185
},
{
"epoch": 0.7639529751635036,
"grad_norm": 1.359375,
"learning_rate": 8.910619666725755e-05,
"loss": 0.6788,
"step": 1190
},
{
"epoch": 0.7671628616137705,
"grad_norm": 1.40625,
"learning_rate": 8.901650429449553e-05,
"loss": 0.6874,
"step": 1195
},
{
"epoch": 0.7703727480640372,
"grad_norm": 1.3125,
"learning_rate": 8.892650750483715e-05,
"loss": 0.7008,
"step": 1200
},
{
"epoch": 0.7735826345143041,
"grad_norm": 1.3203125,
"learning_rate": 8.883620733147073e-05,
"loss": 0.6946,
"step": 1205
},
{
"epoch": 0.7767925209645709,
"grad_norm": 1.3671875,
"learning_rate": 8.874560481106758e-05,
"loss": 0.6845,
"step": 1210
},
{
"epoch": 0.7800024074148377,
"grad_norm": 1.3359375,
"learning_rate": 8.865470098376995e-05,
"loss": 0.7019,
"step": 1215
},
{
"epoch": 0.7832122938651045,
"grad_norm": 1.4140625,
"learning_rate": 8.856349689317933e-05,
"loss": 0.6611,
"step": 1220
},
{
"epoch": 0.7864221803153714,
"grad_norm": 1.4453125,
"learning_rate": 8.847199358634415e-05,
"loss": 0.6769,
"step": 1225
},
{
"epoch": 0.7896320667656381,
"grad_norm": 1.3359375,
"learning_rate": 8.838019211374804e-05,
"loss": 0.6684,
"step": 1230
},
{
"epoch": 0.792841953215905,
"grad_norm": 1.3046875,
"learning_rate": 8.828809352929762e-05,
"loss": 0.6799,
"step": 1235
},
{
"epoch": 0.7960518396661718,
"grad_norm": 1.8125,
"learning_rate": 8.81956988903104e-05,
"loss": 0.685,
"step": 1240
},
{
"epoch": 0.7992617261164386,
"grad_norm": 1.265625,
"learning_rate": 8.810300925750277e-05,
"loss": 0.6874,
"step": 1245
},
{
"epoch": 0.8024716125667054,
"grad_norm": 1.5625,
"learning_rate": 8.801002569497763e-05,
"loss": 0.6856,
"step": 1250
},
{
"epoch": 0.8056814990169723,
"grad_norm": 1.3359375,
"learning_rate": 8.791674927021234e-05,
"loss": 0.68,
"step": 1255
},
{
"epoch": 0.808891385467239,
"grad_norm": 1.2734375,
"learning_rate": 8.782318105404636e-05,
"loss": 0.6473,
"step": 1260
},
{
"epoch": 0.8121012719175059,
"grad_norm": 1.46875,
"learning_rate": 8.772932212066906e-05,
"loss": 0.6721,
"step": 1265
},
{
"epoch": 0.8153111583677728,
"grad_norm": 1.484375,
"learning_rate": 8.763517354760726e-05,
"loss": 0.6675,
"step": 1270
},
{
"epoch": 0.8185210448180396,
"grad_norm": 1.3125,
"learning_rate": 8.754073641571295e-05,
"loss": 0.6856,
"step": 1275
},
{
"epoch": 0.8217309312683064,
"grad_norm": 1.3515625,
"learning_rate": 8.744601180915087e-05,
"loss": 0.6938,
"step": 1280
},
{
"epoch": 0.8249408177185732,
"grad_norm": 1.296875,
"learning_rate": 8.7351000815386e-05,
"loss": 0.6785,
"step": 1285
},
{
"epoch": 0.8281507041688401,
"grad_norm": 1.3515625,
"learning_rate": 8.72557045251712e-05,
"loss": 0.6697,
"step": 1290
},
{
"epoch": 0.8313605906191068,
"grad_norm": 1.40625,
"learning_rate": 8.716012403253455e-05,
"loss": 0.6647,
"step": 1295
},
{
"epoch": 0.8345704770693737,
"grad_norm": 1.3125,
"learning_rate": 8.706426043476687e-05,
"loss": 0.6776,
"step": 1300
},
{
"epoch": 0.8377803635196405,
"grad_norm": 1.40625,
"learning_rate": 8.696811483240915e-05,
"loss": 0.6689,
"step": 1305
},
{
"epoch": 0.8409902499699073,
"grad_norm": 1.515625,
"learning_rate": 8.687168832923981e-05,
"loss": 0.6667,
"step": 1310
},
{
"epoch": 0.8442001364201741,
"grad_norm": 1.3828125,
"learning_rate": 8.67749820322621e-05,
"loss": 0.694,
"step": 1315
},
{
"epoch": 0.847410022870441,
"grad_norm": 1.3828125,
"learning_rate": 8.667799705169142e-05,
"loss": 0.6682,
"step": 1320
},
{
"epoch": 0.8506199093207077,
"grad_norm": 1.4296875,
"learning_rate": 8.65807345009425e-05,
"loss": 0.6942,
"step": 1325
},
{
"epoch": 0.8538297957709746,
"grad_norm": 1.40625,
"learning_rate": 8.648319549661668e-05,
"loss": 0.6832,
"step": 1330
},
{
"epoch": 0.8570396822212414,
"grad_norm": 1.3515625,
"learning_rate": 8.638538115848902e-05,
"loss": 0.673,
"step": 1335
},
{
"epoch": 0.8602495686715083,
"grad_norm": 1.4921875,
"learning_rate": 8.628729260949555e-05,
"loss": 0.6954,
"step": 1340
},
{
"epoch": 0.863459455121775,
"grad_norm": 1.59375,
"learning_rate": 8.618893097572027e-05,
"loss": 0.6585,
"step": 1345
},
{
"epoch": 0.8666693415720419,
"grad_norm": 1.4296875,
"learning_rate": 8.60902973863823e-05,
"loss": 0.6733,
"step": 1350
},
{
"epoch": 0.8698792280223087,
"grad_norm": 1.484375,
"learning_rate": 8.599139297382286e-05,
"loss": 0.6714,
"step": 1355
},
{
"epoch": 0.8730891144725755,
"grad_norm": 1.46875,
"learning_rate": 8.58922188734923e-05,
"loss": 0.6733,
"step": 1360
},
{
"epoch": 0.8762990009228424,
"grad_norm": 1.4453125,
"learning_rate": 8.579277622393708e-05,
"loss": 0.6771,
"step": 1365
},
{
"epoch": 0.8795088873731092,
"grad_norm": 1.5703125,
"learning_rate": 8.569306616678667e-05,
"loss": 0.6702,
"step": 1370
},
{
"epoch": 0.882718773823376,
"grad_norm": 1.421875,
"learning_rate": 8.559308984674047e-05,
"loss": 0.6461,
"step": 1375
},
{
"epoch": 0.8859286602736428,
"grad_norm": 1.4609375,
"learning_rate": 8.549284841155461e-05,
"loss": 0.6836,
"step": 1380
},
{
"epoch": 0.8891385467239097,
"grad_norm": 1.390625,
"learning_rate": 8.539234301202885e-05,
"loss": 0.6547,
"step": 1385
},
{
"epoch": 0.8923484331741764,
"grad_norm": 1.515625,
"learning_rate": 8.529157480199335e-05,
"loss": 0.664,
"step": 1390
},
{
"epoch": 0.8955583196244433,
"grad_norm": 1.2890625,
"learning_rate": 8.519054493829535e-05,
"loss": 0.6625,
"step": 1395
},
{
"epoch": 0.8987682060747101,
"grad_norm": 1.3125,
"learning_rate": 8.508925458078599e-05,
"loss": 0.6582,
"step": 1400
},
{
"epoch": 0.901978092524977,
"grad_norm": 1.3515625,
"learning_rate": 8.498770489230699e-05,
"loss": 0.6432,
"step": 1405
},
{
"epoch": 0.9051879789752437,
"grad_norm": 1.25,
"learning_rate": 8.488589703867714e-05,
"loss": 0.6775,
"step": 1410
},
{
"epoch": 0.9083978654255106,
"grad_norm": 1.3203125,
"learning_rate": 8.478383218867918e-05,
"loss": 0.6847,
"step": 1415
},
{
"epoch": 0.9116077518757774,
"grad_norm": 1.3515625,
"learning_rate": 8.468151151404616e-05,
"loss": 0.6691,
"step": 1420
},
{
"epoch": 0.9148176383260442,
"grad_norm": 1.3828125,
"learning_rate": 8.457893618944808e-05,
"loss": 0.6618,
"step": 1425
},
{
"epoch": 0.918027524776311,
"grad_norm": 1.421875,
"learning_rate": 8.447610739247838e-05,
"loss": 0.6755,
"step": 1430
},
{
"epoch": 0.9212374112265779,
"grad_norm": 1.25,
"learning_rate": 8.437302630364046e-05,
"loss": 0.6673,
"step": 1435
},
{
"epoch": 0.9244472976768446,
"grad_norm": 1.359375,
"learning_rate": 8.426969410633411e-05,
"loss": 0.6582,
"step": 1440
},
{
"epoch": 0.9276571841271115,
"grad_norm": 1.296875,
"learning_rate": 8.416611198684187e-05,
"loss": 0.6667,
"step": 1445
},
{
"epoch": 0.9308670705773783,
"grad_norm": 1.3828125,
"learning_rate": 8.406228113431552e-05,
"loss": 0.6716,
"step": 1450
},
{
"epoch": 0.9340769570276451,
"grad_norm": 1.2890625,
"learning_rate": 8.395820274076229e-05,
"loss": 0.6746,
"step": 1455
},
{
"epoch": 0.937286843477912,
"grad_norm": 1.3359375,
"learning_rate": 8.385387800103132e-05,
"loss": 0.6511,
"step": 1460
},
{
"epoch": 0.9404967299281788,
"grad_norm": 1.4453125,
"learning_rate": 8.374930811279983e-05,
"loss": 0.667,
"step": 1465
},
{
"epoch": 0.9437066163784457,
"grad_norm": 1.296875,
"learning_rate": 8.364449427655942e-05,
"loss": 0.6766,
"step": 1470
},
{
"epoch": 0.9469165028287124,
"grad_norm": 1.4453125,
"learning_rate": 8.353943769560228e-05,
"loss": 0.6468,
"step": 1475
},
{
"epoch": 0.9501263892789793,
"grad_norm": 1.359375,
"learning_rate": 8.343413957600744e-05,
"loss": 0.6427,
"step": 1480
},
{
"epoch": 0.9533362757292461,
"grad_norm": 1.5546875,
"learning_rate": 8.332860112662673e-05,
"loss": 0.6207,
"step": 1485
},
{
"epoch": 0.9565461621795129,
"grad_norm": 1.2890625,
"learning_rate": 8.322282355907117e-05,
"loss": 0.6548,
"step": 1490
},
{
"epoch": 0.9597560486297797,
"grad_norm": 1.546875,
"learning_rate": 8.311680808769682e-05,
"loss": 0.6662,
"step": 1495
},
{
"epoch": 0.9629659350800466,
"grad_norm": 1.3828125,
"learning_rate": 8.301055592959101e-05,
"loss": 0.6488,
"step": 1500
},
{
"epoch": 0.9629659350800466,
"eval_loss": 0.5586946606636047,
"eval_runtime": 2.4022,
"eval_samples_per_second": 83.255,
"eval_steps_per_second": 83.255,
"step": 1500
},
{
"epoch": 0.9661758215303133,
"grad_norm": 1.3203125,
"learning_rate": 8.290406830455828e-05,
"loss": 0.6723,
"step": 1505
},
{
"epoch": 0.9693857079805802,
"grad_norm": 1.359375,
"learning_rate": 8.279734643510636e-05,
"loss": 0.653,
"step": 1510
},
{
"epoch": 0.972595594430847,
"grad_norm": 1.359375,
"learning_rate": 8.269039154643224e-05,
"loss": 0.6535,
"step": 1515
},
{
"epoch": 0.9758054808811139,
"grad_norm": 1.3828125,
"learning_rate": 8.258320486640798e-05,
"loss": 0.6498,
"step": 1520
},
{
"epoch": 0.9790153673313806,
"grad_norm": 1.3515625,
"learning_rate": 8.24757876255667e-05,
"loss": 0.6531,
"step": 1525
},
{
"epoch": 0.9822252537816475,
"grad_norm": 1.40625,
"learning_rate": 8.23681410570884e-05,
"loss": 0.6698,
"step": 1530
},
{
"epoch": 0.9854351402319143,
"grad_norm": 1.4765625,
"learning_rate": 8.226026639678582e-05,
"loss": 0.658,
"step": 1535
},
{
"epoch": 0.9886450266821811,
"grad_norm": 1.2578125,
"learning_rate": 8.215216488309032e-05,
"loss": 0.6606,
"step": 1540
},
{
"epoch": 0.9918549131324479,
"grad_norm": 1.3515625,
"learning_rate": 8.204383775703752e-05,
"loss": 0.6519,
"step": 1545
},
{
"epoch": 0.9950647995827148,
"grad_norm": 1.40625,
"learning_rate": 8.19352862622532e-05,
"loss": 0.6452,
"step": 1550
},
{
"epoch": 0.9982746860329815,
"grad_norm": 1.3515625,
"learning_rate": 8.182651164493889e-05,
"loss": 0.6567,
"step": 1555
},
{
"epoch": 0.9995586406130883,
"eval_loss": 0.5523168444633484,
"eval_runtime": 2.4204,
"eval_samples_per_second": 82.63,
"eval_steps_per_second": 82.63,
"step": 1557
},
{
"epoch": 1.00192593187016,
"grad_norm": 1.3125,
"learning_rate": 8.171751515385769e-05,
"loss": 0.7609,
"step": 1560
},
{
"epoch": 1.0051358183204269,
"grad_norm": 1.2265625,
"learning_rate": 8.160829804031982e-05,
"loss": 0.615,
"step": 1565
},
{
"epoch": 1.0083457047706936,
"grad_norm": 1.3671875,
"learning_rate": 8.149886155816835e-05,
"loss": 0.6382,
"step": 1570
},
{
"epoch": 1.0115555912209606,
"grad_norm": 1.40625,
"learning_rate": 8.138920696376476e-05,
"loss": 0.6391,
"step": 1575
},
{
"epoch": 1.0147654776712274,
"grad_norm": 1.390625,
"learning_rate": 8.127933551597449e-05,
"loss": 0.6365,
"step": 1580
},
{
"epoch": 1.0179753641214941,
"grad_norm": 1.34375,
"learning_rate": 8.116924847615254e-05,
"loss": 0.6269,
"step": 1585
},
{
"epoch": 1.0211852505717611,
"grad_norm": 1.28125,
"learning_rate": 8.105894710812897e-05,
"loss": 0.6414,
"step": 1590
},
{
"epoch": 1.024395137022028,
"grad_norm": 1.421875,
"learning_rate": 8.094843267819438e-05,
"loss": 0.6218,
"step": 1595
},
{
"epoch": 1.0276050234722947,
"grad_norm": 1.4453125,
"learning_rate": 8.083770645508535e-05,
"loss": 0.6456,
"step": 1600
},
{
"epoch": 1.0308149099225614,
"grad_norm": 1.453125,
"learning_rate": 8.072676970996997e-05,
"loss": 0.6349,
"step": 1605
},
{
"epoch": 1.0340247963728284,
"grad_norm": 1.3046875,
"learning_rate": 8.061562371643312e-05,
"loss": 0.5872,
"step": 1610
},
{
"epoch": 1.0372346828230952,
"grad_norm": 1.375,
"learning_rate": 8.050426975046196e-05,
"loss": 0.6129,
"step": 1615
},
{
"epoch": 1.040444569273362,
"grad_norm": 1.34375,
"learning_rate": 8.039270909043119e-05,
"loss": 0.6275,
"step": 1620
},
{
"epoch": 1.0436544557236287,
"grad_norm": 1.40625,
"learning_rate": 8.028094301708843e-05,
"loss": 0.6198,
"step": 1625
},
{
"epoch": 1.0468643421738957,
"grad_norm": 1.4609375,
"learning_rate": 8.016897281353954e-05,
"loss": 0.6125,
"step": 1630
},
{
"epoch": 1.0500742286241624,
"grad_norm": 1.3828125,
"learning_rate": 8.00567997652338e-05,
"loss": 0.6076,
"step": 1635
},
{
"epoch": 1.0532841150744292,
"grad_norm": 1.5625,
"learning_rate": 7.994442515994922e-05,
"loss": 0.6153,
"step": 1640
},
{
"epoch": 1.056494001524696,
"grad_norm": 1.28125,
"learning_rate": 7.983185028777773e-05,
"loss": 0.614,
"step": 1645
},
{
"epoch": 1.059703887974963,
"grad_norm": 1.3828125,
"learning_rate": 7.971907644111043e-05,
"loss": 0.6287,
"step": 1650
},
{
"epoch": 1.0629137744252297,
"grad_norm": 1.4375,
"learning_rate": 7.960610491462265e-05,
"loss": 0.6234,
"step": 1655
},
{
"epoch": 1.0661236608754965,
"grad_norm": 1.390625,
"learning_rate": 7.949293700525914e-05,
"loss": 0.6352,
"step": 1660
},
{
"epoch": 1.0693335473257632,
"grad_norm": 1.328125,
"learning_rate": 7.93795740122192e-05,
"loss": 0.6275,
"step": 1665
},
{
"epoch": 1.0725434337760302,
"grad_norm": 1.2734375,
"learning_rate": 7.926601723694178e-05,
"loss": 0.6266,
"step": 1670
},
{
"epoch": 1.075753320226297,
"grad_norm": 1.40625,
"learning_rate": 7.915226798309042e-05,
"loss": 0.6111,
"step": 1675
},
{
"epoch": 1.0789632066765638,
"grad_norm": 1.3828125,
"learning_rate": 7.903832755653844e-05,
"loss": 0.6032,
"step": 1680
},
{
"epoch": 1.0821730931268307,
"grad_norm": 1.5859375,
"learning_rate": 7.892419726535385e-05,
"loss": 0.6113,
"step": 1685
},
{
"epoch": 1.0853829795770975,
"grad_norm": 1.359375,
"learning_rate": 7.880987841978435e-05,
"loss": 0.6332,
"step": 1690
},
{
"epoch": 1.0885928660273643,
"grad_norm": 1.46875,
"learning_rate": 7.86953723322423e-05,
"loss": 0.6419,
"step": 1695
},
{
"epoch": 1.091802752477631,
"grad_norm": 1.3203125,
"learning_rate": 7.858068031728968e-05,
"loss": 0.6249,
"step": 1700
},
{
"epoch": 1.095012638927898,
"grad_norm": 1.3203125,
"learning_rate": 7.846580369162293e-05,
"loss": 0.6075,
"step": 1705
},
{
"epoch": 1.0982225253781648,
"grad_norm": 1.3828125,
"learning_rate": 7.83507437740579e-05,
"loss": 0.6379,
"step": 1710
},
{
"epoch": 1.1014324118284315,
"grad_norm": 1.390625,
"learning_rate": 7.823550188551466e-05,
"loss": 0.6165,
"step": 1715
},
{
"epoch": 1.1046422982786983,
"grad_norm": 1.328125,
"learning_rate": 7.812007934900238e-05,
"loss": 0.6106,
"step": 1720
},
{
"epoch": 1.1078521847289653,
"grad_norm": 1.25,
"learning_rate": 7.800447748960408e-05,
"loss": 0.6132,
"step": 1725
},
{
"epoch": 1.111062071179232,
"grad_norm": 1.390625,
"learning_rate": 7.788869763446154e-05,
"loss": 0.6224,
"step": 1730
},
{
"epoch": 1.1142719576294988,
"grad_norm": 1.4609375,
"learning_rate": 7.777274111275988e-05,
"loss": 0.6353,
"step": 1735
},
{
"epoch": 1.1174818440797656,
"grad_norm": 1.2421875,
"learning_rate": 7.765660925571245e-05,
"loss": 0.6289,
"step": 1740
},
{
"epoch": 1.1206917305300326,
"grad_norm": 1.2890625,
"learning_rate": 7.754030339654552e-05,
"loss": 0.6091,
"step": 1745
},
{
"epoch": 1.1239016169802993,
"grad_norm": 1.2578125,
"learning_rate": 7.74238248704829e-05,
"loss": 0.6119,
"step": 1750
},
{
"epoch": 1.127111503430566,
"grad_norm": 1.2421875,
"learning_rate": 7.730717501473073e-05,
"loss": 0.6173,
"step": 1755
},
{
"epoch": 1.1303213898808329,
"grad_norm": 1.34375,
"learning_rate": 7.719035516846201e-05,
"loss": 0.6184,
"step": 1760
},
{
"epoch": 1.1335312763310998,
"grad_norm": 1.390625,
"learning_rate": 7.707336667280128e-05,
"loss": 0.6061,
"step": 1765
},
{
"epoch": 1.1367411627813666,
"grad_norm": 1.4296875,
"learning_rate": 7.695621087080924e-05,
"loss": 0.6265,
"step": 1770
},
{
"epoch": 1.1399510492316334,
"grad_norm": 1.328125,
"learning_rate": 7.683888910746735e-05,
"loss": 0.6272,
"step": 1775
},
{
"epoch": 1.1431609356819004,
"grad_norm": 1.359375,
"learning_rate": 7.672140272966227e-05,
"loss": 0.6162,
"step": 1780
},
{
"epoch": 1.1463708221321671,
"grad_norm": 1.40625,
"learning_rate": 7.660375308617054e-05,
"loss": 0.6165,
"step": 1785
},
{
"epoch": 1.1495807085824339,
"grad_norm": 1.328125,
"learning_rate": 7.648594152764304e-05,
"loss": 0.5994,
"step": 1790
},
{
"epoch": 1.1527905950327006,
"grad_norm": 1.328125,
"learning_rate": 7.636796940658942e-05,
"loss": 0.6298,
"step": 1795
},
{
"epoch": 1.1560004814829676,
"grad_norm": 1.4375,
"learning_rate": 7.62498380773627e-05,
"loss": 0.6124,
"step": 1800
},
{
"epoch": 1.1592103679332344,
"grad_norm": 1.34375,
"learning_rate": 7.613154889614362e-05,
"loss": 0.6236,
"step": 1805
},
{
"epoch": 1.1624202543835012,
"grad_norm": 1.3046875,
"learning_rate": 7.601310322092511e-05,
"loss": 0.6148,
"step": 1810
},
{
"epoch": 1.165630140833768,
"grad_norm": 1.3671875,
"learning_rate": 7.589450241149671e-05,
"loss": 0.6119,
"step": 1815
},
{
"epoch": 1.168840027284035,
"grad_norm": 1.4375,
"learning_rate": 7.577574782942893e-05,
"loss": 0.6034,
"step": 1820
},
{
"epoch": 1.1720499137343017,
"grad_norm": 1.4375,
"learning_rate": 7.565684083805762e-05,
"loss": 0.6049,
"step": 1825
},
{
"epoch": 1.1752598001845684,
"grad_norm": 1.3359375,
"learning_rate": 7.553778280246835e-05,
"loss": 0.6314,
"step": 1830
},
{
"epoch": 1.1784696866348352,
"grad_norm": 1.359375,
"learning_rate": 7.541857508948072e-05,
"loss": 0.6015,
"step": 1835
},
{
"epoch": 1.1816795730851022,
"grad_norm": 1.34375,
"learning_rate": 7.529921906763266e-05,
"loss": 0.6085,
"step": 1840
},
{
"epoch": 1.184889459535369,
"grad_norm": 1.3671875,
"learning_rate": 7.517971610716473e-05,
"loss": 0.6071,
"step": 1845
},
{
"epoch": 1.1880993459856357,
"grad_norm": 1.296875,
"learning_rate": 7.50600675800044e-05,
"loss": 0.6237,
"step": 1850
},
{
"epoch": 1.1913092324359025,
"grad_norm": 1.3984375,
"learning_rate": 7.494027485975027e-05,
"loss": 0.6062,
"step": 1855
},
{
"epoch": 1.1945191188861695,
"grad_norm": 1.359375,
"learning_rate": 7.482033932165631e-05,
"loss": 0.6111,
"step": 1860
},
{
"epoch": 1.1977290053364362,
"grad_norm": 1.265625,
"learning_rate": 7.470026234261611e-05,
"loss": 0.5957,
"step": 1865
},
{
"epoch": 1.200938891786703,
"grad_norm": 1.390625,
"learning_rate": 7.4580045301147e-05,
"loss": 0.6054,
"step": 1870
},
{
"epoch": 1.20414877823697,
"grad_norm": 1.3828125,
"learning_rate": 7.44596895773743e-05,
"loss": 0.6264,
"step": 1875
},
{
"epoch": 1.2073586646872367,
"grad_norm": 1.2578125,
"learning_rate": 7.433919655301543e-05,
"loss": 0.5918,
"step": 1880
},
{
"epoch": 1.2105685511375035,
"grad_norm": 1.40625,
"learning_rate": 7.421856761136405e-05,
"loss": 0.6138,
"step": 1885
},
{
"epoch": 1.2137784375877703,
"grad_norm": 1.3515625,
"learning_rate": 7.409780413727423e-05,
"loss": 0.623,
"step": 1890
},
{
"epoch": 1.2169883240380373,
"grad_norm": 1.234375,
"learning_rate": 7.397690751714444e-05,
"loss": 0.6118,
"step": 1895
},
{
"epoch": 1.220198210488304,
"grad_norm": 1.3515625,
"learning_rate": 7.385587913890175e-05,
"loss": 0.5957,
"step": 1900
},
{
"epoch": 1.2234080969385708,
"grad_norm": 1.53125,
"learning_rate": 7.373472039198583e-05,
"loss": 0.6201,
"step": 1905
},
{
"epoch": 1.2266179833888375,
"grad_norm": 1.328125,
"learning_rate": 7.361343266733307e-05,
"loss": 0.6029,
"step": 1910
},
{
"epoch": 1.2298278698391045,
"grad_norm": 1.2265625,
"learning_rate": 7.34920173573605e-05,
"loss": 0.6052,
"step": 1915
},
{
"epoch": 1.2330377562893713,
"grad_norm": 1.34375,
"learning_rate": 7.337047585594987e-05,
"loss": 0.6155,
"step": 1920
},
{
"epoch": 1.236247642739638,
"grad_norm": 1.2890625,
"learning_rate": 7.324880955843167e-05,
"loss": 0.5776,
"step": 1925
},
{
"epoch": 1.2394575291899048,
"grad_norm": 1.3984375,
"learning_rate": 7.312701986156909e-05,
"loss": 0.6156,
"step": 1930
},
{
"epoch": 1.2426674156401718,
"grad_norm": 1.578125,
"learning_rate": 7.300510816354194e-05,
"loss": 0.6011,
"step": 1935
},
{
"epoch": 1.2458773020904386,
"grad_norm": 1.671875,
"learning_rate": 7.288307586393066e-05,
"loss": 0.6094,
"step": 1940
},
{
"epoch": 1.2490871885407053,
"grad_norm": 1.390625,
"learning_rate": 7.276092436370024e-05,
"loss": 0.6074,
"step": 1945
},
{
"epoch": 1.252297074990972,
"grad_norm": 1.328125,
"learning_rate": 7.263865506518411e-05,
"loss": 0.6002,
"step": 1950
},
{
"epoch": 1.255506961441239,
"grad_norm": 1.359375,
"learning_rate": 7.251626937206806e-05,
"loss": 0.5956,
"step": 1955
},
{
"epoch": 1.2587168478915058,
"grad_norm": 1.375,
"learning_rate": 7.239376868937415e-05,
"loss": 0.6026,
"step": 1960
},
{
"epoch": 1.2619267343417726,
"grad_norm": 1.4453125,
"learning_rate": 7.227115442344452e-05,
"loss": 0.6136,
"step": 1965
},
{
"epoch": 1.2651366207920396,
"grad_norm": 1.3515625,
"learning_rate": 7.214842798192526e-05,
"loss": 0.6092,
"step": 1970
},
{
"epoch": 1.2683465072423064,
"grad_norm": 1.453125,
"learning_rate": 7.202559077375033e-05,
"loss": 0.6232,
"step": 1975
},
{
"epoch": 1.2715563936925731,
"grad_norm": 1.28125,
"learning_rate": 7.190264420912526e-05,
"loss": 0.6139,
"step": 1980
},
{
"epoch": 1.2747662801428399,
"grad_norm": 1.2890625,
"learning_rate": 7.177958969951104e-05,
"loss": 0.6085,
"step": 1985
},
{
"epoch": 1.2779761665931066,
"grad_norm": 1.484375,
"learning_rate": 7.165642865760794e-05,
"loss": 0.631,
"step": 1990
},
{
"epoch": 1.2811860530433736,
"grad_norm": 1.4765625,
"learning_rate": 7.15331624973392e-05,
"loss": 0.6131,
"step": 1995
},
{
"epoch": 1.2843959394936404,
"grad_norm": 1.4453125,
"learning_rate": 7.140979263383488e-05,
"loss": 0.6102,
"step": 2000
},
{
"epoch": 1.2843959394936404,
"eval_loss": 0.5290513038635254,
"eval_runtime": 2.3691,
"eval_samples_per_second": 84.421,
"eval_steps_per_second": 84.421,
"step": 2000
},
{
"epoch": 1.2876058259439072,
"grad_norm": 1.34375,
"learning_rate": 7.128632048341553e-05,
"loss": 0.6014,
"step": 2005
},
{
"epoch": 1.2908157123941741,
"grad_norm": 1.25,
"learning_rate": 7.116274746357605e-05,
"loss": 0.6291,
"step": 2010
},
{
"epoch": 1.294025598844441,
"grad_norm": 1.265625,
"learning_rate": 7.103907499296934e-05,
"loss": 0.5853,
"step": 2015
},
{
"epoch": 1.2972354852947077,
"grad_norm": 1.2578125,
"learning_rate": 7.091530449138994e-05,
"loss": 0.6215,
"step": 2020
},
{
"epoch": 1.3004453717449747,
"grad_norm": 1.203125,
"learning_rate": 7.079143737975795e-05,
"loss": 0.5965,
"step": 2025
},
{
"epoch": 1.3036552581952414,
"grad_norm": 1.3984375,
"learning_rate": 7.066747508010243e-05,
"loss": 0.6179,
"step": 2030
},
{
"epoch": 1.3068651446455082,
"grad_norm": 1.265625,
"learning_rate": 7.054341901554537e-05,
"loss": 0.5941,
"step": 2035
},
{
"epoch": 1.310075031095775,
"grad_norm": 1.390625,
"learning_rate": 7.04192706102851e-05,
"loss": 0.6157,
"step": 2040
},
{
"epoch": 1.3132849175460417,
"grad_norm": 1.3828125,
"learning_rate": 7.029503128958009e-05,
"loss": 0.6025,
"step": 2045
},
{
"epoch": 1.3164948039963087,
"grad_norm": 1.2421875,
"learning_rate": 7.017070247973255e-05,
"loss": 0.5932,
"step": 2050
},
{
"epoch": 1.3197046904465755,
"grad_norm": 1.4921875,
"learning_rate": 7.004628560807202e-05,
"loss": 0.5958,
"step": 2055
},
{
"epoch": 1.3229145768968422,
"grad_norm": 1.34375,
"learning_rate": 6.992178210293905e-05,
"loss": 0.6041,
"step": 2060
},
{
"epoch": 1.3261244633471092,
"grad_norm": 1.3828125,
"learning_rate": 6.979719339366876e-05,
"loss": 0.6126,
"step": 2065
},
{
"epoch": 1.329334349797376,
"grad_norm": 1.4921875,
"learning_rate": 6.96725209105744e-05,
"loss": 0.5878,
"step": 2070
},
{
"epoch": 1.3325442362476427,
"grad_norm": 1.3203125,
"learning_rate": 6.954776608493104e-05,
"loss": 0.6037,
"step": 2075
},
{
"epoch": 1.3357541226979095,
"grad_norm": 1.234375,
"learning_rate": 6.942293034895899e-05,
"loss": 0.5986,
"step": 2080
},
{
"epoch": 1.3389640091481763,
"grad_norm": 1.4140625,
"learning_rate": 6.929801513580747e-05,
"loss": 0.6124,
"step": 2085
},
{
"epoch": 1.3421738955984432,
"grad_norm": 1.265625,
"learning_rate": 6.917302187953811e-05,
"loss": 0.613,
"step": 2090
},
{
"epoch": 1.34538378204871,
"grad_norm": 1.2578125,
"learning_rate": 6.904795201510852e-05,
"loss": 0.5869,
"step": 2095
},
{
"epoch": 1.3485936684989768,
"grad_norm": 1.3671875,
"learning_rate": 6.892280697835576e-05,
"loss": 0.6194,
"step": 2100
},
{
"epoch": 1.3518035549492438,
"grad_norm": 1.3828125,
"learning_rate": 6.879758820597991e-05,
"loss": 0.5933,
"step": 2105
},
{
"epoch": 1.3550134413995105,
"grad_norm": 1.2421875,
"learning_rate": 6.867229713552754e-05,
"loss": 0.6055,
"step": 2110
},
{
"epoch": 1.3582233278497773,
"grad_norm": 1.2578125,
"learning_rate": 6.854693520537524e-05,
"loss": 0.6052,
"step": 2115
},
{
"epoch": 1.3614332143000443,
"grad_norm": 1.3046875,
"learning_rate": 6.842150385471307e-05,
"loss": 0.6174,
"step": 2120
},
{
"epoch": 1.364643100750311,
"grad_norm": 1.3828125,
"learning_rate": 6.829600452352806e-05,
"loss": 0.595,
"step": 2125
},
{
"epoch": 1.3678529872005778,
"grad_norm": 1.2421875,
"learning_rate": 6.817043865258774e-05,
"loss": 0.5939,
"step": 2130
},
{
"epoch": 1.3710628736508446,
"grad_norm": 1.3359375,
"learning_rate": 6.804480768342341e-05,
"loss": 0.6006,
"step": 2135
},
{
"epoch": 1.3742727601011113,
"grad_norm": 1.3515625,
"learning_rate": 6.791911305831382e-05,
"loss": 0.5961,
"step": 2140
},
{
"epoch": 1.3774826465513783,
"grad_norm": 1.3046875,
"learning_rate": 6.779335622026847e-05,
"loss": 0.6069,
"step": 2145
},
{
"epoch": 1.380692533001645,
"grad_norm": 1.2734375,
"learning_rate": 6.76675386130111e-05,
"loss": 0.6059,
"step": 2150
},
{
"epoch": 1.3839024194519118,
"grad_norm": 1.3828125,
"learning_rate": 6.754166168096306e-05,
"loss": 0.5894,
"step": 2155
},
{
"epoch": 1.3871123059021788,
"grad_norm": 1.4296875,
"learning_rate": 6.741572686922676e-05,
"loss": 0.6092,
"step": 2160
},
{
"epoch": 1.3903221923524456,
"grad_norm": 1.328125,
"learning_rate": 6.728973562356917e-05,
"loss": 0.5937,
"step": 2165
},
{
"epoch": 1.3935320788027123,
"grad_norm": 1.34375,
"learning_rate": 6.716368939040503e-05,
"loss": 0.5971,
"step": 2170
},
{
"epoch": 1.3967419652529791,
"grad_norm": 1.296875,
"learning_rate": 6.703758961678041e-05,
"loss": 0.5985,
"step": 2175
},
{
"epoch": 1.3999518517032459,
"grad_norm": 1.3125,
"learning_rate": 6.691143775035606e-05,
"loss": 0.6064,
"step": 2180
},
{
"epoch": 1.4031617381535129,
"grad_norm": 1.3515625,
"learning_rate": 6.678523523939074e-05,
"loss": 0.6034,
"step": 2185
},
{
"epoch": 1.4063716246037796,
"grad_norm": 1.296875,
"learning_rate": 6.66589835327246e-05,
"loss": 0.5948,
"step": 2190
},
{
"epoch": 1.4095815110540464,
"grad_norm": 1.28125,
"learning_rate": 6.653268407976258e-05,
"loss": 0.5751,
"step": 2195
},
{
"epoch": 1.4127913975043134,
"grad_norm": 1.265625,
"learning_rate": 6.640633833045783e-05,
"loss": 0.5678,
"step": 2200
},
{
"epoch": 1.4160012839545801,
"grad_norm": 1.28125,
"learning_rate": 6.627994773529489e-05,
"loss": 0.5837,
"step": 2205
},
{
"epoch": 1.419211170404847,
"grad_norm": 1.375,
"learning_rate": 6.615351374527323e-05,
"loss": 0.5856,
"step": 2210
},
{
"epoch": 1.4224210568551139,
"grad_norm": 1.3828125,
"learning_rate": 6.602703781189043e-05,
"loss": 0.5824,
"step": 2215
},
{
"epoch": 1.4256309433053806,
"grad_norm": 1.296875,
"learning_rate": 6.590052138712567e-05,
"loss": 0.6043,
"step": 2220
},
{
"epoch": 1.4288408297556474,
"grad_norm": 1.3515625,
"learning_rate": 6.57739659234229e-05,
"loss": 0.5831,
"step": 2225
},
{
"epoch": 1.4320507162059142,
"grad_norm": 1.3671875,
"learning_rate": 6.564737287367434e-05,
"loss": 0.6001,
"step": 2230
},
{
"epoch": 1.435260602656181,
"grad_norm": 1.25,
"learning_rate": 6.552074369120363e-05,
"loss": 0.6059,
"step": 2235
},
{
"epoch": 1.438470489106448,
"grad_norm": 1.3671875,
"learning_rate": 6.539407982974925e-05,
"loss": 0.5936,
"step": 2240
},
{
"epoch": 1.4416803755567147,
"grad_norm": 1.2890625,
"learning_rate": 6.52673827434478e-05,
"loss": 0.6078,
"step": 2245
},
{
"epoch": 1.4448902620069815,
"grad_norm": 1.3359375,
"learning_rate": 6.514065388681736e-05,
"loss": 0.6106,
"step": 2250
},
{
"epoch": 1.4481001484572484,
"grad_norm": 1.3515625,
"learning_rate": 6.501389471474066e-05,
"loss": 0.5819,
"step": 2255
},
{
"epoch": 1.4513100349075152,
"grad_norm": 1.1953125,
"learning_rate": 6.48871066824485e-05,
"loss": 0.5873,
"step": 2260
},
{
"epoch": 1.454519921357782,
"grad_norm": 1.328125,
"learning_rate": 6.476029124550303e-05,
"loss": 0.586,
"step": 2265
},
{
"epoch": 1.4577298078080487,
"grad_norm": 1.2578125,
"learning_rate": 6.463344985978095e-05,
"loss": 0.6004,
"step": 2270
},
{
"epoch": 1.4609396942583155,
"grad_norm": 1.3046875,
"learning_rate": 6.450658398145692e-05,
"loss": 0.5848,
"step": 2275
},
{
"epoch": 1.4641495807085825,
"grad_norm": 1.453125,
"learning_rate": 6.437969506698678e-05,
"loss": 0.6111,
"step": 2280
},
{
"epoch": 1.4673594671588492,
"grad_norm": 1.375,
"learning_rate": 6.425278457309075e-05,
"loss": 0.5844,
"step": 2285
},
{
"epoch": 1.470569353609116,
"grad_norm": 1.328125,
"learning_rate": 6.41258539567369e-05,
"loss": 0.5919,
"step": 2290
},
{
"epoch": 1.473779240059383,
"grad_norm": 1.3046875,
"learning_rate": 6.399890467512422e-05,
"loss": 0.5992,
"step": 2295
},
{
"epoch": 1.4769891265096498,
"grad_norm": 1.4453125,
"learning_rate": 6.387193818566605e-05,
"loss": 0.5969,
"step": 2300
},
{
"epoch": 1.4801990129599165,
"grad_norm": 1.28125,
"learning_rate": 6.374495594597322e-05,
"loss": 0.6171,
"step": 2305
},
{
"epoch": 1.4834088994101835,
"grad_norm": 1.21875,
"learning_rate": 6.361795941383746e-05,
"loss": 0.5789,
"step": 2310
},
{
"epoch": 1.4866187858604503,
"grad_norm": 1.34375,
"learning_rate": 6.349095004721447e-05,
"loss": 0.6131,
"step": 2315
},
{
"epoch": 1.489828672310717,
"grad_norm": 1.3203125,
"learning_rate": 6.336392930420738e-05,
"loss": 0.5972,
"step": 2320
},
{
"epoch": 1.4930385587609838,
"grad_norm": 1.3984375,
"learning_rate": 6.323689864304991e-05,
"loss": 0.5947,
"step": 2325
},
{
"epoch": 1.4962484452112506,
"grad_norm": 1.2421875,
"learning_rate": 6.31098595220896e-05,
"loss": 0.5936,
"step": 2330
},
{
"epoch": 1.4994583316615175,
"grad_norm": 1.328125,
"learning_rate": 6.298281339977119e-05,
"loss": 0.5879,
"step": 2335
},
{
"epoch": 1.5026682181117843,
"grad_norm": 1.3671875,
"learning_rate": 6.28557617346197e-05,
"loss": 0.5841,
"step": 2340
},
{
"epoch": 1.505878104562051,
"grad_norm": 1.34375,
"learning_rate": 6.272870598522385e-05,
"loss": 0.5699,
"step": 2345
},
{
"epoch": 1.509087991012318,
"grad_norm": 1.3046875,
"learning_rate": 6.260164761021923e-05,
"loss": 0.6094,
"step": 2350
},
{
"epoch": 1.5122978774625848,
"grad_norm": 1.2890625,
"learning_rate": 6.247458806827157e-05,
"loss": 0.5969,
"step": 2355
},
{
"epoch": 1.5155077639128516,
"grad_norm": 1.2421875,
"learning_rate": 6.234752881806001e-05,
"loss": 0.5865,
"step": 2360
},
{
"epoch": 1.5187176503631186,
"grad_norm": 1.3671875,
"learning_rate": 6.222047131826032e-05,
"loss": 0.5898,
"step": 2365
},
{
"epoch": 1.521927536813385,
"grad_norm": 1.3359375,
"learning_rate": 6.20934170275282e-05,
"loss": 0.6127,
"step": 2370
},
{
"epoch": 1.525137423263652,
"grad_norm": 1.3671875,
"learning_rate": 6.196636740448247e-05,
"loss": 0.5926,
"step": 2375
},
{
"epoch": 1.5283473097139189,
"grad_norm": 1.3046875,
"learning_rate": 6.183932390768842e-05,
"loss": 0.582,
"step": 2380
},
{
"epoch": 1.5315571961641856,
"grad_norm": 1.2734375,
"learning_rate": 6.171228799564095e-05,
"loss": 0.57,
"step": 2385
},
{
"epoch": 1.5347670826144526,
"grad_norm": 1.2890625,
"learning_rate": 6.158526112674792e-05,
"loss": 0.5735,
"step": 2390
},
{
"epoch": 1.5379769690647194,
"grad_norm": 1.2890625,
"learning_rate": 6.145824475931338e-05,
"loss": 0.5763,
"step": 2395
},
{
"epoch": 1.5411868555149861,
"grad_norm": 1.2890625,
"learning_rate": 6.133124035152078e-05,
"loss": 0.595,
"step": 2400
},
{
"epoch": 1.5443967419652531,
"grad_norm": 1.1953125,
"learning_rate": 6.120424936141631e-05,
"loss": 0.5876,
"step": 2405
},
{
"epoch": 1.5476066284155197,
"grad_norm": 1.203125,
"learning_rate": 6.10772732468921e-05,
"loss": 0.597,
"step": 2410
},
{
"epoch": 1.5508165148657866,
"grad_norm": 1.3125,
"learning_rate": 6.095031346566951e-05,
"loss": 0.5945,
"step": 2415
},
{
"epoch": 1.5540264013160534,
"grad_norm": 1.390625,
"learning_rate": 6.082337147528239e-05,
"loss": 0.5841,
"step": 2420
},
{
"epoch": 1.5572362877663202,
"grad_norm": 1.25,
"learning_rate": 6.069644873306034e-05,
"loss": 0.5778,
"step": 2425
},
{
"epoch": 1.5604461742165872,
"grad_norm": 1.375,
"learning_rate": 6.0569546696112014e-05,
"loss": 0.5909,
"step": 2430
},
{
"epoch": 1.563656060666854,
"grad_norm": 1.3984375,
"learning_rate": 6.04426668213083e-05,
"loss": 0.6037,
"step": 2435
},
{
"epoch": 1.5668659471171207,
"grad_norm": 1.328125,
"learning_rate": 6.031581056526574e-05,
"loss": 0.6011,
"step": 2440
},
{
"epoch": 1.5700758335673877,
"grad_norm": 1.21875,
"learning_rate": 6.018897938432966e-05,
"loss": 0.5872,
"step": 2445
},
{
"epoch": 1.5732857200176542,
"grad_norm": 1.296875,
"learning_rate": 6.0062174734557554e-05,
"loss": 0.5904,
"step": 2450
},
{
"epoch": 1.5764956064679212,
"grad_norm": 1.203125,
"learning_rate": 5.99353980717023e-05,
"loss": 0.5861,
"step": 2455
},
{
"epoch": 1.5797054929181882,
"grad_norm": 1.3984375,
"learning_rate": 5.9808650851195517e-05,
"loss": 0.5767,
"step": 2460
},
{
"epoch": 1.5829153793684547,
"grad_norm": 1.5,
"learning_rate": 5.968193452813079e-05,
"loss": 0.6083,
"step": 2465
},
{
"epoch": 1.5861252658187217,
"grad_norm": 1.421875,
"learning_rate": 5.9555250557247e-05,
"loss": 0.5851,
"step": 2470
},
{
"epoch": 1.5893351522689885,
"grad_norm": 1.3984375,
"learning_rate": 5.9428600392911624e-05,
"loss": 0.5828,
"step": 2475
},
{
"epoch": 1.5925450387192552,
"grad_norm": 1.3046875,
"learning_rate": 5.9301985489103984e-05,
"loss": 0.5983,
"step": 2480
},
{
"epoch": 1.5957549251695222,
"grad_norm": 1.2734375,
"learning_rate": 5.917540729939869e-05,
"loss": 0.5621,
"step": 2485
},
{
"epoch": 1.598964811619789,
"grad_norm": 1.3671875,
"learning_rate": 5.904886727694879e-05,
"loss": 0.5646,
"step": 2490
},
{
"epoch": 1.6021746980700557,
"grad_norm": 1.296875,
"learning_rate": 5.8922366874469195e-05,
"loss": 0.596,
"step": 2495
},
{
"epoch": 1.6053845845203227,
"grad_norm": 1.2890625,
"learning_rate": 5.879590754421995e-05,
"loss": 0.6159,
"step": 2500
},
{
"epoch": 1.6053845845203227,
"eval_loss": 0.4981262981891632,
"eval_runtime": 2.3761,
"eval_samples_per_second": 84.173,
"eval_steps_per_second": 84.173,
"step": 2500
},
{
"epoch": 1.6085944709705893,
"grad_norm": 1.3828125,
"learning_rate": 5.866949073798958e-05,
"loss": 0.6173,
"step": 2505
},
{
"epoch": 1.6118043574208563,
"grad_norm": 1.265625,
"learning_rate": 5.854311790707845e-05,
"loss": 0.5769,
"step": 2510
},
{
"epoch": 1.615014243871123,
"grad_norm": 1.3203125,
"learning_rate": 5.8416790502282026e-05,
"loss": 0.5856,
"step": 2515
},
{
"epoch": 1.6182241303213898,
"grad_norm": 1.2890625,
"learning_rate": 5.829050997387432e-05,
"loss": 0.5743,
"step": 2520
},
{
"epoch": 1.6214340167716568,
"grad_norm": 1.3203125,
"learning_rate": 5.816427777159117e-05,
"loss": 0.5854,
"step": 2525
},
{
"epoch": 1.6246439032219235,
"grad_norm": 1.3515625,
"learning_rate": 5.8038095344613595e-05,
"loss": 0.5837,
"step": 2530
},
{
"epoch": 1.6278537896721903,
"grad_norm": 1.2109375,
"learning_rate": 5.791196414155121e-05,
"loss": 0.6061,
"step": 2535
},
{
"epoch": 1.6310636761224573,
"grad_norm": 1.1796875,
"learning_rate": 5.778588561042556e-05,
"loss": 0.5856,
"step": 2540
},
{
"epoch": 1.6342735625727238,
"grad_norm": 1.296875,
"learning_rate": 5.76598611986535e-05,
"loss": 0.5721,
"step": 2545
},
{
"epoch": 1.6374834490229908,
"grad_norm": 1.3046875,
"learning_rate": 5.753389235303055e-05,
"loss": 0.5907,
"step": 2550
},
{
"epoch": 1.6406933354732578,
"grad_norm": 1.2578125,
"learning_rate": 5.7407980519714346e-05,
"loss": 0.5801,
"step": 2555
},
{
"epoch": 1.6439032219235243,
"grad_norm": 1.5,
"learning_rate": 5.728212714420804e-05,
"loss": 0.5794,
"step": 2560
},
{
"epoch": 1.6471131083737913,
"grad_norm": 1.3515625,
"learning_rate": 5.71563336713436e-05,
"loss": 0.5779,
"step": 2565
},
{
"epoch": 1.650322994824058,
"grad_norm": 1.4765625,
"learning_rate": 5.7030601545265336e-05,
"loss": 0.5851,
"step": 2570
},
{
"epoch": 1.6535328812743249,
"grad_norm": 1.3359375,
"learning_rate": 5.6904932209413276e-05,
"loss": 0.5868,
"step": 2575
},
{
"epoch": 1.6567427677245918,
"grad_norm": 1.3984375,
"learning_rate": 5.6779327106506594e-05,
"loss": 0.5722,
"step": 2580
},
{
"epoch": 1.6599526541748586,
"grad_norm": 1.328125,
"learning_rate": 5.665378767852704e-05,
"loss": 0.5988,
"step": 2585
},
{
"epoch": 1.6631625406251254,
"grad_norm": 1.25,
"learning_rate": 5.652831536670242e-05,
"loss": 0.5766,
"step": 2590
},
{
"epoch": 1.6663724270753923,
"grad_norm": 1.3984375,
"learning_rate": 5.640291161149e-05,
"loss": 0.592,
"step": 2595
},
{
"epoch": 1.669582313525659,
"grad_norm": 1.34375,
"learning_rate": 5.627757785256006e-05,
"loss": 0.5893,
"step": 2600
},
{
"epoch": 1.6727921999759259,
"grad_norm": 1.359375,
"learning_rate": 5.615231552877921e-05,
"loss": 0.5747,
"step": 2605
},
{
"epoch": 1.6760020864261926,
"grad_norm": 1.3125,
"learning_rate": 5.602712607819404e-05,
"loss": 0.5804,
"step": 2610
},
{
"epoch": 1.6792119728764594,
"grad_norm": 1.3515625,
"learning_rate": 5.590201093801449e-05,
"loss": 0.5734,
"step": 2615
},
{
"epoch": 1.6824218593267264,
"grad_norm": 1.21875,
"learning_rate": 5.577697154459742e-05,
"loss": 0.5708,
"step": 2620
},
{
"epoch": 1.6856317457769932,
"grad_norm": 1.3359375,
"learning_rate": 5.565200933343009e-05,
"loss": 0.5863,
"step": 2625
},
{
"epoch": 1.68884163222726,
"grad_norm": 1.2734375,
"learning_rate": 5.5527125739113686e-05,
"loss": 0.5846,
"step": 2630
},
{
"epoch": 1.692051518677527,
"grad_norm": 1.28125,
"learning_rate": 5.540232219534685e-05,
"loss": 0.5533,
"step": 2635
},
{
"epoch": 1.6952614051277934,
"grad_norm": 1.2890625,
"learning_rate": 5.527760013490922e-05,
"loss": 0.5916,
"step": 2640
},
{
"epoch": 1.6984712915780604,
"grad_norm": 1.328125,
"learning_rate": 5.515296098964499e-05,
"loss": 0.5641,
"step": 2645
},
{
"epoch": 1.7016811780283274,
"grad_norm": 1.2265625,
"learning_rate": 5.502840619044645e-05,
"loss": 0.5737,
"step": 2650
},
{
"epoch": 1.704891064478594,
"grad_norm": 1.28125,
"learning_rate": 5.490393716723757e-05,
"loss": 0.5728,
"step": 2655
},
{
"epoch": 1.708100950928861,
"grad_norm": 1.2265625,
"learning_rate": 5.477955534895762e-05,
"loss": 0.5614,
"step": 2660
},
{
"epoch": 1.7113108373791277,
"grad_norm": 1.34375,
"learning_rate": 5.465526216354471e-05,
"loss": 0.5819,
"step": 2665
},
{
"epoch": 1.7145207238293945,
"grad_norm": 1.203125,
"learning_rate": 5.453105903791942e-05,
"loss": 0.5709,
"step": 2670
},
{
"epoch": 1.7177306102796615,
"grad_norm": 1.234375,
"learning_rate": 5.44069473979684e-05,
"loss": 0.5951,
"step": 2675
},
{
"epoch": 1.7209404967299282,
"grad_norm": 1.2265625,
"learning_rate": 5.428292866852808e-05,
"loss": 0.5705,
"step": 2680
},
{
"epoch": 1.724150383180195,
"grad_norm": 1.265625,
"learning_rate": 5.4159004273368166e-05,
"loss": 0.5787,
"step": 2685
},
{
"epoch": 1.727360269630462,
"grad_norm": 1.2578125,
"learning_rate": 5.4035175635175464e-05,
"loss": 0.5832,
"step": 2690
},
{
"epoch": 1.7305701560807285,
"grad_norm": 1.3046875,
"learning_rate": 5.3911444175537394e-05,
"loss": 0.5888,
"step": 2695
},
{
"epoch": 1.7337800425309955,
"grad_norm": 1.3046875,
"learning_rate": 5.3787811314925776e-05,
"loss": 0.5695,
"step": 2700
},
{
"epoch": 1.7369899289812623,
"grad_norm": 1.21875,
"learning_rate": 5.3664278472680496e-05,
"loss": 0.569,
"step": 2705
},
{
"epoch": 1.740199815431529,
"grad_norm": 1.234375,
"learning_rate": 5.3540847066993173e-05,
"loss": 0.5853,
"step": 2710
},
{
"epoch": 1.743409701881796,
"grad_norm": 1.25,
"learning_rate": 5.341751851489091e-05,
"loss": 0.589,
"step": 2715
},
{
"epoch": 1.7466195883320628,
"grad_norm": 1.203125,
"learning_rate": 5.329429423222003e-05,
"loss": 0.5679,
"step": 2720
},
{
"epoch": 1.7498294747823295,
"grad_norm": 1.2890625,
"learning_rate": 5.3171175633629835e-05,
"loss": 0.5823,
"step": 2725
},
{
"epoch": 1.7530393612325965,
"grad_norm": 1.2265625,
"learning_rate": 5.3048164132556285e-05,
"loss": 0.5561,
"step": 2730
},
{
"epoch": 1.756249247682863,
"grad_norm": 1.265625,
"learning_rate": 5.292526114120589e-05,
"loss": 0.5701,
"step": 2735
},
{
"epoch": 1.75945913413313,
"grad_norm": 1.1953125,
"learning_rate": 5.28024680705394e-05,
"loss": 0.5779,
"step": 2740
},
{
"epoch": 1.762669020583397,
"grad_norm": 1.2421875,
"learning_rate": 5.267978633025568e-05,
"loss": 0.5607,
"step": 2745
},
{
"epoch": 1.7658789070336636,
"grad_norm": 1.171875,
"learning_rate": 5.255721732877546e-05,
"loss": 0.5862,
"step": 2750
},
{
"epoch": 1.7690887934839306,
"grad_norm": 1.296875,
"learning_rate": 5.243476247322521e-05,
"loss": 0.5764,
"step": 2755
},
{
"epoch": 1.7722986799341973,
"grad_norm": 1.296875,
"learning_rate": 5.2312423169420955e-05,
"loss": 0.5814,
"step": 2760
},
{
"epoch": 1.775508566384464,
"grad_norm": 1.2890625,
"learning_rate": 5.219020082185219e-05,
"loss": 0.5808,
"step": 2765
},
{
"epoch": 1.778718452834731,
"grad_norm": 1.2265625,
"learning_rate": 5.206809683366569e-05,
"loss": 0.58,
"step": 2770
},
{
"epoch": 1.7819283392849978,
"grad_norm": 1.2265625,
"learning_rate": 5.1946112606649435e-05,
"loss": 0.5723,
"step": 2775
},
{
"epoch": 1.7851382257352646,
"grad_norm": 1.2265625,
"learning_rate": 5.182424954121652e-05,
"loss": 0.5789,
"step": 2780
},
{
"epoch": 1.7883481121855316,
"grad_norm": 1.2890625,
"learning_rate": 5.170250903638909e-05,
"loss": 0.5726,
"step": 2785
},
{
"epoch": 1.7915579986357981,
"grad_norm": 1.1875,
"learning_rate": 5.158089248978221e-05,
"loss": 0.5718,
"step": 2790
},
{
"epoch": 1.794767885086065,
"grad_norm": 1.28125,
"learning_rate": 5.1459401297587916e-05,
"loss": 0.5845,
"step": 2795
},
{
"epoch": 1.7979777715363319,
"grad_norm": 1.203125,
"learning_rate": 5.1338036854559113e-05,
"loss": 0.563,
"step": 2800
},
{
"epoch": 1.8011876579865986,
"grad_norm": 1.2421875,
"learning_rate": 5.1216800553993606e-05,
"loss": 0.5841,
"step": 2805
},
{
"epoch": 1.8043975444368656,
"grad_norm": 1.2734375,
"learning_rate": 5.109569378771808e-05,
"loss": 0.5648,
"step": 2810
},
{
"epoch": 1.8076074308871324,
"grad_norm": 1.2421875,
"learning_rate": 5.097471794607214e-05,
"loss": 0.5768,
"step": 2815
},
{
"epoch": 1.8108173173373991,
"grad_norm": 1.2421875,
"learning_rate": 5.0853874417892324e-05,
"loss": 0.5596,
"step": 2820
},
{
"epoch": 1.8140272037876661,
"grad_norm": 1.3671875,
"learning_rate": 5.07331645904962e-05,
"loss": 0.5873,
"step": 2825
},
{
"epoch": 1.8172370902379327,
"grad_norm": 1.3125,
"learning_rate": 5.061258984966636e-05,
"loss": 0.5807,
"step": 2830
},
{
"epoch": 1.8204469766881997,
"grad_norm": 1.3359375,
"learning_rate": 5.049215157963464e-05,
"loss": 0.5814,
"step": 2835
},
{
"epoch": 1.8236568631384664,
"grad_norm": 1.4453125,
"learning_rate": 5.03718511630661e-05,
"loss": 0.5727,
"step": 2840
},
{
"epoch": 1.8268667495887332,
"grad_norm": 1.2890625,
"learning_rate": 5.025168998104322e-05,
"loss": 0.5731,
"step": 2845
},
{
"epoch": 1.8300766360390002,
"grad_norm": 1.25,
"learning_rate": 5.013166941304999e-05,
"loss": 0.5664,
"step": 2850
},
{
"epoch": 1.833286522489267,
"grad_norm": 1.3046875,
"learning_rate": 5.0011790836956197e-05,
"loss": 0.5812,
"step": 2855
},
{
"epoch": 1.8364964089395337,
"grad_norm": 1.21875,
"learning_rate": 4.989205562900144e-05,
"loss": 0.5715,
"step": 2860
},
{
"epoch": 1.8397062953898007,
"grad_norm": 1.2265625,
"learning_rate": 4.9772465163779474e-05,
"loss": 0.5785,
"step": 2865
},
{
"epoch": 1.8429161818400674,
"grad_norm": 1.1796875,
"learning_rate": 4.9653020814222315e-05,
"loss": 0.5813,
"step": 2870
},
{
"epoch": 1.8461260682903342,
"grad_norm": 1.2265625,
"learning_rate": 4.9533723951584554e-05,
"loss": 0.59,
"step": 2875
},
{
"epoch": 1.8493359547406012,
"grad_norm": 1.21875,
"learning_rate": 4.94145759454276e-05,
"loss": 0.565,
"step": 2880
},
{
"epoch": 1.8525458411908677,
"grad_norm": 1.3046875,
"learning_rate": 4.929557816360391e-05,
"loss": 0.5839,
"step": 2885
},
{
"epoch": 1.8557557276411347,
"grad_norm": 1.234375,
"learning_rate": 4.9176731972241376e-05,
"loss": 0.5755,
"step": 2890
},
{
"epoch": 1.8589656140914015,
"grad_norm": 1.3125,
"learning_rate": 4.905803873572755e-05,
"loss": 0.571,
"step": 2895
},
{
"epoch": 1.8621755005416682,
"grad_norm": 1.203125,
"learning_rate": 4.8939499816694035e-05,
"loss": 0.572,
"step": 2900
},
{
"epoch": 1.8653853869919352,
"grad_norm": 1.234375,
"learning_rate": 4.882111657600081e-05,
"loss": 0.5559,
"step": 2905
},
{
"epoch": 1.868595273442202,
"grad_norm": 1.234375,
"learning_rate": 4.8702890372720664e-05,
"loss": 0.5792,
"step": 2910
},
{
"epoch": 1.8718051598924688,
"grad_norm": 1.2265625,
"learning_rate": 4.85848225641235e-05,
"loss": 0.5611,
"step": 2915
},
{
"epoch": 1.8750150463427357,
"grad_norm": 1.2421875,
"learning_rate": 4.8466914505660834e-05,
"loss": 0.5663,
"step": 2920
},
{
"epoch": 1.8782249327930023,
"grad_norm": 1.3828125,
"learning_rate": 4.834916755095022e-05,
"loss": 0.5914,
"step": 2925
},
{
"epoch": 1.8814348192432693,
"grad_norm": 1.203125,
"learning_rate": 4.823158305175967e-05,
"loss": 0.5712,
"step": 2930
},
{
"epoch": 1.884644705693536,
"grad_norm": 1.265625,
"learning_rate": 4.811416235799216e-05,
"loss": 0.5957,
"step": 2935
},
{
"epoch": 1.8878545921438028,
"grad_norm": 1.375,
"learning_rate": 4.7996906817670155e-05,
"loss": 0.5872,
"step": 2940
},
{
"epoch": 1.8910644785940698,
"grad_norm": 1.34375,
"learning_rate": 4.78798177769201e-05,
"loss": 0.5604,
"step": 2945
},
{
"epoch": 1.8942743650443365,
"grad_norm": 1.359375,
"learning_rate": 4.7762896579956966e-05,
"loss": 0.556,
"step": 2950
},
{
"epoch": 1.8974842514946033,
"grad_norm": 1.2734375,
"learning_rate": 4.764614456906886e-05,
"loss": 0.5577,
"step": 2955
},
{
"epoch": 1.9006941379448703,
"grad_norm": 1.1484375,
"learning_rate": 4.752956308460155e-05,
"loss": 0.584,
"step": 2960
},
{
"epoch": 1.903904024395137,
"grad_norm": 1.296875,
"learning_rate": 4.741315346494314e-05,
"loss": 0.5625,
"step": 2965
},
{
"epoch": 1.9071139108454038,
"grad_norm": 1.2734375,
"learning_rate": 4.729691704650867e-05,
"loss": 0.5684,
"step": 2970
},
{
"epoch": 1.9103237972956708,
"grad_norm": 1.3359375,
"learning_rate": 4.718085516372478e-05,
"loss": 0.5851,
"step": 2975
},
{
"epoch": 1.9135336837459374,
"grad_norm": 1.1875,
"learning_rate": 4.70649691490144e-05,
"loss": 0.5637,
"step": 2980
},
{
"epoch": 1.9167435701962043,
"grad_norm": 1.296875,
"learning_rate": 4.694926033278142e-05,
"loss": 0.5792,
"step": 2985
},
{
"epoch": 1.919953456646471,
"grad_norm": 1.203125,
"learning_rate": 4.683373004339547e-05,
"loss": 0.5406,
"step": 2990
},
{
"epoch": 1.9231633430967379,
"grad_norm": 1.34375,
"learning_rate": 4.6718379607176634e-05,
"loss": 0.5777,
"step": 2995
},
{
"epoch": 1.9263732295470049,
"grad_norm": 1.25,
"learning_rate": 4.6603210348380235e-05,
"loss": 0.5742,
"step": 3000
},
{
"epoch": 1.9263732295470049,
"eval_loss": 0.48648878931999207,
"eval_runtime": 2.4037,
"eval_samples_per_second": 83.204,
"eval_steps_per_second": 83.204,
"step": 3000
},
{
"epoch": 1.9295831159972716,
"grad_norm": 1.2265625,
"learning_rate": 4.64882235891816e-05,
"loss": 0.5662,
"step": 3005
},
{
"epoch": 1.9327930024475384,
"grad_norm": 1.2890625,
"learning_rate": 4.637342064966095e-05,
"loss": 0.5972,
"step": 3010
},
{
"epoch": 1.9360028888978054,
"grad_norm": 1.2265625,
"learning_rate": 4.625880284778818e-05,
"loss": 0.5682,
"step": 3015
},
{
"epoch": 1.939212775348072,
"grad_norm": 1.2578125,
"learning_rate": 4.614437149940776e-05,
"loss": 0.5703,
"step": 3020
},
{
"epoch": 1.942422661798339,
"grad_norm": 1.3203125,
"learning_rate": 4.603012791822362e-05,
"loss": 0.5611,
"step": 3025
},
{
"epoch": 1.9456325482486057,
"grad_norm": 1.21875,
"learning_rate": 4.591607341578407e-05,
"loss": 0.5471,
"step": 3030
},
{
"epoch": 1.9488424346988724,
"grad_norm": 1.1640625,
"learning_rate": 4.580220930146675e-05,
"loss": 0.5398,
"step": 3035
},
{
"epoch": 1.9520523211491394,
"grad_norm": 1.3203125,
"learning_rate": 4.568853688246357e-05,
"loss": 0.5864,
"step": 3040
},
{
"epoch": 1.9552622075994062,
"grad_norm": 1.234375,
"learning_rate": 4.557505746376576e-05,
"loss": 0.5662,
"step": 3045
},
{
"epoch": 1.958472094049673,
"grad_norm": 1.25,
"learning_rate": 4.546177234814881e-05,
"loss": 0.5745,
"step": 3050
},
{
"epoch": 1.96168198049994,
"grad_norm": 1.1875,
"learning_rate": 4.53486828361576e-05,
"loss": 0.5486,
"step": 3055
},
{
"epoch": 1.9648918669502067,
"grad_norm": 1.2265625,
"learning_rate": 4.523579022609139e-05,
"loss": 0.5703,
"step": 3060
},
{
"epoch": 1.9681017534004734,
"grad_norm": 1.3125,
"learning_rate": 4.512309581398896e-05,
"loss": 0.5627,
"step": 3065
},
{
"epoch": 1.9713116398507404,
"grad_norm": 1.296875,
"learning_rate": 4.5010600893613714e-05,
"loss": 0.5839,
"step": 3070
},
{
"epoch": 1.974521526301007,
"grad_norm": 1.2421875,
"learning_rate": 4.489830675643888e-05,
"loss": 0.5638,
"step": 3075
},
{
"epoch": 1.977731412751274,
"grad_norm": 1.2578125,
"learning_rate": 4.478621469163259e-05,
"loss": 0.5709,
"step": 3080
},
{
"epoch": 1.9809412992015407,
"grad_norm": 1.2421875,
"learning_rate": 4.4674325986043145e-05,
"loss": 0.558,
"step": 3085
},
{
"epoch": 1.9841511856518075,
"grad_norm": 1.1953125,
"learning_rate": 4.456264192418422e-05,
"loss": 0.5639,
"step": 3090
},
{
"epoch": 1.9873610721020745,
"grad_norm": 1.25,
"learning_rate": 4.445116378822014e-05,
"loss": 0.5742,
"step": 3095
},
{
"epoch": 1.9905709585523412,
"grad_norm": 1.25,
"learning_rate": 4.433989285795112e-05,
"loss": 0.5653,
"step": 3100
},
{
"epoch": 1.993780845002608,
"grad_norm": 1.234375,
"learning_rate": 4.4228830410798594e-05,
"loss": 0.581,
"step": 3105
},
{
"epoch": 1.996990731452875,
"grad_norm": 1.1640625,
"learning_rate": 4.411797772179059e-05,
"loss": 0.5658,
"step": 3110
},
{
"epoch": 1.9995586406130883,
"eval_loss": 0.48290687799453735,
"eval_runtime": 2.4097,
"eval_samples_per_second": 82.996,
"eval_steps_per_second": 82.996,
"step": 3114
},
{
"epoch": 2.000641977290053,
"grad_norm": 3.296875,
"learning_rate": 4.4007336063547e-05,
"loss": 0.6695,
"step": 3115
},
{
"epoch": 2.00385186374032,
"grad_norm": 1.1796875,
"learning_rate": 4.389690670626507e-05,
"loss": 0.5518,
"step": 3120
},
{
"epoch": 2.007061750190587,
"grad_norm": 1.25,
"learning_rate": 4.378669091770474e-05,
"loss": 0.5527,
"step": 3125
},
{
"epoch": 2.0102716366408537,
"grad_norm": 1.34375,
"learning_rate": 4.367668996317413e-05,
"loss": 0.5517,
"step": 3130
},
{
"epoch": 2.0134815230911207,
"grad_norm": 1.3046875,
"learning_rate": 4.3566905105515035e-05,
"loss": 0.5451,
"step": 3135
},
{
"epoch": 2.0166914095413873,
"grad_norm": 1.25,
"learning_rate": 4.345733760508832e-05,
"loss": 0.5342,
"step": 3140
},
{
"epoch": 2.0199012959916542,
"grad_norm": 1.21875,
"learning_rate": 4.334798871975963e-05,
"loss": 0.5445,
"step": 3145
},
{
"epoch": 2.0231111824419212,
"grad_norm": 1.15625,
"learning_rate": 4.3238859704884784e-05,
"loss": 0.5442,
"step": 3150
},
{
"epoch": 2.0263210688921878,
"grad_norm": 1.203125,
"learning_rate": 4.312995181329543e-05,
"loss": 0.5367,
"step": 3155
},
{
"epoch": 2.0295309553424548,
"grad_norm": 1.2265625,
"learning_rate": 4.3021266295284665e-05,
"loss": 0.5466,
"step": 3160
},
{
"epoch": 2.0327408417927217,
"grad_norm": 1.1953125,
"learning_rate": 4.291280439859269e-05,
"loss": 0.5709,
"step": 3165
},
{
"epoch": 2.0359507282429883,
"grad_norm": 1.2421875,
"learning_rate": 4.280456736839245e-05,
"loss": 0.5409,
"step": 3170
},
{
"epoch": 2.0391606146932553,
"grad_norm": 1.2265625,
"learning_rate": 4.269655644727536e-05,
"loss": 0.5526,
"step": 3175
},
{
"epoch": 2.0423705011435223,
"grad_norm": 1.2578125,
"learning_rate": 4.258877287523707e-05,
"loss": 0.539,
"step": 3180
},
{
"epoch": 2.045580387593789,
"grad_norm": 1.1796875,
"learning_rate": 4.2481217889663156e-05,
"loss": 0.5503,
"step": 3185
},
{
"epoch": 2.048790274044056,
"grad_norm": 1.1875,
"learning_rate": 4.237389272531499e-05,
"loss": 0.5537,
"step": 3190
},
{
"epoch": 2.0520001604943223,
"grad_norm": 1.2578125,
"learning_rate": 4.2266798614315505e-05,
"loss": 0.544,
"step": 3195
},
{
"epoch": 2.0552100469445893,
"grad_norm": 1.3671875,
"learning_rate": 4.2159936786135115e-05,
"loss": 0.5358,
"step": 3200
},
{
"epoch": 2.0584199333948563,
"grad_norm": 1.2578125,
"learning_rate": 4.2053308467577516e-05,
"loss": 0.5185,
"step": 3205
},
{
"epoch": 2.061629819845123,
"grad_norm": 1.2109375,
"learning_rate": 4.1946914882765684e-05,
"loss": 0.5666,
"step": 3210
},
{
"epoch": 2.06483970629539,
"grad_norm": 1.1953125,
"learning_rate": 4.184075725312776e-05,
"loss": 0.5325,
"step": 3215
},
{
"epoch": 2.068049592745657,
"grad_norm": 1.25,
"learning_rate": 4.173483679738309e-05,
"loss": 0.5484,
"step": 3220
},
{
"epoch": 2.0712594791959233,
"grad_norm": 1.2890625,
"learning_rate": 4.162915473152816e-05,
"loss": 0.5483,
"step": 3225
},
{
"epoch": 2.0744693656461903,
"grad_norm": 1.28125,
"learning_rate": 4.152371226882268e-05,
"loss": 0.5411,
"step": 3230
},
{
"epoch": 2.077679252096457,
"grad_norm": 1.2578125,
"learning_rate": 4.141851061977565e-05,
"loss": 0.5503,
"step": 3235
},
{
"epoch": 2.080889138546724,
"grad_norm": 1.140625,
"learning_rate": 4.131355099213149e-05,
"loss": 0.552,
"step": 3240
},
{
"epoch": 2.084099024996991,
"grad_norm": 1.203125,
"learning_rate": 4.120883459085611e-05,
"loss": 0.5297,
"step": 3245
},
{
"epoch": 2.0873089114472574,
"grad_norm": 1.2421875,
"learning_rate": 4.110436261812313e-05,
"loss": 0.5324,
"step": 3250
},
{
"epoch": 2.0905187978975244,
"grad_norm": 1.1484375,
"learning_rate": 4.100013627330006e-05,
"loss": 0.5355,
"step": 3255
},
{
"epoch": 2.0937286843477914,
"grad_norm": 1.234375,
"learning_rate": 4.089615675293452e-05,
"loss": 0.5508,
"step": 3260
},
{
"epoch": 2.096938570798058,
"grad_norm": 1.25,
"learning_rate": 4.0792425250740544e-05,
"loss": 0.5185,
"step": 3265
},
{
"epoch": 2.100148457248325,
"grad_norm": 1.2890625,
"learning_rate": 4.0688942957584825e-05,
"loss": 0.5783,
"step": 3270
},
{
"epoch": 2.103358343698592,
"grad_norm": 1.2578125,
"learning_rate": 4.058571106147307e-05,
"loss": 0.5403,
"step": 3275
},
{
"epoch": 2.1065682301488584,
"grad_norm": 1.3359375,
"learning_rate": 4.048273074753637e-05,
"loss": 0.5358,
"step": 3280
},
{
"epoch": 2.1097781165991254,
"grad_norm": 1.296875,
"learning_rate": 4.038000319801756e-05,
"loss": 0.5203,
"step": 3285
},
{
"epoch": 2.112988003049392,
"grad_norm": 1.234375,
"learning_rate": 4.0277529592257676e-05,
"loss": 0.5501,
"step": 3290
},
{
"epoch": 2.116197889499659,
"grad_norm": 1.234375,
"learning_rate": 4.017531110668244e-05,
"loss": 0.5677,
"step": 3295
},
{
"epoch": 2.119407775949926,
"grad_norm": 1.2265625,
"learning_rate": 4.0073348914788684e-05,
"loss": 0.536,
"step": 3300
},
{
"epoch": 2.1226176624001925,
"grad_norm": 1.265625,
"learning_rate": 3.997164418713093e-05,
"loss": 0.553,
"step": 3305
},
{
"epoch": 2.1258275488504594,
"grad_norm": 1.3515625,
"learning_rate": 3.987019809130794e-05,
"loss": 0.5614,
"step": 3310
},
{
"epoch": 2.1290374353007264,
"grad_norm": 1.25,
"learning_rate": 3.9769011791949305e-05,
"loss": 0.5337,
"step": 3315
},
{
"epoch": 2.132247321750993,
"grad_norm": 1.1640625,
"learning_rate": 3.9668086450702086e-05,
"loss": 0.5257,
"step": 3320
},
{
"epoch": 2.13545720820126,
"grad_norm": 1.2109375,
"learning_rate": 3.956742322621747e-05,
"loss": 0.5379,
"step": 3325
},
{
"epoch": 2.1386670946515265,
"grad_norm": 1.2578125,
"learning_rate": 3.946702327413746e-05,
"loss": 0.5356,
"step": 3330
},
{
"epoch": 2.1418769811017935,
"grad_norm": 1.3203125,
"learning_rate": 3.936688774708163e-05,
"loss": 0.5343,
"step": 3335
},
{
"epoch": 2.1450868675520605,
"grad_norm": 1.265625,
"learning_rate": 3.926701779463389e-05,
"loss": 0.5452,
"step": 3340
},
{
"epoch": 2.148296754002327,
"grad_norm": 1.171875,
"learning_rate": 3.916741456332926e-05,
"loss": 0.5443,
"step": 3345
},
{
"epoch": 2.151506640452594,
"grad_norm": 1.28125,
"learning_rate": 3.906807919664073e-05,
"loss": 0.5368,
"step": 3350
},
{
"epoch": 2.154716526902861,
"grad_norm": 1.1953125,
"learning_rate": 3.8969012834966135e-05,
"loss": 0.5436,
"step": 3355
},
{
"epoch": 2.1579264133531275,
"grad_norm": 1.21875,
"learning_rate": 3.8870216615615045e-05,
"loss": 0.5238,
"step": 3360
},
{
"epoch": 2.1611362998033945,
"grad_norm": 1.2734375,
"learning_rate": 3.877169167279575e-05,
"loss": 0.5483,
"step": 3365
},
{
"epoch": 2.1643461862536615,
"grad_norm": 1.296875,
"learning_rate": 3.867343913760218e-05,
"loss": 0.5313,
"step": 3370
},
{
"epoch": 2.167556072703928,
"grad_norm": 1.2109375,
"learning_rate": 3.857546013800095e-05,
"loss": 0.539,
"step": 3375
},
{
"epoch": 2.170765959154195,
"grad_norm": 1.328125,
"learning_rate": 3.847775579881844e-05,
"loss": 0.5385,
"step": 3380
},
{
"epoch": 2.1739758456044616,
"grad_norm": 1.2578125,
"learning_rate": 3.8380327241727804e-05,
"loss": 0.5496,
"step": 3385
},
{
"epoch": 2.1771857320547285,
"grad_norm": 1.203125,
"learning_rate": 3.828317558523619e-05,
"loss": 0.545,
"step": 3390
},
{
"epoch": 2.1803956185049955,
"grad_norm": 1.2265625,
"learning_rate": 3.818630194467181e-05,
"loss": 0.5343,
"step": 3395
},
{
"epoch": 2.183605504955262,
"grad_norm": 1.3828125,
"learning_rate": 3.8089707432171193e-05,
"loss": 0.5325,
"step": 3400
},
{
"epoch": 2.186815391405529,
"grad_norm": 1.28125,
"learning_rate": 3.799339315666641e-05,
"loss": 0.547,
"step": 3405
},
{
"epoch": 2.190025277855796,
"grad_norm": 1.296875,
"learning_rate": 3.789736022387231e-05,
"loss": 0.5448,
"step": 3410
},
{
"epoch": 2.1932351643060626,
"grad_norm": 1.2734375,
"learning_rate": 3.780160973627386e-05,
"loss": 0.5431,
"step": 3415
},
{
"epoch": 2.1964450507563296,
"grad_norm": 1.21875,
"learning_rate": 3.770614279311348e-05,
"loss": 0.5599,
"step": 3420
},
{
"epoch": 2.1996549372065965,
"grad_norm": 1.203125,
"learning_rate": 3.7610960490378415e-05,
"loss": 0.5474,
"step": 3425
},
{
"epoch": 2.202864823656863,
"grad_norm": 1.28125,
"learning_rate": 3.751606392078816e-05,
"loss": 0.5688,
"step": 3430
},
{
"epoch": 2.20607471010713,
"grad_norm": 1.2578125,
"learning_rate": 3.74214541737819e-05,
"loss": 0.5326,
"step": 3435
},
{
"epoch": 2.2092845965573966,
"grad_norm": 1.2421875,
"learning_rate": 3.732713233550606e-05,
"loss": 0.5303,
"step": 3440
},
{
"epoch": 2.2124944830076636,
"grad_norm": 1.2734375,
"learning_rate": 3.723309948880176e-05,
"loss": 0.5402,
"step": 3445
},
{
"epoch": 2.2157043694579306,
"grad_norm": 1.265625,
"learning_rate": 3.713935671319239e-05,
"loss": 0.5268,
"step": 3450
},
{
"epoch": 2.218914255908197,
"grad_norm": 1.2734375,
"learning_rate": 3.704590508487129e-05,
"loss": 0.5613,
"step": 3455
},
{
"epoch": 2.222124142358464,
"grad_norm": 1.3125,
"learning_rate": 3.695274567668933e-05,
"loss": 0.5533,
"step": 3460
},
{
"epoch": 2.2253340288087307,
"grad_norm": 1.2109375,
"learning_rate": 3.6859879558142594e-05,
"loss": 0.5403,
"step": 3465
},
{
"epoch": 2.2285439152589976,
"grad_norm": 1.234375,
"learning_rate": 3.6767307795360145e-05,
"loss": 0.5304,
"step": 3470
},
{
"epoch": 2.2317538017092646,
"grad_norm": 1.1953125,
"learning_rate": 3.6675031451091755e-05,
"loss": 0.5323,
"step": 3475
},
{
"epoch": 2.234963688159531,
"grad_norm": 1.2578125,
"learning_rate": 3.65830515846957e-05,
"loss": 0.5299,
"step": 3480
},
{
"epoch": 2.238173574609798,
"grad_norm": 1.1875,
"learning_rate": 3.64913692521266e-05,
"loss": 0.5645,
"step": 3485
},
{
"epoch": 2.241383461060065,
"grad_norm": 1.375,
"learning_rate": 3.6399985505923295e-05,
"loss": 0.5453,
"step": 3490
},
{
"epoch": 2.2445933475103317,
"grad_norm": 1.25,
"learning_rate": 3.6308901395196825e-05,
"loss": 0.5387,
"step": 3495
},
{
"epoch": 2.2478032339605987,
"grad_norm": 1.21875,
"learning_rate": 3.621811796561827e-05,
"loss": 0.5512,
"step": 3500
},
{
"epoch": 2.2478032339605987,
"eval_loss": 0.4768131375312805,
"eval_runtime": 2.3764,
"eval_samples_per_second": 84.16,
"eval_steps_per_second": 84.16,
"step": 3500
},
{
"epoch": 2.2510131204108657,
"grad_norm": 1.2578125,
"learning_rate": 3.6127636259406837e-05,
"loss": 0.555,
"step": 3505
},
{
"epoch": 2.254223006861132,
"grad_norm": 1.2109375,
"learning_rate": 3.6037457315317844e-05,
"loss": 0.5454,
"step": 3510
},
{
"epoch": 2.257432893311399,
"grad_norm": 1.25,
"learning_rate": 3.5947582168630855e-05,
"loss": 0.535,
"step": 3515
},
{
"epoch": 2.2606427797616657,
"grad_norm": 1.21875,
"learning_rate": 3.585801185113771e-05,
"loss": 0.5461,
"step": 3520
},
{
"epoch": 2.2638526662119327,
"grad_norm": 1.1328125,
"learning_rate": 3.576874739113073e-05,
"loss": 0.527,
"step": 3525
},
{
"epoch": 2.2670625526621997,
"grad_norm": 1.34375,
"learning_rate": 3.567978981339095e-05,
"loss": 0.5364,
"step": 3530
},
{
"epoch": 2.2702724391124662,
"grad_norm": 1.234375,
"learning_rate": 3.559114013917624e-05,
"loss": 0.5366,
"step": 3535
},
{
"epoch": 2.273482325562733,
"grad_norm": 1.2890625,
"learning_rate": 3.5502799386209726e-05,
"loss": 0.5386,
"step": 3540
},
{
"epoch": 2.276692212013,
"grad_norm": 1.125,
"learning_rate": 3.5414768568667974e-05,
"loss": 0.5391,
"step": 3545
},
{
"epoch": 2.2799020984632667,
"grad_norm": 1.2109375,
"learning_rate": 3.532704869716943e-05,
"loss": 0.5342,
"step": 3550
},
{
"epoch": 2.2831119849135337,
"grad_norm": 1.21875,
"learning_rate": 3.523964077876279e-05,
"loss": 0.5506,
"step": 3555
},
{
"epoch": 2.2863218713638007,
"grad_norm": 1.2578125,
"learning_rate": 3.5152545816915446e-05,
"loss": 0.561,
"step": 3560
},
{
"epoch": 2.2895317578140673,
"grad_norm": 1.2734375,
"learning_rate": 3.506576481150194e-05,
"loss": 0.5429,
"step": 3565
},
{
"epoch": 2.2927416442643342,
"grad_norm": 1.2109375,
"learning_rate": 3.497929875879254e-05,
"loss": 0.5374,
"step": 3570
},
{
"epoch": 2.295951530714601,
"grad_norm": 1.2265625,
"learning_rate": 3.4893148651441735e-05,
"loss": 0.5634,
"step": 3575
},
{
"epoch": 2.2991614171648678,
"grad_norm": 1.2734375,
"learning_rate": 3.480731547847688e-05,
"loss": 0.5394,
"step": 3580
},
{
"epoch": 2.3023713036151348,
"grad_norm": 1.2109375,
"learning_rate": 3.472180022528686e-05,
"loss": 0.5342,
"step": 3585
},
{
"epoch": 2.3055811900654013,
"grad_norm": 1.1953125,
"learning_rate": 3.4636603873610735e-05,
"loss": 0.547,
"step": 3590
},
{
"epoch": 2.3087910765156683,
"grad_norm": 1.1953125,
"learning_rate": 3.455172740152648e-05,
"loss": 0.5421,
"step": 3595
},
{
"epoch": 2.3120009629659353,
"grad_norm": 1.2890625,
"learning_rate": 3.446717178343976e-05,
"loss": 0.5562,
"step": 3600
},
{
"epoch": 2.315210849416202,
"grad_norm": 1.40625,
"learning_rate": 3.438293799007276e-05,
"loss": 0.5358,
"step": 3605
},
{
"epoch": 2.318420735866469,
"grad_norm": 1.2578125,
"learning_rate": 3.429902698845302e-05,
"loss": 0.5555,
"step": 3610
},
{
"epoch": 2.321630622316736,
"grad_norm": 1.1953125,
"learning_rate": 3.421543974190234e-05,
"loss": 0.5414,
"step": 3615
},
{
"epoch": 2.3248405087670023,
"grad_norm": 1.2734375,
"learning_rate": 3.4132177210025724e-05,
"loss": 0.5336,
"step": 3620
},
{
"epoch": 2.3280503952172693,
"grad_norm": 1.3359375,
"learning_rate": 3.404924034870036e-05,
"loss": 0.5351,
"step": 3625
},
{
"epoch": 2.331260281667536,
"grad_norm": 1.28125,
"learning_rate": 3.396663011006465e-05,
"loss": 0.5523,
"step": 3630
},
{
"epoch": 2.334470168117803,
"grad_norm": 1.203125,
"learning_rate": 3.388434744250726e-05,
"loss": 0.5347,
"step": 3635
},
{
"epoch": 2.33768005456807,
"grad_norm": 1.203125,
"learning_rate": 3.3802393290656274e-05,
"loss": 0.5387,
"step": 3640
},
{
"epoch": 2.3408899410183364,
"grad_norm": 1.2109375,
"learning_rate": 3.372076859536831e-05,
"loss": 0.5309,
"step": 3645
},
{
"epoch": 2.3440998274686033,
"grad_norm": 1.296875,
"learning_rate": 3.363947429371772e-05,
"loss": 0.5531,
"step": 3650
},
{
"epoch": 2.34730971391887,
"grad_norm": 1.2109375,
"learning_rate": 3.355851131898585e-05,
"loss": 0.5437,
"step": 3655
},
{
"epoch": 2.350519600369137,
"grad_norm": 1.1953125,
"learning_rate": 3.347788060065036e-05,
"loss": 0.5143,
"step": 3660
},
{
"epoch": 2.353729486819404,
"grad_norm": 1.234375,
"learning_rate": 3.339758306437445e-05,
"loss": 0.532,
"step": 3665
},
{
"epoch": 2.3569393732696704,
"grad_norm": 1.234375,
"learning_rate": 3.331761963199634e-05,
"loss": 0.5535,
"step": 3670
},
{
"epoch": 2.3601492597199374,
"grad_norm": 1.15625,
"learning_rate": 3.3237991221518636e-05,
"loss": 0.5384,
"step": 3675
},
{
"epoch": 2.3633591461702044,
"grad_norm": 1.296875,
"learning_rate": 3.3158698747097784e-05,
"loss": 0.5444,
"step": 3680
},
{
"epoch": 2.366569032620471,
"grad_norm": 1.21875,
"learning_rate": 3.30797431190336e-05,
"loss": 0.5392,
"step": 3685
},
{
"epoch": 2.369778919070738,
"grad_norm": 1.2265625,
"learning_rate": 3.300112524375881e-05,
"loss": 0.5505,
"step": 3690
},
{
"epoch": 2.372988805521005,
"grad_norm": 1.2578125,
"learning_rate": 3.2922846023828645e-05,
"loss": 0.5432,
"step": 3695
},
{
"epoch": 2.3761986919712714,
"grad_norm": 1.265625,
"learning_rate": 3.2844906357910476e-05,
"loss": 0.5294,
"step": 3700
},
{
"epoch": 2.3794085784215384,
"grad_norm": 1.2578125,
"learning_rate": 3.2767307140773494e-05,
"loss": 0.5619,
"step": 3705
},
{
"epoch": 2.382618464871805,
"grad_norm": 1.1875,
"learning_rate": 3.2690049263278455e-05,
"loss": 0.5422,
"step": 3710
},
{
"epoch": 2.385828351322072,
"grad_norm": 1.3203125,
"learning_rate": 3.261313361236743e-05,
"loss": 0.5413,
"step": 3715
},
{
"epoch": 2.389038237772339,
"grad_norm": 1.234375,
"learning_rate": 3.253656107105362e-05,
"loss": 0.535,
"step": 3720
},
{
"epoch": 2.3922481242226055,
"grad_norm": 1.1953125,
"learning_rate": 3.246033251841126e-05,
"loss": 0.5228,
"step": 3725
},
{
"epoch": 2.3954580106728725,
"grad_norm": 1.21875,
"learning_rate": 3.238444882956548e-05,
"loss": 0.5378,
"step": 3730
},
{
"epoch": 2.3986678971231394,
"grad_norm": 1.28125,
"learning_rate": 3.230891087568229e-05,
"loss": 0.5469,
"step": 3735
},
{
"epoch": 2.401877783573406,
"grad_norm": 1.21875,
"learning_rate": 3.2233719523958563e-05,
"loss": 0.5509,
"step": 3740
},
{
"epoch": 2.405087670023673,
"grad_norm": 1.2109375,
"learning_rate": 3.2158875637612053e-05,
"loss": 0.5212,
"step": 3745
},
{
"epoch": 2.40829755647394,
"grad_norm": 1.1640625,
"learning_rate": 3.208438007587156e-05,
"loss": 0.5221,
"step": 3750
},
{
"epoch": 2.4115074429242065,
"grad_norm": 1.2109375,
"learning_rate": 3.201023369396699e-05,
"loss": 0.5311,
"step": 3755
},
{
"epoch": 2.4147173293744735,
"grad_norm": 1.15625,
"learning_rate": 3.193643734311958e-05,
"loss": 0.5403,
"step": 3760
},
{
"epoch": 2.41792721582474,
"grad_norm": 1.234375,
"learning_rate": 3.1862991870532106e-05,
"loss": 0.548,
"step": 3765
},
{
"epoch": 2.421137102275007,
"grad_norm": 1.2734375,
"learning_rate": 3.1789898119379156e-05,
"loss": 0.5466,
"step": 3770
},
{
"epoch": 2.424346988725274,
"grad_norm": 1.2578125,
"learning_rate": 3.171715692879748e-05,
"loss": 0.5336,
"step": 3775
},
{
"epoch": 2.4275568751755405,
"grad_norm": 1.1875,
"learning_rate": 3.164476913387631e-05,
"loss": 0.5341,
"step": 3780
},
{
"epoch": 2.4307667616258075,
"grad_norm": 1.2578125,
"learning_rate": 3.1572735565647815e-05,
"loss": 0.5335,
"step": 3785
},
{
"epoch": 2.4339766480760745,
"grad_norm": 1.1640625,
"learning_rate": 3.1501057051077535e-05,
"loss": 0.5309,
"step": 3790
},
{
"epoch": 2.437186534526341,
"grad_norm": 1.328125,
"learning_rate": 3.142973441305488e-05,
"loss": 0.5451,
"step": 3795
},
{
"epoch": 2.440396420976608,
"grad_norm": 1.1484375,
"learning_rate": 3.135876847038371e-05,
"loss": 0.5381,
"step": 3800
},
{
"epoch": 2.443606307426875,
"grad_norm": 1.2109375,
"learning_rate": 3.1288160037772953e-05,
"loss": 0.5474,
"step": 3805
},
{
"epoch": 2.4468161938771416,
"grad_norm": 1.28125,
"learning_rate": 3.121790992582717e-05,
"loss": 0.5424,
"step": 3810
},
{
"epoch": 2.4500260803274085,
"grad_norm": 1.2578125,
"learning_rate": 3.1148018941037324e-05,
"loss": 0.5475,
"step": 3815
},
{
"epoch": 2.453235966777675,
"grad_norm": 1.2265625,
"learning_rate": 3.10784878857715e-05,
"loss": 0.5341,
"step": 3820
},
{
"epoch": 2.456445853227942,
"grad_norm": 1.203125,
"learning_rate": 3.100931755826569e-05,
"loss": 0.5365,
"step": 3825
},
{
"epoch": 2.459655739678209,
"grad_norm": 1.234375,
"learning_rate": 3.094050875261462e-05,
"loss": 0.5628,
"step": 3830
},
{
"epoch": 2.4628656261284756,
"grad_norm": 1.1875,
"learning_rate": 3.087206225876266e-05,
"loss": 0.54,
"step": 3835
},
{
"epoch": 2.4660755125787426,
"grad_norm": 1.296875,
"learning_rate": 3.080397886249472e-05,
"loss": 0.5375,
"step": 3840
},
{
"epoch": 2.469285399029009,
"grad_norm": 1.2109375,
"learning_rate": 3.073625934542727e-05,
"loss": 0.5427,
"step": 3845
},
{
"epoch": 2.472495285479276,
"grad_norm": 1.3828125,
"learning_rate": 3.0668904484999334e-05,
"loss": 0.5511,
"step": 3850
},
{
"epoch": 2.475705171929543,
"grad_norm": 1.2421875,
"learning_rate": 3.060191505446357e-05,
"loss": 0.5377,
"step": 3855
},
{
"epoch": 2.4789150583798096,
"grad_norm": 1.2265625,
"learning_rate": 3.0535291822877405e-05,
"loss": 0.533,
"step": 3860
},
{
"epoch": 2.4821249448300766,
"grad_norm": 1.1640625,
"learning_rate": 3.0469035555094194e-05,
"loss": 0.5372,
"step": 3865
},
{
"epoch": 2.4853348312803436,
"grad_norm": 1.21875,
"learning_rate": 3.040314701175445e-05,
"loss": 0.544,
"step": 3870
},
{
"epoch": 2.48854471773061,
"grad_norm": 1.25,
"learning_rate": 3.0337626949277105e-05,
"loss": 0.5307,
"step": 3875
},
{
"epoch": 2.491754604180877,
"grad_norm": 1.2265625,
"learning_rate": 3.0272476119850835e-05,
"loss": 0.5482,
"step": 3880
},
{
"epoch": 2.494964490631144,
"grad_norm": 1.3046875,
"learning_rate": 3.020769527142541e-05,
"loss": 0.5412,
"step": 3885
},
{
"epoch": 2.4981743770814107,
"grad_norm": 1.2265625,
"learning_rate": 3.0143285147703114e-05,
"loss": 0.5554,
"step": 3890
},
{
"epoch": 2.5013842635316776,
"grad_norm": 1.3046875,
"learning_rate": 3.0079246488130197e-05,
"loss": 0.5369,
"step": 3895
},
{
"epoch": 2.504594149981944,
"grad_norm": 1.28125,
"learning_rate": 3.0015580027888424e-05,
"loss": 0.5504,
"step": 3900
},
{
"epoch": 2.507804036432211,
"grad_norm": 1.2578125,
"learning_rate": 2.9952286497886572e-05,
"loss": 0.5287,
"step": 3905
},
{
"epoch": 2.511013922882478,
"grad_norm": 1.234375,
"learning_rate": 2.9889366624752118e-05,
"loss": 0.5553,
"step": 3910
},
{
"epoch": 2.5142238093327447,
"grad_norm": 1.2578125,
"learning_rate": 2.9826821130822807e-05,
"loss": 0.5343,
"step": 3915
},
{
"epoch": 2.5174336957830117,
"grad_norm": 1.25,
"learning_rate": 2.9764650734138434e-05,
"loss": 0.5326,
"step": 3920
},
{
"epoch": 2.5206435822332782,
"grad_norm": 1.234375,
"learning_rate": 2.9702856148432573e-05,
"loss": 0.5366,
"step": 3925
},
{
"epoch": 2.523853468683545,
"grad_norm": 1.3671875,
"learning_rate": 2.9641438083124372e-05,
"loss": 0.5335,
"step": 3930
},
{
"epoch": 2.527063355133812,
"grad_norm": 1.1484375,
"learning_rate": 2.958039724331042e-05,
"loss": 0.518,
"step": 3935
},
{
"epoch": 2.530273241584079,
"grad_norm": 1.296875,
"learning_rate": 2.9519734329756666e-05,
"loss": 0.5379,
"step": 3940
},
{
"epoch": 2.5334831280343457,
"grad_norm": 1.203125,
"learning_rate": 2.9459450038890333e-05,
"loss": 0.5287,
"step": 3945
},
{
"epoch": 2.5366930144846127,
"grad_norm": 1.234375,
"learning_rate": 2.9399545062791967e-05,
"loss": 0.5245,
"step": 3950
},
{
"epoch": 2.5399029009348792,
"grad_norm": 1.171875,
"learning_rate": 2.9340020089187492e-05,
"loss": 0.541,
"step": 3955
},
{
"epoch": 2.5431127873851462,
"grad_norm": 1.25,
"learning_rate": 2.928087580144026e-05,
"loss": 0.5299,
"step": 3960
},
{
"epoch": 2.546322673835413,
"grad_norm": 1.1875,
"learning_rate": 2.9222112878543273e-05,
"loss": 0.527,
"step": 3965
},
{
"epoch": 2.5495325602856798,
"grad_norm": 1.234375,
"learning_rate": 2.9163731995111333e-05,
"loss": 0.5581,
"step": 3970
},
{
"epoch": 2.5527424467359467,
"grad_norm": 1.2109375,
"learning_rate": 2.9105733821373333e-05,
"loss": 0.5499,
"step": 3975
},
{
"epoch": 2.5559523331862133,
"grad_norm": 1.25,
"learning_rate": 2.9048119023164555e-05,
"loss": 0.5265,
"step": 3980
},
{
"epoch": 2.5591622196364803,
"grad_norm": 1.1640625,
"learning_rate": 2.8990888261919024e-05,
"loss": 0.5433,
"step": 3985
},
{
"epoch": 2.5623721060867473,
"grad_norm": 1.2265625,
"learning_rate": 2.8934042194661913e-05,
"loss": 0.5503,
"step": 3990
},
{
"epoch": 2.5655819925370142,
"grad_norm": 1.265625,
"learning_rate": 2.8877581474001986e-05,
"loss": 0.5327,
"step": 3995
},
{
"epoch": 2.568791878987281,
"grad_norm": 1.2578125,
"learning_rate": 2.8821506748124132e-05,
"loss": 0.5499,
"step": 4000
},
{
"epoch": 2.568791878987281,
"eval_loss": 0.4683253765106201,
"eval_runtime": 2.4022,
"eval_samples_per_second": 83.257,
"eval_steps_per_second": 83.257,
"step": 4000
},
{
"epoch": 2.5720017654375478,
"grad_norm": 1.34375,
"learning_rate": 2.8765818660781912e-05,
"loss": 0.5244,
"step": 4005
},
{
"epoch": 2.5752116518878143,
"grad_norm": 1.296875,
"learning_rate": 2.8710517851290174e-05,
"loss": 0.5457,
"step": 4010
},
{
"epoch": 2.5784215383380813,
"grad_norm": 1.28125,
"learning_rate": 2.865560495451769e-05,
"loss": 0.539,
"step": 4015
},
{
"epoch": 2.5816314247883483,
"grad_norm": 1.21875,
"learning_rate": 2.8601080600879892e-05,
"loss": 0.5469,
"step": 4020
},
{
"epoch": 2.584841311238615,
"grad_norm": 1.265625,
"learning_rate": 2.854694541633165e-05,
"loss": 0.5536,
"step": 4025
},
{
"epoch": 2.588051197688882,
"grad_norm": 1.2421875,
"learning_rate": 2.8493200022360027e-05,
"loss": 0.5324,
"step": 4030
},
{
"epoch": 2.5912610841391484,
"grad_norm": 1.2890625,
"learning_rate": 2.8439845035977214e-05,
"loss": 0.519,
"step": 4035
},
{
"epoch": 2.5944709705894153,
"grad_norm": 1.2734375,
"learning_rate": 2.838688106971339e-05,
"loss": 0.534,
"step": 4040
},
{
"epoch": 2.5976808570396823,
"grad_norm": 1.21875,
"learning_rate": 2.8334308731609722e-05,
"loss": 0.5333,
"step": 4045
},
{
"epoch": 2.6008907434899493,
"grad_norm": 1.2734375,
"learning_rate": 2.8282128625211378e-05,
"loss": 0.5319,
"step": 4050
},
{
"epoch": 2.604100629940216,
"grad_norm": 1.203125,
"learning_rate": 2.8230341349560603e-05,
"loss": 0.5411,
"step": 4055
},
{
"epoch": 2.607310516390483,
"grad_norm": 1.25,
"learning_rate": 2.8178947499189812e-05,
"loss": 0.5493,
"step": 4060
},
{
"epoch": 2.6105204028407494,
"grad_norm": 1.2109375,
"learning_rate": 2.812794766411481e-05,
"loss": 0.5491,
"step": 4065
},
{
"epoch": 2.6137302892910164,
"grad_norm": 1.2109375,
"learning_rate": 2.8077342429827992e-05,
"loss": 0.5423,
"step": 4070
},
{
"epoch": 2.6169401757412833,
"grad_norm": 1.2421875,
"learning_rate": 2.802713237729162e-05,
"loss": 0.5493,
"step": 4075
},
{
"epoch": 2.62015006219155,
"grad_norm": 1.1953125,
"learning_rate": 2.797731808293116e-05,
"loss": 0.5503,
"step": 4080
},
{
"epoch": 2.623359948641817,
"grad_norm": 1.203125,
"learning_rate": 2.7927900118628652e-05,
"loss": 0.5297,
"step": 4085
},
{
"epoch": 2.6265698350920834,
"grad_norm": 1.28125,
"learning_rate": 2.787887905171619e-05,
"loss": 0.5406,
"step": 4090
},
{
"epoch": 2.6297797215423504,
"grad_norm": 1.2109375,
"learning_rate": 2.7830255444969332e-05,
"loss": 0.531,
"step": 4095
},
{
"epoch": 2.6329896079926174,
"grad_norm": 1.40625,
"learning_rate": 2.7782029856600715e-05,
"loss": 0.5403,
"step": 4100
},
{
"epoch": 2.636199494442884,
"grad_norm": 1.2578125,
"learning_rate": 2.77342028402536e-05,
"loss": 0.5568,
"step": 4105
},
{
"epoch": 2.639409380893151,
"grad_norm": 1.1796875,
"learning_rate": 2.7686774944995526e-05,
"loss": 0.5364,
"step": 4110
},
{
"epoch": 2.6426192673434175,
"grad_norm": 1.25,
"learning_rate": 2.763974671531201e-05,
"loss": 0.5501,
"step": 4115
},
{
"epoch": 2.6458291537936844,
"grad_norm": 1.3671875,
"learning_rate": 2.759311869110032e-05,
"loss": 0.5469,
"step": 4120
},
{
"epoch": 2.6490390402439514,
"grad_norm": 1.1328125,
"learning_rate": 2.7546891407663216e-05,
"loss": 0.5401,
"step": 4125
},
{
"epoch": 2.6522489266942184,
"grad_norm": 1.2890625,
"learning_rate": 2.7501065395702864e-05,
"loss": 0.5465,
"step": 4130
},
{
"epoch": 2.655458813144485,
"grad_norm": 1.203125,
"learning_rate": 2.745564118131472e-05,
"loss": 0.5332,
"step": 4135
},
{
"epoch": 2.658668699594752,
"grad_norm": 1.2578125,
"learning_rate": 2.741061928598149e-05,
"loss": 0.5376,
"step": 4140
},
{
"epoch": 2.6618785860450185,
"grad_norm": 1.25,
"learning_rate": 2.736600022656714e-05,
"loss": 0.5382,
"step": 4145
},
{
"epoch": 2.6650884724952855,
"grad_norm": 1.203125,
"learning_rate": 2.7321784515310965e-05,
"loss": 0.5494,
"step": 4150
},
{
"epoch": 2.6682983589455525,
"grad_norm": 1.2421875,
"learning_rate": 2.7277972659821727e-05,
"loss": 0.5511,
"step": 4155
},
{
"epoch": 2.671508245395819,
"grad_norm": 1.1875,
"learning_rate": 2.723456516307178e-05,
"loss": 0.552,
"step": 4160
},
{
"epoch": 2.674718131846086,
"grad_norm": 1.2109375,
"learning_rate": 2.7191562523391363e-05,
"loss": 0.5295,
"step": 4165
},
{
"epoch": 2.6779280182963525,
"grad_norm": 1.203125,
"learning_rate": 2.7148965234462807e-05,
"loss": 0.5491,
"step": 4170
},
{
"epoch": 2.6811379047466195,
"grad_norm": 1.203125,
"learning_rate": 2.7106773785314937e-05,
"loss": 0.5218,
"step": 4175
},
{
"epoch": 2.6843477911968865,
"grad_norm": 1.1953125,
"learning_rate": 2.70649886603174e-05,
"loss": 0.5303,
"step": 4180
},
{
"epoch": 2.6875576776471535,
"grad_norm": 1.25,
"learning_rate": 2.7023610339175127e-05,
"loss": 0.5344,
"step": 4185
},
{
"epoch": 2.69076756409742,
"grad_norm": 1.1640625,
"learning_rate": 2.698263929692285e-05,
"loss": 0.5482,
"step": 4190
},
{
"epoch": 2.693977450547687,
"grad_norm": 1.1796875,
"learning_rate": 2.6942076003919596e-05,
"loss": 0.5198,
"step": 4195
},
{
"epoch": 2.6971873369979535,
"grad_norm": 1.1796875,
"learning_rate": 2.6901920925843338e-05,
"loss": 0.5366,
"step": 4200
},
{
"epoch": 2.7003972234482205,
"grad_norm": 1.1875,
"learning_rate": 2.6862174523685618e-05,
"loss": 0.5151,
"step": 4205
},
{
"epoch": 2.7036071098984875,
"grad_norm": 1.2734375,
"learning_rate": 2.6822837253746258e-05,
"loss": 0.5174,
"step": 4210
},
{
"epoch": 2.706816996348754,
"grad_norm": 1.2578125,
"learning_rate": 2.6783909567628153e-05,
"loss": 0.5391,
"step": 4215
},
{
"epoch": 2.710026882799021,
"grad_norm": 1.28125,
"learning_rate": 2.674539191223202e-05,
"loss": 0.5445,
"step": 4220
},
{
"epoch": 2.7132367692492876,
"grad_norm": 1.2578125,
"learning_rate": 2.6707284729751346e-05,
"loss": 0.5197,
"step": 4225
},
{
"epoch": 2.7164466556995546,
"grad_norm": 1.2265625,
"learning_rate": 2.666958845766726e-05,
"loss": 0.5375,
"step": 4230
},
{
"epoch": 2.7196565421498216,
"grad_norm": 1.140625,
"learning_rate": 2.663230352874352e-05,
"loss": 0.5285,
"step": 4235
},
{
"epoch": 2.7228664286000885,
"grad_norm": 1.2421875,
"learning_rate": 2.659543037102154e-05,
"loss": 0.5429,
"step": 4240
},
{
"epoch": 2.726076315050355,
"grad_norm": 1.1953125,
"learning_rate": 2.6558969407815525e-05,
"loss": 0.5288,
"step": 4245
},
{
"epoch": 2.729286201500622,
"grad_norm": 1.265625,
"learning_rate": 2.652292105770753e-05,
"loss": 0.527,
"step": 4250
},
{
"epoch": 2.7324960879508886,
"grad_norm": 1.1484375,
"learning_rate": 2.648728573454271e-05,
"loss": 0.5219,
"step": 4255
},
{
"epoch": 2.7357059744011556,
"grad_norm": 1.2890625,
"learning_rate": 2.6452063847424564e-05,
"loss": 0.5412,
"step": 4260
},
{
"epoch": 2.7389158608514226,
"grad_norm": 1.2265625,
"learning_rate": 2.6417255800710215e-05,
"loss": 0.5495,
"step": 4265
},
{
"epoch": 2.742125747301689,
"grad_norm": 1.3671875,
"learning_rate": 2.6382861994005792e-05,
"loss": 0.5353,
"step": 4270
},
{
"epoch": 2.745335633751956,
"grad_norm": 1.2421875,
"learning_rate": 2.6348882822161826e-05,
"loss": 0.5386,
"step": 4275
},
{
"epoch": 2.7485455202022226,
"grad_norm": 1.234375,
"learning_rate": 2.6315318675268724e-05,
"loss": 0.55,
"step": 4280
},
{
"epoch": 2.7517554066524896,
"grad_norm": 1.25,
"learning_rate": 2.6282169938652306e-05,
"loss": 0.5401,
"step": 4285
},
{
"epoch": 2.7549652931027566,
"grad_norm": 1.15625,
"learning_rate": 2.6249436992869342e-05,
"loss": 0.5289,
"step": 4290
},
{
"epoch": 2.758175179553023,
"grad_norm": 1.203125,
"learning_rate": 2.6217120213703222e-05,
"loss": 0.541,
"step": 4295
},
{
"epoch": 2.76138506600329,
"grad_norm": 1.21875,
"learning_rate": 2.6185219972159626e-05,
"loss": 0.5263,
"step": 4300
},
{
"epoch": 2.7645949524535567,
"grad_norm": 1.2109375,
"learning_rate": 2.6153736634462252e-05,
"loss": 0.5247,
"step": 4305
},
{
"epoch": 2.7678048389038237,
"grad_norm": 1.1640625,
"learning_rate": 2.6122670562048645e-05,
"loss": 0.5476,
"step": 4310
},
{
"epoch": 2.7710147253540907,
"grad_norm": 1.2578125,
"learning_rate": 2.6092022111566007e-05,
"loss": 0.5246,
"step": 4315
},
{
"epoch": 2.7742246118043576,
"grad_norm": 1.1953125,
"learning_rate": 2.6061791634867146e-05,
"loss": 0.5191,
"step": 4320
},
{
"epoch": 2.777434498254624,
"grad_norm": 1.2265625,
"learning_rate": 2.6031979479006395e-05,
"loss": 0.5341,
"step": 4325
},
{
"epoch": 2.780644384704891,
"grad_norm": 1.21875,
"learning_rate": 2.6002585986235656e-05,
"loss": 0.5375,
"step": 4330
},
{
"epoch": 2.7838542711551577,
"grad_norm": 1.2734375,
"learning_rate": 2.5973611494000462e-05,
"loss": 0.5502,
"step": 4335
},
{
"epoch": 2.7870641576054247,
"grad_norm": 1.375,
"learning_rate": 2.5945056334936092e-05,
"loss": 0.5263,
"step": 4340
},
{
"epoch": 2.7902740440556917,
"grad_norm": 1.2265625,
"learning_rate": 2.5916920836863772e-05,
"loss": 0.5388,
"step": 4345
},
{
"epoch": 2.7934839305059582,
"grad_norm": 1.390625,
"learning_rate": 2.58892053227869e-05,
"loss": 0.5378,
"step": 4350
},
{
"epoch": 2.796693816956225,
"grad_norm": 1.2890625,
"learning_rate": 2.5861910110887344e-05,
"loss": 0.5333,
"step": 4355
},
{
"epoch": 2.7999037034064918,
"grad_norm": 1.1484375,
"learning_rate": 2.5835035514521776e-05,
"loss": 0.5295,
"step": 4360
},
{
"epoch": 2.8031135898567587,
"grad_norm": 1.2265625,
"learning_rate": 2.58085818422181e-05,
"loss": 0.5308,
"step": 4365
},
{
"epoch": 2.8063234763070257,
"grad_norm": 1.1875,
"learning_rate": 2.5782549397671872e-05,
"loss": 0.5339,
"step": 4370
},
{
"epoch": 2.8095333627572927,
"grad_norm": 1.28125,
"learning_rate": 2.575693847974286e-05,
"loss": 0.543,
"step": 4375
},
{
"epoch": 2.8127432492075592,
"grad_norm": 1.1796875,
"learning_rate": 2.5731749382451565e-05,
"loss": 0.5417,
"step": 4380
},
{
"epoch": 2.8159531356578262,
"grad_norm": 1.2265625,
"learning_rate": 2.5706982394975875e-05,
"loss": 0.5473,
"step": 4385
},
{
"epoch": 2.8191630221080928,
"grad_norm": 1.21875,
"learning_rate": 2.568263780164775e-05,
"loss": 0.536,
"step": 4390
},
{
"epoch": 2.8223729085583598,
"grad_norm": 1.3125,
"learning_rate": 2.5658715881949946e-05,
"loss": 0.5271,
"step": 4395
},
{
"epoch": 2.8255827950086267,
"grad_norm": 1.2265625,
"learning_rate": 2.5635216910512793e-05,
"loss": 0.5437,
"step": 4400
},
{
"epoch": 2.8287926814588933,
"grad_norm": 1.2109375,
"learning_rate": 2.561214115711107e-05,
"loss": 0.5294,
"step": 4405
},
{
"epoch": 2.8320025679091603,
"grad_norm": 1.3046875,
"learning_rate": 2.558948888666088e-05,
"loss": 0.5353,
"step": 4410
},
{
"epoch": 2.835212454359427,
"grad_norm": 1.2578125,
"learning_rate": 2.556726035921665e-05,
"loss": 0.544,
"step": 4415
},
{
"epoch": 2.838422340809694,
"grad_norm": 1.2421875,
"learning_rate": 2.5545455829968078e-05,
"loss": 0.5282,
"step": 4420
},
{
"epoch": 2.841632227259961,
"grad_norm": 1.234375,
"learning_rate": 2.552407554923729e-05,
"loss": 0.5423,
"step": 4425
},
{
"epoch": 2.8448421137102278,
"grad_norm": 1.296875,
"learning_rate": 2.550311976247588e-05,
"loss": 0.5348,
"step": 4430
},
{
"epoch": 2.8480520001604943,
"grad_norm": 1.25,
"learning_rate": 2.548258871026216e-05,
"loss": 0.5591,
"step": 4435
},
{
"epoch": 2.8512618866107613,
"grad_norm": 1.2734375,
"learning_rate": 2.5462482628298357e-05,
"loss": 0.5325,
"step": 4440
},
{
"epoch": 2.854471773061028,
"grad_norm": 1.203125,
"learning_rate": 2.544280174740792e-05,
"loss": 0.534,
"step": 4445
},
{
"epoch": 2.857681659511295,
"grad_norm": 1.2421875,
"learning_rate": 2.542354629353288e-05,
"loss": 0.534,
"step": 4450
},
{
"epoch": 2.860891545961562,
"grad_norm": 1.140625,
"learning_rate": 2.540471648773124e-05,
"loss": 0.5599,
"step": 4455
},
{
"epoch": 2.8641014324118284,
"grad_norm": 1.375,
"learning_rate": 2.5386312546174434e-05,
"loss": 0.5492,
"step": 4460
},
{
"epoch": 2.8673113188620953,
"grad_norm": 1.15625,
"learning_rate": 2.5368334680144884e-05,
"loss": 0.5301,
"step": 4465
},
{
"epoch": 2.870521205312362,
"grad_norm": 1.15625,
"learning_rate": 2.535078309603351e-05,
"loss": 0.5193,
"step": 4470
},
{
"epoch": 2.873731091762629,
"grad_norm": 1.2421875,
"learning_rate": 2.5333657995337422e-05,
"loss": 0.5296,
"step": 4475
},
{
"epoch": 2.876940978212896,
"grad_norm": 1.1875,
"learning_rate": 2.5316959574657583e-05,
"loss": 0.5139,
"step": 4480
},
{
"epoch": 2.8801508646631624,
"grad_norm": 1.296875,
"learning_rate": 2.5300688025696517e-05,
"loss": 0.5349,
"step": 4485
},
{
"epoch": 2.8833607511134294,
"grad_norm": 1.203125,
"learning_rate": 2.5284843535256182e-05,
"loss": 0.5442,
"step": 4490
},
{
"epoch": 2.886570637563696,
"grad_norm": 1.28125,
"learning_rate": 2.5269426285235753e-05,
"loss": 0.5328,
"step": 4495
},
{
"epoch": 2.889780524013963,
"grad_norm": 1.171875,
"learning_rate": 2.5254436452629594e-05,
"loss": 0.5126,
"step": 4500
},
{
"epoch": 2.889780524013963,
"eval_loss": 0.4651297628879547,
"eval_runtime": 2.403,
"eval_samples_per_second": 83.23,
"eval_steps_per_second": 83.23,
"step": 4500
},
{
"epoch": 2.89299041046423,
"grad_norm": 1.1875,
"learning_rate": 2.523987420952516e-05,
"loss": 0.5352,
"step": 4505
},
{
"epoch": 2.896200296914497,
"grad_norm": 1.1484375,
"learning_rate": 2.5225739723101105e-05,
"loss": 0.5321,
"step": 4510
},
{
"epoch": 2.8994101833647634,
"grad_norm": 1.1796875,
"learning_rate": 2.521203315562528e-05,
"loss": 0.5323,
"step": 4515
},
{
"epoch": 2.9026200698150304,
"grad_norm": 1.21875,
"learning_rate": 2.5198754664452913e-05,
"loss": 0.5468,
"step": 4520
},
{
"epoch": 2.905829956265297,
"grad_norm": 1.296875,
"learning_rate": 2.5185904402024808e-05,
"loss": 0.53,
"step": 4525
},
{
"epoch": 2.909039842715564,
"grad_norm": 1.21875,
"learning_rate": 2.5173482515865582e-05,
"loss": 0.5181,
"step": 4530
},
{
"epoch": 2.912249729165831,
"grad_norm": 1.1875,
"learning_rate": 2.5161489148581962e-05,
"loss": 0.5294,
"step": 4535
},
{
"epoch": 2.9154596156160975,
"grad_norm": 1.1796875,
"learning_rate": 2.514992443786116e-05,
"loss": 0.5339,
"step": 4540
},
{
"epoch": 2.9186695020663644,
"grad_norm": 1.1953125,
"learning_rate": 2.51387885164693e-05,
"loss": 0.5416,
"step": 4545
},
{
"epoch": 2.921879388516631,
"grad_norm": 1.1875,
"learning_rate": 2.512808151224988e-05,
"loss": 0.546,
"step": 4550
},
{
"epoch": 2.925089274966898,
"grad_norm": 1.28125,
"learning_rate": 2.5117803548122305e-05,
"loss": 0.552,
"step": 4555
},
{
"epoch": 2.928299161417165,
"grad_norm": 1.1953125,
"learning_rate": 2.510795474208048e-05,
"loss": 0.5195,
"step": 4560
},
{
"epoch": 2.931509047867432,
"grad_norm": 1.1640625,
"learning_rate": 2.5098535207191458e-05,
"loss": 0.5446,
"step": 4565
},
{
"epoch": 2.9347189343176985,
"grad_norm": 1.125,
"learning_rate": 2.5089545051594136e-05,
"loss": 0.5417,
"step": 4570
},
{
"epoch": 2.9379288207679655,
"grad_norm": 1.234375,
"learning_rate": 2.5080984378498023e-05,
"loss": 0.5301,
"step": 4575
},
{
"epoch": 2.941138707218232,
"grad_norm": 1.3203125,
"learning_rate": 2.507285328618204e-05,
"loss": 0.5464,
"step": 4580
},
{
"epoch": 2.944348593668499,
"grad_norm": 1.1875,
"learning_rate": 2.506515186799341e-05,
"loss": 0.5348,
"step": 4585
},
{
"epoch": 2.947558480118766,
"grad_norm": 1.171875,
"learning_rate": 2.5057880212346564e-05,
"loss": 0.5296,
"step": 4590
},
{
"epoch": 2.9507683665690325,
"grad_norm": 1.21875,
"learning_rate": 2.505103840272215e-05,
"loss": 0.5267,
"step": 4595
},
{
"epoch": 2.9539782530192995,
"grad_norm": 1.2421875,
"learning_rate": 2.5044626517666054e-05,
"loss": 0.5286,
"step": 4600
},
{
"epoch": 2.957188139469566,
"grad_norm": 1.15625,
"learning_rate": 2.5038644630788517e-05,
"loss": 0.5401,
"step": 4605
},
{
"epoch": 2.960398025919833,
"grad_norm": 1.28125,
"learning_rate": 2.5033092810763275e-05,
"loss": 0.5278,
"step": 4610
},
{
"epoch": 2.9636079123701,
"grad_norm": 1.171875,
"learning_rate": 2.5027971121326776e-05,
"loss": 0.5218,
"step": 4615
},
{
"epoch": 2.966817798820367,
"grad_norm": 1.21875,
"learning_rate": 2.5023279621277444e-05,
"loss": 0.5288,
"step": 4620
},
{
"epoch": 2.9700276852706335,
"grad_norm": 1.203125,
"learning_rate": 2.5019018364475026e-05,
"loss": 0.5382,
"step": 4625
},
{
"epoch": 2.9732375717209005,
"grad_norm": 1.1171875,
"learning_rate": 2.5015187399839936e-05,
"loss": 0.5431,
"step": 4630
},
{
"epoch": 2.976447458171167,
"grad_norm": 1.21875,
"learning_rate": 2.501178677135272e-05,
"loss": 0.5417,
"step": 4635
},
{
"epoch": 2.979657344621434,
"grad_norm": 1.2421875,
"learning_rate": 2.5008816518053547e-05,
"loss": 0.5141,
"step": 4640
},
{
"epoch": 2.982867231071701,
"grad_norm": 1.15625,
"learning_rate": 2.500627667404176e-05,
"loss": 0.5438,
"step": 4645
},
{
"epoch": 2.9860771175219676,
"grad_norm": 1.2265625,
"learning_rate": 2.5004167268475475e-05,
"loss": 0.5386,
"step": 4650
},
{
"epoch": 2.9892870039722346,
"grad_norm": 1.2421875,
"learning_rate": 2.500248832557126e-05,
"loss": 0.5358,
"step": 4655
},
{
"epoch": 2.992496890422501,
"grad_norm": 1.21875,
"learning_rate": 2.5001239864603847e-05,
"loss": 0.5446,
"step": 4660
},
{
"epoch": 2.995706776872768,
"grad_norm": 1.1640625,
"learning_rate": 2.500042189990593e-05,
"loss": 0.5492,
"step": 4665
},
{
"epoch": 2.998916663323035,
"grad_norm": 1.2734375,
"learning_rate": 2.5000034440867958e-05,
"loss": 0.5393,
"step": 4670
},
{
"epoch": 2.9995586406130883,
"eval_loss": 0.4636688232421875,
"eval_runtime": 2.4088,
"eval_samples_per_second": 83.028,
"eval_steps_per_second": 83.028,
"step": 4671
}
],
"logging_steps": 5,
"max_steps": 4671,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.12480186236928e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}