{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9995586406130883, "eval_steps": 500, "global_step": 4671, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003209886450266822, "grad_norm": 4.5, "learning_rate": 1.1428571428571429e-05, "loss": 1.969, "step": 5 }, { "epoch": 0.006419772900533644, "grad_norm": 2.703125, "learning_rate": 2.5714285714285714e-05, "loss": 1.9039, "step": 10 }, { "epoch": 0.009629659350800466, "grad_norm": 2.515625, "learning_rate": 4e-05, "loss": 1.7918, "step": 15 }, { "epoch": 0.012839545801067288, "grad_norm": 2.15625, "learning_rate": 5.428571428571428e-05, "loss": 1.6624, "step": 20 }, { "epoch": 0.01604943225133411, "grad_norm": 1.9296875, "learning_rate": 6.857142857142858e-05, "loss": 1.5578, "step": 25 }, { "epoch": 0.01925931870160093, "grad_norm": 1.921875, "learning_rate": 8.285714285714287e-05, "loss": 1.4779, "step": 30 }, { "epoch": 0.022469205151867754, "grad_norm": 1.875, "learning_rate": 9.714285714285715e-05, "loss": 1.4165, "step": 35 }, { "epoch": 0.025679091602134576, "grad_norm": 1.8671875, "learning_rate": 9.999986223659144e-05, "loss": 1.3396, "step": 40 }, { "epoch": 0.028888978052401395, "grad_norm": 1.765625, "learning_rate": 9.999930257447894e-05, "loss": 1.3222, "step": 45 }, { "epoch": 0.03209886450266822, "grad_norm": 1.7578125, "learning_rate": 9.99983124098696e-05, "loss": 1.271, "step": 50 }, { "epoch": 0.03530875095293504, "grad_norm": 1.8046875, "learning_rate": 9.99968917541308e-05, "loss": 1.2353, "step": 55 }, { "epoch": 0.03851863740320186, "grad_norm": 1.8125, "learning_rate": 9.999504062357203e-05, "loss": 1.2284, "step": 60 }, { "epoch": 0.04172852385346868, "grad_norm": 1.640625, "learning_rate": 9.999275903944482e-05, "loss": 1.2037, "step": 65 }, { "epoch": 0.04493841030373551, "grad_norm": 1.765625, "learning_rate": 9.99900470279424e-05, "loss": 1.1832, "step": 70 }, { "epoch": 0.048148296754002326, "grad_norm": 1.7265625, "learning_rate": 9.998690462019939e-05, "loss": 1.1533, "step": 75 }, { "epoch": 0.05135818320426915, "grad_norm": 1.7734375, "learning_rate": 9.998333185229152e-05, "loss": 1.1481, "step": 80 }, { "epoch": 0.05456806965453597, "grad_norm": 1.90625, "learning_rate": 9.99793287652352e-05, "loss": 1.1369, "step": 85 }, { "epoch": 0.05777795610480279, "grad_norm": 1.765625, "learning_rate": 9.997489540498695e-05, "loss": 1.1191, "step": 90 }, { "epoch": 0.060987842555069616, "grad_norm": 1.7421875, "learning_rate": 9.9970031822443e-05, "loss": 1.1189, "step": 95 }, { "epoch": 0.06419772900533643, "grad_norm": 1.625, "learning_rate": 9.996473807343865e-05, "loss": 1.0978, "step": 100 }, { "epoch": 0.06740761545560325, "grad_norm": 1.9375, "learning_rate": 9.995901421874761e-05, "loss": 1.0831, "step": 105 }, { "epoch": 0.07061750190587009, "grad_norm": 1.9609375, "learning_rate": 9.995286032408134e-05, "loss": 1.0734, "step": 110 }, { "epoch": 0.0738273883561369, "grad_norm": 1.7890625, "learning_rate": 9.994627646008827e-05, "loss": 1.0588, "step": 115 }, { "epoch": 0.07703727480640372, "grad_norm": 2.0625, "learning_rate": 9.993926270235301e-05, "loss": 1.0553, "step": 120 }, { "epoch": 0.08024716125667054, "grad_norm": 1.5625, "learning_rate": 9.993181913139545e-05, "loss": 1.0605, "step": 125 }, { "epoch": 0.08345704770693736, "grad_norm": 1.6796875, "learning_rate": 9.992394583266989e-05, "loss": 1.0296, "step": 130 }, { "epoch": 0.0866669341572042, "grad_norm": 1.65625, "learning_rate": 9.991564289656398e-05, "loss": 1.0441, "step": 135 }, { "epoch": 0.08987682060747101, "grad_norm": 1.4609375, "learning_rate": 9.990691041839778e-05, "loss": 1.0367, "step": 140 }, { "epoch": 0.09308670705773783, "grad_norm": 1.625, "learning_rate": 9.989774849842257e-05, "loss": 1.0188, "step": 145 }, { "epoch": 0.09629659350800465, "grad_norm": 1.6328125, "learning_rate": 9.988815724181975e-05, "loss": 1.0121, "step": 150 }, { "epoch": 0.09950647995827147, "grad_norm": 1.6953125, "learning_rate": 9.987813675869966e-05, "loss": 1.0097, "step": 155 }, { "epoch": 0.1027163664085383, "grad_norm": 1.8984375, "learning_rate": 9.98676871641002e-05, "loss": 1.0222, "step": 160 }, { "epoch": 0.10592625285880512, "grad_norm": 1.921875, "learning_rate": 9.98568085779857e-05, "loss": 0.9847, "step": 165 }, { "epoch": 0.10913613930907194, "grad_norm": 1.6484375, "learning_rate": 9.984550112524535e-05, "loss": 1.0177, "step": 170 }, { "epoch": 0.11234602575933876, "grad_norm": 1.7109375, "learning_rate": 9.983376493569186e-05, "loss": 0.986, "step": 175 }, { "epoch": 0.11555591220960558, "grad_norm": 1.6875, "learning_rate": 9.982160014406001e-05, "loss": 0.996, "step": 180 }, { "epoch": 0.11876579865987241, "grad_norm": 1.6015625, "learning_rate": 9.980900689000498e-05, "loss": 0.9572, "step": 185 }, { "epoch": 0.12197568511013923, "grad_norm": 1.5703125, "learning_rate": 9.979598531810088e-05, "loss": 0.9589, "step": 190 }, { "epoch": 0.12518557156040605, "grad_norm": 1.7109375, "learning_rate": 9.978253557783898e-05, "loss": 0.9885, "step": 195 }, { "epoch": 0.12839545801067287, "grad_norm": 1.5390625, "learning_rate": 9.97686578236261e-05, "loss": 0.9701, "step": 200 }, { "epoch": 0.1316053444609397, "grad_norm": 1.6484375, "learning_rate": 9.97543522147827e-05, "loss": 0.96, "step": 205 }, { "epoch": 0.1348152309112065, "grad_norm": 1.6484375, "learning_rate": 9.97396189155412e-05, "loss": 0.9497, "step": 210 }, { "epoch": 0.13802511736147333, "grad_norm": 1.7734375, "learning_rate": 9.9724458095044e-05, "loss": 0.9269, "step": 215 }, { "epoch": 0.14123500381174017, "grad_norm": 1.6015625, "learning_rate": 9.970886992734156e-05, "loss": 0.9376, "step": 220 }, { "epoch": 0.144444890262007, "grad_norm": 1.59375, "learning_rate": 9.969285459139044e-05, "loss": 0.9344, "step": 225 }, { "epoch": 0.1476547767122738, "grad_norm": 1.5625, "learning_rate": 9.967641227105115e-05, "loss": 0.9316, "step": 230 }, { "epoch": 0.15086466316254063, "grad_norm": 1.6875, "learning_rate": 9.965954315508615e-05, "loss": 0.9611, "step": 235 }, { "epoch": 0.15407454961280745, "grad_norm": 1.5859375, "learning_rate": 9.964224743715759e-05, "loss": 0.9371, "step": 240 }, { "epoch": 0.15728443606307427, "grad_norm": 1.7265625, "learning_rate": 9.962452531582519e-05, "loss": 0.9436, "step": 245 }, { "epoch": 0.1604943225133411, "grad_norm": 1.65625, "learning_rate": 9.960637699454385e-05, "loss": 0.9463, "step": 250 }, { "epoch": 0.1637042089636079, "grad_norm": 1.6875, "learning_rate": 9.95878026816614e-05, "loss": 0.9082, "step": 255 }, { "epoch": 0.16691409541387472, "grad_norm": 1.7578125, "learning_rate": 9.95688025904161e-05, "loss": 0.9109, "step": 260 }, { "epoch": 0.17012398186414154, "grad_norm": 1.6328125, "learning_rate": 9.954937693893438e-05, "loss": 0.9137, "step": 265 }, { "epoch": 0.1733338683144084, "grad_norm": 1.703125, "learning_rate": 9.952952595022813e-05, "loss": 0.9238, "step": 270 }, { "epoch": 0.1765437547646752, "grad_norm": 1.84375, "learning_rate": 9.950924985219228e-05, "loss": 0.9301, "step": 275 }, { "epoch": 0.17975364121494203, "grad_norm": 1.7265625, "learning_rate": 9.94885488776021e-05, "loss": 0.8841, "step": 280 }, { "epoch": 0.18296352766520885, "grad_norm": 1.7734375, "learning_rate": 9.946742326411057e-05, "loss": 0.8775, "step": 285 }, { "epoch": 0.18617341411547567, "grad_norm": 1.703125, "learning_rate": 9.944587325424566e-05, "loss": 0.8849, "step": 290 }, { "epoch": 0.18938330056574249, "grad_norm": 1.5546875, "learning_rate": 9.942389909540753e-05, "loss": 0.9084, "step": 295 }, { "epoch": 0.1925931870160093, "grad_norm": 1.65625, "learning_rate": 9.940150103986565e-05, "loss": 0.8777, "step": 300 }, { "epoch": 0.19580307346627612, "grad_norm": 1.7734375, "learning_rate": 9.9378679344756e-05, "loss": 0.8883, "step": 305 }, { "epoch": 0.19901295991654294, "grad_norm": 1.5625, "learning_rate": 9.935543427207801e-05, "loss": 0.8874, "step": 310 }, { "epoch": 0.20222284636680976, "grad_norm": 1.7890625, "learning_rate": 9.933176608869166e-05, "loss": 0.8846, "step": 315 }, { "epoch": 0.2054327328170766, "grad_norm": 1.75, "learning_rate": 9.930767506631427e-05, "loss": 0.9083, "step": 320 }, { "epoch": 0.20864261926734343, "grad_norm": 1.6796875, "learning_rate": 9.928316148151756e-05, "loss": 0.9058, "step": 325 }, { "epoch": 0.21185250571761025, "grad_norm": 1.5625, "learning_rate": 9.925822561572435e-05, "loss": 0.8871, "step": 330 }, { "epoch": 0.21506239216787706, "grad_norm": 1.625, "learning_rate": 9.923286775520537e-05, "loss": 0.8707, "step": 335 }, { "epoch": 0.21827227861814388, "grad_norm": 1.6953125, "learning_rate": 9.920708819107593e-05, "loss": 0.8788, "step": 340 }, { "epoch": 0.2214821650684107, "grad_norm": 1.625, "learning_rate": 9.918088721929266e-05, "loss": 0.867, "step": 345 }, { "epoch": 0.22469205151867752, "grad_norm": 1.59375, "learning_rate": 9.915426514065007e-05, "loss": 0.8763, "step": 350 }, { "epoch": 0.22790193796894434, "grad_norm": 1.6875, "learning_rate": 9.912722226077709e-05, "loss": 0.8843, "step": 355 }, { "epoch": 0.23111182441921116, "grad_norm": 1.5703125, "learning_rate": 9.90997588901335e-05, "loss": 0.8689, "step": 360 }, { "epoch": 0.234321710869478, "grad_norm": 1.6953125, "learning_rate": 9.907187534400655e-05, "loss": 0.8666, "step": 365 }, { "epoch": 0.23753159731974482, "grad_norm": 1.6171875, "learning_rate": 9.90435719425071e-05, "loss": 0.8511, "step": 370 }, { "epoch": 0.24074148377001164, "grad_norm": 1.6953125, "learning_rate": 9.90148490105662e-05, "loss": 0.8491, "step": 375 }, { "epoch": 0.24395137022027846, "grad_norm": 1.8359375, "learning_rate": 9.898570687793107e-05, "loss": 0.8691, "step": 380 }, { "epoch": 0.24716125667054528, "grad_norm": 1.46875, "learning_rate": 9.895614587916162e-05, "loss": 0.8243, "step": 385 }, { "epoch": 0.2503711431208121, "grad_norm": 1.40625, "learning_rate": 9.892616635362637e-05, "loss": 0.8645, "step": 390 }, { "epoch": 0.2535810295710789, "grad_norm": 1.6171875, "learning_rate": 9.889576864549867e-05, "loss": 0.8191, "step": 395 }, { "epoch": 0.25679091602134574, "grad_norm": 1.5703125, "learning_rate": 9.886495310375275e-05, "loss": 0.8665, "step": 400 }, { "epoch": 0.26000080247161256, "grad_norm": 1.421875, "learning_rate": 9.883372008215962e-05, "loss": 0.8695, "step": 405 }, { "epoch": 0.2632106889218794, "grad_norm": 1.5, "learning_rate": 9.880206993928313e-05, "loss": 0.8283, "step": 410 }, { "epoch": 0.2664205753721462, "grad_norm": 1.421875, "learning_rate": 9.87700030384758e-05, "loss": 0.823, "step": 415 }, { "epoch": 0.269630461822413, "grad_norm": 1.5390625, "learning_rate": 9.873751974787461e-05, "loss": 0.8196, "step": 420 }, { "epoch": 0.27284034827267983, "grad_norm": 1.546875, "learning_rate": 9.870462044039685e-05, "loss": 0.8504, "step": 425 }, { "epoch": 0.27605023472294665, "grad_norm": 1.625, "learning_rate": 9.867130549373578e-05, "loss": 0.8519, "step": 430 }, { "epoch": 0.27926012117321347, "grad_norm": 1.6171875, "learning_rate": 9.863757529035633e-05, "loss": 0.8589, "step": 435 }, { "epoch": 0.28247000762348035, "grad_norm": 1.5859375, "learning_rate": 9.860343021749065e-05, "loss": 0.8209, "step": 440 }, { "epoch": 0.28567989407374716, "grad_norm": 1.53125, "learning_rate": 9.856887066713378e-05, "loss": 0.8453, "step": 445 }, { "epoch": 0.288889780524014, "grad_norm": 1.6484375, "learning_rate": 9.853389703603901e-05, "loss": 0.8433, "step": 450 }, { "epoch": 0.2920996669742808, "grad_norm": 1.640625, "learning_rate": 9.849850972571344e-05, "loss": 0.8281, "step": 455 }, { "epoch": 0.2953095534245476, "grad_norm": 1.5703125, "learning_rate": 9.84627091424133e-05, "loss": 0.8292, "step": 460 }, { "epoch": 0.29851943987481444, "grad_norm": 1.40625, "learning_rate": 9.84264956971393e-05, "loss": 0.8199, "step": 465 }, { "epoch": 0.30172932632508126, "grad_norm": 1.4765625, "learning_rate": 9.838986980563193e-05, "loss": 0.8263, "step": 470 }, { "epoch": 0.3049392127753481, "grad_norm": 1.5546875, "learning_rate": 9.835283188836673e-05, "loss": 0.8324, "step": 475 }, { "epoch": 0.3081490992256149, "grad_norm": 1.515625, "learning_rate": 9.831538237054931e-05, "loss": 0.8085, "step": 480 }, { "epoch": 0.3113589856758817, "grad_norm": 1.640625, "learning_rate": 9.827752168211064e-05, "loss": 0.8375, "step": 485 }, { "epoch": 0.31456887212614854, "grad_norm": 1.6015625, "learning_rate": 9.823925025770206e-05, "loss": 0.8027, "step": 490 }, { "epoch": 0.31777875857641535, "grad_norm": 1.6953125, "learning_rate": 9.82005685366902e-05, "loss": 0.8309, "step": 495 }, { "epoch": 0.3209886450266822, "grad_norm": 1.5546875, "learning_rate": 9.816147696315206e-05, "loss": 0.8218, "step": 500 }, { "epoch": 0.3209886450266822, "eval_loss": 0.7136461138725281, "eval_runtime": 2.3986, "eval_samples_per_second": 83.382, "eval_steps_per_second": 83.382, "step": 500 }, { "epoch": 0.324198531476949, "grad_norm": 1.5703125, "learning_rate": 9.812197598586987e-05, "loss": 0.7931, "step": 505 }, { "epoch": 0.3274084179272158, "grad_norm": 1.6953125, "learning_rate": 9.808206605832591e-05, "loss": 0.8032, "step": 510 }, { "epoch": 0.33061830437748263, "grad_norm": 1.4921875, "learning_rate": 9.80417476386973e-05, "loss": 0.8131, "step": 515 }, { "epoch": 0.33382819082774945, "grad_norm": 1.625, "learning_rate": 9.800102118985082e-05, "loss": 0.7943, "step": 520 }, { "epoch": 0.33703807727801627, "grad_norm": 1.703125, "learning_rate": 9.795988717933751e-05, "loss": 0.8233, "step": 525 }, { "epoch": 0.3402479637282831, "grad_norm": 1.5234375, "learning_rate": 9.79183460793873e-05, "loss": 0.8013, "step": 530 }, { "epoch": 0.3434578501785499, "grad_norm": 1.7578125, "learning_rate": 9.78763983669037e-05, "loss": 0.8121, "step": 535 }, { "epoch": 0.3466677366288168, "grad_norm": 1.5546875, "learning_rate": 9.783404452345815e-05, "loss": 0.8053, "step": 540 }, { "epoch": 0.3498776230790836, "grad_norm": 1.640625, "learning_rate": 9.779128503528468e-05, "loss": 0.7825, "step": 545 }, { "epoch": 0.3530875095293504, "grad_norm": 1.5, "learning_rate": 9.774812039327415e-05, "loss": 0.7883, "step": 550 }, { "epoch": 0.35629739597961724, "grad_norm": 1.515625, "learning_rate": 9.770455109296878e-05, "loss": 0.8132, "step": 555 }, { "epoch": 0.35950728242988406, "grad_norm": 1.6484375, "learning_rate": 9.76605776345563e-05, "loss": 0.7793, "step": 560 }, { "epoch": 0.3627171688801509, "grad_norm": 1.5703125, "learning_rate": 9.761620052286438e-05, "loss": 0.7936, "step": 565 }, { "epoch": 0.3659270553304177, "grad_norm": 1.5078125, "learning_rate": 9.757142026735464e-05, "loss": 0.782, "step": 570 }, { "epoch": 0.3691369417806845, "grad_norm": 1.390625, "learning_rate": 9.752623738211698e-05, "loss": 0.7888, "step": 575 }, { "epoch": 0.37234682823095133, "grad_norm": 1.46875, "learning_rate": 9.748065238586357e-05, "loss": 0.8042, "step": 580 }, { "epoch": 0.37555671468121815, "grad_norm": 1.453125, "learning_rate": 9.743466580192297e-05, "loss": 0.7862, "step": 585 }, { "epoch": 0.37876660113148497, "grad_norm": 1.5234375, "learning_rate": 9.738827815823399e-05, "loss": 0.7994, "step": 590 }, { "epoch": 0.3819764875817518, "grad_norm": 1.5546875, "learning_rate": 9.734148998733981e-05, "loss": 0.7933, "step": 595 }, { "epoch": 0.3851863740320186, "grad_norm": 1.5078125, "learning_rate": 9.729430182638173e-05, "loss": 0.7957, "step": 600 }, { "epoch": 0.3883962604822854, "grad_norm": 1.53125, "learning_rate": 9.724671421709304e-05, "loss": 0.788, "step": 605 }, { "epoch": 0.39160614693255225, "grad_norm": 1.5625, "learning_rate": 9.719872770579284e-05, "loss": 0.7994, "step": 610 }, { "epoch": 0.39481603338281906, "grad_norm": 1.625, "learning_rate": 9.71503428433797e-05, "loss": 0.7882, "step": 615 }, { "epoch": 0.3980259198330859, "grad_norm": 1.4375, "learning_rate": 9.710156018532542e-05, "loss": 0.7768, "step": 620 }, { "epoch": 0.4012358062833527, "grad_norm": 1.5859375, "learning_rate": 9.705238029166855e-05, "loss": 0.7844, "step": 625 }, { "epoch": 0.4044456927336195, "grad_norm": 1.390625, "learning_rate": 9.700280372700807e-05, "loss": 0.7825, "step": 630 }, { "epoch": 0.4076555791838864, "grad_norm": 1.3515625, "learning_rate": 9.695283106049682e-05, "loss": 0.7749, "step": 635 }, { "epoch": 0.4108654656341532, "grad_norm": 1.578125, "learning_rate": 9.6902462865835e-05, "loss": 0.7849, "step": 640 }, { "epoch": 0.41407535208442003, "grad_norm": 1.5234375, "learning_rate": 9.68516997212636e-05, "loss": 0.7684, "step": 645 }, { "epoch": 0.41728523853468685, "grad_norm": 1.2890625, "learning_rate": 9.680054220955774e-05, "loss": 0.763, "step": 650 }, { "epoch": 0.42049512498495367, "grad_norm": 1.5859375, "learning_rate": 9.674899091801996e-05, "loss": 0.7771, "step": 655 }, { "epoch": 0.4237050114352205, "grad_norm": 1.46875, "learning_rate": 9.669704643847358e-05, "loss": 0.7729, "step": 660 }, { "epoch": 0.4269148978854873, "grad_norm": 1.4609375, "learning_rate": 9.664470936725571e-05, "loss": 0.7644, "step": 665 }, { "epoch": 0.43012478433575413, "grad_norm": 1.4609375, "learning_rate": 9.659198030521063e-05, "loss": 0.7702, "step": 670 }, { "epoch": 0.43333467078602095, "grad_norm": 1.3671875, "learning_rate": 9.653885985768273e-05, "loss": 0.7859, "step": 675 }, { "epoch": 0.43654455723628777, "grad_norm": 1.5078125, "learning_rate": 9.648534863450962e-05, "loss": 0.7817, "step": 680 }, { "epoch": 0.4397544436865546, "grad_norm": 1.625, "learning_rate": 9.643144725001514e-05, "loss": 0.7604, "step": 685 }, { "epoch": 0.4429643301368214, "grad_norm": 1.5625, "learning_rate": 9.637715632300229e-05, "loss": 0.7772, "step": 690 }, { "epoch": 0.4461742165870882, "grad_norm": 1.65625, "learning_rate": 9.632247647674606e-05, "loss": 0.7653, "step": 695 }, { "epoch": 0.44938410303735504, "grad_norm": 1.609375, "learning_rate": 9.626740833898648e-05, "loss": 0.7522, "step": 700 }, { "epoch": 0.45259398948762186, "grad_norm": 1.4453125, "learning_rate": 9.621195254192114e-05, "loss": 0.7729, "step": 705 }, { "epoch": 0.4558038759378887, "grad_norm": 1.421875, "learning_rate": 9.615610972219816e-05, "loss": 0.7425, "step": 710 }, { "epoch": 0.4590137623881555, "grad_norm": 1.5078125, "learning_rate": 9.609988052090872e-05, "loss": 0.7838, "step": 715 }, { "epoch": 0.4622236488384223, "grad_norm": 1.546875, "learning_rate": 9.604326558357983e-05, "loss": 0.7653, "step": 720 }, { "epoch": 0.46543353528868914, "grad_norm": 1.578125, "learning_rate": 9.598626556016682e-05, "loss": 0.7702, "step": 725 }, { "epoch": 0.468643421738956, "grad_norm": 1.4296875, "learning_rate": 9.59288811050459e-05, "loss": 0.7565, "step": 730 }, { "epoch": 0.47185330818922283, "grad_norm": 1.6015625, "learning_rate": 9.587111287700672e-05, "loss": 0.7352, "step": 735 }, { "epoch": 0.47506319463948965, "grad_norm": 1.3671875, "learning_rate": 9.581296153924468e-05, "loss": 0.7715, "step": 740 }, { "epoch": 0.47827308108975647, "grad_norm": 1.5078125, "learning_rate": 9.575442775935348e-05, "loss": 0.7536, "step": 745 }, { "epoch": 0.4814829675400233, "grad_norm": 1.4296875, "learning_rate": 9.569551220931725e-05, "loss": 0.7404, "step": 750 }, { "epoch": 0.4846928539902901, "grad_norm": 1.5546875, "learning_rate": 9.563621556550306e-05, "loss": 0.7383, "step": 755 }, { "epoch": 0.4879027404405569, "grad_norm": 1.5, "learning_rate": 9.557653850865293e-05, "loss": 0.7391, "step": 760 }, { "epoch": 0.49111262689082374, "grad_norm": 1.4140625, "learning_rate": 9.551648172387624e-05, "loss": 0.751, "step": 765 }, { "epoch": 0.49432251334109056, "grad_norm": 1.3125, "learning_rate": 9.545604590064167e-05, "loss": 0.7483, "step": 770 }, { "epoch": 0.4975323997913574, "grad_norm": 1.5234375, "learning_rate": 9.539523173276942e-05, "loss": 0.7284, "step": 775 }, { "epoch": 0.5007422862416242, "grad_norm": 1.5390625, "learning_rate": 9.533403991842317e-05, "loss": 0.7356, "step": 780 }, { "epoch": 0.5039521726918911, "grad_norm": 1.609375, "learning_rate": 9.527247116010207e-05, "loss": 0.7591, "step": 785 }, { "epoch": 0.5071620591421578, "grad_norm": 1.421875, "learning_rate": 9.521052616463272e-05, "loss": 0.7411, "step": 790 }, { "epoch": 0.5103719455924247, "grad_norm": 1.546875, "learning_rate": 9.5148205643161e-05, "loss": 0.7574, "step": 795 }, { "epoch": 0.5135818320426915, "grad_norm": 1.4609375, "learning_rate": 9.5085510311144e-05, "loss": 0.7262, "step": 800 }, { "epoch": 0.5167917184929584, "grad_norm": 1.4921875, "learning_rate": 9.502244088834164e-05, "loss": 0.7584, "step": 805 }, { "epoch": 0.5200016049432251, "grad_norm": 1.421875, "learning_rate": 9.495899809880858e-05, "loss": 0.7261, "step": 810 }, { "epoch": 0.523211491393492, "grad_norm": 1.6328125, "learning_rate": 9.489518267088583e-05, "loss": 0.7463, "step": 815 }, { "epoch": 0.5264213778437588, "grad_norm": 1.4609375, "learning_rate": 9.483099533719234e-05, "loss": 0.7477, "step": 820 }, { "epoch": 0.5296312642940256, "grad_norm": 1.453125, "learning_rate": 9.476643683461672e-05, "loss": 0.7441, "step": 825 }, { "epoch": 0.5328411507442924, "grad_norm": 1.53125, "learning_rate": 9.470150790430863e-05, "loss": 0.7433, "step": 830 }, { "epoch": 0.5360510371945593, "grad_norm": 1.4609375, "learning_rate": 9.463620929167039e-05, "loss": 0.7414, "step": 835 }, { "epoch": 0.539260923644826, "grad_norm": 1.4140625, "learning_rate": 9.457054174634837e-05, "loss": 0.7412, "step": 840 }, { "epoch": 0.5424708100950929, "grad_norm": 1.640625, "learning_rate": 9.450450602222435e-05, "loss": 0.7164, "step": 845 }, { "epoch": 0.5456806965453597, "grad_norm": 1.53125, "learning_rate": 9.443810287740697e-05, "loss": 0.755, "step": 850 }, { "epoch": 0.5488905829956265, "grad_norm": 1.4765625, "learning_rate": 9.437133307422294e-05, "loss": 0.7512, "step": 855 }, { "epoch": 0.5521004694458933, "grad_norm": 1.5625, "learning_rate": 9.430419737920828e-05, "loss": 0.7385, "step": 860 }, { "epoch": 0.5553103558961602, "grad_norm": 1.515625, "learning_rate": 9.42366965630996e-05, "loss": 0.7316, "step": 865 }, { "epoch": 0.5585202423464269, "grad_norm": 1.390625, "learning_rate": 9.416883140082512e-05, "loss": 0.7297, "step": 870 }, { "epoch": 0.5617301287966938, "grad_norm": 1.5, "learning_rate": 9.410060267149596e-05, "loss": 0.7208, "step": 875 }, { "epoch": 0.5649400152469607, "grad_norm": 1.359375, "learning_rate": 9.403201115839704e-05, "loss": 0.7288, "step": 880 }, { "epoch": 0.5681499016972275, "grad_norm": 1.4375, "learning_rate": 9.396305764897813e-05, "loss": 0.7133, "step": 885 }, { "epoch": 0.5713597881474943, "grad_norm": 1.4921875, "learning_rate": 9.389374293484483e-05, "loss": 0.7036, "step": 890 }, { "epoch": 0.5745696745977611, "grad_norm": 1.5234375, "learning_rate": 9.382406781174949e-05, "loss": 0.7332, "step": 895 }, { "epoch": 0.577779561048028, "grad_norm": 1.484375, "learning_rate": 9.3754033079582e-05, "loss": 0.7343, "step": 900 }, { "epoch": 0.5809894474982947, "grad_norm": 1.546875, "learning_rate": 9.368363954236075e-05, "loss": 0.7119, "step": 905 }, { "epoch": 0.5841993339485616, "grad_norm": 1.5703125, "learning_rate": 9.361288800822321e-05, "loss": 0.7339, "step": 910 }, { "epoch": 0.5874092203988284, "grad_norm": 1.453125, "learning_rate": 9.354177928941687e-05, "loss": 0.7163, "step": 915 }, { "epoch": 0.5906191068490952, "grad_norm": 1.4453125, "learning_rate": 9.347031420228969e-05, "loss": 0.7281, "step": 920 }, { "epoch": 0.593828993299362, "grad_norm": 1.5, "learning_rate": 9.339849356728092e-05, "loss": 0.7072, "step": 925 }, { "epoch": 0.5970388797496289, "grad_norm": 1.46875, "learning_rate": 9.332631820891154e-05, "loss": 0.729, "step": 930 }, { "epoch": 0.6002487661998956, "grad_norm": 1.4609375, "learning_rate": 9.325378895577491e-05, "loss": 0.7341, "step": 935 }, { "epoch": 0.6034586526501625, "grad_norm": 1.4140625, "learning_rate": 9.318090664052713e-05, "loss": 0.708, "step": 940 }, { "epoch": 0.6066685391004293, "grad_norm": 1.3515625, "learning_rate": 9.310767209987763e-05, "loss": 0.7191, "step": 945 }, { "epoch": 0.6098784255506962, "grad_norm": 1.6796875, "learning_rate": 9.303408617457943e-05, "loss": 0.7114, "step": 950 }, { "epoch": 0.6130883120009629, "grad_norm": 1.484375, "learning_rate": 9.296014970941958e-05, "loss": 0.704, "step": 955 }, { "epoch": 0.6162981984512298, "grad_norm": 1.3359375, "learning_rate": 9.288586355320938e-05, "loss": 0.704, "step": 960 }, { "epoch": 0.6195080849014966, "grad_norm": 1.359375, "learning_rate": 9.281122855877473e-05, "loss": 0.7112, "step": 965 }, { "epoch": 0.6227179713517634, "grad_norm": 1.4765625, "learning_rate": 9.273624558294627e-05, "loss": 0.6998, "step": 970 }, { "epoch": 0.6259278578020303, "grad_norm": 1.421875, "learning_rate": 9.266091548654958e-05, "loss": 0.7114, "step": 975 }, { "epoch": 0.6291377442522971, "grad_norm": 1.5625, "learning_rate": 9.258523913439522e-05, "loss": 0.7307, "step": 980 }, { "epoch": 0.632347630702564, "grad_norm": 1.53125, "learning_rate": 9.250921739526896e-05, "loss": 0.7257, "step": 985 }, { "epoch": 0.6355575171528307, "grad_norm": 1.578125, "learning_rate": 9.243285114192163e-05, "loss": 0.7261, "step": 990 }, { "epoch": 0.6387674036030976, "grad_norm": 1.40625, "learning_rate": 9.235614125105922e-05, "loss": 0.7139, "step": 995 }, { "epoch": 0.6419772900533643, "grad_norm": 1.390625, "learning_rate": 9.227908860333275e-05, "loss": 0.7136, "step": 1000 }, { "epoch": 0.6419772900533643, "eval_loss": 0.6108266711235046, "eval_runtime": 2.3924, "eval_samples_per_second": 83.597, "eval_steps_per_second": 83.597, "step": 1000 }, { "epoch": 0.6451871765036312, "grad_norm": 1.5, "learning_rate": 9.220169408332821e-05, "loss": 0.6998, "step": 1005 }, { "epoch": 0.648397062953898, "grad_norm": 1.4375, "learning_rate": 9.212395857955637e-05, "loss": 0.7121, "step": 1010 }, { "epoch": 0.6516069494041649, "grad_norm": 1.390625, "learning_rate": 9.204588298444257e-05, "loss": 0.7275, "step": 1015 }, { "epoch": 0.6548168358544316, "grad_norm": 1.40625, "learning_rate": 9.196746819431652e-05, "loss": 0.7063, "step": 1020 }, { "epoch": 0.6580267223046985, "grad_norm": 1.6171875, "learning_rate": 9.188871510940198e-05, "loss": 0.7275, "step": 1025 }, { "epoch": 0.6612366087549653, "grad_norm": 1.375, "learning_rate": 9.180962463380642e-05, "loss": 0.6942, "step": 1030 }, { "epoch": 0.6644464952052321, "grad_norm": 1.3828125, "learning_rate": 9.173019767551064e-05, "loss": 0.7184, "step": 1035 }, { "epoch": 0.6676563816554989, "grad_norm": 1.515625, "learning_rate": 9.165043514635836e-05, "loss": 0.7054, "step": 1040 }, { "epoch": 0.6708662681057658, "grad_norm": 1.53125, "learning_rate": 9.157033796204579e-05, "loss": 0.7166, "step": 1045 }, { "epoch": 0.6740761545560325, "grad_norm": 1.53125, "learning_rate": 9.148990704211103e-05, "loss": 0.7031, "step": 1050 }, { "epoch": 0.6772860410062994, "grad_norm": 1.453125, "learning_rate": 9.140914330992356e-05, "loss": 0.7071, "step": 1055 }, { "epoch": 0.6804959274565662, "grad_norm": 1.5, "learning_rate": 9.132804769267364e-05, "loss": 0.7117, "step": 1060 }, { "epoch": 0.683705813906833, "grad_norm": 1.703125, "learning_rate": 9.124662112136169e-05, "loss": 0.7063, "step": 1065 }, { "epoch": 0.6869157003570998, "grad_norm": 1.7265625, "learning_rate": 9.116486453078755e-05, "loss": 0.7007, "step": 1070 }, { "epoch": 0.6901255868073667, "grad_norm": 1.5703125, "learning_rate": 9.108277885953975e-05, "loss": 0.6956, "step": 1075 }, { "epoch": 0.6933354732576336, "grad_norm": 1.4140625, "learning_rate": 9.100036504998483e-05, "loss": 0.6968, "step": 1080 }, { "epoch": 0.6965453597079003, "grad_norm": 1.3203125, "learning_rate": 9.091762404825639e-05, "loss": 0.7131, "step": 1085 }, { "epoch": 0.6997552461581672, "grad_norm": 1.46875, "learning_rate": 9.08345568042443e-05, "loss": 0.6982, "step": 1090 }, { "epoch": 0.702965132608434, "grad_norm": 1.5546875, "learning_rate": 9.075116427158379e-05, "loss": 0.6743, "step": 1095 }, { "epoch": 0.7061750190587008, "grad_norm": 1.359375, "learning_rate": 9.06674474076445e-05, "loss": 0.6925, "step": 1100 }, { "epoch": 0.7093849055089676, "grad_norm": 1.4375, "learning_rate": 9.058340717351948e-05, "loss": 0.6849, "step": 1105 }, { "epoch": 0.7125947919592345, "grad_norm": 1.3828125, "learning_rate": 9.049904453401412e-05, "loss": 0.6815, "step": 1110 }, { "epoch": 0.7158046784095012, "grad_norm": 1.4453125, "learning_rate": 9.04143604576352e-05, "loss": 0.6905, "step": 1115 }, { "epoch": 0.7190145648597681, "grad_norm": 1.4453125, "learning_rate": 9.032935591657961e-05, "loss": 0.69, "step": 1120 }, { "epoch": 0.7222244513100349, "grad_norm": 1.3828125, "learning_rate": 9.02440318867233e-05, "loss": 0.6861, "step": 1125 }, { "epoch": 0.7254343377603018, "grad_norm": 1.484375, "learning_rate": 9.015838934761003e-05, "loss": 0.7338, "step": 1130 }, { "epoch": 0.7286442242105685, "grad_norm": 1.3515625, "learning_rate": 9.007242928244014e-05, "loss": 0.6787, "step": 1135 }, { "epoch": 0.7318541106608354, "grad_norm": 1.421875, "learning_rate": 8.998615267805922e-05, "loss": 0.6793, "step": 1140 }, { "epoch": 0.7350639971111022, "grad_norm": 1.3671875, "learning_rate": 8.98995605249469e-05, "loss": 0.6791, "step": 1145 }, { "epoch": 0.738273883561369, "grad_norm": 1.4140625, "learning_rate": 8.981265381720533e-05, "loss": 0.7028, "step": 1150 }, { "epoch": 0.7414837700116358, "grad_norm": 1.453125, "learning_rate": 8.972543355254785e-05, "loss": 0.712, "step": 1155 }, { "epoch": 0.7446936564619027, "grad_norm": 1.4453125, "learning_rate": 8.963790073228757e-05, "loss": 0.6749, "step": 1160 }, { "epoch": 0.7479035429121694, "grad_norm": 1.53125, "learning_rate": 8.955005636132573e-05, "loss": 0.6844, "step": 1165 }, { "epoch": 0.7511134293624363, "grad_norm": 1.296875, "learning_rate": 8.946190144814034e-05, "loss": 0.6753, "step": 1170 }, { "epoch": 0.7543233158127032, "grad_norm": 1.4296875, "learning_rate": 8.937343700477449e-05, "loss": 0.6809, "step": 1175 }, { "epoch": 0.7575332022629699, "grad_norm": 1.3515625, "learning_rate": 8.928466404682478e-05, "loss": 0.7046, "step": 1180 }, { "epoch": 0.7607430887132368, "grad_norm": 1.3515625, "learning_rate": 8.91955835934296e-05, "loss": 0.6763, "step": 1185 }, { "epoch": 0.7639529751635036, "grad_norm": 1.359375, "learning_rate": 8.910619666725755e-05, "loss": 0.6788, "step": 1190 }, { "epoch": 0.7671628616137705, "grad_norm": 1.40625, "learning_rate": 8.901650429449553e-05, "loss": 0.6874, "step": 1195 }, { "epoch": 0.7703727480640372, "grad_norm": 1.3125, "learning_rate": 8.892650750483715e-05, "loss": 0.7008, "step": 1200 }, { "epoch": 0.7735826345143041, "grad_norm": 1.3203125, "learning_rate": 8.883620733147073e-05, "loss": 0.6946, "step": 1205 }, { "epoch": 0.7767925209645709, "grad_norm": 1.3671875, "learning_rate": 8.874560481106758e-05, "loss": 0.6845, "step": 1210 }, { "epoch": 0.7800024074148377, "grad_norm": 1.3359375, "learning_rate": 8.865470098376995e-05, "loss": 0.7019, "step": 1215 }, { "epoch": 0.7832122938651045, "grad_norm": 1.4140625, "learning_rate": 8.856349689317933e-05, "loss": 0.6611, "step": 1220 }, { "epoch": 0.7864221803153714, "grad_norm": 1.4453125, "learning_rate": 8.847199358634415e-05, "loss": 0.6769, "step": 1225 }, { "epoch": 0.7896320667656381, "grad_norm": 1.3359375, "learning_rate": 8.838019211374804e-05, "loss": 0.6684, "step": 1230 }, { "epoch": 0.792841953215905, "grad_norm": 1.3046875, "learning_rate": 8.828809352929762e-05, "loss": 0.6799, "step": 1235 }, { "epoch": 0.7960518396661718, "grad_norm": 1.8125, "learning_rate": 8.81956988903104e-05, "loss": 0.685, "step": 1240 }, { "epoch": 0.7992617261164386, "grad_norm": 1.265625, "learning_rate": 8.810300925750277e-05, "loss": 0.6874, "step": 1245 }, { "epoch": 0.8024716125667054, "grad_norm": 1.5625, "learning_rate": 8.801002569497763e-05, "loss": 0.6856, "step": 1250 }, { "epoch": 0.8056814990169723, "grad_norm": 1.3359375, "learning_rate": 8.791674927021234e-05, "loss": 0.68, "step": 1255 }, { "epoch": 0.808891385467239, "grad_norm": 1.2734375, "learning_rate": 8.782318105404636e-05, "loss": 0.6473, "step": 1260 }, { "epoch": 0.8121012719175059, "grad_norm": 1.46875, "learning_rate": 8.772932212066906e-05, "loss": 0.6721, "step": 1265 }, { "epoch": 0.8153111583677728, "grad_norm": 1.484375, "learning_rate": 8.763517354760726e-05, "loss": 0.6675, "step": 1270 }, { "epoch": 0.8185210448180396, "grad_norm": 1.3125, "learning_rate": 8.754073641571295e-05, "loss": 0.6856, "step": 1275 }, { "epoch": 0.8217309312683064, "grad_norm": 1.3515625, "learning_rate": 8.744601180915087e-05, "loss": 0.6938, "step": 1280 }, { "epoch": 0.8249408177185732, "grad_norm": 1.296875, "learning_rate": 8.7351000815386e-05, "loss": 0.6785, "step": 1285 }, { "epoch": 0.8281507041688401, "grad_norm": 1.3515625, "learning_rate": 8.72557045251712e-05, "loss": 0.6697, "step": 1290 }, { "epoch": 0.8313605906191068, "grad_norm": 1.40625, "learning_rate": 8.716012403253455e-05, "loss": 0.6647, "step": 1295 }, { "epoch": 0.8345704770693737, "grad_norm": 1.3125, "learning_rate": 8.706426043476687e-05, "loss": 0.6776, "step": 1300 }, { "epoch": 0.8377803635196405, "grad_norm": 1.40625, "learning_rate": 8.696811483240915e-05, "loss": 0.6689, "step": 1305 }, { "epoch": 0.8409902499699073, "grad_norm": 1.515625, "learning_rate": 8.687168832923981e-05, "loss": 0.6667, "step": 1310 }, { "epoch": 0.8442001364201741, "grad_norm": 1.3828125, "learning_rate": 8.67749820322621e-05, "loss": 0.694, "step": 1315 }, { "epoch": 0.847410022870441, "grad_norm": 1.3828125, "learning_rate": 8.667799705169142e-05, "loss": 0.6682, "step": 1320 }, { "epoch": 0.8506199093207077, "grad_norm": 1.4296875, "learning_rate": 8.65807345009425e-05, "loss": 0.6942, "step": 1325 }, { "epoch": 0.8538297957709746, "grad_norm": 1.40625, "learning_rate": 8.648319549661668e-05, "loss": 0.6832, "step": 1330 }, { "epoch": 0.8570396822212414, "grad_norm": 1.3515625, "learning_rate": 8.638538115848902e-05, "loss": 0.673, "step": 1335 }, { "epoch": 0.8602495686715083, "grad_norm": 1.4921875, "learning_rate": 8.628729260949555e-05, "loss": 0.6954, "step": 1340 }, { "epoch": 0.863459455121775, "grad_norm": 1.59375, "learning_rate": 8.618893097572027e-05, "loss": 0.6585, "step": 1345 }, { "epoch": 0.8666693415720419, "grad_norm": 1.4296875, "learning_rate": 8.60902973863823e-05, "loss": 0.6733, "step": 1350 }, { "epoch": 0.8698792280223087, "grad_norm": 1.484375, "learning_rate": 8.599139297382286e-05, "loss": 0.6714, "step": 1355 }, { "epoch": 0.8730891144725755, "grad_norm": 1.46875, "learning_rate": 8.58922188734923e-05, "loss": 0.6733, "step": 1360 }, { "epoch": 0.8762990009228424, "grad_norm": 1.4453125, "learning_rate": 8.579277622393708e-05, "loss": 0.6771, "step": 1365 }, { "epoch": 0.8795088873731092, "grad_norm": 1.5703125, "learning_rate": 8.569306616678667e-05, "loss": 0.6702, "step": 1370 }, { "epoch": 0.882718773823376, "grad_norm": 1.421875, "learning_rate": 8.559308984674047e-05, "loss": 0.6461, "step": 1375 }, { "epoch": 0.8859286602736428, "grad_norm": 1.4609375, "learning_rate": 8.549284841155461e-05, "loss": 0.6836, "step": 1380 }, { "epoch": 0.8891385467239097, "grad_norm": 1.390625, "learning_rate": 8.539234301202885e-05, "loss": 0.6547, "step": 1385 }, { "epoch": 0.8923484331741764, "grad_norm": 1.515625, "learning_rate": 8.529157480199335e-05, "loss": 0.664, "step": 1390 }, { "epoch": 0.8955583196244433, "grad_norm": 1.2890625, "learning_rate": 8.519054493829535e-05, "loss": 0.6625, "step": 1395 }, { "epoch": 0.8987682060747101, "grad_norm": 1.3125, "learning_rate": 8.508925458078599e-05, "loss": 0.6582, "step": 1400 }, { "epoch": 0.901978092524977, "grad_norm": 1.3515625, "learning_rate": 8.498770489230699e-05, "loss": 0.6432, "step": 1405 }, { "epoch": 0.9051879789752437, "grad_norm": 1.25, "learning_rate": 8.488589703867714e-05, "loss": 0.6775, "step": 1410 }, { "epoch": 0.9083978654255106, "grad_norm": 1.3203125, "learning_rate": 8.478383218867918e-05, "loss": 0.6847, "step": 1415 }, { "epoch": 0.9116077518757774, "grad_norm": 1.3515625, "learning_rate": 8.468151151404616e-05, "loss": 0.6691, "step": 1420 }, { "epoch": 0.9148176383260442, "grad_norm": 1.3828125, "learning_rate": 8.457893618944808e-05, "loss": 0.6618, "step": 1425 }, { "epoch": 0.918027524776311, "grad_norm": 1.421875, "learning_rate": 8.447610739247838e-05, "loss": 0.6755, "step": 1430 }, { "epoch": 0.9212374112265779, "grad_norm": 1.25, "learning_rate": 8.437302630364046e-05, "loss": 0.6673, "step": 1435 }, { "epoch": 0.9244472976768446, "grad_norm": 1.359375, "learning_rate": 8.426969410633411e-05, "loss": 0.6582, "step": 1440 }, { "epoch": 0.9276571841271115, "grad_norm": 1.296875, "learning_rate": 8.416611198684187e-05, "loss": 0.6667, "step": 1445 }, { "epoch": 0.9308670705773783, "grad_norm": 1.3828125, "learning_rate": 8.406228113431552e-05, "loss": 0.6716, "step": 1450 }, { "epoch": 0.9340769570276451, "grad_norm": 1.2890625, "learning_rate": 8.395820274076229e-05, "loss": 0.6746, "step": 1455 }, { "epoch": 0.937286843477912, "grad_norm": 1.3359375, "learning_rate": 8.385387800103132e-05, "loss": 0.6511, "step": 1460 }, { "epoch": 0.9404967299281788, "grad_norm": 1.4453125, "learning_rate": 8.374930811279983e-05, "loss": 0.667, "step": 1465 }, { "epoch": 0.9437066163784457, "grad_norm": 1.296875, "learning_rate": 8.364449427655942e-05, "loss": 0.6766, "step": 1470 }, { "epoch": 0.9469165028287124, "grad_norm": 1.4453125, "learning_rate": 8.353943769560228e-05, "loss": 0.6468, "step": 1475 }, { "epoch": 0.9501263892789793, "grad_norm": 1.359375, "learning_rate": 8.343413957600744e-05, "loss": 0.6427, "step": 1480 }, { "epoch": 0.9533362757292461, "grad_norm": 1.5546875, "learning_rate": 8.332860112662673e-05, "loss": 0.6207, "step": 1485 }, { "epoch": 0.9565461621795129, "grad_norm": 1.2890625, "learning_rate": 8.322282355907117e-05, "loss": 0.6548, "step": 1490 }, { "epoch": 0.9597560486297797, "grad_norm": 1.546875, "learning_rate": 8.311680808769682e-05, "loss": 0.6662, "step": 1495 }, { "epoch": 0.9629659350800466, "grad_norm": 1.3828125, "learning_rate": 8.301055592959101e-05, "loss": 0.6488, "step": 1500 }, { "epoch": 0.9629659350800466, "eval_loss": 0.5586946606636047, "eval_runtime": 2.4022, "eval_samples_per_second": 83.255, "eval_steps_per_second": 83.255, "step": 1500 }, { "epoch": 0.9661758215303133, "grad_norm": 1.3203125, "learning_rate": 8.290406830455828e-05, "loss": 0.6723, "step": 1505 }, { "epoch": 0.9693857079805802, "grad_norm": 1.359375, "learning_rate": 8.279734643510636e-05, "loss": 0.653, "step": 1510 }, { "epoch": 0.972595594430847, "grad_norm": 1.359375, "learning_rate": 8.269039154643224e-05, "loss": 0.6535, "step": 1515 }, { "epoch": 0.9758054808811139, "grad_norm": 1.3828125, "learning_rate": 8.258320486640798e-05, "loss": 0.6498, "step": 1520 }, { "epoch": 0.9790153673313806, "grad_norm": 1.3515625, "learning_rate": 8.24757876255667e-05, "loss": 0.6531, "step": 1525 }, { "epoch": 0.9822252537816475, "grad_norm": 1.40625, "learning_rate": 8.23681410570884e-05, "loss": 0.6698, "step": 1530 }, { "epoch": 0.9854351402319143, "grad_norm": 1.4765625, "learning_rate": 8.226026639678582e-05, "loss": 0.658, "step": 1535 }, { "epoch": 0.9886450266821811, "grad_norm": 1.2578125, "learning_rate": 8.215216488309032e-05, "loss": 0.6606, "step": 1540 }, { "epoch": 0.9918549131324479, "grad_norm": 1.3515625, "learning_rate": 8.204383775703752e-05, "loss": 0.6519, "step": 1545 }, { "epoch": 0.9950647995827148, "grad_norm": 1.40625, "learning_rate": 8.19352862622532e-05, "loss": 0.6452, "step": 1550 }, { "epoch": 0.9982746860329815, "grad_norm": 1.3515625, "learning_rate": 8.182651164493889e-05, "loss": 0.6567, "step": 1555 }, { "epoch": 0.9995586406130883, "eval_loss": 0.5523168444633484, "eval_runtime": 2.4204, "eval_samples_per_second": 82.63, "eval_steps_per_second": 82.63, "step": 1557 }, { "epoch": 1.00192593187016, "grad_norm": 1.3125, "learning_rate": 8.171751515385769e-05, "loss": 0.7609, "step": 1560 }, { "epoch": 1.0051358183204269, "grad_norm": 1.2265625, "learning_rate": 8.160829804031982e-05, "loss": 0.615, "step": 1565 }, { "epoch": 1.0083457047706936, "grad_norm": 1.3671875, "learning_rate": 8.149886155816835e-05, "loss": 0.6382, "step": 1570 }, { "epoch": 1.0115555912209606, "grad_norm": 1.40625, "learning_rate": 8.138920696376476e-05, "loss": 0.6391, "step": 1575 }, { "epoch": 1.0147654776712274, "grad_norm": 1.390625, "learning_rate": 8.127933551597449e-05, "loss": 0.6365, "step": 1580 }, { "epoch": 1.0179753641214941, "grad_norm": 1.34375, "learning_rate": 8.116924847615254e-05, "loss": 0.6269, "step": 1585 }, { "epoch": 1.0211852505717611, "grad_norm": 1.28125, "learning_rate": 8.105894710812897e-05, "loss": 0.6414, "step": 1590 }, { "epoch": 1.024395137022028, "grad_norm": 1.421875, "learning_rate": 8.094843267819438e-05, "loss": 0.6218, "step": 1595 }, { "epoch": 1.0276050234722947, "grad_norm": 1.4453125, "learning_rate": 8.083770645508535e-05, "loss": 0.6456, "step": 1600 }, { "epoch": 1.0308149099225614, "grad_norm": 1.453125, "learning_rate": 8.072676970996997e-05, "loss": 0.6349, "step": 1605 }, { "epoch": 1.0340247963728284, "grad_norm": 1.3046875, "learning_rate": 8.061562371643312e-05, "loss": 0.5872, "step": 1610 }, { "epoch": 1.0372346828230952, "grad_norm": 1.375, "learning_rate": 8.050426975046196e-05, "loss": 0.6129, "step": 1615 }, { "epoch": 1.040444569273362, "grad_norm": 1.34375, "learning_rate": 8.039270909043119e-05, "loss": 0.6275, "step": 1620 }, { "epoch": 1.0436544557236287, "grad_norm": 1.40625, "learning_rate": 8.028094301708843e-05, "loss": 0.6198, "step": 1625 }, { "epoch": 1.0468643421738957, "grad_norm": 1.4609375, "learning_rate": 8.016897281353954e-05, "loss": 0.6125, "step": 1630 }, { "epoch": 1.0500742286241624, "grad_norm": 1.3828125, "learning_rate": 8.00567997652338e-05, "loss": 0.6076, "step": 1635 }, { "epoch": 1.0532841150744292, "grad_norm": 1.5625, "learning_rate": 7.994442515994922e-05, "loss": 0.6153, "step": 1640 }, { "epoch": 1.056494001524696, "grad_norm": 1.28125, "learning_rate": 7.983185028777773e-05, "loss": 0.614, "step": 1645 }, { "epoch": 1.059703887974963, "grad_norm": 1.3828125, "learning_rate": 7.971907644111043e-05, "loss": 0.6287, "step": 1650 }, { "epoch": 1.0629137744252297, "grad_norm": 1.4375, "learning_rate": 7.960610491462265e-05, "loss": 0.6234, "step": 1655 }, { "epoch": 1.0661236608754965, "grad_norm": 1.390625, "learning_rate": 7.949293700525914e-05, "loss": 0.6352, "step": 1660 }, { "epoch": 1.0693335473257632, "grad_norm": 1.328125, "learning_rate": 7.93795740122192e-05, "loss": 0.6275, "step": 1665 }, { "epoch": 1.0725434337760302, "grad_norm": 1.2734375, "learning_rate": 7.926601723694178e-05, "loss": 0.6266, "step": 1670 }, { "epoch": 1.075753320226297, "grad_norm": 1.40625, "learning_rate": 7.915226798309042e-05, "loss": 0.6111, "step": 1675 }, { "epoch": 1.0789632066765638, "grad_norm": 1.3828125, "learning_rate": 7.903832755653844e-05, "loss": 0.6032, "step": 1680 }, { "epoch": 1.0821730931268307, "grad_norm": 1.5859375, "learning_rate": 7.892419726535385e-05, "loss": 0.6113, "step": 1685 }, { "epoch": 1.0853829795770975, "grad_norm": 1.359375, "learning_rate": 7.880987841978435e-05, "loss": 0.6332, "step": 1690 }, { "epoch": 1.0885928660273643, "grad_norm": 1.46875, "learning_rate": 7.86953723322423e-05, "loss": 0.6419, "step": 1695 }, { "epoch": 1.091802752477631, "grad_norm": 1.3203125, "learning_rate": 7.858068031728968e-05, "loss": 0.6249, "step": 1700 }, { "epoch": 1.095012638927898, "grad_norm": 1.3203125, "learning_rate": 7.846580369162293e-05, "loss": 0.6075, "step": 1705 }, { "epoch": 1.0982225253781648, "grad_norm": 1.3828125, "learning_rate": 7.83507437740579e-05, "loss": 0.6379, "step": 1710 }, { "epoch": 1.1014324118284315, "grad_norm": 1.390625, "learning_rate": 7.823550188551466e-05, "loss": 0.6165, "step": 1715 }, { "epoch": 1.1046422982786983, "grad_norm": 1.328125, "learning_rate": 7.812007934900238e-05, "loss": 0.6106, "step": 1720 }, { "epoch": 1.1078521847289653, "grad_norm": 1.25, "learning_rate": 7.800447748960408e-05, "loss": 0.6132, "step": 1725 }, { "epoch": 1.111062071179232, "grad_norm": 1.390625, "learning_rate": 7.788869763446154e-05, "loss": 0.6224, "step": 1730 }, { "epoch": 1.1142719576294988, "grad_norm": 1.4609375, "learning_rate": 7.777274111275988e-05, "loss": 0.6353, "step": 1735 }, { "epoch": 1.1174818440797656, "grad_norm": 1.2421875, "learning_rate": 7.765660925571245e-05, "loss": 0.6289, "step": 1740 }, { "epoch": 1.1206917305300326, "grad_norm": 1.2890625, "learning_rate": 7.754030339654552e-05, "loss": 0.6091, "step": 1745 }, { "epoch": 1.1239016169802993, "grad_norm": 1.2578125, "learning_rate": 7.74238248704829e-05, "loss": 0.6119, "step": 1750 }, { "epoch": 1.127111503430566, "grad_norm": 1.2421875, "learning_rate": 7.730717501473073e-05, "loss": 0.6173, "step": 1755 }, { "epoch": 1.1303213898808329, "grad_norm": 1.34375, "learning_rate": 7.719035516846201e-05, "loss": 0.6184, "step": 1760 }, { "epoch": 1.1335312763310998, "grad_norm": 1.390625, "learning_rate": 7.707336667280128e-05, "loss": 0.6061, "step": 1765 }, { "epoch": 1.1367411627813666, "grad_norm": 1.4296875, "learning_rate": 7.695621087080924e-05, "loss": 0.6265, "step": 1770 }, { "epoch": 1.1399510492316334, "grad_norm": 1.328125, "learning_rate": 7.683888910746735e-05, "loss": 0.6272, "step": 1775 }, { "epoch": 1.1431609356819004, "grad_norm": 1.359375, "learning_rate": 7.672140272966227e-05, "loss": 0.6162, "step": 1780 }, { "epoch": 1.1463708221321671, "grad_norm": 1.40625, "learning_rate": 7.660375308617054e-05, "loss": 0.6165, "step": 1785 }, { "epoch": 1.1495807085824339, "grad_norm": 1.328125, "learning_rate": 7.648594152764304e-05, "loss": 0.5994, "step": 1790 }, { "epoch": 1.1527905950327006, "grad_norm": 1.328125, "learning_rate": 7.636796940658942e-05, "loss": 0.6298, "step": 1795 }, { "epoch": 1.1560004814829676, "grad_norm": 1.4375, "learning_rate": 7.62498380773627e-05, "loss": 0.6124, "step": 1800 }, { "epoch": 1.1592103679332344, "grad_norm": 1.34375, "learning_rate": 7.613154889614362e-05, "loss": 0.6236, "step": 1805 }, { "epoch": 1.1624202543835012, "grad_norm": 1.3046875, "learning_rate": 7.601310322092511e-05, "loss": 0.6148, "step": 1810 }, { "epoch": 1.165630140833768, "grad_norm": 1.3671875, "learning_rate": 7.589450241149671e-05, "loss": 0.6119, "step": 1815 }, { "epoch": 1.168840027284035, "grad_norm": 1.4375, "learning_rate": 7.577574782942893e-05, "loss": 0.6034, "step": 1820 }, { "epoch": 1.1720499137343017, "grad_norm": 1.4375, "learning_rate": 7.565684083805762e-05, "loss": 0.6049, "step": 1825 }, { "epoch": 1.1752598001845684, "grad_norm": 1.3359375, "learning_rate": 7.553778280246835e-05, "loss": 0.6314, "step": 1830 }, { "epoch": 1.1784696866348352, "grad_norm": 1.359375, "learning_rate": 7.541857508948072e-05, "loss": 0.6015, "step": 1835 }, { "epoch": 1.1816795730851022, "grad_norm": 1.34375, "learning_rate": 7.529921906763266e-05, "loss": 0.6085, "step": 1840 }, { "epoch": 1.184889459535369, "grad_norm": 1.3671875, "learning_rate": 7.517971610716473e-05, "loss": 0.6071, "step": 1845 }, { "epoch": 1.1880993459856357, "grad_norm": 1.296875, "learning_rate": 7.50600675800044e-05, "loss": 0.6237, "step": 1850 }, { "epoch": 1.1913092324359025, "grad_norm": 1.3984375, "learning_rate": 7.494027485975027e-05, "loss": 0.6062, "step": 1855 }, { "epoch": 1.1945191188861695, "grad_norm": 1.359375, "learning_rate": 7.482033932165631e-05, "loss": 0.6111, "step": 1860 }, { "epoch": 1.1977290053364362, "grad_norm": 1.265625, "learning_rate": 7.470026234261611e-05, "loss": 0.5957, "step": 1865 }, { "epoch": 1.200938891786703, "grad_norm": 1.390625, "learning_rate": 7.4580045301147e-05, "loss": 0.6054, "step": 1870 }, { "epoch": 1.20414877823697, "grad_norm": 1.3828125, "learning_rate": 7.44596895773743e-05, "loss": 0.6264, "step": 1875 }, { "epoch": 1.2073586646872367, "grad_norm": 1.2578125, "learning_rate": 7.433919655301543e-05, "loss": 0.5918, "step": 1880 }, { "epoch": 1.2105685511375035, "grad_norm": 1.40625, "learning_rate": 7.421856761136405e-05, "loss": 0.6138, "step": 1885 }, { "epoch": 1.2137784375877703, "grad_norm": 1.3515625, "learning_rate": 7.409780413727423e-05, "loss": 0.623, "step": 1890 }, { "epoch": 1.2169883240380373, "grad_norm": 1.234375, "learning_rate": 7.397690751714444e-05, "loss": 0.6118, "step": 1895 }, { "epoch": 1.220198210488304, "grad_norm": 1.3515625, "learning_rate": 7.385587913890175e-05, "loss": 0.5957, "step": 1900 }, { "epoch": 1.2234080969385708, "grad_norm": 1.53125, "learning_rate": 7.373472039198583e-05, "loss": 0.6201, "step": 1905 }, { "epoch": 1.2266179833888375, "grad_norm": 1.328125, "learning_rate": 7.361343266733307e-05, "loss": 0.6029, "step": 1910 }, { "epoch": 1.2298278698391045, "grad_norm": 1.2265625, "learning_rate": 7.34920173573605e-05, "loss": 0.6052, "step": 1915 }, { "epoch": 1.2330377562893713, "grad_norm": 1.34375, "learning_rate": 7.337047585594987e-05, "loss": 0.6155, "step": 1920 }, { "epoch": 1.236247642739638, "grad_norm": 1.2890625, "learning_rate": 7.324880955843167e-05, "loss": 0.5776, "step": 1925 }, { "epoch": 1.2394575291899048, "grad_norm": 1.3984375, "learning_rate": 7.312701986156909e-05, "loss": 0.6156, "step": 1930 }, { "epoch": 1.2426674156401718, "grad_norm": 1.578125, "learning_rate": 7.300510816354194e-05, "loss": 0.6011, "step": 1935 }, { "epoch": 1.2458773020904386, "grad_norm": 1.671875, "learning_rate": 7.288307586393066e-05, "loss": 0.6094, "step": 1940 }, { "epoch": 1.2490871885407053, "grad_norm": 1.390625, "learning_rate": 7.276092436370024e-05, "loss": 0.6074, "step": 1945 }, { "epoch": 1.252297074990972, "grad_norm": 1.328125, "learning_rate": 7.263865506518411e-05, "loss": 0.6002, "step": 1950 }, { "epoch": 1.255506961441239, "grad_norm": 1.359375, "learning_rate": 7.251626937206806e-05, "loss": 0.5956, "step": 1955 }, { "epoch": 1.2587168478915058, "grad_norm": 1.375, "learning_rate": 7.239376868937415e-05, "loss": 0.6026, "step": 1960 }, { "epoch": 1.2619267343417726, "grad_norm": 1.4453125, "learning_rate": 7.227115442344452e-05, "loss": 0.6136, "step": 1965 }, { "epoch": 1.2651366207920396, "grad_norm": 1.3515625, "learning_rate": 7.214842798192526e-05, "loss": 0.6092, "step": 1970 }, { "epoch": 1.2683465072423064, "grad_norm": 1.453125, "learning_rate": 7.202559077375033e-05, "loss": 0.6232, "step": 1975 }, { "epoch": 1.2715563936925731, "grad_norm": 1.28125, "learning_rate": 7.190264420912526e-05, "loss": 0.6139, "step": 1980 }, { "epoch": 1.2747662801428399, "grad_norm": 1.2890625, "learning_rate": 7.177958969951104e-05, "loss": 0.6085, "step": 1985 }, { "epoch": 1.2779761665931066, "grad_norm": 1.484375, "learning_rate": 7.165642865760794e-05, "loss": 0.631, "step": 1990 }, { "epoch": 1.2811860530433736, "grad_norm": 1.4765625, "learning_rate": 7.15331624973392e-05, "loss": 0.6131, "step": 1995 }, { "epoch": 1.2843959394936404, "grad_norm": 1.4453125, "learning_rate": 7.140979263383488e-05, "loss": 0.6102, "step": 2000 }, { "epoch": 1.2843959394936404, "eval_loss": 0.5290513038635254, "eval_runtime": 2.3691, "eval_samples_per_second": 84.421, "eval_steps_per_second": 84.421, "step": 2000 }, { "epoch": 1.2876058259439072, "grad_norm": 1.34375, "learning_rate": 7.128632048341553e-05, "loss": 0.6014, "step": 2005 }, { "epoch": 1.2908157123941741, "grad_norm": 1.25, "learning_rate": 7.116274746357605e-05, "loss": 0.6291, "step": 2010 }, { "epoch": 1.294025598844441, "grad_norm": 1.265625, "learning_rate": 7.103907499296934e-05, "loss": 0.5853, "step": 2015 }, { "epoch": 1.2972354852947077, "grad_norm": 1.2578125, "learning_rate": 7.091530449138994e-05, "loss": 0.6215, "step": 2020 }, { "epoch": 1.3004453717449747, "grad_norm": 1.203125, "learning_rate": 7.079143737975795e-05, "loss": 0.5965, "step": 2025 }, { "epoch": 1.3036552581952414, "grad_norm": 1.3984375, "learning_rate": 7.066747508010243e-05, "loss": 0.6179, "step": 2030 }, { "epoch": 1.3068651446455082, "grad_norm": 1.265625, "learning_rate": 7.054341901554537e-05, "loss": 0.5941, "step": 2035 }, { "epoch": 1.310075031095775, "grad_norm": 1.390625, "learning_rate": 7.04192706102851e-05, "loss": 0.6157, "step": 2040 }, { "epoch": 1.3132849175460417, "grad_norm": 1.3828125, "learning_rate": 7.029503128958009e-05, "loss": 0.6025, "step": 2045 }, { "epoch": 1.3164948039963087, "grad_norm": 1.2421875, "learning_rate": 7.017070247973255e-05, "loss": 0.5932, "step": 2050 }, { "epoch": 1.3197046904465755, "grad_norm": 1.4921875, "learning_rate": 7.004628560807202e-05, "loss": 0.5958, "step": 2055 }, { "epoch": 1.3229145768968422, "grad_norm": 1.34375, "learning_rate": 6.992178210293905e-05, "loss": 0.6041, "step": 2060 }, { "epoch": 1.3261244633471092, "grad_norm": 1.3828125, "learning_rate": 6.979719339366876e-05, "loss": 0.6126, "step": 2065 }, { "epoch": 1.329334349797376, "grad_norm": 1.4921875, "learning_rate": 6.96725209105744e-05, "loss": 0.5878, "step": 2070 }, { "epoch": 1.3325442362476427, "grad_norm": 1.3203125, "learning_rate": 6.954776608493104e-05, "loss": 0.6037, "step": 2075 }, { "epoch": 1.3357541226979095, "grad_norm": 1.234375, "learning_rate": 6.942293034895899e-05, "loss": 0.5986, "step": 2080 }, { "epoch": 1.3389640091481763, "grad_norm": 1.4140625, "learning_rate": 6.929801513580747e-05, "loss": 0.6124, "step": 2085 }, { "epoch": 1.3421738955984432, "grad_norm": 1.265625, "learning_rate": 6.917302187953811e-05, "loss": 0.613, "step": 2090 }, { "epoch": 1.34538378204871, "grad_norm": 1.2578125, "learning_rate": 6.904795201510852e-05, "loss": 0.5869, "step": 2095 }, { "epoch": 1.3485936684989768, "grad_norm": 1.3671875, "learning_rate": 6.892280697835576e-05, "loss": 0.6194, "step": 2100 }, { "epoch": 1.3518035549492438, "grad_norm": 1.3828125, "learning_rate": 6.879758820597991e-05, "loss": 0.5933, "step": 2105 }, { "epoch": 1.3550134413995105, "grad_norm": 1.2421875, "learning_rate": 6.867229713552754e-05, "loss": 0.6055, "step": 2110 }, { "epoch": 1.3582233278497773, "grad_norm": 1.2578125, "learning_rate": 6.854693520537524e-05, "loss": 0.6052, "step": 2115 }, { "epoch": 1.3614332143000443, "grad_norm": 1.3046875, "learning_rate": 6.842150385471307e-05, "loss": 0.6174, "step": 2120 }, { "epoch": 1.364643100750311, "grad_norm": 1.3828125, "learning_rate": 6.829600452352806e-05, "loss": 0.595, "step": 2125 }, { "epoch": 1.3678529872005778, "grad_norm": 1.2421875, "learning_rate": 6.817043865258774e-05, "loss": 0.5939, "step": 2130 }, { "epoch": 1.3710628736508446, "grad_norm": 1.3359375, "learning_rate": 6.804480768342341e-05, "loss": 0.6006, "step": 2135 }, { "epoch": 1.3742727601011113, "grad_norm": 1.3515625, "learning_rate": 6.791911305831382e-05, "loss": 0.5961, "step": 2140 }, { "epoch": 1.3774826465513783, "grad_norm": 1.3046875, "learning_rate": 6.779335622026847e-05, "loss": 0.6069, "step": 2145 }, { "epoch": 1.380692533001645, "grad_norm": 1.2734375, "learning_rate": 6.76675386130111e-05, "loss": 0.6059, "step": 2150 }, { "epoch": 1.3839024194519118, "grad_norm": 1.3828125, "learning_rate": 6.754166168096306e-05, "loss": 0.5894, "step": 2155 }, { "epoch": 1.3871123059021788, "grad_norm": 1.4296875, "learning_rate": 6.741572686922676e-05, "loss": 0.6092, "step": 2160 }, { "epoch": 1.3903221923524456, "grad_norm": 1.328125, "learning_rate": 6.728973562356917e-05, "loss": 0.5937, "step": 2165 }, { "epoch": 1.3935320788027123, "grad_norm": 1.34375, "learning_rate": 6.716368939040503e-05, "loss": 0.5971, "step": 2170 }, { "epoch": 1.3967419652529791, "grad_norm": 1.296875, "learning_rate": 6.703758961678041e-05, "loss": 0.5985, "step": 2175 }, { "epoch": 1.3999518517032459, "grad_norm": 1.3125, "learning_rate": 6.691143775035606e-05, "loss": 0.6064, "step": 2180 }, { "epoch": 1.4031617381535129, "grad_norm": 1.3515625, "learning_rate": 6.678523523939074e-05, "loss": 0.6034, "step": 2185 }, { "epoch": 1.4063716246037796, "grad_norm": 1.296875, "learning_rate": 6.66589835327246e-05, "loss": 0.5948, "step": 2190 }, { "epoch": 1.4095815110540464, "grad_norm": 1.28125, "learning_rate": 6.653268407976258e-05, "loss": 0.5751, "step": 2195 }, { "epoch": 1.4127913975043134, "grad_norm": 1.265625, "learning_rate": 6.640633833045783e-05, "loss": 0.5678, "step": 2200 }, { "epoch": 1.4160012839545801, "grad_norm": 1.28125, "learning_rate": 6.627994773529489e-05, "loss": 0.5837, "step": 2205 }, { "epoch": 1.419211170404847, "grad_norm": 1.375, "learning_rate": 6.615351374527323e-05, "loss": 0.5856, "step": 2210 }, { "epoch": 1.4224210568551139, "grad_norm": 1.3828125, "learning_rate": 6.602703781189043e-05, "loss": 0.5824, "step": 2215 }, { "epoch": 1.4256309433053806, "grad_norm": 1.296875, "learning_rate": 6.590052138712567e-05, "loss": 0.6043, "step": 2220 }, { "epoch": 1.4288408297556474, "grad_norm": 1.3515625, "learning_rate": 6.57739659234229e-05, "loss": 0.5831, "step": 2225 }, { "epoch": 1.4320507162059142, "grad_norm": 1.3671875, "learning_rate": 6.564737287367434e-05, "loss": 0.6001, "step": 2230 }, { "epoch": 1.435260602656181, "grad_norm": 1.25, "learning_rate": 6.552074369120363e-05, "loss": 0.6059, "step": 2235 }, { "epoch": 1.438470489106448, "grad_norm": 1.3671875, "learning_rate": 6.539407982974925e-05, "loss": 0.5936, "step": 2240 }, { "epoch": 1.4416803755567147, "grad_norm": 1.2890625, "learning_rate": 6.52673827434478e-05, "loss": 0.6078, "step": 2245 }, { "epoch": 1.4448902620069815, "grad_norm": 1.3359375, "learning_rate": 6.514065388681736e-05, "loss": 0.6106, "step": 2250 }, { "epoch": 1.4481001484572484, "grad_norm": 1.3515625, "learning_rate": 6.501389471474066e-05, "loss": 0.5819, "step": 2255 }, { "epoch": 1.4513100349075152, "grad_norm": 1.1953125, "learning_rate": 6.48871066824485e-05, "loss": 0.5873, "step": 2260 }, { "epoch": 1.454519921357782, "grad_norm": 1.328125, "learning_rate": 6.476029124550303e-05, "loss": 0.586, "step": 2265 }, { "epoch": 1.4577298078080487, "grad_norm": 1.2578125, "learning_rate": 6.463344985978095e-05, "loss": 0.6004, "step": 2270 }, { "epoch": 1.4609396942583155, "grad_norm": 1.3046875, "learning_rate": 6.450658398145692e-05, "loss": 0.5848, "step": 2275 }, { "epoch": 1.4641495807085825, "grad_norm": 1.453125, "learning_rate": 6.437969506698678e-05, "loss": 0.6111, "step": 2280 }, { "epoch": 1.4673594671588492, "grad_norm": 1.375, "learning_rate": 6.425278457309075e-05, "loss": 0.5844, "step": 2285 }, { "epoch": 1.470569353609116, "grad_norm": 1.328125, "learning_rate": 6.41258539567369e-05, "loss": 0.5919, "step": 2290 }, { "epoch": 1.473779240059383, "grad_norm": 1.3046875, "learning_rate": 6.399890467512422e-05, "loss": 0.5992, "step": 2295 }, { "epoch": 1.4769891265096498, "grad_norm": 1.4453125, "learning_rate": 6.387193818566605e-05, "loss": 0.5969, "step": 2300 }, { "epoch": 1.4801990129599165, "grad_norm": 1.28125, "learning_rate": 6.374495594597322e-05, "loss": 0.6171, "step": 2305 }, { "epoch": 1.4834088994101835, "grad_norm": 1.21875, "learning_rate": 6.361795941383746e-05, "loss": 0.5789, "step": 2310 }, { "epoch": 1.4866187858604503, "grad_norm": 1.34375, "learning_rate": 6.349095004721447e-05, "loss": 0.6131, "step": 2315 }, { "epoch": 1.489828672310717, "grad_norm": 1.3203125, "learning_rate": 6.336392930420738e-05, "loss": 0.5972, "step": 2320 }, { "epoch": 1.4930385587609838, "grad_norm": 1.3984375, "learning_rate": 6.323689864304991e-05, "loss": 0.5947, "step": 2325 }, { "epoch": 1.4962484452112506, "grad_norm": 1.2421875, "learning_rate": 6.31098595220896e-05, "loss": 0.5936, "step": 2330 }, { "epoch": 1.4994583316615175, "grad_norm": 1.328125, "learning_rate": 6.298281339977119e-05, "loss": 0.5879, "step": 2335 }, { "epoch": 1.5026682181117843, "grad_norm": 1.3671875, "learning_rate": 6.28557617346197e-05, "loss": 0.5841, "step": 2340 }, { "epoch": 1.505878104562051, "grad_norm": 1.34375, "learning_rate": 6.272870598522385e-05, "loss": 0.5699, "step": 2345 }, { "epoch": 1.509087991012318, "grad_norm": 1.3046875, "learning_rate": 6.260164761021923e-05, "loss": 0.6094, "step": 2350 }, { "epoch": 1.5122978774625848, "grad_norm": 1.2890625, "learning_rate": 6.247458806827157e-05, "loss": 0.5969, "step": 2355 }, { "epoch": 1.5155077639128516, "grad_norm": 1.2421875, "learning_rate": 6.234752881806001e-05, "loss": 0.5865, "step": 2360 }, { "epoch": 1.5187176503631186, "grad_norm": 1.3671875, "learning_rate": 6.222047131826032e-05, "loss": 0.5898, "step": 2365 }, { "epoch": 1.521927536813385, "grad_norm": 1.3359375, "learning_rate": 6.20934170275282e-05, "loss": 0.6127, "step": 2370 }, { "epoch": 1.525137423263652, "grad_norm": 1.3671875, "learning_rate": 6.196636740448247e-05, "loss": 0.5926, "step": 2375 }, { "epoch": 1.5283473097139189, "grad_norm": 1.3046875, "learning_rate": 6.183932390768842e-05, "loss": 0.582, "step": 2380 }, { "epoch": 1.5315571961641856, "grad_norm": 1.2734375, "learning_rate": 6.171228799564095e-05, "loss": 0.57, "step": 2385 }, { "epoch": 1.5347670826144526, "grad_norm": 1.2890625, "learning_rate": 6.158526112674792e-05, "loss": 0.5735, "step": 2390 }, { "epoch": 1.5379769690647194, "grad_norm": 1.2890625, "learning_rate": 6.145824475931338e-05, "loss": 0.5763, "step": 2395 }, { "epoch": 1.5411868555149861, "grad_norm": 1.2890625, "learning_rate": 6.133124035152078e-05, "loss": 0.595, "step": 2400 }, { "epoch": 1.5443967419652531, "grad_norm": 1.1953125, "learning_rate": 6.120424936141631e-05, "loss": 0.5876, "step": 2405 }, { "epoch": 1.5476066284155197, "grad_norm": 1.203125, "learning_rate": 6.10772732468921e-05, "loss": 0.597, "step": 2410 }, { "epoch": 1.5508165148657866, "grad_norm": 1.3125, "learning_rate": 6.095031346566951e-05, "loss": 0.5945, "step": 2415 }, { "epoch": 1.5540264013160534, "grad_norm": 1.390625, "learning_rate": 6.082337147528239e-05, "loss": 0.5841, "step": 2420 }, { "epoch": 1.5572362877663202, "grad_norm": 1.25, "learning_rate": 6.069644873306034e-05, "loss": 0.5778, "step": 2425 }, { "epoch": 1.5604461742165872, "grad_norm": 1.375, "learning_rate": 6.0569546696112014e-05, "loss": 0.5909, "step": 2430 }, { "epoch": 1.563656060666854, "grad_norm": 1.3984375, "learning_rate": 6.04426668213083e-05, "loss": 0.6037, "step": 2435 }, { "epoch": 1.5668659471171207, "grad_norm": 1.328125, "learning_rate": 6.031581056526574e-05, "loss": 0.6011, "step": 2440 }, { "epoch": 1.5700758335673877, "grad_norm": 1.21875, "learning_rate": 6.018897938432966e-05, "loss": 0.5872, "step": 2445 }, { "epoch": 1.5732857200176542, "grad_norm": 1.296875, "learning_rate": 6.0062174734557554e-05, "loss": 0.5904, "step": 2450 }, { "epoch": 1.5764956064679212, "grad_norm": 1.203125, "learning_rate": 5.99353980717023e-05, "loss": 0.5861, "step": 2455 }, { "epoch": 1.5797054929181882, "grad_norm": 1.3984375, "learning_rate": 5.9808650851195517e-05, "loss": 0.5767, "step": 2460 }, { "epoch": 1.5829153793684547, "grad_norm": 1.5, "learning_rate": 5.968193452813079e-05, "loss": 0.6083, "step": 2465 }, { "epoch": 1.5861252658187217, "grad_norm": 1.421875, "learning_rate": 5.9555250557247e-05, "loss": 0.5851, "step": 2470 }, { "epoch": 1.5893351522689885, "grad_norm": 1.3984375, "learning_rate": 5.9428600392911624e-05, "loss": 0.5828, "step": 2475 }, { "epoch": 1.5925450387192552, "grad_norm": 1.3046875, "learning_rate": 5.9301985489103984e-05, "loss": 0.5983, "step": 2480 }, { "epoch": 1.5957549251695222, "grad_norm": 1.2734375, "learning_rate": 5.917540729939869e-05, "loss": 0.5621, "step": 2485 }, { "epoch": 1.598964811619789, "grad_norm": 1.3671875, "learning_rate": 5.904886727694879e-05, "loss": 0.5646, "step": 2490 }, { "epoch": 1.6021746980700557, "grad_norm": 1.296875, "learning_rate": 5.8922366874469195e-05, "loss": 0.596, "step": 2495 }, { "epoch": 1.6053845845203227, "grad_norm": 1.2890625, "learning_rate": 5.879590754421995e-05, "loss": 0.6159, "step": 2500 }, { "epoch": 1.6053845845203227, "eval_loss": 0.4981262981891632, "eval_runtime": 2.3761, "eval_samples_per_second": 84.173, "eval_steps_per_second": 84.173, "step": 2500 }, { "epoch": 1.6085944709705893, "grad_norm": 1.3828125, "learning_rate": 5.866949073798958e-05, "loss": 0.6173, "step": 2505 }, { "epoch": 1.6118043574208563, "grad_norm": 1.265625, "learning_rate": 5.854311790707845e-05, "loss": 0.5769, "step": 2510 }, { "epoch": 1.615014243871123, "grad_norm": 1.3203125, "learning_rate": 5.8416790502282026e-05, "loss": 0.5856, "step": 2515 }, { "epoch": 1.6182241303213898, "grad_norm": 1.2890625, "learning_rate": 5.829050997387432e-05, "loss": 0.5743, "step": 2520 }, { "epoch": 1.6214340167716568, "grad_norm": 1.3203125, "learning_rate": 5.816427777159117e-05, "loss": 0.5854, "step": 2525 }, { "epoch": 1.6246439032219235, "grad_norm": 1.3515625, "learning_rate": 5.8038095344613595e-05, "loss": 0.5837, "step": 2530 }, { "epoch": 1.6278537896721903, "grad_norm": 1.2109375, "learning_rate": 5.791196414155121e-05, "loss": 0.6061, "step": 2535 }, { "epoch": 1.6310636761224573, "grad_norm": 1.1796875, "learning_rate": 5.778588561042556e-05, "loss": 0.5856, "step": 2540 }, { "epoch": 1.6342735625727238, "grad_norm": 1.296875, "learning_rate": 5.76598611986535e-05, "loss": 0.5721, "step": 2545 }, { "epoch": 1.6374834490229908, "grad_norm": 1.3046875, "learning_rate": 5.753389235303055e-05, "loss": 0.5907, "step": 2550 }, { "epoch": 1.6406933354732578, "grad_norm": 1.2578125, "learning_rate": 5.7407980519714346e-05, "loss": 0.5801, "step": 2555 }, { "epoch": 1.6439032219235243, "grad_norm": 1.5, "learning_rate": 5.728212714420804e-05, "loss": 0.5794, "step": 2560 }, { "epoch": 1.6471131083737913, "grad_norm": 1.3515625, "learning_rate": 5.71563336713436e-05, "loss": 0.5779, "step": 2565 }, { "epoch": 1.650322994824058, "grad_norm": 1.4765625, "learning_rate": 5.7030601545265336e-05, "loss": 0.5851, "step": 2570 }, { "epoch": 1.6535328812743249, "grad_norm": 1.3359375, "learning_rate": 5.6904932209413276e-05, "loss": 0.5868, "step": 2575 }, { "epoch": 1.6567427677245918, "grad_norm": 1.3984375, "learning_rate": 5.6779327106506594e-05, "loss": 0.5722, "step": 2580 }, { "epoch": 1.6599526541748586, "grad_norm": 1.328125, "learning_rate": 5.665378767852704e-05, "loss": 0.5988, "step": 2585 }, { "epoch": 1.6631625406251254, "grad_norm": 1.25, "learning_rate": 5.652831536670242e-05, "loss": 0.5766, "step": 2590 }, { "epoch": 1.6663724270753923, "grad_norm": 1.3984375, "learning_rate": 5.640291161149e-05, "loss": 0.592, "step": 2595 }, { "epoch": 1.669582313525659, "grad_norm": 1.34375, "learning_rate": 5.627757785256006e-05, "loss": 0.5893, "step": 2600 }, { "epoch": 1.6727921999759259, "grad_norm": 1.359375, "learning_rate": 5.615231552877921e-05, "loss": 0.5747, "step": 2605 }, { "epoch": 1.6760020864261926, "grad_norm": 1.3125, "learning_rate": 5.602712607819404e-05, "loss": 0.5804, "step": 2610 }, { "epoch": 1.6792119728764594, "grad_norm": 1.3515625, "learning_rate": 5.590201093801449e-05, "loss": 0.5734, "step": 2615 }, { "epoch": 1.6824218593267264, "grad_norm": 1.21875, "learning_rate": 5.577697154459742e-05, "loss": 0.5708, "step": 2620 }, { "epoch": 1.6856317457769932, "grad_norm": 1.3359375, "learning_rate": 5.565200933343009e-05, "loss": 0.5863, "step": 2625 }, { "epoch": 1.68884163222726, "grad_norm": 1.2734375, "learning_rate": 5.5527125739113686e-05, "loss": 0.5846, "step": 2630 }, { "epoch": 1.692051518677527, "grad_norm": 1.28125, "learning_rate": 5.540232219534685e-05, "loss": 0.5533, "step": 2635 }, { "epoch": 1.6952614051277934, "grad_norm": 1.2890625, "learning_rate": 5.527760013490922e-05, "loss": 0.5916, "step": 2640 }, { "epoch": 1.6984712915780604, "grad_norm": 1.328125, "learning_rate": 5.515296098964499e-05, "loss": 0.5641, "step": 2645 }, { "epoch": 1.7016811780283274, "grad_norm": 1.2265625, "learning_rate": 5.502840619044645e-05, "loss": 0.5737, "step": 2650 }, { "epoch": 1.704891064478594, "grad_norm": 1.28125, "learning_rate": 5.490393716723757e-05, "loss": 0.5728, "step": 2655 }, { "epoch": 1.708100950928861, "grad_norm": 1.2265625, "learning_rate": 5.477955534895762e-05, "loss": 0.5614, "step": 2660 }, { "epoch": 1.7113108373791277, "grad_norm": 1.34375, "learning_rate": 5.465526216354471e-05, "loss": 0.5819, "step": 2665 }, { "epoch": 1.7145207238293945, "grad_norm": 1.203125, "learning_rate": 5.453105903791942e-05, "loss": 0.5709, "step": 2670 }, { "epoch": 1.7177306102796615, "grad_norm": 1.234375, "learning_rate": 5.44069473979684e-05, "loss": 0.5951, "step": 2675 }, { "epoch": 1.7209404967299282, "grad_norm": 1.2265625, "learning_rate": 5.428292866852808e-05, "loss": 0.5705, "step": 2680 }, { "epoch": 1.724150383180195, "grad_norm": 1.265625, "learning_rate": 5.4159004273368166e-05, "loss": 0.5787, "step": 2685 }, { "epoch": 1.727360269630462, "grad_norm": 1.2578125, "learning_rate": 5.4035175635175464e-05, "loss": 0.5832, "step": 2690 }, { "epoch": 1.7305701560807285, "grad_norm": 1.3046875, "learning_rate": 5.3911444175537394e-05, "loss": 0.5888, "step": 2695 }, { "epoch": 1.7337800425309955, "grad_norm": 1.3046875, "learning_rate": 5.3787811314925776e-05, "loss": 0.5695, "step": 2700 }, { "epoch": 1.7369899289812623, "grad_norm": 1.21875, "learning_rate": 5.3664278472680496e-05, "loss": 0.569, "step": 2705 }, { "epoch": 1.740199815431529, "grad_norm": 1.234375, "learning_rate": 5.3540847066993173e-05, "loss": 0.5853, "step": 2710 }, { "epoch": 1.743409701881796, "grad_norm": 1.25, "learning_rate": 5.341751851489091e-05, "loss": 0.589, "step": 2715 }, { "epoch": 1.7466195883320628, "grad_norm": 1.203125, "learning_rate": 5.329429423222003e-05, "loss": 0.5679, "step": 2720 }, { "epoch": 1.7498294747823295, "grad_norm": 1.2890625, "learning_rate": 5.3171175633629835e-05, "loss": 0.5823, "step": 2725 }, { "epoch": 1.7530393612325965, "grad_norm": 1.2265625, "learning_rate": 5.3048164132556285e-05, "loss": 0.5561, "step": 2730 }, { "epoch": 1.756249247682863, "grad_norm": 1.265625, "learning_rate": 5.292526114120589e-05, "loss": 0.5701, "step": 2735 }, { "epoch": 1.75945913413313, "grad_norm": 1.1953125, "learning_rate": 5.28024680705394e-05, "loss": 0.5779, "step": 2740 }, { "epoch": 1.762669020583397, "grad_norm": 1.2421875, "learning_rate": 5.267978633025568e-05, "loss": 0.5607, "step": 2745 }, { "epoch": 1.7658789070336636, "grad_norm": 1.171875, "learning_rate": 5.255721732877546e-05, "loss": 0.5862, "step": 2750 }, { "epoch": 1.7690887934839306, "grad_norm": 1.296875, "learning_rate": 5.243476247322521e-05, "loss": 0.5764, "step": 2755 }, { "epoch": 1.7722986799341973, "grad_norm": 1.296875, "learning_rate": 5.2312423169420955e-05, "loss": 0.5814, "step": 2760 }, { "epoch": 1.775508566384464, "grad_norm": 1.2890625, "learning_rate": 5.219020082185219e-05, "loss": 0.5808, "step": 2765 }, { "epoch": 1.778718452834731, "grad_norm": 1.2265625, "learning_rate": 5.206809683366569e-05, "loss": 0.58, "step": 2770 }, { "epoch": 1.7819283392849978, "grad_norm": 1.2265625, "learning_rate": 5.1946112606649435e-05, "loss": 0.5723, "step": 2775 }, { "epoch": 1.7851382257352646, "grad_norm": 1.2265625, "learning_rate": 5.182424954121652e-05, "loss": 0.5789, "step": 2780 }, { "epoch": 1.7883481121855316, "grad_norm": 1.2890625, "learning_rate": 5.170250903638909e-05, "loss": 0.5726, "step": 2785 }, { "epoch": 1.7915579986357981, "grad_norm": 1.1875, "learning_rate": 5.158089248978221e-05, "loss": 0.5718, "step": 2790 }, { "epoch": 1.794767885086065, "grad_norm": 1.28125, "learning_rate": 5.1459401297587916e-05, "loss": 0.5845, "step": 2795 }, { "epoch": 1.7979777715363319, "grad_norm": 1.203125, "learning_rate": 5.1338036854559113e-05, "loss": 0.563, "step": 2800 }, { "epoch": 1.8011876579865986, "grad_norm": 1.2421875, "learning_rate": 5.1216800553993606e-05, "loss": 0.5841, "step": 2805 }, { "epoch": 1.8043975444368656, "grad_norm": 1.2734375, "learning_rate": 5.109569378771808e-05, "loss": 0.5648, "step": 2810 }, { "epoch": 1.8076074308871324, "grad_norm": 1.2421875, "learning_rate": 5.097471794607214e-05, "loss": 0.5768, "step": 2815 }, { "epoch": 1.8108173173373991, "grad_norm": 1.2421875, "learning_rate": 5.0853874417892324e-05, "loss": 0.5596, "step": 2820 }, { "epoch": 1.8140272037876661, "grad_norm": 1.3671875, "learning_rate": 5.07331645904962e-05, "loss": 0.5873, "step": 2825 }, { "epoch": 1.8172370902379327, "grad_norm": 1.3125, "learning_rate": 5.061258984966636e-05, "loss": 0.5807, "step": 2830 }, { "epoch": 1.8204469766881997, "grad_norm": 1.3359375, "learning_rate": 5.049215157963464e-05, "loss": 0.5814, "step": 2835 }, { "epoch": 1.8236568631384664, "grad_norm": 1.4453125, "learning_rate": 5.03718511630661e-05, "loss": 0.5727, "step": 2840 }, { "epoch": 1.8268667495887332, "grad_norm": 1.2890625, "learning_rate": 5.025168998104322e-05, "loss": 0.5731, "step": 2845 }, { "epoch": 1.8300766360390002, "grad_norm": 1.25, "learning_rate": 5.013166941304999e-05, "loss": 0.5664, "step": 2850 }, { "epoch": 1.833286522489267, "grad_norm": 1.3046875, "learning_rate": 5.0011790836956197e-05, "loss": 0.5812, "step": 2855 }, { "epoch": 1.8364964089395337, "grad_norm": 1.21875, "learning_rate": 4.989205562900144e-05, "loss": 0.5715, "step": 2860 }, { "epoch": 1.8397062953898007, "grad_norm": 1.2265625, "learning_rate": 4.9772465163779474e-05, "loss": 0.5785, "step": 2865 }, { "epoch": 1.8429161818400674, "grad_norm": 1.1796875, "learning_rate": 4.9653020814222315e-05, "loss": 0.5813, "step": 2870 }, { "epoch": 1.8461260682903342, "grad_norm": 1.2265625, "learning_rate": 4.9533723951584554e-05, "loss": 0.59, "step": 2875 }, { "epoch": 1.8493359547406012, "grad_norm": 1.21875, "learning_rate": 4.94145759454276e-05, "loss": 0.565, "step": 2880 }, { "epoch": 1.8525458411908677, "grad_norm": 1.3046875, "learning_rate": 4.929557816360391e-05, "loss": 0.5839, "step": 2885 }, { "epoch": 1.8557557276411347, "grad_norm": 1.234375, "learning_rate": 4.9176731972241376e-05, "loss": 0.5755, "step": 2890 }, { "epoch": 1.8589656140914015, "grad_norm": 1.3125, "learning_rate": 4.905803873572755e-05, "loss": 0.571, "step": 2895 }, { "epoch": 1.8621755005416682, "grad_norm": 1.203125, "learning_rate": 4.8939499816694035e-05, "loss": 0.572, "step": 2900 }, { "epoch": 1.8653853869919352, "grad_norm": 1.234375, "learning_rate": 4.882111657600081e-05, "loss": 0.5559, "step": 2905 }, { "epoch": 1.868595273442202, "grad_norm": 1.234375, "learning_rate": 4.8702890372720664e-05, "loss": 0.5792, "step": 2910 }, { "epoch": 1.8718051598924688, "grad_norm": 1.2265625, "learning_rate": 4.85848225641235e-05, "loss": 0.5611, "step": 2915 }, { "epoch": 1.8750150463427357, "grad_norm": 1.2421875, "learning_rate": 4.8466914505660834e-05, "loss": 0.5663, "step": 2920 }, { "epoch": 1.8782249327930023, "grad_norm": 1.3828125, "learning_rate": 4.834916755095022e-05, "loss": 0.5914, "step": 2925 }, { "epoch": 1.8814348192432693, "grad_norm": 1.203125, "learning_rate": 4.823158305175967e-05, "loss": 0.5712, "step": 2930 }, { "epoch": 1.884644705693536, "grad_norm": 1.265625, "learning_rate": 4.811416235799216e-05, "loss": 0.5957, "step": 2935 }, { "epoch": 1.8878545921438028, "grad_norm": 1.375, "learning_rate": 4.7996906817670155e-05, "loss": 0.5872, "step": 2940 }, { "epoch": 1.8910644785940698, "grad_norm": 1.34375, "learning_rate": 4.78798177769201e-05, "loss": 0.5604, "step": 2945 }, { "epoch": 1.8942743650443365, "grad_norm": 1.359375, "learning_rate": 4.7762896579956966e-05, "loss": 0.556, "step": 2950 }, { "epoch": 1.8974842514946033, "grad_norm": 1.2734375, "learning_rate": 4.764614456906886e-05, "loss": 0.5577, "step": 2955 }, { "epoch": 1.9006941379448703, "grad_norm": 1.1484375, "learning_rate": 4.752956308460155e-05, "loss": 0.584, "step": 2960 }, { "epoch": 1.903904024395137, "grad_norm": 1.296875, "learning_rate": 4.741315346494314e-05, "loss": 0.5625, "step": 2965 }, { "epoch": 1.9071139108454038, "grad_norm": 1.2734375, "learning_rate": 4.729691704650867e-05, "loss": 0.5684, "step": 2970 }, { "epoch": 1.9103237972956708, "grad_norm": 1.3359375, "learning_rate": 4.718085516372478e-05, "loss": 0.5851, "step": 2975 }, { "epoch": 1.9135336837459374, "grad_norm": 1.1875, "learning_rate": 4.70649691490144e-05, "loss": 0.5637, "step": 2980 }, { "epoch": 1.9167435701962043, "grad_norm": 1.296875, "learning_rate": 4.694926033278142e-05, "loss": 0.5792, "step": 2985 }, { "epoch": 1.919953456646471, "grad_norm": 1.203125, "learning_rate": 4.683373004339547e-05, "loss": 0.5406, "step": 2990 }, { "epoch": 1.9231633430967379, "grad_norm": 1.34375, "learning_rate": 4.6718379607176634e-05, "loss": 0.5777, "step": 2995 }, { "epoch": 1.9263732295470049, "grad_norm": 1.25, "learning_rate": 4.6603210348380235e-05, "loss": 0.5742, "step": 3000 }, { "epoch": 1.9263732295470049, "eval_loss": 0.48648878931999207, "eval_runtime": 2.4037, "eval_samples_per_second": 83.204, "eval_steps_per_second": 83.204, "step": 3000 }, { "epoch": 1.9295831159972716, "grad_norm": 1.2265625, "learning_rate": 4.64882235891816e-05, "loss": 0.5662, "step": 3005 }, { "epoch": 1.9327930024475384, "grad_norm": 1.2890625, "learning_rate": 4.637342064966095e-05, "loss": 0.5972, "step": 3010 }, { "epoch": 1.9360028888978054, "grad_norm": 1.2265625, "learning_rate": 4.625880284778818e-05, "loss": 0.5682, "step": 3015 }, { "epoch": 1.939212775348072, "grad_norm": 1.2578125, "learning_rate": 4.614437149940776e-05, "loss": 0.5703, "step": 3020 }, { "epoch": 1.942422661798339, "grad_norm": 1.3203125, "learning_rate": 4.603012791822362e-05, "loss": 0.5611, "step": 3025 }, { "epoch": 1.9456325482486057, "grad_norm": 1.21875, "learning_rate": 4.591607341578407e-05, "loss": 0.5471, "step": 3030 }, { "epoch": 1.9488424346988724, "grad_norm": 1.1640625, "learning_rate": 4.580220930146675e-05, "loss": 0.5398, "step": 3035 }, { "epoch": 1.9520523211491394, "grad_norm": 1.3203125, "learning_rate": 4.568853688246357e-05, "loss": 0.5864, "step": 3040 }, { "epoch": 1.9552622075994062, "grad_norm": 1.234375, "learning_rate": 4.557505746376576e-05, "loss": 0.5662, "step": 3045 }, { "epoch": 1.958472094049673, "grad_norm": 1.25, "learning_rate": 4.546177234814881e-05, "loss": 0.5745, "step": 3050 }, { "epoch": 1.96168198049994, "grad_norm": 1.1875, "learning_rate": 4.53486828361576e-05, "loss": 0.5486, "step": 3055 }, { "epoch": 1.9648918669502067, "grad_norm": 1.2265625, "learning_rate": 4.523579022609139e-05, "loss": 0.5703, "step": 3060 }, { "epoch": 1.9681017534004734, "grad_norm": 1.3125, "learning_rate": 4.512309581398896e-05, "loss": 0.5627, "step": 3065 }, { "epoch": 1.9713116398507404, "grad_norm": 1.296875, "learning_rate": 4.5010600893613714e-05, "loss": 0.5839, "step": 3070 }, { "epoch": 1.974521526301007, "grad_norm": 1.2421875, "learning_rate": 4.489830675643888e-05, "loss": 0.5638, "step": 3075 }, { "epoch": 1.977731412751274, "grad_norm": 1.2578125, "learning_rate": 4.478621469163259e-05, "loss": 0.5709, "step": 3080 }, { "epoch": 1.9809412992015407, "grad_norm": 1.2421875, "learning_rate": 4.4674325986043145e-05, "loss": 0.558, "step": 3085 }, { "epoch": 1.9841511856518075, "grad_norm": 1.1953125, "learning_rate": 4.456264192418422e-05, "loss": 0.5639, "step": 3090 }, { "epoch": 1.9873610721020745, "grad_norm": 1.25, "learning_rate": 4.445116378822014e-05, "loss": 0.5742, "step": 3095 }, { "epoch": 1.9905709585523412, "grad_norm": 1.25, "learning_rate": 4.433989285795112e-05, "loss": 0.5653, "step": 3100 }, { "epoch": 1.993780845002608, "grad_norm": 1.234375, "learning_rate": 4.4228830410798594e-05, "loss": 0.581, "step": 3105 }, { "epoch": 1.996990731452875, "grad_norm": 1.1640625, "learning_rate": 4.411797772179059e-05, "loss": 0.5658, "step": 3110 }, { "epoch": 1.9995586406130883, "eval_loss": 0.48290687799453735, "eval_runtime": 2.4097, "eval_samples_per_second": 82.996, "eval_steps_per_second": 82.996, "step": 3114 }, { "epoch": 2.000641977290053, "grad_norm": 3.296875, "learning_rate": 4.4007336063547e-05, "loss": 0.6695, "step": 3115 }, { "epoch": 2.00385186374032, "grad_norm": 1.1796875, "learning_rate": 4.389690670626507e-05, "loss": 0.5518, "step": 3120 }, { "epoch": 2.007061750190587, "grad_norm": 1.25, "learning_rate": 4.378669091770474e-05, "loss": 0.5527, "step": 3125 }, { "epoch": 2.0102716366408537, "grad_norm": 1.34375, "learning_rate": 4.367668996317413e-05, "loss": 0.5517, "step": 3130 }, { "epoch": 2.0134815230911207, "grad_norm": 1.3046875, "learning_rate": 4.3566905105515035e-05, "loss": 0.5451, "step": 3135 }, { "epoch": 2.0166914095413873, "grad_norm": 1.25, "learning_rate": 4.345733760508832e-05, "loss": 0.5342, "step": 3140 }, { "epoch": 2.0199012959916542, "grad_norm": 1.21875, "learning_rate": 4.334798871975963e-05, "loss": 0.5445, "step": 3145 }, { "epoch": 2.0231111824419212, "grad_norm": 1.15625, "learning_rate": 4.3238859704884784e-05, "loss": 0.5442, "step": 3150 }, { "epoch": 2.0263210688921878, "grad_norm": 1.203125, "learning_rate": 4.312995181329543e-05, "loss": 0.5367, "step": 3155 }, { "epoch": 2.0295309553424548, "grad_norm": 1.2265625, "learning_rate": 4.3021266295284665e-05, "loss": 0.5466, "step": 3160 }, { "epoch": 2.0327408417927217, "grad_norm": 1.1953125, "learning_rate": 4.291280439859269e-05, "loss": 0.5709, "step": 3165 }, { "epoch": 2.0359507282429883, "grad_norm": 1.2421875, "learning_rate": 4.280456736839245e-05, "loss": 0.5409, "step": 3170 }, { "epoch": 2.0391606146932553, "grad_norm": 1.2265625, "learning_rate": 4.269655644727536e-05, "loss": 0.5526, "step": 3175 }, { "epoch": 2.0423705011435223, "grad_norm": 1.2578125, "learning_rate": 4.258877287523707e-05, "loss": 0.539, "step": 3180 }, { "epoch": 2.045580387593789, "grad_norm": 1.1796875, "learning_rate": 4.2481217889663156e-05, "loss": 0.5503, "step": 3185 }, { "epoch": 2.048790274044056, "grad_norm": 1.1875, "learning_rate": 4.237389272531499e-05, "loss": 0.5537, "step": 3190 }, { "epoch": 2.0520001604943223, "grad_norm": 1.2578125, "learning_rate": 4.2266798614315505e-05, "loss": 0.544, "step": 3195 }, { "epoch": 2.0552100469445893, "grad_norm": 1.3671875, "learning_rate": 4.2159936786135115e-05, "loss": 0.5358, "step": 3200 }, { "epoch": 2.0584199333948563, "grad_norm": 1.2578125, "learning_rate": 4.2053308467577516e-05, "loss": 0.5185, "step": 3205 }, { "epoch": 2.061629819845123, "grad_norm": 1.2109375, "learning_rate": 4.1946914882765684e-05, "loss": 0.5666, "step": 3210 }, { "epoch": 2.06483970629539, "grad_norm": 1.1953125, "learning_rate": 4.184075725312776e-05, "loss": 0.5325, "step": 3215 }, { "epoch": 2.068049592745657, "grad_norm": 1.25, "learning_rate": 4.173483679738309e-05, "loss": 0.5484, "step": 3220 }, { "epoch": 2.0712594791959233, "grad_norm": 1.2890625, "learning_rate": 4.162915473152816e-05, "loss": 0.5483, "step": 3225 }, { "epoch": 2.0744693656461903, "grad_norm": 1.28125, "learning_rate": 4.152371226882268e-05, "loss": 0.5411, "step": 3230 }, { "epoch": 2.077679252096457, "grad_norm": 1.2578125, "learning_rate": 4.141851061977565e-05, "loss": 0.5503, "step": 3235 }, { "epoch": 2.080889138546724, "grad_norm": 1.140625, "learning_rate": 4.131355099213149e-05, "loss": 0.552, "step": 3240 }, { "epoch": 2.084099024996991, "grad_norm": 1.203125, "learning_rate": 4.120883459085611e-05, "loss": 0.5297, "step": 3245 }, { "epoch": 2.0873089114472574, "grad_norm": 1.2421875, "learning_rate": 4.110436261812313e-05, "loss": 0.5324, "step": 3250 }, { "epoch": 2.0905187978975244, "grad_norm": 1.1484375, "learning_rate": 4.100013627330006e-05, "loss": 0.5355, "step": 3255 }, { "epoch": 2.0937286843477914, "grad_norm": 1.234375, "learning_rate": 4.089615675293452e-05, "loss": 0.5508, "step": 3260 }, { "epoch": 2.096938570798058, "grad_norm": 1.25, "learning_rate": 4.0792425250740544e-05, "loss": 0.5185, "step": 3265 }, { "epoch": 2.100148457248325, "grad_norm": 1.2890625, "learning_rate": 4.0688942957584825e-05, "loss": 0.5783, "step": 3270 }, { "epoch": 2.103358343698592, "grad_norm": 1.2578125, "learning_rate": 4.058571106147307e-05, "loss": 0.5403, "step": 3275 }, { "epoch": 2.1065682301488584, "grad_norm": 1.3359375, "learning_rate": 4.048273074753637e-05, "loss": 0.5358, "step": 3280 }, { "epoch": 2.1097781165991254, "grad_norm": 1.296875, "learning_rate": 4.038000319801756e-05, "loss": 0.5203, "step": 3285 }, { "epoch": 2.112988003049392, "grad_norm": 1.234375, "learning_rate": 4.0277529592257676e-05, "loss": 0.5501, "step": 3290 }, { "epoch": 2.116197889499659, "grad_norm": 1.234375, "learning_rate": 4.017531110668244e-05, "loss": 0.5677, "step": 3295 }, { "epoch": 2.119407775949926, "grad_norm": 1.2265625, "learning_rate": 4.0073348914788684e-05, "loss": 0.536, "step": 3300 }, { "epoch": 2.1226176624001925, "grad_norm": 1.265625, "learning_rate": 3.997164418713093e-05, "loss": 0.553, "step": 3305 }, { "epoch": 2.1258275488504594, "grad_norm": 1.3515625, "learning_rate": 3.987019809130794e-05, "loss": 0.5614, "step": 3310 }, { "epoch": 2.1290374353007264, "grad_norm": 1.25, "learning_rate": 3.9769011791949305e-05, "loss": 0.5337, "step": 3315 }, { "epoch": 2.132247321750993, "grad_norm": 1.1640625, "learning_rate": 3.9668086450702086e-05, "loss": 0.5257, "step": 3320 }, { "epoch": 2.13545720820126, "grad_norm": 1.2109375, "learning_rate": 3.956742322621747e-05, "loss": 0.5379, "step": 3325 }, { "epoch": 2.1386670946515265, "grad_norm": 1.2578125, "learning_rate": 3.946702327413746e-05, "loss": 0.5356, "step": 3330 }, { "epoch": 2.1418769811017935, "grad_norm": 1.3203125, "learning_rate": 3.936688774708163e-05, "loss": 0.5343, "step": 3335 }, { "epoch": 2.1450868675520605, "grad_norm": 1.265625, "learning_rate": 3.926701779463389e-05, "loss": 0.5452, "step": 3340 }, { "epoch": 2.148296754002327, "grad_norm": 1.171875, "learning_rate": 3.916741456332926e-05, "loss": 0.5443, "step": 3345 }, { "epoch": 2.151506640452594, "grad_norm": 1.28125, "learning_rate": 3.906807919664073e-05, "loss": 0.5368, "step": 3350 }, { "epoch": 2.154716526902861, "grad_norm": 1.1953125, "learning_rate": 3.8969012834966135e-05, "loss": 0.5436, "step": 3355 }, { "epoch": 2.1579264133531275, "grad_norm": 1.21875, "learning_rate": 3.8870216615615045e-05, "loss": 0.5238, "step": 3360 }, { "epoch": 2.1611362998033945, "grad_norm": 1.2734375, "learning_rate": 3.877169167279575e-05, "loss": 0.5483, "step": 3365 }, { "epoch": 2.1643461862536615, "grad_norm": 1.296875, "learning_rate": 3.867343913760218e-05, "loss": 0.5313, "step": 3370 }, { "epoch": 2.167556072703928, "grad_norm": 1.2109375, "learning_rate": 3.857546013800095e-05, "loss": 0.539, "step": 3375 }, { "epoch": 2.170765959154195, "grad_norm": 1.328125, "learning_rate": 3.847775579881844e-05, "loss": 0.5385, "step": 3380 }, { "epoch": 2.1739758456044616, "grad_norm": 1.2578125, "learning_rate": 3.8380327241727804e-05, "loss": 0.5496, "step": 3385 }, { "epoch": 2.1771857320547285, "grad_norm": 1.203125, "learning_rate": 3.828317558523619e-05, "loss": 0.545, "step": 3390 }, { "epoch": 2.1803956185049955, "grad_norm": 1.2265625, "learning_rate": 3.818630194467181e-05, "loss": 0.5343, "step": 3395 }, { "epoch": 2.183605504955262, "grad_norm": 1.3828125, "learning_rate": 3.8089707432171193e-05, "loss": 0.5325, "step": 3400 }, { "epoch": 2.186815391405529, "grad_norm": 1.28125, "learning_rate": 3.799339315666641e-05, "loss": 0.547, "step": 3405 }, { "epoch": 2.190025277855796, "grad_norm": 1.296875, "learning_rate": 3.789736022387231e-05, "loss": 0.5448, "step": 3410 }, { "epoch": 2.1932351643060626, "grad_norm": 1.2734375, "learning_rate": 3.780160973627386e-05, "loss": 0.5431, "step": 3415 }, { "epoch": 2.1964450507563296, "grad_norm": 1.21875, "learning_rate": 3.770614279311348e-05, "loss": 0.5599, "step": 3420 }, { "epoch": 2.1996549372065965, "grad_norm": 1.203125, "learning_rate": 3.7610960490378415e-05, "loss": 0.5474, "step": 3425 }, { "epoch": 2.202864823656863, "grad_norm": 1.28125, "learning_rate": 3.751606392078816e-05, "loss": 0.5688, "step": 3430 }, { "epoch": 2.20607471010713, "grad_norm": 1.2578125, "learning_rate": 3.74214541737819e-05, "loss": 0.5326, "step": 3435 }, { "epoch": 2.2092845965573966, "grad_norm": 1.2421875, "learning_rate": 3.732713233550606e-05, "loss": 0.5303, "step": 3440 }, { "epoch": 2.2124944830076636, "grad_norm": 1.2734375, "learning_rate": 3.723309948880176e-05, "loss": 0.5402, "step": 3445 }, { "epoch": 2.2157043694579306, "grad_norm": 1.265625, "learning_rate": 3.713935671319239e-05, "loss": 0.5268, "step": 3450 }, { "epoch": 2.218914255908197, "grad_norm": 1.2734375, "learning_rate": 3.704590508487129e-05, "loss": 0.5613, "step": 3455 }, { "epoch": 2.222124142358464, "grad_norm": 1.3125, "learning_rate": 3.695274567668933e-05, "loss": 0.5533, "step": 3460 }, { "epoch": 2.2253340288087307, "grad_norm": 1.2109375, "learning_rate": 3.6859879558142594e-05, "loss": 0.5403, "step": 3465 }, { "epoch": 2.2285439152589976, "grad_norm": 1.234375, "learning_rate": 3.6767307795360145e-05, "loss": 0.5304, "step": 3470 }, { "epoch": 2.2317538017092646, "grad_norm": 1.1953125, "learning_rate": 3.6675031451091755e-05, "loss": 0.5323, "step": 3475 }, { "epoch": 2.234963688159531, "grad_norm": 1.2578125, "learning_rate": 3.65830515846957e-05, "loss": 0.5299, "step": 3480 }, { "epoch": 2.238173574609798, "grad_norm": 1.1875, "learning_rate": 3.64913692521266e-05, "loss": 0.5645, "step": 3485 }, { "epoch": 2.241383461060065, "grad_norm": 1.375, "learning_rate": 3.6399985505923295e-05, "loss": 0.5453, "step": 3490 }, { "epoch": 2.2445933475103317, "grad_norm": 1.25, "learning_rate": 3.6308901395196825e-05, "loss": 0.5387, "step": 3495 }, { "epoch": 2.2478032339605987, "grad_norm": 1.21875, "learning_rate": 3.621811796561827e-05, "loss": 0.5512, "step": 3500 }, { "epoch": 2.2478032339605987, "eval_loss": 0.4768131375312805, "eval_runtime": 2.3764, "eval_samples_per_second": 84.16, "eval_steps_per_second": 84.16, "step": 3500 }, { "epoch": 2.2510131204108657, "grad_norm": 1.2578125, "learning_rate": 3.6127636259406837e-05, "loss": 0.555, "step": 3505 }, { "epoch": 2.254223006861132, "grad_norm": 1.2109375, "learning_rate": 3.6037457315317844e-05, "loss": 0.5454, "step": 3510 }, { "epoch": 2.257432893311399, "grad_norm": 1.25, "learning_rate": 3.5947582168630855e-05, "loss": 0.535, "step": 3515 }, { "epoch": 2.2606427797616657, "grad_norm": 1.21875, "learning_rate": 3.585801185113771e-05, "loss": 0.5461, "step": 3520 }, { "epoch": 2.2638526662119327, "grad_norm": 1.1328125, "learning_rate": 3.576874739113073e-05, "loss": 0.527, "step": 3525 }, { "epoch": 2.2670625526621997, "grad_norm": 1.34375, "learning_rate": 3.567978981339095e-05, "loss": 0.5364, "step": 3530 }, { "epoch": 2.2702724391124662, "grad_norm": 1.234375, "learning_rate": 3.559114013917624e-05, "loss": 0.5366, "step": 3535 }, { "epoch": 2.273482325562733, "grad_norm": 1.2890625, "learning_rate": 3.5502799386209726e-05, "loss": 0.5386, "step": 3540 }, { "epoch": 2.276692212013, "grad_norm": 1.125, "learning_rate": 3.5414768568667974e-05, "loss": 0.5391, "step": 3545 }, { "epoch": 2.2799020984632667, "grad_norm": 1.2109375, "learning_rate": 3.532704869716943e-05, "loss": 0.5342, "step": 3550 }, { "epoch": 2.2831119849135337, "grad_norm": 1.21875, "learning_rate": 3.523964077876279e-05, "loss": 0.5506, "step": 3555 }, { "epoch": 2.2863218713638007, "grad_norm": 1.2578125, "learning_rate": 3.5152545816915446e-05, "loss": 0.561, "step": 3560 }, { "epoch": 2.2895317578140673, "grad_norm": 1.2734375, "learning_rate": 3.506576481150194e-05, "loss": 0.5429, "step": 3565 }, { "epoch": 2.2927416442643342, "grad_norm": 1.2109375, "learning_rate": 3.497929875879254e-05, "loss": 0.5374, "step": 3570 }, { "epoch": 2.295951530714601, "grad_norm": 1.2265625, "learning_rate": 3.4893148651441735e-05, "loss": 0.5634, "step": 3575 }, { "epoch": 2.2991614171648678, "grad_norm": 1.2734375, "learning_rate": 3.480731547847688e-05, "loss": 0.5394, "step": 3580 }, { "epoch": 2.3023713036151348, "grad_norm": 1.2109375, "learning_rate": 3.472180022528686e-05, "loss": 0.5342, "step": 3585 }, { "epoch": 2.3055811900654013, "grad_norm": 1.1953125, "learning_rate": 3.4636603873610735e-05, "loss": 0.547, "step": 3590 }, { "epoch": 2.3087910765156683, "grad_norm": 1.1953125, "learning_rate": 3.455172740152648e-05, "loss": 0.5421, "step": 3595 }, { "epoch": 2.3120009629659353, "grad_norm": 1.2890625, "learning_rate": 3.446717178343976e-05, "loss": 0.5562, "step": 3600 }, { "epoch": 2.315210849416202, "grad_norm": 1.40625, "learning_rate": 3.438293799007276e-05, "loss": 0.5358, "step": 3605 }, { "epoch": 2.318420735866469, "grad_norm": 1.2578125, "learning_rate": 3.429902698845302e-05, "loss": 0.5555, "step": 3610 }, { "epoch": 2.321630622316736, "grad_norm": 1.1953125, "learning_rate": 3.421543974190234e-05, "loss": 0.5414, "step": 3615 }, { "epoch": 2.3248405087670023, "grad_norm": 1.2734375, "learning_rate": 3.4132177210025724e-05, "loss": 0.5336, "step": 3620 }, { "epoch": 2.3280503952172693, "grad_norm": 1.3359375, "learning_rate": 3.404924034870036e-05, "loss": 0.5351, "step": 3625 }, { "epoch": 2.331260281667536, "grad_norm": 1.28125, "learning_rate": 3.396663011006465e-05, "loss": 0.5523, "step": 3630 }, { "epoch": 2.334470168117803, "grad_norm": 1.203125, "learning_rate": 3.388434744250726e-05, "loss": 0.5347, "step": 3635 }, { "epoch": 2.33768005456807, "grad_norm": 1.203125, "learning_rate": 3.3802393290656274e-05, "loss": 0.5387, "step": 3640 }, { "epoch": 2.3408899410183364, "grad_norm": 1.2109375, "learning_rate": 3.372076859536831e-05, "loss": 0.5309, "step": 3645 }, { "epoch": 2.3440998274686033, "grad_norm": 1.296875, "learning_rate": 3.363947429371772e-05, "loss": 0.5531, "step": 3650 }, { "epoch": 2.34730971391887, "grad_norm": 1.2109375, "learning_rate": 3.355851131898585e-05, "loss": 0.5437, "step": 3655 }, { "epoch": 2.350519600369137, "grad_norm": 1.1953125, "learning_rate": 3.347788060065036e-05, "loss": 0.5143, "step": 3660 }, { "epoch": 2.353729486819404, "grad_norm": 1.234375, "learning_rate": 3.339758306437445e-05, "loss": 0.532, "step": 3665 }, { "epoch": 2.3569393732696704, "grad_norm": 1.234375, "learning_rate": 3.331761963199634e-05, "loss": 0.5535, "step": 3670 }, { "epoch": 2.3601492597199374, "grad_norm": 1.15625, "learning_rate": 3.3237991221518636e-05, "loss": 0.5384, "step": 3675 }, { "epoch": 2.3633591461702044, "grad_norm": 1.296875, "learning_rate": 3.3158698747097784e-05, "loss": 0.5444, "step": 3680 }, { "epoch": 2.366569032620471, "grad_norm": 1.21875, "learning_rate": 3.30797431190336e-05, "loss": 0.5392, "step": 3685 }, { "epoch": 2.369778919070738, "grad_norm": 1.2265625, "learning_rate": 3.300112524375881e-05, "loss": 0.5505, "step": 3690 }, { "epoch": 2.372988805521005, "grad_norm": 1.2578125, "learning_rate": 3.2922846023828645e-05, "loss": 0.5432, "step": 3695 }, { "epoch": 2.3761986919712714, "grad_norm": 1.265625, "learning_rate": 3.2844906357910476e-05, "loss": 0.5294, "step": 3700 }, { "epoch": 2.3794085784215384, "grad_norm": 1.2578125, "learning_rate": 3.2767307140773494e-05, "loss": 0.5619, "step": 3705 }, { "epoch": 2.382618464871805, "grad_norm": 1.1875, "learning_rate": 3.2690049263278455e-05, "loss": 0.5422, "step": 3710 }, { "epoch": 2.385828351322072, "grad_norm": 1.3203125, "learning_rate": 3.261313361236743e-05, "loss": 0.5413, "step": 3715 }, { "epoch": 2.389038237772339, "grad_norm": 1.234375, "learning_rate": 3.253656107105362e-05, "loss": 0.535, "step": 3720 }, { "epoch": 2.3922481242226055, "grad_norm": 1.1953125, "learning_rate": 3.246033251841126e-05, "loss": 0.5228, "step": 3725 }, { "epoch": 2.3954580106728725, "grad_norm": 1.21875, "learning_rate": 3.238444882956548e-05, "loss": 0.5378, "step": 3730 }, { "epoch": 2.3986678971231394, "grad_norm": 1.28125, "learning_rate": 3.230891087568229e-05, "loss": 0.5469, "step": 3735 }, { "epoch": 2.401877783573406, "grad_norm": 1.21875, "learning_rate": 3.2233719523958563e-05, "loss": 0.5509, "step": 3740 }, { "epoch": 2.405087670023673, "grad_norm": 1.2109375, "learning_rate": 3.2158875637612053e-05, "loss": 0.5212, "step": 3745 }, { "epoch": 2.40829755647394, "grad_norm": 1.1640625, "learning_rate": 3.208438007587156e-05, "loss": 0.5221, "step": 3750 }, { "epoch": 2.4115074429242065, "grad_norm": 1.2109375, "learning_rate": 3.201023369396699e-05, "loss": 0.5311, "step": 3755 }, { "epoch": 2.4147173293744735, "grad_norm": 1.15625, "learning_rate": 3.193643734311958e-05, "loss": 0.5403, "step": 3760 }, { "epoch": 2.41792721582474, "grad_norm": 1.234375, "learning_rate": 3.1862991870532106e-05, "loss": 0.548, "step": 3765 }, { "epoch": 2.421137102275007, "grad_norm": 1.2734375, "learning_rate": 3.1789898119379156e-05, "loss": 0.5466, "step": 3770 }, { "epoch": 2.424346988725274, "grad_norm": 1.2578125, "learning_rate": 3.171715692879748e-05, "loss": 0.5336, "step": 3775 }, { "epoch": 2.4275568751755405, "grad_norm": 1.1875, "learning_rate": 3.164476913387631e-05, "loss": 0.5341, "step": 3780 }, { "epoch": 2.4307667616258075, "grad_norm": 1.2578125, "learning_rate": 3.1572735565647815e-05, "loss": 0.5335, "step": 3785 }, { "epoch": 2.4339766480760745, "grad_norm": 1.1640625, "learning_rate": 3.1501057051077535e-05, "loss": 0.5309, "step": 3790 }, { "epoch": 2.437186534526341, "grad_norm": 1.328125, "learning_rate": 3.142973441305488e-05, "loss": 0.5451, "step": 3795 }, { "epoch": 2.440396420976608, "grad_norm": 1.1484375, "learning_rate": 3.135876847038371e-05, "loss": 0.5381, "step": 3800 }, { "epoch": 2.443606307426875, "grad_norm": 1.2109375, "learning_rate": 3.1288160037772953e-05, "loss": 0.5474, "step": 3805 }, { "epoch": 2.4468161938771416, "grad_norm": 1.28125, "learning_rate": 3.121790992582717e-05, "loss": 0.5424, "step": 3810 }, { "epoch": 2.4500260803274085, "grad_norm": 1.2578125, "learning_rate": 3.1148018941037324e-05, "loss": 0.5475, "step": 3815 }, { "epoch": 2.453235966777675, "grad_norm": 1.2265625, "learning_rate": 3.10784878857715e-05, "loss": 0.5341, "step": 3820 }, { "epoch": 2.456445853227942, "grad_norm": 1.203125, "learning_rate": 3.100931755826569e-05, "loss": 0.5365, "step": 3825 }, { "epoch": 2.459655739678209, "grad_norm": 1.234375, "learning_rate": 3.094050875261462e-05, "loss": 0.5628, "step": 3830 }, { "epoch": 2.4628656261284756, "grad_norm": 1.1875, "learning_rate": 3.087206225876266e-05, "loss": 0.54, "step": 3835 }, { "epoch": 2.4660755125787426, "grad_norm": 1.296875, "learning_rate": 3.080397886249472e-05, "loss": 0.5375, "step": 3840 }, { "epoch": 2.469285399029009, "grad_norm": 1.2109375, "learning_rate": 3.073625934542727e-05, "loss": 0.5427, "step": 3845 }, { "epoch": 2.472495285479276, "grad_norm": 1.3828125, "learning_rate": 3.0668904484999334e-05, "loss": 0.5511, "step": 3850 }, { "epoch": 2.475705171929543, "grad_norm": 1.2421875, "learning_rate": 3.060191505446357e-05, "loss": 0.5377, "step": 3855 }, { "epoch": 2.4789150583798096, "grad_norm": 1.2265625, "learning_rate": 3.0535291822877405e-05, "loss": 0.533, "step": 3860 }, { "epoch": 2.4821249448300766, "grad_norm": 1.1640625, "learning_rate": 3.0469035555094194e-05, "loss": 0.5372, "step": 3865 }, { "epoch": 2.4853348312803436, "grad_norm": 1.21875, "learning_rate": 3.040314701175445e-05, "loss": 0.544, "step": 3870 }, { "epoch": 2.48854471773061, "grad_norm": 1.25, "learning_rate": 3.0337626949277105e-05, "loss": 0.5307, "step": 3875 }, { "epoch": 2.491754604180877, "grad_norm": 1.2265625, "learning_rate": 3.0272476119850835e-05, "loss": 0.5482, "step": 3880 }, { "epoch": 2.494964490631144, "grad_norm": 1.3046875, "learning_rate": 3.020769527142541e-05, "loss": 0.5412, "step": 3885 }, { "epoch": 2.4981743770814107, "grad_norm": 1.2265625, "learning_rate": 3.0143285147703114e-05, "loss": 0.5554, "step": 3890 }, { "epoch": 2.5013842635316776, "grad_norm": 1.3046875, "learning_rate": 3.0079246488130197e-05, "loss": 0.5369, "step": 3895 }, { "epoch": 2.504594149981944, "grad_norm": 1.28125, "learning_rate": 3.0015580027888424e-05, "loss": 0.5504, "step": 3900 }, { "epoch": 2.507804036432211, "grad_norm": 1.2578125, "learning_rate": 2.9952286497886572e-05, "loss": 0.5287, "step": 3905 }, { "epoch": 2.511013922882478, "grad_norm": 1.234375, "learning_rate": 2.9889366624752118e-05, "loss": 0.5553, "step": 3910 }, { "epoch": 2.5142238093327447, "grad_norm": 1.2578125, "learning_rate": 2.9826821130822807e-05, "loss": 0.5343, "step": 3915 }, { "epoch": 2.5174336957830117, "grad_norm": 1.25, "learning_rate": 2.9764650734138434e-05, "loss": 0.5326, "step": 3920 }, { "epoch": 2.5206435822332782, "grad_norm": 1.234375, "learning_rate": 2.9702856148432573e-05, "loss": 0.5366, "step": 3925 }, { "epoch": 2.523853468683545, "grad_norm": 1.3671875, "learning_rate": 2.9641438083124372e-05, "loss": 0.5335, "step": 3930 }, { "epoch": 2.527063355133812, "grad_norm": 1.1484375, "learning_rate": 2.958039724331042e-05, "loss": 0.518, "step": 3935 }, { "epoch": 2.530273241584079, "grad_norm": 1.296875, "learning_rate": 2.9519734329756666e-05, "loss": 0.5379, "step": 3940 }, { "epoch": 2.5334831280343457, "grad_norm": 1.203125, "learning_rate": 2.9459450038890333e-05, "loss": 0.5287, "step": 3945 }, { "epoch": 2.5366930144846127, "grad_norm": 1.234375, "learning_rate": 2.9399545062791967e-05, "loss": 0.5245, "step": 3950 }, { "epoch": 2.5399029009348792, "grad_norm": 1.171875, "learning_rate": 2.9340020089187492e-05, "loss": 0.541, "step": 3955 }, { "epoch": 2.5431127873851462, "grad_norm": 1.25, "learning_rate": 2.928087580144026e-05, "loss": 0.5299, "step": 3960 }, { "epoch": 2.546322673835413, "grad_norm": 1.1875, "learning_rate": 2.9222112878543273e-05, "loss": 0.527, "step": 3965 }, { "epoch": 2.5495325602856798, "grad_norm": 1.234375, "learning_rate": 2.9163731995111333e-05, "loss": 0.5581, "step": 3970 }, { "epoch": 2.5527424467359467, "grad_norm": 1.2109375, "learning_rate": 2.9105733821373333e-05, "loss": 0.5499, "step": 3975 }, { "epoch": 2.5559523331862133, "grad_norm": 1.25, "learning_rate": 2.9048119023164555e-05, "loss": 0.5265, "step": 3980 }, { "epoch": 2.5591622196364803, "grad_norm": 1.1640625, "learning_rate": 2.8990888261919024e-05, "loss": 0.5433, "step": 3985 }, { "epoch": 2.5623721060867473, "grad_norm": 1.2265625, "learning_rate": 2.8934042194661913e-05, "loss": 0.5503, "step": 3990 }, { "epoch": 2.5655819925370142, "grad_norm": 1.265625, "learning_rate": 2.8877581474001986e-05, "loss": 0.5327, "step": 3995 }, { "epoch": 2.568791878987281, "grad_norm": 1.2578125, "learning_rate": 2.8821506748124132e-05, "loss": 0.5499, "step": 4000 }, { "epoch": 2.568791878987281, "eval_loss": 0.4683253765106201, "eval_runtime": 2.4022, "eval_samples_per_second": 83.257, "eval_steps_per_second": 83.257, "step": 4000 }, { "epoch": 2.5720017654375478, "grad_norm": 1.34375, "learning_rate": 2.8765818660781912e-05, "loss": 0.5244, "step": 4005 }, { "epoch": 2.5752116518878143, "grad_norm": 1.296875, "learning_rate": 2.8710517851290174e-05, "loss": 0.5457, "step": 4010 }, { "epoch": 2.5784215383380813, "grad_norm": 1.28125, "learning_rate": 2.865560495451769e-05, "loss": 0.539, "step": 4015 }, { "epoch": 2.5816314247883483, "grad_norm": 1.21875, "learning_rate": 2.8601080600879892e-05, "loss": 0.5469, "step": 4020 }, { "epoch": 2.584841311238615, "grad_norm": 1.265625, "learning_rate": 2.854694541633165e-05, "loss": 0.5536, "step": 4025 }, { "epoch": 2.588051197688882, "grad_norm": 1.2421875, "learning_rate": 2.8493200022360027e-05, "loss": 0.5324, "step": 4030 }, { "epoch": 2.5912610841391484, "grad_norm": 1.2890625, "learning_rate": 2.8439845035977214e-05, "loss": 0.519, "step": 4035 }, { "epoch": 2.5944709705894153, "grad_norm": 1.2734375, "learning_rate": 2.838688106971339e-05, "loss": 0.534, "step": 4040 }, { "epoch": 2.5976808570396823, "grad_norm": 1.21875, "learning_rate": 2.8334308731609722e-05, "loss": 0.5333, "step": 4045 }, { "epoch": 2.6008907434899493, "grad_norm": 1.2734375, "learning_rate": 2.8282128625211378e-05, "loss": 0.5319, "step": 4050 }, { "epoch": 2.604100629940216, "grad_norm": 1.203125, "learning_rate": 2.8230341349560603e-05, "loss": 0.5411, "step": 4055 }, { "epoch": 2.607310516390483, "grad_norm": 1.25, "learning_rate": 2.8178947499189812e-05, "loss": 0.5493, "step": 4060 }, { "epoch": 2.6105204028407494, "grad_norm": 1.2109375, "learning_rate": 2.812794766411481e-05, "loss": 0.5491, "step": 4065 }, { "epoch": 2.6137302892910164, "grad_norm": 1.2109375, "learning_rate": 2.8077342429827992e-05, "loss": 0.5423, "step": 4070 }, { "epoch": 2.6169401757412833, "grad_norm": 1.2421875, "learning_rate": 2.802713237729162e-05, "loss": 0.5493, "step": 4075 }, { "epoch": 2.62015006219155, "grad_norm": 1.1953125, "learning_rate": 2.797731808293116e-05, "loss": 0.5503, "step": 4080 }, { "epoch": 2.623359948641817, "grad_norm": 1.203125, "learning_rate": 2.7927900118628652e-05, "loss": 0.5297, "step": 4085 }, { "epoch": 2.6265698350920834, "grad_norm": 1.28125, "learning_rate": 2.787887905171619e-05, "loss": 0.5406, "step": 4090 }, { "epoch": 2.6297797215423504, "grad_norm": 1.2109375, "learning_rate": 2.7830255444969332e-05, "loss": 0.531, "step": 4095 }, { "epoch": 2.6329896079926174, "grad_norm": 1.40625, "learning_rate": 2.7782029856600715e-05, "loss": 0.5403, "step": 4100 }, { "epoch": 2.636199494442884, "grad_norm": 1.2578125, "learning_rate": 2.77342028402536e-05, "loss": 0.5568, "step": 4105 }, { "epoch": 2.639409380893151, "grad_norm": 1.1796875, "learning_rate": 2.7686774944995526e-05, "loss": 0.5364, "step": 4110 }, { "epoch": 2.6426192673434175, "grad_norm": 1.25, "learning_rate": 2.763974671531201e-05, "loss": 0.5501, "step": 4115 }, { "epoch": 2.6458291537936844, "grad_norm": 1.3671875, "learning_rate": 2.759311869110032e-05, "loss": 0.5469, "step": 4120 }, { "epoch": 2.6490390402439514, "grad_norm": 1.1328125, "learning_rate": 2.7546891407663216e-05, "loss": 0.5401, "step": 4125 }, { "epoch": 2.6522489266942184, "grad_norm": 1.2890625, "learning_rate": 2.7501065395702864e-05, "loss": 0.5465, "step": 4130 }, { "epoch": 2.655458813144485, "grad_norm": 1.203125, "learning_rate": 2.745564118131472e-05, "loss": 0.5332, "step": 4135 }, { "epoch": 2.658668699594752, "grad_norm": 1.2578125, "learning_rate": 2.741061928598149e-05, "loss": 0.5376, "step": 4140 }, { "epoch": 2.6618785860450185, "grad_norm": 1.25, "learning_rate": 2.736600022656714e-05, "loss": 0.5382, "step": 4145 }, { "epoch": 2.6650884724952855, "grad_norm": 1.203125, "learning_rate": 2.7321784515310965e-05, "loss": 0.5494, "step": 4150 }, { "epoch": 2.6682983589455525, "grad_norm": 1.2421875, "learning_rate": 2.7277972659821727e-05, "loss": 0.5511, "step": 4155 }, { "epoch": 2.671508245395819, "grad_norm": 1.1875, "learning_rate": 2.723456516307178e-05, "loss": 0.552, "step": 4160 }, { "epoch": 2.674718131846086, "grad_norm": 1.2109375, "learning_rate": 2.7191562523391363e-05, "loss": 0.5295, "step": 4165 }, { "epoch": 2.6779280182963525, "grad_norm": 1.203125, "learning_rate": 2.7148965234462807e-05, "loss": 0.5491, "step": 4170 }, { "epoch": 2.6811379047466195, "grad_norm": 1.203125, "learning_rate": 2.7106773785314937e-05, "loss": 0.5218, "step": 4175 }, { "epoch": 2.6843477911968865, "grad_norm": 1.1953125, "learning_rate": 2.70649886603174e-05, "loss": 0.5303, "step": 4180 }, { "epoch": 2.6875576776471535, "grad_norm": 1.25, "learning_rate": 2.7023610339175127e-05, "loss": 0.5344, "step": 4185 }, { "epoch": 2.69076756409742, "grad_norm": 1.1640625, "learning_rate": 2.698263929692285e-05, "loss": 0.5482, "step": 4190 }, { "epoch": 2.693977450547687, "grad_norm": 1.1796875, "learning_rate": 2.6942076003919596e-05, "loss": 0.5198, "step": 4195 }, { "epoch": 2.6971873369979535, "grad_norm": 1.1796875, "learning_rate": 2.6901920925843338e-05, "loss": 0.5366, "step": 4200 }, { "epoch": 2.7003972234482205, "grad_norm": 1.1875, "learning_rate": 2.6862174523685618e-05, "loss": 0.5151, "step": 4205 }, { "epoch": 2.7036071098984875, "grad_norm": 1.2734375, "learning_rate": 2.6822837253746258e-05, "loss": 0.5174, "step": 4210 }, { "epoch": 2.706816996348754, "grad_norm": 1.2578125, "learning_rate": 2.6783909567628153e-05, "loss": 0.5391, "step": 4215 }, { "epoch": 2.710026882799021, "grad_norm": 1.28125, "learning_rate": 2.674539191223202e-05, "loss": 0.5445, "step": 4220 }, { "epoch": 2.7132367692492876, "grad_norm": 1.2578125, "learning_rate": 2.6707284729751346e-05, "loss": 0.5197, "step": 4225 }, { "epoch": 2.7164466556995546, "grad_norm": 1.2265625, "learning_rate": 2.666958845766726e-05, "loss": 0.5375, "step": 4230 }, { "epoch": 2.7196565421498216, "grad_norm": 1.140625, "learning_rate": 2.663230352874352e-05, "loss": 0.5285, "step": 4235 }, { "epoch": 2.7228664286000885, "grad_norm": 1.2421875, "learning_rate": 2.659543037102154e-05, "loss": 0.5429, "step": 4240 }, { "epoch": 2.726076315050355, "grad_norm": 1.1953125, "learning_rate": 2.6558969407815525e-05, "loss": 0.5288, "step": 4245 }, { "epoch": 2.729286201500622, "grad_norm": 1.265625, "learning_rate": 2.652292105770753e-05, "loss": 0.527, "step": 4250 }, { "epoch": 2.7324960879508886, "grad_norm": 1.1484375, "learning_rate": 2.648728573454271e-05, "loss": 0.5219, "step": 4255 }, { "epoch": 2.7357059744011556, "grad_norm": 1.2890625, "learning_rate": 2.6452063847424564e-05, "loss": 0.5412, "step": 4260 }, { "epoch": 2.7389158608514226, "grad_norm": 1.2265625, "learning_rate": 2.6417255800710215e-05, "loss": 0.5495, "step": 4265 }, { "epoch": 2.742125747301689, "grad_norm": 1.3671875, "learning_rate": 2.6382861994005792e-05, "loss": 0.5353, "step": 4270 }, { "epoch": 2.745335633751956, "grad_norm": 1.2421875, "learning_rate": 2.6348882822161826e-05, "loss": 0.5386, "step": 4275 }, { "epoch": 2.7485455202022226, "grad_norm": 1.234375, "learning_rate": 2.6315318675268724e-05, "loss": 0.55, "step": 4280 }, { "epoch": 2.7517554066524896, "grad_norm": 1.25, "learning_rate": 2.6282169938652306e-05, "loss": 0.5401, "step": 4285 }, { "epoch": 2.7549652931027566, "grad_norm": 1.15625, "learning_rate": 2.6249436992869342e-05, "loss": 0.5289, "step": 4290 }, { "epoch": 2.758175179553023, "grad_norm": 1.203125, "learning_rate": 2.6217120213703222e-05, "loss": 0.541, "step": 4295 }, { "epoch": 2.76138506600329, "grad_norm": 1.21875, "learning_rate": 2.6185219972159626e-05, "loss": 0.5263, "step": 4300 }, { "epoch": 2.7645949524535567, "grad_norm": 1.2109375, "learning_rate": 2.6153736634462252e-05, "loss": 0.5247, "step": 4305 }, { "epoch": 2.7678048389038237, "grad_norm": 1.1640625, "learning_rate": 2.6122670562048645e-05, "loss": 0.5476, "step": 4310 }, { "epoch": 2.7710147253540907, "grad_norm": 1.2578125, "learning_rate": 2.6092022111566007e-05, "loss": 0.5246, "step": 4315 }, { "epoch": 2.7742246118043576, "grad_norm": 1.1953125, "learning_rate": 2.6061791634867146e-05, "loss": 0.5191, "step": 4320 }, { "epoch": 2.777434498254624, "grad_norm": 1.2265625, "learning_rate": 2.6031979479006395e-05, "loss": 0.5341, "step": 4325 }, { "epoch": 2.780644384704891, "grad_norm": 1.21875, "learning_rate": 2.6002585986235656e-05, "loss": 0.5375, "step": 4330 }, { "epoch": 2.7838542711551577, "grad_norm": 1.2734375, "learning_rate": 2.5973611494000462e-05, "loss": 0.5502, "step": 4335 }, { "epoch": 2.7870641576054247, "grad_norm": 1.375, "learning_rate": 2.5945056334936092e-05, "loss": 0.5263, "step": 4340 }, { "epoch": 2.7902740440556917, "grad_norm": 1.2265625, "learning_rate": 2.5916920836863772e-05, "loss": 0.5388, "step": 4345 }, { "epoch": 2.7934839305059582, "grad_norm": 1.390625, "learning_rate": 2.58892053227869e-05, "loss": 0.5378, "step": 4350 }, { "epoch": 2.796693816956225, "grad_norm": 1.2890625, "learning_rate": 2.5861910110887344e-05, "loss": 0.5333, "step": 4355 }, { "epoch": 2.7999037034064918, "grad_norm": 1.1484375, "learning_rate": 2.5835035514521776e-05, "loss": 0.5295, "step": 4360 }, { "epoch": 2.8031135898567587, "grad_norm": 1.2265625, "learning_rate": 2.58085818422181e-05, "loss": 0.5308, "step": 4365 }, { "epoch": 2.8063234763070257, "grad_norm": 1.1875, "learning_rate": 2.5782549397671872e-05, "loss": 0.5339, "step": 4370 }, { "epoch": 2.8095333627572927, "grad_norm": 1.28125, "learning_rate": 2.575693847974286e-05, "loss": 0.543, "step": 4375 }, { "epoch": 2.8127432492075592, "grad_norm": 1.1796875, "learning_rate": 2.5731749382451565e-05, "loss": 0.5417, "step": 4380 }, { "epoch": 2.8159531356578262, "grad_norm": 1.2265625, "learning_rate": 2.5706982394975875e-05, "loss": 0.5473, "step": 4385 }, { "epoch": 2.8191630221080928, "grad_norm": 1.21875, "learning_rate": 2.568263780164775e-05, "loss": 0.536, "step": 4390 }, { "epoch": 2.8223729085583598, "grad_norm": 1.3125, "learning_rate": 2.5658715881949946e-05, "loss": 0.5271, "step": 4395 }, { "epoch": 2.8255827950086267, "grad_norm": 1.2265625, "learning_rate": 2.5635216910512793e-05, "loss": 0.5437, "step": 4400 }, { "epoch": 2.8287926814588933, "grad_norm": 1.2109375, "learning_rate": 2.561214115711107e-05, "loss": 0.5294, "step": 4405 }, { "epoch": 2.8320025679091603, "grad_norm": 1.3046875, "learning_rate": 2.558948888666088e-05, "loss": 0.5353, "step": 4410 }, { "epoch": 2.835212454359427, "grad_norm": 1.2578125, "learning_rate": 2.556726035921665e-05, "loss": 0.544, "step": 4415 }, { "epoch": 2.838422340809694, "grad_norm": 1.2421875, "learning_rate": 2.5545455829968078e-05, "loss": 0.5282, "step": 4420 }, { "epoch": 2.841632227259961, "grad_norm": 1.234375, "learning_rate": 2.552407554923729e-05, "loss": 0.5423, "step": 4425 }, { "epoch": 2.8448421137102278, "grad_norm": 1.296875, "learning_rate": 2.550311976247588e-05, "loss": 0.5348, "step": 4430 }, { "epoch": 2.8480520001604943, "grad_norm": 1.25, "learning_rate": 2.548258871026216e-05, "loss": 0.5591, "step": 4435 }, { "epoch": 2.8512618866107613, "grad_norm": 1.2734375, "learning_rate": 2.5462482628298357e-05, "loss": 0.5325, "step": 4440 }, { "epoch": 2.854471773061028, "grad_norm": 1.203125, "learning_rate": 2.544280174740792e-05, "loss": 0.534, "step": 4445 }, { "epoch": 2.857681659511295, "grad_norm": 1.2421875, "learning_rate": 2.542354629353288e-05, "loss": 0.534, "step": 4450 }, { "epoch": 2.860891545961562, "grad_norm": 1.140625, "learning_rate": 2.540471648773124e-05, "loss": 0.5599, "step": 4455 }, { "epoch": 2.8641014324118284, "grad_norm": 1.375, "learning_rate": 2.5386312546174434e-05, "loss": 0.5492, "step": 4460 }, { "epoch": 2.8673113188620953, "grad_norm": 1.15625, "learning_rate": 2.5368334680144884e-05, "loss": 0.5301, "step": 4465 }, { "epoch": 2.870521205312362, "grad_norm": 1.15625, "learning_rate": 2.535078309603351e-05, "loss": 0.5193, "step": 4470 }, { "epoch": 2.873731091762629, "grad_norm": 1.2421875, "learning_rate": 2.5333657995337422e-05, "loss": 0.5296, "step": 4475 }, { "epoch": 2.876940978212896, "grad_norm": 1.1875, "learning_rate": 2.5316959574657583e-05, "loss": 0.5139, "step": 4480 }, { "epoch": 2.8801508646631624, "grad_norm": 1.296875, "learning_rate": 2.5300688025696517e-05, "loss": 0.5349, "step": 4485 }, { "epoch": 2.8833607511134294, "grad_norm": 1.203125, "learning_rate": 2.5284843535256182e-05, "loss": 0.5442, "step": 4490 }, { "epoch": 2.886570637563696, "grad_norm": 1.28125, "learning_rate": 2.5269426285235753e-05, "loss": 0.5328, "step": 4495 }, { "epoch": 2.889780524013963, "grad_norm": 1.171875, "learning_rate": 2.5254436452629594e-05, "loss": 0.5126, "step": 4500 }, { "epoch": 2.889780524013963, "eval_loss": 0.4651297628879547, "eval_runtime": 2.403, "eval_samples_per_second": 83.23, "eval_steps_per_second": 83.23, "step": 4500 }, { "epoch": 2.89299041046423, "grad_norm": 1.1875, "learning_rate": 2.523987420952516e-05, "loss": 0.5352, "step": 4505 }, { "epoch": 2.896200296914497, "grad_norm": 1.1484375, "learning_rate": 2.5225739723101105e-05, "loss": 0.5321, "step": 4510 }, { "epoch": 2.8994101833647634, "grad_norm": 1.1796875, "learning_rate": 2.521203315562528e-05, "loss": 0.5323, "step": 4515 }, { "epoch": 2.9026200698150304, "grad_norm": 1.21875, "learning_rate": 2.5198754664452913e-05, "loss": 0.5468, "step": 4520 }, { "epoch": 2.905829956265297, "grad_norm": 1.296875, "learning_rate": 2.5185904402024808e-05, "loss": 0.53, "step": 4525 }, { "epoch": 2.909039842715564, "grad_norm": 1.21875, "learning_rate": 2.5173482515865582e-05, "loss": 0.5181, "step": 4530 }, { "epoch": 2.912249729165831, "grad_norm": 1.1875, "learning_rate": 2.5161489148581962e-05, "loss": 0.5294, "step": 4535 }, { "epoch": 2.9154596156160975, "grad_norm": 1.1796875, "learning_rate": 2.514992443786116e-05, "loss": 0.5339, "step": 4540 }, { "epoch": 2.9186695020663644, "grad_norm": 1.1953125, "learning_rate": 2.51387885164693e-05, "loss": 0.5416, "step": 4545 }, { "epoch": 2.921879388516631, "grad_norm": 1.1875, "learning_rate": 2.512808151224988e-05, "loss": 0.546, "step": 4550 }, { "epoch": 2.925089274966898, "grad_norm": 1.28125, "learning_rate": 2.5117803548122305e-05, "loss": 0.552, "step": 4555 }, { "epoch": 2.928299161417165, "grad_norm": 1.1953125, "learning_rate": 2.510795474208048e-05, "loss": 0.5195, "step": 4560 }, { "epoch": 2.931509047867432, "grad_norm": 1.1640625, "learning_rate": 2.5098535207191458e-05, "loss": 0.5446, "step": 4565 }, { "epoch": 2.9347189343176985, "grad_norm": 1.125, "learning_rate": 2.5089545051594136e-05, "loss": 0.5417, "step": 4570 }, { "epoch": 2.9379288207679655, "grad_norm": 1.234375, "learning_rate": 2.5080984378498023e-05, "loss": 0.5301, "step": 4575 }, { "epoch": 2.941138707218232, "grad_norm": 1.3203125, "learning_rate": 2.507285328618204e-05, "loss": 0.5464, "step": 4580 }, { "epoch": 2.944348593668499, "grad_norm": 1.1875, "learning_rate": 2.506515186799341e-05, "loss": 0.5348, "step": 4585 }, { "epoch": 2.947558480118766, "grad_norm": 1.171875, "learning_rate": 2.5057880212346564e-05, "loss": 0.5296, "step": 4590 }, { "epoch": 2.9507683665690325, "grad_norm": 1.21875, "learning_rate": 2.505103840272215e-05, "loss": 0.5267, "step": 4595 }, { "epoch": 2.9539782530192995, "grad_norm": 1.2421875, "learning_rate": 2.5044626517666054e-05, "loss": 0.5286, "step": 4600 }, { "epoch": 2.957188139469566, "grad_norm": 1.15625, "learning_rate": 2.5038644630788517e-05, "loss": 0.5401, "step": 4605 }, { "epoch": 2.960398025919833, "grad_norm": 1.28125, "learning_rate": 2.5033092810763275e-05, "loss": 0.5278, "step": 4610 }, { "epoch": 2.9636079123701, "grad_norm": 1.171875, "learning_rate": 2.5027971121326776e-05, "loss": 0.5218, "step": 4615 }, { "epoch": 2.966817798820367, "grad_norm": 1.21875, "learning_rate": 2.5023279621277444e-05, "loss": 0.5288, "step": 4620 }, { "epoch": 2.9700276852706335, "grad_norm": 1.203125, "learning_rate": 2.5019018364475026e-05, "loss": 0.5382, "step": 4625 }, { "epoch": 2.9732375717209005, "grad_norm": 1.1171875, "learning_rate": 2.5015187399839936e-05, "loss": 0.5431, "step": 4630 }, { "epoch": 2.976447458171167, "grad_norm": 1.21875, "learning_rate": 2.501178677135272e-05, "loss": 0.5417, "step": 4635 }, { "epoch": 2.979657344621434, "grad_norm": 1.2421875, "learning_rate": 2.5008816518053547e-05, "loss": 0.5141, "step": 4640 }, { "epoch": 2.982867231071701, "grad_norm": 1.15625, "learning_rate": 2.500627667404176e-05, "loss": 0.5438, "step": 4645 }, { "epoch": 2.9860771175219676, "grad_norm": 1.2265625, "learning_rate": 2.5004167268475475e-05, "loss": 0.5386, "step": 4650 }, { "epoch": 2.9892870039722346, "grad_norm": 1.2421875, "learning_rate": 2.500248832557126e-05, "loss": 0.5358, "step": 4655 }, { "epoch": 2.992496890422501, "grad_norm": 1.21875, "learning_rate": 2.5001239864603847e-05, "loss": 0.5446, "step": 4660 }, { "epoch": 2.995706776872768, "grad_norm": 1.1640625, "learning_rate": 2.500042189990593e-05, "loss": 0.5492, "step": 4665 }, { "epoch": 2.998916663323035, "grad_norm": 1.2734375, "learning_rate": 2.5000034440867958e-05, "loss": 0.5393, "step": 4670 }, { "epoch": 2.9995586406130883, "eval_loss": 0.4636688232421875, "eval_runtime": 2.4088, "eval_samples_per_second": 83.028, "eval_steps_per_second": 83.028, "step": 4671 } ], "logging_steps": 5, "max_steps": 4671, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.12480186236928e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }