{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9992103185048697,
  "eval_steps": 500,
  "global_step": 949,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0010529086601737299,
      "grad_norm": 4.026114469450001,
      "learning_rate": 2.105263157894737e-06,
      "loss": 1.3755,
      "step": 1
    },
    {
      "epoch": 0.0052645433008686494,
      "grad_norm": 1.1251945428540173,
      "learning_rate": 1.0526315789473684e-05,
      "loss": 1.3102,
      "step": 5
    },
    {
      "epoch": 0.010529086601737299,
      "grad_norm": 0.563945023031292,
      "learning_rate": 2.105263157894737e-05,
      "loss": 1.2626,
      "step": 10
    },
    {
      "epoch": 0.01579362990260595,
      "grad_norm": 0.4336837922055097,
      "learning_rate": 3.157894736842105e-05,
      "loss": 1.2133,
      "step": 15
    },
    {
      "epoch": 0.021058173203474598,
      "grad_norm": 0.33707427690007363,
      "learning_rate": 4.210526315789474e-05,
      "loss": 1.1636,
      "step": 20
    },
    {
      "epoch": 0.026322716504343247,
      "grad_norm": 0.2412443925117212,
      "learning_rate": 5.2631578947368424e-05,
      "loss": 1.1845,
      "step": 25
    },
    {
      "epoch": 0.0315872598052119,
      "grad_norm": 0.2414606698915264,
      "learning_rate": 6.31578947368421e-05,
      "loss": 1.1401,
      "step": 30
    },
    {
      "epoch": 0.03685180310608055,
      "grad_norm": 0.22628642837844729,
      "learning_rate": 7.368421052631579e-05,
      "loss": 1.1591,
      "step": 35
    },
    {
      "epoch": 0.042116346406949196,
      "grad_norm": 0.2208133931977146,
      "learning_rate": 8.421052631578948e-05,
      "loss": 1.1397,
      "step": 40
    },
    {
      "epoch": 0.04738088970781785,
      "grad_norm": 0.21905054641153135,
      "learning_rate": 9.473684210526316e-05,
      "loss": 1.1295,
      "step": 45
    },
    {
      "epoch": 0.052645433008686494,
      "grad_norm": 0.1951061692287286,
      "learning_rate": 0.00010526315789473685,
      "loss": 1.141,
      "step": 50
    },
    {
      "epoch": 0.05790997630955515,
      "grad_norm": 0.18792729468212402,
      "learning_rate": 0.00011578947368421053,
      "loss": 1.121,
      "step": 55
    },
    {
      "epoch": 0.0631745196104238,
      "grad_norm": 0.189919407852987,
      "learning_rate": 0.0001263157894736842,
      "loss": 1.1347,
      "step": 60
    },
    {
      "epoch": 0.06843906291129244,
      "grad_norm": 0.18793881851552416,
      "learning_rate": 0.0001368421052631579,
      "loss": 1.0961,
      "step": 65
    },
    {
      "epoch": 0.0737036062121611,
      "grad_norm": 0.18263188517758086,
      "learning_rate": 0.00014736842105263158,
      "loss": 1.0937,
      "step": 70
    },
    {
      "epoch": 0.07896814951302975,
      "grad_norm": 0.18520098125405152,
      "learning_rate": 0.00015789473684210527,
      "loss": 1.1419,
      "step": 75
    },
    {
      "epoch": 0.08423269281389839,
      "grad_norm": 0.18675524775004465,
      "learning_rate": 0.00016842105263157895,
      "loss": 1.1094,
      "step": 80
    },
    {
      "epoch": 0.08949723611476705,
      "grad_norm": 0.18469057661828525,
      "learning_rate": 0.00017894736842105264,
      "loss": 1.0952,
      "step": 85
    },
    {
      "epoch": 0.0947617794156357,
      "grad_norm": 0.17860571701450936,
      "learning_rate": 0.00018947368421052632,
      "loss": 1.1035,
      "step": 90
    },
    {
      "epoch": 0.10002632271650434,
      "grad_norm": 0.2032976356381528,
      "learning_rate": 0.0002,
      "loss": 1.1329,
      "step": 95
    },
    {
      "epoch": 0.10529086601737299,
      "grad_norm": 0.18932762375677964,
      "learning_rate": 0.0001999830846194422,
      "loss": 1.0902,
      "step": 100
    },
    {
      "epoch": 0.11055540931824165,
      "grad_norm": 0.17823414518835126,
      "learning_rate": 0.00019993234420037073,
      "loss": 1.0951,
      "step": 105
    },
    {
      "epoch": 0.1158199526191103,
      "grad_norm": 0.19033211796864122,
      "learning_rate": 0.00019984779590865556,
      "loss": 1.11,
      "step": 110
    },
    {
      "epoch": 0.12108449591997894,
      "grad_norm": 0.1781090004150184,
      "learning_rate": 0.0001997294683476273,
      "loss": 1.1216,
      "step": 115
    },
    {
      "epoch": 0.1263490392208476,
      "grad_norm": 0.20142566240295628,
      "learning_rate": 0.0001995774015484005,
      "loss": 1.088,
      "step": 120
    },
    {
      "epoch": 0.13161358252171623,
      "grad_norm": 0.16738672746077932,
      "learning_rate": 0.00019939164695633067,
      "loss": 1.1069,
      "step": 125
    },
    {
      "epoch": 0.13687812582258488,
      "grad_norm": 0.17141306079033702,
      "learning_rate": 0.00019917226741361015,
      "loss": 1.1178,
      "step": 130
    },
    {
      "epoch": 0.14214266912345355,
      "grad_norm": 0.18242919862111662,
      "learning_rate": 0.00019891933713800798,
      "loss": 1.115,
      "step": 135
    },
    {
      "epoch": 0.1474072124243222,
      "grad_norm": 0.18858703761293544,
      "learning_rate": 0.00019863294169776148,
      "loss": 1.092,
      "step": 140
    },
    {
      "epoch": 0.15267175572519084,
      "grad_norm": 0.1851910906506613,
      "learning_rate": 0.00019831317798262786,
      "loss": 1.1015,
      "step": 145
    },
    {
      "epoch": 0.1579362990260595,
      "grad_norm": 0.17061718616532065,
      "learning_rate": 0.00019796015417110577,
      "loss": 1.0834,
      "step": 150
    },
    {
      "epoch": 0.16320084232692814,
      "grad_norm": 0.19083175263550564,
      "learning_rate": 0.0001975739896938375,
      "loss": 1.0915,
      "step": 155
    },
    {
      "epoch": 0.16846538562779678,
      "grad_norm": 0.17041981222039004,
      "learning_rate": 0.00019715481519320496,
      "loss": 1.1045,
      "step": 160
    },
    {
      "epoch": 0.17372992892866543,
      "grad_norm": 0.17539080495334333,
      "learning_rate": 0.00019670277247913205,
      "loss": 1.0822,
      "step": 165
    },
    {
      "epoch": 0.1789944722295341,
      "grad_norm": 0.16847918243353582,
      "learning_rate": 0.00019621801448110952,
      "loss": 1.1113,
      "step": 170
    },
    {
      "epoch": 0.18425901553040275,
      "grad_norm": 0.16577520965121645,
      "learning_rate": 0.00019570070519645767,
      "loss": 1.0726,
      "step": 175
    },
    {
      "epoch": 0.1895235588312714,
      "grad_norm": 0.17216940817918563,
      "learning_rate": 0.00019515101963484485,
      "loss": 1.1214,
      "step": 180
    },
    {
      "epoch": 0.19478810213214004,
      "grad_norm": 0.16717603329959776,
      "learning_rate": 0.00019456914375908023,
      "loss": 1.0749,
      "step": 185
    },
    {
      "epoch": 0.20005264543300869,
      "grad_norm": 0.16743311436795275,
      "learning_rate": 0.0001939552744222014,
      "loss": 1.0856,
      "step": 190
    },
    {
      "epoch": 0.20531718873387733,
      "grad_norm": 0.16628473925396028,
      "learning_rate": 0.00019330961930087725,
      "loss": 1.1088,
      "step": 195
    },
    {
      "epoch": 0.21058173203474598,
      "grad_norm": 0.1735673710306468,
      "learning_rate": 0.00019263239682514952,
      "loss": 1.094,
      "step": 200
    },
    {
      "epoch": 0.21584627533561462,
      "grad_norm": 0.16463208491106188,
      "learning_rate": 0.00019192383610453618,
      "loss": 1.1191,
      "step": 205
    },
    {
      "epoch": 0.2211108186364833,
      "grad_norm": 0.1697201646634803,
      "learning_rate": 0.00019118417685052194,
      "loss": 1.1188,
      "step": 210
    },
    {
      "epoch": 0.22637536193735194,
      "grad_norm": 0.15930348674889006,
      "learning_rate": 0.00019041366929546219,
      "loss": 1.1132,
      "step": 215
    },
    {
      "epoch": 0.2316399052382206,
      "grad_norm": 0.16154587528605638,
      "learning_rate": 0.0001896125741079272,
      "loss": 1.1029,
      "step": 220
    },
    {
      "epoch": 0.23690444853908924,
      "grad_norm": 0.1689593754891321,
      "learning_rate": 0.00018878116230451613,
      "loss": 1.1196,
      "step": 225
    },
    {
      "epoch": 0.24216899183995788,
      "grad_norm": 0.158620581331537,
      "learning_rate": 0.0001879197151581702,
      "loss": 1.0786,
      "step": 230
    },
    {
      "epoch": 0.24743353514082653,
      "grad_norm": 0.1591976970503649,
      "learning_rate": 0.00018702852410301554,
      "loss": 1.0861,
      "step": 235
    },
    {
      "epoch": 0.2526980784416952,
      "grad_norm": 0.16271741933565712,
      "learning_rate": 0.00018610789063576913,
      "loss": 1.077,
      "step": 240
    },
    {
      "epoch": 0.25796262174256385,
      "grad_norm": 0.16691583214883504,
      "learning_rate": 0.00018515812621373997,
      "loss": 1.0931,
      "step": 245
    },
    {
      "epoch": 0.26322716504343247,
      "grad_norm": 0.15795453416798677,
      "learning_rate": 0.00018417955214946092,
      "loss": 1.0929,
      "step": 250
    },
    {
      "epoch": 0.26849170834430114,
      "grad_norm": 0.15734119895920037,
      "learning_rate": 0.00018317249950198597,
      "loss": 1.086,
      "step": 255
    },
    {
      "epoch": 0.27375625164516976,
      "grad_norm": 0.15815121783491273,
      "learning_rate": 0.0001821373089648906,
      "loss": 1.1142,
      "step": 260
    },
    {
      "epoch": 0.27902079494603843,
      "grad_norm": 0.15790684873329372,
      "learning_rate": 0.00018107433075101252,
      "loss": 1.0907,
      "step": 265
    },
    {
      "epoch": 0.2842853382469071,
      "grad_norm": 0.1603612919235376,
      "learning_rate": 0.00017998392447397197,
      "loss": 1.103,
      "step": 270
    },
    {
      "epoch": 0.2895498815477757,
      "grad_norm": 0.1935643212000403,
      "learning_rate": 0.00017886645902651167,
      "loss": 1.1207,
      "step": 275
    },
    {
      "epoch": 0.2948144248486444,
      "grad_norm": 0.16197395404790052,
      "learning_rate": 0.0001777223124556978,
      "loss": 1.1036,
      "step": 280
    },
    {
      "epoch": 0.300078968149513,
      "grad_norm": 0.16503760296000086,
      "learning_rate": 0.00017655187183502344,
      "loss": 1.0647,
      "step": 285
    },
    {
      "epoch": 0.3053435114503817,
      "grad_norm": 0.1772283442967409,
      "learning_rate": 0.00017535553313345904,
      "loss": 1.1075,
      "step": 290
    },
    {
      "epoch": 0.3106080547512503,
      "grad_norm": 0.16282645295325013,
      "learning_rate": 0.00017413370108149286,
      "loss": 1.1094,
      "step": 295
    },
    {
      "epoch": 0.315872598052119,
      "grad_norm": 0.15561402718068354,
      "learning_rate": 0.00017288678903420762,
      "loss": 1.0776,
      "step": 300
    },
    {
      "epoch": 0.32113714135298765,
      "grad_norm": 0.15594031474920508,
      "learning_rate": 0.00017161521883143934,
      "loss": 1.1078,
      "step": 305
    },
    {
      "epoch": 0.32640168465385627,
      "grad_norm": 0.1570395383591175,
      "learning_rate": 0.00017031942065506576,
      "loss": 1.1124,
      "step": 310
    },
    {
      "epoch": 0.33166622795472495,
      "grad_norm": 0.15773152315944608,
      "learning_rate": 0.00016899983288347248,
      "loss": 1.0913,
      "step": 315
    },
    {
      "epoch": 0.33693077125559356,
      "grad_norm": 0.15310806808595664,
      "learning_rate": 0.00016765690194324616,
      "loss": 1.0845,
      "step": 320
    },
    {
      "epoch": 0.34219531455646224,
      "grad_norm": 0.16384678369715433,
      "learning_rate": 0.00016629108215814525,
      "loss": 1.1173,
      "step": 325
    },
    {
      "epoch": 0.34745985785733086,
      "grad_norm": 0.165818325184464,
      "learning_rate": 0.00016490283559539838,
      "loss": 1.1014,
      "step": 330
    },
    {
      "epoch": 0.35272440115819953,
      "grad_norm": 0.15456003221800826,
      "learning_rate": 0.000163492631909384,
      "loss": 1.0915,
      "step": 335
    },
    {
      "epoch": 0.3579889444590682,
      "grad_norm": 0.16059867173233644,
      "learning_rate": 0.00016206094818274229,
      "loss": 1.0969,
      "step": 340
    },
    {
      "epoch": 0.3632534877599368,
      "grad_norm": 0.17415674474557066,
      "learning_rate": 0.00016060826876497478,
      "loss": 1.1145,
      "step": 345
    },
    {
      "epoch": 0.3685180310608055,
      "grad_norm": 0.16440403677512835,
      "learning_rate": 0.0001591350851085851,
      "loss": 1.0683,
      "step": 350
    },
    {
      "epoch": 0.3737825743616741,
      "grad_norm": 0.15901493438320982,
      "learning_rate": 0.00015764189560281677,
      "loss": 1.1199,
      "step": 355
    },
    {
      "epoch": 0.3790471176625428,
      "grad_norm": 0.15988293404570103,
      "learning_rate": 0.00015612920540504453,
      "loss": 1.0709,
      "step": 360
    },
    {
      "epoch": 0.3843116609634114,
      "grad_norm": 0.1616109424204681,
      "learning_rate": 0.00015459752626987563,
      "loss": 1.1027,
      "step": 365
    },
    {
      "epoch": 0.3895762042642801,
      "grad_norm": 0.1513607201651111,
      "learning_rate": 0.00015304737637601926,
      "loss": 1.0956,
      "step": 370
    },
    {
      "epoch": 0.3948407475651487,
      "grad_norm": 0.15452619863423803,
      "learning_rate": 0.0001514792801509831,
      "loss": 1.0952,
      "step": 375
    },
    {
      "epoch": 0.40010529086601737,
      "grad_norm": 0.15418975657555584,
      "learning_rate": 0.00014989376809365493,
      "loss": 1.0934,
      "step": 380
    },
    {
      "epoch": 0.40536983416688604,
      "grad_norm": 0.15158447263390024,
      "learning_rate": 0.00014829137659483143,
      "loss": 1.0981,
      "step": 385
    },
    {
      "epoch": 0.41063437746775466,
      "grad_norm": 0.15420702474431047,
      "learning_rate": 0.0001466726477557527,
      "loss": 1.1013,
      "step": 390
    },
    {
      "epoch": 0.41589892076862334,
      "grad_norm": 0.1513401762569788,
      "learning_rate": 0.00014503812920470534,
      "loss": 1.1128,
      "step": 395
    },
    {
      "epoch": 0.42116346406949196,
      "grad_norm": 0.1759021276212348,
      "learning_rate": 0.00014338837391175582,
      "loss": 1.0793,
      "step": 400
    },
    {
      "epoch": 0.42642800737036063,
      "grad_norm": 0.15639002655528358,
      "learning_rate": 0.00014172394000167623,
      "loss": 1.1126,
      "step": 405
    },
    {
      "epoch": 0.43169255067122925,
      "grad_norm": 0.1558922751326013,
      "learning_rate": 0.00014004539056512667,
      "loss": 1.0864,
      "step": 410
    },
    {
      "epoch": 0.4369570939720979,
      "grad_norm": 0.15449223519766864,
      "learning_rate": 0.00013835329346815716,
      "loss": 1.1161,
      "step": 415
    },
    {
      "epoch": 0.4422216372729666,
      "grad_norm": 0.15398779214531882,
      "learning_rate": 0.0001366482211600945,
      "loss": 1.113,
      "step": 420
    },
    {
      "epoch": 0.4474861805738352,
      "grad_norm": 0.15902962443654645,
      "learning_rate": 0.000134930750479878,
      "loss": 1.0783,
      "step": 425
    },
    {
      "epoch": 0.4527507238747039,
      "grad_norm": 0.15614703146804315,
      "learning_rate": 0.00013320146246091074,
      "loss": 1.0891,
      "step": 430
    },
    {
      "epoch": 0.4580152671755725,
      "grad_norm": 0.151735228198923,
      "learning_rate": 0.00013146094213449148,
      "loss": 1.1006,
      "step": 435
    },
    {
      "epoch": 0.4632798104764412,
      "grad_norm": 0.1633743946888902,
      "learning_rate": 0.00012970977833189393,
      "loss": 1.0717,
      "step": 440
    },
    {
      "epoch": 0.4685443537773098,
      "grad_norm": 0.16534257355481496,
      "learning_rate": 0.00012794856348516095,
      "loss": 1.0778,
      "step": 445
    },
    {
      "epoch": 0.47380889707817847,
      "grad_norm": 0.1856142828881669,
      "learning_rate": 0.00012617789342668004,
      "loss": 1.0859,
      "step": 450
    },
    {
      "epoch": 0.47907344037904714,
      "grad_norm": 0.15229515578033356,
      "learning_rate": 0.00012439836718760886,
      "loss": 1.0761,
      "step": 455
    },
    {
      "epoch": 0.48433798367991576,
      "grad_norm": 0.15984985984562605,
      "learning_rate": 0.00012261058679521834,
      "loss": 1.0926,
      "step": 460
    },
    {
      "epoch": 0.48960252698078444,
      "grad_norm": 0.14896040772758903,
      "learning_rate": 0.00012081515706922227,
      "loss": 1.0834,
      "step": 465
    },
    {
      "epoch": 0.49486707028165305,
      "grad_norm": 0.1514924492192347,
      "learning_rate": 0.00011901268541716224,
      "loss": 1.0885,
      "step": 470
    },
    {
      "epoch": 0.5001316135825217,
      "grad_norm": 0.1513889418015892,
      "learning_rate": 0.00011720378162891708,
      "loss": 1.1001,
      "step": 475
    },
    {
      "epoch": 0.5053961568833903,
      "grad_norm": 0.15159825336613816,
      "learning_rate": 0.0001153890576704062,
      "loss": 1.1082,
      "step": 480
    },
    {
      "epoch": 0.510660700184259,
      "grad_norm": 0.15427722774659086,
      "learning_rate": 0.00011356912747655685,
      "loss": 1.0843,
      "step": 485
    },
    {
      "epoch": 0.5159252434851277,
      "grad_norm": 0.14639500931900093,
      "learning_rate": 0.00011174460674360549,
      "loss": 1.1058,
      "step": 490
    },
    {
      "epoch": 0.5211897867859964,
      "grad_norm": 0.15320269723203808,
      "learning_rate": 0.00010991611272080269,
      "loss": 1.1125,
      "step": 495
    },
    {
      "epoch": 0.5264543300868649,
      "grad_norm": 0.15092814943890553,
      "learning_rate": 0.00010808426400159338,
      "loss": 1.0898,
      "step": 500
    },
    {
      "epoch": 0.5317188733877336,
      "grad_norm": 0.14712598563479434,
      "learning_rate": 0.00010624968031434173,
      "loss": 1.0975,
      "step": 505
    },
    {
      "epoch": 0.5369834166886023,
      "grad_norm": 0.1506174008648404,
      "learning_rate": 0.00010441298231267242,
      "loss": 1.0789,
      "step": 510
    },
    {
      "epoch": 0.542247959989471,
      "grad_norm": 0.14915164476738402,
      "learning_rate": 0.00010257479136549889,
      "loss": 1.088,
      "step": 515
    },
    {
      "epoch": 0.5475125032903395,
      "grad_norm": 0.14933216158522156,
      "learning_rate": 0.00010073572934680919,
      "loss": 1.1012,
      "step": 520
    },
    {
      "epoch": 0.5527770465912082,
      "grad_norm": 0.1623395783916047,
      "learning_rate": 9.889641842528178e-05,
      "loss": 1.0992,
      "step": 525
    },
    {
      "epoch": 0.5580415898920769,
      "grad_norm": 0.15524883773019818,
      "learning_rate": 9.70574808538006e-05,
      "loss": 1.0558,
      "step": 530
    },
    {
      "epoch": 0.5633061331929455,
      "grad_norm": 0.14879516385003932,
      "learning_rate": 9.521953875894257e-05,
      "loss": 1.0634,
      "step": 535
    },
    {
      "epoch": 0.5685706764938142,
      "grad_norm": 0.14856407933911947,
      "learning_rate": 9.338321393050719e-05,
      "loss": 1.0513,
      "step": 540
    },
    {
      "epoch": 0.5738352197946828,
      "grad_norm": 0.1514919636398635,
      "learning_rate": 9.154912761116056e-05,
      "loss": 1.0899,
      "step": 545
    },
    {
      "epoch": 0.5790997630955514,
      "grad_norm": 0.15005939408454377,
      "learning_rate": 8.971790028626395e-05,
      "loss": 1.09,
      "step": 550
    },
    {
      "epoch": 0.5843643063964201,
      "grad_norm": 0.1541140355049706,
      "learning_rate": 8.789015147395919e-05,
      "loss": 1.072,
      "step": 555
    },
    {
      "epoch": 0.5896288496972888,
      "grad_norm": 0.14756189100480177,
      "learning_rate": 8.606649951558073e-05,
      "loss": 1.0548,
      "step": 560
    },
    {
      "epoch": 0.5948933929981574,
      "grad_norm": 0.14468591274130843,
      "learning_rate": 8.424756136646623e-05,
      "loss": 1.056,
      "step": 565
    },
    {
      "epoch": 0.600157936299026,
      "grad_norm": 0.1510683202100121,
      "learning_rate": 8.243395238723571e-05,
      "loss": 1.0999,
      "step": 570
    },
    {
      "epoch": 0.6054224795998947,
      "grad_norm": 0.14942489035639112,
      "learning_rate": 8.062628613561051e-05,
      "loss": 1.08,
      "step": 575
    },
    {
      "epoch": 0.6106870229007634,
      "grad_norm": 0.14792710995590722,
      "learning_rate": 7.8825174158842e-05,
      "loss": 1.0916,
      "step": 580
    },
    {
      "epoch": 0.615951566201632,
      "grad_norm": 0.14543568608581728,
      "learning_rate": 7.703122578682046e-05,
      "loss": 1.061,
      "step": 585
    },
    {
      "epoch": 0.6212161095025006,
      "grad_norm": 0.14792849899325772,
      "learning_rate": 7.524504792593419e-05,
      "loss": 1.1101,
      "step": 590
    },
    {
      "epoch": 0.6264806528033693,
      "grad_norm": 0.14574924924348462,
      "learning_rate": 7.346724485374837e-05,
      "loss": 1.0687,
      "step": 595
    },
    {
      "epoch": 0.631745196104238,
      "grad_norm": 0.1434166906369258,
      "learning_rate": 7.169841801457347e-05,
      "loss": 1.0825,
      "step": 600
    },
    {
      "epoch": 0.6370097394051066,
      "grad_norm": 0.14254720323207454,
      "learning_rate": 6.993916581599202e-05,
      "loss": 1.0896,
      "step": 605
    },
    {
      "epoch": 0.6422742827059753,
      "grad_norm": 0.14534591022474969,
      "learning_rate": 6.819008342641273e-05,
      "loss": 1.0805,
      "step": 610
    },
    {
      "epoch": 0.6475388260068439,
      "grad_norm": 0.1471482502229213,
      "learning_rate": 6.645176257372055e-05,
      "loss": 1.0933,
      "step": 615
    },
    {
      "epoch": 0.6528033693077125,
      "grad_norm": 0.14967562406928056,
      "learning_rate": 6.472479134509052e-05,
      "loss": 1.0987,
      "step": 620
    },
    {
      "epoch": 0.6580679126085812,
      "grad_norm": 0.14756218985788289,
      "learning_rate": 6.300975398803362e-05,
      "loss": 1.0862,
      "step": 625
    },
    {
      "epoch": 0.6633324559094499,
      "grad_norm": 0.14358810278632364,
      "learning_rate": 6.130723071274107e-05,
      "loss": 1.0736,
      "step": 630
    },
    {
      "epoch": 0.6685969992103185,
      "grad_norm": 0.14508119820046267,
      "learning_rate": 5.961779749579516e-05,
      "loss": 1.077,
      "step": 635
    },
    {
      "epoch": 0.6738615425111871,
      "grad_norm": 0.14868475648668983,
      "learning_rate": 5.794202588531166e-05,
      "loss": 1.0921,
      "step": 640
    },
    {
      "epoch": 0.6791260858120558,
      "grad_norm": 0.14136660751737096,
      "learning_rate": 5.628048280758096e-05,
      "loss": 1.0967,
      "step": 645
    },
    {
      "epoch": 0.6843906291129245,
      "grad_norm": 0.14429824406995242,
      "learning_rate": 5.4633730375272594e-05,
      "loss": 1.094,
      "step": 650
    },
    {
      "epoch": 0.6896551724137931,
      "grad_norm": 0.1435583500936634,
      "learning_rate": 5.300232569726804e-05,
      "loss": 1.0796,
      "step": 655
    },
    {
      "epoch": 0.6949197157146617,
      "grad_norm": 0.14917594264214823,
      "learning_rate": 5.13868206901867e-05,
      "loss": 1.0813,
      "step": 660
    },
    {
      "epoch": 0.7001842590155304,
      "grad_norm": 0.14484547003342338,
      "learning_rate": 4.9787761891668397e-05,
      "loss": 1.0833,
      "step": 665
    },
    {
      "epoch": 0.7054488023163991,
      "grad_norm": 0.14125281408090304,
      "learning_rate": 4.820569027547533e-05,
      "loss": 1.0813,
      "step": 670
    },
    {
      "epoch": 0.7107133456172677,
      "grad_norm": 0.1408995053360923,
      "learning_rate": 4.6641141068476666e-05,
      "loss": 1.0752,
      "step": 675
    },
    {
      "epoch": 0.7159778889181364,
      "grad_norm": 0.1414179653044325,
      "learning_rate": 4.5094643569577186e-05,
      "loss": 1.054,
      "step": 680
    },
    {
      "epoch": 0.721242432219005,
      "grad_norm": 0.14582058548503438,
      "learning_rate": 4.356672097065134e-05,
      "loss": 1.1048,
      "step": 685
    },
    {
      "epoch": 0.7265069755198736,
      "grad_norm": 0.14009606861616825,
      "learning_rate": 4.205789017954364e-05,
      "loss": 1.0683,
      "step": 690
    },
    {
      "epoch": 0.7317715188207423,
      "grad_norm": 0.14586506040118713,
      "learning_rate": 4.056866164519465e-05,
      "loss": 1.0728,
      "step": 695
    },
    {
      "epoch": 0.737036062121611,
      "grad_norm": 0.14168474565307407,
      "learning_rate": 3.909953918495234e-05,
      "loss": 1.0476,
      "step": 700
    },
    {
      "epoch": 0.7423006054224796,
      "grad_norm": 0.14476382479542646,
      "learning_rate": 3.7651019814126654e-05,
      "loss": 1.05,
      "step": 705
    },
    {
      "epoch": 0.7475651487233482,
      "grad_norm": 0.14528550784733454,
      "learning_rate": 3.622359357784569e-05,
      "loss": 1.0611,
      "step": 710
    },
    {
      "epoch": 0.7528296920242169,
      "grad_norm": 0.14781069746763306,
      "learning_rate": 3.481774338526954e-05,
      "loss": 1.0952,
      "step": 715
    },
    {
      "epoch": 0.7580942353250856,
      "grad_norm": 0.15618197530507127,
      "learning_rate": 3.343394484621855e-05,
      "loss": 1.0836,
      "step": 720
    },
    {
      "epoch": 0.7633587786259542,
      "grad_norm": 0.22087793925041818,
      "learning_rate": 3.207266611027069e-05,
      "loss": 1.0727,
      "step": 725
    },
    {
      "epoch": 0.7686233219268228,
      "grad_norm": 0.14674869869141435,
      "learning_rate": 3.0734367708383294e-05,
      "loss": 1.0712,
      "step": 730
    },
    {
      "epoch": 0.7738878652276915,
      "grad_norm": 0.14673826341334423,
      "learning_rate": 2.9419502397091713e-05,
      "loss": 1.0852,
      "step": 735
    },
    {
      "epoch": 0.7791524085285602,
      "grad_norm": 0.1426087824509766,
      "learning_rate": 2.812851500533843e-05,
      "loss": 1.0604,
      "step": 740
    },
    {
      "epoch": 0.7844169518294288,
      "grad_norm": 0.1446320144127932,
      "learning_rate": 2.6861842283983953e-05,
      "loss": 1.0537,
      "step": 745
    },
    {
      "epoch": 0.7896814951302974,
      "grad_norm": 0.14326111319394175,
      "learning_rate": 2.5619912758050725e-05,
      "loss": 1.0942,
      "step": 750
    },
    {
      "epoch": 0.7949460384311661,
      "grad_norm": 0.14149919988871043,
      "learning_rate": 2.4403146581749925e-05,
      "loss": 1.0578,
      "step": 755
    },
    {
      "epoch": 0.8002105817320347,
      "grad_norm": 0.14034086298796508,
      "learning_rate": 2.3211955396340002e-05,
      "loss": 1.0808,
      "step": 760
    },
    {
      "epoch": 0.8054751250329034,
      "grad_norm": 0.1433790314655123,
      "learning_rate": 2.204674219086531e-05,
      "loss": 1.0906,
      "step": 765
    },
    {
      "epoch": 0.8107396683337721,
      "grad_norm": 0.138618618401559,
      "learning_rate": 2.090790116582191e-05,
      "loss": 1.0559,
      "step": 770
    },
    {
      "epoch": 0.8160042116346407,
      "grad_norm": 0.1429827381187093,
      "learning_rate": 1.9795817599796418e-05,
      "loss": 1.0792,
      "step": 775
    },
    {
      "epoch": 0.8212687549355093,
      "grad_norm": 0.14200271718072968,
      "learning_rate": 1.871086771912348e-05,
      "loss": 1.0702,
      "step": 780
    },
    {
      "epoch": 0.826533298236378,
      "grad_norm": 0.1429932480295589,
      "learning_rate": 1.7653418570605475e-05,
      "loss": 1.0715,
      "step": 785
    },
    {
      "epoch": 0.8317978415372467,
      "grad_norm": 0.14431467515210814,
      "learning_rate": 1.6623827897337762e-05,
      "loss": 1.0713,
      "step": 790
    },
    {
      "epoch": 0.8370623848381153,
      "grad_norm": 0.15238820455432608,
      "learning_rate": 1.562244401768144e-05,
      "loss": 1.0824,
      "step": 795
    },
    {
      "epoch": 0.8423269281389839,
      "grad_norm": 0.14830242766673976,
      "learning_rate": 1.4649605707424707e-05,
      "loss": 1.0787,
      "step": 800
    },
    {
      "epoch": 0.8475914714398526,
      "grad_norm": 0.14468170557092047,
      "learning_rate": 1.3705642085172366e-05,
      "loss": 1.0737,
      "step": 805
    },
    {
      "epoch": 0.8528560147407213,
      "grad_norm": 0.14674968769736463,
      "learning_rate": 1.2790872501002472e-05,
      "loss": 1.0577,
      "step": 810
    },
    {
      "epoch": 0.8581205580415899,
      "grad_norm": 0.14311627432536864,
      "learning_rate": 1.1905606428427774e-05,
      "loss": 1.0692,
      "step": 815
    },
    {
      "epoch": 0.8633851013424585,
      "grad_norm": 0.14558376197107287,
      "learning_rate": 1.105014335969855e-05,
      "loss": 1.0934,
      "step": 820
    },
    {
      "epoch": 0.8686496446433272,
      "grad_norm": 0.14414555681497093,
      "learning_rate": 1.0224772704482033e-05,
      "loss": 1.0875,
      "step": 825
    },
    {
      "epoch": 0.8739141879441958,
      "grad_norm": 0.1399627142514978,
      "learning_rate": 9.429773691952858e-06,
      "loss": 1.082,
      "step": 830
    },
    {
      "epoch": 0.8791787312450645,
      "grad_norm": 0.1392001373823857,
      "learning_rate": 8.665415276327871e-06,
      "loss": 1.0573,
      "step": 835
    },
    {
      "epoch": 0.8844432745459332,
      "grad_norm": 0.13993969105859186,
      "learning_rate": 7.931956045876688e-06,
      "loss": 1.0448,
      "step": 840
    },
    {
      "epoch": 0.8897078178468018,
      "grad_norm": 0.16741517197447736,
      "learning_rate": 7.229644135439473e-06,
      "loss": 1.104,
      "step": 845
    },
    {
      "epoch": 0.8949723611476704,
      "grad_norm": 0.14123729142229655,
      "learning_rate": 6.558717142480919e-06,
      "loss": 1.0808,
      "step": 850
    },
    {
      "epoch": 0.9002369044485391,
      "grad_norm": 0.1424278055064695,
      "learning_rate": 5.919402046709288e-06,
      "loss": 1.0709,
      "step": 855
    },
    {
      "epoch": 0.9055014477494078,
      "grad_norm": 0.13993993967003346,
      "learning_rate": 5.311915133287415e-06,
      "loss": 1.0941,
      "step": 860
    },
    {
      "epoch": 0.9107659910502763,
      "grad_norm": 0.14557850289664284,
      "learning_rate": 4.7364619196617495e-06,
      "loss": 1.0492,
      "step": 865
    },
    {
      "epoch": 0.916030534351145,
      "grad_norm": 0.1450177459066908,
      "learning_rate": 4.193237086034351e-06,
      "loss": 1.0972,
      "step": 870
    },
    {
      "epoch": 0.9212950776520137,
      "grad_norm": 0.1570091074884799,
      "learning_rate": 3.6824244095010065e-06,
      "loss": 1.0695,
      "step": 875
    },
    {
      "epoch": 0.9265596209528824,
      "grad_norm": 0.14097561405495265,
      "learning_rate": 3.2041967018780707e-06,
      "loss": 1.0948,
      "step": 880
    },
    {
      "epoch": 0.931824164253751,
      "grad_norm": 0.1420984285291773,
      "learning_rate": 2.7587157512388718e-06,
      "loss": 1.0573,
      "step": 885
    },
    {
      "epoch": 0.9370887075546196,
      "grad_norm": 0.1545471738706476,
      "learning_rate": 2.346132267179646e-06,
      "loss": 1.0786,
      "step": 890
    },
    {
      "epoch": 0.9423532508554883,
      "grad_norm": 0.14481364480205125,
      "learning_rate": 1.9665858298333005e-06,
      "loss": 1.0939,
      "step": 895
    },
    {
      "epoch": 0.9476177941563569,
      "grad_norm": 0.1446556897144525,
      "learning_rate": 1.6202048426483651e-06,
      "loss": 1.0752,
      "step": 900
    },
    {
      "epoch": 0.9528823374572256,
      "grad_norm": 0.13840641658264988,
      "learning_rate": 1.3071064889491724e-06,
      "loss": 1.0757,
      "step": 905
    },
    {
      "epoch": 0.9581468807580943,
      "grad_norm": 0.1405867091258211,
      "learning_rate": 1.0273966922918155e-06,
      "loss": 1.0886,
      "step": 910
    },
    {
      "epoch": 0.9634114240589629,
      "grad_norm": 0.15143973079201015,
      "learning_rate": 7.81170080629412e-07,
      "loss": 1.0337,
      "step": 915
    },
    {
      "epoch": 0.9686759673598315,
      "grad_norm": 0.15113893856195346,
      "learning_rate": 5.68509954298757e-07,
      "loss": 1.099,
      "step": 920
    },
    {
      "epoch": 0.9739405106607002,
      "grad_norm": 0.1436446854214333,
      "learning_rate": 3.8948825783918784e-07,
      "loss": 1.0595,
      "step": 925
    },
    {
      "epoch": 0.9792050539615689,
      "grad_norm": 0.14373165990559605,
      "learning_rate": 2.4416555565318635e-07,
      "loss": 1.0815,
      "step": 930
    },
    {
      "epoch": 0.9844695972624374,
      "grad_norm": 0.14233020784379563,
      "learning_rate": 1.3259101151694708e-07,
      "loss": 1.0569,
      "step": 935
    },
    {
      "epoch": 0.9897341405633061,
      "grad_norm": 0.13823967108377017,
      "learning_rate": 5.480237194799287e-08,
      "loss": 1.0689,
      "step": 940
    },
    {
      "epoch": 0.9949986838641748,
      "grad_norm": 0.1431568671824589,
      "learning_rate": 1.0825953435122938e-08,
      "loss": 1.0709,
      "step": 945
    },
    {
      "epoch": 0.9992103185048697,
      "eval_loss": 1.07915198802948,
      "eval_runtime": 3821.2872,
      "eval_samples_per_second": 3.522,
      "eval_steps_per_second": 0.881,
      "step": 949
    },
    {
      "epoch": 0.9992103185048697,
      "step": 949,
      "total_flos": 1959448100732928.0,
      "train_loss": 1.0930153100081064,
      "train_runtime": 22340.3866,
      "train_samples_per_second": 2.72,
      "train_steps_per_second": 0.042
    }
  ],
  "logging_steps": 5,
  "max_steps": 949,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1959448100732928.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}