{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.999162712810494,
  "eval_steps": 500,
  "global_step": 895,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0011163829193413341,
      "grad_norm": 0.3974737008844776,
      "learning_rate": 2.2222222222222225e-06,
      "loss": 1.607,
      "step": 1
    },
    {
      "epoch": 0.0055819145967066705,
      "grad_norm": 0.4252789938746273,
      "learning_rate": 1.1111111111111112e-05,
      "loss": 1.5942,
      "step": 5
    },
    {
      "epoch": 0.011163829193413341,
      "grad_norm": 0.4658525758416883,
      "learning_rate": 2.2222222222222223e-05,
      "loss": 1.5877,
      "step": 10
    },
    {
      "epoch": 0.01674574379012001,
      "grad_norm": 0.27282017063503095,
      "learning_rate": 3.3333333333333335e-05,
      "loss": 1.5695,
      "step": 15
    },
    {
      "epoch": 0.022327658386826682,
      "grad_norm": 0.24165395076839943,
      "learning_rate": 4.4444444444444447e-05,
      "loss": 1.558,
      "step": 20
    },
    {
      "epoch": 0.027909572983533353,
      "grad_norm": 0.1767403193777301,
      "learning_rate": 5.555555555555556e-05,
      "loss": 1.4678,
      "step": 25
    },
    {
      "epoch": 0.03349148758024002,
      "grad_norm": 0.16356442786177314,
      "learning_rate": 6.666666666666667e-05,
      "loss": 1.467,
      "step": 30
    },
    {
      "epoch": 0.039073402176946694,
      "grad_norm": 0.15556520577978836,
      "learning_rate": 7.777777777777778e-05,
      "loss": 1.429,
      "step": 35
    },
    {
      "epoch": 0.044655316773653364,
      "grad_norm": 0.1263609432879071,
      "learning_rate": 8.888888888888889e-05,
      "loss": 1.4253,
      "step": 40
    },
    {
      "epoch": 0.050237231370360035,
      "grad_norm": 0.1696978939183065,
      "learning_rate": 0.0001,
      "loss": 1.3895,
      "step": 45
    },
    {
      "epoch": 0.055819145967066705,
      "grad_norm": 0.10830406775154863,
      "learning_rate": 0.00011111111111111112,
      "loss": 1.3645,
      "step": 50
    },
    {
      "epoch": 0.061401060563773376,
      "grad_norm": 0.08414898733986972,
      "learning_rate": 0.00012222222222222224,
      "loss": 1.3082,
      "step": 55
    },
    {
      "epoch": 0.06698297516048005,
      "grad_norm": 0.07973185533121883,
      "learning_rate": 0.00013333333333333334,
      "loss": 1.2962,
      "step": 60
    },
    {
      "epoch": 0.07256488975718671,
      "grad_norm": 0.09811845100733502,
      "learning_rate": 0.00014444444444444444,
      "loss": 1.3061,
      "step": 65
    },
    {
      "epoch": 0.07814680435389339,
      "grad_norm": 0.08298371354138047,
      "learning_rate": 0.00015555555555555556,
      "loss": 1.3017,
      "step": 70
    },
    {
      "epoch": 0.08372871895060005,
      "grad_norm": 0.07510078793315819,
      "learning_rate": 0.0001666666666666667,
      "loss": 1.2989,
      "step": 75
    },
    {
      "epoch": 0.08931063354730673,
      "grad_norm": 0.07085309149624731,
      "learning_rate": 0.00017777777777777779,
      "loss": 1.2787,
      "step": 80
    },
    {
      "epoch": 0.09489254814401339,
      "grad_norm": 0.09400917029194135,
      "learning_rate": 0.00018888888888888888,
      "loss": 1.2843,
      "step": 85
    },
    {
      "epoch": 0.10047446274072007,
      "grad_norm": 0.09230059652672952,
      "learning_rate": 0.0002,
      "loss": 1.262,
      "step": 90
    },
    {
      "epoch": 0.10605637733742673,
      "grad_norm": 0.10009657676945562,
      "learning_rate": 0.00019998096274980728,
      "loss": 1.2821,
      "step": 95
    },
    {
      "epoch": 0.11163829193413341,
      "grad_norm": 0.12201167887174731,
      "learning_rate": 0.000199923858247567,
      "loss": 1.2668,
      "step": 100
    },
    {
      "epoch": 0.11722020653084007,
      "grad_norm": 0.09628889966493127,
      "learning_rate": 0.00019982870823553308,
      "loss": 1.2503,
      "step": 105
    },
    {
      "epoch": 0.12280212112754675,
      "grad_norm": 0.10028621820088561,
      "learning_rate": 0.00019969554894159723,
      "loss": 1.2632,
      "step": 110
    },
    {
      "epoch": 0.12838403572425341,
      "grad_norm": 0.08593461106683208,
      "learning_rate": 0.00019952443106549533,
      "loss": 1.2396,
      "step": 115
    },
    {
      "epoch": 0.1339659503209601,
      "grad_norm": 0.08827739693201113,
      "learning_rate": 0.00019931541975950378,
      "loss": 1.2784,
      "step": 120
    },
    {
      "epoch": 0.13954786491766677,
      "grad_norm": 0.0911508607290428,
      "learning_rate": 0.00019906859460363307,
      "loss": 1.2689,
      "step": 125
    },
    {
      "epoch": 0.14512977951437342,
      "grad_norm": 0.12157025851983183,
      "learning_rate": 0.00019878404957532814,
      "loss": 1.2563,
      "step": 130
    },
    {
      "epoch": 0.1507116941110801,
      "grad_norm": 0.10772740664174668,
      "learning_rate": 0.0001984618930136869,
      "loss": 1.2853,
      "step": 135
    },
    {
      "epoch": 0.15629360870778677,
      "grad_norm": 0.09940063564218579,
      "learning_rate": 0.00019810224757821064,
      "loss": 1.241,
      "step": 140
    },
    {
      "epoch": 0.16187552330449345,
      "grad_norm": 0.09118466185918958,
      "learning_rate": 0.00019770525020210204,
      "loss": 1.2746,
      "step": 145
    },
    {
      "epoch": 0.1674574379012001,
      "grad_norm": 0.09674538853934604,
      "learning_rate": 0.0001972710520401287,
      "loss": 1.2561,
      "step": 150
    },
    {
      "epoch": 0.17303935249790678,
      "grad_norm": 0.1126652956332537,
      "learning_rate": 0.0001967998184110713,
      "loss": 1.257,
      "step": 155
    },
    {
      "epoch": 0.17862126709461346,
      "grad_norm": 0.0869341846350413,
      "learning_rate": 0.00019629172873477995,
      "loss": 1.2529,
      "step": 160
    },
    {
      "epoch": 0.18420318169132013,
      "grad_norm": 0.09888626799953022,
      "learning_rate": 0.00019574697646386027,
      "loss": 1.244,
      "step": 165
    },
    {
      "epoch": 0.18978509628802678,
      "grad_norm": 0.09785278620381999,
      "learning_rate": 0.0001951657690100178,
      "loss": 1.2334,
      "step": 170
    },
    {
      "epoch": 0.19536701088473346,
      "grad_norm": 0.07378537831469305,
      "learning_rate": 0.0001945483276650868,
      "loss": 1.2415,
      "step": 175
    },
    {
      "epoch": 0.20094892548144014,
      "grad_norm": 0.08814263560160436,
      "learning_rate": 0.0001938948875167745,
      "loss": 1.2512,
      "step": 180
    },
    {
      "epoch": 0.20653084007814682,
      "grad_norm": 0.09775538276417937,
      "learning_rate": 0.00019320569735915271,
      "loss": 1.2213,
      "step": 185
    },
    {
      "epoch": 0.21211275467485347,
      "grad_norm": 0.09538626874304115,
      "learning_rate": 0.00019248101959793066,
      "loss": 1.2354,
      "step": 190
    },
    {
      "epoch": 0.21769466927156014,
      "grad_norm": 0.08332625788355251,
      "learning_rate": 0.00019172113015054532,
      "loss": 1.2444,
      "step": 195
    },
    {
      "epoch": 0.22327658386826682,
      "grad_norm": 0.08309090570657847,
      "learning_rate": 0.00019092631834110723,
      "loss": 1.2316,
      "step": 200
    },
    {
      "epoch": 0.2288584984649735,
      "grad_norm": 0.09054323693110126,
      "learning_rate": 0.0001900968867902419,
      "loss": 1.27,
      "step": 205
    },
    {
      "epoch": 0.23444041306168015,
      "grad_norm": 0.08549436898181585,
      "learning_rate": 0.00018923315129986835,
      "loss": 1.2348,
      "step": 210
    },
    {
      "epoch": 0.24002232765838682,
      "grad_norm": 0.086610993256363,
      "learning_rate": 0.00018833544073295917,
      "loss": 1.2461,
      "step": 215
    },
    {
      "epoch": 0.2456042422550935,
      "grad_norm": 0.08146109722648563,
      "learning_rate": 0.00018740409688832764,
      "loss": 1.2431,
      "step": 220
    },
    {
      "epoch": 0.2511861568518002,
      "grad_norm": 0.08232534290451142,
      "learning_rate": 0.00018643947437048944,
      "loss": 1.2408,
      "step": 225
    },
    {
      "epoch": 0.25676807144850683,
      "grad_norm": 0.08507739560575232,
      "learning_rate": 0.00018544194045464886,
      "loss": 1.243,
      "step": 230
    },
    {
      "epoch": 0.26234998604521353,
      "grad_norm": 0.09782665661618925,
      "learning_rate": 0.00018441187494686053,
      "loss": 1.2426,
      "step": 235
    },
    {
      "epoch": 0.2679319006419202,
      "grad_norm": 0.0809973818897895,
      "learning_rate": 0.0001833496700394202,
      "loss": 1.2345,
      "step": 240
    },
    {
      "epoch": 0.27351381523862683,
      "grad_norm": 0.09269081567542259,
      "learning_rate": 0.00018225573016153945,
      "loss": 1.2343,
      "step": 245
    },
    {
      "epoch": 0.27909572983533354,
      "grad_norm": 0.09671785308848269,
      "learning_rate": 0.00018113047182536127,
      "loss": 1.2327,
      "step": 250
    },
    {
      "epoch": 0.2846776444320402,
      "grad_norm": 0.0906432644454991,
      "learning_rate": 0.00017997432346737524,
      "loss": 1.2532,
      "step": 255
    },
    {
      "epoch": 0.29025955902874684,
      "grad_norm": 0.08371586611488784,
      "learning_rate": 0.00017878772528529232,
      "loss": 1.2384,
      "step": 260
    },
    {
      "epoch": 0.29584147362545354,
      "grad_norm": 0.08640773776491195,
      "learning_rate": 0.000177571129070442,
      "loss": 1.2193,
      "step": 265
    },
    {
      "epoch": 0.3014233882221602,
      "grad_norm": 0.08164649256677078,
      "learning_rate": 0.00017632499803575474,
      "loss": 1.2327,
      "step": 270
    },
    {
      "epoch": 0.3070053028188669,
      "grad_norm": 0.09156690890905773,
      "learning_rate": 0.00017504980663939613,
      "loss": 1.2534,
      "step": 275
    },
    {
      "epoch": 0.31258721741557355,
      "grad_norm": 0.08393163680296412,
      "learning_rate": 0.00017374604040411935,
      "loss": 1.2411,
      "step": 280
    },
    {
      "epoch": 0.3181691320122802,
      "grad_norm": 0.08340859881557235,
      "learning_rate": 0.00017241419573240462,
      "loss": 1.2398,
      "step": 285
    },
    {
      "epoch": 0.3237510466089869,
      "grad_norm": 0.08622506272483123,
      "learning_rate": 0.00017105477971745666,
      "loss": 1.2321,
      "step": 290
    },
    {
      "epoch": 0.32933296120569355,
      "grad_norm": 0.08338497396964428,
      "learning_rate": 0.00016966830995013133,
      "loss": 1.2453,
      "step": 295
    },
    {
      "epoch": 0.3349148758024002,
      "grad_norm": 0.08718794446584939,
      "learning_rate": 0.00016825531432186543,
      "loss": 1.2134,
      "step": 300
    },
    {
      "epoch": 0.3404967903991069,
      "grad_norm": 0.09158015865602193,
      "learning_rate": 0.00016681633082368498,
      "loss": 1.223,
      "step": 305
    },
    {
      "epoch": 0.34607870499581356,
      "grad_norm": 0.08768121171152027,
      "learning_rate": 0.0001653519073413675,
      "loss": 1.235,
      "step": 310
    },
    {
      "epoch": 0.3516606195925202,
      "grad_norm": 0.08907125432704804,
      "learning_rate": 0.00016386260144683745,
      "loss": 1.2169,
      "step": 315
    },
    {
      "epoch": 0.3572425341892269,
      "grad_norm": 0.08767993008424768,
      "learning_rate": 0.00016234898018587337,
      "loss": 1.2435,
      "step": 320
    },
    {
      "epoch": 0.36282444878593356,
      "grad_norm": 0.08991663909567185,
      "learning_rate": 0.00016081161986220807,
      "loss": 1.2371,
      "step": 325
    },
    {
      "epoch": 0.36840636338264027,
      "grad_norm": 0.07876061570647706,
      "learning_rate": 0.00015925110581810394,
      "loss": 1.2118,
      "step": 330
    },
    {
      "epoch": 0.3739882779793469,
      "grad_norm": 0.09088539514665886,
      "learning_rate": 0.00015766803221148673,
      "loss": 1.2333,
      "step": 335
    },
    {
      "epoch": 0.37957019257605357,
      "grad_norm": 0.09371191064756335,
      "learning_rate": 0.00015606300178972287,
      "loss": 1.2192,
      "step": 340
    },
    {
      "epoch": 0.38515210717276027,
      "grad_norm": 0.0988524027231739,
      "learning_rate": 0.00015443662566012645,
      "loss": 1.2201,
      "step": 345
    },
    {
      "epoch": 0.3907340217694669,
      "grad_norm": 0.08068655015289312,
      "learning_rate": 0.00015278952305728324,
      "loss": 1.2312,
      "step": 350
    },
    {
      "epoch": 0.39631593636617357,
      "grad_norm": 0.08530580419429784,
      "learning_rate": 0.00015112232110728015,
      "loss": 1.2103,
      "step": 355
    },
    {
      "epoch": 0.4018978509628803,
      "grad_norm": 0.0832856621155852,
      "learning_rate": 0.00014943565458893,
      "loss": 1.2049,
      "step": 360
    },
    {
      "epoch": 0.4074797655595869,
      "grad_norm": 0.10112900442930213,
      "learning_rate": 0.00014773016569208283,
      "loss": 1.2381,
      "step": 365
    },
    {
      "epoch": 0.41306168015629363,
      "grad_norm": 0.08250019530921109,
      "learning_rate": 0.00014600650377311522,
      "loss": 1.2185,
      "step": 370
    },
    {
      "epoch": 0.4186435947530003,
      "grad_norm": 0.0987578329954232,
      "learning_rate": 0.0001442653251076912,
      "loss": 1.2222,
      "step": 375
    },
    {
      "epoch": 0.42422550934970693,
      "grad_norm": 0.08530899013880136,
      "learning_rate": 0.00014250729264088843,
      "loss": 1.2556,
      "step": 380
    },
    {
      "epoch": 0.42980742394641364,
      "grad_norm": 0.10267562745822716,
      "learning_rate": 0.00014073307573478526,
      "loss": 1.2146,
      "step": 385
    },
    {
      "epoch": 0.4353893385431203,
      "grad_norm": 0.09189285950155643,
      "learning_rate": 0.00013894334991360448,
      "loss": 1.2206,
      "step": 390
    },
    {
      "epoch": 0.44097125313982694,
      "grad_norm": 0.08370196846674145,
      "learning_rate": 0.00013713879660651068,
      "loss": 1.2076,
      "step": 395
    },
    {
      "epoch": 0.44655316773653364,
      "grad_norm": 0.08423557906306067,
      "learning_rate": 0.0001353201028881598,
      "loss": 1.2223,
      "step": 400
    },
    {
      "epoch": 0.4521350823332403,
      "grad_norm": 0.08292081122541138,
      "learning_rate": 0.00013348796121709862,
      "loss": 1.2294,
      "step": 405
    },
    {
      "epoch": 0.457716996929947,
      "grad_norm": 0.08767079524531268,
      "learning_rate": 0.00013164306917211476,
      "loss": 1.2229,
      "step": 410
    },
    {
      "epoch": 0.46329891152665365,
      "grad_norm": 0.0865942463810843,
      "learning_rate": 0.000129786129186637,
      "loss": 1.2163,
      "step": 415
    },
    {
      "epoch": 0.4688808261233603,
      "grad_norm": 0.08101515714055764,
      "learning_rate": 0.00012791784828128724,
      "loss": 1.2337,
      "step": 420
    },
    {
      "epoch": 0.474462740720067,
      "grad_norm": 0.09009147490161429,
      "learning_rate": 0.00012603893779468604,
      "loss": 1.2148,
      "step": 425
    },
    {
      "epoch": 0.48004465531677365,
      "grad_norm": 0.08757351279515291,
      "learning_rate": 0.0001241501131126138,
      "loss": 1.2056,
      "step": 430
    },
    {
      "epoch": 0.4856265699134803,
      "grad_norm": 0.08418609867162384,
      "learning_rate": 0.00012225209339563145,
      "loss": 1.2419,
      "step": 435
    },
    {
      "epoch": 0.491208484510187,
      "grad_norm": 0.08790367723325618,
      "learning_rate": 0.0001203456013052634,
      "loss": 1.2115,
      "step": 440
    },
    {
      "epoch": 0.49679039910689365,
      "grad_norm": 0.08071789319204539,
      "learning_rate": 0.00011843136272884794,
      "loss": 1.2072,
      "step": 445
    },
    {
      "epoch": 0.5023723137036004,
      "grad_norm": 0.0879278395825441,
      "learning_rate": 0.00011651010650315923,
      "loss": 1.2194,
      "step": 450
    },
    {
      "epoch": 0.507954228300307,
      "grad_norm": 0.08506166782358492,
      "learning_rate": 0.00011458256413690633,
      "loss": 1.2077,
      "step": 455
    },
    {
      "epoch": 0.5135361428970137,
      "grad_norm": 0.08984730610411729,
      "learning_rate": 0.00011264946953221496,
      "loss": 1.2484,
      "step": 460
    },
    {
      "epoch": 0.5191180574937203,
      "grad_norm": 0.2978083078661545,
      "learning_rate": 0.00011071155870519777,
      "loss": 1.2491,
      "step": 465
    },
    {
      "epoch": 0.5246999720904271,
      "grad_norm": 0.08504227931172395,
      "learning_rate": 0.00010876956950572006,
      "loss": 1.2268,
      "step": 470
    },
    {
      "epoch": 0.5302818866871337,
      "grad_norm": 0.08620167875904892,
      "learning_rate": 0.0001068242413364671,
      "loss": 1.2252,
      "step": 475
    },
    {
      "epoch": 0.5358638012838404,
      "grad_norm": 0.08669957736640198,
      "learning_rate": 0.00010487631487142017,
      "loss": 1.217,
      "step": 480
    },
    {
      "epoch": 0.541445715880547,
      "grad_norm": 0.08577871896034497,
      "learning_rate": 0.00010292653177384876,
      "loss": 1.2169,
      "step": 485
    },
    {
      "epoch": 0.5470276304772537,
      "grad_norm": 0.08417260057895289,
      "learning_rate": 0.00010097563441392581,
      "loss": 1.2354,
      "step": 490
    },
    {
      "epoch": 0.5526095450739603,
      "grad_norm": 0.08676422431924583,
      "learning_rate": 9.90243655860742e-05,
      "loss": 1.2039,
      "step": 495
    },
    {
      "epoch": 0.5581914596706671,
      "grad_norm": 0.09103906295111437,
      "learning_rate": 9.707346822615128e-05,
      "loss": 1.2194,
      "step": 500
    },
    {
      "epoch": 0.5637733742673737,
      "grad_norm": 0.08594537537719427,
      "learning_rate": 9.512368512857984e-05,
      "loss": 1.1949,
      "step": 505
    },
    {
      "epoch": 0.5693552888640804,
      "grad_norm": 0.08392759057088481,
      "learning_rate": 9.317575866353292e-05,
      "loss": 1.2196,
      "step": 510
    },
    {
      "epoch": 0.574937203460787,
      "grad_norm": 0.08201912454761111,
      "learning_rate": 9.123043049427995e-05,
      "loss": 1.2131,
      "step": 515
    },
    {
      "epoch": 0.5805191180574937,
      "grad_norm": 0.08925291750313868,
      "learning_rate": 8.928844129480227e-05,
      "loss": 1.2369,
      "step": 520
    },
    {
      "epoch": 0.5861010326542004,
      "grad_norm": 0.08954980070951671,
      "learning_rate": 8.735053046778506e-05,
      "loss": 1.2175,
      "step": 525
    },
    {
      "epoch": 0.5916829472509071,
      "grad_norm": 0.08574100993825345,
      "learning_rate": 8.541743586309365e-05,
      "loss": 1.2166,
      "step": 530
    },
    {
      "epoch": 0.5972648618476137,
      "grad_norm": 0.08840883290578404,
      "learning_rate": 8.348989349684076e-05,
      "loss": 1.2271,
      "step": 535
    },
    {
      "epoch": 0.6028467764443204,
      "grad_norm": 0.08443946017557556,
      "learning_rate": 8.156863727115211e-05,
      "loss": 1.2329,
      "step": 540
    },
    {
      "epoch": 0.608428691041027,
      "grad_norm": 0.0902640782545258,
      "learning_rate": 7.965439869473664e-05,
      "loss": 1.2253,
      "step": 545
    },
    {
      "epoch": 0.6140106056377338,
      "grad_norm": 0.08988630625422679,
      "learning_rate": 7.774790660436858e-05,
      "loss": 1.1785,
      "step": 550
    },
    {
      "epoch": 0.6195925202344404,
      "grad_norm": 0.08134808753957644,
      "learning_rate": 7.584988688738622e-05,
      "loss": 1.2261,
      "step": 555
    },
    {
      "epoch": 0.6251744348311471,
      "grad_norm": 0.08768193779762151,
      "learning_rate": 7.396106220531398e-05,
      "loss": 1.2463,
      "step": 560
    },
    {
      "epoch": 0.6307563494278537,
      "grad_norm": 0.0885816930556393,
      "learning_rate": 7.208215171871277e-05,
      "loss": 1.2141,
      "step": 565
    },
    {
      "epoch": 0.6363382640245604,
      "grad_norm": 0.08553683878588977,
      "learning_rate": 7.021387081336301e-05,
      "loss": 1.2026,
      "step": 570
    },
    {
      "epoch": 0.641920178621267,
      "grad_norm": 0.09505838067263224,
      "learning_rate": 6.835693082788525e-05,
      "loss": 1.2168,
      "step": 575
    },
    {
      "epoch": 0.6475020932179738,
      "grad_norm": 0.08769224685329463,
      "learning_rate": 6.651203878290139e-05,
      "loss": 1.2493,
      "step": 580
    },
    {
      "epoch": 0.6530840078146805,
      "grad_norm": 0.07990213288377576,
      "learning_rate": 6.46798971118402e-05,
      "loss": 1.2308,
      "step": 585
    },
    {
      "epoch": 0.6586659224113871,
      "grad_norm": 0.08133261350163556,
      "learning_rate": 6.286120339348935e-05,
      "loss": 1.2014,
      "step": 590
    },
    {
      "epoch": 0.6642478370080938,
      "grad_norm": 0.09363089434544866,
      "learning_rate": 6.105665008639557e-05,
      "loss": 1.2238,
      "step": 595
    },
    {
      "epoch": 0.6698297516048004,
      "grad_norm": 0.07910287951552411,
      "learning_rate": 5.926692426521474e-05,
      "loss": 1.2473,
      "step": 600
    },
    {
      "epoch": 0.6754116662015072,
      "grad_norm": 0.0801209902764544,
      "learning_rate": 5.749270735911158e-05,
      "loss": 1.1975,
      "step": 605
    },
    {
      "epoch": 0.6809935807982138,
      "grad_norm": 0.08087293360533905,
      "learning_rate": 5.573467489230879e-05,
      "loss": 1.1966,
      "step": 610
    },
    {
      "epoch": 0.6865754953949205,
      "grad_norm": 0.08220997258417966,
      "learning_rate": 5.399349622688479e-05,
      "loss": 1.2345,
      "step": 615
    },
    {
      "epoch": 0.6921574099916271,
      "grad_norm": 0.0825575277760057,
      "learning_rate": 5.226983430791722e-05,
      "loss": 1.2289,
      "step": 620
    },
    {
      "epoch": 0.6977393245883338,
      "grad_norm": 0.08305460425818378,
      "learning_rate": 5.0564345411070025e-05,
      "loss": 1.204,
      "step": 625
    },
    {
      "epoch": 0.7033212391850404,
      "grad_norm": 0.08011105262542664,
      "learning_rate": 4.8877678892719866e-05,
      "loss": 1.1946,
      "step": 630
    },
    {
      "epoch": 0.7089031537817472,
      "grad_norm": 0.08686069747720479,
      "learning_rate": 4.721047694271676e-05,
      "loss": 1.2,
      "step": 635
    },
    {
      "epoch": 0.7144850683784538,
      "grad_norm": 0.08537977661965272,
      "learning_rate": 4.556337433987359e-05,
      "loss": 1.2054,
      "step": 640
    },
    {
      "epoch": 0.7200669829751605,
      "grad_norm": 0.08857193949478791,
      "learning_rate": 4.393699821027716e-05,
      "loss": 1.1988,
      "step": 645
    },
    {
      "epoch": 0.7256488975718671,
      "grad_norm": 0.09608004999262602,
      "learning_rate": 4.2331967788513295e-05,
      "loss": 1.2226,
      "step": 650
    },
    {
      "epoch": 0.7312308121685738,
      "grad_norm": 0.08235757922811432,
      "learning_rate": 4.074889418189608e-05,
      "loss": 1.2202,
      "step": 655
    },
    {
      "epoch": 0.7368127267652805,
      "grad_norm": 0.08660069823512372,
      "learning_rate": 3.9188380137791936e-05,
      "loss": 1.215,
      "step": 660
    },
    {
      "epoch": 0.7423946413619872,
      "grad_norm": 0.08090639704744831,
      "learning_rate": 3.7651019814126654e-05,
      "loss": 1.2255,
      "step": 665
    },
    {
      "epoch": 0.7479765559586938,
      "grad_norm": 0.08082821477995833,
      "learning_rate": 3.613739855316257e-05,
      "loss": 1.2176,
      "step": 670
    },
    {
      "epoch": 0.7535584705554005,
      "grad_norm": 0.08469395080984878,
      "learning_rate": 3.46480926586325e-05,
      "loss": 1.2275,
      "step": 675
    },
    {
      "epoch": 0.7591403851521071,
      "grad_norm": 0.0871555466504494,
      "learning_rate": 3.3183669176315045e-05,
      "loss": 1.2351,
      "step": 680
    },
    {
      "epoch": 0.7647222997488139,
      "grad_norm": 0.08170223557553191,
      "learning_rate": 3.174468567813461e-05,
      "loss": 1.2074,
      "step": 685
    },
    {
      "epoch": 0.7703042143455205,
      "grad_norm": 0.0838318843856818,
      "learning_rate": 3.033169004986873e-05,
      "loss": 1.2396,
      "step": 690
    },
    {
      "epoch": 0.7758861289422272,
      "grad_norm": 0.08831381148889993,
      "learning_rate": 2.894522028254334e-05,
      "loss": 1.1947,
      "step": 695
    },
    {
      "epoch": 0.7814680435389338,
      "grad_norm": 0.08158536981215994,
      "learning_rate": 2.7585804267595384e-05,
      "loss": 1.208,
      "step": 700
    },
    {
      "epoch": 0.7870499581356405,
      "grad_norm": 0.08116519613000232,
      "learning_rate": 2.6253959595880673e-05,
      "loss": 1.2191,
      "step": 705
    },
    {
      "epoch": 0.7926318727323471,
      "grad_norm": 0.08294169676184929,
      "learning_rate": 2.495019336060387e-05,
      "loss": 1.195,
      "step": 710
    },
    {
      "epoch": 0.7982137873290539,
      "grad_norm": 0.08406756837278591,
      "learning_rate": 2.367500196424529e-05,
      "loss": 1.2203,
      "step": 715
    },
    {
      "epoch": 0.8037957019257606,
      "grad_norm": 0.08211403607563178,
      "learning_rate": 2.242887092955801e-05,
      "loss": 1.2041,
      "step": 720
    },
    {
      "epoch": 0.8093776165224672,
      "grad_norm": 0.07980978787138238,
      "learning_rate": 2.121227471470768e-05,
      "loss": 1.2394,
      "step": 725
    },
    {
      "epoch": 0.8149595311191739,
      "grad_norm": 0.08416184610807921,
      "learning_rate": 2.002567653262479e-05,
      "loss": 1.2228,
      "step": 730
    },
    {
      "epoch": 0.8205414457158805,
      "grad_norm": 0.08256062792318115,
      "learning_rate": 1.8869528174638752e-05,
      "loss": 1.203,
      "step": 735
    },
    {
      "epoch": 0.8261233603125873,
      "grad_norm": 0.09043351264554417,
      "learning_rate": 1.774426983846058e-05,
      "loss": 1.2275,
      "step": 740
    },
    {
      "epoch": 0.8317052749092939,
      "grad_norm": 0.08486147964302236,
      "learning_rate": 1.6650329960579792e-05,
      "loss": 1.2208,
      "step": 745
    },
    {
      "epoch": 0.8372871895060006,
      "grad_norm": 0.0935945466460169,
      "learning_rate": 1.5588125053139468e-05,
      "loss": 1.2131,
      "step": 750
    },
    {
      "epoch": 0.8428691041027072,
      "grad_norm": 0.08282716353976063,
      "learning_rate": 1.4558059545351143e-05,
      "loss": 1.2284,
      "step": 755
    },
    {
      "epoch": 0.8484510186994139,
      "grad_norm": 0.08286515378820142,
      "learning_rate": 1.3560525629510568e-05,
      "loss": 1.2086,
      "step": 760
    },
    {
      "epoch": 0.8540329332961206,
      "grad_norm": 0.08295259360853054,
      "learning_rate": 1.259590311167238e-05,
      "loss": 1.2061,
      "step": 765
    },
    {
      "epoch": 0.8596148478928273,
      "grad_norm": 0.08358389042910293,
      "learning_rate": 1.166455926704082e-05,
      "loss": 1.222,
      "step": 770
    },
    {
      "epoch": 0.8651967624895339,
      "grad_norm": 0.08388863476839661,
      "learning_rate": 1.0766848700131648e-05,
      "loss": 1.2143,
      "step": 775
    },
    {
      "epoch": 0.8707786770862406,
      "grad_norm": 0.08277339984932784,
      "learning_rate": 9.903113209758096e-06,
      "loss": 1.2192,
      "step": 780
    },
    {
      "epoch": 0.8763605916829472,
      "grad_norm": 0.08938310164317657,
      "learning_rate": 9.073681658892775e-06,
      "loss": 1.2191,
      "step": 785
    },
    {
      "epoch": 0.8819425062796539,
      "grad_norm": 0.07910593096708422,
      "learning_rate": 8.278869849454718e-06,
      "loss": 1.2269,
      "step": 790
    },
    {
      "epoch": 0.8875244208763606,
      "grad_norm": 0.08295037453317607,
      "learning_rate": 7.5189804020693536e-06,
      "loss": 1.2021,
      "step": 795
    },
    {
      "epoch": 0.8931063354730673,
      "grad_norm": 0.08199446080472911,
      "learning_rate": 6.794302640847294e-06,
      "loss": 1.1961,
      "step": 800
    },
    {
      "epoch": 0.8986882500697739,
      "grad_norm": 0.08481342663212112,
      "learning_rate": 6.1051124832254944e-06,
      "loss": 1.2069,
      "step": 805
    },
    {
      "epoch": 0.9042701646664806,
      "grad_norm": 0.08217551850800063,
      "learning_rate": 5.451672334913216e-06,
      "loss": 1.2055,
      "step": 810
    },
    {
      "epoch": 0.9098520792631872,
      "grad_norm": 0.08322503504827561,
      "learning_rate": 4.834230989982213e-06,
      "loss": 1.2156,
      "step": 815
    },
    {
      "epoch": 0.915433993859894,
      "grad_norm": 0.08125961805104615,
      "learning_rate": 4.253023536139733e-06,
      "loss": 1.2005,
      "step": 820
    },
    {
      "epoch": 0.9210159084566006,
      "grad_norm": 0.09037682759604541,
      "learning_rate": 3.7082712652200867e-06,
      "loss": 1.2079,
      "step": 825
    },
    {
      "epoch": 0.9265978230533073,
      "grad_norm": 0.08711894287392291,
      "learning_rate": 3.2001815889286856e-06,
      "loss": 1.232,
      "step": 830
    },
    {
      "epoch": 0.9321797376500139,
      "grad_norm": 0.08367132801462379,
      "learning_rate": 2.728947959871353e-06,
      "loss": 1.1858,
      "step": 835
    },
    {
      "epoch": 0.9377616522467206,
      "grad_norm": 0.0809801248589102,
      "learning_rate": 2.294749797897955e-06,
      "loss": 1.1871,
      "step": 840
    },
    {
      "epoch": 0.9433435668434274,
      "grad_norm": 0.08412969109149288,
      "learning_rate": 1.8977524217893783e-06,
      "loss": 1.2248,
      "step": 845
    },
    {
      "epoch": 0.948925481440134,
      "grad_norm": 0.08014128153610968,
      "learning_rate": 1.5381069863131037e-06,
      "loss": 1.2312,
      "step": 850
    },
    {
      "epoch": 0.9545073960368406,
      "grad_norm": 0.08040835492341503,
      "learning_rate": 1.2159504246718522e-06,
      "loss": 1.2213,
      "step": 855
    },
    {
      "epoch": 0.9600893106335473,
      "grad_norm": 0.08170226749481643,
      "learning_rate": 9.314053963669245e-07,
      "loss": 1.2114,
      "step": 860
    },
    {
      "epoch": 0.965671225230254,
      "grad_norm": 0.08123838559159317,
      "learning_rate": 6.845802404962243e-07,
      "loss": 1.2455,
      "step": 865
    },
    {
      "epoch": 0.9712531398269606,
      "grad_norm": 0.08532355248950987,
      "learning_rate": 4.7556893450466653e-07,
      "loss": 1.2017,
      "step": 870
    },
    {
      "epoch": 0.9768350544236674,
      "grad_norm": 0.07935413274906811,
      "learning_rate": 3.044510584027771e-07,
      "loss": 1.203,
      "step": 875
    },
    {
      "epoch": 0.982416969020374,
      "grad_norm": 0.07922680701516337,
      "learning_rate": 1.7129176446692984e-07,
      "loss": 1.1993,
      "step": 880
    },
    {
      "epoch": 0.9879988836170807,
      "grad_norm": 0.08007277288266887,
      "learning_rate": 7.614175243301213e-08,
      "loss": 1.221,
      "step": 885
    },
    {
      "epoch": 0.9935807982137873,
      "grad_norm": 0.08190648675567455,
      "learning_rate": 1.9037250192732726e-08,
      "loss": 1.2245,
      "step": 890
    },
    {
      "epoch": 0.999162712810494,
      "grad_norm": 0.07884795604109555,
      "learning_rate": 0.0,
      "loss": 1.2359,
      "step": 895
    },
    {
      "epoch": 0.999162712810494,
      "eval_loss": 1.1748292446136475,
      "eval_runtime": 1569.4225,
      "eval_samples_per_second": 8.524,
      "eval_steps_per_second": 0.533,
      "step": 895
    },
    {
      "epoch": 0.999162712810494,
      "step": 895,
      "total_flos": 1.1254972268150784e+16,
      "train_loss": 1.2433469767011078,
      "train_runtime": 20318.3129,
      "train_samples_per_second": 2.821,
      "train_steps_per_second": 0.044
    }
  ],
  "logging_steps": 5,
  "max_steps": 895,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 25,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1.1254972268150784e+16,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}