{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 10.0,
  "eval_steps": 500,
  "global_step": 850,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.05917159763313609,
      "grad_norm": 1.1964364051818848,
      "learning_rate": 8e-05,
      "loss": 2.1543,
      "step": 5
    },
    {
      "epoch": 0.11834319526627218,
      "grad_norm": 1.1647157669067383,
      "learning_rate": 0.00018,
      "loss": 1.6346,
      "step": 10
    },
    {
      "epoch": 0.17751479289940827,
      "grad_norm": 0.6181473135948181,
      "learning_rate": 0.00019904761904761907,
      "loss": 0.9978,
      "step": 15
    },
    {
      "epoch": 0.23668639053254437,
      "grad_norm": 0.5694869160652161,
      "learning_rate": 0.00019785714285714288,
      "loss": 0.9807,
      "step": 20
    },
    {
      "epoch": 0.2958579881656805,
      "grad_norm": 0.6708640456199646,
      "learning_rate": 0.00019666666666666666,
      "loss": 1.2485,
      "step": 25
    },
    {
      "epoch": 0.35502958579881655,
      "grad_norm": 0.7021521925926208,
      "learning_rate": 0.00019547619047619047,
      "loss": 1.1107,
      "step": 30
    },
    {
      "epoch": 0.41420118343195267,
      "grad_norm": 0.5067740082740784,
      "learning_rate": 0.0001942857142857143,
      "loss": 1.1293,
      "step": 35
    },
    {
      "epoch": 0.47337278106508873,
      "grad_norm": 0.5455656051635742,
      "learning_rate": 0.0001930952380952381,
      "loss": 1.1371,
      "step": 40
    },
    {
      "epoch": 0.5325443786982249,
      "grad_norm": 0.6190764307975769,
      "learning_rate": 0.00019190476190476192,
      "loss": 1.0131,
      "step": 45
    },
    {
      "epoch": 0.591715976331361,
      "grad_norm": 0.544291615486145,
      "learning_rate": 0.00019071428571428573,
      "loss": 0.969,
      "step": 50
    },
    {
      "epoch": 0.650887573964497,
      "grad_norm": 0.600204348564148,
      "learning_rate": 0.0001895238095238095,
      "loss": 1.0509,
      "step": 55
    },
    {
      "epoch": 0.7100591715976331,
      "grad_norm": 0.5397897958755493,
      "learning_rate": 0.00018833333333333335,
      "loss": 0.9839,
      "step": 60
    },
    {
      "epoch": 0.7692307692307693,
      "grad_norm": 0.571107804775238,
      "learning_rate": 0.00018714285714285716,
      "loss": 0.9281,
      "step": 65
    },
    {
      "epoch": 0.8284023668639053,
      "grad_norm": 0.5789744257926941,
      "learning_rate": 0.00018595238095238097,
      "loss": 0.9543,
      "step": 70
    },
    {
      "epoch": 0.8875739644970414,
      "grad_norm": 0.6334110498428345,
      "learning_rate": 0.00018476190476190478,
      "loss": 1.0625,
      "step": 75
    },
    {
      "epoch": 0.9467455621301775,
      "grad_norm": 0.5225853323936462,
      "learning_rate": 0.00018357142857142858,
      "loss": 0.9734,
      "step": 80
    },
    {
      "epoch": 1.0,
      "grad_norm": 0.7854479551315308,
      "learning_rate": 0.0001823809523809524,
      "loss": 0.9452,
      "step": 85
    },
    {
      "epoch": 1.0591715976331362,
      "grad_norm": 0.47304806113243103,
      "learning_rate": 0.0001811904761904762,
      "loss": 0.8473,
      "step": 90
    },
    {
      "epoch": 1.1183431952662721,
      "grad_norm": 0.6840282678604126,
      "learning_rate": 0.00018,
      "loss": 0.6189,
      "step": 95
    },
    {
      "epoch": 1.1775147928994083,
      "grad_norm": 0.5173171162605286,
      "learning_rate": 0.00017880952380952382,
      "loss": 0.9301,
      "step": 100
    },
    {
      "epoch": 1.2366863905325443,
      "grad_norm": 0.4892285466194153,
      "learning_rate": 0.00017761904761904763,
      "loss": 0.7659,
      "step": 105
    },
    {
      "epoch": 1.2958579881656804,
      "grad_norm": 0.5754849910736084,
      "learning_rate": 0.00017642857142857144,
      "loss": 0.7756,
      "step": 110
    },
    {
      "epoch": 1.3550295857988166,
      "grad_norm": 0.47277289628982544,
      "learning_rate": 0.00017523809523809525,
      "loss": 0.7844,
      "step": 115
    },
    {
      "epoch": 1.4142011834319526,
      "grad_norm": 0.8198840022087097,
      "learning_rate": 0.00017404761904761906,
      "loss": 0.8969,
      "step": 120
    },
    {
      "epoch": 1.4733727810650887,
      "grad_norm": 0.5040334463119507,
      "learning_rate": 0.00017285714285714287,
      "loss": 0.8727,
      "step": 125
    },
    {
      "epoch": 1.532544378698225,
      "grad_norm": 0.5382494926452637,
      "learning_rate": 0.00017166666666666667,
      "loss": 0.7677,
      "step": 130
    },
    {
      "epoch": 1.5917159763313609,
      "grad_norm": 0.631537914276123,
      "learning_rate": 0.00017047619047619048,
      "loss": 0.8364,
      "step": 135
    },
    {
      "epoch": 1.650887573964497,
      "grad_norm": 0.5718739628791809,
      "learning_rate": 0.0001692857142857143,
      "loss": 0.7447,
      "step": 140
    },
    {
      "epoch": 1.7100591715976332,
      "grad_norm": 0.557224452495575,
      "learning_rate": 0.0001680952380952381,
      "loss": 0.8828,
      "step": 145
    },
    {
      "epoch": 1.7692307692307692,
      "grad_norm": 0.6206871271133423,
      "learning_rate": 0.0001669047619047619,
      "loss": 0.5674,
      "step": 150
    },
    {
      "epoch": 1.8284023668639053,
      "grad_norm": 0.6297276616096497,
      "learning_rate": 0.00016571428571428575,
      "loss": 0.7956,
      "step": 155
    },
    {
      "epoch": 1.8875739644970415,
      "grad_norm": 0.6178033351898193,
      "learning_rate": 0.00016452380952380953,
      "loss": 0.5527,
      "step": 160
    },
    {
      "epoch": 1.9467455621301775,
      "grad_norm": 0.6269710063934326,
      "learning_rate": 0.00016333333333333334,
      "loss": 0.9635,
      "step": 165
    },
    {
      "epoch": 2.0,
      "grad_norm": 0.7290076613426208,
      "learning_rate": 0.00016214285714285715,
      "loss": 0.8048,
      "step": 170
    },
    {
      "epoch": 2.059171597633136,
      "grad_norm": 0.5376546382904053,
      "learning_rate": 0.00016095238095238096,
      "loss": 0.602,
      "step": 175
    },
    {
      "epoch": 2.1183431952662723,
      "grad_norm": 0.720078706741333,
      "learning_rate": 0.0001597619047619048,
      "loss": 0.5738,
      "step": 180
    },
    {
      "epoch": 2.1775147928994083,
      "grad_norm": 0.5647716522216797,
      "learning_rate": 0.00015857142857142857,
      "loss": 0.5445,
      "step": 185
    },
    {
      "epoch": 2.2366863905325443,
      "grad_norm": 0.7397224307060242,
      "learning_rate": 0.00015738095238095238,
      "loss": 0.5383,
      "step": 190
    },
    {
      "epoch": 2.2958579881656807,
      "grad_norm": 0.8834079504013062,
      "learning_rate": 0.0001561904761904762,
      "loss": 0.6575,
      "step": 195
    },
    {
      "epoch": 2.3550295857988166,
      "grad_norm": 0.6497870683670044,
      "learning_rate": 0.000155,
      "loss": 0.6677,
      "step": 200
    },
    {
      "epoch": 2.4142011834319526,
      "grad_norm": 0.686392605304718,
      "learning_rate": 0.00015380952380952384,
      "loss": 0.493,
      "step": 205
    },
    {
      "epoch": 2.4733727810650885,
      "grad_norm": 0.719688892364502,
      "learning_rate": 0.00015261904761904762,
      "loss": 0.5356,
      "step": 210
    },
    {
      "epoch": 2.532544378698225,
      "grad_norm": 0.6884217262268066,
      "learning_rate": 0.00015142857142857143,
      "loss": 0.7167,
      "step": 215
    },
    {
      "epoch": 2.591715976331361,
      "grad_norm": 0.7767056822776794,
      "learning_rate": 0.00015023809523809524,
      "loss": 0.7346,
      "step": 220
    },
    {
      "epoch": 2.6508875739644973,
      "grad_norm": 0.6508312225341797,
      "learning_rate": 0.00014904761904761904,
      "loss": 0.547,
      "step": 225
    },
    {
      "epoch": 2.710059171597633,
      "grad_norm": 0.6159693598747253,
      "learning_rate": 0.00014785714285714288,
      "loss": 0.5539,
      "step": 230
    },
    {
      "epoch": 2.769230769230769,
      "grad_norm": 0.7028509378433228,
      "learning_rate": 0.00014666666666666666,
      "loss": 0.609,
      "step": 235
    },
    {
      "epoch": 2.828402366863905,
      "grad_norm": 0.6096014976501465,
      "learning_rate": 0.00014547619047619047,
      "loss": 0.5913,
      "step": 240
    },
    {
      "epoch": 2.8875739644970415,
      "grad_norm": 0.8518397212028503,
      "learning_rate": 0.00014428571428571428,
      "loss": 0.6356,
      "step": 245
    },
    {
      "epoch": 2.9467455621301775,
      "grad_norm": 0.6462046504020691,
      "learning_rate": 0.00014309523809523812,
      "loss": 0.527,
      "step": 250
    },
    {
      "epoch": 3.0,
      "grad_norm": 0.8931583762168884,
      "learning_rate": 0.00014190476190476193,
      "loss": 0.6505,
      "step": 255
    },
    {
      "epoch": 3.059171597633136,
      "grad_norm": 0.6089041233062744,
      "learning_rate": 0.00014071428571428573,
      "loss": 0.4721,
      "step": 260
    },
    {
      "epoch": 3.1183431952662723,
      "grad_norm": 0.9845924973487854,
      "learning_rate": 0.00013952380952380952,
      "loss": 0.4478,
      "step": 265
    },
    {
      "epoch": 3.1775147928994083,
      "grad_norm": 0.6962316036224365,
      "learning_rate": 0.00013833333333333333,
      "loss": 0.3862,
      "step": 270
    },
    {
      "epoch": 3.2366863905325443,
      "grad_norm": 0.6963745951652527,
      "learning_rate": 0.00013714285714285716,
      "loss": 0.5446,
      "step": 275
    },
    {
      "epoch": 3.2958579881656807,
      "grad_norm": 0.9289587736129761,
      "learning_rate": 0.00013595238095238097,
      "loss": 0.4985,
      "step": 280
    },
    {
      "epoch": 3.3550295857988166,
      "grad_norm": 0.7913327813148499,
      "learning_rate": 0.00013476190476190478,
      "loss": 0.4291,
      "step": 285
    },
    {
      "epoch": 3.4142011834319526,
      "grad_norm": 0.7623841166496277,
      "learning_rate": 0.00013357142857142856,
      "loss": 0.4198,
      "step": 290
    },
    {
      "epoch": 3.4733727810650885,
      "grad_norm": 1.1334826946258545,
      "learning_rate": 0.00013238095238095237,
      "loss": 0.4442,
      "step": 295
    },
    {
      "epoch": 3.532544378698225,
      "grad_norm": 0.8162091374397278,
      "learning_rate": 0.0001311904761904762,
      "loss": 0.4249,
      "step": 300
    },
    {
      "epoch": 3.591715976331361,
      "grad_norm": 0.7582007646560669,
      "learning_rate": 0.00013000000000000002,
      "loss": 0.4166,
      "step": 305
    },
    {
      "epoch": 3.6508875739644973,
      "grad_norm": 0.8337474465370178,
      "learning_rate": 0.00012880952380952382,
      "loss": 0.3552,
      "step": 310
    },
    {
      "epoch": 3.710059171597633,
      "grad_norm": 0.7497977018356323,
      "learning_rate": 0.0001276190476190476,
      "loss": 0.3778,
      "step": 315
    },
    {
      "epoch": 3.769230769230769,
      "grad_norm": 0.9030293226242065,
      "learning_rate": 0.00012642857142857144,
      "loss": 0.4048,
      "step": 320
    },
    {
      "epoch": 3.828402366863905,
      "grad_norm": 0.8548532128334045,
      "learning_rate": 0.00012523809523809525,
      "loss": 0.5433,
      "step": 325
    },
    {
      "epoch": 3.8875739644970415,
      "grad_norm": 1.1865911483764648,
      "learning_rate": 0.00012404761904761906,
      "loss": 0.4465,
      "step": 330
    },
    {
      "epoch": 3.9467455621301775,
      "grad_norm": 0.6329714059829712,
      "learning_rate": 0.00012285714285714287,
      "loss": 0.499,
      "step": 335
    },
    {
      "epoch": 4.0,
      "grad_norm": 0.8335389494895935,
      "learning_rate": 0.00012166666666666667,
      "loss": 0.3774,
      "step": 340
    },
    {
      "epoch": 4.059171597633136,
      "grad_norm": 0.7739379405975342,
      "learning_rate": 0.00012047619047619047,
      "loss": 0.2472,
      "step": 345
    },
    {
      "epoch": 4.118343195266272,
      "grad_norm": 1.0731953382492065,
      "learning_rate": 0.00011928571428571428,
      "loss": 0.3058,
      "step": 350
    },
    {
      "epoch": 4.177514792899408,
      "grad_norm": 1.051379680633545,
      "learning_rate": 0.0001180952380952381,
      "loss": 0.3446,
      "step": 355
    },
    {
      "epoch": 4.236686390532545,
      "grad_norm": 0.6324198842048645,
      "learning_rate": 0.00011690476190476191,
      "loss": 0.2351,
      "step": 360
    },
    {
      "epoch": 4.295857988165681,
      "grad_norm": 0.9921632409095764,
      "learning_rate": 0.00011571428571428574,
      "loss": 0.2795,
      "step": 365
    },
    {
      "epoch": 4.355029585798817,
      "grad_norm": 0.9360544085502625,
      "learning_rate": 0.00011452380952380952,
      "loss": 0.4056,
      "step": 370
    },
    {
      "epoch": 4.414201183431953,
      "grad_norm": 0.956781268119812,
      "learning_rate": 0.00011333333333333334,
      "loss": 0.3001,
      "step": 375
    },
    {
      "epoch": 4.4733727810650885,
      "grad_norm": 1.0604465007781982,
      "learning_rate": 0.00011214285714285715,
      "loss": 0.3972,
      "step": 380
    },
    {
      "epoch": 4.5325443786982245,
      "grad_norm": 0.8613020181655884,
      "learning_rate": 0.00011095238095238096,
      "loss": 0.2828,
      "step": 385
    },
    {
      "epoch": 4.591715976331361,
      "grad_norm": 0.666599690914154,
      "learning_rate": 0.00010976190476190478,
      "loss": 0.3187,
      "step": 390
    },
    {
      "epoch": 4.650887573964497,
      "grad_norm": 0.7497467398643494,
      "learning_rate": 0.00010857142857142856,
      "loss": 0.4278,
      "step": 395
    },
    {
      "epoch": 4.710059171597633,
      "grad_norm": 0.733259916305542,
      "learning_rate": 0.00010738095238095239,
      "loss": 0.2384,
      "step": 400
    },
    {
      "epoch": 4.769230769230769,
      "grad_norm": 0.7570552229881287,
      "learning_rate": 0.0001061904761904762,
      "loss": 0.3938,
      "step": 405
    },
    {
      "epoch": 4.828402366863905,
      "grad_norm": 0.8109162449836731,
      "learning_rate": 0.000105,
      "loss": 0.2729,
      "step": 410
    },
    {
      "epoch": 4.887573964497041,
      "grad_norm": 0.7985107898712158,
      "learning_rate": 0.00010380952380952383,
      "loss": 0.3805,
      "step": 415
    },
    {
      "epoch": 4.946745562130177,
      "grad_norm": 0.704366147518158,
      "learning_rate": 0.00010261904761904761,
      "loss": 0.2468,
      "step": 420
    },
    {
      "epoch": 5.0,
      "grad_norm": 1.054413080215454,
      "learning_rate": 0.00010142857142857143,
      "loss": 0.2892,
      "step": 425
    },
    {
      "epoch": 5.059171597633136,
      "grad_norm": 0.7603329420089722,
      "learning_rate": 0.00010023809523809524,
      "loss": 0.2865,
      "step": 430
    },
    {
      "epoch": 5.118343195266272,
      "grad_norm": 0.9612646102905273,
      "learning_rate": 9.904761904761905e-05,
      "loss": 0.2215,
      "step": 435
    },
    {
      "epoch": 5.177514792899408,
      "grad_norm": 0.8669071793556213,
      "learning_rate": 9.785714285714286e-05,
      "loss": 0.2078,
      "step": 440
    },
    {
      "epoch": 5.236686390532545,
      "grad_norm": 0.7441051006317139,
      "learning_rate": 9.666666666666667e-05,
      "loss": 0.233,
      "step": 445
    },
    {
      "epoch": 5.295857988165681,
      "grad_norm": 0.5900620818138123,
      "learning_rate": 9.547619047619049e-05,
      "loss": 0.2581,
      "step": 450
    },
    {
      "epoch": 5.355029585798817,
      "grad_norm": 0.990178644657135,
      "learning_rate": 9.428571428571429e-05,
      "loss": 0.2603,
      "step": 455
    },
    {
      "epoch": 5.414201183431953,
      "grad_norm": 0.7644340991973877,
      "learning_rate": 9.309523809523811e-05,
      "loss": 0.2021,
      "step": 460
    },
    {
      "epoch": 5.4733727810650885,
      "grad_norm": 0.5087964534759521,
      "learning_rate": 9.19047619047619e-05,
      "loss": 0.1126,
      "step": 465
    },
    {
      "epoch": 5.5325443786982245,
      "grad_norm": 0.7896738052368164,
      "learning_rate": 9.071428571428571e-05,
      "loss": 0.2084,
      "step": 470
    },
    {
      "epoch": 5.591715976331361,
      "grad_norm": 0.71749347448349,
      "learning_rate": 8.952380952380953e-05,
      "loss": 0.1618,
      "step": 475
    },
    {
      "epoch": 5.650887573964497,
      "grad_norm": 0.8466284871101379,
      "learning_rate": 8.833333333333333e-05,
      "loss": 0.2574,
      "step": 480
    },
    {
      "epoch": 5.710059171597633,
      "grad_norm": 1.0023925304412842,
      "learning_rate": 8.714285714285715e-05,
      "loss": 0.1985,
      "step": 485
    },
    {
      "epoch": 5.769230769230769,
      "grad_norm": 0.8096638321876526,
      "learning_rate": 8.595238095238096e-05,
      "loss": 0.285,
      "step": 490
    },
    {
      "epoch": 5.828402366863905,
      "grad_norm": 0.9154897332191467,
      "learning_rate": 8.476190476190477e-05,
      "loss": 0.2051,
      "step": 495
    },
    {
      "epoch": 5.887573964497041,
      "grad_norm": 0.9506188035011292,
      "learning_rate": 8.357142857142858e-05,
      "loss": 0.2664,
      "step": 500
    },
    {
      "epoch": 5.946745562130177,
      "grad_norm": 0.8935821056365967,
      "learning_rate": 8.238095238095238e-05,
      "loss": 0.1717,
      "step": 505
    },
    {
      "epoch": 6.0,
      "grad_norm": 1.2274423837661743,
      "learning_rate": 8.11904761904762e-05,
      "loss": 0.2529,
      "step": 510
    },
    {
      "epoch": 6.059171597633136,
      "grad_norm": 0.6341638565063477,
      "learning_rate": 8e-05,
      "loss": 0.1795,
      "step": 515
    },
    {
      "epoch": 6.118343195266272,
      "grad_norm": 1.273710012435913,
      "learning_rate": 7.880952380952382e-05,
      "loss": 0.1612,
      "step": 520
    },
    {
      "epoch": 6.177514792899408,
      "grad_norm": 1.065499186515808,
      "learning_rate": 7.761904761904762e-05,
      "loss": 0.1839,
      "step": 525
    },
    {
      "epoch": 6.236686390532545,
      "grad_norm": 0.5382740497589111,
      "learning_rate": 7.642857142857143e-05,
      "loss": 0.1628,
      "step": 530
    },
    {
      "epoch": 6.295857988165681,
      "grad_norm": 0.6181464791297913,
      "learning_rate": 7.523809523809524e-05,
      "loss": 0.141,
      "step": 535
    },
    {
      "epoch": 6.355029585798817,
      "grad_norm": 0.7450206875801086,
      "learning_rate": 7.404761904761905e-05,
      "loss": 0.1453,
      "step": 540
    },
    {
      "epoch": 6.414201183431953,
      "grad_norm": 0.9426142573356628,
      "learning_rate": 7.285714285714286e-05,
      "loss": 0.1403,
      "step": 545
    },
    {
      "epoch": 6.4733727810650885,
      "grad_norm": 0.9675353169441223,
      "learning_rate": 7.166666666666667e-05,
      "loss": 0.1216,
      "step": 550
    },
    {
      "epoch": 6.5325443786982245,
      "grad_norm": 0.5108327269554138,
      "learning_rate": 7.047619047619048e-05,
      "loss": 0.1484,
      "step": 555
    },
    {
      "epoch": 6.591715976331361,
      "grad_norm": 0.6549590229988098,
      "learning_rate": 6.928571428571429e-05,
      "loss": 0.1338,
      "step": 560
    },
    {
      "epoch": 6.650887573964497,
      "grad_norm": 0.843664288520813,
      "learning_rate": 6.80952380952381e-05,
      "loss": 0.1586,
      "step": 565
    },
    {
      "epoch": 6.710059171597633,
      "grad_norm": 0.8650611639022827,
      "learning_rate": 6.69047619047619e-05,
      "loss": 0.1517,
      "step": 570
    },
    {
      "epoch": 6.769230769230769,
      "grad_norm": 0.7471471428871155,
      "learning_rate": 6.571428571428571e-05,
      "loss": 0.1966,
      "step": 575
    },
    {
      "epoch": 6.828402366863905,
      "grad_norm": 0.7219163775444031,
      "learning_rate": 6.452380952380954e-05,
      "loss": 0.1368,
      "step": 580
    },
    {
      "epoch": 6.887573964497041,
      "grad_norm": 0.911191463470459,
      "learning_rate": 6.333333333333333e-05,
      "loss": 0.1681,
      "step": 585
    },
    {
      "epoch": 6.946745562130177,
      "grad_norm": 0.6017201542854309,
      "learning_rate": 6.214285714285714e-05,
      "loss": 0.1675,
      "step": 590
    },
    {
      "epoch": 7.0,
      "grad_norm": 1.0746937990188599,
      "learning_rate": 6.0952380952380964e-05,
      "loss": 0.1944,
      "step": 595
    },
    {
      "epoch": 7.059171597633136,
      "grad_norm": 0.8227368593215942,
      "learning_rate": 5.9761904761904766e-05,
      "loss": 0.076,
      "step": 600
    },
    {
      "epoch": 7.118343195266272,
      "grad_norm": 0.6826525330543518,
      "learning_rate": 5.8571428571428575e-05,
      "loss": 0.1048,
      "step": 605
    },
    {
      "epoch": 7.177514792899408,
      "grad_norm": 1.0038989782333374,
      "learning_rate": 5.738095238095238e-05,
      "loss": 0.0998,
      "step": 610
    },
    {
      "epoch": 7.236686390532545,
      "grad_norm": 0.8537135720252991,
      "learning_rate": 5.619047619047619e-05,
      "loss": 0.1035,
      "step": 615
    },
    {
      "epoch": 7.295857988165681,
      "grad_norm": 0.7067388892173767,
      "learning_rate": 5.500000000000001e-05,
      "loss": 0.1186,
      "step": 620
    },
    {
      "epoch": 7.355029585798817,
      "grad_norm": 0.7003813982009888,
      "learning_rate": 5.380952380952381e-05,
      "loss": 0.0803,
      "step": 625
    },
    {
      "epoch": 7.414201183431953,
      "grad_norm": 0.9923582673072815,
      "learning_rate": 5.261904761904763e-05,
      "loss": 0.1121,
      "step": 630
    },
    {
      "epoch": 7.4733727810650885,
      "grad_norm": 0.4991615116596222,
      "learning_rate": 5.142857142857143e-05,
      "loss": 0.1056,
      "step": 635
    },
    {
      "epoch": 7.5325443786982245,
      "grad_norm": 0.7406235337257385,
      "learning_rate": 5.023809523809524e-05,
      "loss": 0.1135,
      "step": 640
    },
    {
      "epoch": 7.591715976331361,
      "grad_norm": 0.5871267318725586,
      "learning_rate": 4.904761904761905e-05,
      "loss": 0.1141,
      "step": 645
    },
    {
      "epoch": 7.650887573964497,
      "grad_norm": 0.5495700836181641,
      "learning_rate": 4.785714285714286e-05,
      "loss": 0.07,
      "step": 650
    },
    {
      "epoch": 7.710059171597633,
      "grad_norm": 0.9295830130577087,
      "learning_rate": 4.666666666666667e-05,
      "loss": 0.1293,
      "step": 655
    },
    {
      "epoch": 7.769230769230769,
      "grad_norm": 0.9041563272476196,
      "learning_rate": 4.547619047619048e-05,
      "loss": 0.1034,
      "step": 660
    },
    {
      "epoch": 7.828402366863905,
      "grad_norm": 0.6490697264671326,
      "learning_rate": 4.428571428571428e-05,
      "loss": 0.0898,
      "step": 665
    },
    {
      "epoch": 7.887573964497041,
      "grad_norm": 0.5583420991897583,
      "learning_rate": 4.30952380952381e-05,
      "loss": 0.0891,
      "step": 670
    },
    {
      "epoch": 7.946745562130177,
      "grad_norm": 0.7829737663269043,
      "learning_rate": 4.190476190476191e-05,
      "loss": 0.112,
      "step": 675
    },
    {
      "epoch": 8.0,
      "grad_norm": 0.48943546414375305,
      "learning_rate": 4.0714285714285717e-05,
      "loss": 0.115,
      "step": 680
    },
    {
      "epoch": 8.059171597633137,
      "grad_norm": 0.4989611506462097,
      "learning_rate": 3.9523809523809526e-05,
      "loss": 0.0645,
      "step": 685
    },
    {
      "epoch": 8.118343195266272,
      "grad_norm": 0.5723634362220764,
      "learning_rate": 3.8333333333333334e-05,
      "loss": 0.0603,
      "step": 690
    },
    {
      "epoch": 8.177514792899409,
      "grad_norm": 0.5361748933792114,
      "learning_rate": 3.7142857142857143e-05,
      "loss": 0.0626,
      "step": 695
    },
    {
      "epoch": 8.236686390532544,
      "grad_norm": 0.8497764468193054,
      "learning_rate": 3.595238095238095e-05,
      "loss": 0.0856,
      "step": 700
    },
    {
      "epoch": 8.29585798816568,
      "grad_norm": 0.48423126339912415,
      "learning_rate": 3.476190476190476e-05,
      "loss": 0.0568,
      "step": 705
    },
    {
      "epoch": 8.355029585798816,
      "grad_norm": 0.30722182989120483,
      "learning_rate": 3.357142857142857e-05,
      "loss": 0.0564,
      "step": 710
    },
    {
      "epoch": 8.414201183431953,
      "grad_norm": 0.637298047542572,
      "learning_rate": 3.2380952380952386e-05,
      "loss": 0.0792,
      "step": 715
    },
    {
      "epoch": 8.47337278106509,
      "grad_norm": 1.1620301008224487,
      "learning_rate": 3.1190476190476195e-05,
      "loss": 0.0848,
      "step": 720
    },
    {
      "epoch": 8.532544378698224,
      "grad_norm": 0.9842550158500671,
      "learning_rate": 3e-05,
      "loss": 0.0695,
      "step": 725
    },
    {
      "epoch": 8.591715976331361,
      "grad_norm": 0.5429280996322632,
      "learning_rate": 2.880952380952381e-05,
      "loss": 0.0691,
      "step": 730
    },
    {
      "epoch": 8.650887573964496,
      "grad_norm": 0.4466063380241394,
      "learning_rate": 2.7619047619047622e-05,
      "loss": 0.0427,
      "step": 735
    },
    {
      "epoch": 8.710059171597633,
      "grad_norm": 0.7048435807228088,
      "learning_rate": 2.642857142857143e-05,
      "loss": 0.066,
      "step": 740
    },
    {
      "epoch": 8.76923076923077,
      "grad_norm": 0.71544349193573,
      "learning_rate": 2.523809523809524e-05,
      "loss": 0.048,
      "step": 745
    },
    {
      "epoch": 8.828402366863905,
      "grad_norm": 0.27939409017562866,
      "learning_rate": 2.404761904761905e-05,
      "loss": 0.047,
      "step": 750
    },
    {
      "epoch": 8.887573964497042,
      "grad_norm": 0.30130356550216675,
      "learning_rate": 2.2857142857142858e-05,
      "loss": 0.0715,
      "step": 755
    },
    {
      "epoch": 8.946745562130177,
      "grad_norm": 0.6010681986808777,
      "learning_rate": 2.1666666666666667e-05,
      "loss": 0.0478,
      "step": 760
    },
    {
      "epoch": 9.0,
      "grad_norm": 0.5269991159439087,
      "learning_rate": 2.0476190476190476e-05,
      "loss": 0.048,
      "step": 765
    },
    {
      "epoch": 9.059171597633137,
      "grad_norm": 0.30882084369659424,
      "learning_rate": 1.928571428571429e-05,
      "loss": 0.0508,
      "step": 770
    },
    {
      "epoch": 9.118343195266272,
      "grad_norm": 0.3261071741580963,
      "learning_rate": 1.8095238095238094e-05,
      "loss": 0.0439,
      "step": 775
    },
    {
      "epoch": 9.177514792899409,
      "grad_norm": 0.33494994044303894,
      "learning_rate": 1.6904761904761906e-05,
      "loss": 0.0394,
      "step": 780
    },
    {
      "epoch": 9.236686390532544,
      "grad_norm": 0.470735639333725,
      "learning_rate": 1.5714285714285715e-05,
      "loss": 0.0465,
      "step": 785
    },
    {
      "epoch": 9.29585798816568,
      "grad_norm": 0.4816909432411194,
      "learning_rate": 1.4523809523809526e-05,
      "loss": 0.0461,
      "step": 790
    },
    {
      "epoch": 9.355029585798816,
      "grad_norm": 0.332380086183548,
      "learning_rate": 1.3333333333333333e-05,
      "loss": 0.0438,
      "step": 795
    },
    {
      "epoch": 9.414201183431953,
      "grad_norm": 1.0364826917648315,
      "learning_rate": 1.2142857142857144e-05,
      "loss": 0.0575,
      "step": 800
    },
    {
      "epoch": 9.47337278106509,
      "grad_norm": 0.31686267256736755,
      "learning_rate": 1.0952380952380953e-05,
      "loss": 0.0376,
      "step": 805
    },
    {
      "epoch": 9.532544378698224,
      "grad_norm": 0.19316697120666504,
      "learning_rate": 9.761904761904762e-06,
      "loss": 0.0546,
      "step": 810
    },
    {
      "epoch": 9.591715976331361,
      "grad_norm": 0.3316756784915924,
      "learning_rate": 8.571428571428573e-06,
      "loss": 0.0303,
      "step": 815
    },
    {
      "epoch": 9.650887573964496,
      "grad_norm": 0.38312825560569763,
      "learning_rate": 7.380952380952382e-06,
      "loss": 0.0407,
      "step": 820
    },
    {
      "epoch": 9.710059171597633,
      "grad_norm": 0.1672104001045227,
      "learning_rate": 6.190476190476191e-06,
      "loss": 0.0406,
      "step": 825
    },
    {
      "epoch": 9.76923076923077,
      "grad_norm": 0.30679717659950256,
      "learning_rate": 5e-06,
      "loss": 0.0316,
      "step": 830
    },
    {
      "epoch": 9.828402366863905,
      "grad_norm": 0.3245919346809387,
      "learning_rate": 3.8095238095238102e-06,
      "loss": 0.0361,
      "step": 835
    },
    {
      "epoch": 9.887573964497042,
      "grad_norm": 0.25631895661354065,
      "learning_rate": 2.6190476190476192e-06,
      "loss": 0.0351,
      "step": 840
    },
    {
      "epoch": 9.946745562130177,
      "grad_norm": 0.2532467842102051,
      "learning_rate": 1.4285714285714286e-06,
      "loss": 0.0397,
      "step": 845
    },
    {
      "epoch": 10.0,
      "grad_norm": 0.7708420157432556,
      "learning_rate": 2.3809523809523814e-07,
      "loss": 0.0416,
      "step": 850
    },
    {
      "epoch": 10.0,
      "step": 850,
      "total_flos": 5.971841440128e+16,
      "train_loss": 0.3865027742876726,
      "train_runtime": 1732.5271,
      "train_samples_per_second": 3.902,
      "train_steps_per_second": 0.491
    }
  ],
  "logging_steps": 5,
  "max_steps": 850,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 10,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 5.971841440128e+16,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}