{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.0,
  "eval_steps": 500,
  "global_step": 3876,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.015479876160990712,
      "grad_norm": 1.2312519550323486,
      "learning_rate": 1.9587628865979382e-05,
      "loss": 35.56,
      "step": 20
    },
    {
      "epoch": 0.030959752321981424,
      "grad_norm": 3.618231773376465,
      "learning_rate": 4.020618556701031e-05,
      "loss": 35.4506,
      "step": 40
    },
    {
      "epoch": 0.04643962848297214,
      "grad_norm": 7.444742679595947,
      "learning_rate": 6.0824742268041234e-05,
      "loss": 33.9665,
      "step": 60
    },
    {
      "epoch": 0.06191950464396285,
      "grad_norm": 10.143885612487793,
      "learning_rate": 8.144329896907217e-05,
      "loss": 30.007,
      "step": 80
    },
    {
      "epoch": 0.07739938080495357,
      "grad_norm": 8.995733261108398,
      "learning_rate": 0.00010206185567010309,
      "loss": 22.9033,
      "step": 100
    },
    {
      "epoch": 0.09287925696594428,
      "grad_norm": 6.135839939117432,
      "learning_rate": 0.00012268041237113402,
      "loss": 14.5223,
      "step": 120
    },
    {
      "epoch": 0.10835913312693499,
      "grad_norm": 1.3609894514083862,
      "learning_rate": 0.00014329896907216494,
      "loss": 8.502,
      "step": 140
    },
    {
      "epoch": 0.1238390092879257,
      "grad_norm": 0.6270231604576111,
      "learning_rate": 0.0001639175257731959,
      "loss": 6.1607,
      "step": 160
    },
    {
      "epoch": 0.1393188854489164,
      "grad_norm": 1.1116787195205688,
      "learning_rate": 0.0001845360824742268,
      "loss": 5.2469,
      "step": 180
    },
    {
      "epoch": 0.15479876160990713,
      "grad_norm": 19.375608444213867,
      "learning_rate": 0.00019999908999970863,
      "loss": 4.6304,
      "step": 200
    },
    {
      "epoch": 0.17027863777089783,
      "grad_norm": 2.622767925262451,
      "learning_rate": 0.0001999772508208056,
      "loss": 4.0212,
      "step": 220
    },
    {
      "epoch": 0.18575851393188855,
      "grad_norm": 1.9569060802459717,
      "learning_rate": 0.00019992629891946655,
      "loss": 3.5883,
      "step": 240
    },
    {
      "epoch": 0.20123839009287925,
      "grad_norm": 2.5778486728668213,
      "learning_rate": 0.00019984624913255234,
      "loss": 3.2739,
      "step": 260
    },
    {
      "epoch": 0.21671826625386997,
      "grad_norm": 3.097757339477539,
      "learning_rate": 0.00019973712477003812,
      "loss": 3.2694,
      "step": 280
    },
    {
      "epoch": 0.23219814241486067,
      "grad_norm": 4.823343753814697,
      "learning_rate": 0.00019959895760822546,
      "loss": 2.9338,
      "step": 300
    },
    {
      "epoch": 0.2476780185758514,
      "grad_norm": 4.312209606170654,
      "learning_rate": 0.00019943178788048947,
      "loss": 2.8247,
      "step": 320
    },
    {
      "epoch": 0.2631578947368421,
      "grad_norm": 4.690278053283691,
      "learning_rate": 0.00019923566426556296,
      "loss": 2.7036,
      "step": 340
    },
    {
      "epoch": 0.2786377708978328,
      "grad_norm": 3.4156758785247803,
      "learning_rate": 0.00019901064387336164,
      "loss": 2.6405,
      "step": 360
    },
    {
      "epoch": 0.29411764705882354,
      "grad_norm": 15.009350776672363,
      "learning_rate": 0.00019875679222835398,
      "loss": 2.7523,
      "step": 380
    },
    {
      "epoch": 0.30959752321981426,
      "grad_norm": 3.018602132797241,
      "learning_rate": 0.000198474183250481,
      "loss": 2.5967,
      "step": 400
    },
    {
      "epoch": 0.32507739938080493,
      "grad_norm": 61.5975227355957,
      "learning_rate": 0.00019816289923363115,
      "loss": 2.4831,
      "step": 420
    },
    {
      "epoch": 0.34055727554179566,
      "grad_norm": 5.313195705413818,
      "learning_rate": 0.00019782303082167704,
      "loss": 2.5759,
      "step": 440
    },
    {
      "epoch": 0.3560371517027864,
      "grad_norm": 7.81691312789917,
      "learning_rate": 0.0001974546769820803,
      "loss": 2.4488,
      "step": 460
    },
    {
      "epoch": 0.3715170278637771,
      "grad_norm": 2.6861789226531982,
      "learning_rate": 0.00019705794497707312,
      "loss": 2.5337,
      "step": 480
    },
    {
      "epoch": 0.38699690402476783,
      "grad_norm": 2.6320855617523193,
      "learning_rate": 0.00019663295033242416,
      "loss": 2.4594,
      "step": 500
    },
    {
      "epoch": 0.4024767801857585,
      "grad_norm": 4.0807294845581055,
      "learning_rate": 0.00019617981680379804,
      "loss": 2.5519,
      "step": 520
    },
    {
      "epoch": 0.4179566563467492,
      "grad_norm": 1.9054477214813232,
      "learning_rate": 0.00019569867634071866,
      "loss": 2.5392,
      "step": 540
    },
    {
      "epoch": 0.43343653250773995,
      "grad_norm": 3.3869524002075195,
      "learning_rate": 0.00019518966904814625,
      "loss": 2.4402,
      "step": 560
    },
    {
      "epoch": 0.44891640866873067,
      "grad_norm": 3.94587779045105,
      "learning_rate": 0.00019465294314567987,
      "loss": 2.3743,
      "step": 580
    },
    {
      "epoch": 0.46439628482972134,
      "grad_norm": 1.9025462865829468,
      "learning_rate": 0.00019408865492439667,
      "loss": 2.4514,
      "step": 600
    },
    {
      "epoch": 0.47987616099071206,
      "grad_norm": 2.714090585708618,
      "learning_rate": 0.00019349696870134104,
      "loss": 2.4145,
      "step": 620
    },
    {
      "epoch": 0.4953560371517028,
      "grad_norm": 1.989125370979309,
      "learning_rate": 0.0001928780567716765,
      "loss": 2.4118,
      "step": 640
    },
    {
      "epoch": 0.5108359133126935,
      "grad_norm": 1.9810141324996948,
      "learning_rate": 0.00019223209935851455,
      "loss": 2.3477,
      "step": 660
    },
    {
      "epoch": 0.5263157894736842,
      "grad_norm": 2.902862310409546,
      "learning_rate": 0.0001915592845604348,
      "loss": 2.3803,
      "step": 680
    },
    {
      "epoch": 0.541795665634675,
      "grad_norm": 4.101681232452393,
      "learning_rate": 0.00019085980829671202,
      "loss": 2.2595,
      "step": 700
    },
    {
      "epoch": 0.5572755417956656,
      "grad_norm": 3.3641412258148193,
      "learning_rate": 0.0001901338742502655,
      "loss": 2.1746,
      "step": 720
    },
    {
      "epoch": 0.5727554179566563,
      "grad_norm": 4.526272296905518,
      "learning_rate": 0.0001893816938083481,
      "loss": 2.2116,
      "step": 740
    },
    {
      "epoch": 0.5882352941176471,
      "grad_norm": 2.6758034229278564,
      "learning_rate": 0.00018860348600099167,
      "loss": 2.2205,
      "step": 760
    },
    {
      "epoch": 0.6037151702786377,
      "grad_norm": 2.3024699687957764,
      "learning_rate": 0.00018779947743722685,
      "loss": 2.2901,
      "step": 780
    },
    {
      "epoch": 0.6191950464396285,
      "grad_norm": 3.218477487564087,
      "learning_rate": 0.00018696990223909595,
      "loss": 2.3473,
      "step": 800
    },
    {
      "epoch": 0.6346749226006192,
      "grad_norm": 2.0516302585601807,
      "learning_rate": 0.00018611500197347836,
      "loss": 2.3375,
      "step": 820
    },
    {
      "epoch": 0.6501547987616099,
      "grad_norm": 2.359009265899658,
      "learning_rate": 0.0001852350255817476,
      "loss": 2.2232,
      "step": 840
    },
    {
      "epoch": 0.6656346749226006,
      "grad_norm": 2.1015849113464355,
      "learning_rate": 0.00018433022930728133,
      "loss": 2.0797,
      "step": 860
    },
    {
      "epoch": 0.6811145510835913,
      "grad_norm": 2.1213488578796387,
      "learning_rate": 0.000183400876620845,
      "loss": 2.0405,
      "step": 880
    },
    {
      "epoch": 0.6965944272445821,
      "grad_norm": 3.6181552410125732,
      "learning_rate": 0.00018244723814387083,
      "loss": 2.283,
      "step": 900
    },
    {
      "epoch": 0.7120743034055728,
      "grad_norm": 7.133749961853027,
      "learning_rate": 0.0001814695915696546,
      "loss": 2.2591,
      "step": 920
    },
    {
      "epoch": 0.7275541795665634,
      "grad_norm": 5.1819987297058105,
      "learning_rate": 0.00018046822158249325,
      "loss": 2.2727,
      "step": 940
    },
    {
      "epoch": 0.7430340557275542,
      "grad_norm": 1.5060796737670898,
      "learning_rate": 0.00017944341977478654,
      "loss": 2.1029,
      "step": 960
    },
    {
      "epoch": 0.7585139318885449,
      "grad_norm": 1.9794375896453857,
      "learning_rate": 0.00017839548456212735,
      "loss": 2.126,
      "step": 980
    },
    {
      "epoch": 0.7739938080495357,
      "grad_norm": 5.905936241149902,
      "learning_rate": 0.00017732472109640503,
      "loss": 2.0345,
      "step": 1000
    },
    {
      "epoch": 0.7894736842105263,
      "grad_norm": 1.8466005325317383,
      "learning_rate": 0.00017623144117694708,
      "loss": 2.0384,
      "step": 1020
    },
    {
      "epoch": 0.804953560371517,
      "grad_norm": 1.9260845184326172,
      "learning_rate": 0.00017511596315972525,
      "loss": 2.0789,
      "step": 1040
    },
    {
      "epoch": 0.8204334365325078,
      "grad_norm": 39.37140655517578,
      "learning_rate": 0.00017397861186465243,
      "loss": 2.1383,
      "step": 1060
    },
    {
      "epoch": 0.8359133126934984,
      "grad_norm": 2.01326322555542,
      "learning_rate": 0.00017281971848099708,
      "loss": 2.0823,
      "step": 1080
    },
    {
      "epoch": 0.8513931888544891,
      "grad_norm": 6.53413724899292,
      "learning_rate": 0.00017163962047094328,
      "loss": 2.0473,
      "step": 1100
    },
    {
      "epoch": 0.8668730650154799,
      "grad_norm": 2.0091307163238525,
      "learning_rate": 0.0001704386614713236,
      "loss": 2.0851,
      "step": 1120
    },
    {
      "epoch": 0.8823529411764706,
      "grad_norm": 2.5082335472106934,
      "learning_rate": 0.00016921719119355468,
      "loss": 1.9664,
      "step": 1140
    },
    {
      "epoch": 0.8978328173374613,
      "grad_norm": 2.0674970149993896,
      "learning_rate": 0.0001679755653218034,
      "loss": 2.053,
      "step": 1160
    },
    {
      "epoch": 0.913312693498452,
      "grad_norm": 1.7843290567398071,
      "learning_rate": 0.0001667141454094139,
      "loss": 2.0598,
      "step": 1180
    },
    {
      "epoch": 0.9287925696594427,
      "grad_norm": 3.198160171508789,
      "learning_rate": 0.00016543329877362567,
      "loss": 2.0459,
      "step": 1200
    },
    {
      "epoch": 0.9442724458204335,
      "grad_norm": 2.996542453765869,
      "learning_rate": 0.0001641333983886132,
      "loss": 1.9699,
      "step": 1220
    },
    {
      "epoch": 0.9597523219814241,
      "grad_norm": 1.5705294609069824,
      "learning_rate": 0.00016281482277687826,
      "loss": 1.9142,
      "step": 1240
    },
    {
      "epoch": 0.9752321981424149,
      "grad_norm": 2.064488172531128,
      "learning_rate": 0.00016147795589902675,
      "loss": 2.0485,
      "step": 1260
    },
    {
      "epoch": 0.9907120743034056,
      "grad_norm": 3.425995111465454,
      "learning_rate": 0.00016012318704196164,
      "loss": 2.0223,
      "step": 1280
    },
    {
      "epoch": 1.0061919504643964,
      "grad_norm": 3.16005539894104,
      "learning_rate": 0.0001587509107055255,
      "loss": 2.1468,
      "step": 1300
    },
    {
      "epoch": 1.021671826625387,
      "grad_norm": 2.574629545211792,
      "learning_rate": 0.00015736152648762434,
      "loss": 1.9561,
      "step": 1320
    },
    {
      "epoch": 1.0371517027863777,
      "grad_norm": 2.4031307697296143,
      "learning_rate": 0.00015595543896786777,
      "loss": 2.0086,
      "step": 1340
    },
    {
      "epoch": 1.0526315789473684,
      "grad_norm": 2.7825675010681152,
      "learning_rate": 0.00015453305758975758,
      "loss": 1.9428,
      "step": 1360
    },
    {
      "epoch": 1.068111455108359,
      "grad_norm": 2.7439022064208984,
      "learning_rate": 0.0001530947965414608,
      "loss": 1.9599,
      "step": 1380
    },
    {
      "epoch": 1.08359133126935,
      "grad_norm": 3.3258135318756104,
      "learning_rate": 0.0001516410746352006,
      "loss": 1.9358,
      "step": 1400
    },
    {
      "epoch": 1.0990712074303406,
      "grad_norm": 2.5647904872894287,
      "learning_rate": 0.00015017231518530118,
      "loss": 1.9641,
      "step": 1420
    },
    {
      "epoch": 1.1145510835913313,
      "grad_norm": 2.749521017074585,
      "learning_rate": 0.00014868894588492104,
      "loss": 2.0731,
      "step": 1440
    },
    {
      "epoch": 1.130030959752322,
      "grad_norm": 4.515990257263184,
      "learning_rate": 0.00014719139868151184,
      "loss": 2.0112,
      "step": 1460
    },
    {
      "epoch": 1.1455108359133126,
      "grad_norm": 2.183593273162842,
      "learning_rate": 0.00014568010965103795,
      "loss": 1.8446,
      "step": 1480
    },
    {
      "epoch": 1.1609907120743035,
      "grad_norm": 6.230432033538818,
      "learning_rate": 0.00014415551887099405,
      "loss": 1.8628,
      "step": 1500
    },
    {
      "epoch": 1.1764705882352942,
      "grad_norm": 2.1607792377471924,
      "learning_rate": 0.0001426180702922574,
      "loss": 1.9779,
      "step": 1520
    },
    {
      "epoch": 1.1919504643962848,
      "grad_norm": 2.69480299949646,
      "learning_rate": 0.00014106821160981222,
      "loss": 2.0428,
      "step": 1540
    },
    {
      "epoch": 1.2074303405572755,
      "grad_norm": 2.5969278812408447,
      "learning_rate": 0.00013950639413238394,
      "loss": 1.9777,
      "step": 1560
    },
    {
      "epoch": 1.2229102167182662,
      "grad_norm": 2.709635019302368,
      "learning_rate": 0.00013793307265102096,
      "loss": 1.8938,
      "step": 1580
    },
    {
      "epoch": 1.238390092879257,
      "grad_norm": 2.4569203853607178,
      "learning_rate": 0.00013634870530666247,
      "loss": 1.7336,
      "step": 1600
    },
    {
      "epoch": 1.2538699690402477,
      "grad_norm": 2.850924253463745,
      "learning_rate": 0.00013475375345673083,
      "loss": 1.8305,
      "step": 1620
    },
    {
      "epoch": 1.2693498452012384,
      "grad_norm": 3.0461461544036865,
      "learning_rate": 0.00013314868154078725,
      "loss": 1.8139,
      "step": 1640
    },
    {
      "epoch": 1.284829721362229,
      "grad_norm": 2.8259775638580322,
      "learning_rate": 0.00013153395694529016,
      "loss": 1.8382,
      "step": 1660
    },
    {
      "epoch": 1.3003095975232197,
      "grad_norm": 2.21964693069458,
      "learning_rate": 0.00012991004986749515,
      "loss": 1.7279,
      "step": 1680
    },
    {
      "epoch": 1.3157894736842106,
      "grad_norm": 3.952878713607788,
      "learning_rate": 0.00012827743317853665,
      "loss": 1.8069,
      "step": 1700
    },
    {
      "epoch": 1.3312693498452013,
      "grad_norm": 2.4886014461517334,
      "learning_rate": 0.00012663658228573112,
      "loss": 1.8024,
      "step": 1720
    },
    {
      "epoch": 1.346749226006192,
      "grad_norm": 2.1773338317871094,
      "learning_rate": 0.0001249879749941412,
      "loss": 1.8586,
      "step": 1740
    },
    {
      "epoch": 1.3622291021671826,
      "grad_norm": 2.7643444538116455,
      "learning_rate": 0.00012333209136744237,
      "loss": 1.7066,
      "step": 1760
    },
    {
      "epoch": 1.3777089783281733,
      "grad_norm": 2.801255702972412,
      "learning_rate": 0.00012166941358813125,
      "loss": 1.8759,
      "step": 1780
    },
    {
      "epoch": 1.3931888544891642,
      "grad_norm": 3.2398030757904053,
      "learning_rate": 0.00012000042581711737,
      "loss": 1.8216,
      "step": 1800
    },
    {
      "epoch": 1.4086687306501549,
      "grad_norm": 2.008368492126465,
      "learning_rate": 0.00011832561405273867,
      "loss": 1.634,
      "step": 1820
    },
    {
      "epoch": 1.4241486068111455,
      "grad_norm": 2.6406748294830322,
      "learning_rate": 0.00011664546598924184,
      "loss": 1.8042,
      "step": 1840
    },
    {
      "epoch": 1.4396284829721362,
      "grad_norm": 1.6120656728744507,
      "learning_rate": 0.00011496047087476906,
      "loss": 1.7964,
      "step": 1860
    },
    {
      "epoch": 1.4551083591331269,
      "grad_norm": 3.240208864212036,
      "learning_rate": 0.00011327111936889212,
      "loss": 1.7457,
      "step": 1880
    },
    {
      "epoch": 1.4705882352941178,
      "grad_norm": 2.6874380111694336,
      "learning_rate": 0.00011157790339973546,
      "loss": 1.7428,
      "step": 1900
    },
    {
      "epoch": 1.4860681114551084,
      "grad_norm": 3.631455421447754,
      "learning_rate": 0.00010988131602073008,
      "loss": 1.8673,
      "step": 1920
    },
    {
      "epoch": 1.501547987616099,
      "grad_norm": 5.774441242218018,
      "learning_rate": 0.00010818185126703943,
      "loss": 1.7561,
      "step": 1940
    },
    {
      "epoch": 1.5170278637770898,
      "grad_norm": 2.388491153717041,
      "learning_rate": 0.0001064800040116997,
      "loss": 1.7853,
      "step": 1960
    },
    {
      "epoch": 1.5325077399380804,
      "grad_norm": 3.2117481231689453,
      "learning_rate": 0.00010477626982151603,
      "loss": 1.8202,
      "step": 1980
    },
    {
      "epoch": 1.5479876160990713,
      "grad_norm": 4.212379455566406,
      "learning_rate": 0.0001030711448127566,
      "loss": 1.7177,
      "step": 2000
    },
    {
      "epoch": 1.5634674922600618,
      "grad_norm": 1.7437323331832886,
      "learning_rate": 0.00010136512550668693,
      "loss": 1.5671,
      "step": 2020
    },
    {
      "epoch": 1.5789473684210527,
      "grad_norm": 5.586548805236816,
      "learning_rate": 9.965870868498605e-05,
      "loss": 1.7472,
      "step": 2040
    },
    {
      "epoch": 1.5944272445820433,
      "grad_norm": 4.029956340789795,
      "learning_rate": 9.795239124508695e-05,
      "loss": 1.8359,
      "step": 2060
    },
    {
      "epoch": 1.609907120743034,
      "grad_norm": 3.604818105697632,
      "learning_rate": 9.62466700554833e-05,
      "loss": 1.7598,
      "step": 2080
    },
    {
      "epoch": 1.6253869969040249,
      "grad_norm": 2.304302215576172,
      "learning_rate": 9.454204181104455e-05,
      "loss": 1.747,
      "step": 2100
    },
    {
      "epoch": 1.6408668730650153,
      "grad_norm": 3.2332301139831543,
      "learning_rate": 9.28390028883817e-05,
      "loss": 1.7388,
      "step": 2120
    },
    {
      "epoch": 1.6563467492260062,
      "grad_norm": 4.4512763023376465,
      "learning_rate": 9.113804920130558e-05,
      "loss": 1.6419,
      "step": 2140
    },
    {
      "epoch": 1.671826625386997,
      "grad_norm": 4.963044166564941,
      "learning_rate": 8.943967605642006e-05,
      "loss": 1.6978,
      "step": 2160
    },
    {
      "epoch": 1.6873065015479876,
      "grad_norm": 24.24378776550293,
      "learning_rate": 8.774437800889198e-05,
      "loss": 1.7875,
      "step": 2180
    },
    {
      "epoch": 1.7027863777089784,
      "grad_norm": 2.131967306137085,
      "learning_rate": 8.605264871843994e-05,
      "loss": 1.8406,
      "step": 2200
    },
    {
      "epoch": 1.718266253869969,
      "grad_norm": 2.9940407276153564,
      "learning_rate": 8.436498080558373e-05,
      "loss": 1.7843,
      "step": 2220
    },
    {
      "epoch": 1.7337461300309598,
      "grad_norm": 2.6848549842834473,
      "learning_rate": 8.268186570819657e-05,
      "loss": 1.7385,
      "step": 2240
    },
    {
      "epoch": 1.7492260061919505,
      "grad_norm": 2.7113239765167236,
      "learning_rate": 8.10037935384015e-05,
      "loss": 1.6555,
      "step": 2260
    },
    {
      "epoch": 1.7647058823529411,
      "grad_norm": 2.9977896213531494,
      "learning_rate": 7.933125293985404e-05,
      "loss": 1.7137,
      "step": 2280
    },
    {
      "epoch": 1.780185758513932,
      "grad_norm": 2.588730573654175,
      "learning_rate": 7.766473094545223e-05,
      "loss": 1.6741,
      "step": 2300
    },
    {
      "epoch": 1.7956656346749225,
      "grad_norm": 2.706368923187256,
      "learning_rate": 7.600471283551596e-05,
      "loss": 1.6268,
      "step": 2320
    },
    {
      "epoch": 1.8111455108359134,
      "grad_norm": 12.232383728027344,
      "learning_rate": 7.435168199647638e-05,
      "loss": 1.6722,
      "step": 2340
    },
    {
      "epoch": 1.826625386996904,
      "grad_norm": 2.679602861404419,
      "learning_rate": 7.270611978011702e-05,
      "loss": 1.6203,
      "step": 2360
    },
    {
      "epoch": 1.8421052631578947,
      "grad_norm": 2.306474208831787,
      "learning_rate": 7.10685053634073e-05,
      "loss": 1.6872,
      "step": 2380
    },
    {
      "epoch": 1.8575851393188856,
      "grad_norm": 1.9884538650512695,
      "learning_rate": 6.943931560896921e-05,
      "loss": 1.6097,
      "step": 2400
    },
    {
      "epoch": 1.873065015479876,
      "grad_norm": 2.9608068466186523,
      "learning_rate": 6.781902492621822e-05,
      "loss": 1.6861,
      "step": 2420
    },
    {
      "epoch": 1.888544891640867,
      "grad_norm": 2.4801182746887207,
      "learning_rate": 6.620810513321816e-05,
      "loss": 1.5144,
      "step": 2440
    },
    {
      "epoch": 1.9040247678018576,
      "grad_norm": 2.943125009536743,
      "learning_rate": 6.460702531929099e-05,
      "loss": 1.5874,
      "step": 2460
    },
    {
      "epoch": 1.9195046439628483,
      "grad_norm": 2.4978630542755127,
      "learning_rate": 6.30162517084211e-05,
      "loss": 1.6379,
      "step": 2480
    },
    {
      "epoch": 1.9349845201238391,
      "grad_norm": 2.3578176498413086,
      "learning_rate": 6.143624752349373e-05,
      "loss": 1.5288,
      "step": 2500
    },
    {
      "epoch": 1.9504643962848296,
      "grad_norm": 2.534557819366455,
      "learning_rate": 5.986747285140779e-05,
      "loss": 1.5616,
      "step": 2520
    },
    {
      "epoch": 1.9659442724458205,
      "grad_norm": 2.966055154800415,
      "learning_rate": 5.83103845091013e-05,
      "loss": 1.5537,
      "step": 2540
    },
    {
      "epoch": 1.9814241486068112,
      "grad_norm": 2.5695040225982666,
      "learning_rate": 5.676543591052934e-05,
      "loss": 1.6248,
      "step": 2560
    },
    {
      "epoch": 1.9969040247678018,
      "grad_norm": 2.0038466453552246,
      "learning_rate": 5.523307693463303e-05,
      "loss": 1.5773,
      "step": 2580
    },
    {
      "epoch": 2.0123839009287927,
      "grad_norm": 2.372335195541382,
      "learning_rate": 5.3713753794337454e-05,
      "loss": 1.4961,
      "step": 2600
    },
    {
      "epoch": 2.027863777089783,
      "grad_norm": 2.7436954975128174,
      "learning_rate": 5.2207908906617596e-05,
      "loss": 1.5137,
      "step": 2620
    },
    {
      "epoch": 2.043343653250774,
      "grad_norm": 1.831878423690796,
      "learning_rate": 5.0715980763669346e-05,
      "loss": 1.5333,
      "step": 2640
    },
    {
      "epoch": 2.0588235294117645,
      "grad_norm": 2.895763635635376,
      "learning_rate": 4.923840380522341e-05,
      "loss": 1.59,
      "step": 2660
    },
    {
      "epoch": 2.0743034055727554,
      "grad_norm": 2.4093143939971924,
      "learning_rate": 4.777560829203918e-05,
      "loss": 1.4739,
      "step": 2680
    },
    {
      "epoch": 2.0897832817337463,
      "grad_norm": 2.4006857872009277,
      "learning_rate": 4.632802018061588e-05,
      "loss": 1.4223,
      "step": 2700
    },
    {
      "epoch": 2.1052631578947367,
      "grad_norm": 2.8316566944122314,
      "learning_rate": 4.4896060999156584e-05,
      "loss": 1.5238,
      "step": 2720
    },
    {
      "epoch": 2.1207430340557276,
      "grad_norm": 2.2058160305023193,
      "learning_rate": 4.348014772482212e-05,
      "loss": 1.5024,
      "step": 2740
    },
    {
      "epoch": 2.136222910216718,
      "grad_norm": 3.6439034938812256,
      "learning_rate": 4.208069266230983e-05,
      "loss": 1.6302,
      "step": 2760
    },
    {
      "epoch": 2.151702786377709,
      "grad_norm": 2.221247911453247,
      "learning_rate": 4.069810332379343e-05,
      "loss": 1.4838,
      "step": 2780
    },
    {
      "epoch": 2.1671826625387,
      "grad_norm": 2.868905544281006,
      "learning_rate": 3.933278231025784e-05,
      "loss": 1.3948,
      "step": 2800
    },
    {
      "epoch": 2.1826625386996903,
      "grad_norm": 3.4799671173095703,
      "learning_rate": 3.7985127194264645e-05,
      "loss": 1.4766,
      "step": 2820
    },
    {
      "epoch": 2.198142414860681,
      "grad_norm": 2.0027518272399902,
      "learning_rate": 3.665553040418132e-05,
      "loss": 1.477,
      "step": 2840
    },
    {
      "epoch": 2.2136222910216716,
      "grad_norm": 2.957260847091675,
      "learning_rate": 3.534437910990891e-05,
      "loss": 1.5907,
      "step": 2860
    },
    {
      "epoch": 2.2291021671826625,
      "grad_norm": 2.046623468399048,
      "learning_rate": 3.4052055110140455e-05,
      "loss": 1.5678,
      "step": 2880
    },
    {
      "epoch": 2.2445820433436534,
      "grad_norm": 2.445232391357422,
      "learning_rate": 3.277893472118392e-05,
      "loss": 1.5119,
      "step": 2900
    },
    {
      "epoch": 2.260061919504644,
      "grad_norm": 3.0307981967926025,
      "learning_rate": 3.152538866738108e-05,
      "loss": 1.418,
      "step": 2920
    },
    {
      "epoch": 2.2755417956656347,
      "grad_norm": 2.7826693058013916,
      "learning_rate": 3.029178197315533e-05,
      "loss": 1.561,
      "step": 2940
    },
    {
      "epoch": 2.291021671826625,
      "grad_norm": 3.2509384155273438,
      "learning_rate": 2.9078473856718636e-05,
      "loss": 1.639,
      "step": 2960
    },
    {
      "epoch": 2.306501547987616,
      "grad_norm": 1.799000859260559,
      "learning_rate": 2.7885817625469813e-05,
      "loss": 1.6101,
      "step": 2980
    },
    {
      "epoch": 2.321981424148607,
      "grad_norm": 2.011575222015381,
      "learning_rate": 2.67141605731135e-05,
      "loss": 1.4611,
      "step": 3000
    },
    {
      "epoch": 2.3374613003095974,
      "grad_norm": 2.495401382446289,
      "learning_rate": 2.5563843878530713e-05,
      "loss": 1.5235,
      "step": 3020
    },
    {
      "epoch": 2.3529411764705883,
      "grad_norm": 2.8658440113067627,
      "learning_rate": 2.4435202506429522e-05,
      "loss": 1.4874,
      "step": 3040
    },
    {
      "epoch": 2.3684210526315788,
      "grad_norm": 2.25388240814209,
      "learning_rate": 2.332856510980582e-05,
      "loss": 1.5087,
      "step": 3060
    },
    {
      "epoch": 2.3839009287925697,
      "grad_norm": 1.8696151971817017,
      "learning_rate": 2.224425393424142e-05,
      "loss": 1.4936,
      "step": 3080
    },
    {
      "epoch": 2.3993808049535605,
      "grad_norm": 1.8870784044265747,
      "learning_rate": 2.118258472406851e-05,
      "loss": 1.5062,
      "step": 3100
    },
    {
      "epoch": 2.414860681114551,
      "grad_norm": 2.0219244956970215,
      "learning_rate": 2.0143866630426733e-05,
      "loss": 1.5437,
      "step": 3120
    },
    {
      "epoch": 2.430340557275542,
      "grad_norm": 3.397005558013916,
      "learning_rate": 1.9128402121240586e-05,
      "loss": 1.4137,
      "step": 3140
    },
    {
      "epoch": 2.4458204334365323,
      "grad_norm": 6.5130133628845215,
      "learning_rate": 1.8136486893142592e-05,
      "loss": 1.4625,
      "step": 3160
    },
    {
      "epoch": 2.461300309597523,
      "grad_norm": 2.9240591526031494,
      "learning_rate": 1.7168409785368513e-05,
      "loss": 1.4246,
      "step": 3180
    },
    {
      "epoch": 2.476780185758514,
      "grad_norm": 2.079099655151367,
      "learning_rate": 1.622445269564905e-05,
      "loss": 1.4998,
      "step": 3200
    },
    {
      "epoch": 2.4922600619195046,
      "grad_norm": 2.2761752605438232,
      "learning_rate": 1.5304890498123338e-05,
      "loss": 1.4858,
      "step": 3220
    },
    {
      "epoch": 2.5077399380804954,
      "grad_norm": 3.006927728652954,
      "learning_rate": 1.4409990963297093e-05,
      "loss": 1.4152,
      "step": 3240
    },
    {
      "epoch": 2.523219814241486,
      "grad_norm": 2.3741161823272705,
      "learning_rate": 1.3540014680069857e-05,
      "loss": 1.5054,
      "step": 3260
    },
    {
      "epoch": 2.538699690402477,
      "grad_norm": 3.471731424331665,
      "learning_rate": 1.2695214979852987e-05,
      "loss": 1.4098,
      "step": 3280
    },
    {
      "epoch": 2.5541795665634677,
      "grad_norm": 2.505537509918213,
      "learning_rate": 1.1875837862801431e-05,
      "loss": 1.4786,
      "step": 3300
    },
    {
      "epoch": 2.569659442724458,
      "grad_norm": 2.0300590991973877,
      "learning_rate": 1.1082121926179844e-05,
      "loss": 1.4354,
      "step": 3320
    },
    {
      "epoch": 2.585139318885449,
      "grad_norm": 2.6981465816497803,
      "learning_rate": 1.0314298294884839e-05,
      "loss": 1.4713,
      "step": 3340
    },
    {
      "epoch": 2.6006191950464395,
      "grad_norm": 2.780489444732666,
      "learning_rate": 9.572590554142757e-06,
      "loss": 1.4439,
      "step": 3360
    },
    {
      "epoch": 2.6160990712074303,
      "grad_norm": 2.1285483837127686,
      "learning_rate": 8.85721468440327e-06,
      "loss": 1.5715,
      "step": 3380
    },
    {
      "epoch": 2.6315789473684212,
      "grad_norm": 2.0249032974243164,
      "learning_rate": 8.168378998447123e-06,
      "loss": 1.3899,
      "step": 3400
    },
    {
      "epoch": 2.6470588235294117,
      "grad_norm": 2.097409725189209,
      "learning_rate": 7.506284080726955e-06,
      "loss": 1.5343,
      "step": 3420
    },
    {
      "epoch": 2.6625386996904026,
      "grad_norm": 1.9598788022994995,
      "learning_rate": 6.87112272895829e-06,
      "loss": 1.4559,
      "step": 3440
    },
    {
      "epoch": 2.678018575851393,
      "grad_norm": 2.149622917175293,
      "learning_rate": 6.26307989797823e-06,
      "loss": 1.5203,
      "step": 3460
    },
    {
      "epoch": 2.693498452012384,
      "grad_norm": 1.9326039552688599,
      "learning_rate": 5.682332645887689e-06,
      "loss": 1.373,
      "step": 3480
    },
    {
      "epoch": 2.708978328173375,
      "grad_norm": 2.489595890045166,
      "learning_rate": 5.129050082493336e-06,
      "loss": 1.4674,
      "step": 3500
    },
    {
      "epoch": 2.7244582043343653,
      "grad_norm": 2.2312111854553223,
      "learning_rate": 4.603393320063831e-06,
      "loss": 1.4345,
      "step": 3520
    },
    {
      "epoch": 2.739938080495356,
      "grad_norm": 2.5505943298339844,
      "learning_rate": 4.105515426415074e-06,
      "loss": 1.3763,
      "step": 3540
    },
    {
      "epoch": 2.7554179566563466,
      "grad_norm": 2.39302659034729,
      "learning_rate": 3.6355613803378154e-06,
      "loss": 1.4481,
      "step": 3560
    },
    {
      "epoch": 2.7708978328173375,
      "grad_norm": 2.956669569015503,
      "learning_rate": 3.193668029380725e-06,
      "loss": 1.3423,
      "step": 3580
    },
    {
      "epoch": 2.7863777089783284,
      "grad_norm": 2.6478095054626465,
      "learning_rate": 2.7799640500014047e-06,
      "loss": 1.5363,
      "step": 3600
    },
    {
      "epoch": 2.801857585139319,
      "grad_norm": 2.2021002769470215,
      "learning_rate": 2.3945699100965e-06,
      "loss": 1.4281,
      "step": 3620
    },
    {
      "epoch": 2.8173374613003097,
      "grad_norm": 2.756779909133911,
      "learning_rate": 2.0375978339223776e-06,
      "loss": 1.3747,
      "step": 3640
    },
    {
      "epoch": 2.8328173374613,
      "grad_norm": 2.004383087158203,
      "learning_rate": 1.7091517694160286e-06,
      "loss": 1.3276,
      "step": 3660
    },
    {
      "epoch": 2.848297213622291,
      "grad_norm": 2.8317830562591553,
      "learning_rate": 1.4093273579261935e-06,
      "loss": 1.3893,
      "step": 3680
    },
    {
      "epoch": 2.863777089783282,
      "grad_norm": 2.2364342212677,
      "learning_rate": 1.1382119063631736e-06,
      "loss": 1.4623,
      "step": 3700
    },
    {
      "epoch": 2.8792569659442724,
      "grad_norm": 2.155151128768921,
      "learning_rate": 8.958843617757007e-07,
      "loss": 1.4898,
      "step": 3720
    },
    {
      "epoch": 2.8947368421052633,
      "grad_norm": 2.1057004928588867,
      "learning_rate": 6.824152883619705e-07,
      "loss": 1.4417,
      "step": 3740
    },
    {
      "epoch": 2.9102167182662537,
      "grad_norm": 2.36039662361145,
      "learning_rate": 4.978668469218906e-07,
      "loss": 1.4101,
      "step": 3760
    },
    {
      "epoch": 2.9256965944272446,
      "grad_norm": 2.6042068004608154,
      "learning_rate": 3.422927767562256e-07,
      "loss": 1.3303,
      "step": 3780
    },
    {
      "epoch": 2.9411764705882355,
      "grad_norm": 2.136204957962036,
      "learning_rate": 2.1573838001808232e-07,
      "loss": 1.4015,
      "step": 3800
    },
    {
      "epoch": 2.956656346749226,
      "grad_norm": 2.2292158603668213,
      "learning_rate": 1.182405085211724e-07,
      "loss": 1.4547,
      "step": 3820
    },
    {
      "epoch": 2.972136222910217,
      "grad_norm": 2.508124589920044,
      "learning_rate": 4.982755300889652e-08,
      "loss": 1.3741,
      "step": 3840
    },
    {
      "epoch": 2.9876160990712073,
      "grad_norm": 2.2655694484710693,
      "learning_rate": 1.0519434887057422e-08,
      "loss": 1.4247,
      "step": 3860
    }
  ],
  "logging_steps": 20,
  "max_steps": 3876,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 3.4761450366959616e+17,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}