{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 4.997163925127623,
  "eval_steps": 1000,
  "global_step": 4405,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.011344299489506523,
      "grad_norm": 2.3206300735473633,
      "learning_rate": 4.535147392290249e-06,
      "loss": 1.5929,
      "step": 10
    },
    {
      "epoch": 0.022688598979013045,
      "grad_norm": 1.2386493682861328,
      "learning_rate": 9.070294784580499e-06,
      "loss": 1.6159,
      "step": 20
    },
    {
      "epoch": 0.03403289846851957,
      "grad_norm": 1.1790252923965454,
      "learning_rate": 1.360544217687075e-05,
      "loss": 1.538,
      "step": 30
    },
    {
      "epoch": 0.04537719795802609,
      "grad_norm": 1.021796703338623,
      "learning_rate": 1.8140589569160997e-05,
      "loss": 1.4132,
      "step": 40
    },
    {
      "epoch": 0.05672149744753262,
      "grad_norm": 1.3392266035079956,
      "learning_rate": 2.267573696145125e-05,
      "loss": 1.2604,
      "step": 50
    },
    {
      "epoch": 0.06806579693703914,
      "grad_norm": 0.9446895122528076,
      "learning_rate": 2.72108843537415e-05,
      "loss": 1.1644,
      "step": 60
    },
    {
      "epoch": 0.07941009642654566,
      "grad_norm": 1.3290923833847046,
      "learning_rate": 3.1746031746031745e-05,
      "loss": 1.1082,
      "step": 70
    },
    {
      "epoch": 0.09075439591605218,
      "grad_norm": 1.5161434412002563,
      "learning_rate": 3.6281179138321995e-05,
      "loss": 1.0389,
      "step": 80
    },
    {
      "epoch": 0.1020986954055587,
      "grad_norm": 0.6483525633811951,
      "learning_rate": 4.0816326530612245e-05,
      "loss": 1.0542,
      "step": 90
    },
    {
      "epoch": 0.11344299489506524,
      "grad_norm": 0.8814989924430847,
      "learning_rate": 4.53514739229025e-05,
      "loss": 0.9847,
      "step": 100
    },
    {
      "epoch": 0.12478729438457176,
      "grad_norm": 0.7316718101501465,
      "learning_rate": 4.9886621315192745e-05,
      "loss": 1.0585,
      "step": 110
    },
    {
      "epoch": 0.13613159387407828,
      "grad_norm": 0.7645348310470581,
      "learning_rate": 5.4421768707483e-05,
      "loss": 0.9713,
      "step": 120
    },
    {
      "epoch": 0.1474758933635848,
      "grad_norm": 0.6830883622169495,
      "learning_rate": 5.895691609977324e-05,
      "loss": 0.9823,
      "step": 130
    },
    {
      "epoch": 0.15882019285309132,
      "grad_norm": 1.3199207782745361,
      "learning_rate": 6.349206349206349e-05,
      "loss": 0.9992,
      "step": 140
    },
    {
      "epoch": 0.17016449234259784,
      "grad_norm": 0.7770159840583801,
      "learning_rate": 6.802721088435374e-05,
      "loss": 1.0085,
      "step": 150
    },
    {
      "epoch": 0.18150879183210436,
      "grad_norm": 1.623410940170288,
      "learning_rate": 7.256235827664399e-05,
      "loss": 1.0491,
      "step": 160
    },
    {
      "epoch": 0.19285309132161088,
      "grad_norm": 2.8830106258392334,
      "learning_rate": 7.709750566893424e-05,
      "loss": 1.0686,
      "step": 170
    },
    {
      "epoch": 0.2041973908111174,
      "grad_norm": 1.3428577184677124,
      "learning_rate": 8.163265306122449e-05,
      "loss": 1.0359,
      "step": 180
    },
    {
      "epoch": 0.21554169030062392,
      "grad_norm": 0.8043076395988464,
      "learning_rate": 8.616780045351474e-05,
      "loss": 1.0496,
      "step": 190
    },
    {
      "epoch": 0.22688598979013047,
      "grad_norm": 1.8799352645874023,
      "learning_rate": 9.0702947845805e-05,
      "loss": 1.0284,
      "step": 200
    },
    {
      "epoch": 0.238230289279637,
      "grad_norm": 0.6667978167533875,
      "learning_rate": 9.523809523809524e-05,
      "loss": 1.0162,
      "step": 210
    },
    {
      "epoch": 0.2495745887691435,
      "grad_norm": 0.815127968788147,
      "learning_rate": 9.977324263038549e-05,
      "loss": 1.0009,
      "step": 220
    },
    {
      "epoch": 0.26091888825865,
      "grad_norm": 0.6558067798614502,
      "learning_rate": 0.00010430839002267574,
      "loss": 1.004,
      "step": 230
    },
    {
      "epoch": 0.27226318774815655,
      "grad_norm": 0.6002511382102966,
      "learning_rate": 0.000108843537414966,
      "loss": 0.9702,
      "step": 240
    },
    {
      "epoch": 0.28360748723766305,
      "grad_norm": 0.7007895708084106,
      "learning_rate": 0.00011337868480725624,
      "loss": 1.0266,
      "step": 250
    },
    {
      "epoch": 0.2949517867271696,
      "grad_norm": 0.7985921502113342,
      "learning_rate": 0.00011791383219954648,
      "loss": 0.9753,
      "step": 260
    },
    {
      "epoch": 0.30629608621667614,
      "grad_norm": 0.5343239903450012,
      "learning_rate": 0.00012244897959183676,
      "loss": 1.036,
      "step": 270
    },
    {
      "epoch": 0.31764038570618264,
      "grad_norm": 0.7095124125480652,
      "learning_rate": 0.00012698412698412698,
      "loss": 1.0061,
      "step": 280
    },
    {
      "epoch": 0.3289846851956892,
      "grad_norm": 0.8570685386657715,
      "learning_rate": 0.00013151927437641726,
      "loss": 0.9458,
      "step": 290
    },
    {
      "epoch": 0.3403289846851957,
      "grad_norm": 0.6379779577255249,
      "learning_rate": 0.00013605442176870748,
      "loss": 0.9965,
      "step": 300
    },
    {
      "epoch": 0.3516732841747022,
      "grad_norm": 0.9263567328453064,
      "learning_rate": 0.00014058956916099776,
      "loss": 0.9601,
      "step": 310
    },
    {
      "epoch": 0.3630175836642087,
      "grad_norm": 0.7343761920928955,
      "learning_rate": 0.00014512471655328798,
      "loss": 1.0182,
      "step": 320
    },
    {
      "epoch": 0.37436188315371527,
      "grad_norm": 0.588762640953064,
      "learning_rate": 0.00014965986394557826,
      "loss": 0.9762,
      "step": 330
    },
    {
      "epoch": 0.38570618264322176,
      "grad_norm": 0.6719630360603333,
      "learning_rate": 0.00015419501133786848,
      "loss": 0.989,
      "step": 340
    },
    {
      "epoch": 0.3970504821327283,
      "grad_norm": 1.641836166381836,
      "learning_rate": 0.00015873015873015873,
      "loss": 0.9611,
      "step": 350
    },
    {
      "epoch": 0.4083947816222348,
      "grad_norm": 0.9340532422065735,
      "learning_rate": 0.00016326530612244898,
      "loss": 0.9861,
      "step": 360
    },
    {
      "epoch": 0.41973908111174135,
      "grad_norm": 0.737554669380188,
      "learning_rate": 0.00016780045351473923,
      "loss": 1.0,
      "step": 370
    },
    {
      "epoch": 0.43108338060124785,
      "grad_norm": 1.1190237998962402,
      "learning_rate": 0.00017233560090702948,
      "loss": 1.016,
      "step": 380
    },
    {
      "epoch": 0.4424276800907544,
      "grad_norm": 0.7501509785652161,
      "learning_rate": 0.00017687074829931973,
      "loss": 0.9743,
      "step": 390
    },
    {
      "epoch": 0.45377197958026094,
      "grad_norm": 0.5105754733085632,
      "learning_rate": 0.00018140589569161,
      "loss": 1.0182,
      "step": 400
    },
    {
      "epoch": 0.46511627906976744,
      "grad_norm": 0.7148075699806213,
      "learning_rate": 0.00018594104308390023,
      "loss": 0.9673,
      "step": 410
    },
    {
      "epoch": 0.476460578559274,
      "grad_norm": 0.49944302439689636,
      "learning_rate": 0.00019047619047619048,
      "loss": 1.0083,
      "step": 420
    },
    {
      "epoch": 0.4878048780487805,
      "grad_norm": 0.5624661445617676,
      "learning_rate": 0.00019501133786848073,
      "loss": 1.0201,
      "step": 430
    },
    {
      "epoch": 0.499149177538287,
      "grad_norm": 0.5779452919960022,
      "learning_rate": 0.00019954648526077098,
      "loss": 1.0165,
      "step": 440
    },
    {
      "epoch": 0.5104934770277936,
      "grad_norm": 0.8505494594573975,
      "learning_rate": 0.0001999974561843451,
      "loss": 0.9527,
      "step": 450
    },
    {
      "epoch": 0.5218377765173,
      "grad_norm": 0.7141993641853333,
      "learning_rate": 0.00019998866291366877,
      "loss": 0.9927,
      "step": 460
    },
    {
      "epoch": 0.5331820760068066,
      "grad_norm": 0.5913094282150269,
      "learning_rate": 0.0001999735893350151,
      "loss": 1.0054,
      "step": 470
    },
    {
      "epoch": 0.5445263754963131,
      "grad_norm": 0.5813531279563904,
      "learning_rate": 0.00019995223639515864,
      "loss": 0.9511,
      "step": 480
    },
    {
      "epoch": 0.5558706749858197,
      "grad_norm": 0.9083317518234253,
      "learning_rate": 0.0001999246054352818,
      "loss": 0.9596,
      "step": 490
    },
    {
      "epoch": 0.5672149744753261,
      "grad_norm": 0.8444753885269165,
      "learning_rate": 0.00019989069819089067,
      "loss": 1.0163,
      "step": 500
    },
    {
      "epoch": 0.5785592739648326,
      "grad_norm": 0.6896610856056213,
      "learning_rate": 0.0001998505167917061,
      "loss": 0.9606,
      "step": 510
    },
    {
      "epoch": 0.5899035734543392,
      "grad_norm": 0.7446523308753967,
      "learning_rate": 0.00019980406376152984,
      "loss": 0.9748,
      "step": 520
    },
    {
      "epoch": 0.6012478729438457,
      "grad_norm": 0.5111407041549683,
      "learning_rate": 0.00019975134201808605,
      "loss": 0.9364,
      "step": 530
    },
    {
      "epoch": 0.6125921724333523,
      "grad_norm": 0.6797256469726562,
      "learning_rate": 0.000199692354872838,
      "loss": 0.9766,
      "step": 540
    },
    {
      "epoch": 0.6239364719228587,
      "grad_norm": 0.9774245619773865,
      "learning_rate": 0.00019962710603078007,
      "loss": 0.9669,
      "step": 550
    },
    {
      "epoch": 0.6352807714123653,
      "grad_norm": 0.7039481997489929,
      "learning_rate": 0.0001995555995902052,
      "loss": 0.9371,
      "step": 560
    },
    {
      "epoch": 0.6466250709018718,
      "grad_norm": 0.7363829016685486,
      "learning_rate": 0.0001994778400424472,
      "loss": 0.9809,
      "step": 570
    },
    {
      "epoch": 0.6579693703913784,
      "grad_norm": 0.7072857022285461,
      "learning_rate": 0.0001993938322715989,
      "loss": 0.9825,
      "step": 580
    },
    {
      "epoch": 0.6693136698808848,
      "grad_norm": 0.5628974437713623,
      "learning_rate": 0.00019930358155420525,
      "loss": 0.9101,
      "step": 590
    },
    {
      "epoch": 0.6806579693703914,
      "grad_norm": 0.6564317345619202,
      "learning_rate": 0.0001992070935589319,
      "loss": 1.0374,
      "step": 600
    },
    {
      "epoch": 0.6920022688598979,
      "grad_norm": 0.5805884599685669,
      "learning_rate": 0.0001991043743462092,
      "loss": 0.9695,
      "step": 610
    },
    {
      "epoch": 0.7033465683494045,
      "grad_norm": 0.5671830773353577,
      "learning_rate": 0.00019899543036785145,
      "loss": 0.9598,
      "step": 620
    },
    {
      "epoch": 0.7146908678389109,
      "grad_norm": 0.54367595911026,
      "learning_rate": 0.0001988802684666519,
      "loss": 0.962,
      "step": 630
    },
    {
      "epoch": 0.7260351673284174,
      "grad_norm": 0.6982467770576477,
      "learning_rate": 0.00019875889587595252,
      "loss": 0.9633,
      "step": 640
    },
    {
      "epoch": 0.737379466817924,
      "grad_norm": 0.6268488764762878,
      "learning_rate": 0.00019863132021919025,
      "loss": 0.9684,
      "step": 650
    },
    {
      "epoch": 0.7487237663074305,
      "grad_norm": 1.2111632823944092,
      "learning_rate": 0.00019849754950941758,
      "loss": 1.0044,
      "step": 660
    },
    {
      "epoch": 0.7600680657969371,
      "grad_norm": 0.6442829370498657,
      "learning_rate": 0.00019835759214879964,
      "loss": 0.9533,
      "step": 670
    },
    {
      "epoch": 0.7714123652864435,
      "grad_norm": 0.5263229608535767,
      "learning_rate": 0.00019821145692808633,
      "loss": 0.959,
      "step": 680
    },
    {
      "epoch": 0.7827566647759501,
      "grad_norm": 0.572928786277771,
      "learning_rate": 0.00019805915302606016,
      "loss": 0.9473,
      "step": 690
    },
    {
      "epoch": 0.7941009642654566,
      "grad_norm": 0.6176092624664307,
      "learning_rate": 0.00019790069000895987,
      "loss": 0.9164,
      "step": 700
    },
    {
      "epoch": 0.8054452637549632,
      "grad_norm": 0.5628384351730347,
      "learning_rate": 0.00019773607782987924,
      "loss": 0.9705,
      "step": 710
    },
    {
      "epoch": 0.8167895632444696,
      "grad_norm": 0.8331648111343384,
      "learning_rate": 0.00019756532682814232,
      "loss": 0.9497,
      "step": 720
    },
    {
      "epoch": 0.8281338627339762,
      "grad_norm": 0.5843848586082458,
      "learning_rate": 0.00019738844772865377,
      "loss": 0.9828,
      "step": 730
    },
    {
      "epoch": 0.8394781622234827,
      "grad_norm": 0.6603434681892395,
      "learning_rate": 0.0001972054516412253,
      "loss": 0.9717,
      "step": 740
    },
    {
      "epoch": 0.8508224617129893,
      "grad_norm": 0.5622076988220215,
      "learning_rate": 0.00019701635005987792,
      "loss": 0.9392,
      "step": 750
    },
    {
      "epoch": 0.8621667612024957,
      "grad_norm": 0.8947564959526062,
      "learning_rate": 0.00019682115486211984,
      "loss": 0.9917,
      "step": 760
    },
    {
      "epoch": 0.8735110606920022,
      "grad_norm": 0.5935038328170776,
      "learning_rate": 0.00019661987830820065,
      "loss": 0.9749,
      "step": 770
    },
    {
      "epoch": 0.8848553601815088,
      "grad_norm": 0.8751797676086426,
      "learning_rate": 0.000196412533040341,
      "loss": 0.9828,
      "step": 780
    },
    {
      "epoch": 0.8961996596710153,
      "grad_norm": 0.5279515981674194,
      "learning_rate": 0.00019619913208193882,
      "loss": 0.9685,
      "step": 790
    },
    {
      "epoch": 0.9075439591605219,
      "grad_norm": 0.643695056438446,
      "learning_rate": 0.00019597968883675116,
      "loss": 0.9547,
      "step": 800
    },
    {
      "epoch": 0.9188882586500283,
      "grad_norm": 0.7370747923851013,
      "learning_rate": 0.00019575421708805215,
      "loss": 0.9129,
      "step": 810
    },
    {
      "epoch": 0.9302325581395349,
      "grad_norm": 0.7514728307723999,
      "learning_rate": 0.0001955227309977677,
      "loss": 0.9929,
      "step": 820
    },
    {
      "epoch": 0.9415768576290414,
      "grad_norm": 0.6589088439941406,
      "learning_rate": 0.00019528524510558547,
      "loss": 0.9627,
      "step": 830
    },
    {
      "epoch": 0.952921157118548,
      "grad_norm": 0.548102617263794,
      "learning_rate": 0.00019504177432804203,
      "loss": 0.9307,
      "step": 840
    },
    {
      "epoch": 0.9642654566080544,
      "grad_norm": 0.458879679441452,
      "learning_rate": 0.00019479233395758576,
      "loss": 0.9838,
      "step": 850
    },
    {
      "epoch": 0.975609756097561,
      "grad_norm": 0.9955594539642334,
      "learning_rate": 0.0001945369396616164,
      "loss": 0.9246,
      "step": 860
    },
    {
      "epoch": 0.9869540555870675,
      "grad_norm": 0.5781052708625793,
      "learning_rate": 0.0001942756074815009,
      "loss": 1.0076,
      "step": 870
    },
    {
      "epoch": 0.998298355076574,
      "grad_norm": 0.7370733022689819,
      "learning_rate": 0.00019400835383156592,
      "loss": 0.9618,
      "step": 880
    },
    {
      "epoch": 1.0096426545660806,
      "grad_norm": 0.6173350214958191,
      "learning_rate": 0.00019373519549806682,
      "loss": 0.872,
      "step": 890
    },
    {
      "epoch": 1.0209869540555871,
      "grad_norm": 0.6110262274742126,
      "learning_rate": 0.00019345614963813334,
      "loss": 0.8953,
      "step": 900
    },
    {
      "epoch": 1.0323312535450937,
      "grad_norm": 0.8880902528762817,
      "learning_rate": 0.00019317123377869192,
      "loss": 0.8847,
      "step": 910
    },
    {
      "epoch": 1.0436755530346,
      "grad_norm": 0.6907595992088318,
      "learning_rate": 0.00019288046581536486,
      "loss": 0.8878,
      "step": 920
    },
    {
      "epoch": 1.0550198525241066,
      "grad_norm": 0.7469139695167542,
      "learning_rate": 0.00019258386401134624,
      "loss": 0.9018,
      "step": 930
    },
    {
      "epoch": 1.0663641520136131,
      "grad_norm": 0.8650104403495789,
      "learning_rate": 0.0001922814469962549,
      "loss": 0.8825,
      "step": 940
    },
    {
      "epoch": 1.0777084515031197,
      "grad_norm": 1.1437135934829712,
      "learning_rate": 0.00019197323376496427,
      "loss": 0.8977,
      "step": 950
    },
    {
      "epoch": 1.0890527509926262,
      "grad_norm": 0.6191611289978027,
      "learning_rate": 0.00019165924367640916,
      "loss": 0.9059,
      "step": 960
    },
    {
      "epoch": 1.1003970504821328,
      "grad_norm": 0.7402692437171936,
      "learning_rate": 0.00019133949645237005,
      "loss": 0.8778,
      "step": 970
    },
    {
      "epoch": 1.1117413499716393,
      "grad_norm": 0.7002813220024109,
      "learning_rate": 0.00019101401217623426,
      "loss": 0.9281,
      "step": 980
    },
    {
      "epoch": 1.1230856494611459,
      "grad_norm": 0.9000174403190613,
      "learning_rate": 0.00019068281129173444,
      "loss": 0.8795,
      "step": 990
    },
    {
      "epoch": 1.1344299489506522,
      "grad_norm": 0.6749204993247986,
      "learning_rate": 0.00019034591460166463,
      "loss": 0.9091,
      "step": 1000
    },
    {
      "epoch": 1.1344299489506522,
      "eval_loss": 0.8940885663032532,
      "eval_runtime": 15.7869,
      "eval_samples_per_second": 94.065,
      "eval_steps_per_second": 11.782,
      "step": 1000
    },
    {
      "epoch": 1.1457742484401587,
      "grad_norm": 0.7294667959213257,
      "learning_rate": 0.00019000334326657345,
      "loss": 0.879,
      "step": 1010
    },
    {
      "epoch": 1.1571185479296653,
      "grad_norm": 0.9591787457466125,
      "learning_rate": 0.00018965511880343527,
      "loss": 0.9264,
      "step": 1020
    },
    {
      "epoch": 1.1684628474191718,
      "grad_norm": 0.9575808644294739,
      "learning_rate": 0.00018930126308429844,
      "loss": 0.8825,
      "step": 1030
    },
    {
      "epoch": 1.1798071469086784,
      "grad_norm": 0.49267736077308655,
      "learning_rate": 0.00018894179833491164,
      "loss": 0.9321,
      "step": 1040
    },
    {
      "epoch": 1.191151446398185,
      "grad_norm": 0.848102867603302,
      "learning_rate": 0.00018857674713332795,
      "loss": 0.8543,
      "step": 1050
    },
    {
      "epoch": 1.2024957458876915,
      "grad_norm": 0.7710912227630615,
      "learning_rate": 0.00018820613240848655,
      "loss": 0.9468,
      "step": 1060
    },
    {
      "epoch": 1.213840045377198,
      "grad_norm": 0.6399308443069458,
      "learning_rate": 0.00018782997743877264,
      "loss": 0.9081,
      "step": 1070
    },
    {
      "epoch": 1.2251843448667046,
      "grad_norm": 0.9124737977981567,
      "learning_rate": 0.00018744830585055538,
      "loss": 0.9288,
      "step": 1080
    },
    {
      "epoch": 1.236528644356211,
      "grad_norm": 0.6313666105270386,
      "learning_rate": 0.00018706114161670377,
      "loss": 0.8197,
      "step": 1090
    },
    {
      "epoch": 1.2478729438457175,
      "grad_norm": 0.7220073938369751,
      "learning_rate": 0.000186668509055081,
      "loss": 0.8576,
      "step": 1100
    },
    {
      "epoch": 1.259217243335224,
      "grad_norm": 1.1808422803878784,
      "learning_rate": 0.00018627043282701703,
      "loss": 0.9044,
      "step": 1110
    },
    {
      "epoch": 1.2705615428247305,
      "grad_norm": 0.6578934788703918,
      "learning_rate": 0.00018586693793575966,
      "loss": 0.9015,
      "step": 1120
    },
    {
      "epoch": 1.281905842314237,
      "grad_norm": 0.9080325961112976,
      "learning_rate": 0.0001854580497249039,
      "loss": 0.8919,
      "step": 1130
    },
    {
      "epoch": 1.2932501418037436,
      "grad_norm": 0.6446923017501831,
      "learning_rate": 0.00018504379387680034,
      "loss": 0.9033,
      "step": 1140
    },
    {
      "epoch": 1.3045944412932502,
      "grad_norm": 0.6877492070198059,
      "learning_rate": 0.00018462419641094189,
      "loss": 0.8843,
      "step": 1150
    },
    {
      "epoch": 1.3159387407827567,
      "grad_norm": 0.6565636396408081,
      "learning_rate": 0.00018419928368232957,
      "loss": 0.8925,
      "step": 1160
    },
    {
      "epoch": 1.3272830402722633,
      "grad_norm": 0.8198230862617493,
      "learning_rate": 0.0001837690823798171,
      "loss": 0.8495,
      "step": 1170
    },
    {
      "epoch": 1.3386273397617696,
      "grad_norm": 0.7579399347305298,
      "learning_rate": 0.00018333361952443462,
      "loss": 0.9051,
      "step": 1180
    },
    {
      "epoch": 1.3499716392512762,
      "grad_norm": 0.8067922592163086,
      "learning_rate": 0.0001828929224676914,
      "loss": 0.8677,
      "step": 1190
    },
    {
      "epoch": 1.3613159387407827,
      "grad_norm": 0.7077610492706299,
      "learning_rate": 0.00018244701888985802,
      "loss": 0.942,
      "step": 1200
    },
    {
      "epoch": 1.3726602382302893,
      "grad_norm": 1.2009291648864746,
      "learning_rate": 0.00018199593679822765,
      "loss": 0.9034,
      "step": 1210
    },
    {
      "epoch": 1.3840045377197958,
      "grad_norm": 0.8162534832954407,
      "learning_rate": 0.00018153970452535698,
      "loss": 0.8904,
      "step": 1220
    },
    {
      "epoch": 1.3953488372093024,
      "grad_norm": 0.6332406401634216,
      "learning_rate": 0.00018107835072728656,
      "loss": 0.8637,
      "step": 1230
    },
    {
      "epoch": 1.406693136698809,
      "grad_norm": 0.6449089050292969,
      "learning_rate": 0.00018061190438174105,
      "loss": 0.9463,
      "step": 1240
    },
    {
      "epoch": 1.4180374361883152,
      "grad_norm": 0.6543394327163696,
      "learning_rate": 0.00018014039478630894,
      "loss": 0.8497,
      "step": 1250
    },
    {
      "epoch": 1.429381735677822,
      "grad_norm": 0.7993437647819519,
      "learning_rate": 0.0001796638515566025,
      "loss": 0.9415,
      "step": 1260
    },
    {
      "epoch": 1.4407260351673283,
      "grad_norm": 0.878514289855957,
      "learning_rate": 0.0001791823046243977,
      "loss": 0.9143,
      "step": 1270
    },
    {
      "epoch": 1.4520703346568349,
      "grad_norm": 0.6794580817222595,
      "learning_rate": 0.00017869578423575387,
      "loss": 0.9041,
      "step": 1280
    },
    {
      "epoch": 1.4634146341463414,
      "grad_norm": 0.9009565711021423,
      "learning_rate": 0.00017820432094911427,
      "loss": 0.8773,
      "step": 1290
    },
    {
      "epoch": 1.474758933635848,
      "grad_norm": 0.6419825553894043,
      "learning_rate": 0.00017770794563338647,
      "loss": 0.9027,
      "step": 1300
    },
    {
      "epoch": 1.4861032331253545,
      "grad_norm": 0.7277469635009766,
      "learning_rate": 0.0001772066894660037,
      "loss": 0.9123,
      "step": 1310
    },
    {
      "epoch": 1.497447532614861,
      "grad_norm": 0.7514845132827759,
      "learning_rate": 0.00017670058393096634,
      "loss": 0.9095,
      "step": 1320
    },
    {
      "epoch": 1.5087918321043676,
      "grad_norm": 0.5530194044113159,
      "learning_rate": 0.0001761896608168646,
      "loss": 0.855,
      "step": 1330
    },
    {
      "epoch": 1.520136131593874,
      "grad_norm": 0.6379088759422302,
      "learning_rate": 0.0001756739522148818,
      "loss": 0.9485,
      "step": 1340
    },
    {
      "epoch": 1.5314804310833807,
      "grad_norm": 0.5411556959152222,
      "learning_rate": 0.0001751534905167787,
      "loss": 0.951,
      "step": 1350
    },
    {
      "epoch": 1.542824730572887,
      "grad_norm": 0.9241764545440674,
      "learning_rate": 0.00017462830841285894,
      "loss": 0.8459,
      "step": 1360
    },
    {
      "epoch": 1.5541690300623936,
      "grad_norm": 0.9029989242553711,
      "learning_rate": 0.00017409843888991584,
      "loss": 0.9045,
      "step": 1370
    },
    {
      "epoch": 1.5655133295519001,
      "grad_norm": 0.9002951979637146,
      "learning_rate": 0.00017356391522916042,
      "loss": 0.8388,
      "step": 1380
    },
    {
      "epoch": 1.5768576290414067,
      "grad_norm": 0.6322818994522095,
      "learning_rate": 0.0001730247710041311,
      "loss": 0.8937,
      "step": 1390
    },
    {
      "epoch": 1.5882019285309132,
      "grad_norm": 0.9197801351547241,
      "learning_rate": 0.00017248104007858476,
      "loss": 0.8656,
      "step": 1400
    },
    {
      "epoch": 1.5995462280204198,
      "grad_norm": 0.7498595714569092,
      "learning_rate": 0.00017193275660436997,
      "loss": 0.8848,
      "step": 1410
    },
    {
      "epoch": 1.6108905275099263,
      "grad_norm": 1.0003221035003662,
      "learning_rate": 0.00017137995501928166,
      "loss": 0.8494,
      "step": 1420
    },
    {
      "epoch": 1.6222348269994327,
      "grad_norm": 0.6622512340545654,
      "learning_rate": 0.00017082267004489842,
      "loss": 0.9158,
      "step": 1430
    },
    {
      "epoch": 1.6335791264889394,
      "grad_norm": 1.2562657594680786,
      "learning_rate": 0.00017026093668440114,
      "loss": 0.8899,
      "step": 1440
    },
    {
      "epoch": 1.6449234259784458,
      "grad_norm": 0.5380372405052185,
      "learning_rate": 0.00016969479022037502,
      "loss": 0.9082,
      "step": 1450
    },
    {
      "epoch": 1.6562677254679523,
      "grad_norm": 0.7120011448860168,
      "learning_rate": 0.00016912426621259297,
      "loss": 0.8456,
      "step": 1460
    },
    {
      "epoch": 1.6676120249574589,
      "grad_norm": 0.580111026763916,
      "learning_rate": 0.0001685494004957824,
      "loss": 0.9272,
      "step": 1470
    },
    {
      "epoch": 1.6789563244469654,
      "grad_norm": 0.9516561627388,
      "learning_rate": 0.0001679702291773743,
      "loss": 0.906,
      "step": 1480
    },
    {
      "epoch": 1.690300623936472,
      "grad_norm": 0.5973901152610779,
      "learning_rate": 0.0001673867886352354,
      "loss": 0.931,
      "step": 1490
    },
    {
      "epoch": 1.7016449234259783,
      "grad_norm": 0.7292883992195129,
      "learning_rate": 0.00016679911551538317,
      "loss": 0.8848,
      "step": 1500
    },
    {
      "epoch": 1.712989222915485,
      "grad_norm": 0.6363751888275146,
      "learning_rate": 0.0001662072467296842,
      "loss": 0.9059,
      "step": 1510
    },
    {
      "epoch": 1.7243335224049914,
      "grad_norm": 0.9236806631088257,
      "learning_rate": 0.00016561121945353566,
      "loss": 0.8557,
      "step": 1520
    },
    {
      "epoch": 1.7356778218944982,
      "grad_norm": 0.6865366697311401,
      "learning_rate": 0.00016501107112353028,
      "loss": 0.9264,
      "step": 1530
    },
    {
      "epoch": 1.7470221213840045,
      "grad_norm": 0.6749486923217773,
      "learning_rate": 0.00016440683943510516,
      "loss": 0.9224,
      "step": 1540
    },
    {
      "epoch": 1.758366420873511,
      "grad_norm": 0.7539329528808594,
      "learning_rate": 0.00016379856234017382,
      "loss": 0.8594,
      "step": 1550
    },
    {
      "epoch": 1.7697107203630176,
      "grad_norm": 0.6702885031700134,
      "learning_rate": 0.0001631862780447426,
      "loss": 0.8896,
      "step": 1560
    },
    {
      "epoch": 1.7810550198525241,
      "grad_norm": 0.6152791976928711,
      "learning_rate": 0.00016257002500651098,
      "loss": 0.8738,
      "step": 1570
    },
    {
      "epoch": 1.7923993193420307,
      "grad_norm": 0.5736550688743591,
      "learning_rate": 0.00016194984193245587,
      "loss": 0.9018,
      "step": 1580
    },
    {
      "epoch": 1.803743618831537,
      "grad_norm": 0.751157820224762,
      "learning_rate": 0.00016132576777640067,
      "loss": 0.8605,
      "step": 1590
    },
    {
      "epoch": 1.8150879183210438,
      "grad_norm": 0.6626732349395752,
      "learning_rate": 0.0001606978417365682,
      "loss": 0.8857,
      "step": 1600
    },
    {
      "epoch": 1.82643221781055,
      "grad_norm": 0.584065318107605,
      "learning_rate": 0.00016006610325311908,
      "loss": 0.9104,
      "step": 1610
    },
    {
      "epoch": 1.8377765173000569,
      "grad_norm": 0.5933496356010437,
      "learning_rate": 0.0001594305920056742,
      "loss": 0.8167,
      "step": 1620
    },
    {
      "epoch": 1.8491208167895632,
      "grad_norm": 0.5618401765823364,
      "learning_rate": 0.00015879134791082247,
      "loss": 0.8907,
      "step": 1630
    },
    {
      "epoch": 1.8604651162790697,
      "grad_norm": 0.9804329872131348,
      "learning_rate": 0.00015814841111961374,
      "loss": 0.9494,
      "step": 1640
    },
    {
      "epoch": 1.8718094157685763,
      "grad_norm": 0.937347412109375,
      "learning_rate": 0.00015750182201503682,
      "loss": 0.9045,
      "step": 1650
    },
    {
      "epoch": 1.8831537152580828,
      "grad_norm": 0.8898664712905884,
      "learning_rate": 0.00015685162120948317,
      "loss": 0.9346,
      "step": 1660
    },
    {
      "epoch": 1.8944980147475894,
      "grad_norm": 0.8580901622772217,
      "learning_rate": 0.00015619784954219577,
      "loss": 0.9412,
      "step": 1670
    },
    {
      "epoch": 1.9058423142370957,
      "grad_norm": 0.6913225054740906,
      "learning_rate": 0.00015554054807670418,
      "loss": 0.9006,
      "step": 1680
    },
    {
      "epoch": 1.9171866137266025,
      "grad_norm": 0.7101637125015259,
      "learning_rate": 0.00015487975809824539,
      "loss": 0.8857,
      "step": 1690
    },
    {
      "epoch": 1.9285309132161088,
      "grad_norm": 0.8228437900543213,
      "learning_rate": 0.00015421552111117044,
      "loss": 0.8607,
      "step": 1700
    },
    {
      "epoch": 1.9398752127056156,
      "grad_norm": 0.5591906905174255,
      "learning_rate": 0.00015354787883633782,
      "loss": 0.8674,
      "step": 1710
    },
    {
      "epoch": 1.951219512195122,
      "grad_norm": 0.6841379404067993,
      "learning_rate": 0.00015287687320849271,
      "loss": 0.8387,
      "step": 1720
    },
    {
      "epoch": 1.9625638116846285,
      "grad_norm": 0.8344857096672058,
      "learning_rate": 0.00015220254637363318,
      "loss": 0.9227,
      "step": 1730
    },
    {
      "epoch": 1.973908111174135,
      "grad_norm": 0.8986241221427917,
      "learning_rate": 0.00015152494068636308,
      "loss": 0.8917,
      "step": 1740
    },
    {
      "epoch": 1.9852524106636416,
      "grad_norm": 0.5783970952033997,
      "learning_rate": 0.00015084409870723154,
      "loss": 0.872,
      "step": 1750
    },
    {
      "epoch": 1.996596710153148,
      "grad_norm": 0.6369901895523071,
      "learning_rate": 0.00015016006320005986,
      "loss": 0.9132,
      "step": 1760
    },
    {
      "epoch": 2.0079410096426544,
      "grad_norm": 0.5906355381011963,
      "learning_rate": 0.00014947287712925545,
      "loss": 0.8074,
      "step": 1770
    },
    {
      "epoch": 2.019285309132161,
      "grad_norm": 0.6774492263793945,
      "learning_rate": 0.00014878258365711334,
      "loss": 0.759,
      "step": 1780
    },
    {
      "epoch": 2.0306296086216675,
      "grad_norm": 0.8353272676467896,
      "learning_rate": 0.00014808922614110493,
      "loss": 0.8028,
      "step": 1790
    },
    {
      "epoch": 2.0419739081111743,
      "grad_norm": 0.8876771926879883,
      "learning_rate": 0.00014739284813115498,
      "loss": 0.7302,
      "step": 1800
    },
    {
      "epoch": 2.0533182076006806,
      "grad_norm": 0.6215524673461914,
      "learning_rate": 0.00014669349336690594,
      "loss": 0.7759,
      "step": 1810
    },
    {
      "epoch": 2.0646625070901874,
      "grad_norm": 0.5663015246391296,
      "learning_rate": 0.00014599120577497087,
      "loss": 0.7834,
      "step": 1820
    },
    {
      "epoch": 2.0760068065796937,
      "grad_norm": 0.6096060872077942,
      "learning_rate": 0.00014528602946617432,
      "loss": 0.8364,
      "step": 1830
    },
    {
      "epoch": 2.0873511060692,
      "grad_norm": 0.7625316977500916,
      "learning_rate": 0.00014457800873278172,
      "loss": 0.7558,
      "step": 1840
    },
    {
      "epoch": 2.098695405558707,
      "grad_norm": 0.6301640272140503,
      "learning_rate": 0.0001438671880457174,
      "loss": 0.8297,
      "step": 1850
    },
    {
      "epoch": 2.110039705048213,
      "grad_norm": 0.6493074297904968,
      "learning_rate": 0.00014315361205177127,
      "loss": 0.7764,
      "step": 1860
    },
    {
      "epoch": 2.12138400453772,
      "grad_norm": 0.8326807618141174,
      "learning_rate": 0.0001424373255707947,
      "loss": 0.7895,
      "step": 1870
    },
    {
      "epoch": 2.1327283040272262,
      "grad_norm": 1.0578484535217285,
      "learning_rate": 0.00014171837359288524,
      "loss": 0.7889,
      "step": 1880
    },
    {
      "epoch": 2.144072603516733,
      "grad_norm": 0.6812543272972107,
      "learning_rate": 0.0001409968012755609,
      "loss": 0.7643,
      "step": 1890
    },
    {
      "epoch": 2.1554169030062393,
      "grad_norm": 0.8412303924560547,
      "learning_rate": 0.00014027265394092364,
      "loss": 0.7402,
      "step": 1900
    },
    {
      "epoch": 2.1667612024957457,
      "grad_norm": 0.947846531867981,
      "learning_rate": 0.00013954597707281288,
      "loss": 0.7763,
      "step": 1910
    },
    {
      "epoch": 2.1781055019852524,
      "grad_norm": 0.7577157616615295,
      "learning_rate": 0.00013881681631394842,
      "loss": 0.8334,
      "step": 1920
    },
    {
      "epoch": 2.1894498014747588,
      "grad_norm": 0.6362768411636353,
      "learning_rate": 0.0001380852174630639,
      "loss": 0.7484,
      "step": 1930
    },
    {
      "epoch": 2.2007941009642655,
      "grad_norm": 0.7967275381088257,
      "learning_rate": 0.00013735122647202984,
      "loss": 0.7302,
      "step": 1940
    },
    {
      "epoch": 2.212138400453772,
      "grad_norm": 0.7726805210113525,
      "learning_rate": 0.0001366148894429677,
      "loss": 0.7836,
      "step": 1950
    },
    {
      "epoch": 2.2234826999432786,
      "grad_norm": 0.7741623520851135,
      "learning_rate": 0.00013587625262535396,
      "loss": 0.7925,
      "step": 1960
    },
    {
      "epoch": 2.234826999432785,
      "grad_norm": 0.7582458257675171,
      "learning_rate": 0.0001351353624131153,
      "loss": 0.7765,
      "step": 1970
    },
    {
      "epoch": 2.2461712989222917,
      "grad_norm": 0.8276723027229309,
      "learning_rate": 0.00013439226534171463,
      "loss": 0.81,
      "step": 1980
    },
    {
      "epoch": 2.257515598411798,
      "grad_norm": 0.8419069051742554,
      "learning_rate": 0.00013364700808522807,
      "loss": 0.7464,
      "step": 1990
    },
    {
      "epoch": 2.2688598979013044,
      "grad_norm": 0.7446946501731873,
      "learning_rate": 0.00013289963745341345,
      "loss": 0.7524,
      "step": 2000
    },
    {
      "epoch": 2.2688598979013044,
      "eval_loss": 0.9066722989082336,
      "eval_runtime": 15.6396,
      "eval_samples_per_second": 94.951,
      "eval_steps_per_second": 11.893,
      "step": 2000
    },
    {
      "epoch": 2.280204197390811,
      "grad_norm": 0.7091513872146606,
      "learning_rate": 0.00013215020038877002,
      "loss": 0.7806,
      "step": 2010
    },
    {
      "epoch": 2.2915484968803175,
      "grad_norm": 0.5853792428970337,
      "learning_rate": 0.0001313987439635902,
      "loss": 0.7625,
      "step": 2020
    },
    {
      "epoch": 2.3028927963698242,
      "grad_norm": 0.7464004158973694,
      "learning_rate": 0.00013064531537700284,
      "loss": 0.7313,
      "step": 2030
    },
    {
      "epoch": 2.3142370958593306,
      "grad_norm": 0.6370956301689148,
      "learning_rate": 0.00012988996195200858,
      "loss": 0.7903,
      "step": 2040
    },
    {
      "epoch": 2.3255813953488373,
      "grad_norm": 0.8973234295845032,
      "learning_rate": 0.0001291327311325076,
      "loss": 0.7537,
      "step": 2050
    },
    {
      "epoch": 2.3369256948383437,
      "grad_norm": 1.206678032875061,
      "learning_rate": 0.00012837367048031955,
      "loss": 0.8081,
      "step": 2060
    },
    {
      "epoch": 2.3482699943278504,
      "grad_norm": 0.9258993864059448,
      "learning_rate": 0.0001276128276721963,
      "loss": 0.7754,
      "step": 2070
    },
    {
      "epoch": 2.3596142938173568,
      "grad_norm": 0.8008835315704346,
      "learning_rate": 0.00012685025049682732,
      "loss": 0.8119,
      "step": 2080
    },
    {
      "epoch": 2.370958593306863,
      "grad_norm": 0.8094901442527771,
      "learning_rate": 0.0001260859868518379,
      "loss": 0.7889,
      "step": 2090
    },
    {
      "epoch": 2.38230289279637,
      "grad_norm": 0.7824433445930481,
      "learning_rate": 0.00012532008474078093,
      "loss": 0.8443,
      "step": 2100
    },
    {
      "epoch": 2.393647192285876,
      "grad_norm": 0.8314623236656189,
      "learning_rate": 0.00012455259227012172,
      "loss": 0.8009,
      "step": 2110
    },
    {
      "epoch": 2.404991491775383,
      "grad_norm": 0.993483304977417,
      "learning_rate": 0.0001237835576462163,
      "loss": 0.803,
      "step": 2120
    },
    {
      "epoch": 2.4163357912648893,
      "grad_norm": 0.7922090291976929,
      "learning_rate": 0.00012301302917228364,
      "loss": 0.7785,
      "step": 2130
    },
    {
      "epoch": 2.427680090754396,
      "grad_norm": 0.8681336045265198,
      "learning_rate": 0.00012224105524537176,
      "loss": 0.7427,
      "step": 2140
    },
    {
      "epoch": 2.4390243902439024,
      "grad_norm": 0.868011474609375,
      "learning_rate": 0.00012146768435331797,
      "loss": 0.7841,
      "step": 2150
    },
    {
      "epoch": 2.450368689733409,
      "grad_norm": 0.8300703763961792,
      "learning_rate": 0.00012069296507170307,
      "loss": 0.7113,
      "step": 2160
    },
    {
      "epoch": 2.4617129892229155,
      "grad_norm": 1.0211178064346313,
      "learning_rate": 0.00011991694606080062,
      "loss": 0.7927,
      "step": 2170
    },
    {
      "epoch": 2.473057288712422,
      "grad_norm": 1.1126124858856201,
      "learning_rate": 0.00011913967606252035,
      "loss": 0.798,
      "step": 2180
    },
    {
      "epoch": 2.4844015882019286,
      "grad_norm": 1.331468939781189,
      "learning_rate": 0.00011836120389734677,
      "loss": 0.7868,
      "step": 2190
    },
    {
      "epoch": 2.495745887691435,
      "grad_norm": 0.7289639115333557,
      "learning_rate": 0.00011758157846127278,
      "loss": 0.7501,
      "step": 2200
    },
    {
      "epoch": 2.5070901871809417,
      "grad_norm": 0.6862948536872864,
      "learning_rate": 0.00011680084872272843,
      "loss": 0.8113,
      "step": 2210
    },
    {
      "epoch": 2.518434486670448,
      "grad_norm": 0.6838523745536804,
      "learning_rate": 0.00011601906371950523,
      "loss": 0.7794,
      "step": 2220
    },
    {
      "epoch": 2.5297787861599548,
      "grad_norm": 0.8923412561416626,
      "learning_rate": 0.00011523627255567606,
      "loss": 0.7532,
      "step": 2230
    },
    {
      "epoch": 2.541123085649461,
      "grad_norm": 0.7864569425582886,
      "learning_rate": 0.00011445252439851092,
      "loss": 0.8044,
      "step": 2240
    },
    {
      "epoch": 2.552467385138968,
      "grad_norm": 0.9186776280403137,
      "learning_rate": 0.0001136678684753889,
      "loss": 0.7861,
      "step": 2250
    },
    {
      "epoch": 2.563811684628474,
      "grad_norm": 0.9502933025360107,
      "learning_rate": 0.00011288235407070588,
      "loss": 0.7441,
      "step": 2260
    },
    {
      "epoch": 2.5751559841179805,
      "grad_norm": 0.9764688014984131,
      "learning_rate": 0.00011209603052277924,
      "loss": 0.7519,
      "step": 2270
    },
    {
      "epoch": 2.5865002836074873,
      "grad_norm": 0.8480959534645081,
      "learning_rate": 0.00011130894722074874,
      "loss": 0.7743,
      "step": 2280
    },
    {
      "epoch": 2.5978445830969936,
      "grad_norm": 0.8660979866981506,
      "learning_rate": 0.00011052115360147448,
      "loss": 0.7989,
      "step": 2290
    },
    {
      "epoch": 2.6091888825865004,
      "grad_norm": 0.6586043238639832,
      "learning_rate": 0.0001097326991464318,
      "loss": 0.7676,
      "step": 2300
    },
    {
      "epoch": 2.6205331820760067,
      "grad_norm": 0.7315343618392944,
      "learning_rate": 0.00010894363337860314,
      "loss": 0.7699,
      "step": 2310
    },
    {
      "epoch": 2.6318774815655135,
      "grad_norm": 0.7257770895957947,
      "learning_rate": 0.0001081540058593677,
      "loss": 0.7773,
      "step": 2320
    },
    {
      "epoch": 2.64322178105502,
      "grad_norm": 0.6760928630828857,
      "learning_rate": 0.00010736386618538838,
      "loss": 0.7902,
      "step": 2330
    },
    {
      "epoch": 2.6545660805445266,
      "grad_norm": 0.6824659705162048,
      "learning_rate": 0.00010657326398549661,
      "loss": 0.7759,
      "step": 2340
    },
    {
      "epoch": 2.665910380034033,
      "grad_norm": 0.972321629524231,
      "learning_rate": 0.0001057822489175752,
      "loss": 0.7926,
      "step": 2350
    },
    {
      "epoch": 2.6772546795235392,
      "grad_norm": 0.9526649713516235,
      "learning_rate": 0.00010499087066543922,
      "loss": 0.7648,
      "step": 2360
    },
    {
      "epoch": 2.688598979013046,
      "grad_norm": 0.7266947031021118,
      "learning_rate": 0.0001041991789357155,
      "loss": 0.776,
      "step": 2370
    },
    {
      "epoch": 2.6999432785025523,
      "grad_norm": 0.808121383190155,
      "learning_rate": 0.00010340722345472037,
      "loss": 0.7852,
      "step": 2380
    },
    {
      "epoch": 2.711287577992059,
      "grad_norm": 1.1124972105026245,
      "learning_rate": 0.00010261505396533648,
      "loss": 0.717,
      "step": 2390
    },
    {
      "epoch": 2.7226318774815654,
      "grad_norm": 0.7241740226745605,
      "learning_rate": 0.00010182272022388841,
      "loss": 0.8335,
      "step": 2400
    },
    {
      "epoch": 2.733976176971072,
      "grad_norm": 1.0944820642471313,
      "learning_rate": 0.0001010302719970174,
      "loss": 0.7874,
      "step": 2410
    },
    {
      "epoch": 2.7453204764605785,
      "grad_norm": 0.735615611076355,
      "learning_rate": 0.00010023775905855559,
      "loss": 0.7198,
      "step": 2420
    },
    {
      "epoch": 2.7566647759500853,
      "grad_norm": 0.8080368041992188,
      "learning_rate": 9.944523118639958e-05,
      "loss": 0.8275,
      "step": 2430
    },
    {
      "epoch": 2.7680090754395916,
      "grad_norm": 1.0709086656570435,
      "learning_rate": 9.865273815938403e-05,
      "loss": 0.841,
      "step": 2440
    },
    {
      "epoch": 2.779353374929098,
      "grad_norm": 0.8561082482337952,
      "learning_rate": 9.786032975415503e-05,
      "loss": 0.7393,
      "step": 2450
    },
    {
      "epoch": 2.7906976744186047,
      "grad_norm": 0.6831649541854858,
      "learning_rate": 9.706805574204341e-05,
      "loss": 0.7904,
      "step": 2460
    },
    {
      "epoch": 2.802041973908111,
      "grad_norm": 0.9404779672622681,
      "learning_rate": 9.627596588593884e-05,
      "loss": 0.7651,
      "step": 2470
    },
    {
      "epoch": 2.813386273397618,
      "grad_norm": 1.1059134006500244,
      "learning_rate": 9.54841099371641e-05,
      "loss": 0.7792,
      "step": 2480
    },
    {
      "epoch": 2.824730572887124,
      "grad_norm": 0.8339388966560364,
      "learning_rate": 9.469253763235015e-05,
      "loss": 0.8037,
      "step": 2490
    },
    {
      "epoch": 2.8360748723766305,
      "grad_norm": 0.691879153251648,
      "learning_rate": 9.390129869031232e-05,
      "loss": 0.7882,
      "step": 2500
    },
    {
      "epoch": 2.8474191718661372,
      "grad_norm": 0.8173119425773621,
      "learning_rate": 9.311044280892728e-05,
      "loss": 0.7723,
      "step": 2510
    },
    {
      "epoch": 2.858763471355644,
      "grad_norm": 1.2163662910461426,
      "learning_rate": 9.232001966201159e-05,
      "loss": 0.8332,
      "step": 2520
    },
    {
      "epoch": 2.8701077708451503,
      "grad_norm": 0.7762579917907715,
      "learning_rate": 9.153007889620169e-05,
      "loss": 0.8017,
      "step": 2530
    },
    {
      "epoch": 2.8814520703346567,
      "grad_norm": 0.7560020089149475,
      "learning_rate": 9.074067012783551e-05,
      "loss": 0.7645,
      "step": 2540
    },
    {
      "epoch": 2.8927963698241634,
      "grad_norm": 0.7039526104927063,
      "learning_rate": 8.995184293983627e-05,
      "loss": 0.7496,
      "step": 2550
    },
    {
      "epoch": 2.9041406693136698,
      "grad_norm": 0.8188515305519104,
      "learning_rate": 8.916364687859782e-05,
      "loss": 0.7941,
      "step": 2560
    },
    {
      "epoch": 2.9154849688031765,
      "grad_norm": 0.8847174048423767,
      "learning_rate": 8.837613145087289e-05,
      "loss": 0.7462,
      "step": 2570
    },
    {
      "epoch": 2.926829268292683,
      "grad_norm": 1.4302834272384644,
      "learning_rate": 8.758934612066353e-05,
      "loss": 0.7659,
      "step": 2580
    },
    {
      "epoch": 2.938173567782189,
      "grad_norm": 0.8293200135231018,
      "learning_rate": 8.680334030611414e-05,
      "loss": 0.7464,
      "step": 2590
    },
    {
      "epoch": 2.949517867271696,
      "grad_norm": 0.9347418546676636,
      "learning_rate": 8.601816337640767e-05,
      "loss": 0.7907,
      "step": 2600
    },
    {
      "epoch": 2.9608621667612027,
      "grad_norm": 0.8685625195503235,
      "learning_rate": 8.523386464866452e-05,
      "loss": 0.7881,
      "step": 2610
    },
    {
      "epoch": 2.972206466250709,
      "grad_norm": 1.0375618934631348,
      "learning_rate": 8.44504933848452e-05,
      "loss": 0.7415,
      "step": 2620
    },
    {
      "epoch": 2.9835507657402154,
      "grad_norm": 1.1286613941192627,
      "learning_rate": 8.366809878865594e-05,
      "loss": 0.759,
      "step": 2630
    },
    {
      "epoch": 2.994895065229722,
      "grad_norm": 0.9496249556541443,
      "learning_rate": 8.28867300024582e-05,
      "loss": 0.8122,
      "step": 2640
    },
    {
      "epoch": 3.0062393647192285,
      "grad_norm": 0.6161667108535767,
      "learning_rate": 8.210643610418232e-05,
      "loss": 0.7363,
      "step": 2650
    },
    {
      "epoch": 3.0175836642087353,
      "grad_norm": 1.1362223625183105,
      "learning_rate": 8.132726610424453e-05,
      "loss": 0.6957,
      "step": 2660
    },
    {
      "epoch": 3.0289279636982416,
      "grad_norm": 0.9549693465232849,
      "learning_rate": 8.054926894246887e-05,
      "loss": 0.6598,
      "step": 2670
    },
    {
      "epoch": 3.0402722631877483,
      "grad_norm": 0.7844473719596863,
      "learning_rate": 7.977249348501314e-05,
      "loss": 0.7104,
      "step": 2680
    },
    {
      "epoch": 3.0516165626772547,
      "grad_norm": 0.9754497408866882,
      "learning_rate": 7.899698852129962e-05,
      "loss": 0.7109,
      "step": 2690
    },
    {
      "epoch": 3.062960862166761,
      "grad_norm": 0.8465747237205505,
      "learning_rate": 7.822280276095073e-05,
      "loss": 0.6208,
      "step": 2700
    },
    {
      "epoch": 3.0743051616562678,
      "grad_norm": 0.7896714806556702,
      "learning_rate": 7.744998483072936e-05,
      "loss": 0.6417,
      "step": 2710
    },
    {
      "epoch": 3.085649461145774,
      "grad_norm": 0.8668105006217957,
      "learning_rate": 7.667858327148475e-05,
      "loss": 0.6525,
      "step": 2720
    },
    {
      "epoch": 3.096993760635281,
      "grad_norm": 1.0019567012786865,
      "learning_rate": 7.590864653510359e-05,
      "loss": 0.6604,
      "step": 2730
    },
    {
      "epoch": 3.108338060124787,
      "grad_norm": 0.7561362981796265,
      "learning_rate": 7.514022298146679e-05,
      "loss": 0.6912,
      "step": 2740
    },
    {
      "epoch": 3.119682359614294,
      "grad_norm": 0.9435575604438782,
      "learning_rate": 7.437336087541187e-05,
      "loss": 0.6993,
      "step": 2750
    },
    {
      "epoch": 3.1310266591038003,
      "grad_norm": 1.041034460067749,
      "learning_rate": 7.360810838370161e-05,
      "loss": 0.6562,
      "step": 2760
    },
    {
      "epoch": 3.142370958593307,
      "grad_norm": 0.8745769262313843,
      "learning_rate": 7.284451357199851e-05,
      "loss": 0.6035,
      "step": 2770
    },
    {
      "epoch": 3.1537152580828134,
      "grad_norm": 0.9436658620834351,
      "learning_rate": 7.208262440184584e-05,
      "loss": 0.6591,
      "step": 2780
    },
    {
      "epoch": 3.1650595575723197,
      "grad_norm": 0.9558268785476685,
      "learning_rate": 7.13224887276553e-05,
      "loss": 0.7548,
      "step": 2790
    },
    {
      "epoch": 3.1764038570618265,
      "grad_norm": 1.3072495460510254,
      "learning_rate": 7.056415429370106e-05,
      "loss": 0.648,
      "step": 2800
    },
    {
      "epoch": 3.187748156551333,
      "grad_norm": 1.0742169618606567,
      "learning_rate": 6.980766873112106e-05,
      "loss": 0.6646,
      "step": 2810
    },
    {
      "epoch": 3.1990924560408396,
      "grad_norm": 0.8391577005386353,
      "learning_rate": 6.905307955492523e-05,
      "loss": 0.6844,
      "step": 2820
    },
    {
      "epoch": 3.210436755530346,
      "grad_norm": 0.9172285795211792,
      "learning_rate": 6.83004341610111e-05,
      "loss": 0.6671,
      "step": 2830
    },
    {
      "epoch": 3.2217810550198527,
      "grad_norm": 1.0791727304458618,
      "learning_rate": 6.754977982318693e-05,
      "loss": 0.6619,
      "step": 2840
    },
    {
      "epoch": 3.233125354509359,
      "grad_norm": 0.8881738781929016,
      "learning_rate": 6.68011636902022e-05,
      "loss": 0.678,
      "step": 2850
    },
    {
      "epoch": 3.2444696539988658,
      "grad_norm": 0.8353477120399475,
      "learning_rate": 6.605463278278646e-05,
      "loss": 0.7061,
      "step": 2860
    },
    {
      "epoch": 3.255813953488372,
      "grad_norm": 0.9251864552497864,
      "learning_rate": 6.531023399069574e-05,
      "loss": 0.6658,
      "step": 2870
    },
    {
      "epoch": 3.2671582529778784,
      "grad_norm": 0.7780378460884094,
      "learning_rate": 6.45680140697675e-05,
      "loss": 0.6327,
      "step": 2880
    },
    {
      "epoch": 3.278502552467385,
      "grad_norm": 1.3496202230453491,
      "learning_rate": 6.38280196389839e-05,
      "loss": 0.6658,
      "step": 2890
    },
    {
      "epoch": 3.2898468519568915,
      "grad_norm": 1.0429950952529907,
      "learning_rate": 6.309029717754362e-05,
      "loss": 0.7013,
      "step": 2900
    },
    {
      "epoch": 3.3011911514463983,
      "grad_norm": 0.7141017317771912,
      "learning_rate": 6.235489302194247e-05,
      "loss": 0.6969,
      "step": 2910
    },
    {
      "epoch": 3.3125354509359046,
      "grad_norm": 1.2669309377670288,
      "learning_rate": 6.162185336306294e-05,
      "loss": 0.6468,
      "step": 2920
    },
    {
      "epoch": 3.3238797504254114,
      "grad_norm": 0.8476207852363586,
      "learning_rate": 6.089122424327307e-05,
      "loss": 0.6501,
      "step": 2930
    },
    {
      "epoch": 3.3352240499149177,
      "grad_norm": 0.9521162509918213,
      "learning_rate": 6.01630515535345e-05,
      "loss": 0.6546,
      "step": 2940
    },
    {
      "epoch": 3.346568349404424,
      "grad_norm": 0.7817677855491638,
      "learning_rate": 5.943738103051997e-05,
      "loss": 0.6919,
      "step": 2950
    },
    {
      "epoch": 3.357912648893931,
      "grad_norm": 0.776945948600769,
      "learning_rate": 5.8714258253740564e-05,
      "loss": 0.6897,
      "step": 2960
    },
    {
      "epoch": 3.369256948383437,
      "grad_norm": 0.9761963486671448,
      "learning_rate": 5.7993728642683e-05,
      "loss": 0.6299,
      "step": 2970
    },
    {
      "epoch": 3.380601247872944,
      "grad_norm": 0.7887254953384399,
      "learning_rate": 5.7275837453956614e-05,
      "loss": 0.6773,
      "step": 2980
    },
    {
      "epoch": 3.3919455473624502,
      "grad_norm": 0.860835611820221,
      "learning_rate": 5.656062977845116e-05,
      "loss": 0.6239,
      "step": 2990
    },
    {
      "epoch": 3.403289846851957,
      "grad_norm": 0.9700385928153992,
      "learning_rate": 5.584815053850407e-05,
      "loss": 0.7148,
      "step": 3000
    },
    {
      "epoch": 3.403289846851957,
      "eval_loss": 0.9692808389663696,
      "eval_runtime": 15.7325,
      "eval_samples_per_second": 94.39,
      "eval_steps_per_second": 11.823,
      "step": 3000
    },
    {
      "epoch": 3.4146341463414633,
      "grad_norm": 1.335462212562561,
      "learning_rate": 5.51384444850794e-05,
      "loss": 0.6387,
      "step": 3010
    },
    {
      "epoch": 3.42597844583097,
      "grad_norm": 0.8788994550704956,
      "learning_rate": 5.443155619495679e-05,
      "loss": 0.6809,
      "step": 3020
    },
    {
      "epoch": 3.4373227453204764,
      "grad_norm": 0.9188012480735779,
      "learning_rate": 5.372753006793143e-05,
      "loss": 0.6724,
      "step": 3030
    },
    {
      "epoch": 3.4486670448099828,
      "grad_norm": 0.9619457125663757,
      "learning_rate": 5.302641032402578e-05,
      "loss": 0.6789,
      "step": 3040
    },
    {
      "epoch": 3.4600113442994895,
      "grad_norm": 0.9403857588768005,
      "learning_rate": 5.2328241000711464e-05,
      "loss": 0.6274,
      "step": 3050
    },
    {
      "epoch": 3.471355643788996,
      "grad_norm": 0.9259539246559143,
      "learning_rate": 5.16330659501438e-05,
      "loss": 0.6551,
      "step": 3060
    },
    {
      "epoch": 3.4826999432785026,
      "grad_norm": 1.07770574092865,
      "learning_rate": 5.094092883640718e-05,
      "loss": 0.6593,
      "step": 3070
    },
    {
      "epoch": 3.494044242768009,
      "grad_norm": 0.7347473502159119,
      "learning_rate": 5.0251873132772576e-05,
      "loss": 0.6847,
      "step": 3080
    },
    {
      "epoch": 3.5053885422575157,
      "grad_norm": 0.9838495254516602,
      "learning_rate": 4.956594211896701e-05,
      "loss": 0.6667,
      "step": 3090
    },
    {
      "epoch": 3.516732841747022,
      "grad_norm": 1.1671929359436035,
      "learning_rate": 4.8883178878454996e-05,
      "loss": 0.683,
      "step": 3100
    },
    {
      "epoch": 3.528077141236529,
      "grad_norm": 0.6510323882102966,
      "learning_rate": 4.8203626295732675e-05,
      "loss": 0.6946,
      "step": 3110
    },
    {
      "epoch": 3.539421440726035,
      "grad_norm": 0.7871556282043457,
      "learning_rate": 4.7527327053634094e-05,
      "loss": 0.6652,
      "step": 3120
    },
    {
      "epoch": 3.5507657402155415,
      "grad_norm": 0.8053673505783081,
      "learning_rate": 4.685432363065036e-05,
      "loss": 0.6431,
      "step": 3130
    },
    {
      "epoch": 3.5621100397050482,
      "grad_norm": 0.8162011504173279,
      "learning_rate": 4.618465829826145e-05,
      "loss": 0.6089,
      "step": 3140
    },
    {
      "epoch": 3.5734543391945546,
      "grad_norm": 1.0298821926116943,
      "learning_rate": 4.551837311828131e-05,
      "loss": 0.6645,
      "step": 3150
    },
    {
      "epoch": 3.5847986386840613,
      "grad_norm": 1.0996955633163452,
      "learning_rate": 4.485550994021567e-05,
      "loss": 0.6872,
      "step": 3160
    },
    {
      "epoch": 3.5961429381735677,
      "grad_norm": 0.9979953765869141,
      "learning_rate": 4.419611039863377e-05,
      "loss": 0.628,
      "step": 3170
    },
    {
      "epoch": 3.6074872376630744,
      "grad_norm": 1.0593342781066895,
      "learning_rate": 4.354021591055311e-05,
      "loss": 0.6864,
      "step": 3180
    },
    {
      "epoch": 3.6188315371525808,
      "grad_norm": 1.6677913665771484,
      "learning_rate": 4.2887867672838056e-05,
      "loss": 0.6232,
      "step": 3190
    },
    {
      "epoch": 3.6301758366420875,
      "grad_norm": 0.8164204359054565,
      "learning_rate": 4.223910665961235e-05,
      "loss": 0.6786,
      "step": 3200
    },
    {
      "epoch": 3.641520136131594,
      "grad_norm": 0.8163765072822571,
      "learning_rate": 4.15939736196853e-05,
      "loss": 0.6763,
      "step": 3210
    },
    {
      "epoch": 3.6528644356211,
      "grad_norm": 0.9765521883964539,
      "learning_rate": 4.095250907399262e-05,
      "loss": 0.6719,
      "step": 3220
    },
    {
      "epoch": 3.664208735110607,
      "grad_norm": 0.9238688349723816,
      "learning_rate": 4.03147533130511e-05,
      "loss": 0.68,
      "step": 3230
    },
    {
      "epoch": 3.6755530346001133,
      "grad_norm": 0.9760640859603882,
      "learning_rate": 3.968074639442805e-05,
      "loss": 0.6542,
      "step": 3240
    },
    {
      "epoch": 3.68689733408962,
      "grad_norm": 0.9406284689903259,
      "learning_rate": 3.905052814022523e-05,
      "loss": 0.653,
      "step": 3250
    },
    {
      "epoch": 3.6982416335791264,
      "grad_norm": 0.9423522353172302,
      "learning_rate": 3.842413813457758e-05,
      "loss": 0.706,
      "step": 3260
    },
    {
      "epoch": 3.709585933068633,
      "grad_norm": 0.8088165521621704,
      "learning_rate": 3.780161572116704e-05,
      "loss": 0.7161,
      "step": 3270
    },
    {
      "epoch": 3.7209302325581395,
      "grad_norm": 0.9071544408798218,
      "learning_rate": 3.718300000075129e-05,
      "loss": 0.7193,
      "step": 3280
    },
    {
      "epoch": 3.7322745320476463,
      "grad_norm": 0.8792480230331421,
      "learning_rate": 3.6568329828707836e-05,
      "loss": 0.6381,
      "step": 3290
    },
    {
      "epoch": 3.7436188315371526,
      "grad_norm": 1.0307759046554565,
      "learning_rate": 3.5957643812593543e-05,
      "loss": 0.6668,
      "step": 3300
    },
    {
      "epoch": 3.754963131026659,
      "grad_norm": 1.0883175134658813,
      "learning_rate": 3.5350980309719514e-05,
      "loss": 0.6978,
      "step": 3310
    },
    {
      "epoch": 3.7663074305161657,
      "grad_norm": 1.0448516607284546,
      "learning_rate": 3.4748377424742115e-05,
      "loss": 0.6756,
      "step": 3320
    },
    {
      "epoch": 3.777651730005672,
      "grad_norm": 0.8772532939910889,
      "learning_rate": 3.414987300726945e-05,
      "loss": 0.6714,
      "step": 3330
    },
    {
      "epoch": 3.7889960294951788,
      "grad_norm": 1.0115753412246704,
      "learning_rate": 3.3555504649484046e-05,
      "loss": 0.6773,
      "step": 3340
    },
    {
      "epoch": 3.800340328984685,
      "grad_norm": 1.1093175411224365,
      "learning_rate": 3.296530968378173e-05,
      "loss": 0.6916,
      "step": 3350
    },
    {
      "epoch": 3.811684628474192,
      "grad_norm": 0.8998281359672546,
      "learning_rate": 3.237932518042664e-05,
      "loss": 0.6801,
      "step": 3360
    },
    {
      "epoch": 3.823028927963698,
      "grad_norm": 1.0179048776626587,
      "learning_rate": 3.1797587945223026e-05,
      "loss": 0.6702,
      "step": 3370
    },
    {
      "epoch": 3.834373227453205,
      "grad_norm": 0.9240026473999023,
      "learning_rate": 3.1220134517203335e-05,
      "loss": 0.671,
      "step": 3380
    },
    {
      "epoch": 3.8457175269427113,
      "grad_norm": 0.7641962766647339,
      "learning_rate": 3.0647001166333245e-05,
      "loss": 0.7147,
      "step": 3390
    },
    {
      "epoch": 3.8570618264322176,
      "grad_norm": 0.9078419804573059,
      "learning_rate": 3.0078223891233514e-05,
      "loss": 0.7155,
      "step": 3400
    },
    {
      "epoch": 3.8684061259217244,
      "grad_norm": 0.962393045425415,
      "learning_rate": 2.9513838416918815e-05,
      "loss": 0.6866,
      "step": 3410
    },
    {
      "epoch": 3.8797504254112307,
      "grad_norm": 1.5198420286178589,
      "learning_rate": 2.8953880192554105e-05,
      "loss": 0.6741,
      "step": 3420
    },
    {
      "epoch": 3.8910947249007375,
      "grad_norm": 1.1129947900772095,
      "learning_rate": 2.8398384389227816e-05,
      "loss": 0.6542,
      "step": 3430
    },
    {
      "epoch": 3.902439024390244,
      "grad_norm": 0.8633179664611816,
      "learning_rate": 2.7847385897742705e-05,
      "loss": 0.6768,
      "step": 3440
    },
    {
      "epoch": 3.9137833238797506,
      "grad_norm": 1.062277913093567,
      "learning_rate": 2.7300919326424658e-05,
      "loss": 0.6709,
      "step": 3450
    },
    {
      "epoch": 3.925127623369257,
      "grad_norm": 0.7949813604354858,
      "learning_rate": 2.675901899894854e-05,
      "loss": 0.6166,
      "step": 3460
    },
    {
      "epoch": 3.9364719228587637,
      "grad_norm": 0.9200356006622314,
      "learning_rate": 2.622171895218273e-05,
      "loss": 0.6718,
      "step": 3470
    },
    {
      "epoch": 3.94781622234827,
      "grad_norm": 0.9637920260429382,
      "learning_rate": 2.568905293405095e-05,
      "loss": 0.619,
      "step": 3480
    },
    {
      "epoch": 3.9591605218377763,
      "grad_norm": 1.157073974609375,
      "learning_rate": 2.516105440141262e-05,
      "loss": 0.6961,
      "step": 3490
    },
    {
      "epoch": 3.970504821327283,
      "grad_norm": 0.8323079347610474,
      "learning_rate": 2.4637756517961517e-05,
      "loss": 0.677,
      "step": 3500
    },
    {
      "epoch": 3.9818491208167894,
      "grad_norm": 0.9369989037513733,
      "learning_rate": 2.41191921521427e-05,
      "loss": 0.6619,
      "step": 3510
    },
    {
      "epoch": 3.993193420306296,
      "grad_norm": 0.8290889263153076,
      "learning_rate": 2.360539387508801e-05,
      "loss": 0.6534,
      "step": 3520
    },
    {
      "epoch": 4.0045377197958025,
      "grad_norm": 0.8619610071182251,
      "learning_rate": 2.309639395857033e-05,
      "loss": 0.6531,
      "step": 3530
    },
    {
      "epoch": 4.015882019285309,
      "grad_norm": 0.7406215071678162,
      "learning_rate": 2.259222437297649e-05,
      "loss": 0.5811,
      "step": 3540
    },
    {
      "epoch": 4.027226318774816,
      "grad_norm": 1.3408113718032837,
      "learning_rate": 2.2092916785299323e-05,
      "loss": 0.6163,
      "step": 3550
    },
    {
      "epoch": 4.038570618264322,
      "grad_norm": 0.9652060866355896,
      "learning_rate": 2.159850255714859e-05,
      "loss": 0.6345,
      "step": 3560
    },
    {
      "epoch": 4.049914917753829,
      "grad_norm": 1.2307026386260986,
      "learning_rate": 2.1109012742781142e-05,
      "loss": 0.5568,
      "step": 3570
    },
    {
      "epoch": 4.061259217243335,
      "grad_norm": 1.101637363433838,
      "learning_rate": 2.0624478087150456e-05,
      "loss": 0.608,
      "step": 3580
    },
    {
      "epoch": 4.072603516732841,
      "grad_norm": 2.5598561763763428,
      "learning_rate": 2.0144929023975413e-05,
      "loss": 0.5294,
      "step": 3590
    },
    {
      "epoch": 4.083947816222349,
      "grad_norm": 0.9463273286819458,
      "learning_rate": 1.967039567382888e-05,
      "loss": 0.5482,
      "step": 3600
    },
    {
      "epoch": 4.095292115711855,
      "grad_norm": 0.9838125109672546,
      "learning_rate": 1.920090784224581e-05,
      "loss": 0.6254,
      "step": 3610
    },
    {
      "epoch": 4.106636415201361,
      "grad_norm": 0.85828697681427,
      "learning_rate": 1.8736495017851062e-05,
      "loss": 0.5443,
      "step": 3620
    },
    {
      "epoch": 4.117980714690868,
      "grad_norm": 0.8922297954559326,
      "learning_rate": 1.827718637050736e-05,
      "loss": 0.6068,
      "step": 3630
    },
    {
      "epoch": 4.129325014180375,
      "grad_norm": 0.7973962426185608,
      "learning_rate": 1.7823010749482927e-05,
      "loss": 0.6179,
      "step": 3640
    },
    {
      "epoch": 4.140669313669881,
      "grad_norm": 0.8686882257461548,
      "learning_rate": 1.737399668163966e-05,
      "loss": 0.6186,
      "step": 3650
    },
    {
      "epoch": 4.152013613159387,
      "grad_norm": 1.4338245391845703,
      "learning_rate": 1.693017236964125e-05,
      "loss": 0.5784,
      "step": 3660
    },
    {
      "epoch": 4.163357912648894,
      "grad_norm": 0.9958694577217102,
      "learning_rate": 1.6491565690181765e-05,
      "loss": 0.6388,
      "step": 3670
    },
    {
      "epoch": 4.1747022121384,
      "grad_norm": 0.9962863922119141,
      "learning_rate": 1.605820419223476e-05,
      "loss": 0.6541,
      "step": 3680
    },
    {
      "epoch": 4.186046511627907,
      "grad_norm": 1.1754194498062134,
      "learning_rate": 1.5630115095322827e-05,
      "loss": 0.6037,
      "step": 3690
    },
    {
      "epoch": 4.197390811117414,
      "grad_norm": 1.1034218072891235,
      "learning_rate": 1.5207325287808027e-05,
      "loss": 0.5844,
      "step": 3700
    },
    {
      "epoch": 4.20873511060692,
      "grad_norm": 1.0171332359313965,
      "learning_rate": 1.4789861325203013e-05,
      "loss": 0.6724,
      "step": 3710
    },
    {
      "epoch": 4.220079410096426,
      "grad_norm": 0.9791539907455444,
      "learning_rate": 1.4377749428503006e-05,
      "loss": 0.5989,
      "step": 3720
    },
    {
      "epoch": 4.231423709585933,
      "grad_norm": 0.9501050710678101,
      "learning_rate": 1.3971015482538963e-05,
      "loss": 0.5911,
      "step": 3730
    },
    {
      "epoch": 4.24276800907544,
      "grad_norm": 1.2614890336990356,
      "learning_rate": 1.3569685034351554e-05,
      "loss": 0.5849,
      "step": 3740
    },
    {
      "epoch": 4.254112308564946,
      "grad_norm": 1.0194411277770996,
      "learning_rate": 1.3173783291586772e-05,
      "loss": 0.5976,
      "step": 3750
    },
    {
      "epoch": 4.2654566080544525,
      "grad_norm": 1.0711522102355957,
      "learning_rate": 1.2783335120912565e-05,
      "loss": 0.5931,
      "step": 3760
    },
    {
      "epoch": 4.276800907543959,
      "grad_norm": 0.8650385141372681,
      "learning_rate": 1.2398365046456783e-05,
      "loss": 0.6078,
      "step": 3770
    },
    {
      "epoch": 4.288145207033466,
      "grad_norm": 0.823208749294281,
      "learning_rate": 1.2018897248267103e-05,
      "loss": 0.5961,
      "step": 3780
    },
    {
      "epoch": 4.299489506522972,
      "grad_norm": 0.9447870850563049,
      "learning_rate": 1.1644955560791993e-05,
      "loss": 0.6468,
      "step": 3790
    },
    {
      "epoch": 4.310833806012479,
      "grad_norm": 1.102318525314331,
      "learning_rate": 1.1276563471383883e-05,
      "loss": 0.588,
      "step": 3800
    },
    {
      "epoch": 4.322178105501985,
      "grad_norm": 0.9916651248931885,
      "learning_rate": 1.0913744118823866e-05,
      "loss": 0.6188,
      "step": 3810
    },
    {
      "epoch": 4.333522404991491,
      "grad_norm": 1.1987171173095703,
      "learning_rate": 1.05565202918682e-05,
      "loss": 0.5841,
      "step": 3820
    },
    {
      "epoch": 4.3448667044809985,
      "grad_norm": 0.9708378911018372,
      "learning_rate": 1.0204914427817158e-05,
      "loss": 0.6023,
      "step": 3830
    },
    {
      "epoch": 4.356211003970505,
      "grad_norm": 1.0048896074295044,
      "learning_rate": 9.8589486111056e-06,
      "loss": 0.5705,
      "step": 3840
    },
    {
      "epoch": 4.367555303460011,
      "grad_norm": 0.8364105820655823,
      "learning_rate": 9.518644571915847e-06,
      "loss": 0.5872,
      "step": 3850
    },
    {
      "epoch": 4.3788996029495175,
      "grad_norm": 1.5254448652267456,
      "learning_rate": 9.184023684812926e-06,
      "loss": 0.6063,
      "step": 3860
    },
    {
      "epoch": 4.390243902439025,
      "grad_norm": 0.993635356426239,
      "learning_rate": 8.855106967401839e-06,
      "loss": 0.5311,
      "step": 3870
    },
    {
      "epoch": 4.401588201928531,
      "grad_norm": 0.8678284883499146,
      "learning_rate": 8.531915079007625e-06,
      "loss": 0.5894,
      "step": 3880
    },
    {
      "epoch": 4.412932501418037,
      "grad_norm": 1.081127643585205,
      "learning_rate": 8.214468319377633e-06,
      "loss": 0.5906,
      "step": 3890
    },
    {
      "epoch": 4.424276800907544,
      "grad_norm": 0.9130728840827942,
      "learning_rate": 7.902786627406477e-06,
      "loss": 0.5764,
      "step": 3900
    },
    {
      "epoch": 4.43562110039705,
      "grad_norm": 0.9263814091682434,
      "learning_rate": 7.596889579883826e-06,
      "loss": 0.5812,
      "step": 3910
    },
    {
      "epoch": 4.446965399886557,
      "grad_norm": 1.095747947692871,
      "learning_rate": 7.296796390264549e-06,
      "loss": 0.5721,
      "step": 3920
    },
    {
      "epoch": 4.458309699376064,
      "grad_norm": 0.8003553152084351,
      "learning_rate": 7.002525907462121e-06,
      "loss": 0.5882,
      "step": 3930
    },
    {
      "epoch": 4.46965399886557,
      "grad_norm": 0.8841357231140137,
      "learning_rate": 6.7140966146646e-06,
      "loss": 0.5543,
      "step": 3940
    },
    {
      "epoch": 4.480998298355076,
      "grad_norm": 0.8580918312072754,
      "learning_rate": 6.431526628173701e-06,
      "loss": 0.6549,
      "step": 3950
    },
    {
      "epoch": 4.4923425978445835,
      "grad_norm": 0.9447335004806519,
      "learning_rate": 6.154833696267015e-06,
      "loss": 0.6516,
      "step": 3960
    },
    {
      "epoch": 4.50368689733409,
      "grad_norm": 1.0485211610794067,
      "learning_rate": 5.884035198083071e-06,
      "loss": 0.579,
      "step": 3970
    },
    {
      "epoch": 4.515031196823596,
      "grad_norm": 0.9394044876098633,
      "learning_rate": 5.619148142529873e-06,
      "loss": 0.6396,
      "step": 3980
    },
    {
      "epoch": 4.526375496313102,
      "grad_norm": 0.93062824010849,
      "learning_rate": 5.360189167216545e-06,
      "loss": 0.6005,
      "step": 3990
    },
    {
      "epoch": 4.537719795802609,
      "grad_norm": 0.9513915777206421,
      "learning_rate": 5.107174537408233e-06,
      "loss": 0.5743,
      "step": 4000
    },
    {
      "epoch": 4.537719795802609,
      "eval_loss": 1.0443100929260254,
      "eval_runtime": 15.6805,
      "eval_samples_per_second": 94.704,
      "eval_steps_per_second": 11.862,
      "step": 4000
    },
    {
      "epoch": 4.549064095292116,
      "grad_norm": 0.9627020359039307,
      "learning_rate": 4.8601201450046316e-06,
      "loss": 0.6077,
      "step": 4010
    },
    {
      "epoch": 4.560408394781622,
      "grad_norm": 0.8539467453956604,
      "learning_rate": 4.619041507541688e-06,
      "loss": 0.5812,
      "step": 4020
    },
    {
      "epoch": 4.571752694271129,
      "grad_norm": 0.9446848630905151,
      "learning_rate": 4.383953767216964e-06,
      "loss": 0.624,
      "step": 4030
    },
    {
      "epoch": 4.583096993760635,
      "grad_norm": 1.188366174697876,
      "learning_rate": 4.154871689938633e-06,
      "loss": 0.6437,
      "step": 4040
    },
    {
      "epoch": 4.594441293250142,
      "grad_norm": 1.0908474922180176,
      "learning_rate": 3.931809664397867e-06,
      "loss": 0.6323,
      "step": 4050
    },
    {
      "epoch": 4.6057855927396485,
      "grad_norm": 0.9742168188095093,
      "learning_rate": 3.714781701165304e-06,
      "loss": 0.6132,
      "step": 4060
    },
    {
      "epoch": 4.617129892229155,
      "grad_norm": 0.8761405348777771,
      "learning_rate": 3.503801431810816e-06,
      "loss": 0.624,
      "step": 4070
    },
    {
      "epoch": 4.628474191718661,
      "grad_norm": 0.996088445186615,
      "learning_rate": 3.298882108047463e-06,
      "loss": 0.6009,
      "step": 4080
    },
    {
      "epoch": 4.6398184912081675,
      "grad_norm": 0.9667827486991882,
      "learning_rate": 3.10003660089907e-06,
      "loss": 0.5988,
      "step": 4090
    },
    {
      "epoch": 4.651162790697675,
      "grad_norm": 0.9298661351203918,
      "learning_rate": 2.9072773998918503e-06,
      "loss": 0.6453,
      "step": 4100
    },
    {
      "epoch": 4.662507090187181,
      "grad_norm": 0.9182038307189941,
      "learning_rate": 2.7206166122698774e-06,
      "loss": 0.5915,
      "step": 4110
    },
    {
      "epoch": 4.673851389676687,
      "grad_norm": 0.835645318031311,
      "learning_rate": 2.540065962234683e-06,
      "loss": 0.6515,
      "step": 4120
    },
    {
      "epoch": 4.685195689166194,
      "grad_norm": 0.8575255274772644,
      "learning_rate": 2.3656367902088026e-06,
      "loss": 0.6169,
      "step": 4130
    },
    {
      "epoch": 4.696539988655701,
      "grad_norm": 0.9075832962989807,
      "learning_rate": 2.19734005212352e-06,
      "loss": 0.6166,
      "step": 4140
    },
    {
      "epoch": 4.707884288145207,
      "grad_norm": 2.0740888118743896,
      "learning_rate": 2.035186318730742e-06,
      "loss": 0.5779,
      "step": 4150
    },
    {
      "epoch": 4.7192285876347135,
      "grad_norm": 1.0293558835983276,
      "learning_rate": 1.8791857749389741e-06,
      "loss": 0.6414,
      "step": 4160
    },
    {
      "epoch": 4.73057288712422,
      "grad_norm": 0.9525774121284485,
      "learning_rate": 1.7293482191736877e-06,
      "loss": 0.5802,
      "step": 4170
    },
    {
      "epoch": 4.741917186613726,
      "grad_norm": 0.9085150957107544,
      "learning_rate": 1.5856830627618001e-06,
      "loss": 0.6331,
      "step": 4180
    },
    {
      "epoch": 4.753261486103233,
      "grad_norm": 0.9908912777900696,
      "learning_rate": 1.4481993293406048e-06,
      "loss": 0.5844,
      "step": 4190
    },
    {
      "epoch": 4.76460578559274,
      "grad_norm": 0.7421241998672485,
      "learning_rate": 1.316905654291012e-06,
      "loss": 0.6653,
      "step": 4200
    },
    {
      "epoch": 4.775950085082246,
      "grad_norm": 0.857502281665802,
      "learning_rate": 1.1918102841950607e-06,
      "loss": 0.5693,
      "step": 4210
    },
    {
      "epoch": 4.787294384571752,
      "grad_norm": 0.9300210475921631,
      "learning_rate": 1.0729210763180564e-06,
      "loss": 0.5755,
      "step": 4220
    },
    {
      "epoch": 4.79863868406126,
      "grad_norm": 1.2351378202438354,
      "learning_rate": 9.602454981149977e-07,
      "loss": 0.618,
      "step": 4230
    },
    {
      "epoch": 4.809982983550766,
      "grad_norm": 1.24778151512146,
      "learning_rate": 8.537906267615415e-07,
      "loss": 0.5896,
      "step": 4240
    },
    {
      "epoch": 4.821327283040272,
      "grad_norm": 1.3560271263122559,
      "learning_rate": 7.535631487095352e-07,
      "loss": 0.5879,
      "step": 4250
    },
    {
      "epoch": 4.832671582529779,
      "grad_norm": 1.8108911514282227,
      "learning_rate": 6.59569359266976e-07,
      "loss": 0.5943,
      "step": 4260
    },
    {
      "epoch": 4.844015882019285,
      "grad_norm": 0.9743121862411499,
      "learning_rate": 5.718151622026379e-07,
      "loss": 0.6104,
      "step": 4270
    },
    {
      "epoch": 4.855360181508792,
      "grad_norm": 1.2035831212997437,
      "learning_rate": 4.903060693752348e-07,
      "loss": 0.608,
      "step": 4280
    },
    {
      "epoch": 4.866704480998298,
      "grad_norm": 0.9681785106658936,
      "learning_rate": 4.1504720038724187e-07,
      "loss": 0.5773,
      "step": 4290
    },
    {
      "epoch": 4.878048780487805,
      "grad_norm": 1.0151753425598145,
      "learning_rate": 3.4604328226333083e-07,
      "loss": 0.5609,
      "step": 4300
    },
    {
      "epoch": 4.889393079977311,
      "grad_norm": 1.0577515363693237,
      "learning_rate": 2.832986491534295e-07,
      "loss": 0.6435,
      "step": 4310
    },
    {
      "epoch": 4.900737379466818,
      "grad_norm": 0.8938112854957581,
      "learning_rate": 2.2681724206052857e-07,
      "loss": 0.6398,
      "step": 4320
    },
    {
      "epoch": 4.912081678956325,
      "grad_norm": 0.997191846370697,
      "learning_rate": 1.7660260859315713e-07,
      "loss": 0.628,
      "step": 4330
    },
    {
      "epoch": 4.923425978445831,
      "grad_norm": 0.8382704257965088,
      "learning_rate": 1.3265790274249456e-07,
      "loss": 0.6105,
      "step": 4340
    },
    {
      "epoch": 4.934770277935337,
      "grad_norm": 0.8330470323562622,
      "learning_rate": 9.498588468433989e-08,
      "loss": 0.5982,
      "step": 4350
    },
    {
      "epoch": 4.946114577424844,
      "grad_norm": 1.2183622121810913,
      "learning_rate": 6.35889206057172e-08,
      "loss": 0.5876,
      "step": 4360
    },
    {
      "epoch": 4.957458876914351,
      "grad_norm": 1.131373405456543,
      "learning_rate": 3.846898255622788e-08,
      "loss": 0.6113,
      "step": 4370
    },
    {
      "epoch": 4.968803176403857,
      "grad_norm": 1.1781286001205444,
      "learning_rate": 1.9627648324227476e-08,
      "loss": 0.5522,
      "step": 4380
    },
    {
      "epoch": 4.9801474758933635,
      "grad_norm": 1.2726503610610962,
      "learning_rate": 7.066101337682707e-09,
      "loss": 0.6312,
      "step": 4390
    },
    {
      "epoch": 4.99149177538287,
      "grad_norm": 1.1971274614334106,
      "learning_rate": 7.85130589897598e-10,
      "loss": 0.6052,
      "step": 4400
    },
    {
      "epoch": 4.997163925127623,
      "step": 4405,
      "total_flos": 9.40234358432727e+17,
      "train_loss": 0.7921485962039632,
      "train_runtime": 4193.8899,
      "train_samples_per_second": 33.618,
      "train_steps_per_second": 1.05
    }
  ],
  "logging_steps": 10,
  "max_steps": 4405,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 5,
  "save_steps": 1000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 9.40234358432727e+17,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}