{
  "best_global_step": 6000,
  "best_metric": 0.20116083323955536,
  "best_model_checkpoint": "/content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/run-2/checkpoints/checkpoint-6000",
  "epoch": 2.0,
  "eval_steps": 2000,
  "global_step": 14628,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.006836905616517964,
      "grad_norm": 1.572303056716919,
      "learning_rate": 2.232346241457859e-05,
      "loss": 2.3604,
      "step": 50
    },
    {
      "epoch": 0.013673811233035928,
      "grad_norm": 5.201236248016357,
      "learning_rate": 4.510250569476082e-05,
      "loss": 2.1118,
      "step": 100
    },
    {
      "epoch": 0.02051071684955389,
      "grad_norm": 9.312570571899414,
      "learning_rate": 6.788154897494306e-05,
      "loss": 1.8332,
      "step": 150
    },
    {
      "epoch": 0.027347622466071857,
      "grad_norm": 8.565587043762207,
      "learning_rate": 9.066059225512529e-05,
      "loss": 1.9173,
      "step": 200
    },
    {
      "epoch": 0.03418452808258982,
      "grad_norm": 3.824556350708008,
      "learning_rate": 0.00011343963553530752,
      "loss": 1.6633,
      "step": 250
    },
    {
      "epoch": 0.04102143369910778,
      "grad_norm": 5.49424934387207,
      "learning_rate": 0.00013621867881548976,
      "loss": 1.6122,
      "step": 300
    },
    {
      "epoch": 0.04785833931562575,
      "grad_norm": 6.3185038566589355,
      "learning_rate": 0.000158997722095672,
      "loss": 1.5782,
      "step": 350
    },
    {
      "epoch": 0.05469524493214371,
      "grad_norm": 3.980173349380493,
      "learning_rate": 0.00018177676537585422,
      "loss": 1.444,
      "step": 400
    },
    {
      "epoch": 0.06153215054866167,
      "grad_norm": 5.797272682189941,
      "learning_rate": 0.00019999975488719786,
      "loss": 1.5752,
      "step": 450
    },
    {
      "epoch": 0.06836905616517965,
      "grad_norm": 11.263846397399902,
      "learning_rate": 0.0001999911760652904,
      "loss": 1.3607,
      "step": 500
    },
    {
      "epoch": 0.0752059617816976,
      "grad_norm": 4.273462772369385,
      "learning_rate": 0.0001999703428048544,
      "loss": 1.5023,
      "step": 550
    },
    {
      "epoch": 0.08204286739821556,
      "grad_norm": 2.9854705333709717,
      "learning_rate": 0.00019993725765911436,
      "loss": 1.3747,
      "step": 600
    },
    {
      "epoch": 0.08887977301473353,
      "grad_norm": 2.9444832801818848,
      "learning_rate": 0.0001998919246828268,
      "loss": 1.4708,
      "step": 650
    },
    {
      "epoch": 0.0957166786312515,
      "grad_norm": 3.348857879638672,
      "learning_rate": 0.00019983434943178372,
      "loss": 1.439,
      "step": 700
    },
    {
      "epoch": 0.10255358424776946,
      "grad_norm": 5.90728759765625,
      "learning_rate": 0.00019976453896213152,
      "loss": 1.5048,
      "step": 750
    },
    {
      "epoch": 0.10939048986428743,
      "grad_norm": 2.6572535037994385,
      "learning_rate": 0.0001996825018295062,
      "loss": 1.5023,
      "step": 800
    },
    {
      "epoch": 0.11622739548080539,
      "grad_norm": 4.219803810119629,
      "learning_rate": 0.00019958824808798494,
      "loss": 1.5814,
      "step": 850
    },
    {
      "epoch": 0.12306430109732334,
      "grad_norm": 5.457417964935303,
      "learning_rate": 0.00019948178928885378,
      "loss": 1.4203,
      "step": 900
    },
    {
      "epoch": 0.1299012067138413,
      "grad_norm": 5.302417278289795,
      "learning_rate": 0.00019936313847919218,
      "loss": 1.3299,
      "step": 950
    },
    {
      "epoch": 0.1367381123303593,
      "grad_norm": 4.385361194610596,
      "learning_rate": 0.00019923231020027368,
      "loss": 1.3468,
      "step": 1000
    },
    {
      "epoch": 0.14357501794687724,
      "grad_norm": 4.836021423339844,
      "learning_rate": 0.00019908932048578416,
      "loss": 1.2813,
      "step": 1050
    },
    {
      "epoch": 0.1504119235633952,
      "grad_norm": 4.949122905731201,
      "learning_rate": 0.00019893418685985658,
      "loss": 1.311,
      "step": 1100
    },
    {
      "epoch": 0.15724882917991317,
      "grad_norm": 6.123111248016357,
      "learning_rate": 0.00019876692833492343,
      "loss": 1.342,
      "step": 1150
    },
    {
      "epoch": 0.16408573479643113,
      "grad_norm": 5.803433418273926,
      "learning_rate": 0.0001985875654093866,
      "loss": 1.2384,
      "step": 1200
    },
    {
      "epoch": 0.1709226404129491,
      "grad_norm": 3.196314811706543,
      "learning_rate": 0.00019839612006510517,
      "loss": 1.3117,
      "step": 1250
    },
    {
      "epoch": 0.17775954602946706,
      "grad_norm": 6.21234130859375,
      "learning_rate": 0.00019819261576470152,
      "loss": 1.2307,
      "step": 1300
    },
    {
      "epoch": 0.18459645164598504,
      "grad_norm": 3.274829149246216,
      "learning_rate": 0.00019797707744868582,
      "loss": 1.2408,
      "step": 1350
    },
    {
      "epoch": 0.191433357262503,
      "grad_norm": 5.5120320320129395,
      "learning_rate": 0.0001977495315323993,
      "loss": 1.324,
      "step": 1400
    },
    {
      "epoch": 0.19827026287902094,
      "grad_norm": 7.289828777313232,
      "learning_rate": 0.0001975100059027772,
      "loss": 1.2039,
      "step": 1450
    },
    {
      "epoch": 0.20510716849553892,
      "grad_norm": 4.040754795074463,
      "learning_rate": 0.00019725852991493083,
      "loss": 1.3405,
      "step": 1500
    },
    {
      "epoch": 0.21194407411205687,
      "grad_norm": 52.13080596923828,
      "learning_rate": 0.00019699513438854995,
      "loss": 1.2005,
      "step": 1550
    },
    {
      "epoch": 0.21878097972857485,
      "grad_norm": 5.0520429611206055,
      "learning_rate": 0.00019671985160412593,
      "loss": 1.0046,
      "step": 1600
    },
    {
      "epoch": 0.2256178853450928,
      "grad_norm": 1.7626160383224487,
      "learning_rate": 0.00019643271529899532,
      "loss": 1.1398,
      "step": 1650
    },
    {
      "epoch": 0.23245479096161079,
      "grad_norm": 2.1751222610473633,
      "learning_rate": 0.00019613376066320525,
      "loss": 1.1519,
      "step": 1700
    },
    {
      "epoch": 0.23929169657812874,
      "grad_norm": 4.483262062072754,
      "learning_rate": 0.00019582302433520074,
      "loss": 1.144,
      "step": 1750
    },
    {
      "epoch": 0.2461286021946467,
      "grad_norm": 2.494478702545166,
      "learning_rate": 0.00019550054439733449,
      "loss": 1.1908,
      "step": 1800
    },
    {
      "epoch": 0.25296550781116467,
      "grad_norm": 14.6198091506958,
      "learning_rate": 0.00019516636037119952,
      "loss": 1.0791,
      "step": 1850
    },
    {
      "epoch": 0.2598024134276826,
      "grad_norm": 1.5368318557739258,
      "learning_rate": 0.00019482051321278592,
      "loss": 1.1994,
      "step": 1900
    },
    {
      "epoch": 0.2666393190442006,
      "grad_norm": 6.854203701019287,
      "learning_rate": 0.00019446304530746112,
      "loss": 1.1871,
      "step": 1950
    },
    {
      "epoch": 0.2734762246607186,
      "grad_norm": 3.686593770980835,
      "learning_rate": 0.00019409400046477559,
      "loss": 1.0619,
      "step": 2000
    },
    {
      "epoch": 0.2734762246607186,
      "eval_loss": 0.3232106864452362,
      "eval_runtime": 301.3298,
      "eval_samples_per_second": 26.801,
      "eval_steps_per_second": 3.352,
      "step": 2000
    },
    {
      "epoch": 0.28031313027723653,
      "grad_norm": 2.84173321723938,
      "learning_rate": 0.00019371342391309363,
      "loss": 1.1769,
      "step": 2050
    },
    {
      "epoch": 0.2871500358937545,
      "grad_norm": 6.158025741577148,
      "learning_rate": 0.00019332136229405043,
      "loss": 1.1985,
      "step": 2100
    },
    {
      "epoch": 0.29398694151027244,
      "grad_norm": 1.3917083740234375,
      "learning_rate": 0.00019291786365683599,
      "loss": 1.2915,
      "step": 2150
    },
    {
      "epoch": 0.3008238471267904,
      "grad_norm": 6.717157363891602,
      "learning_rate": 0.00019250297745230615,
      "loss": 0.9168,
      "step": 2200
    },
    {
      "epoch": 0.3076607527433084,
      "grad_norm": 7.835381507873535,
      "learning_rate": 0.00019207675452692259,
      "loss": 1.0267,
      "step": 2250
    },
    {
      "epoch": 0.31449765835982635,
      "grad_norm": 4.236868858337402,
      "learning_rate": 0.00019163924711652092,
      "loss": 1.1836,
      "step": 2300
    },
    {
      "epoch": 0.3213345639763443,
      "grad_norm": 4.367033004760742,
      "learning_rate": 0.00019119050883990903,
      "loss": 1.1023,
      "step": 2350
    },
    {
      "epoch": 0.32817146959286225,
      "grad_norm": 8.43916130065918,
      "learning_rate": 0.00019073059469229602,
      "loss": 1.1884,
      "step": 2400
    },
    {
      "epoch": 0.33500837520938026,
      "grad_norm": 7.896825790405273,
      "learning_rate": 0.0001902595610385519,
      "loss": 1.1764,
      "step": 2450
    },
    {
      "epoch": 0.3418452808258982,
      "grad_norm": 3.5363454818725586,
      "learning_rate": 0.00018977746560630012,
      "loss": 1.1172,
      "step": 2500
    },
    {
      "epoch": 0.34868218644241616,
      "grad_norm": 12.307855606079102,
      "learning_rate": 0.00018928436747884253,
      "loss": 1.078,
      "step": 2550
    },
    {
      "epoch": 0.3555190920589341,
      "grad_norm": 8.765337944030762,
      "learning_rate": 0.00018878032708791854,
      "loss": 1.1449,
      "step": 2600
    },
    {
      "epoch": 0.36235599767545207,
      "grad_norm": 11.366116523742676,
      "learning_rate": 0.00018826540620629873,
      "loss": 1.1117,
      "step": 2650
    },
    {
      "epoch": 0.3691929032919701,
      "grad_norm": 3.603243112564087,
      "learning_rate": 0.0001877396679402145,
      "loss": 1.1138,
      "step": 2700
    },
    {
      "epoch": 0.37602980890848803,
      "grad_norm": 8.020549774169922,
      "learning_rate": 0.00018720317672162392,
      "loss": 1.0474,
      "step": 2750
    },
    {
      "epoch": 0.382866714525006,
      "grad_norm": 4.786285877227783,
      "learning_rate": 0.00018665599830031533,
      "loss": 1.1041,
      "step": 2800
    },
    {
      "epoch": 0.38970362014152393,
      "grad_norm": 7.1555633544921875,
      "learning_rate": 0.00018609819973584924,
      "loss": 1.0623,
      "step": 2850
    },
    {
      "epoch": 0.3965405257580419,
      "grad_norm": 6.989715576171875,
      "learning_rate": 0.00018552984938934006,
      "loss": 0.9318,
      "step": 2900
    },
    {
      "epoch": 0.4033774313745599,
      "grad_norm": 7.150449752807617,
      "learning_rate": 0.00018495101691507783,
      "loss": 1.132,
      "step": 2950
    },
    {
      "epoch": 0.41021433699107784,
      "grad_norm": 4.584231853485107,
      "learning_rate": 0.00018436177325199192,
      "loss": 1.1382,
      "step": 3000
    },
    {
      "epoch": 0.4170512426075958,
      "grad_norm": 5.139730930328369,
      "learning_rate": 0.00018376219061495694,
      "loss": 1.0452,
      "step": 3050
    },
    {
      "epoch": 0.42388814822411375,
      "grad_norm": 15.497014999389648,
      "learning_rate": 0.00018315234248594264,
      "loss": 1.0451,
      "step": 3100
    },
    {
      "epoch": 0.43072505384063176,
      "grad_norm": 3.4872303009033203,
      "learning_rate": 0.0001825323036050081,
      "loss": 1.131,
      "step": 3150
    },
    {
      "epoch": 0.4375619594571497,
      "grad_norm": 11.307365417480469,
      "learning_rate": 0.00018190214996114206,
      "loss": 1.1382,
      "step": 3200
    },
    {
      "epoch": 0.44439886507366766,
      "grad_norm": 5.577065467834473,
      "learning_rate": 0.00018126195878295006,
      "loss": 1.1045,
      "step": 3250
    },
    {
      "epoch": 0.4512357706901856,
      "grad_norm": 14.33316421508789,
      "learning_rate": 0.0001806118085291896,
      "loss": 1.0887,
      "step": 3300
    },
    {
      "epoch": 0.45807267630670356,
      "grad_norm": 15.240452766418457,
      "learning_rate": 0.00017995177887915475,
      "loss": 1.0171,
      "step": 3350
    },
    {
      "epoch": 0.46490958192322157,
      "grad_norm": 10.07467269897461,
      "learning_rate": 0.00017928195072291093,
      "loss": 1.0966,
      "step": 3400
    },
    {
      "epoch": 0.4717464875397395,
      "grad_norm": 2.930840253829956,
      "learning_rate": 0.00017860240615138142,
      "loss": 1.0418,
      "step": 3450
    },
    {
      "epoch": 0.4785833931562575,
      "grad_norm": 30.01850700378418,
      "learning_rate": 0.00017791322844628677,
      "loss": 0.9635,
      "step": 3500
    },
    {
      "epoch": 0.4854202987727754,
      "grad_norm": 5.433286666870117,
      "learning_rate": 0.0001772145020699381,
      "loss": 1.0108,
      "step": 3550
    },
    {
      "epoch": 0.4922572043892934,
      "grad_norm": 3.0814309120178223,
      "learning_rate": 0.0001765063126548858,
      "loss": 1.1257,
      "step": 3600
    },
    {
      "epoch": 0.4990941100058114,
      "grad_norm": 79.82017517089844,
      "learning_rate": 0.00017578874699342493,
      "loss": 1.1214,
      "step": 3650
    },
    {
      "epoch": 0.5059310156223293,
      "grad_norm": 8.51614761352539,
      "learning_rate": 0.00017506189302695827,
      "loss": 0.8635,
      "step": 3700
    },
    {
      "epoch": 0.5127679212388473,
      "grad_norm": 8.251550674438477,
      "learning_rate": 0.0001743258398352187,
      "loss": 0.9361,
      "step": 3750
    },
    {
      "epoch": 0.5196048268553652,
      "grad_norm": 3.81523060798645,
      "learning_rate": 0.00017358067762535186,
      "loss": 1.066,
      "step": 3800
    },
    {
      "epoch": 0.5264417324718832,
      "grad_norm": 15.210460662841797,
      "learning_rate": 0.00017282649772086114,
      "loss": 0.9778,
      "step": 3850
    },
    {
      "epoch": 0.5332786380884011,
      "grad_norm": 5.145527362823486,
      "learning_rate": 0.0001720633925504151,
      "loss": 1.0966,
      "step": 3900
    },
    {
      "epoch": 0.5401155437049191,
      "grad_norm": 3.485656261444092,
      "learning_rate": 0.00017129145563652014,
      "loss": 0.6889,
      "step": 3950
    },
    {
      "epoch": 0.5469524493214372,
      "grad_norm": 7.915320873260498,
      "learning_rate": 0.00017051078158405872,
      "loss": 0.9154,
      "step": 4000
    },
    {
      "epoch": 0.5469524493214372,
      "eval_loss": 0.24666446447372437,
      "eval_runtime": 301.8017,
      "eval_samples_per_second": 26.759,
      "eval_steps_per_second": 3.347,
      "step": 4000
    },
    {
      "epoch": 0.5537893549379551,
      "grad_norm": 12.610590934753418,
      "learning_rate": 0.00016972146606869507,
      "loss": 0.8612,
      "step": 4050
    },
    {
      "epoch": 0.5606262605544731,
      "grad_norm": 34.93125915527344,
      "learning_rate": 0.00016892360582514967,
      "loss": 1.0867,
      "step": 4100
    },
    {
      "epoch": 0.567463166170991,
      "grad_norm": 7.39677095413208,
      "learning_rate": 0.00016811729863534377,
      "loss": 1.1106,
      "step": 4150
    },
    {
      "epoch": 0.574300071787509,
      "grad_norm": 2.4880149364471436,
      "learning_rate": 0.00016730264331641585,
      "loss": 0.9142,
      "step": 4200
    },
    {
      "epoch": 0.5811369774040269,
      "grad_norm": 19.268964767456055,
      "learning_rate": 0.00016647973970861104,
      "loss": 0.9408,
      "step": 4250
    },
    {
      "epoch": 0.5879738830205449,
      "grad_norm": 62.558837890625,
      "learning_rate": 0.00016564868866304517,
      "loss": 1.1798,
      "step": 4300
    },
    {
      "epoch": 0.5948107886370628,
      "grad_norm": 12.449636459350586,
      "learning_rate": 0.00016480959202934487,
      "loss": 0.9386,
      "step": 4350
    },
    {
      "epoch": 0.6016476942535808,
      "grad_norm": 9.708828926086426,
      "learning_rate": 0.00016396255264316547,
      "loss": 1.0766,
      "step": 4400
    },
    {
      "epoch": 0.6084845998700988,
      "grad_norm": 4.00963020324707,
      "learning_rate": 0.0001631076743135879,
      "loss": 0.9953,
      "step": 4450
    },
    {
      "epoch": 0.6153215054866168,
      "grad_norm": 14.70906925201416,
      "learning_rate": 0.0001622450618103964,
      "loss": 1.1006,
      "step": 4500
    },
    {
      "epoch": 0.6221584111031347,
      "grad_norm": 2.471301317214966,
      "learning_rate": 0.00016137482085123832,
      "loss": 0.7397,
      "step": 4550
    },
    {
      "epoch": 0.6289953167196527,
      "grad_norm": 0.671847939491272,
      "learning_rate": 0.00016049705808866805,
      "loss": 1.1298,
      "step": 4600
    },
    {
      "epoch": 0.6358322223361706,
      "grad_norm": 11.712217330932617,
      "learning_rate": 0.000159611881097076,
      "loss": 0.8828,
      "step": 4650
    },
    {
      "epoch": 0.6426691279526886,
      "grad_norm": 90.13214111328125,
      "learning_rate": 0.00015871939835950503,
      "loss": 1.085,
      "step": 4700
    },
    {
      "epoch": 0.6495060335692066,
      "grad_norm": 2.1299564838409424,
      "learning_rate": 0.00015781971925435498,
      "loss": 1.0104,
      "step": 4750
    },
    {
      "epoch": 0.6563429391857245,
      "grad_norm": 44.118778228759766,
      "learning_rate": 0.0001569129540419781,
      "loss": 0.8905,
      "step": 4800
    },
    {
      "epoch": 0.6631798448022425,
      "grad_norm": 20.966922760009766,
      "learning_rate": 0.00015599921385116582,
      "loss": 0.9239,
      "step": 4850
    },
    {
      "epoch": 0.6700167504187605,
      "grad_norm": 13.358034133911133,
      "learning_rate": 0.00015507861066552955,
      "loss": 0.8589,
      "step": 4900
    },
    {
      "epoch": 0.6768536560352785,
      "grad_norm": 5.739938259124756,
      "learning_rate": 0.00015415125730977626,
      "loss": 1.0661,
      "step": 4950
    },
    {
      "epoch": 0.6836905616517964,
      "grad_norm": 25.265790939331055,
      "learning_rate": 0.00015321726743588155,
      "loss": 0.9046,
      "step": 5000
    },
    {
      "epoch": 0.6905274672683144,
      "grad_norm": 22.772367477416992,
      "learning_rate": 0.00015227675550916073,
      "loss": 1.0174,
      "step": 5050
    },
    {
      "epoch": 0.6973643728848323,
      "grad_norm": 4.18620491027832,
      "learning_rate": 0.0001513298367942405,
      "loss": 0.9916,
      "step": 5100
    },
    {
      "epoch": 0.7042012785013503,
      "grad_norm": 10.113117218017578,
      "learning_rate": 0.00015037662734093286,
      "loss": 0.9635,
      "step": 5150
    },
    {
      "epoch": 0.7110381841178682,
      "grad_norm": 1.7103244066238403,
      "learning_rate": 0.0001494172439700126,
      "loss": 0.8927,
      "step": 5200
    },
    {
      "epoch": 0.7178750897343862,
      "grad_norm": 24.236433029174805,
      "learning_rate": 0.0001484518042589,
      "loss": 0.9438,
      "step": 5250
    },
    {
      "epoch": 0.7247119953509041,
      "grad_norm": 2.4070262908935547,
      "learning_rate": 0.00014748042652725152,
      "loss": 1.095,
      "step": 5300
    },
    {
      "epoch": 0.7315489009674222,
      "grad_norm": 4.471241474151611,
      "learning_rate": 0.0001465032298224588,
      "loss": 0.8205,
      "step": 5350
    },
    {
      "epoch": 0.7383858065839402,
      "grad_norm": 1.757636547088623,
      "learning_rate": 0.0001455203339050589,
      "loss": 0.9177,
      "step": 5400
    },
    {
      "epoch": 0.7452227122004581,
      "grad_norm": 1.5365773439407349,
      "learning_rate": 0.0001445318592340571,
      "loss": 0.7696,
      "step": 5450
    },
    {
      "epoch": 0.7520596178169761,
      "grad_norm": 1.7077670097351074,
      "learning_rate": 0.00014353792695216382,
      "loss": 0.9342,
      "step": 5500
    },
    {
      "epoch": 0.758896523433494,
      "grad_norm": 28.525236129760742,
      "learning_rate": 0.00014253865887094817,
      "loss": 0.9897,
      "step": 5550
    },
    {
      "epoch": 0.765733429050012,
      "grad_norm": 15.281404495239258,
      "learning_rate": 0.00014153417745590914,
      "loss": 0.8873,
      "step": 5600
    },
    {
      "epoch": 0.7725703346665299,
      "grad_norm": 1.1002103090286255,
      "learning_rate": 0.00014052460581146696,
      "loss": 0.7727,
      "step": 5650
    },
    {
      "epoch": 0.7794072402830479,
      "grad_norm": 4.395946025848389,
      "learning_rate": 0.00013951006766587586,
      "loss": 0.8922,
      "step": 5700
    },
    {
      "epoch": 0.7862441458995658,
      "grad_norm": 5.225406169891357,
      "learning_rate": 0.0001384906873560607,
      "loss": 0.9766,
      "step": 5750
    },
    {
      "epoch": 0.7930810515160838,
      "grad_norm": 6.0966315269470215,
      "learning_rate": 0.00013746658981237867,
      "loss": 1.1373,
      "step": 5800
    },
    {
      "epoch": 0.7999179571326018,
      "grad_norm": 14.155887603759766,
      "learning_rate": 0.00013643790054330846,
      "loss": 0.8954,
      "step": 5850
    },
    {
      "epoch": 0.8067548627491198,
      "grad_norm": 2.6549534797668457,
      "learning_rate": 0.0001354047456200687,
      "loss": 1.0428,
      "step": 5900
    },
    {
      "epoch": 0.8135917683656377,
      "grad_norm": 7.79277229309082,
      "learning_rate": 0.0001343672516611671,
      "loss": 0.8715,
      "step": 5950
    },
    {
      "epoch": 0.8204286739821557,
      "grad_norm": 17.183149337768555,
      "learning_rate": 0.00013332554581688271,
      "loss": 1.0601,
      "step": 6000
    },
    {
      "epoch": 0.8204286739821557,
      "eval_loss": 0.20116083323955536,
      "eval_runtime": 301.512,
      "eval_samples_per_second": 26.785,
      "eval_steps_per_second": 3.35,
      "step": 6000
    },
    {
      "epoch": 0.8272655795986736,
      "grad_norm": 10.275203704833984,
      "learning_rate": 0.00013227975575368312,
      "loss": 0.8782,
      "step": 6050
    },
    {
      "epoch": 0.8341024852151916,
      "grad_norm": 3.2849924564361572,
      "learning_rate": 0.0001312300096385781,
      "loss": 0.7405,
      "step": 6100
    },
    {
      "epoch": 0.8409393908317095,
      "grad_norm": 5.1770853996276855,
      "learning_rate": 0.0001301764361234122,
      "loss": 1.0901,
      "step": 6150
    },
    {
      "epoch": 0.8477762964482275,
      "grad_norm": 13.282193183898926,
      "learning_rate": 0.0001291191643290977,
      "loss": 0.9054,
      "step": 6200
    },
    {
      "epoch": 0.8546132020647454,
      "grad_norm": 9.424989700317383,
      "learning_rate": 0.0001280583238297903,
      "loss": 0.9861,
      "step": 6250
    },
    {
      "epoch": 0.8614501076812635,
      "grad_norm": 2.5506229400634766,
      "learning_rate": 0.000126994044637009,
      "loss": 1.0244,
      "step": 6300
    },
    {
      "epoch": 0.8682870132977815,
      "grad_norm": 21.7524471282959,
      "learning_rate": 0.00012592645718370252,
      "loss": 0.9079,
      "step": 6350
    },
    {
      "epoch": 0.8751239189142994,
      "grad_norm": 2.2379355430603027,
      "learning_rate": 0.00012485569230826423,
      "loss": 1.0235,
      "step": 6400
    },
    {
      "epoch": 0.8819608245308174,
      "grad_norm": 18.936904907226562,
      "learning_rate": 0.000123781881238497,
      "loss": 0.8275,
      "step": 6450
    },
    {
      "epoch": 0.8887977301473353,
      "grad_norm": 1.508329153060913,
      "learning_rate": 0.00012270515557553065,
      "loss": 0.9872,
      "step": 6500
    },
    {
      "epoch": 0.8956346357638533,
      "grad_norm": 30.93293571472168,
      "learning_rate": 0.00012162564727769359,
      "loss": 1.0287,
      "step": 6550
    },
    {
      "epoch": 0.9024715413803712,
      "grad_norm": 29.230403900146484,
      "learning_rate": 0.00012054348864434066,
      "loss": 0.627,
      "step": 6600
    },
    {
      "epoch": 0.9093084469968892,
      "grad_norm": 14.68487548828125,
      "learning_rate": 0.00011945881229963898,
      "loss": 0.9562,
      "step": 6650
    },
    {
      "epoch": 0.9161453526134071,
      "grad_norm": 2.035444736480713,
      "learning_rate": 0.00011837175117631436,
      "loss": 0.8726,
      "step": 6700
    },
    {
      "epoch": 0.9229822582299252,
      "grad_norm": 12.931522369384766,
      "learning_rate": 0.0001172824384993596,
      "loss": 0.8823,
      "step": 6750
    },
    {
      "epoch": 0.9298191638464431,
      "grad_norm": 8.330245971679688,
      "learning_rate": 0.00011619100776970713,
      "loss": 0.7179,
      "step": 6800
    },
    {
      "epoch": 0.9366560694629611,
      "grad_norm": 51.09445571899414,
      "learning_rate": 0.00011509759274786776,
      "loss": 0.8627,
      "step": 6850
    },
    {
      "epoch": 0.943492975079479,
      "grad_norm": 26.371118545532227,
      "learning_rate": 0.00011400232743753752,
      "loss": 0.7334,
      "step": 6900
    },
    {
      "epoch": 0.950329880695997,
      "grad_norm": 1.3464198112487793,
      "learning_rate": 0.00011290534606917508,
      "loss": 1.0389,
      "step": 6950
    },
    {
      "epoch": 0.957166786312515,
      "grad_norm": 0.732755184173584,
      "learning_rate": 0.00011180678308355081,
      "loss": 0.8343,
      "step": 7000
    },
    {
      "epoch": 0.9640036919290329,
      "grad_norm": 0.9582768082618713,
      "learning_rate": 0.00011070677311527058,
      "loss": 1.0705,
      "step": 7050
    },
    {
      "epoch": 0.9708405975455509,
      "grad_norm": 0.7923704385757446,
      "learning_rate": 0.00010960545097627548,
      "loss": 0.9725,
      "step": 7100
    },
    {
      "epoch": 0.9776775031620688,
      "grad_norm": 39.650177001953125,
      "learning_rate": 0.00010850295163931992,
      "loss": 0.8721,
      "step": 7150
    },
    {
      "epoch": 0.9845144087785868,
      "grad_norm": 9.212077140808105,
      "learning_rate": 0.00010739941022143007,
      "loss": 0.8079,
      "step": 7200
    },
    {
      "epoch": 0.9913513143951048,
      "grad_norm": 2.591902494430542,
      "learning_rate": 0.00010629496196734452,
      "loss": 1.1336,
      "step": 7250
    },
    {
      "epoch": 0.9981882200116228,
      "grad_norm": 18.618799209594727,
      "learning_rate": 0.00010518974223293936,
      "loss": 1.0463,
      "step": 7300
    },
    {
      "epoch": 1.004922572043893,
      "grad_norm": 8.480158805847168,
      "learning_rate": 0.00010408388646863965,
      "loss": 0.7236,
      "step": 7350
    },
    {
      "epoch": 1.0117594776604109,
      "grad_norm": 3.5370821952819824,
      "learning_rate": 0.00010297753020281911,
      "loss": 0.813,
      "step": 7400
    },
    {
      "epoch": 1.018596383276929,
      "grad_norm": 0.5842294096946716,
      "learning_rate": 0.00010187080902519064,
      "loss": 0.589,
      "step": 7450
    },
    {
      "epoch": 1.0254332888934468,
      "grad_norm": 11.063470840454102,
      "learning_rate": 0.00010076385857018889,
      "loss": 0.9893,
      "step": 7500
    },
    {
      "epoch": 1.0322701945099648,
      "grad_norm": 8.910834312438965,
      "learning_rate": 9.965681450034771e-05,
      "loss": 0.6532,
      "step": 7550
    },
    {
      "epoch": 1.0391071001264827,
      "grad_norm": 0.8395630121231079,
      "learning_rate": 9.854981248967388e-05,
      "loss": 0.6934,
      "step": 7600
    },
    {
      "epoch": 1.0459440057430007,
      "grad_norm": 3.7071163654327393,
      "learning_rate": 9.744298820701968e-05,
      "loss": 0.7911,
      "step": 7650
    },
    {
      "epoch": 1.0527809113595188,
      "grad_norm": 14.003477096557617,
      "learning_rate": 9.633647729945581e-05,
      "loss": 0.7611,
      "step": 7700
    },
    {
      "epoch": 1.0596178169760366,
      "grad_norm": 19.04654884338379,
      "learning_rate": 9.523041537564726e-05,
      "loss": 0.6596,
      "step": 7750
    },
    {
      "epoch": 1.0664547225925547,
      "grad_norm": 52.79182815551758,
      "learning_rate": 9.412493798923383e-05,
      "loss": 0.763,
      "step": 7800
    },
    {
      "epoch": 1.0732916282090725,
      "grad_norm": 1.4399851560592651,
      "learning_rate": 9.3020180622217e-05,
      "loss": 0.667,
      "step": 7850
    },
    {
      "epoch": 1.0801285338255906,
      "grad_norm": 1.6162464618682861,
      "learning_rate": 9.19162786683564e-05,
      "loss": 0.813,
      "step": 7900
    },
    {
      "epoch": 1.0869654394421084,
      "grad_norm": 6.91720724105835,
      "learning_rate": 9.081336741657603e-05,
      "loss": 0.6394,
      "step": 7950
    },
    {
      "epoch": 1.0938023450586265,
      "grad_norm": 7.005824089050293,
      "learning_rate": 8.971158203438443e-05,
      "loss": 0.6949,
      "step": 8000
    },
    {
      "epoch": 1.0938023450586265,
      "eval_loss": 0.22489766776561737,
      "eval_runtime": 301.6603,
      "eval_samples_per_second": 26.772,
      "eval_steps_per_second": 3.348,
      "step": 8000
    },
    {
      "epoch": 1.1006392506751443,
      "grad_norm": 12.64887523651123,
      "learning_rate": 8.861105755130896e-05,
      "loss": 0.6777,
      "step": 8050
    },
    {
      "epoch": 1.1074761562916624,
      "grad_norm": 99.47157287597656,
      "learning_rate": 8.751192884234704e-05,
      "loss": 0.5242,
      "step": 8100
    },
    {
      "epoch": 1.1143130619081805,
      "grad_norm": 2.9147791862487793,
      "learning_rate": 8.641433061143698e-05,
      "loss": 0.6589,
      "step": 8150
    },
    {
      "epoch": 1.1211499675246983,
      "grad_norm": 0.4020586311817169,
      "learning_rate": 8.531839737494878e-05,
      "loss": 0.9058,
      "step": 8200
    },
    {
      "epoch": 1.1279868731412164,
      "grad_norm": 41.31173324584961,
      "learning_rate": 8.422426344519898e-05,
      "loss": 0.5999,
      "step": 8250
    },
    {
      "epoch": 1.1348237787577342,
      "grad_norm": 0.19233907759189606,
      "learning_rate": 8.313206291398948e-05,
      "loss": 0.8461,
      "step": 8300
    },
    {
      "epoch": 1.1416606843742523,
      "grad_norm": 0.5941385626792908,
      "learning_rate": 8.20419296361743e-05,
      "loss": 0.5353,
      "step": 8350
    },
    {
      "epoch": 1.1484975899907701,
      "grad_norm": 6.670557022094727,
      "learning_rate": 8.095399721325481e-05,
      "loss": 0.6484,
      "step": 8400
    },
    {
      "epoch": 1.1553344956072882,
      "grad_norm": 3.8168182373046875,
      "learning_rate": 7.9868398977006e-05,
      "loss": 0.8318,
      "step": 8450
    },
    {
      "epoch": 1.162171401223806,
      "grad_norm": 17.14653778076172,
      "learning_rate": 7.87852679731364e-05,
      "loss": 0.5694,
      "step": 8500
    },
    {
      "epoch": 1.169008306840324,
      "grad_norm": 58.7053108215332,
      "learning_rate": 7.77047369449821e-05,
      "loss": 0.7256,
      "step": 8550
    },
    {
      "epoch": 1.1758452124568421,
      "grad_norm": 0.4155759811401367,
      "learning_rate": 7.66269383172389e-05,
      "loss": 0.604,
      "step": 8600
    },
    {
      "epoch": 1.18268211807336,
      "grad_norm": 1.1354832649230957,
      "learning_rate": 7.555200417973261e-05,
      "loss": 0.7761,
      "step": 8650
    },
    {
      "epoch": 1.189519023689878,
      "grad_norm": 1.1315326690673828,
      "learning_rate": 7.448006627123083e-05,
      "loss": 0.6569,
      "step": 8700
    },
    {
      "epoch": 1.196355929306396,
      "grad_norm": 0.9931478500366211,
      "learning_rate": 7.341125596329783e-05,
      "loss": 0.8456,
      "step": 8750
    },
    {
      "epoch": 1.203192834922914,
      "grad_norm": 2.132953643798828,
      "learning_rate": 7.2345704244194e-05,
      "loss": 0.7142,
      "step": 8800
    },
    {
      "epoch": 1.2100297405394318,
      "grad_norm": 10.148101806640625,
      "learning_rate": 7.12835417028229e-05,
      "loss": 0.7284,
      "step": 8850
    },
    {
      "epoch": 1.2168666461559499,
      "grad_norm": 41.58332824707031,
      "learning_rate": 7.022489851272668e-05,
      "loss": 0.5779,
      "step": 8900
    },
    {
      "epoch": 1.2237035517724677,
      "grad_norm": 4.843736171722412,
      "learning_rate": 6.91699044161326e-05,
      "loss": 0.6783,
      "step": 8950
    },
    {
      "epoch": 1.2305404573889858,
      "grad_norm": 0.4043326675891876,
      "learning_rate": 6.811868870805269e-05,
      "loss": 0.7656,
      "step": 9000
    },
    {
      "epoch": 1.2373773630055038,
      "grad_norm": 3.8934195041656494,
      "learning_rate": 6.70713802204377e-05,
      "loss": 0.5857,
      "step": 9050
    },
    {
      "epoch": 1.2442142686220217,
      "grad_norm": 0.23483966290950775,
      "learning_rate": 6.602810730638829e-05,
      "loss": 0.6388,
      "step": 9100
    },
    {
      "epoch": 1.2510511742385395,
      "grad_norm": 2.1649527549743652,
      "learning_rate": 6.498899782442444e-05,
      "loss": 0.6986,
      "step": 9150
    },
    {
      "epoch": 1.2578880798550576,
      "grad_norm": 82.96743774414062,
      "learning_rate": 6.39541791228161e-05,
      "loss": 0.5563,
      "step": 9200
    },
    {
      "epoch": 1.2647249854715756,
      "grad_norm": 1.8622783422470093,
      "learning_rate": 6.292377802397564e-05,
      "loss": 0.6941,
      "step": 9250
    },
    {
      "epoch": 1.2715618910880935,
      "grad_norm": 1.1985386610031128,
      "learning_rate": 6.189792080891525e-05,
      "loss": 0.6195,
      "step": 9300
    },
    {
      "epoch": 1.2783987967046115,
      "grad_norm": 1.1333106756210327,
      "learning_rate": 6.087673320177058e-05,
      "loss": 0.5675,
      "step": 9350
    },
    {
      "epoch": 1.2852357023211294,
      "grad_norm": 13.326946258544922,
      "learning_rate": 5.9860340354392496e-05,
      "loss": 0.8214,
      "step": 9400
    },
    {
      "epoch": 1.2920726079376474,
      "grad_norm": 10.754223823547363,
      "learning_rate": 5.8848866831009156e-05,
      "loss": 0.663,
      "step": 9450
    },
    {
      "epoch": 1.2989095135541655,
      "grad_norm": 0.07592844218015671,
      "learning_rate": 5.784243659296001e-05,
      "loss": 0.6661,
      "step": 9500
    },
    {
      "epoch": 1.3057464191706833,
      "grad_norm": 4.361905097961426,
      "learning_rate": 5.6841172983503634e-05,
      "loss": 0.6757,
      "step": 9550
    },
    {
      "epoch": 1.3125833247872012,
      "grad_norm": 6.464013576507568,
      "learning_rate": 5.5845198712701396e-05,
      "loss": 0.8568,
      "step": 9600
    },
    {
      "epoch": 1.3194202304037193,
      "grad_norm": 13.971973419189453,
      "learning_rate": 5.485463584237871e-05,
      "loss": 0.5852,
      "step": 9650
    },
    {
      "epoch": 1.3262571360202373,
      "grad_norm": 25.48811149597168,
      "learning_rate": 5.3869605771165755e-05,
      "loss": 0.652,
      "step": 9700
    },
    {
      "epoch": 1.3330940416367552,
      "grad_norm": 5.14886474609375,
      "learning_rate": 5.289022921961948e-05,
      "loss": 0.8247,
      "step": 9750
    },
    {
      "epoch": 1.3399309472532732,
      "grad_norm": 0.6628409028053284,
      "learning_rate": 5.1916626215428385e-05,
      "loss": 0.5708,
      "step": 9800
    },
    {
      "epoch": 1.346767852869791,
      "grad_norm": 81.61123657226562,
      "learning_rate": 5.094891607870296e-05,
      "loss": 0.7523,
      "step": 9850
    },
    {
      "epoch": 1.3536047584863091,
      "grad_norm": 0.597465455532074,
      "learning_rate": 4.998721740735197e-05,
      "loss": 0.7701,
      "step": 9900
    },
    {
      "epoch": 1.3604416641028272,
      "grad_norm": 1.8627650737762451,
      "learning_rate": 4.903164806254804e-05,
      "loss": 0.6589,
      "step": 9950
    },
    {
      "epoch": 1.367278569719345,
      "grad_norm": 0.427298903465271,
      "learning_rate": 4.808232515428268e-05,
      "loss": 0.6476,
      "step": 10000
    },
    {
      "epoch": 1.367278569719345,
      "eval_loss": 0.25095975399017334,
      "eval_runtime": 301.6273,
      "eval_samples_per_second": 26.775,
      "eval_steps_per_second": 3.349,
      "step": 10000
    },
    {
      "epoch": 1.3741154753358629,
      "grad_norm": 0.5417049527168274,
      "learning_rate": 4.713936502701435e-05,
      "loss": 0.7344,
      "step": 10050
    },
    {
      "epoch": 1.380952380952381,
      "grad_norm": 0.30379384756088257,
      "learning_rate": 4.620288324540962e-05,
      "loss": 0.5764,
      "step": 10100
    },
    {
      "epoch": 1.387789286568899,
      "grad_norm": 0.258468359708786,
      "learning_rate": 4.5272994580179895e-05,
      "loss": 0.6794,
      "step": 10150
    },
    {
      "epoch": 1.3946261921854168,
      "grad_norm": 1.2032103538513184,
      "learning_rate": 4.434981299401615e-05,
      "loss": 0.5931,
      "step": 10200
    },
    {
      "epoch": 1.401463097801935,
      "grad_norm": 4.064381122589111,
      "learning_rate": 4.3433451627621743e-05,
      "loss": 0.4061,
      "step": 10250
    },
    {
      "epoch": 1.4083000034184527,
      "grad_norm": 1.0236620903015137,
      "learning_rate": 4.2524022785846806e-05,
      "loss": 0.5935,
      "step": 10300
    },
    {
      "epoch": 1.4151369090349708,
      "grad_norm": 0.42589133977890015,
      "learning_rate": 4.1621637923924405e-05,
      "loss": 0.8298,
      "step": 10350
    },
    {
      "epoch": 1.4219738146514889,
      "grad_norm": 9.088717460632324,
      "learning_rate": 4.072640763381127e-05,
      "loss": 0.5821,
      "step": 10400
    },
    {
      "epoch": 1.4288107202680067,
      "grad_norm": 2.854710102081299,
      "learning_rate": 3.983844163063429e-05,
      "loss": 0.6541,
      "step": 10450
    },
    {
      "epoch": 1.4356476258845245,
      "grad_norm": 6.076037406921387,
      "learning_rate": 3.895784873924397e-05,
      "loss": 0.6669,
      "step": 10500
    },
    {
      "epoch": 1.4424845315010426,
      "grad_norm": 0.36614227294921875,
      "learning_rate": 3.8084736880877846e-05,
      "loss": 0.5883,
      "step": 10550
    },
    {
      "epoch": 1.4493214371175607,
      "grad_norm": 82.49917602539062,
      "learning_rate": 3.721921305993391e-05,
      "loss": 0.8045,
      "step": 10600
    },
    {
      "epoch": 1.4561583427340785,
      "grad_norm": 45.616859436035156,
      "learning_rate": 3.636138335085666e-05,
      "loss": 0.4991,
      "step": 10650
    },
    {
      "epoch": 1.4629952483505966,
      "grad_norm": 0.26663124561309814,
      "learning_rate": 3.5511352885137194e-05,
      "loss": 0.4815,
      "step": 10700
    },
    {
      "epoch": 1.4698321539671144,
      "grad_norm": 1.6303415298461914,
      "learning_rate": 3.4669225838428785e-05,
      "loss": 0.4746,
      "step": 10750
    },
    {
      "epoch": 1.4766690595836325,
      "grad_norm": 14.5377779006958,
      "learning_rate": 3.3835105417779687e-05,
      "loss": 0.7877,
      "step": 10800
    },
    {
      "epoch": 1.4835059652001505,
      "grad_norm": 0.08112337440252304,
      "learning_rate": 3.30090938489844e-05,
      "loss": 0.6687,
      "step": 10850
    },
    {
      "epoch": 1.4903428708166684,
      "grad_norm": 7.454471588134766,
      "learning_rate": 3.219129236405548e-05,
      "loss": 0.8063,
      "step": 10900
    },
    {
      "epoch": 1.4971797764331862,
      "grad_norm": 5.5912275314331055,
      "learning_rate": 3.13818011888171e-05,
      "loss": 0.6337,
      "step": 10950
    },
    {
      "epoch": 1.5040166820497043,
      "grad_norm": 7.555117130279541,
      "learning_rate": 3.0580719530621705e-05,
      "loss": 0.6513,
      "step": 11000
    },
    {
      "epoch": 1.5108535876662224,
      "grad_norm": 0.4277037978172302,
      "learning_rate": 2.9788145566191693e-05,
      "loss": 0.603,
      "step": 11050
    },
    {
      "epoch": 1.5176904932827402,
      "grad_norm": 0.3563739061355591,
      "learning_rate": 2.900417642958734e-05,
      "loss": 0.5695,
      "step": 11100
    },
    {
      "epoch": 1.524527398899258,
      "grad_norm": 0.8669344782829285,
      "learning_rate": 2.822890820030264e-05,
      "loss": 0.7372,
      "step": 11150
    },
    {
      "epoch": 1.531364304515776,
      "grad_norm": 10.977109909057617,
      "learning_rate": 2.7462435891490036e-05,
      "loss": 0.6573,
      "step": 11200
    },
    {
      "epoch": 1.5382012101322942,
      "grad_norm": 0.33039143681526184,
      "learning_rate": 2.6704853438316213e-05,
      "loss": 0.4278,
      "step": 11250
    },
    {
      "epoch": 1.5450381157488122,
      "grad_norm": 3.340820550918579,
      "learning_rate": 2.5956253686449882e-05,
      "loss": 0.6281,
      "step": 11300
    },
    {
      "epoch": 1.55187502136533,
      "grad_norm": 6.152026176452637,
      "learning_rate": 2.521672838068295e-05,
      "loss": 0.6859,
      "step": 11350
    },
    {
      "epoch": 1.558711926981848,
      "grad_norm": 0.9645776152610779,
      "learning_rate": 2.4486368153686734e-05,
      "loss": 0.578,
      "step": 11400
    },
    {
      "epoch": 1.565548832598366,
      "grad_norm": 3.5073535442352295,
      "learning_rate": 2.3765262514904617e-05,
      "loss": 0.6756,
      "step": 11450
    },
    {
      "epoch": 1.572385738214884,
      "grad_norm": 1.3473198413848877,
      "learning_rate": 2.305349983958196e-05,
      "loss": 0.6288,
      "step": 11500
    },
    {
      "epoch": 1.5792226438314019,
      "grad_norm": 6.039999961853027,
      "learning_rate": 2.2351167357935422e-05,
      "loss": 0.6274,
      "step": 11550
    },
    {
      "epoch": 1.5860595494479197,
      "grad_norm": 0.9115678668022156,
      "learning_rate": 2.1658351144462362e-05,
      "loss": 0.6303,
      "step": 11600
    },
    {
      "epoch": 1.5928964550644378,
      "grad_norm": 37.31045150756836,
      "learning_rate": 2.097513610739209e-05,
      "loss": 0.7243,
      "step": 11650
    },
    {
      "epoch": 1.5997333606809558,
      "grad_norm": 0.5089764595031738,
      "learning_rate": 2.0301605978279702e-05,
      "loss": 0.507,
      "step": 11700
    },
    {
      "epoch": 1.606570266297474,
      "grad_norm": 16.424047470092773,
      "learning_rate": 1.9637843301744528e-05,
      "loss": 0.6387,
      "step": 11750
    },
    {
      "epoch": 1.6134071719139917,
      "grad_norm": 0.6381849646568298,
      "learning_rate": 1.898392942535383e-05,
      "loss": 0.7143,
      "step": 11800
    },
    {
      "epoch": 1.6202440775305096,
      "grad_norm": 7.240786075592041,
      "learning_rate": 1.833994448965315e-05,
      "loss": 0.7644,
      "step": 11850
    },
    {
      "epoch": 1.6270809831470276,
      "grad_norm": 0.6397457122802734,
      "learning_rate": 1.7705967418344737e-05,
      "loss": 0.5355,
      "step": 11900
    },
    {
      "epoch": 1.6339178887635457,
      "grad_norm": 0.49821093678474426,
      "learning_rate": 1.7082075908615013e-05,
      "loss": 0.7372,
      "step": 11950
    },
    {
      "epoch": 1.6407547943800636,
      "grad_norm": 0.550399124622345,
      "learning_rate": 1.6468346421612447e-05,
      "loss": 0.7474,
      "step": 12000
    },
    {
      "epoch": 1.6407547943800636,
      "eval_loss": 0.26388460397720337,
      "eval_runtime": 300.1264,
      "eval_samples_per_second": 26.909,
      "eval_steps_per_second": 3.365,
      "step": 12000
    },
    {
      "epoch": 1.6475916999965814,
      "grad_norm": 0.1512337028980255,
      "learning_rate": 1.5864854173076714e-05,
      "loss": 0.6831,
      "step": 12050
    },
    {
      "epoch": 1.6544286056130995,
      "grad_norm": 40.49404525756836,
      "learning_rate": 1.52716731241207e-05,
      "loss": 0.7483,
      "step": 12100
    },
    {
      "epoch": 1.6612655112296175,
      "grad_norm": 0.5297091007232666,
      "learning_rate": 1.4688875972166227e-05,
      "loss": 0.5595,
      "step": 12150
    },
    {
      "epoch": 1.6681024168461356,
      "grad_norm": 12.922277450561523,
      "learning_rate": 1.4116534142034488e-05,
      "loss": 0.5817,
      "step": 12200
    },
    {
      "epoch": 1.6749393224626534,
      "grad_norm": 0.4216732382774353,
      "learning_rate": 1.3554717777192605e-05,
      "loss": 0.8905,
      "step": 12250
    },
    {
      "epoch": 1.6817762280791713,
      "grad_norm": 1.1882590055465698,
      "learning_rate": 1.3003495731157312e-05,
      "loss": 0.5435,
      "step": 12300
    },
    {
      "epoch": 1.6886131336956893,
      "grad_norm": 15.241290092468262,
      "learning_rate": 1.2462935559056366e-05,
      "loss": 0.5636,
      "step": 12350
    },
    {
      "epoch": 1.6954500393122074,
      "grad_norm": 1.281235933303833,
      "learning_rate": 1.1933103509349508e-05,
      "loss": 0.4771,
      "step": 12400
    },
    {
      "epoch": 1.7022869449287252,
      "grad_norm": 30.664819717407227,
      "learning_rate": 1.1414064515709255e-05,
      "loss": 0.5598,
      "step": 12450
    },
    {
      "epoch": 1.709123850545243,
      "grad_norm": 3.1145246028900146,
      "learning_rate": 1.0905882189063032e-05,
      "loss": 0.5779,
      "step": 12500
    },
    {
      "epoch": 1.7159607561617611,
      "grad_norm": 4.802779674530029,
      "learning_rate": 1.0408618809797255e-05,
      "loss": 0.5402,
      "step": 12550
    },
    {
      "epoch": 1.7227976617782792,
      "grad_norm": 3.566648006439209,
      "learning_rate": 9.92233532012452e-06,
      "loss": 0.816,
      "step": 12600
    },
    {
      "epoch": 1.7296345673947973,
      "grad_norm": 0.9611634016036987,
      "learning_rate": 9.447091316614965e-06,
      "loss": 0.5813,
      "step": 12650
    },
    {
      "epoch": 1.736471473011315,
      "grad_norm": 2.433220148086548,
      "learning_rate": 8.9829450428922e-06,
      "loss": 0.5628,
      "step": 12700
    },
    {
      "epoch": 1.743308378627833,
      "grad_norm": 0.1846768856048584,
      "learning_rate": 8.529953382495404e-06,
      "loss": 0.7646,
      "step": 12750
    },
    {
      "epoch": 1.750145284244351,
      "grad_norm": 1.4401239156723022,
      "learning_rate": 8.088171851907855e-06,
      "loss": 0.5705,
      "step": 12800
    },
    {
      "epoch": 1.756982189860869,
      "grad_norm": 25.80792236328125,
      "learning_rate": 7.657654593753195e-06,
      "loss": 0.6362,
      "step": 12850
    },
    {
      "epoch": 1.763819095477387,
      "grad_norm": 0.8399425148963928,
      "learning_rate": 7.2384543701598416e-06,
      "loss": 0.7085,
      "step": 12900
    },
    {
      "epoch": 1.7706560010939048,
      "grad_norm": 0.8096999526023865,
      "learning_rate": 6.83062255629483e-06,
      "loss": 0.5368,
      "step": 12950
    },
    {
      "epoch": 1.7774929067104228,
      "grad_norm": 8.902669906616211,
      "learning_rate": 6.43420913406747e-06,
      "loss": 0.5753,
      "step": 13000
    },
    {
      "epoch": 1.7843298123269409,
      "grad_norm": 0.15432903170585632,
      "learning_rate": 6.049262686003787e-06,
      "loss": 0.6055,
      "step": 13050
    },
    {
      "epoch": 1.791166717943459,
      "grad_norm": 14.938940048217773,
      "learning_rate": 5.6758303892925025e-06,
      "loss": 0.7965,
      "step": 13100
    },
    {
      "epoch": 1.7980036235599768,
      "grad_norm": 0.20640145242214203,
      "learning_rate": 5.313958010003261e-06,
      "loss": 0.5362,
      "step": 13150
    },
    {
      "epoch": 1.8048405291764946,
      "grad_norm": 0.42624762654304504,
      "learning_rate": 4.963689897477664e-06,
      "loss": 0.6298,
      "step": 13200
    },
    {
      "epoch": 1.8116774347930127,
      "grad_norm": 14.088078498840332,
      "learning_rate": 4.625068978894131e-06,
      "loss": 0.5166,
      "step": 13250
    },
    {
      "epoch": 1.8185143404095307,
      "grad_norm": 8.906865119934082,
      "learning_rate": 4.298136754006854e-06,
      "loss": 0.6144,
      "step": 13300
    },
    {
      "epoch": 1.8253512460260486,
      "grad_norm": 0.16211865842342377,
      "learning_rate": 3.982933290059887e-06,
      "loss": 0.446,
      "step": 13350
    },
    {
      "epoch": 1.8321881516425664,
      "grad_norm": 25.307283401489258,
      "learning_rate": 3.6794972168766594e-06,
      "loss": 0.525,
      "step": 13400
    },
    {
      "epoch": 1.8390250572590845,
      "grad_norm": 41.81796646118164,
      "learning_rate": 3.387865722125594e-06,
      "loss": 0.7377,
      "step": 13450
    },
    {
      "epoch": 1.8458619628756026,
      "grad_norm": 0.09296048432588577,
      "learning_rate": 3.10807454676274e-06,
      "loss": 0.5175,
      "step": 13500
    },
    {
      "epoch": 1.8526988684921206,
      "grad_norm": 113.21685791015625,
      "learning_rate": 2.8401579806514035e-06,
      "loss": 0.7324,
      "step": 13550
    },
    {
      "epoch": 1.8595357741086385,
      "grad_norm": 13.23887825012207,
      "learning_rate": 2.5841488583597696e-06,
      "loss": 0.4255,
      "step": 13600
    },
    {
      "epoch": 1.8663726797251563,
      "grad_norm": 0.3335596024990082,
      "learning_rate": 2.3400785551369043e-06,
      "loss": 0.4865,
      "step": 13650
    },
    {
      "epoch": 1.8732095853416744,
      "grad_norm": 1.1101493835449219,
      "learning_rate": 2.1079769830674836e-06,
      "loss": 0.5834,
      "step": 13700
    },
    {
      "epoch": 1.8800464909581924,
      "grad_norm": 0.44824355840682983,
      "learning_rate": 1.8878725874060144e-06,
      "loss": 0.6434,
      "step": 13750
    },
    {
      "epoch": 1.8868833965747103,
      "grad_norm": 0.7179256081581116,
      "learning_rate": 1.6797923430905583e-06,
      "loss": 0.5649,
      "step": 13800
    },
    {
      "epoch": 1.893720302191228,
      "grad_norm": 0.6279736757278442,
      "learning_rate": 1.4837617514370073e-06,
      "loss": 0.6663,
      "step": 13850
    },
    {
      "epoch": 1.9005572078077462,
      "grad_norm": 2.146757125854492,
      "learning_rate": 1.2998048370135963e-06,
      "loss": 0.5003,
      "step": 13900
    },
    {
      "epoch": 1.9073941134242642,
      "grad_norm": 0.2452065795660019,
      "learning_rate": 1.127944144696691e-06,
      "loss": 0.7167,
      "step": 13950
    },
    {
      "epoch": 1.9142310190407823,
      "grad_norm": 0.2389650195837021,
      "learning_rate": 9.682007369077095e-07,
      "loss": 0.5836,
      "step": 14000
    },
    {
      "epoch": 1.9142310190407823,
      "eval_loss": 0.2555805742740631,
      "eval_runtime": 299.5823,
      "eval_samples_per_second": 26.958,
      "eval_steps_per_second": 3.371,
      "step": 14000
    },
    {
      "epoch": 1.9210679246573001,
      "grad_norm": 20.409788131713867,
      "learning_rate": 8.205941910318426e-07,
      "loss": 0.5573,
      "step": 14050
    },
    {
      "epoch": 1.927904830273818,
      "grad_norm": 0.6842173933982849,
      "learning_rate": 6.851425970187952e-07,
      "loss": 0.5594,
      "step": 14100
    },
    {
      "epoch": 1.934741735890336,
      "grad_norm": 11.089654922485352,
      "learning_rate": 5.618625551656708e-07,
      "loss": 0.6967,
      "step": 14150
    },
    {
      "epoch": 1.941578641506854,
      "grad_norm": 12.126336097717285,
      "learning_rate": 4.507691740825881e-07,
      "loss": 0.677,
      "step": 14200
    },
    {
      "epoch": 1.948415547123372,
      "grad_norm": 0.44369152188301086,
      "learning_rate": 3.518760688410283e-07,
      "loss": 0.6566,
      "step": 14250
    },
    {
      "epoch": 1.9552524527398898,
      "grad_norm": 11.187239646911621,
      "learning_rate": 2.651953593052481e-07,
      "loss": 0.5174,
      "step": 14300
    },
    {
      "epoch": 1.9620893583564079,
      "grad_norm": 15.362393379211426,
      "learning_rate": 1.907376686468787e-07,
      "loss": 0.5426,
      "step": 14350
    },
    {
      "epoch": 1.968926263972926,
      "grad_norm": 0.2329702377319336,
      "learning_rate": 1.2851212204304518e-07,
      "loss": 0.6944,
      "step": 14400
    },
    {
      "epoch": 1.975763169589444,
      "grad_norm": 0.7811570763587952,
      "learning_rate": 7.852634555803873e-08,
      "loss": 0.5647,
      "step": 14450
    },
    {
      "epoch": 1.9826000752059618,
      "grad_norm": 1.2399488687515259,
      "learning_rate": 4.078646520866425e-08,
      "loss": 0.6162,
      "step": 14500
    },
    {
      "epoch": 1.9894369808224797,
      "grad_norm": 0.4023188352584839,
      "learning_rate": 1.5297106213485458e-08,
      "loss": 0.4718,
      "step": 14550
    },
    {
      "epoch": 1.9962738864389977,
      "grad_norm": 0.1795218139886856,
      "learning_rate": 2.061392425978248e-09,
      "loss": 0.5667,
      "step": 14600
    }
  ],
  "logging_steps": 50,
  "max_steps": 14628,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 2,
  "save_steps": 2000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 3.8538290358499676e+18,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}