prm-qwen3-8b-bf16-full / trainer_state.json

Upload PRM LoRA adapter + head + tokenizer from checkpoint

26ba3ce verified 4 months ago

53.3 kB

	{
	"best_global_step": 6000,
	"best_metric": 0.20116083323955536,
	"best_model_checkpoint": "/content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/run-2/checkpoints/checkpoint-6000",
	"epoch": 2.0,
	"eval_steps": 2000,
	"global_step": 14628,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.006836905616517964,
	"grad_norm": 1.572303056716919,
	"learning_rate": 2.232346241457859e-05,
	"loss": 2.3604,
	"step": 50
	},
	{
	"epoch": 0.013673811233035928,
	"grad_norm": 5.201236248016357,
	"learning_rate": 4.510250569476082e-05,
	"loss": 2.1118,
	"step": 100
	},
	{
	"epoch": 0.02051071684955389,
	"grad_norm": 9.312570571899414,
	"learning_rate": 6.788154897494306e-05,
	"loss": 1.8332,
	"step": 150
	},
	{
	"epoch": 0.027347622466071857,
	"grad_norm": 8.565587043762207,
	"learning_rate": 9.066059225512529e-05,
	"loss": 1.9173,
	"step": 200
	},
	{
	"epoch": 0.03418452808258982,
	"grad_norm": 3.824556350708008,
	"learning_rate": 0.00011343963553530752,
	"loss": 1.6633,
	"step": 250
	},
	{
	"epoch": 0.04102143369910778,
	"grad_norm": 5.49424934387207,
	"learning_rate": 0.00013621867881548976,
	"loss": 1.6122,
	"step": 300
	},
	{
	"epoch": 0.04785833931562575,
	"grad_norm": 6.3185038566589355,
	"learning_rate": 0.000158997722095672,
	"loss": 1.5782,
	"step": 350
	},
	{
	"epoch": 0.05469524493214371,
	"grad_norm": 3.980173349380493,
	"learning_rate": 0.00018177676537585422,
	"loss": 1.444,
	"step": 400
	},
	{
	"epoch": 0.06153215054866167,
	"grad_norm": 5.797272682189941,
	"learning_rate": 0.00019999975488719786,
	"loss": 1.5752,
	"step": 450
	},
	{
	"epoch": 0.06836905616517965,
	"grad_norm": 11.263846397399902,
	"learning_rate": 0.0001999911760652904,
	"loss": 1.3607,
	"step": 500
	},
	{
	"epoch": 0.0752059617816976,
	"grad_norm": 4.273462772369385,
	"learning_rate": 0.0001999703428048544,
	"loss": 1.5023,
	"step": 550
	},
	{
	"epoch": 0.08204286739821556,
	"grad_norm": 2.9854705333709717,
	"learning_rate": 0.00019993725765911436,
	"loss": 1.3747,
	"step": 600
	},
	{
	"epoch": 0.08887977301473353,
	"grad_norm": 2.9444832801818848,
	"learning_rate": 0.0001998919246828268,
	"loss": 1.4708,
	"step": 650
	},
	{
	"epoch": 0.0957166786312515,
	"grad_norm": 3.348857879638672,
	"learning_rate": 0.00019983434943178372,
	"loss": 1.439,
	"step": 700
	},
	{
	"epoch": 0.10255358424776946,
	"grad_norm": 5.90728759765625,
	"learning_rate": 0.00019976453896213152,
	"loss": 1.5048,
	"step": 750
	},
	{
	"epoch": 0.10939048986428743,
	"grad_norm": 2.6572535037994385,
	"learning_rate": 0.0001996825018295062,
	"loss": 1.5023,
	"step": 800
	},
	{
	"epoch": 0.11622739548080539,
	"grad_norm": 4.219803810119629,
	"learning_rate": 0.00019958824808798494,
	"loss": 1.5814,
	"step": 850
	},
	{
	"epoch": 0.12306430109732334,
	"grad_norm": 5.457417964935303,
	"learning_rate": 0.00019948178928885378,
	"loss": 1.4203,
	"step": 900
	},
	{
	"epoch": 0.1299012067138413,
	"grad_norm": 5.302417278289795,
	"learning_rate": 0.00019936313847919218,
	"loss": 1.3299,
	"step": 950
	},
	{
	"epoch": 0.1367381123303593,
	"grad_norm": 4.385361194610596,
	"learning_rate": 0.00019923231020027368,
	"loss": 1.3468,
	"step": 1000
	},
	{
	"epoch": 0.14357501794687724,
	"grad_norm": 4.836021423339844,
	"learning_rate": 0.00019908932048578416,
	"loss": 1.2813,
	"step": 1050
	},
	{
	"epoch": 0.1504119235633952,
	"grad_norm": 4.949122905731201,
	"learning_rate": 0.00019893418685985658,
	"loss": 1.311,
	"step": 1100
	},
	{
	"epoch": 0.15724882917991317,
	"grad_norm": 6.123111248016357,
	"learning_rate": 0.00019876692833492343,
	"loss": 1.342,
	"step": 1150
	},
	{
	"epoch": 0.16408573479643113,
	"grad_norm": 5.803433418273926,
	"learning_rate": 0.0001985875654093866,
	"loss": 1.2384,
	"step": 1200
	},
	{
	"epoch": 0.1709226404129491,
	"grad_norm": 3.196314811706543,
	"learning_rate": 0.00019839612006510517,
	"loss": 1.3117,
	"step": 1250
	},
	{
	"epoch": 0.17775954602946706,
	"grad_norm": 6.21234130859375,
	"learning_rate": 0.00019819261576470152,
	"loss": 1.2307,
	"step": 1300
	},
	{
	"epoch": 0.18459645164598504,
	"grad_norm": 3.274829149246216,
	"learning_rate": 0.00019797707744868582,
	"loss": 1.2408,
	"step": 1350
	},
	{
	"epoch": 0.191433357262503,
	"grad_norm": 5.5120320320129395,
	"learning_rate": 0.0001977495315323993,
	"loss": 1.324,
	"step": 1400
	},
	{
	"epoch": 0.19827026287902094,
	"grad_norm": 7.289828777313232,
	"learning_rate": 0.0001975100059027772,
	"loss": 1.2039,
	"step": 1450
	},
	{
	"epoch": 0.20510716849553892,
	"grad_norm": 4.040754795074463,
	"learning_rate": 0.00019725852991493083,
	"loss": 1.3405,
	"step": 1500
	},
	{
	"epoch": 0.21194407411205687,
	"grad_norm": 52.13080596923828,
	"learning_rate": 0.00019699513438854995,
	"loss": 1.2005,
	"step": 1550
	},
	{
	"epoch": 0.21878097972857485,
	"grad_norm": 5.0520429611206055,
	"learning_rate": 0.00019671985160412593,
	"loss": 1.0046,
	"step": 1600
	},
	{
	"epoch": 0.2256178853450928,
	"grad_norm": 1.7626160383224487,
	"learning_rate": 0.00019643271529899532,
	"loss": 1.1398,
	"step": 1650
	},
	{
	"epoch": 0.23245479096161079,
	"grad_norm": 2.1751222610473633,
	"learning_rate": 0.00019613376066320525,
	"loss": 1.1519,
	"step": 1700
	},
	{
	"epoch": 0.23929169657812874,
	"grad_norm": 4.483262062072754,
	"learning_rate": 0.00019582302433520074,
	"loss": 1.144,
	"step": 1750
	},
	{
	"epoch": 0.2461286021946467,
	"grad_norm": 2.494478702545166,
	"learning_rate": 0.00019550054439733449,
	"loss": 1.1908,
	"step": 1800
	},
	{
	"epoch": 0.25296550781116467,
	"grad_norm": 14.6198091506958,
	"learning_rate": 0.00019516636037119952,
	"loss": 1.0791,
	"step": 1850
	},
	{
	"epoch": 0.2598024134276826,
	"grad_norm": 1.5368318557739258,
	"learning_rate": 0.00019482051321278592,
	"loss": 1.1994,
	"step": 1900
	},
	{
	"epoch": 0.2666393190442006,
	"grad_norm": 6.854203701019287,
	"learning_rate": 0.00019446304530746112,
	"loss": 1.1871,
	"step": 1950
	},
	{
	"epoch": 0.2734762246607186,
	"grad_norm": 3.686593770980835,
	"learning_rate": 0.00019409400046477559,
	"loss": 1.0619,
	"step": 2000
	},
	{
	"epoch": 0.2734762246607186,
	"eval_loss": 0.3232106864452362,
	"eval_runtime": 301.3298,
	"eval_samples_per_second": 26.801,
	"eval_steps_per_second": 3.352,
	"step": 2000
	},
	{
	"epoch": 0.28031313027723653,
	"grad_norm": 2.84173321723938,
	"learning_rate": 0.00019371342391309363,
	"loss": 1.1769,
	"step": 2050
	},
	{
	"epoch": 0.2871500358937545,
	"grad_norm": 6.158025741577148,
	"learning_rate": 0.00019332136229405043,
	"loss": 1.1985,
	"step": 2100
	},
	{
	"epoch": 0.29398694151027244,
	"grad_norm": 1.3917083740234375,
	"learning_rate": 0.00019291786365683599,
	"loss": 1.2915,
	"step": 2150
	},
	{
	"epoch": 0.3008238471267904,
	"grad_norm": 6.717157363891602,
	"learning_rate": 0.00019250297745230615,
	"loss": 0.9168,
	"step": 2200
	},
	{
	"epoch": 0.3076607527433084,
	"grad_norm": 7.835381507873535,
	"learning_rate": 0.00019207675452692259,
	"loss": 1.0267,
	"step": 2250
	},
	{
	"epoch": 0.31449765835982635,
	"grad_norm": 4.236868858337402,
	"learning_rate": 0.00019163924711652092,
	"loss": 1.1836,
	"step": 2300
	},
	{
	"epoch": 0.3213345639763443,
	"grad_norm": 4.367033004760742,
	"learning_rate": 0.00019119050883990903,
	"loss": 1.1023,
	"step": 2350
	},
	{
	"epoch": 0.32817146959286225,
	"grad_norm": 8.43916130065918,
	"learning_rate": 0.00019073059469229602,
	"loss": 1.1884,
	"step": 2400
	},
	{
	"epoch": 0.33500837520938026,
	"grad_norm": 7.896825790405273,
	"learning_rate": 0.0001902595610385519,
	"loss": 1.1764,
	"step": 2450
	},
	{
	"epoch": 0.3418452808258982,
	"grad_norm": 3.5363454818725586,
	"learning_rate": 0.00018977746560630012,
	"loss": 1.1172,
	"step": 2500
	},
	{
	"epoch": 0.34868218644241616,
	"grad_norm": 12.307855606079102,
	"learning_rate": 0.00018928436747884253,
	"loss": 1.078,
	"step": 2550
	},
	{
	"epoch": 0.3555190920589341,
	"grad_norm": 8.765337944030762,
	"learning_rate": 0.00018878032708791854,
	"loss": 1.1449,
	"step": 2600
	},
	{
	"epoch": 0.36235599767545207,
	"grad_norm": 11.366116523742676,
	"learning_rate": 0.00018826540620629873,
	"loss": 1.1117,
	"step": 2650
	},
	{
	"epoch": 0.3691929032919701,
	"grad_norm": 3.603243112564087,
	"learning_rate": 0.0001877396679402145,
	"loss": 1.1138,
	"step": 2700
	},
	{
	"epoch": 0.37602980890848803,
	"grad_norm": 8.020549774169922,
	"learning_rate": 0.00018720317672162392,
	"loss": 1.0474,
	"step": 2750
	},
	{
	"epoch": 0.382866714525006,
	"grad_norm": 4.786285877227783,
	"learning_rate": 0.00018665599830031533,
	"loss": 1.1041,
	"step": 2800
	},
	{
	"epoch": 0.38970362014152393,
	"grad_norm": 7.1555633544921875,
	"learning_rate": 0.00018609819973584924,
	"loss": 1.0623,
	"step": 2850
	},
	{
	"epoch": 0.3965405257580419,
	"grad_norm": 6.989715576171875,
	"learning_rate": 0.00018552984938934006,
	"loss": 0.9318,
	"step": 2900
	},
	{
	"epoch": 0.4033774313745599,
	"grad_norm": 7.150449752807617,
	"learning_rate": 0.00018495101691507783,
	"loss": 1.132,
	"step": 2950
	},
	{
	"epoch": 0.41021433699107784,
	"grad_norm": 4.584231853485107,
	"learning_rate": 0.00018436177325199192,
	"loss": 1.1382,
	"step": 3000
	},
	{
	"epoch": 0.4170512426075958,
	"grad_norm": 5.139730930328369,
	"learning_rate": 0.00018376219061495694,
	"loss": 1.0452,
	"step": 3050
	},
	{
	"epoch": 0.42388814822411375,
	"grad_norm": 15.497014999389648,
	"learning_rate": 0.00018315234248594264,
	"loss": 1.0451,
	"step": 3100
	},
	{
	"epoch": 0.43072505384063176,
	"grad_norm": 3.4872303009033203,
	"learning_rate": 0.0001825323036050081,
	"loss": 1.131,
	"step": 3150
	},
	{
	"epoch": 0.4375619594571497,
	"grad_norm": 11.307365417480469,
	"learning_rate": 0.00018190214996114206,
	"loss": 1.1382,
	"step": 3200
	},
	{
	"epoch": 0.44439886507366766,
	"grad_norm": 5.577065467834473,
	"learning_rate": 0.00018126195878295006,
	"loss": 1.1045,
	"step": 3250
	},
	{
	"epoch": 0.4512357706901856,
	"grad_norm": 14.33316421508789,
	"learning_rate": 0.0001806118085291896,
	"loss": 1.0887,
	"step": 3300
	},
	{
	"epoch": 0.45807267630670356,
	"grad_norm": 15.240452766418457,
	"learning_rate": 0.00017995177887915475,
	"loss": 1.0171,
	"step": 3350
	},
	{
	"epoch": 0.46490958192322157,
	"grad_norm": 10.07467269897461,
	"learning_rate": 0.00017928195072291093,
	"loss": 1.0966,
	"step": 3400
	},
	{
	"epoch": 0.4717464875397395,
	"grad_norm": 2.930840253829956,
	"learning_rate": 0.00017860240615138142,
	"loss": 1.0418,
	"step": 3450
	},
	{
	"epoch": 0.4785833931562575,
	"grad_norm": 30.01850700378418,
	"learning_rate": 0.00017791322844628677,
	"loss": 0.9635,
	"step": 3500
	},
	{
	"epoch": 0.4854202987727754,
	"grad_norm": 5.433286666870117,
	"learning_rate": 0.0001772145020699381,
	"loss": 1.0108,
	"step": 3550
	},
	{
	"epoch": 0.4922572043892934,
	"grad_norm": 3.0814309120178223,
	"learning_rate": 0.0001765063126548858,
	"loss": 1.1257,
	"step": 3600
	},
	{
	"epoch": 0.4990941100058114,
	"grad_norm": 79.82017517089844,
	"learning_rate": 0.00017578874699342493,
	"loss": 1.1214,
	"step": 3650
	},
	{
	"epoch": 0.5059310156223293,
	"grad_norm": 8.51614761352539,
	"learning_rate": 0.00017506189302695827,
	"loss": 0.8635,
	"step": 3700
	},
	{
	"epoch": 0.5127679212388473,
	"grad_norm": 8.251550674438477,
	"learning_rate": 0.0001743258398352187,
	"loss": 0.9361,
	"step": 3750
	},
	{
	"epoch": 0.5196048268553652,
	"grad_norm": 3.81523060798645,
	"learning_rate": 0.00017358067762535186,
	"loss": 1.066,
	"step": 3800
	},
	{
	"epoch": 0.5264417324718832,
	"grad_norm": 15.210460662841797,
	"learning_rate": 0.00017282649772086114,
	"loss": 0.9778,
	"step": 3850
	},
	{
	"epoch": 0.5332786380884011,
	"grad_norm": 5.145527362823486,
	"learning_rate": 0.0001720633925504151,
	"loss": 1.0966,
	"step": 3900
	},
	{
	"epoch": 0.5401155437049191,
	"grad_norm": 3.485656261444092,
	"learning_rate": 0.00017129145563652014,
	"loss": 0.6889,
	"step": 3950
	},
	{
	"epoch": 0.5469524493214372,
	"grad_norm": 7.915320873260498,
	"learning_rate": 0.00017051078158405872,
	"loss": 0.9154,
	"step": 4000
	},
	{
	"epoch": 0.5469524493214372,
	"eval_loss": 0.24666446447372437,
	"eval_runtime": 301.8017,
	"eval_samples_per_second": 26.759,
	"eval_steps_per_second": 3.347,
	"step": 4000
	},
	{
	"epoch": 0.5537893549379551,
	"grad_norm": 12.610590934753418,
	"learning_rate": 0.00016972146606869507,
	"loss": 0.8612,
	"step": 4050
	},
	{
	"epoch": 0.5606262605544731,
	"grad_norm": 34.93125915527344,
	"learning_rate": 0.00016892360582514967,
	"loss": 1.0867,
	"step": 4100
	},
	{
	"epoch": 0.567463166170991,
	"grad_norm": 7.39677095413208,
	"learning_rate": 0.00016811729863534377,
	"loss": 1.1106,
	"step": 4150
	},
	{
	"epoch": 0.574300071787509,
	"grad_norm": 2.4880149364471436,
	"learning_rate": 0.00016730264331641585,
	"loss": 0.9142,
	"step": 4200
	},
	{
	"epoch": 0.5811369774040269,
	"grad_norm": 19.268964767456055,
	"learning_rate": 0.00016647973970861104,
	"loss": 0.9408,
	"step": 4250
	},
	{
	"epoch": 0.5879738830205449,
	"grad_norm": 62.558837890625,
	"learning_rate": 0.00016564868866304517,
	"loss": 1.1798,
	"step": 4300
	},
	{
	"epoch": 0.5948107886370628,
	"grad_norm": 12.449636459350586,
	"learning_rate": 0.00016480959202934487,
	"loss": 0.9386,
	"step": 4350
	},
	{
	"epoch": 0.6016476942535808,
	"grad_norm": 9.708828926086426,
	"learning_rate": 0.00016396255264316547,
	"loss": 1.0766,
	"step": 4400
	},
	{
	"epoch": 0.6084845998700988,
	"grad_norm": 4.00963020324707,
	"learning_rate": 0.0001631076743135879,
	"loss": 0.9953,
	"step": 4450
	},
	{
	"epoch": 0.6153215054866168,
	"grad_norm": 14.70906925201416,
	"learning_rate": 0.0001622450618103964,
	"loss": 1.1006,
	"step": 4500
	},
	{
	"epoch": 0.6221584111031347,
	"grad_norm": 2.471301317214966,
	"learning_rate": 0.00016137482085123832,
	"loss": 0.7397,
	"step": 4550
	},
	{
	"epoch": 0.6289953167196527,
	"grad_norm": 0.671847939491272,
	"learning_rate": 0.00016049705808866805,
	"loss": 1.1298,
	"step": 4600
	},
	{
	"epoch": 0.6358322223361706,
	"grad_norm": 11.712217330932617,
	"learning_rate": 0.000159611881097076,
	"loss": 0.8828,
	"step": 4650
	},
	{
	"epoch": 0.6426691279526886,
	"grad_norm": 90.13214111328125,
	"learning_rate": 0.00015871939835950503,
	"loss": 1.085,
	"step": 4700
	},
	{
	"epoch": 0.6495060335692066,
	"grad_norm": 2.1299564838409424,
	"learning_rate": 0.00015781971925435498,
	"loss": 1.0104,
	"step": 4750
	},
	{
	"epoch": 0.6563429391857245,
	"grad_norm": 44.118778228759766,
	"learning_rate": 0.0001569129540419781,
	"loss": 0.8905,
	"step": 4800
	},
	{
	"epoch": 0.6631798448022425,
	"grad_norm": 20.966922760009766,
	"learning_rate": 0.00015599921385116582,
	"loss": 0.9239,
	"step": 4850
	},
	{
	"epoch": 0.6700167504187605,
	"grad_norm": 13.358034133911133,
	"learning_rate": 0.00015507861066552955,
	"loss": 0.8589,
	"step": 4900
	},
	{
	"epoch": 0.6768536560352785,
	"grad_norm": 5.739938259124756,
	"learning_rate": 0.00015415125730977626,
	"loss": 1.0661,
	"step": 4950
	},
	{
	"epoch": 0.6836905616517964,
	"grad_norm": 25.265790939331055,
	"learning_rate": 0.00015321726743588155,
	"loss": 0.9046,
	"step": 5000
	},
	{
	"epoch": 0.6905274672683144,
	"grad_norm": 22.772367477416992,
	"learning_rate": 0.00015227675550916073,
	"loss": 1.0174,
	"step": 5050
	},
	{
	"epoch": 0.6973643728848323,
	"grad_norm": 4.18620491027832,
	"learning_rate": 0.0001513298367942405,
	"loss": 0.9916,
	"step": 5100
	},
	{
	"epoch": 0.7042012785013503,
	"grad_norm": 10.113117218017578,
	"learning_rate": 0.00015037662734093286,
	"loss": 0.9635,
	"step": 5150
	},
	{
	"epoch": 0.7110381841178682,
	"grad_norm": 1.7103244066238403,
	"learning_rate": 0.0001494172439700126,
	"loss": 0.8927,
	"step": 5200
	},
	{
	"epoch": 0.7178750897343862,
	"grad_norm": 24.236433029174805,
	"learning_rate": 0.0001484518042589,
	"loss": 0.9438,
	"step": 5250
	},
	{
	"epoch": 0.7247119953509041,
	"grad_norm": 2.4070262908935547,
	"learning_rate": 0.00014748042652725152,
	"loss": 1.095,
	"step": 5300
	},
	{
	"epoch": 0.7315489009674222,
	"grad_norm": 4.471241474151611,
	"learning_rate": 0.0001465032298224588,
	"loss": 0.8205,
	"step": 5350
	},
	{
	"epoch": 0.7383858065839402,
	"grad_norm": 1.757636547088623,
	"learning_rate": 0.0001455203339050589,
	"loss": 0.9177,
	"step": 5400
	},
	{
	"epoch": 0.7452227122004581,
	"grad_norm": 1.5365773439407349,
	"learning_rate": 0.0001445318592340571,
	"loss": 0.7696,
	"step": 5450
	},
	{
	"epoch": 0.7520596178169761,
	"grad_norm": 1.7077670097351074,
	"learning_rate": 0.00014353792695216382,
	"loss": 0.9342,
	"step": 5500
	},
	{
	"epoch": 0.758896523433494,
	"grad_norm": 28.525236129760742,
	"learning_rate": 0.00014253865887094817,
	"loss": 0.9897,
	"step": 5550
	},
	{
	"epoch": 0.765733429050012,
	"grad_norm": 15.281404495239258,
	"learning_rate": 0.00014153417745590914,
	"loss": 0.8873,
	"step": 5600
	},
	{
	"epoch": 0.7725703346665299,
	"grad_norm": 1.1002103090286255,
	"learning_rate": 0.00014052460581146696,
	"loss": 0.7727,
	"step": 5650
	},
	{
	"epoch": 0.7794072402830479,
	"grad_norm": 4.395946025848389,
	"learning_rate": 0.00013951006766587586,
	"loss": 0.8922,
	"step": 5700
	},
	{
	"epoch": 0.7862441458995658,
	"grad_norm": 5.225406169891357,
	"learning_rate": 0.0001384906873560607,
	"loss": 0.9766,
	"step": 5750
	},
	{
	"epoch": 0.7930810515160838,
	"grad_norm": 6.0966315269470215,
	"learning_rate": 0.00013746658981237867,
	"loss": 1.1373,
	"step": 5800
	},
	{
	"epoch": 0.7999179571326018,
	"grad_norm": 14.155887603759766,
	"learning_rate": 0.00013643790054330846,
	"loss": 0.8954,
	"step": 5850
	},
	{
	"epoch": 0.8067548627491198,
	"grad_norm": 2.6549534797668457,
	"learning_rate": 0.0001354047456200687,
	"loss": 1.0428,
	"step": 5900
	},
	{
	"epoch": 0.8135917683656377,
	"grad_norm": 7.79277229309082,
	"learning_rate": 0.0001343672516611671,
	"loss": 0.8715,
	"step": 5950
	},
	{
	"epoch": 0.8204286739821557,
	"grad_norm": 17.183149337768555,
	"learning_rate": 0.00013332554581688271,
	"loss": 1.0601,
	"step": 6000
	},
	{
	"epoch": 0.8204286739821557,
	"eval_loss": 0.20116083323955536,
	"eval_runtime": 301.512,
	"eval_samples_per_second": 26.785,
	"eval_steps_per_second": 3.35,
	"step": 6000
	},
	{
	"epoch": 0.8272655795986736,
	"grad_norm": 10.275203704833984,
	"learning_rate": 0.00013227975575368312,
	"loss": 0.8782,
	"step": 6050
	},
	{
	"epoch": 0.8341024852151916,
	"grad_norm": 3.2849924564361572,
	"learning_rate": 0.0001312300096385781,
	"loss": 0.7405,
	"step": 6100
	},
	{
	"epoch": 0.8409393908317095,
	"grad_norm": 5.1770853996276855,
	"learning_rate": 0.0001301764361234122,
	"loss": 1.0901,
	"step": 6150
	},
	{
	"epoch": 0.8477762964482275,
	"grad_norm": 13.282193183898926,
	"learning_rate": 0.0001291191643290977,
	"loss": 0.9054,
	"step": 6200
	},
	{
	"epoch": 0.8546132020647454,
	"grad_norm": 9.424989700317383,
	"learning_rate": 0.0001280583238297903,
	"loss": 0.9861,
	"step": 6250
	},
	{
	"epoch": 0.8614501076812635,
	"grad_norm": 2.5506229400634766,
	"learning_rate": 0.000126994044637009,
	"loss": 1.0244,
	"step": 6300
	},
	{
	"epoch": 0.8682870132977815,
	"grad_norm": 21.7524471282959,
	"learning_rate": 0.00012592645718370252,
	"loss": 0.9079,
	"step": 6350
	},
	{
	"epoch": 0.8751239189142994,
	"grad_norm": 2.2379355430603027,
	"learning_rate": 0.00012485569230826423,
	"loss": 1.0235,
	"step": 6400
	},
	{
	"epoch": 0.8819608245308174,
	"grad_norm": 18.936904907226562,
	"learning_rate": 0.000123781881238497,
	"loss": 0.8275,
	"step": 6450
	},
	{
	"epoch": 0.8887977301473353,
	"grad_norm": 1.508329153060913,
	"learning_rate": 0.00012270515557553065,
	"loss": 0.9872,
	"step": 6500
	},
	{
	"epoch": 0.8956346357638533,
	"grad_norm": 30.93293571472168,
	"learning_rate": 0.00012162564727769359,
	"loss": 1.0287,
	"step": 6550
	},
	{
	"epoch": 0.9024715413803712,
	"grad_norm": 29.230403900146484,
	"learning_rate": 0.00012054348864434066,
	"loss": 0.627,
	"step": 6600
	},
	{
	"epoch": 0.9093084469968892,
	"grad_norm": 14.68487548828125,
	"learning_rate": 0.00011945881229963898,
	"loss": 0.9562,
	"step": 6650
	},
	{
	"epoch": 0.9161453526134071,
	"grad_norm": 2.035444736480713,
	"learning_rate": 0.00011837175117631436,
	"loss": 0.8726,
	"step": 6700
	},
	{
	"epoch": 0.9229822582299252,
	"grad_norm": 12.931522369384766,
	"learning_rate": 0.0001172824384993596,
	"loss": 0.8823,
	"step": 6750
	},
	{
	"epoch": 0.9298191638464431,
	"grad_norm": 8.330245971679688,
	"learning_rate": 0.00011619100776970713,
	"loss": 0.7179,
	"step": 6800
	},
	{
	"epoch": 0.9366560694629611,
	"grad_norm": 51.09445571899414,
	"learning_rate": 0.00011509759274786776,
	"loss": 0.8627,
	"step": 6850
	},
	{
	"epoch": 0.943492975079479,
	"grad_norm": 26.371118545532227,
	"learning_rate": 0.00011400232743753752,
	"loss": 0.7334,
	"step": 6900
	},
	{
	"epoch": 0.950329880695997,
	"grad_norm": 1.3464198112487793,
	"learning_rate": 0.00011290534606917508,
	"loss": 1.0389,
	"step": 6950
	},
	{
	"epoch": 0.957166786312515,
	"grad_norm": 0.732755184173584,
	"learning_rate": 0.00011180678308355081,
	"loss": 0.8343,
	"step": 7000
	},
	{
	"epoch": 0.9640036919290329,
	"grad_norm": 0.9582768082618713,
	"learning_rate": 0.00011070677311527058,
	"loss": 1.0705,
	"step": 7050
	},
	{
	"epoch": 0.9708405975455509,
	"grad_norm": 0.7923704385757446,
	"learning_rate": 0.00010960545097627548,
	"loss": 0.9725,
	"step": 7100
	},
	{
	"epoch": 0.9776775031620688,
	"grad_norm": 39.650177001953125,
	"learning_rate": 0.00010850295163931992,
	"loss": 0.8721,
	"step": 7150
	},
	{
	"epoch": 0.9845144087785868,
	"grad_norm": 9.212077140808105,
	"learning_rate": 0.00010739941022143007,
	"loss": 0.8079,
	"step": 7200
	},
	{
	"epoch": 0.9913513143951048,
	"grad_norm": 2.591902494430542,
	"learning_rate": 0.00010629496196734452,
	"loss": 1.1336,
	"step": 7250
	},
	{
	"epoch": 0.9981882200116228,
	"grad_norm": 18.618799209594727,
	"learning_rate": 0.00010518974223293936,
	"loss": 1.0463,
	"step": 7300
	},
	{
	"epoch": 1.004922572043893,
	"grad_norm": 8.480158805847168,
	"learning_rate": 0.00010408388646863965,
	"loss": 0.7236,
	"step": 7350
	},
	{
	"epoch": 1.0117594776604109,
	"grad_norm": 3.5370821952819824,
	"learning_rate": 0.00010297753020281911,
	"loss": 0.813,
	"step": 7400
	},
	{
	"epoch": 1.018596383276929,
	"grad_norm": 0.5842294096946716,
	"learning_rate": 0.00010187080902519064,
	"loss": 0.589,
	"step": 7450
	},
	{
	"epoch": 1.0254332888934468,
	"grad_norm": 11.063470840454102,
	"learning_rate": 0.00010076385857018889,
	"loss": 0.9893,
	"step": 7500
	},
	{
	"epoch": 1.0322701945099648,
	"grad_norm": 8.910834312438965,
	"learning_rate": 9.965681450034771e-05,
	"loss": 0.6532,
	"step": 7550
	},
	{
	"epoch": 1.0391071001264827,
	"grad_norm": 0.8395630121231079,
	"learning_rate": 9.854981248967388e-05,
	"loss": 0.6934,
	"step": 7600
	},
	{
	"epoch": 1.0459440057430007,
	"grad_norm": 3.7071163654327393,
	"learning_rate": 9.744298820701968e-05,
	"loss": 0.7911,
	"step": 7650
	},
	{
	"epoch": 1.0527809113595188,
	"grad_norm": 14.003477096557617,
	"learning_rate": 9.633647729945581e-05,
	"loss": 0.7611,
	"step": 7700
	},
	{
	"epoch": 1.0596178169760366,
	"grad_norm": 19.04654884338379,
	"learning_rate": 9.523041537564726e-05,
	"loss": 0.6596,
	"step": 7750
	},
	{
	"epoch": 1.0664547225925547,
	"grad_norm": 52.79182815551758,
	"learning_rate": 9.412493798923383e-05,
	"loss": 0.763,
	"step": 7800
	},
	{
	"epoch": 1.0732916282090725,
	"grad_norm": 1.4399851560592651,
	"learning_rate": 9.3020180622217e-05,
	"loss": 0.667,
	"step": 7850
	},
	{
	"epoch": 1.0801285338255906,
	"grad_norm": 1.6162464618682861,
	"learning_rate": 9.19162786683564e-05,
	"loss": 0.813,
	"step": 7900
	},
	{
	"epoch": 1.0869654394421084,
	"grad_norm": 6.91720724105835,
	"learning_rate": 9.081336741657603e-05,
	"loss": 0.6394,
	"step": 7950
	},
	{
	"epoch": 1.0938023450586265,
	"grad_norm": 7.005824089050293,
	"learning_rate": 8.971158203438443e-05,
	"loss": 0.6949,
	"step": 8000
	},
	{
	"epoch": 1.0938023450586265,
	"eval_loss": 0.22489766776561737,
	"eval_runtime": 301.6603,
	"eval_samples_per_second": 26.772,
	"eval_steps_per_second": 3.348,
	"step": 8000
	},
	{
	"epoch": 1.1006392506751443,
	"grad_norm": 12.64887523651123,
	"learning_rate": 8.861105755130896e-05,
	"loss": 0.6777,
	"step": 8050
	},
	{
	"epoch": 1.1074761562916624,
	"grad_norm": 99.47157287597656,
	"learning_rate": 8.751192884234704e-05,
	"loss": 0.5242,
	"step": 8100
	},
	{
	"epoch": 1.1143130619081805,
	"grad_norm": 2.9147791862487793,
	"learning_rate": 8.641433061143698e-05,
	"loss": 0.6589,
	"step": 8150
	},
	{
	"epoch": 1.1211499675246983,
	"grad_norm": 0.4020586311817169,
	"learning_rate": 8.531839737494878e-05,
	"loss": 0.9058,
	"step": 8200
	},
	{
	"epoch": 1.1279868731412164,
	"grad_norm": 41.31173324584961,
	"learning_rate": 8.422426344519898e-05,
	"loss": 0.5999,
	"step": 8250
	},
	{
	"epoch": 1.1348237787577342,
	"grad_norm": 0.19233907759189606,
	"learning_rate": 8.313206291398948e-05,
	"loss": 0.8461,
	"step": 8300
	},
	{
	"epoch": 1.1416606843742523,
	"grad_norm": 0.5941385626792908,
	"learning_rate": 8.20419296361743e-05,
	"loss": 0.5353,
	"step": 8350
	},
	{
	"epoch": 1.1484975899907701,
	"grad_norm": 6.670557022094727,
	"learning_rate": 8.095399721325481e-05,
	"loss": 0.6484,
	"step": 8400
	},
	{
	"epoch": 1.1553344956072882,
	"grad_norm": 3.8168182373046875,
	"learning_rate": 7.9868398977006e-05,
	"loss": 0.8318,
	"step": 8450
	},
	{
	"epoch": 1.162171401223806,
	"grad_norm": 17.14653778076172,
	"learning_rate": 7.87852679731364e-05,
	"loss": 0.5694,
	"step": 8500
	},
	{
	"epoch": 1.169008306840324,
	"grad_norm": 58.7053108215332,
	"learning_rate": 7.77047369449821e-05,
	"loss": 0.7256,
	"step": 8550
	},
	{
	"epoch": 1.1758452124568421,
	"grad_norm": 0.4155759811401367,
	"learning_rate": 7.66269383172389e-05,
	"loss": 0.604,
	"step": 8600
	},
	{
	"epoch": 1.18268211807336,
	"grad_norm": 1.1354832649230957,
	"learning_rate": 7.555200417973261e-05,
	"loss": 0.7761,
	"step": 8650
	},
	{
	"epoch": 1.189519023689878,
	"grad_norm": 1.1315326690673828,
	"learning_rate": 7.448006627123083e-05,
	"loss": 0.6569,
	"step": 8700
	},
	{
	"epoch": 1.196355929306396,
	"grad_norm": 0.9931478500366211,
	"learning_rate": 7.341125596329783e-05,
	"loss": 0.8456,
	"step": 8750
	},
	{
	"epoch": 1.203192834922914,
	"grad_norm": 2.132953643798828,
	"learning_rate": 7.2345704244194e-05,
	"loss": 0.7142,
	"step": 8800
	},
	{
	"epoch": 1.2100297405394318,
	"grad_norm": 10.148101806640625,
	"learning_rate": 7.12835417028229e-05,
	"loss": 0.7284,
	"step": 8850
	},
	{
	"epoch": 1.2168666461559499,
	"grad_norm": 41.58332824707031,
	"learning_rate": 7.022489851272668e-05,
	"loss": 0.5779,
	"step": 8900
	},
	{
	"epoch": 1.2237035517724677,
	"grad_norm": 4.843736171722412,
	"learning_rate": 6.91699044161326e-05,
	"loss": 0.6783,
	"step": 8950
	},
	{
	"epoch": 1.2305404573889858,
	"grad_norm": 0.4043326675891876,
	"learning_rate": 6.811868870805269e-05,
	"loss": 0.7656,
	"step": 9000
	},
	{
	"epoch": 1.2373773630055038,
	"grad_norm": 3.8934195041656494,
	"learning_rate": 6.70713802204377e-05,
	"loss": 0.5857,
	"step": 9050
	},
	{
	"epoch": 1.2442142686220217,
	"grad_norm": 0.23483966290950775,
	"learning_rate": 6.602810730638829e-05,
	"loss": 0.6388,
	"step": 9100
	},
	{
	"epoch": 1.2510511742385395,
	"grad_norm": 2.1649527549743652,
	"learning_rate": 6.498899782442444e-05,
	"loss": 0.6986,
	"step": 9150
	},
	{
	"epoch": 1.2578880798550576,
	"grad_norm": 82.96743774414062,
	"learning_rate": 6.39541791228161e-05,
	"loss": 0.5563,
	"step": 9200
	},
	{
	"epoch": 1.2647249854715756,
	"grad_norm": 1.8622783422470093,
	"learning_rate": 6.292377802397564e-05,
	"loss": 0.6941,
	"step": 9250
	},
	{
	"epoch": 1.2715618910880935,
	"grad_norm": 1.1985386610031128,
	"learning_rate": 6.189792080891525e-05,
	"loss": 0.6195,
	"step": 9300
	},
	{
	"epoch": 1.2783987967046115,
	"grad_norm": 1.1333106756210327,
	"learning_rate": 6.087673320177058e-05,
	"loss": 0.5675,
	"step": 9350
	},
	{
	"epoch": 1.2852357023211294,
	"grad_norm": 13.326946258544922,
	"learning_rate": 5.9860340354392496e-05,
	"loss": 0.8214,
	"step": 9400
	},
	{
	"epoch": 1.2920726079376474,
	"grad_norm": 10.754223823547363,
	"learning_rate": 5.8848866831009156e-05,
	"loss": 0.663,
	"step": 9450
	},
	{
	"epoch": 1.2989095135541655,
	"grad_norm": 0.07592844218015671,
	"learning_rate": 5.784243659296001e-05,
	"loss": 0.6661,
	"step": 9500
	},
	{
	"epoch": 1.3057464191706833,
	"grad_norm": 4.361905097961426,
	"learning_rate": 5.6841172983503634e-05,
	"loss": 0.6757,
	"step": 9550
	},
	{
	"epoch": 1.3125833247872012,
	"grad_norm": 6.464013576507568,
	"learning_rate": 5.5845198712701396e-05,
	"loss": 0.8568,
	"step": 9600
	},
	{
	"epoch": 1.3194202304037193,
	"grad_norm": 13.971973419189453,
	"learning_rate": 5.485463584237871e-05,
	"loss": 0.5852,
	"step": 9650
	},
	{
	"epoch": 1.3262571360202373,
	"grad_norm": 25.48811149597168,
	"learning_rate": 5.3869605771165755e-05,
	"loss": 0.652,
	"step": 9700
	},
	{
	"epoch": 1.3330940416367552,
	"grad_norm": 5.14886474609375,
	"learning_rate": 5.289022921961948e-05,
	"loss": 0.8247,
	"step": 9750
	},
	{
	"epoch": 1.3399309472532732,
	"grad_norm": 0.6628409028053284,
	"learning_rate": 5.1916626215428385e-05,
	"loss": 0.5708,
	"step": 9800
	},
	{
	"epoch": 1.346767852869791,
	"grad_norm": 81.61123657226562,
	"learning_rate": 5.094891607870296e-05,
	"loss": 0.7523,
	"step": 9850
	},
	{
	"epoch": 1.3536047584863091,
	"grad_norm": 0.597465455532074,
	"learning_rate": 4.998721740735197e-05,
	"loss": 0.7701,
	"step": 9900
	},
	{
	"epoch": 1.3604416641028272,
	"grad_norm": 1.8627650737762451,
	"learning_rate": 4.903164806254804e-05,
	"loss": 0.6589,
	"step": 9950
	},
	{
	"epoch": 1.367278569719345,
	"grad_norm": 0.427298903465271,
	"learning_rate": 4.808232515428268e-05,
	"loss": 0.6476,
	"step": 10000
	},
	{
	"epoch": 1.367278569719345,
	"eval_loss": 0.25095975399017334,
	"eval_runtime": 301.6273,
	"eval_samples_per_second": 26.775,
	"eval_steps_per_second": 3.349,
	"step": 10000
	},
	{
	"epoch": 1.3741154753358629,
	"grad_norm": 0.5417049527168274,
	"learning_rate": 4.713936502701435e-05,
	"loss": 0.7344,
	"step": 10050
	},
	{
	"epoch": 1.380952380952381,
	"grad_norm": 0.30379384756088257,
	"learning_rate": 4.620288324540962e-05,
	"loss": 0.5764,
	"step": 10100
	},
	{
	"epoch": 1.387789286568899,
	"grad_norm": 0.258468359708786,
	"learning_rate": 4.5272994580179895e-05,
	"loss": 0.6794,
	"step": 10150
	},
	{
	"epoch": 1.3946261921854168,
	"grad_norm": 1.2032103538513184,
	"learning_rate": 4.434981299401615e-05,
	"loss": 0.5931,
	"step": 10200
	},
	{
	"epoch": 1.401463097801935,
	"grad_norm": 4.064381122589111,
	"learning_rate": 4.3433451627621743e-05,
	"loss": 0.4061,
	"step": 10250
	},
	{
	"epoch": 1.4083000034184527,
	"grad_norm": 1.0236620903015137,
	"learning_rate": 4.2524022785846806e-05,
	"loss": 0.5935,
	"step": 10300
	},
	{
	"epoch": 1.4151369090349708,
	"grad_norm": 0.42589133977890015,
	"learning_rate": 4.1621637923924405e-05,
	"loss": 0.8298,
	"step": 10350
	},
	{
	"epoch": 1.4219738146514889,
	"grad_norm": 9.088717460632324,
	"learning_rate": 4.072640763381127e-05,
	"loss": 0.5821,
	"step": 10400
	},
	{
	"epoch": 1.4288107202680067,
	"grad_norm": 2.854710102081299,
	"learning_rate": 3.983844163063429e-05,
	"loss": 0.6541,
	"step": 10450
	},
	{
	"epoch": 1.4356476258845245,
	"grad_norm": 6.076037406921387,
	"learning_rate": 3.895784873924397e-05,
	"loss": 0.6669,
	"step": 10500
	},
	{
	"epoch": 1.4424845315010426,
	"grad_norm": 0.36614227294921875,
	"learning_rate": 3.8084736880877846e-05,
	"loss": 0.5883,
	"step": 10550
	},
	{
	"epoch": 1.4493214371175607,
	"grad_norm": 82.49917602539062,
	"learning_rate": 3.721921305993391e-05,
	"loss": 0.8045,
	"step": 10600
	},
	{
	"epoch": 1.4561583427340785,
	"grad_norm": 45.616859436035156,
	"learning_rate": 3.636138335085666e-05,
	"loss": 0.4991,
	"step": 10650
	},
	{
	"epoch": 1.4629952483505966,
	"grad_norm": 0.26663124561309814,
	"learning_rate": 3.5511352885137194e-05,
	"loss": 0.4815,
	"step": 10700
	},
	{
	"epoch": 1.4698321539671144,
	"grad_norm": 1.6303415298461914,
	"learning_rate": 3.4669225838428785e-05,
	"loss": 0.4746,
	"step": 10750
	},
	{
	"epoch": 1.4766690595836325,
	"grad_norm": 14.5377779006958,
	"learning_rate": 3.3835105417779687e-05,
	"loss": 0.7877,
	"step": 10800
	},
	{
	"epoch": 1.4835059652001505,
	"grad_norm": 0.08112337440252304,
	"learning_rate": 3.30090938489844e-05,
	"loss": 0.6687,
	"step": 10850
	},
	{
	"epoch": 1.4903428708166684,
	"grad_norm": 7.454471588134766,
	"learning_rate": 3.219129236405548e-05,
	"loss": 0.8063,
	"step": 10900
	},
	{
	"epoch": 1.4971797764331862,
	"grad_norm": 5.5912275314331055,
	"learning_rate": 3.13818011888171e-05,
	"loss": 0.6337,
	"step": 10950
	},
	{
	"epoch": 1.5040166820497043,
	"grad_norm": 7.555117130279541,
	"learning_rate": 3.0580719530621705e-05,
	"loss": 0.6513,
	"step": 11000
	},
	{
	"epoch": 1.5108535876662224,
	"grad_norm": 0.4277037978172302,
	"learning_rate": 2.9788145566191693e-05,
	"loss": 0.603,
	"step": 11050
	},
	{
	"epoch": 1.5176904932827402,
	"grad_norm": 0.3563739061355591,
	"learning_rate": 2.900417642958734e-05,
	"loss": 0.5695,
	"step": 11100
	},
	{
	"epoch": 1.524527398899258,
	"grad_norm": 0.8669344782829285,
	"learning_rate": 2.822890820030264e-05,
	"loss": 0.7372,
	"step": 11150
	},
	{
	"epoch": 1.531364304515776,
	"grad_norm": 10.977109909057617,
	"learning_rate": 2.7462435891490036e-05,
	"loss": 0.6573,
	"step": 11200
	},
	{
	"epoch": 1.5382012101322942,
	"grad_norm": 0.33039143681526184,
	"learning_rate": 2.6704853438316213e-05,
	"loss": 0.4278,
	"step": 11250
	},
	{
	"epoch": 1.5450381157488122,
	"grad_norm": 3.340820550918579,
	"learning_rate": 2.5956253686449882e-05,
	"loss": 0.6281,
	"step": 11300
	},
	{
	"epoch": 1.55187502136533,
	"grad_norm": 6.152026176452637,
	"learning_rate": 2.521672838068295e-05,
	"loss": 0.6859,
	"step": 11350
	},
	{
	"epoch": 1.558711926981848,
	"grad_norm": 0.9645776152610779,
	"learning_rate": 2.4486368153686734e-05,
	"loss": 0.578,
	"step": 11400
	},
	{
	"epoch": 1.565548832598366,
	"grad_norm": 3.5073535442352295,
	"learning_rate": 2.3765262514904617e-05,
	"loss": 0.6756,
	"step": 11450
	},
	{
	"epoch": 1.572385738214884,
	"grad_norm": 1.3473198413848877,
	"learning_rate": 2.305349983958196e-05,
	"loss": 0.6288,
	"step": 11500
	},
	{
	"epoch": 1.5792226438314019,
	"grad_norm": 6.039999961853027,
	"learning_rate": 2.2351167357935422e-05,
	"loss": 0.6274,
	"step": 11550
	},
	{
	"epoch": 1.5860595494479197,
	"grad_norm": 0.9115678668022156,
	"learning_rate": 2.1658351144462362e-05,
	"loss": 0.6303,
	"step": 11600
	},
	{
	"epoch": 1.5928964550644378,
	"grad_norm": 37.31045150756836,
	"learning_rate": 2.097513610739209e-05,
	"loss": 0.7243,
	"step": 11650
	},
	{
	"epoch": 1.5997333606809558,
	"grad_norm": 0.5089764595031738,
	"learning_rate": 2.0301605978279702e-05,
	"loss": 0.507,
	"step": 11700
	},
	{
	"epoch": 1.606570266297474,
	"grad_norm": 16.424047470092773,
	"learning_rate": 1.9637843301744528e-05,
	"loss": 0.6387,
	"step": 11750
	},
	{
	"epoch": 1.6134071719139917,
	"grad_norm": 0.6381849646568298,
	"learning_rate": 1.898392942535383e-05,
	"loss": 0.7143,
	"step": 11800
	},
	{
	"epoch": 1.6202440775305096,
	"grad_norm": 7.240786075592041,
	"learning_rate": 1.833994448965315e-05,
	"loss": 0.7644,
	"step": 11850
	},
	{
	"epoch": 1.6270809831470276,
	"grad_norm": 0.6397457122802734,
	"learning_rate": 1.7705967418344737e-05,
	"loss": 0.5355,
	"step": 11900
	},
	{
	"epoch": 1.6339178887635457,
	"grad_norm": 0.49821093678474426,
	"learning_rate": 1.7082075908615013e-05,
	"loss": 0.7372,
	"step": 11950
	},
	{
	"epoch": 1.6407547943800636,
	"grad_norm": 0.550399124622345,
	"learning_rate": 1.6468346421612447e-05,
	"loss": 0.7474,
	"step": 12000
	},
	{
	"epoch": 1.6407547943800636,
	"eval_loss": 0.26388460397720337,
	"eval_runtime": 300.1264,
	"eval_samples_per_second": 26.909,
	"eval_steps_per_second": 3.365,
	"step": 12000
	},
	{
	"epoch": 1.6475916999965814,
	"grad_norm": 0.1512337028980255,
	"learning_rate": 1.5864854173076714e-05,
	"loss": 0.6831,
	"step": 12050
	},
	{
	"epoch": 1.6544286056130995,
	"grad_norm": 40.49404525756836,
	"learning_rate": 1.52716731241207e-05,
	"loss": 0.7483,
	"step": 12100
	},
	{
	"epoch": 1.6612655112296175,
	"grad_norm": 0.5297091007232666,
	"learning_rate": 1.4688875972166227e-05,
	"loss": 0.5595,
	"step": 12150
	},
	{
	"epoch": 1.6681024168461356,
	"grad_norm": 12.922277450561523,
	"learning_rate": 1.4116534142034488e-05,
	"loss": 0.5817,
	"step": 12200
	},
	{
	"epoch": 1.6749393224626534,
	"grad_norm": 0.4216732382774353,
	"learning_rate": 1.3554717777192605e-05,
	"loss": 0.8905,
	"step": 12250
	},
	{
	"epoch": 1.6817762280791713,
	"grad_norm": 1.1882590055465698,
	"learning_rate": 1.3003495731157312e-05,
	"loss": 0.5435,
	"step": 12300
	},
	{
	"epoch": 1.6886131336956893,
	"grad_norm": 15.241290092468262,
	"learning_rate": 1.2462935559056366e-05,
	"loss": 0.5636,
	"step": 12350
	},
	{
	"epoch": 1.6954500393122074,
	"grad_norm": 1.281235933303833,
	"learning_rate": 1.1933103509349508e-05,
	"loss": 0.4771,
	"step": 12400
	},
	{
	"epoch": 1.7022869449287252,
	"grad_norm": 30.664819717407227,
	"learning_rate": 1.1414064515709255e-05,
	"loss": 0.5598,
	"step": 12450
	},
	{
	"epoch": 1.709123850545243,
	"grad_norm": 3.1145246028900146,
	"learning_rate": 1.0905882189063032e-05,
	"loss": 0.5779,
	"step": 12500
	},
	{
	"epoch": 1.7159607561617611,
	"grad_norm": 4.802779674530029,
	"learning_rate": 1.0408618809797255e-05,
	"loss": 0.5402,
	"step": 12550
	},
	{
	"epoch": 1.7227976617782792,
	"grad_norm": 3.566648006439209,
	"learning_rate": 9.92233532012452e-06,
	"loss": 0.816,
	"step": 12600
	},
	{
	"epoch": 1.7296345673947973,
	"grad_norm": 0.9611634016036987,
	"learning_rate": 9.447091316614965e-06,
	"loss": 0.5813,
	"step": 12650
	},
	{
	"epoch": 1.736471473011315,
	"grad_norm": 2.433220148086548,
	"learning_rate": 8.9829450428922e-06,
	"loss": 0.5628,
	"step": 12700
	},
	{
	"epoch": 1.743308378627833,
	"grad_norm": 0.1846768856048584,
	"learning_rate": 8.529953382495404e-06,
	"loss": 0.7646,
	"step": 12750
	},
	{
	"epoch": 1.750145284244351,
	"grad_norm": 1.4401239156723022,
	"learning_rate": 8.088171851907855e-06,
	"loss": 0.5705,
	"step": 12800
	},
	{
	"epoch": 1.756982189860869,
	"grad_norm": 25.80792236328125,
	"learning_rate": 7.657654593753195e-06,
	"loss": 0.6362,
	"step": 12850
	},
	{
	"epoch": 1.763819095477387,
	"grad_norm": 0.8399425148963928,
	"learning_rate": 7.2384543701598416e-06,
	"loss": 0.7085,
	"step": 12900
	},
	{
	"epoch": 1.7706560010939048,
	"grad_norm": 0.8096999526023865,
	"learning_rate": 6.83062255629483e-06,
	"loss": 0.5368,
	"step": 12950
	},
	{
	"epoch": 1.7774929067104228,
	"grad_norm": 8.902669906616211,
	"learning_rate": 6.43420913406747e-06,
	"loss": 0.5753,
	"step": 13000
	},
	{
	"epoch": 1.7843298123269409,
	"grad_norm": 0.15432903170585632,
	"learning_rate": 6.049262686003787e-06,
	"loss": 0.6055,
	"step": 13050
	},
	{
	"epoch": 1.791166717943459,
	"grad_norm": 14.938940048217773,
	"learning_rate": 5.6758303892925025e-06,
	"loss": 0.7965,
	"step": 13100
	},
	{
	"epoch": 1.7980036235599768,
	"grad_norm": 0.20640145242214203,
	"learning_rate": 5.313958010003261e-06,
	"loss": 0.5362,
	"step": 13150
	},
	{
	"epoch": 1.8048405291764946,
	"grad_norm": 0.42624762654304504,
	"learning_rate": 4.963689897477664e-06,
	"loss": 0.6298,
	"step": 13200
	},
	{
	"epoch": 1.8116774347930127,
	"grad_norm": 14.088078498840332,
	"learning_rate": 4.625068978894131e-06,
	"loss": 0.5166,
	"step": 13250
	},
	{
	"epoch": 1.8185143404095307,
	"grad_norm": 8.906865119934082,
	"learning_rate": 4.298136754006854e-06,
	"loss": 0.6144,
	"step": 13300
	},
	{
	"epoch": 1.8253512460260486,
	"grad_norm": 0.16211865842342377,
	"learning_rate": 3.982933290059887e-06,
	"loss": 0.446,
	"step": 13350
	},
	{
	"epoch": 1.8321881516425664,
	"grad_norm": 25.307283401489258,
	"learning_rate": 3.6794972168766594e-06,
	"loss": 0.525,
	"step": 13400
	},
	{
	"epoch": 1.8390250572590845,
	"grad_norm": 41.81796646118164,
	"learning_rate": 3.387865722125594e-06,
	"loss": 0.7377,
	"step": 13450
	},
	{
	"epoch": 1.8458619628756026,
	"grad_norm": 0.09296048432588577,
	"learning_rate": 3.10807454676274e-06,
	"loss": 0.5175,
	"step": 13500
	},
	{
	"epoch": 1.8526988684921206,
	"grad_norm": 113.21685791015625,
	"learning_rate": 2.8401579806514035e-06,
	"loss": 0.7324,
	"step": 13550
	},
	{
	"epoch": 1.8595357741086385,
	"grad_norm": 13.23887825012207,
	"learning_rate": 2.5841488583597696e-06,
	"loss": 0.4255,
	"step": 13600
	},
	{
	"epoch": 1.8663726797251563,
	"grad_norm": 0.3335596024990082,
	"learning_rate": 2.3400785551369043e-06,
	"loss": 0.4865,
	"step": 13650
	},
	{
	"epoch": 1.8732095853416744,
	"grad_norm": 1.1101493835449219,
	"learning_rate": 2.1079769830674836e-06,
	"loss": 0.5834,
	"step": 13700
	},
	{
	"epoch": 1.8800464909581924,
	"grad_norm": 0.44824355840682983,
	"learning_rate": 1.8878725874060144e-06,
	"loss": 0.6434,
	"step": 13750
	},
	{
	"epoch": 1.8868833965747103,
	"grad_norm": 0.7179256081581116,
	"learning_rate": 1.6797923430905583e-06,
	"loss": 0.5649,
	"step": 13800
	},
	{
	"epoch": 1.893720302191228,
	"grad_norm": 0.6279736757278442,
	"learning_rate": 1.4837617514370073e-06,
	"loss": 0.6663,
	"step": 13850
	},
	{
	"epoch": 1.9005572078077462,
	"grad_norm": 2.146757125854492,
	"learning_rate": 1.2998048370135963e-06,
	"loss": 0.5003,
	"step": 13900
	},
	{
	"epoch": 1.9073941134242642,
	"grad_norm": 0.2452065795660019,
	"learning_rate": 1.127944144696691e-06,
	"loss": 0.7167,
	"step": 13950
	},
	{
	"epoch": 1.9142310190407823,
	"grad_norm": 0.2389650195837021,
	"learning_rate": 9.682007369077095e-07,
	"loss": 0.5836,
	"step": 14000
	},
	{
	"epoch": 1.9142310190407823,
	"eval_loss": 0.2555805742740631,
	"eval_runtime": 299.5823,
	"eval_samples_per_second": 26.958,
	"eval_steps_per_second": 3.371,
	"step": 14000
	},
	{
	"epoch": 1.9210679246573001,
	"grad_norm": 20.409788131713867,
	"learning_rate": 8.205941910318426e-07,
	"loss": 0.5573,
	"step": 14050
	},
	{
	"epoch": 1.927904830273818,
	"grad_norm": 0.6842173933982849,
	"learning_rate": 6.851425970187952e-07,
	"loss": 0.5594,
	"step": 14100
	},
	{
	"epoch": 1.934741735890336,
	"grad_norm": 11.089654922485352,
	"learning_rate": 5.618625551656708e-07,
	"loss": 0.6967,
	"step": 14150
	},
	{
	"epoch": 1.941578641506854,
	"grad_norm": 12.126336097717285,
	"learning_rate": 4.507691740825881e-07,
	"loss": 0.677,
	"step": 14200
	},
	{
	"epoch": 1.948415547123372,
	"grad_norm": 0.44369152188301086,
	"learning_rate": 3.518760688410283e-07,
	"loss": 0.6566,
	"step": 14250
	},
	{
	"epoch": 1.9552524527398898,
	"grad_norm": 11.187239646911621,
	"learning_rate": 2.651953593052481e-07,
	"loss": 0.5174,
	"step": 14300
	},
	{
	"epoch": 1.9620893583564079,
	"grad_norm": 15.362393379211426,
	"learning_rate": 1.907376686468787e-07,
	"loss": 0.5426,
	"step": 14350
	},
	{
	"epoch": 1.968926263972926,
	"grad_norm": 0.2329702377319336,
	"learning_rate": 1.2851212204304518e-07,
	"loss": 0.6944,
	"step": 14400
	},
	{
	"epoch": 1.975763169589444,
	"grad_norm": 0.7811570763587952,
	"learning_rate": 7.852634555803873e-08,
	"loss": 0.5647,
	"step": 14450
	},
	{
	"epoch": 1.9826000752059618,
	"grad_norm": 1.2399488687515259,
	"learning_rate": 4.078646520866425e-08,
	"loss": 0.6162,
	"step": 14500
	},
	{
	"epoch": 1.9894369808224797,
	"grad_norm": 0.4023188352584839,
	"learning_rate": 1.5297106213485458e-08,
	"loss": 0.4718,
	"step": 14550
	},
	{
	"epoch": 1.9962738864389977,
	"grad_norm": 0.1795218139886856,
	"learning_rate": 2.061392425978248e-09,
	"loss": 0.5667,
	"step": 14600
	}
	],
	"logging_steps": 50,
	"max_steps": 14628,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 2,
	"save_steps": 2000,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 3.8538290358499676e+18,
	"train_batch_size": 4,
	"trial_name": null,
	"trial_params": null
	}