{ "best_global_step": 6000, "best_metric": 0.20116083323955536, "best_model_checkpoint": "/content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/run-2/checkpoints/checkpoint-6000", "epoch": 2.0, "eval_steps": 2000, "global_step": 14628, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006836905616517964, "grad_norm": 1.572303056716919, "learning_rate": 2.232346241457859e-05, "loss": 2.3604, "step": 50 }, { "epoch": 0.013673811233035928, "grad_norm": 5.201236248016357, "learning_rate": 4.510250569476082e-05, "loss": 2.1118, "step": 100 }, { "epoch": 0.02051071684955389, "grad_norm": 9.312570571899414, "learning_rate": 6.788154897494306e-05, "loss": 1.8332, "step": 150 }, { "epoch": 0.027347622466071857, "grad_norm": 8.565587043762207, "learning_rate": 9.066059225512529e-05, "loss": 1.9173, "step": 200 }, { "epoch": 0.03418452808258982, "grad_norm": 3.824556350708008, "learning_rate": 0.00011343963553530752, "loss": 1.6633, "step": 250 }, { "epoch": 0.04102143369910778, "grad_norm": 5.49424934387207, "learning_rate": 0.00013621867881548976, "loss": 1.6122, "step": 300 }, { "epoch": 0.04785833931562575, "grad_norm": 6.3185038566589355, "learning_rate": 0.000158997722095672, "loss": 1.5782, "step": 350 }, { "epoch": 0.05469524493214371, "grad_norm": 3.980173349380493, "learning_rate": 0.00018177676537585422, "loss": 1.444, "step": 400 }, { "epoch": 0.06153215054866167, "grad_norm": 5.797272682189941, "learning_rate": 0.00019999975488719786, "loss": 1.5752, "step": 450 }, { "epoch": 0.06836905616517965, "grad_norm": 11.263846397399902, "learning_rate": 0.0001999911760652904, "loss": 1.3607, "step": 500 }, { "epoch": 0.0752059617816976, "grad_norm": 4.273462772369385, "learning_rate": 0.0001999703428048544, "loss": 1.5023, "step": 550 }, { "epoch": 0.08204286739821556, "grad_norm": 2.9854705333709717, "learning_rate": 0.00019993725765911436, "loss": 1.3747, "step": 600 }, { "epoch": 0.08887977301473353, "grad_norm": 2.9444832801818848, "learning_rate": 0.0001998919246828268, "loss": 1.4708, "step": 650 }, { "epoch": 0.0957166786312515, "grad_norm": 3.348857879638672, "learning_rate": 0.00019983434943178372, "loss": 1.439, "step": 700 }, { "epoch": 0.10255358424776946, "grad_norm": 5.90728759765625, "learning_rate": 0.00019976453896213152, "loss": 1.5048, "step": 750 }, { "epoch": 0.10939048986428743, "grad_norm": 2.6572535037994385, "learning_rate": 0.0001996825018295062, "loss": 1.5023, "step": 800 }, { "epoch": 0.11622739548080539, "grad_norm": 4.219803810119629, "learning_rate": 0.00019958824808798494, "loss": 1.5814, "step": 850 }, { "epoch": 0.12306430109732334, "grad_norm": 5.457417964935303, "learning_rate": 0.00019948178928885378, "loss": 1.4203, "step": 900 }, { "epoch": 0.1299012067138413, "grad_norm": 5.302417278289795, "learning_rate": 0.00019936313847919218, "loss": 1.3299, "step": 950 }, { "epoch": 0.1367381123303593, "grad_norm": 4.385361194610596, "learning_rate": 0.00019923231020027368, "loss": 1.3468, "step": 1000 }, { "epoch": 0.14357501794687724, "grad_norm": 4.836021423339844, "learning_rate": 0.00019908932048578416, "loss": 1.2813, "step": 1050 }, { "epoch": 0.1504119235633952, "grad_norm": 4.949122905731201, "learning_rate": 0.00019893418685985658, "loss": 1.311, "step": 1100 }, { "epoch": 0.15724882917991317, "grad_norm": 6.123111248016357, "learning_rate": 0.00019876692833492343, "loss": 1.342, "step": 1150 }, { "epoch": 0.16408573479643113, "grad_norm": 5.803433418273926, "learning_rate": 0.0001985875654093866, "loss": 1.2384, "step": 1200 }, { "epoch": 0.1709226404129491, "grad_norm": 3.196314811706543, "learning_rate": 0.00019839612006510517, "loss": 1.3117, "step": 1250 }, { "epoch": 0.17775954602946706, "grad_norm": 6.21234130859375, "learning_rate": 0.00019819261576470152, "loss": 1.2307, "step": 1300 }, { "epoch": 0.18459645164598504, "grad_norm": 3.274829149246216, "learning_rate": 0.00019797707744868582, "loss": 1.2408, "step": 1350 }, { "epoch": 0.191433357262503, "grad_norm": 5.5120320320129395, "learning_rate": 0.0001977495315323993, "loss": 1.324, "step": 1400 }, { "epoch": 0.19827026287902094, "grad_norm": 7.289828777313232, "learning_rate": 0.0001975100059027772, "loss": 1.2039, "step": 1450 }, { "epoch": 0.20510716849553892, "grad_norm": 4.040754795074463, "learning_rate": 0.00019725852991493083, "loss": 1.3405, "step": 1500 }, { "epoch": 0.21194407411205687, "grad_norm": 52.13080596923828, "learning_rate": 0.00019699513438854995, "loss": 1.2005, "step": 1550 }, { "epoch": 0.21878097972857485, "grad_norm": 5.0520429611206055, "learning_rate": 0.00019671985160412593, "loss": 1.0046, "step": 1600 }, { "epoch": 0.2256178853450928, "grad_norm": 1.7626160383224487, "learning_rate": 0.00019643271529899532, "loss": 1.1398, "step": 1650 }, { "epoch": 0.23245479096161079, "grad_norm": 2.1751222610473633, "learning_rate": 0.00019613376066320525, "loss": 1.1519, "step": 1700 }, { "epoch": 0.23929169657812874, "grad_norm": 4.483262062072754, "learning_rate": 0.00019582302433520074, "loss": 1.144, "step": 1750 }, { "epoch": 0.2461286021946467, "grad_norm": 2.494478702545166, "learning_rate": 0.00019550054439733449, "loss": 1.1908, "step": 1800 }, { "epoch": 0.25296550781116467, "grad_norm": 14.6198091506958, "learning_rate": 0.00019516636037119952, "loss": 1.0791, "step": 1850 }, { "epoch": 0.2598024134276826, "grad_norm": 1.5368318557739258, "learning_rate": 0.00019482051321278592, "loss": 1.1994, "step": 1900 }, { "epoch": 0.2666393190442006, "grad_norm": 6.854203701019287, "learning_rate": 0.00019446304530746112, "loss": 1.1871, "step": 1950 }, { "epoch": 0.2734762246607186, "grad_norm": 3.686593770980835, "learning_rate": 0.00019409400046477559, "loss": 1.0619, "step": 2000 }, { "epoch": 0.2734762246607186, "eval_loss": 0.3232106864452362, "eval_runtime": 301.3298, "eval_samples_per_second": 26.801, "eval_steps_per_second": 3.352, "step": 2000 }, { "epoch": 0.28031313027723653, "grad_norm": 2.84173321723938, "learning_rate": 0.00019371342391309363, "loss": 1.1769, "step": 2050 }, { "epoch": 0.2871500358937545, "grad_norm": 6.158025741577148, "learning_rate": 0.00019332136229405043, "loss": 1.1985, "step": 2100 }, { "epoch": 0.29398694151027244, "grad_norm": 1.3917083740234375, "learning_rate": 0.00019291786365683599, "loss": 1.2915, "step": 2150 }, { "epoch": 0.3008238471267904, "grad_norm": 6.717157363891602, "learning_rate": 0.00019250297745230615, "loss": 0.9168, "step": 2200 }, { "epoch": 0.3076607527433084, "grad_norm": 7.835381507873535, "learning_rate": 0.00019207675452692259, "loss": 1.0267, "step": 2250 }, { "epoch": 0.31449765835982635, "grad_norm": 4.236868858337402, "learning_rate": 0.00019163924711652092, "loss": 1.1836, "step": 2300 }, { "epoch": 0.3213345639763443, "grad_norm": 4.367033004760742, "learning_rate": 0.00019119050883990903, "loss": 1.1023, "step": 2350 }, { "epoch": 0.32817146959286225, "grad_norm": 8.43916130065918, "learning_rate": 0.00019073059469229602, "loss": 1.1884, "step": 2400 }, { "epoch": 0.33500837520938026, "grad_norm": 7.896825790405273, "learning_rate": 0.0001902595610385519, "loss": 1.1764, "step": 2450 }, { "epoch": 0.3418452808258982, "grad_norm": 3.5363454818725586, "learning_rate": 0.00018977746560630012, "loss": 1.1172, "step": 2500 }, { "epoch": 0.34868218644241616, "grad_norm": 12.307855606079102, "learning_rate": 0.00018928436747884253, "loss": 1.078, "step": 2550 }, { "epoch": 0.3555190920589341, "grad_norm": 8.765337944030762, "learning_rate": 0.00018878032708791854, "loss": 1.1449, "step": 2600 }, { "epoch": 0.36235599767545207, "grad_norm": 11.366116523742676, "learning_rate": 0.00018826540620629873, "loss": 1.1117, "step": 2650 }, { "epoch": 0.3691929032919701, "grad_norm": 3.603243112564087, "learning_rate": 0.0001877396679402145, "loss": 1.1138, "step": 2700 }, { "epoch": 0.37602980890848803, "grad_norm": 8.020549774169922, "learning_rate": 0.00018720317672162392, "loss": 1.0474, "step": 2750 }, { "epoch": 0.382866714525006, "grad_norm": 4.786285877227783, "learning_rate": 0.00018665599830031533, "loss": 1.1041, "step": 2800 }, { "epoch": 0.38970362014152393, "grad_norm": 7.1555633544921875, "learning_rate": 0.00018609819973584924, "loss": 1.0623, "step": 2850 }, { "epoch": 0.3965405257580419, "grad_norm": 6.989715576171875, "learning_rate": 0.00018552984938934006, "loss": 0.9318, "step": 2900 }, { "epoch": 0.4033774313745599, "grad_norm": 7.150449752807617, "learning_rate": 0.00018495101691507783, "loss": 1.132, "step": 2950 }, { "epoch": 0.41021433699107784, "grad_norm": 4.584231853485107, "learning_rate": 0.00018436177325199192, "loss": 1.1382, "step": 3000 }, { "epoch": 0.4170512426075958, "grad_norm": 5.139730930328369, "learning_rate": 0.00018376219061495694, "loss": 1.0452, "step": 3050 }, { "epoch": 0.42388814822411375, "grad_norm": 15.497014999389648, "learning_rate": 0.00018315234248594264, "loss": 1.0451, "step": 3100 }, { "epoch": 0.43072505384063176, "grad_norm": 3.4872303009033203, "learning_rate": 0.0001825323036050081, "loss": 1.131, "step": 3150 }, { "epoch": 0.4375619594571497, "grad_norm": 11.307365417480469, "learning_rate": 0.00018190214996114206, "loss": 1.1382, "step": 3200 }, { "epoch": 0.44439886507366766, "grad_norm": 5.577065467834473, "learning_rate": 0.00018126195878295006, "loss": 1.1045, "step": 3250 }, { "epoch": 0.4512357706901856, "grad_norm": 14.33316421508789, "learning_rate": 0.0001806118085291896, "loss": 1.0887, "step": 3300 }, { "epoch": 0.45807267630670356, "grad_norm": 15.240452766418457, "learning_rate": 0.00017995177887915475, "loss": 1.0171, "step": 3350 }, { "epoch": 0.46490958192322157, "grad_norm": 10.07467269897461, "learning_rate": 0.00017928195072291093, "loss": 1.0966, "step": 3400 }, { "epoch": 0.4717464875397395, "grad_norm": 2.930840253829956, "learning_rate": 0.00017860240615138142, "loss": 1.0418, "step": 3450 }, { "epoch": 0.4785833931562575, "grad_norm": 30.01850700378418, "learning_rate": 0.00017791322844628677, "loss": 0.9635, "step": 3500 }, { "epoch": 0.4854202987727754, "grad_norm": 5.433286666870117, "learning_rate": 0.0001772145020699381, "loss": 1.0108, "step": 3550 }, { "epoch": 0.4922572043892934, "grad_norm": 3.0814309120178223, "learning_rate": 0.0001765063126548858, "loss": 1.1257, "step": 3600 }, { "epoch": 0.4990941100058114, "grad_norm": 79.82017517089844, "learning_rate": 0.00017578874699342493, "loss": 1.1214, "step": 3650 }, { "epoch": 0.5059310156223293, "grad_norm": 8.51614761352539, "learning_rate": 0.00017506189302695827, "loss": 0.8635, "step": 3700 }, { "epoch": 0.5127679212388473, "grad_norm": 8.251550674438477, "learning_rate": 0.0001743258398352187, "loss": 0.9361, "step": 3750 }, { "epoch": 0.5196048268553652, "grad_norm": 3.81523060798645, "learning_rate": 0.00017358067762535186, "loss": 1.066, "step": 3800 }, { "epoch": 0.5264417324718832, "grad_norm": 15.210460662841797, "learning_rate": 0.00017282649772086114, "loss": 0.9778, "step": 3850 }, { "epoch": 0.5332786380884011, "grad_norm": 5.145527362823486, "learning_rate": 0.0001720633925504151, "loss": 1.0966, "step": 3900 }, { "epoch": 0.5401155437049191, "grad_norm": 3.485656261444092, "learning_rate": 0.00017129145563652014, "loss": 0.6889, "step": 3950 }, { "epoch": 0.5469524493214372, "grad_norm": 7.915320873260498, "learning_rate": 0.00017051078158405872, "loss": 0.9154, "step": 4000 }, { "epoch": 0.5469524493214372, "eval_loss": 0.24666446447372437, "eval_runtime": 301.8017, "eval_samples_per_second": 26.759, "eval_steps_per_second": 3.347, "step": 4000 }, { "epoch": 0.5537893549379551, "grad_norm": 12.610590934753418, "learning_rate": 0.00016972146606869507, "loss": 0.8612, "step": 4050 }, { "epoch": 0.5606262605544731, "grad_norm": 34.93125915527344, "learning_rate": 0.00016892360582514967, "loss": 1.0867, "step": 4100 }, { "epoch": 0.567463166170991, "grad_norm": 7.39677095413208, "learning_rate": 0.00016811729863534377, "loss": 1.1106, "step": 4150 }, { "epoch": 0.574300071787509, "grad_norm": 2.4880149364471436, "learning_rate": 0.00016730264331641585, "loss": 0.9142, "step": 4200 }, { "epoch": 0.5811369774040269, "grad_norm": 19.268964767456055, "learning_rate": 0.00016647973970861104, "loss": 0.9408, "step": 4250 }, { "epoch": 0.5879738830205449, "grad_norm": 62.558837890625, "learning_rate": 0.00016564868866304517, "loss": 1.1798, "step": 4300 }, { "epoch": 0.5948107886370628, "grad_norm": 12.449636459350586, "learning_rate": 0.00016480959202934487, "loss": 0.9386, "step": 4350 }, { "epoch": 0.6016476942535808, "grad_norm": 9.708828926086426, "learning_rate": 0.00016396255264316547, "loss": 1.0766, "step": 4400 }, { "epoch": 0.6084845998700988, "grad_norm": 4.00963020324707, "learning_rate": 0.0001631076743135879, "loss": 0.9953, "step": 4450 }, { "epoch": 0.6153215054866168, "grad_norm": 14.70906925201416, "learning_rate": 0.0001622450618103964, "loss": 1.1006, "step": 4500 }, { "epoch": 0.6221584111031347, "grad_norm": 2.471301317214966, "learning_rate": 0.00016137482085123832, "loss": 0.7397, "step": 4550 }, { "epoch": 0.6289953167196527, "grad_norm": 0.671847939491272, "learning_rate": 0.00016049705808866805, "loss": 1.1298, "step": 4600 }, { "epoch": 0.6358322223361706, "grad_norm": 11.712217330932617, "learning_rate": 0.000159611881097076, "loss": 0.8828, "step": 4650 }, { "epoch": 0.6426691279526886, "grad_norm": 90.13214111328125, "learning_rate": 0.00015871939835950503, "loss": 1.085, "step": 4700 }, { "epoch": 0.6495060335692066, "grad_norm": 2.1299564838409424, "learning_rate": 0.00015781971925435498, "loss": 1.0104, "step": 4750 }, { "epoch": 0.6563429391857245, "grad_norm": 44.118778228759766, "learning_rate": 0.0001569129540419781, "loss": 0.8905, "step": 4800 }, { "epoch": 0.6631798448022425, "grad_norm": 20.966922760009766, "learning_rate": 0.00015599921385116582, "loss": 0.9239, "step": 4850 }, { "epoch": 0.6700167504187605, "grad_norm": 13.358034133911133, "learning_rate": 0.00015507861066552955, "loss": 0.8589, "step": 4900 }, { "epoch": 0.6768536560352785, "grad_norm": 5.739938259124756, "learning_rate": 0.00015415125730977626, "loss": 1.0661, "step": 4950 }, { "epoch": 0.6836905616517964, "grad_norm": 25.265790939331055, "learning_rate": 0.00015321726743588155, "loss": 0.9046, "step": 5000 }, { "epoch": 0.6905274672683144, "grad_norm": 22.772367477416992, "learning_rate": 0.00015227675550916073, "loss": 1.0174, "step": 5050 }, { "epoch": 0.6973643728848323, "grad_norm": 4.18620491027832, "learning_rate": 0.0001513298367942405, "loss": 0.9916, "step": 5100 }, { "epoch": 0.7042012785013503, "grad_norm": 10.113117218017578, "learning_rate": 0.00015037662734093286, "loss": 0.9635, "step": 5150 }, { "epoch": 0.7110381841178682, "grad_norm": 1.7103244066238403, "learning_rate": 0.0001494172439700126, "loss": 0.8927, "step": 5200 }, { "epoch": 0.7178750897343862, "grad_norm": 24.236433029174805, "learning_rate": 0.0001484518042589, "loss": 0.9438, "step": 5250 }, { "epoch": 0.7247119953509041, "grad_norm": 2.4070262908935547, "learning_rate": 0.00014748042652725152, "loss": 1.095, "step": 5300 }, { "epoch": 0.7315489009674222, "grad_norm": 4.471241474151611, "learning_rate": 0.0001465032298224588, "loss": 0.8205, "step": 5350 }, { "epoch": 0.7383858065839402, "grad_norm": 1.757636547088623, "learning_rate": 0.0001455203339050589, "loss": 0.9177, "step": 5400 }, { "epoch": 0.7452227122004581, "grad_norm": 1.5365773439407349, "learning_rate": 0.0001445318592340571, "loss": 0.7696, "step": 5450 }, { "epoch": 0.7520596178169761, "grad_norm": 1.7077670097351074, "learning_rate": 0.00014353792695216382, "loss": 0.9342, "step": 5500 }, { "epoch": 0.758896523433494, "grad_norm": 28.525236129760742, "learning_rate": 0.00014253865887094817, "loss": 0.9897, "step": 5550 }, { "epoch": 0.765733429050012, "grad_norm": 15.281404495239258, "learning_rate": 0.00014153417745590914, "loss": 0.8873, "step": 5600 }, { "epoch": 0.7725703346665299, "grad_norm": 1.1002103090286255, "learning_rate": 0.00014052460581146696, "loss": 0.7727, "step": 5650 }, { "epoch": 0.7794072402830479, "grad_norm": 4.395946025848389, "learning_rate": 0.00013951006766587586, "loss": 0.8922, "step": 5700 }, { "epoch": 0.7862441458995658, "grad_norm": 5.225406169891357, "learning_rate": 0.0001384906873560607, "loss": 0.9766, "step": 5750 }, { "epoch": 0.7930810515160838, "grad_norm": 6.0966315269470215, "learning_rate": 0.00013746658981237867, "loss": 1.1373, "step": 5800 }, { "epoch": 0.7999179571326018, "grad_norm": 14.155887603759766, "learning_rate": 0.00013643790054330846, "loss": 0.8954, "step": 5850 }, { "epoch": 0.8067548627491198, "grad_norm": 2.6549534797668457, "learning_rate": 0.0001354047456200687, "loss": 1.0428, "step": 5900 }, { "epoch": 0.8135917683656377, "grad_norm": 7.79277229309082, "learning_rate": 0.0001343672516611671, "loss": 0.8715, "step": 5950 }, { "epoch": 0.8204286739821557, "grad_norm": 17.183149337768555, "learning_rate": 0.00013332554581688271, "loss": 1.0601, "step": 6000 }, { "epoch": 0.8204286739821557, "eval_loss": 0.20116083323955536, "eval_runtime": 301.512, "eval_samples_per_second": 26.785, "eval_steps_per_second": 3.35, "step": 6000 }, { "epoch": 0.8272655795986736, "grad_norm": 10.275203704833984, "learning_rate": 0.00013227975575368312, "loss": 0.8782, "step": 6050 }, { "epoch": 0.8341024852151916, "grad_norm": 3.2849924564361572, "learning_rate": 0.0001312300096385781, "loss": 0.7405, "step": 6100 }, { "epoch": 0.8409393908317095, "grad_norm": 5.1770853996276855, "learning_rate": 0.0001301764361234122, "loss": 1.0901, "step": 6150 }, { "epoch": 0.8477762964482275, "grad_norm": 13.282193183898926, "learning_rate": 0.0001291191643290977, "loss": 0.9054, "step": 6200 }, { "epoch": 0.8546132020647454, "grad_norm": 9.424989700317383, "learning_rate": 0.0001280583238297903, "loss": 0.9861, "step": 6250 }, { "epoch": 0.8614501076812635, "grad_norm": 2.5506229400634766, "learning_rate": 0.000126994044637009, "loss": 1.0244, "step": 6300 }, { "epoch": 0.8682870132977815, "grad_norm": 21.7524471282959, "learning_rate": 0.00012592645718370252, "loss": 0.9079, "step": 6350 }, { "epoch": 0.8751239189142994, "grad_norm": 2.2379355430603027, "learning_rate": 0.00012485569230826423, "loss": 1.0235, "step": 6400 }, { "epoch": 0.8819608245308174, "grad_norm": 18.936904907226562, "learning_rate": 0.000123781881238497, "loss": 0.8275, "step": 6450 }, { "epoch": 0.8887977301473353, "grad_norm": 1.508329153060913, "learning_rate": 0.00012270515557553065, "loss": 0.9872, "step": 6500 }, { "epoch": 0.8956346357638533, "grad_norm": 30.93293571472168, "learning_rate": 0.00012162564727769359, "loss": 1.0287, "step": 6550 }, { "epoch": 0.9024715413803712, "grad_norm": 29.230403900146484, "learning_rate": 0.00012054348864434066, "loss": 0.627, "step": 6600 }, { "epoch": 0.9093084469968892, "grad_norm": 14.68487548828125, "learning_rate": 0.00011945881229963898, "loss": 0.9562, "step": 6650 }, { "epoch": 0.9161453526134071, "grad_norm": 2.035444736480713, "learning_rate": 0.00011837175117631436, "loss": 0.8726, "step": 6700 }, { "epoch": 0.9229822582299252, "grad_norm": 12.931522369384766, "learning_rate": 0.0001172824384993596, "loss": 0.8823, "step": 6750 }, { "epoch": 0.9298191638464431, "grad_norm": 8.330245971679688, "learning_rate": 0.00011619100776970713, "loss": 0.7179, "step": 6800 }, { "epoch": 0.9366560694629611, "grad_norm": 51.09445571899414, "learning_rate": 0.00011509759274786776, "loss": 0.8627, "step": 6850 }, { "epoch": 0.943492975079479, "grad_norm": 26.371118545532227, "learning_rate": 0.00011400232743753752, "loss": 0.7334, "step": 6900 }, { "epoch": 0.950329880695997, "grad_norm": 1.3464198112487793, "learning_rate": 0.00011290534606917508, "loss": 1.0389, "step": 6950 }, { "epoch": 0.957166786312515, "grad_norm": 0.732755184173584, "learning_rate": 0.00011180678308355081, "loss": 0.8343, "step": 7000 }, { "epoch": 0.9640036919290329, "grad_norm": 0.9582768082618713, "learning_rate": 0.00011070677311527058, "loss": 1.0705, "step": 7050 }, { "epoch": 0.9708405975455509, "grad_norm": 0.7923704385757446, "learning_rate": 0.00010960545097627548, "loss": 0.9725, "step": 7100 }, { "epoch": 0.9776775031620688, "grad_norm": 39.650177001953125, "learning_rate": 0.00010850295163931992, "loss": 0.8721, "step": 7150 }, { "epoch": 0.9845144087785868, "grad_norm": 9.212077140808105, "learning_rate": 0.00010739941022143007, "loss": 0.8079, "step": 7200 }, { "epoch": 0.9913513143951048, "grad_norm": 2.591902494430542, "learning_rate": 0.00010629496196734452, "loss": 1.1336, "step": 7250 }, { "epoch": 0.9981882200116228, "grad_norm": 18.618799209594727, "learning_rate": 0.00010518974223293936, "loss": 1.0463, "step": 7300 }, { "epoch": 1.004922572043893, "grad_norm": 8.480158805847168, "learning_rate": 0.00010408388646863965, "loss": 0.7236, "step": 7350 }, { "epoch": 1.0117594776604109, "grad_norm": 3.5370821952819824, "learning_rate": 0.00010297753020281911, "loss": 0.813, "step": 7400 }, { "epoch": 1.018596383276929, "grad_norm": 0.5842294096946716, "learning_rate": 0.00010187080902519064, "loss": 0.589, "step": 7450 }, { "epoch": 1.0254332888934468, "grad_norm": 11.063470840454102, "learning_rate": 0.00010076385857018889, "loss": 0.9893, "step": 7500 }, { "epoch": 1.0322701945099648, "grad_norm": 8.910834312438965, "learning_rate": 9.965681450034771e-05, "loss": 0.6532, "step": 7550 }, { "epoch": 1.0391071001264827, "grad_norm": 0.8395630121231079, "learning_rate": 9.854981248967388e-05, "loss": 0.6934, "step": 7600 }, { "epoch": 1.0459440057430007, "grad_norm": 3.7071163654327393, "learning_rate": 9.744298820701968e-05, "loss": 0.7911, "step": 7650 }, { "epoch": 1.0527809113595188, "grad_norm": 14.003477096557617, "learning_rate": 9.633647729945581e-05, "loss": 0.7611, "step": 7700 }, { "epoch": 1.0596178169760366, "grad_norm": 19.04654884338379, "learning_rate": 9.523041537564726e-05, "loss": 0.6596, "step": 7750 }, { "epoch": 1.0664547225925547, "grad_norm": 52.79182815551758, "learning_rate": 9.412493798923383e-05, "loss": 0.763, "step": 7800 }, { "epoch": 1.0732916282090725, "grad_norm": 1.4399851560592651, "learning_rate": 9.3020180622217e-05, "loss": 0.667, "step": 7850 }, { "epoch": 1.0801285338255906, "grad_norm": 1.6162464618682861, "learning_rate": 9.19162786683564e-05, "loss": 0.813, "step": 7900 }, { "epoch": 1.0869654394421084, "grad_norm": 6.91720724105835, "learning_rate": 9.081336741657603e-05, "loss": 0.6394, "step": 7950 }, { "epoch": 1.0938023450586265, "grad_norm": 7.005824089050293, "learning_rate": 8.971158203438443e-05, "loss": 0.6949, "step": 8000 }, { "epoch": 1.0938023450586265, "eval_loss": 0.22489766776561737, "eval_runtime": 301.6603, "eval_samples_per_second": 26.772, "eval_steps_per_second": 3.348, "step": 8000 }, { "epoch": 1.1006392506751443, "grad_norm": 12.64887523651123, "learning_rate": 8.861105755130896e-05, "loss": 0.6777, "step": 8050 }, { "epoch": 1.1074761562916624, "grad_norm": 99.47157287597656, "learning_rate": 8.751192884234704e-05, "loss": 0.5242, "step": 8100 }, { "epoch": 1.1143130619081805, "grad_norm": 2.9147791862487793, "learning_rate": 8.641433061143698e-05, "loss": 0.6589, "step": 8150 }, { "epoch": 1.1211499675246983, "grad_norm": 0.4020586311817169, "learning_rate": 8.531839737494878e-05, "loss": 0.9058, "step": 8200 }, { "epoch": 1.1279868731412164, "grad_norm": 41.31173324584961, "learning_rate": 8.422426344519898e-05, "loss": 0.5999, "step": 8250 }, { "epoch": 1.1348237787577342, "grad_norm": 0.19233907759189606, "learning_rate": 8.313206291398948e-05, "loss": 0.8461, "step": 8300 }, { "epoch": 1.1416606843742523, "grad_norm": 0.5941385626792908, "learning_rate": 8.20419296361743e-05, "loss": 0.5353, "step": 8350 }, { "epoch": 1.1484975899907701, "grad_norm": 6.670557022094727, "learning_rate": 8.095399721325481e-05, "loss": 0.6484, "step": 8400 }, { "epoch": 1.1553344956072882, "grad_norm": 3.8168182373046875, "learning_rate": 7.9868398977006e-05, "loss": 0.8318, "step": 8450 }, { "epoch": 1.162171401223806, "grad_norm": 17.14653778076172, "learning_rate": 7.87852679731364e-05, "loss": 0.5694, "step": 8500 }, { "epoch": 1.169008306840324, "grad_norm": 58.7053108215332, "learning_rate": 7.77047369449821e-05, "loss": 0.7256, "step": 8550 }, { "epoch": 1.1758452124568421, "grad_norm": 0.4155759811401367, "learning_rate": 7.66269383172389e-05, "loss": 0.604, "step": 8600 }, { "epoch": 1.18268211807336, "grad_norm": 1.1354832649230957, "learning_rate": 7.555200417973261e-05, "loss": 0.7761, "step": 8650 }, { "epoch": 1.189519023689878, "grad_norm": 1.1315326690673828, "learning_rate": 7.448006627123083e-05, "loss": 0.6569, "step": 8700 }, { "epoch": 1.196355929306396, "grad_norm": 0.9931478500366211, "learning_rate": 7.341125596329783e-05, "loss": 0.8456, "step": 8750 }, { "epoch": 1.203192834922914, "grad_norm": 2.132953643798828, "learning_rate": 7.2345704244194e-05, "loss": 0.7142, "step": 8800 }, { "epoch": 1.2100297405394318, "grad_norm": 10.148101806640625, "learning_rate": 7.12835417028229e-05, "loss": 0.7284, "step": 8850 }, { "epoch": 1.2168666461559499, "grad_norm": 41.58332824707031, "learning_rate": 7.022489851272668e-05, "loss": 0.5779, "step": 8900 }, { "epoch": 1.2237035517724677, "grad_norm": 4.843736171722412, "learning_rate": 6.91699044161326e-05, "loss": 0.6783, "step": 8950 }, { "epoch": 1.2305404573889858, "grad_norm": 0.4043326675891876, "learning_rate": 6.811868870805269e-05, "loss": 0.7656, "step": 9000 }, { "epoch": 1.2373773630055038, "grad_norm": 3.8934195041656494, "learning_rate": 6.70713802204377e-05, "loss": 0.5857, "step": 9050 }, { "epoch": 1.2442142686220217, "grad_norm": 0.23483966290950775, "learning_rate": 6.602810730638829e-05, "loss": 0.6388, "step": 9100 }, { "epoch": 1.2510511742385395, "grad_norm": 2.1649527549743652, "learning_rate": 6.498899782442444e-05, "loss": 0.6986, "step": 9150 }, { "epoch": 1.2578880798550576, "grad_norm": 82.96743774414062, "learning_rate": 6.39541791228161e-05, "loss": 0.5563, "step": 9200 }, { "epoch": 1.2647249854715756, "grad_norm": 1.8622783422470093, "learning_rate": 6.292377802397564e-05, "loss": 0.6941, "step": 9250 }, { "epoch": 1.2715618910880935, "grad_norm": 1.1985386610031128, "learning_rate": 6.189792080891525e-05, "loss": 0.6195, "step": 9300 }, { "epoch": 1.2783987967046115, "grad_norm": 1.1333106756210327, "learning_rate": 6.087673320177058e-05, "loss": 0.5675, "step": 9350 }, { "epoch": 1.2852357023211294, "grad_norm": 13.326946258544922, "learning_rate": 5.9860340354392496e-05, "loss": 0.8214, "step": 9400 }, { "epoch": 1.2920726079376474, "grad_norm": 10.754223823547363, "learning_rate": 5.8848866831009156e-05, "loss": 0.663, "step": 9450 }, { "epoch": 1.2989095135541655, "grad_norm": 0.07592844218015671, "learning_rate": 5.784243659296001e-05, "loss": 0.6661, "step": 9500 }, { "epoch": 1.3057464191706833, "grad_norm": 4.361905097961426, "learning_rate": 5.6841172983503634e-05, "loss": 0.6757, "step": 9550 }, { "epoch": 1.3125833247872012, "grad_norm": 6.464013576507568, "learning_rate": 5.5845198712701396e-05, "loss": 0.8568, "step": 9600 }, { "epoch": 1.3194202304037193, "grad_norm": 13.971973419189453, "learning_rate": 5.485463584237871e-05, "loss": 0.5852, "step": 9650 }, { "epoch": 1.3262571360202373, "grad_norm": 25.48811149597168, "learning_rate": 5.3869605771165755e-05, "loss": 0.652, "step": 9700 }, { "epoch": 1.3330940416367552, "grad_norm": 5.14886474609375, "learning_rate": 5.289022921961948e-05, "loss": 0.8247, "step": 9750 }, { "epoch": 1.3399309472532732, "grad_norm": 0.6628409028053284, "learning_rate": 5.1916626215428385e-05, "loss": 0.5708, "step": 9800 }, { "epoch": 1.346767852869791, "grad_norm": 81.61123657226562, "learning_rate": 5.094891607870296e-05, "loss": 0.7523, "step": 9850 }, { "epoch": 1.3536047584863091, "grad_norm": 0.597465455532074, "learning_rate": 4.998721740735197e-05, "loss": 0.7701, "step": 9900 }, { "epoch": 1.3604416641028272, "grad_norm": 1.8627650737762451, "learning_rate": 4.903164806254804e-05, "loss": 0.6589, "step": 9950 }, { "epoch": 1.367278569719345, "grad_norm": 0.427298903465271, "learning_rate": 4.808232515428268e-05, "loss": 0.6476, "step": 10000 }, { "epoch": 1.367278569719345, "eval_loss": 0.25095975399017334, "eval_runtime": 301.6273, "eval_samples_per_second": 26.775, "eval_steps_per_second": 3.349, "step": 10000 }, { "epoch": 1.3741154753358629, "grad_norm": 0.5417049527168274, "learning_rate": 4.713936502701435e-05, "loss": 0.7344, "step": 10050 }, { "epoch": 1.380952380952381, "grad_norm": 0.30379384756088257, "learning_rate": 4.620288324540962e-05, "loss": 0.5764, "step": 10100 }, { "epoch": 1.387789286568899, "grad_norm": 0.258468359708786, "learning_rate": 4.5272994580179895e-05, "loss": 0.6794, "step": 10150 }, { "epoch": 1.3946261921854168, "grad_norm": 1.2032103538513184, "learning_rate": 4.434981299401615e-05, "loss": 0.5931, "step": 10200 }, { "epoch": 1.401463097801935, "grad_norm": 4.064381122589111, "learning_rate": 4.3433451627621743e-05, "loss": 0.4061, "step": 10250 }, { "epoch": 1.4083000034184527, "grad_norm": 1.0236620903015137, "learning_rate": 4.2524022785846806e-05, "loss": 0.5935, "step": 10300 }, { "epoch": 1.4151369090349708, "grad_norm": 0.42589133977890015, "learning_rate": 4.1621637923924405e-05, "loss": 0.8298, "step": 10350 }, { "epoch": 1.4219738146514889, "grad_norm": 9.088717460632324, "learning_rate": 4.072640763381127e-05, "loss": 0.5821, "step": 10400 }, { "epoch": 1.4288107202680067, "grad_norm": 2.854710102081299, "learning_rate": 3.983844163063429e-05, "loss": 0.6541, "step": 10450 }, { "epoch": 1.4356476258845245, "grad_norm": 6.076037406921387, "learning_rate": 3.895784873924397e-05, "loss": 0.6669, "step": 10500 }, { "epoch": 1.4424845315010426, "grad_norm": 0.36614227294921875, "learning_rate": 3.8084736880877846e-05, "loss": 0.5883, "step": 10550 }, { "epoch": 1.4493214371175607, "grad_norm": 82.49917602539062, "learning_rate": 3.721921305993391e-05, "loss": 0.8045, "step": 10600 }, { "epoch": 1.4561583427340785, "grad_norm": 45.616859436035156, "learning_rate": 3.636138335085666e-05, "loss": 0.4991, "step": 10650 }, { "epoch": 1.4629952483505966, "grad_norm": 0.26663124561309814, "learning_rate": 3.5511352885137194e-05, "loss": 0.4815, "step": 10700 }, { "epoch": 1.4698321539671144, "grad_norm": 1.6303415298461914, "learning_rate": 3.4669225838428785e-05, "loss": 0.4746, "step": 10750 }, { "epoch": 1.4766690595836325, "grad_norm": 14.5377779006958, "learning_rate": 3.3835105417779687e-05, "loss": 0.7877, "step": 10800 }, { "epoch": 1.4835059652001505, "grad_norm": 0.08112337440252304, "learning_rate": 3.30090938489844e-05, "loss": 0.6687, "step": 10850 }, { "epoch": 1.4903428708166684, "grad_norm": 7.454471588134766, "learning_rate": 3.219129236405548e-05, "loss": 0.8063, "step": 10900 }, { "epoch": 1.4971797764331862, "grad_norm": 5.5912275314331055, "learning_rate": 3.13818011888171e-05, "loss": 0.6337, "step": 10950 }, { "epoch": 1.5040166820497043, "grad_norm": 7.555117130279541, "learning_rate": 3.0580719530621705e-05, "loss": 0.6513, "step": 11000 }, { "epoch": 1.5108535876662224, "grad_norm": 0.4277037978172302, "learning_rate": 2.9788145566191693e-05, "loss": 0.603, "step": 11050 }, { "epoch": 1.5176904932827402, "grad_norm": 0.3563739061355591, "learning_rate": 2.900417642958734e-05, "loss": 0.5695, "step": 11100 }, { "epoch": 1.524527398899258, "grad_norm": 0.8669344782829285, "learning_rate": 2.822890820030264e-05, "loss": 0.7372, "step": 11150 }, { "epoch": 1.531364304515776, "grad_norm": 10.977109909057617, "learning_rate": 2.7462435891490036e-05, "loss": 0.6573, "step": 11200 }, { "epoch": 1.5382012101322942, "grad_norm": 0.33039143681526184, "learning_rate": 2.6704853438316213e-05, "loss": 0.4278, "step": 11250 }, { "epoch": 1.5450381157488122, "grad_norm": 3.340820550918579, "learning_rate": 2.5956253686449882e-05, "loss": 0.6281, "step": 11300 }, { "epoch": 1.55187502136533, "grad_norm": 6.152026176452637, "learning_rate": 2.521672838068295e-05, "loss": 0.6859, "step": 11350 }, { "epoch": 1.558711926981848, "grad_norm": 0.9645776152610779, "learning_rate": 2.4486368153686734e-05, "loss": 0.578, "step": 11400 }, { "epoch": 1.565548832598366, "grad_norm": 3.5073535442352295, "learning_rate": 2.3765262514904617e-05, "loss": 0.6756, "step": 11450 }, { "epoch": 1.572385738214884, "grad_norm": 1.3473198413848877, "learning_rate": 2.305349983958196e-05, "loss": 0.6288, "step": 11500 }, { "epoch": 1.5792226438314019, "grad_norm": 6.039999961853027, "learning_rate": 2.2351167357935422e-05, "loss": 0.6274, "step": 11550 }, { "epoch": 1.5860595494479197, "grad_norm": 0.9115678668022156, "learning_rate": 2.1658351144462362e-05, "loss": 0.6303, "step": 11600 }, { "epoch": 1.5928964550644378, "grad_norm": 37.31045150756836, "learning_rate": 2.097513610739209e-05, "loss": 0.7243, "step": 11650 }, { "epoch": 1.5997333606809558, "grad_norm": 0.5089764595031738, "learning_rate": 2.0301605978279702e-05, "loss": 0.507, "step": 11700 }, { "epoch": 1.606570266297474, "grad_norm": 16.424047470092773, "learning_rate": 1.9637843301744528e-05, "loss": 0.6387, "step": 11750 }, { "epoch": 1.6134071719139917, "grad_norm": 0.6381849646568298, "learning_rate": 1.898392942535383e-05, "loss": 0.7143, "step": 11800 }, { "epoch": 1.6202440775305096, "grad_norm": 7.240786075592041, "learning_rate": 1.833994448965315e-05, "loss": 0.7644, "step": 11850 }, { "epoch": 1.6270809831470276, "grad_norm": 0.6397457122802734, "learning_rate": 1.7705967418344737e-05, "loss": 0.5355, "step": 11900 }, { "epoch": 1.6339178887635457, "grad_norm": 0.49821093678474426, "learning_rate": 1.7082075908615013e-05, "loss": 0.7372, "step": 11950 }, { "epoch": 1.6407547943800636, "grad_norm": 0.550399124622345, "learning_rate": 1.6468346421612447e-05, "loss": 0.7474, "step": 12000 }, { "epoch": 1.6407547943800636, "eval_loss": 0.26388460397720337, "eval_runtime": 300.1264, "eval_samples_per_second": 26.909, "eval_steps_per_second": 3.365, "step": 12000 }, { "epoch": 1.6475916999965814, "grad_norm": 0.1512337028980255, "learning_rate": 1.5864854173076714e-05, "loss": 0.6831, "step": 12050 }, { "epoch": 1.6544286056130995, "grad_norm": 40.49404525756836, "learning_rate": 1.52716731241207e-05, "loss": 0.7483, "step": 12100 }, { "epoch": 1.6612655112296175, "grad_norm": 0.5297091007232666, "learning_rate": 1.4688875972166227e-05, "loss": 0.5595, "step": 12150 }, { "epoch": 1.6681024168461356, "grad_norm": 12.922277450561523, "learning_rate": 1.4116534142034488e-05, "loss": 0.5817, "step": 12200 }, { "epoch": 1.6749393224626534, "grad_norm": 0.4216732382774353, "learning_rate": 1.3554717777192605e-05, "loss": 0.8905, "step": 12250 }, { "epoch": 1.6817762280791713, "grad_norm": 1.1882590055465698, "learning_rate": 1.3003495731157312e-05, "loss": 0.5435, "step": 12300 }, { "epoch": 1.6886131336956893, "grad_norm": 15.241290092468262, "learning_rate": 1.2462935559056366e-05, "loss": 0.5636, "step": 12350 }, { "epoch": 1.6954500393122074, "grad_norm": 1.281235933303833, "learning_rate": 1.1933103509349508e-05, "loss": 0.4771, "step": 12400 }, { "epoch": 1.7022869449287252, "grad_norm": 30.664819717407227, "learning_rate": 1.1414064515709255e-05, "loss": 0.5598, "step": 12450 }, { "epoch": 1.709123850545243, "grad_norm": 3.1145246028900146, "learning_rate": 1.0905882189063032e-05, "loss": 0.5779, "step": 12500 }, { "epoch": 1.7159607561617611, "grad_norm": 4.802779674530029, "learning_rate": 1.0408618809797255e-05, "loss": 0.5402, "step": 12550 }, { "epoch": 1.7227976617782792, "grad_norm": 3.566648006439209, "learning_rate": 9.92233532012452e-06, "loss": 0.816, "step": 12600 }, { "epoch": 1.7296345673947973, "grad_norm": 0.9611634016036987, "learning_rate": 9.447091316614965e-06, "loss": 0.5813, "step": 12650 }, { "epoch": 1.736471473011315, "grad_norm": 2.433220148086548, "learning_rate": 8.9829450428922e-06, "loss": 0.5628, "step": 12700 }, { "epoch": 1.743308378627833, "grad_norm": 0.1846768856048584, "learning_rate": 8.529953382495404e-06, "loss": 0.7646, "step": 12750 }, { "epoch": 1.750145284244351, "grad_norm": 1.4401239156723022, "learning_rate": 8.088171851907855e-06, "loss": 0.5705, "step": 12800 }, { "epoch": 1.756982189860869, "grad_norm": 25.80792236328125, "learning_rate": 7.657654593753195e-06, "loss": 0.6362, "step": 12850 }, { "epoch": 1.763819095477387, "grad_norm": 0.8399425148963928, "learning_rate": 7.2384543701598416e-06, "loss": 0.7085, "step": 12900 }, { "epoch": 1.7706560010939048, "grad_norm": 0.8096999526023865, "learning_rate": 6.83062255629483e-06, "loss": 0.5368, "step": 12950 }, { "epoch": 1.7774929067104228, "grad_norm": 8.902669906616211, "learning_rate": 6.43420913406747e-06, "loss": 0.5753, "step": 13000 }, { "epoch": 1.7843298123269409, "grad_norm": 0.15432903170585632, "learning_rate": 6.049262686003787e-06, "loss": 0.6055, "step": 13050 }, { "epoch": 1.791166717943459, "grad_norm": 14.938940048217773, "learning_rate": 5.6758303892925025e-06, "loss": 0.7965, "step": 13100 }, { "epoch": 1.7980036235599768, "grad_norm": 0.20640145242214203, "learning_rate": 5.313958010003261e-06, "loss": 0.5362, "step": 13150 }, { "epoch": 1.8048405291764946, "grad_norm": 0.42624762654304504, "learning_rate": 4.963689897477664e-06, "loss": 0.6298, "step": 13200 }, { "epoch": 1.8116774347930127, "grad_norm": 14.088078498840332, "learning_rate": 4.625068978894131e-06, "loss": 0.5166, "step": 13250 }, { "epoch": 1.8185143404095307, "grad_norm": 8.906865119934082, "learning_rate": 4.298136754006854e-06, "loss": 0.6144, "step": 13300 }, { "epoch": 1.8253512460260486, "grad_norm": 0.16211865842342377, "learning_rate": 3.982933290059887e-06, "loss": 0.446, "step": 13350 }, { "epoch": 1.8321881516425664, "grad_norm": 25.307283401489258, "learning_rate": 3.6794972168766594e-06, "loss": 0.525, "step": 13400 }, { "epoch": 1.8390250572590845, "grad_norm": 41.81796646118164, "learning_rate": 3.387865722125594e-06, "loss": 0.7377, "step": 13450 }, { "epoch": 1.8458619628756026, "grad_norm": 0.09296048432588577, "learning_rate": 3.10807454676274e-06, "loss": 0.5175, "step": 13500 }, { "epoch": 1.8526988684921206, "grad_norm": 113.21685791015625, "learning_rate": 2.8401579806514035e-06, "loss": 0.7324, "step": 13550 }, { "epoch": 1.8595357741086385, "grad_norm": 13.23887825012207, "learning_rate": 2.5841488583597696e-06, "loss": 0.4255, "step": 13600 }, { "epoch": 1.8663726797251563, "grad_norm": 0.3335596024990082, "learning_rate": 2.3400785551369043e-06, "loss": 0.4865, "step": 13650 }, { "epoch": 1.8732095853416744, "grad_norm": 1.1101493835449219, "learning_rate": 2.1079769830674836e-06, "loss": 0.5834, "step": 13700 }, { "epoch": 1.8800464909581924, "grad_norm": 0.44824355840682983, "learning_rate": 1.8878725874060144e-06, "loss": 0.6434, "step": 13750 }, { "epoch": 1.8868833965747103, "grad_norm": 0.7179256081581116, "learning_rate": 1.6797923430905583e-06, "loss": 0.5649, "step": 13800 }, { "epoch": 1.893720302191228, "grad_norm": 0.6279736757278442, "learning_rate": 1.4837617514370073e-06, "loss": 0.6663, "step": 13850 }, { "epoch": 1.9005572078077462, "grad_norm": 2.146757125854492, "learning_rate": 1.2998048370135963e-06, "loss": 0.5003, "step": 13900 }, { "epoch": 1.9073941134242642, "grad_norm": 0.2452065795660019, "learning_rate": 1.127944144696691e-06, "loss": 0.7167, "step": 13950 }, { "epoch": 1.9142310190407823, "grad_norm": 0.2389650195837021, "learning_rate": 9.682007369077095e-07, "loss": 0.5836, "step": 14000 }, { "epoch": 1.9142310190407823, "eval_loss": 0.2555805742740631, "eval_runtime": 299.5823, "eval_samples_per_second": 26.958, "eval_steps_per_second": 3.371, "step": 14000 }, { "epoch": 1.9210679246573001, "grad_norm": 20.409788131713867, "learning_rate": 8.205941910318426e-07, "loss": 0.5573, "step": 14050 }, { "epoch": 1.927904830273818, "grad_norm": 0.6842173933982849, "learning_rate": 6.851425970187952e-07, "loss": 0.5594, "step": 14100 }, { "epoch": 1.934741735890336, "grad_norm": 11.089654922485352, "learning_rate": 5.618625551656708e-07, "loss": 0.6967, "step": 14150 }, { "epoch": 1.941578641506854, "grad_norm": 12.126336097717285, "learning_rate": 4.507691740825881e-07, "loss": 0.677, "step": 14200 }, { "epoch": 1.948415547123372, "grad_norm": 0.44369152188301086, "learning_rate": 3.518760688410283e-07, "loss": 0.6566, "step": 14250 }, { "epoch": 1.9552524527398898, "grad_norm": 11.187239646911621, "learning_rate": 2.651953593052481e-07, "loss": 0.5174, "step": 14300 }, { "epoch": 1.9620893583564079, "grad_norm": 15.362393379211426, "learning_rate": 1.907376686468787e-07, "loss": 0.5426, "step": 14350 }, { "epoch": 1.968926263972926, "grad_norm": 0.2329702377319336, "learning_rate": 1.2851212204304518e-07, "loss": 0.6944, "step": 14400 }, { "epoch": 1.975763169589444, "grad_norm": 0.7811570763587952, "learning_rate": 7.852634555803873e-08, "loss": 0.5647, "step": 14450 }, { "epoch": 1.9826000752059618, "grad_norm": 1.2399488687515259, "learning_rate": 4.078646520866425e-08, "loss": 0.6162, "step": 14500 }, { "epoch": 1.9894369808224797, "grad_norm": 0.4023188352584839, "learning_rate": 1.5297106213485458e-08, "loss": 0.4718, "step": 14550 }, { "epoch": 1.9962738864389977, "grad_norm": 0.1795218139886856, "learning_rate": 2.061392425978248e-09, "loss": 0.5667, "step": 14600 } ], "logging_steps": 50, "max_steps": 14628, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.8538290358499676e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }