| { | |
| "best_global_step": 6000, | |
| "best_metric": 0.20116083323955536, | |
| "best_model_checkpoint": "/content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/run-2/checkpoints/checkpoint-6000", | |
| "epoch": 2.0, | |
| "eval_steps": 2000, | |
| "global_step": 14628, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006836905616517964, | |
| "grad_norm": 1.572303056716919, | |
| "learning_rate": 2.232346241457859e-05, | |
| "loss": 2.3604, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.013673811233035928, | |
| "grad_norm": 5.201236248016357, | |
| "learning_rate": 4.510250569476082e-05, | |
| "loss": 2.1118, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02051071684955389, | |
| "grad_norm": 9.312570571899414, | |
| "learning_rate": 6.788154897494306e-05, | |
| "loss": 1.8332, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.027347622466071857, | |
| "grad_norm": 8.565587043762207, | |
| "learning_rate": 9.066059225512529e-05, | |
| "loss": 1.9173, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.03418452808258982, | |
| "grad_norm": 3.824556350708008, | |
| "learning_rate": 0.00011343963553530752, | |
| "loss": 1.6633, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.04102143369910778, | |
| "grad_norm": 5.49424934387207, | |
| "learning_rate": 0.00013621867881548976, | |
| "loss": 1.6122, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.04785833931562575, | |
| "grad_norm": 6.3185038566589355, | |
| "learning_rate": 0.000158997722095672, | |
| "loss": 1.5782, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.05469524493214371, | |
| "grad_norm": 3.980173349380493, | |
| "learning_rate": 0.00018177676537585422, | |
| "loss": 1.444, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.06153215054866167, | |
| "grad_norm": 5.797272682189941, | |
| "learning_rate": 0.00019999975488719786, | |
| "loss": 1.5752, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.06836905616517965, | |
| "grad_norm": 11.263846397399902, | |
| "learning_rate": 0.0001999911760652904, | |
| "loss": 1.3607, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0752059617816976, | |
| "grad_norm": 4.273462772369385, | |
| "learning_rate": 0.0001999703428048544, | |
| "loss": 1.5023, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.08204286739821556, | |
| "grad_norm": 2.9854705333709717, | |
| "learning_rate": 0.00019993725765911436, | |
| "loss": 1.3747, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.08887977301473353, | |
| "grad_norm": 2.9444832801818848, | |
| "learning_rate": 0.0001998919246828268, | |
| "loss": 1.4708, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.0957166786312515, | |
| "grad_norm": 3.348857879638672, | |
| "learning_rate": 0.00019983434943178372, | |
| "loss": 1.439, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.10255358424776946, | |
| "grad_norm": 5.90728759765625, | |
| "learning_rate": 0.00019976453896213152, | |
| "loss": 1.5048, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.10939048986428743, | |
| "grad_norm": 2.6572535037994385, | |
| "learning_rate": 0.0001996825018295062, | |
| "loss": 1.5023, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.11622739548080539, | |
| "grad_norm": 4.219803810119629, | |
| "learning_rate": 0.00019958824808798494, | |
| "loss": 1.5814, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.12306430109732334, | |
| "grad_norm": 5.457417964935303, | |
| "learning_rate": 0.00019948178928885378, | |
| "loss": 1.4203, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.1299012067138413, | |
| "grad_norm": 5.302417278289795, | |
| "learning_rate": 0.00019936313847919218, | |
| "loss": 1.3299, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.1367381123303593, | |
| "grad_norm": 4.385361194610596, | |
| "learning_rate": 0.00019923231020027368, | |
| "loss": 1.3468, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.14357501794687724, | |
| "grad_norm": 4.836021423339844, | |
| "learning_rate": 0.00019908932048578416, | |
| "loss": 1.2813, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.1504119235633952, | |
| "grad_norm": 4.949122905731201, | |
| "learning_rate": 0.00019893418685985658, | |
| "loss": 1.311, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.15724882917991317, | |
| "grad_norm": 6.123111248016357, | |
| "learning_rate": 0.00019876692833492343, | |
| "loss": 1.342, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.16408573479643113, | |
| "grad_norm": 5.803433418273926, | |
| "learning_rate": 0.0001985875654093866, | |
| "loss": 1.2384, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.1709226404129491, | |
| "grad_norm": 3.196314811706543, | |
| "learning_rate": 0.00019839612006510517, | |
| "loss": 1.3117, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.17775954602946706, | |
| "grad_norm": 6.21234130859375, | |
| "learning_rate": 0.00019819261576470152, | |
| "loss": 1.2307, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.18459645164598504, | |
| "grad_norm": 3.274829149246216, | |
| "learning_rate": 0.00019797707744868582, | |
| "loss": 1.2408, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.191433357262503, | |
| "grad_norm": 5.5120320320129395, | |
| "learning_rate": 0.0001977495315323993, | |
| "loss": 1.324, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.19827026287902094, | |
| "grad_norm": 7.289828777313232, | |
| "learning_rate": 0.0001975100059027772, | |
| "loss": 1.2039, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.20510716849553892, | |
| "grad_norm": 4.040754795074463, | |
| "learning_rate": 0.00019725852991493083, | |
| "loss": 1.3405, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.21194407411205687, | |
| "grad_norm": 52.13080596923828, | |
| "learning_rate": 0.00019699513438854995, | |
| "loss": 1.2005, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.21878097972857485, | |
| "grad_norm": 5.0520429611206055, | |
| "learning_rate": 0.00019671985160412593, | |
| "loss": 1.0046, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.2256178853450928, | |
| "grad_norm": 1.7626160383224487, | |
| "learning_rate": 0.00019643271529899532, | |
| "loss": 1.1398, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.23245479096161079, | |
| "grad_norm": 2.1751222610473633, | |
| "learning_rate": 0.00019613376066320525, | |
| "loss": 1.1519, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.23929169657812874, | |
| "grad_norm": 4.483262062072754, | |
| "learning_rate": 0.00019582302433520074, | |
| "loss": 1.144, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.2461286021946467, | |
| "grad_norm": 2.494478702545166, | |
| "learning_rate": 0.00019550054439733449, | |
| "loss": 1.1908, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.25296550781116467, | |
| "grad_norm": 14.6198091506958, | |
| "learning_rate": 0.00019516636037119952, | |
| "loss": 1.0791, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.2598024134276826, | |
| "grad_norm": 1.5368318557739258, | |
| "learning_rate": 0.00019482051321278592, | |
| "loss": 1.1994, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.2666393190442006, | |
| "grad_norm": 6.854203701019287, | |
| "learning_rate": 0.00019446304530746112, | |
| "loss": 1.1871, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.2734762246607186, | |
| "grad_norm": 3.686593770980835, | |
| "learning_rate": 0.00019409400046477559, | |
| "loss": 1.0619, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.2734762246607186, | |
| "eval_loss": 0.3232106864452362, | |
| "eval_runtime": 301.3298, | |
| "eval_samples_per_second": 26.801, | |
| "eval_steps_per_second": 3.352, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.28031313027723653, | |
| "grad_norm": 2.84173321723938, | |
| "learning_rate": 0.00019371342391309363, | |
| "loss": 1.1769, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.2871500358937545, | |
| "grad_norm": 6.158025741577148, | |
| "learning_rate": 0.00019332136229405043, | |
| "loss": 1.1985, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.29398694151027244, | |
| "grad_norm": 1.3917083740234375, | |
| "learning_rate": 0.00019291786365683599, | |
| "loss": 1.2915, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.3008238471267904, | |
| "grad_norm": 6.717157363891602, | |
| "learning_rate": 0.00019250297745230615, | |
| "loss": 0.9168, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.3076607527433084, | |
| "grad_norm": 7.835381507873535, | |
| "learning_rate": 0.00019207675452692259, | |
| "loss": 1.0267, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.31449765835982635, | |
| "grad_norm": 4.236868858337402, | |
| "learning_rate": 0.00019163924711652092, | |
| "loss": 1.1836, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.3213345639763443, | |
| "grad_norm": 4.367033004760742, | |
| "learning_rate": 0.00019119050883990903, | |
| "loss": 1.1023, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.32817146959286225, | |
| "grad_norm": 8.43916130065918, | |
| "learning_rate": 0.00019073059469229602, | |
| "loss": 1.1884, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.33500837520938026, | |
| "grad_norm": 7.896825790405273, | |
| "learning_rate": 0.0001902595610385519, | |
| "loss": 1.1764, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.3418452808258982, | |
| "grad_norm": 3.5363454818725586, | |
| "learning_rate": 0.00018977746560630012, | |
| "loss": 1.1172, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.34868218644241616, | |
| "grad_norm": 12.307855606079102, | |
| "learning_rate": 0.00018928436747884253, | |
| "loss": 1.078, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.3555190920589341, | |
| "grad_norm": 8.765337944030762, | |
| "learning_rate": 0.00018878032708791854, | |
| "loss": 1.1449, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.36235599767545207, | |
| "grad_norm": 11.366116523742676, | |
| "learning_rate": 0.00018826540620629873, | |
| "loss": 1.1117, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.3691929032919701, | |
| "grad_norm": 3.603243112564087, | |
| "learning_rate": 0.0001877396679402145, | |
| "loss": 1.1138, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.37602980890848803, | |
| "grad_norm": 8.020549774169922, | |
| "learning_rate": 0.00018720317672162392, | |
| "loss": 1.0474, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.382866714525006, | |
| "grad_norm": 4.786285877227783, | |
| "learning_rate": 0.00018665599830031533, | |
| "loss": 1.1041, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.38970362014152393, | |
| "grad_norm": 7.1555633544921875, | |
| "learning_rate": 0.00018609819973584924, | |
| "loss": 1.0623, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.3965405257580419, | |
| "grad_norm": 6.989715576171875, | |
| "learning_rate": 0.00018552984938934006, | |
| "loss": 0.9318, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.4033774313745599, | |
| "grad_norm": 7.150449752807617, | |
| "learning_rate": 0.00018495101691507783, | |
| "loss": 1.132, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.41021433699107784, | |
| "grad_norm": 4.584231853485107, | |
| "learning_rate": 0.00018436177325199192, | |
| "loss": 1.1382, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.4170512426075958, | |
| "grad_norm": 5.139730930328369, | |
| "learning_rate": 0.00018376219061495694, | |
| "loss": 1.0452, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.42388814822411375, | |
| "grad_norm": 15.497014999389648, | |
| "learning_rate": 0.00018315234248594264, | |
| "loss": 1.0451, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.43072505384063176, | |
| "grad_norm": 3.4872303009033203, | |
| "learning_rate": 0.0001825323036050081, | |
| "loss": 1.131, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.4375619594571497, | |
| "grad_norm": 11.307365417480469, | |
| "learning_rate": 0.00018190214996114206, | |
| "loss": 1.1382, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.44439886507366766, | |
| "grad_norm": 5.577065467834473, | |
| "learning_rate": 0.00018126195878295006, | |
| "loss": 1.1045, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.4512357706901856, | |
| "grad_norm": 14.33316421508789, | |
| "learning_rate": 0.0001806118085291896, | |
| "loss": 1.0887, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.45807267630670356, | |
| "grad_norm": 15.240452766418457, | |
| "learning_rate": 0.00017995177887915475, | |
| "loss": 1.0171, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.46490958192322157, | |
| "grad_norm": 10.07467269897461, | |
| "learning_rate": 0.00017928195072291093, | |
| "loss": 1.0966, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.4717464875397395, | |
| "grad_norm": 2.930840253829956, | |
| "learning_rate": 0.00017860240615138142, | |
| "loss": 1.0418, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.4785833931562575, | |
| "grad_norm": 30.01850700378418, | |
| "learning_rate": 0.00017791322844628677, | |
| "loss": 0.9635, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.4854202987727754, | |
| "grad_norm": 5.433286666870117, | |
| "learning_rate": 0.0001772145020699381, | |
| "loss": 1.0108, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.4922572043892934, | |
| "grad_norm": 3.0814309120178223, | |
| "learning_rate": 0.0001765063126548858, | |
| "loss": 1.1257, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.4990941100058114, | |
| "grad_norm": 79.82017517089844, | |
| "learning_rate": 0.00017578874699342493, | |
| "loss": 1.1214, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.5059310156223293, | |
| "grad_norm": 8.51614761352539, | |
| "learning_rate": 0.00017506189302695827, | |
| "loss": 0.8635, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.5127679212388473, | |
| "grad_norm": 8.251550674438477, | |
| "learning_rate": 0.0001743258398352187, | |
| "loss": 0.9361, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.5196048268553652, | |
| "grad_norm": 3.81523060798645, | |
| "learning_rate": 0.00017358067762535186, | |
| "loss": 1.066, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.5264417324718832, | |
| "grad_norm": 15.210460662841797, | |
| "learning_rate": 0.00017282649772086114, | |
| "loss": 0.9778, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.5332786380884011, | |
| "grad_norm": 5.145527362823486, | |
| "learning_rate": 0.0001720633925504151, | |
| "loss": 1.0966, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.5401155437049191, | |
| "grad_norm": 3.485656261444092, | |
| "learning_rate": 0.00017129145563652014, | |
| "loss": 0.6889, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.5469524493214372, | |
| "grad_norm": 7.915320873260498, | |
| "learning_rate": 0.00017051078158405872, | |
| "loss": 0.9154, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.5469524493214372, | |
| "eval_loss": 0.24666446447372437, | |
| "eval_runtime": 301.8017, | |
| "eval_samples_per_second": 26.759, | |
| "eval_steps_per_second": 3.347, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.5537893549379551, | |
| "grad_norm": 12.610590934753418, | |
| "learning_rate": 0.00016972146606869507, | |
| "loss": 0.8612, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.5606262605544731, | |
| "grad_norm": 34.93125915527344, | |
| "learning_rate": 0.00016892360582514967, | |
| "loss": 1.0867, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.567463166170991, | |
| "grad_norm": 7.39677095413208, | |
| "learning_rate": 0.00016811729863534377, | |
| "loss": 1.1106, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.574300071787509, | |
| "grad_norm": 2.4880149364471436, | |
| "learning_rate": 0.00016730264331641585, | |
| "loss": 0.9142, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.5811369774040269, | |
| "grad_norm": 19.268964767456055, | |
| "learning_rate": 0.00016647973970861104, | |
| "loss": 0.9408, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.5879738830205449, | |
| "grad_norm": 62.558837890625, | |
| "learning_rate": 0.00016564868866304517, | |
| "loss": 1.1798, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.5948107886370628, | |
| "grad_norm": 12.449636459350586, | |
| "learning_rate": 0.00016480959202934487, | |
| "loss": 0.9386, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.6016476942535808, | |
| "grad_norm": 9.708828926086426, | |
| "learning_rate": 0.00016396255264316547, | |
| "loss": 1.0766, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.6084845998700988, | |
| "grad_norm": 4.00963020324707, | |
| "learning_rate": 0.0001631076743135879, | |
| "loss": 0.9953, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.6153215054866168, | |
| "grad_norm": 14.70906925201416, | |
| "learning_rate": 0.0001622450618103964, | |
| "loss": 1.1006, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.6221584111031347, | |
| "grad_norm": 2.471301317214966, | |
| "learning_rate": 0.00016137482085123832, | |
| "loss": 0.7397, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.6289953167196527, | |
| "grad_norm": 0.671847939491272, | |
| "learning_rate": 0.00016049705808866805, | |
| "loss": 1.1298, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.6358322223361706, | |
| "grad_norm": 11.712217330932617, | |
| "learning_rate": 0.000159611881097076, | |
| "loss": 0.8828, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.6426691279526886, | |
| "grad_norm": 90.13214111328125, | |
| "learning_rate": 0.00015871939835950503, | |
| "loss": 1.085, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.6495060335692066, | |
| "grad_norm": 2.1299564838409424, | |
| "learning_rate": 0.00015781971925435498, | |
| "loss": 1.0104, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.6563429391857245, | |
| "grad_norm": 44.118778228759766, | |
| "learning_rate": 0.0001569129540419781, | |
| "loss": 0.8905, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.6631798448022425, | |
| "grad_norm": 20.966922760009766, | |
| "learning_rate": 0.00015599921385116582, | |
| "loss": 0.9239, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.6700167504187605, | |
| "grad_norm": 13.358034133911133, | |
| "learning_rate": 0.00015507861066552955, | |
| "loss": 0.8589, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.6768536560352785, | |
| "grad_norm": 5.739938259124756, | |
| "learning_rate": 0.00015415125730977626, | |
| "loss": 1.0661, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.6836905616517964, | |
| "grad_norm": 25.265790939331055, | |
| "learning_rate": 0.00015321726743588155, | |
| "loss": 0.9046, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.6905274672683144, | |
| "grad_norm": 22.772367477416992, | |
| "learning_rate": 0.00015227675550916073, | |
| "loss": 1.0174, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.6973643728848323, | |
| "grad_norm": 4.18620491027832, | |
| "learning_rate": 0.0001513298367942405, | |
| "loss": 0.9916, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.7042012785013503, | |
| "grad_norm": 10.113117218017578, | |
| "learning_rate": 0.00015037662734093286, | |
| "loss": 0.9635, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.7110381841178682, | |
| "grad_norm": 1.7103244066238403, | |
| "learning_rate": 0.0001494172439700126, | |
| "loss": 0.8927, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.7178750897343862, | |
| "grad_norm": 24.236433029174805, | |
| "learning_rate": 0.0001484518042589, | |
| "loss": 0.9438, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.7247119953509041, | |
| "grad_norm": 2.4070262908935547, | |
| "learning_rate": 0.00014748042652725152, | |
| "loss": 1.095, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.7315489009674222, | |
| "grad_norm": 4.471241474151611, | |
| "learning_rate": 0.0001465032298224588, | |
| "loss": 0.8205, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.7383858065839402, | |
| "grad_norm": 1.757636547088623, | |
| "learning_rate": 0.0001455203339050589, | |
| "loss": 0.9177, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.7452227122004581, | |
| "grad_norm": 1.5365773439407349, | |
| "learning_rate": 0.0001445318592340571, | |
| "loss": 0.7696, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.7520596178169761, | |
| "grad_norm": 1.7077670097351074, | |
| "learning_rate": 0.00014353792695216382, | |
| "loss": 0.9342, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.758896523433494, | |
| "grad_norm": 28.525236129760742, | |
| "learning_rate": 0.00014253865887094817, | |
| "loss": 0.9897, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.765733429050012, | |
| "grad_norm": 15.281404495239258, | |
| "learning_rate": 0.00014153417745590914, | |
| "loss": 0.8873, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.7725703346665299, | |
| "grad_norm": 1.1002103090286255, | |
| "learning_rate": 0.00014052460581146696, | |
| "loss": 0.7727, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.7794072402830479, | |
| "grad_norm": 4.395946025848389, | |
| "learning_rate": 0.00013951006766587586, | |
| "loss": 0.8922, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.7862441458995658, | |
| "grad_norm": 5.225406169891357, | |
| "learning_rate": 0.0001384906873560607, | |
| "loss": 0.9766, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.7930810515160838, | |
| "grad_norm": 6.0966315269470215, | |
| "learning_rate": 0.00013746658981237867, | |
| "loss": 1.1373, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.7999179571326018, | |
| "grad_norm": 14.155887603759766, | |
| "learning_rate": 0.00013643790054330846, | |
| "loss": 0.8954, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.8067548627491198, | |
| "grad_norm": 2.6549534797668457, | |
| "learning_rate": 0.0001354047456200687, | |
| "loss": 1.0428, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.8135917683656377, | |
| "grad_norm": 7.79277229309082, | |
| "learning_rate": 0.0001343672516611671, | |
| "loss": 0.8715, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.8204286739821557, | |
| "grad_norm": 17.183149337768555, | |
| "learning_rate": 0.00013332554581688271, | |
| "loss": 1.0601, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.8204286739821557, | |
| "eval_loss": 0.20116083323955536, | |
| "eval_runtime": 301.512, | |
| "eval_samples_per_second": 26.785, | |
| "eval_steps_per_second": 3.35, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.8272655795986736, | |
| "grad_norm": 10.275203704833984, | |
| "learning_rate": 0.00013227975575368312, | |
| "loss": 0.8782, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.8341024852151916, | |
| "grad_norm": 3.2849924564361572, | |
| "learning_rate": 0.0001312300096385781, | |
| "loss": 0.7405, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.8409393908317095, | |
| "grad_norm": 5.1770853996276855, | |
| "learning_rate": 0.0001301764361234122, | |
| "loss": 1.0901, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.8477762964482275, | |
| "grad_norm": 13.282193183898926, | |
| "learning_rate": 0.0001291191643290977, | |
| "loss": 0.9054, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.8546132020647454, | |
| "grad_norm": 9.424989700317383, | |
| "learning_rate": 0.0001280583238297903, | |
| "loss": 0.9861, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.8614501076812635, | |
| "grad_norm": 2.5506229400634766, | |
| "learning_rate": 0.000126994044637009, | |
| "loss": 1.0244, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.8682870132977815, | |
| "grad_norm": 21.7524471282959, | |
| "learning_rate": 0.00012592645718370252, | |
| "loss": 0.9079, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.8751239189142994, | |
| "grad_norm": 2.2379355430603027, | |
| "learning_rate": 0.00012485569230826423, | |
| "loss": 1.0235, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.8819608245308174, | |
| "grad_norm": 18.936904907226562, | |
| "learning_rate": 0.000123781881238497, | |
| "loss": 0.8275, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.8887977301473353, | |
| "grad_norm": 1.508329153060913, | |
| "learning_rate": 0.00012270515557553065, | |
| "loss": 0.9872, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.8956346357638533, | |
| "grad_norm": 30.93293571472168, | |
| "learning_rate": 0.00012162564727769359, | |
| "loss": 1.0287, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.9024715413803712, | |
| "grad_norm": 29.230403900146484, | |
| "learning_rate": 0.00012054348864434066, | |
| "loss": 0.627, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.9093084469968892, | |
| "grad_norm": 14.68487548828125, | |
| "learning_rate": 0.00011945881229963898, | |
| "loss": 0.9562, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.9161453526134071, | |
| "grad_norm": 2.035444736480713, | |
| "learning_rate": 0.00011837175117631436, | |
| "loss": 0.8726, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.9229822582299252, | |
| "grad_norm": 12.931522369384766, | |
| "learning_rate": 0.0001172824384993596, | |
| "loss": 0.8823, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.9298191638464431, | |
| "grad_norm": 8.330245971679688, | |
| "learning_rate": 0.00011619100776970713, | |
| "loss": 0.7179, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.9366560694629611, | |
| "grad_norm": 51.09445571899414, | |
| "learning_rate": 0.00011509759274786776, | |
| "loss": 0.8627, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.943492975079479, | |
| "grad_norm": 26.371118545532227, | |
| "learning_rate": 0.00011400232743753752, | |
| "loss": 0.7334, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.950329880695997, | |
| "grad_norm": 1.3464198112487793, | |
| "learning_rate": 0.00011290534606917508, | |
| "loss": 1.0389, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.957166786312515, | |
| "grad_norm": 0.732755184173584, | |
| "learning_rate": 0.00011180678308355081, | |
| "loss": 0.8343, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.9640036919290329, | |
| "grad_norm": 0.9582768082618713, | |
| "learning_rate": 0.00011070677311527058, | |
| "loss": 1.0705, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.9708405975455509, | |
| "grad_norm": 0.7923704385757446, | |
| "learning_rate": 0.00010960545097627548, | |
| "loss": 0.9725, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.9776775031620688, | |
| "grad_norm": 39.650177001953125, | |
| "learning_rate": 0.00010850295163931992, | |
| "loss": 0.8721, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.9845144087785868, | |
| "grad_norm": 9.212077140808105, | |
| "learning_rate": 0.00010739941022143007, | |
| "loss": 0.8079, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.9913513143951048, | |
| "grad_norm": 2.591902494430542, | |
| "learning_rate": 0.00010629496196734452, | |
| "loss": 1.1336, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.9981882200116228, | |
| "grad_norm": 18.618799209594727, | |
| "learning_rate": 0.00010518974223293936, | |
| "loss": 1.0463, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.004922572043893, | |
| "grad_norm": 8.480158805847168, | |
| "learning_rate": 0.00010408388646863965, | |
| "loss": 0.7236, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 1.0117594776604109, | |
| "grad_norm": 3.5370821952819824, | |
| "learning_rate": 0.00010297753020281911, | |
| "loss": 0.813, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.018596383276929, | |
| "grad_norm": 0.5842294096946716, | |
| "learning_rate": 0.00010187080902519064, | |
| "loss": 0.589, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 1.0254332888934468, | |
| "grad_norm": 11.063470840454102, | |
| "learning_rate": 0.00010076385857018889, | |
| "loss": 0.9893, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.0322701945099648, | |
| "grad_norm": 8.910834312438965, | |
| "learning_rate": 9.965681450034771e-05, | |
| "loss": 0.6532, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 1.0391071001264827, | |
| "grad_norm": 0.8395630121231079, | |
| "learning_rate": 9.854981248967388e-05, | |
| "loss": 0.6934, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.0459440057430007, | |
| "grad_norm": 3.7071163654327393, | |
| "learning_rate": 9.744298820701968e-05, | |
| "loss": 0.7911, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 1.0527809113595188, | |
| "grad_norm": 14.003477096557617, | |
| "learning_rate": 9.633647729945581e-05, | |
| "loss": 0.7611, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.0596178169760366, | |
| "grad_norm": 19.04654884338379, | |
| "learning_rate": 9.523041537564726e-05, | |
| "loss": 0.6596, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 1.0664547225925547, | |
| "grad_norm": 52.79182815551758, | |
| "learning_rate": 9.412493798923383e-05, | |
| "loss": 0.763, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.0732916282090725, | |
| "grad_norm": 1.4399851560592651, | |
| "learning_rate": 9.3020180622217e-05, | |
| "loss": 0.667, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 1.0801285338255906, | |
| "grad_norm": 1.6162464618682861, | |
| "learning_rate": 9.19162786683564e-05, | |
| "loss": 0.813, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.0869654394421084, | |
| "grad_norm": 6.91720724105835, | |
| "learning_rate": 9.081336741657603e-05, | |
| "loss": 0.6394, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 1.0938023450586265, | |
| "grad_norm": 7.005824089050293, | |
| "learning_rate": 8.971158203438443e-05, | |
| "loss": 0.6949, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.0938023450586265, | |
| "eval_loss": 0.22489766776561737, | |
| "eval_runtime": 301.6603, | |
| "eval_samples_per_second": 26.772, | |
| "eval_steps_per_second": 3.348, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.1006392506751443, | |
| "grad_norm": 12.64887523651123, | |
| "learning_rate": 8.861105755130896e-05, | |
| "loss": 0.6777, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 1.1074761562916624, | |
| "grad_norm": 99.47157287597656, | |
| "learning_rate": 8.751192884234704e-05, | |
| "loss": 0.5242, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.1143130619081805, | |
| "grad_norm": 2.9147791862487793, | |
| "learning_rate": 8.641433061143698e-05, | |
| "loss": 0.6589, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 1.1211499675246983, | |
| "grad_norm": 0.4020586311817169, | |
| "learning_rate": 8.531839737494878e-05, | |
| "loss": 0.9058, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.1279868731412164, | |
| "grad_norm": 41.31173324584961, | |
| "learning_rate": 8.422426344519898e-05, | |
| "loss": 0.5999, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 1.1348237787577342, | |
| "grad_norm": 0.19233907759189606, | |
| "learning_rate": 8.313206291398948e-05, | |
| "loss": 0.8461, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.1416606843742523, | |
| "grad_norm": 0.5941385626792908, | |
| "learning_rate": 8.20419296361743e-05, | |
| "loss": 0.5353, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 1.1484975899907701, | |
| "grad_norm": 6.670557022094727, | |
| "learning_rate": 8.095399721325481e-05, | |
| "loss": 0.6484, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.1553344956072882, | |
| "grad_norm": 3.8168182373046875, | |
| "learning_rate": 7.9868398977006e-05, | |
| "loss": 0.8318, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 1.162171401223806, | |
| "grad_norm": 17.14653778076172, | |
| "learning_rate": 7.87852679731364e-05, | |
| "loss": 0.5694, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.169008306840324, | |
| "grad_norm": 58.7053108215332, | |
| "learning_rate": 7.77047369449821e-05, | |
| "loss": 0.7256, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 1.1758452124568421, | |
| "grad_norm": 0.4155759811401367, | |
| "learning_rate": 7.66269383172389e-05, | |
| "loss": 0.604, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.18268211807336, | |
| "grad_norm": 1.1354832649230957, | |
| "learning_rate": 7.555200417973261e-05, | |
| "loss": 0.7761, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 1.189519023689878, | |
| "grad_norm": 1.1315326690673828, | |
| "learning_rate": 7.448006627123083e-05, | |
| "loss": 0.6569, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.196355929306396, | |
| "grad_norm": 0.9931478500366211, | |
| "learning_rate": 7.341125596329783e-05, | |
| "loss": 0.8456, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 1.203192834922914, | |
| "grad_norm": 2.132953643798828, | |
| "learning_rate": 7.2345704244194e-05, | |
| "loss": 0.7142, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.2100297405394318, | |
| "grad_norm": 10.148101806640625, | |
| "learning_rate": 7.12835417028229e-05, | |
| "loss": 0.7284, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 1.2168666461559499, | |
| "grad_norm": 41.58332824707031, | |
| "learning_rate": 7.022489851272668e-05, | |
| "loss": 0.5779, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.2237035517724677, | |
| "grad_norm": 4.843736171722412, | |
| "learning_rate": 6.91699044161326e-05, | |
| "loss": 0.6783, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 1.2305404573889858, | |
| "grad_norm": 0.4043326675891876, | |
| "learning_rate": 6.811868870805269e-05, | |
| "loss": 0.7656, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.2373773630055038, | |
| "grad_norm": 3.8934195041656494, | |
| "learning_rate": 6.70713802204377e-05, | |
| "loss": 0.5857, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 1.2442142686220217, | |
| "grad_norm": 0.23483966290950775, | |
| "learning_rate": 6.602810730638829e-05, | |
| "loss": 0.6388, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.2510511742385395, | |
| "grad_norm": 2.1649527549743652, | |
| "learning_rate": 6.498899782442444e-05, | |
| "loss": 0.6986, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 1.2578880798550576, | |
| "grad_norm": 82.96743774414062, | |
| "learning_rate": 6.39541791228161e-05, | |
| "loss": 0.5563, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.2647249854715756, | |
| "grad_norm": 1.8622783422470093, | |
| "learning_rate": 6.292377802397564e-05, | |
| "loss": 0.6941, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 1.2715618910880935, | |
| "grad_norm": 1.1985386610031128, | |
| "learning_rate": 6.189792080891525e-05, | |
| "loss": 0.6195, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.2783987967046115, | |
| "grad_norm": 1.1333106756210327, | |
| "learning_rate": 6.087673320177058e-05, | |
| "loss": 0.5675, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 1.2852357023211294, | |
| "grad_norm": 13.326946258544922, | |
| "learning_rate": 5.9860340354392496e-05, | |
| "loss": 0.8214, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.2920726079376474, | |
| "grad_norm": 10.754223823547363, | |
| "learning_rate": 5.8848866831009156e-05, | |
| "loss": 0.663, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 1.2989095135541655, | |
| "grad_norm": 0.07592844218015671, | |
| "learning_rate": 5.784243659296001e-05, | |
| "loss": 0.6661, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.3057464191706833, | |
| "grad_norm": 4.361905097961426, | |
| "learning_rate": 5.6841172983503634e-05, | |
| "loss": 0.6757, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 1.3125833247872012, | |
| "grad_norm": 6.464013576507568, | |
| "learning_rate": 5.5845198712701396e-05, | |
| "loss": 0.8568, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.3194202304037193, | |
| "grad_norm": 13.971973419189453, | |
| "learning_rate": 5.485463584237871e-05, | |
| "loss": 0.5852, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 1.3262571360202373, | |
| "grad_norm": 25.48811149597168, | |
| "learning_rate": 5.3869605771165755e-05, | |
| "loss": 0.652, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.3330940416367552, | |
| "grad_norm": 5.14886474609375, | |
| "learning_rate": 5.289022921961948e-05, | |
| "loss": 0.8247, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.3399309472532732, | |
| "grad_norm": 0.6628409028053284, | |
| "learning_rate": 5.1916626215428385e-05, | |
| "loss": 0.5708, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.346767852869791, | |
| "grad_norm": 81.61123657226562, | |
| "learning_rate": 5.094891607870296e-05, | |
| "loss": 0.7523, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 1.3536047584863091, | |
| "grad_norm": 0.597465455532074, | |
| "learning_rate": 4.998721740735197e-05, | |
| "loss": 0.7701, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.3604416641028272, | |
| "grad_norm": 1.8627650737762451, | |
| "learning_rate": 4.903164806254804e-05, | |
| "loss": 0.6589, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 1.367278569719345, | |
| "grad_norm": 0.427298903465271, | |
| "learning_rate": 4.808232515428268e-05, | |
| "loss": 0.6476, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.367278569719345, | |
| "eval_loss": 0.25095975399017334, | |
| "eval_runtime": 301.6273, | |
| "eval_samples_per_second": 26.775, | |
| "eval_steps_per_second": 3.349, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.3741154753358629, | |
| "grad_norm": 0.5417049527168274, | |
| "learning_rate": 4.713936502701435e-05, | |
| "loss": 0.7344, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 1.380952380952381, | |
| "grad_norm": 0.30379384756088257, | |
| "learning_rate": 4.620288324540962e-05, | |
| "loss": 0.5764, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.387789286568899, | |
| "grad_norm": 0.258468359708786, | |
| "learning_rate": 4.5272994580179895e-05, | |
| "loss": 0.6794, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 1.3946261921854168, | |
| "grad_norm": 1.2032103538513184, | |
| "learning_rate": 4.434981299401615e-05, | |
| "loss": 0.5931, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.401463097801935, | |
| "grad_norm": 4.064381122589111, | |
| "learning_rate": 4.3433451627621743e-05, | |
| "loss": 0.4061, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 1.4083000034184527, | |
| "grad_norm": 1.0236620903015137, | |
| "learning_rate": 4.2524022785846806e-05, | |
| "loss": 0.5935, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.4151369090349708, | |
| "grad_norm": 0.42589133977890015, | |
| "learning_rate": 4.1621637923924405e-05, | |
| "loss": 0.8298, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 1.4219738146514889, | |
| "grad_norm": 9.088717460632324, | |
| "learning_rate": 4.072640763381127e-05, | |
| "loss": 0.5821, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.4288107202680067, | |
| "grad_norm": 2.854710102081299, | |
| "learning_rate": 3.983844163063429e-05, | |
| "loss": 0.6541, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 1.4356476258845245, | |
| "grad_norm": 6.076037406921387, | |
| "learning_rate": 3.895784873924397e-05, | |
| "loss": 0.6669, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.4424845315010426, | |
| "grad_norm": 0.36614227294921875, | |
| "learning_rate": 3.8084736880877846e-05, | |
| "loss": 0.5883, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 1.4493214371175607, | |
| "grad_norm": 82.49917602539062, | |
| "learning_rate": 3.721921305993391e-05, | |
| "loss": 0.8045, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.4561583427340785, | |
| "grad_norm": 45.616859436035156, | |
| "learning_rate": 3.636138335085666e-05, | |
| "loss": 0.4991, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 1.4629952483505966, | |
| "grad_norm": 0.26663124561309814, | |
| "learning_rate": 3.5511352885137194e-05, | |
| "loss": 0.4815, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.4698321539671144, | |
| "grad_norm": 1.6303415298461914, | |
| "learning_rate": 3.4669225838428785e-05, | |
| "loss": 0.4746, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 1.4766690595836325, | |
| "grad_norm": 14.5377779006958, | |
| "learning_rate": 3.3835105417779687e-05, | |
| "loss": 0.7877, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.4835059652001505, | |
| "grad_norm": 0.08112337440252304, | |
| "learning_rate": 3.30090938489844e-05, | |
| "loss": 0.6687, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 1.4903428708166684, | |
| "grad_norm": 7.454471588134766, | |
| "learning_rate": 3.219129236405548e-05, | |
| "loss": 0.8063, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.4971797764331862, | |
| "grad_norm": 5.5912275314331055, | |
| "learning_rate": 3.13818011888171e-05, | |
| "loss": 0.6337, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 1.5040166820497043, | |
| "grad_norm": 7.555117130279541, | |
| "learning_rate": 3.0580719530621705e-05, | |
| "loss": 0.6513, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.5108535876662224, | |
| "grad_norm": 0.4277037978172302, | |
| "learning_rate": 2.9788145566191693e-05, | |
| "loss": 0.603, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 1.5176904932827402, | |
| "grad_norm": 0.3563739061355591, | |
| "learning_rate": 2.900417642958734e-05, | |
| "loss": 0.5695, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.524527398899258, | |
| "grad_norm": 0.8669344782829285, | |
| "learning_rate": 2.822890820030264e-05, | |
| "loss": 0.7372, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 1.531364304515776, | |
| "grad_norm": 10.977109909057617, | |
| "learning_rate": 2.7462435891490036e-05, | |
| "loss": 0.6573, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.5382012101322942, | |
| "grad_norm": 0.33039143681526184, | |
| "learning_rate": 2.6704853438316213e-05, | |
| "loss": 0.4278, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 1.5450381157488122, | |
| "grad_norm": 3.340820550918579, | |
| "learning_rate": 2.5956253686449882e-05, | |
| "loss": 0.6281, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.55187502136533, | |
| "grad_norm": 6.152026176452637, | |
| "learning_rate": 2.521672838068295e-05, | |
| "loss": 0.6859, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 1.558711926981848, | |
| "grad_norm": 0.9645776152610779, | |
| "learning_rate": 2.4486368153686734e-05, | |
| "loss": 0.578, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.565548832598366, | |
| "grad_norm": 3.5073535442352295, | |
| "learning_rate": 2.3765262514904617e-05, | |
| "loss": 0.6756, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 1.572385738214884, | |
| "grad_norm": 1.3473198413848877, | |
| "learning_rate": 2.305349983958196e-05, | |
| "loss": 0.6288, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.5792226438314019, | |
| "grad_norm": 6.039999961853027, | |
| "learning_rate": 2.2351167357935422e-05, | |
| "loss": 0.6274, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 1.5860595494479197, | |
| "grad_norm": 0.9115678668022156, | |
| "learning_rate": 2.1658351144462362e-05, | |
| "loss": 0.6303, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.5928964550644378, | |
| "grad_norm": 37.31045150756836, | |
| "learning_rate": 2.097513610739209e-05, | |
| "loss": 0.7243, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 1.5997333606809558, | |
| "grad_norm": 0.5089764595031738, | |
| "learning_rate": 2.0301605978279702e-05, | |
| "loss": 0.507, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.606570266297474, | |
| "grad_norm": 16.424047470092773, | |
| "learning_rate": 1.9637843301744528e-05, | |
| "loss": 0.6387, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 1.6134071719139917, | |
| "grad_norm": 0.6381849646568298, | |
| "learning_rate": 1.898392942535383e-05, | |
| "loss": 0.7143, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.6202440775305096, | |
| "grad_norm": 7.240786075592041, | |
| "learning_rate": 1.833994448965315e-05, | |
| "loss": 0.7644, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 1.6270809831470276, | |
| "grad_norm": 0.6397457122802734, | |
| "learning_rate": 1.7705967418344737e-05, | |
| "loss": 0.5355, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 1.6339178887635457, | |
| "grad_norm": 0.49821093678474426, | |
| "learning_rate": 1.7082075908615013e-05, | |
| "loss": 0.7372, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 1.6407547943800636, | |
| "grad_norm": 0.550399124622345, | |
| "learning_rate": 1.6468346421612447e-05, | |
| "loss": 0.7474, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.6407547943800636, | |
| "eval_loss": 0.26388460397720337, | |
| "eval_runtime": 300.1264, | |
| "eval_samples_per_second": 26.909, | |
| "eval_steps_per_second": 3.365, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.6475916999965814, | |
| "grad_norm": 0.1512337028980255, | |
| "learning_rate": 1.5864854173076714e-05, | |
| "loss": 0.6831, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 1.6544286056130995, | |
| "grad_norm": 40.49404525756836, | |
| "learning_rate": 1.52716731241207e-05, | |
| "loss": 0.7483, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 1.6612655112296175, | |
| "grad_norm": 0.5297091007232666, | |
| "learning_rate": 1.4688875972166227e-05, | |
| "loss": 0.5595, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 1.6681024168461356, | |
| "grad_norm": 12.922277450561523, | |
| "learning_rate": 1.4116534142034488e-05, | |
| "loss": 0.5817, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 1.6749393224626534, | |
| "grad_norm": 0.4216732382774353, | |
| "learning_rate": 1.3554717777192605e-05, | |
| "loss": 0.8905, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 1.6817762280791713, | |
| "grad_norm": 1.1882590055465698, | |
| "learning_rate": 1.3003495731157312e-05, | |
| "loss": 0.5435, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 1.6886131336956893, | |
| "grad_norm": 15.241290092468262, | |
| "learning_rate": 1.2462935559056366e-05, | |
| "loss": 0.5636, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 1.6954500393122074, | |
| "grad_norm": 1.281235933303833, | |
| "learning_rate": 1.1933103509349508e-05, | |
| "loss": 0.4771, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 1.7022869449287252, | |
| "grad_norm": 30.664819717407227, | |
| "learning_rate": 1.1414064515709255e-05, | |
| "loss": 0.5598, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 1.709123850545243, | |
| "grad_norm": 3.1145246028900146, | |
| "learning_rate": 1.0905882189063032e-05, | |
| "loss": 0.5779, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.7159607561617611, | |
| "grad_norm": 4.802779674530029, | |
| "learning_rate": 1.0408618809797255e-05, | |
| "loss": 0.5402, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 1.7227976617782792, | |
| "grad_norm": 3.566648006439209, | |
| "learning_rate": 9.92233532012452e-06, | |
| "loss": 0.816, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 1.7296345673947973, | |
| "grad_norm": 0.9611634016036987, | |
| "learning_rate": 9.447091316614965e-06, | |
| "loss": 0.5813, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 1.736471473011315, | |
| "grad_norm": 2.433220148086548, | |
| "learning_rate": 8.9829450428922e-06, | |
| "loss": 0.5628, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 1.743308378627833, | |
| "grad_norm": 0.1846768856048584, | |
| "learning_rate": 8.529953382495404e-06, | |
| "loss": 0.7646, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 1.750145284244351, | |
| "grad_norm": 1.4401239156723022, | |
| "learning_rate": 8.088171851907855e-06, | |
| "loss": 0.5705, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 1.756982189860869, | |
| "grad_norm": 25.80792236328125, | |
| "learning_rate": 7.657654593753195e-06, | |
| "loss": 0.6362, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 1.763819095477387, | |
| "grad_norm": 0.8399425148963928, | |
| "learning_rate": 7.2384543701598416e-06, | |
| "loss": 0.7085, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 1.7706560010939048, | |
| "grad_norm": 0.8096999526023865, | |
| "learning_rate": 6.83062255629483e-06, | |
| "loss": 0.5368, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 1.7774929067104228, | |
| "grad_norm": 8.902669906616211, | |
| "learning_rate": 6.43420913406747e-06, | |
| "loss": 0.5753, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.7843298123269409, | |
| "grad_norm": 0.15432903170585632, | |
| "learning_rate": 6.049262686003787e-06, | |
| "loss": 0.6055, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 1.791166717943459, | |
| "grad_norm": 14.938940048217773, | |
| "learning_rate": 5.6758303892925025e-06, | |
| "loss": 0.7965, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 1.7980036235599768, | |
| "grad_norm": 0.20640145242214203, | |
| "learning_rate": 5.313958010003261e-06, | |
| "loss": 0.5362, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 1.8048405291764946, | |
| "grad_norm": 0.42624762654304504, | |
| "learning_rate": 4.963689897477664e-06, | |
| "loss": 0.6298, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.8116774347930127, | |
| "grad_norm": 14.088078498840332, | |
| "learning_rate": 4.625068978894131e-06, | |
| "loss": 0.5166, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 1.8185143404095307, | |
| "grad_norm": 8.906865119934082, | |
| "learning_rate": 4.298136754006854e-06, | |
| "loss": 0.6144, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 1.8253512460260486, | |
| "grad_norm": 0.16211865842342377, | |
| "learning_rate": 3.982933290059887e-06, | |
| "loss": 0.446, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 1.8321881516425664, | |
| "grad_norm": 25.307283401489258, | |
| "learning_rate": 3.6794972168766594e-06, | |
| "loss": 0.525, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 1.8390250572590845, | |
| "grad_norm": 41.81796646118164, | |
| "learning_rate": 3.387865722125594e-06, | |
| "loss": 0.7377, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 1.8458619628756026, | |
| "grad_norm": 0.09296048432588577, | |
| "learning_rate": 3.10807454676274e-06, | |
| "loss": 0.5175, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.8526988684921206, | |
| "grad_norm": 113.21685791015625, | |
| "learning_rate": 2.8401579806514035e-06, | |
| "loss": 0.7324, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 1.8595357741086385, | |
| "grad_norm": 13.23887825012207, | |
| "learning_rate": 2.5841488583597696e-06, | |
| "loss": 0.4255, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 1.8663726797251563, | |
| "grad_norm": 0.3335596024990082, | |
| "learning_rate": 2.3400785551369043e-06, | |
| "loss": 0.4865, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 1.8732095853416744, | |
| "grad_norm": 1.1101493835449219, | |
| "learning_rate": 2.1079769830674836e-06, | |
| "loss": 0.5834, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 1.8800464909581924, | |
| "grad_norm": 0.44824355840682983, | |
| "learning_rate": 1.8878725874060144e-06, | |
| "loss": 0.6434, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 1.8868833965747103, | |
| "grad_norm": 0.7179256081581116, | |
| "learning_rate": 1.6797923430905583e-06, | |
| "loss": 0.5649, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 1.893720302191228, | |
| "grad_norm": 0.6279736757278442, | |
| "learning_rate": 1.4837617514370073e-06, | |
| "loss": 0.6663, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 1.9005572078077462, | |
| "grad_norm": 2.146757125854492, | |
| "learning_rate": 1.2998048370135963e-06, | |
| "loss": 0.5003, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 1.9073941134242642, | |
| "grad_norm": 0.2452065795660019, | |
| "learning_rate": 1.127944144696691e-06, | |
| "loss": 0.7167, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 1.9142310190407823, | |
| "grad_norm": 0.2389650195837021, | |
| "learning_rate": 9.682007369077095e-07, | |
| "loss": 0.5836, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.9142310190407823, | |
| "eval_loss": 0.2555805742740631, | |
| "eval_runtime": 299.5823, | |
| "eval_samples_per_second": 26.958, | |
| "eval_steps_per_second": 3.371, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.9210679246573001, | |
| "grad_norm": 20.409788131713867, | |
| "learning_rate": 8.205941910318426e-07, | |
| "loss": 0.5573, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 1.927904830273818, | |
| "grad_norm": 0.6842173933982849, | |
| "learning_rate": 6.851425970187952e-07, | |
| "loss": 0.5594, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 1.934741735890336, | |
| "grad_norm": 11.089654922485352, | |
| "learning_rate": 5.618625551656708e-07, | |
| "loss": 0.6967, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 1.941578641506854, | |
| "grad_norm": 12.126336097717285, | |
| "learning_rate": 4.507691740825881e-07, | |
| "loss": 0.677, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 1.948415547123372, | |
| "grad_norm": 0.44369152188301086, | |
| "learning_rate": 3.518760688410283e-07, | |
| "loss": 0.6566, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 1.9552524527398898, | |
| "grad_norm": 11.187239646911621, | |
| "learning_rate": 2.651953593052481e-07, | |
| "loss": 0.5174, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 1.9620893583564079, | |
| "grad_norm": 15.362393379211426, | |
| "learning_rate": 1.907376686468787e-07, | |
| "loss": 0.5426, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 1.968926263972926, | |
| "grad_norm": 0.2329702377319336, | |
| "learning_rate": 1.2851212204304518e-07, | |
| "loss": 0.6944, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 1.975763169589444, | |
| "grad_norm": 0.7811570763587952, | |
| "learning_rate": 7.852634555803873e-08, | |
| "loss": 0.5647, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 1.9826000752059618, | |
| "grad_norm": 1.2399488687515259, | |
| "learning_rate": 4.078646520866425e-08, | |
| "loss": 0.6162, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.9894369808224797, | |
| "grad_norm": 0.4023188352584839, | |
| "learning_rate": 1.5297106213485458e-08, | |
| "loss": 0.4718, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 1.9962738864389977, | |
| "grad_norm": 0.1795218139886856, | |
| "learning_rate": 2.061392425978248e-09, | |
| "loss": 0.5667, | |
| "step": 14600 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 14628, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.8538290358499676e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |