diff --git "a/resemble_to_drop_frequency_3591/checkpoint-50000/trainer_state.json" "b/resemble_to_drop_frequency_3591/checkpoint-50000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/resemble_to_drop_frequency_3591/checkpoint-50000/trainer_state.json" @@ -0,0 +1,7493 @@ +{ + "best_global_step": 48000, + "best_metric": 3.543606996536255, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/resemble_to_drop_frequency_3591/checkpoint-40000", + "epoch": 14.560312190576038, + "eval_steps": 1000, + "global_step": 50000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.014561127613722406, + "grad_norm": 1.0968835353851318, + "learning_rate": 0.000294, + "loss": 8.4166, + "step": 50 + }, + { + "epoch": 0.029122255227444813, + "grad_norm": 0.6513845920562744, + "learning_rate": 0.0005939999999999999, + "loss": 6.7305, + "step": 100 + }, + { + "epoch": 0.04368338284116722, + "grad_norm": 0.5097090601921082, + "learning_rate": 0.0005998286713286713, + "loss": 6.3288, + "step": 150 + }, + { + "epoch": 0.058244510454889625, + "grad_norm": 0.42854636907577515, + "learning_rate": 0.0005996538461538461, + "loss": 6.1467, + "step": 200 + }, + { + "epoch": 0.07280563806861204, + "grad_norm": 0.5062187314033508, + "learning_rate": 0.0005994790209790209, + "loss": 5.9942, + "step": 250 + }, + { + "epoch": 0.08736676568233444, + "grad_norm": 0.39580509066581726, + "learning_rate": 0.0005993041958041958, + "loss": 5.8474, + "step": 300 + }, + { + "epoch": 0.10192789329605685, + "grad_norm": 0.4567234516143799, + "learning_rate": 0.0005991293706293705, + "loss": 5.7198, + "step": 350 + }, + { + "epoch": 0.11648902090977925, + "grad_norm": 0.5529834628105164, + "learning_rate": 0.0005989545454545454, + "loss": 5.6138, + "step": 400 + }, + { + "epoch": 0.13105014852350166, + "grad_norm": 0.4305090010166168, + "learning_rate": 0.0005987797202797202, + "loss": 5.4936, + "step": 450 + }, + { + "epoch": 0.14561127613722408, + "grad_norm": 0.45783543586730957, + "learning_rate": 0.000598604895104895, + "loss": 5.4016, + "step": 500 + }, + { + "epoch": 0.16017240375094646, + "grad_norm": 0.4605506658554077, + "learning_rate": 0.0005984300699300698, + "loss": 5.3398, + "step": 550 + }, + { + "epoch": 0.17473353136466888, + "grad_norm": 0.582597553730011, + "learning_rate": 0.0005982552447552447, + "loss": 5.2616, + "step": 600 + }, + { + "epoch": 0.1892946589783913, + "grad_norm": 0.47175198793411255, + "learning_rate": 0.0005980804195804195, + "loss": 5.1831, + "step": 650 + }, + { + "epoch": 0.2038557865921137, + "grad_norm": 0.5068243145942688, + "learning_rate": 0.0005979055944055943, + "loss": 5.1231, + "step": 700 + }, + { + "epoch": 0.2184169142058361, + "grad_norm": 0.4487193822860718, + "learning_rate": 0.0005977307692307691, + "loss": 5.0754, + "step": 750 + }, + { + "epoch": 0.2329780418195585, + "grad_norm": 0.51069575548172, + "learning_rate": 0.000597555944055944, + "loss": 5.0179, + "step": 800 + }, + { + "epoch": 0.24753916943328091, + "grad_norm": 0.5181578397750854, + "learning_rate": 0.0005973811188811188, + "loss": 4.9711, + "step": 850 + }, + { + "epoch": 0.2621002970470033, + "grad_norm": 0.5237641334533691, + "learning_rate": 0.0005972062937062936, + "loss": 4.9212, + "step": 900 + }, + { + "epoch": 0.27666142466072574, + "grad_norm": 0.5211111903190613, + "learning_rate": 0.0005970314685314685, + "loss": 4.869, + "step": 950 + }, + { + "epoch": 0.29122255227444815, + "grad_norm": 0.44638922810554504, + "learning_rate": 0.0005968566433566433, + "loss": 4.8185, + "step": 1000 + }, + { + "epoch": 0.29122255227444815, + "eval_accuracy": 0.2571676232670328, + "eval_loss": 4.735123634338379, + "eval_runtime": 181.933, + "eval_samples_per_second": 91.468, + "eval_steps_per_second": 5.722, + "step": 1000 + }, + { + "epoch": 0.30578367988817057, + "grad_norm": 0.4172416627407074, + "learning_rate": 0.0005966818181818181, + "loss": 4.7639, + "step": 1050 + }, + { + "epoch": 0.3203448075018929, + "grad_norm": 0.4456934928894043, + "learning_rate": 0.0005965069930069929, + "loss": 4.7177, + "step": 1100 + }, + { + "epoch": 0.33490593511561534, + "grad_norm": 0.47358137369155884, + "learning_rate": 0.0005963321678321677, + "loss": 4.693, + "step": 1150 + }, + { + "epoch": 0.34946706272933775, + "grad_norm": 0.4430226683616638, + "learning_rate": 0.0005961573426573425, + "loss": 4.659, + "step": 1200 + }, + { + "epoch": 0.36402819034306017, + "grad_norm": 0.4078950583934784, + "learning_rate": 0.0005959825174825174, + "loss": 4.6245, + "step": 1250 + }, + { + "epoch": 0.3785893179567826, + "grad_norm": 0.44066765904426575, + "learning_rate": 0.0005958076923076922, + "loss": 4.5934, + "step": 1300 + }, + { + "epoch": 0.393150445570505, + "grad_norm": 0.43180859088897705, + "learning_rate": 0.000595632867132867, + "loss": 4.5672, + "step": 1350 + }, + { + "epoch": 0.4077115731842274, + "grad_norm": 0.41051167249679565, + "learning_rate": 0.0005954580419580418, + "loss": 4.552, + "step": 1400 + }, + { + "epoch": 0.4222727007979498, + "grad_norm": 0.4435499906539917, + "learning_rate": 0.0005952832167832168, + "loss": 4.5272, + "step": 1450 + }, + { + "epoch": 0.4368338284116722, + "grad_norm": 0.4016467332839966, + "learning_rate": 0.0005951083916083916, + "loss": 4.4965, + "step": 1500 + }, + { + "epoch": 0.4513949560253946, + "grad_norm": 0.42740699648857117, + "learning_rate": 0.0005949335664335664, + "loss": 4.4734, + "step": 1550 + }, + { + "epoch": 0.465956083639117, + "grad_norm": 0.41071462631225586, + "learning_rate": 0.0005947587412587413, + "loss": 4.4678, + "step": 1600 + }, + { + "epoch": 0.4805172112528394, + "grad_norm": 0.41863691806793213, + "learning_rate": 0.0005945839160839161, + "loss": 4.4373, + "step": 1650 + }, + { + "epoch": 0.49507833886656183, + "grad_norm": 0.4083499610424042, + "learning_rate": 0.0005944090909090909, + "loss": 4.4181, + "step": 1700 + }, + { + "epoch": 0.5096394664802842, + "grad_norm": 0.45506301522254944, + "learning_rate": 0.0005942342657342657, + "loss": 4.4017, + "step": 1750 + }, + { + "epoch": 0.5242005940940067, + "grad_norm": 0.4013260006904602, + "learning_rate": 0.0005940594405594406, + "loss": 4.3976, + "step": 1800 + }, + { + "epoch": 0.5387617217077291, + "grad_norm": 0.37785711884498596, + "learning_rate": 0.0005938846153846153, + "loss": 4.3778, + "step": 1850 + }, + { + "epoch": 0.5533228493214515, + "grad_norm": 0.436885267496109, + "learning_rate": 0.0005937097902097902, + "loss": 4.3769, + "step": 1900 + }, + { + "epoch": 0.5678839769351739, + "grad_norm": 0.41713714599609375, + "learning_rate": 0.000593534965034965, + "loss": 4.3597, + "step": 1950 + }, + { + "epoch": 0.5824451045488963, + "grad_norm": 0.40815940499305725, + "learning_rate": 0.0005933601398601398, + "loss": 4.3311, + "step": 2000 + }, + { + "epoch": 0.5824451045488963, + "eval_accuracy": 0.2994343186746337, + "eval_loss": 4.2821149826049805, + "eval_runtime": 182.0522, + "eval_samples_per_second": 91.408, + "eval_steps_per_second": 5.718, + "step": 2000 + }, + { + "epoch": 0.5970062321626187, + "grad_norm": 0.3702036440372467, + "learning_rate": 0.0005931853146853146, + "loss": 4.3181, + "step": 2050 + }, + { + "epoch": 0.6115673597763411, + "grad_norm": 0.3854442238807678, + "learning_rate": 0.0005930104895104895, + "loss": 4.308, + "step": 2100 + }, + { + "epoch": 0.6261284873900634, + "grad_norm": 0.3813020884990692, + "learning_rate": 0.0005928356643356643, + "loss": 4.31, + "step": 2150 + }, + { + "epoch": 0.6406896150037859, + "grad_norm": 0.3868289291858673, + "learning_rate": 0.0005926608391608391, + "loss": 4.3031, + "step": 2200 + }, + { + "epoch": 0.6552507426175083, + "grad_norm": 0.3773530423641205, + "learning_rate": 0.000592486013986014, + "loss": 4.2951, + "step": 2250 + }, + { + "epoch": 0.6698118702312307, + "grad_norm": 0.43276000022888184, + "learning_rate": 0.0005923111888111888, + "loss": 4.2716, + "step": 2300 + }, + { + "epoch": 0.6843729978449531, + "grad_norm": 0.4105272591114044, + "learning_rate": 0.0005921363636363636, + "loss": 4.269, + "step": 2350 + }, + { + "epoch": 0.6989341254586755, + "grad_norm": 0.381144255399704, + "learning_rate": 0.0005919615384615384, + "loss": 4.2447, + "step": 2400 + }, + { + "epoch": 0.7134952530723979, + "grad_norm": 0.5546244978904724, + "learning_rate": 0.0005917867132867133, + "loss": 4.2573, + "step": 2450 + }, + { + "epoch": 0.7280563806861203, + "grad_norm": 0.37709176540374756, + "learning_rate": 0.0005916118881118881, + "loss": 4.2385, + "step": 2500 + }, + { + "epoch": 0.7426175082998427, + "grad_norm": 0.39420562982559204, + "learning_rate": 0.0005914370629370629, + "loss": 4.235, + "step": 2550 + }, + { + "epoch": 0.7571786359135652, + "grad_norm": 0.3711323142051697, + "learning_rate": 0.0005912622377622377, + "loss": 4.2138, + "step": 2600 + }, + { + "epoch": 0.7717397635272876, + "grad_norm": 0.37468209862709045, + "learning_rate": 0.0005910874125874125, + "loss": 4.2063, + "step": 2650 + }, + { + "epoch": 0.78630089114101, + "grad_norm": 0.36328959465026855, + "learning_rate": 0.0005909125874125873, + "loss": 4.1915, + "step": 2700 + }, + { + "epoch": 0.8008620187547324, + "grad_norm": 0.3596959412097931, + "learning_rate": 0.0005907377622377622, + "loss": 4.1859, + "step": 2750 + }, + { + "epoch": 0.8154231463684548, + "grad_norm": 0.3694506287574768, + "learning_rate": 0.000590562937062937, + "loss": 4.17, + "step": 2800 + }, + { + "epoch": 0.8299842739821772, + "grad_norm": 0.37155982851982117, + "learning_rate": 0.0005903881118881118, + "loss": 4.1644, + "step": 2850 + }, + { + "epoch": 0.8445454015958996, + "grad_norm": 0.3630830943584442, + "learning_rate": 0.0005902132867132867, + "loss": 4.1669, + "step": 2900 + }, + { + "epoch": 0.8591065292096219, + "grad_norm": 0.37551137804985046, + "learning_rate": 0.0005900384615384615, + "loss": 4.1477, + "step": 2950 + }, + { + "epoch": 0.8736676568233444, + "grad_norm": 0.38218334317207336, + "learning_rate": 0.0005898636363636363, + "loss": 4.1448, + "step": 3000 + }, + { + "epoch": 0.8736676568233444, + "eval_accuracy": 0.315657070793131, + "eval_loss": 4.097928524017334, + "eval_runtime": 180.4968, + "eval_samples_per_second": 92.196, + "eval_steps_per_second": 5.767, + "step": 3000 + }, + { + "epoch": 0.8882287844370668, + "grad_norm": 0.3394276201725006, + "learning_rate": 0.0005896888111888111, + "loss": 4.1382, + "step": 3050 + }, + { + "epoch": 0.9027899120507892, + "grad_norm": 0.35621699690818787, + "learning_rate": 0.000589513986013986, + "loss": 4.1264, + "step": 3100 + }, + { + "epoch": 0.9173510396645116, + "grad_norm": 0.40414878726005554, + "learning_rate": 0.0005893391608391608, + "loss": 4.1358, + "step": 3150 + }, + { + "epoch": 0.931912167278234, + "grad_norm": 0.34163355827331543, + "learning_rate": 0.0005891643356643356, + "loss": 4.1084, + "step": 3200 + }, + { + "epoch": 0.9464732948919564, + "grad_norm": 0.34562134742736816, + "learning_rate": 0.0005889895104895104, + "loss": 4.1068, + "step": 3250 + }, + { + "epoch": 0.9610344225056788, + "grad_norm": 0.34311237931251526, + "learning_rate": 0.0005888146853146853, + "loss": 4.1128, + "step": 3300 + }, + { + "epoch": 0.9755955501194012, + "grad_norm": 0.3517252206802368, + "learning_rate": 0.00058863986013986, + "loss": 4.0946, + "step": 3350 + }, + { + "epoch": 0.9901566777331237, + "grad_norm": 0.36632663011550903, + "learning_rate": 0.0005884650349650349, + "loss": 4.1048, + "step": 3400 + }, + { + "epoch": 1.004659560836391, + "grad_norm": 0.3214505910873413, + "learning_rate": 0.0005882902097902097, + "loss": 4.068, + "step": 3450 + }, + { + "epoch": 1.0192206884501136, + "grad_norm": 0.3312755525112152, + "learning_rate": 0.0005881153846153845, + "loss": 4.0171, + "step": 3500 + }, + { + "epoch": 1.033781816063836, + "grad_norm": 0.35106778144836426, + "learning_rate": 0.0005879405594405594, + "loss": 4.0175, + "step": 3550 + }, + { + "epoch": 1.0483429436775584, + "grad_norm": 0.3634989857673645, + "learning_rate": 0.0005877657342657342, + "loss": 4.007, + "step": 3600 + }, + { + "epoch": 1.0629040712912807, + "grad_norm": 0.3441219627857208, + "learning_rate": 0.000587590909090909, + "loss": 4.0123, + "step": 3650 + }, + { + "epoch": 1.0774651989050033, + "grad_norm": 0.33224573731422424, + "learning_rate": 0.0005874160839160838, + "loss": 4.0016, + "step": 3700 + }, + { + "epoch": 1.0920263265187256, + "grad_norm": 0.34425127506256104, + "learning_rate": 0.0005872412587412587, + "loss": 4.0066, + "step": 3750 + }, + { + "epoch": 1.106587454132448, + "grad_norm": 0.34838539361953735, + "learning_rate": 0.0005870664335664335, + "loss": 4.0148, + "step": 3800 + }, + { + "epoch": 1.1211485817461704, + "grad_norm": 0.3374408781528473, + "learning_rate": 0.0005868916083916083, + "loss": 3.9959, + "step": 3850 + }, + { + "epoch": 1.135709709359893, + "grad_norm": 0.35629168152809143, + "learning_rate": 0.0005867167832167831, + "loss": 3.9961, + "step": 3900 + }, + { + "epoch": 1.1502708369736152, + "grad_norm": 0.34352248907089233, + "learning_rate": 0.000586541958041958, + "loss": 3.9805, + "step": 3950 + }, + { + "epoch": 1.1648319645873377, + "grad_norm": 0.3234029710292816, + "learning_rate": 0.0005863671328671328, + "loss": 3.9959, + "step": 4000 + }, + { + "epoch": 1.1648319645873377, + "eval_accuracy": 0.3252184881351332, + "eval_loss": 3.9894917011260986, + "eval_runtime": 180.4599, + "eval_samples_per_second": 92.214, + "eval_steps_per_second": 5.769, + "step": 4000 + }, + { + "epoch": 1.17939309220106, + "grad_norm": 0.3508160710334778, + "learning_rate": 0.0005861923076923076, + "loss": 3.9981, + "step": 4050 + }, + { + "epoch": 1.1939542198147826, + "grad_norm": 0.35192403197288513, + "learning_rate": 0.0005860174825174824, + "loss": 3.9802, + "step": 4100 + }, + { + "epoch": 1.2085153474285049, + "grad_norm": 0.33519187569618225, + "learning_rate": 0.0005858426573426573, + "loss": 3.9831, + "step": 4150 + }, + { + "epoch": 1.2230764750422272, + "grad_norm": 0.3435879349708557, + "learning_rate": 0.000585667832167832, + "loss": 3.9778, + "step": 4200 + }, + { + "epoch": 1.2376376026559497, + "grad_norm": 0.3758980333805084, + "learning_rate": 0.000585493006993007, + "loss": 3.9786, + "step": 4250 + }, + { + "epoch": 1.2521987302696722, + "grad_norm": 0.3644406199455261, + "learning_rate": 0.0005853181818181817, + "loss": 3.9794, + "step": 4300 + }, + { + "epoch": 1.2667598578833945, + "grad_norm": 0.34182295203208923, + "learning_rate": 0.0005851433566433565, + "loss": 3.9657, + "step": 4350 + }, + { + "epoch": 1.2813209854971168, + "grad_norm": 0.35043004155158997, + "learning_rate": 0.0005849685314685315, + "loss": 3.9696, + "step": 4400 + }, + { + "epoch": 1.2958821131108393, + "grad_norm": 0.3384864926338196, + "learning_rate": 0.0005847937062937063, + "loss": 3.9709, + "step": 4450 + }, + { + "epoch": 1.3104432407245616, + "grad_norm": 0.3442433178424835, + "learning_rate": 0.0005846188811188811, + "loss": 3.9592, + "step": 4500 + }, + { + "epoch": 1.3250043683382842, + "grad_norm": 0.34257060289382935, + "learning_rate": 0.0005844440559440559, + "loss": 3.9582, + "step": 4550 + }, + { + "epoch": 1.3395654959520065, + "grad_norm": 0.35144466161727905, + "learning_rate": 0.0005842692307692308, + "loss": 3.9495, + "step": 4600 + }, + { + "epoch": 1.354126623565729, + "grad_norm": 0.3375908136367798, + "learning_rate": 0.0005840944055944056, + "loss": 3.9506, + "step": 4650 + }, + { + "epoch": 1.3686877511794513, + "grad_norm": 0.3664558231830597, + "learning_rate": 0.0005839195804195804, + "loss": 3.9559, + "step": 4700 + }, + { + "epoch": 1.3832488787931738, + "grad_norm": 0.3706004321575165, + "learning_rate": 0.0005837447552447552, + "loss": 3.9544, + "step": 4750 + }, + { + "epoch": 1.3978100064068961, + "grad_norm": 0.36268237233161926, + "learning_rate": 0.0005835699300699301, + "loss": 3.9448, + "step": 4800 + }, + { + "epoch": 1.4123711340206184, + "grad_norm": 0.3499133884906769, + "learning_rate": 0.0005833951048951048, + "loss": 3.941, + "step": 4850 + }, + { + "epoch": 1.426932261634341, + "grad_norm": 0.33515897393226624, + "learning_rate": 0.0005832202797202797, + "loss": 3.9399, + "step": 4900 + }, + { + "epoch": 1.4414933892480635, + "grad_norm": 0.3366197645664215, + "learning_rate": 0.0005830454545454546, + "loss": 3.9426, + "step": 4950 + }, + { + "epoch": 1.4560545168617858, + "grad_norm": 0.32890358567237854, + "learning_rate": 0.0005828706293706293, + "loss": 3.9195, + "step": 5000 + }, + { + "epoch": 1.4560545168617858, + "eval_accuracy": 0.33210278858796755, + "eval_loss": 3.913426637649536, + "eval_runtime": 180.5893, + "eval_samples_per_second": 92.148, + "eval_steps_per_second": 5.764, + "step": 5000 + }, + { + "epoch": 1.470615644475508, + "grad_norm": 0.32962432503700256, + "learning_rate": 0.0005826958041958042, + "loss": 3.9236, + "step": 5050 + }, + { + "epoch": 1.4851767720892306, + "grad_norm": 0.32921549677848816, + "learning_rate": 0.000582520979020979, + "loss": 3.9185, + "step": 5100 + }, + { + "epoch": 1.4997378997029531, + "grad_norm": 0.3333084285259247, + "learning_rate": 0.0005823461538461538, + "loss": 3.9211, + "step": 5150 + }, + { + "epoch": 1.5142990273166754, + "grad_norm": 0.33431023359298706, + "learning_rate": 0.0005821713286713286, + "loss": 3.9171, + "step": 5200 + }, + { + "epoch": 1.5288601549303977, + "grad_norm": 0.3071958124637604, + "learning_rate": 0.0005819965034965035, + "loss": 3.9336, + "step": 5250 + }, + { + "epoch": 1.5434212825441203, + "grad_norm": 0.33107051253318787, + "learning_rate": 0.0005818216783216783, + "loss": 3.9241, + "step": 5300 + }, + { + "epoch": 1.5579824101578428, + "grad_norm": 0.3333437442779541, + "learning_rate": 0.0005816468531468531, + "loss": 3.9097, + "step": 5350 + }, + { + "epoch": 1.572543537771565, + "grad_norm": 0.30821335315704346, + "learning_rate": 0.0005814720279720279, + "loss": 3.9082, + "step": 5400 + }, + { + "epoch": 1.5871046653852874, + "grad_norm": 0.33230161666870117, + "learning_rate": 0.0005812972027972028, + "loss": 3.9124, + "step": 5450 + }, + { + "epoch": 1.6016657929990097, + "grad_norm": 0.32524222135543823, + "learning_rate": 0.0005811223776223776, + "loss": 3.8887, + "step": 5500 + }, + { + "epoch": 1.6162269206127322, + "grad_norm": 0.3314615786075592, + "learning_rate": 0.0005809475524475524, + "loss": 3.8882, + "step": 5550 + }, + { + "epoch": 1.6307880482264547, + "grad_norm": 0.32449811697006226, + "learning_rate": 0.0005807727272727272, + "loss": 3.9018, + "step": 5600 + }, + { + "epoch": 1.645349175840177, + "grad_norm": 0.3151806890964508, + "learning_rate": 0.0005805979020979021, + "loss": 3.9122, + "step": 5650 + }, + { + "epoch": 1.6599103034538993, + "grad_norm": 0.31618261337280273, + "learning_rate": 0.0005804230769230769, + "loss": 3.8943, + "step": 5700 + }, + { + "epoch": 1.6744714310676219, + "grad_norm": 0.31507301330566406, + "learning_rate": 0.0005802482517482517, + "loss": 3.8947, + "step": 5750 + }, + { + "epoch": 1.6890325586813444, + "grad_norm": 0.3101233243942261, + "learning_rate": 0.0005800734265734265, + "loss": 3.8899, + "step": 5800 + }, + { + "epoch": 1.7035936862950667, + "grad_norm": 0.3337361514568329, + "learning_rate": 0.0005798986013986013, + "loss": 3.8903, + "step": 5850 + }, + { + "epoch": 1.718154813908789, + "grad_norm": 0.3244958519935608, + "learning_rate": 0.0005797237762237762, + "loss": 3.8904, + "step": 5900 + }, + { + "epoch": 1.7327159415225115, + "grad_norm": 0.3190813958644867, + "learning_rate": 0.000579548951048951, + "loss": 3.8707, + "step": 5950 + }, + { + "epoch": 1.747277069136234, + "grad_norm": 0.30000555515289307, + "learning_rate": 0.0005793741258741258, + "loss": 3.8847, + "step": 6000 + }, + { + "epoch": 1.747277069136234, + "eval_accuracy": 0.3372700416567149, + "eval_loss": 3.8552145957946777, + "eval_runtime": 180.7902, + "eval_samples_per_second": 92.046, + "eval_steps_per_second": 5.758, + "step": 6000 + }, + { + "epoch": 1.7618381967499563, + "grad_norm": 0.3228197395801544, + "learning_rate": 0.0005791993006993006, + "loss": 3.8787, + "step": 6050 + }, + { + "epoch": 1.7763993243636786, + "grad_norm": 0.3813265264034271, + "learning_rate": 0.0005790244755244755, + "loss": 3.8758, + "step": 6100 + }, + { + "epoch": 1.7909604519774012, + "grad_norm": 0.31867480278015137, + "learning_rate": 0.0005788496503496503, + "loss": 3.8588, + "step": 6150 + }, + { + "epoch": 1.8055215795911237, + "grad_norm": 0.3052184581756592, + "learning_rate": 0.0005786748251748251, + "loss": 3.8693, + "step": 6200 + }, + { + "epoch": 1.820082707204846, + "grad_norm": 0.33260828256607056, + "learning_rate": 0.0005784999999999999, + "loss": 3.8806, + "step": 6250 + }, + { + "epoch": 1.8346438348185683, + "grad_norm": 0.3406659960746765, + "learning_rate": 0.0005783251748251748, + "loss": 3.8749, + "step": 6300 + }, + { + "epoch": 1.8492049624322906, + "grad_norm": 0.3197321891784668, + "learning_rate": 0.0005781503496503496, + "loss": 3.8612, + "step": 6350 + }, + { + "epoch": 1.8637660900460131, + "grad_norm": 0.3296874761581421, + "learning_rate": 0.0005779755244755244, + "loss": 3.8534, + "step": 6400 + }, + { + "epoch": 1.8783272176597356, + "grad_norm": 0.33814582228660583, + "learning_rate": 0.0005778006993006993, + "loss": 3.867, + "step": 6450 + }, + { + "epoch": 1.892888345273458, + "grad_norm": 0.3142564594745636, + "learning_rate": 0.000577625874125874, + "loss": 3.8651, + "step": 6500 + }, + { + "epoch": 1.9074494728871803, + "grad_norm": 0.32239586114883423, + "learning_rate": 0.0005774510489510489, + "loss": 3.8486, + "step": 6550 + }, + { + "epoch": 1.9220106005009028, + "grad_norm": 0.3052440583705902, + "learning_rate": 0.0005772762237762237, + "loss": 3.8551, + "step": 6600 + }, + { + "epoch": 1.9365717281146253, + "grad_norm": 0.34700754284858704, + "learning_rate": 0.0005771013986013985, + "loss": 3.8433, + "step": 6650 + }, + { + "epoch": 1.9511328557283476, + "grad_norm": 0.32743698358535767, + "learning_rate": 0.0005769265734265733, + "loss": 3.8523, + "step": 6700 + }, + { + "epoch": 1.96569398334207, + "grad_norm": 0.3276400864124298, + "learning_rate": 0.0005767517482517482, + "loss": 3.8501, + "step": 6750 + }, + { + "epoch": 1.9802551109557924, + "grad_norm": 0.31817659735679626, + "learning_rate": 0.000576576923076923, + "loss": 3.8395, + "step": 6800 + }, + { + "epoch": 1.994816238569515, + "grad_norm": 0.30320706963539124, + "learning_rate": 0.0005764020979020978, + "loss": 3.8473, + "step": 6850 + }, + { + "epoch": 2.009319121672782, + "grad_norm": 0.33266910910606384, + "learning_rate": 0.0005762272727272726, + "loss": 3.7834, + "step": 6900 + }, + { + "epoch": 2.023880249286505, + "grad_norm": 0.3259655237197876, + "learning_rate": 0.0005760524475524475, + "loss": 3.7468, + "step": 6950 + }, + { + "epoch": 2.038441376900227, + "grad_norm": 0.32959362864494324, + "learning_rate": 0.0005758776223776223, + "loss": 3.7518, + "step": 7000 + }, + { + "epoch": 2.038441376900227, + "eval_accuracy": 0.3410902104309129, + "eval_loss": 3.8155248165130615, + "eval_runtime": 180.6704, + "eval_samples_per_second": 92.107, + "eval_steps_per_second": 5.762, + "step": 7000 + }, + { + "epoch": 2.0530025045139495, + "grad_norm": 0.3127276301383972, + "learning_rate": 0.0005757027972027971, + "loss": 3.7567, + "step": 7050 + }, + { + "epoch": 2.067563632127672, + "grad_norm": 0.3236212432384491, + "learning_rate": 0.000575527972027972, + "loss": 3.7529, + "step": 7100 + }, + { + "epoch": 2.0821247597413945, + "grad_norm": 0.32352373003959656, + "learning_rate": 0.0005753531468531468, + "loss": 3.7582, + "step": 7150 + }, + { + "epoch": 2.096685887355117, + "grad_norm": 0.3106796145439148, + "learning_rate": 0.0005751783216783216, + "loss": 3.769, + "step": 7200 + }, + { + "epoch": 2.111247014968839, + "grad_norm": 0.3407052755355835, + "learning_rate": 0.0005750034965034964, + "loss": 3.7593, + "step": 7250 + }, + { + "epoch": 2.1258081425825615, + "grad_norm": 0.32309308648109436, + "learning_rate": 0.0005748286713286712, + "loss": 3.7436, + "step": 7300 + }, + { + "epoch": 2.140369270196284, + "grad_norm": 0.3216754198074341, + "learning_rate": 0.000574653846153846, + "loss": 3.7569, + "step": 7350 + }, + { + "epoch": 2.1549303978100065, + "grad_norm": 0.32464084029197693, + "learning_rate": 0.000574479020979021, + "loss": 3.7569, + "step": 7400 + }, + { + "epoch": 2.169491525423729, + "grad_norm": 0.31773078441619873, + "learning_rate": 0.0005743041958041958, + "loss": 3.7452, + "step": 7450 + }, + { + "epoch": 2.184052653037451, + "grad_norm": 0.33485111594200134, + "learning_rate": 0.0005741293706293706, + "loss": 3.7562, + "step": 7500 + }, + { + "epoch": 2.198613780651174, + "grad_norm": 0.31402260065078735, + "learning_rate": 0.0005739545454545454, + "loss": 3.7598, + "step": 7550 + }, + { + "epoch": 2.213174908264896, + "grad_norm": 0.3134409785270691, + "learning_rate": 0.0005737797202797203, + "loss": 3.7545, + "step": 7600 + }, + { + "epoch": 2.2277360358786185, + "grad_norm": 0.29751038551330566, + "learning_rate": 0.0005736048951048951, + "loss": 3.7576, + "step": 7650 + }, + { + "epoch": 2.2422971634923408, + "grad_norm": 0.3134411573410034, + "learning_rate": 0.0005734300699300699, + "loss": 3.7487, + "step": 7700 + }, + { + "epoch": 2.256858291106063, + "grad_norm": 0.30340442061424255, + "learning_rate": 0.0005732552447552448, + "loss": 3.7514, + "step": 7750 + }, + { + "epoch": 2.271419418719786, + "grad_norm": 0.3226783871650696, + "learning_rate": 0.0005730804195804196, + "loss": 3.7542, + "step": 7800 + }, + { + "epoch": 2.285980546333508, + "grad_norm": 0.331408828496933, + "learning_rate": 0.0005729055944055944, + "loss": 3.7647, + "step": 7850 + }, + { + "epoch": 2.3005416739472304, + "grad_norm": 0.3096162974834442, + "learning_rate": 0.0005727307692307692, + "loss": 3.7471, + "step": 7900 + }, + { + "epoch": 2.3151028015609527, + "grad_norm": 0.3096749186515808, + "learning_rate": 0.0005725559440559441, + "loss": 3.7573, + "step": 7950 + }, + { + "epoch": 2.3296639291746755, + "grad_norm": 0.3250366151332855, + "learning_rate": 0.0005723811188811188, + "loss": 3.7545, + "step": 8000 + }, + { + "epoch": 2.3296639291746755, + "eval_accuracy": 0.3441775089018693, + "eval_loss": 3.785076856613159, + "eval_runtime": 182.534, + "eval_samples_per_second": 91.167, + "eval_steps_per_second": 5.703, + "step": 8000 + }, + { + "epoch": 2.3442250567883978, + "grad_norm": 0.334833025932312, + "learning_rate": 0.0005722062937062937, + "loss": 3.7573, + "step": 8050 + }, + { + "epoch": 2.35878618440212, + "grad_norm": 0.3055669963359833, + "learning_rate": 0.0005720314685314685, + "loss": 3.7556, + "step": 8100 + }, + { + "epoch": 2.3733473120158424, + "grad_norm": 0.31652265787124634, + "learning_rate": 0.0005718566433566433, + "loss": 3.7427, + "step": 8150 + }, + { + "epoch": 2.387908439629565, + "grad_norm": 0.31866180896759033, + "learning_rate": 0.0005716818181818181, + "loss": 3.7445, + "step": 8200 + }, + { + "epoch": 2.4024695672432874, + "grad_norm": 0.3108992278575897, + "learning_rate": 0.000571506993006993, + "loss": 3.7576, + "step": 8250 + }, + { + "epoch": 2.4170306948570097, + "grad_norm": 0.32130104303359985, + "learning_rate": 0.0005713321678321678, + "loss": 3.749, + "step": 8300 + }, + { + "epoch": 2.431591822470732, + "grad_norm": 0.3208393156528473, + "learning_rate": 0.0005711573426573426, + "loss": 3.7552, + "step": 8350 + }, + { + "epoch": 2.4461529500844543, + "grad_norm": 0.33237773180007935, + "learning_rate": 0.0005709825174825175, + "loss": 3.7437, + "step": 8400 + }, + { + "epoch": 2.460714077698177, + "grad_norm": 0.329561322927475, + "learning_rate": 0.0005708076923076923, + "loss": 3.7517, + "step": 8450 + }, + { + "epoch": 2.4752752053118994, + "grad_norm": 0.3131866753101349, + "learning_rate": 0.0005706328671328671, + "loss": 3.7548, + "step": 8500 + }, + { + "epoch": 2.4898363329256217, + "grad_norm": 0.3359507620334625, + "learning_rate": 0.0005704580419580419, + "loss": 3.7339, + "step": 8550 + }, + { + "epoch": 2.5043974605393444, + "grad_norm": 0.32358288764953613, + "learning_rate": 0.0005702832167832168, + "loss": 3.7537, + "step": 8600 + }, + { + "epoch": 2.5189585881530667, + "grad_norm": 0.3077840805053711, + "learning_rate": 0.0005701083916083916, + "loss": 3.7491, + "step": 8650 + }, + { + "epoch": 2.533519715766789, + "grad_norm": 0.306542307138443, + "learning_rate": 0.0005699335664335664, + "loss": 3.7501, + "step": 8700 + }, + { + "epoch": 2.5480808433805113, + "grad_norm": 0.31777438521385193, + "learning_rate": 0.0005697587412587412, + "loss": 3.7588, + "step": 8750 + }, + { + "epoch": 2.5626419709942336, + "grad_norm": 0.2995713949203491, + "learning_rate": 0.000569583916083916, + "loss": 3.7534, + "step": 8800 + }, + { + "epoch": 2.5772030986079564, + "grad_norm": 0.28951001167297363, + "learning_rate": 0.0005694090909090908, + "loss": 3.7181, + "step": 8850 + }, + { + "epoch": 2.5917642262216787, + "grad_norm": 0.3006434142589569, + "learning_rate": 0.0005692342657342657, + "loss": 3.7345, + "step": 8900 + }, + { + "epoch": 2.606325353835401, + "grad_norm": 0.341970831155777, + "learning_rate": 0.0005690594405594405, + "loss": 3.7367, + "step": 8950 + }, + { + "epoch": 2.6208864814491233, + "grad_norm": 0.33957773447036743, + "learning_rate": 0.0005688846153846153, + "loss": 3.7406, + "step": 9000 + }, + { + "epoch": 2.6208864814491233, + "eval_accuracy": 0.3470423120882088, + "eval_loss": 3.755228042602539, + "eval_runtime": 182.4889, + "eval_samples_per_second": 91.189, + "eval_steps_per_second": 5.704, + "step": 9000 + }, + { + "epoch": 2.6354476090628456, + "grad_norm": 0.3154286742210388, + "learning_rate": 0.0005687097902097901, + "loss": 3.7347, + "step": 9050 + }, + { + "epoch": 2.6500087366765683, + "grad_norm": 0.3290162682533264, + "learning_rate": 0.000568534965034965, + "loss": 3.7492, + "step": 9100 + }, + { + "epoch": 2.6645698642902906, + "grad_norm": 0.3163735568523407, + "learning_rate": 0.0005683601398601398, + "loss": 3.7292, + "step": 9150 + }, + { + "epoch": 2.679130991904013, + "grad_norm": 0.31252047419548035, + "learning_rate": 0.0005681853146853146, + "loss": 3.7412, + "step": 9200 + }, + { + "epoch": 2.6936921195177357, + "grad_norm": 0.31522566080093384, + "learning_rate": 0.0005680104895104895, + "loss": 3.7257, + "step": 9250 + }, + { + "epoch": 2.708253247131458, + "grad_norm": 0.320932000875473, + "learning_rate": 0.0005678356643356643, + "loss": 3.7274, + "step": 9300 + }, + { + "epoch": 2.7228143747451803, + "grad_norm": 0.30979084968566895, + "learning_rate": 0.0005676608391608391, + "loss": 3.7346, + "step": 9350 + }, + { + "epoch": 2.7373755023589026, + "grad_norm": 0.31152546405792236, + "learning_rate": 0.0005674860139860139, + "loss": 3.725, + "step": 9400 + }, + { + "epoch": 2.751936629972625, + "grad_norm": 0.30036696791648865, + "learning_rate": 0.0005673111888111888, + "loss": 3.7413, + "step": 9450 + }, + { + "epoch": 2.7664977575863476, + "grad_norm": 0.2985321879386902, + "learning_rate": 0.0005671363636363635, + "loss": 3.7299, + "step": 9500 + }, + { + "epoch": 2.78105888520007, + "grad_norm": 0.29972919821739197, + "learning_rate": 0.0005669615384615384, + "loss": 3.7392, + "step": 9550 + }, + { + "epoch": 2.7956200128137922, + "grad_norm": 0.3080502152442932, + "learning_rate": 0.0005667867132867132, + "loss": 3.7244, + "step": 9600 + }, + { + "epoch": 2.8101811404275145, + "grad_norm": 0.3224172294139862, + "learning_rate": 0.000566611888111888, + "loss": 3.7353, + "step": 9650 + }, + { + "epoch": 2.824742268041237, + "grad_norm": 0.29166194796562195, + "learning_rate": 0.0005664370629370628, + "loss": 3.7261, + "step": 9700 + }, + { + "epoch": 2.8393033956549596, + "grad_norm": 0.3214750587940216, + "learning_rate": 0.0005662622377622377, + "loss": 3.7281, + "step": 9750 + }, + { + "epoch": 2.853864523268682, + "grad_norm": 0.3150324821472168, + "learning_rate": 0.0005660874125874125, + "loss": 3.7247, + "step": 9800 + }, + { + "epoch": 2.868425650882404, + "grad_norm": 0.31625422835350037, + "learning_rate": 0.0005659125874125873, + "loss": 3.7244, + "step": 9850 + }, + { + "epoch": 2.882986778496127, + "grad_norm": 0.3253997564315796, + "learning_rate": 0.0005657377622377622, + "loss": 3.7113, + "step": 9900 + }, + { + "epoch": 2.8975479061098492, + "grad_norm": 0.2979266941547394, + "learning_rate": 0.000565562937062937, + "loss": 3.7287, + "step": 9950 + }, + { + "epoch": 2.9121090337235715, + "grad_norm": 0.28829896450042725, + "learning_rate": 0.0005653881118881118, + "loss": 3.7104, + "step": 10000 + }, + { + "epoch": 2.9121090337235715, + "eval_accuracy": 0.3495516167304694, + "eval_loss": 3.725857734680176, + "eval_runtime": 182.3878, + "eval_samples_per_second": 91.24, + "eval_steps_per_second": 5.708, + "step": 10000 + }, + { + "epoch": 2.926670161337294, + "grad_norm": 0.3077658414840698, + "learning_rate": 0.0005652132867132866, + "loss": 3.7278, + "step": 10050 + }, + { + "epoch": 2.941231288951016, + "grad_norm": 0.2965807020664215, + "learning_rate": 0.0005650384615384615, + "loss": 3.7136, + "step": 10100 + }, + { + "epoch": 2.955792416564739, + "grad_norm": 0.3523092269897461, + "learning_rate": 0.0005648636363636363, + "loss": 3.7209, + "step": 10150 + }, + { + "epoch": 2.970353544178461, + "grad_norm": 0.32154616713523865, + "learning_rate": 0.0005646888111888111, + "loss": 3.71, + "step": 10200 + }, + { + "epoch": 2.9849146717921835, + "grad_norm": 0.32815372943878174, + "learning_rate": 0.000564513986013986, + "loss": 3.7295, + "step": 10250 + }, + { + "epoch": 2.9994757994059063, + "grad_norm": 0.3026379346847534, + "learning_rate": 0.0005643391608391607, + "loss": 3.7079, + "step": 10300 + }, + { + "epoch": 3.0139786825091734, + "grad_norm": 0.32696130871772766, + "learning_rate": 0.0005641643356643355, + "loss": 3.6198, + "step": 10350 + }, + { + "epoch": 3.0285398101228957, + "grad_norm": 0.3219397962093353, + "learning_rate": 0.0005639895104895105, + "loss": 3.6018, + "step": 10400 + }, + { + "epoch": 3.0431009377366185, + "grad_norm": 0.32359954714775085, + "learning_rate": 0.0005638146853146853, + "loss": 3.6216, + "step": 10450 + }, + { + "epoch": 3.057662065350341, + "grad_norm": 0.33311325311660767, + "learning_rate": 0.0005636398601398601, + "loss": 3.6286, + "step": 10500 + }, + { + "epoch": 3.072223192964063, + "grad_norm": 0.3124459385871887, + "learning_rate": 0.000563465034965035, + "loss": 3.6296, + "step": 10550 + }, + { + "epoch": 3.0867843205777854, + "grad_norm": 0.3220648169517517, + "learning_rate": 0.0005632902097902098, + "loss": 3.6115, + "step": 10600 + }, + { + "epoch": 3.101345448191508, + "grad_norm": 0.3088878095149994, + "learning_rate": 0.0005631153846153846, + "loss": 3.6037, + "step": 10650 + }, + { + "epoch": 3.1159065758052304, + "grad_norm": 0.30444589257240295, + "learning_rate": 0.0005629405594405594, + "loss": 3.6469, + "step": 10700 + }, + { + "epoch": 3.1304677034189528, + "grad_norm": 0.32744741439819336, + "learning_rate": 0.0005627657342657343, + "loss": 3.6301, + "step": 10750 + }, + { + "epoch": 3.145028831032675, + "grad_norm": 0.3285087049007416, + "learning_rate": 0.0005625909090909091, + "loss": 3.6354, + "step": 10800 + }, + { + "epoch": 3.1595899586463974, + "grad_norm": 0.3226669430732727, + "learning_rate": 0.0005624160839160839, + "loss": 3.6348, + "step": 10850 + }, + { + "epoch": 3.17415108626012, + "grad_norm": 0.32627004384994507, + "learning_rate": 0.0005622412587412587, + "loss": 3.6261, + "step": 10900 + }, + { + "epoch": 3.1887122138738424, + "grad_norm": 0.3208872377872467, + "learning_rate": 0.0005620664335664336, + "loss": 3.6344, + "step": 10950 + }, + { + "epoch": 3.2032733414875647, + "grad_norm": 0.3367385268211365, + "learning_rate": 0.0005618916083916083, + "loss": 3.6416, + "step": 11000 + }, + { + "epoch": 3.2032733414875647, + "eval_accuracy": 0.35108838648700996, + "eval_loss": 3.7168452739715576, + "eval_runtime": 182.5337, + "eval_samples_per_second": 91.167, + "eval_steps_per_second": 5.703, + "step": 11000 + }, + { + "epoch": 3.217834469101287, + "grad_norm": 0.32166630029678345, + "learning_rate": 0.0005617167832167832, + "loss": 3.6309, + "step": 11050 + }, + { + "epoch": 3.2323955967150098, + "grad_norm": 0.32037273049354553, + "learning_rate": 0.000561541958041958, + "loss": 3.6488, + "step": 11100 + }, + { + "epoch": 3.246956724328732, + "grad_norm": 0.32968729734420776, + "learning_rate": 0.0005613671328671328, + "loss": 3.6462, + "step": 11150 + }, + { + "epoch": 3.2615178519424544, + "grad_norm": 0.3250342905521393, + "learning_rate": 0.0005611923076923077, + "loss": 3.6474, + "step": 11200 + }, + { + "epoch": 3.2760789795561767, + "grad_norm": 0.3201673924922943, + "learning_rate": 0.0005610174825174825, + "loss": 3.6506, + "step": 11250 + }, + { + "epoch": 3.2906401071698994, + "grad_norm": 0.3127792477607727, + "learning_rate": 0.0005608426573426573, + "loss": 3.6342, + "step": 11300 + }, + { + "epoch": 3.3052012347836217, + "grad_norm": 0.3302961587905884, + "learning_rate": 0.0005606678321678321, + "loss": 3.6471, + "step": 11350 + }, + { + "epoch": 3.319762362397344, + "grad_norm": 0.32816413044929504, + "learning_rate": 0.000560493006993007, + "loss": 3.6385, + "step": 11400 + }, + { + "epoch": 3.3343234900110663, + "grad_norm": 0.31450170278549194, + "learning_rate": 0.0005603181818181818, + "loss": 3.6462, + "step": 11450 + }, + { + "epoch": 3.3488846176247886, + "grad_norm": 0.32757627964019775, + "learning_rate": 0.0005601433566433566, + "loss": 3.646, + "step": 11500 + }, + { + "epoch": 3.3634457452385114, + "grad_norm": 0.32431358098983765, + "learning_rate": 0.0005599685314685314, + "loss": 3.6413, + "step": 11550 + }, + { + "epoch": 3.3780068728522337, + "grad_norm": 0.3174758851528168, + "learning_rate": 0.0005597937062937063, + "loss": 3.643, + "step": 11600 + }, + { + "epoch": 3.392568000465956, + "grad_norm": 0.3019546866416931, + "learning_rate": 0.0005596188811188811, + "loss": 3.6347, + "step": 11650 + }, + { + "epoch": 3.4071291280796787, + "grad_norm": 0.3143378794193268, + "learning_rate": 0.0005594440559440559, + "loss": 3.6397, + "step": 11700 + }, + { + "epoch": 3.421690255693401, + "grad_norm": 0.3293830454349518, + "learning_rate": 0.0005592692307692307, + "loss": 3.6487, + "step": 11750 + }, + { + "epoch": 3.4362513833071233, + "grad_norm": 0.3159070909023285, + "learning_rate": 0.0005590944055944055, + "loss": 3.6391, + "step": 11800 + }, + { + "epoch": 3.4508125109208456, + "grad_norm": 0.31916487216949463, + "learning_rate": 0.0005589195804195803, + "loss": 3.6553, + "step": 11850 + }, + { + "epoch": 3.465373638534568, + "grad_norm": 0.30976319313049316, + "learning_rate": 0.0005587447552447552, + "loss": 3.6392, + "step": 11900 + }, + { + "epoch": 3.4799347661482907, + "grad_norm": 0.3121052086353302, + "learning_rate": 0.00055856993006993, + "loss": 3.6477, + "step": 11950 + }, + { + "epoch": 3.494495893762013, + "grad_norm": 0.31760597229003906, + "learning_rate": 0.0005583951048951048, + "loss": 3.6462, + "step": 12000 + }, + { + "epoch": 3.494495893762013, + "eval_accuracy": 0.3529633678918372, + "eval_loss": 3.695136785507202, + "eval_runtime": 181.8852, + "eval_samples_per_second": 91.492, + "eval_steps_per_second": 5.723, + "step": 12000 + }, + { + "epoch": 3.5090570213757353, + "grad_norm": 0.324222594499588, + "learning_rate": 0.0005582202797202797, + "loss": 3.6489, + "step": 12050 + }, + { + "epoch": 3.523618148989458, + "grad_norm": 0.3073160946369171, + "learning_rate": 0.0005580454545454545, + "loss": 3.6491, + "step": 12100 + }, + { + "epoch": 3.53817927660318, + "grad_norm": 0.333292692899704, + "learning_rate": 0.0005578706293706293, + "loss": 3.6485, + "step": 12150 + }, + { + "epoch": 3.5527404042169026, + "grad_norm": 0.32193732261657715, + "learning_rate": 0.0005576958041958041, + "loss": 3.6356, + "step": 12200 + }, + { + "epoch": 3.567301531830625, + "grad_norm": 0.305073618888855, + "learning_rate": 0.000557520979020979, + "loss": 3.6421, + "step": 12250 + }, + { + "epoch": 3.5818626594443472, + "grad_norm": 0.30666157603263855, + "learning_rate": 0.0005573461538461538, + "loss": 3.6479, + "step": 12300 + }, + { + "epoch": 3.59642378705807, + "grad_norm": 0.30662357807159424, + "learning_rate": 0.0005571713286713286, + "loss": 3.6384, + "step": 12350 + }, + { + "epoch": 3.6109849146717923, + "grad_norm": 0.3077661991119385, + "learning_rate": 0.0005569965034965034, + "loss": 3.6267, + "step": 12400 + }, + { + "epoch": 3.6255460422855146, + "grad_norm": 0.3115411698818207, + "learning_rate": 0.0005568216783216783, + "loss": 3.6399, + "step": 12450 + }, + { + "epoch": 3.640107169899237, + "grad_norm": 0.347708523273468, + "learning_rate": 0.000556646853146853, + "loss": 3.6376, + "step": 12500 + }, + { + "epoch": 3.654668297512959, + "grad_norm": 0.33384427428245544, + "learning_rate": 0.0005564720279720279, + "loss": 3.6411, + "step": 12550 + }, + { + "epoch": 3.669229425126682, + "grad_norm": 0.31211602687835693, + "learning_rate": 0.0005562972027972027, + "loss": 3.6455, + "step": 12600 + }, + { + "epoch": 3.6837905527404042, + "grad_norm": 0.3280918002128601, + "learning_rate": 0.0005561223776223775, + "loss": 3.6432, + "step": 12650 + }, + { + "epoch": 3.6983516803541265, + "grad_norm": 0.30643153190612793, + "learning_rate": 0.0005559475524475524, + "loss": 3.6345, + "step": 12700 + }, + { + "epoch": 3.7129128079678493, + "grad_norm": 0.32905712723731995, + "learning_rate": 0.0005557727272727272, + "loss": 3.6532, + "step": 12750 + }, + { + "epoch": 3.7274739355815716, + "grad_norm": 0.2936493158340454, + "learning_rate": 0.000555597902097902, + "loss": 3.6437, + "step": 12800 + }, + { + "epoch": 3.742035063195294, + "grad_norm": 0.3418334126472473, + "learning_rate": 0.0005554230769230768, + "loss": 3.6344, + "step": 12850 + }, + { + "epoch": 3.756596190809016, + "grad_norm": 0.32170310616493225, + "learning_rate": 0.0005552482517482517, + "loss": 3.6389, + "step": 12900 + }, + { + "epoch": 3.7711573184227385, + "grad_norm": 0.3068753480911255, + "learning_rate": 0.0005550734265734265, + "loss": 3.6438, + "step": 12950 + }, + { + "epoch": 3.7857184460364612, + "grad_norm": 0.31144753098487854, + "learning_rate": 0.0005548986013986013, + "loss": 3.6498, + "step": 13000 + }, + { + "epoch": 3.7857184460364612, + "eval_accuracy": 0.35463855041264525, + "eval_loss": 3.6793668270111084, + "eval_runtime": 181.7036, + "eval_samples_per_second": 91.583, + "eval_steps_per_second": 5.729, + "step": 13000 + }, + { + "epoch": 3.8002795736501835, + "grad_norm": 0.3221241235733032, + "learning_rate": 0.0005547237762237761, + "loss": 3.6364, + "step": 13050 + }, + { + "epoch": 3.814840701263906, + "grad_norm": 0.3083803653717041, + "learning_rate": 0.000554548951048951, + "loss": 3.6327, + "step": 13100 + }, + { + "epoch": 3.829401828877628, + "grad_norm": 0.3062104880809784, + "learning_rate": 0.0005543741258741258, + "loss": 3.6334, + "step": 13150 + }, + { + "epoch": 3.8439629564913504, + "grad_norm": 0.3108167350292206, + "learning_rate": 0.0005541993006993006, + "loss": 3.6318, + "step": 13200 + }, + { + "epoch": 3.858524084105073, + "grad_norm": 0.3138327896595001, + "learning_rate": 0.0005540244755244756, + "loss": 3.6419, + "step": 13250 + }, + { + "epoch": 3.8730852117187955, + "grad_norm": 0.3089863061904907, + "learning_rate": 0.0005538496503496502, + "loss": 3.6278, + "step": 13300 + }, + { + "epoch": 3.887646339332518, + "grad_norm": 0.301481693983078, + "learning_rate": 0.0005536748251748252, + "loss": 3.6352, + "step": 13350 + }, + { + "epoch": 3.9022074669462405, + "grad_norm": 0.315318763256073, + "learning_rate": 0.0005535, + "loss": 3.6306, + "step": 13400 + }, + { + "epoch": 3.916768594559963, + "grad_norm": 0.3293091058731079, + "learning_rate": 0.0005533251748251748, + "loss": 3.6413, + "step": 13450 + }, + { + "epoch": 3.931329722173685, + "grad_norm": 0.30823683738708496, + "learning_rate": 0.0005531503496503496, + "loss": 3.6494, + "step": 13500 + }, + { + "epoch": 3.9458908497874075, + "grad_norm": 0.30845707654953003, + "learning_rate": 0.0005529755244755245, + "loss": 3.6276, + "step": 13550 + }, + { + "epoch": 3.9604519774011298, + "grad_norm": 0.30484089255332947, + "learning_rate": 0.0005528006993006993, + "loss": 3.6249, + "step": 13600 + }, + { + "epoch": 3.9750131050148525, + "grad_norm": 0.31132036447525024, + "learning_rate": 0.0005526258741258741, + "loss": 3.6481, + "step": 13650 + }, + { + "epoch": 3.989574232628575, + "grad_norm": 0.2920059859752655, + "learning_rate": 0.0005524510489510489, + "loss": 3.6361, + "step": 13700 + }, + { + "epoch": 4.004077115731842, + "grad_norm": 0.31949788331985474, + "learning_rate": 0.0005522762237762238, + "loss": 3.6005, + "step": 13750 + }, + { + "epoch": 4.018638243345564, + "grad_norm": 0.3100167214870453, + "learning_rate": 0.0005521013986013986, + "loss": 3.5291, + "step": 13800 + }, + { + "epoch": 4.033199370959287, + "grad_norm": 0.3215591609477997, + "learning_rate": 0.0005519265734265734, + "loss": 3.5248, + "step": 13850 + }, + { + "epoch": 4.04776049857301, + "grad_norm": 0.31551459431648254, + "learning_rate": 0.0005517517482517482, + "loss": 3.5267, + "step": 13900 + }, + { + "epoch": 4.062321626186732, + "grad_norm": 0.3320501744747162, + "learning_rate": 0.0005515769230769231, + "loss": 3.5426, + "step": 13950 + }, + { + "epoch": 4.076882753800454, + "grad_norm": 0.32519182562828064, + "learning_rate": 0.0005514020979020979, + "loss": 3.533, + "step": 14000 + }, + { + "epoch": 4.076882753800454, + "eval_accuracy": 0.3559558824307633, + "eval_loss": 3.6741702556610107, + "eval_runtime": 181.4446, + "eval_samples_per_second": 91.714, + "eval_steps_per_second": 5.737, + "step": 14000 + }, + { + "epoch": 4.091443881414177, + "grad_norm": 0.3130313754081726, + "learning_rate": 0.0005512272727272727, + "loss": 3.5326, + "step": 14050 + }, + { + "epoch": 4.106005009027899, + "grad_norm": 0.315218061208725, + "learning_rate": 0.0005510524475524475, + "loss": 3.5588, + "step": 14100 + }, + { + "epoch": 4.120566136641622, + "grad_norm": 0.3174503743648529, + "learning_rate": 0.0005508776223776223, + "loss": 3.5427, + "step": 14150 + }, + { + "epoch": 4.135127264255344, + "grad_norm": 0.2998291254043579, + "learning_rate": 0.0005507027972027972, + "loss": 3.56, + "step": 14200 + }, + { + "epoch": 4.149688391869066, + "grad_norm": 0.31294891238212585, + "learning_rate": 0.000550527972027972, + "loss": 3.5353, + "step": 14250 + }, + { + "epoch": 4.164249519482789, + "grad_norm": 0.30971232056617737, + "learning_rate": 0.0005503531468531468, + "loss": 3.5362, + "step": 14300 + }, + { + "epoch": 4.178810647096511, + "grad_norm": 0.3312567174434662, + "learning_rate": 0.0005501783216783216, + "loss": 3.5566, + "step": 14350 + }, + { + "epoch": 4.193371774710234, + "grad_norm": 0.3150828778743744, + "learning_rate": 0.0005500034965034965, + "loss": 3.558, + "step": 14400 + }, + { + "epoch": 4.207932902323956, + "grad_norm": 0.3072719871997833, + "learning_rate": 0.0005498286713286713, + "loss": 3.5581, + "step": 14450 + }, + { + "epoch": 4.222494029937678, + "grad_norm": 0.34312841296195984, + "learning_rate": 0.0005496538461538461, + "loss": 3.5609, + "step": 14500 + }, + { + "epoch": 4.237055157551401, + "grad_norm": 0.33278995752334595, + "learning_rate": 0.0005494790209790209, + "loss": 3.5608, + "step": 14550 + }, + { + "epoch": 4.251616285165123, + "grad_norm": 0.31530410051345825, + "learning_rate": 0.0005493041958041958, + "loss": 3.559, + "step": 14600 + }, + { + "epoch": 4.266177412778846, + "grad_norm": 0.33409106731414795, + "learning_rate": 0.0005491293706293706, + "loss": 3.5706, + "step": 14650 + }, + { + "epoch": 4.280738540392568, + "grad_norm": 0.31736454367637634, + "learning_rate": 0.0005489545454545454, + "loss": 3.5602, + "step": 14700 + }, + { + "epoch": 4.29529966800629, + "grad_norm": 0.30737069249153137, + "learning_rate": 0.0005487797202797203, + "loss": 3.5574, + "step": 14750 + }, + { + "epoch": 4.309860795620013, + "grad_norm": 0.32270538806915283, + "learning_rate": 0.000548604895104895, + "loss": 3.5568, + "step": 14800 + }, + { + "epoch": 4.324421923233735, + "grad_norm": 0.3326343297958374, + "learning_rate": 0.0005484300699300699, + "loss": 3.5657, + "step": 14850 + }, + { + "epoch": 4.338983050847458, + "grad_norm": 0.32561352849006653, + "learning_rate": 0.0005482552447552447, + "loss": 3.5686, + "step": 14900 + }, + { + "epoch": 4.35354417846118, + "grad_norm": 0.3027591407299042, + "learning_rate": 0.0005480804195804195, + "loss": 3.566, + "step": 14950 + }, + { + "epoch": 4.368105306074902, + "grad_norm": 0.31824353337287903, + "learning_rate": 0.0005479055944055943, + "loss": 3.5749, + "step": 15000 + }, + { + "epoch": 4.368105306074902, + "eval_accuracy": 0.35690054660694104, + "eval_loss": 3.66287899017334, + "eval_runtime": 181.6353, + "eval_samples_per_second": 91.618, + "eval_steps_per_second": 5.731, + "step": 15000 + }, + { + "epoch": 4.382666433688625, + "grad_norm": 0.3168264329433441, + "learning_rate": 0.0005477307692307692, + "loss": 3.5774, + "step": 15050 + }, + { + "epoch": 4.397227561302348, + "grad_norm": 0.3169305920600891, + "learning_rate": 0.000547555944055944, + "loss": 3.5736, + "step": 15100 + }, + { + "epoch": 4.41178868891607, + "grad_norm": 0.3356797397136688, + "learning_rate": 0.0005473811188811188, + "loss": 3.5811, + "step": 15150 + }, + { + "epoch": 4.426349816529792, + "grad_norm": 0.31550532579421997, + "learning_rate": 0.0005472062937062936, + "loss": 3.5761, + "step": 15200 + }, + { + "epoch": 4.440910944143514, + "grad_norm": 0.3461132049560547, + "learning_rate": 0.0005470314685314685, + "loss": 3.5665, + "step": 15250 + }, + { + "epoch": 4.455472071757237, + "grad_norm": 0.31411316990852356, + "learning_rate": 0.0005468566433566433, + "loss": 3.5696, + "step": 15300 + }, + { + "epoch": 4.47003319937096, + "grad_norm": 0.3277323544025421, + "learning_rate": 0.0005466818181818181, + "loss": 3.5694, + "step": 15350 + }, + { + "epoch": 4.4845943269846815, + "grad_norm": 0.31850603222846985, + "learning_rate": 0.000546506993006993, + "loss": 3.5765, + "step": 15400 + }, + { + "epoch": 4.499155454598404, + "grad_norm": 0.33567726612091064, + "learning_rate": 0.0005463321678321678, + "loss": 3.5728, + "step": 15450 + }, + { + "epoch": 4.513716582212126, + "grad_norm": 0.33373895287513733, + "learning_rate": 0.0005461573426573426, + "loss": 3.5827, + "step": 15500 + }, + { + "epoch": 4.528277709825849, + "grad_norm": 0.33843833208084106, + "learning_rate": 0.0005459825174825174, + "loss": 3.5677, + "step": 15550 + }, + { + "epoch": 4.542838837439572, + "grad_norm": 0.30723223090171814, + "learning_rate": 0.0005458076923076922, + "loss": 3.5769, + "step": 15600 + }, + { + "epoch": 4.5573999650532935, + "grad_norm": 0.3198756277561188, + "learning_rate": 0.000545632867132867, + "loss": 3.5735, + "step": 15650 + }, + { + "epoch": 4.571961092667016, + "grad_norm": 0.32363468408584595, + "learning_rate": 0.0005454580419580419, + "loss": 3.5751, + "step": 15700 + }, + { + "epoch": 4.586522220280738, + "grad_norm": 0.34676221013069153, + "learning_rate": 0.0005452832167832167, + "loss": 3.5806, + "step": 15750 + }, + { + "epoch": 4.601083347894461, + "grad_norm": 0.32409968972206116, + "learning_rate": 0.0005451083916083915, + "loss": 3.5709, + "step": 15800 + }, + { + "epoch": 4.615644475508184, + "grad_norm": 0.31708860397338867, + "learning_rate": 0.0005449335664335663, + "loss": 3.5786, + "step": 15850 + }, + { + "epoch": 4.630205603121905, + "grad_norm": 0.3069595694541931, + "learning_rate": 0.0005447587412587412, + "loss": 3.5719, + "step": 15900 + }, + { + "epoch": 4.644766730735628, + "grad_norm": 0.30829885601997375, + "learning_rate": 0.000544583916083916, + "loss": 3.5832, + "step": 15950 + }, + { + "epoch": 4.659327858349351, + "grad_norm": 0.3094317317008972, + "learning_rate": 0.0005444090909090908, + "loss": 3.5851, + "step": 16000 + }, + { + "epoch": 4.659327858349351, + "eval_accuracy": 0.3582172906354063, + "eval_loss": 3.6480820178985596, + "eval_runtime": 181.5728, + "eval_samples_per_second": 91.649, + "eval_steps_per_second": 5.733, + "step": 16000 + }, + { + "epoch": 4.673888985963073, + "grad_norm": 0.3228926360607147, + "learning_rate": 0.0005442342657342657, + "loss": 3.5847, + "step": 16050 + }, + { + "epoch": 4.6884501135767955, + "grad_norm": 0.31402644515037537, + "learning_rate": 0.0005440594405594405, + "loss": 3.5727, + "step": 16100 + }, + { + "epoch": 4.703011241190518, + "grad_norm": 0.32076942920684814, + "learning_rate": 0.0005438846153846153, + "loss": 3.5834, + "step": 16150 + }, + { + "epoch": 4.71757236880424, + "grad_norm": 0.32715633511543274, + "learning_rate": 0.0005437097902097901, + "loss": 3.5792, + "step": 16200 + }, + { + "epoch": 4.732133496417963, + "grad_norm": 0.33358898758888245, + "learning_rate": 0.0005435349650349651, + "loss": 3.5713, + "step": 16250 + }, + { + "epoch": 4.746694624031685, + "grad_norm": 0.31136220693588257, + "learning_rate": 0.0005433601398601397, + "loss": 3.5786, + "step": 16300 + }, + { + "epoch": 4.7612557516454075, + "grad_norm": 0.3148178160190582, + "learning_rate": 0.0005431853146853147, + "loss": 3.5716, + "step": 16350 + }, + { + "epoch": 4.77581687925913, + "grad_norm": 0.321322500705719, + "learning_rate": 0.0005430104895104895, + "loss": 3.572, + "step": 16400 + }, + { + "epoch": 4.790378006872852, + "grad_norm": 0.3095892071723938, + "learning_rate": 0.0005428356643356643, + "loss": 3.5684, + "step": 16450 + }, + { + "epoch": 4.804939134486575, + "grad_norm": 0.30092310905456543, + "learning_rate": 0.0005426608391608391, + "loss": 3.5772, + "step": 16500 + }, + { + "epoch": 4.819500262100297, + "grad_norm": 0.3169448673725128, + "learning_rate": 0.000542486013986014, + "loss": 3.5718, + "step": 16550 + }, + { + "epoch": 4.834061389714019, + "grad_norm": 0.31526780128479004, + "learning_rate": 0.0005423111888111888, + "loss": 3.5614, + "step": 16600 + }, + { + "epoch": 4.848622517327742, + "grad_norm": 0.3070712983608246, + "learning_rate": 0.0005421363636363636, + "loss": 3.5931, + "step": 16650 + }, + { + "epoch": 4.863183644941464, + "grad_norm": 0.3159180283546448, + "learning_rate": 0.0005419615384615385, + "loss": 3.58, + "step": 16700 + }, + { + "epoch": 4.877744772555187, + "grad_norm": 0.3154292702674866, + "learning_rate": 0.0005417867132867133, + "loss": 3.587, + "step": 16750 + }, + { + "epoch": 4.892305900168909, + "grad_norm": 0.3371017277240753, + "learning_rate": 0.0005416118881118881, + "loss": 3.5729, + "step": 16800 + }, + { + "epoch": 4.906867027782631, + "grad_norm": 0.3221917748451233, + "learning_rate": 0.0005414370629370629, + "loss": 3.5721, + "step": 16850 + }, + { + "epoch": 4.921428155396354, + "grad_norm": 0.30847296118736267, + "learning_rate": 0.0005412622377622378, + "loss": 3.59, + "step": 16900 + }, + { + "epoch": 4.935989283010076, + "grad_norm": 0.3217724859714508, + "learning_rate": 0.0005410874125874126, + "loss": 3.5694, + "step": 16950 + }, + { + "epoch": 4.950550410623799, + "grad_norm": 0.319769948720932, + "learning_rate": 0.0005409125874125874, + "loss": 3.5783, + "step": 17000 + }, + { + "epoch": 4.950550410623799, + "eval_accuracy": 0.3593890364154928, + "eval_loss": 3.6356794834136963, + "eval_runtime": 181.6809, + "eval_samples_per_second": 91.595, + "eval_steps_per_second": 5.73, + "step": 17000 + }, + { + "epoch": 4.9651115382375215, + "grad_norm": 0.33256304264068604, + "learning_rate": 0.0005407377622377622, + "loss": 3.5772, + "step": 17050 + }, + { + "epoch": 4.979672665851243, + "grad_norm": 0.31527116894721985, + "learning_rate": 0.000540562937062937, + "loss": 3.5829, + "step": 17100 + }, + { + "epoch": 4.994233793464966, + "grad_norm": 0.3263448476791382, + "learning_rate": 0.0005403881118881118, + "loss": 3.5755, + "step": 17150 + }, + { + "epoch": 5.008736676568233, + "grad_norm": 0.30820319056510925, + "learning_rate": 0.0005402132867132867, + "loss": 3.5017, + "step": 17200 + }, + { + "epoch": 5.023297804181956, + "grad_norm": 0.324663370847702, + "learning_rate": 0.0005400384615384615, + "loss": 3.4585, + "step": 17250 + }, + { + "epoch": 5.037858931795678, + "grad_norm": 0.33028483390808105, + "learning_rate": 0.0005398636363636363, + "loss": 3.4698, + "step": 17300 + }, + { + "epoch": 5.052420059409401, + "grad_norm": 0.30313462018966675, + "learning_rate": 0.0005396888111888111, + "loss": 3.471, + "step": 17350 + }, + { + "epoch": 5.066981187023123, + "grad_norm": 0.3187935948371887, + "learning_rate": 0.000539513986013986, + "loss": 3.4774, + "step": 17400 + }, + { + "epoch": 5.081542314636845, + "grad_norm": 0.3407951295375824, + "learning_rate": 0.0005393391608391608, + "loss": 3.4883, + "step": 17450 + }, + { + "epoch": 5.096103442250568, + "grad_norm": 0.3357192575931549, + "learning_rate": 0.0005391643356643356, + "loss": 3.4888, + "step": 17500 + }, + { + "epoch": 5.110664569864291, + "grad_norm": 0.3398742079734802, + "learning_rate": 0.0005389895104895105, + "loss": 3.4936, + "step": 17550 + }, + { + "epoch": 5.125225697478013, + "grad_norm": 0.3227720558643341, + "learning_rate": 0.0005388146853146853, + "loss": 3.4852, + "step": 17600 + }, + { + "epoch": 5.139786825091735, + "grad_norm": 0.3157457113265991, + "learning_rate": 0.0005386398601398601, + "loss": 3.4914, + "step": 17650 + }, + { + "epoch": 5.154347952705457, + "grad_norm": 0.3167363405227661, + "learning_rate": 0.0005384650349650349, + "loss": 3.4853, + "step": 17700 + }, + { + "epoch": 5.16890908031918, + "grad_norm": 0.31787964701652527, + "learning_rate": 0.0005382902097902098, + "loss": 3.4914, + "step": 17750 + }, + { + "epoch": 5.183470207932903, + "grad_norm": 0.3270482122898102, + "learning_rate": 0.0005381153846153845, + "loss": 3.5016, + "step": 17800 + }, + { + "epoch": 5.1980313355466246, + "grad_norm": 0.3301296830177307, + "learning_rate": 0.0005379405594405594, + "loss": 3.5044, + "step": 17850 + }, + { + "epoch": 5.212592463160347, + "grad_norm": 0.3604094386100769, + "learning_rate": 0.0005377657342657342, + "loss": 3.5047, + "step": 17900 + }, + { + "epoch": 5.227153590774069, + "grad_norm": 0.3370974361896515, + "learning_rate": 0.000537590909090909, + "loss": 3.5027, + "step": 17950 + }, + { + "epoch": 5.241714718387792, + "grad_norm": 0.35089635848999023, + "learning_rate": 0.0005374160839160838, + "loss": 3.4935, + "step": 18000 + }, + { + "epoch": 5.241714718387792, + "eval_accuracy": 0.36011414525531743, + "eval_loss": 3.6362621784210205, + "eval_runtime": 181.5644, + "eval_samples_per_second": 91.653, + "eval_steps_per_second": 5.734, + "step": 18000 + }, + { + "epoch": 5.256275846001515, + "grad_norm": 0.3056846857070923, + "learning_rate": 0.0005372412587412587, + "loss": 3.5168, + "step": 18050 + }, + { + "epoch": 5.2708369736152365, + "grad_norm": 0.32178497314453125, + "learning_rate": 0.0005370664335664335, + "loss": 3.5152, + "step": 18100 + }, + { + "epoch": 5.285398101228959, + "grad_norm": 0.33543261885643005, + "learning_rate": 0.0005368916083916083, + "loss": 3.5213, + "step": 18150 + }, + { + "epoch": 5.299959228842681, + "grad_norm": 0.31193044781684875, + "learning_rate": 0.0005367167832167832, + "loss": 3.5102, + "step": 18200 + }, + { + "epoch": 5.314520356456404, + "grad_norm": 0.31134116649627686, + "learning_rate": 0.000536541958041958, + "loss": 3.5019, + "step": 18250 + }, + { + "epoch": 5.329081484070127, + "grad_norm": 0.3064328730106354, + "learning_rate": 0.0005363671328671328, + "loss": 3.5082, + "step": 18300 + }, + { + "epoch": 5.3436426116838485, + "grad_norm": 0.3232489228248596, + "learning_rate": 0.0005361923076923076, + "loss": 3.5088, + "step": 18350 + }, + { + "epoch": 5.358203739297571, + "grad_norm": 0.3162420690059662, + "learning_rate": 0.0005360174825174825, + "loss": 3.5201, + "step": 18400 + }, + { + "epoch": 5.372764866911294, + "grad_norm": 0.29694801568984985, + "learning_rate": 0.0005358426573426573, + "loss": 3.5179, + "step": 18450 + }, + { + "epoch": 5.387325994525016, + "grad_norm": 0.35571807622909546, + "learning_rate": 0.0005356678321678321, + "loss": 3.5171, + "step": 18500 + }, + { + "epoch": 5.401887122138739, + "grad_norm": 0.32287997007369995, + "learning_rate": 0.0005354930069930069, + "loss": 3.507, + "step": 18550 + }, + { + "epoch": 5.41644824975246, + "grad_norm": 0.3234080970287323, + "learning_rate": 0.0005353181818181817, + "loss": 3.5237, + "step": 18600 + }, + { + "epoch": 5.431009377366183, + "grad_norm": 0.3096522092819214, + "learning_rate": 0.0005351433566433565, + "loss": 3.5226, + "step": 18650 + }, + { + "epoch": 5.445570504979906, + "grad_norm": 0.3458673357963562, + "learning_rate": 0.0005349685314685314, + "loss": 3.5318, + "step": 18700 + }, + { + "epoch": 5.460131632593628, + "grad_norm": 0.341791033744812, + "learning_rate": 0.0005347937062937062, + "loss": 3.5091, + "step": 18750 + }, + { + "epoch": 5.4746927602073505, + "grad_norm": 0.3296269476413727, + "learning_rate": 0.000534618881118881, + "loss": 3.5258, + "step": 18800 + }, + { + "epoch": 5.489253887821073, + "grad_norm": 0.3013812005519867, + "learning_rate": 0.0005344440559440559, + "loss": 3.5136, + "step": 18850 + }, + { + "epoch": 5.503815015434795, + "grad_norm": 0.30301058292388916, + "learning_rate": 0.0005342692307692307, + "loss": 3.522, + "step": 18900 + }, + { + "epoch": 5.518376143048518, + "grad_norm": 0.33094391226768494, + "learning_rate": 0.0005340944055944055, + "loss": 3.5293, + "step": 18950 + }, + { + "epoch": 5.53293727066224, + "grad_norm": 0.3317716717720032, + "learning_rate": 0.0005339195804195803, + "loss": 3.5133, + "step": 19000 + }, + { + "epoch": 5.53293727066224, + "eval_accuracy": 0.36096437829325656, + "eval_loss": 3.627488613128662, + "eval_runtime": 181.4247, + "eval_samples_per_second": 91.724, + "eval_steps_per_second": 5.738, + "step": 19000 + }, + { + "epoch": 5.5474983982759625, + "grad_norm": 0.32238489389419556, + "learning_rate": 0.0005337447552447552, + "loss": 3.5201, + "step": 19050 + }, + { + "epoch": 5.562059525889685, + "grad_norm": 0.30963394045829773, + "learning_rate": 0.00053356993006993, + "loss": 3.5149, + "step": 19100 + }, + { + "epoch": 5.576620653503407, + "grad_norm": 0.3175974190235138, + "learning_rate": 0.0005333951048951048, + "loss": 3.5136, + "step": 19150 + }, + { + "epoch": 5.59118178111713, + "grad_norm": 0.32928696274757385, + "learning_rate": 0.0005332202797202796, + "loss": 3.5253, + "step": 19200 + }, + { + "epoch": 5.605742908730852, + "grad_norm": 0.3137303292751312, + "learning_rate": 0.0005330454545454546, + "loss": 3.5249, + "step": 19250 + }, + { + "epoch": 5.620304036344574, + "grad_norm": 0.332732617855072, + "learning_rate": 0.0005328706293706292, + "loss": 3.5172, + "step": 19300 + }, + { + "epoch": 5.634865163958297, + "grad_norm": 0.34113192558288574, + "learning_rate": 0.0005326958041958042, + "loss": 3.5251, + "step": 19350 + }, + { + "epoch": 5.649426291572019, + "grad_norm": 0.3195572793483734, + "learning_rate": 0.000532520979020979, + "loss": 3.5343, + "step": 19400 + }, + { + "epoch": 5.663987419185742, + "grad_norm": 0.3289816975593567, + "learning_rate": 0.0005323461538461538, + "loss": 3.5341, + "step": 19450 + }, + { + "epoch": 5.6785485467994645, + "grad_norm": 0.31563058495521545, + "learning_rate": 0.0005321713286713287, + "loss": 3.5192, + "step": 19500 + }, + { + "epoch": 5.693109674413186, + "grad_norm": 0.3364487588405609, + "learning_rate": 0.0005319965034965035, + "loss": 3.5278, + "step": 19550 + }, + { + "epoch": 5.707670802026909, + "grad_norm": 0.31011462211608887, + "learning_rate": 0.0005318216783216783, + "loss": 3.5285, + "step": 19600 + }, + { + "epoch": 5.722231929640631, + "grad_norm": 0.31973445415496826, + "learning_rate": 0.0005316468531468531, + "loss": 3.5257, + "step": 19650 + }, + { + "epoch": 5.736793057254354, + "grad_norm": 0.33663949370384216, + "learning_rate": 0.000531472027972028, + "loss": 3.5427, + "step": 19700 + }, + { + "epoch": 5.7513541848680765, + "grad_norm": 0.320126473903656, + "learning_rate": 0.0005312972027972028, + "loss": 3.5276, + "step": 19750 + }, + { + "epoch": 5.765915312481798, + "grad_norm": 0.3229815363883972, + "learning_rate": 0.0005311223776223776, + "loss": 3.5301, + "step": 19800 + }, + { + "epoch": 5.780476440095521, + "grad_norm": 0.3471143841743469, + "learning_rate": 0.0005309475524475524, + "loss": 3.518, + "step": 19850 + }, + { + "epoch": 5.795037567709244, + "grad_norm": 0.33333975076675415, + "learning_rate": 0.0005307727272727273, + "loss": 3.5302, + "step": 19900 + }, + { + "epoch": 5.809598695322966, + "grad_norm": 0.31919199228286743, + "learning_rate": 0.0005305979020979021, + "loss": 3.5216, + "step": 19950 + }, + { + "epoch": 5.824159822936688, + "grad_norm": 0.2982664704322815, + "learning_rate": 0.0005304230769230769, + "loss": 3.5425, + "step": 20000 + }, + { + "epoch": 5.824159822936688, + "eval_accuracy": 0.3616781977317476, + "eval_loss": 3.6140241622924805, + "eval_runtime": 181.7884, + "eval_samples_per_second": 91.54, + "eval_steps_per_second": 5.726, + "step": 20000 + }, + { + "epoch": 5.83872095055041, + "grad_norm": 0.3217616081237793, + "learning_rate": 0.0005302482517482517, + "loss": 3.5286, + "step": 20050 + }, + { + "epoch": 5.853282078164133, + "grad_norm": 0.3139144480228424, + "learning_rate": 0.0005300734265734265, + "loss": 3.5229, + "step": 20100 + }, + { + "epoch": 5.867843205777856, + "grad_norm": 0.32186374068260193, + "learning_rate": 0.0005298986013986013, + "loss": 3.526, + "step": 20150 + }, + { + "epoch": 5.882404333391578, + "grad_norm": 0.3202000856399536, + "learning_rate": 0.0005297237762237762, + "loss": 3.5309, + "step": 20200 + }, + { + "epoch": 5.8969654610053, + "grad_norm": 0.3569493591785431, + "learning_rate": 0.000529548951048951, + "loss": 3.5113, + "step": 20250 + }, + { + "epoch": 5.911526588619022, + "grad_norm": 0.302033394575119, + "learning_rate": 0.0005293741258741258, + "loss": 3.5269, + "step": 20300 + }, + { + "epoch": 5.926087716232745, + "grad_norm": 0.31015071272850037, + "learning_rate": 0.0005291993006993007, + "loss": 3.5308, + "step": 20350 + }, + { + "epoch": 5.940648843846468, + "grad_norm": 0.3325614929199219, + "learning_rate": 0.0005290244755244755, + "loss": 3.5259, + "step": 20400 + }, + { + "epoch": 5.95520997146019, + "grad_norm": 0.3196668326854706, + "learning_rate": 0.0005288496503496503, + "loss": 3.5339, + "step": 20450 + }, + { + "epoch": 5.969771099073912, + "grad_norm": 0.313598096370697, + "learning_rate": 0.0005286748251748251, + "loss": 3.527, + "step": 20500 + }, + { + "epoch": 5.984332226687634, + "grad_norm": 0.32459649443626404, + "learning_rate": 0.0005285, + "loss": 3.5347, + "step": 20550 + }, + { + "epoch": 5.998893354301357, + "grad_norm": 0.31021371483802795, + "learning_rate": 0.0005283251748251748, + "loss": 3.5267, + "step": 20600 + }, + { + "epoch": 6.013396237404625, + "grad_norm": 0.3184387683868408, + "learning_rate": 0.0005281503496503496, + "loss": 3.4286, + "step": 20650 + }, + { + "epoch": 6.027957365018347, + "grad_norm": 0.3275751769542694, + "learning_rate": 0.0005279755244755244, + "loss": 3.4175, + "step": 20700 + }, + { + "epoch": 6.04251849263207, + "grad_norm": 0.320161372423172, + "learning_rate": 0.0005278006993006993, + "loss": 3.4261, + "step": 20750 + }, + { + "epoch": 6.0570796202457915, + "grad_norm": 0.33726227283477783, + "learning_rate": 0.000527625874125874, + "loss": 3.427, + "step": 20800 + }, + { + "epoch": 6.071640747859514, + "grad_norm": 0.3396624028682709, + "learning_rate": 0.0005274510489510489, + "loss": 3.4261, + "step": 20850 + }, + { + "epoch": 6.086201875473237, + "grad_norm": 0.3232564628124237, + "learning_rate": 0.0005272762237762238, + "loss": 3.4347, + "step": 20900 + }, + { + "epoch": 6.100763003086959, + "grad_norm": 0.31716442108154297, + "learning_rate": 0.0005271013986013985, + "loss": 3.4299, + "step": 20950 + }, + { + "epoch": 6.115324130700682, + "grad_norm": 0.32577764987945557, + "learning_rate": 0.0005269265734265734, + "loss": 3.4302, + "step": 21000 + }, + { + "epoch": 6.115324130700682, + "eval_accuracy": 0.3622031548937614, + "eval_loss": 3.619544267654419, + "eval_runtime": 181.3777, + "eval_samples_per_second": 91.748, + "eval_steps_per_second": 5.739, + "step": 21000 + }, + { + "epoch": 6.1298852583144035, + "grad_norm": 0.3372443914413452, + "learning_rate": 0.0005267517482517482, + "loss": 3.451, + "step": 21050 + }, + { + "epoch": 6.144446385928126, + "grad_norm": 0.32983914017677307, + "learning_rate": 0.000526576923076923, + "loss": 3.4585, + "step": 21100 + }, + { + "epoch": 6.159007513541849, + "grad_norm": 0.3527901768684387, + "learning_rate": 0.0005264020979020978, + "loss": 3.4515, + "step": 21150 + }, + { + "epoch": 6.173568641155571, + "grad_norm": 0.32249078154563904, + "learning_rate": 0.0005262272727272727, + "loss": 3.4478, + "step": 21200 + }, + { + "epoch": 6.1881297687692935, + "grad_norm": 0.3154282867908478, + "learning_rate": 0.0005260524475524475, + "loss": 3.4408, + "step": 21250 + }, + { + "epoch": 6.202690896383016, + "grad_norm": 0.3283529281616211, + "learning_rate": 0.0005258776223776223, + "loss": 3.4585, + "step": 21300 + }, + { + "epoch": 6.217252023996738, + "grad_norm": 0.3528301417827606, + "learning_rate": 0.0005257027972027971, + "loss": 3.4687, + "step": 21350 + }, + { + "epoch": 6.231813151610461, + "grad_norm": 0.3371451497077942, + "learning_rate": 0.000525527972027972, + "loss": 3.4452, + "step": 21400 + }, + { + "epoch": 6.246374279224183, + "grad_norm": 0.3369438350200653, + "learning_rate": 0.0005253531468531468, + "loss": 3.4643, + "step": 21450 + }, + { + "epoch": 6.2609354068379055, + "grad_norm": 0.3266349732875824, + "learning_rate": 0.0005251783216783216, + "loss": 3.4547, + "step": 21500 + }, + { + "epoch": 6.275496534451628, + "grad_norm": 0.32149258255958557, + "learning_rate": 0.0005250034965034965, + "loss": 3.4577, + "step": 21550 + }, + { + "epoch": 6.29005766206535, + "grad_norm": 0.31332188844680786, + "learning_rate": 0.0005248286713286712, + "loss": 3.4526, + "step": 21600 + }, + { + "epoch": 6.304618789679073, + "grad_norm": 0.34418395161628723, + "learning_rate": 0.0005246538461538461, + "loss": 3.4707, + "step": 21650 + }, + { + "epoch": 6.319179917292795, + "grad_norm": 0.3074207305908203, + "learning_rate": 0.0005244790209790209, + "loss": 3.4713, + "step": 21700 + }, + { + "epoch": 6.3337410449065175, + "grad_norm": 0.3123386800289154, + "learning_rate": 0.0005243041958041957, + "loss": 3.466, + "step": 21750 + }, + { + "epoch": 6.34830217252024, + "grad_norm": 0.32567399740219116, + "learning_rate": 0.0005241293706293705, + "loss": 3.4607, + "step": 21800 + }, + { + "epoch": 6.362863300133962, + "grad_norm": 0.3354513645172119, + "learning_rate": 0.0005239545454545454, + "loss": 3.4662, + "step": 21850 + }, + { + "epoch": 6.377424427747685, + "grad_norm": 0.32310134172439575, + "learning_rate": 0.0005237797202797202, + "loss": 3.473, + "step": 21900 + }, + { + "epoch": 6.391985555361408, + "grad_norm": 0.34556859731674194, + "learning_rate": 0.000523604895104895, + "loss": 3.4771, + "step": 21950 + }, + { + "epoch": 6.406546682975129, + "grad_norm": 0.3282891809940338, + "learning_rate": 0.0005234300699300698, + "loss": 3.4781, + "step": 22000 + }, + { + "epoch": 6.406546682975129, + "eval_accuracy": 0.36261039652728605, + "eval_loss": 3.612433910369873, + "eval_runtime": 179.6978, + "eval_samples_per_second": 92.605, + "eval_steps_per_second": 5.793, + "step": 22000 + }, + { + "epoch": 6.421107810588852, + "grad_norm": 0.3479907810688019, + "learning_rate": 0.0005232552447552447, + "loss": 3.4734, + "step": 22050 + }, + { + "epoch": 6.435668938202574, + "grad_norm": 0.36589229106903076, + "learning_rate": 0.0005230804195804195, + "loss": 3.4748, + "step": 22100 + }, + { + "epoch": 6.450230065816297, + "grad_norm": 0.33497488498687744, + "learning_rate": 0.0005229055944055943, + "loss": 3.4702, + "step": 22150 + }, + { + "epoch": 6.4647911934300195, + "grad_norm": 0.3153390884399414, + "learning_rate": 0.0005227307692307691, + "loss": 3.4821, + "step": 22200 + }, + { + "epoch": 6.479352321043741, + "grad_norm": 0.34096068143844604, + "learning_rate": 0.0005225559440559441, + "loss": 3.4717, + "step": 22250 + }, + { + "epoch": 6.493913448657464, + "grad_norm": 0.3129245340824127, + "learning_rate": 0.0005223811188811189, + "loss": 3.4865, + "step": 22300 + }, + { + "epoch": 6.508474576271187, + "grad_norm": 0.32267722487449646, + "learning_rate": 0.0005222062937062937, + "loss": 3.4701, + "step": 22350 + }, + { + "epoch": 6.523035703884909, + "grad_norm": 0.31626445055007935, + "learning_rate": 0.0005220314685314686, + "loss": 3.4743, + "step": 22400 + }, + { + "epoch": 6.5375968314986315, + "grad_norm": 0.3249252438545227, + "learning_rate": 0.0005218566433566433, + "loss": 3.4776, + "step": 22450 + }, + { + "epoch": 6.552157959112353, + "grad_norm": 0.3492892384529114, + "learning_rate": 0.0005216818181818182, + "loss": 3.4883, + "step": 22500 + }, + { + "epoch": 6.566719086726076, + "grad_norm": 0.3136710524559021, + "learning_rate": 0.000521506993006993, + "loss": 3.4739, + "step": 22550 + }, + { + "epoch": 6.581280214339799, + "grad_norm": 0.3537392020225525, + "learning_rate": 0.0005213321678321678, + "loss": 3.4942, + "step": 22600 + }, + { + "epoch": 6.595841341953521, + "grad_norm": 0.31021302938461304, + "learning_rate": 0.0005211573426573426, + "loss": 3.4792, + "step": 22650 + }, + { + "epoch": 6.610402469567243, + "grad_norm": 0.29873308539390564, + "learning_rate": 0.0005209825174825175, + "loss": 3.4877, + "step": 22700 + }, + { + "epoch": 6.624963597180965, + "grad_norm": 0.32095155119895935, + "learning_rate": 0.0005208076923076923, + "loss": 3.4752, + "step": 22750 + }, + { + "epoch": 6.639524724794688, + "grad_norm": 0.31786495447158813, + "learning_rate": 0.0005206328671328671, + "loss": 3.4854, + "step": 22800 + }, + { + "epoch": 6.654085852408411, + "grad_norm": 0.34261924028396606, + "learning_rate": 0.0005204580419580419, + "loss": 3.4853, + "step": 22850 + }, + { + "epoch": 6.668646980022133, + "grad_norm": 0.3314582407474518, + "learning_rate": 0.0005202832167832168, + "loss": 3.4919, + "step": 22900 + }, + { + "epoch": 6.683208107635855, + "grad_norm": 0.3397805094718933, + "learning_rate": 0.0005201083916083916, + "loss": 3.4908, + "step": 22950 + }, + { + "epoch": 6.697769235249577, + "grad_norm": 0.33193162083625793, + "learning_rate": 0.0005199335664335664, + "loss": 3.4873, + "step": 23000 + }, + { + "epoch": 6.697769235249577, + "eval_accuracy": 0.363417235928849, + "eval_loss": 3.6028225421905518, + "eval_runtime": 179.443, + "eval_samples_per_second": 92.737, + "eval_steps_per_second": 5.801, + "step": 23000 + }, + { + "epoch": 6.7123303628633, + "grad_norm": 0.30862730741500854, + "learning_rate": 0.0005197587412587413, + "loss": 3.4914, + "step": 23050 + }, + { + "epoch": 6.726891490477023, + "grad_norm": 0.3028578460216522, + "learning_rate": 0.0005195839160839161, + "loss": 3.4778, + "step": 23100 + }, + { + "epoch": 6.741452618090745, + "grad_norm": 0.3111686706542969, + "learning_rate": 0.0005194090909090909, + "loss": 3.4941, + "step": 23150 + }, + { + "epoch": 6.756013745704467, + "grad_norm": 0.3356785476207733, + "learning_rate": 0.0005192342657342657, + "loss": 3.4939, + "step": 23200 + }, + { + "epoch": 6.77057487331819, + "grad_norm": 0.3174416720867157, + "learning_rate": 0.0005190594405594405, + "loss": 3.4902, + "step": 23250 + }, + { + "epoch": 6.785136000931912, + "grad_norm": 0.33599984645843506, + "learning_rate": 0.0005188846153846153, + "loss": 3.4959, + "step": 23300 + }, + { + "epoch": 6.799697128545635, + "grad_norm": 0.3432798385620117, + "learning_rate": 0.0005187097902097902, + "loss": 3.4871, + "step": 23350 + }, + { + "epoch": 6.814258256159357, + "grad_norm": 0.3143886625766754, + "learning_rate": 0.000518534965034965, + "loss": 3.5018, + "step": 23400 + }, + { + "epoch": 6.828819383773079, + "grad_norm": 0.3041757047176361, + "learning_rate": 0.0005183601398601398, + "loss": 3.4996, + "step": 23450 + }, + { + "epoch": 6.843380511386802, + "grad_norm": 0.3227998912334442, + "learning_rate": 0.0005181853146853146, + "loss": 3.488, + "step": 23500 + }, + { + "epoch": 6.857941639000524, + "grad_norm": 0.3264809846878052, + "learning_rate": 0.0005180104895104895, + "loss": 3.5016, + "step": 23550 + }, + { + "epoch": 6.872502766614247, + "grad_norm": 0.30348044633865356, + "learning_rate": 0.0005178356643356643, + "loss": 3.4928, + "step": 23600 + }, + { + "epoch": 6.887063894227969, + "grad_norm": 0.30934104323387146, + "learning_rate": 0.0005176608391608391, + "loss": 3.481, + "step": 23650 + }, + { + "epoch": 6.901625021841691, + "grad_norm": 0.3080596327781677, + "learning_rate": 0.000517486013986014, + "loss": 3.4931, + "step": 23700 + }, + { + "epoch": 6.916186149455414, + "grad_norm": 0.3390050530433655, + "learning_rate": 0.0005173111888111888, + "loss": 3.4935, + "step": 23750 + }, + { + "epoch": 6.930747277069136, + "grad_norm": 0.33725789189338684, + "learning_rate": 0.0005171363636363636, + "loss": 3.4894, + "step": 23800 + }, + { + "epoch": 6.945308404682859, + "grad_norm": 0.3304024934768677, + "learning_rate": 0.0005169615384615384, + "loss": 3.4811, + "step": 23850 + }, + { + "epoch": 6.959869532296581, + "grad_norm": 0.3428155481815338, + "learning_rate": 0.0005167867132867133, + "loss": 3.4964, + "step": 23900 + }, + { + "epoch": 6.974430659910303, + "grad_norm": 0.34181949496269226, + "learning_rate": 0.000516611888111888, + "loss": 3.4981, + "step": 23950 + }, + { + "epoch": 6.988991787524026, + "grad_norm": 0.3134143352508545, + "learning_rate": 0.0005164370629370629, + "loss": 3.4938, + "step": 24000 + }, + { + "epoch": 6.988991787524026, + "eval_accuracy": 0.3644916106224329, + "eval_loss": 3.5912303924560547, + "eval_runtime": 179.7006, + "eval_samples_per_second": 92.604, + "eval_steps_per_second": 5.793, + "step": 24000 + }, + { + "epoch": 7.003494670627293, + "grad_norm": 0.3197336792945862, + "learning_rate": 0.0005162622377622377, + "loss": 3.4804, + "step": 24050 + }, + { + "epoch": 7.018055798241016, + "grad_norm": 0.34031471610069275, + "learning_rate": 0.0005160874125874125, + "loss": 3.389, + "step": 24100 + }, + { + "epoch": 7.032616925854738, + "grad_norm": 0.31004777550697327, + "learning_rate": 0.0005159125874125873, + "loss": 3.3799, + "step": 24150 + }, + { + "epoch": 7.0471780534684605, + "grad_norm": 0.343368798494339, + "learning_rate": 0.0005157377622377622, + "loss": 3.3855, + "step": 24200 + }, + { + "epoch": 7.061739181082183, + "grad_norm": 0.3274615406990051, + "learning_rate": 0.000515562937062937, + "loss": 3.3952, + "step": 24250 + }, + { + "epoch": 7.076300308695905, + "grad_norm": 0.32230237126350403, + "learning_rate": 0.0005153881118881118, + "loss": 3.3911, + "step": 24300 + }, + { + "epoch": 7.090861436309628, + "grad_norm": 0.32454419136047363, + "learning_rate": 0.0005152132867132867, + "loss": 3.4076, + "step": 24350 + }, + { + "epoch": 7.105422563923351, + "grad_norm": 0.33376428484916687, + "learning_rate": 0.0005150384615384615, + "loss": 3.4129, + "step": 24400 + }, + { + "epoch": 7.1199836915370724, + "grad_norm": 0.3614583909511566, + "learning_rate": 0.0005148636363636363, + "loss": 3.4057, + "step": 24450 + }, + { + "epoch": 7.134544819150795, + "grad_norm": 0.34300723671913147, + "learning_rate": 0.0005146888111888111, + "loss": 3.405, + "step": 24500 + }, + { + "epoch": 7.149105946764517, + "grad_norm": 0.343777596950531, + "learning_rate": 0.000514513986013986, + "loss": 3.4118, + "step": 24550 + }, + { + "epoch": 7.16366707437824, + "grad_norm": 0.3267875611782074, + "learning_rate": 0.0005143391608391608, + "loss": 3.4238, + "step": 24600 + }, + { + "epoch": 7.1782282019919625, + "grad_norm": 0.37025755643844604, + "learning_rate": 0.0005141643356643356, + "loss": 3.4162, + "step": 24650 + }, + { + "epoch": 7.192789329605684, + "grad_norm": 0.3431757390499115, + "learning_rate": 0.0005139895104895104, + "loss": 3.4131, + "step": 24700 + }, + { + "epoch": 7.207350457219407, + "grad_norm": 0.36377280950546265, + "learning_rate": 0.0005138146853146852, + "loss": 3.4085, + "step": 24750 + }, + { + "epoch": 7.22191158483313, + "grad_norm": 0.3307691216468811, + "learning_rate": 0.00051363986013986, + "loss": 3.4203, + "step": 24800 + }, + { + "epoch": 7.236472712446852, + "grad_norm": 0.35206225514411926, + "learning_rate": 0.0005134650349650349, + "loss": 3.4279, + "step": 24850 + }, + { + "epoch": 7.2510338400605745, + "grad_norm": 0.336747944355011, + "learning_rate": 0.0005132902097902097, + "loss": 3.4326, + "step": 24900 + }, + { + "epoch": 7.265594967674296, + "grad_norm": 0.3248525857925415, + "learning_rate": 0.0005131153846153845, + "loss": 3.4251, + "step": 24950 + }, + { + "epoch": 7.280156095288019, + "grad_norm": 0.3397109806537628, + "learning_rate": 0.0005129405594405594, + "loss": 3.4288, + "step": 25000 + }, + { + "epoch": 7.280156095288019, + "eval_accuracy": 0.3642418326179263, + "eval_loss": 3.5990543365478516, + "eval_runtime": 179.5973, + "eval_samples_per_second": 92.657, + "eval_steps_per_second": 5.796, + "step": 25000 + }, + { + "epoch": 7.294717222901742, + "grad_norm": 0.3284904956817627, + "learning_rate": 0.0005127657342657342, + "loss": 3.4113, + "step": 25050 + }, + { + "epoch": 7.309278350515464, + "grad_norm": 0.33549222350120544, + "learning_rate": 0.000512590909090909, + "loss": 3.4227, + "step": 25100 + }, + { + "epoch": 7.3238394781291865, + "grad_norm": 0.3276277780532837, + "learning_rate": 0.0005124160839160838, + "loss": 3.4321, + "step": 25150 + }, + { + "epoch": 7.338400605742908, + "grad_norm": 0.3157896101474762, + "learning_rate": 0.0005122412587412588, + "loss": 3.4382, + "step": 25200 + }, + { + "epoch": 7.352961733356631, + "grad_norm": 0.3331218957901001, + "learning_rate": 0.0005120664335664336, + "loss": 3.4405, + "step": 25250 + }, + { + "epoch": 7.367522860970354, + "grad_norm": 0.321426659822464, + "learning_rate": 0.0005118916083916084, + "loss": 3.4389, + "step": 25300 + }, + { + "epoch": 7.382083988584076, + "grad_norm": 0.3326101005077362, + "learning_rate": 0.0005117167832167832, + "loss": 3.4309, + "step": 25350 + }, + { + "epoch": 7.396645116197798, + "grad_norm": 0.3259352445602417, + "learning_rate": 0.0005115419580419581, + "loss": 3.4294, + "step": 25400 + }, + { + "epoch": 7.411206243811521, + "grad_norm": 0.32202550768852234, + "learning_rate": 0.0005113671328671328, + "loss": 3.439, + "step": 25450 + }, + { + "epoch": 7.425767371425243, + "grad_norm": 0.32014110684394836, + "learning_rate": 0.0005111923076923077, + "loss": 3.4449, + "step": 25500 + }, + { + "epoch": 7.440328499038966, + "grad_norm": 0.34263157844543457, + "learning_rate": 0.0005110174825174825, + "loss": 3.4429, + "step": 25550 + }, + { + "epoch": 7.454889626652688, + "grad_norm": 0.31470826268196106, + "learning_rate": 0.0005108426573426573, + "loss": 3.4333, + "step": 25600 + }, + { + "epoch": 7.46945075426641, + "grad_norm": 0.3453708589076996, + "learning_rate": 0.0005106678321678321, + "loss": 3.4381, + "step": 25650 + }, + { + "epoch": 7.484011881880133, + "grad_norm": 0.3306244909763336, + "learning_rate": 0.000510493006993007, + "loss": 3.4464, + "step": 25700 + }, + { + "epoch": 7.498573009493855, + "grad_norm": 0.3169175088405609, + "learning_rate": 0.0005103181818181818, + "loss": 3.4453, + "step": 25750 + }, + { + "epoch": 7.513134137107578, + "grad_norm": 0.317241370677948, + "learning_rate": 0.0005101433566433566, + "loss": 3.4517, + "step": 25800 + }, + { + "epoch": 7.5276952647213005, + "grad_norm": 0.3119760751724243, + "learning_rate": 0.0005099685314685315, + "loss": 3.4369, + "step": 25850 + }, + { + "epoch": 7.542256392335022, + "grad_norm": 0.3233243227005005, + "learning_rate": 0.0005097937062937063, + "loss": 3.4422, + "step": 25900 + }, + { + "epoch": 7.556817519948745, + "grad_norm": 0.3211483955383301, + "learning_rate": 0.0005096188811188811, + "loss": 3.4554, + "step": 25950 + }, + { + "epoch": 7.571378647562467, + "grad_norm": 0.3340722918510437, + "learning_rate": 0.0005094440559440559, + "loss": 3.4497, + "step": 26000 + }, + { + "epoch": 7.571378647562467, + "eval_accuracy": 0.3650712508221565, + "eval_loss": 3.5916457176208496, + "eval_runtime": 179.7062, + "eval_samples_per_second": 92.601, + "eval_steps_per_second": 5.793, + "step": 26000 + }, + { + "epoch": 7.58593977517619, + "grad_norm": 0.322214812040329, + "learning_rate": 0.0005092692307692308, + "loss": 3.4458, + "step": 26050 + }, + { + "epoch": 7.600500902789912, + "grad_norm": 0.35813722014427185, + "learning_rate": 0.0005090944055944056, + "loss": 3.4463, + "step": 26100 + }, + { + "epoch": 7.615062030403634, + "grad_norm": 0.3351564407348633, + "learning_rate": 0.0005089195804195804, + "loss": 3.453, + "step": 26150 + }, + { + "epoch": 7.629623158017357, + "grad_norm": 0.335275262594223, + "learning_rate": 0.0005087447552447552, + "loss": 3.441, + "step": 26200 + }, + { + "epoch": 7.644184285631079, + "grad_norm": 0.3310592770576477, + "learning_rate": 0.00050856993006993, + "loss": 3.4408, + "step": 26250 + }, + { + "epoch": 7.658745413244802, + "grad_norm": 0.33791449666023254, + "learning_rate": 0.0005083951048951048, + "loss": 3.4483, + "step": 26300 + }, + { + "epoch": 7.673306540858524, + "grad_norm": 0.3487813174724579, + "learning_rate": 0.0005082202797202797, + "loss": 3.4584, + "step": 26350 + }, + { + "epoch": 7.687867668472246, + "grad_norm": 0.3040063679218292, + "learning_rate": 0.0005080454545454545, + "loss": 3.4539, + "step": 26400 + }, + { + "epoch": 7.702428796085969, + "grad_norm": 0.31800422072410583, + "learning_rate": 0.0005078706293706293, + "loss": 3.4514, + "step": 26450 + }, + { + "epoch": 7.716989923699691, + "grad_norm": 0.3236735165119171, + "learning_rate": 0.0005076958041958042, + "loss": 3.4515, + "step": 26500 + }, + { + "epoch": 7.731551051313414, + "grad_norm": 0.31341543793678284, + "learning_rate": 0.000507520979020979, + "loss": 3.4679, + "step": 26550 + }, + { + "epoch": 7.746112178927136, + "grad_norm": 0.32171645760536194, + "learning_rate": 0.0005073461538461538, + "loss": 3.4554, + "step": 26600 + }, + { + "epoch": 7.760673306540858, + "grad_norm": 0.31651511788368225, + "learning_rate": 0.0005071713286713286, + "loss": 3.4561, + "step": 26650 + }, + { + "epoch": 7.775234434154581, + "grad_norm": 0.3263193964958191, + "learning_rate": 0.0005069965034965035, + "loss": 3.4446, + "step": 26700 + }, + { + "epoch": 7.789795561768304, + "grad_norm": 0.3318195939064026, + "learning_rate": 0.0005068216783216783, + "loss": 3.4513, + "step": 26750 + }, + { + "epoch": 7.8043566893820255, + "grad_norm": 0.3138175308704376, + "learning_rate": 0.0005066468531468531, + "loss": 3.4507, + "step": 26800 + }, + { + "epoch": 7.818917816995748, + "grad_norm": 0.344470351934433, + "learning_rate": 0.0005064720279720279, + "loss": 3.4695, + "step": 26850 + }, + { + "epoch": 7.833478944609471, + "grad_norm": 0.32541370391845703, + "learning_rate": 0.0005062972027972028, + "loss": 3.4625, + "step": 26900 + }, + { + "epoch": 7.848040072223193, + "grad_norm": 0.33207646012306213, + "learning_rate": 0.0005061223776223775, + "loss": 3.4471, + "step": 26950 + }, + { + "epoch": 7.862601199836916, + "grad_norm": 0.3322581350803375, + "learning_rate": 0.0005059475524475524, + "loss": 3.4642, + "step": 27000 + }, + { + "epoch": 7.862601199836916, + "eval_accuracy": 0.3658501019162465, + "eval_loss": 3.5833911895751953, + "eval_runtime": 179.4681, + "eval_samples_per_second": 92.724, + "eval_steps_per_second": 5.8, + "step": 27000 + }, + { + "epoch": 7.8771623274506375, + "grad_norm": 0.3180125653743744, + "learning_rate": 0.0005057727272727272, + "loss": 3.4548, + "step": 27050 + }, + { + "epoch": 7.89172345506436, + "grad_norm": 0.32920128107070923, + "learning_rate": 0.000505597902097902, + "loss": 3.4555, + "step": 27100 + }, + { + "epoch": 7.906284582678083, + "grad_norm": 0.345284640789032, + "learning_rate": 0.0005054230769230769, + "loss": 3.4548, + "step": 27150 + }, + { + "epoch": 7.920845710291805, + "grad_norm": 0.29907023906707764, + "learning_rate": 0.0005052482517482517, + "loss": 3.4601, + "step": 27200 + }, + { + "epoch": 7.935406837905528, + "grad_norm": 0.32699817419052124, + "learning_rate": 0.0005050734265734265, + "loss": 3.4745, + "step": 27250 + }, + { + "epoch": 7.9499679655192494, + "grad_norm": 0.3353591859340668, + "learning_rate": 0.0005048986013986013, + "loss": 3.4594, + "step": 27300 + }, + { + "epoch": 7.964529093132972, + "grad_norm": 0.31299489736557007, + "learning_rate": 0.0005047237762237762, + "loss": 3.4708, + "step": 27350 + }, + { + "epoch": 7.979090220746695, + "grad_norm": 0.318194180727005, + "learning_rate": 0.000504548951048951, + "loss": 3.4673, + "step": 27400 + }, + { + "epoch": 7.993651348360417, + "grad_norm": 0.3500754237174988, + "learning_rate": 0.0005043741258741258, + "loss": 3.4694, + "step": 27450 + }, + { + "epoch": 8.008154231463685, + "grad_norm": 0.32221224904060364, + "learning_rate": 0.0005041993006993006, + "loss": 3.4022, + "step": 27500 + }, + { + "epoch": 8.022715359077408, + "grad_norm": 0.3506814241409302, + "learning_rate": 0.0005040244755244755, + "loss": 3.3639, + "step": 27550 + }, + { + "epoch": 8.037276486691129, + "grad_norm": 0.3434743285179138, + "learning_rate": 0.0005038496503496503, + "loss": 3.3689, + "step": 27600 + }, + { + "epoch": 8.051837614304851, + "grad_norm": 0.36473479866981506, + "learning_rate": 0.0005036748251748251, + "loss": 3.3577, + "step": 27650 + }, + { + "epoch": 8.066398741918574, + "grad_norm": 0.3590794801712036, + "learning_rate": 0.0005034999999999999, + "loss": 3.3581, + "step": 27700 + }, + { + "epoch": 8.080959869532297, + "grad_norm": 0.34557044506073, + "learning_rate": 0.0005033251748251747, + "loss": 3.3431, + "step": 27750 + }, + { + "epoch": 8.09552099714602, + "grad_norm": 0.3259578049182892, + "learning_rate": 0.0005031503496503496, + "loss": 3.3671, + "step": 27800 + }, + { + "epoch": 8.11008212475974, + "grad_norm": 0.33602702617645264, + "learning_rate": 0.0005029755244755244, + "loss": 3.3677, + "step": 27850 + }, + { + "epoch": 8.124643252373463, + "grad_norm": 0.3089875280857086, + "learning_rate": 0.0005028006993006992, + "loss": 3.3709, + "step": 27900 + }, + { + "epoch": 8.139204379987186, + "grad_norm": 0.32774367928504944, + "learning_rate": 0.000502625874125874, + "loss": 3.3677, + "step": 27950 + }, + { + "epoch": 8.153765507600909, + "grad_norm": 0.34521356225013733, + "learning_rate": 0.000502451048951049, + "loss": 3.3812, + "step": 28000 + }, + { + "epoch": 8.153765507600909, + "eval_accuracy": 0.365528353978238, + "eval_loss": 3.5927321910858154, + "eval_runtime": 179.4954, + "eval_samples_per_second": 92.71, + "eval_steps_per_second": 5.8, + "step": 28000 + }, + { + "epoch": 8.168326635214632, + "grad_norm": 0.3378417491912842, + "learning_rate": 0.0005022762237762237, + "loss": 3.3911, + "step": 28050 + }, + { + "epoch": 8.182887762828354, + "grad_norm": 0.3383776843547821, + "learning_rate": 0.0005021013986013985, + "loss": 3.3869, + "step": 28100 + }, + { + "epoch": 8.197448890442075, + "grad_norm": 0.36928725242614746, + "learning_rate": 0.0005019265734265733, + "loss": 3.3919, + "step": 28150 + }, + { + "epoch": 8.212010018055798, + "grad_norm": 0.3346560597419739, + "learning_rate": 0.0005017517482517483, + "loss": 3.3872, + "step": 28200 + }, + { + "epoch": 8.22657114566952, + "grad_norm": 0.33889731764793396, + "learning_rate": 0.0005015769230769231, + "loss": 3.3853, + "step": 28250 + }, + { + "epoch": 8.241132273283243, + "grad_norm": 0.3344823122024536, + "learning_rate": 0.0005014020979020979, + "loss": 3.3937, + "step": 28300 + }, + { + "epoch": 8.255693400896966, + "grad_norm": 0.3183842599391937, + "learning_rate": 0.0005012272727272727, + "loss": 3.3917, + "step": 28350 + }, + { + "epoch": 8.270254528510687, + "grad_norm": 0.3541378974914551, + "learning_rate": 0.0005010524475524476, + "loss": 3.4027, + "step": 28400 + }, + { + "epoch": 8.28481565612441, + "grad_norm": 0.34245550632476807, + "learning_rate": 0.0005008776223776223, + "loss": 3.4131, + "step": 28450 + }, + { + "epoch": 8.299376783738133, + "grad_norm": 0.3223508298397064, + "learning_rate": 0.0005007027972027972, + "loss": 3.399, + "step": 28500 + }, + { + "epoch": 8.313937911351855, + "grad_norm": 0.3582371771335602, + "learning_rate": 0.000500527972027972, + "loss": 3.4116, + "step": 28550 + }, + { + "epoch": 8.328499038965578, + "grad_norm": 0.3282024562358856, + "learning_rate": 0.0005003531468531468, + "loss": 3.3914, + "step": 28600 + }, + { + "epoch": 8.3430601665793, + "grad_norm": 0.32470643520355225, + "learning_rate": 0.0005001783216783217, + "loss": 3.4074, + "step": 28650 + }, + { + "epoch": 8.357621294193022, + "grad_norm": 0.32355979084968567, + "learning_rate": 0.0005000034965034965, + "loss": 3.4055, + "step": 28700 + }, + { + "epoch": 8.372182421806745, + "grad_norm": 0.3411882519721985, + "learning_rate": 0.0004998286713286713, + "loss": 3.4086, + "step": 28750 + }, + { + "epoch": 8.386743549420467, + "grad_norm": 0.3322669267654419, + "learning_rate": 0.0004996538461538461, + "loss": 3.4094, + "step": 28800 + }, + { + "epoch": 8.40130467703419, + "grad_norm": 0.32565972208976746, + "learning_rate": 0.000499479020979021, + "loss": 3.4171, + "step": 28850 + }, + { + "epoch": 8.415865804647911, + "grad_norm": 0.325556218624115, + "learning_rate": 0.0004993041958041958, + "loss": 3.4111, + "step": 28900 + }, + { + "epoch": 8.430426932261634, + "grad_norm": 0.34173521399497986, + "learning_rate": 0.0004991293706293706, + "loss": 3.4118, + "step": 28950 + }, + { + "epoch": 8.444988059875357, + "grad_norm": 0.32827889919281006, + "learning_rate": 0.0004989545454545454, + "loss": 3.4174, + "step": 29000 + }, + { + "epoch": 8.444988059875357, + "eval_accuracy": 0.3661121101055312, + "eval_loss": 3.5833680629730225, + "eval_runtime": 179.5213, + "eval_samples_per_second": 92.697, + "eval_steps_per_second": 5.799, + "step": 29000 + }, + { + "epoch": 8.45954918748908, + "grad_norm": 0.3157867193222046, + "learning_rate": 0.0004987797202797203, + "loss": 3.4037, + "step": 29050 + }, + { + "epoch": 8.474110315102802, + "grad_norm": 0.3219262659549713, + "learning_rate": 0.0004986048951048951, + "loss": 3.4059, + "step": 29100 + }, + { + "epoch": 8.488671442716523, + "grad_norm": 0.3398604094982147, + "learning_rate": 0.0004984300699300699, + "loss": 3.4149, + "step": 29150 + }, + { + "epoch": 8.503232570330246, + "grad_norm": 0.31037428975105286, + "learning_rate": 0.0004982552447552448, + "loss": 3.4154, + "step": 29200 + }, + { + "epoch": 8.517793697943969, + "grad_norm": 0.3249618113040924, + "learning_rate": 0.0004980804195804195, + "loss": 3.4058, + "step": 29250 + }, + { + "epoch": 8.532354825557691, + "grad_norm": 0.3502727746963501, + "learning_rate": 0.0004979055944055944, + "loss": 3.423, + "step": 29300 + }, + { + "epoch": 8.546915953171414, + "grad_norm": 0.3321726322174072, + "learning_rate": 0.0004977307692307692, + "loss": 3.4139, + "step": 29350 + }, + { + "epoch": 8.561477080785137, + "grad_norm": 0.32756468653678894, + "learning_rate": 0.000497555944055944, + "loss": 3.4144, + "step": 29400 + }, + { + "epoch": 8.576038208398858, + "grad_norm": 0.3343152105808258, + "learning_rate": 0.0004973811188811188, + "loss": 3.4128, + "step": 29450 + }, + { + "epoch": 8.59059933601258, + "grad_norm": 0.3429624140262604, + "learning_rate": 0.0004972062937062937, + "loss": 3.4153, + "step": 29500 + }, + { + "epoch": 8.605160463626303, + "grad_norm": 0.3241840898990631, + "learning_rate": 0.0004970314685314685, + "loss": 3.4261, + "step": 29550 + }, + { + "epoch": 8.619721591240026, + "grad_norm": 0.36211371421813965, + "learning_rate": 0.0004968566433566433, + "loss": 3.4077, + "step": 29600 + }, + { + "epoch": 8.634282718853749, + "grad_norm": 0.35850051045417786, + "learning_rate": 0.0004966818181818181, + "loss": 3.4132, + "step": 29650 + }, + { + "epoch": 8.64884384646747, + "grad_norm": 0.32470637559890747, + "learning_rate": 0.000496506993006993, + "loss": 3.4301, + "step": 29700 + }, + { + "epoch": 8.663404974081192, + "grad_norm": 0.3128025531768799, + "learning_rate": 0.0004963321678321678, + "loss": 3.4233, + "step": 29750 + }, + { + "epoch": 8.677966101694915, + "grad_norm": 0.3735058605670929, + "learning_rate": 0.0004961573426573426, + "loss": 3.4253, + "step": 29800 + }, + { + "epoch": 8.692527229308638, + "grad_norm": 0.3122337758541107, + "learning_rate": 0.0004959825174825175, + "loss": 3.4234, + "step": 29850 + }, + { + "epoch": 8.70708835692236, + "grad_norm": 0.3589786887168884, + "learning_rate": 0.0004958076923076923, + "loss": 3.4175, + "step": 29900 + }, + { + "epoch": 8.721649484536082, + "grad_norm": 0.33352115750312805, + "learning_rate": 0.0004956328671328671, + "loss": 3.4166, + "step": 29950 + }, + { + "epoch": 8.736210612149804, + "grad_norm": 0.3245551288127899, + "learning_rate": 0.0004954580419580419, + "loss": 3.4296, + "step": 30000 + }, + { + "epoch": 8.736210612149804, + "eval_accuracy": 0.367026904407347, + "eval_loss": 3.5770156383514404, + "eval_runtime": 179.9405, + "eval_samples_per_second": 92.481, + "eval_steps_per_second": 5.785, + "step": 30000 + }, + { + "epoch": 8.750771739763527, + "grad_norm": 0.3421856164932251, + "learning_rate": 0.0004952832167832167, + "loss": 3.4276, + "step": 30050 + }, + { + "epoch": 8.76533286737725, + "grad_norm": 0.3369308114051819, + "learning_rate": 0.0004951083916083915, + "loss": 3.4324, + "step": 30100 + }, + { + "epoch": 8.779893994990973, + "grad_norm": 0.3228960633277893, + "learning_rate": 0.0004949335664335664, + "loss": 3.4244, + "step": 30150 + }, + { + "epoch": 8.794455122604695, + "grad_norm": 0.3328655958175659, + "learning_rate": 0.0004947587412587412, + "loss": 3.4292, + "step": 30200 + }, + { + "epoch": 8.809016250218416, + "grad_norm": 0.33146989345550537, + "learning_rate": 0.000494583916083916, + "loss": 3.4347, + "step": 30250 + }, + { + "epoch": 8.82357737783214, + "grad_norm": 0.3366376459598541, + "learning_rate": 0.0004944090909090908, + "loss": 3.4403, + "step": 30300 + }, + { + "epoch": 8.838138505445862, + "grad_norm": 0.3395143151283264, + "learning_rate": 0.0004942342657342657, + "loss": 3.4291, + "step": 30350 + }, + { + "epoch": 8.852699633059585, + "grad_norm": 0.3265385627746582, + "learning_rate": 0.0004940594405594405, + "loss": 3.4229, + "step": 30400 + }, + { + "epoch": 8.867260760673307, + "grad_norm": 0.3156973123550415, + "learning_rate": 0.0004938846153846153, + "loss": 3.4274, + "step": 30450 + }, + { + "epoch": 8.881821888287028, + "grad_norm": 0.3365817964076996, + "learning_rate": 0.0004937097902097901, + "loss": 3.4404, + "step": 30500 + }, + { + "epoch": 8.896383015900751, + "grad_norm": 0.33978214859962463, + "learning_rate": 0.000493534965034965, + "loss": 3.4348, + "step": 30550 + }, + { + "epoch": 8.910944143514474, + "grad_norm": 0.3471260666847229, + "learning_rate": 0.0004933601398601398, + "loss": 3.4458, + "step": 30600 + }, + { + "epoch": 8.925505271128197, + "grad_norm": 0.33476272225379944, + "learning_rate": 0.0004931853146853146, + "loss": 3.4331, + "step": 30650 + }, + { + "epoch": 8.94006639874192, + "grad_norm": 0.31054040789604187, + "learning_rate": 0.0004930104895104895, + "loss": 3.4318, + "step": 30700 + }, + { + "epoch": 8.95462752635564, + "grad_norm": 0.3388405740261078, + "learning_rate": 0.0004928356643356642, + "loss": 3.4365, + "step": 30750 + }, + { + "epoch": 8.969188653969363, + "grad_norm": 0.3688811659812927, + "learning_rate": 0.0004926608391608391, + "loss": 3.4345, + "step": 30800 + }, + { + "epoch": 8.983749781583086, + "grad_norm": 0.32696250081062317, + "learning_rate": 0.0004924860139860139, + "loss": 3.4382, + "step": 30850 + }, + { + "epoch": 8.998310909196809, + "grad_norm": 0.36053240299224854, + "learning_rate": 0.0004923111888111887, + "loss": 3.435, + "step": 30900 + }, + { + "epoch": 9.012813792300076, + "grad_norm": 0.3175714612007141, + "learning_rate": 0.0004921363636363635, + "loss": 3.3325, + "step": 30950 + }, + { + "epoch": 9.027374919913798, + "grad_norm": 0.3416489362716675, + "learning_rate": 0.0004919615384615384, + "loss": 3.3249, + "step": 31000 + }, + { + "epoch": 9.027374919913798, + "eval_accuracy": 0.36684850834668953, + "eval_loss": 3.580686330795288, + "eval_runtime": 179.5407, + "eval_samples_per_second": 92.686, + "eval_steps_per_second": 5.798, + "step": 31000 + }, + { + "epoch": 9.041936047527521, + "grad_norm": 0.3503015339374542, + "learning_rate": 0.0004917867132867132, + "loss": 3.3338, + "step": 31050 + }, + { + "epoch": 9.056497175141242, + "grad_norm": 0.3463656008243561, + "learning_rate": 0.000491611888111888, + "loss": 3.3427, + "step": 31100 + }, + { + "epoch": 9.071058302754965, + "grad_norm": 0.3392924964427948, + "learning_rate": 0.0004914370629370628, + "loss": 3.3455, + "step": 31150 + }, + { + "epoch": 9.085619430368688, + "grad_norm": 0.3339923620223999, + "learning_rate": 0.0004912622377622378, + "loss": 3.3401, + "step": 31200 + }, + { + "epoch": 9.10018055798241, + "grad_norm": 0.33536580204963684, + "learning_rate": 0.0004910874125874126, + "loss": 3.3443, + "step": 31250 + }, + { + "epoch": 9.114741685596133, + "grad_norm": 0.325057715177536, + "learning_rate": 0.0004909125874125874, + "loss": 3.3497, + "step": 31300 + }, + { + "epoch": 9.129302813209854, + "grad_norm": 0.3440364897251129, + "learning_rate": 0.0004907377622377623, + "loss": 3.3465, + "step": 31350 + }, + { + "epoch": 9.143863940823577, + "grad_norm": 0.34977987408638, + "learning_rate": 0.0004905629370629371, + "loss": 3.3461, + "step": 31400 + }, + { + "epoch": 9.1584250684373, + "grad_norm": 0.34154027700424194, + "learning_rate": 0.0004903881118881119, + "loss": 3.3571, + "step": 31450 + }, + { + "epoch": 9.172986196051022, + "grad_norm": 0.35453176498413086, + "learning_rate": 0.0004902132867132867, + "loss": 3.3583, + "step": 31500 + }, + { + "epoch": 9.187547323664745, + "grad_norm": 0.34433984756469727, + "learning_rate": 0.0004900384615384615, + "loss": 3.3614, + "step": 31550 + }, + { + "epoch": 9.202108451278466, + "grad_norm": 0.34944620728492737, + "learning_rate": 0.0004898636363636363, + "loss": 3.3564, + "step": 31600 + }, + { + "epoch": 9.216669578892189, + "grad_norm": 0.35863709449768066, + "learning_rate": 0.0004896888111888112, + "loss": 3.3646, + "step": 31650 + }, + { + "epoch": 9.231230706505912, + "grad_norm": 0.34784045815467834, + "learning_rate": 0.000489513986013986, + "loss": 3.3667, + "step": 31700 + }, + { + "epoch": 9.245791834119634, + "grad_norm": 0.3355439603328705, + "learning_rate": 0.0004893391608391608, + "loss": 3.3814, + "step": 31750 + }, + { + "epoch": 9.260352961733357, + "grad_norm": 0.3343488872051239, + "learning_rate": 0.0004891643356643356, + "loss": 3.366, + "step": 31800 + }, + { + "epoch": 9.27491408934708, + "grad_norm": 0.3547097444534302, + "learning_rate": 0.0004889895104895105, + "loss": 3.378, + "step": 31850 + }, + { + "epoch": 9.2894752169608, + "grad_norm": 0.33162373304367065, + "learning_rate": 0.0004888146853146853, + "loss": 3.3649, + "step": 31900 + }, + { + "epoch": 9.304036344574524, + "grad_norm": 0.3784696161746979, + "learning_rate": 0.0004886398601398601, + "loss": 3.3739, + "step": 31950 + }, + { + "epoch": 9.318597472188246, + "grad_norm": 0.3406825363636017, + "learning_rate": 0.000488465034965035, + "loss": 3.377, + "step": 32000 + }, + { + "epoch": 9.318597472188246, + "eval_accuracy": 0.3671976566025182, + "eval_loss": 3.579307794570923, + "eval_runtime": 179.5993, + "eval_samples_per_second": 92.656, + "eval_steps_per_second": 5.796, + "step": 32000 + }, + { + "epoch": 9.333158599801969, + "grad_norm": 0.34786367416381836, + "learning_rate": 0.0004882902097902098, + "loss": 3.3731, + "step": 32050 + }, + { + "epoch": 9.347719727415692, + "grad_norm": 0.33358603715896606, + "learning_rate": 0.0004881153846153846, + "loss": 3.3715, + "step": 32100 + }, + { + "epoch": 9.362280855029413, + "grad_norm": 0.34001341462135315, + "learning_rate": 0.0004879405594405594, + "loss": 3.3815, + "step": 32150 + }, + { + "epoch": 9.376841982643136, + "grad_norm": 0.343924880027771, + "learning_rate": 0.00048776573426573424, + "loss": 3.379, + "step": 32200 + }, + { + "epoch": 9.391403110256858, + "grad_norm": 0.3322538137435913, + "learning_rate": 0.00048759090909090904, + "loss": 3.3914, + "step": 32250 + }, + { + "epoch": 9.405964237870581, + "grad_norm": 0.3515528440475464, + "learning_rate": 0.0004874160839160839, + "loss": 3.3813, + "step": 32300 + }, + { + "epoch": 9.420525365484304, + "grad_norm": 0.34504127502441406, + "learning_rate": 0.0004872412587412587, + "loss": 3.3871, + "step": 32350 + }, + { + "epoch": 9.435086493098025, + "grad_norm": 0.3422424793243408, + "learning_rate": 0.00048706643356643354, + "loss": 3.3952, + "step": 32400 + }, + { + "epoch": 9.449647620711747, + "grad_norm": 0.3778160512447357, + "learning_rate": 0.00048689160839160834, + "loss": 3.3871, + "step": 32450 + }, + { + "epoch": 9.46420874832547, + "grad_norm": 0.34064722061157227, + "learning_rate": 0.0004867167832167832, + "loss": 3.3851, + "step": 32500 + }, + { + "epoch": 9.478769875939193, + "grad_norm": 0.31927308440208435, + "learning_rate": 0.00048654195804195794, + "loss": 3.3986, + "step": 32550 + }, + { + "epoch": 9.493331003552916, + "grad_norm": 0.3233279883861542, + "learning_rate": 0.00048636713286713285, + "loss": 3.3917, + "step": 32600 + }, + { + "epoch": 9.507892131166638, + "grad_norm": 0.34362176060676575, + "learning_rate": 0.0004861923076923077, + "loss": 3.3818, + "step": 32650 + }, + { + "epoch": 9.52245325878036, + "grad_norm": 0.32272911071777344, + "learning_rate": 0.00048601748251748245, + "loss": 3.3855, + "step": 32700 + }, + { + "epoch": 9.537014386394082, + "grad_norm": 0.3509780466556549, + "learning_rate": 0.0004858426573426573, + "loss": 3.3869, + "step": 32750 + }, + { + "epoch": 9.551575514007805, + "grad_norm": 0.3370649814605713, + "learning_rate": 0.0004856678321678321, + "loss": 3.3947, + "step": 32800 + }, + { + "epoch": 9.566136641621528, + "grad_norm": 0.33514314889907837, + "learning_rate": 0.00048549300699300696, + "loss": 3.3867, + "step": 32850 + }, + { + "epoch": 9.58069776923525, + "grad_norm": 0.3138607144355774, + "learning_rate": 0.00048531818181818176, + "loss": 3.3909, + "step": 32900 + }, + { + "epoch": 9.595258896848971, + "grad_norm": 0.34165963530540466, + "learning_rate": 0.0004851433566433566, + "loss": 3.3951, + "step": 32950 + }, + { + "epoch": 9.609820024462694, + "grad_norm": 0.351672500371933, + "learning_rate": 0.0004849685314685314, + "loss": 3.3885, + "step": 33000 + }, + { + "epoch": 9.609820024462694, + "eval_accuracy": 0.3680022616434005, + "eval_loss": 3.5697364807128906, + "eval_runtime": 179.5621, + "eval_samples_per_second": 92.675, + "eval_steps_per_second": 5.797, + "step": 33000 + }, + { + "epoch": 9.624381152076417, + "grad_norm": 0.3302818238735199, + "learning_rate": 0.00048479370629370627, + "loss": 3.3928, + "step": 33050 + }, + { + "epoch": 9.63894227969014, + "grad_norm": 0.3303544521331787, + "learning_rate": 0.00048461888111888106, + "loss": 3.3966, + "step": 33100 + }, + { + "epoch": 9.653503407303862, + "grad_norm": 0.37402021884918213, + "learning_rate": 0.0004844440559440559, + "loss": 3.4061, + "step": 33150 + }, + { + "epoch": 9.668064534917583, + "grad_norm": 0.3550156354904175, + "learning_rate": 0.0004842692307692307, + "loss": 3.3945, + "step": 33200 + }, + { + "epoch": 9.682625662531306, + "grad_norm": 0.3375867009162903, + "learning_rate": 0.00048409440559440557, + "loss": 3.4039, + "step": 33250 + }, + { + "epoch": 9.697186790145029, + "grad_norm": 0.3243738114833832, + "learning_rate": 0.0004839195804195803, + "loss": 3.3875, + "step": 33300 + }, + { + "epoch": 9.711747917758752, + "grad_norm": 0.35806065797805786, + "learning_rate": 0.0004837447552447552, + "loss": 3.3992, + "step": 33350 + }, + { + "epoch": 9.726309045372474, + "grad_norm": 0.32793280482292175, + "learning_rate": 0.0004835699300699301, + "loss": 3.3877, + "step": 33400 + }, + { + "epoch": 9.740870172986195, + "grad_norm": 0.33390912413597107, + "learning_rate": 0.0004833951048951048, + "loss": 3.4176, + "step": 33450 + }, + { + "epoch": 9.755431300599918, + "grad_norm": 0.3261817991733551, + "learning_rate": 0.0004832202797202797, + "loss": 3.397, + "step": 33500 + }, + { + "epoch": 9.76999242821364, + "grad_norm": 0.3434211015701294, + "learning_rate": 0.0004830454545454545, + "loss": 3.4082, + "step": 33550 + }, + { + "epoch": 9.784553555827364, + "grad_norm": 0.34745633602142334, + "learning_rate": 0.00048287062937062933, + "loss": 3.4018, + "step": 33600 + }, + { + "epoch": 9.799114683441086, + "grad_norm": 0.3618619441986084, + "learning_rate": 0.00048269580419580413, + "loss": 3.3959, + "step": 33650 + }, + { + "epoch": 9.813675811054807, + "grad_norm": 0.35037025809288025, + "learning_rate": 0.000482520979020979, + "loss": 3.3963, + "step": 33700 + }, + { + "epoch": 9.82823693866853, + "grad_norm": 0.35323193669319153, + "learning_rate": 0.0004823461538461538, + "loss": 3.4175, + "step": 33750 + }, + { + "epoch": 9.842798066282253, + "grad_norm": 0.32245779037475586, + "learning_rate": 0.00048217132867132864, + "loss": 3.3962, + "step": 33800 + }, + { + "epoch": 9.857359193895975, + "grad_norm": 0.3340567648410797, + "learning_rate": 0.00048199650349650344, + "loss": 3.4052, + "step": 33850 + }, + { + "epoch": 9.871920321509698, + "grad_norm": 0.35121092200279236, + "learning_rate": 0.0004818216783216783, + "loss": 3.4022, + "step": 33900 + }, + { + "epoch": 9.88648144912342, + "grad_norm": 0.3410423994064331, + "learning_rate": 0.0004816468531468531, + "loss": 3.4115, + "step": 33950 + }, + { + "epoch": 9.901042576737142, + "grad_norm": 0.32788190245628357, + "learning_rate": 0.00048147202797202795, + "loss": 3.4055, + "step": 34000 + }, + { + "epoch": 9.901042576737142, + "eval_accuracy": 0.3681349121090707, + "eval_loss": 3.5651626586914062, + "eval_runtime": 179.6407, + "eval_samples_per_second": 92.635, + "eval_steps_per_second": 5.795, + "step": 34000 + }, + { + "epoch": 9.915603704350865, + "grad_norm": 0.3360147774219513, + "learning_rate": 0.0004812972027972028, + "loss": 3.421, + "step": 34050 + }, + { + "epoch": 9.930164831964587, + "grad_norm": 0.3194892108440399, + "learning_rate": 0.0004811223776223776, + "loss": 3.4199, + "step": 34100 + }, + { + "epoch": 9.94472595957831, + "grad_norm": 0.3178540766239166, + "learning_rate": 0.00048094755244755245, + "loss": 3.414, + "step": 34150 + }, + { + "epoch": 9.959287087192033, + "grad_norm": 0.3059634268283844, + "learning_rate": 0.0004807727272727272, + "loss": 3.4017, + "step": 34200 + }, + { + "epoch": 9.973848214805754, + "grad_norm": 0.33229926228523254, + "learning_rate": 0.00048059790209790205, + "loss": 3.4211, + "step": 34250 + }, + { + "epoch": 9.988409342419477, + "grad_norm": 0.3151392638683319, + "learning_rate": 0.00048042307692307685, + "loss": 3.4128, + "step": 34300 + }, + { + "epoch": 10.002912225522744, + "grad_norm": 0.35194942355155945, + "learning_rate": 0.0004802482517482517, + "loss": 3.3823, + "step": 34350 + }, + { + "epoch": 10.017473353136467, + "grad_norm": 0.34302377700805664, + "learning_rate": 0.0004800734265734265, + "loss": 3.3022, + "step": 34400 + }, + { + "epoch": 10.03203448075019, + "grad_norm": 0.35437721014022827, + "learning_rate": 0.00047989860139860136, + "loss": 3.3014, + "step": 34450 + }, + { + "epoch": 10.046595608363912, + "grad_norm": 0.3585539758205414, + "learning_rate": 0.00047972377622377616, + "loss": 3.2966, + "step": 34500 + }, + { + "epoch": 10.061156735977635, + "grad_norm": 0.3557938039302826, + "learning_rate": 0.000479548951048951, + "loss": 3.31, + "step": 34550 + }, + { + "epoch": 10.075717863591356, + "grad_norm": 0.38117077946662903, + "learning_rate": 0.0004793741258741258, + "loss": 3.3192, + "step": 34600 + }, + { + "epoch": 10.090278991205079, + "grad_norm": 0.3344501256942749, + "learning_rate": 0.00047919930069930067, + "loss": 3.3062, + "step": 34650 + }, + { + "epoch": 10.104840118818801, + "grad_norm": 0.3537605106830597, + "learning_rate": 0.0004790244755244755, + "loss": 3.3134, + "step": 34700 + }, + { + "epoch": 10.119401246432524, + "grad_norm": 0.35433295369148254, + "learning_rate": 0.0004788496503496503, + "loss": 3.3182, + "step": 34750 + }, + { + "epoch": 10.133962374046247, + "grad_norm": 0.3392355144023895, + "learning_rate": 0.0004786748251748252, + "loss": 3.3281, + "step": 34800 + }, + { + "epoch": 10.148523501659968, + "grad_norm": 0.34290164709091187, + "learning_rate": 0.0004785, + "loss": 3.3316, + "step": 34850 + }, + { + "epoch": 10.16308462927369, + "grad_norm": 0.35168832540512085, + "learning_rate": 0.00047832517482517483, + "loss": 3.3223, + "step": 34900 + }, + { + "epoch": 10.177645756887413, + "grad_norm": 0.32866406440734863, + "learning_rate": 0.0004781503496503496, + "loss": 3.3278, + "step": 34950 + }, + { + "epoch": 10.192206884501136, + "grad_norm": 0.3799251914024353, + "learning_rate": 0.00047797552447552443, + "loss": 3.3423, + "step": 35000 + }, + { + "epoch": 10.192206884501136, + "eval_accuracy": 0.36767463380886406, + "eval_loss": 3.5737509727478027, + "eval_runtime": 179.5577, + "eval_samples_per_second": 92.678, + "eval_steps_per_second": 5.798, + "step": 35000 + }, + { + "epoch": 10.206768012114859, + "grad_norm": 0.35849910974502563, + "learning_rate": 0.00047780069930069923, + "loss": 3.3372, + "step": 35050 + }, + { + "epoch": 10.221329139728581, + "grad_norm": 0.33695927262306213, + "learning_rate": 0.0004776258741258741, + "loss": 3.3448, + "step": 35100 + }, + { + "epoch": 10.235890267342302, + "grad_norm": 0.32416245341300964, + "learning_rate": 0.0004774510489510489, + "loss": 3.3477, + "step": 35150 + }, + { + "epoch": 10.250451394956025, + "grad_norm": 0.3458312749862671, + "learning_rate": 0.00047727622377622374, + "loss": 3.3455, + "step": 35200 + }, + { + "epoch": 10.265012522569748, + "grad_norm": 0.36680135130882263, + "learning_rate": 0.00047710139860139854, + "loss": 3.3607, + "step": 35250 + }, + { + "epoch": 10.27957365018347, + "grad_norm": 0.34393811225891113, + "learning_rate": 0.0004769265734265734, + "loss": 3.3559, + "step": 35300 + }, + { + "epoch": 10.294134777797193, + "grad_norm": 0.3232147991657257, + "learning_rate": 0.0004767517482517482, + "loss": 3.3609, + "step": 35350 + }, + { + "epoch": 10.308695905410914, + "grad_norm": 0.36371856927871704, + "learning_rate": 0.00047657692307692304, + "loss": 3.3565, + "step": 35400 + }, + { + "epoch": 10.323257033024637, + "grad_norm": 0.32138916850090027, + "learning_rate": 0.0004764020979020979, + "loss": 3.3509, + "step": 35450 + }, + { + "epoch": 10.33781816063836, + "grad_norm": 0.36574694514274597, + "learning_rate": 0.0004762272727272727, + "loss": 3.3486, + "step": 35500 + }, + { + "epoch": 10.352379288252083, + "grad_norm": 0.36567094922065735, + "learning_rate": 0.00047605244755244755, + "loss": 3.3576, + "step": 35550 + }, + { + "epoch": 10.366940415865805, + "grad_norm": 0.3548871576786041, + "learning_rate": 0.00047587762237762235, + "loss": 3.3614, + "step": 35600 + }, + { + "epoch": 10.381501543479526, + "grad_norm": 0.3599976599216461, + "learning_rate": 0.0004757027972027972, + "loss": 3.3603, + "step": 35650 + }, + { + "epoch": 10.396062671093249, + "grad_norm": 0.34999898076057434, + "learning_rate": 0.00047552797202797195, + "loss": 3.3498, + "step": 35700 + }, + { + "epoch": 10.410623798706972, + "grad_norm": 0.3362889289855957, + "learning_rate": 0.0004753531468531468, + "loss": 3.3549, + "step": 35750 + }, + { + "epoch": 10.425184926320695, + "grad_norm": 0.33971652388572693, + "learning_rate": 0.0004751783216783216, + "loss": 3.3558, + "step": 35800 + }, + { + "epoch": 10.439746053934417, + "grad_norm": 0.32522860169410706, + "learning_rate": 0.00047500349650349646, + "loss": 3.3734, + "step": 35850 + }, + { + "epoch": 10.454307181548138, + "grad_norm": 0.33393293619155884, + "learning_rate": 0.00047482867132867126, + "loss": 3.3728, + "step": 35900 + }, + { + "epoch": 10.468868309161861, + "grad_norm": 0.36121344566345215, + "learning_rate": 0.0004746538461538461, + "loss": 3.3704, + "step": 35950 + }, + { + "epoch": 10.483429436775584, + "grad_norm": 0.33934155106544495, + "learning_rate": 0.0004744790209790209, + "loss": 3.36, + "step": 36000 + }, + { + "epoch": 10.483429436775584, + "eval_accuracy": 0.36847312375735736, + "eval_loss": 3.567448616027832, + "eval_runtime": 189.5881, + "eval_samples_per_second": 87.774, + "eval_steps_per_second": 5.491, + "step": 36000 + }, + { + "epoch": 10.497990564389307, + "grad_norm": 0.33829599618911743, + "learning_rate": 0.00047430419580419576, + "loss": 3.3594, + "step": 36050 + }, + { + "epoch": 10.51255169200303, + "grad_norm": 0.36573532223701477, + "learning_rate": 0.0004741293706293706, + "loss": 3.3697, + "step": 36100 + }, + { + "epoch": 10.52711281961675, + "grad_norm": 0.3282630145549774, + "learning_rate": 0.0004739545454545454, + "loss": 3.3601, + "step": 36150 + }, + { + "epoch": 10.541673947230473, + "grad_norm": 0.36783677339553833, + "learning_rate": 0.00047377972027972027, + "loss": 3.3587, + "step": 36200 + }, + { + "epoch": 10.556235074844196, + "grad_norm": 0.3381112217903137, + "learning_rate": 0.00047360489510489507, + "loss": 3.3777, + "step": 36250 + }, + { + "epoch": 10.570796202457919, + "grad_norm": 0.3379128575325012, + "learning_rate": 0.0004734300699300699, + "loss": 3.3733, + "step": 36300 + }, + { + "epoch": 10.585357330071641, + "grad_norm": 0.32401373982429504, + "learning_rate": 0.0004732552447552447, + "loss": 3.3722, + "step": 36350 + }, + { + "epoch": 10.599918457685362, + "grad_norm": 0.3448126018047333, + "learning_rate": 0.0004730804195804196, + "loss": 3.3762, + "step": 36400 + }, + { + "epoch": 10.614479585299085, + "grad_norm": 0.36033710837364197, + "learning_rate": 0.0004729055944055943, + "loss": 3.3721, + "step": 36450 + }, + { + "epoch": 10.629040712912808, + "grad_norm": 0.32210254669189453, + "learning_rate": 0.0004727307692307692, + "loss": 3.3704, + "step": 36500 + }, + { + "epoch": 10.64360184052653, + "grad_norm": 0.3496635854244232, + "learning_rate": 0.000472555944055944, + "loss": 3.3641, + "step": 36550 + }, + { + "epoch": 10.658162968140253, + "grad_norm": 0.3081586956977844, + "learning_rate": 0.00047238111888111883, + "loss": 3.3772, + "step": 36600 + }, + { + "epoch": 10.672724095753976, + "grad_norm": 0.3534667491912842, + "learning_rate": 0.00047220629370629363, + "loss": 3.3856, + "step": 36650 + }, + { + "epoch": 10.687285223367697, + "grad_norm": 0.34923726320266724, + "learning_rate": 0.0004720314685314685, + "loss": 3.378, + "step": 36700 + }, + { + "epoch": 10.70184635098142, + "grad_norm": 0.35523879528045654, + "learning_rate": 0.0004718566433566433, + "loss": 3.3725, + "step": 36750 + }, + { + "epoch": 10.716407478595142, + "grad_norm": 0.3368263840675354, + "learning_rate": 0.00047168181818181814, + "loss": 3.3739, + "step": 36800 + }, + { + "epoch": 10.730968606208865, + "grad_norm": 0.32694390416145325, + "learning_rate": 0.000471506993006993, + "loss": 3.3751, + "step": 36850 + }, + { + "epoch": 10.745529733822588, + "grad_norm": 0.32708966732025146, + "learning_rate": 0.0004713321678321678, + "loss": 3.3844, + "step": 36900 + }, + { + "epoch": 10.760090861436309, + "grad_norm": 0.30745530128479004, + "learning_rate": 0.00047115734265734265, + "loss": 3.3898, + "step": 36950 + }, + { + "epoch": 10.774651989050032, + "grad_norm": 0.3373164236545563, + "learning_rate": 0.00047098251748251745, + "loss": 3.3876, + "step": 37000 + }, + { + "epoch": 10.774651989050032, + "eval_accuracy": 0.36911803080854105, + "eval_loss": 3.5577805042266846, + "eval_runtime": 179.2277, + "eval_samples_per_second": 92.848, + "eval_steps_per_second": 5.808, + "step": 37000 + }, + { + "epoch": 10.789213116663754, + "grad_norm": 0.3574223816394806, + "learning_rate": 0.0004708076923076923, + "loss": 3.3808, + "step": 37050 + }, + { + "epoch": 10.803774244277477, + "grad_norm": 0.3373048007488251, + "learning_rate": 0.0004706328671328671, + "loss": 3.3903, + "step": 37100 + }, + { + "epoch": 10.8183353718912, + "grad_norm": 0.34644612669944763, + "learning_rate": 0.00047045804195804195, + "loss": 3.3885, + "step": 37150 + }, + { + "epoch": 10.83289649950492, + "grad_norm": 0.34422236680984497, + "learning_rate": 0.0004702832167832167, + "loss": 3.3818, + "step": 37200 + }, + { + "epoch": 10.847457627118644, + "grad_norm": 0.3302682638168335, + "learning_rate": 0.00047010839160839155, + "loss": 3.3797, + "step": 37250 + }, + { + "epoch": 10.862018754732366, + "grad_norm": 0.3459398150444031, + "learning_rate": 0.00046993356643356635, + "loss": 3.3998, + "step": 37300 + }, + { + "epoch": 10.876579882346089, + "grad_norm": 0.3879469037055969, + "learning_rate": 0.0004697587412587412, + "loss": 3.3815, + "step": 37350 + }, + { + "epoch": 10.891141009959812, + "grad_norm": 0.33385929465293884, + "learning_rate": 0.000469583916083916, + "loss": 3.379, + "step": 37400 + }, + { + "epoch": 10.905702137573535, + "grad_norm": 0.3342723250389099, + "learning_rate": 0.00046940909090909086, + "loss": 3.3929, + "step": 37450 + }, + { + "epoch": 10.920263265187256, + "grad_norm": 0.35342901945114136, + "learning_rate": 0.0004692342657342657, + "loss": 3.3927, + "step": 37500 + }, + { + "epoch": 10.934824392800978, + "grad_norm": 0.33476606011390686, + "learning_rate": 0.0004690594405594405, + "loss": 3.3997, + "step": 37550 + }, + { + "epoch": 10.949385520414701, + "grad_norm": 0.34175390005111694, + "learning_rate": 0.00046888461538461537, + "loss": 3.3931, + "step": 37600 + }, + { + "epoch": 10.963946648028424, + "grad_norm": 0.3353242576122284, + "learning_rate": 0.00046870979020979017, + "loss": 3.393, + "step": 37650 + }, + { + "epoch": 10.978507775642147, + "grad_norm": 0.34950190782546997, + "learning_rate": 0.000468534965034965, + "loss": 3.3864, + "step": 37700 + }, + { + "epoch": 10.993068903255867, + "grad_norm": 0.3388817310333252, + "learning_rate": 0.0004683601398601398, + "loss": 3.3917, + "step": 37750 + }, + { + "epoch": 11.007571786359136, + "grad_norm": 0.3355076313018799, + "learning_rate": 0.0004681853146853147, + "loss": 3.3182, + "step": 37800 + }, + { + "epoch": 11.022132913972857, + "grad_norm": 0.34891900420188904, + "learning_rate": 0.0004680104895104895, + "loss": 3.2813, + "step": 37850 + }, + { + "epoch": 11.03669404158658, + "grad_norm": 0.3277644217014313, + "learning_rate": 0.00046783566433566433, + "loss": 3.2816, + "step": 37900 + }, + { + "epoch": 11.051255169200303, + "grad_norm": 0.3547028601169586, + "learning_rate": 0.0004676608391608391, + "loss": 3.2908, + "step": 37950 + }, + { + "epoch": 11.065816296814026, + "grad_norm": 0.3213656544685364, + "learning_rate": 0.00046748601398601393, + "loss": 3.2956, + "step": 38000 + }, + { + "epoch": 11.065816296814026, + "eval_accuracy": 0.3687849934691989, + "eval_loss": 3.5677566528320312, + "eval_runtime": 179.2703, + "eval_samples_per_second": 92.826, + "eval_steps_per_second": 5.807, + "step": 38000 + }, + { + "epoch": 11.080377424427748, + "grad_norm": 0.35302484035491943, + "learning_rate": 0.00046731118881118873, + "loss": 3.3003, + "step": 38050 + }, + { + "epoch": 11.09493855204147, + "grad_norm": 0.33304354548454285, + "learning_rate": 0.0004671363636363636, + "loss": 3.2988, + "step": 38100 + }, + { + "epoch": 11.109499679655192, + "grad_norm": 0.35924726724624634, + "learning_rate": 0.00046696153846153844, + "loss": 3.3095, + "step": 38150 + }, + { + "epoch": 11.124060807268915, + "grad_norm": 0.340108722448349, + "learning_rate": 0.00046678671328671324, + "loss": 3.304, + "step": 38200 + }, + { + "epoch": 11.138621934882638, + "grad_norm": 0.3212912678718567, + "learning_rate": 0.0004666118881118881, + "loss": 3.3098, + "step": 38250 + }, + { + "epoch": 11.15318306249636, + "grad_norm": 0.34773996472358704, + "learning_rate": 0.0004664370629370629, + "loss": 3.3144, + "step": 38300 + }, + { + "epoch": 11.167744190110081, + "grad_norm": 0.3532446026802063, + "learning_rate": 0.00046626223776223774, + "loss": 3.3145, + "step": 38350 + }, + { + "epoch": 11.182305317723804, + "grad_norm": 0.3510436415672302, + "learning_rate": 0.00046608741258741254, + "loss": 3.3075, + "step": 38400 + }, + { + "epoch": 11.196866445337527, + "grad_norm": 0.3559911549091339, + "learning_rate": 0.0004659125874125874, + "loss": 3.3147, + "step": 38450 + }, + { + "epoch": 11.21142757295125, + "grad_norm": 0.35517627000808716, + "learning_rate": 0.0004657377622377622, + "loss": 3.3198, + "step": 38500 + }, + { + "epoch": 11.225988700564972, + "grad_norm": 0.34963780641555786, + "learning_rate": 0.00046556293706293705, + "loss": 3.3041, + "step": 38550 + }, + { + "epoch": 11.240549828178693, + "grad_norm": 0.37004563212394714, + "learning_rate": 0.00046538811188811185, + "loss": 3.3175, + "step": 38600 + }, + { + "epoch": 11.255110955792416, + "grad_norm": 0.3559558689594269, + "learning_rate": 0.0004652132867132867, + "loss": 3.331, + "step": 38650 + }, + { + "epoch": 11.269672083406139, + "grad_norm": 0.3516577184200287, + "learning_rate": 0.00046503846153846145, + "loss": 3.3216, + "step": 38700 + }, + { + "epoch": 11.284233211019862, + "grad_norm": 0.3305668532848358, + "learning_rate": 0.0004648636363636363, + "loss": 3.3288, + "step": 38750 + }, + { + "epoch": 11.298794338633584, + "grad_norm": 0.3463844358921051, + "learning_rate": 0.0004646888111888111, + "loss": 3.3294, + "step": 38800 + }, + { + "epoch": 11.313355466247307, + "grad_norm": 0.3470117151737213, + "learning_rate": 0.00046451398601398596, + "loss": 3.334, + "step": 38850 + }, + { + "epoch": 11.327916593861028, + "grad_norm": 0.3878748118877411, + "learning_rate": 0.0004643391608391608, + "loss": 3.3207, + "step": 38900 + }, + { + "epoch": 11.34247772147475, + "grad_norm": 0.3300091028213501, + "learning_rate": 0.0004641643356643356, + "loss": 3.3571, + "step": 38950 + }, + { + "epoch": 11.357038849088473, + "grad_norm": 0.3411715626716614, + "learning_rate": 0.00046398951048951046, + "loss": 3.3316, + "step": 39000 + }, + { + "epoch": 11.357038849088473, + "eval_accuracy": 0.36932288640357425, + "eval_loss": 3.56606125831604, + "eval_runtime": 179.4464, + "eval_samples_per_second": 92.735, + "eval_steps_per_second": 5.801, + "step": 39000 + }, + { + "epoch": 11.371599976702196, + "grad_norm": 0.34825968742370605, + "learning_rate": 0.00046381468531468526, + "loss": 3.3289, + "step": 39050 + }, + { + "epoch": 11.386161104315919, + "grad_norm": 0.3493784964084625, + "learning_rate": 0.0004636398601398601, + "loss": 3.3427, + "step": 39100 + }, + { + "epoch": 11.40072223192964, + "grad_norm": 0.38270437717437744, + "learning_rate": 0.0004634650349650349, + "loss": 3.3439, + "step": 39150 + }, + { + "epoch": 11.415283359543363, + "grad_norm": 0.373262882232666, + "learning_rate": 0.00046329020979020977, + "loss": 3.3412, + "step": 39200 + }, + { + "epoch": 11.429844487157085, + "grad_norm": 0.3348482847213745, + "learning_rate": 0.00046311538461538457, + "loss": 3.3427, + "step": 39250 + }, + { + "epoch": 11.444405614770808, + "grad_norm": 0.3629266917705536, + "learning_rate": 0.0004629405594405594, + "loss": 3.3431, + "step": 39300 + }, + { + "epoch": 11.458966742384531, + "grad_norm": 0.3323516845703125, + "learning_rate": 0.0004627657342657342, + "loss": 3.3372, + "step": 39350 + }, + { + "epoch": 11.473527869998252, + "grad_norm": 0.3465496301651001, + "learning_rate": 0.0004625909090909091, + "loss": 3.3334, + "step": 39400 + }, + { + "epoch": 11.488088997611975, + "grad_norm": 0.38713112473487854, + "learning_rate": 0.0004624160839160838, + "loss": 3.3428, + "step": 39450 + }, + { + "epoch": 11.502650125225697, + "grad_norm": 0.33216097950935364, + "learning_rate": 0.0004622412587412587, + "loss": 3.3379, + "step": 39500 + }, + { + "epoch": 11.51721125283942, + "grad_norm": 0.35208389163017273, + "learning_rate": 0.00046206643356643353, + "loss": 3.3628, + "step": 39550 + }, + { + "epoch": 11.531772380453143, + "grad_norm": 0.36084526777267456, + "learning_rate": 0.00046189160839160833, + "loss": 3.3487, + "step": 39600 + }, + { + "epoch": 11.546333508066864, + "grad_norm": 0.3580910265445709, + "learning_rate": 0.0004617167832167832, + "loss": 3.3446, + "step": 39650 + }, + { + "epoch": 11.560894635680587, + "grad_norm": 0.3954737186431885, + "learning_rate": 0.000461541958041958, + "loss": 3.3511, + "step": 39700 + }, + { + "epoch": 11.57545576329431, + "grad_norm": 0.33455291390419006, + "learning_rate": 0.00046136713286713284, + "loss": 3.3654, + "step": 39750 + }, + { + "epoch": 11.590016890908032, + "grad_norm": 0.364233136177063, + "learning_rate": 0.00046119230769230764, + "loss": 3.3632, + "step": 39800 + }, + { + "epoch": 11.604578018521755, + "grad_norm": 0.36201661825180054, + "learning_rate": 0.0004610174825174825, + "loss": 3.3553, + "step": 39850 + }, + { + "epoch": 11.619139146135478, + "grad_norm": 0.3511942923069, + "learning_rate": 0.0004608426573426573, + "loss": 3.3432, + "step": 39900 + }, + { + "epoch": 11.633700273749199, + "grad_norm": 0.35589101910591125, + "learning_rate": 0.00046066783216783215, + "loss": 3.3694, + "step": 39950 + }, + { + "epoch": 11.648261401362921, + "grad_norm": 0.33582714200019836, + "learning_rate": 0.00046049300699300695, + "loss": 3.3497, + "step": 40000 + }, + { + "epoch": 11.648261401362921, + "eval_accuracy": 0.3697849286727392, + "eval_loss": 3.5575385093688965, + "eval_runtime": 179.3781, + "eval_samples_per_second": 92.771, + "eval_steps_per_second": 5.803, + "step": 40000 + }, + { + "epoch": 11.662822528976644, + "grad_norm": 0.3475906550884247, + "learning_rate": 0.0004603181818181818, + "loss": 3.3493, + "step": 40050 + }, + { + "epoch": 11.677383656590367, + "grad_norm": 0.3488917052745819, + "learning_rate": 0.0004601433566433566, + "loss": 3.3608, + "step": 40100 + }, + { + "epoch": 11.69194478420409, + "grad_norm": 0.379167765378952, + "learning_rate": 0.00045996853146853145, + "loss": 3.3625, + "step": 40150 + }, + { + "epoch": 11.70650591181781, + "grad_norm": 0.34200170636177063, + "learning_rate": 0.0004597937062937062, + "loss": 3.3571, + "step": 40200 + }, + { + "epoch": 11.721067039431533, + "grad_norm": 0.36554938554763794, + "learning_rate": 0.00045961888111888105, + "loss": 3.3614, + "step": 40250 + }, + { + "epoch": 11.735628167045256, + "grad_norm": 0.3781202435493469, + "learning_rate": 0.0004594440559440559, + "loss": 3.3741, + "step": 40300 + }, + { + "epoch": 11.750189294658979, + "grad_norm": 0.3247515559196472, + "learning_rate": 0.0004592692307692307, + "loss": 3.3653, + "step": 40350 + }, + { + "epoch": 11.764750422272702, + "grad_norm": 0.3390739858150482, + "learning_rate": 0.00045909440559440556, + "loss": 3.3551, + "step": 40400 + }, + { + "epoch": 11.779311549886422, + "grad_norm": 0.34482771158218384, + "learning_rate": 0.00045891958041958036, + "loss": 3.3688, + "step": 40450 + }, + { + "epoch": 11.793872677500145, + "grad_norm": 0.35239699482917786, + "learning_rate": 0.0004587447552447552, + "loss": 3.3604, + "step": 40500 + }, + { + "epoch": 11.808433805113868, + "grad_norm": 0.3418464958667755, + "learning_rate": 0.00045856993006993, + "loss": 3.3696, + "step": 40550 + }, + { + "epoch": 11.82299493272759, + "grad_norm": 0.3679628372192383, + "learning_rate": 0.00045839510489510487, + "loss": 3.3641, + "step": 40600 + }, + { + "epoch": 11.837556060341313, + "grad_norm": 0.33282944560050964, + "learning_rate": 0.00045822027972027967, + "loss": 3.3546, + "step": 40650 + }, + { + "epoch": 11.852117187955034, + "grad_norm": 0.36056581139564514, + "learning_rate": 0.0004580454545454545, + "loss": 3.3632, + "step": 40700 + }, + { + "epoch": 11.866678315568757, + "grad_norm": 0.3631812036037445, + "learning_rate": 0.0004578706293706293, + "loss": 3.3794, + "step": 40750 + }, + { + "epoch": 11.88123944318248, + "grad_norm": 0.33666253089904785, + "learning_rate": 0.0004576958041958042, + "loss": 3.3578, + "step": 40800 + }, + { + "epoch": 11.895800570796203, + "grad_norm": 0.3388082981109619, + "learning_rate": 0.000457520979020979, + "loss": 3.3795, + "step": 40850 + }, + { + "epoch": 11.910361698409925, + "grad_norm": 0.3440251648426056, + "learning_rate": 0.00045734615384615383, + "loss": 3.3674, + "step": 40900 + }, + { + "epoch": 11.924922826023646, + "grad_norm": 0.3502148687839508, + "learning_rate": 0.0004571713286713287, + "loss": 3.3791, + "step": 40950 + }, + { + "epoch": 11.93948395363737, + "grad_norm": 0.33843347430229187, + "learning_rate": 0.00045699650349650343, + "loss": 3.3652, + "step": 41000 + }, + { + "epoch": 11.93948395363737, + "eval_accuracy": 0.37016665155533257, + "eval_loss": 3.5524773597717285, + "eval_runtime": 179.4931, + "eval_samples_per_second": 92.711, + "eval_steps_per_second": 5.8, + "step": 41000 + }, + { + "epoch": 11.954045081251092, + "grad_norm": 0.3571944534778595, + "learning_rate": 0.0004568216783216783, + "loss": 3.378, + "step": 41050 + }, + { + "epoch": 11.968606208864815, + "grad_norm": 0.35896360874176025, + "learning_rate": 0.0004566468531468531, + "loss": 3.3745, + "step": 41100 + }, + { + "epoch": 11.983167336478537, + "grad_norm": 0.36639687418937683, + "learning_rate": 0.00045647202797202794, + "loss": 3.3723, + "step": 41150 + }, + { + "epoch": 11.99772846409226, + "grad_norm": 0.34861519932746887, + "learning_rate": 0.00045629720279720274, + "loss": 3.3665, + "step": 41200 + }, + { + "epoch": 12.012231347195527, + "grad_norm": 0.3419104218482971, + "learning_rate": 0.0004561223776223776, + "loss": 3.2789, + "step": 41250 + }, + { + "epoch": 12.02679247480925, + "grad_norm": 0.3540184795856476, + "learning_rate": 0.0004559475524475524, + "loss": 3.2586, + "step": 41300 + }, + { + "epoch": 12.041353602422971, + "grad_norm": 0.3587668836116791, + "learning_rate": 0.00045577272727272724, + "loss": 3.261, + "step": 41350 + }, + { + "epoch": 12.055914730036694, + "grad_norm": 0.33976301550865173, + "learning_rate": 0.00045559790209790204, + "loss": 3.2696, + "step": 41400 + }, + { + "epoch": 12.070475857650417, + "grad_norm": 0.3481811583042145, + "learning_rate": 0.0004554230769230769, + "loss": 3.2758, + "step": 41450 + }, + { + "epoch": 12.08503698526414, + "grad_norm": 0.36990079283714294, + "learning_rate": 0.0004552482517482517, + "loss": 3.2813, + "step": 41500 + }, + { + "epoch": 12.099598112877862, + "grad_norm": 0.33966222405433655, + "learning_rate": 0.00045507342657342655, + "loss": 3.2727, + "step": 41550 + }, + { + "epoch": 12.114159240491583, + "grad_norm": 0.3467234969139099, + "learning_rate": 0.00045489860139860135, + "loss": 3.2746, + "step": 41600 + }, + { + "epoch": 12.128720368105306, + "grad_norm": 0.35601502656936646, + "learning_rate": 0.0004547237762237762, + "loss": 3.2741, + "step": 41650 + }, + { + "epoch": 12.143281495719028, + "grad_norm": 0.3513096868991852, + "learning_rate": 0.00045454895104895106, + "loss": 3.2947, + "step": 41700 + }, + { + "epoch": 12.157842623332751, + "grad_norm": 0.3616795241832733, + "learning_rate": 0.0004543741258741258, + "loss": 3.2756, + "step": 41750 + }, + { + "epoch": 12.172403750946474, + "grad_norm": 0.36047980189323425, + "learning_rate": 0.00045419930069930066, + "loss": 3.2892, + "step": 41800 + }, + { + "epoch": 12.186964878560195, + "grad_norm": 0.3357391357421875, + "learning_rate": 0.00045402447552447546, + "loss": 3.2952, + "step": 41850 + }, + { + "epoch": 12.201526006173918, + "grad_norm": 0.3331086337566376, + "learning_rate": 0.0004538496503496503, + "loss": 3.298, + "step": 41900 + }, + { + "epoch": 12.21608713378764, + "grad_norm": 0.3594485819339752, + "learning_rate": 0.0004536748251748251, + "loss": 3.2939, + "step": 41950 + }, + { + "epoch": 12.230648261401363, + "grad_norm": 0.333735853433609, + "learning_rate": 0.00045349999999999996, + "loss": 3.2998, + "step": 42000 + }, + { + "epoch": 12.230648261401363, + "eval_accuracy": 0.36975270683976613, + "eval_loss": 3.5625216960906982, + "eval_runtime": 179.4692, + "eval_samples_per_second": 92.723, + "eval_steps_per_second": 5.8, + "step": 42000 + }, + { + "epoch": 12.245209389015086, + "grad_norm": 0.38054659962654114, + "learning_rate": 0.00045332517482517476, + "loss": 3.3154, + "step": 42050 + }, + { + "epoch": 12.259770516628807, + "grad_norm": 0.35925668478012085, + "learning_rate": 0.0004531503496503496, + "loss": 3.301, + "step": 42100 + }, + { + "epoch": 12.27433164424253, + "grad_norm": 0.3805968165397644, + "learning_rate": 0.0004529755244755244, + "loss": 3.3095, + "step": 42150 + }, + { + "epoch": 12.288892771856252, + "grad_norm": 0.3440055251121521, + "learning_rate": 0.00045280069930069927, + "loss": 3.3106, + "step": 42200 + }, + { + "epoch": 12.303453899469975, + "grad_norm": 0.3645689785480499, + "learning_rate": 0.00045262587412587407, + "loss": 3.3168, + "step": 42250 + }, + { + "epoch": 12.318015027083698, + "grad_norm": 0.3602798581123352, + "learning_rate": 0.0004524510489510489, + "loss": 3.3174, + "step": 42300 + }, + { + "epoch": 12.33257615469742, + "grad_norm": 0.36583825945854187, + "learning_rate": 0.0004522762237762238, + "loss": 3.3139, + "step": 42350 + }, + { + "epoch": 12.347137282311142, + "grad_norm": 0.34254148602485657, + "learning_rate": 0.0004521013986013986, + "loss": 3.3131, + "step": 42400 + }, + { + "epoch": 12.361698409924864, + "grad_norm": 0.3659822344779968, + "learning_rate": 0.00045192657342657343, + "loss": 3.3231, + "step": 42450 + }, + { + "epoch": 12.376259537538587, + "grad_norm": 0.3749573528766632, + "learning_rate": 0.0004517517482517482, + "loss": 3.322, + "step": 42500 + }, + { + "epoch": 12.39082066515231, + "grad_norm": 0.34579816460609436, + "learning_rate": 0.00045157692307692303, + "loss": 3.3321, + "step": 42550 + }, + { + "epoch": 12.405381792766033, + "grad_norm": 0.34676283597946167, + "learning_rate": 0.00045140209790209783, + "loss": 3.3237, + "step": 42600 + }, + { + "epoch": 12.419942920379754, + "grad_norm": 0.37196454405784607, + "learning_rate": 0.0004512272727272727, + "loss": 3.3271, + "step": 42650 + }, + { + "epoch": 12.434504047993476, + "grad_norm": 0.33962497115135193, + "learning_rate": 0.0004510524475524475, + "loss": 3.3228, + "step": 42700 + }, + { + "epoch": 12.449065175607199, + "grad_norm": 0.3634340167045593, + "learning_rate": 0.00045087762237762234, + "loss": 3.3355, + "step": 42750 + }, + { + "epoch": 12.463626303220922, + "grad_norm": 0.3375799357891083, + "learning_rate": 0.00045070279720279714, + "loss": 3.3264, + "step": 42800 + }, + { + "epoch": 12.478187430834645, + "grad_norm": 0.34822535514831543, + "learning_rate": 0.000450527972027972, + "loss": 3.3143, + "step": 42850 + }, + { + "epoch": 12.492748558448366, + "grad_norm": 0.3571722209453583, + "learning_rate": 0.0004503531468531468, + "loss": 3.3301, + "step": 42900 + }, + { + "epoch": 12.507309686062088, + "grad_norm": 0.34919336438179016, + "learning_rate": 0.00045017832167832165, + "loss": 3.329, + "step": 42950 + }, + { + "epoch": 12.521870813675811, + "grad_norm": 0.35883447527885437, + "learning_rate": 0.0004500034965034965, + "loss": 3.3222, + "step": 43000 + }, + { + "epoch": 12.521870813675811, + "eval_accuracy": 0.37045594246450686, + "eval_loss": 3.555795192718506, + "eval_runtime": 179.3484, + "eval_samples_per_second": 92.786, + "eval_steps_per_second": 5.804, + "step": 43000 + }, + { + "epoch": 12.536431941289534, + "grad_norm": 0.3635919988155365, + "learning_rate": 0.0004498286713286713, + "loss": 3.3463, + "step": 43050 + }, + { + "epoch": 12.550993068903256, + "grad_norm": 0.3739362359046936, + "learning_rate": 0.00044965384615384615, + "loss": 3.3449, + "step": 43100 + }, + { + "epoch": 12.565554196516977, + "grad_norm": 0.34652945399284363, + "learning_rate": 0.00044947902097902095, + "loss": 3.3283, + "step": 43150 + }, + { + "epoch": 12.5801153241307, + "grad_norm": 0.37062638998031616, + "learning_rate": 0.0004493041958041958, + "loss": 3.3339, + "step": 43200 + }, + { + "epoch": 12.594676451744423, + "grad_norm": 0.3567717373371124, + "learning_rate": 0.00044912937062937055, + "loss": 3.3448, + "step": 43250 + }, + { + "epoch": 12.609237579358146, + "grad_norm": 0.3564370274543762, + "learning_rate": 0.0004489545454545454, + "loss": 3.3355, + "step": 43300 + }, + { + "epoch": 12.623798706971868, + "grad_norm": 0.3377971351146698, + "learning_rate": 0.0004487797202797202, + "loss": 3.3314, + "step": 43350 + }, + { + "epoch": 12.63835983458559, + "grad_norm": 0.35002943873405457, + "learning_rate": 0.00044860489510489506, + "loss": 3.3442, + "step": 43400 + }, + { + "epoch": 12.652920962199312, + "grad_norm": 0.35957759618759155, + "learning_rate": 0.00044843006993006986, + "loss": 3.3305, + "step": 43450 + }, + { + "epoch": 12.667482089813035, + "grad_norm": 0.3890550136566162, + "learning_rate": 0.0004482552447552447, + "loss": 3.3383, + "step": 43500 + }, + { + "epoch": 12.682043217426758, + "grad_norm": 0.3480173349380493, + "learning_rate": 0.0004480804195804195, + "loss": 3.3468, + "step": 43550 + }, + { + "epoch": 12.69660434504048, + "grad_norm": 0.35312148928642273, + "learning_rate": 0.00044790559440559437, + "loss": 3.3424, + "step": 43600 + }, + { + "epoch": 12.711165472654203, + "grad_norm": 0.3422091007232666, + "learning_rate": 0.00044773076923076917, + "loss": 3.3358, + "step": 43650 + }, + { + "epoch": 12.725726600267924, + "grad_norm": 0.3429749011993408, + "learning_rate": 0.000447555944055944, + "loss": 3.3494, + "step": 43700 + }, + { + "epoch": 12.740287727881647, + "grad_norm": 0.3457909822463989, + "learning_rate": 0.0004473811188811189, + "loss": 3.3504, + "step": 43750 + }, + { + "epoch": 12.75484885549537, + "grad_norm": 0.3556594252586365, + "learning_rate": 0.0004472062937062937, + "loss": 3.327, + "step": 43800 + }, + { + "epoch": 12.769409983109092, + "grad_norm": 0.3568826913833618, + "learning_rate": 0.00044703146853146853, + "loss": 3.3423, + "step": 43850 + }, + { + "epoch": 12.783971110722815, + "grad_norm": 0.3365405797958374, + "learning_rate": 0.00044685664335664333, + "loss": 3.3448, + "step": 43900 + }, + { + "epoch": 12.798532238336536, + "grad_norm": 0.32082322239875793, + "learning_rate": 0.0004466818181818182, + "loss": 3.3455, + "step": 43950 + }, + { + "epoch": 12.813093365950259, + "grad_norm": 0.3770608603954315, + "learning_rate": 0.00044650699300699293, + "loss": 3.3512, + "step": 44000 + }, + { + "epoch": 12.813093365950259, + "eval_accuracy": 0.370579537889524, + "eval_loss": 3.5477168560028076, + "eval_runtime": 179.3881, + "eval_samples_per_second": 92.765, + "eval_steps_per_second": 5.803, + "step": 44000 + }, + { + "epoch": 12.827654493563982, + "grad_norm": 0.35191744565963745, + "learning_rate": 0.0004463321678321678, + "loss": 3.3434, + "step": 44050 + }, + { + "epoch": 12.842215621177704, + "grad_norm": 0.35282081365585327, + "learning_rate": 0.0004461573426573426, + "loss": 3.363, + "step": 44100 + }, + { + "epoch": 12.856776748791427, + "grad_norm": 0.3475187420845032, + "learning_rate": 0.00044598251748251744, + "loss": 3.3419, + "step": 44150 + }, + { + "epoch": 12.871337876405148, + "grad_norm": 0.37921270728111267, + "learning_rate": 0.00044580769230769224, + "loss": 3.3643, + "step": 44200 + }, + { + "epoch": 12.88589900401887, + "grad_norm": 0.35705578327178955, + "learning_rate": 0.0004456328671328671, + "loss": 3.3566, + "step": 44250 + }, + { + "epoch": 12.900460131632594, + "grad_norm": 0.3738381266593933, + "learning_rate": 0.0004454580419580419, + "loss": 3.3578, + "step": 44300 + }, + { + "epoch": 12.915021259246316, + "grad_norm": 0.37262797355651855, + "learning_rate": 0.00044528321678321674, + "loss": 3.3576, + "step": 44350 + }, + { + "epoch": 12.929582386860039, + "grad_norm": 0.36386188864707947, + "learning_rate": 0.0004451083916083916, + "loss": 3.3621, + "step": 44400 + }, + { + "epoch": 12.944143514473762, + "grad_norm": 0.35035935044288635, + "learning_rate": 0.0004449335664335664, + "loss": 3.3512, + "step": 44450 + }, + { + "epoch": 12.958704642087483, + "grad_norm": 0.34800392389297485, + "learning_rate": 0.00044475874125874125, + "loss": 3.3548, + "step": 44500 + }, + { + "epoch": 12.973265769701205, + "grad_norm": 0.3844282031059265, + "learning_rate": 0.00044458391608391605, + "loss": 3.3601, + "step": 44550 + }, + { + "epoch": 12.987826897314928, + "grad_norm": 0.3487918972969055, + "learning_rate": 0.0004444090909090909, + "loss": 3.3575, + "step": 44600 + }, + { + "epoch": 13.002329780418195, + "grad_norm": 0.3930342197418213, + "learning_rate": 0.0004442342657342657, + "loss": 3.3273, + "step": 44650 + }, + { + "epoch": 13.016890908031918, + "grad_norm": 0.3483104109764099, + "learning_rate": 0.00044405944055944056, + "loss": 3.2386, + "step": 44700 + }, + { + "epoch": 13.031452035645641, + "grad_norm": 0.40511345863342285, + "learning_rate": 0.0004438846153846153, + "loss": 3.2456, + "step": 44750 + }, + { + "epoch": 13.046013163259364, + "grad_norm": 0.34650883078575134, + "learning_rate": 0.00044370979020979016, + "loss": 3.2567, + "step": 44800 + }, + { + "epoch": 13.060574290873085, + "grad_norm": 0.34063106775283813, + "learning_rate": 0.00044353496503496496, + "loss": 3.264, + "step": 44850 + }, + { + "epoch": 13.075135418486807, + "grad_norm": 0.3684180974960327, + "learning_rate": 0.0004433601398601398, + "loss": 3.2597, + "step": 44900 + }, + { + "epoch": 13.08969654610053, + "grad_norm": 0.3664800822734833, + "learning_rate": 0.0004431853146853146, + "loss": 3.2519, + "step": 44950 + }, + { + "epoch": 13.104257673714253, + "grad_norm": 0.326003760099411, + "learning_rate": 0.00044301048951048946, + "loss": 3.2655, + "step": 45000 + }, + { + "epoch": 13.104257673714253, + "eval_accuracy": 0.37012114115620637, + "eval_loss": 3.5636181831359863, + "eval_runtime": 179.2598, + "eval_samples_per_second": 92.832, + "eval_steps_per_second": 5.807, + "step": 45000 + }, + { + "epoch": 13.118818801327976, + "grad_norm": 0.34620916843414307, + "learning_rate": 0.00044283566433566426, + "loss": 3.2733, + "step": 45050 + }, + { + "epoch": 13.133379928941697, + "grad_norm": 0.3420342803001404, + "learning_rate": 0.0004426608391608391, + "loss": 3.2653, + "step": 45100 + }, + { + "epoch": 13.14794105655542, + "grad_norm": 0.37536609172821045, + "learning_rate": 0.00044248601398601397, + "loss": 3.2691, + "step": 45150 + }, + { + "epoch": 13.162502184169142, + "grad_norm": 0.3489169478416443, + "learning_rate": 0.00044231118881118877, + "loss": 3.2838, + "step": 45200 + }, + { + "epoch": 13.177063311782865, + "grad_norm": 0.3672625720500946, + "learning_rate": 0.0004421363636363636, + "loss": 3.271, + "step": 45250 + }, + { + "epoch": 13.191624439396588, + "grad_norm": 0.33298152685165405, + "learning_rate": 0.0004419615384615384, + "loss": 3.2791, + "step": 45300 + }, + { + "epoch": 13.206185567010309, + "grad_norm": 0.35840123891830444, + "learning_rate": 0.0004417867132867133, + "loss": 3.2835, + "step": 45350 + }, + { + "epoch": 13.220746694624031, + "grad_norm": 0.34123364090919495, + "learning_rate": 0.0004416118881118881, + "loss": 3.2848, + "step": 45400 + }, + { + "epoch": 13.235307822237754, + "grad_norm": 0.35084405541419983, + "learning_rate": 0.00044143706293706293, + "loss": 3.2859, + "step": 45450 + }, + { + "epoch": 13.249868949851477, + "grad_norm": 0.38360145688056946, + "learning_rate": 0.0004412622377622377, + "loss": 3.2838, + "step": 45500 + }, + { + "epoch": 13.2644300774652, + "grad_norm": 0.3318229019641876, + "learning_rate": 0.00044108741258741253, + "loss": 3.2892, + "step": 45550 + }, + { + "epoch": 13.27899120507892, + "grad_norm": 0.3659403622150421, + "learning_rate": 0.00044091258741258733, + "loss": 3.3001, + "step": 45600 + }, + { + "epoch": 13.293552332692643, + "grad_norm": 0.35524776577949524, + "learning_rate": 0.0004407377622377622, + "loss": 3.2927, + "step": 45650 + }, + { + "epoch": 13.308113460306366, + "grad_norm": 0.3889663517475128, + "learning_rate": 0.000440562937062937, + "loss": 3.2983, + "step": 45700 + }, + { + "epoch": 13.322674587920089, + "grad_norm": 0.34109199047088623, + "learning_rate": 0.00044038811188811184, + "loss": 3.2808, + "step": 45750 + }, + { + "epoch": 13.337235715533811, + "grad_norm": 0.3871344029903412, + "learning_rate": 0.0004402132867132867, + "loss": 3.3107, + "step": 45800 + }, + { + "epoch": 13.351796843147532, + "grad_norm": 0.3498539328575134, + "learning_rate": 0.0004400384615384615, + "loss": 3.2938, + "step": 45850 + }, + { + "epoch": 13.366357970761255, + "grad_norm": 0.3516894280910492, + "learning_rate": 0.00043986363636363635, + "loss": 3.305, + "step": 45900 + }, + { + "epoch": 13.380919098374978, + "grad_norm": 0.35592004656791687, + "learning_rate": 0.00043968881118881115, + "loss": 3.2998, + "step": 45950 + }, + { + "epoch": 13.3954802259887, + "grad_norm": 0.32546910643577576, + "learning_rate": 0.000439513986013986, + "loss": 3.3094, + "step": 46000 + }, + { + "epoch": 13.3954802259887, + "eval_accuracy": 0.37056460295234306, + "eval_loss": 3.5570590496063232, + "eval_runtime": 179.3634, + "eval_samples_per_second": 92.778, + "eval_steps_per_second": 5.804, + "step": 46000 + }, + { + "epoch": 13.410041353602423, + "grad_norm": 0.37634313106536865, + "learning_rate": 0.0004393391608391608, + "loss": 3.3115, + "step": 46050 + }, + { + "epoch": 13.424602481216146, + "grad_norm": 0.3550618886947632, + "learning_rate": 0.00043916433566433565, + "loss": 3.3024, + "step": 46100 + }, + { + "epoch": 13.439163608829867, + "grad_norm": 0.3748590052127838, + "learning_rate": 0.00043898951048951045, + "loss": 3.289, + "step": 46150 + }, + { + "epoch": 13.45372473644359, + "grad_norm": 0.35100436210632324, + "learning_rate": 0.0004388146853146853, + "loss": 3.3117, + "step": 46200 + }, + { + "epoch": 13.468285864057313, + "grad_norm": 0.36626309156417847, + "learning_rate": 0.00043863986013986005, + "loss": 3.3111, + "step": 46250 + }, + { + "epoch": 13.482846991671035, + "grad_norm": 0.3601759672164917, + "learning_rate": 0.0004384650349650349, + "loss": 3.3107, + "step": 46300 + }, + { + "epoch": 13.497408119284758, + "grad_norm": 0.33450236916542053, + "learning_rate": 0.0004382902097902097, + "loss": 3.3077, + "step": 46350 + }, + { + "epoch": 13.51196924689848, + "grad_norm": 0.34406334161758423, + "learning_rate": 0.00043811538461538456, + "loss": 3.3194, + "step": 46400 + }, + { + "epoch": 13.526530374512202, + "grad_norm": 0.34490951895713806, + "learning_rate": 0.0004379405594405594, + "loss": 3.3069, + "step": 46450 + }, + { + "epoch": 13.541091502125925, + "grad_norm": 0.3848843574523926, + "learning_rate": 0.0004377657342657342, + "loss": 3.3134, + "step": 46500 + }, + { + "epoch": 13.555652629739647, + "grad_norm": 0.3632429838180542, + "learning_rate": 0.00043759090909090907, + "loss": 3.3208, + "step": 46550 + }, + { + "epoch": 13.57021375735337, + "grad_norm": 0.3939577341079712, + "learning_rate": 0.00043741608391608387, + "loss": 3.324, + "step": 46600 + }, + { + "epoch": 13.584774884967091, + "grad_norm": 0.3596148192882538, + "learning_rate": 0.0004372412587412587, + "loss": 3.3322, + "step": 46650 + }, + { + "epoch": 13.599336012580814, + "grad_norm": 0.35220861434936523, + "learning_rate": 0.0004370664335664335, + "loss": 3.3069, + "step": 46700 + }, + { + "epoch": 13.613897140194537, + "grad_norm": 0.3723880350589752, + "learning_rate": 0.0004368916083916084, + "loss": 3.327, + "step": 46750 + }, + { + "epoch": 13.62845826780826, + "grad_norm": 0.3383399546146393, + "learning_rate": 0.0004367167832167832, + "loss": 3.3256, + "step": 46800 + }, + { + "epoch": 13.643019395421982, + "grad_norm": 0.3382317125797272, + "learning_rate": 0.00043654195804195803, + "loss": 3.3374, + "step": 46850 + }, + { + "epoch": 13.657580523035705, + "grad_norm": 0.34736913442611694, + "learning_rate": 0.00043636713286713283, + "loss": 3.329, + "step": 46900 + }, + { + "epoch": 13.672141650649426, + "grad_norm": 0.33893242478370667, + "learning_rate": 0.0004361923076923077, + "loss": 3.3123, + "step": 46950 + }, + { + "epoch": 13.686702778263149, + "grad_norm": 0.35469168424606323, + "learning_rate": 0.00043601748251748243, + "loss": 3.3239, + "step": 47000 + }, + { + "epoch": 13.686702778263149, + "eval_accuracy": 0.37072630010686125, + "eval_loss": 3.550806999206543, + "eval_runtime": 179.4918, + "eval_samples_per_second": 92.712, + "eval_steps_per_second": 5.8, + "step": 47000 + }, + { + "epoch": 13.701263905876871, + "grad_norm": 0.3704715073108673, + "learning_rate": 0.00043584265734265734, + "loss": 3.3243, + "step": 47050 + }, + { + "epoch": 13.715825033490594, + "grad_norm": 0.3501978814601898, + "learning_rate": 0.0004356678321678321, + "loss": 3.3377, + "step": 47100 + }, + { + "epoch": 13.730386161104317, + "grad_norm": 0.3527981638908386, + "learning_rate": 0.00043549300699300694, + "loss": 3.3309, + "step": 47150 + }, + { + "epoch": 13.744947288718038, + "grad_norm": 0.36017870903015137, + "learning_rate": 0.0004353181818181818, + "loss": 3.3214, + "step": 47200 + }, + { + "epoch": 13.75950841633176, + "grad_norm": 0.3922679126262665, + "learning_rate": 0.0004351433566433566, + "loss": 3.3286, + "step": 47250 + }, + { + "epoch": 13.774069543945483, + "grad_norm": 0.3513588011264801, + "learning_rate": 0.00043496853146853144, + "loss": 3.3451, + "step": 47300 + }, + { + "epoch": 13.788630671559206, + "grad_norm": 0.3663727045059204, + "learning_rate": 0.00043479370629370624, + "loss": 3.3265, + "step": 47350 + }, + { + "epoch": 13.803191799172929, + "grad_norm": 0.35833120346069336, + "learning_rate": 0.0004346188811188811, + "loss": 3.3246, + "step": 47400 + }, + { + "epoch": 13.81775292678665, + "grad_norm": 0.39127853512763977, + "learning_rate": 0.0004344440559440559, + "loss": 3.3396, + "step": 47450 + }, + { + "epoch": 13.832314054400372, + "grad_norm": 0.34158089756965637, + "learning_rate": 0.00043426923076923075, + "loss": 3.3327, + "step": 47500 + }, + { + "epoch": 13.846875182014095, + "grad_norm": 0.34215644001960754, + "learning_rate": 0.00043409440559440555, + "loss": 3.3317, + "step": 47550 + }, + { + "epoch": 13.861436309627818, + "grad_norm": 0.3922632038593292, + "learning_rate": 0.0004339195804195804, + "loss": 3.3378, + "step": 47600 + }, + { + "epoch": 13.87599743724154, + "grad_norm": 0.35244500637054443, + "learning_rate": 0.0004337447552447552, + "loss": 3.3386, + "step": 47650 + }, + { + "epoch": 13.890558564855262, + "grad_norm": 0.3905071020126343, + "learning_rate": 0.00043356993006993006, + "loss": 3.3239, + "step": 47700 + }, + { + "epoch": 13.905119692468984, + "grad_norm": NaN, + "learning_rate": 0.0004333951048951048, + "loss": 3.3461, + "step": 47750 + }, + { + "epoch": 13.919680820082707, + "grad_norm": 0.3413317799568176, + "learning_rate": 0.0004332202797202797, + "loss": 3.3401, + "step": 47800 + }, + { + "epoch": 13.93424194769643, + "grad_norm": 0.3655047118663788, + "learning_rate": 0.00043304545454545456, + "loss": 3.3375, + "step": 47850 + }, + { + "epoch": 13.948803075310153, + "grad_norm": 0.35636481642723083, + "learning_rate": 0.0004328706293706293, + "loss": 3.3318, + "step": 47900 + }, + { + "epoch": 13.963364202923874, + "grad_norm": 0.35816138982772827, + "learning_rate": 0.00043269580419580416, + "loss": 3.3291, + "step": 47950 + }, + { + "epoch": 13.977925330537596, + "grad_norm": 0.3533070683479309, + "learning_rate": 0.00043252097902097896, + "loss": 3.3384, + "step": 48000 + }, + { + "epoch": 13.977925330537596, + "eval_accuracy": 0.3715532487545497, + "eval_loss": 3.543606996536255, + "eval_runtime": 179.3377, + "eval_samples_per_second": 92.791, + "eval_steps_per_second": 5.805, + "step": 48000 + }, + { + "epoch": 13.992486458151319, + "grad_norm": 0.3505331873893738, + "learning_rate": 0.0004323461538461538, + "loss": 3.3509, + "step": 48050 + }, + { + "epoch": 14.006989341254586, + "grad_norm": 0.33162781596183777, + "learning_rate": 0.0004321713286713286, + "loss": 3.2666, + "step": 48100 + }, + { + "epoch": 14.021550468868309, + "grad_norm": 0.3567025661468506, + "learning_rate": 0.00043199650349650347, + "loss": 3.2307, + "step": 48150 + }, + { + "epoch": 14.036111596482032, + "grad_norm": 0.39268216490745544, + "learning_rate": 0.00043182167832167827, + "loss": 3.236, + "step": 48200 + }, + { + "epoch": 14.050672724095755, + "grad_norm": 0.3755398392677307, + "learning_rate": 0.0004316468531468531, + "loss": 3.2447, + "step": 48250 + }, + { + "epoch": 14.065233851709475, + "grad_norm": 0.37311825156211853, + "learning_rate": 0.0004314720279720279, + "loss": 3.2517, + "step": 48300 + }, + { + "epoch": 14.079794979323198, + "grad_norm": 0.3706851601600647, + "learning_rate": 0.0004312972027972028, + "loss": 3.2327, + "step": 48350 + }, + { + "epoch": 14.094356106936921, + "grad_norm": 0.3393601179122925, + "learning_rate": 0.0004311223776223776, + "loss": 3.2466, + "step": 48400 + }, + { + "epoch": 14.108917234550644, + "grad_norm": 0.33590853214263916, + "learning_rate": 0.00043094755244755243, + "loss": 3.2537, + "step": 48450 + }, + { + "epoch": 14.123478362164366, + "grad_norm": 0.370746910572052, + "learning_rate": 0.0004307727272727272, + "loss": 3.2524, + "step": 48500 + }, + { + "epoch": 14.13803948977809, + "grad_norm": 0.37777411937713623, + "learning_rate": 0.0004305979020979021, + "loss": 3.2438, + "step": 48550 + }, + { + "epoch": 14.15260061739181, + "grad_norm": 0.3572547435760498, + "learning_rate": 0.00043042307692307694, + "loss": 3.2555, + "step": 48600 + }, + { + "epoch": 14.167161745005533, + "grad_norm": 0.39759549498558044, + "learning_rate": 0.0004302482517482517, + "loss": 3.2567, + "step": 48650 + }, + { + "epoch": 14.181722872619256, + "grad_norm": 0.37063178420066833, + "learning_rate": 0.00043007342657342654, + "loss": 3.2589, + "step": 48700 + }, + { + "epoch": 14.196284000232978, + "grad_norm": 0.365633487701416, + "learning_rate": 0.00042989860139860134, + "loss": 3.2801, + "step": 48750 + }, + { + "epoch": 14.210845127846701, + "grad_norm": 0.35019052028656006, + "learning_rate": 0.0004297237762237762, + "loss": 3.2774, + "step": 48800 + }, + { + "epoch": 14.225406255460422, + "grad_norm": 0.35043588280677795, + "learning_rate": 0.000429548951048951, + "loss": 3.2659, + "step": 48850 + }, + { + "epoch": 14.239967383074145, + "grad_norm": 0.3760192394256592, + "learning_rate": 0.00042937412587412585, + "loss": 3.2845, + "step": 48900 + }, + { + "epoch": 14.254528510687868, + "grad_norm": 0.35163772106170654, + "learning_rate": 0.00042919930069930065, + "loss": 3.2683, + "step": 48950 + }, + { + "epoch": 14.26908963830159, + "grad_norm": 0.3676861524581909, + "learning_rate": 0.0004290244755244755, + "loss": 3.2693, + "step": 49000 + }, + { + "epoch": 14.26908963830159, + "eval_accuracy": 0.37074393979644504, + "eval_loss": 3.561136245727539, + "eval_runtime": 179.3898, + "eval_samples_per_second": 92.764, + "eval_steps_per_second": 5.803, + "step": 49000 + }, + { + "epoch": 14.283650765915313, + "grad_norm": 0.3473881185054779, + "learning_rate": 0.0004288496503496503, + "loss": 3.2762, + "step": 49050 + }, + { + "epoch": 14.298211893529034, + "grad_norm": 0.3722080886363983, + "learning_rate": 0.00042867482517482515, + "loss": 3.2836, + "step": 49100 + }, + { + "epoch": 14.312773021142757, + "grad_norm": 0.35850104689598083, + "learning_rate": 0.00042849999999999995, + "loss": 3.285, + "step": 49150 + }, + { + "epoch": 14.32733414875648, + "grad_norm": 0.3875146210193634, + "learning_rate": 0.0004283251748251748, + "loss": 3.2743, + "step": 49200 + }, + { + "epoch": 14.341895276370202, + "grad_norm": 0.3412521779537201, + "learning_rate": 0.00042815034965034966, + "loss": 3.2891, + "step": 49250 + }, + { + "epoch": 14.356456403983925, + "grad_norm": 0.36994293332099915, + "learning_rate": 0.00042797552447552446, + "loss": 3.2841, + "step": 49300 + }, + { + "epoch": 14.371017531597648, + "grad_norm": 0.36702197790145874, + "learning_rate": 0.0004278006993006993, + "loss": 3.2903, + "step": 49350 + }, + { + "epoch": 14.385578659211369, + "grad_norm": 0.4124605357646942, + "learning_rate": 0.00042762587412587406, + "loss": 3.2874, + "step": 49400 + }, + { + "epoch": 14.400139786825092, + "grad_norm": 0.39137279987335205, + "learning_rate": 0.0004274510489510489, + "loss": 3.2784, + "step": 49450 + }, + { + "epoch": 14.414700914438814, + "grad_norm": 0.3900337219238281, + "learning_rate": 0.0004272762237762237, + "loss": 3.2899, + "step": 49500 + }, + { + "epoch": 14.429262042052537, + "grad_norm": 0.3700048625469208, + "learning_rate": 0.00042710139860139857, + "loss": 3.2876, + "step": 49550 + }, + { + "epoch": 14.44382316966626, + "grad_norm": 0.3632020056247711, + "learning_rate": 0.00042692657342657337, + "loss": 3.2918, + "step": 49600 + }, + { + "epoch": 14.45838429727998, + "grad_norm": 0.37253108620643616, + "learning_rate": 0.0004267517482517482, + "loss": 3.3046, + "step": 49650 + }, + { + "epoch": 14.472945424893704, + "grad_norm": 0.37529903650283813, + "learning_rate": 0.000426576923076923, + "loss": 3.2981, + "step": 49700 + }, + { + "epoch": 14.487506552507426, + "grad_norm": 0.3445450961589813, + "learning_rate": 0.0004264020979020979, + "loss": 3.2962, + "step": 49750 + }, + { + "epoch": 14.502067680121149, + "grad_norm": 0.37300825119018555, + "learning_rate": 0.0004262272727272727, + "loss": 3.2913, + "step": 49800 + }, + { + "epoch": 14.516628807734872, + "grad_norm": 0.3445644676685333, + "learning_rate": 0.00042605244755244753, + "loss": 3.3022, + "step": 49850 + }, + { + "epoch": 14.531189935348593, + "grad_norm": 0.4042005240917206, + "learning_rate": 0.00042587762237762233, + "loss": 3.2982, + "step": 49900 + }, + { + "epoch": 14.545751062962315, + "grad_norm": 0.3575955331325531, + "learning_rate": 0.0004257027972027972, + "loss": 3.305, + "step": 49950 + }, + { + "epoch": 14.560312190576038, + "grad_norm": 0.3821354806423187, + "learning_rate": 0.00042552797202797204, + "loss": 3.3071, + "step": 50000 + }, + { + "epoch": 14.560312190576038, + "eval_accuracy": 0.3708429572539754, + "eval_loss": 3.5507326126098633, + "eval_runtime": 179.5347, + "eval_samples_per_second": 92.69, + "eval_steps_per_second": 5.798, + "step": 50000 + } + ], + "logging_steps": 50, + "max_steps": 171700, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 10000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 20, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 2 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.045069359611904e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}