diff --git "a/cost_to_push_frequency_5039/checkpoint-70000/trainer_state.json" "b/cost_to_push_frequency_5039/checkpoint-70000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/cost_to_push_frequency_5039/checkpoint-70000/trainer_state.json" @@ -0,0 +1,10473 @@ +{ + "best_global_step": 65000, + "best_metric": 3.5314598083496094, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_push_frequency_5039/checkpoint-30000", + "epoch": 20.384413769002272, + "eval_steps": 1000, + "global_step": 70000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.014561127613722406, + "grad_norm": 0.7864501476287842, + "learning_rate": 0.000294, + "loss": 8.4581, + "step": 50 + }, + { + "epoch": 0.029122255227444813, + "grad_norm": 0.5157894492149353, + "learning_rate": 0.0005939999999999999, + "loss": 6.7389, + "step": 100 + }, + { + "epoch": 0.04368338284116722, + "grad_norm": 0.7220619320869446, + "learning_rate": 0.0005998286713286713, + "loss": 6.358, + "step": 150 + }, + { + "epoch": 0.058244510454889625, + "grad_norm": 0.45132747292518616, + "learning_rate": 0.0005996538461538461, + "loss": 6.1209, + "step": 200 + }, + { + "epoch": 0.07280563806861204, + "grad_norm": 0.5135724544525146, + "learning_rate": 0.0005994790209790209, + "loss": 5.9889, + "step": 250 + }, + { + "epoch": 0.08736676568233444, + "grad_norm": 0.492136687040329, + "learning_rate": 0.0005993041958041958, + "loss": 5.8792, + "step": 300 + }, + { + "epoch": 0.10192789329605685, + "grad_norm": 0.4208127558231354, + "learning_rate": 0.0005991293706293705, + "loss": 5.7207, + "step": 350 + }, + { + "epoch": 0.11648902090977925, + "grad_norm": 0.41867050528526306, + "learning_rate": 0.0005989545454545454, + "loss": 5.6164, + "step": 400 + }, + { + "epoch": 0.13105014852350166, + "grad_norm": 0.5124858021736145, + "learning_rate": 0.0005987797202797202, + "loss": 5.501, + "step": 450 + }, + { + "epoch": 0.14561127613722408, + "grad_norm": 0.4207671880722046, + "learning_rate": 0.000598604895104895, + "loss": 5.406, + "step": 500 + }, + { + "epoch": 0.16017240375094646, + "grad_norm": 0.4488603472709656, + "learning_rate": 0.0005984300699300698, + "loss": 5.3207, + "step": 550 + }, + { + "epoch": 0.17473353136466888, + "grad_norm": 0.50998854637146, + "learning_rate": 0.0005982552447552447, + "loss": 5.243, + "step": 600 + }, + { + "epoch": 0.1892946589783913, + "grad_norm": 0.4845823049545288, + "learning_rate": 0.0005980804195804195, + "loss": 5.175, + "step": 650 + }, + { + "epoch": 0.2038557865921137, + "grad_norm": 0.4981078803539276, + "learning_rate": 0.0005979055944055943, + "loss": 5.1142, + "step": 700 + }, + { + "epoch": 0.2184169142058361, + "grad_norm": 0.6035723686218262, + "learning_rate": 0.0005977307692307691, + "loss": 5.0627, + "step": 750 + }, + { + "epoch": 0.2329780418195585, + "grad_norm": 0.49036705493927, + "learning_rate": 0.000597555944055944, + "loss": 5.0033, + "step": 800 + }, + { + "epoch": 0.24753916943328091, + "grad_norm": 0.4498800039291382, + "learning_rate": 0.0005973811188811188, + "loss": 4.9588, + "step": 850 + }, + { + "epoch": 0.2621002970470033, + "grad_norm": 0.44455981254577637, + "learning_rate": 0.0005972062937062936, + "loss": 4.9198, + "step": 900 + }, + { + "epoch": 0.27666142466072574, + "grad_norm": 0.40544578433036804, + "learning_rate": 0.0005970314685314685, + "loss": 4.8619, + "step": 950 + }, + { + "epoch": 0.29122255227444815, + "grad_norm": 0.4749813377857208, + "learning_rate": 0.0005968566433566433, + "loss": 4.8278, + "step": 1000 + }, + { + "epoch": 0.29122255227444815, + "eval_accuracy": 0.2567868305426714, + "eval_loss": 4.739474773406982, + "eval_runtime": 180.3909, + "eval_samples_per_second": 92.277, + "eval_steps_per_second": 5.771, + "step": 1000 + }, + { + "epoch": 0.30578367988817057, + "grad_norm": 0.43021872639656067, + "learning_rate": 0.0005966818181818181, + "loss": 4.7606, + "step": 1050 + }, + { + "epoch": 0.3203448075018929, + "grad_norm": 0.4772787094116211, + "learning_rate": 0.0005965069930069929, + "loss": 4.7182, + "step": 1100 + }, + { + "epoch": 0.33490593511561534, + "grad_norm": 0.4763582646846771, + "learning_rate": 0.0005963321678321677, + "loss": 4.6945, + "step": 1150 + }, + { + "epoch": 0.34946706272933775, + "grad_norm": 0.4743852913379669, + "learning_rate": 0.0005961573426573425, + "loss": 4.6563, + "step": 1200 + }, + { + "epoch": 0.36402819034306017, + "grad_norm": 0.43077409267425537, + "learning_rate": 0.0005959825174825174, + "loss": 4.6227, + "step": 1250 + }, + { + "epoch": 0.3785893179567826, + "grad_norm": 0.4979049563407898, + "learning_rate": 0.0005958076923076922, + "loss": 4.5896, + "step": 1300 + }, + { + "epoch": 0.393150445570505, + "grad_norm": 0.4326588809490204, + "learning_rate": 0.000595632867132867, + "loss": 4.5628, + "step": 1350 + }, + { + "epoch": 0.4077115731842274, + "grad_norm": 0.4023474156856537, + "learning_rate": 0.0005954580419580418, + "loss": 4.5543, + "step": 1400 + }, + { + "epoch": 0.4222727007979498, + "grad_norm": 0.4054296612739563, + "learning_rate": 0.0005952832167832168, + "loss": 4.5307, + "step": 1450 + }, + { + "epoch": 0.4368338284116722, + "grad_norm": 0.4788004159927368, + "learning_rate": 0.0005951083916083916, + "loss": 4.4881, + "step": 1500 + }, + { + "epoch": 0.4513949560253946, + "grad_norm": 0.46861621737480164, + "learning_rate": 0.0005949335664335664, + "loss": 4.4843, + "step": 1550 + }, + { + "epoch": 0.465956083639117, + "grad_norm": 0.4255252480506897, + "learning_rate": 0.0005947587412587413, + "loss": 4.4711, + "step": 1600 + }, + { + "epoch": 0.4805172112528394, + "grad_norm": 0.4327350854873657, + "learning_rate": 0.0005945839160839161, + "loss": 4.4385, + "step": 1650 + }, + { + "epoch": 0.49507833886656183, + "grad_norm": 0.42307257652282715, + "learning_rate": 0.0005944090909090909, + "loss": 4.436, + "step": 1700 + }, + { + "epoch": 0.5096394664802842, + "grad_norm": 0.4274670481681824, + "learning_rate": 0.0005942342657342657, + "loss": 4.4006, + "step": 1750 + }, + { + "epoch": 0.5242005940940067, + "grad_norm": 0.42061132192611694, + "learning_rate": 0.0005940594405594406, + "loss": 4.3802, + "step": 1800 + }, + { + "epoch": 0.5387617217077291, + "grad_norm": 0.4404565393924713, + "learning_rate": 0.0005938846153846153, + "loss": 4.3798, + "step": 1850 + }, + { + "epoch": 0.5533228493214515, + "grad_norm": 0.41425594687461853, + "learning_rate": 0.0005937097902097902, + "loss": 4.3565, + "step": 1900 + }, + { + "epoch": 0.5678839769351739, + "grad_norm": 0.39919501543045044, + "learning_rate": 0.000593534965034965, + "loss": 4.344, + "step": 1950 + }, + { + "epoch": 0.5824451045488963, + "grad_norm": 0.44774821400642395, + "learning_rate": 0.0005933601398601398, + "loss": 4.3264, + "step": 2000 + }, + { + "epoch": 0.5824451045488963, + "eval_accuracy": 0.30012405206330606, + "eval_loss": 4.277124404907227, + "eval_runtime": 180.5704, + "eval_samples_per_second": 92.186, + "eval_steps_per_second": 5.765, + "step": 2000 + }, + { + "epoch": 0.5970062321626187, + "grad_norm": 0.41692787408828735, + "learning_rate": 0.0005931853146853146, + "loss": 4.3319, + "step": 2050 + }, + { + "epoch": 0.6115673597763411, + "grad_norm": 0.37570783495903015, + "learning_rate": 0.0005930104895104895, + "loss": 4.3138, + "step": 2100 + }, + { + "epoch": 0.6261284873900634, + "grad_norm": 0.415437787771225, + "learning_rate": 0.0005928356643356643, + "loss": 4.2858, + "step": 2150 + }, + { + "epoch": 0.6406896150037859, + "grad_norm": 0.3619239330291748, + "learning_rate": 0.0005926608391608391, + "loss": 4.2923, + "step": 2200 + }, + { + "epoch": 0.6552507426175083, + "grad_norm": 0.4092442989349365, + "learning_rate": 0.000592486013986014, + "loss": 4.2764, + "step": 2250 + }, + { + "epoch": 0.6698118702312307, + "grad_norm": 0.37751713395118713, + "learning_rate": 0.0005923111888111888, + "loss": 4.2665, + "step": 2300 + }, + { + "epoch": 0.6843729978449531, + "grad_norm": 0.41071465611457825, + "learning_rate": 0.0005921363636363636, + "loss": 4.2555, + "step": 2350 + }, + { + "epoch": 0.6989341254586755, + "grad_norm": 0.39890703558921814, + "learning_rate": 0.0005919615384615384, + "loss": 4.2449, + "step": 2400 + }, + { + "epoch": 0.7134952530723979, + "grad_norm": 0.36429277062416077, + "learning_rate": 0.0005917867132867133, + "loss": 4.2621, + "step": 2450 + }, + { + "epoch": 0.7280563806861203, + "grad_norm": 0.375135213136673, + "learning_rate": 0.0005916118881118881, + "loss": 4.238, + "step": 2500 + }, + { + "epoch": 0.7426175082998427, + "grad_norm": 0.3689658045768738, + "learning_rate": 0.0005914370629370629, + "loss": 4.2122, + "step": 2550 + }, + { + "epoch": 0.7571786359135652, + "grad_norm": 0.3870299756526947, + "learning_rate": 0.0005912622377622377, + "loss": 4.2154, + "step": 2600 + }, + { + "epoch": 0.7717397635272876, + "grad_norm": 0.379463255405426, + "learning_rate": 0.0005910874125874125, + "loss": 4.2048, + "step": 2650 + }, + { + "epoch": 0.78630089114101, + "grad_norm": 0.37131428718566895, + "learning_rate": 0.0005909125874125873, + "loss": 4.1908, + "step": 2700 + }, + { + "epoch": 0.8008620187547324, + "grad_norm": 0.37358808517456055, + "learning_rate": 0.0005907377622377622, + "loss": 4.1776, + "step": 2750 + }, + { + "epoch": 0.8154231463684548, + "grad_norm": 0.37713292241096497, + "learning_rate": 0.000590562937062937, + "loss": 4.1811, + "step": 2800 + }, + { + "epoch": 0.8299842739821772, + "grad_norm": 0.3663122355937958, + "learning_rate": 0.0005903881118881118, + "loss": 4.1753, + "step": 2850 + }, + { + "epoch": 0.8445454015958996, + "grad_norm": 0.40076518058776855, + "learning_rate": 0.0005902132867132867, + "loss": 4.1634, + "step": 2900 + }, + { + "epoch": 0.8591065292096219, + "grad_norm": 0.347970187664032, + "learning_rate": 0.0005900384615384615, + "loss": 4.1627, + "step": 2950 + }, + { + "epoch": 0.8736676568233444, + "grad_norm": 0.37718185782432556, + "learning_rate": 0.0005898636363636363, + "loss": 4.1401, + "step": 3000 + }, + { + "epoch": 0.8736676568233444, + "eval_accuracy": 0.3151520801645312, + "eval_loss": 4.096925735473633, + "eval_runtime": 180.6379, + "eval_samples_per_second": 92.151, + "eval_steps_per_second": 5.763, + "step": 3000 + }, + { + "epoch": 0.8882287844370668, + "grad_norm": 0.3469873070716858, + "learning_rate": 0.0005896888111888111, + "loss": 4.1311, + "step": 3050 + }, + { + "epoch": 0.9027899120507892, + "grad_norm": 0.3316064476966858, + "learning_rate": 0.000589513986013986, + "loss": 4.1353, + "step": 3100 + }, + { + "epoch": 0.9173510396645116, + "grad_norm": 0.362809956073761, + "learning_rate": 0.0005893391608391608, + "loss": 4.1216, + "step": 3150 + }, + { + "epoch": 0.931912167278234, + "grad_norm": 0.384818971157074, + "learning_rate": 0.0005891643356643356, + "loss": 4.129, + "step": 3200 + }, + { + "epoch": 0.9464732948919564, + "grad_norm": 0.3387335538864136, + "learning_rate": 0.0005889895104895104, + "loss": 4.1201, + "step": 3250 + }, + { + "epoch": 0.9610344225056788, + "grad_norm": 0.33730268478393555, + "learning_rate": 0.0005888146853146853, + "loss": 4.1139, + "step": 3300 + }, + { + "epoch": 0.9755955501194012, + "grad_norm": 0.3745166063308716, + "learning_rate": 0.00058863986013986, + "loss": 4.0941, + "step": 3350 + }, + { + "epoch": 0.9901566777331237, + "grad_norm": 0.35486307740211487, + "learning_rate": 0.0005884650349650349, + "loss": 4.088, + "step": 3400 + }, + { + "epoch": 1.004659560836391, + "grad_norm": 0.36032548546791077, + "learning_rate": 0.0005882902097902097, + "loss": 4.0745, + "step": 3450 + }, + { + "epoch": 1.0192206884501136, + "grad_norm": 0.38221433758735657, + "learning_rate": 0.0005881153846153845, + "loss": 4.0227, + "step": 3500 + }, + { + "epoch": 1.033781816063836, + "grad_norm": 0.35653796792030334, + "learning_rate": 0.0005879405594405594, + "loss": 4.018, + "step": 3550 + }, + { + "epoch": 1.0483429436775584, + "grad_norm": 0.3454371392726898, + "learning_rate": 0.0005877657342657342, + "loss": 4.0198, + "step": 3600 + }, + { + "epoch": 1.0629040712912807, + "grad_norm": 0.3450075387954712, + "learning_rate": 0.000587590909090909, + "loss": 4.0057, + "step": 3650 + }, + { + "epoch": 1.0774651989050033, + "grad_norm": 0.3462330996990204, + "learning_rate": 0.0005874160839160838, + "loss": 3.9938, + "step": 3700 + }, + { + "epoch": 1.0920263265187256, + "grad_norm": 0.35249096155166626, + "learning_rate": 0.0005872412587412587, + "loss": 4.0156, + "step": 3750 + }, + { + "epoch": 1.106587454132448, + "grad_norm": 0.32855677604675293, + "learning_rate": 0.0005870664335664335, + "loss": 4.013, + "step": 3800 + }, + { + "epoch": 1.1211485817461704, + "grad_norm": 0.33263683319091797, + "learning_rate": 0.0005868916083916083, + "loss": 4.0015, + "step": 3850 + }, + { + "epoch": 1.135709709359893, + "grad_norm": 0.3342099189758301, + "learning_rate": 0.0005867167832167831, + "loss": 4.0121, + "step": 3900 + }, + { + "epoch": 1.1502708369736152, + "grad_norm": 0.3328862190246582, + "learning_rate": 0.000586541958041958, + "loss": 4.0018, + "step": 3950 + }, + { + "epoch": 1.1648319645873377, + "grad_norm": 0.32106339931488037, + "learning_rate": 0.0005863671328671328, + "loss": 3.9957, + "step": 4000 + }, + { + "epoch": 1.1648319645873377, + "eval_accuracy": 0.32516712112451923, + "eval_loss": 3.989675521850586, + "eval_runtime": 180.2628, + "eval_samples_per_second": 92.343, + "eval_steps_per_second": 5.775, + "step": 4000 + }, + { + "epoch": 1.17939309220106, + "grad_norm": 0.328213632106781, + "learning_rate": 0.0005861923076923076, + "loss": 3.9967, + "step": 4050 + }, + { + "epoch": 1.1939542198147826, + "grad_norm": 0.32576414942741394, + "learning_rate": 0.0005860174825174824, + "loss": 3.9813, + "step": 4100 + }, + { + "epoch": 1.2085153474285049, + "grad_norm": 0.34450486302375793, + "learning_rate": 0.0005858426573426573, + "loss": 3.9912, + "step": 4150 + }, + { + "epoch": 1.2230764750422272, + "grad_norm": 0.3487413823604584, + "learning_rate": 0.000585667832167832, + "loss": 3.9733, + "step": 4200 + }, + { + "epoch": 1.2376376026559497, + "grad_norm": 0.34708768129348755, + "learning_rate": 0.000585493006993007, + "loss": 3.9726, + "step": 4250 + }, + { + "epoch": 1.2521987302696722, + "grad_norm": 0.3304347097873688, + "learning_rate": 0.0005853181818181817, + "loss": 3.9653, + "step": 4300 + }, + { + "epoch": 1.2667598578833945, + "grad_norm": 0.3297647535800934, + "learning_rate": 0.0005851433566433565, + "loss": 3.9575, + "step": 4350 + }, + { + "epoch": 1.2813209854971168, + "grad_norm": 0.33607807755470276, + "learning_rate": 0.0005849685314685315, + "loss": 3.97, + "step": 4400 + }, + { + "epoch": 1.2958821131108393, + "grad_norm": 0.3388855457305908, + "learning_rate": 0.0005847937062937063, + "loss": 3.9794, + "step": 4450 + }, + { + "epoch": 1.3104432407245616, + "grad_norm": 0.32652390003204346, + "learning_rate": 0.0005846188811188811, + "loss": 3.9708, + "step": 4500 + }, + { + "epoch": 1.3250043683382842, + "grad_norm": 0.3443945050239563, + "learning_rate": 0.0005844440559440559, + "loss": 3.9656, + "step": 4550 + }, + { + "epoch": 1.3395654959520065, + "grad_norm": 0.341253399848938, + "learning_rate": 0.0005842692307692308, + "loss": 3.9506, + "step": 4600 + }, + { + "epoch": 1.354126623565729, + "grad_norm": 0.3474685549736023, + "learning_rate": 0.0005840944055944056, + "loss": 3.9478, + "step": 4650 + }, + { + "epoch": 1.3686877511794513, + "grad_norm": 0.34360435605049133, + "learning_rate": 0.0005839195804195804, + "loss": 3.9412, + "step": 4700 + }, + { + "epoch": 1.3832488787931738, + "grad_norm": 0.35134294629096985, + "learning_rate": 0.0005837447552447552, + "loss": 3.9533, + "step": 4750 + }, + { + "epoch": 1.3978100064068961, + "grad_norm": 0.3539522588253021, + "learning_rate": 0.0005835699300699301, + "loss": 3.9555, + "step": 4800 + }, + { + "epoch": 1.4123711340206184, + "grad_norm": 0.34184563159942627, + "learning_rate": 0.0005833951048951048, + "loss": 3.9523, + "step": 4850 + }, + { + "epoch": 1.426932261634341, + "grad_norm": 0.3456088602542877, + "learning_rate": 0.0005832202797202797, + "loss": 3.9523, + "step": 4900 + }, + { + "epoch": 1.4414933892480635, + "grad_norm": 0.3169538080692291, + "learning_rate": 0.0005830454545454546, + "loss": 3.933, + "step": 4950 + }, + { + "epoch": 1.4560545168617858, + "grad_norm": 0.3226233720779419, + "learning_rate": 0.0005828706293706293, + "loss": 3.9142, + "step": 5000 + }, + { + "epoch": 1.4560545168617858, + "eval_accuracy": 0.331768614216658, + "eval_loss": 3.915773391723633, + "eval_runtime": 180.4519, + "eval_samples_per_second": 92.246, + "eval_steps_per_second": 5.769, + "step": 5000 + }, + { + "epoch": 1.470615644475508, + "grad_norm": 0.3312673568725586, + "learning_rate": 0.0005826958041958042, + "loss": 3.9325, + "step": 5050 + }, + { + "epoch": 1.4851767720892306, + "grad_norm": 0.38458746671676636, + "learning_rate": 0.000582520979020979, + "loss": 3.9231, + "step": 5100 + }, + { + "epoch": 1.4997378997029531, + "grad_norm": 0.3237328827381134, + "learning_rate": 0.0005823461538461538, + "loss": 3.9247, + "step": 5150 + }, + { + "epoch": 1.5142990273166754, + "grad_norm": 0.31413206458091736, + "learning_rate": 0.0005821713286713286, + "loss": 3.9265, + "step": 5200 + }, + { + "epoch": 1.5288601549303977, + "grad_norm": 0.35820019245147705, + "learning_rate": 0.0005819965034965035, + "loss": 3.9107, + "step": 5250 + }, + { + "epoch": 1.5434212825441203, + "grad_norm": 0.3079889416694641, + "learning_rate": 0.0005818216783216783, + "loss": 3.9191, + "step": 5300 + }, + { + "epoch": 1.5579824101578428, + "grad_norm": 0.3229127526283264, + "learning_rate": 0.0005816468531468531, + "loss": 3.9095, + "step": 5350 + }, + { + "epoch": 1.572543537771565, + "grad_norm": 0.3325076997280121, + "learning_rate": 0.0005814720279720279, + "loss": 3.9086, + "step": 5400 + }, + { + "epoch": 1.5871046653852874, + "grad_norm": 0.3287741541862488, + "learning_rate": 0.0005812972027972028, + "loss": 3.908, + "step": 5450 + }, + { + "epoch": 1.6016657929990097, + "grad_norm": 0.31764456629753113, + "learning_rate": 0.0005811223776223776, + "loss": 3.9184, + "step": 5500 + }, + { + "epoch": 1.6162269206127322, + "grad_norm": 0.3241208791732788, + "learning_rate": 0.0005809475524475524, + "loss": 3.8827, + "step": 5550 + }, + { + "epoch": 1.6307880482264547, + "grad_norm": 0.3249201774597168, + "learning_rate": 0.0005807727272727272, + "loss": 3.9047, + "step": 5600 + }, + { + "epoch": 1.645349175840177, + "grad_norm": 0.3344324827194214, + "learning_rate": 0.0005805979020979021, + "loss": 3.8918, + "step": 5650 + }, + { + "epoch": 1.6599103034538993, + "grad_norm": 0.3216317296028137, + "learning_rate": 0.0005804230769230769, + "loss": 3.8934, + "step": 5700 + }, + { + "epoch": 1.6744714310676219, + "grad_norm": 0.3269466161727905, + "learning_rate": 0.0005802482517482517, + "loss": 3.8934, + "step": 5750 + }, + { + "epoch": 1.6890325586813444, + "grad_norm": 0.32984817028045654, + "learning_rate": 0.0005800734265734265, + "loss": 3.8732, + "step": 5800 + }, + { + "epoch": 1.7035936862950667, + "grad_norm": 0.3205852508544922, + "learning_rate": 0.0005798986013986013, + "loss": 3.8806, + "step": 5850 + }, + { + "epoch": 1.718154813908789, + "grad_norm": 0.3284587860107422, + "learning_rate": 0.0005797237762237762, + "loss": 3.8772, + "step": 5900 + }, + { + "epoch": 1.7327159415225115, + "grad_norm": 0.3408912420272827, + "learning_rate": 0.000579548951048951, + "loss": 3.8816, + "step": 5950 + }, + { + "epoch": 1.747277069136234, + "grad_norm": 0.32032084465026855, + "learning_rate": 0.0005793741258741258, + "loss": 3.8871, + "step": 6000 + }, + { + "epoch": 1.747277069136234, + "eval_accuracy": 0.33673681000448386, + "eval_loss": 3.857321262359619, + "eval_runtime": 180.7738, + "eval_samples_per_second": 92.082, + "eval_steps_per_second": 5.759, + "step": 6000 + }, + { + "epoch": 1.7618381967499563, + "grad_norm": 0.3361666798591614, + "learning_rate": 0.0005791993006993006, + "loss": 3.8866, + "step": 6050 + }, + { + "epoch": 1.7763993243636786, + "grad_norm": 0.3247644305229187, + "learning_rate": 0.0005790244755244755, + "loss": 3.8842, + "step": 6100 + }, + { + "epoch": 1.7909604519774012, + "grad_norm": 0.31813907623291016, + "learning_rate": 0.0005788496503496503, + "loss": 3.8721, + "step": 6150 + }, + { + "epoch": 1.8055215795911237, + "grad_norm": 0.3214803636074066, + "learning_rate": 0.0005786748251748251, + "loss": 3.8562, + "step": 6200 + }, + { + "epoch": 1.820082707204846, + "grad_norm": 0.3097255229949951, + "learning_rate": 0.0005784999999999999, + "loss": 3.8573, + "step": 6250 + }, + { + "epoch": 1.8346438348185683, + "grad_norm": 0.32802045345306396, + "learning_rate": 0.0005783251748251748, + "loss": 3.8657, + "step": 6300 + }, + { + "epoch": 1.8492049624322906, + "grad_norm": 0.3414191007614136, + "learning_rate": 0.0005781503496503496, + "loss": 3.8632, + "step": 6350 + }, + { + "epoch": 1.8637660900460131, + "grad_norm": 0.3283698856830597, + "learning_rate": 0.0005779755244755244, + "loss": 3.8643, + "step": 6400 + }, + { + "epoch": 1.8783272176597356, + "grad_norm": 0.31000998616218567, + "learning_rate": 0.0005778006993006993, + "loss": 3.861, + "step": 6450 + }, + { + "epoch": 1.892888345273458, + "grad_norm": 0.319161593914032, + "learning_rate": 0.000577625874125874, + "loss": 3.8588, + "step": 6500 + }, + { + "epoch": 1.9074494728871803, + "grad_norm": 0.3341123163700104, + "learning_rate": 0.0005774510489510489, + "loss": 3.8575, + "step": 6550 + }, + { + "epoch": 1.9220106005009028, + "grad_norm": 0.3260324001312256, + "learning_rate": 0.0005772762237762237, + "loss": 3.8503, + "step": 6600 + }, + { + "epoch": 1.9365717281146253, + "grad_norm": 0.32724907994270325, + "learning_rate": 0.0005771013986013985, + "loss": 3.8533, + "step": 6650 + }, + { + "epoch": 1.9511328557283476, + "grad_norm": 0.3281218111515045, + "learning_rate": 0.0005769265734265733, + "loss": 3.8423, + "step": 6700 + }, + { + "epoch": 1.96569398334207, + "grad_norm": 0.3258511424064636, + "learning_rate": 0.0005767517482517482, + "loss": 3.8595, + "step": 6750 + }, + { + "epoch": 1.9802551109557924, + "grad_norm": 0.3264133930206299, + "learning_rate": 0.000576576923076923, + "loss": 3.8518, + "step": 6800 + }, + { + "epoch": 1.994816238569515, + "grad_norm": 0.3205277621746063, + "learning_rate": 0.0005764020979020978, + "loss": 3.8442, + "step": 6850 + }, + { + "epoch": 2.009319121672782, + "grad_norm": 0.3255445659160614, + "learning_rate": 0.0005762272727272726, + "loss": 3.7817, + "step": 6900 + }, + { + "epoch": 2.023880249286505, + "grad_norm": 0.3084678649902344, + "learning_rate": 0.0005760524475524475, + "loss": 3.7401, + "step": 6950 + }, + { + "epoch": 2.038441376900227, + "grad_norm": 0.3164021968841553, + "learning_rate": 0.0005758776223776223, + "loss": 3.7577, + "step": 7000 + }, + { + "epoch": 2.038441376900227, + "eval_accuracy": 0.34121018477785253, + "eval_loss": 3.815640687942505, + "eval_runtime": 180.7874, + "eval_samples_per_second": 92.075, + "eval_steps_per_second": 5.758, + "step": 7000 + }, + { + "epoch": 2.0530025045139495, + "grad_norm": 0.32217466831207275, + "learning_rate": 0.0005757027972027971, + "loss": 3.7505, + "step": 7050 + }, + { + "epoch": 2.067563632127672, + "grad_norm": 0.3208473324775696, + "learning_rate": 0.000575527972027972, + "loss": 3.7286, + "step": 7100 + }, + { + "epoch": 2.0821247597413945, + "grad_norm": 0.3347488045692444, + "learning_rate": 0.0005753531468531468, + "loss": 3.7528, + "step": 7150 + }, + { + "epoch": 2.096685887355117, + "grad_norm": 0.31259018182754517, + "learning_rate": 0.0005751783216783216, + "loss": 3.7381, + "step": 7200 + }, + { + "epoch": 2.111247014968839, + "grad_norm": 0.31018945574760437, + "learning_rate": 0.0005750034965034964, + "loss": 3.754, + "step": 7250 + }, + { + "epoch": 2.1258081425825615, + "grad_norm": 0.3304862380027771, + "learning_rate": 0.0005748286713286712, + "loss": 3.7545, + "step": 7300 + }, + { + "epoch": 2.140369270196284, + "grad_norm": 0.30643802881240845, + "learning_rate": 0.000574653846153846, + "loss": 3.7614, + "step": 7350 + }, + { + "epoch": 2.1549303978100065, + "grad_norm": 0.31393590569496155, + "learning_rate": 0.000574479020979021, + "loss": 3.7581, + "step": 7400 + }, + { + "epoch": 2.169491525423729, + "grad_norm": 0.3010294735431671, + "learning_rate": 0.0005743041958041958, + "loss": 3.7627, + "step": 7450 + }, + { + "epoch": 2.184052653037451, + "grad_norm": 0.31778109073638916, + "learning_rate": 0.0005741293706293706, + "loss": 3.763, + "step": 7500 + }, + { + "epoch": 2.198613780651174, + "grad_norm": 0.314079612493515, + "learning_rate": 0.0005739545454545454, + "loss": 3.7594, + "step": 7550 + }, + { + "epoch": 2.213174908264896, + "grad_norm": 0.3207477629184723, + "learning_rate": 0.0005737797202797203, + "loss": 3.7588, + "step": 7600 + }, + { + "epoch": 2.2277360358786185, + "grad_norm": 0.323979914188385, + "learning_rate": 0.0005736048951048951, + "loss": 3.7551, + "step": 7650 + }, + { + "epoch": 2.2422971634923408, + "grad_norm": 0.32413870096206665, + "learning_rate": 0.0005734300699300699, + "loss": 3.7636, + "step": 7700 + }, + { + "epoch": 2.256858291106063, + "grad_norm": 0.3244387209415436, + "learning_rate": 0.0005732552447552448, + "loss": 3.7487, + "step": 7750 + }, + { + "epoch": 2.271419418719786, + "grad_norm": 0.3057880699634552, + "learning_rate": 0.0005730804195804196, + "loss": 3.754, + "step": 7800 + }, + { + "epoch": 2.285980546333508, + "grad_norm": 0.30727651715278625, + "learning_rate": 0.0005729055944055944, + "loss": 3.7595, + "step": 7850 + }, + { + "epoch": 2.3005416739472304, + "grad_norm": 0.3170273005962372, + "learning_rate": 0.0005727307692307692, + "loss": 3.7625, + "step": 7900 + }, + { + "epoch": 2.3151028015609527, + "grad_norm": 0.32343780994415283, + "learning_rate": 0.0005725559440559441, + "loss": 3.7416, + "step": 7950 + }, + { + "epoch": 2.3296639291746755, + "grad_norm": 0.31494140625, + "learning_rate": 0.0005723811188811188, + "loss": 3.754, + "step": 8000 + }, + { + "epoch": 2.3296639291746755, + "eval_accuracy": 0.3443036096658095, + "eval_loss": 3.784417152404785, + "eval_runtime": 181.0164, + "eval_samples_per_second": 91.959, + "eval_steps_per_second": 5.751, + "step": 8000 + }, + { + "epoch": 2.3442250567883978, + "grad_norm": 0.3380199372768402, + "learning_rate": 0.0005722062937062937, + "loss": 3.7634, + "step": 8050 + }, + { + "epoch": 2.35878618440212, + "grad_norm": 0.31097733974456787, + "learning_rate": 0.0005720314685314685, + "loss": 3.7577, + "step": 8100 + }, + { + "epoch": 2.3733473120158424, + "grad_norm": 0.3224102556705475, + "learning_rate": 0.0005718566433566433, + "loss": 3.7658, + "step": 8150 + }, + { + "epoch": 2.387908439629565, + "grad_norm": 0.30005016922950745, + "learning_rate": 0.0005716818181818181, + "loss": 3.7533, + "step": 8200 + }, + { + "epoch": 2.4024695672432874, + "grad_norm": 0.32545462250709534, + "learning_rate": 0.000571506993006993, + "loss": 3.7441, + "step": 8250 + }, + { + "epoch": 2.4170306948570097, + "grad_norm": 0.3352498710155487, + "learning_rate": 0.0005713321678321678, + "loss": 3.7494, + "step": 8300 + }, + { + "epoch": 2.431591822470732, + "grad_norm": 0.326423317193985, + "learning_rate": 0.0005711573426573426, + "loss": 3.7618, + "step": 8350 + }, + { + "epoch": 2.4461529500844543, + "grad_norm": 0.3356836438179016, + "learning_rate": 0.0005709825174825175, + "loss": 3.7467, + "step": 8400 + }, + { + "epoch": 2.460714077698177, + "grad_norm": 0.3288600742816925, + "learning_rate": 0.0005708076923076923, + "loss": 3.7519, + "step": 8450 + }, + { + "epoch": 2.4752752053118994, + "grad_norm": 0.307220458984375, + "learning_rate": 0.0005706328671328671, + "loss": 3.7423, + "step": 8500 + }, + { + "epoch": 2.4898363329256217, + "grad_norm": 0.33141323924064636, + "learning_rate": 0.0005704580419580419, + "loss": 3.7437, + "step": 8550 + }, + { + "epoch": 2.5043974605393444, + "grad_norm": 0.34977665543556213, + "learning_rate": 0.0005702832167832168, + "loss": 3.7535, + "step": 8600 + }, + { + "epoch": 2.5189585881530667, + "grad_norm": 0.32064738869667053, + "learning_rate": 0.0005701083916083916, + "loss": 3.7556, + "step": 8650 + }, + { + "epoch": 2.533519715766789, + "grad_norm": 0.30487334728240967, + "learning_rate": 0.0005699335664335664, + "loss": 3.7369, + "step": 8700 + }, + { + "epoch": 2.5480808433805113, + "grad_norm": 0.3201334476470947, + "learning_rate": 0.0005697587412587412, + "loss": 3.7461, + "step": 8750 + }, + { + "epoch": 2.5626419709942336, + "grad_norm": 0.2892219126224518, + "learning_rate": 0.000569583916083916, + "loss": 3.7463, + "step": 8800 + }, + { + "epoch": 2.5772030986079564, + "grad_norm": 0.3132726848125458, + "learning_rate": 0.0005694090909090908, + "loss": 3.7544, + "step": 8850 + }, + { + "epoch": 2.5917642262216787, + "grad_norm": 0.30421772599220276, + "learning_rate": 0.0005692342657342657, + "loss": 3.7395, + "step": 8900 + }, + { + "epoch": 2.606325353835401, + "grad_norm": 0.3089979290962219, + "learning_rate": 0.0005690594405594405, + "loss": 3.7349, + "step": 8950 + }, + { + "epoch": 2.6208864814491233, + "grad_norm": 0.3203098177909851, + "learning_rate": 0.0005688846153846153, + "loss": 3.7424, + "step": 9000 + }, + { + "epoch": 2.6208864814491233, + "eval_accuracy": 0.3470149560797855, + "eval_loss": 3.753990650177002, + "eval_runtime": 180.7846, + "eval_samples_per_second": 92.076, + "eval_steps_per_second": 5.758, + "step": 9000 + }, + { + "epoch": 2.6354476090628456, + "grad_norm": 0.31155484914779663, + "learning_rate": 0.0005687097902097901, + "loss": 3.7402, + "step": 9050 + }, + { + "epoch": 2.6500087366765683, + "grad_norm": 0.3341865539550781, + "learning_rate": 0.000568534965034965, + "loss": 3.7273, + "step": 9100 + }, + { + "epoch": 2.6645698642902906, + "grad_norm": 0.31667912006378174, + "learning_rate": 0.0005683601398601398, + "loss": 3.7479, + "step": 9150 + }, + { + "epoch": 2.679130991904013, + "grad_norm": 0.3061210513114929, + "learning_rate": 0.0005681853146853146, + "loss": 3.7272, + "step": 9200 + }, + { + "epoch": 2.6936921195177357, + "grad_norm": 0.317827969789505, + "learning_rate": 0.0005680104895104895, + "loss": 3.7336, + "step": 9250 + }, + { + "epoch": 2.708253247131458, + "grad_norm": 0.3071555197238922, + "learning_rate": 0.0005678356643356643, + "loss": 3.7312, + "step": 9300 + }, + { + "epoch": 2.7228143747451803, + "grad_norm": 0.31722643971443176, + "learning_rate": 0.0005676608391608391, + "loss": 3.7306, + "step": 9350 + }, + { + "epoch": 2.7373755023589026, + "grad_norm": 0.321736216545105, + "learning_rate": 0.0005674860139860139, + "loss": 3.7327, + "step": 9400 + }, + { + "epoch": 2.751936629972625, + "grad_norm": 0.3194282352924347, + "learning_rate": 0.0005673111888111888, + "loss": 3.7152, + "step": 9450 + }, + { + "epoch": 2.7664977575863476, + "grad_norm": 0.3411991000175476, + "learning_rate": 0.0005671363636363635, + "loss": 3.7404, + "step": 9500 + }, + { + "epoch": 2.78105888520007, + "grad_norm": 0.3048510253429413, + "learning_rate": 0.0005669615384615384, + "loss": 3.7335, + "step": 9550 + }, + { + "epoch": 2.7956200128137922, + "grad_norm": 0.30494141578674316, + "learning_rate": 0.0005667867132867132, + "loss": 3.7217, + "step": 9600 + }, + { + "epoch": 2.8101811404275145, + "grad_norm": 0.324116051197052, + "learning_rate": 0.000566611888111888, + "loss": 3.7275, + "step": 9650 + }, + { + "epoch": 2.824742268041237, + "grad_norm": 0.3104328215122223, + "learning_rate": 0.0005664370629370628, + "loss": 3.7239, + "step": 9700 + }, + { + "epoch": 2.8393033956549596, + "grad_norm": 0.3002651035785675, + "learning_rate": 0.0005662622377622377, + "loss": 3.7217, + "step": 9750 + }, + { + "epoch": 2.853864523268682, + "grad_norm": 0.2895483374595642, + "learning_rate": 0.0005660874125874125, + "loss": 3.7101, + "step": 9800 + }, + { + "epoch": 2.868425650882404, + "grad_norm": 0.3242766261100769, + "learning_rate": 0.0005659125874125873, + "loss": 3.7279, + "step": 9850 + }, + { + "epoch": 2.882986778496127, + "grad_norm": 0.32792049646377563, + "learning_rate": 0.0005657377622377622, + "loss": 3.7205, + "step": 9900 + }, + { + "epoch": 2.8975479061098492, + "grad_norm": 0.3201243281364441, + "learning_rate": 0.000565562937062937, + "loss": 3.7099, + "step": 9950 + }, + { + "epoch": 2.9121090337235715, + "grad_norm": 0.3127686083316803, + "learning_rate": 0.0005653881118881118, + "loss": 3.7116, + "step": 10000 + }, + { + "epoch": 2.9121090337235715, + "eval_accuracy": 0.3493557451553037, + "eval_loss": 3.727367639541626, + "eval_runtime": 181.2018, + "eval_samples_per_second": 91.864, + "eval_steps_per_second": 5.745, + "step": 10000 + }, + { + "epoch": 2.926670161337294, + "grad_norm": 0.29951873421669006, + "learning_rate": 0.0005652132867132866, + "loss": 3.7143, + "step": 10050 + }, + { + "epoch": 2.941231288951016, + "grad_norm": 0.3104478418827057, + "learning_rate": 0.0005650384615384615, + "loss": 3.722, + "step": 10100 + }, + { + "epoch": 2.955792416564739, + "grad_norm": 0.3141760230064392, + "learning_rate": 0.0005648636363636363, + "loss": 3.7163, + "step": 10150 + }, + { + "epoch": 2.970353544178461, + "grad_norm": 0.3073762357234955, + "learning_rate": 0.0005646888111888111, + "loss": 3.7277, + "step": 10200 + }, + { + "epoch": 2.9849146717921835, + "grad_norm": 0.3173540234565735, + "learning_rate": 0.000564513986013986, + "loss": 3.7239, + "step": 10250 + }, + { + "epoch": 2.9994757994059063, + "grad_norm": 0.3042048513889313, + "learning_rate": 0.0005643391608391607, + "loss": 3.7166, + "step": 10300 + }, + { + "epoch": 3.0139786825091734, + "grad_norm": 0.32184624671936035, + "learning_rate": 0.0005641643356643355, + "loss": 3.6176, + "step": 10350 + }, + { + "epoch": 3.0285398101228957, + "grad_norm": 0.30402591824531555, + "learning_rate": 0.0005639895104895105, + "loss": 3.6089, + "step": 10400 + }, + { + "epoch": 3.0431009377366185, + "grad_norm": 0.3132675886154175, + "learning_rate": 0.0005638146853146853, + "loss": 3.6175, + "step": 10450 + }, + { + "epoch": 3.057662065350341, + "grad_norm": 0.308847039937973, + "learning_rate": 0.0005636398601398601, + "loss": 3.6257, + "step": 10500 + }, + { + "epoch": 3.072223192964063, + "grad_norm": 0.3209833800792694, + "learning_rate": 0.000563465034965035, + "loss": 3.6303, + "step": 10550 + }, + { + "epoch": 3.0867843205777854, + "grad_norm": 0.3201095759868622, + "learning_rate": 0.0005632902097902098, + "loss": 3.6251, + "step": 10600 + }, + { + "epoch": 3.101345448191508, + "grad_norm": 0.3169490694999695, + "learning_rate": 0.0005631153846153846, + "loss": 3.6231, + "step": 10650 + }, + { + "epoch": 3.1159065758052304, + "grad_norm": 0.31459590792655945, + "learning_rate": 0.0005629405594405594, + "loss": 3.6346, + "step": 10700 + }, + { + "epoch": 3.1304677034189528, + "grad_norm": 0.3260461390018463, + "learning_rate": 0.0005627657342657343, + "loss": 3.6224, + "step": 10750 + }, + { + "epoch": 3.145028831032675, + "grad_norm": 0.3369269073009491, + "learning_rate": 0.0005625909090909091, + "loss": 3.6224, + "step": 10800 + }, + { + "epoch": 3.1595899586463974, + "grad_norm": 0.3142642676830292, + "learning_rate": 0.0005624160839160839, + "loss": 3.6332, + "step": 10850 + }, + { + "epoch": 3.17415108626012, + "grad_norm": 0.33844444155693054, + "learning_rate": 0.0005622412587412587, + "loss": 3.6388, + "step": 10900 + }, + { + "epoch": 3.1887122138738424, + "grad_norm": 0.3050360083580017, + "learning_rate": 0.0005620664335664336, + "loss": 3.6383, + "step": 10950 + }, + { + "epoch": 3.2032733414875647, + "grad_norm": 0.32840847969055176, + "learning_rate": 0.0005618916083916083, + "loss": 3.6438, + "step": 11000 + }, + { + "epoch": 3.2032733414875647, + "eval_accuracy": 0.35157309349307425, + "eval_loss": 3.71468186378479, + "eval_runtime": 181.9058, + "eval_samples_per_second": 91.509, + "eval_steps_per_second": 5.723, + "step": 11000 + }, + { + "epoch": 3.217834469101287, + "grad_norm": 0.31019338965415955, + "learning_rate": 0.0005617167832167832, + "loss": 3.6341, + "step": 11050 + }, + { + "epoch": 3.2323955967150098, + "grad_norm": 0.3126959204673767, + "learning_rate": 0.000561541958041958, + "loss": 3.6271, + "step": 11100 + }, + { + "epoch": 3.246956724328732, + "grad_norm": 0.311865895986557, + "learning_rate": 0.0005613671328671328, + "loss": 3.6429, + "step": 11150 + }, + { + "epoch": 3.2615178519424544, + "grad_norm": 0.35393866896629333, + "learning_rate": 0.0005611923076923077, + "loss": 3.6427, + "step": 11200 + }, + { + "epoch": 3.2760789795561767, + "grad_norm": 0.314378559589386, + "learning_rate": 0.0005610174825174825, + "loss": 3.6465, + "step": 11250 + }, + { + "epoch": 3.2906401071698994, + "grad_norm": 0.3106997013092041, + "learning_rate": 0.0005608426573426573, + "loss": 3.6439, + "step": 11300 + }, + { + "epoch": 3.3052012347836217, + "grad_norm": 0.31017565727233887, + "learning_rate": 0.0005606678321678321, + "loss": 3.6422, + "step": 11350 + }, + { + "epoch": 3.319762362397344, + "grad_norm": 0.297059565782547, + "learning_rate": 0.000560493006993007, + "loss": 3.6435, + "step": 11400 + }, + { + "epoch": 3.3343234900110663, + "grad_norm": 0.3107824921607971, + "learning_rate": 0.0005603181818181818, + "loss": 3.6471, + "step": 11450 + }, + { + "epoch": 3.3488846176247886, + "grad_norm": 0.3141351342201233, + "learning_rate": 0.0005601433566433566, + "loss": 3.6388, + "step": 11500 + }, + { + "epoch": 3.3634457452385114, + "grad_norm": 0.32414788007736206, + "learning_rate": 0.0005599685314685314, + "loss": 3.6296, + "step": 11550 + }, + { + "epoch": 3.3780068728522337, + "grad_norm": 0.32148855924606323, + "learning_rate": 0.0005597937062937063, + "loss": 3.6379, + "step": 11600 + }, + { + "epoch": 3.392568000465956, + "grad_norm": 0.3060270845890045, + "learning_rate": 0.0005596188811188811, + "loss": 3.653, + "step": 11650 + }, + { + "epoch": 3.4071291280796787, + "grad_norm": 0.3094375431537628, + "learning_rate": 0.0005594440559440559, + "loss": 3.6363, + "step": 11700 + }, + { + "epoch": 3.421690255693401, + "grad_norm": 0.3291188180446625, + "learning_rate": 0.0005592692307692307, + "loss": 3.6504, + "step": 11750 + }, + { + "epoch": 3.4362513833071233, + "grad_norm": 0.31658145785331726, + "learning_rate": 0.0005590944055944055, + "loss": 3.6477, + "step": 11800 + }, + { + "epoch": 3.4508125109208456, + "grad_norm": 0.31207045912742615, + "learning_rate": 0.0005589195804195803, + "loss": 3.6462, + "step": 11850 + }, + { + "epoch": 3.465373638534568, + "grad_norm": 0.3267686367034912, + "learning_rate": 0.0005587447552447552, + "loss": 3.6405, + "step": 11900 + }, + { + "epoch": 3.4799347661482907, + "grad_norm": 0.3283142149448395, + "learning_rate": 0.00055856993006993, + "loss": 3.6475, + "step": 11950 + }, + { + "epoch": 3.494495893762013, + "grad_norm": 0.32450154423713684, + "learning_rate": 0.0005583951048951048, + "loss": 3.6323, + "step": 12000 + }, + { + "epoch": 3.494495893762013, + "eval_accuracy": 0.35301782037515167, + "eval_loss": 3.6957285404205322, + "eval_runtime": 180.5967, + "eval_samples_per_second": 92.172, + "eval_steps_per_second": 5.764, + "step": 12000 + }, + { + "epoch": 3.5090570213757353, + "grad_norm": 0.3060498833656311, + "learning_rate": 0.0005582202797202797, + "loss": 3.6332, + "step": 12050 + }, + { + "epoch": 3.523618148989458, + "grad_norm": 0.3046874403953552, + "learning_rate": 0.0005580454545454545, + "loss": 3.6427, + "step": 12100 + }, + { + "epoch": 3.53817927660318, + "grad_norm": 0.3221018314361572, + "learning_rate": 0.0005578706293706293, + "loss": 3.6475, + "step": 12150 + }, + { + "epoch": 3.5527404042169026, + "grad_norm": 0.33598005771636963, + "learning_rate": 0.0005576958041958041, + "loss": 3.6484, + "step": 12200 + }, + { + "epoch": 3.567301531830625, + "grad_norm": 0.3117700219154358, + "learning_rate": 0.000557520979020979, + "loss": 3.6374, + "step": 12250 + }, + { + "epoch": 3.5818626594443472, + "grad_norm": 0.3346507251262665, + "learning_rate": 0.0005573461538461538, + "loss": 3.6269, + "step": 12300 + }, + { + "epoch": 3.59642378705807, + "grad_norm": 0.3148074448108673, + "learning_rate": 0.0005571713286713286, + "loss": 3.6348, + "step": 12350 + }, + { + "epoch": 3.6109849146717923, + "grad_norm": 0.30282357335090637, + "learning_rate": 0.0005569965034965034, + "loss": 3.6468, + "step": 12400 + }, + { + "epoch": 3.6255460422855146, + "grad_norm": 0.306007981300354, + "learning_rate": 0.0005568216783216783, + "loss": 3.6379, + "step": 12450 + }, + { + "epoch": 3.640107169899237, + "grad_norm": 0.3162916600704193, + "learning_rate": 0.000556646853146853, + "loss": 3.6354, + "step": 12500 + }, + { + "epoch": 3.654668297512959, + "grad_norm": 0.31878116726875305, + "learning_rate": 0.0005564720279720279, + "loss": 3.6363, + "step": 12550 + }, + { + "epoch": 3.669229425126682, + "grad_norm": 0.3057233691215515, + "learning_rate": 0.0005562972027972027, + "loss": 3.6419, + "step": 12600 + }, + { + "epoch": 3.6837905527404042, + "grad_norm": 0.33499786257743835, + "learning_rate": 0.0005561223776223775, + "loss": 3.6456, + "step": 12650 + }, + { + "epoch": 3.6983516803541265, + "grad_norm": 0.3105752170085907, + "learning_rate": 0.0005559475524475524, + "loss": 3.6356, + "step": 12700 + }, + { + "epoch": 3.7129128079678493, + "grad_norm": 0.3092792332172394, + "learning_rate": 0.0005557727272727272, + "loss": 3.6494, + "step": 12750 + }, + { + "epoch": 3.7274739355815716, + "grad_norm": 0.3000400960445404, + "learning_rate": 0.000555597902097902, + "loss": 3.6326, + "step": 12800 + }, + { + "epoch": 3.742035063195294, + "grad_norm": 0.3095425069332123, + "learning_rate": 0.0005554230769230768, + "loss": 3.6408, + "step": 12850 + }, + { + "epoch": 3.756596190809016, + "grad_norm": 0.3014494478702545, + "learning_rate": 0.0005552482517482517, + "loss": 3.6444, + "step": 12900 + }, + { + "epoch": 3.7711573184227385, + "grad_norm": 0.3192651569843292, + "learning_rate": 0.0005550734265734265, + "loss": 3.6436, + "step": 12950 + }, + { + "epoch": 3.7857184460364612, + "grad_norm": 0.32174089550971985, + "learning_rate": 0.0005548986013986013, + "loss": 3.6429, + "step": 13000 + }, + { + "epoch": 3.7857184460364612, + "eval_accuracy": 0.35489482496456076, + "eval_loss": 3.6800663471221924, + "eval_runtime": 180.4037, + "eval_samples_per_second": 92.271, + "eval_steps_per_second": 5.77, + "step": 13000 + }, + { + "epoch": 3.8002795736501835, + "grad_norm": 0.31617510318756104, + "learning_rate": 0.0005547237762237761, + "loss": 3.6465, + "step": 13050 + }, + { + "epoch": 3.814840701263906, + "grad_norm": 0.34271636605262756, + "learning_rate": 0.000554548951048951, + "loss": 3.6347, + "step": 13100 + }, + { + "epoch": 3.829401828877628, + "grad_norm": 0.3209848999977112, + "learning_rate": 0.0005543741258741258, + "loss": 3.6414, + "step": 13150 + }, + { + "epoch": 3.8439629564913504, + "grad_norm": 0.33069539070129395, + "learning_rate": 0.0005541993006993006, + "loss": 3.6252, + "step": 13200 + }, + { + "epoch": 3.858524084105073, + "grad_norm": 0.3266202211380005, + "learning_rate": 0.0005540244755244756, + "loss": 3.6354, + "step": 13250 + }, + { + "epoch": 3.8730852117187955, + "grad_norm": 0.3188350796699524, + "learning_rate": 0.0005538496503496502, + "loss": 3.6536, + "step": 13300 + }, + { + "epoch": 3.887646339332518, + "grad_norm": 0.3059855103492737, + "learning_rate": 0.0005536748251748252, + "loss": 3.6327, + "step": 13350 + }, + { + "epoch": 3.9022074669462405, + "grad_norm": 0.31143465638160706, + "learning_rate": 0.0005535, + "loss": 3.6204, + "step": 13400 + }, + { + "epoch": 3.916768594559963, + "grad_norm": 0.3238065838813782, + "learning_rate": 0.0005533251748251748, + "loss": 3.6292, + "step": 13450 + }, + { + "epoch": 3.931329722173685, + "grad_norm": 0.2959144115447998, + "learning_rate": 0.0005531503496503496, + "loss": 3.6453, + "step": 13500 + }, + { + "epoch": 3.9458908497874075, + "grad_norm": 0.32706987857818604, + "learning_rate": 0.0005529755244755245, + "loss": 3.6269, + "step": 13550 + }, + { + "epoch": 3.9604519774011298, + "grad_norm": 0.3075656294822693, + "learning_rate": 0.0005528006993006993, + "loss": 3.6376, + "step": 13600 + }, + { + "epoch": 3.9750131050148525, + "grad_norm": 0.3198637366294861, + "learning_rate": 0.0005526258741258741, + "loss": 3.6372, + "step": 13650 + }, + { + "epoch": 3.989574232628575, + "grad_norm": 0.3145425319671631, + "learning_rate": 0.0005524510489510489, + "loss": 3.6378, + "step": 13700 + }, + { + "epoch": 4.004077115731842, + "grad_norm": 0.31634321808815, + "learning_rate": 0.0005522762237762238, + "loss": 3.6062, + "step": 13750 + }, + { + "epoch": 4.018638243345564, + "grad_norm": 0.3056029677391052, + "learning_rate": 0.0005521013986013986, + "loss": 3.5304, + "step": 13800 + }, + { + "epoch": 4.033199370959287, + "grad_norm": 0.29673531651496887, + "learning_rate": 0.0005519265734265734, + "loss": 3.5444, + "step": 13850 + }, + { + "epoch": 4.04776049857301, + "grad_norm": 0.32092323899269104, + "learning_rate": 0.0005517517482517482, + "loss": 3.5251, + "step": 13900 + }, + { + "epoch": 4.062321626186732, + "grad_norm": 0.32128646969795227, + "learning_rate": 0.0005515769230769231, + "loss": 3.5352, + "step": 13950 + }, + { + "epoch": 4.076882753800454, + "grad_norm": 0.3217810094356537, + "learning_rate": 0.0005514020979020979, + "loss": 3.5448, + "step": 14000 + }, + { + "epoch": 4.076882753800454, + "eval_accuracy": 0.3563979804625054, + "eval_loss": 3.6696760654449463, + "eval_runtime": 180.6933, + "eval_samples_per_second": 92.123, + "eval_steps_per_second": 5.761, + "step": 14000 + }, + { + "epoch": 4.091443881414177, + "grad_norm": 0.320950984954834, + "learning_rate": 0.0005512272727272727, + "loss": 3.5395, + "step": 14050 + }, + { + "epoch": 4.106005009027899, + "grad_norm": 0.3130992650985718, + "learning_rate": 0.0005510524475524475, + "loss": 3.5323, + "step": 14100 + }, + { + "epoch": 4.120566136641622, + "grad_norm": 0.32641854882240295, + "learning_rate": 0.0005508776223776223, + "loss": 3.5339, + "step": 14150 + }, + { + "epoch": 4.135127264255344, + "grad_norm": 0.3011496961116791, + "learning_rate": 0.0005507027972027972, + "loss": 3.5519, + "step": 14200 + }, + { + "epoch": 4.149688391869066, + "grad_norm": 0.31833386421203613, + "learning_rate": 0.000550527972027972, + "loss": 3.5554, + "step": 14250 + }, + { + "epoch": 4.164249519482789, + "grad_norm": 0.3174136281013489, + "learning_rate": 0.0005503531468531468, + "loss": 3.5548, + "step": 14300 + }, + { + "epoch": 4.178810647096511, + "grad_norm": 0.30939123034477234, + "learning_rate": 0.0005501783216783216, + "loss": 3.5442, + "step": 14350 + }, + { + "epoch": 4.193371774710234, + "grad_norm": 0.31899625062942505, + "learning_rate": 0.0005500034965034965, + "loss": 3.5547, + "step": 14400 + }, + { + "epoch": 4.207932902323956, + "grad_norm": 0.3191762864589691, + "learning_rate": 0.0005498286713286713, + "loss": 3.5635, + "step": 14450 + }, + { + "epoch": 4.222494029937678, + "grad_norm": 0.32480305433273315, + "learning_rate": 0.0005496538461538461, + "loss": 3.5434, + "step": 14500 + }, + { + "epoch": 4.237055157551401, + "grad_norm": 0.32042625546455383, + "learning_rate": 0.0005494790209790209, + "loss": 3.5771, + "step": 14550 + }, + { + "epoch": 4.251616285165123, + "grad_norm": 0.34208381175994873, + "learning_rate": 0.0005493041958041958, + "loss": 3.5499, + "step": 14600 + }, + { + "epoch": 4.266177412778846, + "grad_norm": 0.29826879501342773, + "learning_rate": 0.0005491293706293706, + "loss": 3.554, + "step": 14650 + }, + { + "epoch": 4.280738540392568, + "grad_norm": 0.2985898554325104, + "learning_rate": 0.0005489545454545454, + "loss": 3.5696, + "step": 14700 + }, + { + "epoch": 4.29529966800629, + "grad_norm": 0.3151024878025055, + "learning_rate": 0.0005487797202797203, + "loss": 3.5591, + "step": 14750 + }, + { + "epoch": 4.309860795620013, + "grad_norm": 0.31103193759918213, + "learning_rate": 0.000548604895104895, + "loss": 3.5667, + "step": 14800 + }, + { + "epoch": 4.324421923233735, + "grad_norm": 0.30722710490226746, + "learning_rate": 0.0005484300699300699, + "loss": 3.5591, + "step": 14850 + }, + { + "epoch": 4.338983050847458, + "grad_norm": 0.32191506028175354, + "learning_rate": 0.0005482552447552447, + "loss": 3.5589, + "step": 14900 + }, + { + "epoch": 4.35354417846118, + "grad_norm": 0.30484798550605774, + "learning_rate": 0.0005480804195804195, + "loss": 3.5703, + "step": 14950 + }, + { + "epoch": 4.368105306074902, + "grad_norm": 0.3188960552215576, + "learning_rate": 0.0005479055944055943, + "loss": 3.5583, + "step": 15000 + }, + { + "epoch": 4.368105306074902, + "eval_accuracy": 0.35716825066605096, + "eval_loss": 3.6608121395111084, + "eval_runtime": 180.9462, + "eval_samples_per_second": 91.994, + "eval_steps_per_second": 5.753, + "step": 15000 + }, + { + "epoch": 4.382666433688625, + "grad_norm": 0.30769604444503784, + "learning_rate": 0.0005477307692307692, + "loss": 3.5692, + "step": 15050 + }, + { + "epoch": 4.397227561302348, + "grad_norm": 0.3089370131492615, + "learning_rate": 0.000547555944055944, + "loss": 3.5697, + "step": 15100 + }, + { + "epoch": 4.41178868891607, + "grad_norm": 0.2947642207145691, + "learning_rate": 0.0005473811188811188, + "loss": 3.5753, + "step": 15150 + }, + { + "epoch": 4.426349816529792, + "grad_norm": 0.30472230911254883, + "learning_rate": 0.0005472062937062936, + "loss": 3.5844, + "step": 15200 + }, + { + "epoch": 4.440910944143514, + "grad_norm": 0.31481656432151794, + "learning_rate": 0.0005470314685314685, + "loss": 3.5755, + "step": 15250 + }, + { + "epoch": 4.455472071757237, + "grad_norm": 0.29861533641815186, + "learning_rate": 0.0005468566433566433, + "loss": 3.5762, + "step": 15300 + }, + { + "epoch": 4.47003319937096, + "grad_norm": 0.3105551600456238, + "learning_rate": 0.0005466818181818181, + "loss": 3.5571, + "step": 15350 + }, + { + "epoch": 4.4845943269846815, + "grad_norm": 0.318051278591156, + "learning_rate": 0.000546506993006993, + "loss": 3.579, + "step": 15400 + }, + { + "epoch": 4.499155454598404, + "grad_norm": 0.3215668797492981, + "learning_rate": 0.0005463321678321678, + "loss": 3.5776, + "step": 15450 + }, + { + "epoch": 4.513716582212126, + "grad_norm": 0.3257148861885071, + "learning_rate": 0.0005461573426573426, + "loss": 3.5733, + "step": 15500 + }, + { + "epoch": 4.528277709825849, + "grad_norm": 0.3296310603618622, + "learning_rate": 0.0005459825174825174, + "loss": 3.57, + "step": 15550 + }, + { + "epoch": 4.542838837439572, + "grad_norm": 0.349429726600647, + "learning_rate": 0.0005458076923076922, + "loss": 3.5644, + "step": 15600 + }, + { + "epoch": 4.5573999650532935, + "grad_norm": 0.3020808696746826, + "learning_rate": 0.000545632867132867, + "loss": 3.5642, + "step": 15650 + }, + { + "epoch": 4.571961092667016, + "grad_norm": 0.3027339577674866, + "learning_rate": 0.0005454580419580419, + "loss": 3.5719, + "step": 15700 + }, + { + "epoch": 4.586522220280738, + "grad_norm": 0.32000523805618286, + "learning_rate": 0.0005452832167832167, + "loss": 3.5646, + "step": 15750 + }, + { + "epoch": 4.601083347894461, + "grad_norm": 0.32438185811042786, + "learning_rate": 0.0005451083916083915, + "loss": 3.5665, + "step": 15800 + }, + { + "epoch": 4.615644475508184, + "grad_norm": 0.3186527192592621, + "learning_rate": 0.0005449335664335663, + "loss": 3.5831, + "step": 15850 + }, + { + "epoch": 4.630205603121905, + "grad_norm": 0.31081655621528625, + "learning_rate": 0.0005447587412587412, + "loss": 3.5725, + "step": 15900 + }, + { + "epoch": 4.644766730735628, + "grad_norm": 0.3225807547569275, + "learning_rate": 0.000544583916083916, + "loss": 3.5615, + "step": 15950 + }, + { + "epoch": 4.659327858349351, + "grad_norm": 0.30369243025779724, + "learning_rate": 0.0005444090909090908, + "loss": 3.5833, + "step": 16000 + }, + { + "epoch": 4.659327858349351, + "eval_accuracy": 0.358389373468894, + "eval_loss": 3.6469626426696777, + "eval_runtime": 180.7892, + "eval_samples_per_second": 92.074, + "eval_steps_per_second": 5.758, + "step": 16000 + }, + { + "epoch": 4.673888985963073, + "grad_norm": 0.31460005044937134, + "learning_rate": 0.0005442342657342657, + "loss": 3.5811, + "step": 16050 + }, + { + "epoch": 4.6884501135767955, + "grad_norm": 0.3051797151565552, + "learning_rate": 0.0005440594405594405, + "loss": 3.5772, + "step": 16100 + }, + { + "epoch": 4.703011241190518, + "grad_norm": 0.3245795667171478, + "learning_rate": 0.0005438846153846153, + "loss": 3.5612, + "step": 16150 + }, + { + "epoch": 4.71757236880424, + "grad_norm": 0.30969762802124023, + "learning_rate": 0.0005437097902097901, + "loss": 3.5658, + "step": 16200 + }, + { + "epoch": 4.732133496417963, + "grad_norm": 0.31821149587631226, + "learning_rate": 0.0005435349650349651, + "loss": 3.5825, + "step": 16250 + }, + { + "epoch": 4.746694624031685, + "grad_norm": 0.3063112199306488, + "learning_rate": 0.0005433601398601397, + "loss": 3.5774, + "step": 16300 + }, + { + "epoch": 4.7612557516454075, + "grad_norm": 0.31970885396003723, + "learning_rate": 0.0005431853146853147, + "loss": 3.5815, + "step": 16350 + }, + { + "epoch": 4.77581687925913, + "grad_norm": 0.3243388235569, + "learning_rate": 0.0005430104895104895, + "loss": 3.588, + "step": 16400 + }, + { + "epoch": 4.790378006872852, + "grad_norm": 0.30511829257011414, + "learning_rate": 0.0005428356643356643, + "loss": 3.5775, + "step": 16450 + }, + { + "epoch": 4.804939134486575, + "grad_norm": 0.3213348984718323, + "learning_rate": 0.0005426608391608391, + "loss": 3.5811, + "step": 16500 + }, + { + "epoch": 4.819500262100297, + "grad_norm": 0.3377111554145813, + "learning_rate": 0.000542486013986014, + "loss": 3.5669, + "step": 16550 + }, + { + "epoch": 4.834061389714019, + "grad_norm": 0.3088950216770172, + "learning_rate": 0.0005423111888111888, + "loss": 3.5772, + "step": 16600 + }, + { + "epoch": 4.848622517327742, + "grad_norm": 0.30557340383529663, + "learning_rate": 0.0005421363636363636, + "loss": 3.5807, + "step": 16650 + }, + { + "epoch": 4.863183644941464, + "grad_norm": 0.3154135048389435, + "learning_rate": 0.0005419615384615385, + "loss": 3.5719, + "step": 16700 + }, + { + "epoch": 4.877744772555187, + "grad_norm": 0.31455299258232117, + "learning_rate": 0.0005417867132867133, + "loss": 3.5717, + "step": 16750 + }, + { + "epoch": 4.892305900168909, + "grad_norm": 0.31194445490837097, + "learning_rate": 0.0005416118881118881, + "loss": 3.5834, + "step": 16800 + }, + { + "epoch": 4.906867027782631, + "grad_norm": 0.31651851534843445, + "learning_rate": 0.0005414370629370629, + "loss": 3.5798, + "step": 16850 + }, + { + "epoch": 4.921428155396354, + "grad_norm": 0.3114611506462097, + "learning_rate": 0.0005412622377622378, + "loss": 3.5846, + "step": 16900 + }, + { + "epoch": 4.935989283010076, + "grad_norm": 0.3295888602733612, + "learning_rate": 0.0005410874125874126, + "loss": 3.5719, + "step": 16950 + }, + { + "epoch": 4.950550410623799, + "grad_norm": 0.3076690137386322, + "learning_rate": 0.0005409125874125874, + "loss": 3.5773, + "step": 17000 + }, + { + "epoch": 4.950550410623799, + "eval_accuracy": 0.3596430611139809, + "eval_loss": 3.63307523727417, + "eval_runtime": 180.6842, + "eval_samples_per_second": 92.128, + "eval_steps_per_second": 5.761, + "step": 17000 + }, + { + "epoch": 4.9651115382375215, + "grad_norm": 0.3147895038127899, + "learning_rate": 0.0005407377622377622, + "loss": 3.5766, + "step": 17050 + }, + { + "epoch": 4.979672665851243, + "grad_norm": 0.3255283832550049, + "learning_rate": 0.000540562937062937, + "loss": 3.5795, + "step": 17100 + }, + { + "epoch": 4.994233793464966, + "grad_norm": 0.31261691451072693, + "learning_rate": 0.0005403881118881118, + "loss": 3.5825, + "step": 17150 + }, + { + "epoch": 5.008736676568233, + "grad_norm": 0.31291091442108154, + "learning_rate": 0.0005402132867132867, + "loss": 3.4917, + "step": 17200 + }, + { + "epoch": 5.023297804181956, + "grad_norm": 0.32165074348449707, + "learning_rate": 0.0005400384615384615, + "loss": 3.4626, + "step": 17250 + }, + { + "epoch": 5.037858931795678, + "grad_norm": 0.33376818895339966, + "learning_rate": 0.0005398636363636363, + "loss": 3.4814, + "step": 17300 + }, + { + "epoch": 5.052420059409401, + "grad_norm": 0.3262355625629425, + "learning_rate": 0.0005396888111888111, + "loss": 3.4666, + "step": 17350 + }, + { + "epoch": 5.066981187023123, + "grad_norm": 0.33096253871917725, + "learning_rate": 0.000539513986013986, + "loss": 3.4784, + "step": 17400 + }, + { + "epoch": 5.081542314636845, + "grad_norm": 0.3339770436286926, + "learning_rate": 0.0005393391608391608, + "loss": 3.4858, + "step": 17450 + }, + { + "epoch": 5.096103442250568, + "grad_norm": 0.31180301308631897, + "learning_rate": 0.0005391643356643356, + "loss": 3.4845, + "step": 17500 + }, + { + "epoch": 5.110664569864291, + "grad_norm": 0.3266642987728119, + "learning_rate": 0.0005389895104895105, + "loss": 3.4799, + "step": 17550 + }, + { + "epoch": 5.125225697478013, + "grad_norm": 0.3353632092475891, + "learning_rate": 0.0005388146853146853, + "loss": 3.4778, + "step": 17600 + }, + { + "epoch": 5.139786825091735, + "grad_norm": 0.3286963999271393, + "learning_rate": 0.0005386398601398601, + "loss": 3.4885, + "step": 17650 + }, + { + "epoch": 5.154347952705457, + "grad_norm": 0.3285607695579529, + "learning_rate": 0.0005384650349650349, + "loss": 3.4878, + "step": 17700 + }, + { + "epoch": 5.16890908031918, + "grad_norm": 0.32417571544647217, + "learning_rate": 0.0005382902097902098, + "loss": 3.486, + "step": 17750 + }, + { + "epoch": 5.183470207932903, + "grad_norm": 0.3437209129333496, + "learning_rate": 0.0005381153846153845, + "loss": 3.5001, + "step": 17800 + }, + { + "epoch": 5.1980313355466246, + "grad_norm": 0.3151189088821411, + "learning_rate": 0.0005379405594405594, + "loss": 3.4985, + "step": 17850 + }, + { + "epoch": 5.212592463160347, + "grad_norm": 0.30223751068115234, + "learning_rate": 0.0005377657342657342, + "loss": 3.5054, + "step": 17900 + }, + { + "epoch": 5.227153590774069, + "grad_norm": 0.334331214427948, + "learning_rate": 0.000537590909090909, + "loss": 3.4948, + "step": 17950 + }, + { + "epoch": 5.241714718387792, + "grad_norm": 0.33861103653907776, + "learning_rate": 0.0005374160839160838, + "loss": 3.4997, + "step": 18000 + }, + { + "epoch": 5.241714718387792, + "eval_accuracy": 0.35962095934379373, + "eval_loss": 3.6378488540649414, + "eval_runtime": 180.5829, + "eval_samples_per_second": 92.179, + "eval_steps_per_second": 5.765, + "step": 18000 + }, + { + "epoch": 5.256275846001515, + "grad_norm": 0.3312193751335144, + "learning_rate": 0.0005372412587412587, + "loss": 3.51, + "step": 18050 + }, + { + "epoch": 5.2708369736152365, + "grad_norm": 0.3368973731994629, + "learning_rate": 0.0005370664335664335, + "loss": 3.5077, + "step": 18100 + }, + { + "epoch": 5.285398101228959, + "grad_norm": 0.3362269997596741, + "learning_rate": 0.0005368916083916083, + "loss": 3.505, + "step": 18150 + }, + { + "epoch": 5.299959228842681, + "grad_norm": 0.3283269703388214, + "learning_rate": 0.0005367167832167832, + "loss": 3.5147, + "step": 18200 + }, + { + "epoch": 5.314520356456404, + "grad_norm": 0.33599990606307983, + "learning_rate": 0.000536541958041958, + "loss": 3.5038, + "step": 18250 + }, + { + "epoch": 5.329081484070127, + "grad_norm": 0.325891375541687, + "learning_rate": 0.0005363671328671328, + "loss": 3.5007, + "step": 18300 + }, + { + "epoch": 5.3436426116838485, + "grad_norm": 0.31639760732650757, + "learning_rate": 0.0005361923076923076, + "loss": 3.5059, + "step": 18350 + }, + { + "epoch": 5.358203739297571, + "grad_norm": 0.3361245095729828, + "learning_rate": 0.0005360174825174825, + "loss": 3.5089, + "step": 18400 + }, + { + "epoch": 5.372764866911294, + "grad_norm": 0.308998167514801, + "learning_rate": 0.0005358426573426573, + "loss": 3.517, + "step": 18450 + }, + { + "epoch": 5.387325994525016, + "grad_norm": 0.30970314145088196, + "learning_rate": 0.0005356678321678321, + "loss": 3.5048, + "step": 18500 + }, + { + "epoch": 5.401887122138739, + "grad_norm": 0.3119153380393982, + "learning_rate": 0.0005354930069930069, + "loss": 3.5108, + "step": 18550 + }, + { + "epoch": 5.41644824975246, + "grad_norm": 0.32366570830345154, + "learning_rate": 0.0005353181818181817, + "loss": 3.5209, + "step": 18600 + }, + { + "epoch": 5.431009377366183, + "grad_norm": 0.34180885553359985, + "learning_rate": 0.0005351433566433565, + "loss": 3.5116, + "step": 18650 + }, + { + "epoch": 5.445570504979906, + "grad_norm": 0.30780646204948425, + "learning_rate": 0.0005349685314685314, + "loss": 3.5122, + "step": 18700 + }, + { + "epoch": 5.460131632593628, + "grad_norm": 0.32243868708610535, + "learning_rate": 0.0005347937062937062, + "loss": 3.5189, + "step": 18750 + }, + { + "epoch": 5.4746927602073505, + "grad_norm": 0.30452656745910645, + "learning_rate": 0.000534618881118881, + "loss": 3.5301, + "step": 18800 + }, + { + "epoch": 5.489253887821073, + "grad_norm": 0.31950345635414124, + "learning_rate": 0.0005344440559440559, + "loss": 3.5165, + "step": 18850 + }, + { + "epoch": 5.503815015434795, + "grad_norm": 0.32063764333724976, + "learning_rate": 0.0005342692307692307, + "loss": 3.5179, + "step": 18900 + }, + { + "epoch": 5.518376143048518, + "grad_norm": 0.31415075063705444, + "learning_rate": 0.0005340944055944055, + "loss": 3.5253, + "step": 18950 + }, + { + "epoch": 5.53293727066224, + "grad_norm": 0.2983076572418213, + "learning_rate": 0.0005339195804195803, + "loss": 3.5261, + "step": 19000 + }, + { + "epoch": 5.53293727066224, + "eval_accuracy": 0.3608592462873141, + "eval_loss": 3.6253974437713623, + "eval_runtime": 180.632, + "eval_samples_per_second": 92.154, + "eval_steps_per_second": 5.763, + "step": 19000 + }, + { + "epoch": 5.5474983982759625, + "grad_norm": 0.3204125463962555, + "learning_rate": 0.0005337447552447552, + "loss": 3.5249, + "step": 19050 + }, + { + "epoch": 5.562059525889685, + "grad_norm": 0.3400876224040985, + "learning_rate": 0.00053356993006993, + "loss": 3.5125, + "step": 19100 + }, + { + "epoch": 5.576620653503407, + "grad_norm": 0.31529760360717773, + "learning_rate": 0.0005333951048951048, + "loss": 3.522, + "step": 19150 + }, + { + "epoch": 5.59118178111713, + "grad_norm": 0.30710023641586304, + "learning_rate": 0.0005332202797202796, + "loss": 3.5183, + "step": 19200 + }, + { + "epoch": 5.605742908730852, + "grad_norm": 0.30690956115722656, + "learning_rate": 0.0005330454545454546, + "loss": 3.5081, + "step": 19250 + }, + { + "epoch": 5.620304036344574, + "grad_norm": 0.3141026794910431, + "learning_rate": 0.0005328706293706292, + "loss": 3.5201, + "step": 19300 + }, + { + "epoch": 5.634865163958297, + "grad_norm": 0.3454573452472687, + "learning_rate": 0.0005326958041958042, + "loss": 3.5377, + "step": 19350 + }, + { + "epoch": 5.649426291572019, + "grad_norm": 0.32653287053108215, + "learning_rate": 0.000532520979020979, + "loss": 3.5207, + "step": 19400 + }, + { + "epoch": 5.663987419185742, + "grad_norm": 0.30469658970832825, + "learning_rate": 0.0005323461538461538, + "loss": 3.5373, + "step": 19450 + }, + { + "epoch": 5.6785485467994645, + "grad_norm": 0.29301902651786804, + "learning_rate": 0.0005321713286713287, + "loss": 3.5197, + "step": 19500 + }, + { + "epoch": 5.693109674413186, + "grad_norm": 0.34159839153289795, + "learning_rate": 0.0005319965034965035, + "loss": 3.5231, + "step": 19550 + }, + { + "epoch": 5.707670802026909, + "grad_norm": 0.30907705426216125, + "learning_rate": 0.0005318216783216783, + "loss": 3.527, + "step": 19600 + }, + { + "epoch": 5.722231929640631, + "grad_norm": 0.3162166178226471, + "learning_rate": 0.0005316468531468531, + "loss": 3.5296, + "step": 19650 + }, + { + "epoch": 5.736793057254354, + "grad_norm": 0.3204457461833954, + "learning_rate": 0.000531472027972028, + "loss": 3.5307, + "step": 19700 + }, + { + "epoch": 5.7513541848680765, + "grad_norm": 0.3057008683681488, + "learning_rate": 0.0005312972027972028, + "loss": 3.53, + "step": 19750 + }, + { + "epoch": 5.765915312481798, + "grad_norm": 0.32276445627212524, + "learning_rate": 0.0005311223776223776, + "loss": 3.5437, + "step": 19800 + }, + { + "epoch": 5.780476440095521, + "grad_norm": 0.30552002787590027, + "learning_rate": 0.0005309475524475524, + "loss": 3.5316, + "step": 19850 + }, + { + "epoch": 5.795037567709244, + "grad_norm": 0.2986491918563843, + "learning_rate": 0.0005307727272727273, + "loss": 3.5291, + "step": 19900 + }, + { + "epoch": 5.809598695322966, + "grad_norm": 0.3022374212741852, + "learning_rate": 0.0005305979020979021, + "loss": 3.5171, + "step": 19950 + }, + { + "epoch": 5.824159822936688, + "grad_norm": 0.3142791986465454, + "learning_rate": 0.0005304230769230769, + "loss": 3.5262, + "step": 20000 + }, + { + "epoch": 5.824159822936688, + "eval_accuracy": 0.36232043193442454, + "eval_loss": 3.614790439605713, + "eval_runtime": 180.8701, + "eval_samples_per_second": 92.033, + "eval_steps_per_second": 5.756, + "step": 20000 + }, + { + "epoch": 5.83872095055041, + "grad_norm": 0.300855427980423, + "learning_rate": 0.0005302482517482517, + "loss": 3.538, + "step": 20050 + }, + { + "epoch": 5.853282078164133, + "grad_norm": 0.286253958940506, + "learning_rate": 0.0005300734265734265, + "loss": 3.5309, + "step": 20100 + }, + { + "epoch": 5.867843205777856, + "grad_norm": 0.2925945222377777, + "learning_rate": 0.0005298986013986013, + "loss": 3.5247, + "step": 20150 + }, + { + "epoch": 5.882404333391578, + "grad_norm": 0.31259238719940186, + "learning_rate": 0.0005297237762237762, + "loss": 3.5267, + "step": 20200 + }, + { + "epoch": 5.8969654610053, + "grad_norm": 0.30897215008735657, + "learning_rate": 0.000529548951048951, + "loss": 3.5384, + "step": 20250 + }, + { + "epoch": 5.911526588619022, + "grad_norm": 0.3039945363998413, + "learning_rate": 0.0005293741258741258, + "loss": 3.5347, + "step": 20300 + }, + { + "epoch": 5.926087716232745, + "grad_norm": 0.30515971779823303, + "learning_rate": 0.0005291993006993007, + "loss": 3.5327, + "step": 20350 + }, + { + "epoch": 5.940648843846468, + "grad_norm": 0.3250819742679596, + "learning_rate": 0.0005290244755244755, + "loss": 3.5112, + "step": 20400 + }, + { + "epoch": 5.95520997146019, + "grad_norm": 0.31303057074546814, + "learning_rate": 0.0005288496503496503, + "loss": 3.5379, + "step": 20450 + }, + { + "epoch": 5.969771099073912, + "grad_norm": 0.30073851346969604, + "learning_rate": 0.0005286748251748251, + "loss": 3.5411, + "step": 20500 + }, + { + "epoch": 5.984332226687634, + "grad_norm": 0.3091214597225189, + "learning_rate": 0.0005285, + "loss": 3.5171, + "step": 20550 + }, + { + "epoch": 5.998893354301357, + "grad_norm": 0.32308459281921387, + "learning_rate": 0.0005283251748251748, + "loss": 3.5282, + "step": 20600 + }, + { + "epoch": 6.013396237404625, + "grad_norm": 0.33346566557884216, + "learning_rate": 0.0005281503496503496, + "loss": 3.4215, + "step": 20650 + }, + { + "epoch": 6.027957365018347, + "grad_norm": 0.2992725372314453, + "learning_rate": 0.0005279755244755244, + "loss": 3.42, + "step": 20700 + }, + { + "epoch": 6.04251849263207, + "grad_norm": 0.3181827664375305, + "learning_rate": 0.0005278006993006993, + "loss": 3.4206, + "step": 20750 + }, + { + "epoch": 6.0570796202457915, + "grad_norm": 0.3308713734149933, + "learning_rate": 0.000527625874125874, + "loss": 3.436, + "step": 20800 + }, + { + "epoch": 6.071640747859514, + "grad_norm": 0.33250924944877625, + "learning_rate": 0.0005274510489510489, + "loss": 3.43, + "step": 20850 + }, + { + "epoch": 6.086201875473237, + "grad_norm": 0.31477755308151245, + "learning_rate": 0.0005272762237762238, + "loss": 3.4421, + "step": 20900 + }, + { + "epoch": 6.100763003086959, + "grad_norm": 0.30721914768218994, + "learning_rate": 0.0005271013986013985, + "loss": 3.4443, + "step": 20950 + }, + { + "epoch": 6.115324130700682, + "grad_norm": 0.3389357924461365, + "learning_rate": 0.0005269265734265734, + "loss": 3.4388, + "step": 21000 + }, + { + "epoch": 6.115324130700682, + "eval_accuracy": 0.3626586595558532, + "eval_loss": 3.616656541824341, + "eval_runtime": 180.8276, + "eval_samples_per_second": 92.055, + "eval_steps_per_second": 5.757, + "step": 21000 + }, + { + "epoch": 6.1298852583144035, + "grad_norm": 0.3342684507369995, + "learning_rate": 0.0005267517482517482, + "loss": 3.433, + "step": 21050 + }, + { + "epoch": 6.144446385928126, + "grad_norm": 0.3244004547595978, + "learning_rate": 0.000526576923076923, + "loss": 3.4507, + "step": 21100 + }, + { + "epoch": 6.159007513541849, + "grad_norm": 0.30746179819107056, + "learning_rate": 0.0005264020979020978, + "loss": 3.4594, + "step": 21150 + }, + { + "epoch": 6.173568641155571, + "grad_norm": 0.30511513352394104, + "learning_rate": 0.0005262272727272727, + "loss": 3.4499, + "step": 21200 + }, + { + "epoch": 6.1881297687692935, + "grad_norm": 0.30683988332748413, + "learning_rate": 0.0005260524475524475, + "loss": 3.4515, + "step": 21250 + }, + { + "epoch": 6.202690896383016, + "grad_norm": 0.3043258488178253, + "learning_rate": 0.0005258776223776223, + "loss": 3.4542, + "step": 21300 + }, + { + "epoch": 6.217252023996738, + "grad_norm": 0.32519418001174927, + "learning_rate": 0.0005257027972027971, + "loss": 3.4434, + "step": 21350 + }, + { + "epoch": 6.231813151610461, + "grad_norm": 0.3197823762893677, + "learning_rate": 0.000525527972027972, + "loss": 3.4388, + "step": 21400 + }, + { + "epoch": 6.246374279224183, + "grad_norm": 0.3240765333175659, + "learning_rate": 0.0005253531468531468, + "loss": 3.4588, + "step": 21450 + }, + { + "epoch": 6.2609354068379055, + "grad_norm": 0.34707140922546387, + "learning_rate": 0.0005251783216783216, + "loss": 3.4539, + "step": 21500 + }, + { + "epoch": 6.275496534451628, + "grad_norm": 0.337259441614151, + "learning_rate": 0.0005250034965034965, + "loss": 3.4567, + "step": 21550 + }, + { + "epoch": 6.29005766206535, + "grad_norm": 0.31958332657814026, + "learning_rate": 0.0005248286713286712, + "loss": 3.4585, + "step": 21600 + }, + { + "epoch": 6.304618789679073, + "grad_norm": 0.32298600673675537, + "learning_rate": 0.0005246538461538461, + "loss": 3.463, + "step": 21650 + }, + { + "epoch": 6.319179917292795, + "grad_norm": 0.3255809247493744, + "learning_rate": 0.0005244790209790209, + "loss": 3.4608, + "step": 21700 + }, + { + "epoch": 6.3337410449065175, + "grad_norm": 0.32428938150405884, + "learning_rate": 0.0005243041958041957, + "loss": 3.4699, + "step": 21750 + }, + { + "epoch": 6.34830217252024, + "grad_norm": 0.3299717307090759, + "learning_rate": 0.0005241293706293705, + "loss": 3.4656, + "step": 21800 + }, + { + "epoch": 6.362863300133962, + "grad_norm": 0.32020100951194763, + "learning_rate": 0.0005239545454545454, + "loss": 3.4592, + "step": 21850 + }, + { + "epoch": 6.377424427747685, + "grad_norm": 0.3127721846103668, + "learning_rate": 0.0005237797202797202, + "loss": 3.4843, + "step": 21900 + }, + { + "epoch": 6.391985555361408, + "grad_norm": 0.3199755847454071, + "learning_rate": 0.000523604895104895, + "loss": 3.4847, + "step": 21950 + }, + { + "epoch": 6.406546682975129, + "grad_norm": 0.3283131718635559, + "learning_rate": 0.0005234300699300698, + "loss": 3.4634, + "step": 22000 + }, + { + "epoch": 6.406546682975129, + "eval_accuracy": 0.36310868921689904, + "eval_loss": 3.6098170280456543, + "eval_runtime": 181.4687, + "eval_samples_per_second": 91.729, + "eval_steps_per_second": 5.737, + "step": 22000 + }, + { + "epoch": 6.421107810588852, + "grad_norm": 0.3273344933986664, + "learning_rate": 0.0005232552447552447, + "loss": 3.4702, + "step": 22050 + }, + { + "epoch": 6.435668938202574, + "grad_norm": 0.3400958478450775, + "learning_rate": 0.0005230804195804195, + "loss": 3.4588, + "step": 22100 + }, + { + "epoch": 6.450230065816297, + "grad_norm": 0.3155398666858673, + "learning_rate": 0.0005229055944055943, + "loss": 3.486, + "step": 22150 + }, + { + "epoch": 6.4647911934300195, + "grad_norm": 0.32073867321014404, + "learning_rate": 0.0005227307692307691, + "loss": 3.482, + "step": 22200 + }, + { + "epoch": 6.479352321043741, + "grad_norm": 0.30934587121009827, + "learning_rate": 0.0005225559440559441, + "loss": 3.4709, + "step": 22250 + }, + { + "epoch": 6.493913448657464, + "grad_norm": 0.3552294671535492, + "learning_rate": 0.0005223811188811189, + "loss": 3.4915, + "step": 22300 + }, + { + "epoch": 6.508474576271187, + "grad_norm": 0.32506152987480164, + "learning_rate": 0.0005222062937062937, + "loss": 3.4893, + "step": 22350 + }, + { + "epoch": 6.523035703884909, + "grad_norm": 0.31292617321014404, + "learning_rate": 0.0005220314685314686, + "loss": 3.4771, + "step": 22400 + }, + { + "epoch": 6.5375968314986315, + "grad_norm": 0.3158280849456787, + "learning_rate": 0.0005218566433566433, + "loss": 3.4818, + "step": 22450 + }, + { + "epoch": 6.552157959112353, + "grad_norm": 0.3074539303779602, + "learning_rate": 0.0005216818181818182, + "loss": 3.4723, + "step": 22500 + }, + { + "epoch": 6.566719086726076, + "grad_norm": 0.30536600947380066, + "learning_rate": 0.000521506993006993, + "loss": 3.4755, + "step": 22550 + }, + { + "epoch": 6.581280214339799, + "grad_norm": 0.3104475438594818, + "learning_rate": 0.0005213321678321678, + "loss": 3.4779, + "step": 22600 + }, + { + "epoch": 6.595841341953521, + "grad_norm": 0.3355538547039032, + "learning_rate": 0.0005211573426573426, + "loss": 3.4812, + "step": 22650 + }, + { + "epoch": 6.610402469567243, + "grad_norm": 0.3326936364173889, + "learning_rate": 0.0005209825174825175, + "loss": 3.4838, + "step": 22700 + }, + { + "epoch": 6.624963597180965, + "grad_norm": 0.311422735452652, + "learning_rate": 0.0005208076923076923, + "loss": 3.4823, + "step": 22750 + }, + { + "epoch": 6.639524724794688, + "grad_norm": 0.3325813412666321, + "learning_rate": 0.0005206328671328671, + "loss": 3.4808, + "step": 22800 + }, + { + "epoch": 6.654085852408411, + "grad_norm": 0.32317134737968445, + "learning_rate": 0.0005204580419580419, + "loss": 3.4865, + "step": 22850 + }, + { + "epoch": 6.668646980022133, + "grad_norm": 0.307810515165329, + "learning_rate": 0.0005202832167832168, + "loss": 3.4835, + "step": 22900 + }, + { + "epoch": 6.683208107635855, + "grad_norm": 0.3085881471633911, + "learning_rate": 0.0005201083916083916, + "loss": 3.4681, + "step": 22950 + }, + { + "epoch": 6.697769235249577, + "grad_norm": 0.33852365612983704, + "learning_rate": 0.0005199335664335664, + "loss": 3.5019, + "step": 23000 + }, + { + "epoch": 6.697769235249577, + "eval_accuracy": 0.3637609265626363, + "eval_loss": 3.6008996963500977, + "eval_runtime": 180.919, + "eval_samples_per_second": 92.008, + "eval_steps_per_second": 5.754, + "step": 23000 + }, + { + "epoch": 6.7123303628633, + "grad_norm": 0.3305760622024536, + "learning_rate": 0.0005197587412587413, + "loss": 3.4833, + "step": 23050 + }, + { + "epoch": 6.726891490477023, + "grad_norm": 0.30366402864456177, + "learning_rate": 0.0005195839160839161, + "loss": 3.4811, + "step": 23100 + }, + { + "epoch": 6.741452618090745, + "grad_norm": 0.2957024872303009, + "learning_rate": 0.0005194090909090909, + "loss": 3.4887, + "step": 23150 + }, + { + "epoch": 6.756013745704467, + "grad_norm": 0.33022427558898926, + "learning_rate": 0.0005192342657342657, + "loss": 3.4863, + "step": 23200 + }, + { + "epoch": 6.77057487331819, + "grad_norm": 0.3377295732498169, + "learning_rate": 0.0005190594405594405, + "loss": 3.4843, + "step": 23250 + }, + { + "epoch": 6.785136000931912, + "grad_norm": 0.288236141204834, + "learning_rate": 0.0005188846153846153, + "loss": 3.5076, + "step": 23300 + }, + { + "epoch": 6.799697128545635, + "grad_norm": 0.3370327353477478, + "learning_rate": 0.0005187097902097902, + "loss": 3.4846, + "step": 23350 + }, + { + "epoch": 6.814258256159357, + "grad_norm": 0.32287853956222534, + "learning_rate": 0.000518534965034965, + "loss": 3.4971, + "step": 23400 + }, + { + "epoch": 6.828819383773079, + "grad_norm": 0.3225182294845581, + "learning_rate": 0.0005183601398601398, + "loss": 3.4699, + "step": 23450 + }, + { + "epoch": 6.843380511386802, + "grad_norm": 0.32776007056236267, + "learning_rate": 0.0005181853146853146, + "loss": 3.4867, + "step": 23500 + }, + { + "epoch": 6.857941639000524, + "grad_norm": 0.3112199306488037, + "learning_rate": 0.0005180104895104895, + "loss": 3.4929, + "step": 23550 + }, + { + "epoch": 6.872502766614247, + "grad_norm": 0.3101305663585663, + "learning_rate": 0.0005178356643356643, + "loss": 3.4852, + "step": 23600 + }, + { + "epoch": 6.887063894227969, + "grad_norm": 0.34980154037475586, + "learning_rate": 0.0005176608391608391, + "loss": 3.4916, + "step": 23650 + }, + { + "epoch": 6.901625021841691, + "grad_norm": 0.3215571939945221, + "learning_rate": 0.000517486013986014, + "loss": 3.4847, + "step": 23700 + }, + { + "epoch": 6.916186149455414, + "grad_norm": 0.33672600984573364, + "learning_rate": 0.0005173111888111888, + "loss": 3.4901, + "step": 23750 + }, + { + "epoch": 6.930747277069136, + "grad_norm": 0.3331395387649536, + "learning_rate": 0.0005171363636363636, + "loss": 3.4927, + "step": 23800 + }, + { + "epoch": 6.945308404682859, + "grad_norm": 0.3124479055404663, + "learning_rate": 0.0005169615384615384, + "loss": 3.506, + "step": 23850 + }, + { + "epoch": 6.959869532296581, + "grad_norm": 0.33297938108444214, + "learning_rate": 0.0005167867132867133, + "loss": 3.4942, + "step": 23900 + }, + { + "epoch": 6.974430659910303, + "grad_norm": 0.31658828258514404, + "learning_rate": 0.000516611888111888, + "loss": 3.5013, + "step": 23950 + }, + { + "epoch": 6.988991787524026, + "grad_norm": 0.32839858531951904, + "learning_rate": 0.0005164370629370629, + "loss": 3.4874, + "step": 24000 + }, + { + "epoch": 6.988991787524026, + "eval_accuracy": 0.36462360097558155, + "eval_loss": 3.5910537242889404, + "eval_runtime": 180.6847, + "eval_samples_per_second": 92.127, + "eval_steps_per_second": 5.761, + "step": 24000 + }, + { + "epoch": 7.003494670627293, + "grad_norm": 0.32422536611557007, + "learning_rate": 0.0005162622377622377, + "loss": 3.4613, + "step": 24050 + }, + { + "epoch": 7.018055798241016, + "grad_norm": 0.31693896651268005, + "learning_rate": 0.0005160874125874125, + "loss": 3.3777, + "step": 24100 + }, + { + "epoch": 7.032616925854738, + "grad_norm": 0.3402048349380493, + "learning_rate": 0.0005159125874125873, + "loss": 3.3793, + "step": 24150 + }, + { + "epoch": 7.0471780534684605, + "grad_norm": 0.3285346031188965, + "learning_rate": 0.0005157377622377622, + "loss": 3.387, + "step": 24200 + }, + { + "epoch": 7.061739181082183, + "grad_norm": 0.330010324716568, + "learning_rate": 0.000515562937062937, + "loss": 3.392, + "step": 24250 + }, + { + "epoch": 7.076300308695905, + "grad_norm": 0.32283395528793335, + "learning_rate": 0.0005153881118881118, + "loss": 3.4001, + "step": 24300 + }, + { + "epoch": 7.090861436309628, + "grad_norm": 0.3089471459388733, + "learning_rate": 0.0005152132867132867, + "loss": 3.4023, + "step": 24350 + }, + { + "epoch": 7.105422563923351, + "grad_norm": 0.32891589403152466, + "learning_rate": 0.0005150384615384615, + "loss": 3.3962, + "step": 24400 + }, + { + "epoch": 7.1199836915370724, + "grad_norm": 0.34227320551872253, + "learning_rate": 0.0005148636363636363, + "loss": 3.4008, + "step": 24450 + }, + { + "epoch": 7.134544819150795, + "grad_norm": 0.31279921531677246, + "learning_rate": 0.0005146888111888111, + "loss": 3.4203, + "step": 24500 + }, + { + "epoch": 7.149105946764517, + "grad_norm": 0.3075491189956665, + "learning_rate": 0.000514513986013986, + "loss": 3.4162, + "step": 24550 + }, + { + "epoch": 7.16366707437824, + "grad_norm": 0.32692641019821167, + "learning_rate": 0.0005143391608391608, + "loss": 3.4139, + "step": 24600 + }, + { + "epoch": 7.1782282019919625, + "grad_norm": 0.34598174691200256, + "learning_rate": 0.0005141643356643356, + "loss": 3.4051, + "step": 24650 + }, + { + "epoch": 7.192789329605684, + "grad_norm": 0.33088117837905884, + "learning_rate": 0.0005139895104895104, + "loss": 3.4026, + "step": 24700 + }, + { + "epoch": 7.207350457219407, + "grad_norm": 0.33946093916893005, + "learning_rate": 0.0005138146853146852, + "loss": 3.412, + "step": 24750 + }, + { + "epoch": 7.22191158483313, + "grad_norm": 0.32189738750457764, + "learning_rate": 0.00051363986013986, + "loss": 3.4248, + "step": 24800 + }, + { + "epoch": 7.236472712446852, + "grad_norm": 0.3421887457370758, + "learning_rate": 0.0005134650349650349, + "loss": 3.4235, + "step": 24850 + }, + { + "epoch": 7.2510338400605745, + "grad_norm": 0.32535749673843384, + "learning_rate": 0.0005132902097902097, + "loss": 3.4198, + "step": 24900 + }, + { + "epoch": 7.265594967674296, + "grad_norm": 0.30061206221580505, + "learning_rate": 0.0005131153846153845, + "loss": 3.418, + "step": 24950 + }, + { + "epoch": 7.280156095288019, + "grad_norm": 0.31006714701652527, + "learning_rate": 0.0005129405594405594, + "loss": 3.4222, + "step": 25000 + }, + { + "epoch": 7.280156095288019, + "eval_accuracy": 0.3645690519257578, + "eval_loss": 3.5986101627349854, + "eval_runtime": 180.7195, + "eval_samples_per_second": 92.11, + "eval_steps_per_second": 5.76, + "step": 25000 + }, + { + "epoch": 7.294717222901742, + "grad_norm": 0.32596707344055176, + "learning_rate": 0.0005127657342657342, + "loss": 3.4334, + "step": 25050 + }, + { + "epoch": 7.309278350515464, + "grad_norm": 0.3368932604789734, + "learning_rate": 0.000512590909090909, + "loss": 3.4284, + "step": 25100 + }, + { + "epoch": 7.3238394781291865, + "grad_norm": 0.3360914885997772, + "learning_rate": 0.0005124160839160838, + "loss": 3.4356, + "step": 25150 + }, + { + "epoch": 7.338400605742908, + "grad_norm": 0.3240918517112732, + "learning_rate": 0.0005122412587412588, + "loss": 3.4355, + "step": 25200 + }, + { + "epoch": 7.352961733356631, + "grad_norm": 0.3215958774089813, + "learning_rate": 0.0005120664335664336, + "loss": 3.4247, + "step": 25250 + }, + { + "epoch": 7.367522860970354, + "grad_norm": 0.3438575267791748, + "learning_rate": 0.0005118916083916084, + "loss": 3.4302, + "step": 25300 + }, + { + "epoch": 7.382083988584076, + "grad_norm": 0.3164159655570984, + "learning_rate": 0.0005117167832167832, + "loss": 3.4298, + "step": 25350 + }, + { + "epoch": 7.396645116197798, + "grad_norm": 0.32072940468788147, + "learning_rate": 0.0005115419580419581, + "loss": 3.4205, + "step": 25400 + }, + { + "epoch": 7.411206243811521, + "grad_norm": 0.32689517736434937, + "learning_rate": 0.0005113671328671328, + "loss": 3.4321, + "step": 25450 + }, + { + "epoch": 7.425767371425243, + "grad_norm": 0.31307026743888855, + "learning_rate": 0.0005111923076923077, + "loss": 3.4414, + "step": 25500 + }, + { + "epoch": 7.440328499038966, + "grad_norm": 0.3382061719894409, + "learning_rate": 0.0005110174825174825, + "loss": 3.4383, + "step": 25550 + }, + { + "epoch": 7.454889626652688, + "grad_norm": 0.32721343636512756, + "learning_rate": 0.0005108426573426573, + "loss": 3.4344, + "step": 25600 + }, + { + "epoch": 7.46945075426641, + "grad_norm": 0.3294874131679535, + "learning_rate": 0.0005106678321678321, + "loss": 3.4324, + "step": 25650 + }, + { + "epoch": 7.484011881880133, + "grad_norm": 0.3590655028820038, + "learning_rate": 0.000510493006993007, + "loss": 3.4372, + "step": 25700 + }, + { + "epoch": 7.498573009493855, + "grad_norm": 0.3113061189651489, + "learning_rate": 0.0005103181818181818, + "loss": 3.4376, + "step": 25750 + }, + { + "epoch": 7.513134137107578, + "grad_norm": 0.3551453948020935, + "learning_rate": 0.0005101433566433566, + "loss": 3.4468, + "step": 25800 + }, + { + "epoch": 7.5276952647213005, + "grad_norm": 0.3093542456626892, + "learning_rate": 0.0005099685314685315, + "loss": 3.453, + "step": 25850 + }, + { + "epoch": 7.542256392335022, + "grad_norm": 0.32737603783607483, + "learning_rate": 0.0005097937062937063, + "loss": 3.4581, + "step": 25900 + }, + { + "epoch": 7.556817519948745, + "grad_norm": 0.3264789581298828, + "learning_rate": 0.0005096188811188811, + "loss": 3.4601, + "step": 25950 + }, + { + "epoch": 7.571378647562467, + "grad_norm": 0.312288761138916, + "learning_rate": 0.0005094440559440559, + "loss": 3.4584, + "step": 26000 + }, + { + "epoch": 7.571378647562467, + "eval_accuracy": 0.36503354178751124, + "eval_loss": 3.5893683433532715, + "eval_runtime": 180.8574, + "eval_samples_per_second": 92.039, + "eval_steps_per_second": 5.756, + "step": 26000 + }, + { + "epoch": 7.58593977517619, + "grad_norm": 0.30291152000427246, + "learning_rate": 0.0005092692307692308, + "loss": 3.4588, + "step": 26050 + }, + { + "epoch": 7.600500902789912, + "grad_norm": 0.3086663484573364, + "learning_rate": 0.0005090944055944056, + "loss": 3.4499, + "step": 26100 + }, + { + "epoch": 7.615062030403634, + "grad_norm": 0.31779009103775024, + "learning_rate": 0.0005089195804195804, + "loss": 3.4592, + "step": 26150 + }, + { + "epoch": 7.629623158017357, + "grad_norm": 0.3134089708328247, + "learning_rate": 0.0005087447552447552, + "loss": 3.4564, + "step": 26200 + }, + { + "epoch": 7.644184285631079, + "grad_norm": 0.316002756357193, + "learning_rate": 0.00050856993006993, + "loss": 3.4563, + "step": 26250 + }, + { + "epoch": 7.658745413244802, + "grad_norm": 0.3355805277824402, + "learning_rate": 0.0005083951048951048, + "loss": 3.447, + "step": 26300 + }, + { + "epoch": 7.673306540858524, + "grad_norm": 0.3496020436286926, + "learning_rate": 0.0005082202797202797, + "loss": 3.4441, + "step": 26350 + }, + { + "epoch": 7.687867668472246, + "grad_norm": 0.31140685081481934, + "learning_rate": 0.0005080454545454545, + "loss": 3.4427, + "step": 26400 + }, + { + "epoch": 7.702428796085969, + "grad_norm": 0.35752904415130615, + "learning_rate": 0.0005078706293706293, + "loss": 3.4618, + "step": 26450 + }, + { + "epoch": 7.716989923699691, + "grad_norm": 0.34923386573791504, + "learning_rate": 0.0005076958041958042, + "loss": 3.4498, + "step": 26500 + }, + { + "epoch": 7.731551051313414, + "grad_norm": 0.3136448562145233, + "learning_rate": 0.000507520979020979, + "loss": 3.4591, + "step": 26550 + }, + { + "epoch": 7.746112178927136, + "grad_norm": 0.31951072812080383, + "learning_rate": 0.0005073461538461538, + "loss": 3.4538, + "step": 26600 + }, + { + "epoch": 7.760673306540858, + "grad_norm": 0.3343602120876312, + "learning_rate": 0.0005071713286713286, + "loss": 3.4467, + "step": 26650 + }, + { + "epoch": 7.775234434154581, + "grad_norm": 0.33834147453308105, + "learning_rate": 0.0005069965034965035, + "loss": 3.462, + "step": 26700 + }, + { + "epoch": 7.789795561768304, + "grad_norm": 0.3536285161972046, + "learning_rate": 0.0005068216783216783, + "loss": 3.4647, + "step": 26750 + }, + { + "epoch": 7.8043566893820255, + "grad_norm": 0.31148087978363037, + "learning_rate": 0.0005066468531468531, + "loss": 3.4641, + "step": 26800 + }, + { + "epoch": 7.818917816995748, + "grad_norm": 0.32618945837020874, + "learning_rate": 0.0005064720279720279, + "loss": 3.4615, + "step": 26850 + }, + { + "epoch": 7.833478944609471, + "grad_norm": 0.31293508410453796, + "learning_rate": 0.0005062972027972028, + "loss": 3.4553, + "step": 26900 + }, + { + "epoch": 7.848040072223193, + "grad_norm": 0.3196380138397217, + "learning_rate": 0.0005061223776223775, + "loss": 3.4592, + "step": 26950 + }, + { + "epoch": 7.862601199836916, + "grad_norm": 0.3242320418357849, + "learning_rate": 0.0005059475524475524, + "loss": 3.4485, + "step": 27000 + }, + { + "epoch": 7.862601199836916, + "eval_accuracy": 0.3659237258505831, + "eval_loss": 3.580866575241089, + "eval_runtime": 180.9494, + "eval_samples_per_second": 91.993, + "eval_steps_per_second": 5.753, + "step": 27000 + }, + { + "epoch": 7.8771623274506375, + "grad_norm": 0.3238949477672577, + "learning_rate": 0.0005057727272727272, + "loss": 3.4538, + "step": 27050 + }, + { + "epoch": 7.89172345506436, + "grad_norm": 0.31904834508895874, + "learning_rate": 0.000505597902097902, + "loss": 3.4514, + "step": 27100 + }, + { + "epoch": 7.906284582678083, + "grad_norm": 0.32475146651268005, + "learning_rate": 0.0005054230769230769, + "loss": 3.4581, + "step": 27150 + }, + { + "epoch": 7.920845710291805, + "grad_norm": 0.32322096824645996, + "learning_rate": 0.0005052482517482517, + "loss": 3.449, + "step": 27200 + }, + { + "epoch": 7.935406837905528, + "grad_norm": 0.3155551552772522, + "learning_rate": 0.0005050734265734265, + "loss": 3.454, + "step": 27250 + }, + { + "epoch": 7.9499679655192494, + "grad_norm": 0.36903294920921326, + "learning_rate": 0.0005048986013986013, + "loss": 3.4722, + "step": 27300 + }, + { + "epoch": 7.964529093132972, + "grad_norm": 0.32118940353393555, + "learning_rate": 0.0005047237762237762, + "loss": 3.4534, + "step": 27350 + }, + { + "epoch": 7.979090220746695, + "grad_norm": 0.3173547387123108, + "learning_rate": 0.000504548951048951, + "loss": 3.4603, + "step": 27400 + }, + { + "epoch": 7.993651348360417, + "grad_norm": 0.30384644865989685, + "learning_rate": 0.0005043741258741258, + "loss": 3.4583, + "step": 27450 + }, + { + "epoch": 8.008154231463685, + "grad_norm": 0.3187147378921509, + "learning_rate": 0.0005041993006993006, + "loss": 3.4017, + "step": 27500 + }, + { + "epoch": 8.022715359077408, + "grad_norm": 0.32749438285827637, + "learning_rate": 0.0005040244755244755, + "loss": 3.3413, + "step": 27550 + }, + { + "epoch": 8.037276486691129, + "grad_norm": 0.3408881425857544, + "learning_rate": 0.0005038496503496503, + "loss": 3.3453, + "step": 27600 + }, + { + "epoch": 8.051837614304851, + "grad_norm": 0.3273647129535675, + "learning_rate": 0.0005036748251748251, + "loss": 3.3509, + "step": 27650 + }, + { + "epoch": 8.066398741918574, + "grad_norm": 0.3485855162143707, + "learning_rate": 0.0005034999999999999, + "loss": 3.3585, + "step": 27700 + }, + { + "epoch": 8.080959869532297, + "grad_norm": 0.31149420142173767, + "learning_rate": 0.0005033251748251747, + "loss": 3.3763, + "step": 27750 + }, + { + "epoch": 8.09552099714602, + "grad_norm": 0.35035839676856995, + "learning_rate": 0.0005031503496503496, + "loss": 3.3758, + "step": 27800 + }, + { + "epoch": 8.11008212475974, + "grad_norm": 0.3395816683769226, + "learning_rate": 0.0005029755244755244, + "loss": 3.3757, + "step": 27850 + }, + { + "epoch": 8.124643252373463, + "grad_norm": 0.3167729079723358, + "learning_rate": 0.0005028006993006992, + "loss": 3.3735, + "step": 27900 + }, + { + "epoch": 8.139204379987186, + "grad_norm": 0.31024643778800964, + "learning_rate": 0.000502625874125874, + "loss": 3.3797, + "step": 27950 + }, + { + "epoch": 8.153765507600909, + "grad_norm": 0.3287062346935272, + "learning_rate": 0.000502451048951049, + "loss": 3.3673, + "step": 28000 + }, + { + "epoch": 8.153765507600909, + "eval_accuracy": 0.36575196688120276, + "eval_loss": 3.587451457977295, + "eval_runtime": 180.7644, + "eval_samples_per_second": 92.087, + "eval_steps_per_second": 5.759, + "step": 28000 + }, + { + "epoch": 8.168326635214632, + "grad_norm": 0.3316042125225067, + "learning_rate": 0.0005022762237762237, + "loss": 3.3777, + "step": 28050 + }, + { + "epoch": 8.182887762828354, + "grad_norm": 0.3350354731082916, + "learning_rate": 0.0005021013986013985, + "loss": 3.3789, + "step": 28100 + }, + { + "epoch": 8.197448890442075, + "grad_norm": 0.29669225215911865, + "learning_rate": 0.0005019265734265733, + "loss": 3.3892, + "step": 28150 + }, + { + "epoch": 8.212010018055798, + "grad_norm": 0.33734533190727234, + "learning_rate": 0.0005017517482517483, + "loss": 3.389, + "step": 28200 + }, + { + "epoch": 8.22657114566952, + "grad_norm": 0.3267245292663574, + "learning_rate": 0.0005015769230769231, + "loss": 3.3855, + "step": 28250 + }, + { + "epoch": 8.241132273283243, + "grad_norm": 0.32976293563842773, + "learning_rate": 0.0005014020979020979, + "loss": 3.3962, + "step": 28300 + }, + { + "epoch": 8.255693400896966, + "grad_norm": 0.36868104338645935, + "learning_rate": 0.0005012272727272727, + "loss": 3.3907, + "step": 28350 + }, + { + "epoch": 8.270254528510687, + "grad_norm": 0.3128938376903534, + "learning_rate": 0.0005010524475524476, + "loss": 3.392, + "step": 28400 + }, + { + "epoch": 8.28481565612441, + "grad_norm": 0.32096031308174133, + "learning_rate": 0.0005008776223776223, + "loss": 3.3917, + "step": 28450 + }, + { + "epoch": 8.299376783738133, + "grad_norm": 0.3255106806755066, + "learning_rate": 0.0005007027972027972, + "loss": 3.3873, + "step": 28500 + }, + { + "epoch": 8.313937911351855, + "grad_norm": 0.33877506852149963, + "learning_rate": 0.000500527972027972, + "loss": 3.4003, + "step": 28550 + }, + { + "epoch": 8.328499038965578, + "grad_norm": 0.323604553937912, + "learning_rate": 0.0005003531468531468, + "loss": 3.4088, + "step": 28600 + }, + { + "epoch": 8.3430601665793, + "grad_norm": 0.34601426124572754, + "learning_rate": 0.0005001783216783217, + "loss": 3.4023, + "step": 28650 + }, + { + "epoch": 8.357621294193022, + "grad_norm": 0.3355138599872589, + "learning_rate": 0.0005000034965034965, + "loss": 3.4155, + "step": 28700 + }, + { + "epoch": 8.372182421806745, + "grad_norm": 0.33060452342033386, + "learning_rate": 0.0004998286713286713, + "loss": 3.4031, + "step": 28750 + }, + { + "epoch": 8.386743549420467, + "grad_norm": 0.33665457367897034, + "learning_rate": 0.0004996538461538461, + "loss": 3.3994, + "step": 28800 + }, + { + "epoch": 8.40130467703419, + "grad_norm": 0.3132437765598297, + "learning_rate": 0.000499479020979021, + "loss": 3.408, + "step": 28850 + }, + { + "epoch": 8.415865804647911, + "grad_norm": 0.35093778371810913, + "learning_rate": 0.0004993041958041958, + "loss": 3.4177, + "step": 28900 + }, + { + "epoch": 8.430426932261634, + "grad_norm": 0.3508143126964569, + "learning_rate": 0.0004991293706293706, + "loss": 3.4109, + "step": 28950 + }, + { + "epoch": 8.444988059875357, + "grad_norm": 0.3553667962551117, + "learning_rate": 0.0004989545454545454, + "loss": 3.4158, + "step": 29000 + }, + { + "epoch": 8.444988059875357, + "eval_accuracy": 0.366446409202989, + "eval_loss": 3.5817067623138428, + "eval_runtime": 180.8846, + "eval_samples_per_second": 92.026, + "eval_steps_per_second": 5.755, + "step": 29000 + }, + { + "epoch": 8.45954918748908, + "grad_norm": 0.34186291694641113, + "learning_rate": 0.0004987797202797203, + "loss": 3.4113, + "step": 29050 + }, + { + "epoch": 8.474110315102802, + "grad_norm": 0.33059263229370117, + "learning_rate": 0.0004986048951048951, + "loss": 3.4114, + "step": 29100 + }, + { + "epoch": 8.488671442716523, + "grad_norm": 0.3274041712284088, + "learning_rate": 0.0004984300699300699, + "loss": 3.3992, + "step": 29150 + }, + { + "epoch": 8.503232570330246, + "grad_norm": 0.3322159945964813, + "learning_rate": 0.0004982552447552448, + "loss": 3.4132, + "step": 29200 + }, + { + "epoch": 8.517793697943969, + "grad_norm": 0.3492433428764343, + "learning_rate": 0.0004980804195804195, + "loss": 3.4041, + "step": 29250 + }, + { + "epoch": 8.532354825557691, + "grad_norm": 0.3179911971092224, + "learning_rate": 0.0004979055944055944, + "loss": 3.421, + "step": 29300 + }, + { + "epoch": 8.546915953171414, + "grad_norm": 0.3256663978099823, + "learning_rate": 0.0004977307692307692, + "loss": 3.4057, + "step": 29350 + }, + { + "epoch": 8.561477080785137, + "grad_norm": 0.3634396195411682, + "learning_rate": 0.000497555944055944, + "loss": 3.4194, + "step": 29400 + }, + { + "epoch": 8.576038208398858, + "grad_norm": 0.3208930492401123, + "learning_rate": 0.0004973811188811188, + "loss": 3.4217, + "step": 29450 + }, + { + "epoch": 8.59059933601258, + "grad_norm": 0.3529151678085327, + "learning_rate": 0.0004972062937062937, + "loss": 3.4254, + "step": 29500 + }, + { + "epoch": 8.605160463626303, + "grad_norm": 0.33356186747550964, + "learning_rate": 0.0004970314685314685, + "loss": 3.4137, + "step": 29550 + }, + { + "epoch": 8.619721591240026, + "grad_norm": 0.3451177179813385, + "learning_rate": 0.0004968566433566433, + "loss": 3.4248, + "step": 29600 + }, + { + "epoch": 8.634282718853749, + "grad_norm": 0.32635167241096497, + "learning_rate": 0.0004966818181818181, + "loss": 3.4141, + "step": 29650 + }, + { + "epoch": 8.64884384646747, + "grad_norm": 0.3231564462184906, + "learning_rate": 0.000496506993006993, + "loss": 3.4239, + "step": 29700 + }, + { + "epoch": 8.663404974081192, + "grad_norm": 0.3018019497394562, + "learning_rate": 0.0004963321678321678, + "loss": 3.4006, + "step": 29750 + }, + { + "epoch": 8.677966101694915, + "grad_norm": 0.3364706337451935, + "learning_rate": 0.0004961573426573426, + "loss": 3.4213, + "step": 29800 + }, + { + "epoch": 8.692527229308638, + "grad_norm": 0.32945576310157776, + "learning_rate": 0.0004959825174825175, + "loss": 3.4371, + "step": 29850 + }, + { + "epoch": 8.70708835692236, + "grad_norm": 0.3232404589653015, + "learning_rate": 0.0004958076923076923, + "loss": 3.4317, + "step": 29900 + }, + { + "epoch": 8.721649484536082, + "grad_norm": 0.33957934379577637, + "learning_rate": 0.0004956328671328671, + "loss": 3.4244, + "step": 29950 + }, + { + "epoch": 8.736210612149804, + "grad_norm": 0.3381136655807495, + "learning_rate": 0.0004954580419580419, + "loss": 3.4307, + "step": 30000 + }, + { + "epoch": 8.736210612149804, + "eval_accuracy": 0.3671908156329112, + "eval_loss": 3.5734803676605225, + "eval_runtime": 180.8975, + "eval_samples_per_second": 92.019, + "eval_steps_per_second": 5.755, + "step": 30000 + }, + { + "epoch": 8.750771739763527, + "grad_norm": 0.33459433913230896, + "learning_rate": 0.0004952832167832167, + "loss": 3.4191, + "step": 30050 + }, + { + "epoch": 8.76533286737725, + "grad_norm": 0.32491710782051086, + "learning_rate": 0.0004951083916083915, + "loss": 3.436, + "step": 30100 + }, + { + "epoch": 8.779893994990973, + "grad_norm": 0.3690783977508545, + "learning_rate": 0.0004949335664335664, + "loss": 3.4343, + "step": 30150 + }, + { + "epoch": 8.794455122604695, + "grad_norm": 0.3273313343524933, + "learning_rate": 0.0004947587412587412, + "loss": 3.411, + "step": 30200 + }, + { + "epoch": 8.809016250218416, + "grad_norm": 0.3403497040271759, + "learning_rate": 0.000494583916083916, + "loss": 3.4318, + "step": 30250 + }, + { + "epoch": 8.82357737783214, + "grad_norm": 0.29834118485450745, + "learning_rate": 0.0004944090909090908, + "loss": 3.4454, + "step": 30300 + }, + { + "epoch": 8.838138505445862, + "grad_norm": 0.34332728385925293, + "learning_rate": 0.0004942342657342657, + "loss": 3.4399, + "step": 30350 + }, + { + "epoch": 8.852699633059585, + "grad_norm": 0.32075831294059753, + "learning_rate": 0.0004940594405594405, + "loss": 3.4269, + "step": 30400 + }, + { + "epoch": 8.867260760673307, + "grad_norm": 0.3218145966529846, + "learning_rate": 0.0004938846153846153, + "loss": 3.4325, + "step": 30450 + }, + { + "epoch": 8.881821888287028, + "grad_norm": 0.322322815656662, + "learning_rate": 0.0004937097902097901, + "loss": 3.4382, + "step": 30500 + }, + { + "epoch": 8.896383015900751, + "grad_norm": 0.3216952681541443, + "learning_rate": 0.000493534965034965, + "loss": 3.4371, + "step": 30550 + }, + { + "epoch": 8.910944143514474, + "grad_norm": 0.31134024262428284, + "learning_rate": 0.0004933601398601398, + "loss": 3.4368, + "step": 30600 + }, + { + "epoch": 8.925505271128197, + "grad_norm": 0.33784356713294983, + "learning_rate": 0.0004931853146853146, + "loss": 3.4262, + "step": 30650 + }, + { + "epoch": 8.94006639874192, + "grad_norm": 0.3245190680027008, + "learning_rate": 0.0004930104895104895, + "loss": 3.4206, + "step": 30700 + }, + { + "epoch": 8.95462752635564, + "grad_norm": 0.32766658067703247, + "learning_rate": 0.0004928356643356642, + "loss": 3.4303, + "step": 30750 + }, + { + "epoch": 8.969188653969363, + "grad_norm": 0.327048122882843, + "learning_rate": 0.0004926608391608391, + "loss": 3.4377, + "step": 30800 + }, + { + "epoch": 8.983749781583086, + "grad_norm": 0.3345113694667816, + "learning_rate": 0.0004924860139860139, + "loss": 3.4446, + "step": 30850 + }, + { + "epoch": 8.998310909196809, + "grad_norm": 0.31201016902923584, + "learning_rate": 0.0004923111888111887, + "loss": 3.4229, + "step": 30900 + }, + { + "epoch": 9.012813792300076, + "grad_norm": 0.31976842880249023, + "learning_rate": 0.0004921363636363635, + "loss": 3.3365, + "step": 30950 + }, + { + "epoch": 9.027374919913798, + "grad_norm": 0.31214338541030884, + "learning_rate": 0.0004919615384615384, + "loss": 3.3191, + "step": 31000 + }, + { + "epoch": 9.027374919913798, + "eval_accuracy": 0.36729027359875366, + "eval_loss": 3.579190492630005, + "eval_runtime": 180.6255, + "eval_samples_per_second": 92.158, + "eval_steps_per_second": 5.763, + "step": 31000 + }, + { + "epoch": 9.041936047527521, + "grad_norm": 0.3289821445941925, + "learning_rate": 0.0004917867132867132, + "loss": 3.3282, + "step": 31050 + }, + { + "epoch": 9.056497175141242, + "grad_norm": 0.3382094204425812, + "learning_rate": 0.000491611888111888, + "loss": 3.3278, + "step": 31100 + }, + { + "epoch": 9.071058302754965, + "grad_norm": 0.3234216570854187, + "learning_rate": 0.0004914370629370628, + "loss": 3.3318, + "step": 31150 + }, + { + "epoch": 9.085619430368688, + "grad_norm": 0.3528585135936737, + "learning_rate": 0.0004912622377622378, + "loss": 3.3381, + "step": 31200 + }, + { + "epoch": 9.10018055798241, + "grad_norm": 0.32475900650024414, + "learning_rate": 0.0004910874125874126, + "loss": 3.3417, + "step": 31250 + }, + { + "epoch": 9.114741685596133, + "grad_norm": 0.38442665338516235, + "learning_rate": 0.0004909125874125874, + "loss": 3.3351, + "step": 31300 + }, + { + "epoch": 9.129302813209854, + "grad_norm": 0.3471496105194092, + "learning_rate": 0.0004907377622377623, + "loss": 3.3419, + "step": 31350 + }, + { + "epoch": 9.143863940823577, + "grad_norm": 0.34800395369529724, + "learning_rate": 0.0004905629370629371, + "loss": 3.3577, + "step": 31400 + }, + { + "epoch": 9.1584250684373, + "grad_norm": 0.3251023292541504, + "learning_rate": 0.0004903881118881119, + "loss": 3.3453, + "step": 31450 + }, + { + "epoch": 9.172986196051022, + "grad_norm": 0.3403095602989197, + "learning_rate": 0.0004902132867132867, + "loss": 3.3494, + "step": 31500 + }, + { + "epoch": 9.187547323664745, + "grad_norm": 0.34055787324905396, + "learning_rate": 0.0004900384615384615, + "loss": 3.3536, + "step": 31550 + }, + { + "epoch": 9.202108451278466, + "grad_norm": 0.3373563587665558, + "learning_rate": 0.0004898636363636363, + "loss": 3.3497, + "step": 31600 + }, + { + "epoch": 9.216669578892189, + "grad_norm": 0.3197869062423706, + "learning_rate": 0.0004896888111888112, + "loss": 3.3548, + "step": 31650 + }, + { + "epoch": 9.231230706505912, + "grad_norm": 0.3400711715221405, + "learning_rate": 0.000489513986013986, + "loss": 3.365, + "step": 31700 + }, + { + "epoch": 9.245791834119634, + "grad_norm": 0.338986337184906, + "learning_rate": 0.0004893391608391608, + "loss": 3.3731, + "step": 31750 + }, + { + "epoch": 9.260352961733357, + "grad_norm": 0.3736262321472168, + "learning_rate": 0.0004891643356643356, + "loss": 3.3684, + "step": 31800 + }, + { + "epoch": 9.27491408934708, + "grad_norm": 0.32736754417419434, + "learning_rate": 0.0004889895104895105, + "loss": 3.3747, + "step": 31850 + }, + { + "epoch": 9.2894752169608, + "grad_norm": 0.3265363872051239, + "learning_rate": 0.0004888146853146853, + "loss": 3.3615, + "step": 31900 + }, + { + "epoch": 9.304036344574524, + "grad_norm": 0.35288384556770325, + "learning_rate": 0.0004886398601398601, + "loss": 3.3709, + "step": 31950 + }, + { + "epoch": 9.318597472188246, + "grad_norm": 0.3371695280075073, + "learning_rate": 0.000488465034965035, + "loss": 3.3673, + "step": 32000 + }, + { + "epoch": 9.318597472188246, + "eval_accuracy": 0.3674448684274567, + "eval_loss": 3.577155113220215, + "eval_runtime": 180.6862, + "eval_samples_per_second": 92.127, + "eval_steps_per_second": 5.761, + "step": 32000 + }, + { + "epoch": 9.333158599801969, + "grad_norm": 0.32006871700286865, + "learning_rate": 0.0004882902097902098, + "loss": 3.3764, + "step": 32050 + }, + { + "epoch": 9.347719727415692, + "grad_norm": 0.3183046877384186, + "learning_rate": 0.0004881153846153846, + "loss": 3.3797, + "step": 32100 + }, + { + "epoch": 9.362280855029413, + "grad_norm": 0.34943392872810364, + "learning_rate": 0.0004879405594405594, + "loss": 3.3896, + "step": 32150 + }, + { + "epoch": 9.376841982643136, + "grad_norm": 0.32475075125694275, + "learning_rate": 0.00048776573426573424, + "loss": 3.3837, + "step": 32200 + }, + { + "epoch": 9.391403110256858, + "grad_norm": 0.3197793662548065, + "learning_rate": 0.00048759090909090904, + "loss": 3.3888, + "step": 32250 + }, + { + "epoch": 9.405964237870581, + "grad_norm": 0.34314748644828796, + "learning_rate": 0.0004874160839160839, + "loss": 3.3873, + "step": 32300 + }, + { + "epoch": 9.420525365484304, + "grad_norm": 0.35972434282302856, + "learning_rate": 0.0004872412587412587, + "loss": 3.3898, + "step": 32350 + }, + { + "epoch": 9.435086493098025, + "grad_norm": 0.34308797121047974, + "learning_rate": 0.00048706643356643354, + "loss": 3.3825, + "step": 32400 + }, + { + "epoch": 9.449647620711747, + "grad_norm": 0.3321487307548523, + "learning_rate": 0.00048689160839160834, + "loss": 3.3777, + "step": 32450 + }, + { + "epoch": 9.46420874832547, + "grad_norm": 0.3517487049102783, + "learning_rate": 0.0004867167832167832, + "loss": 3.3981, + "step": 32500 + }, + { + "epoch": 9.478769875939193, + "grad_norm": 0.3309260606765747, + "learning_rate": 0.00048654195804195794, + "loss": 3.3946, + "step": 32550 + }, + { + "epoch": 9.493331003552916, + "grad_norm": 0.32394567131996155, + "learning_rate": 0.00048636713286713285, + "loss": 3.3976, + "step": 32600 + }, + { + "epoch": 9.507892131166638, + "grad_norm": 0.3531663119792938, + "learning_rate": 0.0004861923076923077, + "loss": 3.3897, + "step": 32650 + }, + { + "epoch": 9.52245325878036, + "grad_norm": 0.327006995677948, + "learning_rate": 0.00048601748251748245, + "loss": 3.3908, + "step": 32700 + }, + { + "epoch": 9.537014386394082, + "grad_norm": 0.3265383839607239, + "learning_rate": 0.0004858426573426573, + "loss": 3.3903, + "step": 32750 + }, + { + "epoch": 9.551575514007805, + "grad_norm": 0.34400323033332825, + "learning_rate": 0.0004856678321678321, + "loss": 3.3941, + "step": 32800 + }, + { + "epoch": 9.566136641621528, + "grad_norm": 0.3338717520236969, + "learning_rate": 0.00048549300699300696, + "loss": 3.3895, + "step": 32850 + }, + { + "epoch": 9.58069776923525, + "grad_norm": 0.3492511510848999, + "learning_rate": 0.00048531818181818176, + "loss": 3.4021, + "step": 32900 + }, + { + "epoch": 9.595258896848971, + "grad_norm": 0.346670538187027, + "learning_rate": 0.0004851433566433566, + "loss": 3.4054, + "step": 32950 + }, + { + "epoch": 9.609820024462694, + "grad_norm": 0.32477083802223206, + "learning_rate": 0.0004849685314685314, + "loss": 3.4042, + "step": 33000 + }, + { + "epoch": 9.609820024462694, + "eval_accuracy": 0.36810098533923746, + "eval_loss": 3.5701539516448975, + "eval_runtime": 180.6914, + "eval_samples_per_second": 92.124, + "eval_steps_per_second": 5.761, + "step": 33000 + }, + { + "epoch": 9.624381152076417, + "grad_norm": 0.335056871175766, + "learning_rate": 0.00048479370629370627, + "loss": 3.3859, + "step": 33050 + }, + { + "epoch": 9.63894227969014, + "grad_norm": 0.3331140875816345, + "learning_rate": 0.00048461888111888106, + "loss": 3.3992, + "step": 33100 + }, + { + "epoch": 9.653503407303862, + "grad_norm": 0.3186609148979187, + "learning_rate": 0.0004844440559440559, + "loss": 3.3985, + "step": 33150 + }, + { + "epoch": 9.668064534917583, + "grad_norm": 0.3806893527507782, + "learning_rate": 0.0004842692307692307, + "loss": 3.3957, + "step": 33200 + }, + { + "epoch": 9.682625662531306, + "grad_norm": 0.3387463688850403, + "learning_rate": 0.00048409440559440557, + "loss": 3.407, + "step": 33250 + }, + { + "epoch": 9.697186790145029, + "grad_norm": 0.3401879370212555, + "learning_rate": 0.0004839195804195803, + "loss": 3.4044, + "step": 33300 + }, + { + "epoch": 9.711747917758752, + "grad_norm": 0.34021636843681335, + "learning_rate": 0.0004837447552447552, + "loss": 3.3997, + "step": 33350 + }, + { + "epoch": 9.726309045372474, + "grad_norm": 0.33192554116249084, + "learning_rate": 0.0004835699300699301, + "loss": 3.4043, + "step": 33400 + }, + { + "epoch": 9.740870172986195, + "grad_norm": 0.34780585765838623, + "learning_rate": 0.0004833951048951048, + "loss": 3.3918, + "step": 33450 + }, + { + "epoch": 9.755431300599918, + "grad_norm": 0.3295852243900299, + "learning_rate": 0.0004832202797202797, + "loss": 3.399, + "step": 33500 + }, + { + "epoch": 9.76999242821364, + "grad_norm": 0.3357504904270172, + "learning_rate": 0.0004830454545454545, + "loss": 3.403, + "step": 33550 + }, + { + "epoch": 9.784553555827364, + "grad_norm": 0.34999728202819824, + "learning_rate": 0.00048287062937062933, + "loss": 3.4003, + "step": 33600 + }, + { + "epoch": 9.799114683441086, + "grad_norm": 0.33418694138526917, + "learning_rate": 0.00048269580419580413, + "loss": 3.408, + "step": 33650 + }, + { + "epoch": 9.813675811054807, + "grad_norm": 0.34634068608283997, + "learning_rate": 0.000482520979020979, + "loss": 3.4006, + "step": 33700 + }, + { + "epoch": 9.82823693866853, + "grad_norm": 0.3158990144729614, + "learning_rate": 0.0004823461538461538, + "loss": 3.3964, + "step": 33750 + }, + { + "epoch": 9.842798066282253, + "grad_norm": 0.3415282666683197, + "learning_rate": 0.00048217132867132864, + "loss": 3.4045, + "step": 33800 + }, + { + "epoch": 9.857359193895975, + "grad_norm": 0.32026195526123047, + "learning_rate": 0.00048199650349650344, + "loss": 3.4094, + "step": 33850 + }, + { + "epoch": 9.871920321509698, + "grad_norm": 0.3347398638725281, + "learning_rate": 0.0004818216783216783, + "loss": 3.4243, + "step": 33900 + }, + { + "epoch": 9.88648144912342, + "grad_norm": 0.33100831508636475, + "learning_rate": 0.0004816468531468531, + "loss": 3.4099, + "step": 33950 + }, + { + "epoch": 9.901042576737142, + "grad_norm": 0.34802931547164917, + "learning_rate": 0.00048147202797202795, + "loss": 3.4059, + "step": 34000 + }, + { + "epoch": 9.901042576737142, + "eval_accuracy": 0.3684669577360075, + "eval_loss": 3.563434362411499, + "eval_runtime": 180.8937, + "eval_samples_per_second": 92.021, + "eval_steps_per_second": 5.755, + "step": 34000 + }, + { + "epoch": 9.915603704350865, + "grad_norm": 0.31292399764060974, + "learning_rate": 0.0004812972027972028, + "loss": 3.3956, + "step": 34050 + }, + { + "epoch": 9.930164831964587, + "grad_norm": 0.3332271873950958, + "learning_rate": 0.0004811223776223776, + "loss": 3.4103, + "step": 34100 + }, + { + "epoch": 9.94472595957831, + "grad_norm": 0.3372020721435547, + "learning_rate": 0.00048094755244755245, + "loss": 3.4071, + "step": 34150 + }, + { + "epoch": 9.959287087192033, + "grad_norm": 0.35507315397262573, + "learning_rate": 0.0004807727272727272, + "loss": 3.3987, + "step": 34200 + }, + { + "epoch": 9.973848214805754, + "grad_norm": 0.31724709272384644, + "learning_rate": 0.00048059790209790205, + "loss": 3.4112, + "step": 34250 + }, + { + "epoch": 9.988409342419477, + "grad_norm": 0.33160942792892456, + "learning_rate": 0.00048042307692307685, + "loss": 3.4104, + "step": 34300 + }, + { + "epoch": 10.002912225522744, + "grad_norm": 0.3155555725097656, + "learning_rate": 0.0004802482517482517, + "loss": 3.39, + "step": 34350 + }, + { + "epoch": 10.017473353136467, + "grad_norm": 0.364051878452301, + "learning_rate": 0.0004800734265734265, + "loss": 3.3051, + "step": 34400 + }, + { + "epoch": 10.03203448075019, + "grad_norm": 0.3165052533149719, + "learning_rate": 0.00047989860139860136, + "loss": 3.3003, + "step": 34450 + }, + { + "epoch": 10.046595608363912, + "grad_norm": 0.3623712956905365, + "learning_rate": 0.00047972377622377616, + "loss": 3.312, + "step": 34500 + }, + { + "epoch": 10.061156735977635, + "grad_norm": 0.33263257145881653, + "learning_rate": 0.000479548951048951, + "loss": 3.3194, + "step": 34550 + }, + { + "epoch": 10.075717863591356, + "grad_norm": 0.3498550355434418, + "learning_rate": 0.0004793741258741258, + "loss": 3.3114, + "step": 34600 + }, + { + "epoch": 10.090278991205079, + "grad_norm": 0.32290056347846985, + "learning_rate": 0.00047919930069930067, + "loss": 3.3143, + "step": 34650 + }, + { + "epoch": 10.104840118818801, + "grad_norm": 0.3660222589969635, + "learning_rate": 0.0004790244755244755, + "loss": 3.3239, + "step": 34700 + }, + { + "epoch": 10.119401246432524, + "grad_norm": 0.3473450839519501, + "learning_rate": 0.0004788496503496503, + "loss": 3.3249, + "step": 34750 + }, + { + "epoch": 10.133962374046247, + "grad_norm": 0.3577515482902527, + "learning_rate": 0.0004786748251748252, + "loss": 3.3254, + "step": 34800 + }, + { + "epoch": 10.148523501659968, + "grad_norm": 0.33307579159736633, + "learning_rate": 0.0004785, + "loss": 3.3295, + "step": 34850 + }, + { + "epoch": 10.16308462927369, + "grad_norm": 0.3364368677139282, + "learning_rate": 0.00047832517482517483, + "loss": 3.3365, + "step": 34900 + }, + { + "epoch": 10.177645756887413, + "grad_norm": 0.3440118432044983, + "learning_rate": 0.0004781503496503496, + "loss": 3.3399, + "step": 34950 + }, + { + "epoch": 10.192206884501136, + "grad_norm": 0.34188905358314514, + "learning_rate": 0.00047797552447552443, + "loss": 3.3375, + "step": 35000 + }, + { + "epoch": 10.192206884501136, + "eval_accuracy": 0.36812226417117305, + "eval_loss": 3.572165012359619, + "eval_runtime": 180.6318, + "eval_samples_per_second": 92.154, + "eval_steps_per_second": 5.763, + "step": 35000 + }, + { + "epoch": 10.206768012114859, + "grad_norm": 0.3428874909877777, + "learning_rate": 0.00047780069930069923, + "loss": 3.3332, + "step": 35050 + }, + { + "epoch": 10.221329139728581, + "grad_norm": 0.3249208927154541, + "learning_rate": 0.0004776258741258741, + "loss": 3.3323, + "step": 35100 + }, + { + "epoch": 10.235890267342302, + "grad_norm": 0.3354872167110443, + "learning_rate": 0.0004774510489510489, + "loss": 3.3385, + "step": 35150 + }, + { + "epoch": 10.250451394956025, + "grad_norm": 0.33717766404151917, + "learning_rate": 0.00047727622377622374, + "loss": 3.3466, + "step": 35200 + }, + { + "epoch": 10.265012522569748, + "grad_norm": 0.37344837188720703, + "learning_rate": 0.00047710139860139854, + "loss": 3.3522, + "step": 35250 + }, + { + "epoch": 10.27957365018347, + "grad_norm": 0.3368532061576843, + "learning_rate": 0.0004769265734265734, + "loss": 3.3487, + "step": 35300 + }, + { + "epoch": 10.294134777797193, + "grad_norm": 0.3481936752796173, + "learning_rate": 0.0004767517482517482, + "loss": 3.3462, + "step": 35350 + }, + { + "epoch": 10.308695905410914, + "grad_norm": 0.3315824866294861, + "learning_rate": 0.00047657692307692304, + "loss": 3.3595, + "step": 35400 + }, + { + "epoch": 10.323257033024637, + "grad_norm": 0.329924076795578, + "learning_rate": 0.0004764020979020979, + "loss": 3.3562, + "step": 35450 + }, + { + "epoch": 10.33781816063836, + "grad_norm": 0.356904000043869, + "learning_rate": 0.0004762272727272727, + "loss": 3.3541, + "step": 35500 + }, + { + "epoch": 10.352379288252083, + "grad_norm": 0.33224600553512573, + "learning_rate": 0.00047605244755244755, + "loss": 3.3563, + "step": 35550 + }, + { + "epoch": 10.366940415865805, + "grad_norm": 0.33060455322265625, + "learning_rate": 0.00047587762237762235, + "loss": 3.3526, + "step": 35600 + }, + { + "epoch": 10.381501543479526, + "grad_norm": 0.3506428599357605, + "learning_rate": 0.0004757027972027972, + "loss": 3.3612, + "step": 35650 + }, + { + "epoch": 10.396062671093249, + "grad_norm": 0.3286277651786804, + "learning_rate": 0.00047552797202797195, + "loss": 3.3555, + "step": 35700 + }, + { + "epoch": 10.410623798706972, + "grad_norm": 0.3427405059337616, + "learning_rate": 0.0004753531468531468, + "loss": 3.3529, + "step": 35750 + }, + { + "epoch": 10.425184926320695, + "grad_norm": 0.36271488666534424, + "learning_rate": 0.0004751783216783216, + "loss": 3.3544, + "step": 35800 + }, + { + "epoch": 10.439746053934417, + "grad_norm": 0.3364131450653076, + "learning_rate": 0.00047500349650349646, + "loss": 3.3693, + "step": 35850 + }, + { + "epoch": 10.454307181548138, + "grad_norm": 0.34665197134017944, + "learning_rate": 0.00047482867132867126, + "loss": 3.3763, + "step": 35900 + }, + { + "epoch": 10.468868309161861, + "grad_norm": 0.4717924892902374, + "learning_rate": 0.0004746538461538461, + "loss": 3.3666, + "step": 35950 + }, + { + "epoch": 10.483429436775584, + "grad_norm": 0.32863062620162964, + "learning_rate": 0.0004744790209790209, + "loss": 3.3636, + "step": 36000 + }, + { + "epoch": 10.483429436775584, + "eval_accuracy": 0.36863377907587797, + "eval_loss": 3.5674543380737305, + "eval_runtime": 180.888, + "eval_samples_per_second": 92.024, + "eval_steps_per_second": 5.755, + "step": 36000 + }, + { + "epoch": 10.497990564389307, + "grad_norm": 0.3689253628253937, + "learning_rate": 0.00047430419580419576, + "loss": 3.3647, + "step": 36050 + }, + { + "epoch": 10.51255169200303, + "grad_norm": 0.33551663160324097, + "learning_rate": 0.0004741293706293706, + "loss": 3.3471, + "step": 36100 + }, + { + "epoch": 10.52711281961675, + "grad_norm": 0.32269003987312317, + "learning_rate": 0.0004739545454545454, + "loss": 3.3725, + "step": 36150 + }, + { + "epoch": 10.541673947230473, + "grad_norm": 0.3469913899898529, + "learning_rate": 0.00047377972027972027, + "loss": 3.3801, + "step": 36200 + }, + { + "epoch": 10.556235074844196, + "grad_norm": 0.32835647463798523, + "learning_rate": 0.00047360489510489507, + "loss": 3.3726, + "step": 36250 + }, + { + "epoch": 10.570796202457919, + "grad_norm": 0.34614914655685425, + "learning_rate": 0.0004734300699300699, + "loss": 3.3698, + "step": 36300 + }, + { + "epoch": 10.585357330071641, + "grad_norm": 0.3445773720741272, + "learning_rate": 0.0004732552447552447, + "loss": 3.3589, + "step": 36350 + }, + { + "epoch": 10.599918457685362, + "grad_norm": 0.33179157972335815, + "learning_rate": 0.0004730804195804196, + "loss": 3.3738, + "step": 36400 + }, + { + "epoch": 10.614479585299085, + "grad_norm": 0.34509310126304626, + "learning_rate": 0.0004729055944055943, + "loss": 3.37, + "step": 36450 + }, + { + "epoch": 10.629040712912808, + "grad_norm": 0.34073859453201294, + "learning_rate": 0.0004727307692307692, + "loss": 3.3782, + "step": 36500 + }, + { + "epoch": 10.64360184052653, + "grad_norm": 0.33959853649139404, + "learning_rate": 0.000472555944055944, + "loss": 3.3636, + "step": 36550 + }, + { + "epoch": 10.658162968140253, + "grad_norm": 0.3635091483592987, + "learning_rate": 0.00047238111888111883, + "loss": 3.3765, + "step": 36600 + }, + { + "epoch": 10.672724095753976, + "grad_norm": 0.3816806375980377, + "learning_rate": 0.00047220629370629363, + "loss": 3.3727, + "step": 36650 + }, + { + "epoch": 10.687285223367697, + "grad_norm": 0.44146403670310974, + "learning_rate": 0.0004720314685314685, + "loss": 3.3843, + "step": 36700 + }, + { + "epoch": 10.70184635098142, + "grad_norm": 0.3365342319011688, + "learning_rate": 0.0004718566433566433, + "loss": 3.3746, + "step": 36750 + }, + { + "epoch": 10.716407478595142, + "grad_norm": 0.3512522578239441, + "learning_rate": 0.00047168181818181814, + "loss": 3.3824, + "step": 36800 + }, + { + "epoch": 10.730968606208865, + "grad_norm": 0.34449753165245056, + "learning_rate": 0.000471506993006993, + "loss": 3.3708, + "step": 36850 + }, + { + "epoch": 10.745529733822588, + "grad_norm": 0.3496144115924835, + "learning_rate": 0.0004713321678321678, + "loss": 3.3733, + "step": 36900 + }, + { + "epoch": 10.760090861436309, + "grad_norm": 0.34519726037979126, + "learning_rate": 0.00047115734265734265, + "loss": 3.3864, + "step": 36950 + }, + { + "epoch": 10.774651989050032, + "grad_norm": 0.362666517496109, + "learning_rate": 0.00047098251748251745, + "loss": 3.3783, + "step": 37000 + }, + { + "epoch": 10.774651989050032, + "eval_accuracy": 0.36897800239028294, + "eval_loss": 3.558612108230591, + "eval_runtime": 181.139, + "eval_samples_per_second": 91.896, + "eval_steps_per_second": 5.747, + "step": 37000 + }, + { + "epoch": 10.789213116663754, + "grad_norm": 0.31553319096565247, + "learning_rate": 0.0004708076923076923, + "loss": 3.3889, + "step": 37050 + }, + { + "epoch": 10.803774244277477, + "grad_norm": 0.33135709166526794, + "learning_rate": 0.0004706328671328671, + "loss": 3.3838, + "step": 37100 + }, + { + "epoch": 10.8183353718912, + "grad_norm": 0.3278547525405884, + "learning_rate": 0.00047045804195804195, + "loss": 3.3758, + "step": 37150 + }, + { + "epoch": 10.83289649950492, + "grad_norm": 0.3490781784057617, + "learning_rate": 0.0004702832167832167, + "loss": 3.396, + "step": 37200 + }, + { + "epoch": 10.847457627118644, + "grad_norm": 0.35344281792640686, + "learning_rate": 0.00047010839160839155, + "loss": 3.3873, + "step": 37250 + }, + { + "epoch": 10.862018754732366, + "grad_norm": 0.34710001945495605, + "learning_rate": 0.00046993356643356635, + "loss": 3.3902, + "step": 37300 + }, + { + "epoch": 10.876579882346089, + "grad_norm": 0.36521968245506287, + "learning_rate": 0.0004697587412587412, + "loss": 3.3968, + "step": 37350 + }, + { + "epoch": 10.891141009959812, + "grad_norm": 0.33776748180389404, + "learning_rate": 0.000469583916083916, + "loss": 3.3824, + "step": 37400 + }, + { + "epoch": 10.905702137573535, + "grad_norm": 0.3304058015346527, + "learning_rate": 0.00046940909090909086, + "loss": 3.3784, + "step": 37450 + }, + { + "epoch": 10.920263265187256, + "grad_norm": 0.3333076536655426, + "learning_rate": 0.0004692342657342657, + "loss": 3.3872, + "step": 37500 + }, + { + "epoch": 10.934824392800978, + "grad_norm": 0.3497754633426666, + "learning_rate": 0.0004690594405594405, + "loss": 3.3924, + "step": 37550 + }, + { + "epoch": 10.949385520414701, + "grad_norm": 0.3230236768722534, + "learning_rate": 0.00046888461538461537, + "loss": 3.3794, + "step": 37600 + }, + { + "epoch": 10.963946648028424, + "grad_norm": 0.34919098019599915, + "learning_rate": 0.00046870979020979017, + "loss": 3.3816, + "step": 37650 + }, + { + "epoch": 10.978507775642147, + "grad_norm": 0.32327717542648315, + "learning_rate": 0.000468534965034965, + "loss": 3.3837, + "step": 37700 + }, + { + "epoch": 10.993068903255867, + "grad_norm": 0.3353258967399597, + "learning_rate": 0.0004683601398601398, + "loss": 3.3962, + "step": 37750 + }, + { + "epoch": 11.007571786359136, + "grad_norm": 0.3746544122695923, + "learning_rate": 0.0004681853146853147, + "loss": 3.3268, + "step": 37800 + }, + { + "epoch": 11.022132913972857, + "grad_norm": 0.37094318866729736, + "learning_rate": 0.0004680104895104895, + "loss": 3.2658, + "step": 37850 + }, + { + "epoch": 11.03669404158658, + "grad_norm": 0.3441932797431946, + "learning_rate": 0.00046783566433566433, + "loss": 3.2796, + "step": 37900 + }, + { + "epoch": 11.051255169200303, + "grad_norm": 0.31547367572784424, + "learning_rate": 0.0004676608391608391, + "loss": 3.2876, + "step": 37950 + }, + { + "epoch": 11.065816296814026, + "grad_norm": 0.36404022574424744, + "learning_rate": 0.00046748601398601393, + "loss": 3.2792, + "step": 38000 + }, + { + "epoch": 11.065816296814026, + "eval_accuracy": 0.3686197891255999, + "eval_loss": 3.572824001312256, + "eval_runtime": 180.8426, + "eval_samples_per_second": 92.047, + "eval_steps_per_second": 5.756, + "step": 38000 + }, + { + "epoch": 11.080377424427748, + "grad_norm": 0.35687056183815, + "learning_rate": 0.00046731118881118873, + "loss": 3.2936, + "step": 38050 + }, + { + "epoch": 11.09493855204147, + "grad_norm": 0.33898624777793884, + "learning_rate": 0.0004671363636363636, + "loss": 3.2977, + "step": 38100 + }, + { + "epoch": 11.109499679655192, + "grad_norm": 0.33681026101112366, + "learning_rate": 0.00046696153846153844, + "loss": 3.3047, + "step": 38150 + }, + { + "epoch": 11.124060807268915, + "grad_norm": 0.3504103124141693, + "learning_rate": 0.00046678671328671324, + "loss": 3.3054, + "step": 38200 + }, + { + "epoch": 11.138621934882638, + "grad_norm": 0.35253238677978516, + "learning_rate": 0.0004666118881118881, + "loss": 3.3048, + "step": 38250 + }, + { + "epoch": 11.15318306249636, + "grad_norm": 0.37646666169166565, + "learning_rate": 0.0004664370629370629, + "loss": 3.2982, + "step": 38300 + }, + { + "epoch": 11.167744190110081, + "grad_norm": 0.32107439637184143, + "learning_rate": 0.00046626223776223774, + "loss": 3.2995, + "step": 38350 + }, + { + "epoch": 11.182305317723804, + "grad_norm": 0.37649980187416077, + "learning_rate": 0.00046608741258741254, + "loss": 3.304, + "step": 38400 + }, + { + "epoch": 11.196866445337527, + "grad_norm": 0.365675687789917, + "learning_rate": 0.0004659125874125874, + "loss": 3.3121, + "step": 38450 + }, + { + "epoch": 11.21142757295125, + "grad_norm": 0.3648703396320343, + "learning_rate": 0.0004657377622377622, + "loss": 3.3139, + "step": 38500 + }, + { + "epoch": 11.225988700564972, + "grad_norm": 0.34117260575294495, + "learning_rate": 0.00046556293706293705, + "loss": 3.3322, + "step": 38550 + }, + { + "epoch": 11.240549828178693, + "grad_norm": 0.3431043028831482, + "learning_rate": 0.00046538811188811185, + "loss": 3.312, + "step": 38600 + }, + { + "epoch": 11.255110955792416, + "grad_norm": 0.34056955575942993, + "learning_rate": 0.0004652132867132867, + "loss": 3.3255, + "step": 38650 + }, + { + "epoch": 11.269672083406139, + "grad_norm": 0.3726236820220947, + "learning_rate": 0.00046503846153846145, + "loss": 3.3273, + "step": 38700 + }, + { + "epoch": 11.284233211019862, + "grad_norm": 0.3636229336261749, + "learning_rate": 0.0004648636363636363, + "loss": 3.3231, + "step": 38750 + }, + { + "epoch": 11.298794338633584, + "grad_norm": 0.3742581903934479, + "learning_rate": 0.0004646888111888111, + "loss": 3.3272, + "step": 38800 + }, + { + "epoch": 11.313355466247307, + "grad_norm": 0.3781585693359375, + "learning_rate": 0.00046451398601398596, + "loss": 3.3396, + "step": 38850 + }, + { + "epoch": 11.327916593861028, + "grad_norm": 0.349326491355896, + "learning_rate": 0.0004643391608391608, + "loss": 3.3466, + "step": 38900 + }, + { + "epoch": 11.34247772147475, + "grad_norm": 0.38100987672805786, + "learning_rate": 0.0004641643356643356, + "loss": 3.3351, + "step": 38950 + }, + { + "epoch": 11.357038849088473, + "grad_norm": 0.34632235765457153, + "learning_rate": 0.00046398951048951046, + "loss": 3.3318, + "step": 39000 + }, + { + "epoch": 11.357038849088473, + "eval_accuracy": 0.36974063102434884, + "eval_loss": 3.5641088485717773, + "eval_runtime": 180.754, + "eval_samples_per_second": 92.092, + "eval_steps_per_second": 5.759, + "step": 39000 + }, + { + "epoch": 11.371599976702196, + "grad_norm": 0.36289188265800476, + "learning_rate": 0.00046381468531468526, + "loss": 3.3394, + "step": 39050 + }, + { + "epoch": 11.386161104315919, + "grad_norm": 0.32132551074028015, + "learning_rate": 0.0004636398601398601, + "loss": 3.3275, + "step": 39100 + }, + { + "epoch": 11.40072223192964, + "grad_norm": 0.3607240617275238, + "learning_rate": 0.0004634650349650349, + "loss": 3.3365, + "step": 39150 + }, + { + "epoch": 11.415283359543363, + "grad_norm": 0.3579674959182739, + "learning_rate": 0.00046329020979020977, + "loss": 3.3283, + "step": 39200 + }, + { + "epoch": 11.429844487157085, + "grad_norm": 0.35449928045272827, + "learning_rate": 0.00046311538461538457, + "loss": 3.3373, + "step": 39250 + }, + { + "epoch": 11.444405614770808, + "grad_norm": 0.3389507234096527, + "learning_rate": 0.0004629405594405594, + "loss": 3.3359, + "step": 39300 + }, + { + "epoch": 11.458966742384531, + "grad_norm": 0.35343387722969055, + "learning_rate": 0.0004627657342657342, + "loss": 3.3483, + "step": 39350 + }, + { + "epoch": 11.473527869998252, + "grad_norm": 0.3741236627101898, + "learning_rate": 0.0004625909090909091, + "loss": 3.3399, + "step": 39400 + }, + { + "epoch": 11.488088997611975, + "grad_norm": 0.3604460656642914, + "learning_rate": 0.0004624160839160838, + "loss": 3.3536, + "step": 39450 + }, + { + "epoch": 11.502650125225697, + "grad_norm": 0.374704509973526, + "learning_rate": 0.0004622412587412587, + "loss": 3.3552, + "step": 39500 + }, + { + "epoch": 11.51721125283942, + "grad_norm": 0.34912702441215515, + "learning_rate": 0.00046206643356643353, + "loss": 3.3618, + "step": 39550 + }, + { + "epoch": 11.531772380453143, + "grad_norm": 0.35853371024131775, + "learning_rate": 0.00046189160839160833, + "loss": 3.3609, + "step": 39600 + }, + { + "epoch": 11.546333508066864, + "grad_norm": 0.34523260593414307, + "learning_rate": 0.0004617167832167832, + "loss": 3.3525, + "step": 39650 + }, + { + "epoch": 11.560894635680587, + "grad_norm": 0.32813966274261475, + "learning_rate": 0.000461541958041958, + "loss": 3.3592, + "step": 39700 + }, + { + "epoch": 11.57545576329431, + "grad_norm": 0.3415012061595917, + "learning_rate": 0.00046136713286713284, + "loss": 3.342, + "step": 39750 + }, + { + "epoch": 11.590016890908032, + "grad_norm": 0.34844455122947693, + "learning_rate": 0.00046119230769230764, + "loss": 3.3556, + "step": 39800 + }, + { + "epoch": 11.604578018521755, + "grad_norm": 0.37255018949508667, + "learning_rate": 0.0004610174825174825, + "loss": 3.3545, + "step": 39850 + }, + { + "epoch": 11.619139146135478, + "grad_norm": 0.3670998215675354, + "learning_rate": 0.0004608426573426573, + "loss": 3.359, + "step": 39900 + }, + { + "epoch": 11.633700273749199, + "grad_norm": 0.32997700572013855, + "learning_rate": 0.00046066783216783215, + "loss": 3.3547, + "step": 39950 + }, + { + "epoch": 11.648261401362921, + "grad_norm": 0.3327796757221222, + "learning_rate": 0.00046049300699300695, + "loss": 3.3586, + "step": 40000 + }, + { + "epoch": 11.648261401362921, + "eval_accuracy": 0.3697354582696242, + "eval_loss": 3.558627128601074, + "eval_runtime": 180.8619, + "eval_samples_per_second": 92.037, + "eval_steps_per_second": 5.756, + "step": 40000 + }, + { + "epoch": 11.662822528976644, + "grad_norm": 0.3510940670967102, + "learning_rate": 0.0004603181818181818, + "loss": 3.361, + "step": 40050 + }, + { + "epoch": 11.677383656590367, + "grad_norm": 0.3497365117073059, + "learning_rate": 0.0004601433566433566, + "loss": 3.3581, + "step": 40100 + }, + { + "epoch": 11.69194478420409, + "grad_norm": 0.35127291083335876, + "learning_rate": 0.00045996853146853145, + "loss": 3.3681, + "step": 40150 + }, + { + "epoch": 11.70650591181781, + "grad_norm": 0.3371525704860687, + "learning_rate": 0.0004597937062937062, + "loss": 3.3521, + "step": 40200 + }, + { + "epoch": 11.721067039431533, + "grad_norm": 0.3377763330936432, + "learning_rate": 0.00045961888111888105, + "loss": 3.3748, + "step": 40250 + }, + { + "epoch": 11.735628167045256, + "grad_norm": 0.3272308111190796, + "learning_rate": 0.0004594440559440559, + "loss": 3.3714, + "step": 40300 + }, + { + "epoch": 11.750189294658979, + "grad_norm": 0.3513683080673218, + "learning_rate": 0.0004592692307692307, + "loss": 3.3533, + "step": 40350 + }, + { + "epoch": 11.764750422272702, + "grad_norm": 0.34825000166893005, + "learning_rate": 0.00045909440559440556, + "loss": 3.372, + "step": 40400 + }, + { + "epoch": 11.779311549886422, + "grad_norm": 0.3494202494621277, + "learning_rate": 0.00045891958041958036, + "loss": 3.3673, + "step": 40450 + }, + { + "epoch": 11.793872677500145, + "grad_norm": 0.357833594083786, + "learning_rate": 0.0004587447552447552, + "loss": 3.3581, + "step": 40500 + }, + { + "epoch": 11.808433805113868, + "grad_norm": 0.32943588495254517, + "learning_rate": 0.00045856993006993, + "loss": 3.3642, + "step": 40550 + }, + { + "epoch": 11.82299493272759, + "grad_norm": 0.3362137973308563, + "learning_rate": 0.00045839510489510487, + "loss": 3.3649, + "step": 40600 + }, + { + "epoch": 11.837556060341313, + "grad_norm": 0.35426944494247437, + "learning_rate": 0.00045822027972027967, + "loss": 3.3644, + "step": 40650 + }, + { + "epoch": 11.852117187955034, + "grad_norm": 0.3524153232574463, + "learning_rate": 0.0004580454545454545, + "loss": 3.3766, + "step": 40700 + }, + { + "epoch": 11.866678315568757, + "grad_norm": 0.34207257628440857, + "learning_rate": 0.0004578706293706293, + "loss": 3.356, + "step": 40750 + }, + { + "epoch": 11.88123944318248, + "grad_norm": 0.33475518226623535, + "learning_rate": 0.0004576958041958042, + "loss": 3.3619, + "step": 40800 + }, + { + "epoch": 11.895800570796203, + "grad_norm": 0.3188439607620239, + "learning_rate": 0.000457520979020979, + "loss": 3.3655, + "step": 40850 + }, + { + "epoch": 11.910361698409925, + "grad_norm": 0.3339869976043701, + "learning_rate": 0.00045734615384615383, + "loss": 3.3671, + "step": 40900 + }, + { + "epoch": 11.924922826023646, + "grad_norm": 0.3358663022518158, + "learning_rate": 0.0004571713286713287, + "loss": 3.3729, + "step": 40950 + }, + { + "epoch": 11.93948395363737, + "grad_norm": 0.3430688679218292, + "learning_rate": 0.00045699650349650343, + "loss": 3.355, + "step": 41000 + }, + { + "epoch": 11.93948395363737, + "eval_accuracy": 0.37045693999110757, + "eval_loss": 3.551067590713501, + "eval_runtime": 180.9029, + "eval_samples_per_second": 92.016, + "eval_steps_per_second": 5.754, + "step": 41000 + }, + { + "epoch": 11.954045081251092, + "grad_norm": 0.3405440151691437, + "learning_rate": 0.0004568216783216783, + "loss": 3.3736, + "step": 41050 + }, + { + "epoch": 11.968606208864815, + "grad_norm": 0.3274223208427429, + "learning_rate": 0.0004566468531468531, + "loss": 3.3775, + "step": 41100 + }, + { + "epoch": 11.983167336478537, + "grad_norm": 0.34064793586730957, + "learning_rate": 0.00045647202797202794, + "loss": 3.3726, + "step": 41150 + }, + { + "epoch": 11.99772846409226, + "grad_norm": 0.4026555120944977, + "learning_rate": 0.00045629720279720274, + "loss": 3.3781, + "step": 41200 + }, + { + "epoch": 12.012231347195527, + "grad_norm": 0.341229110956192, + "learning_rate": 0.0004561223776223776, + "loss": 3.2739, + "step": 41250 + }, + { + "epoch": 12.02679247480925, + "grad_norm": 0.3594837486743927, + "learning_rate": 0.0004559475524475524, + "loss": 3.262, + "step": 41300 + }, + { + "epoch": 12.041353602422971, + "grad_norm": 0.3388396203517914, + "learning_rate": 0.00045577272727272724, + "loss": 3.257, + "step": 41350 + }, + { + "epoch": 12.055914730036694, + "grad_norm": 0.36203816533088684, + "learning_rate": 0.00045559790209790204, + "loss": 3.2833, + "step": 41400 + }, + { + "epoch": 12.070475857650417, + "grad_norm": 0.37606170773506165, + "learning_rate": 0.0004554230769230769, + "loss": 3.2727, + "step": 41450 + }, + { + "epoch": 12.08503698526414, + "grad_norm": 0.33072522282600403, + "learning_rate": 0.0004552482517482517, + "loss": 3.2844, + "step": 41500 + }, + { + "epoch": 12.099598112877862, + "grad_norm": 0.3422619104385376, + "learning_rate": 0.00045507342657342655, + "loss": 3.2729, + "step": 41550 + }, + { + "epoch": 12.114159240491583, + "grad_norm": 0.33112087845802307, + "learning_rate": 0.00045489860139860135, + "loss": 3.2855, + "step": 41600 + }, + { + "epoch": 12.128720368105306, + "grad_norm": 0.3561849296092987, + "learning_rate": 0.0004547237762237762, + "loss": 3.2809, + "step": 41650 + }, + { + "epoch": 12.143281495719028, + "grad_norm": 0.356389582157135, + "learning_rate": 0.00045454895104895106, + "loss": 3.2899, + "step": 41700 + }, + { + "epoch": 12.157842623332751, + "grad_norm": 0.37144315242767334, + "learning_rate": 0.0004543741258741258, + "loss": 3.2904, + "step": 41750 + }, + { + "epoch": 12.172403750946474, + "grad_norm": 0.3500952124595642, + "learning_rate": 0.00045419930069930066, + "loss": 3.2895, + "step": 41800 + }, + { + "epoch": 12.186964878560195, + "grad_norm": 0.3198910355567932, + "learning_rate": 0.00045402447552447546, + "loss": 3.2839, + "step": 41850 + }, + { + "epoch": 12.201526006173918, + "grad_norm": 0.37378618121147156, + "learning_rate": 0.0004538496503496503, + "loss": 3.3, + "step": 41900 + }, + { + "epoch": 12.21608713378764, + "grad_norm": 0.36039999127388, + "learning_rate": 0.0004536748251748251, + "loss": 3.2992, + "step": 41950 + }, + { + "epoch": 12.230648261401363, + "grad_norm": 0.3335278630256653, + "learning_rate": 0.00045349999999999996, + "loss": 3.3012, + "step": 42000 + }, + { + "epoch": 12.230648261401363, + "eval_accuracy": 0.3701739667951469, + "eval_loss": 3.5618510246276855, + "eval_runtime": 180.658, + "eval_samples_per_second": 92.141, + "eval_steps_per_second": 5.762, + "step": 42000 + }, + { + "epoch": 12.245209389015086, + "grad_norm": 0.38318052887916565, + "learning_rate": 0.00045332517482517476, + "loss": 3.3091, + "step": 42050 + }, + { + "epoch": 12.259770516628807, + "grad_norm": 0.3642372786998749, + "learning_rate": 0.0004531503496503496, + "loss": 3.3112, + "step": 42100 + }, + { + "epoch": 12.27433164424253, + "grad_norm": 0.3355906307697296, + "learning_rate": 0.0004529755244755244, + "loss": 3.3095, + "step": 42150 + }, + { + "epoch": 12.288892771856252, + "grad_norm": 0.3374500274658203, + "learning_rate": 0.00045280069930069927, + "loss": 3.3135, + "step": 42200 + }, + { + "epoch": 12.303453899469975, + "grad_norm": 0.3533301055431366, + "learning_rate": 0.00045262587412587407, + "loss": 3.3096, + "step": 42250 + }, + { + "epoch": 12.318015027083698, + "grad_norm": 0.34568679332733154, + "learning_rate": 0.0004524510489510489, + "loss": 3.3137, + "step": 42300 + }, + { + "epoch": 12.33257615469742, + "grad_norm": 0.3552064597606659, + "learning_rate": 0.0004522762237762238, + "loss": 3.3188, + "step": 42350 + }, + { + "epoch": 12.347137282311142, + "grad_norm": 0.365732878446579, + "learning_rate": 0.0004521013986013986, + "loss": 3.3069, + "step": 42400 + }, + { + "epoch": 12.361698409924864, + "grad_norm": 0.32813093066215515, + "learning_rate": 0.00045192657342657343, + "loss": 3.3176, + "step": 42450 + }, + { + "epoch": 12.376259537538587, + "grad_norm": 0.35801804065704346, + "learning_rate": 0.0004517517482517482, + "loss": 3.3266, + "step": 42500 + }, + { + "epoch": 12.39082066515231, + "grad_norm": 0.3690122067928314, + "learning_rate": 0.00045157692307692303, + "loss": 3.3077, + "step": 42550 + }, + { + "epoch": 12.405381792766033, + "grad_norm": 0.3640967905521393, + "learning_rate": 0.00045140209790209783, + "loss": 3.3327, + "step": 42600 + }, + { + "epoch": 12.419942920379754, + "grad_norm": 0.3548601567745209, + "learning_rate": 0.0004512272727272727, + "loss": 3.3232, + "step": 42650 + }, + { + "epoch": 12.434504047993476, + "grad_norm": 0.34999150037765503, + "learning_rate": 0.0004510524475524475, + "loss": 3.3247, + "step": 42700 + }, + { + "epoch": 12.449065175607199, + "grad_norm": 0.36435720324516296, + "learning_rate": 0.00045087762237762234, + "loss": 3.3183, + "step": 42750 + }, + { + "epoch": 12.463626303220922, + "grad_norm": 0.36453813314437866, + "learning_rate": 0.00045070279720279714, + "loss": 3.3269, + "step": 42800 + }, + { + "epoch": 12.478187430834645, + "grad_norm": 0.35878297686576843, + "learning_rate": 0.000450527972027972, + "loss": 3.3212, + "step": 42850 + }, + { + "epoch": 12.492748558448366, + "grad_norm": 0.36037617921829224, + "learning_rate": 0.0004503531468531468, + "loss": 3.3282, + "step": 42900 + }, + { + "epoch": 12.507309686062088, + "grad_norm": 0.35079270601272583, + "learning_rate": 0.00045017832167832165, + "loss": 3.3298, + "step": 42950 + }, + { + "epoch": 12.521870813675811, + "grad_norm": 0.3421091139316559, + "learning_rate": 0.0004500034965034965, + "loss": 3.3375, + "step": 43000 + }, + { + "epoch": 12.521870813675811, + "eval_accuracy": 0.370077447894489, + "eval_loss": 3.5572597980499268, + "eval_runtime": 180.7328, + "eval_samples_per_second": 92.103, + "eval_steps_per_second": 5.76, + "step": 43000 + }, + { + "epoch": 12.536431941289534, + "grad_norm": 0.33686643838882446, + "learning_rate": 0.0004498286713286713, + "loss": 3.3295, + "step": 43050 + }, + { + "epoch": 12.550993068903256, + "grad_norm": 0.3493344783782959, + "learning_rate": 0.00044965384615384615, + "loss": 3.3336, + "step": 43100 + }, + { + "epoch": 12.565554196516977, + "grad_norm": 0.3651737868785858, + "learning_rate": 0.00044947902097902095, + "loss": 3.3356, + "step": 43150 + }, + { + "epoch": 12.5801153241307, + "grad_norm": 0.35403475165367126, + "learning_rate": 0.0004493041958041958, + "loss": 3.3392, + "step": 43200 + }, + { + "epoch": 12.594676451744423, + "grad_norm": 0.35346367955207825, + "learning_rate": 0.00044912937062937055, + "loss": 3.3331, + "step": 43250 + }, + { + "epoch": 12.609237579358146, + "grad_norm": 0.3847574293613434, + "learning_rate": 0.0004489545454545454, + "loss": 3.3227, + "step": 43300 + }, + { + "epoch": 12.623798706971868, + "grad_norm": 0.32844939827919006, + "learning_rate": 0.0004487797202797202, + "loss": 3.3305, + "step": 43350 + }, + { + "epoch": 12.63835983458559, + "grad_norm": 0.3514168858528137, + "learning_rate": 0.00044860489510489506, + "loss": 3.3346, + "step": 43400 + }, + { + "epoch": 12.652920962199312, + "grad_norm": 0.3668827712535858, + "learning_rate": 0.00044843006993006986, + "loss": 3.3453, + "step": 43450 + }, + { + "epoch": 12.667482089813035, + "grad_norm": 0.35664016008377075, + "learning_rate": 0.0004482552447552447, + "loss": 3.3523, + "step": 43500 + }, + { + "epoch": 12.682043217426758, + "grad_norm": 0.3246038854122162, + "learning_rate": 0.0004480804195804195, + "loss": 3.3459, + "step": 43550 + }, + { + "epoch": 12.69660434504048, + "grad_norm": 0.39765822887420654, + "learning_rate": 0.00044790559440559437, + "loss": 3.3396, + "step": 43600 + }, + { + "epoch": 12.711165472654203, + "grad_norm": 0.404825896024704, + "learning_rate": 0.00044773076923076917, + "loss": 3.3458, + "step": 43650 + }, + { + "epoch": 12.725726600267924, + "grad_norm": 0.3610759675502777, + "learning_rate": 0.000447555944055944, + "loss": 3.344, + "step": 43700 + }, + { + "epoch": 12.740287727881647, + "grad_norm": 0.3398517668247223, + "learning_rate": 0.0004473811188811189, + "loss": 3.3499, + "step": 43750 + }, + { + "epoch": 12.75484885549537, + "grad_norm": 0.35025089979171753, + "learning_rate": 0.0004472062937062937, + "loss": 3.3479, + "step": 43800 + }, + { + "epoch": 12.769409983109092, + "grad_norm": 0.35315173864364624, + "learning_rate": 0.00044703146853146853, + "loss": 3.3462, + "step": 43850 + }, + { + "epoch": 12.783971110722815, + "grad_norm": 0.35999706387519836, + "learning_rate": 0.00044685664335664333, + "loss": 3.348, + "step": 43900 + }, + { + "epoch": 12.798532238336536, + "grad_norm": 0.33961042761802673, + "learning_rate": 0.0004466818181818182, + "loss": 3.3528, + "step": 43950 + }, + { + "epoch": 12.813093365950259, + "grad_norm": 0.34737616777420044, + "learning_rate": 0.00044650699300699293, + "loss": 3.3423, + "step": 44000 + }, + { + "epoch": 12.813093365950259, + "eval_accuracy": 0.3708158586314349, + "eval_loss": 3.547250986099243, + "eval_runtime": 180.8057, + "eval_samples_per_second": 92.066, + "eval_steps_per_second": 5.758, + "step": 44000 + }, + { + "epoch": 12.827654493563982, + "grad_norm": 0.372464656829834, + "learning_rate": 0.0004463321678321678, + "loss": 3.3532, + "step": 44050 + }, + { + "epoch": 12.842215621177704, + "grad_norm": 0.346667617559433, + "learning_rate": 0.0004461573426573426, + "loss": 3.3469, + "step": 44100 + }, + { + "epoch": 12.856776748791427, + "grad_norm": 0.3418535888195038, + "learning_rate": 0.00044598251748251744, + "loss": 3.3397, + "step": 44150 + }, + { + "epoch": 12.871337876405148, + "grad_norm": 0.3593897223472595, + "learning_rate": 0.00044580769230769224, + "loss": 3.3517, + "step": 44200 + }, + { + "epoch": 12.88589900401887, + "grad_norm": 0.3259468972682953, + "learning_rate": 0.0004456328671328671, + "loss": 3.3539, + "step": 44250 + }, + { + "epoch": 12.900460131632594, + "grad_norm": 0.3682238459587097, + "learning_rate": 0.0004454580419580419, + "loss": 3.3607, + "step": 44300 + }, + { + "epoch": 12.915021259246316, + "grad_norm": 0.33671849966049194, + "learning_rate": 0.00044528321678321674, + "loss": 3.3507, + "step": 44350 + }, + { + "epoch": 12.929582386860039, + "grad_norm": 0.3739355504512787, + "learning_rate": 0.0004451083916083916, + "loss": 3.3463, + "step": 44400 + }, + { + "epoch": 12.944143514473762, + "grad_norm": 0.3360922336578369, + "learning_rate": 0.0004449335664335664, + "loss": 3.3563, + "step": 44450 + }, + { + "epoch": 12.958704642087483, + "grad_norm": 0.38051101565361023, + "learning_rate": 0.00044475874125874125, + "loss": 3.3509, + "step": 44500 + }, + { + "epoch": 12.973265769701205, + "grad_norm": 0.35468021035194397, + "learning_rate": 0.00044458391608391605, + "loss": 3.3574, + "step": 44550 + }, + { + "epoch": 12.987826897314928, + "grad_norm": 0.33574724197387695, + "learning_rate": 0.0004444090909090909, + "loss": 3.3432, + "step": 44600 + }, + { + "epoch": 13.002329780418195, + "grad_norm": 0.3547254502773285, + "learning_rate": 0.0004442342657342657, + "loss": 3.3365, + "step": 44650 + }, + { + "epoch": 13.016890908031918, + "grad_norm": 0.3475710451602936, + "learning_rate": 0.00044405944055944056, + "loss": 3.232, + "step": 44700 + }, + { + "epoch": 13.031452035645641, + "grad_norm": 0.3456555902957916, + "learning_rate": 0.0004438846153846153, + "loss": 3.2463, + "step": 44750 + }, + { + "epoch": 13.046013163259364, + "grad_norm": 0.36879873275756836, + "learning_rate": 0.00044370979020979016, + "loss": 3.2534, + "step": 44800 + }, + { + "epoch": 13.060574290873085, + "grad_norm": 0.3598310649394989, + "learning_rate": 0.00044353496503496496, + "loss": 3.2438, + "step": 44850 + }, + { + "epoch": 13.075135418486807, + "grad_norm": 0.35868072509765625, + "learning_rate": 0.0004433601398601398, + "loss": 3.2632, + "step": 44900 + }, + { + "epoch": 13.08969654610053, + "grad_norm": 0.33635494112968445, + "learning_rate": 0.0004431853146853146, + "loss": 3.2603, + "step": 44950 + }, + { + "epoch": 13.104257673714253, + "grad_norm": 0.36905378103256226, + "learning_rate": 0.00044301048951048946, + "loss": 3.258, + "step": 45000 + }, + { + "epoch": 13.104257673714253, + "eval_accuracy": 0.37040074506478055, + "eval_loss": 3.5601999759674072, + "eval_runtime": 180.8794, + "eval_samples_per_second": 92.028, + "eval_steps_per_second": 5.755, + "step": 45000 + }, + { + "epoch": 13.118818801327976, + "grad_norm": 0.3864152729511261, + "learning_rate": 0.00044283566433566426, + "loss": 3.2666, + "step": 45050 + }, + { + "epoch": 13.133379928941697, + "grad_norm": 0.3702114522457123, + "learning_rate": 0.0004426608391608391, + "loss": 3.2676, + "step": 45100 + }, + { + "epoch": 13.14794105655542, + "grad_norm": 0.35562071204185486, + "learning_rate": 0.00044248601398601397, + "loss": 3.2743, + "step": 45150 + }, + { + "epoch": 13.162502184169142, + "grad_norm": 0.3449855148792267, + "learning_rate": 0.00044231118881118877, + "loss": 3.2757, + "step": 45200 + }, + { + "epoch": 13.177063311782865, + "grad_norm": 0.34758079051971436, + "learning_rate": 0.0004421363636363636, + "loss": 3.2819, + "step": 45250 + }, + { + "epoch": 13.191624439396588, + "grad_norm": 0.3789484202861786, + "learning_rate": 0.0004419615384615384, + "loss": 3.2785, + "step": 45300 + }, + { + "epoch": 13.206185567010309, + "grad_norm": 0.3660043478012085, + "learning_rate": 0.0004417867132867133, + "loss": 3.2718, + "step": 45350 + }, + { + "epoch": 13.220746694624031, + "grad_norm": 0.392027348279953, + "learning_rate": 0.0004416118881118881, + "loss": 3.273, + "step": 45400 + }, + { + "epoch": 13.235307822237754, + "grad_norm": 0.4098648130893707, + "learning_rate": 0.00044143706293706293, + "loss": 3.2767, + "step": 45450 + }, + { + "epoch": 13.249868949851477, + "grad_norm": 0.3648849129676819, + "learning_rate": 0.0004412622377622377, + "loss": 3.2936, + "step": 45500 + }, + { + "epoch": 13.2644300774652, + "grad_norm": 0.381013423204422, + "learning_rate": 0.00044108741258741253, + "loss": 3.2894, + "step": 45550 + }, + { + "epoch": 13.27899120507892, + "grad_norm": 0.344605416059494, + "learning_rate": 0.00044091258741258733, + "loss": 3.2903, + "step": 45600 + }, + { + "epoch": 13.293552332692643, + "grad_norm": 0.3393156826496124, + "learning_rate": 0.0004407377622377622, + "loss": 3.2873, + "step": 45650 + }, + { + "epoch": 13.308113460306366, + "grad_norm": 0.34138327836990356, + "learning_rate": 0.000440562937062937, + "loss": 3.2941, + "step": 45700 + }, + { + "epoch": 13.322674587920089, + "grad_norm": 0.35385775566101074, + "learning_rate": 0.00044038811188811184, + "loss": 3.2914, + "step": 45750 + }, + { + "epoch": 13.337235715533811, + "grad_norm": 0.39340725541114807, + "learning_rate": 0.0004402132867132867, + "loss": 3.2961, + "step": 45800 + }, + { + "epoch": 13.351796843147532, + "grad_norm": 0.38336238265037537, + "learning_rate": 0.0004400384615384615, + "loss": 3.3032, + "step": 45850 + }, + { + "epoch": 13.366357970761255, + "grad_norm": 0.376488596200943, + "learning_rate": 0.00043986363636363635, + "loss": 3.302, + "step": 45900 + }, + { + "epoch": 13.380919098374978, + "grad_norm": 0.38429391384124756, + "learning_rate": 0.00043968881118881115, + "loss": 3.3086, + "step": 45950 + }, + { + "epoch": 13.3954802259887, + "grad_norm": 0.3611532151699066, + "learning_rate": 0.000439513986013986, + "loss": 3.2948, + "step": 46000 + }, + { + "epoch": 13.3954802259887, + "eval_accuracy": 0.37078893679434516, + "eval_loss": 3.556919813156128, + "eval_runtime": 181.2381, + "eval_samples_per_second": 91.846, + "eval_steps_per_second": 5.744, + "step": 46000 + }, + { + "epoch": 13.410041353602423, + "grad_norm": 0.35340753197669983, + "learning_rate": 0.0004393391608391608, + "loss": 3.3028, + "step": 46050 + }, + { + "epoch": 13.424602481216146, + "grad_norm": 0.33229947090148926, + "learning_rate": 0.00043916433566433565, + "loss": 3.3226, + "step": 46100 + }, + { + "epoch": 13.439163608829867, + "grad_norm": 0.3614226281642914, + "learning_rate": 0.00043898951048951045, + "loss": 3.3084, + "step": 46150 + }, + { + "epoch": 13.45372473644359, + "grad_norm": 0.35992908477783203, + "learning_rate": 0.0004388146853146853, + "loss": 3.3103, + "step": 46200 + }, + { + "epoch": 13.468285864057313, + "grad_norm": 0.37572646141052246, + "learning_rate": 0.00043863986013986005, + "loss": 3.3319, + "step": 46250 + }, + { + "epoch": 13.482846991671035, + "grad_norm": 0.35673362016677856, + "learning_rate": 0.0004384650349650349, + "loss": 3.3052, + "step": 46300 + }, + { + "epoch": 13.497408119284758, + "grad_norm": 0.3684885501861572, + "learning_rate": 0.0004382902097902097, + "loss": 3.3061, + "step": 46350 + }, + { + "epoch": 13.51196924689848, + "grad_norm": 0.35558366775512695, + "learning_rate": 0.00043811538461538456, + "loss": 3.3148, + "step": 46400 + }, + { + "epoch": 13.526530374512202, + "grad_norm": 0.3368312120437622, + "learning_rate": 0.0004379405594405594, + "loss": 3.311, + "step": 46450 + }, + { + "epoch": 13.541091502125925, + "grad_norm": 0.3515823483467102, + "learning_rate": 0.0004377657342657342, + "loss": 3.3151, + "step": 46500 + }, + { + "epoch": 13.555652629739647, + "grad_norm": 0.3697565495967865, + "learning_rate": 0.00043759090909090907, + "loss": 3.3236, + "step": 46550 + }, + { + "epoch": 13.57021375735337, + "grad_norm": 0.371623694896698, + "learning_rate": 0.00043741608391608387, + "loss": 3.3189, + "step": 46600 + }, + { + "epoch": 13.584774884967091, + "grad_norm": 0.36001795530319214, + "learning_rate": 0.0004372412587412587, + "loss": 3.3307, + "step": 46650 + }, + { + "epoch": 13.599336012580814, + "grad_norm": 0.3571346700191498, + "learning_rate": 0.0004370664335664335, + "loss": 3.3313, + "step": 46700 + }, + { + "epoch": 13.613897140194537, + "grad_norm": 0.34875962138175964, + "learning_rate": 0.0004368916083916084, + "loss": 3.328, + "step": 46750 + }, + { + "epoch": 13.62845826780826, + "grad_norm": 0.3460927903652191, + "learning_rate": 0.0004367167832167832, + "loss": 3.3221, + "step": 46800 + }, + { + "epoch": 13.643019395421982, + "grad_norm": 0.3462391495704651, + "learning_rate": 0.00043654195804195803, + "loss": 3.315, + "step": 46850 + }, + { + "epoch": 13.657580523035705, + "grad_norm": 0.33300238847732544, + "learning_rate": 0.00043636713286713283, + "loss": 3.3241, + "step": 46900 + }, + { + "epoch": 13.672141650649426, + "grad_norm": 0.35763484239578247, + "learning_rate": 0.0004361923076923077, + "loss": 3.3222, + "step": 46950 + }, + { + "epoch": 13.686702778263149, + "grad_norm": 0.3411622941493988, + "learning_rate": 0.00043601748251748243, + "loss": 3.3168, + "step": 47000 + }, + { + "epoch": 13.686702778263149, + "eval_accuracy": 0.3710402856489209, + "eval_loss": 3.5501530170440674, + "eval_runtime": 181.1346, + "eval_samples_per_second": 91.899, + "eval_steps_per_second": 5.747, + "step": 47000 + }, + { + "epoch": 13.701263905876871, + "grad_norm": 0.36353328824043274, + "learning_rate": 0.00043584265734265734, + "loss": 3.3267, + "step": 47050 + }, + { + "epoch": 13.715825033490594, + "grad_norm": 0.3539396822452545, + "learning_rate": 0.0004356678321678321, + "loss": 3.3319, + "step": 47100 + }, + { + "epoch": 13.730386161104317, + "grad_norm": 0.3500533103942871, + "learning_rate": 0.00043549300699300694, + "loss": 3.3324, + "step": 47150 + }, + { + "epoch": 13.744947288718038, + "grad_norm": 0.360314279794693, + "learning_rate": 0.0004353181818181818, + "loss": 3.3379, + "step": 47200 + }, + { + "epoch": 13.75950841633176, + "grad_norm": 0.365933358669281, + "learning_rate": 0.0004351433566433566, + "loss": 3.329, + "step": 47250 + }, + { + "epoch": 13.774069543945483, + "grad_norm": 0.3569656014442444, + "learning_rate": 0.00043496853146853144, + "loss": 3.3337, + "step": 47300 + }, + { + "epoch": 13.788630671559206, + "grad_norm": 0.35865193605422974, + "learning_rate": 0.00043479370629370624, + "loss": 3.3275, + "step": 47350 + }, + { + "epoch": 13.803191799172929, + "grad_norm": 0.342277467250824, + "learning_rate": 0.0004346188811188811, + "loss": 3.3288, + "step": 47400 + }, + { + "epoch": 13.81775292678665, + "grad_norm": 0.364913672208786, + "learning_rate": 0.0004344440559440559, + "loss": 3.3432, + "step": 47450 + }, + { + "epoch": 13.832314054400372, + "grad_norm": 0.361052006483078, + "learning_rate": 0.00043426923076923075, + "loss": 3.332, + "step": 47500 + }, + { + "epoch": 13.846875182014095, + "grad_norm": 0.3582068681716919, + "learning_rate": 0.00043409440559440555, + "loss": 3.3425, + "step": 47550 + }, + { + "epoch": 13.861436309627818, + "grad_norm": 0.34829556941986084, + "learning_rate": 0.0004339195804195804, + "loss": 3.3265, + "step": 47600 + }, + { + "epoch": 13.87599743724154, + "grad_norm": 0.35710349678993225, + "learning_rate": 0.0004337447552447552, + "loss": 3.3391, + "step": 47650 + }, + { + "epoch": 13.890558564855262, + "grad_norm": 0.3624763786792755, + "learning_rate": 0.00043356993006993006, + "loss": 3.3336, + "step": 47700 + }, + { + "epoch": 13.905119692468984, + "grad_norm": 0.338870644569397, + "learning_rate": 0.0004333951048951048, + "loss": 3.3439, + "step": 47750 + }, + { + "epoch": 13.919680820082707, + "grad_norm": 0.32162490487098694, + "learning_rate": 0.0004332202797202797, + "loss": 3.3331, + "step": 47800 + }, + { + "epoch": 13.93424194769643, + "grad_norm": 0.3730635344982147, + "learning_rate": 0.00043304545454545456, + "loss": 3.3267, + "step": 47850 + }, + { + "epoch": 13.948803075310153, + "grad_norm": 0.36320826411247253, + "learning_rate": 0.0004328706293706293, + "loss": 3.3336, + "step": 47900 + }, + { + "epoch": 13.963364202923874, + "grad_norm": 0.3524576723575592, + "learning_rate": 0.00043269580419580416, + "loss": 3.3353, + "step": 47950 + }, + { + "epoch": 13.977925330537596, + "grad_norm": 0.3460340201854706, + "learning_rate": 0.00043252097902097896, + "loss": 3.3278, + "step": 48000 + }, + { + "epoch": 13.977925330537596, + "eval_accuracy": 0.37145504652775313, + "eval_loss": 3.540445566177368, + "eval_runtime": 181.2081, + "eval_samples_per_second": 91.861, + "eval_steps_per_second": 5.745, + "step": 48000 + }, + { + "epoch": 13.992486458151319, + "grad_norm": 0.35427072644233704, + "learning_rate": 0.0004323461538461538, + "loss": 3.3319, + "step": 48050 + }, + { + "epoch": 14.006989341254586, + "grad_norm": 0.3449511229991913, + "learning_rate": 0.0004321713286713286, + "loss": 3.2881, + "step": 48100 + }, + { + "epoch": 14.021550468868309, + "grad_norm": 0.39801499247550964, + "learning_rate": 0.00043199650349650347, + "loss": 3.2324, + "step": 48150 + }, + { + "epoch": 14.036111596482032, + "grad_norm": 0.34631258249282837, + "learning_rate": 0.00043182167832167827, + "loss": 3.2373, + "step": 48200 + }, + { + "epoch": 14.050672724095755, + "grad_norm": 0.34527838230133057, + "learning_rate": 0.0004316468531468531, + "loss": 3.2396, + "step": 48250 + }, + { + "epoch": 14.065233851709475, + "grad_norm": 0.3679901659488678, + "learning_rate": 0.0004314720279720279, + "loss": 3.2423, + "step": 48300 + }, + { + "epoch": 14.079794979323198, + "grad_norm": 0.36143872141838074, + "learning_rate": 0.0004312972027972028, + "loss": 3.2434, + "step": 48350 + }, + { + "epoch": 14.094356106936921, + "grad_norm": 0.36590418219566345, + "learning_rate": 0.0004311223776223776, + "loss": 3.2398, + "step": 48400 + }, + { + "epoch": 14.108917234550644, + "grad_norm": 0.35469797253608704, + "learning_rate": 0.00043094755244755243, + "loss": 3.2428, + "step": 48450 + }, + { + "epoch": 14.123478362164366, + "grad_norm": 0.34602782130241394, + "learning_rate": 0.0004307727272727272, + "loss": 3.2579, + "step": 48500 + }, + { + "epoch": 14.13803948977809, + "grad_norm": 0.32153913378715515, + "learning_rate": 0.0004305979020979021, + "loss": 3.2568, + "step": 48550 + }, + { + "epoch": 14.15260061739181, + "grad_norm": 0.34756767749786377, + "learning_rate": 0.00043042307692307694, + "loss": 3.2628, + "step": 48600 + }, + { + "epoch": 14.167161745005533, + "grad_norm": 0.3779682517051697, + "learning_rate": 0.0004302482517482517, + "loss": 3.2529, + "step": 48650 + }, + { + "epoch": 14.181722872619256, + "grad_norm": 0.3358452320098877, + "learning_rate": 0.00043007342657342654, + "loss": 3.2635, + "step": 48700 + }, + { + "epoch": 14.196284000232978, + "grad_norm": 0.3383064568042755, + "learning_rate": 0.00042989860139860134, + "loss": 3.2613, + "step": 48750 + }, + { + "epoch": 14.210845127846701, + "grad_norm": 0.33334216475486755, + "learning_rate": 0.0004297237762237762, + "loss": 3.2725, + "step": 48800 + }, + { + "epoch": 14.225406255460422, + "grad_norm": 0.4189155697822571, + "learning_rate": 0.000429548951048951, + "loss": 3.271, + "step": 48850 + }, + { + "epoch": 14.239967383074145, + "grad_norm": 0.3933100402355194, + "learning_rate": 0.00042937412587412585, + "loss": 3.2692, + "step": 48900 + }, + { + "epoch": 14.254528510687868, + "grad_norm": 0.3877250850200653, + "learning_rate": 0.00042919930069930065, + "loss": 3.2711, + "step": 48950 + }, + { + "epoch": 14.26908963830159, + "grad_norm": 0.37484902143478394, + "learning_rate": 0.0004290244755244755, + "loss": 3.2799, + "step": 49000 + }, + { + "epoch": 14.26908963830159, + "eval_accuracy": 0.3709477638769138, + "eval_loss": 3.558025360107422, + "eval_runtime": 180.9464, + "eval_samples_per_second": 91.994, + "eval_steps_per_second": 5.753, + "step": 49000 + }, + { + "epoch": 14.283650765915313, + "grad_norm": 0.3490859270095825, + "learning_rate": 0.0004288496503496503, + "loss": 3.2762, + "step": 49050 + }, + { + "epoch": 14.298211893529034, + "grad_norm": 0.38379967212677, + "learning_rate": 0.00042867482517482515, + "loss": 3.2816, + "step": 49100 + }, + { + "epoch": 14.312773021142757, + "grad_norm": 0.366131454706192, + "learning_rate": 0.00042849999999999995, + "loss": 3.2791, + "step": 49150 + }, + { + "epoch": 14.32733414875648, + "grad_norm": 0.3447644114494324, + "learning_rate": 0.0004283251748251748, + "loss": 3.27, + "step": 49200 + }, + { + "epoch": 14.341895276370202, + "grad_norm": 0.3631740212440491, + "learning_rate": 0.00042815034965034966, + "loss": 3.2812, + "step": 49250 + }, + { + "epoch": 14.356456403983925, + "grad_norm": 0.37680432200431824, + "learning_rate": 0.00042797552447552446, + "loss": 3.2918, + "step": 49300 + }, + { + "epoch": 14.371017531597648, + "grad_norm": 0.348541796207428, + "learning_rate": 0.0004278006993006993, + "loss": 3.2858, + "step": 49350 + }, + { + "epoch": 14.385578659211369, + "grad_norm": 0.40237003564834595, + "learning_rate": 0.00042762587412587406, + "loss": 3.2925, + "step": 49400 + }, + { + "epoch": 14.400139786825092, + "grad_norm": 0.3444540798664093, + "learning_rate": 0.0004274510489510489, + "loss": 3.2888, + "step": 49450 + }, + { + "epoch": 14.414700914438814, + "grad_norm": 0.37795794010162354, + "learning_rate": 0.0004272762237762237, + "loss": 3.3037, + "step": 49500 + }, + { + "epoch": 14.429262042052537, + "grad_norm": 0.3671080768108368, + "learning_rate": 0.00042710139860139857, + "loss": 3.2885, + "step": 49550 + }, + { + "epoch": 14.44382316966626, + "grad_norm": 0.3939375877380371, + "learning_rate": 0.00042692657342657337, + "loss": 3.2869, + "step": 49600 + }, + { + "epoch": 14.45838429727998, + "grad_norm": 0.3599258065223694, + "learning_rate": 0.0004267517482517482, + "loss": 3.2999, + "step": 49650 + }, + { + "epoch": 14.472945424893704, + "grad_norm": 0.39820772409439087, + "learning_rate": 0.000426576923076923, + "loss": 3.302, + "step": 49700 + }, + { + "epoch": 14.487506552507426, + "grad_norm": 0.33834782242774963, + "learning_rate": 0.0004264020979020979, + "loss": 3.2965, + "step": 49750 + }, + { + "epoch": 14.502067680121149, + "grad_norm": 0.34589359164237976, + "learning_rate": 0.0004262272727272727, + "loss": 3.2905, + "step": 49800 + }, + { + "epoch": 14.516628807734872, + "grad_norm": 0.36442044377326965, + "learning_rate": 0.00042605244755244753, + "loss": 3.3112, + "step": 49850 + }, + { + "epoch": 14.531189935348593, + "grad_norm": 0.3587281107902527, + "learning_rate": 0.00042587762237762233, + "loss": 3.2954, + "step": 49900 + }, + { + "epoch": 14.545751062962315, + "grad_norm": 0.40609636902809143, + "learning_rate": 0.0004257027972027972, + "loss": 3.3017, + "step": 49950 + }, + { + "epoch": 14.560312190576038, + "grad_norm": 0.36938217282295227, + "learning_rate": 0.00042552797202797204, + "loss": 3.305, + "step": 50000 + }, + { + "epoch": 14.560312190576038, + "eval_accuracy": 0.3712948086938959, + "eval_loss": 3.549705982208252, + "eval_runtime": 180.9754, + "eval_samples_per_second": 91.979, + "eval_steps_per_second": 5.752, + "step": 50000 + }, + { + "epoch": 14.574873318189761, + "grad_norm": 0.3450993299484253, + "learning_rate": 0.00042535314685314684, + "loss": 3.3049, + "step": 50050 + }, + { + "epoch": 14.589434445803484, + "grad_norm": 0.3647083342075348, + "learning_rate": 0.0004251783216783217, + "loss": 3.2905, + "step": 50100 + }, + { + "epoch": 14.603995573417205, + "grad_norm": 0.374889999628067, + "learning_rate": 0.00042500349650349643, + "loss": 3.2949, + "step": 50150 + }, + { + "epoch": 14.618556701030927, + "grad_norm": 0.35723409056663513, + "learning_rate": 0.0004248286713286713, + "loss": 3.3063, + "step": 50200 + }, + { + "epoch": 14.63311782864465, + "grad_norm": 0.3582786023616791, + "learning_rate": 0.0004246538461538461, + "loss": 3.3007, + "step": 50250 + }, + { + "epoch": 14.647678956258373, + "grad_norm": 0.3550582528114319, + "learning_rate": 0.00042447902097902094, + "loss": 3.3112, + "step": 50300 + }, + { + "epoch": 14.662240083872096, + "grad_norm": 0.38724565505981445, + "learning_rate": 0.00042430419580419574, + "loss": 3.3092, + "step": 50350 + }, + { + "epoch": 14.676801211485817, + "grad_norm": 0.3551936745643616, + "learning_rate": 0.0004241293706293706, + "loss": 3.3052, + "step": 50400 + }, + { + "epoch": 14.69136233909954, + "grad_norm": 0.37477564811706543, + "learning_rate": 0.0004239545454545454, + "loss": 3.3014, + "step": 50450 + }, + { + "epoch": 14.705923466713262, + "grad_norm": 0.3632015287876129, + "learning_rate": 0.00042377972027972025, + "loss": 3.3086, + "step": 50500 + }, + { + "epoch": 14.720484594326985, + "grad_norm": 0.3654315769672394, + "learning_rate": 0.00042360489510489505, + "loss": 3.3075, + "step": 50550 + }, + { + "epoch": 14.735045721940708, + "grad_norm": 0.3743443191051483, + "learning_rate": 0.0004234300699300699, + "loss": 3.3103, + "step": 50600 + }, + { + "epoch": 14.749606849554429, + "grad_norm": 0.3471044898033142, + "learning_rate": 0.00042325524475524476, + "loss": 3.3226, + "step": 50650 + }, + { + "epoch": 14.764167977168151, + "grad_norm": 0.35494619607925415, + "learning_rate": 0.00042308041958041956, + "loss": 3.3218, + "step": 50700 + }, + { + "epoch": 14.778729104781874, + "grad_norm": 0.36895397305488586, + "learning_rate": 0.0004229055944055944, + "loss": 3.3032, + "step": 50750 + }, + { + "epoch": 14.793290232395597, + "grad_norm": 0.3416297733783722, + "learning_rate": 0.0004227307692307692, + "loss": 3.3163, + "step": 50800 + }, + { + "epoch": 14.80785136000932, + "grad_norm": 0.3781328797340393, + "learning_rate": 0.00042255594405594406, + "loss": 3.3142, + "step": 50850 + }, + { + "epoch": 14.822412487623042, + "grad_norm": 0.3623887002468109, + "learning_rate": 0.0004223811188811188, + "loss": 3.3257, + "step": 50900 + }, + { + "epoch": 14.836973615236763, + "grad_norm": 0.360993355512619, + "learning_rate": 0.00042220629370629366, + "loss": 3.3151, + "step": 50950 + }, + { + "epoch": 14.851534742850486, + "grad_norm": 0.3625052571296692, + "learning_rate": 0.00042203146853146846, + "loss": 3.3274, + "step": 51000 + }, + { + "epoch": 14.851534742850486, + "eval_accuracy": 0.37191624463649997, + "eval_loss": 3.5400757789611816, + "eval_runtime": 180.9103, + "eval_samples_per_second": 92.012, + "eval_steps_per_second": 5.754, + "step": 51000 + }, + { + "epoch": 14.866095870464209, + "grad_norm": 0.3725496232509613, + "learning_rate": 0.0004218566433566433, + "loss": 3.328, + "step": 51050 + }, + { + "epoch": 14.880656998077932, + "grad_norm": 0.3665943741798401, + "learning_rate": 0.0004216818181818181, + "loss": 3.3191, + "step": 51100 + }, + { + "epoch": 14.895218125691654, + "grad_norm": 0.3478486239910126, + "learning_rate": 0.00042150699300699297, + "loss": 3.3334, + "step": 51150 + }, + { + "epoch": 14.909779253305375, + "grad_norm": 0.3583490252494812, + "learning_rate": 0.00042133216783216777, + "loss": 3.3253, + "step": 51200 + }, + { + "epoch": 14.924340380919098, + "grad_norm": 0.3656362295150757, + "learning_rate": 0.0004211573426573426, + "loss": 3.3083, + "step": 51250 + }, + { + "epoch": 14.93890150853282, + "grad_norm": 0.37020862102508545, + "learning_rate": 0.0004209825174825175, + "loss": 3.3181, + "step": 51300 + }, + { + "epoch": 14.953462636146543, + "grad_norm": 0.3434571325778961, + "learning_rate": 0.0004208076923076923, + "loss": 3.3307, + "step": 51350 + }, + { + "epoch": 14.968023763760266, + "grad_norm": 0.3477000594139099, + "learning_rate": 0.00042063286713286713, + "loss": 3.3369, + "step": 51400 + }, + { + "epoch": 14.982584891373987, + "grad_norm": 0.3382193148136139, + "learning_rate": 0.00042045804195804193, + "loss": 3.3173, + "step": 51450 + }, + { + "epoch": 14.99714601898771, + "grad_norm": 0.3472067713737488, + "learning_rate": 0.0004202832167832168, + "loss": 3.3205, + "step": 51500 + }, + { + "epoch": 15.011648902090977, + "grad_norm": 0.3728955090045929, + "learning_rate": 0.0004201083916083916, + "loss": 3.2401, + "step": 51550 + }, + { + "epoch": 15.0262100297047, + "grad_norm": 0.3808425962924957, + "learning_rate": 0.00041993356643356644, + "loss": 3.2133, + "step": 51600 + }, + { + "epoch": 15.040771157318423, + "grad_norm": 0.3673969805240631, + "learning_rate": 0.0004197587412587412, + "loss": 3.2078, + "step": 51650 + }, + { + "epoch": 15.055332284932145, + "grad_norm": 0.37854793667793274, + "learning_rate": 0.00041958391608391604, + "loss": 3.2111, + "step": 51700 + }, + { + "epoch": 15.069893412545868, + "grad_norm": 0.36936038732528687, + "learning_rate": 0.00041940909090909084, + "loss": 3.2208, + "step": 51750 + }, + { + "epoch": 15.084454540159589, + "grad_norm": 0.36692115664482117, + "learning_rate": 0.0004192342657342657, + "loss": 3.229, + "step": 51800 + }, + { + "epoch": 15.099015667773312, + "grad_norm": 0.34014618396759033, + "learning_rate": 0.0004190594405594405, + "loss": 3.2281, + "step": 51850 + }, + { + "epoch": 15.113576795387035, + "grad_norm": 0.3525432348251343, + "learning_rate": 0.00041888461538461535, + "loss": 3.2413, + "step": 51900 + }, + { + "epoch": 15.128137923000757, + "grad_norm": 0.3579222857952118, + "learning_rate": 0.00041870979020979015, + "loss": 3.2443, + "step": 51950 + }, + { + "epoch": 15.14269905061448, + "grad_norm": 0.36283615231513977, + "learning_rate": 0.000418534965034965, + "loss": 3.2323, + "step": 52000 + }, + { + "epoch": 15.14269905061448, + "eval_accuracy": 0.3711156432802507, + "eval_loss": 3.5574755668640137, + "eval_runtime": 180.7629, + "eval_samples_per_second": 92.087, + "eval_steps_per_second": 5.759, + "step": 52000 + }, + { + "epoch": 15.157260178228203, + "grad_norm": 0.3686647415161133, + "learning_rate": 0.00041836013986013985, + "loss": 3.2459, + "step": 52050 + }, + { + "epoch": 15.171821305841924, + "grad_norm": 0.35007837414741516, + "learning_rate": 0.00041818531468531465, + "loss": 3.2433, + "step": 52100 + }, + { + "epoch": 15.186382433455647, + "grad_norm": 0.3874566853046417, + "learning_rate": 0.0004180104895104895, + "loss": 3.238, + "step": 52150 + }, + { + "epoch": 15.20094356106937, + "grad_norm": 0.35891878604888916, + "learning_rate": 0.0004178356643356643, + "loss": 3.2537, + "step": 52200 + }, + { + "epoch": 15.215504688683092, + "grad_norm": 0.35012251138687134, + "learning_rate": 0.00041766083916083916, + "loss": 3.2566, + "step": 52250 + }, + { + "epoch": 15.230065816296815, + "grad_norm": 0.37354549765586853, + "learning_rate": 0.00041748601398601396, + "loss": 3.2547, + "step": 52300 + }, + { + "epoch": 15.244626943910536, + "grad_norm": 0.3650526702404022, + "learning_rate": 0.0004173111888111888, + "loss": 3.2521, + "step": 52350 + }, + { + "epoch": 15.259188071524258, + "grad_norm": 0.43291184306144714, + "learning_rate": 0.00041713636363636356, + "loss": 3.253, + "step": 52400 + }, + { + "epoch": 15.273749199137981, + "grad_norm": 0.36537203192710876, + "learning_rate": 0.0004169615384615384, + "loss": 3.2628, + "step": 52450 + }, + { + "epoch": 15.288310326751704, + "grad_norm": 0.3516569137573242, + "learning_rate": 0.0004167867132867132, + "loss": 3.2648, + "step": 52500 + }, + { + "epoch": 15.302871454365427, + "grad_norm": 0.3715130090713501, + "learning_rate": 0.00041661188811188807, + "loss": 3.2721, + "step": 52550 + }, + { + "epoch": 15.317432581979148, + "grad_norm": 0.3508760631084442, + "learning_rate": 0.00041643706293706287, + "loss": 3.2753, + "step": 52600 + }, + { + "epoch": 15.33199370959287, + "grad_norm": 0.36405643820762634, + "learning_rate": 0.0004162622377622377, + "loss": 3.2591, + "step": 52650 + }, + { + "epoch": 15.346554837206593, + "grad_norm": 0.3771628737449646, + "learning_rate": 0.0004160874125874126, + "loss": 3.2705, + "step": 52700 + }, + { + "epoch": 15.361115964820316, + "grad_norm": 0.3789055347442627, + "learning_rate": 0.0004159125874125874, + "loss": 3.29, + "step": 52750 + }, + { + "epoch": 15.375677092434039, + "grad_norm": 0.35194921493530273, + "learning_rate": 0.00041573776223776223, + "loss": 3.28, + "step": 52800 + }, + { + "epoch": 15.39023822004776, + "grad_norm": 0.39530250430107117, + "learning_rate": 0.00041556293706293703, + "loss": 3.2666, + "step": 52850 + }, + { + "epoch": 15.404799347661482, + "grad_norm": 0.35562649369239807, + "learning_rate": 0.0004153881118881119, + "loss": 3.2888, + "step": 52900 + }, + { + "epoch": 15.419360475275205, + "grad_norm": 0.3992999792098999, + "learning_rate": 0.0004152132867132867, + "loss": 3.2757, + "step": 52950 + }, + { + "epoch": 15.433921602888928, + "grad_norm": 0.3915247321128845, + "learning_rate": 0.00041503846153846154, + "loss": 3.2795, + "step": 53000 + }, + { + "epoch": 15.433921602888928, + "eval_accuracy": 0.3716249244954154, + "eval_loss": 3.551257371902466, + "eval_runtime": 180.9509, + "eval_samples_per_second": 91.992, + "eval_steps_per_second": 5.753, + "step": 53000 + }, + { + "epoch": 15.44848273050265, + "grad_norm": 0.3508327305316925, + "learning_rate": 0.00041486363636363634, + "loss": 3.281, + "step": 53050 + }, + { + "epoch": 15.463043858116373, + "grad_norm": 0.37493276596069336, + "learning_rate": 0.0004146888111888112, + "loss": 3.2868, + "step": 53100 + }, + { + "epoch": 15.477604985730094, + "grad_norm": 0.38858386874198914, + "learning_rate": 0.00041451398601398593, + "loss": 3.2819, + "step": 53150 + }, + { + "epoch": 15.492166113343817, + "grad_norm": 0.36600261926651, + "learning_rate": 0.0004143391608391608, + "loss": 3.2751, + "step": 53200 + }, + { + "epoch": 15.50672724095754, + "grad_norm": 0.38814976811408997, + "learning_rate": 0.0004141643356643356, + "loss": 3.2928, + "step": 53250 + }, + { + "epoch": 15.521288368571263, + "grad_norm": 0.35792145133018494, + "learning_rate": 0.00041398951048951044, + "loss": 3.2845, + "step": 53300 + }, + { + "epoch": 15.535849496184985, + "grad_norm": 0.37970036268234253, + "learning_rate": 0.00041381468531468524, + "loss": 3.2981, + "step": 53350 + }, + { + "epoch": 15.550410623798706, + "grad_norm": 0.3698292672634125, + "learning_rate": 0.0004136398601398601, + "loss": 3.2764, + "step": 53400 + }, + { + "epoch": 15.564971751412429, + "grad_norm": 0.3558374047279358, + "learning_rate": 0.00041346503496503495, + "loss": 3.2896, + "step": 53450 + }, + { + "epoch": 15.579532879026152, + "grad_norm": Infinity, + "learning_rate": 0.00041329020979020975, + "loss": 3.2946, + "step": 53500 + }, + { + "epoch": 15.594094006639875, + "grad_norm": 0.37755706906318665, + "learning_rate": 0.0004131153846153846, + "loss": 3.2922, + "step": 53550 + }, + { + "epoch": 15.608655134253597, + "grad_norm": 0.3373359143733978, + "learning_rate": 0.0004129405594405594, + "loss": 3.2973, + "step": 53600 + }, + { + "epoch": 15.623216261867318, + "grad_norm": 0.35794350504875183, + "learning_rate": 0.00041276573426573426, + "loss": 3.3028, + "step": 53650 + }, + { + "epoch": 15.637777389481041, + "grad_norm": 0.3629034757614136, + "learning_rate": 0.00041259090909090906, + "loss": 3.292, + "step": 53700 + }, + { + "epoch": 15.652338517094764, + "grad_norm": 0.3656299412250519, + "learning_rate": 0.0004124160839160839, + "loss": 3.2817, + "step": 53750 + }, + { + "epoch": 15.666899644708487, + "grad_norm": 0.37915611267089844, + "learning_rate": 0.0004122412587412587, + "loss": 3.3044, + "step": 53800 + }, + { + "epoch": 15.68146077232221, + "grad_norm": 0.38034093379974365, + "learning_rate": 0.00041206643356643356, + "loss": 3.2931, + "step": 53850 + }, + { + "epoch": 15.69602189993593, + "grad_norm": 0.39361733198165894, + "learning_rate": 0.0004118916083916083, + "loss": 3.2864, + "step": 53900 + }, + { + "epoch": 15.710583027549653, + "grad_norm": 0.40040066838264465, + "learning_rate": 0.00041171678321678316, + "loss": 3.2923, + "step": 53950 + }, + { + "epoch": 15.725144155163376, + "grad_norm": 0.33866026997566223, + "learning_rate": 0.00041154195804195796, + "loss": 3.3018, + "step": 54000 + }, + { + "epoch": 15.725144155163376, + "eval_accuracy": 0.3721960436420614, + "eval_loss": 3.541879653930664, + "eval_runtime": 180.9075, + "eval_samples_per_second": 92.014, + "eval_steps_per_second": 5.754, + "step": 54000 + }, + { + "epoch": 15.739705282777098, + "grad_norm": 0.36742034554481506, + "learning_rate": 0.0004113671328671328, + "loss": 3.2937, + "step": 54050 + }, + { + "epoch": 15.754266410390821, + "grad_norm": 0.3519640862941742, + "learning_rate": 0.00041119230769230767, + "loss": 3.3098, + "step": 54100 + }, + { + "epoch": 15.768827538004544, + "grad_norm": 0.356086790561676, + "learning_rate": 0.00041101748251748247, + "loss": 3.3104, + "step": 54150 + }, + { + "epoch": 15.783388665618265, + "grad_norm": 0.3600098490715027, + "learning_rate": 0.0004108426573426573, + "loss": 3.3047, + "step": 54200 + }, + { + "epoch": 15.797949793231988, + "grad_norm": 0.3581182062625885, + "learning_rate": 0.0004106678321678321, + "loss": 3.3121, + "step": 54250 + }, + { + "epoch": 15.81251092084571, + "grad_norm": 0.3515772521495819, + "learning_rate": 0.000410493006993007, + "loss": 3.3019, + "step": 54300 + }, + { + "epoch": 15.827072048459433, + "grad_norm": 0.40560662746429443, + "learning_rate": 0.0004103181818181818, + "loss": 3.3104, + "step": 54350 + }, + { + "epoch": 15.841633176073156, + "grad_norm": 0.3703998029232025, + "learning_rate": 0.00041014335664335663, + "loss": 3.3006, + "step": 54400 + }, + { + "epoch": 15.856194303686877, + "grad_norm": 0.35367172956466675, + "learning_rate": 0.00040996853146853143, + "loss": 3.2965, + "step": 54450 + }, + { + "epoch": 15.8707554313006, + "grad_norm": 0.3762366473674774, + "learning_rate": 0.0004097937062937063, + "loss": 3.2994, + "step": 54500 + }, + { + "epoch": 15.885316558914322, + "grad_norm": 0.3624609708786011, + "learning_rate": 0.0004096188811188811, + "loss": 3.3033, + "step": 54550 + }, + { + "epoch": 15.899877686528045, + "grad_norm": 0.36504390835762024, + "learning_rate": 0.00040944405594405594, + "loss": 3.3073, + "step": 54600 + }, + { + "epoch": 15.914438814141768, + "grad_norm": 0.36164841055870056, + "learning_rate": 0.0004092692307692307, + "loss": 3.311, + "step": 54650 + }, + { + "epoch": 15.928999941755489, + "grad_norm": 0.35146453976631165, + "learning_rate": 0.00040909440559440554, + "loss": 3.3255, + "step": 54700 + }, + { + "epoch": 15.943561069369212, + "grad_norm": 0.36682817339897156, + "learning_rate": 0.00040891958041958034, + "loss": 3.3056, + "step": 54750 + }, + { + "epoch": 15.958122196982934, + "grad_norm": 0.375960111618042, + "learning_rate": 0.0004087447552447552, + "loss": 3.2998, + "step": 54800 + }, + { + "epoch": 15.972683324596657, + "grad_norm": 0.37814897298812866, + "learning_rate": 0.00040856993006993005, + "loss": 3.3165, + "step": 54850 + }, + { + "epoch": 15.98724445221038, + "grad_norm": 0.3649648129940033, + "learning_rate": 0.00040839510489510485, + "loss": 3.305, + "step": 54900 + }, + { + "epoch": 16.001747335313645, + "grad_norm": 0.38327813148498535, + "learning_rate": 0.0004082202797202797, + "loss": 3.3029, + "step": 54950 + }, + { + "epoch": 16.01630846292737, + "grad_norm": 0.3484591841697693, + "learning_rate": 0.0004080454545454545, + "loss": 3.2195, + "step": 55000 + }, + { + "epoch": 16.01630846292737, + "eval_accuracy": 0.37155215324144797, + "eval_loss": 3.5542728900909424, + "eval_runtime": 180.9535, + "eval_samples_per_second": 91.99, + "eval_steps_per_second": 5.753, + "step": 55000 + }, + { + "epoch": 16.03086959054109, + "grad_norm": 0.37213021516799927, + "learning_rate": 0.00040787062937062935, + "loss": 3.1973, + "step": 55050 + }, + { + "epoch": 16.045430718154815, + "grad_norm": 0.3620660603046417, + "learning_rate": 0.00040769580419580415, + "loss": 3.2098, + "step": 55100 + }, + { + "epoch": 16.059991845768536, + "grad_norm": 0.39267122745513916, + "learning_rate": 0.000407520979020979, + "loss": 3.2128, + "step": 55150 + }, + { + "epoch": 16.074552973382257, + "grad_norm": 0.3540675938129425, + "learning_rate": 0.0004073461538461538, + "loss": 3.211, + "step": 55200 + }, + { + "epoch": 16.08911410099598, + "grad_norm": 0.35271748900413513, + "learning_rate": 0.00040717132867132866, + "loss": 3.2222, + "step": 55250 + }, + { + "epoch": 16.103675228609703, + "grad_norm": 0.3637480139732361, + "learning_rate": 0.00040699650349650346, + "loss": 3.223, + "step": 55300 + }, + { + "epoch": 16.118236356223427, + "grad_norm": 0.39908871054649353, + "learning_rate": 0.0004068216783216783, + "loss": 3.2375, + "step": 55350 + }, + { + "epoch": 16.132797483837148, + "grad_norm": 0.37379196286201477, + "learning_rate": 0.00040664685314685306, + "loss": 3.2193, + "step": 55400 + }, + { + "epoch": 16.14735861145087, + "grad_norm": 0.36621153354644775, + "learning_rate": 0.0004064720279720279, + "loss": 3.2247, + "step": 55450 + }, + { + "epoch": 16.161919739064594, + "grad_norm": 0.3897995352745056, + "learning_rate": 0.00040629720279720277, + "loss": 3.2235, + "step": 55500 + }, + { + "epoch": 16.176480866678315, + "grad_norm": 0.384102463722229, + "learning_rate": 0.00040612237762237757, + "loss": 3.2327, + "step": 55550 + }, + { + "epoch": 16.19104199429204, + "grad_norm": 0.3749612271785736, + "learning_rate": 0.0004059475524475524, + "loss": 3.2375, + "step": 55600 + }, + { + "epoch": 16.20560312190576, + "grad_norm": 0.37099507451057434, + "learning_rate": 0.0004057727272727272, + "loss": 3.2415, + "step": 55650 + }, + { + "epoch": 16.22016424951948, + "grad_norm": 0.39269089698791504, + "learning_rate": 0.0004055979020979021, + "loss": 3.2305, + "step": 55700 + }, + { + "epoch": 16.234725377133206, + "grad_norm": 0.38968995213508606, + "learning_rate": 0.0004054230769230769, + "loss": 3.2416, + "step": 55750 + }, + { + "epoch": 16.249286504746927, + "grad_norm": 0.3777758777141571, + "learning_rate": 0.00040524825174825173, + "loss": 3.244, + "step": 55800 + }, + { + "epoch": 16.26384763236065, + "grad_norm": 0.3755890429019928, + "learning_rate": 0.00040507342657342653, + "loss": 3.2563, + "step": 55850 + }, + { + "epoch": 16.278408759974372, + "grad_norm": 0.3695318102836609, + "learning_rate": 0.0004048986013986014, + "loss": 3.2446, + "step": 55900 + }, + { + "epoch": 16.292969887588093, + "grad_norm": 0.33629676699638367, + "learning_rate": 0.0004047237762237762, + "loss": 3.2445, + "step": 55950 + }, + { + "epoch": 16.307531015201818, + "grad_norm": 0.35609471797943115, + "learning_rate": 0.00040454895104895104, + "loss": 3.253, + "step": 56000 + }, + { + "epoch": 16.307531015201818, + "eval_accuracy": 0.37172932009076776, + "eval_loss": 3.5524020195007324, + "eval_runtime": 180.9583, + "eval_samples_per_second": 91.988, + "eval_steps_per_second": 5.753, + "step": 56000 + }, + { + "epoch": 16.32209214281554, + "grad_norm": 0.379630446434021, + "learning_rate": 0.00040437412587412583, + "loss": 3.2563, + "step": 56050 + }, + { + "epoch": 16.336653270429263, + "grad_norm": 0.42045262455940247, + "learning_rate": 0.0004041993006993007, + "loss": 3.267, + "step": 56100 + }, + { + "epoch": 16.351214398042984, + "grad_norm": 0.3641156554222107, + "learning_rate": 0.00040402447552447554, + "loss": 3.253, + "step": 56150 + }, + { + "epoch": 16.36577552565671, + "grad_norm": 0.3764376640319824, + "learning_rate": 0.0004038496503496503, + "loss": 3.2674, + "step": 56200 + }, + { + "epoch": 16.38033665327043, + "grad_norm": 0.38018858432769775, + "learning_rate": 0.00040367482517482514, + "loss": 3.2573, + "step": 56250 + }, + { + "epoch": 16.39489778088415, + "grad_norm": 0.3741246163845062, + "learning_rate": 0.00040349999999999994, + "loss": 3.2688, + "step": 56300 + }, + { + "epoch": 16.409458908497875, + "grad_norm": 0.36829307675361633, + "learning_rate": 0.0004033251748251748, + "loss": 3.2688, + "step": 56350 + }, + { + "epoch": 16.424020036111596, + "grad_norm": 0.37031373381614685, + "learning_rate": 0.0004031503496503496, + "loss": 3.2537, + "step": 56400 + }, + { + "epoch": 16.43858116372532, + "grad_norm": 0.4177717864513397, + "learning_rate": 0.00040297552447552445, + "loss": 3.2683, + "step": 56450 + }, + { + "epoch": 16.45314229133904, + "grad_norm": 0.3726315498352051, + "learning_rate": 0.00040280069930069925, + "loss": 3.261, + "step": 56500 + }, + { + "epoch": 16.467703418952762, + "grad_norm": 0.36388418078422546, + "learning_rate": 0.0004026258741258741, + "loss": 3.2684, + "step": 56550 + }, + { + "epoch": 16.482264546566487, + "grad_norm": 0.35378575325012207, + "learning_rate": 0.0004024510489510489, + "loss": 3.2733, + "step": 56600 + }, + { + "epoch": 16.496825674180208, + "grad_norm": 0.3618945777416229, + "learning_rate": 0.00040227622377622376, + "loss": 3.2709, + "step": 56650 + }, + { + "epoch": 16.511386801793932, + "grad_norm": 0.4058282673358917, + "learning_rate": 0.00040210139860139856, + "loss": 3.2689, + "step": 56700 + }, + { + "epoch": 16.525947929407653, + "grad_norm": 0.3674890995025635, + "learning_rate": 0.0004019265734265734, + "loss": 3.2723, + "step": 56750 + }, + { + "epoch": 16.540509057021374, + "grad_norm": 0.3580159842967987, + "learning_rate": 0.0004017517482517482, + "loss": 3.2853, + "step": 56800 + }, + { + "epoch": 16.5550701846351, + "grad_norm": 0.39147987961769104, + "learning_rate": 0.00040157692307692306, + "loss": 3.2778, + "step": 56850 + }, + { + "epoch": 16.56963131224882, + "grad_norm": 0.3897418975830078, + "learning_rate": 0.0004014020979020979, + "loss": 3.276, + "step": 56900 + }, + { + "epoch": 16.584192439862544, + "grad_norm": 0.36115020513534546, + "learning_rate": 0.00040122727272727266, + "loss": 3.2842, + "step": 56950 + }, + { + "epoch": 16.598753567476265, + "grad_norm": 0.34980452060699463, + "learning_rate": 0.0004010524475524475, + "loss": 3.2781, + "step": 57000 + }, + { + "epoch": 16.598753567476265, + "eval_accuracy": 0.3722949137948669, + "eval_loss": 3.5432724952697754, + "eval_runtime": 180.9348, + "eval_samples_per_second": 92.0, + "eval_steps_per_second": 5.753, + "step": 57000 + }, + { + "epoch": 16.613314695089986, + "grad_norm": 0.3930431604385376, + "learning_rate": 0.0004008776223776223, + "loss": 3.2808, + "step": 57050 + }, + { + "epoch": 16.62787582270371, + "grad_norm": 0.377023309469223, + "learning_rate": 0.00040070279720279717, + "loss": 3.2813, + "step": 57100 + }, + { + "epoch": 16.642436950317432, + "grad_norm": 0.3362906277179718, + "learning_rate": 0.00040052797202797197, + "loss": 3.2755, + "step": 57150 + }, + { + "epoch": 16.656998077931156, + "grad_norm": 0.3678101599216461, + "learning_rate": 0.0004003531468531468, + "loss": 3.275, + "step": 57200 + }, + { + "epoch": 16.671559205544877, + "grad_norm": 0.3514406979084015, + "learning_rate": 0.0004001783216783216, + "loss": 3.2848, + "step": 57250 + }, + { + "epoch": 16.6861203331586, + "grad_norm": 0.34927234053611755, + "learning_rate": 0.0004000034965034965, + "loss": 3.2896, + "step": 57300 + }, + { + "epoch": 16.700681460772323, + "grad_norm": 0.35985633730888367, + "learning_rate": 0.0003998286713286713, + "loss": 3.2758, + "step": 57350 + }, + { + "epoch": 16.715242588386044, + "grad_norm": 0.38298559188842773, + "learning_rate": 0.00039965384615384613, + "loss": 3.2831, + "step": 57400 + }, + { + "epoch": 16.72980371599977, + "grad_norm": 0.39080435037612915, + "learning_rate": 0.00039947902097902093, + "loss": 3.2831, + "step": 57450 + }, + { + "epoch": 16.74436484361349, + "grad_norm": 0.35113897919654846, + "learning_rate": 0.0003993041958041958, + "loss": 3.284, + "step": 57500 + }, + { + "epoch": 16.75892597122721, + "grad_norm": 0.34621018171310425, + "learning_rate": 0.00039912937062937064, + "loss": 3.2853, + "step": 57550 + }, + { + "epoch": 16.773487098840935, + "grad_norm": 0.40347158908843994, + "learning_rate": 0.00039895454545454544, + "loss": 3.2903, + "step": 57600 + }, + { + "epoch": 16.788048226454656, + "grad_norm": 0.3585277199745178, + "learning_rate": 0.0003987797202797203, + "loss": 3.292, + "step": 57650 + }, + { + "epoch": 16.80260935406838, + "grad_norm": 0.3407650589942932, + "learning_rate": 0.00039860489510489504, + "loss": 3.2895, + "step": 57700 + }, + { + "epoch": 16.8171704816821, + "grad_norm": 0.36110949516296387, + "learning_rate": 0.0003984300699300699, + "loss": 3.2995, + "step": 57750 + }, + { + "epoch": 16.831731609295822, + "grad_norm": 0.3790327310562134, + "learning_rate": 0.0003982552447552447, + "loss": 3.2806, + "step": 57800 + }, + { + "epoch": 16.846292736909547, + "grad_norm": 0.3504062592983246, + "learning_rate": 0.00039808041958041955, + "loss": 3.2835, + "step": 57850 + }, + { + "epoch": 16.860853864523268, + "grad_norm": 0.34082862734794617, + "learning_rate": 0.00039790559440559435, + "loss": 3.2844, + "step": 57900 + }, + { + "epoch": 16.875414992136992, + "grad_norm": 0.4020354151725769, + "learning_rate": 0.0003977307692307692, + "loss": 3.2997, + "step": 57950 + }, + { + "epoch": 16.889976119750713, + "grad_norm": 0.3737473487854004, + "learning_rate": 0.000397555944055944, + "loss": 3.2999, + "step": 58000 + }, + { + "epoch": 16.889976119750713, + "eval_accuracy": 0.3727604617200867, + "eval_loss": 3.5354135036468506, + "eval_runtime": 180.934, + "eval_samples_per_second": 92.0, + "eval_steps_per_second": 5.753, + "step": 58000 + }, + { + "epoch": 16.904537247364434, + "grad_norm": 0.3592594563961029, + "learning_rate": 0.00039738111888111885, + "loss": 3.2993, + "step": 58050 + }, + { + "epoch": 16.91909837497816, + "grad_norm": 0.3586547076702118, + "learning_rate": 0.00039720629370629365, + "loss": 3.2973, + "step": 58100 + }, + { + "epoch": 16.93365950259188, + "grad_norm": 0.3590955138206482, + "learning_rate": 0.0003970314685314685, + "loss": 3.2927, + "step": 58150 + }, + { + "epoch": 16.948220630205604, + "grad_norm": 0.38664183020591736, + "learning_rate": 0.0003968566433566433, + "loss": 3.2935, + "step": 58200 + }, + { + "epoch": 16.962781757819325, + "grad_norm": 0.38259342312812805, + "learning_rate": 0.00039668181818181816, + "loss": 3.2941, + "step": 58250 + }, + { + "epoch": 16.977342885433046, + "grad_norm": 0.3696136474609375, + "learning_rate": 0.000396506993006993, + "loss": 3.2981, + "step": 58300 + }, + { + "epoch": 16.99190401304677, + "grad_norm": 0.3578000068664551, + "learning_rate": 0.0003963321678321678, + "loss": 3.292, + "step": 58350 + }, + { + "epoch": 17.006406896150036, + "grad_norm": 0.38829582929611206, + "learning_rate": 0.00039615734265734267, + "loss": 3.256, + "step": 58400 + }, + { + "epoch": 17.02096802376376, + "grad_norm": 0.3556952178478241, + "learning_rate": 0.0003959825174825174, + "loss": 3.1862, + "step": 58450 + }, + { + "epoch": 17.03552915137748, + "grad_norm": 0.36325693130493164, + "learning_rate": 0.00039580769230769227, + "loss": 3.1957, + "step": 58500 + }, + { + "epoch": 17.050090278991206, + "grad_norm": 0.3780340552330017, + "learning_rate": 0.00039563286713286707, + "loss": 3.1984, + "step": 58550 + }, + { + "epoch": 17.064651406604927, + "grad_norm": 0.3776744306087494, + "learning_rate": 0.0003954580419580419, + "loss": 3.2022, + "step": 58600 + }, + { + "epoch": 17.07921253421865, + "grad_norm": 0.37272578477859497, + "learning_rate": 0.0003952832167832167, + "loss": 3.2085, + "step": 58650 + }, + { + "epoch": 17.093773661832373, + "grad_norm": 0.3816814124584198, + "learning_rate": 0.0003951083916083916, + "loss": 3.2128, + "step": 58700 + }, + { + "epoch": 17.108334789446094, + "grad_norm": 0.3480782210826874, + "learning_rate": 0.0003949335664335664, + "loss": 3.2202, + "step": 58750 + }, + { + "epoch": 17.122895917059818, + "grad_norm": 0.4132950007915497, + "learning_rate": 0.00039475874125874123, + "loss": 3.2165, + "step": 58800 + }, + { + "epoch": 17.13745704467354, + "grad_norm": 0.3595176935195923, + "learning_rate": 0.00039458391608391603, + "loss": 3.22, + "step": 58850 + }, + { + "epoch": 17.152018172287264, + "grad_norm": 0.3662016689777374, + "learning_rate": 0.0003944090909090909, + "loss": 3.2138, + "step": 58900 + }, + { + "epoch": 17.166579299900985, + "grad_norm": 0.36622393131256104, + "learning_rate": 0.00039423426573426573, + "loss": 3.2161, + "step": 58950 + }, + { + "epoch": 17.181140427514705, + "grad_norm": 0.37797629833221436, + "learning_rate": 0.00039405944055944053, + "loss": 3.2191, + "step": 59000 + }, + { + "epoch": 17.181140427514705, + "eval_accuracy": 0.3717471896070893, + "eval_loss": 3.555344820022583, + "eval_runtime": 180.7239, + "eval_samples_per_second": 92.107, + "eval_steps_per_second": 5.76, + "step": 59000 + }, + { + "epoch": 17.19570155512843, + "grad_norm": 0.39831972122192383, + "learning_rate": 0.0003938846153846154, + "loss": 3.2431, + "step": 59050 + }, + { + "epoch": 17.21026268274215, + "grad_norm": 0.3561229109764099, + "learning_rate": 0.0003937097902097902, + "loss": 3.2428, + "step": 59100 + }, + { + "epoch": 17.224823810355876, + "grad_norm": 0.35875722765922546, + "learning_rate": 0.00039353496503496504, + "loss": 3.2266, + "step": 59150 + }, + { + "epoch": 17.239384937969596, + "grad_norm": 0.4089879095554352, + "learning_rate": 0.0003933601398601398, + "loss": 3.2269, + "step": 59200 + }, + { + "epoch": 17.253946065583317, + "grad_norm": 0.37490785121917725, + "learning_rate": 0.00039318531468531464, + "loss": 3.2422, + "step": 59250 + }, + { + "epoch": 17.268507193197042, + "grad_norm": 0.36489033699035645, + "learning_rate": 0.00039301048951048944, + "loss": 3.2319, + "step": 59300 + }, + { + "epoch": 17.283068320810763, + "grad_norm": 0.4006343185901642, + "learning_rate": 0.0003928356643356643, + "loss": 3.2394, + "step": 59350 + }, + { + "epoch": 17.297629448424487, + "grad_norm": 0.4032607078552246, + "learning_rate": 0.0003926608391608391, + "loss": 3.2427, + "step": 59400 + }, + { + "epoch": 17.31219057603821, + "grad_norm": 0.3809773623943329, + "learning_rate": 0.00039248601398601395, + "loss": 3.2293, + "step": 59450 + }, + { + "epoch": 17.32675170365193, + "grad_norm": 0.3603288531303406, + "learning_rate": 0.00039231118881118875, + "loss": 3.2487, + "step": 59500 + }, + { + "epoch": 17.341312831265654, + "grad_norm": 0.3441549837589264, + "learning_rate": 0.0003921363636363636, + "loss": 3.2395, + "step": 59550 + }, + { + "epoch": 17.355873958879375, + "grad_norm": 0.37875792384147644, + "learning_rate": 0.00039196153846153846, + "loss": 3.2598, + "step": 59600 + }, + { + "epoch": 17.3704350864931, + "grad_norm": 0.38208144903182983, + "learning_rate": 0.00039178671328671326, + "loss": 3.2508, + "step": 59650 + }, + { + "epoch": 17.38499621410682, + "grad_norm": 0.3884451389312744, + "learning_rate": 0.0003916118881118881, + "loss": 3.2471, + "step": 59700 + }, + { + "epoch": 17.39955734172054, + "grad_norm": 0.3753611445426941, + "learning_rate": 0.0003914370629370629, + "loss": 3.2523, + "step": 59750 + }, + { + "epoch": 17.414118469334266, + "grad_norm": 0.37695708870887756, + "learning_rate": 0.00039126223776223776, + "loss": 3.2556, + "step": 59800 + }, + { + "epoch": 17.428679596947987, + "grad_norm": 0.3882795572280884, + "learning_rate": 0.00039108741258741256, + "loss": 3.2442, + "step": 59850 + }, + { + "epoch": 17.44324072456171, + "grad_norm": 0.35586661100387573, + "learning_rate": 0.0003909125874125874, + "loss": 3.2433, + "step": 59900 + }, + { + "epoch": 17.457801852175432, + "grad_norm": 0.38463085889816284, + "learning_rate": 0.00039073776223776216, + "loss": 3.2651, + "step": 59950 + }, + { + "epoch": 17.472362979789153, + "grad_norm": 0.38579466938972473, + "learning_rate": 0.000390562937062937, + "loss": 3.2596, + "step": 60000 + }, + { + "epoch": 17.472362979789153, + "eval_accuracy": 0.37229173960446765, + "eval_loss": 3.5483200550079346, + "eval_runtime": 180.7914, + "eval_samples_per_second": 92.073, + "eval_steps_per_second": 5.758, + "step": 60000 + }, + { + "epoch": 17.486924107402878, + "grad_norm": 0.39465898275375366, + "learning_rate": 0.0003903881118881118, + "loss": 3.2515, + "step": 60050 + }, + { + "epoch": 17.5014852350166, + "grad_norm": 0.37305083870887756, + "learning_rate": 0.00039021328671328667, + "loss": 3.2657, + "step": 60100 + }, + { + "epoch": 17.516046362630323, + "grad_norm": 0.3921613097190857, + "learning_rate": 0.00039003846153846147, + "loss": 3.2809, + "step": 60150 + }, + { + "epoch": 17.530607490244044, + "grad_norm": 0.35129493474960327, + "learning_rate": 0.0003898636363636363, + "loss": 3.259, + "step": 60200 + }, + { + "epoch": 17.545168617857765, + "grad_norm": 0.3839664161205292, + "learning_rate": 0.0003896888111888111, + "loss": 3.2613, + "step": 60250 + }, + { + "epoch": 17.55972974547149, + "grad_norm": 0.3718804717063904, + "learning_rate": 0.000389513986013986, + "loss": 3.2626, + "step": 60300 + }, + { + "epoch": 17.57429087308521, + "grad_norm": 0.4159618318080902, + "learning_rate": 0.00038933916083916083, + "loss": 3.2682, + "step": 60350 + }, + { + "epoch": 17.588852000698935, + "grad_norm": 0.37494173645973206, + "learning_rate": 0.00038916433566433563, + "loss": 3.2774, + "step": 60400 + }, + { + "epoch": 17.603413128312656, + "grad_norm": 0.38625165820121765, + "learning_rate": 0.0003889895104895105, + "loss": 3.2691, + "step": 60450 + }, + { + "epoch": 17.617974255926377, + "grad_norm": 0.39727920293807983, + "learning_rate": 0.0003888146853146853, + "loss": 3.2739, + "step": 60500 + }, + { + "epoch": 17.6325353835401, + "grad_norm": 0.3708936274051666, + "learning_rate": 0.00038863986013986014, + "loss": 3.2578, + "step": 60550 + }, + { + "epoch": 17.647096511153823, + "grad_norm": 0.36158886551856995, + "learning_rate": 0.00038846503496503494, + "loss": 3.271, + "step": 60600 + }, + { + "epoch": 17.661657638767547, + "grad_norm": 0.3670588731765747, + "learning_rate": 0.0003882902097902098, + "loss": 3.2752, + "step": 60650 + }, + { + "epoch": 17.676218766381268, + "grad_norm": 0.3502790629863739, + "learning_rate": 0.00038811538461538454, + "loss": 3.262, + "step": 60700 + }, + { + "epoch": 17.690779893994993, + "grad_norm": 0.38418471813201904, + "learning_rate": 0.0003879405594405594, + "loss": 3.2687, + "step": 60750 + }, + { + "epoch": 17.705341021608714, + "grad_norm": 0.3664897680282593, + "learning_rate": 0.0003877657342657342, + "loss": 3.2597, + "step": 60800 + }, + { + "epoch": 17.719902149222435, + "grad_norm": 0.40013816952705383, + "learning_rate": 0.00038759090909090905, + "loss": 3.2732, + "step": 60850 + }, + { + "epoch": 17.73446327683616, + "grad_norm": 0.3857571482658386, + "learning_rate": 0.00038741608391608384, + "loss": 3.2733, + "step": 60900 + }, + { + "epoch": 17.74902440444988, + "grad_norm": 0.38301730155944824, + "learning_rate": 0.0003872412587412587, + "loss": 3.2715, + "step": 60950 + }, + { + "epoch": 17.763585532063605, + "grad_norm": 0.37607985734939575, + "learning_rate": 0.00038706643356643355, + "loss": 3.2635, + "step": 61000 + }, + { + "epoch": 17.763585532063605, + "eval_accuracy": 0.3731001000928039, + "eval_loss": 3.5363667011260986, + "eval_runtime": 181.354, + "eval_samples_per_second": 91.787, + "eval_steps_per_second": 5.74, + "step": 61000 + }, + { + "epoch": 17.778146659677326, + "grad_norm": 0.3776940703392029, + "learning_rate": 0.00038689160839160835, + "loss": 3.2758, + "step": 61050 + }, + { + "epoch": 17.792707787291047, + "grad_norm": 0.39629730582237244, + "learning_rate": 0.0003867167832167832, + "loss": 3.2631, + "step": 61100 + }, + { + "epoch": 17.80726891490477, + "grad_norm": 0.3491132855415344, + "learning_rate": 0.000386541958041958, + "loss": 3.2839, + "step": 61150 + }, + { + "epoch": 17.821830042518492, + "grad_norm": 0.37949714064598083, + "learning_rate": 0.00038636713286713286, + "loss": 3.2783, + "step": 61200 + }, + { + "epoch": 17.836391170132217, + "grad_norm": 0.3643863797187805, + "learning_rate": 0.00038619230769230766, + "loss": 3.2805, + "step": 61250 + }, + { + "epoch": 17.850952297745938, + "grad_norm": 0.3382050096988678, + "learning_rate": 0.0003860174825174825, + "loss": 3.2723, + "step": 61300 + }, + { + "epoch": 17.86551342535966, + "grad_norm": 0.3952576816082001, + "learning_rate": 0.0003858426573426573, + "loss": 3.2847, + "step": 61350 + }, + { + "epoch": 17.880074552973383, + "grad_norm": 0.38078320026397705, + "learning_rate": 0.00038566783216783217, + "loss": 3.2854, + "step": 61400 + }, + { + "epoch": 17.894635680587104, + "grad_norm": 0.3551877439022064, + "learning_rate": 0.0003854930069930069, + "loss": 3.2797, + "step": 61450 + }, + { + "epoch": 17.90919680820083, + "grad_norm": 0.3975968360900879, + "learning_rate": 0.00038531818181818177, + "loss": 3.2771, + "step": 61500 + }, + { + "epoch": 17.92375793581455, + "grad_norm": 0.3528117835521698, + "learning_rate": 0.00038514335664335657, + "loss": 3.2777, + "step": 61550 + }, + { + "epoch": 17.93831906342827, + "grad_norm": 0.3516587018966675, + "learning_rate": 0.0003849685314685314, + "loss": 3.2816, + "step": 61600 + }, + { + "epoch": 17.952880191041995, + "grad_norm": 0.37779533863067627, + "learning_rate": 0.0003847937062937062, + "loss": 3.2731, + "step": 61650 + }, + { + "epoch": 17.967441318655716, + "grad_norm": 0.37921205163002014, + "learning_rate": 0.0003846188811188811, + "loss": 3.2815, + "step": 61700 + }, + { + "epoch": 17.98200244626944, + "grad_norm": 0.40624403953552246, + "learning_rate": 0.00038444405594405593, + "loss": 3.2829, + "step": 61750 + }, + { + "epoch": 17.99656357388316, + "grad_norm": 0.3792065680027008, + "learning_rate": 0.00038426923076923073, + "loss": 3.2762, + "step": 61800 + }, + { + "epoch": 18.01106645698643, + "grad_norm": 0.38736584782600403, + "learning_rate": 0.0003840944055944056, + "loss": 3.2056, + "step": 61850 + }, + { + "epoch": 18.02562758460015, + "grad_norm": 0.3663954436779022, + "learning_rate": 0.0003839195804195804, + "loss": 3.1867, + "step": 61900 + }, + { + "epoch": 18.040188712213872, + "grad_norm": 0.36250391602516174, + "learning_rate": 0.00038374475524475523, + "loss": 3.1847, + "step": 61950 + }, + { + "epoch": 18.054749839827597, + "grad_norm": 0.3424190580844879, + "learning_rate": 0.00038356993006993003, + "loss": 3.1872, + "step": 62000 + }, + { + "epoch": 18.054749839827597, + "eval_accuracy": 0.3726784030201364, + "eval_loss": 3.5482542514801025, + "eval_runtime": 181.0633, + "eval_samples_per_second": 91.935, + "eval_steps_per_second": 5.749, + "step": 62000 + }, + { + "epoch": 18.069310967441318, + "grad_norm": 0.3808369040489197, + "learning_rate": 0.0003833951048951049, + "loss": 3.1775, + "step": 62050 + }, + { + "epoch": 18.083872095055042, + "grad_norm": 0.41652339696884155, + "learning_rate": 0.0003832202797202797, + "loss": 3.1956, + "step": 62100 + }, + { + "epoch": 18.098433222668763, + "grad_norm": 0.3494375944137573, + "learning_rate": 0.00038304545454545454, + "loss": 3.1895, + "step": 62150 + }, + { + "epoch": 18.112994350282484, + "grad_norm": 0.3790935277938843, + "learning_rate": 0.0003828706293706293, + "loss": 3.1971, + "step": 62200 + }, + { + "epoch": 18.12755547789621, + "grad_norm": 0.3747474253177643, + "learning_rate": 0.00038269580419580414, + "loss": 3.2032, + "step": 62250 + }, + { + "epoch": 18.14211660550993, + "grad_norm": 0.38967105746269226, + "learning_rate": 0.00038252097902097894, + "loss": 3.2032, + "step": 62300 + }, + { + "epoch": 18.156677733123654, + "grad_norm": 0.3795710504055023, + "learning_rate": 0.0003823461538461538, + "loss": 3.2167, + "step": 62350 + }, + { + "epoch": 18.171238860737375, + "grad_norm": 0.392669677734375, + "learning_rate": 0.00038217132867132865, + "loss": 3.209, + "step": 62400 + }, + { + "epoch": 18.185799988351096, + "grad_norm": 0.38014981150627136, + "learning_rate": 0.00038199650349650345, + "loss": 3.2065, + "step": 62450 + }, + { + "epoch": 18.20036111596482, + "grad_norm": 0.37931060791015625, + "learning_rate": 0.0003818216783216783, + "loss": 3.2128, + "step": 62500 + }, + { + "epoch": 18.214922243578542, + "grad_norm": 0.3978460729122162, + "learning_rate": 0.0003816468531468531, + "loss": 3.2153, + "step": 62550 + }, + { + "epoch": 18.229483371192266, + "grad_norm": 0.3716878592967987, + "learning_rate": 0.00038147202797202796, + "loss": 3.2101, + "step": 62600 + }, + { + "epoch": 18.244044498805987, + "grad_norm": 0.3965378403663635, + "learning_rate": 0.00038129720279720276, + "loss": 3.2217, + "step": 62650 + }, + { + "epoch": 18.25860562641971, + "grad_norm": 0.4007912576198578, + "learning_rate": 0.0003811223776223776, + "loss": 3.2219, + "step": 62700 + }, + { + "epoch": 18.273166754033433, + "grad_norm": 0.3940446078777313, + "learning_rate": 0.0003809475524475524, + "loss": 3.2261, + "step": 62750 + }, + { + "epoch": 18.287727881647154, + "grad_norm": 0.4010846018791199, + "learning_rate": 0.00038077272727272726, + "loss": 3.2194, + "step": 62800 + }, + { + "epoch": 18.30228900926088, + "grad_norm": 0.39300212264060974, + "learning_rate": 0.00038059790209790206, + "loss": 3.2288, + "step": 62850 + }, + { + "epoch": 18.3168501368746, + "grad_norm": 0.3691205084323883, + "learning_rate": 0.0003804230769230769, + "loss": 3.2333, + "step": 62900 + }, + { + "epoch": 18.33141126448832, + "grad_norm": 0.35388556122779846, + "learning_rate": 0.00038024825174825166, + "loss": 3.2303, + "step": 62950 + }, + { + "epoch": 18.345972392102045, + "grad_norm": 0.3913156986236572, + "learning_rate": 0.0003800734265734265, + "loss": 3.2398, + "step": 63000 + }, + { + "epoch": 18.345972392102045, + "eval_accuracy": 0.3726364331693022, + "eval_loss": 3.547825336456299, + "eval_runtime": 181.4413, + "eval_samples_per_second": 91.743, + "eval_steps_per_second": 5.737, + "step": 63000 + }, + { + "epoch": 18.360533519715766, + "grad_norm": 0.4170674681663513, + "learning_rate": 0.0003798986013986013, + "loss": 3.2439, + "step": 63050 + }, + { + "epoch": 18.37509464732949, + "grad_norm": 0.3595086932182312, + "learning_rate": 0.00037972377622377617, + "loss": 3.2187, + "step": 63100 + }, + { + "epoch": 18.38965577494321, + "grad_norm": 0.34484726190567017, + "learning_rate": 0.000379548951048951, + "loss": 3.224, + "step": 63150 + }, + { + "epoch": 18.404216902556932, + "grad_norm": 0.3886369466781616, + "learning_rate": 0.0003793741258741258, + "loss": 3.235, + "step": 63200 + }, + { + "epoch": 18.418778030170657, + "grad_norm": 0.3723530173301697, + "learning_rate": 0.0003791993006993007, + "loss": 3.2362, + "step": 63250 + }, + { + "epoch": 18.433339157784378, + "grad_norm": 0.3875845968723297, + "learning_rate": 0.0003790244755244755, + "loss": 3.2425, + "step": 63300 + }, + { + "epoch": 18.447900285398102, + "grad_norm": 0.3876391053199768, + "learning_rate": 0.00037884965034965033, + "loss": 3.2526, + "step": 63350 + }, + { + "epoch": 18.462461413011823, + "grad_norm": 0.355400025844574, + "learning_rate": 0.00037867482517482513, + "loss": 3.2482, + "step": 63400 + }, + { + "epoch": 18.477022540625548, + "grad_norm": 0.3940849006175995, + "learning_rate": 0.0003785, + "loss": 3.239, + "step": 63450 + }, + { + "epoch": 18.49158366823927, + "grad_norm": 0.3690098226070404, + "learning_rate": 0.0003783251748251748, + "loss": 3.2422, + "step": 63500 + }, + { + "epoch": 18.50614479585299, + "grad_norm": 0.35771867632865906, + "learning_rate": 0.00037815034965034964, + "loss": 3.2373, + "step": 63550 + }, + { + "epoch": 18.520705923466714, + "grad_norm": 0.36510664224624634, + "learning_rate": 0.00037797552447552444, + "loss": 3.2422, + "step": 63600 + }, + { + "epoch": 18.535267051080435, + "grad_norm": 0.3902657628059387, + "learning_rate": 0.0003778006993006993, + "loss": 3.2532, + "step": 63650 + }, + { + "epoch": 18.54982817869416, + "grad_norm": 0.36882349848747253, + "learning_rate": 0.00037762587412587404, + "loss": 3.2641, + "step": 63700 + }, + { + "epoch": 18.56438930630788, + "grad_norm": 0.3985002934932709, + "learning_rate": 0.0003774510489510489, + "loss": 3.2609, + "step": 63750 + }, + { + "epoch": 18.5789504339216, + "grad_norm": 0.35080111026763916, + "learning_rate": 0.0003772762237762238, + "loss": 3.2476, + "step": 63800 + }, + { + "epoch": 18.593511561535326, + "grad_norm": 0.369186669588089, + "learning_rate": 0.00037710139860139854, + "loss": 3.2379, + "step": 63850 + }, + { + "epoch": 18.608072689149047, + "grad_norm": 0.36628544330596924, + "learning_rate": 0.0003769265734265734, + "loss": 3.2485, + "step": 63900 + }, + { + "epoch": 18.62263381676277, + "grad_norm": 0.38443052768707275, + "learning_rate": 0.0003767517482517482, + "loss": 3.2553, + "step": 63950 + }, + { + "epoch": 18.637194944376493, + "grad_norm": 0.37169235944747925, + "learning_rate": 0.00037657692307692305, + "loss": 3.2587, + "step": 64000 + }, + { + "epoch": 18.637194944376493, + "eval_accuracy": 0.3731584111460638, + "eval_loss": 3.541902780532837, + "eval_runtime": 180.7907, + "eval_samples_per_second": 92.073, + "eval_steps_per_second": 5.758, + "step": 64000 + }, + { + "epoch": 18.651756071990214, + "grad_norm": 0.3672039806842804, + "learning_rate": 0.00037640209790209785, + "loss": 3.2603, + "step": 64050 + }, + { + "epoch": 18.666317199603938, + "grad_norm": 0.45815590023994446, + "learning_rate": 0.0003762272727272727, + "loss": 3.2663, + "step": 64100 + }, + { + "epoch": 18.68087832721766, + "grad_norm": 0.3884202837944031, + "learning_rate": 0.0003760524475524475, + "loss": 3.2654, + "step": 64150 + }, + { + "epoch": 18.695439454831384, + "grad_norm": 0.3832554221153259, + "learning_rate": 0.00037587762237762236, + "loss": 3.2624, + "step": 64200 + }, + { + "epoch": 18.710000582445105, + "grad_norm": 0.36669617891311646, + "learning_rate": 0.00037570279720279716, + "loss": 3.2656, + "step": 64250 + }, + { + "epoch": 18.724561710058826, + "grad_norm": 0.38056427240371704, + "learning_rate": 0.000375527972027972, + "loss": 3.2587, + "step": 64300 + }, + { + "epoch": 18.73912283767255, + "grad_norm": 0.38066598773002625, + "learning_rate": 0.0003753531468531468, + "loss": 3.2681, + "step": 64350 + }, + { + "epoch": 18.75368396528627, + "grad_norm": 0.3981202244758606, + "learning_rate": 0.00037517832167832167, + "loss": 3.2619, + "step": 64400 + }, + { + "epoch": 18.768245092899996, + "grad_norm": 0.4338530898094177, + "learning_rate": 0.0003750034965034965, + "loss": 3.2692, + "step": 64450 + }, + { + "epoch": 18.782806220513717, + "grad_norm": 0.3539597988128662, + "learning_rate": 0.00037482867132867127, + "loss": 3.2738, + "step": 64500 + }, + { + "epoch": 18.797367348127437, + "grad_norm": 0.37372642755508423, + "learning_rate": 0.0003746538461538462, + "loss": 3.2636, + "step": 64550 + }, + { + "epoch": 18.811928475741162, + "grad_norm": 0.4146757423877716, + "learning_rate": 0.0003744790209790209, + "loss": 3.2714, + "step": 64600 + }, + { + "epoch": 18.826489603354883, + "grad_norm": 0.38235408067703247, + "learning_rate": 0.0003743041958041958, + "loss": 3.2688, + "step": 64650 + }, + { + "epoch": 18.841050730968607, + "grad_norm": 0.368381530046463, + "learning_rate": 0.0003741293706293706, + "loss": 3.2786, + "step": 64700 + }, + { + "epoch": 18.85561185858233, + "grad_norm": 0.37366122007369995, + "learning_rate": 0.0003739545454545454, + "loss": 3.2658, + "step": 64750 + }, + { + "epoch": 18.87017298619605, + "grad_norm": 0.39176511764526367, + "learning_rate": 0.0003737797202797202, + "loss": 3.2802, + "step": 64800 + }, + { + "epoch": 18.884734113809774, + "grad_norm": 0.36762627959251404, + "learning_rate": 0.0003736048951048951, + "loss": 3.2696, + "step": 64850 + }, + { + "epoch": 18.899295241423495, + "grad_norm": 0.36578455567359924, + "learning_rate": 0.0003734300699300699, + "loss": 3.2606, + "step": 64900 + }, + { + "epoch": 18.91385636903722, + "grad_norm": 0.37496402859687805, + "learning_rate": 0.00037325524475524473, + "loss": 3.2774, + "step": 64950 + }, + { + "epoch": 18.92841749665094, + "grad_norm": 0.4118943512439728, + "learning_rate": 0.00037308041958041953, + "loss": 3.2748, + "step": 65000 + }, + { + "epoch": 18.92841749665094, + "eval_accuracy": 0.37342116357355526, + "eval_loss": 3.5314598083496094, + "eval_runtime": 181.076, + "eval_samples_per_second": 91.928, + "eval_steps_per_second": 5.749, + "step": 65000 + }, + { + "epoch": 18.94297862426466, + "grad_norm": 0.3894543945789337, + "learning_rate": 0.0003729055944055944, + "loss": 3.2743, + "step": 65050 + }, + { + "epoch": 18.957539751878386, + "grad_norm": 0.3797741234302521, + "learning_rate": 0.0003727307692307692, + "loss": 3.2648, + "step": 65100 + }, + { + "epoch": 18.972100879492107, + "grad_norm": 0.40677353739738464, + "learning_rate": 0.00037255594405594404, + "loss": 3.273, + "step": 65150 + }, + { + "epoch": 18.98666200710583, + "grad_norm": 0.34352266788482666, + "learning_rate": 0.0003723811188811189, + "loss": 3.2839, + "step": 65200 + }, + { + "epoch": 19.001164890209097, + "grad_norm": 0.3892625570297241, + "learning_rate": 0.00037220629370629364, + "loss": 3.2697, + "step": 65250 + }, + { + "epoch": 19.01572601782282, + "grad_norm": 0.3817722499370575, + "learning_rate": 0.00037203146853146855, + "loss": 3.1613, + "step": 65300 + }, + { + "epoch": 19.030287145436542, + "grad_norm": 0.3764374256134033, + "learning_rate": 0.0003718566433566433, + "loss": 3.1725, + "step": 65350 + }, + { + "epoch": 19.044848273050263, + "grad_norm": 0.3804764747619629, + "learning_rate": 0.00037168181818181815, + "loss": 3.1711, + "step": 65400 + }, + { + "epoch": 19.059409400663988, + "grad_norm": 0.3937837481498718, + "learning_rate": 0.00037150699300699295, + "loss": 3.175, + "step": 65450 + }, + { + "epoch": 19.07397052827771, + "grad_norm": 0.3940994441509247, + "learning_rate": 0.0003713321678321678, + "loss": 3.1962, + "step": 65500 + }, + { + "epoch": 19.088531655891433, + "grad_norm": 0.3652641177177429, + "learning_rate": 0.0003711573426573426, + "loss": 3.1868, + "step": 65550 + }, + { + "epoch": 19.103092783505154, + "grad_norm": 0.34368589520454407, + "learning_rate": 0.00037098251748251746, + "loss": 3.192, + "step": 65600 + }, + { + "epoch": 19.11765391111888, + "grad_norm": 0.388906329870224, + "learning_rate": 0.00037080769230769226, + "loss": 3.1941, + "step": 65650 + }, + { + "epoch": 19.1322150387326, + "grad_norm": 0.40207478404045105, + "learning_rate": 0.0003706328671328671, + "loss": 3.2025, + "step": 65700 + }, + { + "epoch": 19.14677616634632, + "grad_norm": 0.3887316882610321, + "learning_rate": 0.0003704580419580419, + "loss": 3.2037, + "step": 65750 + }, + { + "epoch": 19.161337293960045, + "grad_norm": 0.38904979825019836, + "learning_rate": 0.00037028321678321676, + "loss": 3.1945, + "step": 65800 + }, + { + "epoch": 19.175898421573766, + "grad_norm": 0.3794829845428467, + "learning_rate": 0.0003701083916083916, + "loss": 3.2061, + "step": 65850 + }, + { + "epoch": 19.19045954918749, + "grad_norm": 0.4314369261264801, + "learning_rate": 0.0003699335664335664, + "loss": 3.2102, + "step": 65900 + }, + { + "epoch": 19.20502067680121, + "grad_norm": 0.40267378091812134, + "learning_rate": 0.00036975874125874127, + "loss": 3.2061, + "step": 65950 + }, + { + "epoch": 19.219581804414933, + "grad_norm": 0.3804206848144531, + "learning_rate": 0.00036958391608391607, + "loss": 3.1828, + "step": 66000 + }, + { + "epoch": 19.219581804414933, + "eval_accuracy": 0.3726852216513643, + "eval_loss": 3.5499932765960693, + "eval_runtime": 180.7468, + "eval_samples_per_second": 92.096, + "eval_steps_per_second": 5.759, + "step": 66000 + }, + { + "epoch": 19.234142932028657, + "grad_norm": 0.4092799425125122, + "learning_rate": 0.0003694090909090909, + "loss": 3.2022, + "step": 66050 + }, + { + "epoch": 19.248704059642378, + "grad_norm": 0.3658771216869354, + "learning_rate": 0.00036923426573426567, + "loss": 3.2156, + "step": 66100 + }, + { + "epoch": 19.263265187256103, + "grad_norm": 0.3707059621810913, + "learning_rate": 0.0003690594405594405, + "loss": 3.2101, + "step": 66150 + }, + { + "epoch": 19.277826314869824, + "grad_norm": 0.3689354360103607, + "learning_rate": 0.0003688846153846153, + "loss": 3.2297, + "step": 66200 + }, + { + "epoch": 19.292387442483545, + "grad_norm": 0.4154335558414459, + "learning_rate": 0.0003687097902097902, + "loss": 3.2106, + "step": 66250 + }, + { + "epoch": 19.30694857009727, + "grad_norm": 0.3923240303993225, + "learning_rate": 0.000368534965034965, + "loss": 3.2092, + "step": 66300 + }, + { + "epoch": 19.32150969771099, + "grad_norm": 0.3821892738342285, + "learning_rate": 0.00036836013986013983, + "loss": 3.2283, + "step": 66350 + }, + { + "epoch": 19.336070825324715, + "grad_norm": 0.4162690341472626, + "learning_rate": 0.00036818531468531463, + "loss": 3.2196, + "step": 66400 + }, + { + "epoch": 19.350631952938436, + "grad_norm": 0.3843232989311218, + "learning_rate": 0.0003680104895104895, + "loss": 3.2142, + "step": 66450 + }, + { + "epoch": 19.365193080552157, + "grad_norm": 0.37635594606399536, + "learning_rate": 0.0003678356643356643, + "loss": 3.2152, + "step": 66500 + }, + { + "epoch": 19.37975420816588, + "grad_norm": 0.3948022425174713, + "learning_rate": 0.00036766083916083914, + "loss": 3.2298, + "step": 66550 + }, + { + "epoch": 19.394315335779602, + "grad_norm": 0.39074137806892395, + "learning_rate": 0.000367486013986014, + "loss": 3.2166, + "step": 66600 + }, + { + "epoch": 19.408876463393327, + "grad_norm": 0.38576585054397583, + "learning_rate": 0.0003673111888111888, + "loss": 3.2184, + "step": 66650 + }, + { + "epoch": 19.423437591007048, + "grad_norm": 0.38614434003829956, + "learning_rate": 0.00036713636363636365, + "loss": 3.2252, + "step": 66700 + }, + { + "epoch": 19.43799871862077, + "grad_norm": 0.406095027923584, + "learning_rate": 0.00036696153846153844, + "loss": 3.2202, + "step": 66750 + }, + { + "epoch": 19.452559846234493, + "grad_norm": 0.37972956895828247, + "learning_rate": 0.0003667867132867133, + "loss": 3.2255, + "step": 66800 + }, + { + "epoch": 19.467120973848214, + "grad_norm": 0.38185039162635803, + "learning_rate": 0.00036661188811188804, + "loss": 3.2353, + "step": 66850 + }, + { + "epoch": 19.48168210146194, + "grad_norm": 0.39274418354034424, + "learning_rate": 0.0003664370629370629, + "loss": 3.2396, + "step": 66900 + }, + { + "epoch": 19.49624322907566, + "grad_norm": 0.39396706223487854, + "learning_rate": 0.0003662622377622377, + "loss": 3.2468, + "step": 66950 + }, + { + "epoch": 19.51080435668938, + "grad_norm": 0.37209567427635193, + "learning_rate": 0.00036608741258741255, + "loss": 3.2364, + "step": 67000 + }, + { + "epoch": 19.51080435668938, + "eval_accuracy": 0.3731644068390401, + "eval_loss": 3.5449118614196777, + "eval_runtime": 181.5019, + "eval_samples_per_second": 91.713, + "eval_steps_per_second": 5.735, + "step": 67000 + }, + { + "epoch": 19.525365484303105, + "grad_norm": 0.36750879883766174, + "learning_rate": 0.00036591258741258735, + "loss": 3.2415, + "step": 67050 + }, + { + "epoch": 19.539926611916826, + "grad_norm": 0.3621958792209625, + "learning_rate": 0.0003657377622377622, + "loss": 3.2354, + "step": 67100 + }, + { + "epoch": 19.55448773953055, + "grad_norm": 0.3803771734237671, + "learning_rate": 0.000365562937062937, + "loss": 3.241, + "step": 67150 + }, + { + "epoch": 19.56904886714427, + "grad_norm": 0.3699823319911957, + "learning_rate": 0.00036538811188811186, + "loss": 3.2311, + "step": 67200 + }, + { + "epoch": 19.583609994757992, + "grad_norm": 0.4162828028202057, + "learning_rate": 0.0003652132867132867, + "loss": 3.2361, + "step": 67250 + }, + { + "epoch": 19.598171122371717, + "grad_norm": 0.3639504909515381, + "learning_rate": 0.0003650384615384615, + "loss": 3.2508, + "step": 67300 + }, + { + "epoch": 19.612732249985438, + "grad_norm": 0.4054574966430664, + "learning_rate": 0.00036486363636363637, + "loss": 3.2472, + "step": 67350 + }, + { + "epoch": 19.627293377599162, + "grad_norm": 0.41714179515838623, + "learning_rate": 0.00036468881118881117, + "loss": 3.2421, + "step": 67400 + }, + { + "epoch": 19.641854505212883, + "grad_norm": 0.3846038281917572, + "learning_rate": 0.000364513986013986, + "loss": 3.2516, + "step": 67450 + }, + { + "epoch": 19.656415632826604, + "grad_norm": 0.35238131880760193, + "learning_rate": 0.0003643391608391608, + "loss": 3.2442, + "step": 67500 + }, + { + "epoch": 19.67097676044033, + "grad_norm": 0.36373382806777954, + "learning_rate": 0.0003641643356643357, + "loss": 3.2489, + "step": 67550 + }, + { + "epoch": 19.68553788805405, + "grad_norm": 0.3687552511692047, + "learning_rate": 0.0003639895104895104, + "loss": 3.2356, + "step": 67600 + }, + { + "epoch": 19.700099015667774, + "grad_norm": 0.3758412003517151, + "learning_rate": 0.0003638146853146853, + "loss": 3.2561, + "step": 67650 + }, + { + "epoch": 19.714660143281495, + "grad_norm": 0.37209826707839966, + "learning_rate": 0.00036363986013986007, + "loss": 3.2522, + "step": 67700 + }, + { + "epoch": 19.729221270895216, + "grad_norm": 0.38578248023986816, + "learning_rate": 0.0003634650349650349, + "loss": 3.2484, + "step": 67750 + }, + { + "epoch": 19.74378239850894, + "grad_norm": 0.3813081383705139, + "learning_rate": 0.0003632902097902097, + "loss": 3.253, + "step": 67800 + }, + { + "epoch": 19.758343526122662, + "grad_norm": 0.3634510040283203, + "learning_rate": 0.0003631153846153846, + "loss": 3.2554, + "step": 67850 + }, + { + "epoch": 19.772904653736386, + "grad_norm": 0.37720900774002075, + "learning_rate": 0.00036294055944055943, + "loss": 3.2575, + "step": 67900 + }, + { + "epoch": 19.787465781350107, + "grad_norm": 0.39338457584381104, + "learning_rate": 0.00036276573426573423, + "loss": 3.254, + "step": 67950 + }, + { + "epoch": 19.802026908963832, + "grad_norm": 0.40323713421821594, + "learning_rate": 0.0003625909090909091, + "loss": 3.271, + "step": 68000 + }, + { + "epoch": 19.802026908963832, + "eval_accuracy": 0.3733834259765867, + "eval_loss": 3.5359480381011963, + "eval_runtime": 181.6023, + "eval_samples_per_second": 91.662, + "eval_steps_per_second": 5.732, + "step": 68000 + }, + { + "epoch": 19.816588036577553, + "grad_norm": 0.3981825113296509, + "learning_rate": 0.0003624160839160839, + "loss": 3.2556, + "step": 68050 + }, + { + "epoch": 19.831149164191274, + "grad_norm": 0.3592379689216614, + "learning_rate": 0.00036224125874125874, + "loss": 3.2478, + "step": 68100 + }, + { + "epoch": 19.845710291805, + "grad_norm": 0.36781877279281616, + "learning_rate": 0.00036206643356643354, + "loss": 3.2517, + "step": 68150 + }, + { + "epoch": 19.86027141941872, + "grad_norm": 0.38581395149230957, + "learning_rate": 0.0003618916083916084, + "loss": 3.2555, + "step": 68200 + }, + { + "epoch": 19.874832547032444, + "grad_norm": 0.39512038230895996, + "learning_rate": 0.0003617167832167832, + "loss": 3.2689, + "step": 68250 + }, + { + "epoch": 19.889393674646165, + "grad_norm": 0.35403746366500854, + "learning_rate": 0.00036154195804195805, + "loss": 3.2671, + "step": 68300 + }, + { + "epoch": 19.903954802259886, + "grad_norm": 0.4128151535987854, + "learning_rate": 0.0003613671328671328, + "loss": 3.27, + "step": 68350 + }, + { + "epoch": 19.91851592987361, + "grad_norm": 0.4067288339138031, + "learning_rate": 0.00036119230769230765, + "loss": 3.2533, + "step": 68400 + }, + { + "epoch": 19.93307705748733, + "grad_norm": 0.36504778265953064, + "learning_rate": 0.00036101748251748245, + "loss": 3.2567, + "step": 68450 + }, + { + "epoch": 19.947638185101056, + "grad_norm": 0.3819653391838074, + "learning_rate": 0.0003608426573426573, + "loss": 3.265, + "step": 68500 + }, + { + "epoch": 19.962199312714777, + "grad_norm": 0.37546461820602417, + "learning_rate": 0.0003606678321678321, + "loss": 3.2596, + "step": 68550 + }, + { + "epoch": 19.976760440328498, + "grad_norm": 0.38692423701286316, + "learning_rate": 0.00036049300699300696, + "loss": 3.2577, + "step": 68600 + }, + { + "epoch": 19.991321567942222, + "grad_norm": 0.3578813672065735, + "learning_rate": 0.0003603181818181818, + "loss": 3.2568, + "step": 68650 + }, + { + "epoch": 20.005824451045488, + "grad_norm": 0.4185604751110077, + "learning_rate": 0.0003601433566433566, + "loss": 3.2322, + "step": 68700 + }, + { + "epoch": 20.020385578659212, + "grad_norm": 0.4104021191596985, + "learning_rate": 0.00035996853146853146, + "loss": 3.1446, + "step": 68750 + }, + { + "epoch": 20.034946706272933, + "grad_norm": 0.38897705078125, + "learning_rate": 0.00035979370629370626, + "loss": 3.1556, + "step": 68800 + }, + { + "epoch": 20.049507833886658, + "grad_norm": 0.40106043219566345, + "learning_rate": 0.0003596188811188811, + "loss": 3.1617, + "step": 68850 + }, + { + "epoch": 20.06406896150038, + "grad_norm": 0.4021047055721283, + "learning_rate": 0.0003594440559440559, + "loss": 3.1718, + "step": 68900 + }, + { + "epoch": 20.0786300891141, + "grad_norm": 0.3868863880634308, + "learning_rate": 0.00035926923076923077, + "loss": 3.1777, + "step": 68950 + }, + { + "epoch": 20.093191216727824, + "grad_norm": 0.368656188249588, + "learning_rate": 0.00035909440559440557, + "loss": 3.1782, + "step": 69000 + }, + { + "epoch": 20.093191216727824, + "eval_accuracy": 0.3730581302419697, + "eval_loss": 3.5493500232696533, + "eval_runtime": 181.0447, + "eval_samples_per_second": 91.944, + "eval_steps_per_second": 5.75, + "step": 69000 + }, + { + "epoch": 20.107752344341545, + "grad_norm": 0.37952375411987305, + "learning_rate": 0.0003589195804195804, + "loss": 3.1841, + "step": 69050 + }, + { + "epoch": 20.12231347195527, + "grad_norm": 0.3818630874156952, + "learning_rate": 0.00035874475524475517, + "loss": 3.1845, + "step": 69100 + }, + { + "epoch": 20.13687459956899, + "grad_norm": 0.4041067361831665, + "learning_rate": 0.00035856993006993, + "loss": 3.1926, + "step": 69150 + }, + { + "epoch": 20.15143572718271, + "grad_norm": 0.4008336365222931, + "learning_rate": 0.0003583951048951048, + "loss": 3.1876, + "step": 69200 + }, + { + "epoch": 20.165996854796436, + "grad_norm": 0.37791797518730164, + "learning_rate": 0.0003582202797202797, + "loss": 3.1929, + "step": 69250 + }, + { + "epoch": 20.180557982410157, + "grad_norm": 0.3881707191467285, + "learning_rate": 0.00035804545454545453, + "loss": 3.1894, + "step": 69300 + }, + { + "epoch": 20.19511911002388, + "grad_norm": 0.41837161779403687, + "learning_rate": 0.00035787062937062933, + "loss": 3.1876, + "step": 69350 + }, + { + "epoch": 20.209680237637603, + "grad_norm": 0.3870464563369751, + "learning_rate": 0.0003576958041958042, + "loss": 3.1971, + "step": 69400 + }, + { + "epoch": 20.224241365251324, + "grad_norm": 0.3730372190475464, + "learning_rate": 0.000357520979020979, + "loss": 3.2083, + "step": 69450 + }, + { + "epoch": 20.238802492865048, + "grad_norm": 0.3917742371559143, + "learning_rate": 0.00035734615384615384, + "loss": 3.1931, + "step": 69500 + }, + { + "epoch": 20.25336362047877, + "grad_norm": 0.39877134561538696, + "learning_rate": 0.00035717132867132864, + "loss": 3.201, + "step": 69550 + }, + { + "epoch": 20.267924748092494, + "grad_norm": 0.4186917543411255, + "learning_rate": 0.0003569965034965035, + "loss": 3.2039, + "step": 69600 + }, + { + "epoch": 20.282485875706215, + "grad_norm": 0.3791273832321167, + "learning_rate": 0.0003568216783216783, + "loss": 3.2102, + "step": 69650 + }, + { + "epoch": 20.297047003319935, + "grad_norm": 0.3736315667629242, + "learning_rate": 0.00035664685314685314, + "loss": 3.1941, + "step": 69700 + }, + { + "epoch": 20.31160813093366, + "grad_norm": 0.40645095705986023, + "learning_rate": 0.00035647202797202794, + "loss": 3.2125, + "step": 69750 + }, + { + "epoch": 20.32616925854738, + "grad_norm": 0.4175493121147156, + "learning_rate": 0.0003562972027972028, + "loss": 3.2107, + "step": 69800 + }, + { + "epoch": 20.340730386161106, + "grad_norm": 0.392880916595459, + "learning_rate": 0.00035612237762237754, + "loss": 3.2139, + "step": 69850 + }, + { + "epoch": 20.355291513774826, + "grad_norm": 0.3980697691440582, + "learning_rate": 0.0003559475524475524, + "loss": 3.2024, + "step": 69900 + }, + { + "epoch": 20.369852641388547, + "grad_norm": 0.37479138374328613, + "learning_rate": 0.0003557727272727272, + "loss": 3.2134, + "step": 69950 + }, + { + "epoch": 20.384413769002272, + "grad_norm": 0.3928723931312561, + "learning_rate": 0.00035559790209790205, + "loss": 3.227, + "step": 70000 + }, + { + "epoch": 20.384413769002272, + "eval_accuracy": 0.37295220633272147, + "eval_loss": 3.5466055870056152, + "eval_runtime": 181.2725, + "eval_samples_per_second": 91.829, + "eval_steps_per_second": 5.743, + "step": 70000 + } + ], + "logging_steps": 50, + "max_steps": 171700, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 10000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 20, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 5 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4631308623872e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}