| { |
| "best_global_step": 75000, |
| "best_metric": 3.5301737785339355, |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_push_frequency_1001/checkpoint-40000", |
| "epoch": 29.120857359193895, |
| "eval_steps": 1000, |
| "global_step": 100000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.014561127613722406, |
| "grad_norm": 1.0156038999557495, |
| "learning_rate": 0.000294, |
| "loss": 8.436, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.029122255227444813, |
| "grad_norm": 0.809311032295227, |
| "learning_rate": 0.0005939999999999999, |
| "loss": 6.7501, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04368338284116722, |
| "grad_norm": 0.633715033531189, |
| "learning_rate": 0.0005998286713286713, |
| "loss": 6.362, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.058244510454889625, |
| "grad_norm": 0.4976341724395752, |
| "learning_rate": 0.0005996538461538461, |
| "loss": 6.1664, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.07280563806861204, |
| "grad_norm": 0.464887797832489, |
| "learning_rate": 0.0005994790209790209, |
| "loss": 6.0235, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.08736676568233444, |
| "grad_norm": 0.4251498579978943, |
| "learning_rate": 0.0005993041958041958, |
| "loss": 5.8761, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.10192789329605685, |
| "grad_norm": 0.4090366065502167, |
| "learning_rate": 0.0005991293706293705, |
| "loss": 5.7573, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.11648902090977925, |
| "grad_norm": 0.4682002365589142, |
| "learning_rate": 0.0005989545454545454, |
| "loss": 5.6386, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.13105014852350166, |
| "grad_norm": 0.4403827488422394, |
| "learning_rate": 0.0005987797202797202, |
| "loss": 5.5162, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.14561127613722408, |
| "grad_norm": 0.49002617597579956, |
| "learning_rate": 0.000598604895104895, |
| "loss": 5.4387, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.16017240375094646, |
| "grad_norm": 0.4434022605419159, |
| "learning_rate": 0.0005984300699300698, |
| "loss": 5.3546, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.17473353136466888, |
| "grad_norm": 0.565316915512085, |
| "learning_rate": 0.0005982552447552447, |
| "loss": 5.2723, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.1892946589783913, |
| "grad_norm": 0.46579620242118835, |
| "learning_rate": 0.0005980804195804195, |
| "loss": 5.2164, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.2038557865921137, |
| "grad_norm": 0.40206030011177063, |
| "learning_rate": 0.0005979055944055943, |
| "loss": 5.1371, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.2184169142058361, |
| "grad_norm": 0.4663309156894684, |
| "learning_rate": 0.0005977307692307691, |
| "loss": 5.0956, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.2329780418195585, |
| "grad_norm": 0.4851098358631134, |
| "learning_rate": 0.000597555944055944, |
| "loss": 5.038, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.24753916943328091, |
| "grad_norm": 0.4971611499786377, |
| "learning_rate": 0.0005973811188811188, |
| "loss": 4.9986, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.2621002970470033, |
| "grad_norm": 0.43835172057151794, |
| "learning_rate": 0.0005972062937062936, |
| "loss": 4.9594, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.27666142466072574, |
| "grad_norm": 0.4245416522026062, |
| "learning_rate": 0.0005970314685314685, |
| "loss": 4.901, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.29122255227444815, |
| "grad_norm": 0.4260517954826355, |
| "learning_rate": 0.0005968566433566433, |
| "loss": 4.8593, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.29122255227444815, |
| "eval_accuracy": 0.251100209661154, |
| "eval_loss": 4.784478187561035, |
| "eval_runtime": 179.3832, |
| "eval_samples_per_second": 92.796, |
| "eval_steps_per_second": 5.803, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.30578367988817057, |
| "grad_norm": 0.4281257092952728, |
| "learning_rate": 0.0005966818181818181, |
| "loss": 4.8155, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.3203448075018929, |
| "grad_norm": 0.44848015904426575, |
| "learning_rate": 0.0005965069930069929, |
| "loss": 4.7622, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.33490593511561534, |
| "grad_norm": 0.5026978850364685, |
| "learning_rate": 0.0005963321678321677, |
| "loss": 4.7187, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.34946706272933775, |
| "grad_norm": 0.4378901720046997, |
| "learning_rate": 0.0005961573426573425, |
| "loss": 4.689, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.36402819034306017, |
| "grad_norm": 0.47806859016418457, |
| "learning_rate": 0.0005959825174825174, |
| "loss": 4.6541, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.3785893179567826, |
| "grad_norm": 0.42951807379722595, |
| "learning_rate": 0.0005958076923076922, |
| "loss": 4.6137, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.393150445570505, |
| "grad_norm": 0.3968406021595001, |
| "learning_rate": 0.000595632867132867, |
| "loss": 4.5933, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.4077115731842274, |
| "grad_norm": 0.4745321571826935, |
| "learning_rate": 0.0005954580419580418, |
| "loss": 4.5585, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.4222727007979498, |
| "grad_norm": 0.4153960943222046, |
| "learning_rate": 0.0005952832167832168, |
| "loss": 4.5407, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.4368338284116722, |
| "grad_norm": 0.3878651261329651, |
| "learning_rate": 0.0005951083916083916, |
| "loss": 4.5259, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.4513949560253946, |
| "grad_norm": 0.4079844057559967, |
| "learning_rate": 0.0005949335664335664, |
| "loss": 4.5019, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.465956083639117, |
| "grad_norm": 0.4346165060997009, |
| "learning_rate": 0.0005947587412587413, |
| "loss": 4.4782, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.4805172112528394, |
| "grad_norm": 0.4144968092441559, |
| "learning_rate": 0.0005945839160839161, |
| "loss": 4.4536, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.49507833886656183, |
| "grad_norm": 0.4191794693470001, |
| "learning_rate": 0.0005944090909090909, |
| "loss": 4.4498, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.5096394664802842, |
| "grad_norm": 0.41704434156417847, |
| "learning_rate": 0.0005942342657342657, |
| "loss": 4.4228, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.5242005940940067, |
| "grad_norm": 0.41254884004592896, |
| "learning_rate": 0.0005940594405594406, |
| "loss": 4.3967, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.5387617217077291, |
| "grad_norm": 0.39104679226875305, |
| "learning_rate": 0.0005938846153846153, |
| "loss": 4.3879, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.5533228493214515, |
| "grad_norm": 0.41211313009262085, |
| "learning_rate": 0.0005937097902097902, |
| "loss": 4.3635, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.5678839769351739, |
| "grad_norm": 0.4436270594596863, |
| "learning_rate": 0.000593534965034965, |
| "loss": 4.3544, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.5824451045488963, |
| "grad_norm": 0.41863691806793213, |
| "learning_rate": 0.0005933601398601398, |
| "loss": 4.3389, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5824451045488963, |
| "eval_accuracy": 0.2990180230530868, |
| "eval_loss": 4.286617755889893, |
| "eval_runtime": 179.9159, |
| "eval_samples_per_second": 92.521, |
| "eval_steps_per_second": 5.786, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5970062321626187, |
| "grad_norm": 0.36824026703834534, |
| "learning_rate": 0.0005931853146853146, |
| "loss": 4.3171, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.6115673597763411, |
| "grad_norm": 0.37108343839645386, |
| "learning_rate": 0.0005930104895104895, |
| "loss": 4.3193, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.6261284873900634, |
| "grad_norm": 0.4174569249153137, |
| "learning_rate": 0.0005928356643356643, |
| "loss": 4.305, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.6406896150037859, |
| "grad_norm": 0.3838972747325897, |
| "learning_rate": 0.0005926608391608391, |
| "loss": 4.2918, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.6552507426175083, |
| "grad_norm": 0.4136732518672943, |
| "learning_rate": 0.000592486013986014, |
| "loss": 4.2746, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.6698118702312307, |
| "grad_norm": 0.3708474338054657, |
| "learning_rate": 0.0005923111888111888, |
| "loss": 4.2644, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.6843729978449531, |
| "grad_norm": 0.3848128616809845, |
| "learning_rate": 0.0005921363636363636, |
| "loss": 4.2554, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.6989341254586755, |
| "grad_norm": 0.37099912762641907, |
| "learning_rate": 0.0005919615384615384, |
| "loss": 4.2568, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.7134952530723979, |
| "grad_norm": 0.42061349749565125, |
| "learning_rate": 0.0005917867132867133, |
| "loss": 4.2437, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.7280563806861203, |
| "grad_norm": 0.3873692750930786, |
| "learning_rate": 0.0005916118881118881, |
| "loss": 4.2256, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.7426175082998427, |
| "grad_norm": 0.39050984382629395, |
| "learning_rate": 0.0005914370629370629, |
| "loss": 4.2286, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.7571786359135652, |
| "grad_norm": 0.4035472571849823, |
| "learning_rate": 0.0005912622377622377, |
| "loss": 4.2119, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.7717397635272876, |
| "grad_norm": 0.38068002462387085, |
| "learning_rate": 0.0005910874125874125, |
| "loss": 4.1925, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.78630089114101, |
| "grad_norm": 0.3586716949939728, |
| "learning_rate": 0.0005909125874125873, |
| "loss": 4.1987, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.8008620187547324, |
| "grad_norm": 0.3798849880695343, |
| "learning_rate": 0.0005907377622377622, |
| "loss": 4.1908, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.8154231463684548, |
| "grad_norm": 0.34419625997543335, |
| "learning_rate": 0.000590562937062937, |
| "loss": 4.1849, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.8299842739821772, |
| "grad_norm": 0.39560699462890625, |
| "learning_rate": 0.0005903881118881118, |
| "loss": 4.1819, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.8445454015958996, |
| "grad_norm": 0.3793541193008423, |
| "learning_rate": 0.0005902132867132867, |
| "loss": 4.1643, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.8591065292096219, |
| "grad_norm": 0.3944413363933563, |
| "learning_rate": 0.0005900384615384615, |
| "loss": 4.1472, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.8736676568233444, |
| "grad_norm": 0.36391785740852356, |
| "learning_rate": 0.0005898636363636363, |
| "loss": 4.1582, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8736676568233444, |
| "eval_accuracy": 0.31558964818919494, |
| "eval_loss": 4.094042778015137, |
| "eval_runtime": 179.9487, |
| "eval_samples_per_second": 92.504, |
| "eval_steps_per_second": 5.785, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8882287844370668, |
| "grad_norm": 0.3777507543563843, |
| "learning_rate": 0.0005896888111888111, |
| "loss": 4.1449, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.9027899120507892, |
| "grad_norm": 0.4013363718986511, |
| "learning_rate": 0.000589513986013986, |
| "loss": 4.1303, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.9173510396645116, |
| "grad_norm": 0.37003064155578613, |
| "learning_rate": 0.0005893391608391608, |
| "loss": 4.1259, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.931912167278234, |
| "grad_norm": 0.3683750629425049, |
| "learning_rate": 0.0005891643356643356, |
| "loss": 4.1149, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.9464732948919564, |
| "grad_norm": 0.37037304043769836, |
| "learning_rate": 0.0005889895104895104, |
| "loss": 4.1215, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.9610344225056788, |
| "grad_norm": 0.3676895201206207, |
| "learning_rate": 0.0005888146853146853, |
| "loss": 4.1077, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.9755955501194012, |
| "grad_norm": 0.32570791244506836, |
| "learning_rate": 0.00058863986013986, |
| "loss": 4.0894, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.9901566777331237, |
| "grad_norm": 0.36267757415771484, |
| "learning_rate": 0.0005884650349650349, |
| "loss": 4.0982, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.004659560836391, |
| "grad_norm": 0.35793808102607727, |
| "learning_rate": 0.0005882902097902097, |
| "loss": 4.0777, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.0192206884501136, |
| "grad_norm": 0.3839673101902008, |
| "learning_rate": 0.0005881153846153845, |
| "loss": 4.0113, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.033781816063836, |
| "grad_norm": 0.3534751534461975, |
| "learning_rate": 0.0005879405594405594, |
| "loss": 4.0028, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.0483429436775584, |
| "grad_norm": 0.33864957094192505, |
| "learning_rate": 0.0005877657342657342, |
| "loss": 4.0242, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.0629040712912807, |
| "grad_norm": 0.33282938599586487, |
| "learning_rate": 0.000587590909090909, |
| "loss": 4.0182, |
| "step": 3650 |
| }, |
| { |
| "epoch": 1.0774651989050033, |
| "grad_norm": 0.3558276891708374, |
| "learning_rate": 0.0005874160839160838, |
| "loss": 4.0161, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.0920263265187256, |
| "grad_norm": 0.3629501163959503, |
| "learning_rate": 0.0005872412587412587, |
| "loss": 4.0114, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.106587454132448, |
| "grad_norm": 0.3544020652770996, |
| "learning_rate": 0.0005870664335664335, |
| "loss": 4.0103, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.1211485817461704, |
| "grad_norm": 0.4010087251663208, |
| "learning_rate": 0.0005868916083916083, |
| "loss": 4.0004, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.135709709359893, |
| "grad_norm": 0.35543861985206604, |
| "learning_rate": 0.0005867167832167831, |
| "loss": 4.0132, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.1502708369736152, |
| "grad_norm": 0.34876418113708496, |
| "learning_rate": 0.000586541958041958, |
| "loss": 4.0061, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.1648319645873377, |
| "grad_norm": 0.343862384557724, |
| "learning_rate": 0.0005863671328671328, |
| "loss": 3.998, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.1648319645873377, |
| "eval_accuracy": 0.3254573832021374, |
| "eval_loss": 3.987274169921875, |
| "eval_runtime": 179.8874, |
| "eval_samples_per_second": 92.536, |
| "eval_steps_per_second": 5.787, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.17939309220106, |
| "grad_norm": 0.3759085536003113, |
| "learning_rate": 0.0005861923076923076, |
| "loss": 3.9917, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.1939542198147826, |
| "grad_norm": 0.39087915420532227, |
| "learning_rate": 0.0005860174825174824, |
| "loss": 3.9679, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.2085153474285049, |
| "grad_norm": 0.3662189543247223, |
| "learning_rate": 0.0005858426573426573, |
| "loss": 3.982, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.2230764750422272, |
| "grad_norm": 0.3650372326374054, |
| "learning_rate": 0.000585667832167832, |
| "loss": 3.979, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.2376376026559497, |
| "grad_norm": 0.3324548006057739, |
| "learning_rate": 0.000585493006993007, |
| "loss": 3.9827, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.2521987302696722, |
| "grad_norm": 0.3553929924964905, |
| "learning_rate": 0.0005853181818181817, |
| "loss": 3.9791, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.2667598578833945, |
| "grad_norm": 0.38200274109840393, |
| "learning_rate": 0.0005851433566433565, |
| "loss": 3.969, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.2813209854971168, |
| "grad_norm": 0.35176458954811096, |
| "learning_rate": 0.0005849685314685315, |
| "loss": 3.9683, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.2958821131108393, |
| "grad_norm": 0.34012216329574585, |
| "learning_rate": 0.0005847937062937063, |
| "loss": 3.9681, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.3104432407245616, |
| "grad_norm": 0.3573969304561615, |
| "learning_rate": 0.0005846188811188811, |
| "loss": 3.9549, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.3250043683382842, |
| "grad_norm": 0.34808942675590515, |
| "learning_rate": 0.0005844440559440559, |
| "loss": 3.9579, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.3395654959520065, |
| "grad_norm": 0.33802273869514465, |
| "learning_rate": 0.0005842692307692308, |
| "loss": 3.9547, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.354126623565729, |
| "grad_norm": 0.35063666105270386, |
| "learning_rate": 0.0005840944055944056, |
| "loss": 3.9449, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.3686877511794513, |
| "grad_norm": 0.34823665022850037, |
| "learning_rate": 0.0005839195804195804, |
| "loss": 3.9457, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.3832488787931738, |
| "grad_norm": 0.3375008702278137, |
| "learning_rate": 0.0005837447552447552, |
| "loss": 3.9351, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.3978100064068961, |
| "grad_norm": 0.3285793662071228, |
| "learning_rate": 0.0005835699300699301, |
| "loss": 3.942, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.4123711340206184, |
| "grad_norm": 0.3183380663394928, |
| "learning_rate": 0.0005833951048951048, |
| "loss": 3.9235, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.426932261634341, |
| "grad_norm": 0.3355945348739624, |
| "learning_rate": 0.0005832202797202797, |
| "loss": 3.9332, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.4414933892480635, |
| "grad_norm": 0.33554500341415405, |
| "learning_rate": 0.0005830454545454546, |
| "loss": 3.9153, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.4560545168617858, |
| "grad_norm": 0.3406490385532379, |
| "learning_rate": 0.0005828706293706293, |
| "loss": 3.9194, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.4560545168617858, |
| "eval_accuracy": 0.3320758053097387, |
| "eval_loss": 3.9140732288360596, |
| "eval_runtime": 179.8737, |
| "eval_samples_per_second": 92.543, |
| "eval_steps_per_second": 5.787, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.470615644475508, |
| "grad_norm": 0.31789177656173706, |
| "learning_rate": 0.0005826958041958042, |
| "loss": 3.9285, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.4851767720892306, |
| "grad_norm": 0.34523099660873413, |
| "learning_rate": 0.000582520979020979, |
| "loss": 3.9205, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.4997378997029531, |
| "grad_norm": 0.3127771317958832, |
| "learning_rate": 0.0005823461538461538, |
| "loss": 3.939, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.5142990273166754, |
| "grad_norm": 0.3415018320083618, |
| "learning_rate": 0.0005821713286713286, |
| "loss": 3.9162, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.5288601549303977, |
| "grad_norm": 0.3186185657978058, |
| "learning_rate": 0.0005819965034965035, |
| "loss": 3.9147, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.5434212825441203, |
| "grad_norm": 0.33821001648902893, |
| "learning_rate": 0.0005818216783216783, |
| "loss": 3.9268, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.5579824101578428, |
| "grad_norm": 0.3105230927467346, |
| "learning_rate": 0.0005816468531468531, |
| "loss": 3.9089, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.572543537771565, |
| "grad_norm": 0.33344826102256775, |
| "learning_rate": 0.0005814720279720279, |
| "loss": 3.9004, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.5871046653852874, |
| "grad_norm": 0.3113841414451599, |
| "learning_rate": 0.0005812972027972028, |
| "loss": 3.9041, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.6016657929990097, |
| "grad_norm": 0.3271586298942566, |
| "learning_rate": 0.0005811223776223776, |
| "loss": 3.9142, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.6162269206127322, |
| "grad_norm": 0.35432323813438416, |
| "learning_rate": 0.0005809475524475524, |
| "loss": 3.898, |
| "step": 5550 |
| }, |
| { |
| "epoch": 1.6307880482264547, |
| "grad_norm": 0.3350410461425781, |
| "learning_rate": 0.0005807727272727272, |
| "loss": 3.8936, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.645349175840177, |
| "grad_norm": 0.34723180532455444, |
| "learning_rate": 0.0005805979020979021, |
| "loss": 3.9073, |
| "step": 5650 |
| }, |
| { |
| "epoch": 1.6599103034538993, |
| "grad_norm": 0.34611478447914124, |
| "learning_rate": 0.0005804230769230769, |
| "loss": 3.8876, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.6744714310676219, |
| "grad_norm": 0.32853081822395325, |
| "learning_rate": 0.0005802482517482517, |
| "loss": 3.8974, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.6890325586813444, |
| "grad_norm": 0.3465241491794586, |
| "learning_rate": 0.0005800734265734265, |
| "loss": 3.9005, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.7035936862950667, |
| "grad_norm": 0.3429538607597351, |
| "learning_rate": 0.0005798986013986013, |
| "loss": 3.8819, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.718154813908789, |
| "grad_norm": 0.3328207731246948, |
| "learning_rate": 0.0005797237762237762, |
| "loss": 3.8857, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.7327159415225115, |
| "grad_norm": 0.3421238660812378, |
| "learning_rate": 0.000579548951048951, |
| "loss": 3.8775, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.747277069136234, |
| "grad_norm": 0.34112289547920227, |
| "learning_rate": 0.0005793741258741258, |
| "loss": 3.8898, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.747277069136234, |
| "eval_accuracy": 0.33747510317882234, |
| "eval_loss": 3.8535807132720947, |
| "eval_runtime": 179.9999, |
| "eval_samples_per_second": 92.478, |
| "eval_steps_per_second": 5.783, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.7618381967499563, |
| "grad_norm": 0.31641003489494324, |
| "learning_rate": 0.0005791993006993006, |
| "loss": 3.8723, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.7763993243636786, |
| "grad_norm": 0.3260517120361328, |
| "learning_rate": 0.0005790244755244755, |
| "loss": 3.869, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.7909604519774012, |
| "grad_norm": 0.3309280276298523, |
| "learning_rate": 0.0005788496503496503, |
| "loss": 3.8801, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.8055215795911237, |
| "grad_norm": 0.3414907455444336, |
| "learning_rate": 0.0005786748251748251, |
| "loss": 3.8829, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.820082707204846, |
| "grad_norm": 0.348162978887558, |
| "learning_rate": 0.0005784999999999999, |
| "loss": 3.8575, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.8346438348185683, |
| "grad_norm": 0.33785954117774963, |
| "learning_rate": 0.0005783251748251748, |
| "loss": 3.8654, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.8492049624322906, |
| "grad_norm": 0.34326276183128357, |
| "learning_rate": 0.0005781503496503496, |
| "loss": 3.8504, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.8637660900460131, |
| "grad_norm": 0.354599267244339, |
| "learning_rate": 0.0005779755244755244, |
| "loss": 3.8621, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.8783272176597356, |
| "grad_norm": 0.3137534558773041, |
| "learning_rate": 0.0005778006993006993, |
| "loss": 3.8652, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.892888345273458, |
| "grad_norm": 0.3379197120666504, |
| "learning_rate": 0.000577625874125874, |
| "loss": 3.8501, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.9074494728871803, |
| "grad_norm": 0.3122377395629883, |
| "learning_rate": 0.0005774510489510489, |
| "loss": 3.8557, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.9220106005009028, |
| "grad_norm": 0.31604939699172974, |
| "learning_rate": 0.0005772762237762237, |
| "loss": 3.855, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.9365717281146253, |
| "grad_norm": 0.30811870098114014, |
| "learning_rate": 0.0005771013986013985, |
| "loss": 3.8529, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.9511328557283476, |
| "grad_norm": 0.33653736114501953, |
| "learning_rate": 0.0005769265734265733, |
| "loss": 3.8492, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.96569398334207, |
| "grad_norm": 0.32716983556747437, |
| "learning_rate": 0.0005767517482517482, |
| "loss": 3.8549, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.9802551109557924, |
| "grad_norm": 0.3329831659793854, |
| "learning_rate": 0.000576576923076923, |
| "loss": 3.8395, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.994816238569515, |
| "grad_norm": 0.3238976299762726, |
| "learning_rate": 0.0005764020979020978, |
| "loss": 3.8466, |
| "step": 6850 |
| }, |
| { |
| "epoch": 2.009319121672782, |
| "grad_norm": 0.32503843307495117, |
| "learning_rate": 0.0005762272727272726, |
| "loss": 3.7636, |
| "step": 6900 |
| }, |
| { |
| "epoch": 2.023880249286505, |
| "grad_norm": 0.320682555437088, |
| "learning_rate": 0.0005760524475524475, |
| "loss": 3.7494, |
| "step": 6950 |
| }, |
| { |
| "epoch": 2.038441376900227, |
| "grad_norm": 0.3408512771129608, |
| "learning_rate": 0.0005758776223776223, |
| "loss": 3.7319, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.038441376900227, |
| "eval_accuracy": 0.3416704423857403, |
| "eval_loss": 3.8125834465026855, |
| "eval_runtime": 179.8316, |
| "eval_samples_per_second": 92.564, |
| "eval_steps_per_second": 5.789, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.0530025045139495, |
| "grad_norm": 0.3324206471443176, |
| "learning_rate": 0.0005757027972027971, |
| "loss": 3.7407, |
| "step": 7050 |
| }, |
| { |
| "epoch": 2.067563632127672, |
| "grad_norm": 0.3342064321041107, |
| "learning_rate": 0.000575527972027972, |
| "loss": 3.7655, |
| "step": 7100 |
| }, |
| { |
| "epoch": 2.0821247597413945, |
| "grad_norm": 0.33558395504951477, |
| "learning_rate": 0.0005753531468531468, |
| "loss": 3.7473, |
| "step": 7150 |
| }, |
| { |
| "epoch": 2.096685887355117, |
| "grad_norm": 0.3283115029335022, |
| "learning_rate": 0.0005751783216783216, |
| "loss": 3.7578, |
| "step": 7200 |
| }, |
| { |
| "epoch": 2.111247014968839, |
| "grad_norm": 0.31441500782966614, |
| "learning_rate": 0.0005750034965034964, |
| "loss": 3.7402, |
| "step": 7250 |
| }, |
| { |
| "epoch": 2.1258081425825615, |
| "grad_norm": 0.3163776099681854, |
| "learning_rate": 0.0005748286713286712, |
| "loss": 3.7665, |
| "step": 7300 |
| }, |
| { |
| "epoch": 2.140369270196284, |
| "grad_norm": 0.3424457609653473, |
| "learning_rate": 0.000574653846153846, |
| "loss": 3.755, |
| "step": 7350 |
| }, |
| { |
| "epoch": 2.1549303978100065, |
| "grad_norm": 0.3293427526950836, |
| "learning_rate": 0.000574479020979021, |
| "loss": 3.7511, |
| "step": 7400 |
| }, |
| { |
| "epoch": 2.169491525423729, |
| "grad_norm": 0.34711673855781555, |
| "learning_rate": 0.0005743041958041958, |
| "loss": 3.7509, |
| "step": 7450 |
| }, |
| { |
| "epoch": 2.184052653037451, |
| "grad_norm": 0.3269766867160797, |
| "learning_rate": 0.0005741293706293706, |
| "loss": 3.7524, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.198613780651174, |
| "grad_norm": 0.33547940850257874, |
| "learning_rate": 0.0005739545454545454, |
| "loss": 3.7581, |
| "step": 7550 |
| }, |
| { |
| "epoch": 2.213174908264896, |
| "grad_norm": 0.3282161355018616, |
| "learning_rate": 0.0005737797202797203, |
| "loss": 3.7511, |
| "step": 7600 |
| }, |
| { |
| "epoch": 2.2277360358786185, |
| "grad_norm": 0.33413347601890564, |
| "learning_rate": 0.0005736048951048951, |
| "loss": 3.757, |
| "step": 7650 |
| }, |
| { |
| "epoch": 2.2422971634923408, |
| "grad_norm": 0.33424875140190125, |
| "learning_rate": 0.0005734300699300699, |
| "loss": 3.7578, |
| "step": 7700 |
| }, |
| { |
| "epoch": 2.256858291106063, |
| "grad_norm": 0.34025800228118896, |
| "learning_rate": 0.0005732552447552448, |
| "loss": 3.7542, |
| "step": 7750 |
| }, |
| { |
| "epoch": 2.271419418719786, |
| "grad_norm": 0.32362252473831177, |
| "learning_rate": 0.0005730804195804196, |
| "loss": 3.7431, |
| "step": 7800 |
| }, |
| { |
| "epoch": 2.285980546333508, |
| "grad_norm": 0.3169853687286377, |
| "learning_rate": 0.0005729055944055944, |
| "loss": 3.7436, |
| "step": 7850 |
| }, |
| { |
| "epoch": 2.3005416739472304, |
| "grad_norm": 0.3264821469783783, |
| "learning_rate": 0.0005727307692307692, |
| "loss": 3.753, |
| "step": 7900 |
| }, |
| { |
| "epoch": 2.3151028015609527, |
| "grad_norm": 0.3273603916168213, |
| "learning_rate": 0.0005725559440559441, |
| "loss": 3.7553, |
| "step": 7950 |
| }, |
| { |
| "epoch": 2.3296639291746755, |
| "grad_norm": 0.3181568384170532, |
| "learning_rate": 0.0005723811188811188, |
| "loss": 3.7529, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.3296639291746755, |
| "eval_accuracy": 0.3447504651364561, |
| "eval_loss": 3.782167673110962, |
| "eval_runtime": 179.8459, |
| "eval_samples_per_second": 92.557, |
| "eval_steps_per_second": 5.788, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.3442250567883978, |
| "grad_norm": 0.3199383020401001, |
| "learning_rate": 0.0005722062937062937, |
| "loss": 3.7563, |
| "step": 8050 |
| }, |
| { |
| "epoch": 2.35878618440212, |
| "grad_norm": 0.34408700466156006, |
| "learning_rate": 0.0005720314685314685, |
| "loss": 3.7563, |
| "step": 8100 |
| }, |
| { |
| "epoch": 2.3733473120158424, |
| "grad_norm": 0.3332165479660034, |
| "learning_rate": 0.0005718566433566433, |
| "loss": 3.7544, |
| "step": 8150 |
| }, |
| { |
| "epoch": 2.387908439629565, |
| "grad_norm": 0.3418613076210022, |
| "learning_rate": 0.0005716818181818181, |
| "loss": 3.7503, |
| "step": 8200 |
| }, |
| { |
| "epoch": 2.4024695672432874, |
| "grad_norm": 0.30788588523864746, |
| "learning_rate": 0.000571506993006993, |
| "loss": 3.7567, |
| "step": 8250 |
| }, |
| { |
| "epoch": 2.4170306948570097, |
| "grad_norm": 0.3155195415019989, |
| "learning_rate": 0.0005713321678321678, |
| "loss": 3.7583, |
| "step": 8300 |
| }, |
| { |
| "epoch": 2.431591822470732, |
| "grad_norm": 0.3371559679508209, |
| "learning_rate": 0.0005711573426573426, |
| "loss": 3.7476, |
| "step": 8350 |
| }, |
| { |
| "epoch": 2.4461529500844543, |
| "grad_norm": 0.3333493173122406, |
| "learning_rate": 0.0005709825174825175, |
| "loss": 3.7481, |
| "step": 8400 |
| }, |
| { |
| "epoch": 2.460714077698177, |
| "grad_norm": 0.3173786997795105, |
| "learning_rate": 0.0005708076923076923, |
| "loss": 3.7538, |
| "step": 8450 |
| }, |
| { |
| "epoch": 2.4752752053118994, |
| "grad_norm": 0.31761112809181213, |
| "learning_rate": 0.0005706328671328671, |
| "loss": 3.747, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.4898363329256217, |
| "grad_norm": 0.3122101128101349, |
| "learning_rate": 0.0005704580419580419, |
| "loss": 3.7529, |
| "step": 8550 |
| }, |
| { |
| "epoch": 2.5043974605393444, |
| "grad_norm": 0.3232831060886383, |
| "learning_rate": 0.0005702832167832168, |
| "loss": 3.7452, |
| "step": 8600 |
| }, |
| { |
| "epoch": 2.5189585881530667, |
| "grad_norm": 0.3366617262363434, |
| "learning_rate": 0.0005701083916083916, |
| "loss": 3.7498, |
| "step": 8650 |
| }, |
| { |
| "epoch": 2.533519715766789, |
| "grad_norm": 0.3086051046848297, |
| "learning_rate": 0.0005699335664335664, |
| "loss": 3.7435, |
| "step": 8700 |
| }, |
| { |
| "epoch": 2.5480808433805113, |
| "grad_norm": 0.3077169358730316, |
| "learning_rate": 0.0005697587412587412, |
| "loss": 3.7315, |
| "step": 8750 |
| }, |
| { |
| "epoch": 2.5626419709942336, |
| "grad_norm": 0.3069632947444916, |
| "learning_rate": 0.000569583916083916, |
| "loss": 3.7473, |
| "step": 8800 |
| }, |
| { |
| "epoch": 2.5772030986079564, |
| "grad_norm": 0.31771939992904663, |
| "learning_rate": 0.0005694090909090908, |
| "loss": 3.7403, |
| "step": 8850 |
| }, |
| { |
| "epoch": 2.5917642262216787, |
| "grad_norm": 0.31766974925994873, |
| "learning_rate": 0.0005692342657342657, |
| "loss": 3.7379, |
| "step": 8900 |
| }, |
| { |
| "epoch": 2.606325353835401, |
| "grad_norm": 0.2977115511894226, |
| "learning_rate": 0.0005690594405594405, |
| "loss": 3.7372, |
| "step": 8950 |
| }, |
| { |
| "epoch": 2.6208864814491233, |
| "grad_norm": 0.31320396065711975, |
| "learning_rate": 0.0005688846153846153, |
| "loss": 3.7456, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.6208864814491233, |
| "eval_accuracy": 0.3476051203688268, |
| "eval_loss": 3.753355026245117, |
| "eval_runtime": 179.9413, |
| "eval_samples_per_second": 92.508, |
| "eval_steps_per_second": 5.785, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.6354476090628456, |
| "grad_norm": 0.3366906940937042, |
| "learning_rate": 0.0005687097902097901, |
| "loss": 3.7334, |
| "step": 9050 |
| }, |
| { |
| "epoch": 2.6500087366765683, |
| "grad_norm": 0.3280063569545746, |
| "learning_rate": 0.000568534965034965, |
| "loss": 3.7329, |
| "step": 9100 |
| }, |
| { |
| "epoch": 2.6645698642902906, |
| "grad_norm": 0.3041098415851593, |
| "learning_rate": 0.0005683601398601398, |
| "loss": 3.7356, |
| "step": 9150 |
| }, |
| { |
| "epoch": 2.679130991904013, |
| "grad_norm": 0.32193732261657715, |
| "learning_rate": 0.0005681853146853146, |
| "loss": 3.7403, |
| "step": 9200 |
| }, |
| { |
| "epoch": 2.6936921195177357, |
| "grad_norm": 0.3207952082157135, |
| "learning_rate": 0.0005680104895104895, |
| "loss": 3.7314, |
| "step": 9250 |
| }, |
| { |
| "epoch": 2.708253247131458, |
| "grad_norm": 0.304932564496994, |
| "learning_rate": 0.0005678356643356643, |
| "loss": 3.7318, |
| "step": 9300 |
| }, |
| { |
| "epoch": 2.7228143747451803, |
| "grad_norm": 0.3156920373439789, |
| "learning_rate": 0.0005676608391608391, |
| "loss": 3.7389, |
| "step": 9350 |
| }, |
| { |
| "epoch": 2.7373755023589026, |
| "grad_norm": 0.3496898412704468, |
| "learning_rate": 0.0005674860139860139, |
| "loss": 3.7481, |
| "step": 9400 |
| }, |
| { |
| "epoch": 2.751936629972625, |
| "grad_norm": 0.32160428166389465, |
| "learning_rate": 0.0005673111888111888, |
| "loss": 3.7133, |
| "step": 9450 |
| }, |
| { |
| "epoch": 2.7664977575863476, |
| "grad_norm": 0.35244813561439514, |
| "learning_rate": 0.0005671363636363635, |
| "loss": 3.7272, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.78105888520007, |
| "grad_norm": 0.31820884346961975, |
| "learning_rate": 0.0005669615384615384, |
| "loss": 3.7372, |
| "step": 9550 |
| }, |
| { |
| "epoch": 2.7956200128137922, |
| "grad_norm": 0.3161890208721161, |
| "learning_rate": 0.0005667867132867132, |
| "loss": 3.7296, |
| "step": 9600 |
| }, |
| { |
| "epoch": 2.8101811404275145, |
| "grad_norm": 0.3274824917316437, |
| "learning_rate": 0.000566611888111888, |
| "loss": 3.7331, |
| "step": 9650 |
| }, |
| { |
| "epoch": 2.824742268041237, |
| "grad_norm": 0.32894909381866455, |
| "learning_rate": 0.0005664370629370628, |
| "loss": 3.7276, |
| "step": 9700 |
| }, |
| { |
| "epoch": 2.8393033956549596, |
| "grad_norm": 0.33568722009658813, |
| "learning_rate": 0.0005662622377622377, |
| "loss": 3.7206, |
| "step": 9750 |
| }, |
| { |
| "epoch": 2.853864523268682, |
| "grad_norm": 0.31120970845222473, |
| "learning_rate": 0.0005660874125874125, |
| "loss": 3.7306, |
| "step": 9800 |
| }, |
| { |
| "epoch": 2.868425650882404, |
| "grad_norm": 0.3126221001148224, |
| "learning_rate": 0.0005659125874125873, |
| "loss": 3.7359, |
| "step": 9850 |
| }, |
| { |
| "epoch": 2.882986778496127, |
| "grad_norm": 0.32586759328842163, |
| "learning_rate": 0.0005657377622377622, |
| "loss": 3.727, |
| "step": 9900 |
| }, |
| { |
| "epoch": 2.8975479061098492, |
| "grad_norm": 0.3100239336490631, |
| "learning_rate": 0.000565562937062937, |
| "loss": 3.7286, |
| "step": 9950 |
| }, |
| { |
| "epoch": 2.9121090337235715, |
| "grad_norm": 0.312603235244751, |
| "learning_rate": 0.0005653881118881118, |
| "loss": 3.7211, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.9121090337235715, |
| "eval_accuracy": 0.34975263651781435, |
| "eval_loss": 3.728771209716797, |
| "eval_runtime": 180.2577, |
| "eval_samples_per_second": 92.346, |
| "eval_steps_per_second": 5.775, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.926670161337294, |
| "grad_norm": 0.3169952630996704, |
| "learning_rate": 0.0005652132867132866, |
| "loss": 3.73, |
| "step": 10050 |
| }, |
| { |
| "epoch": 2.941231288951016, |
| "grad_norm": 0.34371045231819153, |
| "learning_rate": 0.0005650384615384615, |
| "loss": 3.7237, |
| "step": 10100 |
| }, |
| { |
| "epoch": 2.955792416564739, |
| "grad_norm": 0.3246004283428192, |
| "learning_rate": 0.0005648636363636363, |
| "loss": 3.72, |
| "step": 10150 |
| }, |
| { |
| "epoch": 2.970353544178461, |
| "grad_norm": 0.3151702284812927, |
| "learning_rate": 0.0005646888111888111, |
| "loss": 3.71, |
| "step": 10200 |
| }, |
| { |
| "epoch": 2.9849146717921835, |
| "grad_norm": 0.3169575035572052, |
| "learning_rate": 0.000564513986013986, |
| "loss": 3.7235, |
| "step": 10250 |
| }, |
| { |
| "epoch": 2.9994757994059063, |
| "grad_norm": 0.319948673248291, |
| "learning_rate": 0.0005643391608391607, |
| "loss": 3.7198, |
| "step": 10300 |
| }, |
| { |
| "epoch": 3.0139786825091734, |
| "grad_norm": 0.3059089779853821, |
| "learning_rate": 0.0005641643356643355, |
| "loss": 3.6236, |
| "step": 10350 |
| }, |
| { |
| "epoch": 3.0285398101228957, |
| "grad_norm": 0.3222568929195404, |
| "learning_rate": 0.0005639895104895105, |
| "loss": 3.6094, |
| "step": 10400 |
| }, |
| { |
| "epoch": 3.0431009377366185, |
| "grad_norm": 0.33031460642814636, |
| "learning_rate": 0.0005638146853146853, |
| "loss": 3.6259, |
| "step": 10450 |
| }, |
| { |
| "epoch": 3.057662065350341, |
| "grad_norm": 0.3451738655567169, |
| "learning_rate": 0.0005636398601398601, |
| "loss": 3.6174, |
| "step": 10500 |
| }, |
| { |
| "epoch": 3.072223192964063, |
| "grad_norm": 0.3100563585758209, |
| "learning_rate": 0.000563465034965035, |
| "loss": 3.619, |
| "step": 10550 |
| }, |
| { |
| "epoch": 3.0867843205777854, |
| "grad_norm": 0.3191082775592804, |
| "learning_rate": 0.0005632902097902098, |
| "loss": 3.6134, |
| "step": 10600 |
| }, |
| { |
| "epoch": 3.101345448191508, |
| "grad_norm": 0.31113263964653015, |
| "learning_rate": 0.0005631153846153846, |
| "loss": 3.64, |
| "step": 10650 |
| }, |
| { |
| "epoch": 3.1159065758052304, |
| "grad_norm": 0.31913626194000244, |
| "learning_rate": 0.0005629405594405594, |
| "loss": 3.6148, |
| "step": 10700 |
| }, |
| { |
| "epoch": 3.1304677034189528, |
| "grad_norm": 0.34938791394233704, |
| "learning_rate": 0.0005627657342657343, |
| "loss": 3.6388, |
| "step": 10750 |
| }, |
| { |
| "epoch": 3.145028831032675, |
| "grad_norm": 0.3137979209423065, |
| "learning_rate": 0.0005625909090909091, |
| "loss": 3.637, |
| "step": 10800 |
| }, |
| { |
| "epoch": 3.1595899586463974, |
| "grad_norm": 0.3257555067539215, |
| "learning_rate": 0.0005624160839160839, |
| "loss": 3.622, |
| "step": 10850 |
| }, |
| { |
| "epoch": 3.17415108626012, |
| "grad_norm": 0.3360476493835449, |
| "learning_rate": 0.0005622412587412587, |
| "loss": 3.6168, |
| "step": 10900 |
| }, |
| { |
| "epoch": 3.1887122138738424, |
| "grad_norm": 0.32247471809387207, |
| "learning_rate": 0.0005620664335664336, |
| "loss": 3.6371, |
| "step": 10950 |
| }, |
| { |
| "epoch": 3.2032733414875647, |
| "grad_norm": 0.3221111595630646, |
| "learning_rate": 0.0005618916083916083, |
| "loss": 3.6308, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.2032733414875647, |
| "eval_accuracy": 0.3516110662152576, |
| "eval_loss": 3.71530818939209, |
| "eval_runtime": 180.0716, |
| "eval_samples_per_second": 92.441, |
| "eval_steps_per_second": 5.781, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.217834469101287, |
| "grad_norm": 0.328537255525589, |
| "learning_rate": 0.0005617167832167832, |
| "loss": 3.6442, |
| "step": 11050 |
| }, |
| { |
| "epoch": 3.2323955967150098, |
| "grad_norm": 0.3117297887802124, |
| "learning_rate": 0.000561541958041958, |
| "loss": 3.6432, |
| "step": 11100 |
| }, |
| { |
| "epoch": 3.246956724328732, |
| "grad_norm": 0.3149890899658203, |
| "learning_rate": 0.0005613671328671328, |
| "loss": 3.6417, |
| "step": 11150 |
| }, |
| { |
| "epoch": 3.2615178519424544, |
| "grad_norm": 0.33611249923706055, |
| "learning_rate": 0.0005611923076923077, |
| "loss": 3.6445, |
| "step": 11200 |
| }, |
| { |
| "epoch": 3.2760789795561767, |
| "grad_norm": 0.3154550790786743, |
| "learning_rate": 0.0005610174825174825, |
| "loss": 3.6354, |
| "step": 11250 |
| }, |
| { |
| "epoch": 3.2906401071698994, |
| "grad_norm": 0.32652929425239563, |
| "learning_rate": 0.0005608426573426573, |
| "loss": 3.6504, |
| "step": 11300 |
| }, |
| { |
| "epoch": 3.3052012347836217, |
| "grad_norm": 0.3078729808330536, |
| "learning_rate": 0.0005606678321678321, |
| "loss": 3.6414, |
| "step": 11350 |
| }, |
| { |
| "epoch": 3.319762362397344, |
| "grad_norm": 0.3264888525009155, |
| "learning_rate": 0.000560493006993007, |
| "loss": 3.641, |
| "step": 11400 |
| }, |
| { |
| "epoch": 3.3343234900110663, |
| "grad_norm": 0.32761842012405396, |
| "learning_rate": 0.0005603181818181818, |
| "loss": 3.6376, |
| "step": 11450 |
| }, |
| { |
| "epoch": 3.3488846176247886, |
| "grad_norm": 0.32418113946914673, |
| "learning_rate": 0.0005601433566433566, |
| "loss": 3.6318, |
| "step": 11500 |
| }, |
| { |
| "epoch": 3.3634457452385114, |
| "grad_norm": 0.31711092591285706, |
| "learning_rate": 0.0005599685314685314, |
| "loss": 3.6466, |
| "step": 11550 |
| }, |
| { |
| "epoch": 3.3780068728522337, |
| "grad_norm": 0.3332130014896393, |
| "learning_rate": 0.0005597937062937063, |
| "loss": 3.644, |
| "step": 11600 |
| }, |
| { |
| "epoch": 3.392568000465956, |
| "grad_norm": 0.3066932260990143, |
| "learning_rate": 0.0005596188811188811, |
| "loss": 3.6239, |
| "step": 11650 |
| }, |
| { |
| "epoch": 3.4071291280796787, |
| "grad_norm": 0.3358692228794098, |
| "learning_rate": 0.0005594440559440559, |
| "loss": 3.6383, |
| "step": 11700 |
| }, |
| { |
| "epoch": 3.421690255693401, |
| "grad_norm": 0.3365779221057892, |
| "learning_rate": 0.0005592692307692307, |
| "loss": 3.6465, |
| "step": 11750 |
| }, |
| { |
| "epoch": 3.4362513833071233, |
| "grad_norm": 0.3120969235897064, |
| "learning_rate": 0.0005590944055944055, |
| "loss": 3.6376, |
| "step": 11800 |
| }, |
| { |
| "epoch": 3.4508125109208456, |
| "grad_norm": 0.3177337944507599, |
| "learning_rate": 0.0005589195804195803, |
| "loss": 3.6378, |
| "step": 11850 |
| }, |
| { |
| "epoch": 3.465373638534568, |
| "grad_norm": 0.32334715127944946, |
| "learning_rate": 0.0005587447552447552, |
| "loss": 3.6375, |
| "step": 11900 |
| }, |
| { |
| "epoch": 3.4799347661482907, |
| "grad_norm": 0.3454623818397522, |
| "learning_rate": 0.00055856993006993, |
| "loss": 3.6455, |
| "step": 11950 |
| }, |
| { |
| "epoch": 3.494495893762013, |
| "grad_norm": 0.30331259965896606, |
| "learning_rate": 0.0005583951048951048, |
| "loss": 3.6491, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.494495893762013, |
| "eval_accuracy": 0.35328621580779734, |
| "eval_loss": 3.694920539855957, |
| "eval_runtime": 180.3415, |
| "eval_samples_per_second": 92.303, |
| "eval_steps_per_second": 5.772, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.5090570213757353, |
| "grad_norm": 0.3378197252750397, |
| "learning_rate": 0.0005582202797202797, |
| "loss": 3.6428, |
| "step": 12050 |
| }, |
| { |
| "epoch": 3.523618148989458, |
| "grad_norm": 0.3206309378147125, |
| "learning_rate": 0.0005580454545454545, |
| "loss": 3.6469, |
| "step": 12100 |
| }, |
| { |
| "epoch": 3.53817927660318, |
| "grad_norm": 0.33509668707847595, |
| "learning_rate": 0.0005578706293706293, |
| "loss": 3.6422, |
| "step": 12150 |
| }, |
| { |
| "epoch": 3.5527404042169026, |
| "grad_norm": 0.3330724239349365, |
| "learning_rate": 0.0005576958041958041, |
| "loss": 3.6415, |
| "step": 12200 |
| }, |
| { |
| "epoch": 3.567301531830625, |
| "grad_norm": 0.30723437666893005, |
| "learning_rate": 0.000557520979020979, |
| "loss": 3.6441, |
| "step": 12250 |
| }, |
| { |
| "epoch": 3.5818626594443472, |
| "grad_norm": 0.32412177324295044, |
| "learning_rate": 0.0005573461538461538, |
| "loss": 3.6435, |
| "step": 12300 |
| }, |
| { |
| "epoch": 3.59642378705807, |
| "grad_norm": 0.33709919452667236, |
| "learning_rate": 0.0005571713286713286, |
| "loss": 3.6397, |
| "step": 12350 |
| }, |
| { |
| "epoch": 3.6109849146717923, |
| "grad_norm": 0.32330116629600525, |
| "learning_rate": 0.0005569965034965034, |
| "loss": 3.6426, |
| "step": 12400 |
| }, |
| { |
| "epoch": 3.6255460422855146, |
| "grad_norm": 0.32359299063682556, |
| "learning_rate": 0.0005568216783216783, |
| "loss": 3.6532, |
| "step": 12450 |
| }, |
| { |
| "epoch": 3.640107169899237, |
| "grad_norm": 0.3221568763256073, |
| "learning_rate": 0.000556646853146853, |
| "loss": 3.6437, |
| "step": 12500 |
| }, |
| { |
| "epoch": 3.654668297512959, |
| "grad_norm": 0.3112320005893707, |
| "learning_rate": 0.0005564720279720279, |
| "loss": 3.651, |
| "step": 12550 |
| }, |
| { |
| "epoch": 3.669229425126682, |
| "grad_norm": 0.3220657408237457, |
| "learning_rate": 0.0005562972027972027, |
| "loss": 3.6339, |
| "step": 12600 |
| }, |
| { |
| "epoch": 3.6837905527404042, |
| "grad_norm": 0.3259319067001343, |
| "learning_rate": 0.0005561223776223775, |
| "loss": 3.6549, |
| "step": 12650 |
| }, |
| { |
| "epoch": 3.6983516803541265, |
| "grad_norm": 0.3208153545856476, |
| "learning_rate": 0.0005559475524475524, |
| "loss": 3.6423, |
| "step": 12700 |
| }, |
| { |
| "epoch": 3.7129128079678493, |
| "grad_norm": 0.31334730982780457, |
| "learning_rate": 0.0005557727272727272, |
| "loss": 3.646, |
| "step": 12750 |
| }, |
| { |
| "epoch": 3.7274739355815716, |
| "grad_norm": 0.32748448848724365, |
| "learning_rate": 0.000555597902097902, |
| "loss": 3.6424, |
| "step": 12800 |
| }, |
| { |
| "epoch": 3.742035063195294, |
| "grad_norm": 0.3044220209121704, |
| "learning_rate": 0.0005554230769230768, |
| "loss": 3.6371, |
| "step": 12850 |
| }, |
| { |
| "epoch": 3.756596190809016, |
| "grad_norm": 0.3228132426738739, |
| "learning_rate": 0.0005552482517482517, |
| "loss": 3.6433, |
| "step": 12900 |
| }, |
| { |
| "epoch": 3.7711573184227385, |
| "grad_norm": 0.3331652283668518, |
| "learning_rate": 0.0005550734265734265, |
| "loss": 3.6409, |
| "step": 12950 |
| }, |
| { |
| "epoch": 3.7857184460364612, |
| "grad_norm": 0.30985894799232483, |
| "learning_rate": 0.0005548986013986013, |
| "loss": 3.6444, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.7857184460364612, |
| "eval_accuracy": 0.35506799468522965, |
| "eval_loss": 3.6784751415252686, |
| "eval_runtime": 179.8543, |
| "eval_samples_per_second": 92.553, |
| "eval_steps_per_second": 5.788, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.8002795736501835, |
| "grad_norm": 0.3334255814552307, |
| "learning_rate": 0.0005547237762237761, |
| "loss": 3.63, |
| "step": 13050 |
| }, |
| { |
| "epoch": 3.814840701263906, |
| "grad_norm": 0.3209901452064514, |
| "learning_rate": 0.000554548951048951, |
| "loss": 3.6409, |
| "step": 13100 |
| }, |
| { |
| "epoch": 3.829401828877628, |
| "grad_norm": 0.3354300856590271, |
| "learning_rate": 0.0005543741258741258, |
| "loss": 3.6484, |
| "step": 13150 |
| }, |
| { |
| "epoch": 3.8439629564913504, |
| "grad_norm": 0.3089316189289093, |
| "learning_rate": 0.0005541993006993006, |
| "loss": 3.6433, |
| "step": 13200 |
| }, |
| { |
| "epoch": 3.858524084105073, |
| "grad_norm": 0.2953662872314453, |
| "learning_rate": 0.0005540244755244756, |
| "loss": 3.6282, |
| "step": 13250 |
| }, |
| { |
| "epoch": 3.8730852117187955, |
| "grad_norm": 0.3339363932609558, |
| "learning_rate": 0.0005538496503496502, |
| "loss": 3.6408, |
| "step": 13300 |
| }, |
| { |
| "epoch": 3.887646339332518, |
| "grad_norm": 0.32135921716690063, |
| "learning_rate": 0.0005536748251748252, |
| "loss": 3.6392, |
| "step": 13350 |
| }, |
| { |
| "epoch": 3.9022074669462405, |
| "grad_norm": 0.3284589946269989, |
| "learning_rate": 0.0005535, |
| "loss": 3.6308, |
| "step": 13400 |
| }, |
| { |
| "epoch": 3.916768594559963, |
| "grad_norm": 0.3352121114730835, |
| "learning_rate": 0.0005533251748251748, |
| "loss": 3.6474, |
| "step": 13450 |
| }, |
| { |
| "epoch": 3.931329722173685, |
| "grad_norm": 0.2903008759021759, |
| "learning_rate": 0.0005531503496503496, |
| "loss": 3.6451, |
| "step": 13500 |
| }, |
| { |
| "epoch": 3.9458908497874075, |
| "grad_norm": 0.3002571165561676, |
| "learning_rate": 0.0005529755244755245, |
| "loss": 3.633, |
| "step": 13550 |
| }, |
| { |
| "epoch": 3.9604519774011298, |
| "grad_norm": 0.31052255630493164, |
| "learning_rate": 0.0005528006993006993, |
| "loss": 3.645, |
| "step": 13600 |
| }, |
| { |
| "epoch": 3.9750131050148525, |
| "grad_norm": 0.31419000029563904, |
| "learning_rate": 0.0005526258741258741, |
| "loss": 3.6543, |
| "step": 13650 |
| }, |
| { |
| "epoch": 3.989574232628575, |
| "grad_norm": 0.3141070306301117, |
| "learning_rate": 0.0005524510489510489, |
| "loss": 3.6398, |
| "step": 13700 |
| }, |
| { |
| "epoch": 4.004077115731842, |
| "grad_norm": 0.34409621357917786, |
| "learning_rate": 0.0005522762237762238, |
| "loss": 3.6174, |
| "step": 13750 |
| }, |
| { |
| "epoch": 4.018638243345564, |
| "grad_norm": 0.3311491310596466, |
| "learning_rate": 0.0005521013986013986, |
| "loss": 3.5164, |
| "step": 13800 |
| }, |
| { |
| "epoch": 4.033199370959287, |
| "grad_norm": 0.3100874722003937, |
| "learning_rate": 0.0005519265734265734, |
| "loss": 3.53, |
| "step": 13850 |
| }, |
| { |
| "epoch": 4.04776049857301, |
| "grad_norm": 0.33125829696655273, |
| "learning_rate": 0.0005517517482517482, |
| "loss": 3.5464, |
| "step": 13900 |
| }, |
| { |
| "epoch": 4.062321626186732, |
| "grad_norm": 0.3169455826282501, |
| "learning_rate": 0.0005515769230769231, |
| "loss": 3.5338, |
| "step": 13950 |
| }, |
| { |
| "epoch": 4.076882753800454, |
| "grad_norm": 0.32324185967445374, |
| "learning_rate": 0.0005514020979020979, |
| "loss": 3.546, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.076882753800454, |
| "eval_accuracy": 0.35623974119297364, |
| "eval_loss": 3.6714980602264404, |
| "eval_runtime": 179.8923, |
| "eval_samples_per_second": 92.533, |
| "eval_steps_per_second": 5.787, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.091443881414177, |
| "grad_norm": 0.31944194436073303, |
| "learning_rate": 0.0005512272727272727, |
| "loss": 3.5414, |
| "step": 14050 |
| }, |
| { |
| "epoch": 4.106005009027899, |
| "grad_norm": 0.32807984948158264, |
| "learning_rate": 0.0005510524475524475, |
| "loss": 3.5496, |
| "step": 14100 |
| }, |
| { |
| "epoch": 4.120566136641622, |
| "grad_norm": 0.32585006952285767, |
| "learning_rate": 0.0005508776223776223, |
| "loss": 3.5529, |
| "step": 14150 |
| }, |
| { |
| "epoch": 4.135127264255344, |
| "grad_norm": 0.31170549988746643, |
| "learning_rate": 0.0005507027972027972, |
| "loss": 3.549, |
| "step": 14200 |
| }, |
| { |
| "epoch": 4.149688391869066, |
| "grad_norm": 0.32387596368789673, |
| "learning_rate": 0.000550527972027972, |
| "loss": 3.5506, |
| "step": 14250 |
| }, |
| { |
| "epoch": 4.164249519482789, |
| "grad_norm": 0.30805736780166626, |
| "learning_rate": 0.0005503531468531468, |
| "loss": 3.5578, |
| "step": 14300 |
| }, |
| { |
| "epoch": 4.178810647096511, |
| "grad_norm": 0.3185058832168579, |
| "learning_rate": 0.0005501783216783216, |
| "loss": 3.5592, |
| "step": 14350 |
| }, |
| { |
| "epoch": 4.193371774710234, |
| "grad_norm": 0.3130621314048767, |
| "learning_rate": 0.0005500034965034965, |
| "loss": 3.5591, |
| "step": 14400 |
| }, |
| { |
| "epoch": 4.207932902323956, |
| "grad_norm": 0.3218880593776703, |
| "learning_rate": 0.0005498286713286713, |
| "loss": 3.5626, |
| "step": 14450 |
| }, |
| { |
| "epoch": 4.222494029937678, |
| "grad_norm": 0.3212413191795349, |
| "learning_rate": 0.0005496538461538461, |
| "loss": 3.5614, |
| "step": 14500 |
| }, |
| { |
| "epoch": 4.237055157551401, |
| "grad_norm": 0.3139151930809021, |
| "learning_rate": 0.0005494790209790209, |
| "loss": 3.5612, |
| "step": 14550 |
| }, |
| { |
| "epoch": 4.251616285165123, |
| "grad_norm": 0.3274683654308319, |
| "learning_rate": 0.0005493041958041958, |
| "loss": 3.5634, |
| "step": 14600 |
| }, |
| { |
| "epoch": 4.266177412778846, |
| "grad_norm": 0.3314908742904663, |
| "learning_rate": 0.0005491293706293706, |
| "loss": 3.558, |
| "step": 14650 |
| }, |
| { |
| "epoch": 4.280738540392568, |
| "grad_norm": 0.3365668058395386, |
| "learning_rate": 0.0005489545454545454, |
| "loss": 3.5707, |
| "step": 14700 |
| }, |
| { |
| "epoch": 4.29529966800629, |
| "grad_norm": 0.3486507534980774, |
| "learning_rate": 0.0005487797202797203, |
| "loss": 3.566, |
| "step": 14750 |
| }, |
| { |
| "epoch": 4.309860795620013, |
| "grad_norm": 0.3329235911369324, |
| "learning_rate": 0.000548604895104895, |
| "loss": 3.5603, |
| "step": 14800 |
| }, |
| { |
| "epoch": 4.324421923233735, |
| "grad_norm": 0.3063100576400757, |
| "learning_rate": 0.0005484300699300699, |
| "loss": 3.5618, |
| "step": 14850 |
| }, |
| { |
| "epoch": 4.338983050847458, |
| "grad_norm": 0.32201799750328064, |
| "learning_rate": 0.0005482552447552447, |
| "loss": 3.5755, |
| "step": 14900 |
| }, |
| { |
| "epoch": 4.35354417846118, |
| "grad_norm": 0.32739633321762085, |
| "learning_rate": 0.0005480804195804195, |
| "loss": 3.5735, |
| "step": 14950 |
| }, |
| { |
| "epoch": 4.368105306074902, |
| "grad_norm": 0.3276308476924896, |
| "learning_rate": 0.0005479055944055943, |
| "loss": 3.5738, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.368105306074902, |
| "eval_accuracy": 0.35739702750001, |
| "eval_loss": 3.6611199378967285, |
| "eval_runtime": 179.8719, |
| "eval_samples_per_second": 92.544, |
| "eval_steps_per_second": 5.787, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.382666433688625, |
| "grad_norm": 0.31093257665634155, |
| "learning_rate": 0.0005477307692307692, |
| "loss": 3.5744, |
| "step": 15050 |
| }, |
| { |
| "epoch": 4.397227561302348, |
| "grad_norm": 0.3235359489917755, |
| "learning_rate": 0.000547555944055944, |
| "loss": 3.5776, |
| "step": 15100 |
| }, |
| { |
| "epoch": 4.41178868891607, |
| "grad_norm": 0.3139866888523102, |
| "learning_rate": 0.0005473811188811188, |
| "loss": 3.5715, |
| "step": 15150 |
| }, |
| { |
| "epoch": 4.426349816529792, |
| "grad_norm": 0.31879371404647827, |
| "learning_rate": 0.0005472062937062936, |
| "loss": 3.5628, |
| "step": 15200 |
| }, |
| { |
| "epoch": 4.440910944143514, |
| "grad_norm": 0.3083356022834778, |
| "learning_rate": 0.0005470314685314685, |
| "loss": 3.5836, |
| "step": 15250 |
| }, |
| { |
| "epoch": 4.455472071757237, |
| "grad_norm": 0.3144471347332001, |
| "learning_rate": 0.0005468566433566433, |
| "loss": 3.5615, |
| "step": 15300 |
| }, |
| { |
| "epoch": 4.47003319937096, |
| "grad_norm": 0.33122745156288147, |
| "learning_rate": 0.0005466818181818181, |
| "loss": 3.5764, |
| "step": 15350 |
| }, |
| { |
| "epoch": 4.4845943269846815, |
| "grad_norm": 0.31733083724975586, |
| "learning_rate": 0.000546506993006993, |
| "loss": 3.5728, |
| "step": 15400 |
| }, |
| { |
| "epoch": 4.499155454598404, |
| "grad_norm": 0.33638572692871094, |
| "learning_rate": 0.0005463321678321678, |
| "loss": 3.5689, |
| "step": 15450 |
| }, |
| { |
| "epoch": 4.513716582212126, |
| "grad_norm": 0.32364147901535034, |
| "learning_rate": 0.0005461573426573426, |
| "loss": 3.5732, |
| "step": 15500 |
| }, |
| { |
| "epoch": 4.528277709825849, |
| "grad_norm": 0.33318740129470825, |
| "learning_rate": 0.0005459825174825174, |
| "loss": 3.5674, |
| "step": 15550 |
| }, |
| { |
| "epoch": 4.542838837439572, |
| "grad_norm": 0.31503283977508545, |
| "learning_rate": 0.0005458076923076922, |
| "loss": 3.5789, |
| "step": 15600 |
| }, |
| { |
| "epoch": 4.5573999650532935, |
| "grad_norm": 0.3540826737880707, |
| "learning_rate": 0.000545632867132867, |
| "loss": 3.5826, |
| "step": 15650 |
| }, |
| { |
| "epoch": 4.571961092667016, |
| "grad_norm": 0.32796967029571533, |
| "learning_rate": 0.0005454580419580419, |
| "loss": 3.5875, |
| "step": 15700 |
| }, |
| { |
| "epoch": 4.586522220280738, |
| "grad_norm": 0.30915871262550354, |
| "learning_rate": 0.0005452832167832167, |
| "loss": 3.5746, |
| "step": 15750 |
| }, |
| { |
| "epoch": 4.601083347894461, |
| "grad_norm": 0.3040575385093689, |
| "learning_rate": 0.0005451083916083915, |
| "loss": 3.5834, |
| "step": 15800 |
| }, |
| { |
| "epoch": 4.615644475508184, |
| "grad_norm": 0.3184298872947693, |
| "learning_rate": 0.0005449335664335663, |
| "loss": 3.574, |
| "step": 15850 |
| }, |
| { |
| "epoch": 4.630205603121905, |
| "grad_norm": 0.33443188667297363, |
| "learning_rate": 0.0005447587412587412, |
| "loss": 3.5729, |
| "step": 15900 |
| }, |
| { |
| "epoch": 4.644766730735628, |
| "grad_norm": 0.3360270857810974, |
| "learning_rate": 0.000544583916083916, |
| "loss": 3.5826, |
| "step": 15950 |
| }, |
| { |
| "epoch": 4.659327858349351, |
| "grad_norm": 0.3119940459728241, |
| "learning_rate": 0.0005444090909090908, |
| "loss": 3.5847, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.659327858349351, |
| "eval_accuracy": 0.35877356806980776, |
| "eval_loss": 3.6469807624816895, |
| "eval_runtime": 179.9979, |
| "eval_samples_per_second": 92.479, |
| "eval_steps_per_second": 5.783, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.673888985963073, |
| "grad_norm": 0.3362326920032501, |
| "learning_rate": 0.0005442342657342657, |
| "loss": 3.5745, |
| "step": 16050 |
| }, |
| { |
| "epoch": 4.6884501135767955, |
| "grad_norm": 0.2989421486854553, |
| "learning_rate": 0.0005440594405594405, |
| "loss": 3.5836, |
| "step": 16100 |
| }, |
| { |
| "epoch": 4.703011241190518, |
| "grad_norm": 0.3141367435455322, |
| "learning_rate": 0.0005438846153846153, |
| "loss": 3.575, |
| "step": 16150 |
| }, |
| { |
| "epoch": 4.71757236880424, |
| "grad_norm": 0.32469645142555237, |
| "learning_rate": 0.0005437097902097901, |
| "loss": 3.5671, |
| "step": 16200 |
| }, |
| { |
| "epoch": 4.732133496417963, |
| "grad_norm": 0.30097395181655884, |
| "learning_rate": 0.0005435349650349651, |
| "loss": 3.5793, |
| "step": 16250 |
| }, |
| { |
| "epoch": 4.746694624031685, |
| "grad_norm": 0.3281540274620056, |
| "learning_rate": 0.0005433601398601397, |
| "loss": 3.5755, |
| "step": 16300 |
| }, |
| { |
| "epoch": 4.7612557516454075, |
| "grad_norm": 0.31318652629852295, |
| "learning_rate": 0.0005431853146853147, |
| "loss": 3.5767, |
| "step": 16350 |
| }, |
| { |
| "epoch": 4.77581687925913, |
| "grad_norm": 0.32615000009536743, |
| "learning_rate": 0.0005430104895104895, |
| "loss": 3.5756, |
| "step": 16400 |
| }, |
| { |
| "epoch": 4.790378006872852, |
| "grad_norm": 0.30325064063072205, |
| "learning_rate": 0.0005428356643356643, |
| "loss": 3.5745, |
| "step": 16450 |
| }, |
| { |
| "epoch": 4.804939134486575, |
| "grad_norm": 0.3084414005279541, |
| "learning_rate": 0.0005426608391608391, |
| "loss": 3.5711, |
| "step": 16500 |
| }, |
| { |
| "epoch": 4.819500262100297, |
| "grad_norm": 0.31482434272766113, |
| "learning_rate": 0.000542486013986014, |
| "loss": 3.5746, |
| "step": 16550 |
| }, |
| { |
| "epoch": 4.834061389714019, |
| "grad_norm": 0.3251674175262451, |
| "learning_rate": 0.0005423111888111888, |
| "loss": 3.5809, |
| "step": 16600 |
| }, |
| { |
| "epoch": 4.848622517327742, |
| "grad_norm": 0.32683202624320984, |
| "learning_rate": 0.0005421363636363636, |
| "loss": 3.5704, |
| "step": 16650 |
| }, |
| { |
| "epoch": 4.863183644941464, |
| "grad_norm": 0.32508617639541626, |
| "learning_rate": 0.0005419615384615385, |
| "loss": 3.5743, |
| "step": 16700 |
| }, |
| { |
| "epoch": 4.877744772555187, |
| "grad_norm": 0.3222089409828186, |
| "learning_rate": 0.0005417867132867133, |
| "loss": 3.5759, |
| "step": 16750 |
| }, |
| { |
| "epoch": 4.892305900168909, |
| "grad_norm": 0.33162739872932434, |
| "learning_rate": 0.0005416118881118881, |
| "loss": 3.577, |
| "step": 16800 |
| }, |
| { |
| "epoch": 4.906867027782631, |
| "grad_norm": 0.3083624839782715, |
| "learning_rate": 0.0005414370629370629, |
| "loss": 3.5874, |
| "step": 16850 |
| }, |
| { |
| "epoch": 4.921428155396354, |
| "grad_norm": 0.314523309469223, |
| "learning_rate": 0.0005412622377622378, |
| "loss": 3.5742, |
| "step": 16900 |
| }, |
| { |
| "epoch": 4.935989283010076, |
| "grad_norm": 0.32406067848205566, |
| "learning_rate": 0.0005410874125874126, |
| "loss": 3.5783, |
| "step": 16950 |
| }, |
| { |
| "epoch": 4.950550410623799, |
| "grad_norm": 0.30304184556007385, |
| "learning_rate": 0.0005409125874125874, |
| "loss": 3.5793, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.950550410623799, |
| "eval_accuracy": 0.3598104702668883, |
| "eval_loss": 3.6329519748687744, |
| "eval_runtime": 180.2241, |
| "eval_samples_per_second": 92.363, |
| "eval_steps_per_second": 5.776, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.9651115382375215, |
| "grad_norm": 0.33693137764930725, |
| "learning_rate": 0.0005407377622377622, |
| "loss": 3.5716, |
| "step": 17050 |
| }, |
| { |
| "epoch": 4.979672665851243, |
| "grad_norm": 0.32594335079193115, |
| "learning_rate": 0.000540562937062937, |
| "loss": 3.5774, |
| "step": 17100 |
| }, |
| { |
| "epoch": 4.994233793464966, |
| "grad_norm": 0.3311070501804352, |
| "learning_rate": 0.0005403881118881118, |
| "loss": 3.5736, |
| "step": 17150 |
| }, |
| { |
| "epoch": 5.008736676568233, |
| "grad_norm": 0.33527490496635437, |
| "learning_rate": 0.0005402132867132867, |
| "loss": 3.5133, |
| "step": 17200 |
| }, |
| { |
| "epoch": 5.023297804181956, |
| "grad_norm": 0.3373531699180603, |
| "learning_rate": 0.0005400384615384615, |
| "loss": 3.4574, |
| "step": 17250 |
| }, |
| { |
| "epoch": 5.037858931795678, |
| "grad_norm": 0.3367919921875, |
| "learning_rate": 0.0005398636363636363, |
| "loss": 3.4778, |
| "step": 17300 |
| }, |
| { |
| "epoch": 5.052420059409401, |
| "grad_norm": 0.324205607175827, |
| "learning_rate": 0.0005396888111888111, |
| "loss": 3.4857, |
| "step": 17350 |
| }, |
| { |
| "epoch": 5.066981187023123, |
| "grad_norm": 0.3194437325000763, |
| "learning_rate": 0.000539513986013986, |
| "loss": 3.4846, |
| "step": 17400 |
| }, |
| { |
| "epoch": 5.081542314636845, |
| "grad_norm": 0.3172489106655121, |
| "learning_rate": 0.0005393391608391608, |
| "loss": 3.4697, |
| "step": 17450 |
| }, |
| { |
| "epoch": 5.096103442250568, |
| "grad_norm": 0.3533608913421631, |
| "learning_rate": 0.0005391643356643356, |
| "loss": 3.4824, |
| "step": 17500 |
| }, |
| { |
| "epoch": 5.110664569864291, |
| "grad_norm": 0.3347857594490051, |
| "learning_rate": 0.0005389895104895105, |
| "loss": 3.4885, |
| "step": 17550 |
| }, |
| { |
| "epoch": 5.125225697478013, |
| "grad_norm": 0.3107113540172577, |
| "learning_rate": 0.0005388146853146853, |
| "loss": 3.4907, |
| "step": 17600 |
| }, |
| { |
| "epoch": 5.139786825091735, |
| "grad_norm": 0.32970553636550903, |
| "learning_rate": 0.0005386398601398601, |
| "loss": 3.4955, |
| "step": 17650 |
| }, |
| { |
| "epoch": 5.154347952705457, |
| "grad_norm": 0.3255005180835724, |
| "learning_rate": 0.0005384650349650349, |
| "loss": 3.4872, |
| "step": 17700 |
| }, |
| { |
| "epoch": 5.16890908031918, |
| "grad_norm": 0.31805264949798584, |
| "learning_rate": 0.0005382902097902098, |
| "loss": 3.5085, |
| "step": 17750 |
| }, |
| { |
| "epoch": 5.183470207932903, |
| "grad_norm": 0.33333975076675415, |
| "learning_rate": 0.0005381153846153845, |
| "loss": 3.5013, |
| "step": 17800 |
| }, |
| { |
| "epoch": 5.1980313355466246, |
| "grad_norm": 0.33611050248146057, |
| "learning_rate": 0.0005379405594405594, |
| "loss": 3.5025, |
| "step": 17850 |
| }, |
| { |
| "epoch": 5.212592463160347, |
| "grad_norm": 0.32773202657699585, |
| "learning_rate": 0.0005377657342657342, |
| "loss": 3.5014, |
| "step": 17900 |
| }, |
| { |
| "epoch": 5.227153590774069, |
| "grad_norm": 0.31583476066589355, |
| "learning_rate": 0.000537590909090909, |
| "loss": 3.5047, |
| "step": 17950 |
| }, |
| { |
| "epoch": 5.241714718387792, |
| "grad_norm": 0.3295851945877075, |
| "learning_rate": 0.0005374160839160838, |
| "loss": 3.5062, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.241714718387792, |
| "eval_accuracy": 0.3603497299469346, |
| "eval_loss": 3.637709617614746, |
| "eval_runtime": 179.9952, |
| "eval_samples_per_second": 92.48, |
| "eval_steps_per_second": 5.783, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.256275846001515, |
| "grad_norm": 0.3549295961856842, |
| "learning_rate": 0.0005372412587412587, |
| "loss": 3.5129, |
| "step": 18050 |
| }, |
| { |
| "epoch": 5.2708369736152365, |
| "grad_norm": 0.3322116434574127, |
| "learning_rate": 0.0005370664335664335, |
| "loss": 3.5078, |
| "step": 18100 |
| }, |
| { |
| "epoch": 5.285398101228959, |
| "grad_norm": 0.32084742188453674, |
| "learning_rate": 0.0005368916083916083, |
| "loss": 3.5105, |
| "step": 18150 |
| }, |
| { |
| "epoch": 5.299959228842681, |
| "grad_norm": 0.3556179404258728, |
| "learning_rate": 0.0005367167832167832, |
| "loss": 3.5188, |
| "step": 18200 |
| }, |
| { |
| "epoch": 5.314520356456404, |
| "grad_norm": 0.3325696885585785, |
| "learning_rate": 0.000536541958041958, |
| "loss": 3.5168, |
| "step": 18250 |
| }, |
| { |
| "epoch": 5.329081484070127, |
| "grad_norm": 0.3196581304073334, |
| "learning_rate": 0.0005363671328671328, |
| "loss": 3.5097, |
| "step": 18300 |
| }, |
| { |
| "epoch": 5.3436426116838485, |
| "grad_norm": 0.34717246890068054, |
| "learning_rate": 0.0005361923076923076, |
| "loss": 3.5099, |
| "step": 18350 |
| }, |
| { |
| "epoch": 5.358203739297571, |
| "grad_norm": 0.31319311261177063, |
| "learning_rate": 0.0005360174825174825, |
| "loss": 3.5257, |
| "step": 18400 |
| }, |
| { |
| "epoch": 5.372764866911294, |
| "grad_norm": 0.33258256316185, |
| "learning_rate": 0.0005358426573426573, |
| "loss": 3.5074, |
| "step": 18450 |
| }, |
| { |
| "epoch": 5.387325994525016, |
| "grad_norm": 0.3166041970252991, |
| "learning_rate": 0.0005356678321678321, |
| "loss": 3.5196, |
| "step": 18500 |
| }, |
| { |
| "epoch": 5.401887122138739, |
| "grad_norm": 0.35271477699279785, |
| "learning_rate": 0.0005354930069930069, |
| "loss": 3.5226, |
| "step": 18550 |
| }, |
| { |
| "epoch": 5.41644824975246, |
| "grad_norm": 0.30517712235450745, |
| "learning_rate": 0.0005353181818181817, |
| "loss": 3.5138, |
| "step": 18600 |
| }, |
| { |
| "epoch": 5.431009377366183, |
| "grad_norm": 0.339862197637558, |
| "learning_rate": 0.0005351433566433565, |
| "loss": 3.5085, |
| "step": 18650 |
| }, |
| { |
| "epoch": 5.445570504979906, |
| "grad_norm": 0.31639939546585083, |
| "learning_rate": 0.0005349685314685314, |
| "loss": 3.5146, |
| "step": 18700 |
| }, |
| { |
| "epoch": 5.460131632593628, |
| "grad_norm": 0.3516615629196167, |
| "learning_rate": 0.0005347937062937062, |
| "loss": 3.5267, |
| "step": 18750 |
| }, |
| { |
| "epoch": 5.4746927602073505, |
| "grad_norm": 0.3237917423248291, |
| "learning_rate": 0.000534618881118881, |
| "loss": 3.531, |
| "step": 18800 |
| }, |
| { |
| "epoch": 5.489253887821073, |
| "grad_norm": 0.3265361189842224, |
| "learning_rate": 0.0005344440559440559, |
| "loss": 3.5167, |
| "step": 18850 |
| }, |
| { |
| "epoch": 5.503815015434795, |
| "grad_norm": 0.31917789578437805, |
| "learning_rate": 0.0005342692307692307, |
| "loss": 3.5211, |
| "step": 18900 |
| }, |
| { |
| "epoch": 5.518376143048518, |
| "grad_norm": 0.33544105291366577, |
| "learning_rate": 0.0005340944055944055, |
| "loss": 3.5309, |
| "step": 18950 |
| }, |
| { |
| "epoch": 5.53293727066224, |
| "grad_norm": 0.32001304626464844, |
| "learning_rate": 0.0005339195804195803, |
| "loss": 3.515, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.53293727066224, |
| "eval_accuracy": 0.3610780302996459, |
| "eval_loss": 3.6267178058624268, |
| "eval_runtime": 179.9232, |
| "eval_samples_per_second": 92.517, |
| "eval_steps_per_second": 5.786, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.5474983982759625, |
| "grad_norm": 0.3181758522987366, |
| "learning_rate": 0.0005337447552447552, |
| "loss": 3.5152, |
| "step": 19050 |
| }, |
| { |
| "epoch": 5.562059525889685, |
| "grad_norm": 0.32306650280952454, |
| "learning_rate": 0.00053356993006993, |
| "loss": 3.5342, |
| "step": 19100 |
| }, |
| { |
| "epoch": 5.576620653503407, |
| "grad_norm": 0.3550286889076233, |
| "learning_rate": 0.0005333951048951048, |
| "loss": 3.5289, |
| "step": 19150 |
| }, |
| { |
| "epoch": 5.59118178111713, |
| "grad_norm": 0.31986021995544434, |
| "learning_rate": 0.0005332202797202796, |
| "loss": 3.5189, |
| "step": 19200 |
| }, |
| { |
| "epoch": 5.605742908730852, |
| "grad_norm": 0.30408525466918945, |
| "learning_rate": 0.0005330454545454546, |
| "loss": 3.5261, |
| "step": 19250 |
| }, |
| { |
| "epoch": 5.620304036344574, |
| "grad_norm": 0.33521342277526855, |
| "learning_rate": 0.0005328706293706292, |
| "loss": 3.5305, |
| "step": 19300 |
| }, |
| { |
| "epoch": 5.634865163958297, |
| "grad_norm": 0.3034505546092987, |
| "learning_rate": 0.0005326958041958042, |
| "loss": 3.5193, |
| "step": 19350 |
| }, |
| { |
| "epoch": 5.649426291572019, |
| "grad_norm": 0.3276582658290863, |
| "learning_rate": 0.000532520979020979, |
| "loss": 3.526, |
| "step": 19400 |
| }, |
| { |
| "epoch": 5.663987419185742, |
| "grad_norm": 0.3390166461467743, |
| "learning_rate": 0.0005323461538461538, |
| "loss": 3.5226, |
| "step": 19450 |
| }, |
| { |
| "epoch": 5.6785485467994645, |
| "grad_norm": 0.32178202271461487, |
| "learning_rate": 0.0005321713286713287, |
| "loss": 3.5262, |
| "step": 19500 |
| }, |
| { |
| "epoch": 5.693109674413186, |
| "grad_norm": 0.3271371126174927, |
| "learning_rate": 0.0005319965034965035, |
| "loss": 3.5252, |
| "step": 19550 |
| }, |
| { |
| "epoch": 5.707670802026909, |
| "grad_norm": 0.33951279520988464, |
| "learning_rate": 0.0005318216783216783, |
| "loss": 3.5334, |
| "step": 19600 |
| }, |
| { |
| "epoch": 5.722231929640631, |
| "grad_norm": 0.33040541410446167, |
| "learning_rate": 0.0005316468531468531, |
| "loss": 3.533, |
| "step": 19650 |
| }, |
| { |
| "epoch": 5.736793057254354, |
| "grad_norm": 0.32898440957069397, |
| "learning_rate": 0.000531472027972028, |
| "loss": 3.5343, |
| "step": 19700 |
| }, |
| { |
| "epoch": 5.7513541848680765, |
| "grad_norm": 0.31869906187057495, |
| "learning_rate": 0.0005312972027972028, |
| "loss": 3.5372, |
| "step": 19750 |
| }, |
| { |
| "epoch": 5.765915312481798, |
| "grad_norm": 0.3398876488208771, |
| "learning_rate": 0.0005311223776223776, |
| "loss": 3.5317, |
| "step": 19800 |
| }, |
| { |
| "epoch": 5.780476440095521, |
| "grad_norm": 0.32224592566490173, |
| "learning_rate": 0.0005309475524475524, |
| "loss": 3.5313, |
| "step": 19850 |
| }, |
| { |
| "epoch": 5.795037567709244, |
| "grad_norm": 0.3511403501033783, |
| "learning_rate": 0.0005307727272727273, |
| "loss": 3.5224, |
| "step": 19900 |
| }, |
| { |
| "epoch": 5.809598695322966, |
| "grad_norm": 0.32117030024528503, |
| "learning_rate": 0.0005305979020979021, |
| "loss": 3.5255, |
| "step": 19950 |
| }, |
| { |
| "epoch": 5.824159822936688, |
| "grad_norm": 0.32694876194000244, |
| "learning_rate": 0.0005304230769230769, |
| "loss": 3.5335, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.824159822936688, |
| "eval_accuracy": 0.36223637467014874, |
| "eval_loss": 3.612752676010132, |
| "eval_runtime": 179.9148, |
| "eval_samples_per_second": 92.522, |
| "eval_steps_per_second": 5.786, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.83872095055041, |
| "grad_norm": 0.3316369652748108, |
| "learning_rate": 0.0005302482517482517, |
| "loss": 3.539, |
| "step": 20050 |
| }, |
| { |
| "epoch": 5.853282078164133, |
| "grad_norm": 0.3608207106590271, |
| "learning_rate": 0.0005300734265734265, |
| "loss": 3.5315, |
| "step": 20100 |
| }, |
| { |
| "epoch": 5.867843205777856, |
| "grad_norm": 0.31140851974487305, |
| "learning_rate": 0.0005298986013986013, |
| "loss": 3.5275, |
| "step": 20150 |
| }, |
| { |
| "epoch": 5.882404333391578, |
| "grad_norm": 0.3333629071712494, |
| "learning_rate": 0.0005297237762237762, |
| "loss": 3.5288, |
| "step": 20200 |
| }, |
| { |
| "epoch": 5.8969654610053, |
| "grad_norm": 0.3193221986293793, |
| "learning_rate": 0.000529548951048951, |
| "loss": 3.5278, |
| "step": 20250 |
| }, |
| { |
| "epoch": 5.911526588619022, |
| "grad_norm": 0.32918399572372437, |
| "learning_rate": 0.0005293741258741258, |
| "loss": 3.5359, |
| "step": 20300 |
| }, |
| { |
| "epoch": 5.926087716232745, |
| "grad_norm": 0.33671337366104126, |
| "learning_rate": 0.0005291993006993007, |
| "loss": 3.5273, |
| "step": 20350 |
| }, |
| { |
| "epoch": 5.940648843846468, |
| "grad_norm": 0.32543742656707764, |
| "learning_rate": 0.0005290244755244755, |
| "loss": 3.529, |
| "step": 20400 |
| }, |
| { |
| "epoch": 5.95520997146019, |
| "grad_norm": 0.324459969997406, |
| "learning_rate": 0.0005288496503496503, |
| "loss": 3.5239, |
| "step": 20450 |
| }, |
| { |
| "epoch": 5.969771099073912, |
| "grad_norm": 0.32625219225883484, |
| "learning_rate": 0.0005286748251748251, |
| "loss": 3.5273, |
| "step": 20500 |
| }, |
| { |
| "epoch": 5.984332226687634, |
| "grad_norm": 0.3036121129989624, |
| "learning_rate": 0.0005285, |
| "loss": 3.5248, |
| "step": 20550 |
| }, |
| { |
| "epoch": 5.998893354301357, |
| "grad_norm": 0.31967318058013916, |
| "learning_rate": 0.0005283251748251748, |
| "loss": 3.5295, |
| "step": 20600 |
| }, |
| { |
| "epoch": 6.013396237404625, |
| "grad_norm": 0.3283931016921997, |
| "learning_rate": 0.0005281503496503496, |
| "loss": 3.4456, |
| "step": 20650 |
| }, |
| { |
| "epoch": 6.027957365018347, |
| "grad_norm": 0.33710625767707825, |
| "learning_rate": 0.0005279755244755244, |
| "loss": 3.4272, |
| "step": 20700 |
| }, |
| { |
| "epoch": 6.04251849263207, |
| "grad_norm": 0.32933324575424194, |
| "learning_rate": 0.0005278006993006993, |
| "loss": 3.417, |
| "step": 20750 |
| }, |
| { |
| "epoch": 6.0570796202457915, |
| "grad_norm": 0.37448322772979736, |
| "learning_rate": 0.000527625874125874, |
| "loss": 3.4348, |
| "step": 20800 |
| }, |
| { |
| "epoch": 6.071640747859514, |
| "grad_norm": 0.3174060881137848, |
| "learning_rate": 0.0005274510489510489, |
| "loss": 3.4384, |
| "step": 20850 |
| }, |
| { |
| "epoch": 6.086201875473237, |
| "grad_norm": 0.31601423025131226, |
| "learning_rate": 0.0005272762237762238, |
| "loss": 3.4409, |
| "step": 20900 |
| }, |
| { |
| "epoch": 6.100763003086959, |
| "grad_norm": 0.34834301471710205, |
| "learning_rate": 0.0005271013986013985, |
| "loss": 3.4406, |
| "step": 20950 |
| }, |
| { |
| "epoch": 6.115324130700682, |
| "grad_norm": 0.32480382919311523, |
| "learning_rate": 0.0005269265734265734, |
| "loss": 3.4428, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.115324130700682, |
| "eval_accuracy": 0.36245997874938307, |
| "eval_loss": 3.619140386581421, |
| "eval_runtime": 180.104, |
| "eval_samples_per_second": 92.424, |
| "eval_steps_per_second": 5.78, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.1298852583144035, |
| "grad_norm": 0.3219059407711029, |
| "learning_rate": 0.0005267517482517482, |
| "loss": 3.4521, |
| "step": 21050 |
| }, |
| { |
| "epoch": 6.144446385928126, |
| "grad_norm": 0.31983089447021484, |
| "learning_rate": 0.000526576923076923, |
| "loss": 3.436, |
| "step": 21100 |
| }, |
| { |
| "epoch": 6.159007513541849, |
| "grad_norm": 0.34388336539268494, |
| "learning_rate": 0.0005264020979020978, |
| "loss": 3.4463, |
| "step": 21150 |
| }, |
| { |
| "epoch": 6.173568641155571, |
| "grad_norm": 0.3420620858669281, |
| "learning_rate": 0.0005262272727272727, |
| "loss": 3.4594, |
| "step": 21200 |
| }, |
| { |
| "epoch": 6.1881297687692935, |
| "grad_norm": 0.3290681540966034, |
| "learning_rate": 0.0005260524475524475, |
| "loss": 3.4603, |
| "step": 21250 |
| }, |
| { |
| "epoch": 6.202690896383016, |
| "grad_norm": 0.32561996579170227, |
| "learning_rate": 0.0005258776223776223, |
| "loss": 3.4485, |
| "step": 21300 |
| }, |
| { |
| "epoch": 6.217252023996738, |
| "grad_norm": 0.3419070243835449, |
| "learning_rate": 0.0005257027972027971, |
| "loss": 3.4689, |
| "step": 21350 |
| }, |
| { |
| "epoch": 6.231813151610461, |
| "grad_norm": 0.369342178106308, |
| "learning_rate": 0.000525527972027972, |
| "loss": 3.451, |
| "step": 21400 |
| }, |
| { |
| "epoch": 6.246374279224183, |
| "grad_norm": 0.3240169584751129, |
| "learning_rate": 0.0005253531468531468, |
| "loss": 3.4607, |
| "step": 21450 |
| }, |
| { |
| "epoch": 6.2609354068379055, |
| "grad_norm": 0.3300141394138336, |
| "learning_rate": 0.0005251783216783216, |
| "loss": 3.4644, |
| "step": 21500 |
| }, |
| { |
| "epoch": 6.275496534451628, |
| "grad_norm": 0.3245398998260498, |
| "learning_rate": 0.0005250034965034965, |
| "loss": 3.4701, |
| "step": 21550 |
| }, |
| { |
| "epoch": 6.29005766206535, |
| "grad_norm": 0.355610728263855, |
| "learning_rate": 0.0005248286713286712, |
| "loss": 3.4734, |
| "step": 21600 |
| }, |
| { |
| "epoch": 6.304618789679073, |
| "grad_norm": 0.32918643951416016, |
| "learning_rate": 0.0005246538461538461, |
| "loss": 3.4535, |
| "step": 21650 |
| }, |
| { |
| "epoch": 6.319179917292795, |
| "grad_norm": 0.33903130888938904, |
| "learning_rate": 0.0005244790209790209, |
| "loss": 3.4653, |
| "step": 21700 |
| }, |
| { |
| "epoch": 6.3337410449065175, |
| "grad_norm": 0.32103845477104187, |
| "learning_rate": 0.0005243041958041957, |
| "loss": 3.4648, |
| "step": 21750 |
| }, |
| { |
| "epoch": 6.34830217252024, |
| "grad_norm": 0.3122301697731018, |
| "learning_rate": 0.0005241293706293705, |
| "loss": 3.4732, |
| "step": 21800 |
| }, |
| { |
| "epoch": 6.362863300133962, |
| "grad_norm": 0.3314891457557678, |
| "learning_rate": 0.0005239545454545454, |
| "loss": 3.4765, |
| "step": 21850 |
| }, |
| { |
| "epoch": 6.377424427747685, |
| "grad_norm": 0.31977224349975586, |
| "learning_rate": 0.0005237797202797202, |
| "loss": 3.4755, |
| "step": 21900 |
| }, |
| { |
| "epoch": 6.391985555361408, |
| "grad_norm": 0.3509627878665924, |
| "learning_rate": 0.000523604895104895, |
| "loss": 3.4806, |
| "step": 21950 |
| }, |
| { |
| "epoch": 6.406546682975129, |
| "grad_norm": 0.33360016345977783, |
| "learning_rate": 0.0005234300699300698, |
| "loss": 3.4722, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.406546682975129, |
| "eval_accuracy": 0.3629602076437797, |
| "eval_loss": 3.6122748851776123, |
| "eval_runtime": 180.0332, |
| "eval_samples_per_second": 92.461, |
| "eval_steps_per_second": 5.782, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.421107810588852, |
| "grad_norm": 0.3087630867958069, |
| "learning_rate": 0.0005232552447552447, |
| "loss": 3.4741, |
| "step": 22050 |
| }, |
| { |
| "epoch": 6.435668938202574, |
| "grad_norm": 0.3060018718242645, |
| "learning_rate": 0.0005230804195804195, |
| "loss": 3.4768, |
| "step": 22100 |
| }, |
| { |
| "epoch": 6.450230065816297, |
| "grad_norm": 0.32050588726997375, |
| "learning_rate": 0.0005229055944055943, |
| "loss": 3.485, |
| "step": 22150 |
| }, |
| { |
| "epoch": 6.4647911934300195, |
| "grad_norm": 0.3175103962421417, |
| "learning_rate": 0.0005227307692307691, |
| "loss": 3.4741, |
| "step": 22200 |
| }, |
| { |
| "epoch": 6.479352321043741, |
| "grad_norm": 0.34851333498954773, |
| "learning_rate": 0.0005225559440559441, |
| "loss": 3.4897, |
| "step": 22250 |
| }, |
| { |
| "epoch": 6.493913448657464, |
| "grad_norm": 0.3007611632347107, |
| "learning_rate": 0.0005223811188811189, |
| "loss": 3.4798, |
| "step": 22300 |
| }, |
| { |
| "epoch": 6.508474576271187, |
| "grad_norm": 0.3257494270801544, |
| "learning_rate": 0.0005222062937062937, |
| "loss": 3.4833, |
| "step": 22350 |
| }, |
| { |
| "epoch": 6.523035703884909, |
| "grad_norm": 0.35384172201156616, |
| "learning_rate": 0.0005220314685314686, |
| "loss": 3.4822, |
| "step": 22400 |
| }, |
| { |
| "epoch": 6.5375968314986315, |
| "grad_norm": 0.3386410176753998, |
| "learning_rate": 0.0005218566433566433, |
| "loss": 3.4715, |
| "step": 22450 |
| }, |
| { |
| "epoch": 6.552157959112353, |
| "grad_norm": 0.32020702958106995, |
| "learning_rate": 0.0005216818181818182, |
| "loss": 3.4915, |
| "step": 22500 |
| }, |
| { |
| "epoch": 6.566719086726076, |
| "grad_norm": 0.319762647151947, |
| "learning_rate": 0.000521506993006993, |
| "loss": 3.4796, |
| "step": 22550 |
| }, |
| { |
| "epoch": 6.581280214339799, |
| "grad_norm": 0.3387615382671356, |
| "learning_rate": 0.0005213321678321678, |
| "loss": 3.4837, |
| "step": 22600 |
| }, |
| { |
| "epoch": 6.595841341953521, |
| "grad_norm": 0.3180614113807678, |
| "learning_rate": 0.0005211573426573426, |
| "loss": 3.4879, |
| "step": 22650 |
| }, |
| { |
| "epoch": 6.610402469567243, |
| "grad_norm": 0.3316706418991089, |
| "learning_rate": 0.0005209825174825175, |
| "loss": 3.4904, |
| "step": 22700 |
| }, |
| { |
| "epoch": 6.624963597180965, |
| "grad_norm": 0.35205355286598206, |
| "learning_rate": 0.0005208076923076923, |
| "loss": 3.4846, |
| "step": 22750 |
| }, |
| { |
| "epoch": 6.639524724794688, |
| "grad_norm": 0.32036563754081726, |
| "learning_rate": 0.0005206328671328671, |
| "loss": 3.4926, |
| "step": 22800 |
| }, |
| { |
| "epoch": 6.654085852408411, |
| "grad_norm": 0.3236433267593384, |
| "learning_rate": 0.0005204580419580419, |
| "loss": 3.4891, |
| "step": 22850 |
| }, |
| { |
| "epoch": 6.668646980022133, |
| "grad_norm": 0.3206466734409332, |
| "learning_rate": 0.0005202832167832168, |
| "loss": 3.4853, |
| "step": 22900 |
| }, |
| { |
| "epoch": 6.683208107635855, |
| "grad_norm": 0.3160809576511383, |
| "learning_rate": 0.0005201083916083916, |
| "loss": 3.4888, |
| "step": 22950 |
| }, |
| { |
| "epoch": 6.697769235249577, |
| "grad_norm": 0.3378507196903229, |
| "learning_rate": 0.0005199335664335664, |
| "loss": 3.4907, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.697769235249577, |
| "eval_accuracy": 0.363873198852683, |
| "eval_loss": 3.5971081256866455, |
| "eval_runtime": 179.9566, |
| "eval_samples_per_second": 92.5, |
| "eval_steps_per_second": 5.785, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.7123303628633, |
| "grad_norm": 0.31848153471946716, |
| "learning_rate": 0.0005197587412587413, |
| "loss": 3.483, |
| "step": 23050 |
| }, |
| { |
| "epoch": 6.726891490477023, |
| "grad_norm": 0.35052964091300964, |
| "learning_rate": 0.0005195839160839161, |
| "loss": 3.4949, |
| "step": 23100 |
| }, |
| { |
| "epoch": 6.741452618090745, |
| "grad_norm": 0.32243213057518005, |
| "learning_rate": 0.0005194090909090909, |
| "loss": 3.4824, |
| "step": 23150 |
| }, |
| { |
| "epoch": 6.756013745704467, |
| "grad_norm": 0.3273182809352875, |
| "learning_rate": 0.0005192342657342657, |
| "loss": 3.481, |
| "step": 23200 |
| }, |
| { |
| "epoch": 6.77057487331819, |
| "grad_norm": 0.3279160261154175, |
| "learning_rate": 0.0005190594405594405, |
| "loss": 3.4835, |
| "step": 23250 |
| }, |
| { |
| "epoch": 6.785136000931912, |
| "grad_norm": 0.3148849904537201, |
| "learning_rate": 0.0005188846153846153, |
| "loss": 3.4935, |
| "step": 23300 |
| }, |
| { |
| "epoch": 6.799697128545635, |
| "grad_norm": 0.3306692838668823, |
| "learning_rate": 0.0005187097902097902, |
| "loss": 3.4932, |
| "step": 23350 |
| }, |
| { |
| "epoch": 6.814258256159357, |
| "grad_norm": 0.3366992473602295, |
| "learning_rate": 0.000518534965034965, |
| "loss": 3.4872, |
| "step": 23400 |
| }, |
| { |
| "epoch": 6.828819383773079, |
| "grad_norm": 0.3220052123069763, |
| "learning_rate": 0.0005183601398601398, |
| "loss": 3.4877, |
| "step": 23450 |
| }, |
| { |
| "epoch": 6.843380511386802, |
| "grad_norm": 0.3485269546508789, |
| "learning_rate": 0.0005181853146853146, |
| "loss": 3.4879, |
| "step": 23500 |
| }, |
| { |
| "epoch": 6.857941639000524, |
| "grad_norm": 0.3294737637042999, |
| "learning_rate": 0.0005180104895104895, |
| "loss": 3.4929, |
| "step": 23550 |
| }, |
| { |
| "epoch": 6.872502766614247, |
| "grad_norm": 0.35737600922584534, |
| "learning_rate": 0.0005178356643356643, |
| "loss": 3.4889, |
| "step": 23600 |
| }, |
| { |
| "epoch": 6.887063894227969, |
| "grad_norm": 0.3128018379211426, |
| "learning_rate": 0.0005176608391608391, |
| "loss": 3.4849, |
| "step": 23650 |
| }, |
| { |
| "epoch": 6.901625021841691, |
| "grad_norm": 0.3166596293449402, |
| "learning_rate": 0.000517486013986014, |
| "loss": 3.4948, |
| "step": 23700 |
| }, |
| { |
| "epoch": 6.916186149455414, |
| "grad_norm": 0.32783517241477966, |
| "learning_rate": 0.0005173111888111888, |
| "loss": 3.5017, |
| "step": 23750 |
| }, |
| { |
| "epoch": 6.930747277069136, |
| "grad_norm": 0.3422223925590515, |
| "learning_rate": 0.0005171363636363636, |
| "loss": 3.4959, |
| "step": 23800 |
| }, |
| { |
| "epoch": 6.945308404682859, |
| "grad_norm": 0.3121911585330963, |
| "learning_rate": 0.0005169615384615384, |
| "loss": 3.494, |
| "step": 23850 |
| }, |
| { |
| "epoch": 6.959869532296581, |
| "grad_norm": 0.3127148151397705, |
| "learning_rate": 0.0005167867132867133, |
| "loss": 3.4947, |
| "step": 23900 |
| }, |
| { |
| "epoch": 6.974430659910303, |
| "grad_norm": 0.304234117269516, |
| "learning_rate": 0.000516611888111888, |
| "loss": 3.499, |
| "step": 23950 |
| }, |
| { |
| "epoch": 6.988991787524026, |
| "grad_norm": 0.3382669985294342, |
| "learning_rate": 0.0005164370629370629, |
| "loss": 3.4887, |
| "step": 24000 |
| }, |
| { |
| "epoch": 6.988991787524026, |
| "eval_accuracy": 0.36493831607553445, |
| "eval_loss": 3.592543840408325, |
| "eval_runtime": 179.9167, |
| "eval_samples_per_second": 92.521, |
| "eval_steps_per_second": 5.786, |
| "step": 24000 |
| }, |
| { |
| "epoch": 7.003494670627293, |
| "grad_norm": 0.3160039484500885, |
| "learning_rate": 0.0005162622377622377, |
| "loss": 3.4561, |
| "step": 24050 |
| }, |
| { |
| "epoch": 7.018055798241016, |
| "grad_norm": 0.3464524745941162, |
| "learning_rate": 0.0005160874125874125, |
| "loss": 3.3886, |
| "step": 24100 |
| }, |
| { |
| "epoch": 7.032616925854738, |
| "grad_norm": 0.3348691761493683, |
| "learning_rate": 0.0005159125874125873, |
| "loss": 3.3939, |
| "step": 24150 |
| }, |
| { |
| "epoch": 7.0471780534684605, |
| "grad_norm": 0.3343693017959595, |
| "learning_rate": 0.0005157377622377622, |
| "loss": 3.3815, |
| "step": 24200 |
| }, |
| { |
| "epoch": 7.061739181082183, |
| "grad_norm": 0.3213493227958679, |
| "learning_rate": 0.000515562937062937, |
| "loss": 3.3906, |
| "step": 24250 |
| }, |
| { |
| "epoch": 7.076300308695905, |
| "grad_norm": 0.3274659812450409, |
| "learning_rate": 0.0005153881118881118, |
| "loss": 3.4057, |
| "step": 24300 |
| }, |
| { |
| "epoch": 7.090861436309628, |
| "grad_norm": 0.33152633905410767, |
| "learning_rate": 0.0005152132867132867, |
| "loss": 3.4067, |
| "step": 24350 |
| }, |
| { |
| "epoch": 7.105422563923351, |
| "grad_norm": 0.32923412322998047, |
| "learning_rate": 0.0005150384615384615, |
| "loss": 3.4087, |
| "step": 24400 |
| }, |
| { |
| "epoch": 7.1199836915370724, |
| "grad_norm": 0.3264068067073822, |
| "learning_rate": 0.0005148636363636363, |
| "loss": 3.4132, |
| "step": 24450 |
| }, |
| { |
| "epoch": 7.134544819150795, |
| "grad_norm": 0.34122031927108765, |
| "learning_rate": 0.0005146888111888111, |
| "loss": 3.4002, |
| "step": 24500 |
| }, |
| { |
| "epoch": 7.149105946764517, |
| "grad_norm": 0.35621559619903564, |
| "learning_rate": 0.000514513986013986, |
| "loss": 3.411, |
| "step": 24550 |
| }, |
| { |
| "epoch": 7.16366707437824, |
| "grad_norm": 0.33289653062820435, |
| "learning_rate": 0.0005143391608391608, |
| "loss": 3.4182, |
| "step": 24600 |
| }, |
| { |
| "epoch": 7.1782282019919625, |
| "grad_norm": 0.33276861906051636, |
| "learning_rate": 0.0005141643356643356, |
| "loss": 3.4182, |
| "step": 24650 |
| }, |
| { |
| "epoch": 7.192789329605684, |
| "grad_norm": 0.32877597212791443, |
| "learning_rate": 0.0005139895104895104, |
| "loss": 3.4264, |
| "step": 24700 |
| }, |
| { |
| "epoch": 7.207350457219407, |
| "grad_norm": 0.3156750202178955, |
| "learning_rate": 0.0005138146853146852, |
| "loss": 3.4171, |
| "step": 24750 |
| }, |
| { |
| "epoch": 7.22191158483313, |
| "grad_norm": 0.3044648766517639, |
| "learning_rate": 0.00051363986013986, |
| "loss": 3.4162, |
| "step": 24800 |
| }, |
| { |
| "epoch": 7.236472712446852, |
| "grad_norm": 0.3253194987773895, |
| "learning_rate": 0.0005134650349650349, |
| "loss": 3.415, |
| "step": 24850 |
| }, |
| { |
| "epoch": 7.2510338400605745, |
| "grad_norm": 0.32780349254608154, |
| "learning_rate": 0.0005132902097902097, |
| "loss": 3.4254, |
| "step": 24900 |
| }, |
| { |
| "epoch": 7.265594967674296, |
| "grad_norm": 0.35302308201789856, |
| "learning_rate": 0.0005131153846153845, |
| "loss": 3.4331, |
| "step": 24950 |
| }, |
| { |
| "epoch": 7.280156095288019, |
| "grad_norm": 0.3327321410179138, |
| "learning_rate": 0.0005129405594405594, |
| "loss": 3.4294, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.280156095288019, |
| "eval_accuracy": 0.3646837930305595, |
| "eval_loss": 3.599747657775879, |
| "eval_runtime": 179.9396, |
| "eval_samples_per_second": 92.509, |
| "eval_steps_per_second": 5.785, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.294717222901742, |
| "grad_norm": 0.331084668636322, |
| "learning_rate": 0.0005127657342657342, |
| "loss": 3.4319, |
| "step": 25050 |
| }, |
| { |
| "epoch": 7.309278350515464, |
| "grad_norm": 0.32319188117980957, |
| "learning_rate": 0.000512590909090909, |
| "loss": 3.4364, |
| "step": 25100 |
| }, |
| { |
| "epoch": 7.3238394781291865, |
| "grad_norm": 0.3713702857494354, |
| "learning_rate": 0.0005124160839160838, |
| "loss": 3.4428, |
| "step": 25150 |
| }, |
| { |
| "epoch": 7.338400605742908, |
| "grad_norm": 0.33959725499153137, |
| "learning_rate": 0.0005122412587412588, |
| "loss": 3.4306, |
| "step": 25200 |
| }, |
| { |
| "epoch": 7.352961733356631, |
| "grad_norm": 0.3171273469924927, |
| "learning_rate": 0.0005120664335664336, |
| "loss": 3.441, |
| "step": 25250 |
| }, |
| { |
| "epoch": 7.367522860970354, |
| "grad_norm": 0.34396296739578247, |
| "learning_rate": 0.0005118916083916084, |
| "loss": 3.4379, |
| "step": 25300 |
| }, |
| { |
| "epoch": 7.382083988584076, |
| "grad_norm": 0.30035629868507385, |
| "learning_rate": 0.0005117167832167832, |
| "loss": 3.4363, |
| "step": 25350 |
| }, |
| { |
| "epoch": 7.396645116197798, |
| "grad_norm": 0.3178955018520355, |
| "learning_rate": 0.0005115419580419581, |
| "loss": 3.4364, |
| "step": 25400 |
| }, |
| { |
| "epoch": 7.411206243811521, |
| "grad_norm": 0.32567980885505676, |
| "learning_rate": 0.0005113671328671328, |
| "loss": 3.4237, |
| "step": 25450 |
| }, |
| { |
| "epoch": 7.425767371425243, |
| "grad_norm": 0.33921509981155396, |
| "learning_rate": 0.0005111923076923077, |
| "loss": 3.4388, |
| "step": 25500 |
| }, |
| { |
| "epoch": 7.440328499038966, |
| "grad_norm": 0.3294987380504608, |
| "learning_rate": 0.0005110174825174825, |
| "loss": 3.435, |
| "step": 25550 |
| }, |
| { |
| "epoch": 7.454889626652688, |
| "grad_norm": 0.32871222496032715, |
| "learning_rate": 0.0005108426573426573, |
| "loss": 3.4482, |
| "step": 25600 |
| }, |
| { |
| "epoch": 7.46945075426641, |
| "grad_norm": 0.3521834909915924, |
| "learning_rate": 0.0005106678321678321, |
| "loss": 3.454, |
| "step": 25650 |
| }, |
| { |
| "epoch": 7.484011881880133, |
| "grad_norm": 0.32148364186286926, |
| "learning_rate": 0.000510493006993007, |
| "loss": 3.4348, |
| "step": 25700 |
| }, |
| { |
| "epoch": 7.498573009493855, |
| "grad_norm": 0.31990259885787964, |
| "learning_rate": 0.0005103181818181818, |
| "loss": 3.4522, |
| "step": 25750 |
| }, |
| { |
| "epoch": 7.513134137107578, |
| "grad_norm": 0.3114703297615051, |
| "learning_rate": 0.0005101433566433566, |
| "loss": 3.4493, |
| "step": 25800 |
| }, |
| { |
| "epoch": 7.5276952647213005, |
| "grad_norm": 0.33588969707489014, |
| "learning_rate": 0.0005099685314685315, |
| "loss": 3.457, |
| "step": 25850 |
| }, |
| { |
| "epoch": 7.542256392335022, |
| "grad_norm": 0.3361005485057831, |
| "learning_rate": 0.0005097937062937063, |
| "loss": 3.4541, |
| "step": 25900 |
| }, |
| { |
| "epoch": 7.556817519948745, |
| "grad_norm": 0.3297813832759857, |
| "learning_rate": 0.0005096188811188811, |
| "loss": 3.443, |
| "step": 25950 |
| }, |
| { |
| "epoch": 7.571378647562467, |
| "grad_norm": 0.32429084181785583, |
| "learning_rate": 0.0005094440559440559, |
| "loss": 3.4576, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.571378647562467, |
| "eval_accuracy": 0.36531392860610956, |
| "eval_loss": 3.58967924118042, |
| "eval_runtime": 179.9516, |
| "eval_samples_per_second": 92.503, |
| "eval_steps_per_second": 5.785, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.58593977517619, |
| "grad_norm": 0.34405285120010376, |
| "learning_rate": 0.0005092692307692308, |
| "loss": 3.456, |
| "step": 26050 |
| }, |
| { |
| "epoch": 7.600500902789912, |
| "grad_norm": 0.3523123264312744, |
| "learning_rate": 0.0005090944055944056, |
| "loss": 3.4455, |
| "step": 26100 |
| }, |
| { |
| "epoch": 7.615062030403634, |
| "grad_norm": 0.3292773365974426, |
| "learning_rate": 0.0005089195804195804, |
| "loss": 3.4511, |
| "step": 26150 |
| }, |
| { |
| "epoch": 7.629623158017357, |
| "grad_norm": 0.3290829062461853, |
| "learning_rate": 0.0005087447552447552, |
| "loss": 3.455, |
| "step": 26200 |
| }, |
| { |
| "epoch": 7.644184285631079, |
| "grad_norm": 0.3564629852771759, |
| "learning_rate": 0.00050856993006993, |
| "loss": 3.4563, |
| "step": 26250 |
| }, |
| { |
| "epoch": 7.658745413244802, |
| "grad_norm": 0.3048079311847687, |
| "learning_rate": 0.0005083951048951048, |
| "loss": 3.4511, |
| "step": 26300 |
| }, |
| { |
| "epoch": 7.673306540858524, |
| "grad_norm": 0.326558917760849, |
| "learning_rate": 0.0005082202797202797, |
| "loss": 3.4579, |
| "step": 26350 |
| }, |
| { |
| "epoch": 7.687867668472246, |
| "grad_norm": 0.3268931210041046, |
| "learning_rate": 0.0005080454545454545, |
| "loss": 3.4556, |
| "step": 26400 |
| }, |
| { |
| "epoch": 7.702428796085969, |
| "grad_norm": 0.3164925277233124, |
| "learning_rate": 0.0005078706293706293, |
| "loss": 3.4534, |
| "step": 26450 |
| }, |
| { |
| "epoch": 7.716989923699691, |
| "grad_norm": 0.3382319509983063, |
| "learning_rate": 0.0005076958041958042, |
| "loss": 3.4612, |
| "step": 26500 |
| }, |
| { |
| "epoch": 7.731551051313414, |
| "grad_norm": 0.3426370620727539, |
| "learning_rate": 0.000507520979020979, |
| "loss": 3.4465, |
| "step": 26550 |
| }, |
| { |
| "epoch": 7.746112178927136, |
| "grad_norm": 0.32341861724853516, |
| "learning_rate": 0.0005073461538461538, |
| "loss": 3.4557, |
| "step": 26600 |
| }, |
| { |
| "epoch": 7.760673306540858, |
| "grad_norm": 0.33356454968452454, |
| "learning_rate": 0.0005071713286713286, |
| "loss": 3.4535, |
| "step": 26650 |
| }, |
| { |
| "epoch": 7.775234434154581, |
| "grad_norm": 0.32658296823501587, |
| "learning_rate": 0.0005069965034965035, |
| "loss": 3.4629, |
| "step": 26700 |
| }, |
| { |
| "epoch": 7.789795561768304, |
| "grad_norm": 0.33484941720962524, |
| "learning_rate": 0.0005068216783216783, |
| "loss": 3.4448, |
| "step": 26750 |
| }, |
| { |
| "epoch": 7.8043566893820255, |
| "grad_norm": 0.3235480487346649, |
| "learning_rate": 0.0005066468531468531, |
| "loss": 3.4605, |
| "step": 26800 |
| }, |
| { |
| "epoch": 7.818917816995748, |
| "grad_norm": 0.31953296065330505, |
| "learning_rate": 0.0005064720279720279, |
| "loss": 3.4558, |
| "step": 26850 |
| }, |
| { |
| "epoch": 7.833478944609471, |
| "grad_norm": 0.310001939535141, |
| "learning_rate": 0.0005062972027972028, |
| "loss": 3.4607, |
| "step": 26900 |
| }, |
| { |
| "epoch": 7.848040072223193, |
| "grad_norm": 0.3460739254951477, |
| "learning_rate": 0.0005061223776223775, |
| "loss": 3.4531, |
| "step": 26950 |
| }, |
| { |
| "epoch": 7.862601199836916, |
| "grad_norm": 0.3318574130535126, |
| "learning_rate": 0.0005059475524475524, |
| "loss": 3.4537, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.862601199836916, |
| "eval_accuracy": 0.36604363971010945, |
| "eval_loss": 3.5796239376068115, |
| "eval_runtime": 180.0847, |
| "eval_samples_per_second": 92.434, |
| "eval_steps_per_second": 5.781, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.8771623274506375, |
| "grad_norm": 0.30648431181907654, |
| "learning_rate": 0.0005057727272727272, |
| "loss": 3.4657, |
| "step": 27050 |
| }, |
| { |
| "epoch": 7.89172345506436, |
| "grad_norm": 0.31725013256073, |
| "learning_rate": 0.000505597902097902, |
| "loss": 3.463, |
| "step": 27100 |
| }, |
| { |
| "epoch": 7.906284582678083, |
| "grad_norm": 0.30761897563934326, |
| "learning_rate": 0.0005054230769230769, |
| "loss": 3.4607, |
| "step": 27150 |
| }, |
| { |
| "epoch": 7.920845710291805, |
| "grad_norm": 0.32406291365623474, |
| "learning_rate": 0.0005052482517482517, |
| "loss": 3.4635, |
| "step": 27200 |
| }, |
| { |
| "epoch": 7.935406837905528, |
| "grad_norm": 0.30351096391677856, |
| "learning_rate": 0.0005050734265734265, |
| "loss": 3.4623, |
| "step": 27250 |
| }, |
| { |
| "epoch": 7.9499679655192494, |
| "grad_norm": 0.36170071363449097, |
| "learning_rate": 0.0005048986013986013, |
| "loss": 3.4596, |
| "step": 27300 |
| }, |
| { |
| "epoch": 7.964529093132972, |
| "grad_norm": 0.34269431233406067, |
| "learning_rate": 0.0005047237762237762, |
| "loss": 3.4578, |
| "step": 27350 |
| }, |
| { |
| "epoch": 7.979090220746695, |
| "grad_norm": 0.3124333322048187, |
| "learning_rate": 0.000504548951048951, |
| "loss": 3.4646, |
| "step": 27400 |
| }, |
| { |
| "epoch": 7.993651348360417, |
| "grad_norm": 0.3473075330257416, |
| "learning_rate": 0.0005043741258741258, |
| "loss": 3.4747, |
| "step": 27450 |
| }, |
| { |
| "epoch": 8.008154231463685, |
| "grad_norm": 0.33856120705604553, |
| "learning_rate": 0.0005041993006993006, |
| "loss": 3.3971, |
| "step": 27500 |
| }, |
| { |
| "epoch": 8.022715359077408, |
| "grad_norm": 0.3425942659378052, |
| "learning_rate": 0.0005040244755244755, |
| "loss": 3.3405, |
| "step": 27550 |
| }, |
| { |
| "epoch": 8.037276486691129, |
| "grad_norm": 0.3561685085296631, |
| "learning_rate": 0.0005038496503496503, |
| "loss": 3.3546, |
| "step": 27600 |
| }, |
| { |
| "epoch": 8.051837614304851, |
| "grad_norm": 0.311886727809906, |
| "learning_rate": 0.0005036748251748251, |
| "loss": 3.3646, |
| "step": 27650 |
| }, |
| { |
| "epoch": 8.066398741918574, |
| "grad_norm": 0.33370155096054077, |
| "learning_rate": 0.0005034999999999999, |
| "loss": 3.3717, |
| "step": 27700 |
| }, |
| { |
| "epoch": 8.080959869532297, |
| "grad_norm": 0.33389151096343994, |
| "learning_rate": 0.0005033251748251747, |
| "loss": 3.3739, |
| "step": 27750 |
| }, |
| { |
| "epoch": 8.09552099714602, |
| "grad_norm": 0.35343077778816223, |
| "learning_rate": 0.0005031503496503496, |
| "loss": 3.3642, |
| "step": 27800 |
| }, |
| { |
| "epoch": 8.11008212475974, |
| "grad_norm": 0.345520943403244, |
| "learning_rate": 0.0005029755244755244, |
| "loss": 3.3761, |
| "step": 27850 |
| }, |
| { |
| "epoch": 8.124643252373463, |
| "grad_norm": 0.3449995517730713, |
| "learning_rate": 0.0005028006993006992, |
| "loss": 3.3864, |
| "step": 27900 |
| }, |
| { |
| "epoch": 8.139204379987186, |
| "grad_norm": 0.33876168727874756, |
| "learning_rate": 0.000502625874125874, |
| "loss": 3.3809, |
| "step": 27950 |
| }, |
| { |
| "epoch": 8.153765507600909, |
| "grad_norm": 0.31593626737594604, |
| "learning_rate": 0.000502451048951049, |
| "loss": 3.3948, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.153765507600909, |
| "eval_accuracy": 0.3657942894198591, |
| "eval_loss": 3.5906715393066406, |
| "eval_runtime": 179.9302, |
| "eval_samples_per_second": 92.514, |
| "eval_steps_per_second": 5.786, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.168326635214632, |
| "grad_norm": 0.36130982637405396, |
| "learning_rate": 0.0005022762237762237, |
| "loss": 3.3784, |
| "step": 28050 |
| }, |
| { |
| "epoch": 8.182887762828354, |
| "grad_norm": 0.3143582046031952, |
| "learning_rate": 0.0005021013986013985, |
| "loss": 3.387, |
| "step": 28100 |
| }, |
| { |
| "epoch": 8.197448890442075, |
| "grad_norm": 0.3271821141242981, |
| "learning_rate": 0.0005019265734265733, |
| "loss": 3.3911, |
| "step": 28150 |
| }, |
| { |
| "epoch": 8.212010018055798, |
| "grad_norm": 0.35023316740989685, |
| "learning_rate": 0.0005017517482517483, |
| "loss": 3.3896, |
| "step": 28200 |
| }, |
| { |
| "epoch": 8.22657114566952, |
| "grad_norm": 0.33042287826538086, |
| "learning_rate": 0.0005015769230769231, |
| "loss": 3.3877, |
| "step": 28250 |
| }, |
| { |
| "epoch": 8.241132273283243, |
| "grad_norm": 0.3285730183124542, |
| "learning_rate": 0.0005014020979020979, |
| "loss": 3.3912, |
| "step": 28300 |
| }, |
| { |
| "epoch": 8.255693400896966, |
| "grad_norm": 0.3385370373725891, |
| "learning_rate": 0.0005012272727272727, |
| "loss": 3.3954, |
| "step": 28350 |
| }, |
| { |
| "epoch": 8.270254528510687, |
| "grad_norm": 0.32670000195503235, |
| "learning_rate": 0.0005010524475524476, |
| "loss": 3.3881, |
| "step": 28400 |
| }, |
| { |
| "epoch": 8.28481565612441, |
| "grad_norm": 0.31677448749542236, |
| "learning_rate": 0.0005008776223776223, |
| "loss": 3.4044, |
| "step": 28450 |
| }, |
| { |
| "epoch": 8.299376783738133, |
| "grad_norm": 0.31752389669418335, |
| "learning_rate": 0.0005007027972027972, |
| "loss": 3.3969, |
| "step": 28500 |
| }, |
| { |
| "epoch": 8.313937911351855, |
| "grad_norm": 0.33070990443229675, |
| "learning_rate": 0.000500527972027972, |
| "loss": 3.4068, |
| "step": 28550 |
| }, |
| { |
| "epoch": 8.328499038965578, |
| "grad_norm": 0.3155275881290436, |
| "learning_rate": 0.0005003531468531468, |
| "loss": 3.3974, |
| "step": 28600 |
| }, |
| { |
| "epoch": 8.3430601665793, |
| "grad_norm": 0.35018283128738403, |
| "learning_rate": 0.0005001783216783217, |
| "loss": 3.4101, |
| "step": 28650 |
| }, |
| { |
| "epoch": 8.357621294193022, |
| "grad_norm": 0.33324185013771057, |
| "learning_rate": 0.0005000034965034965, |
| "loss": 3.404, |
| "step": 28700 |
| }, |
| { |
| "epoch": 8.372182421806745, |
| "grad_norm": 0.3326118588447571, |
| "learning_rate": 0.0004998286713286713, |
| "loss": 3.4122, |
| "step": 28750 |
| }, |
| { |
| "epoch": 8.386743549420467, |
| "grad_norm": 0.334804892539978, |
| "learning_rate": 0.0004996538461538461, |
| "loss": 3.4134, |
| "step": 28800 |
| }, |
| { |
| "epoch": 8.40130467703419, |
| "grad_norm": 0.3175402581691742, |
| "learning_rate": 0.000499479020979021, |
| "loss": 3.4056, |
| "step": 28850 |
| }, |
| { |
| "epoch": 8.415865804647911, |
| "grad_norm": 0.33022984862327576, |
| "learning_rate": 0.0004993041958041958, |
| "loss": 3.4186, |
| "step": 28900 |
| }, |
| { |
| "epoch": 8.430426932261634, |
| "grad_norm": 0.32591524720191956, |
| "learning_rate": 0.0004991293706293706, |
| "loss": 3.413, |
| "step": 28950 |
| }, |
| { |
| "epoch": 8.444988059875357, |
| "grad_norm": 0.3372584879398346, |
| "learning_rate": 0.0004989545454545454, |
| "loss": 3.4066, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.444988059875357, |
| "eval_accuracy": 0.3666075875377053, |
| "eval_loss": 3.582721471786499, |
| "eval_runtime": 180.1244, |
| "eval_samples_per_second": 92.414, |
| "eval_steps_per_second": 5.779, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.45954918748908, |
| "grad_norm": 0.312492311000824, |
| "learning_rate": 0.0004987797202797203, |
| "loss": 3.4043, |
| "step": 29050 |
| }, |
| { |
| "epoch": 8.474110315102802, |
| "grad_norm": 0.3069634735584259, |
| "learning_rate": 0.0004986048951048951, |
| "loss": 3.4106, |
| "step": 29100 |
| }, |
| { |
| "epoch": 8.488671442716523, |
| "grad_norm": 0.3423140048980713, |
| "learning_rate": 0.0004984300699300699, |
| "loss": 3.4263, |
| "step": 29150 |
| }, |
| { |
| "epoch": 8.503232570330246, |
| "grad_norm": 0.3705313503742218, |
| "learning_rate": 0.0004982552447552448, |
| "loss": 3.422, |
| "step": 29200 |
| }, |
| { |
| "epoch": 8.517793697943969, |
| "grad_norm": 0.3448847532272339, |
| "learning_rate": 0.0004980804195804195, |
| "loss": 3.4247, |
| "step": 29250 |
| }, |
| { |
| "epoch": 8.532354825557691, |
| "grad_norm": 0.3401014804840088, |
| "learning_rate": 0.0004979055944055944, |
| "loss": 3.419, |
| "step": 29300 |
| }, |
| { |
| "epoch": 8.546915953171414, |
| "grad_norm": 0.37092649936676025, |
| "learning_rate": 0.0004977307692307692, |
| "loss": 3.4086, |
| "step": 29350 |
| }, |
| { |
| "epoch": 8.561477080785137, |
| "grad_norm": 0.3265298306941986, |
| "learning_rate": 0.000497555944055944, |
| "loss": 3.4265, |
| "step": 29400 |
| }, |
| { |
| "epoch": 8.576038208398858, |
| "grad_norm": 0.3344496190547943, |
| "learning_rate": 0.0004973811188811188, |
| "loss": 3.4203, |
| "step": 29450 |
| }, |
| { |
| "epoch": 8.59059933601258, |
| "grad_norm": 0.32499802112579346, |
| "learning_rate": 0.0004972062937062937, |
| "loss": 3.4212, |
| "step": 29500 |
| }, |
| { |
| "epoch": 8.605160463626303, |
| "grad_norm": 0.35400429368019104, |
| "learning_rate": 0.0004970314685314685, |
| "loss": 3.4141, |
| "step": 29550 |
| }, |
| { |
| "epoch": 8.619721591240026, |
| "grad_norm": 0.33860787749290466, |
| "learning_rate": 0.0004968566433566433, |
| "loss": 3.4319, |
| "step": 29600 |
| }, |
| { |
| "epoch": 8.634282718853749, |
| "grad_norm": 0.34178146719932556, |
| "learning_rate": 0.0004966818181818181, |
| "loss": 3.4165, |
| "step": 29650 |
| }, |
| { |
| "epoch": 8.64884384646747, |
| "grad_norm": 0.33078354597091675, |
| "learning_rate": 0.000496506993006993, |
| "loss": 3.4303, |
| "step": 29700 |
| }, |
| { |
| "epoch": 8.663404974081192, |
| "grad_norm": 0.3562380075454712, |
| "learning_rate": 0.0004963321678321678, |
| "loss": 3.4254, |
| "step": 29750 |
| }, |
| { |
| "epoch": 8.677966101694915, |
| "grad_norm": 0.33405157923698425, |
| "learning_rate": 0.0004961573426573426, |
| "loss": 3.4287, |
| "step": 29800 |
| }, |
| { |
| "epoch": 8.692527229308638, |
| "grad_norm": 0.3252856135368347, |
| "learning_rate": 0.0004959825174825175, |
| "loss": 3.4301, |
| "step": 29850 |
| }, |
| { |
| "epoch": 8.70708835692236, |
| "grad_norm": 0.33423078060150146, |
| "learning_rate": 0.0004958076923076923, |
| "loss": 3.4394, |
| "step": 29900 |
| }, |
| { |
| "epoch": 8.721649484536082, |
| "grad_norm": 0.3440803289413452, |
| "learning_rate": 0.0004956328671328671, |
| "loss": 3.4225, |
| "step": 29950 |
| }, |
| { |
| "epoch": 8.736210612149804, |
| "grad_norm": 0.3355342447757721, |
| "learning_rate": 0.0004954580419580419, |
| "loss": 3.4422, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.736210612149804, |
| "eval_accuracy": 0.3673996068236159, |
| "eval_loss": 3.5731475353240967, |
| "eval_runtime": 180.1693, |
| "eval_samples_per_second": 92.391, |
| "eval_steps_per_second": 5.778, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.750771739763527, |
| "grad_norm": 0.33982402086257935, |
| "learning_rate": 0.0004952832167832167, |
| "loss": 3.4286, |
| "step": 30050 |
| }, |
| { |
| "epoch": 8.76533286737725, |
| "grad_norm": 0.33567437529563904, |
| "learning_rate": 0.0004951083916083915, |
| "loss": 3.4282, |
| "step": 30100 |
| }, |
| { |
| "epoch": 8.779893994990973, |
| "grad_norm": 0.3140004277229309, |
| "learning_rate": 0.0004949335664335664, |
| "loss": 3.4373, |
| "step": 30150 |
| }, |
| { |
| "epoch": 8.794455122604695, |
| "grad_norm": 0.33392393589019775, |
| "learning_rate": 0.0004947587412587412, |
| "loss": 3.429, |
| "step": 30200 |
| }, |
| { |
| "epoch": 8.809016250218416, |
| "grad_norm": 0.3189204931259155, |
| "learning_rate": 0.000494583916083916, |
| "loss": 3.4201, |
| "step": 30250 |
| }, |
| { |
| "epoch": 8.82357737783214, |
| "grad_norm": 0.31288912892341614, |
| "learning_rate": 0.0004944090909090908, |
| "loss": 3.427, |
| "step": 30300 |
| }, |
| { |
| "epoch": 8.838138505445862, |
| "grad_norm": 0.33589813113212585, |
| "learning_rate": 0.0004942342657342657, |
| "loss": 3.4327, |
| "step": 30350 |
| }, |
| { |
| "epoch": 8.852699633059585, |
| "grad_norm": 0.34450462460517883, |
| "learning_rate": 0.0004940594405594405, |
| "loss": 3.4349, |
| "step": 30400 |
| }, |
| { |
| "epoch": 8.867260760673307, |
| "grad_norm": 0.34097838401794434, |
| "learning_rate": 0.0004938846153846153, |
| "loss": 3.431, |
| "step": 30450 |
| }, |
| { |
| "epoch": 8.881821888287028, |
| "grad_norm": 0.3667545020580292, |
| "learning_rate": 0.0004937097902097901, |
| "loss": 3.4459, |
| "step": 30500 |
| }, |
| { |
| "epoch": 8.896383015900751, |
| "grad_norm": 0.32320767641067505, |
| "learning_rate": 0.000493534965034965, |
| "loss": 3.4312, |
| "step": 30550 |
| }, |
| { |
| "epoch": 8.910944143514474, |
| "grad_norm": 0.3119420111179352, |
| "learning_rate": 0.0004933601398601398, |
| "loss": 3.426, |
| "step": 30600 |
| }, |
| { |
| "epoch": 8.925505271128197, |
| "grad_norm": 0.35779494047164917, |
| "learning_rate": 0.0004931853146853146, |
| "loss": 3.4295, |
| "step": 30650 |
| }, |
| { |
| "epoch": 8.94006639874192, |
| "grad_norm": 0.3556269705295563, |
| "learning_rate": 0.0004930104895104895, |
| "loss": 3.4477, |
| "step": 30700 |
| }, |
| { |
| "epoch": 8.95462752635564, |
| "grad_norm": 0.3329892158508301, |
| "learning_rate": 0.0004928356643356642, |
| "loss": 3.43, |
| "step": 30750 |
| }, |
| { |
| "epoch": 8.969188653969363, |
| "grad_norm": 0.35038620233535767, |
| "learning_rate": 0.0004926608391608391, |
| "loss": 3.4362, |
| "step": 30800 |
| }, |
| { |
| "epoch": 8.983749781583086, |
| "grad_norm": 0.31508538126945496, |
| "learning_rate": 0.0004924860139860139, |
| "loss": 3.4365, |
| "step": 30850 |
| }, |
| { |
| "epoch": 8.998310909196809, |
| "grad_norm": 0.30871662497520447, |
| "learning_rate": 0.0004923111888111887, |
| "loss": 3.4432, |
| "step": 30900 |
| }, |
| { |
| "epoch": 9.012813792300076, |
| "grad_norm": 0.34938541054725647, |
| "learning_rate": 0.0004921363636363635, |
| "loss": 3.3275, |
| "step": 30950 |
| }, |
| { |
| "epoch": 9.027374919913798, |
| "grad_norm": 0.3519437313079834, |
| "learning_rate": 0.0004919615384615384, |
| "loss": 3.3397, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.027374919913798, |
| "eval_accuracy": 0.3675191679953201, |
| "eval_loss": 3.5792672634124756, |
| "eval_runtime": 182.1642, |
| "eval_samples_per_second": 91.379, |
| "eval_steps_per_second": 5.715, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.041936047527521, |
| "grad_norm": 0.3590649366378784, |
| "learning_rate": 0.0004917867132867132, |
| "loss": 3.321, |
| "step": 31050 |
| }, |
| { |
| "epoch": 9.056497175141242, |
| "grad_norm": 0.35833194851875305, |
| "learning_rate": 0.000491611888111888, |
| "loss": 3.3403, |
| "step": 31100 |
| }, |
| { |
| "epoch": 9.071058302754965, |
| "grad_norm": 0.32010728120803833, |
| "learning_rate": 0.0004914370629370628, |
| "loss": 3.3411, |
| "step": 31150 |
| }, |
| { |
| "epoch": 9.085619430368688, |
| "grad_norm": 0.3303408920764923, |
| "learning_rate": 0.0004912622377622378, |
| "loss": 3.3471, |
| "step": 31200 |
| }, |
| { |
| "epoch": 9.10018055798241, |
| "grad_norm": 0.3390958309173584, |
| "learning_rate": 0.0004910874125874126, |
| "loss": 3.3446, |
| "step": 31250 |
| }, |
| { |
| "epoch": 9.114741685596133, |
| "grad_norm": 0.3788028359413147, |
| "learning_rate": 0.0004909125874125874, |
| "loss": 3.346, |
| "step": 31300 |
| }, |
| { |
| "epoch": 9.129302813209854, |
| "grad_norm": 0.34279879927635193, |
| "learning_rate": 0.0004907377622377623, |
| "loss": 3.3461, |
| "step": 31350 |
| }, |
| { |
| "epoch": 9.143863940823577, |
| "grad_norm": 0.3165462613105774, |
| "learning_rate": 0.0004905629370629371, |
| "loss": 3.3632, |
| "step": 31400 |
| }, |
| { |
| "epoch": 9.1584250684373, |
| "grad_norm": 0.3663565516471863, |
| "learning_rate": 0.0004903881118881119, |
| "loss": 3.3482, |
| "step": 31450 |
| }, |
| { |
| "epoch": 9.172986196051022, |
| "grad_norm": 0.3512609601020813, |
| "learning_rate": 0.0004902132867132867, |
| "loss": 3.3552, |
| "step": 31500 |
| }, |
| { |
| "epoch": 9.187547323664745, |
| "grad_norm": 0.35511720180511475, |
| "learning_rate": 0.0004900384615384615, |
| "loss": 3.3734, |
| "step": 31550 |
| }, |
| { |
| "epoch": 9.202108451278466, |
| "grad_norm": 0.3515777289867401, |
| "learning_rate": 0.0004898636363636363, |
| "loss": 3.3637, |
| "step": 31600 |
| }, |
| { |
| "epoch": 9.216669578892189, |
| "grad_norm": 0.38096532225608826, |
| "learning_rate": 0.0004896888111888112, |
| "loss": 3.3607, |
| "step": 31650 |
| }, |
| { |
| "epoch": 9.231230706505912, |
| "grad_norm": 0.3359524607658386, |
| "learning_rate": 0.000489513986013986, |
| "loss": 3.3676, |
| "step": 31700 |
| }, |
| { |
| "epoch": 9.245791834119634, |
| "grad_norm": 0.3147598206996918, |
| "learning_rate": 0.0004893391608391608, |
| "loss": 3.3719, |
| "step": 31750 |
| }, |
| { |
| "epoch": 9.260352961733357, |
| "grad_norm": 0.33265364170074463, |
| "learning_rate": 0.0004891643356643356, |
| "loss": 3.3647, |
| "step": 31800 |
| }, |
| { |
| "epoch": 9.27491408934708, |
| "grad_norm": 0.3475736081600189, |
| "learning_rate": 0.0004889895104895105, |
| "loss": 3.3814, |
| "step": 31850 |
| }, |
| { |
| "epoch": 9.2894752169608, |
| "grad_norm": 0.3379450738430023, |
| "learning_rate": 0.0004888146853146853, |
| "loss": 3.3719, |
| "step": 31900 |
| }, |
| { |
| "epoch": 9.304036344574524, |
| "grad_norm": 0.3429054319858551, |
| "learning_rate": 0.0004886398601398601, |
| "loss": 3.368, |
| "step": 31950 |
| }, |
| { |
| "epoch": 9.318597472188246, |
| "grad_norm": 0.36689573526382446, |
| "learning_rate": 0.000488465034965035, |
| "loss": 3.3756, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.318597472188246, |
| "eval_accuracy": 0.3671841145642906, |
| "eval_loss": 3.577822208404541, |
| "eval_runtime": 180.2267, |
| "eval_samples_per_second": 92.361, |
| "eval_steps_per_second": 5.776, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.333158599801969, |
| "grad_norm": 0.33105987310409546, |
| "learning_rate": 0.0004882902097902098, |
| "loss": 3.3843, |
| "step": 32050 |
| }, |
| { |
| "epoch": 9.347719727415692, |
| "grad_norm": 0.3265824019908905, |
| "learning_rate": 0.0004881153846153846, |
| "loss": 3.3865, |
| "step": 32100 |
| }, |
| { |
| "epoch": 9.362280855029413, |
| "grad_norm": 0.37859949469566345, |
| "learning_rate": 0.0004879405594405594, |
| "loss": 3.3717, |
| "step": 32150 |
| }, |
| { |
| "epoch": 9.376841982643136, |
| "grad_norm": 0.3668246865272522, |
| "learning_rate": 0.00048776573426573424, |
| "loss": 3.3762, |
| "step": 32200 |
| }, |
| { |
| "epoch": 9.391403110256858, |
| "grad_norm": 0.3174314796924591, |
| "learning_rate": 0.00048759090909090904, |
| "loss": 3.3872, |
| "step": 32250 |
| }, |
| { |
| "epoch": 9.405964237870581, |
| "grad_norm": 0.34313857555389404, |
| "learning_rate": 0.0004874160839160839, |
| "loss": 3.3886, |
| "step": 32300 |
| }, |
| { |
| "epoch": 9.420525365484304, |
| "grad_norm": 0.35954397916793823, |
| "learning_rate": 0.0004872412587412587, |
| "loss": 3.3928, |
| "step": 32350 |
| }, |
| { |
| "epoch": 9.435086493098025, |
| "grad_norm": 0.35825568437576294, |
| "learning_rate": 0.00048706643356643354, |
| "loss": 3.3814, |
| "step": 32400 |
| }, |
| { |
| "epoch": 9.449647620711747, |
| "grad_norm": 0.3450055420398712, |
| "learning_rate": 0.00048689160839160834, |
| "loss": 3.3919, |
| "step": 32450 |
| }, |
| { |
| "epoch": 9.46420874832547, |
| "grad_norm": 0.32993149757385254, |
| "learning_rate": 0.0004867167832167832, |
| "loss": 3.393, |
| "step": 32500 |
| }, |
| { |
| "epoch": 9.478769875939193, |
| "grad_norm": 0.3306158781051636, |
| "learning_rate": 0.00048654195804195794, |
| "loss": 3.3925, |
| "step": 32550 |
| }, |
| { |
| "epoch": 9.493331003552916, |
| "grad_norm": 0.3598116338253021, |
| "learning_rate": 0.00048636713286713285, |
| "loss": 3.3957, |
| "step": 32600 |
| }, |
| { |
| "epoch": 9.507892131166638, |
| "grad_norm": 0.3408449590206146, |
| "learning_rate": 0.0004861923076923077, |
| "loss": 3.3871, |
| "step": 32650 |
| }, |
| { |
| "epoch": 9.52245325878036, |
| "grad_norm": 0.3317490816116333, |
| "learning_rate": 0.00048601748251748245, |
| "loss": 3.3835, |
| "step": 32700 |
| }, |
| { |
| "epoch": 9.537014386394082, |
| "grad_norm": 0.3280400037765503, |
| "learning_rate": 0.0004858426573426573, |
| "loss": 3.3993, |
| "step": 32750 |
| }, |
| { |
| "epoch": 9.551575514007805, |
| "grad_norm": 0.33201780915260315, |
| "learning_rate": 0.0004856678321678321, |
| "loss": 3.4001, |
| "step": 32800 |
| }, |
| { |
| "epoch": 9.566136641621528, |
| "grad_norm": 0.3269020915031433, |
| "learning_rate": 0.00048549300699300696, |
| "loss": 3.4068, |
| "step": 32850 |
| }, |
| { |
| "epoch": 9.58069776923525, |
| "grad_norm": 0.31468507647514343, |
| "learning_rate": 0.00048531818181818176, |
| "loss": 3.4022, |
| "step": 32900 |
| }, |
| { |
| "epoch": 9.595258896848971, |
| "grad_norm": 0.32653799653053284, |
| "learning_rate": 0.0004851433566433566, |
| "loss": 3.3969, |
| "step": 32950 |
| }, |
| { |
| "epoch": 9.609820024462694, |
| "grad_norm": 0.36493203043937683, |
| "learning_rate": 0.0004849685314685314, |
| "loss": 3.3942, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.609820024462694, |
| "eval_accuracy": 0.36788220132690563, |
| "eval_loss": 3.569019317626953, |
| "eval_runtime": 179.8942, |
| "eval_samples_per_second": 92.532, |
| "eval_steps_per_second": 5.787, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.624381152076417, |
| "grad_norm": 0.3345858156681061, |
| "learning_rate": 0.00048479370629370627, |
| "loss": 3.4061, |
| "step": 33050 |
| }, |
| { |
| "epoch": 9.63894227969014, |
| "grad_norm": 0.38037094473838806, |
| "learning_rate": 0.00048461888111888106, |
| "loss": 3.4037, |
| "step": 33100 |
| }, |
| { |
| "epoch": 9.653503407303862, |
| "grad_norm": 0.3111010789871216, |
| "learning_rate": 0.0004844440559440559, |
| "loss": 3.3955, |
| "step": 33150 |
| }, |
| { |
| "epoch": 9.668064534917583, |
| "grad_norm": 0.35344934463500977, |
| "learning_rate": 0.0004842692307692307, |
| "loss": 3.3963, |
| "step": 33200 |
| }, |
| { |
| "epoch": 9.682625662531306, |
| "grad_norm": 0.3215346932411194, |
| "learning_rate": 0.00048409440559440557, |
| "loss": 3.3948, |
| "step": 33250 |
| }, |
| { |
| "epoch": 9.697186790145029, |
| "grad_norm": 0.36966708302497864, |
| "learning_rate": 0.0004839195804195803, |
| "loss": 3.3989, |
| "step": 33300 |
| }, |
| { |
| "epoch": 9.711747917758752, |
| "grad_norm": 0.3502499461174011, |
| "learning_rate": 0.0004837447552447552, |
| "loss": 3.4065, |
| "step": 33350 |
| }, |
| { |
| "epoch": 9.726309045372474, |
| "grad_norm": 0.3110143542289734, |
| "learning_rate": 0.0004835699300699301, |
| "loss": 3.4015, |
| "step": 33400 |
| }, |
| { |
| "epoch": 9.740870172986195, |
| "grad_norm": 0.3369152247905731, |
| "learning_rate": 0.0004833951048951048, |
| "loss": 3.407, |
| "step": 33450 |
| }, |
| { |
| "epoch": 9.755431300599918, |
| "grad_norm": 0.3321021497249603, |
| "learning_rate": 0.0004832202797202797, |
| "loss": 3.3996, |
| "step": 33500 |
| }, |
| { |
| "epoch": 9.76999242821364, |
| "grad_norm": 0.3521941602230072, |
| "learning_rate": 0.0004830454545454545, |
| "loss": 3.4016, |
| "step": 33550 |
| }, |
| { |
| "epoch": 9.784553555827364, |
| "grad_norm": 0.36469826102256775, |
| "learning_rate": 0.00048287062937062933, |
| "loss": 3.4082, |
| "step": 33600 |
| }, |
| { |
| "epoch": 9.799114683441086, |
| "grad_norm": 0.3464314937591553, |
| "learning_rate": 0.00048269580419580413, |
| "loss": 3.4125, |
| "step": 33650 |
| }, |
| { |
| "epoch": 9.813675811054807, |
| "grad_norm": 0.34648653864860535, |
| "learning_rate": 0.000482520979020979, |
| "loss": 3.4111, |
| "step": 33700 |
| }, |
| { |
| "epoch": 9.82823693866853, |
| "grad_norm": 0.31844958662986755, |
| "learning_rate": 0.0004823461538461538, |
| "loss": 3.4095, |
| "step": 33750 |
| }, |
| { |
| "epoch": 9.842798066282253, |
| "grad_norm": 0.341919481754303, |
| "learning_rate": 0.00048217132867132864, |
| "loss": 3.4021, |
| "step": 33800 |
| }, |
| { |
| "epoch": 9.857359193895975, |
| "grad_norm": 0.3782145380973816, |
| "learning_rate": 0.00048199650349650344, |
| "loss": 3.4122, |
| "step": 33850 |
| }, |
| { |
| "epoch": 9.871920321509698, |
| "grad_norm": 0.34616291522979736, |
| "learning_rate": 0.0004818216783216783, |
| "loss": 3.411, |
| "step": 33900 |
| }, |
| { |
| "epoch": 9.88648144912342, |
| "grad_norm": 0.3947765827178955, |
| "learning_rate": 0.0004816468531468531, |
| "loss": 3.4107, |
| "step": 33950 |
| }, |
| { |
| "epoch": 9.901042576737142, |
| "grad_norm": 0.3330315947532654, |
| "learning_rate": 0.00048147202797202795, |
| "loss": 3.412, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.901042576737142, |
| "eval_accuracy": 0.3685081046485901, |
| "eval_loss": 3.5606253147125244, |
| "eval_runtime": 181.06, |
| "eval_samples_per_second": 91.936, |
| "eval_steps_per_second": 5.749, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.915603704350865, |
| "grad_norm": 0.34899887442588806, |
| "learning_rate": 0.0004812972027972028, |
| "loss": 3.4039, |
| "step": 34050 |
| }, |
| { |
| "epoch": 9.930164831964587, |
| "grad_norm": 0.34163951873779297, |
| "learning_rate": 0.0004811223776223776, |
| "loss": 3.4078, |
| "step": 34100 |
| }, |
| { |
| "epoch": 9.94472595957831, |
| "grad_norm": 0.3454214334487915, |
| "learning_rate": 0.00048094755244755245, |
| "loss": 3.4091, |
| "step": 34150 |
| }, |
| { |
| "epoch": 9.959287087192033, |
| "grad_norm": 0.31334033608436584, |
| "learning_rate": 0.0004807727272727272, |
| "loss": 3.4132, |
| "step": 34200 |
| }, |
| { |
| "epoch": 9.973848214805754, |
| "grad_norm": 0.32300513982772827, |
| "learning_rate": 0.00048059790209790205, |
| "loss": 3.4173, |
| "step": 34250 |
| }, |
| { |
| "epoch": 9.988409342419477, |
| "grad_norm": 0.3137335181236267, |
| "learning_rate": 0.00048042307692307685, |
| "loss": 3.4163, |
| "step": 34300 |
| }, |
| { |
| "epoch": 10.002912225522744, |
| "grad_norm": 0.3487343192100525, |
| "learning_rate": 0.0004802482517482517, |
| "loss": 3.3853, |
| "step": 34350 |
| }, |
| { |
| "epoch": 10.017473353136467, |
| "grad_norm": 0.32562926411628723, |
| "learning_rate": 0.0004800734265734265, |
| "loss": 3.2863, |
| "step": 34400 |
| }, |
| { |
| "epoch": 10.03203448075019, |
| "grad_norm": 0.3302108645439148, |
| "learning_rate": 0.00047989860139860136, |
| "loss": 3.3014, |
| "step": 34450 |
| }, |
| { |
| "epoch": 10.046595608363912, |
| "grad_norm": 0.3371056616306305, |
| "learning_rate": 0.00047972377622377616, |
| "loss": 3.3122, |
| "step": 34500 |
| }, |
| { |
| "epoch": 10.061156735977635, |
| "grad_norm": 0.3660150468349457, |
| "learning_rate": 0.000479548951048951, |
| "loss": 3.3038, |
| "step": 34550 |
| }, |
| { |
| "epoch": 10.075717863591356, |
| "grad_norm": 0.3384579122066498, |
| "learning_rate": 0.0004793741258741258, |
| "loss": 3.3124, |
| "step": 34600 |
| }, |
| { |
| "epoch": 10.090278991205079, |
| "grad_norm": 0.34635335206985474, |
| "learning_rate": 0.00047919930069930067, |
| "loss": 3.3213, |
| "step": 34650 |
| }, |
| { |
| "epoch": 10.104840118818801, |
| "grad_norm": 0.340000182390213, |
| "learning_rate": 0.0004790244755244755, |
| "loss": 3.3186, |
| "step": 34700 |
| }, |
| { |
| "epoch": 10.119401246432524, |
| "grad_norm": 0.37846511602401733, |
| "learning_rate": 0.0004788496503496503, |
| "loss": 3.3359, |
| "step": 34750 |
| }, |
| { |
| "epoch": 10.133962374046247, |
| "grad_norm": 0.342523992061615, |
| "learning_rate": 0.0004786748251748252, |
| "loss": 3.3243, |
| "step": 34800 |
| }, |
| { |
| "epoch": 10.148523501659968, |
| "grad_norm": 0.377957284450531, |
| "learning_rate": 0.0004785, |
| "loss": 3.3363, |
| "step": 34850 |
| }, |
| { |
| "epoch": 10.16308462927369, |
| "grad_norm": 0.3483106791973114, |
| "learning_rate": 0.00047832517482517483, |
| "loss": 3.3395, |
| "step": 34900 |
| }, |
| { |
| "epoch": 10.177645756887413, |
| "grad_norm": 0.32820820808410645, |
| "learning_rate": 0.0004781503496503496, |
| "loss": 3.3285, |
| "step": 34950 |
| }, |
| { |
| "epoch": 10.192206884501136, |
| "grad_norm": 0.3250258266925812, |
| "learning_rate": 0.00047797552447552443, |
| "loss": 3.3372, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.192206884501136, |
| "eval_accuracy": 0.3681262612998239, |
| "eval_loss": 3.573594570159912, |
| "eval_runtime": 179.9972, |
| "eval_samples_per_second": 92.479, |
| "eval_steps_per_second": 5.783, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.206768012114859, |
| "grad_norm": 0.39293602108955383, |
| "learning_rate": 0.00047780069930069923, |
| "loss": 3.3373, |
| "step": 35050 |
| }, |
| { |
| "epoch": 10.221329139728581, |
| "grad_norm": 0.37482836842536926, |
| "learning_rate": 0.0004776258741258741, |
| "loss": 3.3499, |
| "step": 35100 |
| }, |
| { |
| "epoch": 10.235890267342302, |
| "grad_norm": 0.3627389073371887, |
| "learning_rate": 0.0004774510489510489, |
| "loss": 3.3476, |
| "step": 35150 |
| }, |
| { |
| "epoch": 10.250451394956025, |
| "grad_norm": 0.3211499750614166, |
| "learning_rate": 0.00047727622377622374, |
| "loss": 3.3421, |
| "step": 35200 |
| }, |
| { |
| "epoch": 10.265012522569748, |
| "grad_norm": 0.3497181236743927, |
| "learning_rate": 0.00047710139860139854, |
| "loss": 3.3506, |
| "step": 35250 |
| }, |
| { |
| "epoch": 10.27957365018347, |
| "grad_norm": 0.33958762884140015, |
| "learning_rate": 0.0004769265734265734, |
| "loss": 3.3526, |
| "step": 35300 |
| }, |
| { |
| "epoch": 10.294134777797193, |
| "grad_norm": 0.31388774514198303, |
| "learning_rate": 0.0004767517482517482, |
| "loss": 3.3638, |
| "step": 35350 |
| }, |
| { |
| "epoch": 10.308695905410914, |
| "grad_norm": 0.34293249249458313, |
| "learning_rate": 0.00047657692307692304, |
| "loss": 3.3474, |
| "step": 35400 |
| }, |
| { |
| "epoch": 10.323257033024637, |
| "grad_norm": 0.3444289267063141, |
| "learning_rate": 0.0004764020979020979, |
| "loss": 3.3606, |
| "step": 35450 |
| }, |
| { |
| "epoch": 10.33781816063836, |
| "grad_norm": 0.3576660454273224, |
| "learning_rate": 0.0004762272727272727, |
| "loss": 3.3696, |
| "step": 35500 |
| }, |
| { |
| "epoch": 10.352379288252083, |
| "grad_norm": 0.334441602230072, |
| "learning_rate": 0.00047605244755244755, |
| "loss": 3.3595, |
| "step": 35550 |
| }, |
| { |
| "epoch": 10.366940415865805, |
| "grad_norm": 0.3403428792953491, |
| "learning_rate": 0.00047587762237762235, |
| "loss": 3.3469, |
| "step": 35600 |
| }, |
| { |
| "epoch": 10.381501543479526, |
| "grad_norm": 0.32856133580207825, |
| "learning_rate": 0.0004757027972027972, |
| "loss": 3.358, |
| "step": 35650 |
| }, |
| { |
| "epoch": 10.396062671093249, |
| "grad_norm": 0.3596150875091553, |
| "learning_rate": 0.00047552797202797195, |
| "loss": 3.3608, |
| "step": 35700 |
| }, |
| { |
| "epoch": 10.410623798706972, |
| "grad_norm": 0.33391135931015015, |
| "learning_rate": 0.0004753531468531468, |
| "loss": 3.3602, |
| "step": 35750 |
| }, |
| { |
| "epoch": 10.425184926320695, |
| "grad_norm": 0.3472296893596649, |
| "learning_rate": 0.0004751783216783216, |
| "loss": 3.3669, |
| "step": 35800 |
| }, |
| { |
| "epoch": 10.439746053934417, |
| "grad_norm": 0.3308184742927551, |
| "learning_rate": 0.00047500349650349646, |
| "loss": 3.3665, |
| "step": 35850 |
| }, |
| { |
| "epoch": 10.454307181548138, |
| "grad_norm": 0.332075834274292, |
| "learning_rate": 0.00047482867132867126, |
| "loss": 3.3603, |
| "step": 35900 |
| }, |
| { |
| "epoch": 10.468868309161861, |
| "grad_norm": 0.368173211812973, |
| "learning_rate": 0.0004746538461538461, |
| "loss": 3.3632, |
| "step": 35950 |
| }, |
| { |
| "epoch": 10.483429436775584, |
| "grad_norm": 0.3679032027721405, |
| "learning_rate": 0.0004744790209790209, |
| "loss": 3.3716, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.483429436775584, |
| "eval_accuracy": 0.36859662929194625, |
| "eval_loss": 3.5674023628234863, |
| "eval_runtime": 180.0439, |
| "eval_samples_per_second": 92.455, |
| "eval_steps_per_second": 5.782, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.497990564389307, |
| "grad_norm": 0.3425818085670471, |
| "learning_rate": 0.00047430419580419576, |
| "loss": 3.3652, |
| "step": 36050 |
| }, |
| { |
| "epoch": 10.51255169200303, |
| "grad_norm": 0.3339255452156067, |
| "learning_rate": 0.0004741293706293706, |
| "loss": 3.3708, |
| "step": 36100 |
| }, |
| { |
| "epoch": 10.52711281961675, |
| "grad_norm": 0.33495861291885376, |
| "learning_rate": 0.0004739545454545454, |
| "loss": 3.3707, |
| "step": 36150 |
| }, |
| { |
| "epoch": 10.541673947230473, |
| "grad_norm": 0.3485237956047058, |
| "learning_rate": 0.00047377972027972027, |
| "loss": 3.3864, |
| "step": 36200 |
| }, |
| { |
| "epoch": 10.556235074844196, |
| "grad_norm": 0.3434908986091614, |
| "learning_rate": 0.00047360489510489507, |
| "loss": 3.3873, |
| "step": 36250 |
| }, |
| { |
| "epoch": 10.570796202457919, |
| "grad_norm": 0.3345359265804291, |
| "learning_rate": 0.0004734300699300699, |
| "loss": 3.361, |
| "step": 36300 |
| }, |
| { |
| "epoch": 10.585357330071641, |
| "grad_norm": 0.35131263732910156, |
| "learning_rate": 0.0004732552447552447, |
| "loss": 3.3731, |
| "step": 36350 |
| }, |
| { |
| "epoch": 10.599918457685362, |
| "grad_norm": 0.3793847858905792, |
| "learning_rate": 0.0004730804195804196, |
| "loss": 3.3689, |
| "step": 36400 |
| }, |
| { |
| "epoch": 10.614479585299085, |
| "grad_norm": 0.3411298096179962, |
| "learning_rate": 0.0004729055944055943, |
| "loss": 3.3782, |
| "step": 36450 |
| }, |
| { |
| "epoch": 10.629040712912808, |
| "grad_norm": 0.37617266178131104, |
| "learning_rate": 0.0004727307692307692, |
| "loss": 3.3711, |
| "step": 36500 |
| }, |
| { |
| "epoch": 10.64360184052653, |
| "grad_norm": 0.32526981830596924, |
| "learning_rate": 0.000472555944055944, |
| "loss": 3.3718, |
| "step": 36550 |
| }, |
| { |
| "epoch": 10.658162968140253, |
| "grad_norm": 0.30344685912132263, |
| "learning_rate": 0.00047238111888111883, |
| "loss": 3.3796, |
| "step": 36600 |
| }, |
| { |
| "epoch": 10.672724095753976, |
| "grad_norm": 0.3454796373844147, |
| "learning_rate": 0.00047220629370629363, |
| "loss": 3.3874, |
| "step": 36650 |
| }, |
| { |
| "epoch": 10.687285223367697, |
| "grad_norm": 0.3252149522304535, |
| "learning_rate": 0.0004720314685314685, |
| "loss": 3.3809, |
| "step": 36700 |
| }, |
| { |
| "epoch": 10.70184635098142, |
| "grad_norm": 0.3657645285129547, |
| "learning_rate": 0.0004718566433566433, |
| "loss": 3.391, |
| "step": 36750 |
| }, |
| { |
| "epoch": 10.716407478595142, |
| "grad_norm": 0.3494245409965515, |
| "learning_rate": 0.00047168181818181814, |
| "loss": 3.3812, |
| "step": 36800 |
| }, |
| { |
| "epoch": 10.730968606208865, |
| "grad_norm": 0.3405997157096863, |
| "learning_rate": 0.000471506993006993, |
| "loss": 3.3952, |
| "step": 36850 |
| }, |
| { |
| "epoch": 10.745529733822588, |
| "grad_norm": 0.32991763949394226, |
| "learning_rate": 0.0004713321678321678, |
| "loss": 3.3794, |
| "step": 36900 |
| }, |
| { |
| "epoch": 10.760090861436309, |
| "grad_norm": 0.327539324760437, |
| "learning_rate": 0.00047115734265734265, |
| "loss": 3.3939, |
| "step": 36950 |
| }, |
| { |
| "epoch": 10.774651989050032, |
| "grad_norm": 0.359293669462204, |
| "learning_rate": 0.00047098251748251745, |
| "loss": 3.3913, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.774651989050032, |
| "eval_accuracy": 0.3693203447029698, |
| "eval_loss": 3.5581202507019043, |
| "eval_runtime": 179.9534, |
| "eval_samples_per_second": 92.502, |
| "eval_steps_per_second": 5.785, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.789213116663754, |
| "grad_norm": 0.33391720056533813, |
| "learning_rate": 0.0004708076923076923, |
| "loss": 3.3913, |
| "step": 37050 |
| }, |
| { |
| "epoch": 10.803774244277477, |
| "grad_norm": 0.3419799208641052, |
| "learning_rate": 0.0004706328671328671, |
| "loss": 3.394, |
| "step": 37100 |
| }, |
| { |
| "epoch": 10.8183353718912, |
| "grad_norm": 0.33272436261177063, |
| "learning_rate": 0.00047045804195804195, |
| "loss": 3.3848, |
| "step": 37150 |
| }, |
| { |
| "epoch": 10.83289649950492, |
| "grad_norm": 0.3595719635486603, |
| "learning_rate": 0.0004702832167832167, |
| "loss": 3.3834, |
| "step": 37200 |
| }, |
| { |
| "epoch": 10.847457627118644, |
| "grad_norm": 0.34681352972984314, |
| "learning_rate": 0.00047010839160839155, |
| "loss": 3.3834, |
| "step": 37250 |
| }, |
| { |
| "epoch": 10.862018754732366, |
| "grad_norm": 0.3241344094276428, |
| "learning_rate": 0.00046993356643356635, |
| "loss": 3.3857, |
| "step": 37300 |
| }, |
| { |
| "epoch": 10.876579882346089, |
| "grad_norm": 0.39304202795028687, |
| "learning_rate": 0.0004697587412587412, |
| "loss": 3.3798, |
| "step": 37350 |
| }, |
| { |
| "epoch": 10.891141009959812, |
| "grad_norm": 0.3281121253967285, |
| "learning_rate": 0.000469583916083916, |
| "loss": 3.3913, |
| "step": 37400 |
| }, |
| { |
| "epoch": 10.905702137573535, |
| "grad_norm": 0.3333226144313812, |
| "learning_rate": 0.00046940909090909086, |
| "loss": 3.3931, |
| "step": 37450 |
| }, |
| { |
| "epoch": 10.920263265187256, |
| "grad_norm": 0.3577227294445038, |
| "learning_rate": 0.0004692342657342657, |
| "loss": 3.3859, |
| "step": 37500 |
| }, |
| { |
| "epoch": 10.934824392800978, |
| "grad_norm": 0.37690049409866333, |
| "learning_rate": 0.0004690594405594405, |
| "loss": 3.4, |
| "step": 37550 |
| }, |
| { |
| "epoch": 10.949385520414701, |
| "grad_norm": 0.3263942301273346, |
| "learning_rate": 0.00046888461538461537, |
| "loss": 3.3922, |
| "step": 37600 |
| }, |
| { |
| "epoch": 10.963946648028424, |
| "grad_norm": 0.34868118166923523, |
| "learning_rate": 0.00046870979020979017, |
| "loss": 3.3907, |
| "step": 37650 |
| }, |
| { |
| "epoch": 10.978507775642147, |
| "grad_norm": 0.3693598210811615, |
| "learning_rate": 0.000468534965034965, |
| "loss": 3.3969, |
| "step": 37700 |
| }, |
| { |
| "epoch": 10.993068903255867, |
| "grad_norm": 0.33616459369659424, |
| "learning_rate": 0.0004683601398601398, |
| "loss": 3.3917, |
| "step": 37750 |
| }, |
| { |
| "epoch": 11.007571786359136, |
| "grad_norm": 0.36423736810684204, |
| "learning_rate": 0.0004681853146853147, |
| "loss": 3.3301, |
| "step": 37800 |
| }, |
| { |
| "epoch": 11.022132913972857, |
| "grad_norm": 0.3345557451248169, |
| "learning_rate": 0.0004680104895104895, |
| "loss": 3.2934, |
| "step": 37850 |
| }, |
| { |
| "epoch": 11.03669404158658, |
| "grad_norm": 0.3328711986541748, |
| "learning_rate": 0.00046783566433566433, |
| "loss": 3.2877, |
| "step": 37900 |
| }, |
| { |
| "epoch": 11.051255169200303, |
| "grad_norm": 0.35611701011657715, |
| "learning_rate": 0.0004676608391608391, |
| "loss": 3.2932, |
| "step": 37950 |
| }, |
| { |
| "epoch": 11.065816296814026, |
| "grad_norm": 0.41214683651924133, |
| "learning_rate": 0.00046748601398601393, |
| "loss": 3.307, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.065816296814026, |
| "eval_accuracy": 0.3693281038350568, |
| "eval_loss": 3.5670840740203857, |
| "eval_runtime": 180.0481, |
| "eval_samples_per_second": 92.453, |
| "eval_steps_per_second": 5.782, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.080377424427748, |
| "grad_norm": 0.35001227259635925, |
| "learning_rate": 0.00046731118881118873, |
| "loss": 3.2886, |
| "step": 38050 |
| }, |
| { |
| "epoch": 11.09493855204147, |
| "grad_norm": 0.3704109489917755, |
| "learning_rate": 0.0004671363636363636, |
| "loss": 3.3065, |
| "step": 38100 |
| }, |
| { |
| "epoch": 11.109499679655192, |
| "grad_norm": 0.3530898094177246, |
| "learning_rate": 0.00046696153846153844, |
| "loss": 3.2973, |
| "step": 38150 |
| }, |
| { |
| "epoch": 11.124060807268915, |
| "grad_norm": 0.40628939867019653, |
| "learning_rate": 0.00046678671328671324, |
| "loss": 3.2947, |
| "step": 38200 |
| }, |
| { |
| "epoch": 11.138621934882638, |
| "grad_norm": 0.3742045760154724, |
| "learning_rate": 0.0004666118881118881, |
| "loss": 3.3097, |
| "step": 38250 |
| }, |
| { |
| "epoch": 11.15318306249636, |
| "grad_norm": 0.34569111466407776, |
| "learning_rate": 0.0004664370629370629, |
| "loss": 3.3157, |
| "step": 38300 |
| }, |
| { |
| "epoch": 11.167744190110081, |
| "grad_norm": 0.35447704792022705, |
| "learning_rate": 0.00046626223776223774, |
| "loss": 3.3011, |
| "step": 38350 |
| }, |
| { |
| "epoch": 11.182305317723804, |
| "grad_norm": 0.34649232029914856, |
| "learning_rate": 0.00046608741258741254, |
| "loss": 3.3254, |
| "step": 38400 |
| }, |
| { |
| "epoch": 11.196866445337527, |
| "grad_norm": 0.3378303647041321, |
| "learning_rate": 0.0004659125874125874, |
| "loss": 3.321, |
| "step": 38450 |
| }, |
| { |
| "epoch": 11.21142757295125, |
| "grad_norm": 0.35000166296958923, |
| "learning_rate": 0.0004657377622377622, |
| "loss": 3.3253, |
| "step": 38500 |
| }, |
| { |
| "epoch": 11.225988700564972, |
| "grad_norm": 0.3722868263721466, |
| "learning_rate": 0.00046556293706293705, |
| "loss": 3.3427, |
| "step": 38550 |
| }, |
| { |
| "epoch": 11.240549828178693, |
| "grad_norm": 0.35413050651550293, |
| "learning_rate": 0.00046538811188811185, |
| "loss": 3.3254, |
| "step": 38600 |
| }, |
| { |
| "epoch": 11.255110955792416, |
| "grad_norm": 0.34542974829673767, |
| "learning_rate": 0.0004652132867132867, |
| "loss": 3.3328, |
| "step": 38650 |
| }, |
| { |
| "epoch": 11.269672083406139, |
| "grad_norm": 0.33427903056144714, |
| "learning_rate": 0.00046503846153846145, |
| "loss": 3.3265, |
| "step": 38700 |
| }, |
| { |
| "epoch": 11.284233211019862, |
| "grad_norm": 0.38189569115638733, |
| "learning_rate": 0.0004648636363636363, |
| "loss": 3.3315, |
| "step": 38750 |
| }, |
| { |
| "epoch": 11.298794338633584, |
| "grad_norm": 0.36723101139068604, |
| "learning_rate": 0.0004646888111888111, |
| "loss": 3.34, |
| "step": 38800 |
| }, |
| { |
| "epoch": 11.313355466247307, |
| "grad_norm": 0.31660521030426025, |
| "learning_rate": 0.00046451398601398596, |
| "loss": 3.3279, |
| "step": 38850 |
| }, |
| { |
| "epoch": 11.327916593861028, |
| "grad_norm": 0.3575914204120636, |
| "learning_rate": 0.0004643391608391608, |
| "loss": 3.331, |
| "step": 38900 |
| }, |
| { |
| "epoch": 11.34247772147475, |
| "grad_norm": 0.32250308990478516, |
| "learning_rate": 0.0004641643356643356, |
| "loss": 3.3302, |
| "step": 38950 |
| }, |
| { |
| "epoch": 11.357038849088473, |
| "grad_norm": 0.34342795610427856, |
| "learning_rate": 0.00046398951048951046, |
| "loss": 3.3368, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.357038849088473, |
| "eval_accuracy": 0.36949939255400766, |
| "eval_loss": 3.5619232654571533, |
| "eval_runtime": 180.056, |
| "eval_samples_per_second": 92.449, |
| "eval_steps_per_second": 5.782, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.371599976702196, |
| "grad_norm": 0.3561156690120697, |
| "learning_rate": 0.00046381468531468526, |
| "loss": 3.335, |
| "step": 39050 |
| }, |
| { |
| "epoch": 11.386161104315919, |
| "grad_norm": 0.33757245540618896, |
| "learning_rate": 0.0004636398601398601, |
| "loss": 3.335, |
| "step": 39100 |
| }, |
| { |
| "epoch": 11.40072223192964, |
| "grad_norm": 0.3763889968395233, |
| "learning_rate": 0.0004634650349650349, |
| "loss": 3.3194, |
| "step": 39150 |
| }, |
| { |
| "epoch": 11.415283359543363, |
| "grad_norm": 0.36068034172058105, |
| "learning_rate": 0.00046329020979020977, |
| "loss": 3.3439, |
| "step": 39200 |
| }, |
| { |
| "epoch": 11.429844487157085, |
| "grad_norm": 0.3321940302848816, |
| "learning_rate": 0.00046311538461538457, |
| "loss": 3.3491, |
| "step": 39250 |
| }, |
| { |
| "epoch": 11.444405614770808, |
| "grad_norm": 0.35047703981399536, |
| "learning_rate": 0.0004629405594405594, |
| "loss": 3.3486, |
| "step": 39300 |
| }, |
| { |
| "epoch": 11.458966742384531, |
| "grad_norm": 0.3526565432548523, |
| "learning_rate": 0.0004627657342657342, |
| "loss": 3.3521, |
| "step": 39350 |
| }, |
| { |
| "epoch": 11.473527869998252, |
| "grad_norm": 0.3880484998226166, |
| "learning_rate": 0.0004625909090909091, |
| "loss": 3.3427, |
| "step": 39400 |
| }, |
| { |
| "epoch": 11.488088997611975, |
| "grad_norm": 0.32461223006248474, |
| "learning_rate": 0.0004624160839160838, |
| "loss": 3.3516, |
| "step": 39450 |
| }, |
| { |
| "epoch": 11.502650125225697, |
| "grad_norm": 0.33704331517219543, |
| "learning_rate": 0.0004622412587412587, |
| "loss": 3.3503, |
| "step": 39500 |
| }, |
| { |
| "epoch": 11.51721125283942, |
| "grad_norm": 0.35615167021751404, |
| "learning_rate": 0.00046206643356643353, |
| "loss": 3.357, |
| "step": 39550 |
| }, |
| { |
| "epoch": 11.531772380453143, |
| "grad_norm": 0.33560869097709656, |
| "learning_rate": 0.00046189160839160833, |
| "loss": 3.3519, |
| "step": 39600 |
| }, |
| { |
| "epoch": 11.546333508066864, |
| "grad_norm": 0.352498322725296, |
| "learning_rate": 0.0004617167832167832, |
| "loss": 3.3539, |
| "step": 39650 |
| }, |
| { |
| "epoch": 11.560894635680587, |
| "grad_norm": 0.3590864837169647, |
| "learning_rate": 0.000461541958041958, |
| "loss": 3.3621, |
| "step": 39700 |
| }, |
| { |
| "epoch": 11.57545576329431, |
| "grad_norm": 0.3814064562320709, |
| "learning_rate": 0.00046136713286713284, |
| "loss": 3.3601, |
| "step": 39750 |
| }, |
| { |
| "epoch": 11.590016890908032, |
| "grad_norm": 0.36403122544288635, |
| "learning_rate": 0.00046119230769230764, |
| "loss": 3.3568, |
| "step": 39800 |
| }, |
| { |
| "epoch": 11.604578018521755, |
| "grad_norm": 0.3343297243118286, |
| "learning_rate": 0.0004610174825174825, |
| "loss": 3.3541, |
| "step": 39850 |
| }, |
| { |
| "epoch": 11.619139146135478, |
| "grad_norm": 0.4333159327507019, |
| "learning_rate": 0.0004608426573426573, |
| "loss": 3.3586, |
| "step": 39900 |
| }, |
| { |
| "epoch": 11.633700273749199, |
| "grad_norm": 0.3147551715373993, |
| "learning_rate": 0.00046066783216783215, |
| "loss": 3.3557, |
| "step": 39950 |
| }, |
| { |
| "epoch": 11.648261401362921, |
| "grad_norm": 0.3541496992111206, |
| "learning_rate": 0.00046049300699300695, |
| "loss": 3.3631, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.648261401362921, |
| "eval_accuracy": 0.3697861277534044, |
| "eval_loss": 3.555816650390625, |
| "eval_runtime": 180.1897, |
| "eval_samples_per_second": 92.38, |
| "eval_steps_per_second": 5.777, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.662822528976644, |
| "grad_norm": 0.3832921087741852, |
| "learning_rate": 0.0004603181818181818, |
| "loss": 3.3628, |
| "step": 40050 |
| }, |
| { |
| "epoch": 11.677383656590367, |
| "grad_norm": 0.34815239906311035, |
| "learning_rate": 0.0004601433566433566, |
| "loss": 3.3639, |
| "step": 40100 |
| }, |
| { |
| "epoch": 11.69194478420409, |
| "grad_norm": 0.35218140482902527, |
| "learning_rate": 0.00045996853146853145, |
| "loss": 3.3652, |
| "step": 40150 |
| }, |
| { |
| "epoch": 11.70650591181781, |
| "grad_norm": 0.3666824698448181, |
| "learning_rate": 0.0004597937062937062, |
| "loss": 3.3647, |
| "step": 40200 |
| }, |
| { |
| "epoch": 11.721067039431533, |
| "grad_norm": 0.3536669909954071, |
| "learning_rate": 0.00045961888111888105, |
| "loss": 3.3739, |
| "step": 40250 |
| }, |
| { |
| "epoch": 11.735628167045256, |
| "grad_norm": 0.3629182279109955, |
| "learning_rate": 0.0004594440559440559, |
| "loss": 3.3653, |
| "step": 40300 |
| }, |
| { |
| "epoch": 11.750189294658979, |
| "grad_norm": 0.3446507453918457, |
| "learning_rate": 0.0004592692307692307, |
| "loss": 3.3658, |
| "step": 40350 |
| }, |
| { |
| "epoch": 11.764750422272702, |
| "grad_norm": 0.35143420100212097, |
| "learning_rate": 0.00045909440559440556, |
| "loss": 3.3779, |
| "step": 40400 |
| }, |
| { |
| "epoch": 11.779311549886422, |
| "grad_norm": 0.34043586254119873, |
| "learning_rate": 0.00045891958041958036, |
| "loss": 3.3545, |
| "step": 40450 |
| }, |
| { |
| "epoch": 11.793872677500145, |
| "grad_norm": 0.341762512922287, |
| "learning_rate": 0.0004587447552447552, |
| "loss": 3.3583, |
| "step": 40500 |
| }, |
| { |
| "epoch": 11.808433805113868, |
| "grad_norm": 0.3619687557220459, |
| "learning_rate": 0.00045856993006993, |
| "loss": 3.3629, |
| "step": 40550 |
| }, |
| { |
| "epoch": 11.82299493272759, |
| "grad_norm": 0.34980112314224243, |
| "learning_rate": 0.00045839510489510487, |
| "loss": 3.364, |
| "step": 40600 |
| }, |
| { |
| "epoch": 11.837556060341313, |
| "grad_norm": 0.3506343960762024, |
| "learning_rate": 0.00045822027972027967, |
| "loss": 3.3564, |
| "step": 40650 |
| }, |
| { |
| "epoch": 11.852117187955034, |
| "grad_norm": 0.3548135757446289, |
| "learning_rate": 0.0004580454545454545, |
| "loss": 3.3837, |
| "step": 40700 |
| }, |
| { |
| "epoch": 11.866678315568757, |
| "grad_norm": 0.3386966586112976, |
| "learning_rate": 0.0004578706293706293, |
| "loss": 3.3634, |
| "step": 40750 |
| }, |
| { |
| "epoch": 11.88123944318248, |
| "grad_norm": 0.3632442355155945, |
| "learning_rate": 0.0004576958041958042, |
| "loss": 3.3545, |
| "step": 40800 |
| }, |
| { |
| "epoch": 11.895800570796203, |
| "grad_norm": 0.3457760810852051, |
| "learning_rate": 0.000457520979020979, |
| "loss": 3.372, |
| "step": 40850 |
| }, |
| { |
| "epoch": 11.910361698409925, |
| "grad_norm": 0.3774667978286743, |
| "learning_rate": 0.00045734615384615383, |
| "loss": 3.377, |
| "step": 40900 |
| }, |
| { |
| "epoch": 11.924922826023646, |
| "grad_norm": 0.3599102199077606, |
| "learning_rate": 0.0004571713286713287, |
| "loss": 3.3598, |
| "step": 40950 |
| }, |
| { |
| "epoch": 11.93948395363737, |
| "grad_norm": 0.35357990860939026, |
| "learning_rate": 0.00045699650349650343, |
| "loss": 3.3751, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.93948395363737, |
| "eval_accuracy": 0.37036465334431523, |
| "eval_loss": 3.548415422439575, |
| "eval_runtime": 180.0425, |
| "eval_samples_per_second": 92.456, |
| "eval_steps_per_second": 5.782, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.954045081251092, |
| "grad_norm": 0.3612292408943176, |
| "learning_rate": 0.0004568216783216783, |
| "loss": 3.3754, |
| "step": 41050 |
| }, |
| { |
| "epoch": 11.968606208864815, |
| "grad_norm": 0.3789537250995636, |
| "learning_rate": 0.0004566468531468531, |
| "loss": 3.3636, |
| "step": 41100 |
| }, |
| { |
| "epoch": 11.983167336478537, |
| "grad_norm": 0.3364958167076111, |
| "learning_rate": 0.00045647202797202794, |
| "loss": 3.3729, |
| "step": 41150 |
| }, |
| { |
| "epoch": 11.99772846409226, |
| "grad_norm": 0.37100473046302795, |
| "learning_rate": 0.00045629720279720274, |
| "loss": 3.3771, |
| "step": 41200 |
| }, |
| { |
| "epoch": 12.012231347195527, |
| "grad_norm": 0.3618772029876709, |
| "learning_rate": 0.0004561223776223776, |
| "loss": 3.282, |
| "step": 41250 |
| }, |
| { |
| "epoch": 12.02679247480925, |
| "grad_norm": 0.4036857783794403, |
| "learning_rate": 0.0004559475524475524, |
| "loss": 3.2775, |
| "step": 41300 |
| }, |
| { |
| "epoch": 12.041353602422971, |
| "grad_norm": 0.36086776852607727, |
| "learning_rate": 0.00045577272727272724, |
| "loss": 3.2665, |
| "step": 41350 |
| }, |
| { |
| "epoch": 12.055914730036694, |
| "grad_norm": 0.36698007583618164, |
| "learning_rate": 0.00045559790209790204, |
| "loss": 3.2767, |
| "step": 41400 |
| }, |
| { |
| "epoch": 12.070475857650417, |
| "grad_norm": 0.36267364025115967, |
| "learning_rate": 0.0004554230769230769, |
| "loss": 3.268, |
| "step": 41450 |
| }, |
| { |
| "epoch": 12.08503698526414, |
| "grad_norm": 0.34971630573272705, |
| "learning_rate": 0.0004552482517482517, |
| "loss": 3.2801, |
| "step": 41500 |
| }, |
| { |
| "epoch": 12.099598112877862, |
| "grad_norm": 0.363832026720047, |
| "learning_rate": 0.00045507342657342655, |
| "loss": 3.284, |
| "step": 41550 |
| }, |
| { |
| "epoch": 12.114159240491583, |
| "grad_norm": 0.39120298624038696, |
| "learning_rate": 0.00045489860139860135, |
| "loss": 3.2931, |
| "step": 41600 |
| }, |
| { |
| "epoch": 12.128720368105306, |
| "grad_norm": 0.3486727476119995, |
| "learning_rate": 0.0004547237762237762, |
| "loss": 3.2846, |
| "step": 41650 |
| }, |
| { |
| "epoch": 12.143281495719028, |
| "grad_norm": 0.4164111614227295, |
| "learning_rate": 0.00045454895104895106, |
| "loss": 3.28, |
| "step": 41700 |
| }, |
| { |
| "epoch": 12.157842623332751, |
| "grad_norm": 0.3600572347640991, |
| "learning_rate": 0.0004543741258741258, |
| "loss": 3.2904, |
| "step": 41750 |
| }, |
| { |
| "epoch": 12.172403750946474, |
| "grad_norm": 0.36207276582717896, |
| "learning_rate": 0.00045419930069930066, |
| "loss": 3.2893, |
| "step": 41800 |
| }, |
| { |
| "epoch": 12.186964878560195, |
| "grad_norm": 0.33754491806030273, |
| "learning_rate": 0.00045402447552447546, |
| "loss": 3.2997, |
| "step": 41850 |
| }, |
| { |
| "epoch": 12.201526006173918, |
| "grad_norm": 0.3483685255050659, |
| "learning_rate": 0.0004538496503496503, |
| "loss": 3.3088, |
| "step": 41900 |
| }, |
| { |
| "epoch": 12.21608713378764, |
| "grad_norm": 0.3464941084384918, |
| "learning_rate": 0.0004536748251748251, |
| "loss": 3.309, |
| "step": 41950 |
| }, |
| { |
| "epoch": 12.230648261401363, |
| "grad_norm": 0.35594475269317627, |
| "learning_rate": 0.00045349999999999996, |
| "loss": 3.303, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.230648261401363, |
| "eval_accuracy": 0.36972064538109445, |
| "eval_loss": 3.5634684562683105, |
| "eval_runtime": 180.183, |
| "eval_samples_per_second": 92.384, |
| "eval_steps_per_second": 5.777, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.245209389015086, |
| "grad_norm": 0.3592406213283539, |
| "learning_rate": 0.00045332517482517476, |
| "loss": 3.3105, |
| "step": 42050 |
| }, |
| { |
| "epoch": 12.259770516628807, |
| "grad_norm": 0.36598271131515503, |
| "learning_rate": 0.0004531503496503496, |
| "loss": 3.3056, |
| "step": 42100 |
| }, |
| { |
| "epoch": 12.27433164424253, |
| "grad_norm": 0.35532549023628235, |
| "learning_rate": 0.0004529755244755244, |
| "loss": 3.3231, |
| "step": 42150 |
| }, |
| { |
| "epoch": 12.288892771856252, |
| "grad_norm": 0.35334861278533936, |
| "learning_rate": 0.00045280069930069927, |
| "loss": 3.3146, |
| "step": 42200 |
| }, |
| { |
| "epoch": 12.303453899469975, |
| "grad_norm": 0.35980355739593506, |
| "learning_rate": 0.00045262587412587407, |
| "loss": 3.3164, |
| "step": 42250 |
| }, |
| { |
| "epoch": 12.318015027083698, |
| "grad_norm": 0.34838441014289856, |
| "learning_rate": 0.0004524510489510489, |
| "loss": 3.3162, |
| "step": 42300 |
| }, |
| { |
| "epoch": 12.33257615469742, |
| "grad_norm": 0.3427560031414032, |
| "learning_rate": 0.0004522762237762238, |
| "loss": 3.3182, |
| "step": 42350 |
| }, |
| { |
| "epoch": 12.347137282311142, |
| "grad_norm": 0.3396528661251068, |
| "learning_rate": 0.0004521013986013986, |
| "loss": 3.3073, |
| "step": 42400 |
| }, |
| { |
| "epoch": 12.361698409924864, |
| "grad_norm": 0.3475770056247711, |
| "learning_rate": 0.00045192657342657343, |
| "loss": 3.3317, |
| "step": 42450 |
| }, |
| { |
| "epoch": 12.376259537538587, |
| "grad_norm": 0.362802118062973, |
| "learning_rate": 0.0004517517482517482, |
| "loss": 3.3121, |
| "step": 42500 |
| }, |
| { |
| "epoch": 12.39082066515231, |
| "grad_norm": 0.37369081377983093, |
| "learning_rate": 0.00045157692307692303, |
| "loss": 3.3281, |
| "step": 42550 |
| }, |
| { |
| "epoch": 12.405381792766033, |
| "grad_norm": 0.3561052083969116, |
| "learning_rate": 0.00045140209790209783, |
| "loss": 3.3152, |
| "step": 42600 |
| }, |
| { |
| "epoch": 12.419942920379754, |
| "grad_norm": 0.37191736698150635, |
| "learning_rate": 0.0004512272727272727, |
| "loss": 3.3133, |
| "step": 42650 |
| }, |
| { |
| "epoch": 12.434504047993476, |
| "grad_norm": 0.3548118770122528, |
| "learning_rate": 0.0004510524475524475, |
| "loss": 3.3277, |
| "step": 42700 |
| }, |
| { |
| "epoch": 12.449065175607199, |
| "grad_norm": 0.35695403814315796, |
| "learning_rate": 0.00045087762237762234, |
| "loss": 3.3261, |
| "step": 42750 |
| }, |
| { |
| "epoch": 12.463626303220922, |
| "grad_norm": 0.3630702793598175, |
| "learning_rate": 0.00045070279720279714, |
| "loss": 3.3222, |
| "step": 42800 |
| }, |
| { |
| "epoch": 12.478187430834645, |
| "grad_norm": 0.3870992660522461, |
| "learning_rate": 0.000450527972027972, |
| "loss": 3.3371, |
| "step": 42850 |
| }, |
| { |
| "epoch": 12.492748558448366, |
| "grad_norm": 0.35204771161079407, |
| "learning_rate": 0.0004503531468531468, |
| "loss": 3.3278, |
| "step": 42900 |
| }, |
| { |
| "epoch": 12.507309686062088, |
| "grad_norm": 0.37499335408210754, |
| "learning_rate": 0.00045017832167832165, |
| "loss": 3.3325, |
| "step": 42950 |
| }, |
| { |
| "epoch": 12.521870813675811, |
| "grad_norm": 0.354116827249527, |
| "learning_rate": 0.0004500034965034965, |
| "loss": 3.3382, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.521870813675811, |
| "eval_accuracy": 0.370408033946438, |
| "eval_loss": 3.5555574893951416, |
| "eval_runtime": 179.9813, |
| "eval_samples_per_second": 92.487, |
| "eval_steps_per_second": 5.784, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.536431941289534, |
| "grad_norm": 0.3633236885070801, |
| "learning_rate": 0.0004498286713286713, |
| "loss": 3.3301, |
| "step": 43050 |
| }, |
| { |
| "epoch": 12.550993068903256, |
| "grad_norm": 0.32828742265701294, |
| "learning_rate": 0.00044965384615384615, |
| "loss": 3.3394, |
| "step": 43100 |
| }, |
| { |
| "epoch": 12.565554196516977, |
| "grad_norm": 0.3750151991844177, |
| "learning_rate": 0.00044947902097902095, |
| "loss": 3.3416, |
| "step": 43150 |
| }, |
| { |
| "epoch": 12.5801153241307, |
| "grad_norm": 0.33759573101997375, |
| "learning_rate": 0.0004493041958041958, |
| "loss": 3.3351, |
| "step": 43200 |
| }, |
| { |
| "epoch": 12.594676451744423, |
| "grad_norm": 0.3236546516418457, |
| "learning_rate": 0.00044912937062937055, |
| "loss": 3.3266, |
| "step": 43250 |
| }, |
| { |
| "epoch": 12.609237579358146, |
| "grad_norm": 0.3640960156917572, |
| "learning_rate": 0.0004489545454545454, |
| "loss": 3.3463, |
| "step": 43300 |
| }, |
| { |
| "epoch": 12.623798706971868, |
| "grad_norm": 0.36271312832832336, |
| "learning_rate": 0.0004487797202797202, |
| "loss": 3.356, |
| "step": 43350 |
| }, |
| { |
| "epoch": 12.63835983458559, |
| "grad_norm": 0.3453165292739868, |
| "learning_rate": 0.00044860489510489506, |
| "loss": 3.335, |
| "step": 43400 |
| }, |
| { |
| "epoch": 12.652920962199312, |
| "grad_norm": 0.34455326199531555, |
| "learning_rate": 0.00044843006993006986, |
| "loss": 3.3423, |
| "step": 43450 |
| }, |
| { |
| "epoch": 12.667482089813035, |
| "grad_norm": 0.3475700616836548, |
| "learning_rate": 0.0004482552447552447, |
| "loss": 3.342, |
| "step": 43500 |
| }, |
| { |
| "epoch": 12.682043217426758, |
| "grad_norm": 0.34330064058303833, |
| "learning_rate": 0.0004480804195804195, |
| "loss": 3.3354, |
| "step": 43550 |
| }, |
| { |
| "epoch": 12.69660434504048, |
| "grad_norm": 0.3568446934223175, |
| "learning_rate": 0.00044790559440559437, |
| "loss": 3.3515, |
| "step": 43600 |
| }, |
| { |
| "epoch": 12.711165472654203, |
| "grad_norm": 0.3274061977863312, |
| "learning_rate": 0.00044773076923076917, |
| "loss": 3.3516, |
| "step": 43650 |
| }, |
| { |
| "epoch": 12.725726600267924, |
| "grad_norm": 0.33510422706604004, |
| "learning_rate": 0.000447555944055944, |
| "loss": 3.3373, |
| "step": 43700 |
| }, |
| { |
| "epoch": 12.740287727881647, |
| "grad_norm": 0.3329940438270569, |
| "learning_rate": 0.0004473811188811189, |
| "loss": 3.3375, |
| "step": 43750 |
| }, |
| { |
| "epoch": 12.75484885549537, |
| "grad_norm": 0.3572615087032318, |
| "learning_rate": 0.0004472062937062937, |
| "loss": 3.3476, |
| "step": 43800 |
| }, |
| { |
| "epoch": 12.769409983109092, |
| "grad_norm": 0.33176979422569275, |
| "learning_rate": 0.00044703146853146853, |
| "loss": 3.3381, |
| "step": 43850 |
| }, |
| { |
| "epoch": 12.783971110722815, |
| "grad_norm": 0.34815382957458496, |
| "learning_rate": 0.00044685664335664333, |
| "loss": 3.3555, |
| "step": 43900 |
| }, |
| { |
| "epoch": 12.798532238336536, |
| "grad_norm": 0.34616056084632874, |
| "learning_rate": 0.0004466818181818182, |
| "loss": 3.3469, |
| "step": 43950 |
| }, |
| { |
| "epoch": 12.813093365950259, |
| "grad_norm": 0.35044896602630615, |
| "learning_rate": 0.00044650699300699293, |
| "loss": 3.3602, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.813093365950259, |
| "eval_accuracy": 0.3704870360185965, |
| "eval_loss": 3.5500214099884033, |
| "eval_runtime": 179.959, |
| "eval_samples_per_second": 92.499, |
| "eval_steps_per_second": 5.785, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.827654493563982, |
| "grad_norm": 0.33117392659187317, |
| "learning_rate": 0.0004463321678321678, |
| "loss": 3.3572, |
| "step": 44050 |
| }, |
| { |
| "epoch": 12.842215621177704, |
| "grad_norm": 0.3368406891822815, |
| "learning_rate": 0.0004461573426573426, |
| "loss": 3.3564, |
| "step": 44100 |
| }, |
| { |
| "epoch": 12.856776748791427, |
| "grad_norm": 0.3537261188030243, |
| "learning_rate": 0.00044598251748251744, |
| "loss": 3.3551, |
| "step": 44150 |
| }, |
| { |
| "epoch": 12.871337876405148, |
| "grad_norm": 0.35541796684265137, |
| "learning_rate": 0.00044580769230769224, |
| "loss": 3.3549, |
| "step": 44200 |
| }, |
| { |
| "epoch": 12.88589900401887, |
| "grad_norm": 0.3518984913825989, |
| "learning_rate": 0.0004456328671328671, |
| "loss": 3.3613, |
| "step": 44250 |
| }, |
| { |
| "epoch": 12.900460131632594, |
| "grad_norm": 0.36018097400665283, |
| "learning_rate": 0.0004454580419580419, |
| "loss": 3.355, |
| "step": 44300 |
| }, |
| { |
| "epoch": 12.915021259246316, |
| "grad_norm": 0.35521966218948364, |
| "learning_rate": 0.00044528321678321674, |
| "loss": 3.3563, |
| "step": 44350 |
| }, |
| { |
| "epoch": 12.929582386860039, |
| "grad_norm": 0.33770403265953064, |
| "learning_rate": 0.0004451083916083916, |
| "loss": 3.3509, |
| "step": 44400 |
| }, |
| { |
| "epoch": 12.944143514473762, |
| "grad_norm": 0.34535059332847595, |
| "learning_rate": 0.0004449335664335664, |
| "loss": 3.3545, |
| "step": 44450 |
| }, |
| { |
| "epoch": 12.958704642087483, |
| "grad_norm": 0.3915956914424896, |
| "learning_rate": 0.00044475874125874125, |
| "loss": 3.3604, |
| "step": 44500 |
| }, |
| { |
| "epoch": 12.973265769701205, |
| "grad_norm": 0.3847052752971649, |
| "learning_rate": 0.00044458391608391605, |
| "loss": 3.3606, |
| "step": 44550 |
| }, |
| { |
| "epoch": 12.987826897314928, |
| "grad_norm": 0.36198189854621887, |
| "learning_rate": 0.0004444090909090909, |
| "loss": 3.36, |
| "step": 44600 |
| }, |
| { |
| "epoch": 13.002329780418195, |
| "grad_norm": 0.3606732487678528, |
| "learning_rate": 0.0004442342657342657, |
| "loss": 3.3451, |
| "step": 44650 |
| }, |
| { |
| "epoch": 13.016890908031918, |
| "grad_norm": 0.3492608666419983, |
| "learning_rate": 0.00044405944055944056, |
| "loss": 3.2394, |
| "step": 44700 |
| }, |
| { |
| "epoch": 13.031452035645641, |
| "grad_norm": 0.3639189600944519, |
| "learning_rate": 0.0004438846153846153, |
| "loss": 3.252, |
| "step": 44750 |
| }, |
| { |
| "epoch": 13.046013163259364, |
| "grad_norm": 0.3617708086967468, |
| "learning_rate": 0.00044370979020979016, |
| "loss": 3.2626, |
| "step": 44800 |
| }, |
| { |
| "epoch": 13.060574290873085, |
| "grad_norm": 0.3884221911430359, |
| "learning_rate": 0.00044353496503496496, |
| "loss": 3.2627, |
| "step": 44850 |
| }, |
| { |
| "epoch": 13.075135418486807, |
| "grad_norm": 0.35316526889801025, |
| "learning_rate": 0.0004433601398601398, |
| "loss": 3.2553, |
| "step": 44900 |
| }, |
| { |
| "epoch": 13.08969654610053, |
| "grad_norm": 0.34747469425201416, |
| "learning_rate": 0.0004431853146853146, |
| "loss": 3.274, |
| "step": 44950 |
| }, |
| { |
| "epoch": 13.104257673714253, |
| "grad_norm": 0.37390294671058655, |
| "learning_rate": 0.00044301048951048946, |
| "loss": 3.2629, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.104257673714253, |
| "eval_accuracy": 0.3701607997831205, |
| "eval_loss": 3.5621118545532227, |
| "eval_runtime": 179.9089, |
| "eval_samples_per_second": 92.525, |
| "eval_steps_per_second": 5.786, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.118818801327976, |
| "grad_norm": 0.35391858220100403, |
| "learning_rate": 0.00044283566433566426, |
| "loss": 3.2667, |
| "step": 45050 |
| }, |
| { |
| "epoch": 13.133379928941697, |
| "grad_norm": 0.3924326002597809, |
| "learning_rate": 0.0004426608391608391, |
| "loss": 3.2647, |
| "step": 45100 |
| }, |
| { |
| "epoch": 13.14794105655542, |
| "grad_norm": 0.35613200068473816, |
| "learning_rate": 0.00044248601398601397, |
| "loss": 3.287, |
| "step": 45150 |
| }, |
| { |
| "epoch": 13.162502184169142, |
| "grad_norm": 0.35485580563545227, |
| "learning_rate": 0.00044231118881118877, |
| "loss": 3.2745, |
| "step": 45200 |
| }, |
| { |
| "epoch": 13.177063311782865, |
| "grad_norm": 0.3689339756965637, |
| "learning_rate": 0.0004421363636363636, |
| "loss": 3.2729, |
| "step": 45250 |
| }, |
| { |
| "epoch": 13.191624439396588, |
| "grad_norm": 0.3808785080909729, |
| "learning_rate": 0.0004419615384615384, |
| "loss": 3.27, |
| "step": 45300 |
| }, |
| { |
| "epoch": 13.206185567010309, |
| "grad_norm": 0.33401966094970703, |
| "learning_rate": 0.0004417867132867133, |
| "loss": 3.2889, |
| "step": 45350 |
| }, |
| { |
| "epoch": 13.220746694624031, |
| "grad_norm": 0.3462887406349182, |
| "learning_rate": 0.0004416118881118881, |
| "loss": 3.2894, |
| "step": 45400 |
| }, |
| { |
| "epoch": 13.235307822237754, |
| "grad_norm": 0.36844760179519653, |
| "learning_rate": 0.00044143706293706293, |
| "loss": 3.2837, |
| "step": 45450 |
| }, |
| { |
| "epoch": 13.249868949851477, |
| "grad_norm": 0.3560846149921417, |
| "learning_rate": 0.0004412622377622377, |
| "loss": 3.2865, |
| "step": 45500 |
| }, |
| { |
| "epoch": 13.2644300774652, |
| "grad_norm": 0.3669024407863617, |
| "learning_rate": 0.00044108741258741253, |
| "loss": 3.2847, |
| "step": 45550 |
| }, |
| { |
| "epoch": 13.27899120507892, |
| "grad_norm": 0.367481529712677, |
| "learning_rate": 0.00044091258741258733, |
| "loss": 3.3048, |
| "step": 45600 |
| }, |
| { |
| "epoch": 13.293552332692643, |
| "grad_norm": 0.34471479058265686, |
| "learning_rate": 0.0004407377622377622, |
| "loss": 3.3218, |
| "step": 45650 |
| }, |
| { |
| "epoch": 13.308113460306366, |
| "grad_norm": 0.34463047981262207, |
| "learning_rate": 0.000440562937062937, |
| "loss": 3.3007, |
| "step": 45700 |
| }, |
| { |
| "epoch": 13.322674587920089, |
| "grad_norm": 0.3587283492088318, |
| "learning_rate": 0.00044038811188811184, |
| "loss": 3.303, |
| "step": 45750 |
| }, |
| { |
| "epoch": 13.337235715533811, |
| "grad_norm": 0.35901951789855957, |
| "learning_rate": 0.0004402132867132867, |
| "loss": 3.3063, |
| "step": 45800 |
| }, |
| { |
| "epoch": 13.351796843147532, |
| "grad_norm": 0.36252450942993164, |
| "learning_rate": 0.0004400384615384615, |
| "loss": 3.2963, |
| "step": 45850 |
| }, |
| { |
| "epoch": 13.366357970761255, |
| "grad_norm": 0.35850846767425537, |
| "learning_rate": 0.00043986363636363635, |
| "loss": 3.3059, |
| "step": 45900 |
| }, |
| { |
| "epoch": 13.380919098374978, |
| "grad_norm": 0.35532084107398987, |
| "learning_rate": 0.00043968881118881115, |
| "loss": 3.3093, |
| "step": 45950 |
| }, |
| { |
| "epoch": 13.3954802259887, |
| "grad_norm": 0.3536911904811859, |
| "learning_rate": 0.000439513986013986, |
| "loss": 3.3151, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.3954802259887, |
| "eval_accuracy": 0.37079269879778126, |
| "eval_loss": 3.5561933517456055, |
| "eval_runtime": 179.8711, |
| "eval_samples_per_second": 92.544, |
| "eval_steps_per_second": 5.787, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.410041353602423, |
| "grad_norm": 0.3350816071033478, |
| "learning_rate": 0.0004393391608391608, |
| "loss": 3.3044, |
| "step": 46050 |
| }, |
| { |
| "epoch": 13.424602481216146, |
| "grad_norm": 0.3656558394432068, |
| "learning_rate": 0.00043916433566433565, |
| "loss": 3.3146, |
| "step": 46100 |
| }, |
| { |
| "epoch": 13.439163608829867, |
| "grad_norm": 0.35285845398902893, |
| "learning_rate": 0.00043898951048951045, |
| "loss": 3.3154, |
| "step": 46150 |
| }, |
| { |
| "epoch": 13.45372473644359, |
| "grad_norm": 0.3440799415111542, |
| "learning_rate": 0.0004388146853146853, |
| "loss": 3.3168, |
| "step": 46200 |
| }, |
| { |
| "epoch": 13.468285864057313, |
| "grad_norm": 0.3636631667613983, |
| "learning_rate": 0.00043863986013986005, |
| "loss": 3.3105, |
| "step": 46250 |
| }, |
| { |
| "epoch": 13.482846991671035, |
| "grad_norm": 0.3539026379585266, |
| "learning_rate": 0.0004384650349650349, |
| "loss": 3.3087, |
| "step": 46300 |
| }, |
| { |
| "epoch": 13.497408119284758, |
| "grad_norm": 0.3460793197154999, |
| "learning_rate": 0.0004382902097902097, |
| "loss": 3.3097, |
| "step": 46350 |
| }, |
| { |
| "epoch": 13.51196924689848, |
| "grad_norm": 0.38852551579475403, |
| "learning_rate": 0.00043811538461538456, |
| "loss": 3.3115, |
| "step": 46400 |
| }, |
| { |
| "epoch": 13.526530374512202, |
| "grad_norm": 0.3664706349372864, |
| "learning_rate": 0.0004379405594405594, |
| "loss": 3.309, |
| "step": 46450 |
| }, |
| { |
| "epoch": 13.541091502125925, |
| "grad_norm": 0.3525027334690094, |
| "learning_rate": 0.0004377657342657342, |
| "loss": 3.3313, |
| "step": 46500 |
| }, |
| { |
| "epoch": 13.555652629739647, |
| "grad_norm": 0.3686428368091583, |
| "learning_rate": 0.00043759090909090907, |
| "loss": 3.3196, |
| "step": 46550 |
| }, |
| { |
| "epoch": 13.57021375735337, |
| "grad_norm": 0.383261114358902, |
| "learning_rate": 0.00043741608391608387, |
| "loss": 3.3254, |
| "step": 46600 |
| }, |
| { |
| "epoch": 13.584774884967091, |
| "grad_norm": 0.34970635175704956, |
| "learning_rate": 0.0004372412587412587, |
| "loss": 3.3304, |
| "step": 46650 |
| }, |
| { |
| "epoch": 13.599336012580814, |
| "grad_norm": 0.356056809425354, |
| "learning_rate": 0.0004370664335664335, |
| "loss": 3.3248, |
| "step": 46700 |
| }, |
| { |
| "epoch": 13.613897140194537, |
| "grad_norm": 0.36876165866851807, |
| "learning_rate": 0.0004368916083916084, |
| "loss": 3.3233, |
| "step": 46750 |
| }, |
| { |
| "epoch": 13.62845826780826, |
| "grad_norm": 0.3871111273765564, |
| "learning_rate": 0.0004367167832167832, |
| "loss": 3.3242, |
| "step": 46800 |
| }, |
| { |
| "epoch": 13.643019395421982, |
| "grad_norm": 0.40034401416778564, |
| "learning_rate": 0.00043654195804195803, |
| "loss": 3.3381, |
| "step": 46850 |
| }, |
| { |
| "epoch": 13.657580523035705, |
| "grad_norm": 0.3296981751918793, |
| "learning_rate": 0.00043636713286713283, |
| "loss": 3.3253, |
| "step": 46900 |
| }, |
| { |
| "epoch": 13.672141650649426, |
| "grad_norm": 0.35436248779296875, |
| "learning_rate": 0.0004361923076923077, |
| "loss": 3.3118, |
| "step": 46950 |
| }, |
| { |
| "epoch": 13.686702778263149, |
| "grad_norm": 0.380553662776947, |
| "learning_rate": 0.00043601748251748243, |
| "loss": 3.3245, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.686702778263149, |
| "eval_accuracy": 0.3712443743353304, |
| "eval_loss": 3.5467824935913086, |
| "eval_runtime": 179.9731, |
| "eval_samples_per_second": 92.492, |
| "eval_steps_per_second": 5.784, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.701263905876871, |
| "grad_norm": 0.34484803676605225, |
| "learning_rate": 0.00043584265734265734, |
| "loss": 3.3273, |
| "step": 47050 |
| }, |
| { |
| "epoch": 13.715825033490594, |
| "grad_norm": 0.3786901831626892, |
| "learning_rate": 0.0004356678321678321, |
| "loss": 3.3224, |
| "step": 47100 |
| }, |
| { |
| "epoch": 13.730386161104317, |
| "grad_norm": 0.3483482599258423, |
| "learning_rate": 0.00043549300699300694, |
| "loss": 3.3478, |
| "step": 47150 |
| }, |
| { |
| "epoch": 13.744947288718038, |
| "grad_norm": 0.3696334660053253, |
| "learning_rate": 0.0004353181818181818, |
| "loss": 3.3318, |
| "step": 47200 |
| }, |
| { |
| "epoch": 13.75950841633176, |
| "grad_norm": 0.38778334856033325, |
| "learning_rate": 0.0004351433566433566, |
| "loss": 3.3319, |
| "step": 47250 |
| }, |
| { |
| "epoch": 13.774069543945483, |
| "grad_norm": 0.3464931845664978, |
| "learning_rate": 0.00043496853146853144, |
| "loss": 3.3359, |
| "step": 47300 |
| }, |
| { |
| "epoch": 13.788630671559206, |
| "grad_norm": 0.3816988468170166, |
| "learning_rate": 0.00043479370629370624, |
| "loss": 3.3358, |
| "step": 47350 |
| }, |
| { |
| "epoch": 13.803191799172929, |
| "grad_norm": 0.34828609228134155, |
| "learning_rate": 0.0004346188811188811, |
| "loss": 3.3362, |
| "step": 47400 |
| }, |
| { |
| "epoch": 13.81775292678665, |
| "grad_norm": 0.35228267312049866, |
| "learning_rate": 0.0004344440559440559, |
| "loss": 3.3405, |
| "step": 47450 |
| }, |
| { |
| "epoch": 13.832314054400372, |
| "grad_norm": 0.3658430874347687, |
| "learning_rate": 0.00043426923076923075, |
| "loss": 3.3311, |
| "step": 47500 |
| }, |
| { |
| "epoch": 13.846875182014095, |
| "grad_norm": 0.3630894422531128, |
| "learning_rate": 0.00043409440559440555, |
| "loss": 3.3404, |
| "step": 47550 |
| }, |
| { |
| "epoch": 13.861436309627818, |
| "grad_norm": 0.3596256971359253, |
| "learning_rate": 0.0004339195804195804, |
| "loss": 3.332, |
| "step": 47600 |
| }, |
| { |
| "epoch": 13.87599743724154, |
| "grad_norm": 0.3426588177680969, |
| "learning_rate": 0.0004337447552447552, |
| "loss": 3.3357, |
| "step": 47650 |
| }, |
| { |
| "epoch": 13.890558564855262, |
| "grad_norm": 0.3674740791320801, |
| "learning_rate": 0.00043356993006993006, |
| "loss": 3.3371, |
| "step": 47700 |
| }, |
| { |
| "epoch": 13.905119692468984, |
| "grad_norm": 0.36069542169570923, |
| "learning_rate": 0.0004333951048951048, |
| "loss": 3.3486, |
| "step": 47750 |
| }, |
| { |
| "epoch": 13.919680820082707, |
| "grad_norm": 0.375957727432251, |
| "learning_rate": 0.0004332202797202797, |
| "loss": 3.341, |
| "step": 47800 |
| }, |
| { |
| "epoch": 13.93424194769643, |
| "grad_norm": 0.3572161793708801, |
| "learning_rate": 0.00043304545454545456, |
| "loss": 3.3271, |
| "step": 47850 |
| }, |
| { |
| "epoch": 13.948803075310153, |
| "grad_norm": 0.36845171451568604, |
| "learning_rate": 0.0004328706293706293, |
| "loss": 3.3298, |
| "step": 47900 |
| }, |
| { |
| "epoch": 13.963364202923874, |
| "grad_norm": 0.3810923397541046, |
| "learning_rate": 0.00043269580419580416, |
| "loss": 3.3447, |
| "step": 47950 |
| }, |
| { |
| "epoch": 13.977925330537596, |
| "grad_norm": 0.3576909601688385, |
| "learning_rate": 0.00043252097902097896, |
| "loss": 3.3368, |
| "step": 48000 |
| }, |
| { |
| "epoch": 13.977925330537596, |
| "eval_accuracy": 0.37160482128955363, |
| "eval_loss": 3.541656970977783, |
| "eval_runtime": 180.2138, |
| "eval_samples_per_second": 92.368, |
| "eval_steps_per_second": 5.776, |
| "step": 48000 |
| }, |
| { |
| "epoch": 13.992486458151319, |
| "grad_norm": 0.35949257016181946, |
| "learning_rate": 0.0004323461538461538, |
| "loss": 3.3326, |
| "step": 48050 |
| }, |
| { |
| "epoch": 14.006989341254586, |
| "grad_norm": 0.35736098885536194, |
| "learning_rate": 0.0004321713286713286, |
| "loss": 3.2806, |
| "step": 48100 |
| }, |
| { |
| "epoch": 14.021550468868309, |
| "grad_norm": 0.35906386375427246, |
| "learning_rate": 0.00043199650349650347, |
| "loss": 3.2416, |
| "step": 48150 |
| }, |
| { |
| "epoch": 14.036111596482032, |
| "grad_norm": 0.3441833555698395, |
| "learning_rate": 0.00043182167832167827, |
| "loss": 3.2384, |
| "step": 48200 |
| }, |
| { |
| "epoch": 14.050672724095755, |
| "grad_norm": 0.3421272933483124, |
| "learning_rate": 0.0004316468531468531, |
| "loss": 3.2419, |
| "step": 48250 |
| }, |
| { |
| "epoch": 14.065233851709475, |
| "grad_norm": 0.35532912611961365, |
| "learning_rate": 0.0004314720279720279, |
| "loss": 3.2461, |
| "step": 48300 |
| }, |
| { |
| "epoch": 14.079794979323198, |
| "grad_norm": 0.3772093951702118, |
| "learning_rate": 0.0004312972027972028, |
| "loss": 3.2402, |
| "step": 48350 |
| }, |
| { |
| "epoch": 14.094356106936921, |
| "grad_norm": 0.3355690836906433, |
| "learning_rate": 0.0004311223776223776, |
| "loss": 3.2531, |
| "step": 48400 |
| }, |
| { |
| "epoch": 14.108917234550644, |
| "grad_norm": 0.36676159501075745, |
| "learning_rate": 0.00043094755244755243, |
| "loss": 3.2522, |
| "step": 48450 |
| }, |
| { |
| "epoch": 14.123478362164366, |
| "grad_norm": 0.33616307377815247, |
| "learning_rate": 0.0004307727272727272, |
| "loss": 3.2605, |
| "step": 48500 |
| }, |
| { |
| "epoch": 14.13803948977809, |
| "grad_norm": 0.3770199716091156, |
| "learning_rate": 0.0004305979020979021, |
| "loss": 3.2598, |
| "step": 48550 |
| }, |
| { |
| "epoch": 14.15260061739181, |
| "grad_norm": 0.3645670711994171, |
| "learning_rate": 0.00043042307692307694, |
| "loss": 3.2451, |
| "step": 48600 |
| }, |
| { |
| "epoch": 14.167161745005533, |
| "grad_norm": 0.3539714515209198, |
| "learning_rate": 0.0004302482517482517, |
| "loss": 3.2714, |
| "step": 48650 |
| }, |
| { |
| "epoch": 14.181722872619256, |
| "grad_norm": 0.3534782826900482, |
| "learning_rate": 0.00043007342657342654, |
| "loss": 3.2721, |
| "step": 48700 |
| }, |
| { |
| "epoch": 14.196284000232978, |
| "grad_norm": 0.3444008231163025, |
| "learning_rate": 0.00042989860139860134, |
| "loss": 3.2671, |
| "step": 48750 |
| }, |
| { |
| "epoch": 14.210845127846701, |
| "grad_norm": 0.37874382734298706, |
| "learning_rate": 0.0004297237762237762, |
| "loss": 3.2699, |
| "step": 48800 |
| }, |
| { |
| "epoch": 14.225406255460422, |
| "grad_norm": 0.35406023263931274, |
| "learning_rate": 0.000429548951048951, |
| "loss": 3.2709, |
| "step": 48850 |
| }, |
| { |
| "epoch": 14.239967383074145, |
| "grad_norm": 0.3657627999782562, |
| "learning_rate": 0.00042937412587412585, |
| "loss": 3.2663, |
| "step": 48900 |
| }, |
| { |
| "epoch": 14.254528510687868, |
| "grad_norm": 0.35727784037590027, |
| "learning_rate": 0.00042919930069930065, |
| "loss": 3.2689, |
| "step": 48950 |
| }, |
| { |
| "epoch": 14.26908963830159, |
| "grad_norm": 0.3745940029621124, |
| "learning_rate": 0.0004290244755244755, |
| "loss": 3.2765, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.26908963830159, |
| "eval_accuracy": 0.37117383677090315, |
| "eval_loss": 3.5587430000305176, |
| "eval_runtime": 180.1241, |
| "eval_samples_per_second": 92.414, |
| "eval_steps_per_second": 5.779, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.283650765915313, |
| "grad_norm": 0.37137308716773987, |
| "learning_rate": 0.0004288496503496503, |
| "loss": 3.2887, |
| "step": 49050 |
| }, |
| { |
| "epoch": 14.298211893529034, |
| "grad_norm": 0.36955568194389343, |
| "learning_rate": 0.00042867482517482515, |
| "loss": 3.2872, |
| "step": 49100 |
| }, |
| { |
| "epoch": 14.312773021142757, |
| "grad_norm": 0.35592296719551086, |
| "learning_rate": 0.00042849999999999995, |
| "loss": 3.2817, |
| "step": 49150 |
| }, |
| { |
| "epoch": 14.32733414875648, |
| "grad_norm": 0.38243964314460754, |
| "learning_rate": 0.0004283251748251748, |
| "loss": 3.2665, |
| "step": 49200 |
| }, |
| { |
| "epoch": 14.341895276370202, |
| "grad_norm": 0.4379175901412964, |
| "learning_rate": 0.00042815034965034966, |
| "loss": 3.2826, |
| "step": 49250 |
| }, |
| { |
| "epoch": 14.356456403983925, |
| "grad_norm": 0.3918575048446655, |
| "learning_rate": 0.00042797552447552446, |
| "loss": 3.2772, |
| "step": 49300 |
| }, |
| { |
| "epoch": 14.371017531597648, |
| "grad_norm": 0.3655647337436676, |
| "learning_rate": 0.0004278006993006993, |
| "loss": 3.3001, |
| "step": 49350 |
| }, |
| { |
| "epoch": 14.385578659211369, |
| "grad_norm": 0.3602593243122101, |
| "learning_rate": 0.00042762587412587406, |
| "loss": 3.2914, |
| "step": 49400 |
| }, |
| { |
| "epoch": 14.400139786825092, |
| "grad_norm": 0.3734046220779419, |
| "learning_rate": 0.0004274510489510489, |
| "loss": 3.2921, |
| "step": 49450 |
| }, |
| { |
| "epoch": 14.414700914438814, |
| "grad_norm": 0.36270594596862793, |
| "learning_rate": 0.0004272762237762237, |
| "loss": 3.2896, |
| "step": 49500 |
| }, |
| { |
| "epoch": 14.429262042052537, |
| "grad_norm": 0.34769439697265625, |
| "learning_rate": 0.00042710139860139857, |
| "loss": 3.2928, |
| "step": 49550 |
| }, |
| { |
| "epoch": 14.44382316966626, |
| "grad_norm": 0.3424946665763855, |
| "learning_rate": 0.00042692657342657337, |
| "loss": 3.2828, |
| "step": 49600 |
| }, |
| { |
| "epoch": 14.45838429727998, |
| "grad_norm": 0.34357205033302307, |
| "learning_rate": 0.0004267517482517482, |
| "loss": 3.3029, |
| "step": 49650 |
| }, |
| { |
| "epoch": 14.472945424893704, |
| "grad_norm": 0.36078691482543945, |
| "learning_rate": 0.000426576923076923, |
| "loss": 3.2983, |
| "step": 49700 |
| }, |
| { |
| "epoch": 14.487506552507426, |
| "grad_norm": 0.3904469907283783, |
| "learning_rate": 0.0004264020979020979, |
| "loss": 3.3086, |
| "step": 49750 |
| }, |
| { |
| "epoch": 14.502067680121149, |
| "grad_norm": 0.35750025510787964, |
| "learning_rate": 0.0004262272727272727, |
| "loss": 3.3063, |
| "step": 49800 |
| }, |
| { |
| "epoch": 14.516628807734872, |
| "grad_norm": 0.35314109921455383, |
| "learning_rate": 0.00042605244755244753, |
| "loss": 3.2972, |
| "step": 49850 |
| }, |
| { |
| "epoch": 14.531189935348593, |
| "grad_norm": 0.39195117354393005, |
| "learning_rate": 0.00042587762237762233, |
| "loss": 3.3063, |
| "step": 49900 |
| }, |
| { |
| "epoch": 14.545751062962315, |
| "grad_norm": 0.3887055218219757, |
| "learning_rate": 0.0004257027972027972, |
| "loss": 3.3148, |
| "step": 49950 |
| }, |
| { |
| "epoch": 14.560312190576038, |
| "grad_norm": 0.3657190799713135, |
| "learning_rate": 0.00042552797202797204, |
| "loss": 3.3005, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.560312190576038, |
| "eval_accuracy": 0.3711953507280535, |
| "eval_loss": 3.5526821613311768, |
| "eval_runtime": 180.0112, |
| "eval_samples_per_second": 92.472, |
| "eval_steps_per_second": 5.783, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.574873318189761, |
| "grad_norm": 0.3578939139842987, |
| "learning_rate": 0.00042535314685314684, |
| "loss": 3.3041, |
| "step": 50050 |
| }, |
| { |
| "epoch": 14.589434445803484, |
| "grad_norm": 0.36270761489868164, |
| "learning_rate": 0.0004251783216783217, |
| "loss": 3.312, |
| "step": 50100 |
| }, |
| { |
| "epoch": 14.603995573417205, |
| "grad_norm": 0.3790474534034729, |
| "learning_rate": 0.00042500349650349643, |
| "loss": 3.3083, |
| "step": 50150 |
| }, |
| { |
| "epoch": 14.618556701030927, |
| "grad_norm": 0.34353137016296387, |
| "learning_rate": 0.0004248286713286713, |
| "loss": 3.3132, |
| "step": 50200 |
| }, |
| { |
| "epoch": 14.63311782864465, |
| "grad_norm": 0.35432496666908264, |
| "learning_rate": 0.0004246538461538461, |
| "loss": 3.3178, |
| "step": 50250 |
| }, |
| { |
| "epoch": 14.647678956258373, |
| "grad_norm": 0.3751126229763031, |
| "learning_rate": 0.00042447902097902094, |
| "loss": 3.3095, |
| "step": 50300 |
| }, |
| { |
| "epoch": 14.662240083872096, |
| "grad_norm": 0.3904288709163666, |
| "learning_rate": 0.00042430419580419574, |
| "loss": 3.315, |
| "step": 50350 |
| }, |
| { |
| "epoch": 14.676801211485817, |
| "grad_norm": 0.3363359570503235, |
| "learning_rate": 0.0004241293706293706, |
| "loss": 3.3121, |
| "step": 50400 |
| }, |
| { |
| "epoch": 14.69136233909954, |
| "grad_norm": 0.3706911504268646, |
| "learning_rate": 0.0004239545454545454, |
| "loss": 3.3076, |
| "step": 50450 |
| }, |
| { |
| "epoch": 14.705923466713262, |
| "grad_norm": 0.3716376721858978, |
| "learning_rate": 0.00042377972027972025, |
| "loss": 3.3189, |
| "step": 50500 |
| }, |
| { |
| "epoch": 14.720484594326985, |
| "grad_norm": 0.3527190387248993, |
| "learning_rate": 0.00042360489510489505, |
| "loss": 3.3206, |
| "step": 50550 |
| }, |
| { |
| "epoch": 14.735045721940708, |
| "grad_norm": 0.3800128102302551, |
| "learning_rate": 0.0004234300699300699, |
| "loss": 3.3231, |
| "step": 50600 |
| }, |
| { |
| "epoch": 14.749606849554429, |
| "grad_norm": 0.3714574873447418, |
| "learning_rate": 0.00042325524475524476, |
| "loss": 3.315, |
| "step": 50650 |
| }, |
| { |
| "epoch": 14.764167977168151, |
| "grad_norm": 0.3832313120365143, |
| "learning_rate": 0.00042308041958041956, |
| "loss": 3.3131, |
| "step": 50700 |
| }, |
| { |
| "epoch": 14.778729104781874, |
| "grad_norm": 0.3804624378681183, |
| "learning_rate": 0.0004229055944055944, |
| "loss": 3.3159, |
| "step": 50750 |
| }, |
| { |
| "epoch": 14.793290232395597, |
| "grad_norm": 0.33810165524482727, |
| "learning_rate": 0.0004227307692307692, |
| "loss": 3.3113, |
| "step": 50800 |
| }, |
| { |
| "epoch": 14.80785136000932, |
| "grad_norm": 0.36445289850234985, |
| "learning_rate": 0.00042255594405594406, |
| "loss": 3.3203, |
| "step": 50850 |
| }, |
| { |
| "epoch": 14.822412487623042, |
| "grad_norm": 0.36522355675697327, |
| "learning_rate": 0.0004223811188811188, |
| "loss": 3.3026, |
| "step": 50900 |
| }, |
| { |
| "epoch": 14.836973615236763, |
| "grad_norm": 0.34769731760025024, |
| "learning_rate": 0.00042220629370629366, |
| "loss": 3.3196, |
| "step": 50950 |
| }, |
| { |
| "epoch": 14.851534742850486, |
| "grad_norm": 0.3947177827358246, |
| "learning_rate": 0.00042203146853146846, |
| "loss": 3.3231, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.851534742850486, |
| "eval_accuracy": 0.3716891136790442, |
| "eval_loss": 3.541720390319824, |
| "eval_runtime": 179.9128, |
| "eval_samples_per_second": 92.523, |
| "eval_steps_per_second": 5.786, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.866095870464209, |
| "grad_norm": 0.39388659596443176, |
| "learning_rate": 0.0004218566433566433, |
| "loss": 3.3281, |
| "step": 51050 |
| }, |
| { |
| "epoch": 14.880656998077932, |
| "grad_norm": 0.3492530584335327, |
| "learning_rate": 0.0004216818181818181, |
| "loss": 3.3114, |
| "step": 51100 |
| }, |
| { |
| "epoch": 14.895218125691654, |
| "grad_norm": 0.37607342004776, |
| "learning_rate": 0.00042150699300699297, |
| "loss": 3.3227, |
| "step": 51150 |
| }, |
| { |
| "epoch": 14.909779253305375, |
| "grad_norm": 0.3827172517776489, |
| "learning_rate": 0.00042133216783216777, |
| "loss": 3.3262, |
| "step": 51200 |
| }, |
| { |
| "epoch": 14.924340380919098, |
| "grad_norm": 0.35901880264282227, |
| "learning_rate": 0.0004211573426573426, |
| "loss": 3.3353, |
| "step": 51250 |
| }, |
| { |
| "epoch": 14.93890150853282, |
| "grad_norm": 0.377532035112381, |
| "learning_rate": 0.0004209825174825175, |
| "loss": 3.331, |
| "step": 51300 |
| }, |
| { |
| "epoch": 14.953462636146543, |
| "grad_norm": 0.3667951822280884, |
| "learning_rate": 0.0004208076923076923, |
| "loss": 3.325, |
| "step": 51350 |
| }, |
| { |
| "epoch": 14.968023763760266, |
| "grad_norm": 0.36658504605293274, |
| "learning_rate": 0.00042063286713286713, |
| "loss": 3.3381, |
| "step": 51400 |
| }, |
| { |
| "epoch": 14.982584891373987, |
| "grad_norm": 0.33549827337265015, |
| "learning_rate": 0.00042045804195804193, |
| "loss": 3.3322, |
| "step": 51450 |
| }, |
| { |
| "epoch": 14.99714601898771, |
| "grad_norm": 0.36752671003341675, |
| "learning_rate": 0.0004202832167832168, |
| "loss": 3.3341, |
| "step": 51500 |
| }, |
| { |
| "epoch": 15.011648902090977, |
| "grad_norm": 0.34055018424987793, |
| "learning_rate": 0.0004201083916083916, |
| "loss": 3.2356, |
| "step": 51550 |
| }, |
| { |
| "epoch": 15.0262100297047, |
| "grad_norm": 0.3584866225719452, |
| "learning_rate": 0.00041993356643356644, |
| "loss": 3.2207, |
| "step": 51600 |
| }, |
| { |
| "epoch": 15.040771157318423, |
| "grad_norm": 0.35611316561698914, |
| "learning_rate": 0.0004197587412587412, |
| "loss": 3.221, |
| "step": 51650 |
| }, |
| { |
| "epoch": 15.055332284932145, |
| "grad_norm": 0.3985893130302429, |
| "learning_rate": 0.00041958391608391604, |
| "loss": 3.213, |
| "step": 51700 |
| }, |
| { |
| "epoch": 15.069893412545868, |
| "grad_norm": 0.3578147292137146, |
| "learning_rate": 0.00041940909090909084, |
| "loss": 3.2306, |
| "step": 51750 |
| }, |
| { |
| "epoch": 15.084454540159589, |
| "grad_norm": 0.3800044655799866, |
| "learning_rate": 0.0004192342657342657, |
| "loss": 3.2447, |
| "step": 51800 |
| }, |
| { |
| "epoch": 15.099015667773312, |
| "grad_norm": 0.3552968502044678, |
| "learning_rate": 0.0004190594405594405, |
| "loss": 3.2413, |
| "step": 51850 |
| }, |
| { |
| "epoch": 15.113576795387035, |
| "grad_norm": 0.37285658717155457, |
| "learning_rate": 0.00041888461538461535, |
| "loss": 3.2345, |
| "step": 51900 |
| }, |
| { |
| "epoch": 15.128137923000757, |
| "grad_norm": 0.38795363903045654, |
| "learning_rate": 0.00041870979020979015, |
| "loss": 3.2471, |
| "step": 51950 |
| }, |
| { |
| "epoch": 15.14269905061448, |
| "grad_norm": 0.3961765468120575, |
| "learning_rate": 0.000418534965034965, |
| "loss": 3.253, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.14269905061448, |
| "eval_accuracy": 0.371423657311583, |
| "eval_loss": 3.5584487915039062, |
| "eval_runtime": 180.216, |
| "eval_samples_per_second": 92.367, |
| "eval_steps_per_second": 5.776, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.157260178228203, |
| "grad_norm": 0.349721759557724, |
| "learning_rate": 0.00041836013986013985, |
| "loss": 3.2485, |
| "step": 52050 |
| }, |
| { |
| "epoch": 15.171821305841924, |
| "grad_norm": 0.37533050775527954, |
| "learning_rate": 0.00041818531468531465, |
| "loss": 3.2494, |
| "step": 52100 |
| }, |
| { |
| "epoch": 15.186382433455647, |
| "grad_norm": 0.37644854187965393, |
| "learning_rate": 0.0004180104895104895, |
| "loss": 3.2584, |
| "step": 52150 |
| }, |
| { |
| "epoch": 15.20094356106937, |
| "grad_norm": 0.3912580609321594, |
| "learning_rate": 0.0004178356643356643, |
| "loss": 3.257, |
| "step": 52200 |
| }, |
| { |
| "epoch": 15.215504688683092, |
| "grad_norm": 0.3808298707008362, |
| "learning_rate": 0.00041766083916083916, |
| "loss": 3.2662, |
| "step": 52250 |
| }, |
| { |
| "epoch": 15.230065816296815, |
| "grad_norm": 0.35927554965019226, |
| "learning_rate": 0.00041748601398601396, |
| "loss": 3.2615, |
| "step": 52300 |
| }, |
| { |
| "epoch": 15.244626943910536, |
| "grad_norm": 0.3501395285129547, |
| "learning_rate": 0.0004173111888111888, |
| "loss": 3.2642, |
| "step": 52350 |
| }, |
| { |
| "epoch": 15.259188071524258, |
| "grad_norm": 0.38749128580093384, |
| "learning_rate": 0.00041713636363636356, |
| "loss": 3.2639, |
| "step": 52400 |
| }, |
| { |
| "epoch": 15.273749199137981, |
| "grad_norm": 0.358185738325119, |
| "learning_rate": 0.0004169615384615384, |
| "loss": 3.2687, |
| "step": 52450 |
| }, |
| { |
| "epoch": 15.288310326751704, |
| "grad_norm": 0.3519742488861084, |
| "learning_rate": 0.0004167867132867132, |
| "loss": 3.2579, |
| "step": 52500 |
| }, |
| { |
| "epoch": 15.302871454365427, |
| "grad_norm": 0.37668633460998535, |
| "learning_rate": 0.00041661188811188807, |
| "loss": 3.2752, |
| "step": 52550 |
| }, |
| { |
| "epoch": 15.317432581979148, |
| "grad_norm": 0.39537209272384644, |
| "learning_rate": 0.00041643706293706287, |
| "loss": 3.2608, |
| "step": 52600 |
| }, |
| { |
| "epoch": 15.33199370959287, |
| "grad_norm": 0.3505506217479706, |
| "learning_rate": 0.0004162622377622377, |
| "loss": 3.2671, |
| "step": 52650 |
| }, |
| { |
| "epoch": 15.346554837206593, |
| "grad_norm": 0.4017491340637207, |
| "learning_rate": 0.0004160874125874126, |
| "loss": 3.2646, |
| "step": 52700 |
| }, |
| { |
| "epoch": 15.361115964820316, |
| "grad_norm": 0.3511753976345062, |
| "learning_rate": 0.0004159125874125874, |
| "loss": 3.268, |
| "step": 52750 |
| }, |
| { |
| "epoch": 15.375677092434039, |
| "grad_norm": 0.3840733468532562, |
| "learning_rate": 0.00041573776223776223, |
| "loss": 3.2698, |
| "step": 52800 |
| }, |
| { |
| "epoch": 15.39023822004776, |
| "grad_norm": 0.3798452615737915, |
| "learning_rate": 0.00041556293706293703, |
| "loss": 3.2844, |
| "step": 52850 |
| }, |
| { |
| "epoch": 15.404799347661482, |
| "grad_norm": 0.3574904203414917, |
| "learning_rate": 0.0004153881118881119, |
| "loss": 3.2712, |
| "step": 52900 |
| }, |
| { |
| "epoch": 15.419360475275205, |
| "grad_norm": 0.3655366599559784, |
| "learning_rate": 0.0004152132867132867, |
| "loss": 3.2807, |
| "step": 52950 |
| }, |
| { |
| "epoch": 15.433921602888928, |
| "grad_norm": 0.3935593366622925, |
| "learning_rate": 0.00041503846153846154, |
| "loss": 3.2694, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.433921602888928, |
| "eval_accuracy": 0.37167653448005467, |
| "eval_loss": 3.5547757148742676, |
| "eval_runtime": 180.1849, |
| "eval_samples_per_second": 92.383, |
| "eval_steps_per_second": 5.777, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.44848273050265, |
| "grad_norm": 0.3608306050300598, |
| "learning_rate": 0.00041486363636363634, |
| "loss": 3.2933, |
| "step": 53050 |
| }, |
| { |
| "epoch": 15.463043858116373, |
| "grad_norm": 0.3752822279930115, |
| "learning_rate": 0.0004146888111888112, |
| "loss": 3.2889, |
| "step": 53100 |
| }, |
| { |
| "epoch": 15.477604985730094, |
| "grad_norm": 0.3465368151664734, |
| "learning_rate": 0.00041451398601398593, |
| "loss": 3.2837, |
| "step": 53150 |
| }, |
| { |
| "epoch": 15.492166113343817, |
| "grad_norm": 0.36019545793533325, |
| "learning_rate": 0.0004143391608391608, |
| "loss": 3.2916, |
| "step": 53200 |
| }, |
| { |
| "epoch": 15.50672724095754, |
| "grad_norm": 0.39511221647262573, |
| "learning_rate": 0.0004141643356643356, |
| "loss": 3.2787, |
| "step": 53250 |
| }, |
| { |
| "epoch": 15.521288368571263, |
| "grad_norm": 0.35745346546173096, |
| "learning_rate": 0.00041398951048951044, |
| "loss": 3.2884, |
| "step": 53300 |
| }, |
| { |
| "epoch": 15.535849496184985, |
| "grad_norm": 0.37510353326797485, |
| "learning_rate": 0.00041381468531468524, |
| "loss": 3.2889, |
| "step": 53350 |
| }, |
| { |
| "epoch": 15.550410623798706, |
| "grad_norm": 0.38238877058029175, |
| "learning_rate": 0.0004136398601398601, |
| "loss": 3.2987, |
| "step": 53400 |
| }, |
| { |
| "epoch": 15.564971751412429, |
| "grad_norm": 0.37124359607696533, |
| "learning_rate": 0.00041346503496503495, |
| "loss": 3.2911, |
| "step": 53450 |
| }, |
| { |
| "epoch": 15.579532879026152, |
| "grad_norm": 0.35970422625541687, |
| "learning_rate": 0.00041329020979020975, |
| "loss": 3.2869, |
| "step": 53500 |
| }, |
| { |
| "epoch": 15.594094006639875, |
| "grad_norm": 0.36661380529403687, |
| "learning_rate": 0.0004131153846153846, |
| "loss": 3.2949, |
| "step": 53550 |
| }, |
| { |
| "epoch": 15.608655134253597, |
| "grad_norm": 0.3719494938850403, |
| "learning_rate": 0.0004129405594405594, |
| "loss": 3.2838, |
| "step": 53600 |
| }, |
| { |
| "epoch": 15.623216261867318, |
| "grad_norm": 0.4038849174976349, |
| "learning_rate": 0.00041276573426573426, |
| "loss": 3.2961, |
| "step": 53650 |
| }, |
| { |
| "epoch": 15.637777389481041, |
| "grad_norm": 0.3406437635421753, |
| "learning_rate": 0.00041259090909090906, |
| "loss": 3.2982, |
| "step": 53700 |
| }, |
| { |
| "epoch": 15.652338517094764, |
| "grad_norm": 0.3382225036621094, |
| "learning_rate": 0.0004124160839160839, |
| "loss": 3.3081, |
| "step": 53750 |
| }, |
| { |
| "epoch": 15.666899644708487, |
| "grad_norm": 0.34634169936180115, |
| "learning_rate": 0.0004122412587412587, |
| "loss": 3.3067, |
| "step": 53800 |
| }, |
| { |
| "epoch": 15.68146077232221, |
| "grad_norm": 0.3550468683242798, |
| "learning_rate": 0.00041206643356643356, |
| "loss": 3.3041, |
| "step": 53850 |
| }, |
| { |
| "epoch": 15.69602189993593, |
| "grad_norm": 0.37474149465560913, |
| "learning_rate": 0.0004118916083916083, |
| "loss": 3.3008, |
| "step": 53900 |
| }, |
| { |
| "epoch": 15.710583027549653, |
| "grad_norm": 0.39201828837394714, |
| "learning_rate": 0.00041171678321678316, |
| "loss": 3.2909, |
| "step": 53950 |
| }, |
| { |
| "epoch": 15.725144155163376, |
| "grad_norm": 0.3740415573120117, |
| "learning_rate": 0.00041154195804195796, |
| "loss": 3.3034, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.725144155163376, |
| "eval_accuracy": 0.37207154484084726, |
| "eval_loss": 3.5419719219207764, |
| "eval_runtime": 180.0153, |
| "eval_samples_per_second": 92.47, |
| "eval_steps_per_second": 5.783, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.739705282777098, |
| "grad_norm": 0.36284756660461426, |
| "learning_rate": 0.0004113671328671328, |
| "loss": 3.315, |
| "step": 54050 |
| }, |
| { |
| "epoch": 15.754266410390821, |
| "grad_norm": 0.3752084970474243, |
| "learning_rate": 0.00041119230769230767, |
| "loss": 3.3069, |
| "step": 54100 |
| }, |
| { |
| "epoch": 15.768827538004544, |
| "grad_norm": 0.3913860023021698, |
| "learning_rate": 0.00041101748251748247, |
| "loss": 3.3018, |
| "step": 54150 |
| }, |
| { |
| "epoch": 15.783388665618265, |
| "grad_norm": 0.35426411032676697, |
| "learning_rate": 0.0004108426573426573, |
| "loss": 3.3055, |
| "step": 54200 |
| }, |
| { |
| "epoch": 15.797949793231988, |
| "grad_norm": 0.358026385307312, |
| "learning_rate": 0.0004106678321678321, |
| "loss": 3.3072, |
| "step": 54250 |
| }, |
| { |
| "epoch": 15.81251092084571, |
| "grad_norm": 0.3560652434825897, |
| "learning_rate": 0.000410493006993007, |
| "loss": 3.3064, |
| "step": 54300 |
| }, |
| { |
| "epoch": 15.827072048459433, |
| "grad_norm": 0.3553317189216614, |
| "learning_rate": 0.0004103181818181818, |
| "loss": 3.3091, |
| "step": 54350 |
| }, |
| { |
| "epoch": 15.841633176073156, |
| "grad_norm": 0.3881910741329193, |
| "learning_rate": 0.00041014335664335663, |
| "loss": 3.31, |
| "step": 54400 |
| }, |
| { |
| "epoch": 15.856194303686877, |
| "grad_norm": 0.36161836981773376, |
| "learning_rate": 0.00040996853146853143, |
| "loss": 3.3085, |
| "step": 54450 |
| }, |
| { |
| "epoch": 15.8707554313006, |
| "grad_norm": 0.3497399091720581, |
| "learning_rate": 0.0004097937062937063, |
| "loss": 3.3154, |
| "step": 54500 |
| }, |
| { |
| "epoch": 15.885316558914322, |
| "grad_norm": 0.3460595905780792, |
| "learning_rate": 0.0004096188811188811, |
| "loss": 3.3137, |
| "step": 54550 |
| }, |
| { |
| "epoch": 15.899877686528045, |
| "grad_norm": 0.3706072270870209, |
| "learning_rate": 0.00040944405594405594, |
| "loss": 3.313, |
| "step": 54600 |
| }, |
| { |
| "epoch": 15.914438814141768, |
| "grad_norm": 0.3560197651386261, |
| "learning_rate": 0.0004092692307692307, |
| "loss": 3.316, |
| "step": 54650 |
| }, |
| { |
| "epoch": 15.928999941755489, |
| "grad_norm": 0.3823436200618744, |
| "learning_rate": 0.00040909440559440554, |
| "loss": 3.3201, |
| "step": 54700 |
| }, |
| { |
| "epoch": 15.943561069369212, |
| "grad_norm": 0.3746497333049774, |
| "learning_rate": 0.00040891958041958034, |
| "loss": 3.2996, |
| "step": 54750 |
| }, |
| { |
| "epoch": 15.958122196982934, |
| "grad_norm": 0.36520153284072876, |
| "learning_rate": 0.0004087447552447552, |
| "loss": 3.3044, |
| "step": 54800 |
| }, |
| { |
| "epoch": 15.972683324596657, |
| "grad_norm": 0.3529646098613739, |
| "learning_rate": 0.00040856993006993005, |
| "loss": 3.3194, |
| "step": 54850 |
| }, |
| { |
| "epoch": 15.98724445221038, |
| "grad_norm": 0.3613719940185547, |
| "learning_rate": 0.00040839510489510485, |
| "loss": 3.3077, |
| "step": 54900 |
| }, |
| { |
| "epoch": 16.001747335313645, |
| "grad_norm": 0.364214152097702, |
| "learning_rate": 0.0004082202797202797, |
| "loss": 3.302, |
| "step": 54950 |
| }, |
| { |
| "epoch": 16.01630846292737, |
| "grad_norm": 0.38465356826782227, |
| "learning_rate": 0.0004080454545454545, |
| "loss": 3.202, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.01630846292737, |
| "eval_accuracy": 0.3719789055062328, |
| "eval_loss": 3.5520362854003906, |
| "eval_runtime": 180.2449, |
| "eval_samples_per_second": 92.352, |
| "eval_steps_per_second": 5.775, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.03086959054109, |
| "grad_norm": 0.3448808193206787, |
| "learning_rate": 0.00040787062937062935, |
| "loss": 3.212, |
| "step": 55050 |
| }, |
| { |
| "epoch": 16.045430718154815, |
| "grad_norm": 0.385085791349411, |
| "learning_rate": 0.00040769580419580415, |
| "loss": 3.2028, |
| "step": 55100 |
| }, |
| { |
| "epoch": 16.059991845768536, |
| "grad_norm": 0.37901026010513306, |
| "learning_rate": 0.000407520979020979, |
| "loss": 3.2165, |
| "step": 55150 |
| }, |
| { |
| "epoch": 16.074552973382257, |
| "grad_norm": 0.37651047110557556, |
| "learning_rate": 0.0004073461538461538, |
| "loss": 3.2096, |
| "step": 55200 |
| }, |
| { |
| "epoch": 16.08911410099598, |
| "grad_norm": 0.41138216853141785, |
| "learning_rate": 0.00040717132867132866, |
| "loss": 3.2159, |
| "step": 55250 |
| }, |
| { |
| "epoch": 16.103675228609703, |
| "grad_norm": 0.36162111163139343, |
| "learning_rate": 0.00040699650349650346, |
| "loss": 3.2307, |
| "step": 55300 |
| }, |
| { |
| "epoch": 16.118236356223427, |
| "grad_norm": 0.3828912675380707, |
| "learning_rate": 0.0004068216783216783, |
| "loss": 3.2348, |
| "step": 55350 |
| }, |
| { |
| "epoch": 16.132797483837148, |
| "grad_norm": 0.3747362494468689, |
| "learning_rate": 0.00040664685314685306, |
| "loss": 3.236, |
| "step": 55400 |
| }, |
| { |
| "epoch": 16.14735861145087, |
| "grad_norm": 0.3754531443119049, |
| "learning_rate": 0.0004064720279720279, |
| "loss": 3.2281, |
| "step": 55450 |
| }, |
| { |
| "epoch": 16.161919739064594, |
| "grad_norm": 0.35862216353416443, |
| "learning_rate": 0.00040629720279720277, |
| "loss": 3.2412, |
| "step": 55500 |
| }, |
| { |
| "epoch": 16.176480866678315, |
| "grad_norm": 0.37285348773002625, |
| "learning_rate": 0.00040612237762237757, |
| "loss": 3.2374, |
| "step": 55550 |
| }, |
| { |
| "epoch": 16.19104199429204, |
| "grad_norm": 0.3793524205684662, |
| "learning_rate": 0.0004059475524475524, |
| "loss": 3.2366, |
| "step": 55600 |
| }, |
| { |
| "epoch": 16.20560312190576, |
| "grad_norm": 0.4028421938419342, |
| "learning_rate": 0.0004057727272727272, |
| "loss": 3.2367, |
| "step": 55650 |
| }, |
| { |
| "epoch": 16.22016424951948, |
| "grad_norm": 0.3684329688549042, |
| "learning_rate": 0.0004055979020979021, |
| "loss": 3.2366, |
| "step": 55700 |
| }, |
| { |
| "epoch": 16.234725377133206, |
| "grad_norm": 0.35635483264923096, |
| "learning_rate": 0.0004054230769230769, |
| "loss": 3.2607, |
| "step": 55750 |
| }, |
| { |
| "epoch": 16.249286504746927, |
| "grad_norm": 0.40113571286201477, |
| "learning_rate": 0.00040524825174825173, |
| "loss": 3.254, |
| "step": 55800 |
| }, |
| { |
| "epoch": 16.26384763236065, |
| "grad_norm": 0.3831492066383362, |
| "learning_rate": 0.00040507342657342653, |
| "loss": 3.2531, |
| "step": 55850 |
| }, |
| { |
| "epoch": 16.278408759974372, |
| "grad_norm": 0.39126092195510864, |
| "learning_rate": 0.0004048986013986014, |
| "loss": 3.2463, |
| "step": 55900 |
| }, |
| { |
| "epoch": 16.292969887588093, |
| "grad_norm": 0.3921174705028534, |
| "learning_rate": 0.0004047237762237762, |
| "loss": 3.2679, |
| "step": 55950 |
| }, |
| { |
| "epoch": 16.307531015201818, |
| "grad_norm": 0.3537350296974182, |
| "learning_rate": 0.00040454895104895104, |
| "loss": 3.2585, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.307531015201818, |
| "eval_accuracy": 0.3722745754637904, |
| "eval_loss": 3.550562620162964, |
| "eval_runtime": 179.9469, |
| "eval_samples_per_second": 92.505, |
| "eval_steps_per_second": 5.785, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.32209214281554, |
| "grad_norm": 0.34859004616737366, |
| "learning_rate": 0.00040437412587412583, |
| "loss": 3.26, |
| "step": 56050 |
| }, |
| { |
| "epoch": 16.336653270429263, |
| "grad_norm": 0.38539889454841614, |
| "learning_rate": 0.0004041993006993007, |
| "loss": 3.2614, |
| "step": 56100 |
| }, |
| { |
| "epoch": 16.351214398042984, |
| "grad_norm": 0.3837622404098511, |
| "learning_rate": 0.00040402447552447554, |
| "loss": 3.2734, |
| "step": 56150 |
| }, |
| { |
| "epoch": 16.36577552565671, |
| "grad_norm": 0.3635867238044739, |
| "learning_rate": 0.0004038496503496503, |
| "loss": 3.2611, |
| "step": 56200 |
| }, |
| { |
| "epoch": 16.38033665327043, |
| "grad_norm": 0.38138046860694885, |
| "learning_rate": 0.00040367482517482514, |
| "loss": 3.2671, |
| "step": 56250 |
| }, |
| { |
| "epoch": 16.39489778088415, |
| "grad_norm": 0.36557263135910034, |
| "learning_rate": 0.00040349999999999994, |
| "loss": 3.2711, |
| "step": 56300 |
| }, |
| { |
| "epoch": 16.409458908497875, |
| "grad_norm": 0.38369277119636536, |
| "learning_rate": 0.0004033251748251748, |
| "loss": 3.2599, |
| "step": 56350 |
| }, |
| { |
| "epoch": 16.424020036111596, |
| "grad_norm": 0.3768405318260193, |
| "learning_rate": 0.0004031503496503496, |
| "loss": 3.2758, |
| "step": 56400 |
| }, |
| { |
| "epoch": 16.43858116372532, |
| "grad_norm": 0.375188410282135, |
| "learning_rate": 0.00040297552447552445, |
| "loss": 3.2715, |
| "step": 56450 |
| }, |
| { |
| "epoch": 16.45314229133904, |
| "grad_norm": 0.3774068355560303, |
| "learning_rate": 0.00040280069930069925, |
| "loss": 3.2708, |
| "step": 56500 |
| }, |
| { |
| "epoch": 16.467703418952762, |
| "grad_norm": 0.3932843506336212, |
| "learning_rate": 0.0004026258741258741, |
| "loss": 3.271, |
| "step": 56550 |
| }, |
| { |
| "epoch": 16.482264546566487, |
| "grad_norm": 0.3809216320514679, |
| "learning_rate": 0.0004024510489510489, |
| "loss": 3.2672, |
| "step": 56600 |
| }, |
| { |
| "epoch": 16.496825674180208, |
| "grad_norm": 0.35790684819221497, |
| "learning_rate": 0.00040227622377622376, |
| "loss": 3.2728, |
| "step": 56650 |
| }, |
| { |
| "epoch": 16.511386801793932, |
| "grad_norm": 0.35489341616630554, |
| "learning_rate": 0.00040210139860139856, |
| "loss": 3.2741, |
| "step": 56700 |
| }, |
| { |
| "epoch": 16.525947929407653, |
| "grad_norm": 0.37867653369903564, |
| "learning_rate": 0.0004019265734265734, |
| "loss": 3.2832, |
| "step": 56750 |
| }, |
| { |
| "epoch": 16.540509057021374, |
| "grad_norm": 0.35731521248817444, |
| "learning_rate": 0.0004017517482517482, |
| "loss": 3.2739, |
| "step": 56800 |
| }, |
| { |
| "epoch": 16.5550701846351, |
| "grad_norm": 0.381649374961853, |
| "learning_rate": 0.00040157692307692306, |
| "loss": 3.2705, |
| "step": 56850 |
| }, |
| { |
| "epoch": 16.56963131224882, |
| "grad_norm": 0.35336828231811523, |
| "learning_rate": 0.0004014020979020979, |
| "loss": 3.2812, |
| "step": 56900 |
| }, |
| { |
| "epoch": 16.584192439862544, |
| "grad_norm": 0.38121795654296875, |
| "learning_rate": 0.00040122727272727266, |
| "loss": 3.2794, |
| "step": 56950 |
| }, |
| { |
| "epoch": 16.598753567476265, |
| "grad_norm": 0.3877357244491577, |
| "learning_rate": 0.0004010524475524475, |
| "loss": 3.2753, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.598753567476265, |
| "eval_accuracy": 0.3721695920554011, |
| "eval_loss": 3.545459270477295, |
| "eval_runtime": 179.9779, |
| "eval_samples_per_second": 92.489, |
| "eval_steps_per_second": 5.784, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.613314695089986, |
| "grad_norm": 0.3907957673072815, |
| "learning_rate": 0.0004008776223776223, |
| "loss": 3.2652, |
| "step": 57050 |
| }, |
| { |
| "epoch": 16.62787582270371, |
| "grad_norm": 0.38088274002075195, |
| "learning_rate": 0.00040070279720279717, |
| "loss": 3.2809, |
| "step": 57100 |
| }, |
| { |
| "epoch": 16.642436950317432, |
| "grad_norm": 0.3674221634864807, |
| "learning_rate": 0.00040052797202797197, |
| "loss": 3.2896, |
| "step": 57150 |
| }, |
| { |
| "epoch": 16.656998077931156, |
| "grad_norm": 0.38404926657676697, |
| "learning_rate": 0.0004003531468531468, |
| "loss": 3.2903, |
| "step": 57200 |
| }, |
| { |
| "epoch": 16.671559205544877, |
| "grad_norm": 0.3633009195327759, |
| "learning_rate": 0.0004001783216783216, |
| "loss": 3.279, |
| "step": 57250 |
| }, |
| { |
| "epoch": 16.6861203331586, |
| "grad_norm": 0.34845659136772156, |
| "learning_rate": 0.0004000034965034965, |
| "loss": 3.2839, |
| "step": 57300 |
| }, |
| { |
| "epoch": 16.700681460772323, |
| "grad_norm": 0.4066186249256134, |
| "learning_rate": 0.0003998286713286713, |
| "loss": 3.274, |
| "step": 57350 |
| }, |
| { |
| "epoch": 16.715242588386044, |
| "grad_norm": 0.3727223575115204, |
| "learning_rate": 0.00039965384615384613, |
| "loss": 3.2924, |
| "step": 57400 |
| }, |
| { |
| "epoch": 16.72980371599977, |
| "grad_norm": 0.9412374496459961, |
| "learning_rate": 0.00039947902097902093, |
| "loss": 3.3019, |
| "step": 57450 |
| }, |
| { |
| "epoch": 16.74436484361349, |
| "grad_norm": 0.3742312788963318, |
| "learning_rate": 0.0003993041958041958, |
| "loss": 3.288, |
| "step": 57500 |
| }, |
| { |
| "epoch": 16.75892597122721, |
| "grad_norm": 0.35423213243484497, |
| "learning_rate": 0.00039912937062937064, |
| "loss": 3.3062, |
| "step": 57550 |
| }, |
| { |
| "epoch": 16.773487098840935, |
| "grad_norm": 0.3780602216720581, |
| "learning_rate": 0.00039895454545454544, |
| "loss": 3.2955, |
| "step": 57600 |
| }, |
| { |
| "epoch": 16.788048226454656, |
| "grad_norm": 0.34438854455947876, |
| "learning_rate": 0.0003987797202797203, |
| "loss": 3.2837, |
| "step": 57650 |
| }, |
| { |
| "epoch": 16.80260935406838, |
| "grad_norm": 0.3713186979293823, |
| "learning_rate": 0.00039860489510489504, |
| "loss": 3.2867, |
| "step": 57700 |
| }, |
| { |
| "epoch": 16.8171704816821, |
| "grad_norm": 0.3602714240550995, |
| "learning_rate": 0.0003984300699300699, |
| "loss": 3.2944, |
| "step": 57750 |
| }, |
| { |
| "epoch": 16.831731609295822, |
| "grad_norm": 0.359958291053772, |
| "learning_rate": 0.0003982552447552447, |
| "loss": 3.2914, |
| "step": 57800 |
| }, |
| { |
| "epoch": 16.846292736909547, |
| "grad_norm": 0.3828895390033722, |
| "learning_rate": 0.00039808041958041955, |
| "loss": 3.2905, |
| "step": 57850 |
| }, |
| { |
| "epoch": 16.860853864523268, |
| "grad_norm": 0.3683166205883026, |
| "learning_rate": 0.00039790559440559435, |
| "loss": 3.2857, |
| "step": 57900 |
| }, |
| { |
| "epoch": 16.875414992136992, |
| "grad_norm": 0.35476696491241455, |
| "learning_rate": 0.0003977307692307692, |
| "loss": 3.3029, |
| "step": 57950 |
| }, |
| { |
| "epoch": 16.889976119750713, |
| "grad_norm": 0.3767741918563843, |
| "learning_rate": 0.000397555944055944, |
| "loss": 3.2954, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.889976119750713, |
| "eval_accuracy": 0.3727307183804199, |
| "eval_loss": 3.538306951522827, |
| "eval_runtime": 179.9534, |
| "eval_samples_per_second": 92.502, |
| "eval_steps_per_second": 5.785, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.904537247364434, |
| "grad_norm": 0.3530736267566681, |
| "learning_rate": 0.00039738111888111885, |
| "loss": 3.2834, |
| "step": 58050 |
| }, |
| { |
| "epoch": 16.91909837497816, |
| "grad_norm": 0.373714417219162, |
| "learning_rate": 0.00039720629370629365, |
| "loss": 3.3025, |
| "step": 58100 |
| }, |
| { |
| "epoch": 16.93365950259188, |
| "grad_norm": 0.3793703019618988, |
| "learning_rate": 0.0003970314685314685, |
| "loss": 3.299, |
| "step": 58150 |
| }, |
| { |
| "epoch": 16.948220630205604, |
| "grad_norm": 0.36771833896636963, |
| "learning_rate": 0.0003968566433566433, |
| "loss": 3.3048, |
| "step": 58200 |
| }, |
| { |
| "epoch": 16.962781757819325, |
| "grad_norm": 0.3570466637611389, |
| "learning_rate": 0.00039668181818181816, |
| "loss": 3.2936, |
| "step": 58250 |
| }, |
| { |
| "epoch": 16.977342885433046, |
| "grad_norm": 0.396435022354126, |
| "learning_rate": 0.000396506993006993, |
| "loss": 3.3104, |
| "step": 58300 |
| }, |
| { |
| "epoch": 16.99190401304677, |
| "grad_norm": 0.3816315531730652, |
| "learning_rate": 0.0003963321678321678, |
| "loss": 3.3037, |
| "step": 58350 |
| }, |
| { |
| "epoch": 17.006406896150036, |
| "grad_norm": 0.3699148893356323, |
| "learning_rate": 0.00039615734265734267, |
| "loss": 3.2599, |
| "step": 58400 |
| }, |
| { |
| "epoch": 17.02096802376376, |
| "grad_norm": 0.3788648545742035, |
| "learning_rate": 0.0003959825174825174, |
| "loss": 3.1858, |
| "step": 58450 |
| }, |
| { |
| "epoch": 17.03552915137748, |
| "grad_norm": 0.38128015398979187, |
| "learning_rate": 0.00039580769230769227, |
| "loss": 3.1884, |
| "step": 58500 |
| }, |
| { |
| "epoch": 17.050090278991206, |
| "grad_norm": 0.36501333117485046, |
| "learning_rate": 0.00039563286713286707, |
| "loss": 3.1948, |
| "step": 58550 |
| }, |
| { |
| "epoch": 17.064651406604927, |
| "grad_norm": 0.3572292625904083, |
| "learning_rate": 0.0003954580419580419, |
| "loss": 3.2079, |
| "step": 58600 |
| }, |
| { |
| "epoch": 17.07921253421865, |
| "grad_norm": 0.3647148907184601, |
| "learning_rate": 0.0003952832167832167, |
| "loss": 3.2029, |
| "step": 58650 |
| }, |
| { |
| "epoch": 17.093773661832373, |
| "grad_norm": 0.4238453209400177, |
| "learning_rate": 0.0003951083916083916, |
| "loss": 3.2016, |
| "step": 58700 |
| }, |
| { |
| "epoch": 17.108334789446094, |
| "grad_norm": 0.4407082498073578, |
| "learning_rate": 0.0003949335664335664, |
| "loss": 3.2037, |
| "step": 58750 |
| }, |
| { |
| "epoch": 17.122895917059818, |
| "grad_norm": 0.38033443689346313, |
| "learning_rate": 0.00039475874125874123, |
| "loss": 3.2164, |
| "step": 58800 |
| }, |
| { |
| "epoch": 17.13745704467354, |
| "grad_norm": 0.3846021890640259, |
| "learning_rate": 0.00039458391608391603, |
| "loss": 3.2216, |
| "step": 58850 |
| }, |
| { |
| "epoch": 17.152018172287264, |
| "grad_norm": 0.39829474687576294, |
| "learning_rate": 0.0003944090909090909, |
| "loss": 3.2221, |
| "step": 58900 |
| }, |
| { |
| "epoch": 17.166579299900985, |
| "grad_norm": 0.363840788602829, |
| "learning_rate": 0.00039423426573426573, |
| "loss": 3.207, |
| "step": 58950 |
| }, |
| { |
| "epoch": 17.181140427514705, |
| "grad_norm": 0.37014469504356384, |
| "learning_rate": 0.00039405944055944053, |
| "loss": 3.2297, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.181140427514705, |
| "eval_accuracy": 0.37208671041719915, |
| "eval_loss": 3.555042266845703, |
| "eval_runtime": 180.0923, |
| "eval_samples_per_second": 92.43, |
| "eval_steps_per_second": 5.78, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.19570155512843, |
| "grad_norm": 0.41102135181427, |
| "learning_rate": 0.0003938846153846154, |
| "loss": 3.2334, |
| "step": 59050 |
| }, |
| { |
| "epoch": 17.21026268274215, |
| "grad_norm": 0.3776380121707916, |
| "learning_rate": 0.0003937097902097902, |
| "loss": 3.2323, |
| "step": 59100 |
| }, |
| { |
| "epoch": 17.224823810355876, |
| "grad_norm": 0.37090641260147095, |
| "learning_rate": 0.00039353496503496504, |
| "loss": 3.2395, |
| "step": 59150 |
| }, |
| { |
| "epoch": 17.239384937969596, |
| "grad_norm": 0.35886919498443604, |
| "learning_rate": 0.0003933601398601398, |
| "loss": 3.238, |
| "step": 59200 |
| }, |
| { |
| "epoch": 17.253946065583317, |
| "grad_norm": 0.35070979595184326, |
| "learning_rate": 0.00039318531468531464, |
| "loss": 3.2485, |
| "step": 59250 |
| }, |
| { |
| "epoch": 17.268507193197042, |
| "grad_norm": 0.38137462735176086, |
| "learning_rate": 0.00039301048951048944, |
| "loss": 3.2334, |
| "step": 59300 |
| }, |
| { |
| "epoch": 17.283068320810763, |
| "grad_norm": 0.36624035239219666, |
| "learning_rate": 0.0003928356643356643, |
| "loss": 3.2441, |
| "step": 59350 |
| }, |
| { |
| "epoch": 17.297629448424487, |
| "grad_norm": 0.3852206766605377, |
| "learning_rate": 0.0003926608391608391, |
| "loss": 3.244, |
| "step": 59400 |
| }, |
| { |
| "epoch": 17.31219057603821, |
| "grad_norm": 0.37531208992004395, |
| "learning_rate": 0.00039248601398601395, |
| "loss": 3.2498, |
| "step": 59450 |
| }, |
| { |
| "epoch": 17.32675170365193, |
| "grad_norm": 0.3944702744483948, |
| "learning_rate": 0.00039231118881118875, |
| "loss": 3.2525, |
| "step": 59500 |
| }, |
| { |
| "epoch": 17.341312831265654, |
| "grad_norm": 0.397729754447937, |
| "learning_rate": 0.0003921363636363636, |
| "loss": 3.2451, |
| "step": 59550 |
| }, |
| { |
| "epoch": 17.355873958879375, |
| "grad_norm": 0.36204469203948975, |
| "learning_rate": 0.00039196153846153846, |
| "loss": 3.2406, |
| "step": 59600 |
| }, |
| { |
| "epoch": 17.3704350864931, |
| "grad_norm": 0.4126119315624237, |
| "learning_rate": 0.00039178671328671326, |
| "loss": 3.2493, |
| "step": 59650 |
| }, |
| { |
| "epoch": 17.38499621410682, |
| "grad_norm": 0.37658989429473877, |
| "learning_rate": 0.0003916118881118881, |
| "loss": 3.2598, |
| "step": 59700 |
| }, |
| { |
| "epoch": 17.39955734172054, |
| "grad_norm": 0.3899455666542053, |
| "learning_rate": 0.0003914370629370629, |
| "loss": 3.2431, |
| "step": 59750 |
| }, |
| { |
| "epoch": 17.414118469334266, |
| "grad_norm": 0.3752618134021759, |
| "learning_rate": 0.00039126223776223776, |
| "loss": 3.2488, |
| "step": 59800 |
| }, |
| { |
| "epoch": 17.428679596947987, |
| "grad_norm": 0.3970799744129181, |
| "learning_rate": 0.00039108741258741256, |
| "loss": 3.2614, |
| "step": 59850 |
| }, |
| { |
| "epoch": 17.44324072456171, |
| "grad_norm": 0.38133737444877625, |
| "learning_rate": 0.0003909125874125874, |
| "loss": 3.2644, |
| "step": 59900 |
| }, |
| { |
| "epoch": 17.457801852175432, |
| "grad_norm": 0.3630177974700928, |
| "learning_rate": 0.00039073776223776216, |
| "loss": 3.2532, |
| "step": 59950 |
| }, |
| { |
| "epoch": 17.472362979789153, |
| "grad_norm": 0.35856887698173523, |
| "learning_rate": 0.000390562937062937, |
| "loss": 3.2697, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.472362979789153, |
| "eval_accuracy": 0.37256001747450596, |
| "eval_loss": 3.5483932495117188, |
| "eval_runtime": 179.9066, |
| "eval_samples_per_second": 92.526, |
| "eval_steps_per_second": 5.786, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.486924107402878, |
| "grad_norm": 0.38552340865135193, |
| "learning_rate": 0.0003903881118881118, |
| "loss": 3.2658, |
| "step": 60050 |
| }, |
| { |
| "epoch": 17.5014852350166, |
| "grad_norm": 0.39752206206321716, |
| "learning_rate": 0.00039021328671328667, |
| "loss": 3.2575, |
| "step": 60100 |
| }, |
| { |
| "epoch": 17.516046362630323, |
| "grad_norm": 0.3888029456138611, |
| "learning_rate": 0.00039003846153846147, |
| "loss": 3.2813, |
| "step": 60150 |
| }, |
| { |
| "epoch": 17.530607490244044, |
| "grad_norm": 0.35686013102531433, |
| "learning_rate": 0.0003898636363636363, |
| "loss": 3.2702, |
| "step": 60200 |
| }, |
| { |
| "epoch": 17.545168617857765, |
| "grad_norm": 0.3533909022808075, |
| "learning_rate": 0.0003896888111888111, |
| "loss": 3.2613, |
| "step": 60250 |
| }, |
| { |
| "epoch": 17.55972974547149, |
| "grad_norm": 0.348493367433548, |
| "learning_rate": 0.000389513986013986, |
| "loss": 3.2744, |
| "step": 60300 |
| }, |
| { |
| "epoch": 17.57429087308521, |
| "grad_norm": 0.38434287905693054, |
| "learning_rate": 0.00038933916083916083, |
| "loss": 3.2616, |
| "step": 60350 |
| }, |
| { |
| "epoch": 17.588852000698935, |
| "grad_norm": 0.3869266211986542, |
| "learning_rate": 0.00038916433566433563, |
| "loss": 3.275, |
| "step": 60400 |
| }, |
| { |
| "epoch": 17.603413128312656, |
| "grad_norm": 0.3701704144477844, |
| "learning_rate": 0.0003889895104895105, |
| "loss": 3.2845, |
| "step": 60450 |
| }, |
| { |
| "epoch": 17.617974255926377, |
| "grad_norm": 0.3878682255744934, |
| "learning_rate": 0.0003888146853146853, |
| "loss": 3.2652, |
| "step": 60500 |
| }, |
| { |
| "epoch": 17.6325353835401, |
| "grad_norm": 0.39550337195396423, |
| "learning_rate": 0.00038863986013986014, |
| "loss": 3.2684, |
| "step": 60550 |
| }, |
| { |
| "epoch": 17.647096511153823, |
| "grad_norm": 0.4366273581981659, |
| "learning_rate": 0.00038846503496503494, |
| "loss": 3.2679, |
| "step": 60600 |
| }, |
| { |
| "epoch": 17.661657638767547, |
| "grad_norm": 0.3733770549297333, |
| "learning_rate": 0.0003882902097902098, |
| "loss": 3.2682, |
| "step": 60650 |
| }, |
| { |
| "epoch": 17.676218766381268, |
| "grad_norm": 0.38435491919517517, |
| "learning_rate": 0.00038811538461538454, |
| "loss": 3.2544, |
| "step": 60700 |
| }, |
| { |
| "epoch": 17.690779893994993, |
| "grad_norm": 0.3815951943397522, |
| "learning_rate": 0.0003879405594405594, |
| "loss": 3.2807, |
| "step": 60750 |
| }, |
| { |
| "epoch": 17.705341021608714, |
| "grad_norm": 0.3753691017627716, |
| "learning_rate": 0.0003877657342657342, |
| "loss": 3.2762, |
| "step": 60800 |
| }, |
| { |
| "epoch": 17.719902149222435, |
| "grad_norm": 0.42220115661621094, |
| "learning_rate": 0.00038759090909090905, |
| "loss": 3.2871, |
| "step": 60850 |
| }, |
| { |
| "epoch": 17.73446327683616, |
| "grad_norm": 0.35121020674705505, |
| "learning_rate": 0.00038741608391608384, |
| "loss": 3.2713, |
| "step": 60900 |
| }, |
| { |
| "epoch": 17.74902440444988, |
| "grad_norm": 0.3975067734718323, |
| "learning_rate": 0.0003872412587412587, |
| "loss": 3.2847, |
| "step": 60950 |
| }, |
| { |
| "epoch": 17.763585532063605, |
| "grad_norm": 0.3895809054374695, |
| "learning_rate": 0.00038706643356643355, |
| "loss": 3.2866, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.763585532063605, |
| "eval_accuracy": 0.3731351337498028, |
| "eval_loss": 3.5378761291503906, |
| "eval_runtime": 179.9947, |
| "eval_samples_per_second": 92.48, |
| "eval_steps_per_second": 5.784, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.778146659677326, |
| "grad_norm": 0.3391883373260498, |
| "learning_rate": 0.00038689160839160835, |
| "loss": 3.2825, |
| "step": 61050 |
| }, |
| { |
| "epoch": 17.792707787291047, |
| "grad_norm": 0.3976898789405823, |
| "learning_rate": 0.0003867167832167832, |
| "loss": 3.2761, |
| "step": 61100 |
| }, |
| { |
| "epoch": 17.80726891490477, |
| "grad_norm": 0.3589976727962494, |
| "learning_rate": 0.000386541958041958, |
| "loss": 3.2818, |
| "step": 61150 |
| }, |
| { |
| "epoch": 17.821830042518492, |
| "grad_norm": 0.38042929768562317, |
| "learning_rate": 0.00038636713286713286, |
| "loss": 3.2655, |
| "step": 61200 |
| }, |
| { |
| "epoch": 17.836391170132217, |
| "grad_norm": 0.374071329832077, |
| "learning_rate": 0.00038619230769230766, |
| "loss": 3.288, |
| "step": 61250 |
| }, |
| { |
| "epoch": 17.850952297745938, |
| "grad_norm": 0.3914439380168915, |
| "learning_rate": 0.0003860174825174825, |
| "loss": 3.2895, |
| "step": 61300 |
| }, |
| { |
| "epoch": 17.86551342535966, |
| "grad_norm": 0.38434526324272156, |
| "learning_rate": 0.0003858426573426573, |
| "loss": 3.2922, |
| "step": 61350 |
| }, |
| { |
| "epoch": 17.880074552973383, |
| "grad_norm": 0.36606523394584656, |
| "learning_rate": 0.00038566783216783217, |
| "loss": 3.2884, |
| "step": 61400 |
| }, |
| { |
| "epoch": 17.894635680587104, |
| "grad_norm": 0.3612377643585205, |
| "learning_rate": 0.0003854930069930069, |
| "loss": 3.2856, |
| "step": 61450 |
| }, |
| { |
| "epoch": 17.90919680820083, |
| "grad_norm": 0.3759946823120117, |
| "learning_rate": 0.00038531818181818177, |
| "loss": 3.2932, |
| "step": 61500 |
| }, |
| { |
| "epoch": 17.92375793581455, |
| "grad_norm": 0.3594622313976288, |
| "learning_rate": 0.00038514335664335657, |
| "loss": 3.2704, |
| "step": 61550 |
| }, |
| { |
| "epoch": 17.93831906342827, |
| "grad_norm": 0.3536679148674011, |
| "learning_rate": 0.0003849685314685314, |
| "loss": 3.2835, |
| "step": 61600 |
| }, |
| { |
| "epoch": 17.952880191041995, |
| "grad_norm": 0.49064597487449646, |
| "learning_rate": 0.0003847937062937062, |
| "loss": 3.2914, |
| "step": 61650 |
| }, |
| { |
| "epoch": 17.967441318655716, |
| "grad_norm": 0.38135209679603577, |
| "learning_rate": 0.0003846188811188811, |
| "loss": 3.2825, |
| "step": 61700 |
| }, |
| { |
| "epoch": 17.98200244626944, |
| "grad_norm": 0.37888818979263306, |
| "learning_rate": 0.00038444405594405593, |
| "loss": 3.294, |
| "step": 61750 |
| }, |
| { |
| "epoch": 17.99656357388316, |
| "grad_norm": 0.35670679807662964, |
| "learning_rate": 0.00038426923076923073, |
| "loss": 3.2805, |
| "step": 61800 |
| }, |
| { |
| "epoch": 18.01106645698643, |
| "grad_norm": 0.3818565607070923, |
| "learning_rate": 0.0003840944055944056, |
| "loss": 3.2114, |
| "step": 61850 |
| }, |
| { |
| "epoch": 18.02562758460015, |
| "grad_norm": 0.3703397512435913, |
| "learning_rate": 0.0003839195804195804, |
| "loss": 3.1789, |
| "step": 61900 |
| }, |
| { |
| "epoch": 18.040188712213872, |
| "grad_norm": 0.39804592728614807, |
| "learning_rate": 0.00038374475524475523, |
| "loss": 3.1832, |
| "step": 61950 |
| }, |
| { |
| "epoch": 18.054749839827597, |
| "grad_norm": 0.401012659072876, |
| "learning_rate": 0.00038356993006993003, |
| "loss": 3.1941, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.054749839827597, |
| "eval_accuracy": 0.3725934052550015, |
| "eval_loss": 3.549222230911255, |
| "eval_runtime": 180.2253, |
| "eval_samples_per_second": 92.362, |
| "eval_steps_per_second": 5.776, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.069310967441318, |
| "grad_norm": 0.4045586585998535, |
| "learning_rate": 0.0003833951048951049, |
| "loss": 3.1949, |
| "step": 62050 |
| }, |
| { |
| "epoch": 18.083872095055042, |
| "grad_norm": 0.3926238417625427, |
| "learning_rate": 0.0003832202797202797, |
| "loss": 3.2128, |
| "step": 62100 |
| }, |
| { |
| "epoch": 18.098433222668763, |
| "grad_norm": 0.407951295375824, |
| "learning_rate": 0.00038304545454545454, |
| "loss": 3.1859, |
| "step": 62150 |
| }, |
| { |
| "epoch": 18.112994350282484, |
| "grad_norm": 0.39534199237823486, |
| "learning_rate": 0.0003828706293706293, |
| "loss": 3.2046, |
| "step": 62200 |
| }, |
| { |
| "epoch": 18.12755547789621, |
| "grad_norm": 0.3941259980201721, |
| "learning_rate": 0.00038269580419580414, |
| "loss": 3.2065, |
| "step": 62250 |
| }, |
| { |
| "epoch": 18.14211660550993, |
| "grad_norm": 0.3844638168811798, |
| "learning_rate": 0.00038252097902097894, |
| "loss": 3.2095, |
| "step": 62300 |
| }, |
| { |
| "epoch": 18.156677733123654, |
| "grad_norm": 0.37141576409339905, |
| "learning_rate": 0.0003823461538461538, |
| "loss": 3.2036, |
| "step": 62350 |
| }, |
| { |
| "epoch": 18.171238860737375, |
| "grad_norm": 0.3663264811038971, |
| "learning_rate": 0.00038217132867132865, |
| "loss": 3.2138, |
| "step": 62400 |
| }, |
| { |
| "epoch": 18.185799988351096, |
| "grad_norm": 0.37277019023895264, |
| "learning_rate": 0.00038199650349650345, |
| "loss": 3.2184, |
| "step": 62450 |
| }, |
| { |
| "epoch": 18.20036111596482, |
| "grad_norm": 0.36672139167785645, |
| "learning_rate": 0.0003818216783216783, |
| "loss": 3.2182, |
| "step": 62500 |
| }, |
| { |
| "epoch": 18.214922243578542, |
| "grad_norm": 0.3598610758781433, |
| "learning_rate": 0.0003816468531468531, |
| "loss": 3.2258, |
| "step": 62550 |
| }, |
| { |
| "epoch": 18.229483371192266, |
| "grad_norm": 0.37559694051742554, |
| "learning_rate": 0.00038147202797202796, |
| "loss": 3.2361, |
| "step": 62600 |
| }, |
| { |
| "epoch": 18.244044498805987, |
| "grad_norm": 0.3849242627620697, |
| "learning_rate": 0.00038129720279720276, |
| "loss": 3.2326, |
| "step": 62650 |
| }, |
| { |
| "epoch": 18.25860562641971, |
| "grad_norm": 0.35836875438690186, |
| "learning_rate": 0.0003811223776223776, |
| "loss": 3.2255, |
| "step": 62700 |
| }, |
| { |
| "epoch": 18.273166754033433, |
| "grad_norm": 0.3678399920463562, |
| "learning_rate": 0.0003809475524475524, |
| "loss": 3.2378, |
| "step": 62750 |
| }, |
| { |
| "epoch": 18.287727881647154, |
| "grad_norm": 0.37946373224258423, |
| "learning_rate": 0.00038077272727272726, |
| "loss": 3.2287, |
| "step": 62800 |
| }, |
| { |
| "epoch": 18.30228900926088, |
| "grad_norm": 0.3561152219772339, |
| "learning_rate": 0.00038059790209790206, |
| "loss": 3.2255, |
| "step": 62850 |
| }, |
| { |
| "epoch": 18.3168501368746, |
| "grad_norm": 0.3992021977901459, |
| "learning_rate": 0.0003804230769230769, |
| "loss": 3.2304, |
| "step": 62900 |
| }, |
| { |
| "epoch": 18.33141126448832, |
| "grad_norm": 0.37258997559547424, |
| "learning_rate": 0.00038024825174825166, |
| "loss": 3.239, |
| "step": 62950 |
| }, |
| { |
| "epoch": 18.345972392102045, |
| "grad_norm": 0.37678083777427673, |
| "learning_rate": 0.0003800734265734265, |
| "loss": 3.2347, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.345972392102045, |
| "eval_accuracy": 0.3727190796822894, |
| "eval_loss": 3.547424077987671, |
| "eval_runtime": 180.3148, |
| "eval_samples_per_second": 92.316, |
| "eval_steps_per_second": 5.773, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.360533519715766, |
| "grad_norm": 0.4106750190258026, |
| "learning_rate": 0.0003798986013986013, |
| "loss": 3.2414, |
| "step": 63050 |
| }, |
| { |
| "epoch": 18.37509464732949, |
| "grad_norm": 0.4024544060230255, |
| "learning_rate": 0.00037972377622377617, |
| "loss": 3.2448, |
| "step": 63100 |
| }, |
| { |
| "epoch": 18.38965577494321, |
| "grad_norm": 0.36580199003219604, |
| "learning_rate": 0.000379548951048951, |
| "loss": 3.2383, |
| "step": 63150 |
| }, |
| { |
| "epoch": 18.404216902556932, |
| "grad_norm": 0.3975513279438019, |
| "learning_rate": 0.0003793741258741258, |
| "loss": 3.2537, |
| "step": 63200 |
| }, |
| { |
| "epoch": 18.418778030170657, |
| "grad_norm": 0.3643522262573242, |
| "learning_rate": 0.0003791993006993007, |
| "loss": 3.2468, |
| "step": 63250 |
| }, |
| { |
| "epoch": 18.433339157784378, |
| "grad_norm": 0.3544718623161316, |
| "learning_rate": 0.0003790244755244755, |
| "loss": 3.2346, |
| "step": 63300 |
| }, |
| { |
| "epoch": 18.447900285398102, |
| "grad_norm": 0.39411383867263794, |
| "learning_rate": 0.00037884965034965033, |
| "loss": 3.2436, |
| "step": 63350 |
| }, |
| { |
| "epoch": 18.462461413011823, |
| "grad_norm": 0.3775225281715393, |
| "learning_rate": 0.00037867482517482513, |
| "loss": 3.2485, |
| "step": 63400 |
| }, |
| { |
| "epoch": 18.477022540625548, |
| "grad_norm": 0.372683048248291, |
| "learning_rate": 0.0003785, |
| "loss": 3.2413, |
| "step": 63450 |
| }, |
| { |
| "epoch": 18.49158366823927, |
| "grad_norm": 0.38039228320121765, |
| "learning_rate": 0.0003783251748251748, |
| "loss": 3.2524, |
| "step": 63500 |
| }, |
| { |
| "epoch": 18.50614479585299, |
| "grad_norm": 0.39060574769973755, |
| "learning_rate": 0.00037815034965034964, |
| "loss": 3.2429, |
| "step": 63550 |
| }, |
| { |
| "epoch": 18.520705923466714, |
| "grad_norm": 0.3711771070957184, |
| "learning_rate": 0.00037797552447552444, |
| "loss": 3.2344, |
| "step": 63600 |
| }, |
| { |
| "epoch": 18.535267051080435, |
| "grad_norm": 0.3637911081314087, |
| "learning_rate": 0.0003778006993006993, |
| "loss": 3.25, |
| "step": 63650 |
| }, |
| { |
| "epoch": 18.54982817869416, |
| "grad_norm": 0.3562261462211609, |
| "learning_rate": 0.00037762587412587404, |
| "loss": 3.2543, |
| "step": 63700 |
| }, |
| { |
| "epoch": 18.56438930630788, |
| "grad_norm": 0.37300369143486023, |
| "learning_rate": 0.0003774510489510489, |
| "loss": 3.2443, |
| "step": 63750 |
| }, |
| { |
| "epoch": 18.5789504339216, |
| "grad_norm": 0.3933142125606537, |
| "learning_rate": 0.0003772762237762238, |
| "loss": 3.2484, |
| "step": 63800 |
| }, |
| { |
| "epoch": 18.593511561535326, |
| "grad_norm": 0.38338160514831543, |
| "learning_rate": 0.00037710139860139854, |
| "loss": 3.2543, |
| "step": 63850 |
| }, |
| { |
| "epoch": 18.608072689149047, |
| "grad_norm": 0.39572325348854065, |
| "learning_rate": 0.0003769265734265734, |
| "loss": 3.2654, |
| "step": 63900 |
| }, |
| { |
| "epoch": 18.62263381676277, |
| "grad_norm": 0.37693658471107483, |
| "learning_rate": 0.0003767517482517482, |
| "loss": 3.2641, |
| "step": 63950 |
| }, |
| { |
| "epoch": 18.637194944376493, |
| "grad_norm": 0.3730895221233368, |
| "learning_rate": 0.00037657692307692305, |
| "loss": 3.2498, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.637194944376493, |
| "eval_accuracy": 0.37294080275980573, |
| "eval_loss": 3.540463924407959, |
| "eval_runtime": 179.9256, |
| "eval_samples_per_second": 92.516, |
| "eval_steps_per_second": 5.786, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.651756071990214, |
| "grad_norm": 0.39404192566871643, |
| "learning_rate": 0.00037640209790209785, |
| "loss": 3.2589, |
| "step": 64050 |
| }, |
| { |
| "epoch": 18.666317199603938, |
| "grad_norm": 0.38090044260025024, |
| "learning_rate": 0.0003762272727272727, |
| "loss": 3.2701, |
| "step": 64100 |
| }, |
| { |
| "epoch": 18.68087832721766, |
| "grad_norm": 0.36398962140083313, |
| "learning_rate": 0.0003760524475524475, |
| "loss": 3.2568, |
| "step": 64150 |
| }, |
| { |
| "epoch": 18.695439454831384, |
| "grad_norm": 0.38197851181030273, |
| "learning_rate": 0.00037587762237762236, |
| "loss": 3.2634, |
| "step": 64200 |
| }, |
| { |
| "epoch": 18.710000582445105, |
| "grad_norm": 0.3825123608112335, |
| "learning_rate": 0.00037570279720279716, |
| "loss": 3.2636, |
| "step": 64250 |
| }, |
| { |
| "epoch": 18.724561710058826, |
| "grad_norm": 0.367422878742218, |
| "learning_rate": 0.000375527972027972, |
| "loss": 3.26, |
| "step": 64300 |
| }, |
| { |
| "epoch": 18.73912283767255, |
| "grad_norm": 0.38466405868530273, |
| "learning_rate": 0.0003753531468531468, |
| "loss": 3.2791, |
| "step": 64350 |
| }, |
| { |
| "epoch": 18.75368396528627, |
| "grad_norm": 0.4133056104183197, |
| "learning_rate": 0.00037517832167832167, |
| "loss": 3.2582, |
| "step": 64400 |
| }, |
| { |
| "epoch": 18.768245092899996, |
| "grad_norm": 0.3779293894767761, |
| "learning_rate": 0.0003750034965034965, |
| "loss": 3.2694, |
| "step": 64450 |
| }, |
| { |
| "epoch": 18.782806220513717, |
| "grad_norm": 0.3765803575515747, |
| "learning_rate": 0.00037482867132867127, |
| "loss": 3.2688, |
| "step": 64500 |
| }, |
| { |
| "epoch": 18.797367348127437, |
| "grad_norm": 0.37509140372276306, |
| "learning_rate": 0.0003746538461538462, |
| "loss": 3.2713, |
| "step": 64550 |
| }, |
| { |
| "epoch": 18.811928475741162, |
| "grad_norm": 0.40308573842048645, |
| "learning_rate": 0.0003744790209790209, |
| "loss": 3.2565, |
| "step": 64600 |
| }, |
| { |
| "epoch": 18.826489603354883, |
| "grad_norm": 0.3870610296726227, |
| "learning_rate": 0.0003743041958041958, |
| "loss": 3.2639, |
| "step": 64650 |
| }, |
| { |
| "epoch": 18.841050730968607, |
| "grad_norm": 0.3560023009777069, |
| "learning_rate": 0.0003741293706293706, |
| "loss": 3.2662, |
| "step": 64700 |
| }, |
| { |
| "epoch": 18.85561185858233, |
| "grad_norm": 0.38569357991218567, |
| "learning_rate": 0.0003739545454545454, |
| "loss": 3.2685, |
| "step": 64750 |
| }, |
| { |
| "epoch": 18.87017298619605, |
| "grad_norm": 0.36742493510246277, |
| "learning_rate": 0.0003737797202797202, |
| "loss": 3.2719, |
| "step": 64800 |
| }, |
| { |
| "epoch": 18.884734113809774, |
| "grad_norm": 0.3732217848300934, |
| "learning_rate": 0.0003736048951048951, |
| "loss": 3.2794, |
| "step": 64850 |
| }, |
| { |
| "epoch": 18.899295241423495, |
| "grad_norm": 0.3774060010910034, |
| "learning_rate": 0.0003734300699300699, |
| "loss": 3.2763, |
| "step": 64900 |
| }, |
| { |
| "epoch": 18.91385636903722, |
| "grad_norm": 0.38624700903892517, |
| "learning_rate": 0.00037325524475524473, |
| "loss": 3.2776, |
| "step": 64950 |
| }, |
| { |
| "epoch": 18.92841749665094, |
| "grad_norm": 0.37768304347991943, |
| "learning_rate": 0.00037308041958041953, |
| "loss": 3.2726, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.92841749665094, |
| "eval_accuracy": 0.3736417110249978, |
| "eval_loss": 3.532841444015503, |
| "eval_runtime": 180.0966, |
| "eval_samples_per_second": 92.428, |
| "eval_steps_per_second": 5.78, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.94297862426466, |
| "grad_norm": 0.39474961161613464, |
| "learning_rate": 0.0003729055944055944, |
| "loss": 3.2714, |
| "step": 65050 |
| }, |
| { |
| "epoch": 18.957539751878386, |
| "grad_norm": 0.3922153413295746, |
| "learning_rate": 0.0003727307692307692, |
| "loss": 3.2794, |
| "step": 65100 |
| }, |
| { |
| "epoch": 18.972100879492107, |
| "grad_norm": 0.3659467101097107, |
| "learning_rate": 0.00037255594405594404, |
| "loss": 3.2946, |
| "step": 65150 |
| }, |
| { |
| "epoch": 18.98666200710583, |
| "grad_norm": 0.3558950424194336, |
| "learning_rate": 0.0003723811188811189, |
| "loss": 3.2897, |
| "step": 65200 |
| }, |
| { |
| "epoch": 19.001164890209097, |
| "grad_norm": 0.3747306764125824, |
| "learning_rate": 0.00037220629370629364, |
| "loss": 3.2719, |
| "step": 65250 |
| }, |
| { |
| "epoch": 19.01572601782282, |
| "grad_norm": 0.4085351824760437, |
| "learning_rate": 0.00037203146853146855, |
| "loss": 3.1611, |
| "step": 65300 |
| }, |
| { |
| "epoch": 19.030287145436542, |
| "grad_norm": 0.4003465473651886, |
| "learning_rate": 0.0003718566433566433, |
| "loss": 3.1577, |
| "step": 65350 |
| }, |
| { |
| "epoch": 19.044848273050263, |
| "grad_norm": 0.38690558075904846, |
| "learning_rate": 0.00037168181818181815, |
| "loss": 3.1785, |
| "step": 65400 |
| }, |
| { |
| "epoch": 19.059409400663988, |
| "grad_norm": 0.38368141651153564, |
| "learning_rate": 0.00037150699300699295, |
| "loss": 3.1785, |
| "step": 65450 |
| }, |
| { |
| "epoch": 19.07397052827771, |
| "grad_norm": 0.3683834373950958, |
| "learning_rate": 0.0003713321678321678, |
| "loss": 3.1779, |
| "step": 65500 |
| }, |
| { |
| "epoch": 19.088531655891433, |
| "grad_norm": 0.3883166015148163, |
| "learning_rate": 0.0003711573426573426, |
| "loss": 3.1804, |
| "step": 65550 |
| }, |
| { |
| "epoch": 19.103092783505154, |
| "grad_norm": 0.41156354546546936, |
| "learning_rate": 0.00037098251748251746, |
| "loss": 3.1908, |
| "step": 65600 |
| }, |
| { |
| "epoch": 19.11765391111888, |
| "grad_norm": 0.4096542000770569, |
| "learning_rate": 0.00037080769230769226, |
| "loss": 3.1864, |
| "step": 65650 |
| }, |
| { |
| "epoch": 19.1322150387326, |
| "grad_norm": 0.38619449734687805, |
| "learning_rate": 0.0003706328671328671, |
| "loss": 3.1971, |
| "step": 65700 |
| }, |
| { |
| "epoch": 19.14677616634632, |
| "grad_norm": 0.41573604941368103, |
| "learning_rate": 0.0003704580419580419, |
| "loss": 3.2037, |
| "step": 65750 |
| }, |
| { |
| "epoch": 19.161337293960045, |
| "grad_norm": 0.4099518954753876, |
| "learning_rate": 0.00037028321678321676, |
| "loss": 3.1961, |
| "step": 65800 |
| }, |
| { |
| "epoch": 19.175898421573766, |
| "grad_norm": 0.3666977882385254, |
| "learning_rate": 0.0003701083916083916, |
| "loss": 3.2115, |
| "step": 65850 |
| }, |
| { |
| "epoch": 19.19045954918749, |
| "grad_norm": 0.40087178349494934, |
| "learning_rate": 0.0003699335664335664, |
| "loss": 3.1953, |
| "step": 65900 |
| }, |
| { |
| "epoch": 19.20502067680121, |
| "grad_norm": 0.3936769366264343, |
| "learning_rate": 0.00036975874125874127, |
| "loss": 3.213, |
| "step": 65950 |
| }, |
| { |
| "epoch": 19.219581804414933, |
| "grad_norm": 0.363978773355484, |
| "learning_rate": 0.00036958391608391607, |
| "loss": 3.2068, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.219581804414933, |
| "eval_accuracy": 0.37264948261872116, |
| "eval_loss": 3.553295135498047, |
| "eval_runtime": 179.9635, |
| "eval_samples_per_second": 92.497, |
| "eval_steps_per_second": 5.785, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.234142932028657, |
| "grad_norm": 0.3696822226047516, |
| "learning_rate": 0.0003694090909090909, |
| "loss": 3.2047, |
| "step": 66050 |
| }, |
| { |
| "epoch": 19.248704059642378, |
| "grad_norm": 0.3602693974971771, |
| "learning_rate": 0.00036923426573426567, |
| "loss": 3.2044, |
| "step": 66100 |
| }, |
| { |
| "epoch": 19.263265187256103, |
| "grad_norm": 0.37676116824150085, |
| "learning_rate": 0.0003690594405594405, |
| "loss": 3.221, |
| "step": 66150 |
| }, |
| { |
| "epoch": 19.277826314869824, |
| "grad_norm": 0.3772142827510834, |
| "learning_rate": 0.0003688846153846153, |
| "loss": 3.2148, |
| "step": 66200 |
| }, |
| { |
| "epoch": 19.292387442483545, |
| "grad_norm": 0.41931411623954773, |
| "learning_rate": 0.0003687097902097902, |
| "loss": 3.223, |
| "step": 66250 |
| }, |
| { |
| "epoch": 19.30694857009727, |
| "grad_norm": 0.40397313237190247, |
| "learning_rate": 0.000368534965034965, |
| "loss": 3.2234, |
| "step": 66300 |
| }, |
| { |
| "epoch": 19.32150969771099, |
| "grad_norm": 0.39478546380996704, |
| "learning_rate": 0.00036836013986013983, |
| "loss": 3.238, |
| "step": 66350 |
| }, |
| { |
| "epoch": 19.336070825324715, |
| "grad_norm": 0.41326457262039185, |
| "learning_rate": 0.00036818531468531463, |
| "loss": 3.228, |
| "step": 66400 |
| }, |
| { |
| "epoch": 19.350631952938436, |
| "grad_norm": 0.4092574715614319, |
| "learning_rate": 0.0003680104895104895, |
| "loss": 3.2262, |
| "step": 66450 |
| }, |
| { |
| "epoch": 19.365193080552157, |
| "grad_norm": 0.35985106229782104, |
| "learning_rate": 0.0003678356643356643, |
| "loss": 3.2233, |
| "step": 66500 |
| }, |
| { |
| "epoch": 19.37975420816588, |
| "grad_norm": 0.3727477490901947, |
| "learning_rate": 0.00036766083916083914, |
| "loss": 3.2334, |
| "step": 66550 |
| }, |
| { |
| "epoch": 19.394315335779602, |
| "grad_norm": 0.5783479809761047, |
| "learning_rate": 0.000367486013986014, |
| "loss": 3.2399, |
| "step": 66600 |
| }, |
| { |
| "epoch": 19.408876463393327, |
| "grad_norm": 0.42180201411247253, |
| "learning_rate": 0.0003673111888111888, |
| "loss": 3.2354, |
| "step": 66650 |
| }, |
| { |
| "epoch": 19.423437591007048, |
| "grad_norm": 0.38537582755088806, |
| "learning_rate": 0.00036713636363636365, |
| "loss": 3.2342, |
| "step": 66700 |
| }, |
| { |
| "epoch": 19.43799871862077, |
| "grad_norm": 0.3808097541332245, |
| "learning_rate": 0.00036696153846153844, |
| "loss": 3.2329, |
| "step": 66750 |
| }, |
| { |
| "epoch": 19.452559846234493, |
| "grad_norm": 0.38023439049720764, |
| "learning_rate": 0.0003667867132867133, |
| "loss": 3.2295, |
| "step": 66800 |
| }, |
| { |
| "epoch": 19.467120973848214, |
| "grad_norm": 0.37724730372428894, |
| "learning_rate": 0.00036661188811188804, |
| "loss": 3.2327, |
| "step": 66850 |
| }, |
| { |
| "epoch": 19.48168210146194, |
| "grad_norm": 0.3762873709201813, |
| "learning_rate": 0.0003664370629370629, |
| "loss": 3.2437, |
| "step": 66900 |
| }, |
| { |
| "epoch": 19.49624322907566, |
| "grad_norm": 0.3730737268924713, |
| "learning_rate": 0.0003662622377622377, |
| "loss": 3.2385, |
| "step": 66950 |
| }, |
| { |
| "epoch": 19.51080435668938, |
| "grad_norm": 0.4036095440387726, |
| "learning_rate": 0.00036608741258741255, |
| "loss": 3.2369, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.51080435668938, |
| "eval_accuracy": 0.37310950510139423, |
| "eval_loss": 3.546161651611328, |
| "eval_runtime": 179.8812, |
| "eval_samples_per_second": 92.539, |
| "eval_steps_per_second": 5.787, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.525365484303105, |
| "grad_norm": 0.3802689015865326, |
| "learning_rate": 0.00036591258741258735, |
| "loss": 3.2357, |
| "step": 67050 |
| }, |
| { |
| "epoch": 19.539926611916826, |
| "grad_norm": 0.3963395059108734, |
| "learning_rate": 0.0003657377622377622, |
| "loss": 3.2439, |
| "step": 67100 |
| }, |
| { |
| "epoch": 19.55448773953055, |
| "grad_norm": 0.3858702480792999, |
| "learning_rate": 0.000365562937062937, |
| "loss": 3.2407, |
| "step": 67150 |
| }, |
| { |
| "epoch": 19.56904886714427, |
| "grad_norm": 0.393506795167923, |
| "learning_rate": 0.00036538811188811186, |
| "loss": 3.2505, |
| "step": 67200 |
| }, |
| { |
| "epoch": 19.583609994757992, |
| "grad_norm": 0.3662310838699341, |
| "learning_rate": 0.0003652132867132867, |
| "loss": 3.2384, |
| "step": 67250 |
| }, |
| { |
| "epoch": 19.598171122371717, |
| "grad_norm": 0.39800354838371277, |
| "learning_rate": 0.0003650384615384615, |
| "loss": 3.2513, |
| "step": 67300 |
| }, |
| { |
| "epoch": 19.612732249985438, |
| "grad_norm": 0.39850059151649475, |
| "learning_rate": 0.00036486363636363637, |
| "loss": 3.2515, |
| "step": 67350 |
| }, |
| { |
| "epoch": 19.627293377599162, |
| "grad_norm": 0.40997257828712463, |
| "learning_rate": 0.00036468881118881117, |
| "loss": 3.2362, |
| "step": 67400 |
| }, |
| { |
| "epoch": 19.641854505212883, |
| "grad_norm": 0.3920000195503235, |
| "learning_rate": 0.000364513986013986, |
| "loss": 3.2478, |
| "step": 67450 |
| }, |
| { |
| "epoch": 19.656415632826604, |
| "grad_norm": 0.38390007615089417, |
| "learning_rate": 0.0003643391608391608, |
| "loss": 3.2479, |
| "step": 67500 |
| }, |
| { |
| "epoch": 19.67097676044033, |
| "grad_norm": 0.3826581537723541, |
| "learning_rate": 0.0003641643356643357, |
| "loss": 3.26, |
| "step": 67550 |
| }, |
| { |
| "epoch": 19.68553788805405, |
| "grad_norm": 0.3734751045703888, |
| "learning_rate": 0.0003639895104895104, |
| "loss": 3.2584, |
| "step": 67600 |
| }, |
| { |
| "epoch": 19.700099015667774, |
| "grad_norm": 0.44370555877685547, |
| "learning_rate": 0.0003638146853146853, |
| "loss": 3.2524, |
| "step": 67650 |
| }, |
| { |
| "epoch": 19.714660143281495, |
| "grad_norm": 0.35572460293769836, |
| "learning_rate": 0.00036363986013986007, |
| "loss": 3.2571, |
| "step": 67700 |
| }, |
| { |
| "epoch": 19.729221270895216, |
| "grad_norm": 0.3746419847011566, |
| "learning_rate": 0.0003634650349650349, |
| "loss": 3.2597, |
| "step": 67750 |
| }, |
| { |
| "epoch": 19.74378239850894, |
| "grad_norm": 0.39100751280784607, |
| "learning_rate": 0.0003632902097902097, |
| "loss": 3.2632, |
| "step": 67800 |
| }, |
| { |
| "epoch": 19.758343526122662, |
| "grad_norm": 0.39586180448532104, |
| "learning_rate": 0.0003631153846153846, |
| "loss": 3.2635, |
| "step": 67850 |
| }, |
| { |
| "epoch": 19.772904653736386, |
| "grad_norm": 0.3798498511314392, |
| "learning_rate": 0.00036294055944055943, |
| "loss": 3.2672, |
| "step": 67900 |
| }, |
| { |
| "epoch": 19.787465781350107, |
| "grad_norm": 0.3736896812915802, |
| "learning_rate": 0.00036276573426573423, |
| "loss": 3.2602, |
| "step": 67950 |
| }, |
| { |
| "epoch": 19.802026908963832, |
| "grad_norm": 0.39452555775642395, |
| "learning_rate": 0.0003625909090909091, |
| "loss": 3.2604, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.802026908963832, |
| "eval_accuracy": 0.37352250254111574, |
| "eval_loss": 3.5354180335998535, |
| "eval_runtime": 179.9898, |
| "eval_samples_per_second": 92.483, |
| "eval_steps_per_second": 5.784, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.816588036577553, |
| "grad_norm": 0.3736236095428467, |
| "learning_rate": 0.0003624160839160839, |
| "loss": 3.2579, |
| "step": 68050 |
| }, |
| { |
| "epoch": 19.831149164191274, |
| "grad_norm": 0.37088045477867126, |
| "learning_rate": 0.00036224125874125874, |
| "loss": 3.2629, |
| "step": 68100 |
| }, |
| { |
| "epoch": 19.845710291805, |
| "grad_norm": 0.4000867009162903, |
| "learning_rate": 0.00036206643356643354, |
| "loss": 3.2625, |
| "step": 68150 |
| }, |
| { |
| "epoch": 19.86027141941872, |
| "grad_norm": 0.40783992409706116, |
| "learning_rate": 0.0003618916083916084, |
| "loss": 3.2568, |
| "step": 68200 |
| }, |
| { |
| "epoch": 19.874832547032444, |
| "grad_norm": 0.3544476330280304, |
| "learning_rate": 0.0003617167832167832, |
| "loss": 3.2563, |
| "step": 68250 |
| }, |
| { |
| "epoch": 19.889393674646165, |
| "grad_norm": 0.38307687640190125, |
| "learning_rate": 0.00036154195804195805, |
| "loss": 3.2613, |
| "step": 68300 |
| }, |
| { |
| "epoch": 19.903954802259886, |
| "grad_norm": 0.3610667884349823, |
| "learning_rate": 0.0003613671328671328, |
| "loss": 3.2601, |
| "step": 68350 |
| }, |
| { |
| "epoch": 19.91851592987361, |
| "grad_norm": 0.3683680593967438, |
| "learning_rate": 0.00036119230769230765, |
| "loss": 3.2624, |
| "step": 68400 |
| }, |
| { |
| "epoch": 19.93307705748733, |
| "grad_norm": 0.38349440693855286, |
| "learning_rate": 0.00036101748251748245, |
| "loss": 3.2531, |
| "step": 68450 |
| }, |
| { |
| "epoch": 19.947638185101056, |
| "grad_norm": 0.3851879835128784, |
| "learning_rate": 0.0003608426573426573, |
| "loss": 3.2751, |
| "step": 68500 |
| }, |
| { |
| "epoch": 19.962199312714777, |
| "grad_norm": 0.3976530432701111, |
| "learning_rate": 0.0003606678321678321, |
| "loss": 3.2587, |
| "step": 68550 |
| }, |
| { |
| "epoch": 19.976760440328498, |
| "grad_norm": 0.3516232669353485, |
| "learning_rate": 0.00036049300699300696, |
| "loss": 3.2673, |
| "step": 68600 |
| }, |
| { |
| "epoch": 19.991321567942222, |
| "grad_norm": 0.3732779324054718, |
| "learning_rate": 0.0003603181818181818, |
| "loss": 3.2569, |
| "step": 68650 |
| }, |
| { |
| "epoch": 20.005824451045488, |
| "grad_norm": 0.357576847076416, |
| "learning_rate": 0.0003601433566433566, |
| "loss": 3.2126, |
| "step": 68700 |
| }, |
| { |
| "epoch": 20.020385578659212, |
| "grad_norm": 0.39621129631996155, |
| "learning_rate": 0.00035996853146853146, |
| "loss": 3.1568, |
| "step": 68750 |
| }, |
| { |
| "epoch": 20.034946706272933, |
| "grad_norm": 0.4111010730266571, |
| "learning_rate": 0.00035979370629370626, |
| "loss": 3.1612, |
| "step": 68800 |
| }, |
| { |
| "epoch": 20.049507833886658, |
| "grad_norm": 0.3604491055011749, |
| "learning_rate": 0.0003596188811188811, |
| "loss": 3.1672, |
| "step": 68850 |
| }, |
| { |
| "epoch": 20.06406896150038, |
| "grad_norm": 0.37328052520751953, |
| "learning_rate": 0.0003594440559440559, |
| "loss": 3.1773, |
| "step": 68900 |
| }, |
| { |
| "epoch": 20.0786300891141, |
| "grad_norm": 0.37750720977783203, |
| "learning_rate": 0.00035926923076923077, |
| "loss": 3.1798, |
| "step": 68950 |
| }, |
| { |
| "epoch": 20.093191216727824, |
| "grad_norm": 0.4066753685474396, |
| "learning_rate": 0.00035909440559440557, |
| "loss": 3.1803, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.093191216727824, |
| "eval_accuracy": 0.37312572874121247, |
| "eval_loss": 3.5512688159942627, |
| "eval_runtime": 180.0573, |
| "eval_samples_per_second": 92.448, |
| "eval_steps_per_second": 5.781, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.107752344341545, |
| "grad_norm": 0.38535457849502563, |
| "learning_rate": 0.0003589195804195804, |
| "loss": 3.1743, |
| "step": 69050 |
| }, |
| { |
| "epoch": 20.12231347195527, |
| "grad_norm": 0.38833922147750854, |
| "learning_rate": 0.00035874475524475517, |
| "loss": 3.1815, |
| "step": 69100 |
| }, |
| { |
| "epoch": 20.13687459956899, |
| "grad_norm": 0.37982314825057983, |
| "learning_rate": 0.00035856993006993, |
| "loss": 3.1688, |
| "step": 69150 |
| }, |
| { |
| "epoch": 20.15143572718271, |
| "grad_norm": 0.384846568107605, |
| "learning_rate": 0.0003583951048951048, |
| "loss": 3.1857, |
| "step": 69200 |
| }, |
| { |
| "epoch": 20.165996854796436, |
| "grad_norm": 0.3848235607147217, |
| "learning_rate": 0.0003582202797202797, |
| "loss": 3.204, |
| "step": 69250 |
| }, |
| { |
| "epoch": 20.180557982410157, |
| "grad_norm": 0.43032485246658325, |
| "learning_rate": 0.00035804545454545453, |
| "loss": 3.1907, |
| "step": 69300 |
| }, |
| { |
| "epoch": 20.19511911002388, |
| "grad_norm": 0.39198267459869385, |
| "learning_rate": 0.00035787062937062933, |
| "loss": 3.1987, |
| "step": 69350 |
| }, |
| { |
| "epoch": 20.209680237637603, |
| "grad_norm": 0.34576407074928284, |
| "learning_rate": 0.0003576958041958042, |
| "loss": 3.2077, |
| "step": 69400 |
| }, |
| { |
| "epoch": 20.224241365251324, |
| "grad_norm": 0.37564489245414734, |
| "learning_rate": 0.000357520979020979, |
| "loss": 3.1956, |
| "step": 69450 |
| }, |
| { |
| "epoch": 20.238802492865048, |
| "grad_norm": 0.37332504987716675, |
| "learning_rate": 0.00035734615384615384, |
| "loss": 3.2067, |
| "step": 69500 |
| }, |
| { |
| "epoch": 20.25336362047877, |
| "grad_norm": 0.4408362805843353, |
| "learning_rate": 0.00035717132867132864, |
| "loss": 3.201, |
| "step": 69550 |
| }, |
| { |
| "epoch": 20.267924748092494, |
| "grad_norm": 0.37921786308288574, |
| "learning_rate": 0.0003569965034965035, |
| "loss": 3.1996, |
| "step": 69600 |
| }, |
| { |
| "epoch": 20.282485875706215, |
| "grad_norm": 0.7169339060783386, |
| "learning_rate": 0.0003568216783216783, |
| "loss": 3.2101, |
| "step": 69650 |
| }, |
| { |
| "epoch": 20.297047003319935, |
| "grad_norm": 0.3850073516368866, |
| "learning_rate": 0.00035664685314685314, |
| "loss": 3.2164, |
| "step": 69700 |
| }, |
| { |
| "epoch": 20.31160813093366, |
| "grad_norm": 0.408514142036438, |
| "learning_rate": 0.00035647202797202794, |
| "loss": 3.2161, |
| "step": 69750 |
| }, |
| { |
| "epoch": 20.32616925854738, |
| "grad_norm": 0.41573452949523926, |
| "learning_rate": 0.0003562972027972028, |
| "loss": 3.21, |
| "step": 69800 |
| }, |
| { |
| "epoch": 20.340730386161106, |
| "grad_norm": 0.3953891694545746, |
| "learning_rate": 0.00035612237762237754, |
| "loss": 3.2192, |
| "step": 69850 |
| }, |
| { |
| "epoch": 20.355291513774826, |
| "grad_norm": 0.43931904435157776, |
| "learning_rate": 0.0003559475524475524, |
| "loss": 3.2121, |
| "step": 69900 |
| }, |
| { |
| "epoch": 20.369852641388547, |
| "grad_norm": 0.3625791072845459, |
| "learning_rate": 0.0003557727272727272, |
| "loss": 3.2153, |
| "step": 69950 |
| }, |
| { |
| "epoch": 20.384413769002272, |
| "grad_norm": 0.3787856698036194, |
| "learning_rate": 0.00035559790209790205, |
| "loss": 3.2253, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.384413769002272, |
| "eval_accuracy": 0.37321883832625646, |
| "eval_loss": 3.5476481914520264, |
| "eval_runtime": 180.044, |
| "eval_samples_per_second": 92.455, |
| "eval_steps_per_second": 5.782, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.398974896615993, |
| "grad_norm": 0.3990388512611389, |
| "learning_rate": 0.0003554230769230769, |
| "loss": 3.1984, |
| "step": 70050 |
| }, |
| { |
| "epoch": 20.413536024229717, |
| "grad_norm": 0.35478881001472473, |
| "learning_rate": 0.0003552482517482517, |
| "loss": 3.2165, |
| "step": 70100 |
| }, |
| { |
| "epoch": 20.42809715184344, |
| "grad_norm": 0.3720736801624298, |
| "learning_rate": 0.00035507342657342656, |
| "loss": 3.2285, |
| "step": 70150 |
| }, |
| { |
| "epoch": 20.442658279457163, |
| "grad_norm": 0.3901727795600891, |
| "learning_rate": 0.00035489860139860136, |
| "loss": 3.2368, |
| "step": 70200 |
| }, |
| { |
| "epoch": 20.457219407070884, |
| "grad_norm": 0.41906219720840454, |
| "learning_rate": 0.0003547237762237762, |
| "loss": 3.2368, |
| "step": 70250 |
| }, |
| { |
| "epoch": 20.471780534684605, |
| "grad_norm": 0.38884779810905457, |
| "learning_rate": 0.000354548951048951, |
| "loss": 3.2412, |
| "step": 70300 |
| }, |
| { |
| "epoch": 20.48634166229833, |
| "grad_norm": 0.40126678347587585, |
| "learning_rate": 0.00035437412587412587, |
| "loss": 3.2387, |
| "step": 70350 |
| }, |
| { |
| "epoch": 20.50090278991205, |
| "grad_norm": 0.3941691517829895, |
| "learning_rate": 0.00035419930069930067, |
| "loss": 3.2224, |
| "step": 70400 |
| }, |
| { |
| "epoch": 20.51546391752577, |
| "grad_norm": 0.37078821659088135, |
| "learning_rate": 0.0003540244755244755, |
| "loss": 3.2275, |
| "step": 70450 |
| }, |
| { |
| "epoch": 20.530025045139496, |
| "grad_norm": 0.3779580891132355, |
| "learning_rate": 0.0003538496503496503, |
| "loss": 3.2293, |
| "step": 70500 |
| }, |
| { |
| "epoch": 20.544586172753217, |
| "grad_norm": 0.36630627512931824, |
| "learning_rate": 0.0003536748251748252, |
| "loss": 3.2287, |
| "step": 70550 |
| }, |
| { |
| "epoch": 20.55914730036694, |
| "grad_norm": 0.3747677206993103, |
| "learning_rate": 0.0003534999999999999, |
| "loss": 3.2385, |
| "step": 70600 |
| }, |
| { |
| "epoch": 20.573708427980662, |
| "grad_norm": 0.3941807150840759, |
| "learning_rate": 0.00035332517482517477, |
| "loss": 3.2407, |
| "step": 70650 |
| }, |
| { |
| "epoch": 20.588269555594387, |
| "grad_norm": 0.3974025249481201, |
| "learning_rate": 0.0003531503496503496, |
| "loss": 3.2267, |
| "step": 70700 |
| }, |
| { |
| "epoch": 20.602830683208108, |
| "grad_norm": 0.4064212143421173, |
| "learning_rate": 0.0003529755244755244, |
| "loss": 3.2327, |
| "step": 70750 |
| }, |
| { |
| "epoch": 20.61739181082183, |
| "grad_norm": 0.3866769075393677, |
| "learning_rate": 0.0003528006993006993, |
| "loss": 3.2241, |
| "step": 70800 |
| }, |
| { |
| "epoch": 20.631952938435553, |
| "grad_norm": 0.39001014828681946, |
| "learning_rate": 0.0003526258741258741, |
| "loss": 3.2476, |
| "step": 70850 |
| }, |
| { |
| "epoch": 20.646514066049274, |
| "grad_norm": 0.37435194849967957, |
| "learning_rate": 0.00035245104895104893, |
| "loss": 3.2246, |
| "step": 70900 |
| }, |
| { |
| "epoch": 20.661075193663, |
| "grad_norm": 0.4112047553062439, |
| "learning_rate": 0.00035227622377622373, |
| "loss": 3.2407, |
| "step": 70950 |
| }, |
| { |
| "epoch": 20.67563632127672, |
| "grad_norm": 0.41346442699432373, |
| "learning_rate": 0.0003521013986013986, |
| "loss": 3.2326, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.67563632127672, |
| "eval_accuracy": 0.37384086208189743, |
| "eval_loss": 3.53576922416687, |
| "eval_runtime": 180.2484, |
| "eval_samples_per_second": 92.35, |
| "eval_steps_per_second": 5.775, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.69019744889044, |
| "grad_norm": 0.38038039207458496, |
| "learning_rate": 0.0003519265734265734, |
| "loss": 3.2461, |
| "step": 71050 |
| }, |
| { |
| "epoch": 20.704758576504165, |
| "grad_norm": 0.41925546526908875, |
| "learning_rate": 0.00035175174825174824, |
| "loss": 3.2411, |
| "step": 71100 |
| }, |
| { |
| "epoch": 20.719319704117886, |
| "grad_norm": 0.3877270817756653, |
| "learning_rate": 0.00035157692307692304, |
| "loss": 3.2507, |
| "step": 71150 |
| }, |
| { |
| "epoch": 20.73388083173161, |
| "grad_norm": 0.3802727460861206, |
| "learning_rate": 0.0003514020979020979, |
| "loss": 3.2471, |
| "step": 71200 |
| }, |
| { |
| "epoch": 20.74844195934533, |
| "grad_norm": 0.3646993637084961, |
| "learning_rate": 0.0003512272727272727, |
| "loss": 3.2445, |
| "step": 71250 |
| }, |
| { |
| "epoch": 20.763003086959053, |
| "grad_norm": 0.3975246250629425, |
| "learning_rate": 0.00035105244755244755, |
| "loss": 3.2471, |
| "step": 71300 |
| }, |
| { |
| "epoch": 20.777564214572777, |
| "grad_norm": 0.3957580626010895, |
| "learning_rate": 0.0003508776223776223, |
| "loss": 3.2481, |
| "step": 71350 |
| }, |
| { |
| "epoch": 20.792125342186498, |
| "grad_norm": 0.3608238101005554, |
| "learning_rate": 0.00035070279720279715, |
| "loss": 3.254, |
| "step": 71400 |
| }, |
| { |
| "epoch": 20.806686469800223, |
| "grad_norm": 0.3858623206615448, |
| "learning_rate": 0.000350527972027972, |
| "loss": 3.2498, |
| "step": 71450 |
| }, |
| { |
| "epoch": 20.821247597413944, |
| "grad_norm": 0.43309536576271057, |
| "learning_rate": 0.0003503531468531468, |
| "loss": 3.2472, |
| "step": 71500 |
| }, |
| { |
| "epoch": 20.835808725027665, |
| "grad_norm": 0.38652944564819336, |
| "learning_rate": 0.00035017832167832166, |
| "loss": 3.2466, |
| "step": 71550 |
| }, |
| { |
| "epoch": 20.85036985264139, |
| "grad_norm": 0.40087568759918213, |
| "learning_rate": 0.00035000349650349645, |
| "loss": 3.2468, |
| "step": 71600 |
| }, |
| { |
| "epoch": 20.86493098025511, |
| "grad_norm": 0.37535202503204346, |
| "learning_rate": 0.0003498286713286713, |
| "loss": 3.2431, |
| "step": 71650 |
| }, |
| { |
| "epoch": 20.879492107868835, |
| "grad_norm": 0.3938947916030884, |
| "learning_rate": 0.0003496538461538461, |
| "loss": 3.2476, |
| "step": 71700 |
| }, |
| { |
| "epoch": 20.894053235482556, |
| "grad_norm": 0.38487690687179565, |
| "learning_rate": 0.00034947902097902096, |
| "loss": 3.2556, |
| "step": 71750 |
| }, |
| { |
| "epoch": 20.908614363096277, |
| "grad_norm": 0.3644219636917114, |
| "learning_rate": 0.00034930419580419576, |
| "loss": 3.2536, |
| "step": 71800 |
| }, |
| { |
| "epoch": 20.92317549071, |
| "grad_norm": 0.40566596388816833, |
| "learning_rate": 0.0003491293706293706, |
| "loss": 3.244, |
| "step": 71850 |
| }, |
| { |
| "epoch": 20.937736618323722, |
| "grad_norm": 0.38698315620422363, |
| "learning_rate": 0.0003489545454545454, |
| "loss": 3.2499, |
| "step": 71900 |
| }, |
| { |
| "epoch": 20.952297745937447, |
| "grad_norm": 0.38761574029922485, |
| "learning_rate": 0.00034877972027972027, |
| "loss": 3.2502, |
| "step": 71950 |
| }, |
| { |
| "epoch": 20.966858873551168, |
| "grad_norm": 0.3744429647922516, |
| "learning_rate": 0.00034860489510489507, |
| "loss": 3.2448, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.966858873551168, |
| "eval_accuracy": 0.3739019946377343, |
| "eval_loss": 3.5312864780426025, |
| "eval_runtime": 179.9319, |
| "eval_samples_per_second": 92.513, |
| "eval_steps_per_second": 5.786, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.98142000116489, |
| "grad_norm": 0.3960542380809784, |
| "learning_rate": 0.0003484300699300699, |
| "loss": 3.2572, |
| "step": 72050 |
| }, |
| { |
| "epoch": 20.995981128778613, |
| "grad_norm": 0.3759651482105255, |
| "learning_rate": 0.0003482552447552448, |
| "loss": 3.2621, |
| "step": 72100 |
| }, |
| { |
| "epoch": 21.01048401188188, |
| "grad_norm": 0.4319317936897278, |
| "learning_rate": 0.0003480804195804195, |
| "loss": 3.1813, |
| "step": 72150 |
| }, |
| { |
| "epoch": 21.025045139495603, |
| "grad_norm": 0.39148691296577454, |
| "learning_rate": 0.0003479055944055944, |
| "loss": 3.1588, |
| "step": 72200 |
| }, |
| { |
| "epoch": 21.039606267109324, |
| "grad_norm": 0.38446807861328125, |
| "learning_rate": 0.0003477307692307692, |
| "loss": 3.1545, |
| "step": 72250 |
| }, |
| { |
| "epoch": 21.05416739472305, |
| "grad_norm": 0.38231173157691956, |
| "learning_rate": 0.00034755594405594403, |
| "loss": 3.1551, |
| "step": 72300 |
| }, |
| { |
| "epoch": 21.06872852233677, |
| "grad_norm": 0.4030628204345703, |
| "learning_rate": 0.00034738111888111883, |
| "loss": 3.1649, |
| "step": 72350 |
| }, |
| { |
| "epoch": 21.08328964995049, |
| "grad_norm": 0.4177522659301758, |
| "learning_rate": 0.0003472062937062937, |
| "loss": 3.1709, |
| "step": 72400 |
| }, |
| { |
| "epoch": 21.097850777564215, |
| "grad_norm": 0.5332360863685608, |
| "learning_rate": 0.0003470314685314685, |
| "loss": 3.1681, |
| "step": 72450 |
| }, |
| { |
| "epoch": 21.112411905177936, |
| "grad_norm": 0.40137720108032227, |
| "learning_rate": 0.00034685664335664334, |
| "loss": 3.165, |
| "step": 72500 |
| }, |
| { |
| "epoch": 21.12697303279166, |
| "grad_norm": 0.37777307629585266, |
| "learning_rate": 0.00034668181818181814, |
| "loss": 3.1715, |
| "step": 72550 |
| }, |
| { |
| "epoch": 21.14153416040538, |
| "grad_norm": 0.3991614878177643, |
| "learning_rate": 0.000346506993006993, |
| "loss": 3.1793, |
| "step": 72600 |
| }, |
| { |
| "epoch": 21.156095288019102, |
| "grad_norm": 0.4044230878353119, |
| "learning_rate": 0.0003463321678321678, |
| "loss": 3.1783, |
| "step": 72650 |
| }, |
| { |
| "epoch": 21.170656415632827, |
| "grad_norm": 0.37559133768081665, |
| "learning_rate": 0.00034615734265734264, |
| "loss": 3.1877, |
| "step": 72700 |
| }, |
| { |
| "epoch": 21.185217543246548, |
| "grad_norm": 0.4068480134010315, |
| "learning_rate": 0.0003459825174825175, |
| "loss": 3.1784, |
| "step": 72750 |
| }, |
| { |
| "epoch": 21.199778670860272, |
| "grad_norm": 0.39489054679870605, |
| "learning_rate": 0.0003458076923076923, |
| "loss": 3.1859, |
| "step": 72800 |
| }, |
| { |
| "epoch": 21.214339798473993, |
| "grad_norm": 0.3678929805755615, |
| "learning_rate": 0.00034563286713286715, |
| "loss": 3.1896, |
| "step": 72850 |
| }, |
| { |
| "epoch": 21.228900926087718, |
| "grad_norm": 0.3867121636867523, |
| "learning_rate": 0.0003454580419580419, |
| "loss": 3.1851, |
| "step": 72900 |
| }, |
| { |
| "epoch": 21.24346205370144, |
| "grad_norm": 0.393509179353714, |
| "learning_rate": 0.00034528321678321675, |
| "loss": 3.1862, |
| "step": 72950 |
| }, |
| { |
| "epoch": 21.25802318131516, |
| "grad_norm": 0.3877445161342621, |
| "learning_rate": 0.00034510839160839155, |
| "loss": 3.1927, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.25802318131516, |
| "eval_accuracy": 0.37313654450109135, |
| "eval_loss": 3.548133373260498, |
| "eval_runtime": 180.2174, |
| "eval_samples_per_second": 92.366, |
| "eval_steps_per_second": 5.776, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.272584308928884, |
| "grad_norm": 0.38138648867607117, |
| "learning_rate": 0.0003449335664335664, |
| "loss": 3.1927, |
| "step": 73050 |
| }, |
| { |
| "epoch": 21.287145436542605, |
| "grad_norm": 0.4013443887233734, |
| "learning_rate": 0.0003447587412587412, |
| "loss": 3.1796, |
| "step": 73100 |
| }, |
| { |
| "epoch": 21.30170656415633, |
| "grad_norm": 0.3730858564376831, |
| "learning_rate": 0.00034458391608391606, |
| "loss": 3.1864, |
| "step": 73150 |
| }, |
| { |
| "epoch": 21.31626769177005, |
| "grad_norm": 0.36887383460998535, |
| "learning_rate": 0.00034440909090909086, |
| "loss": 3.1944, |
| "step": 73200 |
| }, |
| { |
| "epoch": 21.330828819383772, |
| "grad_norm": 0.40235355496406555, |
| "learning_rate": 0.0003442342657342657, |
| "loss": 3.1998, |
| "step": 73250 |
| }, |
| { |
| "epoch": 21.345389946997496, |
| "grad_norm": 0.393684446811676, |
| "learning_rate": 0.0003440594405594405, |
| "loss": 3.203, |
| "step": 73300 |
| }, |
| { |
| "epoch": 21.359951074611217, |
| "grad_norm": 0.393527090549469, |
| "learning_rate": 0.00034388461538461537, |
| "loss": 3.2127, |
| "step": 73350 |
| }, |
| { |
| "epoch": 21.374512202224942, |
| "grad_norm": 0.3892517685890198, |
| "learning_rate": 0.00034370979020979017, |
| "loss": 3.2144, |
| "step": 73400 |
| }, |
| { |
| "epoch": 21.389073329838663, |
| "grad_norm": 0.38523000478744507, |
| "learning_rate": 0.000343534965034965, |
| "loss": 3.2097, |
| "step": 73450 |
| }, |
| { |
| "epoch": 21.403634457452384, |
| "grad_norm": 0.4060019552707672, |
| "learning_rate": 0.0003433601398601399, |
| "loss": 3.2145, |
| "step": 73500 |
| }, |
| { |
| "epoch": 21.41819558506611, |
| "grad_norm": 0.42627066373825073, |
| "learning_rate": 0.0003431853146853147, |
| "loss": 3.2152, |
| "step": 73550 |
| }, |
| { |
| "epoch": 21.43275671267983, |
| "grad_norm": 0.41233697533607483, |
| "learning_rate": 0.0003430104895104895, |
| "loss": 3.2101, |
| "step": 73600 |
| }, |
| { |
| "epoch": 21.447317840293554, |
| "grad_norm": 0.3984028398990631, |
| "learning_rate": 0.00034283566433566427, |
| "loss": 3.207, |
| "step": 73650 |
| }, |
| { |
| "epoch": 21.461878967907275, |
| "grad_norm": 0.378518670797348, |
| "learning_rate": 0.0003426608391608391, |
| "loss": 3.2125, |
| "step": 73700 |
| }, |
| { |
| "epoch": 21.476440095520996, |
| "grad_norm": 0.3638949394226074, |
| "learning_rate": 0.0003424860139860139, |
| "loss": 3.2306, |
| "step": 73750 |
| }, |
| { |
| "epoch": 21.49100122313472, |
| "grad_norm": 0.3892297148704529, |
| "learning_rate": 0.0003423111888111888, |
| "loss": 3.2221, |
| "step": 73800 |
| }, |
| { |
| "epoch": 21.50556235074844, |
| "grad_norm": 0.38714274764060974, |
| "learning_rate": 0.0003421363636363636, |
| "loss": 3.2147, |
| "step": 73850 |
| }, |
| { |
| "epoch": 21.520123478362166, |
| "grad_norm": 0.38122016191482544, |
| "learning_rate": 0.00034196153846153843, |
| "loss": 3.2185, |
| "step": 73900 |
| }, |
| { |
| "epoch": 21.534684605975887, |
| "grad_norm": 0.4093562066555023, |
| "learning_rate": 0.00034178671328671323, |
| "loss": 3.2267, |
| "step": 73950 |
| }, |
| { |
| "epoch": 21.549245733589608, |
| "grad_norm": 0.40201225876808167, |
| "learning_rate": 0.0003416118881118881, |
| "loss": 3.2163, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.549245733589608, |
| "eval_accuracy": 0.3737187145328309, |
| "eval_loss": 3.542142152786255, |
| "eval_runtime": 180.2089, |
| "eval_samples_per_second": 92.371, |
| "eval_steps_per_second": 5.777, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.563806861203332, |
| "grad_norm": 0.4206041395664215, |
| "learning_rate": 0.0003414370629370629, |
| "loss": 3.2186, |
| "step": 74050 |
| }, |
| { |
| "epoch": 21.578367988817053, |
| "grad_norm": 0.3935268521308899, |
| "learning_rate": 0.00034126223776223774, |
| "loss": 3.2307, |
| "step": 74100 |
| }, |
| { |
| "epoch": 21.592929116430778, |
| "grad_norm": 0.3997489809989929, |
| "learning_rate": 0.0003410874125874126, |
| "loss": 3.2391, |
| "step": 74150 |
| }, |
| { |
| "epoch": 21.6074902440445, |
| "grad_norm": 0.4133063852787018, |
| "learning_rate": 0.0003409125874125874, |
| "loss": 3.2339, |
| "step": 74200 |
| }, |
| { |
| "epoch": 21.62205137165822, |
| "grad_norm": 0.4046327471733093, |
| "learning_rate": 0.00034073776223776225, |
| "loss": 3.2291, |
| "step": 74250 |
| }, |
| { |
| "epoch": 21.636612499271944, |
| "grad_norm": 0.4158601462841034, |
| "learning_rate": 0.00034056293706293705, |
| "loss": 3.2306, |
| "step": 74300 |
| }, |
| { |
| "epoch": 21.651173626885665, |
| "grad_norm": 0.3953965902328491, |
| "learning_rate": 0.0003403881118881119, |
| "loss": 3.228, |
| "step": 74350 |
| }, |
| { |
| "epoch": 21.66573475449939, |
| "grad_norm": 0.378416508436203, |
| "learning_rate": 0.00034021328671328665, |
| "loss": 3.236, |
| "step": 74400 |
| }, |
| { |
| "epoch": 21.68029588211311, |
| "grad_norm": 0.37443292140960693, |
| "learning_rate": 0.0003400384615384615, |
| "loss": 3.2254, |
| "step": 74450 |
| }, |
| { |
| "epoch": 21.69485700972683, |
| "grad_norm": 0.3993610143661499, |
| "learning_rate": 0.0003398636363636363, |
| "loss": 3.2341, |
| "step": 74500 |
| }, |
| { |
| "epoch": 21.709418137340556, |
| "grad_norm": 0.39983636140823364, |
| "learning_rate": 0.00033968881118881115, |
| "loss": 3.2342, |
| "step": 74550 |
| }, |
| { |
| "epoch": 21.723979264954277, |
| "grad_norm": 0.4082450270652771, |
| "learning_rate": 0.00033951398601398595, |
| "loss": 3.2375, |
| "step": 74600 |
| }, |
| { |
| "epoch": 21.738540392568, |
| "grad_norm": 0.40965044498443604, |
| "learning_rate": 0.0003393391608391608, |
| "loss": 3.2148, |
| "step": 74650 |
| }, |
| { |
| "epoch": 21.753101520181723, |
| "grad_norm": 0.39464494585990906, |
| "learning_rate": 0.0003391643356643356, |
| "loss": 3.2354, |
| "step": 74700 |
| }, |
| { |
| "epoch": 21.767662647795444, |
| "grad_norm": 0.38399943709373474, |
| "learning_rate": 0.00033898951048951046, |
| "loss": 3.2329, |
| "step": 74750 |
| }, |
| { |
| "epoch": 21.782223775409168, |
| "grad_norm": 0.3806881904602051, |
| "learning_rate": 0.00033881468531468526, |
| "loss": 3.2315, |
| "step": 74800 |
| }, |
| { |
| "epoch": 21.79678490302289, |
| "grad_norm": 0.3800226151943207, |
| "learning_rate": 0.0003386398601398601, |
| "loss": 3.2366, |
| "step": 74850 |
| }, |
| { |
| "epoch": 21.811346030636614, |
| "grad_norm": 0.3831939995288849, |
| "learning_rate": 0.00033846503496503497, |
| "loss": 3.2355, |
| "step": 74900 |
| }, |
| { |
| "epoch": 21.825907158250335, |
| "grad_norm": 0.36772599816322327, |
| "learning_rate": 0.00033829020979020977, |
| "loss": 3.2456, |
| "step": 74950 |
| }, |
| { |
| "epoch": 21.840468285864056, |
| "grad_norm": 0.4026773273944855, |
| "learning_rate": 0.0003381153846153846, |
| "loss": 3.2369, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.840468285864056, |
| "eval_accuracy": 0.37462712080004645, |
| "eval_loss": 3.5301737785339355, |
| "eval_runtime": 179.9683, |
| "eval_samples_per_second": 92.494, |
| "eval_steps_per_second": 5.784, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.85502941347778, |
| "grad_norm": 0.4003564119338989, |
| "learning_rate": 0.0003379405594405594, |
| "loss": 3.2435, |
| "step": 75050 |
| }, |
| { |
| "epoch": 21.8695905410915, |
| "grad_norm": 0.4154500365257263, |
| "learning_rate": 0.0003377657342657343, |
| "loss": 3.2278, |
| "step": 75100 |
| }, |
| { |
| "epoch": 21.884151668705226, |
| "grad_norm": 0.4204762279987335, |
| "learning_rate": 0.000337590909090909, |
| "loss": 3.2371, |
| "step": 75150 |
| }, |
| { |
| "epoch": 21.898712796318947, |
| "grad_norm": 0.38466429710388184, |
| "learning_rate": 0.0003374160839160839, |
| "loss": 3.2451, |
| "step": 75200 |
| }, |
| { |
| "epoch": 21.91327392393267, |
| "grad_norm": 0.38914236426353455, |
| "learning_rate": 0.0003372412587412587, |
| "loss": 3.2487, |
| "step": 75250 |
| }, |
| { |
| "epoch": 21.927835051546392, |
| "grad_norm": 0.4058581292629242, |
| "learning_rate": 0.00033706643356643353, |
| "loss": 3.2456, |
| "step": 75300 |
| }, |
| { |
| "epoch": 21.942396179160113, |
| "grad_norm": 0.3948516547679901, |
| "learning_rate": 0.00033689160839160833, |
| "loss": 3.2483, |
| "step": 75350 |
| }, |
| { |
| "epoch": 21.956957306773838, |
| "grad_norm": 0.4105282425880432, |
| "learning_rate": 0.0003367167832167832, |
| "loss": 3.2346, |
| "step": 75400 |
| }, |
| { |
| "epoch": 21.97151843438756, |
| "grad_norm": 0.4124220013618469, |
| "learning_rate": 0.000336541958041958, |
| "loss": 3.2427, |
| "step": 75450 |
| }, |
| { |
| "epoch": 21.986079562001283, |
| "grad_norm": 0.4113534688949585, |
| "learning_rate": 0.00033636713286713284, |
| "loss": 3.2458, |
| "step": 75500 |
| }, |
| { |
| "epoch": 22.00058244510455, |
| "grad_norm": 0.41743507981300354, |
| "learning_rate": 0.0003361923076923077, |
| "loss": 3.2468, |
| "step": 75550 |
| }, |
| { |
| "epoch": 22.015143572718273, |
| "grad_norm": 0.3947121500968933, |
| "learning_rate": 0.0003360174825174825, |
| "loss": 3.1412, |
| "step": 75600 |
| }, |
| { |
| "epoch": 22.029704700331994, |
| "grad_norm": 0.4013170599937439, |
| "learning_rate": 0.00033584265734265734, |
| "loss": 3.147, |
| "step": 75650 |
| }, |
| { |
| "epoch": 22.044265827945715, |
| "grad_norm": 0.41499465703964233, |
| "learning_rate": 0.00033566783216783214, |
| "loss": 3.1406, |
| "step": 75700 |
| }, |
| { |
| "epoch": 22.05882695555944, |
| "grad_norm": 0.40082693099975586, |
| "learning_rate": 0.000335493006993007, |
| "loss": 3.1622, |
| "step": 75750 |
| }, |
| { |
| "epoch": 22.07338808317316, |
| "grad_norm": 0.3774999976158142, |
| "learning_rate": 0.0003353181818181818, |
| "loss": 3.1378, |
| "step": 75800 |
| }, |
| { |
| "epoch": 22.087949210786885, |
| "grad_norm": 0.3857954144477844, |
| "learning_rate": 0.00033514335664335665, |
| "loss": 3.1499, |
| "step": 75850 |
| }, |
| { |
| "epoch": 22.102510338400606, |
| "grad_norm": 0.4899783730506897, |
| "learning_rate": 0.0003349685314685314, |
| "loss": 3.1543, |
| "step": 75900 |
| }, |
| { |
| "epoch": 22.117071466014327, |
| "grad_norm": 0.4249260425567627, |
| "learning_rate": 0.00033479370629370625, |
| "loss": 3.1797, |
| "step": 75950 |
| }, |
| { |
| "epoch": 22.13163259362805, |
| "grad_norm": 0.410011887550354, |
| "learning_rate": 0.00033461888111888105, |
| "loss": 3.1542, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.13163259362805, |
| "eval_accuracy": 0.37328361532292215, |
| "eval_loss": 3.5537054538726807, |
| "eval_runtime": 180.0603, |
| "eval_samples_per_second": 92.447, |
| "eval_steps_per_second": 5.781, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.146193721241772, |
| "grad_norm": 0.3888453245162964, |
| "learning_rate": 0.0003344440559440559, |
| "loss": 3.1632, |
| "step": 76050 |
| }, |
| { |
| "epoch": 22.160754848855497, |
| "grad_norm": 0.3912050426006317, |
| "learning_rate": 0.0003342692307692307, |
| "loss": 3.183, |
| "step": 76100 |
| }, |
| { |
| "epoch": 22.175315976469218, |
| "grad_norm": 0.38971278071403503, |
| "learning_rate": 0.00033409440559440556, |
| "loss": 3.1864, |
| "step": 76150 |
| }, |
| { |
| "epoch": 22.18987710408294, |
| "grad_norm": 0.3725743293762207, |
| "learning_rate": 0.00033391958041958036, |
| "loss": 3.1715, |
| "step": 76200 |
| }, |
| { |
| "epoch": 22.204438231696663, |
| "grad_norm": 0.4129214286804199, |
| "learning_rate": 0.0003337447552447552, |
| "loss": 3.1842, |
| "step": 76250 |
| }, |
| { |
| "epoch": 22.218999359310384, |
| "grad_norm": 0.40630948543548584, |
| "learning_rate": 0.00033356993006993007, |
| "loss": 3.1818, |
| "step": 76300 |
| }, |
| { |
| "epoch": 22.23356048692411, |
| "grad_norm": 0.4003860354423523, |
| "learning_rate": 0.00033339510489510487, |
| "loss": 3.1635, |
| "step": 76350 |
| }, |
| { |
| "epoch": 22.24812161453783, |
| "grad_norm": 0.3969387114048004, |
| "learning_rate": 0.0003332202797202797, |
| "loss": 3.1821, |
| "step": 76400 |
| }, |
| { |
| "epoch": 22.26268274215155, |
| "grad_norm": 0.3908751606941223, |
| "learning_rate": 0.0003330454545454545, |
| "loss": 3.1798, |
| "step": 76450 |
| }, |
| { |
| "epoch": 22.277243869765275, |
| "grad_norm": 0.3970881700515747, |
| "learning_rate": 0.0003328706293706294, |
| "loss": 3.1837, |
| "step": 76500 |
| }, |
| { |
| "epoch": 22.291804997378996, |
| "grad_norm": 0.3967888653278351, |
| "learning_rate": 0.00033269580419580417, |
| "loss": 3.1872, |
| "step": 76550 |
| }, |
| { |
| "epoch": 22.30636612499272, |
| "grad_norm": 0.42406654357910156, |
| "learning_rate": 0.000332520979020979, |
| "loss": 3.1921, |
| "step": 76600 |
| }, |
| { |
| "epoch": 22.32092725260644, |
| "grad_norm": 0.4066693186759949, |
| "learning_rate": 0.00033234615384615377, |
| "loss": 3.2, |
| "step": 76650 |
| }, |
| { |
| "epoch": 22.335488380220163, |
| "grad_norm": 0.39060643315315247, |
| "learning_rate": 0.0003321713286713286, |
| "loss": 3.2104, |
| "step": 76700 |
| }, |
| { |
| "epoch": 22.350049507833887, |
| "grad_norm": 0.39368969202041626, |
| "learning_rate": 0.0003319965034965034, |
| "loss": 3.2015, |
| "step": 76750 |
| }, |
| { |
| "epoch": 22.364610635447608, |
| "grad_norm": 0.40363943576812744, |
| "learning_rate": 0.0003318216783216783, |
| "loss": 3.2003, |
| "step": 76800 |
| }, |
| { |
| "epoch": 22.379171763061333, |
| "grad_norm": 0.38878241181373596, |
| "learning_rate": 0.0003316468531468531, |
| "loss": 3.2079, |
| "step": 76850 |
| }, |
| { |
| "epoch": 22.393732890675054, |
| "grad_norm": 0.37092626094818115, |
| "learning_rate": 0.00033147202797202793, |
| "loss": 3.2027, |
| "step": 76900 |
| }, |
| { |
| "epoch": 22.408294018288775, |
| "grad_norm": 0.4058542847633362, |
| "learning_rate": 0.0003312972027972028, |
| "loss": 3.1978, |
| "step": 76950 |
| }, |
| { |
| "epoch": 22.4228551459025, |
| "grad_norm": 0.3866056203842163, |
| "learning_rate": 0.0003311223776223776, |
| "loss": 3.1967, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.4228551459025, |
| "eval_accuracy": 0.3737928965380869, |
| "eval_loss": 3.5445046424865723, |
| "eval_runtime": 258.1775, |
| "eval_samples_per_second": 64.475, |
| "eval_steps_per_second": 4.032, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.43741627351622, |
| "grad_norm": 0.3999904692173004, |
| "learning_rate": 0.00033094755244755244, |
| "loss": 3.2007, |
| "step": 77050 |
| }, |
| { |
| "epoch": 22.451977401129945, |
| "grad_norm": 0.45506981015205383, |
| "learning_rate": 0.00033077272727272724, |
| "loss": 3.211, |
| "step": 77100 |
| }, |
| { |
| "epoch": 22.466538528743666, |
| "grad_norm": 0.3958829641342163, |
| "learning_rate": 0.0003305979020979021, |
| "loss": 3.2046, |
| "step": 77150 |
| }, |
| { |
| "epoch": 22.481099656357387, |
| "grad_norm": 0.3806569278240204, |
| "learning_rate": 0.0003304230769230769, |
| "loss": 3.2117, |
| "step": 77200 |
| }, |
| { |
| "epoch": 22.49566078397111, |
| "grad_norm": 0.419170081615448, |
| "learning_rate": 0.00033024825174825175, |
| "loss": 3.2058, |
| "step": 77250 |
| }, |
| { |
| "epoch": 22.510221911584832, |
| "grad_norm": 0.39194509387016296, |
| "learning_rate": 0.00033007342657342655, |
| "loss": 3.2101, |
| "step": 77300 |
| }, |
| { |
| "epoch": 22.524783039198557, |
| "grad_norm": 0.3864322006702423, |
| "learning_rate": 0.0003298986013986014, |
| "loss": 3.2044, |
| "step": 77350 |
| }, |
| { |
| "epoch": 22.539344166812278, |
| "grad_norm": 0.3918672502040863, |
| "learning_rate": 0.00032972377622377615, |
| "loss": 3.2037, |
| "step": 77400 |
| }, |
| { |
| "epoch": 22.553905294426002, |
| "grad_norm": 0.40433594584465027, |
| "learning_rate": 0.000329548951048951, |
| "loss": 3.2124, |
| "step": 77450 |
| }, |
| { |
| "epoch": 22.568466422039723, |
| "grad_norm": 0.41127893328666687, |
| "learning_rate": 0.0003293741258741258, |
| "loss": 3.215, |
| "step": 77500 |
| }, |
| { |
| "epoch": 22.583027549653444, |
| "grad_norm": 0.40741223096847534, |
| "learning_rate": 0.00032919930069930065, |
| "loss": 3.2154, |
| "step": 77550 |
| }, |
| { |
| "epoch": 22.59758867726717, |
| "grad_norm": 0.3917607069015503, |
| "learning_rate": 0.0003290244755244755, |
| "loss": 3.2064, |
| "step": 77600 |
| }, |
| { |
| "epoch": 22.61214980488089, |
| "grad_norm": 0.4020385146141052, |
| "learning_rate": 0.0003288496503496503, |
| "loss": 3.2151, |
| "step": 77650 |
| }, |
| { |
| "epoch": 22.626710932494614, |
| "grad_norm": 0.5136081576347351, |
| "learning_rate": 0.00032867482517482516, |
| "loss": 3.2245, |
| "step": 77700 |
| }, |
| { |
| "epoch": 22.641272060108335, |
| "grad_norm": 0.3728886842727661, |
| "learning_rate": 0.00032849999999999996, |
| "loss": 3.2206, |
| "step": 77750 |
| }, |
| { |
| "epoch": 22.655833187722056, |
| "grad_norm": 0.3799057602882385, |
| "learning_rate": 0.0003283251748251748, |
| "loss": 3.2144, |
| "step": 77800 |
| }, |
| { |
| "epoch": 22.67039431533578, |
| "grad_norm": 0.39716431498527527, |
| "learning_rate": 0.0003281503496503496, |
| "loss": 3.2191, |
| "step": 77850 |
| }, |
| { |
| "epoch": 22.6849554429495, |
| "grad_norm": 0.43552517890930176, |
| "learning_rate": 0.00032797552447552447, |
| "loss": 3.2231, |
| "step": 77900 |
| }, |
| { |
| "epoch": 22.699516570563226, |
| "grad_norm": 0.3908248841762543, |
| "learning_rate": 0.00032780069930069927, |
| "loss": 3.212, |
| "step": 77950 |
| }, |
| { |
| "epoch": 22.714077698176947, |
| "grad_norm": 0.4287446141242981, |
| "learning_rate": 0.0003276258741258741, |
| "loss": 3.2073, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.714077698176947, |
| "eval_accuracy": 0.3741561649948872, |
| "eval_loss": 3.538102626800537, |
| "eval_runtime": 180.7115, |
| "eval_samples_per_second": 92.114, |
| "eval_steps_per_second": 5.761, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.728638825790668, |
| "grad_norm": 0.40964144468307495, |
| "learning_rate": 0.0003274510489510489, |
| "loss": 3.218, |
| "step": 78050 |
| }, |
| { |
| "epoch": 22.743199953404392, |
| "grad_norm": 0.4104141592979431, |
| "learning_rate": 0.0003272762237762238, |
| "loss": 3.22, |
| "step": 78100 |
| }, |
| { |
| "epoch": 22.757761081018113, |
| "grad_norm": 0.41202884912490845, |
| "learning_rate": 0.0003271013986013985, |
| "loss": 3.2137, |
| "step": 78150 |
| }, |
| { |
| "epoch": 22.772322208631838, |
| "grad_norm": 0.41015103459358215, |
| "learning_rate": 0.0003269265734265734, |
| "loss": 3.2242, |
| "step": 78200 |
| }, |
| { |
| "epoch": 22.78688333624556, |
| "grad_norm": 0.40442413091659546, |
| "learning_rate": 0.0003267517482517482, |
| "loss": 3.2366, |
| "step": 78250 |
| }, |
| { |
| "epoch": 22.80144446385928, |
| "grad_norm": 0.42656955122947693, |
| "learning_rate": 0.00032657692307692303, |
| "loss": 3.2278, |
| "step": 78300 |
| }, |
| { |
| "epoch": 22.816005591473004, |
| "grad_norm": 0.40416911244392395, |
| "learning_rate": 0.0003264020979020979, |
| "loss": 3.2191, |
| "step": 78350 |
| }, |
| { |
| "epoch": 22.830566719086725, |
| "grad_norm": 0.3903436064720154, |
| "learning_rate": 0.0003262272727272727, |
| "loss": 3.2222, |
| "step": 78400 |
| }, |
| { |
| "epoch": 22.84512784670045, |
| "grad_norm": 0.39566561579704285, |
| "learning_rate": 0.00032605244755244754, |
| "loss": 3.2269, |
| "step": 78450 |
| }, |
| { |
| "epoch": 22.85968897431417, |
| "grad_norm": 0.391445130109787, |
| "learning_rate": 0.00032587762237762234, |
| "loss": 3.2487, |
| "step": 78500 |
| }, |
| { |
| "epoch": 22.874250101927892, |
| "grad_norm": 0.40493929386138916, |
| "learning_rate": 0.0003257027972027972, |
| "loss": 3.2305, |
| "step": 78550 |
| }, |
| { |
| "epoch": 22.888811229541616, |
| "grad_norm": 0.3763491213321686, |
| "learning_rate": 0.000325527972027972, |
| "loss": 3.2321, |
| "step": 78600 |
| }, |
| { |
| "epoch": 22.903372357155337, |
| "grad_norm": 0.4233284890651703, |
| "learning_rate": 0.00032535314685314684, |
| "loss": 3.2314, |
| "step": 78650 |
| }, |
| { |
| "epoch": 22.917933484769062, |
| "grad_norm": 0.3961465656757355, |
| "learning_rate": 0.00032517832167832164, |
| "loss": 3.2319, |
| "step": 78700 |
| }, |
| { |
| "epoch": 22.932494612382783, |
| "grad_norm": 0.383890300989151, |
| "learning_rate": 0.0003250034965034965, |
| "loss": 3.2315, |
| "step": 78750 |
| }, |
| { |
| "epoch": 22.947055739996504, |
| "grad_norm": 0.41498807072639465, |
| "learning_rate": 0.0003248286713286713, |
| "loss": 3.237, |
| "step": 78800 |
| }, |
| { |
| "epoch": 22.96161686761023, |
| "grad_norm": 0.40998226404190063, |
| "learning_rate": 0.00032465384615384615, |
| "loss": 3.2289, |
| "step": 78850 |
| }, |
| { |
| "epoch": 22.97617799522395, |
| "grad_norm": 0.44798022508621216, |
| "learning_rate": 0.0003244790209790209, |
| "loss": 3.2423, |
| "step": 78900 |
| }, |
| { |
| "epoch": 22.990739122837674, |
| "grad_norm": 0.4027538299560547, |
| "learning_rate": 0.00032430419580419575, |
| "loss": 3.2385, |
| "step": 78950 |
| }, |
| { |
| "epoch": 23.00524200594094, |
| "grad_norm": 0.41473162174224854, |
| "learning_rate": 0.00032412937062937066, |
| "loss": 3.1855, |
| "step": 79000 |
| }, |
| { |
| "epoch": 23.00524200594094, |
| "eval_accuracy": 0.3737847847181777, |
| "eval_loss": 3.5444626808166504, |
| "eval_runtime": 180.4791, |
| "eval_samples_per_second": 92.232, |
| "eval_steps_per_second": 5.768, |
| "step": 79000 |
| }, |
| { |
| "epoch": 23.019803133554664, |
| "grad_norm": 0.38057205080986023, |
| "learning_rate": 0.0003239545454545454, |
| "loss": 3.1333, |
| "step": 79050 |
| }, |
| { |
| "epoch": 23.034364261168385, |
| "grad_norm": 0.3964289724826813, |
| "learning_rate": 0.00032377972027972026, |
| "loss": 3.1432, |
| "step": 79100 |
| }, |
| { |
| "epoch": 23.048925388782106, |
| "grad_norm": 0.4040731191635132, |
| "learning_rate": 0.00032360489510489506, |
| "loss": 3.1318, |
| "step": 79150 |
| }, |
| { |
| "epoch": 23.06348651639583, |
| "grad_norm": 0.4074184000492096, |
| "learning_rate": 0.0003234300699300699, |
| "loss": 3.1446, |
| "step": 79200 |
| }, |
| { |
| "epoch": 23.07804764400955, |
| "grad_norm": 0.4043462872505188, |
| "learning_rate": 0.0003232552447552447, |
| "loss": 3.142, |
| "step": 79250 |
| }, |
| { |
| "epoch": 23.092608771623276, |
| "grad_norm": 0.44761183857917786, |
| "learning_rate": 0.00032308041958041957, |
| "loss": 3.1496, |
| "step": 79300 |
| }, |
| { |
| "epoch": 23.107169899236997, |
| "grad_norm": 0.41128572821617126, |
| "learning_rate": 0.00032290559440559437, |
| "loss": 3.1416, |
| "step": 79350 |
| }, |
| { |
| "epoch": 23.121731026850718, |
| "grad_norm": 0.3760201036930084, |
| "learning_rate": 0.0003227307692307692, |
| "loss": 3.1562, |
| "step": 79400 |
| }, |
| { |
| "epoch": 23.136292154464442, |
| "grad_norm": 0.4055956304073334, |
| "learning_rate": 0.000322555944055944, |
| "loss": 3.1609, |
| "step": 79450 |
| }, |
| { |
| "epoch": 23.150853282078163, |
| "grad_norm": 0.40147146582603455, |
| "learning_rate": 0.00032238111888111887, |
| "loss": 3.163, |
| "step": 79500 |
| }, |
| { |
| "epoch": 23.165414409691888, |
| "grad_norm": 0.4069676101207733, |
| "learning_rate": 0.00032220629370629367, |
| "loss": 3.156, |
| "step": 79550 |
| }, |
| { |
| "epoch": 23.17997553730561, |
| "grad_norm": 0.44056233763694763, |
| "learning_rate": 0.0003220314685314685, |
| "loss": 3.1695, |
| "step": 79600 |
| }, |
| { |
| "epoch": 23.19453666491933, |
| "grad_norm": 0.39475882053375244, |
| "learning_rate": 0.00032185664335664327, |
| "loss": 3.1703, |
| "step": 79650 |
| }, |
| { |
| "epoch": 23.209097792533054, |
| "grad_norm": 0.4088168442249298, |
| "learning_rate": 0.0003216818181818181, |
| "loss": 3.1671, |
| "step": 79700 |
| }, |
| { |
| "epoch": 23.223658920146775, |
| "grad_norm": 0.43437495827674866, |
| "learning_rate": 0.00032150699300699303, |
| "loss": 3.1639, |
| "step": 79750 |
| }, |
| { |
| "epoch": 23.2382200477605, |
| "grad_norm": 0.40435701608657837, |
| "learning_rate": 0.0003213321678321678, |
| "loss": 3.1699, |
| "step": 79800 |
| }, |
| { |
| "epoch": 23.25278117537422, |
| "grad_norm": 0.41126659512519836, |
| "learning_rate": 0.00032115734265734263, |
| "loss": 3.1715, |
| "step": 79850 |
| }, |
| { |
| "epoch": 23.26734230298794, |
| "grad_norm": 0.4153859317302704, |
| "learning_rate": 0.00032098251748251743, |
| "loss": 3.1737, |
| "step": 79900 |
| }, |
| { |
| "epoch": 23.281903430601666, |
| "grad_norm": 0.37753936648368835, |
| "learning_rate": 0.0003208076923076923, |
| "loss": 3.1805, |
| "step": 79950 |
| }, |
| { |
| "epoch": 23.296464558215387, |
| "grad_norm": 0.41824862360954285, |
| "learning_rate": 0.0003206328671328671, |
| "loss": 3.173, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.296464558215387, |
| "eval_accuracy": 0.37383815814192767, |
| "eval_loss": 3.547642469406128, |
| "eval_runtime": 206.8945, |
| "eval_samples_per_second": 80.456, |
| "eval_steps_per_second": 5.032, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.31102568582911, |
| "grad_norm": 0.42979976534843445, |
| "learning_rate": 0.00032045804195804194, |
| "loss": 3.1465, |
| "step": 80050 |
| }, |
| { |
| "epoch": 23.325586813442833, |
| "grad_norm": 0.40402212738990784, |
| "learning_rate": 0.00032028321678321674, |
| "loss": 3.1415, |
| "step": 80100 |
| }, |
| { |
| "epoch": 23.340147941056557, |
| "grad_norm": 0.42350292205810547, |
| "learning_rate": 0.0003201083916083916, |
| "loss": 3.1344, |
| "step": 80150 |
| }, |
| { |
| "epoch": 23.354709068670278, |
| "grad_norm": 0.4050774574279785, |
| "learning_rate": 0.0003199335664335664, |
| "loss": 3.1469, |
| "step": 80200 |
| }, |
| { |
| "epoch": 23.369270196284, |
| "grad_norm": 0.41722771525382996, |
| "learning_rate": 0.00031975874125874125, |
| "loss": 3.1484, |
| "step": 80250 |
| }, |
| { |
| "epoch": 23.383831323897724, |
| "grad_norm": 0.43232372403144836, |
| "learning_rate": 0.00031958391608391605, |
| "loss": 3.1458, |
| "step": 80300 |
| }, |
| { |
| "epoch": 23.398392451511445, |
| "grad_norm": 0.3975783586502075, |
| "learning_rate": 0.0003194090909090909, |
| "loss": 3.1604, |
| "step": 80350 |
| }, |
| { |
| "epoch": 23.41295357912517, |
| "grad_norm": 0.41612717509269714, |
| "learning_rate": 0.00031923426573426576, |
| "loss": 3.1515, |
| "step": 80400 |
| }, |
| { |
| "epoch": 23.42751470673889, |
| "grad_norm": 0.41064780950546265, |
| "learning_rate": 0.0003190594405594405, |
| "loss": 3.159, |
| "step": 80450 |
| }, |
| { |
| "epoch": 23.44207583435261, |
| "grad_norm": 0.4188465178012848, |
| "learning_rate": 0.0003188846153846154, |
| "loss": 3.162, |
| "step": 80500 |
| }, |
| { |
| "epoch": 23.456636961966336, |
| "grad_norm": 0.41556617617607117, |
| "learning_rate": 0.00031870979020979015, |
| "loss": 3.1652, |
| "step": 80550 |
| }, |
| { |
| "epoch": 23.471198089580056, |
| "grad_norm": 0.4153840243816376, |
| "learning_rate": 0.000318534965034965, |
| "loss": 3.1641, |
| "step": 80600 |
| }, |
| { |
| "epoch": 23.48575921719378, |
| "grad_norm": 0.4001832902431488, |
| "learning_rate": 0.0003183601398601398, |
| "loss": 3.1678, |
| "step": 80650 |
| }, |
| { |
| "epoch": 23.500320344807502, |
| "grad_norm": 0.4026460647583008, |
| "learning_rate": 0.00031818531468531466, |
| "loss": 3.1778, |
| "step": 80700 |
| }, |
| { |
| "epoch": 23.514881472421223, |
| "grad_norm": 0.3965749442577362, |
| "learning_rate": 0.00031801048951048946, |
| "loss": 3.1777, |
| "step": 80750 |
| }, |
| { |
| "epoch": 23.529442600034947, |
| "grad_norm": 0.38197195529937744, |
| "learning_rate": 0.0003178356643356643, |
| "loss": 3.1625, |
| "step": 80800 |
| }, |
| { |
| "epoch": 23.54400372764867, |
| "grad_norm": 0.4331737756729126, |
| "learning_rate": 0.0003176608391608391, |
| "loss": 3.1818, |
| "step": 80850 |
| }, |
| { |
| "epoch": 23.558564855262393, |
| "grad_norm": 0.4490604102611542, |
| "learning_rate": 0.00031748601398601397, |
| "loss": 3.1665, |
| "step": 80900 |
| }, |
| { |
| "epoch": 23.573125982876114, |
| "grad_norm": 0.4250028431415558, |
| "learning_rate": 0.00031731118881118877, |
| "loss": 3.18, |
| "step": 80950 |
| }, |
| { |
| "epoch": 23.587687110489835, |
| "grad_norm": 0.3894713222980499, |
| "learning_rate": 0.0003171363636363636, |
| "loss": 3.1695, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.587687110489835, |
| "eval_accuracy": 0.3735602401380843, |
| "eval_loss": 3.550976037979126, |
| "eval_runtime": 81.7574, |
| "eval_samples_per_second": 203.602, |
| "eval_steps_per_second": 12.733, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.60224823810356, |
| "grad_norm": 0.40171271562576294, |
| "learning_rate": 0.0003169615384615385, |
| "loss": 3.1697, |
| "step": 81050 |
| }, |
| { |
| "epoch": 23.61680936571728, |
| "grad_norm": 0.429794043302536, |
| "learning_rate": 0.0003167867132867133, |
| "loss": 3.1802, |
| "step": 81100 |
| }, |
| { |
| "epoch": 23.631370493331005, |
| "grad_norm": 0.39102861285209656, |
| "learning_rate": 0.00031661188811188813, |
| "loss": 3.1805, |
| "step": 81150 |
| }, |
| { |
| "epoch": 23.645931620944726, |
| "grad_norm": 0.3841477930545807, |
| "learning_rate": 0.0003164370629370629, |
| "loss": 3.1848, |
| "step": 81200 |
| }, |
| { |
| "epoch": 23.660492748558447, |
| "grad_norm": 0.3811715841293335, |
| "learning_rate": 0.0003162622377622378, |
| "loss": 3.1774, |
| "step": 81250 |
| }, |
| { |
| "epoch": 23.67505387617217, |
| "grad_norm": 0.4145857095718384, |
| "learning_rate": 0.00031608741258741253, |
| "loss": 3.1768, |
| "step": 81300 |
| }, |
| { |
| "epoch": 23.689615003785892, |
| "grad_norm": 0.38658639788627625, |
| "learning_rate": 0.0003159125874125874, |
| "loss": 3.1839, |
| "step": 81350 |
| }, |
| { |
| "epoch": 23.704176131399617, |
| "grad_norm": 0.40219464898109436, |
| "learning_rate": 0.0003157377622377622, |
| "loss": 3.197, |
| "step": 81400 |
| }, |
| { |
| "epoch": 23.718737259013338, |
| "grad_norm": 0.38494277000427246, |
| "learning_rate": 0.00031556293706293704, |
| "loss": 3.1961, |
| "step": 81450 |
| }, |
| { |
| "epoch": 23.73329838662706, |
| "grad_norm": 0.4261615574359894, |
| "learning_rate": 0.00031538811188811184, |
| "loss": 3.1931, |
| "step": 81500 |
| }, |
| { |
| "epoch": 23.747859514240783, |
| "grad_norm": 0.4387470483779907, |
| "learning_rate": 0.0003152132867132867, |
| "loss": 3.2004, |
| "step": 81550 |
| }, |
| { |
| "epoch": 23.762420641854504, |
| "grad_norm": 0.39953136444091797, |
| "learning_rate": 0.0003150384615384615, |
| "loss": 3.1929, |
| "step": 81600 |
| }, |
| { |
| "epoch": 23.77698176946823, |
| "grad_norm": 0.4131433665752411, |
| "learning_rate": 0.00031486363636363634, |
| "loss": 3.193, |
| "step": 81650 |
| }, |
| { |
| "epoch": 23.79154289708195, |
| "grad_norm": 0.4012967050075531, |
| "learning_rate": 0.00031468881118881114, |
| "loss": 3.2044, |
| "step": 81700 |
| }, |
| { |
| "epoch": 23.80610402469567, |
| "grad_norm": 0.40840378403663635, |
| "learning_rate": 0.000314513986013986, |
| "loss": 3.1983, |
| "step": 81750 |
| }, |
| { |
| "epoch": 23.820665152309395, |
| "grad_norm": 0.3806869685649872, |
| "learning_rate": 0.00031433916083916085, |
| "loss": 3.2068, |
| "step": 81800 |
| }, |
| { |
| "epoch": 23.835226279923116, |
| "grad_norm": 0.45317861437797546, |
| "learning_rate": 0.00031416433566433565, |
| "loss": 3.2069, |
| "step": 81850 |
| }, |
| { |
| "epoch": 23.84978740753684, |
| "grad_norm": 0.3802683353424072, |
| "learning_rate": 0.0003139895104895105, |
| "loss": 3.1981, |
| "step": 81900 |
| }, |
| { |
| "epoch": 23.86434853515056, |
| "grad_norm": 0.40308499336242676, |
| "learning_rate": 0.00031381468531468525, |
| "loss": 3.2093, |
| "step": 81950 |
| }, |
| { |
| "epoch": 23.878909662764286, |
| "grad_norm": 0.44168829917907715, |
| "learning_rate": 0.00031363986013986016, |
| "loss": 3.2074, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.878909662764286, |
| "eval_accuracy": 0.37422940649928416, |
| "eval_loss": 3.539874792098999, |
| "eval_runtime": 82.063, |
| "eval_samples_per_second": 202.844, |
| "eval_steps_per_second": 12.685, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.893470790378007, |
| "grad_norm": 0.3938729166984558, |
| "learning_rate": 0.0003134650349650349, |
| "loss": 3.2018, |
| "step": 82050 |
| }, |
| { |
| "epoch": 23.908031917991728, |
| "grad_norm": 0.39335423707962036, |
| "learning_rate": 0.00031329020979020976, |
| "loss": 3.1935, |
| "step": 82100 |
| }, |
| { |
| "epoch": 23.922593045605453, |
| "grad_norm": 0.4227752089500427, |
| "learning_rate": 0.00031311538461538456, |
| "loss": 3.2114, |
| "step": 82150 |
| }, |
| { |
| "epoch": 23.937154173219174, |
| "grad_norm": 0.4210541546344757, |
| "learning_rate": 0.0003129405594405594, |
| "loss": 3.2036, |
| "step": 82200 |
| }, |
| { |
| "epoch": 23.951715300832895, |
| "grad_norm": 0.4006504714488983, |
| "learning_rate": 0.0003127657342657342, |
| "loss": 3.2099, |
| "step": 82250 |
| }, |
| { |
| "epoch": 23.96627642844662, |
| "grad_norm": 0.42231711745262146, |
| "learning_rate": 0.00031259090909090907, |
| "loss": 3.2127, |
| "step": 82300 |
| }, |
| { |
| "epoch": 23.98083755606034, |
| "grad_norm": 0.40426918864250183, |
| "learning_rate": 0.00031241608391608386, |
| "loss": 3.2118, |
| "step": 82350 |
| }, |
| { |
| "epoch": 23.995398683674065, |
| "grad_norm": 0.3873956501483917, |
| "learning_rate": 0.0003122412587412587, |
| "loss": 3.2157, |
| "step": 82400 |
| }, |
| { |
| "epoch": 24.010192789329604, |
| "grad_norm": 0.40731847286224365, |
| "learning_rate": 0.00031206643356643357, |
| "loss": 3.2204, |
| "step": 82450 |
| }, |
| { |
| "epoch": 24.02475391694333, |
| "grad_norm": 0.42244619131088257, |
| "learning_rate": 0.00031189160839160837, |
| "loss": 3.139, |
| "step": 82500 |
| }, |
| { |
| "epoch": 24.03931504455705, |
| "grad_norm": 0.42944249510765076, |
| "learning_rate": 0.0003117167832167832, |
| "loss": 3.1336, |
| "step": 82550 |
| }, |
| { |
| "epoch": 24.053876172170774, |
| "grad_norm": 0.43737539649009705, |
| "learning_rate": 0.000311541958041958, |
| "loss": 3.1371, |
| "step": 82600 |
| }, |
| { |
| "epoch": 24.068437299784495, |
| "grad_norm": 0.427998811006546, |
| "learning_rate": 0.0003113671328671329, |
| "loss": 3.1349, |
| "step": 82650 |
| }, |
| { |
| "epoch": 24.082998427398216, |
| "grad_norm": 0.3977920413017273, |
| "learning_rate": 0.0003111923076923076, |
| "loss": 3.1462, |
| "step": 82700 |
| }, |
| { |
| "epoch": 24.09755955501194, |
| "grad_norm": 0.3784102499485016, |
| "learning_rate": 0.00031101748251748253, |
| "loss": 3.1369, |
| "step": 82750 |
| }, |
| { |
| "epoch": 24.11212068262566, |
| "grad_norm": 0.4208683371543884, |
| "learning_rate": 0.0003108426573426573, |
| "loss": 3.1477, |
| "step": 82800 |
| }, |
| { |
| "epoch": 24.126681810239386, |
| "grad_norm": 0.4010569751262665, |
| "learning_rate": 0.00031066783216783213, |
| "loss": 3.1567, |
| "step": 82850 |
| }, |
| { |
| "epoch": 24.141242937853107, |
| "grad_norm": 0.4046182632446289, |
| "learning_rate": 0.00031049300699300693, |
| "loss": 3.161, |
| "step": 82900 |
| }, |
| { |
| "epoch": 24.15580406546683, |
| "grad_norm": 0.41489896178245544, |
| "learning_rate": 0.0003103181818181818, |
| "loss": 3.1549, |
| "step": 82950 |
| }, |
| { |
| "epoch": 24.170365193080553, |
| "grad_norm": 0.403430700302124, |
| "learning_rate": 0.0003101433566433566, |
| "loss": 3.1651, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.170365193080553, |
| "eval_accuracy": 0.3737176564693645, |
| "eval_loss": 3.5497021675109863, |
| "eval_runtime": 81.8557, |
| "eval_samples_per_second": 203.358, |
| "eval_steps_per_second": 12.718, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.184926320694274, |
| "grad_norm": 0.4056961238384247, |
| "learning_rate": 0.00030996853146853144, |
| "loss": 3.1536, |
| "step": 83050 |
| }, |
| { |
| "epoch": 24.199487448308, |
| "grad_norm": 0.4257747530937195, |
| "learning_rate": 0.00030979370629370624, |
| "loss": 3.1699, |
| "step": 83100 |
| }, |
| { |
| "epoch": 24.21404857592172, |
| "grad_norm": 0.42535319924354553, |
| "learning_rate": 0.0003096188811188811, |
| "loss": 3.1596, |
| "step": 83150 |
| }, |
| { |
| "epoch": 24.22860970353544, |
| "grad_norm": 0.4171734154224396, |
| "learning_rate": 0.00030944405594405595, |
| "loss": 3.1696, |
| "step": 83200 |
| }, |
| { |
| "epoch": 24.243170831149165, |
| "grad_norm": 0.41859862208366394, |
| "learning_rate": 0.00030926923076923075, |
| "loss": 3.1671, |
| "step": 83250 |
| }, |
| { |
| "epoch": 24.257731958762886, |
| "grad_norm": 0.4126744270324707, |
| "learning_rate": 0.0003090944055944056, |
| "loss": 3.1722, |
| "step": 83300 |
| }, |
| { |
| "epoch": 24.27229308637661, |
| "grad_norm": 0.42095598578453064, |
| "learning_rate": 0.0003089195804195804, |
| "loss": 3.165, |
| "step": 83350 |
| }, |
| { |
| "epoch": 24.28685421399033, |
| "grad_norm": 0.43283501267433167, |
| "learning_rate": 0.00030874475524475525, |
| "loss": 3.175, |
| "step": 83400 |
| }, |
| { |
| "epoch": 24.301415341604052, |
| "grad_norm": 0.39688780903816223, |
| "learning_rate": 0.00030856993006993, |
| "loss": 3.1682, |
| "step": 83450 |
| }, |
| { |
| "epoch": 24.315976469217777, |
| "grad_norm": 0.3968777060508728, |
| "learning_rate": 0.0003083951048951049, |
| "loss": 3.1674, |
| "step": 83500 |
| }, |
| { |
| "epoch": 24.330537596831498, |
| "grad_norm": 0.39567282795906067, |
| "learning_rate": 0.00030822027972027965, |
| "loss": 3.1916, |
| "step": 83550 |
| }, |
| { |
| "epoch": 24.345098724445222, |
| "grad_norm": 0.4216485619544983, |
| "learning_rate": 0.0003080454545454545, |
| "loss": 3.184, |
| "step": 83600 |
| }, |
| { |
| "epoch": 24.359659852058943, |
| "grad_norm": 0.41604211926460266, |
| "learning_rate": 0.0003078706293706293, |
| "loss": 3.1747, |
| "step": 83650 |
| }, |
| { |
| "epoch": 24.374220979672664, |
| "grad_norm": 0.40126150846481323, |
| "learning_rate": 0.00030769580419580416, |
| "loss": 3.1773, |
| "step": 83700 |
| }, |
| { |
| "epoch": 24.38878210728639, |
| "grad_norm": 0.4066073000431061, |
| "learning_rate": 0.00030752097902097896, |
| "loss": 3.1995, |
| "step": 83750 |
| }, |
| { |
| "epoch": 24.40334323490011, |
| "grad_norm": 0.4371321499347687, |
| "learning_rate": 0.0003073461538461538, |
| "loss": 3.1987, |
| "step": 83800 |
| }, |
| { |
| "epoch": 24.417904362513834, |
| "grad_norm": 0.420232892036438, |
| "learning_rate": 0.00030717132867132867, |
| "loss": 3.1883, |
| "step": 83850 |
| }, |
| { |
| "epoch": 24.432465490127555, |
| "grad_norm": 0.41062405705451965, |
| "learning_rate": 0.00030699650349650347, |
| "loss": 3.1898, |
| "step": 83900 |
| }, |
| { |
| "epoch": 24.44702661774128, |
| "grad_norm": 0.4123108685016632, |
| "learning_rate": 0.0003068216783216783, |
| "loss": 3.1817, |
| "step": 83950 |
| }, |
| { |
| "epoch": 24.461587745355, |
| "grad_norm": 0.39547696709632874, |
| "learning_rate": 0.0003066468531468531, |
| "loss": 3.1884, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.461587745355, |
| "eval_accuracy": 0.37415040442712566, |
| "eval_loss": 3.542630672454834, |
| "eval_runtime": 82.7561, |
| "eval_samples_per_second": 201.145, |
| "eval_steps_per_second": 12.579, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.47614887296872, |
| "grad_norm": 0.40193748474121094, |
| "learning_rate": 0.000306472027972028, |
| "loss": 3.192, |
| "step": 84050 |
| }, |
| { |
| "epoch": 24.490710000582446, |
| "grad_norm": 0.42589059472084045, |
| "learning_rate": 0.0003062972027972028, |
| "loss": 3.1965, |
| "step": 84100 |
| }, |
| { |
| "epoch": 24.505271128196167, |
| "grad_norm": 0.4336428642272949, |
| "learning_rate": 0.00030612237762237763, |
| "loss": 3.196, |
| "step": 84150 |
| }, |
| { |
| "epoch": 24.51983225580989, |
| "grad_norm": 0.37640851736068726, |
| "learning_rate": 0.0003059475524475524, |
| "loss": 3.1951, |
| "step": 84200 |
| }, |
| { |
| "epoch": 24.534393383423613, |
| "grad_norm": 0.4630868136882782, |
| "learning_rate": 0.0003057727272727273, |
| "loss": 3.2039, |
| "step": 84250 |
| }, |
| { |
| "epoch": 24.548954511037334, |
| "grad_norm": 0.38848793506622314, |
| "learning_rate": 0.00030559790209790203, |
| "loss": 3.1917, |
| "step": 84300 |
| }, |
| { |
| "epoch": 24.563515638651058, |
| "grad_norm": 0.4076870381832123, |
| "learning_rate": 0.0003054230769230769, |
| "loss": 3.1941, |
| "step": 84350 |
| }, |
| { |
| "epoch": 24.57807676626478, |
| "grad_norm": 0.40952327847480774, |
| "learning_rate": 0.0003052482517482517, |
| "loss": 3.1948, |
| "step": 84400 |
| }, |
| { |
| "epoch": 24.592637893878504, |
| "grad_norm": 0.4265903830528259, |
| "learning_rate": 0.00030507342657342654, |
| "loss": 3.2098, |
| "step": 84450 |
| }, |
| { |
| "epoch": 24.607199021492224, |
| "grad_norm": 0.39746084809303284, |
| "learning_rate": 0.00030489860139860134, |
| "loss": 3.1988, |
| "step": 84500 |
| }, |
| { |
| "epoch": 24.621760149105945, |
| "grad_norm": 0.41020631790161133, |
| "learning_rate": 0.0003047237762237762, |
| "loss": 3.1983, |
| "step": 84550 |
| }, |
| { |
| "epoch": 24.63632127671967, |
| "grad_norm": 0.4352176785469055, |
| "learning_rate": 0.00030454895104895104, |
| "loss": 3.2109, |
| "step": 84600 |
| }, |
| { |
| "epoch": 24.65088240433339, |
| "grad_norm": 0.40543854236602783, |
| "learning_rate": 0.00030437412587412584, |
| "loss": 3.2079, |
| "step": 84650 |
| }, |
| { |
| "epoch": 24.665443531947115, |
| "grad_norm": 0.40363264083862305, |
| "learning_rate": 0.0003041993006993007, |
| "loss": 3.2035, |
| "step": 84700 |
| }, |
| { |
| "epoch": 24.680004659560836, |
| "grad_norm": 0.38873913884162903, |
| "learning_rate": 0.0003040244755244755, |
| "loss": 3.2219, |
| "step": 84750 |
| }, |
| { |
| "epoch": 24.694565787174557, |
| "grad_norm": 0.40062999725341797, |
| "learning_rate": 0.00030384965034965035, |
| "loss": 3.2052, |
| "step": 84800 |
| }, |
| { |
| "epoch": 24.709126914788282, |
| "grad_norm": 0.43961188197135925, |
| "learning_rate": 0.00030367482517482515, |
| "loss": 3.2001, |
| "step": 84850 |
| }, |
| { |
| "epoch": 24.723688042402003, |
| "grad_norm": 0.420628160238266, |
| "learning_rate": 0.0003035, |
| "loss": 3.1965, |
| "step": 84900 |
| }, |
| { |
| "epoch": 24.738249170015727, |
| "grad_norm": 0.3856954276561737, |
| "learning_rate": 0.00030332517482517475, |
| "loss": 3.202, |
| "step": 84950 |
| }, |
| { |
| "epoch": 24.75281029762945, |
| "grad_norm": 0.4050898849964142, |
| "learning_rate": 0.00030315034965034966, |
| "loss": 3.207, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.75281029762945, |
| "eval_accuracy": 0.3746136011001979, |
| "eval_loss": 3.537879705429077, |
| "eval_runtime": 82.7982, |
| "eval_samples_per_second": 201.043, |
| "eval_steps_per_second": 12.573, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.76737142524317, |
| "grad_norm": 0.41925495862960815, |
| "learning_rate": 0.0003029755244755244, |
| "loss": 3.2085, |
| "step": 85050 |
| }, |
| { |
| "epoch": 24.781932552856894, |
| "grad_norm": 0.40448760986328125, |
| "learning_rate": 0.00030280069930069926, |
| "loss": 3.1937, |
| "step": 85100 |
| }, |
| { |
| "epoch": 24.796493680470615, |
| "grad_norm": 0.3652881979942322, |
| "learning_rate": 0.00030262587412587406, |
| "loss": 3.2102, |
| "step": 85150 |
| }, |
| { |
| "epoch": 24.81105480808434, |
| "grad_norm": 0.4352279603481293, |
| "learning_rate": 0.0003024510489510489, |
| "loss": 3.209, |
| "step": 85200 |
| }, |
| { |
| "epoch": 24.82561593569806, |
| "grad_norm": 0.43612954020500183, |
| "learning_rate": 0.00030227622377622377, |
| "loss": 3.2085, |
| "step": 85250 |
| }, |
| { |
| "epoch": 24.84017706331178, |
| "grad_norm": 0.38125696778297424, |
| "learning_rate": 0.00030210139860139856, |
| "loss": 3.1942, |
| "step": 85300 |
| }, |
| { |
| "epoch": 24.854738190925506, |
| "grad_norm": 0.4056960642337799, |
| "learning_rate": 0.0003019265734265734, |
| "loss": 3.2125, |
| "step": 85350 |
| }, |
| { |
| "epoch": 24.869299318539227, |
| "grad_norm": 0.42750799655914307, |
| "learning_rate": 0.0003017517482517482, |
| "loss": 3.2227, |
| "step": 85400 |
| }, |
| { |
| "epoch": 24.88386044615295, |
| "grad_norm": 0.3985252380371094, |
| "learning_rate": 0.00030157692307692307, |
| "loss": 3.2074, |
| "step": 85450 |
| }, |
| { |
| "epoch": 24.898421573766672, |
| "grad_norm": 0.4346800446510315, |
| "learning_rate": 0.00030140209790209787, |
| "loss": 3.2159, |
| "step": 85500 |
| }, |
| { |
| "epoch": 24.912982701380393, |
| "grad_norm": 0.4201732575893402, |
| "learning_rate": 0.0003012272727272727, |
| "loss": 3.2167, |
| "step": 85550 |
| }, |
| { |
| "epoch": 24.927543828994118, |
| "grad_norm": 0.426582932472229, |
| "learning_rate": 0.0003010524475524475, |
| "loss": 3.1992, |
| "step": 85600 |
| }, |
| { |
| "epoch": 24.94210495660784, |
| "grad_norm": 0.38945046067237854, |
| "learning_rate": 0.0003008776223776224, |
| "loss": 3.2155, |
| "step": 85650 |
| }, |
| { |
| "epoch": 24.956666084221563, |
| "grad_norm": 0.4330475628376007, |
| "learning_rate": 0.0003007027972027972, |
| "loss": 3.2257, |
| "step": 85700 |
| }, |
| { |
| "epoch": 24.971227211835284, |
| "grad_norm": 0.508823812007904, |
| "learning_rate": 0.00030052797202797203, |
| "loss": 3.2257, |
| "step": 85750 |
| }, |
| { |
| "epoch": 24.985788339449005, |
| "grad_norm": 0.4129977822303772, |
| "learning_rate": 0.0003003531468531468, |
| "loss": 3.233, |
| "step": 85800 |
| }, |
| { |
| "epoch": 25.000291222552274, |
| "grad_norm": 0.4007333815097809, |
| "learning_rate": 0.00030017832167832163, |
| "loss": 3.2076, |
| "step": 85850 |
| }, |
| { |
| "epoch": 25.014852350165995, |
| "grad_norm": 0.39630234241485596, |
| "learning_rate": 0.0003000034965034965, |
| "loss": 3.1072, |
| "step": 85900 |
| }, |
| { |
| "epoch": 25.02941347777972, |
| "grad_norm": 0.4062051773071289, |
| "learning_rate": 0.0002998286713286713, |
| "loss": 3.1216, |
| "step": 85950 |
| }, |
| { |
| "epoch": 25.04397460539344, |
| "grad_norm": 0.43086424469947815, |
| "learning_rate": 0.00029965384615384614, |
| "loss": 3.1365, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.04397460539344, |
| "eval_accuracy": 0.37406634716284987, |
| "eval_loss": 3.552361011505127, |
| "eval_runtime": 82.808, |
| "eval_samples_per_second": 201.019, |
| "eval_steps_per_second": 12.571, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.058535733007165, |
| "grad_norm": 0.4218362271785736, |
| "learning_rate": 0.00029947902097902094, |
| "loss": 3.1264, |
| "step": 86050 |
| }, |
| { |
| "epoch": 25.073096860620886, |
| "grad_norm": 0.4121156632900238, |
| "learning_rate": 0.0002993041958041958, |
| "loss": 3.1337, |
| "step": 86100 |
| }, |
| { |
| "epoch": 25.087657988234607, |
| "grad_norm": 0.4042889177799225, |
| "learning_rate": 0.0002991293706293706, |
| "loss": 3.1322, |
| "step": 86150 |
| }, |
| { |
| "epoch": 25.10221911584833, |
| "grad_norm": 0.41994380950927734, |
| "learning_rate": 0.0002989545454545454, |
| "loss": 3.1307, |
| "step": 86200 |
| }, |
| { |
| "epoch": 25.116780243462053, |
| "grad_norm": 0.45464181900024414, |
| "learning_rate": 0.00029877972027972025, |
| "loss": 3.1424, |
| "step": 86250 |
| }, |
| { |
| "epoch": 25.131341371075777, |
| "grad_norm": 0.41389626264572144, |
| "learning_rate": 0.0002986048951048951, |
| "loss": 3.1396, |
| "step": 86300 |
| }, |
| { |
| "epoch": 25.145902498689498, |
| "grad_norm": 0.4256459176540375, |
| "learning_rate": 0.0002984300699300699, |
| "loss": 3.1451, |
| "step": 86350 |
| }, |
| { |
| "epoch": 25.160463626303223, |
| "grad_norm": 0.39612478017807007, |
| "learning_rate": 0.00029825524475524475, |
| "loss": 3.1497, |
| "step": 86400 |
| }, |
| { |
| "epoch": 25.175024753916944, |
| "grad_norm": 0.4280693531036377, |
| "learning_rate": 0.00029808041958041955, |
| "loss": 3.1546, |
| "step": 86450 |
| }, |
| { |
| "epoch": 25.189585881530665, |
| "grad_norm": 0.42289748787879944, |
| "learning_rate": 0.0002979055944055944, |
| "loss": 3.1347, |
| "step": 86500 |
| }, |
| { |
| "epoch": 25.20414700914439, |
| "grad_norm": 0.4312131404876709, |
| "learning_rate": 0.0002977307692307692, |
| "loss": 3.1444, |
| "step": 86550 |
| }, |
| { |
| "epoch": 25.21870813675811, |
| "grad_norm": 0.413555771112442, |
| "learning_rate": 0.000297555944055944, |
| "loss": 3.1534, |
| "step": 86600 |
| }, |
| { |
| "epoch": 25.233269264371835, |
| "grad_norm": 0.40471503138542175, |
| "learning_rate": 0.00029738111888111886, |
| "loss": 3.1589, |
| "step": 86650 |
| }, |
| { |
| "epoch": 25.247830391985556, |
| "grad_norm": 0.4393641948699951, |
| "learning_rate": 0.00029720629370629366, |
| "loss": 3.1654, |
| "step": 86700 |
| }, |
| { |
| "epoch": 25.262391519599277, |
| "grad_norm": 0.39511838555336, |
| "learning_rate": 0.0002970314685314685, |
| "loss": 3.1683, |
| "step": 86750 |
| }, |
| { |
| "epoch": 25.276952647213, |
| "grad_norm": 0.4135552942752838, |
| "learning_rate": 0.0002968566433566433, |
| "loss": 3.1727, |
| "step": 86800 |
| }, |
| { |
| "epoch": 25.291513774826722, |
| "grad_norm": 0.45105600357055664, |
| "learning_rate": 0.00029668181818181817, |
| "loss": 3.1548, |
| "step": 86850 |
| }, |
| { |
| "epoch": 25.306074902440447, |
| "grad_norm": 0.4085216522216797, |
| "learning_rate": 0.00029650699300699297, |
| "loss": 3.1717, |
| "step": 86900 |
| }, |
| { |
| "epoch": 25.320636030054168, |
| "grad_norm": 0.39837518334388733, |
| "learning_rate": 0.0002963321678321678, |
| "loss": 3.1575, |
| "step": 86950 |
| }, |
| { |
| "epoch": 25.33519715766789, |
| "grad_norm": 0.41157785058021545, |
| "learning_rate": 0.0002961573426573426, |
| "loss": 3.1642, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.33519715766789, |
| "eval_accuracy": 0.37407457654536636, |
| "eval_loss": 3.5455355644226074, |
| "eval_runtime": 82.7908, |
| "eval_samples_per_second": 201.061, |
| "eval_steps_per_second": 12.574, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.349758285281613, |
| "grad_norm": 0.44607582688331604, |
| "learning_rate": 0.0002959825174825175, |
| "loss": 3.1656, |
| "step": 87050 |
| }, |
| { |
| "epoch": 25.364319412895334, |
| "grad_norm": 0.42980557680130005, |
| "learning_rate": 0.0002958076923076923, |
| "loss": 3.1759, |
| "step": 87100 |
| }, |
| { |
| "epoch": 25.37888054050906, |
| "grad_norm": 0.4200369119644165, |
| "learning_rate": 0.00029563286713286713, |
| "loss": 3.1824, |
| "step": 87150 |
| }, |
| { |
| "epoch": 25.39344166812278, |
| "grad_norm": 0.45044153928756714, |
| "learning_rate": 0.00029545804195804193, |
| "loss": 3.1724, |
| "step": 87200 |
| }, |
| { |
| "epoch": 25.4080027957365, |
| "grad_norm": 0.4264580011367798, |
| "learning_rate": 0.0002952832167832168, |
| "loss": 3.1855, |
| "step": 87250 |
| }, |
| { |
| "epoch": 25.422563923350225, |
| "grad_norm": 0.41655462980270386, |
| "learning_rate": 0.0002951083916083916, |
| "loss": 3.1829, |
| "step": 87300 |
| }, |
| { |
| "epoch": 25.437125050963946, |
| "grad_norm": 0.3989514410495758, |
| "learning_rate": 0.0002949335664335664, |
| "loss": 3.1645, |
| "step": 87350 |
| }, |
| { |
| "epoch": 25.45168617857767, |
| "grad_norm": 0.41328999400138855, |
| "learning_rate": 0.00029475874125874124, |
| "loss": 3.1918, |
| "step": 87400 |
| }, |
| { |
| "epoch": 25.46624730619139, |
| "grad_norm": 0.429002583026886, |
| "learning_rate": 0.00029458391608391604, |
| "loss": 3.186, |
| "step": 87450 |
| }, |
| { |
| "epoch": 25.480808433805112, |
| "grad_norm": 0.43216243386268616, |
| "learning_rate": 0.0002944090909090909, |
| "loss": 3.1853, |
| "step": 87500 |
| }, |
| { |
| "epoch": 25.495369561418837, |
| "grad_norm": 0.4262638986110687, |
| "learning_rate": 0.0002942342657342657, |
| "loss": 3.1803, |
| "step": 87550 |
| }, |
| { |
| "epoch": 25.509930689032558, |
| "grad_norm": 0.4065530300140381, |
| "learning_rate": 0.00029405944055944054, |
| "loss": 3.1769, |
| "step": 87600 |
| }, |
| { |
| "epoch": 25.524491816646282, |
| "grad_norm": 0.4066266715526581, |
| "learning_rate": 0.0002938846153846154, |
| "loss": 3.1746, |
| "step": 87650 |
| }, |
| { |
| "epoch": 25.539052944260003, |
| "grad_norm": 0.44327476620674133, |
| "learning_rate": 0.0002937097902097902, |
| "loss": 3.1743, |
| "step": 87700 |
| }, |
| { |
| "epoch": 25.553614071873724, |
| "grad_norm": 0.4258955419063568, |
| "learning_rate": 0.000293534965034965, |
| "loss": 3.1687, |
| "step": 87750 |
| }, |
| { |
| "epoch": 25.56817519948745, |
| "grad_norm": 0.41590726375579834, |
| "learning_rate": 0.00029336013986013985, |
| "loss": 3.1766, |
| "step": 87800 |
| }, |
| { |
| "epoch": 25.58273632710117, |
| "grad_norm": 0.43293097615242004, |
| "learning_rate": 0.00029318531468531465, |
| "loss": 3.1894, |
| "step": 87850 |
| }, |
| { |
| "epoch": 25.597297454714894, |
| "grad_norm": 0.37935930490493774, |
| "learning_rate": 0.0002930104895104895, |
| "loss": 3.1825, |
| "step": 87900 |
| }, |
| { |
| "epoch": 25.611858582328615, |
| "grad_norm": 0.4388798475265503, |
| "learning_rate": 0.0002928356643356643, |
| "loss": 3.195, |
| "step": 87950 |
| }, |
| { |
| "epoch": 25.626419709942336, |
| "grad_norm": 0.41763705015182495, |
| "learning_rate": 0.00029266083916083916, |
| "loss": 3.1958, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.626419709942336, |
| "eval_accuracy": 0.37490797786907426, |
| "eval_loss": 3.540461778640747, |
| "eval_runtime": 82.6732, |
| "eval_samples_per_second": 201.347, |
| "eval_steps_per_second": 12.592, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.64098083755606, |
| "grad_norm": 0.3861890435218811, |
| "learning_rate": 0.00029248601398601396, |
| "loss": 3.1964, |
| "step": 88050 |
| }, |
| { |
| "epoch": 25.655541965169782, |
| "grad_norm": 0.4253334105014801, |
| "learning_rate": 0.00029231118881118876, |
| "loss": 3.1809, |
| "step": 88100 |
| }, |
| { |
| "epoch": 25.670103092783506, |
| "grad_norm": 0.4184607267379761, |
| "learning_rate": 0.0002921363636363636, |
| "loss": 3.1805, |
| "step": 88150 |
| }, |
| { |
| "epoch": 25.684664220397227, |
| "grad_norm": 0.4241752028465271, |
| "learning_rate": 0.0002919615384615384, |
| "loss": 3.1938, |
| "step": 88200 |
| }, |
| { |
| "epoch": 25.69922534801095, |
| "grad_norm": 0.4125079810619354, |
| "learning_rate": 0.00029178671328671326, |
| "loss": 3.1899, |
| "step": 88250 |
| }, |
| { |
| "epoch": 25.713786475624673, |
| "grad_norm": 0.4324009418487549, |
| "learning_rate": 0.00029161188811188806, |
| "loss": 3.1993, |
| "step": 88300 |
| }, |
| { |
| "epoch": 25.728347603238394, |
| "grad_norm": 0.4532955586910248, |
| "learning_rate": 0.0002914370629370629, |
| "loss": 3.2014, |
| "step": 88350 |
| }, |
| { |
| "epoch": 25.74290873085212, |
| "grad_norm": 0.4195028245449066, |
| "learning_rate": 0.00029126223776223777, |
| "loss": 3.2034, |
| "step": 88400 |
| }, |
| { |
| "epoch": 25.75746985846584, |
| "grad_norm": 0.3960408568382263, |
| "learning_rate": 0.00029108741258741257, |
| "loss": 3.1918, |
| "step": 88450 |
| }, |
| { |
| "epoch": 25.772030986079564, |
| "grad_norm": 0.425977885723114, |
| "learning_rate": 0.00029091258741258737, |
| "loss": 3.2127, |
| "step": 88500 |
| }, |
| { |
| "epoch": 25.786592113693285, |
| "grad_norm": 0.41860243678092957, |
| "learning_rate": 0.0002907377622377622, |
| "loss": 3.2075, |
| "step": 88550 |
| }, |
| { |
| "epoch": 25.801153241307006, |
| "grad_norm": 0.44775453209877014, |
| "learning_rate": 0.000290562937062937, |
| "loss": 3.2117, |
| "step": 88600 |
| }, |
| { |
| "epoch": 25.81571436892073, |
| "grad_norm": 0.4702814221382141, |
| "learning_rate": 0.0002903881118881119, |
| "loss": 3.1964, |
| "step": 88650 |
| }, |
| { |
| "epoch": 25.83027549653445, |
| "grad_norm": 0.4235730469226837, |
| "learning_rate": 0.0002902132867132867, |
| "loss": 3.1933, |
| "step": 88700 |
| }, |
| { |
| "epoch": 25.844836624148176, |
| "grad_norm": 0.41908684372901917, |
| "learning_rate": 0.00029003846153846153, |
| "loss": 3.2097, |
| "step": 88750 |
| }, |
| { |
| "epoch": 25.859397751761897, |
| "grad_norm": 0.45353618264198303, |
| "learning_rate": 0.00028986363636363633, |
| "loss": 3.204, |
| "step": 88800 |
| }, |
| { |
| "epoch": 25.873958879375618, |
| "grad_norm": 0.4325787425041199, |
| "learning_rate": 0.00028968881118881113, |
| "loss": 3.2003, |
| "step": 88850 |
| }, |
| { |
| "epoch": 25.888520006989342, |
| "grad_norm": 0.4323851466178894, |
| "learning_rate": 0.000289513986013986, |
| "loss": 3.2142, |
| "step": 88900 |
| }, |
| { |
| "epoch": 25.903081134603063, |
| "grad_norm": 0.43925732374191284, |
| "learning_rate": 0.0002893391608391608, |
| "loss": 3.2089, |
| "step": 88950 |
| }, |
| { |
| "epoch": 25.917642262216788, |
| "grad_norm": 0.42061591148376465, |
| "learning_rate": 0.00028916433566433564, |
| "loss": 3.2107, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.917642262216788, |
| "eval_accuracy": 0.3751083045520477, |
| "eval_loss": 3.5316367149353027, |
| "eval_runtime": 82.7165, |
| "eval_samples_per_second": 201.242, |
| "eval_steps_per_second": 12.585, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.93220338983051, |
| "grad_norm": 0.4129398763179779, |
| "learning_rate": 0.0002889895104895105, |
| "loss": 3.2124, |
| "step": 89050 |
| }, |
| { |
| "epoch": 25.94676451744423, |
| "grad_norm": 0.4748251438140869, |
| "learning_rate": 0.0002888146853146853, |
| "loss": 3.1991, |
| "step": 89100 |
| }, |
| { |
| "epoch": 25.961325645057954, |
| "grad_norm": 0.41654056310653687, |
| "learning_rate": 0.00028863986013986015, |
| "loss": 3.1942, |
| "step": 89150 |
| }, |
| { |
| "epoch": 25.975886772671675, |
| "grad_norm": 0.40318265557289124, |
| "learning_rate": 0.00028846503496503495, |
| "loss": 3.2088, |
| "step": 89200 |
| }, |
| { |
| "epoch": 25.9904479002854, |
| "grad_norm": 0.3837660253047943, |
| "learning_rate": 0.00028829020979020975, |
| "loss": 3.2013, |
| "step": 89250 |
| }, |
| { |
| "epoch": 26.004950783388665, |
| "grad_norm": 0.45651721954345703, |
| "learning_rate": 0.0002881153846153846, |
| "loss": 3.1696, |
| "step": 89300 |
| }, |
| { |
| "epoch": 26.01951191100239, |
| "grad_norm": 0.39994147419929504, |
| "learning_rate": 0.0002879405594405594, |
| "loss": 3.1018, |
| "step": 89350 |
| }, |
| { |
| "epoch": 26.03407303861611, |
| "grad_norm": 0.4511447250843048, |
| "learning_rate": 0.00028776573426573425, |
| "loss": 3.1111, |
| "step": 89400 |
| }, |
| { |
| "epoch": 26.04863416622983, |
| "grad_norm": 0.4027986526489258, |
| "learning_rate": 0.00028759090909090905, |
| "loss": 3.1179, |
| "step": 89450 |
| }, |
| { |
| "epoch": 26.063195293843556, |
| "grad_norm": 0.3995853364467621, |
| "learning_rate": 0.0002874160839160839, |
| "loss": 3.1177, |
| "step": 89500 |
| }, |
| { |
| "epoch": 26.077756421457277, |
| "grad_norm": 0.4248117208480835, |
| "learning_rate": 0.0002872412587412587, |
| "loss": 3.1186, |
| "step": 89550 |
| }, |
| { |
| "epoch": 26.092317549071, |
| "grad_norm": 0.4385637938976288, |
| "learning_rate": 0.0002870664335664335, |
| "loss": 3.1367, |
| "step": 89600 |
| }, |
| { |
| "epoch": 26.106878676684723, |
| "grad_norm": 0.47063812613487244, |
| "learning_rate": 0.00028689160839160836, |
| "loss": 3.1228, |
| "step": 89650 |
| }, |
| { |
| "epoch": 26.121439804298443, |
| "grad_norm": 0.42873483896255493, |
| "learning_rate": 0.0002867167832167832, |
| "loss": 3.1245, |
| "step": 89700 |
| }, |
| { |
| "epoch": 26.136000931912168, |
| "grad_norm": 0.41910821199417114, |
| "learning_rate": 0.000286541958041958, |
| "loss": 3.1387, |
| "step": 89750 |
| }, |
| { |
| "epoch": 26.15056205952589, |
| "grad_norm": 0.4321070909500122, |
| "learning_rate": 0.00028636713286713287, |
| "loss": 3.1381, |
| "step": 89800 |
| }, |
| { |
| "epoch": 26.165123187139613, |
| "grad_norm": 0.43659085035324097, |
| "learning_rate": 0.00028619230769230767, |
| "loss": 3.1349, |
| "step": 89850 |
| }, |
| { |
| "epoch": 26.179684314753334, |
| "grad_norm": 0.41032034158706665, |
| "learning_rate": 0.0002860174825174825, |
| "loss": 3.1311, |
| "step": 89900 |
| }, |
| { |
| "epoch": 26.194245442367055, |
| "grad_norm": 0.409869909286499, |
| "learning_rate": 0.0002858426573426573, |
| "loss": 3.1373, |
| "step": 89950 |
| }, |
| { |
| "epoch": 26.20880656998078, |
| "grad_norm": 0.4248732626438141, |
| "learning_rate": 0.0002856678321678321, |
| "loss": 3.1475, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.20880656998078, |
| "eval_accuracy": 0.3740578826551186, |
| "eval_loss": 3.5519931316375732, |
| "eval_runtime": 82.6882, |
| "eval_samples_per_second": 201.311, |
| "eval_steps_per_second": 12.589, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.2233676975945, |
| "grad_norm": 0.4216873049736023, |
| "learning_rate": 0.000285493006993007, |
| "loss": 3.1525, |
| "step": 90050 |
| }, |
| { |
| "epoch": 26.237928825208225, |
| "grad_norm": 0.4195588529109955, |
| "learning_rate": 0.0002853181818181818, |
| "loss": 3.1503, |
| "step": 90100 |
| }, |
| { |
| "epoch": 26.252489952821946, |
| "grad_norm": 0.429360955953598, |
| "learning_rate": 0.00028514335664335663, |
| "loss": 3.1632, |
| "step": 90150 |
| }, |
| { |
| "epoch": 26.267051080435667, |
| "grad_norm": 0.4333534836769104, |
| "learning_rate": 0.00028496853146853143, |
| "loss": 3.1473, |
| "step": 90200 |
| }, |
| { |
| "epoch": 26.281612208049392, |
| "grad_norm": 0.4616968631744385, |
| "learning_rate": 0.0002847937062937063, |
| "loss": 3.141, |
| "step": 90250 |
| }, |
| { |
| "epoch": 26.296173335663113, |
| "grad_norm": 0.3975614309310913, |
| "learning_rate": 0.0002846188811188811, |
| "loss": 3.1487, |
| "step": 90300 |
| }, |
| { |
| "epoch": 26.310734463276837, |
| "grad_norm": 0.42888548970222473, |
| "learning_rate": 0.0002844440559440559, |
| "loss": 3.1559, |
| "step": 90350 |
| }, |
| { |
| "epoch": 26.32529559089056, |
| "grad_norm": 0.4494994580745697, |
| "learning_rate": 0.00028426923076923074, |
| "loss": 3.1558, |
| "step": 90400 |
| }, |
| { |
| "epoch": 26.33985671850428, |
| "grad_norm": 0.4605661630630493, |
| "learning_rate": 0.0002840944055944056, |
| "loss": 3.1717, |
| "step": 90450 |
| }, |
| { |
| "epoch": 26.354417846118004, |
| "grad_norm": 0.42073243856430054, |
| "learning_rate": 0.0002839195804195804, |
| "loss": 3.1623, |
| "step": 90500 |
| }, |
| { |
| "epoch": 26.368978973731725, |
| "grad_norm": 0.414485901594162, |
| "learning_rate": 0.00028374475524475524, |
| "loss": 3.1586, |
| "step": 90550 |
| }, |
| { |
| "epoch": 26.38354010134545, |
| "grad_norm": 0.40136295557022095, |
| "learning_rate": 0.00028356993006993004, |
| "loss": 3.1555, |
| "step": 90600 |
| }, |
| { |
| "epoch": 26.39810122895917, |
| "grad_norm": 0.4165910482406616, |
| "learning_rate": 0.0002833951048951049, |
| "loss": 3.1698, |
| "step": 90650 |
| }, |
| { |
| "epoch": 26.41266235657289, |
| "grad_norm": 0.4253171980381012, |
| "learning_rate": 0.0002832202797202797, |
| "loss": 3.1737, |
| "step": 90700 |
| }, |
| { |
| "epoch": 26.427223484186616, |
| "grad_norm": 0.4556134343147278, |
| "learning_rate": 0.0002830454545454545, |
| "loss": 3.1707, |
| "step": 90750 |
| }, |
| { |
| "epoch": 26.441784611800337, |
| "grad_norm": 0.4397638142108917, |
| "learning_rate": 0.00028287062937062935, |
| "loss": 3.1767, |
| "step": 90800 |
| }, |
| { |
| "epoch": 26.45634573941406, |
| "grad_norm": 0.44042325019836426, |
| "learning_rate": 0.00028269580419580415, |
| "loss": 3.1534, |
| "step": 90850 |
| }, |
| { |
| "epoch": 26.470906867027782, |
| "grad_norm": 0.4332970678806305, |
| "learning_rate": 0.000282520979020979, |
| "loss": 3.1692, |
| "step": 90900 |
| }, |
| { |
| "epoch": 26.485467994641503, |
| "grad_norm": 0.42552515864372253, |
| "learning_rate": 0.0002823461538461538, |
| "loss": 3.1658, |
| "step": 90950 |
| }, |
| { |
| "epoch": 26.500029122255228, |
| "grad_norm": 0.4197435975074768, |
| "learning_rate": 0.00028217132867132866, |
| "loss": 3.1867, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.500029122255228, |
| "eval_accuracy": 0.3747208181981273, |
| "eval_loss": 3.541072130203247, |
| "eval_runtime": 82.4899, |
| "eval_samples_per_second": 201.794, |
| "eval_steps_per_second": 12.62, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.51459024986895, |
| "grad_norm": 0.4575478732585907, |
| "learning_rate": 0.00028199650349650346, |
| "loss": 3.1719, |
| "step": 91050 |
| }, |
| { |
| "epoch": 26.529151377482673, |
| "grad_norm": 0.4428056478500366, |
| "learning_rate": 0.0002818216783216783, |
| "loss": 3.1651, |
| "step": 91100 |
| }, |
| { |
| "epoch": 26.543712505096394, |
| "grad_norm": 0.4040013551712036, |
| "learning_rate": 0.0002816468531468531, |
| "loss": 3.1653, |
| "step": 91150 |
| }, |
| { |
| "epoch": 26.55827363271012, |
| "grad_norm": 0.41876381635665894, |
| "learning_rate": 0.00028147202797202796, |
| "loss": 3.1777, |
| "step": 91200 |
| }, |
| { |
| "epoch": 26.57283476032384, |
| "grad_norm": 0.4380807876586914, |
| "learning_rate": 0.00028129720279720276, |
| "loss": 3.179, |
| "step": 91250 |
| }, |
| { |
| "epoch": 26.58739588793756, |
| "grad_norm": 0.44236987829208374, |
| "learning_rate": 0.0002811223776223776, |
| "loss": 3.1661, |
| "step": 91300 |
| }, |
| { |
| "epoch": 26.601957015551285, |
| "grad_norm": 0.40528810024261475, |
| "learning_rate": 0.0002809475524475524, |
| "loss": 3.1786, |
| "step": 91350 |
| }, |
| { |
| "epoch": 26.616518143165006, |
| "grad_norm": 0.43165895342826843, |
| "learning_rate": 0.00028077272727272727, |
| "loss": 3.1717, |
| "step": 91400 |
| }, |
| { |
| "epoch": 26.63107927077873, |
| "grad_norm": 0.40146100521087646, |
| "learning_rate": 0.00028059790209790207, |
| "loss": 3.1875, |
| "step": 91450 |
| }, |
| { |
| "epoch": 26.64564039839245, |
| "grad_norm": 0.4229898750782013, |
| "learning_rate": 0.00028042307692307687, |
| "loss": 3.1842, |
| "step": 91500 |
| }, |
| { |
| "epoch": 26.660201526006173, |
| "grad_norm": 0.4406355321407318, |
| "learning_rate": 0.0002802482517482517, |
| "loss": 3.1988, |
| "step": 91550 |
| }, |
| { |
| "epoch": 26.674762653619897, |
| "grad_norm": 0.4173825681209564, |
| "learning_rate": 0.0002800734265734265, |
| "loss": 3.1822, |
| "step": 91600 |
| }, |
| { |
| "epoch": 26.689323781233618, |
| "grad_norm": 0.4346403181552887, |
| "learning_rate": 0.0002798986013986014, |
| "loss": 3.1852, |
| "step": 91650 |
| }, |
| { |
| "epoch": 26.703884908847343, |
| "grad_norm": 0.4205593764781952, |
| "learning_rate": 0.0002797237762237762, |
| "loss": 3.1764, |
| "step": 91700 |
| }, |
| { |
| "epoch": 26.718446036461064, |
| "grad_norm": 0.447220116853714, |
| "learning_rate": 0.00027954895104895103, |
| "loss": 3.2009, |
| "step": 91750 |
| }, |
| { |
| "epoch": 26.733007164074785, |
| "grad_norm": 0.4073473811149597, |
| "learning_rate": 0.0002793741258741259, |
| "loss": 3.1775, |
| "step": 91800 |
| }, |
| { |
| "epoch": 26.74756829168851, |
| "grad_norm": 0.44910600781440735, |
| "learning_rate": 0.0002791993006993007, |
| "loss": 3.1971, |
| "step": 91850 |
| }, |
| { |
| "epoch": 26.76212941930223, |
| "grad_norm": 0.40810632705688477, |
| "learning_rate": 0.0002790244755244755, |
| "loss": 3.197, |
| "step": 91900 |
| }, |
| { |
| "epoch": 26.776690546915955, |
| "grad_norm": 0.4296828508377075, |
| "learning_rate": 0.00027884965034965034, |
| "loss": 3.1934, |
| "step": 91950 |
| }, |
| { |
| "epoch": 26.791251674529676, |
| "grad_norm": 0.43995600938796997, |
| "learning_rate": 0.00027867482517482514, |
| "loss": 3.1844, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.791251674529676, |
| "eval_accuracy": 0.37478265612960854, |
| "eval_loss": 3.5354270935058594, |
| "eval_runtime": 81.6538, |
| "eval_samples_per_second": 203.861, |
| "eval_steps_per_second": 12.749, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.805812802143397, |
| "grad_norm": 0.41159945726394653, |
| "learning_rate": 0.0002785, |
| "loss": 3.1856, |
| "step": 92050 |
| }, |
| { |
| "epoch": 26.82037392975712, |
| "grad_norm": 0.401716023683548, |
| "learning_rate": 0.0002783251748251748, |
| "loss": 3.2011, |
| "step": 92100 |
| }, |
| { |
| "epoch": 26.834935057370842, |
| "grad_norm": 0.3977109491825104, |
| "learning_rate": 0.00027815034965034965, |
| "loss": 3.1897, |
| "step": 92150 |
| }, |
| { |
| "epoch": 26.849496184984567, |
| "grad_norm": 0.4292439818382263, |
| "learning_rate": 0.00027797552447552445, |
| "loss": 3.1993, |
| "step": 92200 |
| }, |
| { |
| "epoch": 26.864057312598288, |
| "grad_norm": 0.4346993565559387, |
| "learning_rate": 0.00027780069930069925, |
| "loss": 3.1883, |
| "step": 92250 |
| }, |
| { |
| "epoch": 26.87861844021201, |
| "grad_norm": 0.4218614101409912, |
| "learning_rate": 0.0002776258741258741, |
| "loss": 3.204, |
| "step": 92300 |
| }, |
| { |
| "epoch": 26.893179567825733, |
| "grad_norm": 0.4299354553222656, |
| "learning_rate": 0.0002774510489510489, |
| "loss": 3.1988, |
| "step": 92350 |
| }, |
| { |
| "epoch": 26.907740695439454, |
| "grad_norm": 0.4194084107875824, |
| "learning_rate": 0.00027727622377622375, |
| "loss": 3.1933, |
| "step": 92400 |
| }, |
| { |
| "epoch": 26.92230182305318, |
| "grad_norm": 0.410491943359375, |
| "learning_rate": 0.00027710139860139855, |
| "loss": 3.1968, |
| "step": 92450 |
| }, |
| { |
| "epoch": 26.9368629506669, |
| "grad_norm": 0.4403439462184906, |
| "learning_rate": 0.0002769265734265734, |
| "loss": 3.2031, |
| "step": 92500 |
| }, |
| { |
| "epoch": 26.95142407828062, |
| "grad_norm": 0.4328434467315674, |
| "learning_rate": 0.00027675174825174826, |
| "loss": 3.207, |
| "step": 92550 |
| }, |
| { |
| "epoch": 26.965985205894345, |
| "grad_norm": 0.40403029322624207, |
| "learning_rate": 0.00027657692307692306, |
| "loss": 3.1974, |
| "step": 92600 |
| }, |
| { |
| "epoch": 26.980546333508066, |
| "grad_norm": 0.4548574686050415, |
| "learning_rate": 0.00027640209790209786, |
| "loss": 3.1991, |
| "step": 92650 |
| }, |
| { |
| "epoch": 26.99510746112179, |
| "grad_norm": 0.4638030230998993, |
| "learning_rate": 0.0002762272727272727, |
| "loss": 3.1911, |
| "step": 92700 |
| }, |
| { |
| "epoch": 27.009610344225056, |
| "grad_norm": 0.424076646566391, |
| "learning_rate": 0.0002760524475524475, |
| "loss": 3.1293, |
| "step": 92750 |
| }, |
| { |
| "epoch": 27.02417147183878, |
| "grad_norm": 0.41390353441238403, |
| "learning_rate": 0.00027587762237762237, |
| "loss": 3.1001, |
| "step": 92800 |
| }, |
| { |
| "epoch": 27.0387325994525, |
| "grad_norm": 0.4083455502986908, |
| "learning_rate": 0.00027570279720279717, |
| "loss": 3.0931, |
| "step": 92850 |
| }, |
| { |
| "epoch": 27.053293727066222, |
| "grad_norm": 0.41164708137512207, |
| "learning_rate": 0.000275527972027972, |
| "loss": 3.1185, |
| "step": 92900 |
| }, |
| { |
| "epoch": 27.067854854679947, |
| "grad_norm": 0.49643316864967346, |
| "learning_rate": 0.0002753531468531468, |
| "loss": 3.1136, |
| "step": 92950 |
| }, |
| { |
| "epoch": 27.082415982293668, |
| "grad_norm": 0.44062769412994385, |
| "learning_rate": 0.0002751783216783216, |
| "loss": 3.1078, |
| "step": 93000 |
| }, |
| { |
| "epoch": 27.082415982293668, |
| "eval_accuracy": 0.37448780911030266, |
| "eval_loss": 3.5474536418914795, |
| "eval_runtime": 81.7612, |
| "eval_samples_per_second": 203.593, |
| "eval_steps_per_second": 12.732, |
| "step": 93000 |
| }, |
| { |
| "epoch": 27.096977109907392, |
| "grad_norm": 0.41481348872184753, |
| "learning_rate": 0.0002750034965034965, |
| "loss": 3.1283, |
| "step": 93050 |
| }, |
| { |
| "epoch": 27.111538237521113, |
| "grad_norm": 0.4204113483428955, |
| "learning_rate": 0.0002748286713286713, |
| "loss": 3.121, |
| "step": 93100 |
| }, |
| { |
| "epoch": 27.126099365134834, |
| "grad_norm": 0.43113580346107483, |
| "learning_rate": 0.00027465384615384613, |
| "loss": 3.1167, |
| "step": 93150 |
| }, |
| { |
| "epoch": 27.14066049274856, |
| "grad_norm": 0.43483778834342957, |
| "learning_rate": 0.000274479020979021, |
| "loss": 3.1375, |
| "step": 93200 |
| }, |
| { |
| "epoch": 27.15522162036228, |
| "grad_norm": 0.4796471893787384, |
| "learning_rate": 0.0002743041958041958, |
| "loss": 3.1282, |
| "step": 93250 |
| }, |
| { |
| "epoch": 27.169782747976004, |
| "grad_norm": 0.45119085907936096, |
| "learning_rate": 0.00027412937062937064, |
| "loss": 3.1271, |
| "step": 93300 |
| }, |
| { |
| "epoch": 27.184343875589725, |
| "grad_norm": 0.46973350644111633, |
| "learning_rate": 0.00027395454545454544, |
| "loss": 3.1219, |
| "step": 93350 |
| }, |
| { |
| "epoch": 27.19890500320345, |
| "grad_norm": 0.42338910698890686, |
| "learning_rate": 0.00027377972027972024, |
| "loss": 3.1338, |
| "step": 93400 |
| }, |
| { |
| "epoch": 27.21346613081717, |
| "grad_norm": 0.4591929614543915, |
| "learning_rate": 0.0002736048951048951, |
| "loss": 3.1385, |
| "step": 93450 |
| }, |
| { |
| "epoch": 27.228027258430892, |
| "grad_norm": 0.41916024684906006, |
| "learning_rate": 0.0002734300699300699, |
| "loss": 3.1302, |
| "step": 93500 |
| }, |
| { |
| "epoch": 27.242588386044616, |
| "grad_norm": 0.42721858620643616, |
| "learning_rate": 0.00027325524475524474, |
| "loss": 3.149, |
| "step": 93550 |
| }, |
| { |
| "epoch": 27.257149513658337, |
| "grad_norm": 0.4636271595954895, |
| "learning_rate": 0.00027308041958041954, |
| "loss": 3.1617, |
| "step": 93600 |
| }, |
| { |
| "epoch": 27.271710641272062, |
| "grad_norm": 0.4328298568725586, |
| "learning_rate": 0.0002729055944055944, |
| "loss": 3.1509, |
| "step": 93650 |
| }, |
| { |
| "epoch": 27.286271768885783, |
| "grad_norm": 0.4406587779521942, |
| "learning_rate": 0.0002727307692307692, |
| "loss": 3.1364, |
| "step": 93700 |
| }, |
| { |
| "epoch": 27.300832896499504, |
| "grad_norm": 0.408686101436615, |
| "learning_rate": 0.000272555944055944, |
| "loss": 3.1508, |
| "step": 93750 |
| }, |
| { |
| "epoch": 27.31539402411323, |
| "grad_norm": 0.45478031039237976, |
| "learning_rate": 0.00027238111888111885, |
| "loss": 3.1467, |
| "step": 93800 |
| }, |
| { |
| "epoch": 27.32995515172695, |
| "grad_norm": 0.41115689277648926, |
| "learning_rate": 0.0002722062937062937, |
| "loss": 3.1499, |
| "step": 93850 |
| }, |
| { |
| "epoch": 27.344516279340674, |
| "grad_norm": 0.432817667722702, |
| "learning_rate": 0.0002720314685314685, |
| "loss": 3.1521, |
| "step": 93900 |
| }, |
| { |
| "epoch": 27.359077406954395, |
| "grad_norm": 0.4235577881336212, |
| "learning_rate": 0.00027185664335664336, |
| "loss": 3.1439, |
| "step": 93950 |
| }, |
| { |
| "epoch": 27.373638534568116, |
| "grad_norm": 0.4289872646331787, |
| "learning_rate": 0.00027168181818181816, |
| "loss": 3.1532, |
| "step": 94000 |
| }, |
| { |
| "epoch": 27.373638534568116, |
| "eval_accuracy": 0.37467637953253813, |
| "eval_loss": 3.5424959659576416, |
| "eval_runtime": 82.2934, |
| "eval_samples_per_second": 202.276, |
| "eval_steps_per_second": 12.65, |
| "step": 94000 |
| }, |
| { |
| "epoch": 27.38819966218184, |
| "grad_norm": 0.43689417839050293, |
| "learning_rate": 0.000271506993006993, |
| "loss": 3.1525, |
| "step": 94050 |
| }, |
| { |
| "epoch": 27.40276078979556, |
| "grad_norm": 0.4556712806224823, |
| "learning_rate": 0.0002713321678321678, |
| "loss": 3.1604, |
| "step": 94100 |
| }, |
| { |
| "epoch": 27.417321917409286, |
| "grad_norm": 0.43220189213752747, |
| "learning_rate": 0.0002711573426573426, |
| "loss": 3.1683, |
| "step": 94150 |
| }, |
| { |
| "epoch": 27.431883045023007, |
| "grad_norm": 0.41929247975349426, |
| "learning_rate": 0.00027098251748251746, |
| "loss": 3.1629, |
| "step": 94200 |
| }, |
| { |
| "epoch": 27.446444172636728, |
| "grad_norm": 0.4319346249103546, |
| "learning_rate": 0.00027080769230769226, |
| "loss": 3.1534, |
| "step": 94250 |
| }, |
| { |
| "epoch": 27.461005300250452, |
| "grad_norm": 0.42514413595199585, |
| "learning_rate": 0.0002706328671328671, |
| "loss": 3.154, |
| "step": 94300 |
| }, |
| { |
| "epoch": 27.475566427864173, |
| "grad_norm": 0.4643436670303345, |
| "learning_rate": 0.0002704580419580419, |
| "loss": 3.1634, |
| "step": 94350 |
| }, |
| { |
| "epoch": 27.490127555477898, |
| "grad_norm": 0.45777031779289246, |
| "learning_rate": 0.00027028321678321677, |
| "loss": 3.1696, |
| "step": 94400 |
| }, |
| { |
| "epoch": 27.50468868309162, |
| "grad_norm": 0.4447330832481384, |
| "learning_rate": 0.00027010839160839157, |
| "loss": 3.1694, |
| "step": 94450 |
| }, |
| { |
| "epoch": 27.51924981070534, |
| "grad_norm": 0.42576029896736145, |
| "learning_rate": 0.00026993356643356637, |
| "loss": 3.1608, |
| "step": 94500 |
| }, |
| { |
| "epoch": 27.533810938319064, |
| "grad_norm": 0.41965171694755554, |
| "learning_rate": 0.0002697587412587412, |
| "loss": 3.1566, |
| "step": 94550 |
| }, |
| { |
| "epoch": 27.548372065932785, |
| "grad_norm": 0.4206099510192871, |
| "learning_rate": 0.0002695839160839161, |
| "loss": 3.1636, |
| "step": 94600 |
| }, |
| { |
| "epoch": 27.56293319354651, |
| "grad_norm": 0.43519139289855957, |
| "learning_rate": 0.0002694090909090909, |
| "loss": 3.1783, |
| "step": 94650 |
| }, |
| { |
| "epoch": 27.57749432116023, |
| "grad_norm": 0.4283556640148163, |
| "learning_rate": 0.00026923426573426573, |
| "loss": 3.1618, |
| "step": 94700 |
| }, |
| { |
| "epoch": 27.59205544877395, |
| "grad_norm": 0.43282249569892883, |
| "learning_rate": 0.00026905944055944053, |
| "loss": 3.163, |
| "step": 94750 |
| }, |
| { |
| "epoch": 27.606616576387676, |
| "grad_norm": 0.43075206875801086, |
| "learning_rate": 0.0002688846153846154, |
| "loss": 3.1781, |
| "step": 94800 |
| }, |
| { |
| "epoch": 27.621177704001397, |
| "grad_norm": 0.4051018953323364, |
| "learning_rate": 0.0002687097902097902, |
| "loss": 3.1652, |
| "step": 94850 |
| }, |
| { |
| "epoch": 27.63573883161512, |
| "grad_norm": 0.4258587062358856, |
| "learning_rate": 0.000268534965034965, |
| "loss": 3.1652, |
| "step": 94900 |
| }, |
| { |
| "epoch": 27.650299959228843, |
| "grad_norm": 0.4083760678768158, |
| "learning_rate": 0.00026836013986013984, |
| "loss": 3.1739, |
| "step": 94950 |
| }, |
| { |
| "epoch": 27.664861086842564, |
| "grad_norm": 0.4240263104438782, |
| "learning_rate": 0.00026818531468531464, |
| "loss": 3.1733, |
| "step": 95000 |
| }, |
| { |
| "epoch": 27.664861086842564, |
| "eval_accuracy": 0.3748955162326921, |
| "eval_loss": 3.539768934249878, |
| "eval_runtime": 82.2815, |
| "eval_samples_per_second": 202.306, |
| "eval_steps_per_second": 12.652, |
| "step": 95000 |
| }, |
| { |
| "epoch": 27.679422214456288, |
| "grad_norm": 0.4019402861595154, |
| "learning_rate": 0.0002680104895104895, |
| "loss": 3.1769, |
| "step": 95050 |
| }, |
| { |
| "epoch": 27.69398334207001, |
| "grad_norm": 0.44093072414398193, |
| "learning_rate": 0.0002678356643356643, |
| "loss": 3.1778, |
| "step": 95100 |
| }, |
| { |
| "epoch": 27.708544469683734, |
| "grad_norm": 0.436570405960083, |
| "learning_rate": 0.00026766083916083915, |
| "loss": 3.1911, |
| "step": 95150 |
| }, |
| { |
| "epoch": 27.723105597297454, |
| "grad_norm": 0.41168585419654846, |
| "learning_rate": 0.00026748601398601395, |
| "loss": 3.1752, |
| "step": 95200 |
| }, |
| { |
| "epoch": 27.737666724911175, |
| "grad_norm": 0.4103345274925232, |
| "learning_rate": 0.0002673111888111888, |
| "loss": 3.1855, |
| "step": 95250 |
| }, |
| { |
| "epoch": 27.7522278525249, |
| "grad_norm": 0.4220221936702728, |
| "learning_rate": 0.0002671363636363636, |
| "loss": 3.1724, |
| "step": 95300 |
| }, |
| { |
| "epoch": 27.76678898013862, |
| "grad_norm": 0.42229747772216797, |
| "learning_rate": 0.00026696153846153845, |
| "loss": 3.1782, |
| "step": 95350 |
| }, |
| { |
| "epoch": 27.781350107752345, |
| "grad_norm": 0.4067949652671814, |
| "learning_rate": 0.00026678671328671325, |
| "loss": 3.18, |
| "step": 95400 |
| }, |
| { |
| "epoch": 27.795911235366066, |
| "grad_norm": 0.4630569815635681, |
| "learning_rate": 0.0002666118881118881, |
| "loss": 3.186, |
| "step": 95450 |
| }, |
| { |
| "epoch": 27.810472362979787, |
| "grad_norm": 0.41654670238494873, |
| "learning_rate": 0.0002664370629370629, |
| "loss": 3.1864, |
| "step": 95500 |
| }, |
| { |
| "epoch": 27.825033490593512, |
| "grad_norm": 0.42629358172416687, |
| "learning_rate": 0.00026626223776223776, |
| "loss": 3.1739, |
| "step": 95550 |
| }, |
| { |
| "epoch": 27.839594618207233, |
| "grad_norm": 0.44692865014076233, |
| "learning_rate": 0.00026608741258741256, |
| "loss": 3.1736, |
| "step": 95600 |
| }, |
| { |
| "epoch": 27.854155745820957, |
| "grad_norm": 0.4193972945213318, |
| "learning_rate": 0.00026591258741258736, |
| "loss": 3.1685, |
| "step": 95650 |
| }, |
| { |
| "epoch": 27.86871687343468, |
| "grad_norm": 0.4250575304031372, |
| "learning_rate": 0.0002657377622377622, |
| "loss": 3.1931, |
| "step": 95700 |
| }, |
| { |
| "epoch": 27.883278001048403, |
| "grad_norm": 0.43862655758857727, |
| "learning_rate": 0.000265562937062937, |
| "loss": 3.1969, |
| "step": 95750 |
| }, |
| { |
| "epoch": 27.897839128662124, |
| "grad_norm": 0.4448447823524475, |
| "learning_rate": 0.00026538811188811187, |
| "loss": 3.1822, |
| "step": 95800 |
| }, |
| { |
| "epoch": 27.912400256275845, |
| "grad_norm": 0.42465585470199585, |
| "learning_rate": 0.00026521328671328667, |
| "loss": 3.181, |
| "step": 95850 |
| }, |
| { |
| "epoch": 27.92696138388957, |
| "grad_norm": 0.4636482000350952, |
| "learning_rate": 0.0002650384615384615, |
| "loss": 3.1999, |
| "step": 95900 |
| }, |
| { |
| "epoch": 27.94152251150329, |
| "grad_norm": 0.4286040663719177, |
| "learning_rate": 0.0002648636363636364, |
| "loss": 3.1893, |
| "step": 95950 |
| }, |
| { |
| "epoch": 27.956083639117015, |
| "grad_norm": 0.44037359952926636, |
| "learning_rate": 0.0002646888111888112, |
| "loss": 3.1887, |
| "step": 96000 |
| }, |
| { |
| "epoch": 27.956083639117015, |
| "eval_accuracy": 0.3753692935404285, |
| "eval_loss": 3.5321953296661377, |
| "eval_runtime": 81.3353, |
| "eval_samples_per_second": 204.659, |
| "eval_steps_per_second": 12.799, |
| "step": 96000 |
| }, |
| { |
| "epoch": 27.970644766730736, |
| "grad_norm": 0.43227526545524597, |
| "learning_rate": 0.000264513986013986, |
| "loss": 3.1867, |
| "step": 96050 |
| }, |
| { |
| "epoch": 27.985205894344457, |
| "grad_norm": 0.4277183413505554, |
| "learning_rate": 0.00026433916083916083, |
| "loss": 3.183, |
| "step": 96100 |
| }, |
| { |
| "epoch": 27.99976702195818, |
| "grad_norm": 0.4404087960720062, |
| "learning_rate": 0.00026416433566433563, |
| "loss": 3.1978, |
| "step": 96150 |
| }, |
| { |
| "epoch": 28.014269905061447, |
| "grad_norm": 0.42884597182273865, |
| "learning_rate": 0.0002639895104895105, |
| "loss": 3.0877, |
| "step": 96200 |
| }, |
| { |
| "epoch": 28.02883103267517, |
| "grad_norm": 0.42914730310440063, |
| "learning_rate": 0.0002638146853146853, |
| "loss": 3.1017, |
| "step": 96250 |
| }, |
| { |
| "epoch": 28.043392160288892, |
| "grad_norm": 0.4513637125492096, |
| "learning_rate": 0.00026363986013986014, |
| "loss": 3.1012, |
| "step": 96300 |
| }, |
| { |
| "epoch": 28.057953287902617, |
| "grad_norm": 0.44458386301994324, |
| "learning_rate": 0.00026346503496503494, |
| "loss": 3.0968, |
| "step": 96350 |
| }, |
| { |
| "epoch": 28.072514415516338, |
| "grad_norm": 0.46024274826049805, |
| "learning_rate": 0.00026329020979020974, |
| "loss": 3.1038, |
| "step": 96400 |
| }, |
| { |
| "epoch": 28.08707554313006, |
| "grad_norm": 0.4524226784706116, |
| "learning_rate": 0.0002631153846153846, |
| "loss": 3.1049, |
| "step": 96450 |
| }, |
| { |
| "epoch": 28.101636670743783, |
| "grad_norm": 0.4207121431827545, |
| "learning_rate": 0.0002629405594405594, |
| "loss": 3.1161, |
| "step": 96500 |
| }, |
| { |
| "epoch": 28.116197798357504, |
| "grad_norm": 0.42661893367767334, |
| "learning_rate": 0.00026276573426573424, |
| "loss": 3.127, |
| "step": 96550 |
| }, |
| { |
| "epoch": 28.13075892597123, |
| "grad_norm": 0.4063734710216522, |
| "learning_rate": 0.00026259090909090904, |
| "loss": 3.115, |
| "step": 96600 |
| }, |
| { |
| "epoch": 28.14532005358495, |
| "grad_norm": 0.4538852870464325, |
| "learning_rate": 0.0002624160839160839, |
| "loss": 3.1236, |
| "step": 96650 |
| }, |
| { |
| "epoch": 28.15988118119867, |
| "grad_norm": 0.4559941291809082, |
| "learning_rate": 0.00026224125874125875, |
| "loss": 3.1279, |
| "step": 96700 |
| }, |
| { |
| "epoch": 28.174442308812395, |
| "grad_norm": 0.4624997079372406, |
| "learning_rate": 0.00026206643356643355, |
| "loss": 3.1312, |
| "step": 96750 |
| }, |
| { |
| "epoch": 28.189003436426116, |
| "grad_norm": 0.42149782180786133, |
| "learning_rate": 0.00026189160839160835, |
| "loss": 3.1174, |
| "step": 96800 |
| }, |
| { |
| "epoch": 28.20356456403984, |
| "grad_norm": 0.4477824866771698, |
| "learning_rate": 0.0002617167832167832, |
| "loss": 3.1143, |
| "step": 96850 |
| }, |
| { |
| "epoch": 28.21812569165356, |
| "grad_norm": 0.4365858733654022, |
| "learning_rate": 0.000261541958041958, |
| "loss": 3.1245, |
| "step": 96900 |
| }, |
| { |
| "epoch": 28.232686819267283, |
| "grad_norm": 0.45397719740867615, |
| "learning_rate": 0.00026136713286713286, |
| "loss": 3.1354, |
| "step": 96950 |
| }, |
| { |
| "epoch": 28.247247946881007, |
| "grad_norm": 0.4588843584060669, |
| "learning_rate": 0.00026119230769230766, |
| "loss": 3.1166, |
| "step": 97000 |
| }, |
| { |
| "epoch": 28.247247946881007, |
| "eval_accuracy": 0.3744686464052999, |
| "eval_loss": 3.5470144748687744, |
| "eval_runtime": 81.536, |
| "eval_samples_per_second": 204.155, |
| "eval_steps_per_second": 12.767, |
| "step": 97000 |
| }, |
| { |
| "epoch": 28.261809074494728, |
| "grad_norm": 0.43315234780311584, |
| "learning_rate": 0.0002610174825174825, |
| "loss": 3.1325, |
| "step": 97050 |
| }, |
| { |
| "epoch": 28.276370202108453, |
| "grad_norm": 0.4766865670681, |
| "learning_rate": 0.0002608426573426573, |
| "loss": 3.1142, |
| "step": 97100 |
| }, |
| { |
| "epoch": 28.290931329722174, |
| "grad_norm": 0.42569443583488464, |
| "learning_rate": 0.0002606678321678321, |
| "loss": 3.1412, |
| "step": 97150 |
| }, |
| { |
| "epoch": 28.305492457335895, |
| "grad_norm": 0.4787962734699249, |
| "learning_rate": 0.00026049300699300696, |
| "loss": 3.1288, |
| "step": 97200 |
| }, |
| { |
| "epoch": 28.32005358494962, |
| "grad_norm": 0.44054627418518066, |
| "learning_rate": 0.00026031818181818176, |
| "loss": 3.1451, |
| "step": 97250 |
| }, |
| { |
| "epoch": 28.33461471256334, |
| "grad_norm": 0.461443692445755, |
| "learning_rate": 0.0002601433566433566, |
| "loss": 3.1529, |
| "step": 97300 |
| }, |
| { |
| "epoch": 28.349175840177065, |
| "grad_norm": 0.4544363021850586, |
| "learning_rate": 0.00025996853146853147, |
| "loss": 3.1242, |
| "step": 97350 |
| }, |
| { |
| "epoch": 28.363736967790786, |
| "grad_norm": 0.4331625699996948, |
| "learning_rate": 0.00025979370629370627, |
| "loss": 3.1569, |
| "step": 97400 |
| }, |
| { |
| "epoch": 28.378298095404507, |
| "grad_norm": 0.4085220396518707, |
| "learning_rate": 0.0002596188811188811, |
| "loss": 3.1509, |
| "step": 97450 |
| }, |
| { |
| "epoch": 28.39285922301823, |
| "grad_norm": 0.4554603397846222, |
| "learning_rate": 0.0002594440559440559, |
| "loss": 3.1441, |
| "step": 97500 |
| }, |
| { |
| "epoch": 28.407420350631952, |
| "grad_norm": 0.454208105802536, |
| "learning_rate": 0.0002592692307692307, |
| "loss": 3.1456, |
| "step": 97550 |
| }, |
| { |
| "epoch": 28.421981478245677, |
| "grad_norm": 0.4616270661354065, |
| "learning_rate": 0.0002590944055944056, |
| "loss": 3.1652, |
| "step": 97600 |
| }, |
| { |
| "epoch": 28.436542605859398, |
| "grad_norm": 0.4600011110305786, |
| "learning_rate": 0.0002589195804195804, |
| "loss": 3.1565, |
| "step": 97650 |
| }, |
| { |
| "epoch": 28.45110373347312, |
| "grad_norm": 0.446938693523407, |
| "learning_rate": 0.00025874475524475523, |
| "loss": 3.1558, |
| "step": 97700 |
| }, |
| { |
| "epoch": 28.465664861086843, |
| "grad_norm": 0.4358912706375122, |
| "learning_rate": 0.00025856993006993003, |
| "loss": 3.1655, |
| "step": 97750 |
| }, |
| { |
| "epoch": 28.480225988700564, |
| "grad_norm": 0.44656306505203247, |
| "learning_rate": 0.0002583951048951049, |
| "loss": 3.1476, |
| "step": 97800 |
| }, |
| { |
| "epoch": 28.49478711631429, |
| "grad_norm": 0.4129399359226227, |
| "learning_rate": 0.0002582202797202797, |
| "loss": 3.1637, |
| "step": 97850 |
| }, |
| { |
| "epoch": 28.50934824392801, |
| "grad_norm": 0.4618586003780365, |
| "learning_rate": 0.0002580454545454545, |
| "loss": 3.1453, |
| "step": 97900 |
| }, |
| { |
| "epoch": 28.523909371541734, |
| "grad_norm": 0.46807023882865906, |
| "learning_rate": 0.00025787062937062934, |
| "loss": 3.1519, |
| "step": 97950 |
| }, |
| { |
| "epoch": 28.538470499155455, |
| "grad_norm": 0.4363339841365814, |
| "learning_rate": 0.0002576958041958042, |
| "loss": 3.1646, |
| "step": 98000 |
| }, |
| { |
| "epoch": 28.538470499155455, |
| "eval_accuracy": 0.3745487065409248, |
| "eval_loss": 3.5429527759552, |
| "eval_runtime": 81.363, |
| "eval_samples_per_second": 204.589, |
| "eval_steps_per_second": 12.795, |
| "step": 98000 |
| }, |
| { |
| "epoch": 28.553031626769176, |
| "grad_norm": 0.4522702693939209, |
| "learning_rate": 0.000257520979020979, |
| "loss": 3.1598, |
| "step": 98050 |
| }, |
| { |
| "epoch": 28.5675927543829, |
| "grad_norm": 0.43922194838523865, |
| "learning_rate": 0.00025734615384615385, |
| "loss": 3.1418, |
| "step": 98100 |
| }, |
| { |
| "epoch": 28.58215388199662, |
| "grad_norm": 0.44358423352241516, |
| "learning_rate": 0.00025717132867132865, |
| "loss": 3.1619, |
| "step": 98150 |
| }, |
| { |
| "epoch": 28.596715009610342, |
| "grad_norm": 0.4296855628490448, |
| "learning_rate": 0.0002569965034965035, |
| "loss": 3.1705, |
| "step": 98200 |
| }, |
| { |
| "epoch": 28.611276137224067, |
| "grad_norm": 0.43030625581741333, |
| "learning_rate": 0.0002568216783216783, |
| "loss": 3.1628, |
| "step": 98250 |
| }, |
| { |
| "epoch": 28.625837264837788, |
| "grad_norm": 0.425258070230484, |
| "learning_rate": 0.0002566468531468531, |
| "loss": 3.1731, |
| "step": 98300 |
| }, |
| { |
| "epoch": 28.640398392451512, |
| "grad_norm": 0.47003525495529175, |
| "learning_rate": 0.00025647202797202795, |
| "loss": 3.1659, |
| "step": 98350 |
| }, |
| { |
| "epoch": 28.654959520065233, |
| "grad_norm": 0.426637202501297, |
| "learning_rate": 0.00025629720279720275, |
| "loss": 3.1575, |
| "step": 98400 |
| }, |
| { |
| "epoch": 28.669520647678958, |
| "grad_norm": 0.440075159072876, |
| "learning_rate": 0.0002561223776223776, |
| "loss": 3.1691, |
| "step": 98450 |
| }, |
| { |
| "epoch": 28.68408177529268, |
| "grad_norm": 0.45423394441604614, |
| "learning_rate": 0.0002559475524475524, |
| "loss": 3.1599, |
| "step": 98500 |
| }, |
| { |
| "epoch": 28.6986429029064, |
| "grad_norm": 0.431527316570282, |
| "learning_rate": 0.00025577272727272726, |
| "loss": 3.1697, |
| "step": 98550 |
| }, |
| { |
| "epoch": 28.713204030520124, |
| "grad_norm": 0.4657762944698334, |
| "learning_rate": 0.00025559790209790206, |
| "loss": 3.167, |
| "step": 98600 |
| }, |
| { |
| "epoch": 28.727765158133845, |
| "grad_norm": 0.4670034348964691, |
| "learning_rate": 0.00025542307692307686, |
| "loss": 3.1668, |
| "step": 98650 |
| }, |
| { |
| "epoch": 28.74232628574757, |
| "grad_norm": 0.523735761642456, |
| "learning_rate": 0.00025524825174825177, |
| "loss": 3.1551, |
| "step": 98700 |
| }, |
| { |
| "epoch": 28.75688741336129, |
| "grad_norm": 0.45341548323631287, |
| "learning_rate": 0.00025507342657342657, |
| "loss": 3.1744, |
| "step": 98750 |
| }, |
| { |
| "epoch": 28.771448540975012, |
| "grad_norm": 0.41971510648727417, |
| "learning_rate": 0.00025489860139860137, |
| "loss": 3.1783, |
| "step": 98800 |
| }, |
| { |
| "epoch": 28.786009668588736, |
| "grad_norm": 0.4568953812122345, |
| "learning_rate": 0.0002547237762237762, |
| "loss": 3.1742, |
| "step": 98850 |
| }, |
| { |
| "epoch": 28.800570796202457, |
| "grad_norm": 0.4538373053073883, |
| "learning_rate": 0.000254548951048951, |
| "loss": 3.1766, |
| "step": 98900 |
| }, |
| { |
| "epoch": 28.815131923816182, |
| "grad_norm": 0.47364869713783264, |
| "learning_rate": 0.0002543741258741259, |
| "loss": 3.1756, |
| "step": 98950 |
| }, |
| { |
| "epoch": 28.829693051429903, |
| "grad_norm": 0.43546101450920105, |
| "learning_rate": 0.0002541993006993007, |
| "loss": 3.1702, |
| "step": 99000 |
| }, |
| { |
| "epoch": 28.829693051429903, |
| "eval_accuracy": 0.37518742418681356, |
| "eval_loss": 3.5357120037078857, |
| "eval_runtime": 81.4049, |
| "eval_samples_per_second": 204.484, |
| "eval_steps_per_second": 12.788, |
| "step": 99000 |
| }, |
| { |
| "epoch": 28.844254179043624, |
| "grad_norm": 0.4798898994922638, |
| "learning_rate": 0.0002540244755244755, |
| "loss": 3.1676, |
| "step": 99050 |
| }, |
| { |
| "epoch": 28.85881530665735, |
| "grad_norm": 0.4351198375225067, |
| "learning_rate": 0.00025384965034965033, |
| "loss": 3.1894, |
| "step": 99100 |
| }, |
| { |
| "epoch": 28.87337643427107, |
| "grad_norm": 0.4484253525733948, |
| "learning_rate": 0.00025367482517482513, |
| "loss": 3.1747, |
| "step": 99150 |
| }, |
| { |
| "epoch": 28.887937561884794, |
| "grad_norm": 0.4140309989452362, |
| "learning_rate": 0.0002535, |
| "loss": 3.1752, |
| "step": 99200 |
| }, |
| { |
| "epoch": 28.902498689498515, |
| "grad_norm": 0.4411883056163788, |
| "learning_rate": 0.0002533251748251748, |
| "loss": 3.1738, |
| "step": 99250 |
| }, |
| { |
| "epoch": 28.917059817112236, |
| "grad_norm": 0.42821288108825684, |
| "learning_rate": 0.00025315034965034964, |
| "loss": 3.1796, |
| "step": 99300 |
| }, |
| { |
| "epoch": 28.93162094472596, |
| "grad_norm": 0.47841882705688477, |
| "learning_rate": 0.00025297552447552444, |
| "loss": 3.1883, |
| "step": 99350 |
| }, |
| { |
| "epoch": 28.94618207233968, |
| "grad_norm": 0.43780389428138733, |
| "learning_rate": 0.0002528006993006993, |
| "loss": 3.174, |
| "step": 99400 |
| }, |
| { |
| "epoch": 28.960743199953406, |
| "grad_norm": 0.4515286684036255, |
| "learning_rate": 0.00025262587412587414, |
| "loss": 3.1741, |
| "step": 99450 |
| }, |
| { |
| "epoch": 28.975304327567127, |
| "grad_norm": 0.45508626103401184, |
| "learning_rate": 0.00025245104895104894, |
| "loss": 3.1814, |
| "step": 99500 |
| }, |
| { |
| "epoch": 28.989865455180848, |
| "grad_norm": 0.43479517102241516, |
| "learning_rate": 0.00025227622377622374, |
| "loss": 3.1834, |
| "step": 99550 |
| }, |
| { |
| "epoch": 29.004368338284117, |
| "grad_norm": 0.42603936791419983, |
| "learning_rate": 0.0002521013986013986, |
| "loss": 3.1514, |
| "step": 99600 |
| }, |
| { |
| "epoch": 29.018929465897838, |
| "grad_norm": 0.46239781379699707, |
| "learning_rate": 0.0002519265734265734, |
| "loss": 3.0857, |
| "step": 99650 |
| }, |
| { |
| "epoch": 29.033490593511562, |
| "grad_norm": 0.43879106640815735, |
| "learning_rate": 0.00025175174825174825, |
| "loss": 3.0812, |
| "step": 99700 |
| }, |
| { |
| "epoch": 29.048051721125283, |
| "grad_norm": 0.4821576476097107, |
| "learning_rate": 0.00025157692307692305, |
| "loss": 3.0982, |
| "step": 99750 |
| }, |
| { |
| "epoch": 29.062612848739008, |
| "grad_norm": 0.4344748854637146, |
| "learning_rate": 0.0002514020979020979, |
| "loss": 3.0943, |
| "step": 99800 |
| }, |
| { |
| "epoch": 29.07717397635273, |
| "grad_norm": 0.47556647658348083, |
| "learning_rate": 0.0002512272727272727, |
| "loss": 3.1009, |
| "step": 99850 |
| }, |
| { |
| "epoch": 29.09173510396645, |
| "grad_norm": 0.4649229049682617, |
| "learning_rate": 0.0002510524475524475, |
| "loss": 3.1046, |
| "step": 99900 |
| }, |
| { |
| "epoch": 29.106296231580174, |
| "grad_norm": 0.47019723057746887, |
| "learning_rate": 0.00025087762237762236, |
| "loss": 3.1, |
| "step": 99950 |
| }, |
| { |
| "epoch": 29.120857359193895, |
| "grad_norm": 0.4474315941333771, |
| "learning_rate": 0.00025070279720279716, |
| "loss": 3.1108, |
| "step": 100000 |
| }, |
| { |
| "epoch": 29.120857359193895, |
| "eval_accuracy": 0.3746137186628053, |
| "eval_loss": 3.5503060817718506, |
| "eval_runtime": 81.4135, |
| "eval_samples_per_second": 204.462, |
| "eval_steps_per_second": 12.787, |
| "step": 100000 |
| }, |
| { |
| "epoch": 29.120857359193895, |
| "step": 100000, |
| "total_flos": 2.090205609984e+18, |
| "train_loss": 0.6336879605102539, |
| "train_runtime": 14849.3402, |
| "train_samples_per_second": 924.957, |
| "train_steps_per_second": 11.563 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 171700, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 20, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 20 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.090205609984e+18, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|