diff --git "a/cost_to_push_frequency_2128/checkpoint-110000/trainer_state.json" "b/cost_to_push_frequency_2128/checkpoint-110000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/cost_to_push_frequency_2128/checkpoint-110000/trainer_state.json" @@ -0,0 +1,16433 @@ +{ + "best_global_step": 96000, + "best_metric": 3.529139995574951, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_push_frequency_2128/checkpoint-40000", + "epoch": 32.03290814840701, + "eval_steps": 1000, + "global_step": 110000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.014561127613722406, + "grad_norm": 1.0827219486236572, + "learning_rate": 0.000294, + "loss": 8.4429, + "step": 50 + }, + { + "epoch": 0.029122255227444813, + "grad_norm": 0.9950307011604309, + "learning_rate": 0.0005939999999999999, + "loss": 6.7538, + "step": 100 + }, + { + "epoch": 0.04368338284116722, + "grad_norm": 0.40593206882476807, + "learning_rate": 0.0005998286713286713, + "loss": 6.3529, + "step": 150 + }, + { + "epoch": 0.058244510454889625, + "grad_norm": 0.5188880562782288, + "learning_rate": 0.0005996538461538461, + "loss": 6.1387, + "step": 200 + }, + { + "epoch": 0.07280563806861204, + "grad_norm": 0.5219882726669312, + "learning_rate": 0.0005994790209790209, + "loss": 5.9936, + "step": 250 + }, + { + "epoch": 0.08736676568233444, + "grad_norm": 0.4981943368911743, + "learning_rate": 0.0005993041958041958, + "loss": 5.8475, + "step": 300 + }, + { + "epoch": 0.10192789329605685, + "grad_norm": 0.419317364692688, + "learning_rate": 0.0005991293706293705, + "loss": 5.731, + "step": 350 + }, + { + "epoch": 0.11648902090977925, + "grad_norm": 0.4203638732433319, + "learning_rate": 0.0005989545454545454, + "loss": 5.6254, + "step": 400 + }, + { + "epoch": 0.13105014852350166, + "grad_norm": 0.5592066645622253, + "learning_rate": 0.0005987797202797202, + "loss": 5.5068, + "step": 450 + }, + { + "epoch": 0.14561127613722408, + "grad_norm": 0.465763658285141, + "learning_rate": 0.000598604895104895, + "loss": 5.4002, + "step": 500 + }, + { + "epoch": 0.16017240375094646, + "grad_norm": 0.4648076295852661, + "learning_rate": 0.0005984300699300698, + "loss": 5.3308, + "step": 550 + }, + { + "epoch": 0.17473353136466888, + "grad_norm": 0.49379315972328186, + "learning_rate": 0.0005982552447552447, + "loss": 5.2609, + "step": 600 + }, + { + "epoch": 0.1892946589783913, + "grad_norm": 0.4596584439277649, + "learning_rate": 0.0005980804195804195, + "loss": 5.1905, + "step": 650 + }, + { + "epoch": 0.2038557865921137, + "grad_norm": 0.40508726239204407, + "learning_rate": 0.0005979055944055943, + "loss": 5.1331, + "step": 700 + }, + { + "epoch": 0.2184169142058361, + "grad_norm": 0.3763967752456665, + "learning_rate": 0.0005977307692307691, + "loss": 5.0753, + "step": 750 + }, + { + "epoch": 0.2329780418195585, + "grad_norm": 0.4820829927921295, + "learning_rate": 0.000597555944055944, + "loss": 5.0201, + "step": 800 + }, + { + "epoch": 0.24753916943328091, + "grad_norm": 0.4278320074081421, + "learning_rate": 0.0005973811188811188, + "loss": 4.9577, + "step": 850 + }, + { + "epoch": 0.2621002970470033, + "grad_norm": 0.4293597340583801, + "learning_rate": 0.0005972062937062936, + "loss": 4.9213, + "step": 900 + }, + { + "epoch": 0.27666142466072574, + "grad_norm": 0.43348240852355957, + "learning_rate": 0.0005970314685314685, + "loss": 4.8786, + "step": 950 + }, + { + "epoch": 0.29122255227444815, + "grad_norm": 0.4520808160305023, + "learning_rate": 0.0005968566433566433, + "loss": 4.8181, + "step": 1000 + }, + { + "epoch": 0.29122255227444815, + "eval_accuracy": 0.2556966724844482, + "eval_loss": 4.745121955871582, + "eval_runtime": 179.4217, + "eval_samples_per_second": 92.776, + "eval_steps_per_second": 5.802, + "step": 1000 + }, + { + "epoch": 0.30578367988817057, + "grad_norm": 0.45122841000556946, + "learning_rate": 0.0005966818181818181, + "loss": 4.7877, + "step": 1050 + }, + { + "epoch": 0.3203448075018929, + "grad_norm": 0.47879835963249207, + "learning_rate": 0.0005965069930069929, + "loss": 4.7428, + "step": 1100 + }, + { + "epoch": 0.33490593511561534, + "grad_norm": 0.4831642210483551, + "learning_rate": 0.0005963321678321677, + "loss": 4.6996, + "step": 1150 + }, + { + "epoch": 0.34946706272933775, + "grad_norm": 0.4561481177806854, + "learning_rate": 0.0005961573426573425, + "loss": 4.6659, + "step": 1200 + }, + { + "epoch": 0.36402819034306017, + "grad_norm": 0.4561339020729065, + "learning_rate": 0.0005959825174825174, + "loss": 4.6367, + "step": 1250 + }, + { + "epoch": 0.3785893179567826, + "grad_norm": 0.4436923861503601, + "learning_rate": 0.0005958076923076922, + "loss": 4.6064, + "step": 1300 + }, + { + "epoch": 0.393150445570505, + "grad_norm": 0.46087032556533813, + "learning_rate": 0.000595632867132867, + "loss": 4.5797, + "step": 1350 + }, + { + "epoch": 0.4077115731842274, + "grad_norm": 0.47251585125923157, + "learning_rate": 0.0005954580419580418, + "loss": 4.547, + "step": 1400 + }, + { + "epoch": 0.4222727007979498, + "grad_norm": 0.43149644136428833, + "learning_rate": 0.0005952832167832168, + "loss": 4.5216, + "step": 1450 + }, + { + "epoch": 0.4368338284116722, + "grad_norm": 0.3600349724292755, + "learning_rate": 0.0005951083916083916, + "loss": 4.5129, + "step": 1500 + }, + { + "epoch": 0.4513949560253946, + "grad_norm": 0.42545634508132935, + "learning_rate": 0.0005949335664335664, + "loss": 4.478, + "step": 1550 + }, + { + "epoch": 0.465956083639117, + "grad_norm": 0.4261489808559418, + "learning_rate": 0.0005947587412587413, + "loss": 4.466, + "step": 1600 + }, + { + "epoch": 0.4805172112528394, + "grad_norm": 0.382684588432312, + "learning_rate": 0.0005945839160839161, + "loss": 4.4463, + "step": 1650 + }, + { + "epoch": 0.49507833886656183, + "grad_norm": 0.4798526465892792, + "learning_rate": 0.0005944090909090909, + "loss": 4.419, + "step": 1700 + }, + { + "epoch": 0.5096394664802842, + "grad_norm": 0.4271828830242157, + "learning_rate": 0.0005942342657342657, + "loss": 4.4065, + "step": 1750 + }, + { + "epoch": 0.5242005940940067, + "grad_norm": 0.4648028016090393, + "learning_rate": 0.0005940594405594406, + "loss": 4.389, + "step": 1800 + }, + { + "epoch": 0.5387617217077291, + "grad_norm": 0.46727654337882996, + "learning_rate": 0.0005938846153846153, + "loss": 4.3739, + "step": 1850 + }, + { + "epoch": 0.5533228493214515, + "grad_norm": 0.4359632432460785, + "learning_rate": 0.0005937097902097902, + "loss": 4.3727, + "step": 1900 + }, + { + "epoch": 0.5678839769351739, + "grad_norm": 0.39883190393447876, + "learning_rate": 0.000593534965034965, + "loss": 4.3559, + "step": 1950 + }, + { + "epoch": 0.5824451045488963, + "grad_norm": 0.4254516661167145, + "learning_rate": 0.0005933601398601398, + "loss": 4.3438, + "step": 2000 + }, + { + "epoch": 0.5824451045488963, + "eval_accuracy": 0.29953294727340574, + "eval_loss": 4.282804489135742, + "eval_runtime": 179.6292, + "eval_samples_per_second": 92.669, + "eval_steps_per_second": 5.795, + "step": 2000 + }, + { + "epoch": 0.5970062321626187, + "grad_norm": 0.39681392908096313, + "learning_rate": 0.0005931853146853146, + "loss": 4.3252, + "step": 2050 + }, + { + "epoch": 0.6115673597763411, + "grad_norm": 0.36488792300224304, + "learning_rate": 0.0005930104895104895, + "loss": 4.3158, + "step": 2100 + }, + { + "epoch": 0.6261284873900634, + "grad_norm": 0.4375183582305908, + "learning_rate": 0.0005928356643356643, + "loss": 4.299, + "step": 2150 + }, + { + "epoch": 0.6406896150037859, + "grad_norm": 0.38287097215652466, + "learning_rate": 0.0005926608391608391, + "loss": 4.2941, + "step": 2200 + }, + { + "epoch": 0.6552507426175083, + "grad_norm": 0.3945271968841553, + "learning_rate": 0.000592486013986014, + "loss": 4.2685, + "step": 2250 + }, + { + "epoch": 0.6698118702312307, + "grad_norm": 0.3807995617389679, + "learning_rate": 0.0005923111888111888, + "loss": 4.2773, + "step": 2300 + }, + { + "epoch": 0.6843729978449531, + "grad_norm": 0.3736141324043274, + "learning_rate": 0.0005921363636363636, + "loss": 4.2439, + "step": 2350 + }, + { + "epoch": 0.6989341254586755, + "grad_norm": 0.37925609946250916, + "learning_rate": 0.0005919615384615384, + "loss": 4.2377, + "step": 2400 + }, + { + "epoch": 0.7134952530723979, + "grad_norm": 0.40228238701820374, + "learning_rate": 0.0005917867132867133, + "loss": 4.2397, + "step": 2450 + }, + { + "epoch": 0.7280563806861203, + "grad_norm": 0.3505542278289795, + "learning_rate": 0.0005916118881118881, + "loss": 4.2359, + "step": 2500 + }, + { + "epoch": 0.7426175082998427, + "grad_norm": 0.40058302879333496, + "learning_rate": 0.0005914370629370629, + "loss": 4.2241, + "step": 2550 + }, + { + "epoch": 0.7571786359135652, + "grad_norm": 0.3788367509841919, + "learning_rate": 0.0005912622377622377, + "loss": 4.2107, + "step": 2600 + }, + { + "epoch": 0.7717397635272876, + "grad_norm": 0.3747999668121338, + "learning_rate": 0.0005910874125874125, + "loss": 4.2, + "step": 2650 + }, + { + "epoch": 0.78630089114101, + "grad_norm": 0.40086600184440613, + "learning_rate": 0.0005909125874125873, + "loss": 4.1915, + "step": 2700 + }, + { + "epoch": 0.8008620187547324, + "grad_norm": 0.36495792865753174, + "learning_rate": 0.0005907377622377622, + "loss": 4.1941, + "step": 2750 + }, + { + "epoch": 0.8154231463684548, + "grad_norm": 0.3766659200191498, + "learning_rate": 0.000590562937062937, + "loss": 4.1739, + "step": 2800 + }, + { + "epoch": 0.8299842739821772, + "grad_norm": 0.3640320301055908, + "learning_rate": 0.0005903881118881118, + "loss": 4.1626, + "step": 2850 + }, + { + "epoch": 0.8445454015958996, + "grad_norm": 0.3703969717025757, + "learning_rate": 0.0005902132867132867, + "loss": 4.1557, + "step": 2900 + }, + { + "epoch": 0.8591065292096219, + "grad_norm": 0.3352505564689636, + "learning_rate": 0.0005900384615384615, + "loss": 4.1426, + "step": 2950 + }, + { + "epoch": 0.8736676568233444, + "grad_norm": 0.3644249141216278, + "learning_rate": 0.0005898636363636363, + "loss": 4.1483, + "step": 3000 + }, + { + "epoch": 0.8736676568233444, + "eval_accuracy": 0.31578797630784283, + "eval_loss": 4.0948615074157715, + "eval_runtime": 179.7252, + "eval_samples_per_second": 92.619, + "eval_steps_per_second": 5.792, + "step": 3000 + }, + { + "epoch": 0.8882287844370668, + "grad_norm": 0.3400101065635681, + "learning_rate": 0.0005896888111888111, + "loss": 4.1436, + "step": 3050 + }, + { + "epoch": 0.9027899120507892, + "grad_norm": 0.3571796417236328, + "learning_rate": 0.000589513986013986, + "loss": 4.1302, + "step": 3100 + }, + { + "epoch": 0.9173510396645116, + "grad_norm": 0.34732452034950256, + "learning_rate": 0.0005893391608391608, + "loss": 4.1203, + "step": 3150 + }, + { + "epoch": 0.931912167278234, + "grad_norm": 0.36288565397262573, + "learning_rate": 0.0005891643356643356, + "loss": 4.1241, + "step": 3200 + }, + { + "epoch": 0.9464732948919564, + "grad_norm": 0.34131136536598206, + "learning_rate": 0.0005889895104895104, + "loss": 4.1136, + "step": 3250 + }, + { + "epoch": 0.9610344225056788, + "grad_norm": 0.35798367857933044, + "learning_rate": 0.0005888146853146853, + "loss": 4.1029, + "step": 3300 + }, + { + "epoch": 0.9755955501194012, + "grad_norm": 0.3709186613559723, + "learning_rate": 0.00058863986013986, + "loss": 4.0891, + "step": 3350 + }, + { + "epoch": 0.9901566777331237, + "grad_norm": 0.3378744423389435, + "learning_rate": 0.0005884650349650349, + "loss": 4.0959, + "step": 3400 + }, + { + "epoch": 1.004659560836391, + "grad_norm": 0.3469085097312927, + "learning_rate": 0.0005882902097902097, + "loss": 4.0733, + "step": 3450 + }, + { + "epoch": 1.0192206884501136, + "grad_norm": 0.3355250954627991, + "learning_rate": 0.0005881153846153845, + "loss": 4.0135, + "step": 3500 + }, + { + "epoch": 1.033781816063836, + "grad_norm": 0.34765860438346863, + "learning_rate": 0.0005879405594405594, + "loss": 4.0131, + "step": 3550 + }, + { + "epoch": 1.0483429436775584, + "grad_norm": 0.3484998941421509, + "learning_rate": 0.0005877657342657342, + "loss": 4.0352, + "step": 3600 + }, + { + "epoch": 1.0629040712912807, + "grad_norm": 0.34341979026794434, + "learning_rate": 0.000587590909090909, + "loss": 4.0047, + "step": 3650 + }, + { + "epoch": 1.0774651989050033, + "grad_norm": 0.36538752913475037, + "learning_rate": 0.0005874160839160838, + "loss": 4.0016, + "step": 3700 + }, + { + "epoch": 1.0920263265187256, + "grad_norm": 0.3458220064640045, + "learning_rate": 0.0005872412587412587, + "loss": 4.0163, + "step": 3750 + }, + { + "epoch": 1.106587454132448, + "grad_norm": 0.3493204414844513, + "learning_rate": 0.0005870664335664335, + "loss": 4.0035, + "step": 3800 + }, + { + "epoch": 1.1211485817461704, + "grad_norm": 0.3274590075016022, + "learning_rate": 0.0005868916083916083, + "loss": 4.0167, + "step": 3850 + }, + { + "epoch": 1.135709709359893, + "grad_norm": 0.3461831510066986, + "learning_rate": 0.0005867167832167831, + "loss": 3.99, + "step": 3900 + }, + { + "epoch": 1.1502708369736152, + "grad_norm": 0.3442121148109436, + "learning_rate": 0.000586541958041958, + "loss": 3.9825, + "step": 3950 + }, + { + "epoch": 1.1648319645873377, + "grad_norm": 0.3337996006011963, + "learning_rate": 0.0005863671328671328, + "loss": 3.9794, + "step": 4000 + }, + { + "epoch": 1.1648319645873377, + "eval_accuracy": 0.32491283320475906, + "eval_loss": 3.989677906036377, + "eval_runtime": 179.7843, + "eval_samples_per_second": 92.589, + "eval_steps_per_second": 5.79, + "step": 4000 + }, + { + "epoch": 1.17939309220106, + "grad_norm": 0.33036714792251587, + "learning_rate": 0.0005861923076923076, + "loss": 3.9821, + "step": 4050 + }, + { + "epoch": 1.1939542198147826, + "grad_norm": 0.33033114671707153, + "learning_rate": 0.0005860174825174824, + "loss": 3.9925, + "step": 4100 + }, + { + "epoch": 1.2085153474285049, + "grad_norm": 0.3445809781551361, + "learning_rate": 0.0005858426573426573, + "loss": 3.9873, + "step": 4150 + }, + { + "epoch": 1.2230764750422272, + "grad_norm": 0.32692384719848633, + "learning_rate": 0.000585667832167832, + "loss": 3.9814, + "step": 4200 + }, + { + "epoch": 1.2376376026559497, + "grad_norm": 0.3487424850463867, + "learning_rate": 0.000585493006993007, + "loss": 3.9712, + "step": 4250 + }, + { + "epoch": 1.2521987302696722, + "grad_norm": 0.345749169588089, + "learning_rate": 0.0005853181818181817, + "loss": 3.9784, + "step": 4300 + }, + { + "epoch": 1.2667598578833945, + "grad_norm": 0.36335498094558716, + "learning_rate": 0.0005851433566433565, + "loss": 3.9808, + "step": 4350 + }, + { + "epoch": 1.2813209854971168, + "grad_norm": 0.31872642040252686, + "learning_rate": 0.0005849685314685315, + "loss": 3.9746, + "step": 4400 + }, + { + "epoch": 1.2958821131108393, + "grad_norm": 0.357146680355072, + "learning_rate": 0.0005847937062937063, + "loss": 3.9645, + "step": 4450 + }, + { + "epoch": 1.3104432407245616, + "grad_norm": 0.325870543718338, + "learning_rate": 0.0005846188811188811, + "loss": 3.9639, + "step": 4500 + }, + { + "epoch": 1.3250043683382842, + "grad_norm": 0.3136429488658905, + "learning_rate": 0.0005844440559440559, + "loss": 3.9582, + "step": 4550 + }, + { + "epoch": 1.3395654959520065, + "grad_norm": 0.35432639718055725, + "learning_rate": 0.0005842692307692308, + "loss": 3.9456, + "step": 4600 + }, + { + "epoch": 1.354126623565729, + "grad_norm": 0.3514183759689331, + "learning_rate": 0.0005840944055944056, + "loss": 3.9475, + "step": 4650 + }, + { + "epoch": 1.3686877511794513, + "grad_norm": 0.33868497610092163, + "learning_rate": 0.0005839195804195804, + "loss": 3.9486, + "step": 4700 + }, + { + "epoch": 1.3832488787931738, + "grad_norm": 0.3391216993331909, + "learning_rate": 0.0005837447552447552, + "loss": 3.9525, + "step": 4750 + }, + { + "epoch": 1.3978100064068961, + "grad_norm": 0.34010815620422363, + "learning_rate": 0.0005835699300699301, + "loss": 3.947, + "step": 4800 + }, + { + "epoch": 1.4123711340206184, + "grad_norm": 0.3243875205516815, + "learning_rate": 0.0005833951048951048, + "loss": 3.9515, + "step": 4850 + }, + { + "epoch": 1.426932261634341, + "grad_norm": 0.35085731744766235, + "learning_rate": 0.0005832202797202797, + "loss": 3.9402, + "step": 4900 + }, + { + "epoch": 1.4414933892480635, + "grad_norm": 0.34375637769699097, + "learning_rate": 0.0005830454545454546, + "loss": 3.9424, + "step": 4950 + }, + { + "epoch": 1.4560545168617858, + "grad_norm": 0.3360918164253235, + "learning_rate": 0.0005828706293706293, + "loss": 3.946, + "step": 5000 + }, + { + "epoch": 1.4560545168617858, + "eval_accuracy": 0.3315629972163526, + "eval_loss": 3.9168527126312256, + "eval_runtime": 179.6234, + "eval_samples_per_second": 92.672, + "eval_steps_per_second": 5.795, + "step": 5000 + }, + { + "epoch": 1.470615644475508, + "grad_norm": 0.3384229838848114, + "learning_rate": 0.0005826958041958042, + "loss": 3.9282, + "step": 5050 + }, + { + "epoch": 1.4851767720892306, + "grad_norm": 0.3160015642642975, + "learning_rate": 0.000582520979020979, + "loss": 3.9223, + "step": 5100 + }, + { + "epoch": 1.4997378997029531, + "grad_norm": 0.31337279081344604, + "learning_rate": 0.0005823461538461538, + "loss": 3.9139, + "step": 5150 + }, + { + "epoch": 1.5142990273166754, + "grad_norm": 0.3430108428001404, + "learning_rate": 0.0005821713286713286, + "loss": 3.9192, + "step": 5200 + }, + { + "epoch": 1.5288601549303977, + "grad_norm": 0.32244783639907837, + "learning_rate": 0.0005819965034965035, + "loss": 3.9181, + "step": 5250 + }, + { + "epoch": 1.5434212825441203, + "grad_norm": 0.32754674553871155, + "learning_rate": 0.0005818216783216783, + "loss": 3.9076, + "step": 5300 + }, + { + "epoch": 1.5579824101578428, + "grad_norm": 0.3257962167263031, + "learning_rate": 0.0005816468531468531, + "loss": 3.9091, + "step": 5350 + }, + { + "epoch": 1.572543537771565, + "grad_norm": 0.319021999835968, + "learning_rate": 0.0005814720279720279, + "loss": 3.8997, + "step": 5400 + }, + { + "epoch": 1.5871046653852874, + "grad_norm": 0.34583571553230286, + "learning_rate": 0.0005812972027972028, + "loss": 3.9082, + "step": 5450 + }, + { + "epoch": 1.6016657929990097, + "grad_norm": 0.31768912076950073, + "learning_rate": 0.0005811223776223776, + "loss": 3.9119, + "step": 5500 + }, + { + "epoch": 1.6162269206127322, + "grad_norm": 0.30981358885765076, + "learning_rate": 0.0005809475524475524, + "loss": 3.8991, + "step": 5550 + }, + { + "epoch": 1.6307880482264547, + "grad_norm": 0.3583605885505676, + "learning_rate": 0.0005807727272727272, + "loss": 3.8898, + "step": 5600 + }, + { + "epoch": 1.645349175840177, + "grad_norm": 0.35432425141334534, + "learning_rate": 0.0005805979020979021, + "loss": 3.9067, + "step": 5650 + }, + { + "epoch": 1.6599103034538993, + "grad_norm": 0.32656440138816833, + "learning_rate": 0.0005804230769230769, + "loss": 3.8878, + "step": 5700 + }, + { + "epoch": 1.6744714310676219, + "grad_norm": 0.32895249128341675, + "learning_rate": 0.0005802482517482517, + "loss": 3.8858, + "step": 5750 + }, + { + "epoch": 1.6890325586813444, + "grad_norm": 0.3573879897594452, + "learning_rate": 0.0005800734265734265, + "loss": 3.8995, + "step": 5800 + }, + { + "epoch": 1.7035936862950667, + "grad_norm": 0.3116515278816223, + "learning_rate": 0.0005798986013986013, + "loss": 3.8855, + "step": 5850 + }, + { + "epoch": 1.718154813908789, + "grad_norm": 0.32921165227890015, + "learning_rate": 0.0005797237762237762, + "loss": 3.8858, + "step": 5900 + }, + { + "epoch": 1.7327159415225115, + "grad_norm": 0.32322996854782104, + "learning_rate": 0.000579548951048951, + "loss": 3.8747, + "step": 5950 + }, + { + "epoch": 1.747277069136234, + "grad_norm": 0.3198484778404236, + "learning_rate": 0.0005793741258741258, + "loss": 3.8796, + "step": 6000 + }, + { + "epoch": 1.747277069136234, + "eval_accuracy": 0.33665428105410394, + "eval_loss": 3.859868049621582, + "eval_runtime": 179.7598, + "eval_samples_per_second": 92.601, + "eval_steps_per_second": 5.791, + "step": 6000 + }, + { + "epoch": 1.7618381967499563, + "grad_norm": 0.32858818769454956, + "learning_rate": 0.0005791993006993006, + "loss": 3.8737, + "step": 6050 + }, + { + "epoch": 1.7763993243636786, + "grad_norm": 0.31307506561279297, + "learning_rate": 0.0005790244755244755, + "loss": 3.8731, + "step": 6100 + }, + { + "epoch": 1.7909604519774012, + "grad_norm": 0.32378000020980835, + "learning_rate": 0.0005788496503496503, + "loss": 3.8751, + "step": 6150 + }, + { + "epoch": 1.8055215795911237, + "grad_norm": 0.3218482434749603, + "learning_rate": 0.0005786748251748251, + "loss": 3.8731, + "step": 6200 + }, + { + "epoch": 1.820082707204846, + "grad_norm": 0.3510587215423584, + "learning_rate": 0.0005784999999999999, + "loss": 3.8621, + "step": 6250 + }, + { + "epoch": 1.8346438348185683, + "grad_norm": 0.32646113634109497, + "learning_rate": 0.0005783251748251748, + "loss": 3.8652, + "step": 6300 + }, + { + "epoch": 1.8492049624322906, + "grad_norm": 0.34067031741142273, + "learning_rate": 0.0005781503496503496, + "loss": 3.8638, + "step": 6350 + }, + { + "epoch": 1.8637660900460131, + "grad_norm": 0.327680766582489, + "learning_rate": 0.0005779755244755244, + "loss": 3.8617, + "step": 6400 + }, + { + "epoch": 1.8783272176597356, + "grad_norm": 0.31625163555145264, + "learning_rate": 0.0005778006993006993, + "loss": 3.8561, + "step": 6450 + }, + { + "epoch": 1.892888345273458, + "grad_norm": 0.312741219997406, + "learning_rate": 0.000577625874125874, + "loss": 3.842, + "step": 6500 + }, + { + "epoch": 1.9074494728871803, + "grad_norm": 0.32632362842559814, + "learning_rate": 0.0005774510489510489, + "loss": 3.8528, + "step": 6550 + }, + { + "epoch": 1.9220106005009028, + "grad_norm": 0.32156306505203247, + "learning_rate": 0.0005772762237762237, + "loss": 3.8587, + "step": 6600 + }, + { + "epoch": 1.9365717281146253, + "grad_norm": 0.3177630305290222, + "learning_rate": 0.0005771013986013985, + "loss": 3.8592, + "step": 6650 + }, + { + "epoch": 1.9511328557283476, + "grad_norm": 0.3381432890892029, + "learning_rate": 0.0005769265734265733, + "loss": 3.8487, + "step": 6700 + }, + { + "epoch": 1.96569398334207, + "grad_norm": 0.31193795800209045, + "learning_rate": 0.0005767517482517482, + "loss": 3.8599, + "step": 6750 + }, + { + "epoch": 1.9802551109557924, + "grad_norm": 0.33586713671684265, + "learning_rate": 0.000576576923076923, + "loss": 3.8439, + "step": 6800 + }, + { + "epoch": 1.994816238569515, + "grad_norm": 0.3259575068950653, + "learning_rate": 0.0005764020979020978, + "loss": 3.8482, + "step": 6850 + }, + { + "epoch": 2.009319121672782, + "grad_norm": 0.3125501275062561, + "learning_rate": 0.0005762272727272726, + "loss": 3.7815, + "step": 6900 + }, + { + "epoch": 2.023880249286505, + "grad_norm": 0.3336809575557709, + "learning_rate": 0.0005760524475524475, + "loss": 3.7473, + "step": 6950 + }, + { + "epoch": 2.038441376900227, + "grad_norm": 0.3166639804840088, + "learning_rate": 0.0005758776223776223, + "loss": 3.7474, + "step": 7000 + }, + { + "epoch": 2.038441376900227, + "eval_accuracy": 0.34135384628406934, + "eval_loss": 3.8145618438720703, + "eval_runtime": 179.8334, + "eval_samples_per_second": 92.563, + "eval_steps_per_second": 5.789, + "step": 7000 + }, + { + "epoch": 2.0530025045139495, + "grad_norm": 0.3304164409637451, + "learning_rate": 0.0005757027972027971, + "loss": 3.7559, + "step": 7050 + }, + { + "epoch": 2.067563632127672, + "grad_norm": 0.35328182578086853, + "learning_rate": 0.000575527972027972, + "loss": 3.741, + "step": 7100 + }, + { + "epoch": 2.0821247597413945, + "grad_norm": 0.3486672043800354, + "learning_rate": 0.0005753531468531468, + "loss": 3.751, + "step": 7150 + }, + { + "epoch": 2.096685887355117, + "grad_norm": 0.32075631618499756, + "learning_rate": 0.0005751783216783216, + "loss": 3.7516, + "step": 7200 + }, + { + "epoch": 2.111247014968839, + "grad_norm": 0.3235573172569275, + "learning_rate": 0.0005750034965034964, + "loss": 3.7561, + "step": 7250 + }, + { + "epoch": 2.1258081425825615, + "grad_norm": 0.32960283756256104, + "learning_rate": 0.0005748286713286712, + "loss": 3.7471, + "step": 7300 + }, + { + "epoch": 2.140369270196284, + "grad_norm": 0.3249431848526001, + "learning_rate": 0.000574653846153846, + "loss": 3.7479, + "step": 7350 + }, + { + "epoch": 2.1549303978100065, + "grad_norm": 0.32068416476249695, + "learning_rate": 0.000574479020979021, + "loss": 3.7515, + "step": 7400 + }, + { + "epoch": 2.169491525423729, + "grad_norm": 0.35874906182289124, + "learning_rate": 0.0005743041958041958, + "loss": 3.7665, + "step": 7450 + }, + { + "epoch": 2.184052653037451, + "grad_norm": 0.34327706694602966, + "learning_rate": 0.0005741293706293706, + "loss": 3.7511, + "step": 7500 + }, + { + "epoch": 2.198613780651174, + "grad_norm": 0.3151525855064392, + "learning_rate": 0.0005739545454545454, + "loss": 3.7454, + "step": 7550 + }, + { + "epoch": 2.213174908264896, + "grad_norm": 0.3023368716239929, + "learning_rate": 0.0005737797202797203, + "loss": 3.7624, + "step": 7600 + }, + { + "epoch": 2.2277360358786185, + "grad_norm": 0.3228301703929901, + "learning_rate": 0.0005736048951048951, + "loss": 3.7529, + "step": 7650 + }, + { + "epoch": 2.2422971634923408, + "grad_norm": 0.33145347237586975, + "learning_rate": 0.0005734300699300699, + "loss": 3.76, + "step": 7700 + }, + { + "epoch": 2.256858291106063, + "grad_norm": 0.31790366768836975, + "learning_rate": 0.0005732552447552448, + "loss": 3.7657, + "step": 7750 + }, + { + "epoch": 2.271419418719786, + "grad_norm": 0.32009178400039673, + "learning_rate": 0.0005730804195804196, + "loss": 3.7592, + "step": 7800 + }, + { + "epoch": 2.285980546333508, + "grad_norm": 0.31966885924339294, + "learning_rate": 0.0005729055944055944, + "loss": 3.7606, + "step": 7850 + }, + { + "epoch": 2.3005416739472304, + "grad_norm": 0.3291054368019104, + "learning_rate": 0.0005727307692307692, + "loss": 3.7479, + "step": 7900 + }, + { + "epoch": 2.3151028015609527, + "grad_norm": 0.33194002509117126, + "learning_rate": 0.0005725559440559441, + "loss": 3.757, + "step": 7950 + }, + { + "epoch": 2.3296639291746755, + "grad_norm": 0.30678218603134155, + "learning_rate": 0.0005723811188811188, + "loss": 3.7545, + "step": 8000 + }, + { + "epoch": 2.3296639291746755, + "eval_accuracy": 0.34440224469340025, + "eval_loss": 3.782811164855957, + "eval_runtime": 179.7459, + "eval_samples_per_second": 92.609, + "eval_steps_per_second": 5.792, + "step": 8000 + }, + { + "epoch": 2.3442250567883978, + "grad_norm": 0.31450313329696655, + "learning_rate": 0.0005722062937062937, + "loss": 3.7648, + "step": 8050 + }, + { + "epoch": 2.35878618440212, + "grad_norm": 0.3125315308570862, + "learning_rate": 0.0005720314685314685, + "loss": 3.7461, + "step": 8100 + }, + { + "epoch": 2.3733473120158424, + "grad_norm": 0.3463304936885834, + "learning_rate": 0.0005718566433566433, + "loss": 3.7542, + "step": 8150 + }, + { + "epoch": 2.387908439629565, + "grad_norm": 0.3375414311885834, + "learning_rate": 0.0005716818181818181, + "loss": 3.7424, + "step": 8200 + }, + { + "epoch": 2.4024695672432874, + "grad_norm": 0.3216915428638458, + "learning_rate": 0.000571506993006993, + "loss": 3.7559, + "step": 8250 + }, + { + "epoch": 2.4170306948570097, + "grad_norm": 0.37400275468826294, + "learning_rate": 0.0005713321678321678, + "loss": 3.7556, + "step": 8300 + }, + { + "epoch": 2.431591822470732, + "grad_norm": 0.3273051977157593, + "learning_rate": 0.0005711573426573426, + "loss": 3.7541, + "step": 8350 + }, + { + "epoch": 2.4461529500844543, + "grad_norm": 0.31118476390838623, + "learning_rate": 0.0005709825174825175, + "loss": 3.7479, + "step": 8400 + }, + { + "epoch": 2.460714077698177, + "grad_norm": 0.33436667919158936, + "learning_rate": 0.0005708076923076923, + "loss": 3.7398, + "step": 8450 + }, + { + "epoch": 2.4752752053118994, + "grad_norm": 0.32443201541900635, + "learning_rate": 0.0005706328671328671, + "loss": 3.7483, + "step": 8500 + }, + { + "epoch": 2.4898363329256217, + "grad_norm": 0.3430940806865692, + "learning_rate": 0.0005704580419580419, + "loss": 3.75, + "step": 8550 + }, + { + "epoch": 2.5043974605393444, + "grad_norm": 0.31686174869537354, + "learning_rate": 0.0005702832167832168, + "loss": 3.7418, + "step": 8600 + }, + { + "epoch": 2.5189585881530667, + "grad_norm": 0.3173408508300781, + "learning_rate": 0.0005701083916083916, + "loss": 3.7437, + "step": 8650 + }, + { + "epoch": 2.533519715766789, + "grad_norm": 0.3175743818283081, + "learning_rate": 0.0005699335664335664, + "loss": 3.7417, + "step": 8700 + }, + { + "epoch": 2.5480808433805113, + "grad_norm": 0.3153781592845917, + "learning_rate": 0.0005697587412587412, + "loss": 3.7459, + "step": 8750 + }, + { + "epoch": 2.5626419709942336, + "grad_norm": 0.3198295831680298, + "learning_rate": 0.000569583916083916, + "loss": 3.7524, + "step": 8800 + }, + { + "epoch": 2.5772030986079564, + "grad_norm": 0.31497374176979065, + "learning_rate": 0.0005694090909090908, + "loss": 3.7366, + "step": 8850 + }, + { + "epoch": 2.5917642262216787, + "grad_norm": 0.3190245032310486, + "learning_rate": 0.0005692342657342657, + "loss": 3.7408, + "step": 8900 + }, + { + "epoch": 2.606325353835401, + "grad_norm": 0.3084900975227356, + "learning_rate": 0.0005690594405594405, + "loss": 3.7355, + "step": 8950 + }, + { + "epoch": 2.6208864814491233, + "grad_norm": 0.3053756356239319, + "learning_rate": 0.0005688846153846153, + "loss": 3.7487, + "step": 9000 + }, + { + "epoch": 2.6208864814491233, + "eval_accuracy": 0.34699638118781967, + "eval_loss": 3.7568321228027344, + "eval_runtime": 179.8658, + "eval_samples_per_second": 92.547, + "eval_steps_per_second": 5.788, + "step": 9000 + }, + { + "epoch": 2.6354476090628456, + "grad_norm": 0.3176893889904022, + "learning_rate": 0.0005687097902097901, + "loss": 3.7455, + "step": 9050 + }, + { + "epoch": 2.6500087366765683, + "grad_norm": 0.3208650052547455, + "learning_rate": 0.000568534965034965, + "loss": 3.7457, + "step": 9100 + }, + { + "epoch": 2.6645698642902906, + "grad_norm": 0.3182576596736908, + "learning_rate": 0.0005683601398601398, + "loss": 3.7312, + "step": 9150 + }, + { + "epoch": 2.679130991904013, + "grad_norm": 0.31629255414009094, + "learning_rate": 0.0005681853146853146, + "loss": 3.7155, + "step": 9200 + }, + { + "epoch": 2.6936921195177357, + "grad_norm": 0.33148428797721863, + "learning_rate": 0.0005680104895104895, + "loss": 3.7379, + "step": 9250 + }, + { + "epoch": 2.708253247131458, + "grad_norm": 0.3020288646221161, + "learning_rate": 0.0005678356643356643, + "loss": 3.7264, + "step": 9300 + }, + { + "epoch": 2.7228143747451803, + "grad_norm": 0.34346917271614075, + "learning_rate": 0.0005676608391608391, + "loss": 3.7374, + "step": 9350 + }, + { + "epoch": 2.7373755023589026, + "grad_norm": 0.31063133478164673, + "learning_rate": 0.0005674860139860139, + "loss": 3.7298, + "step": 9400 + }, + { + "epoch": 2.751936629972625, + "grad_norm": 0.31841859221458435, + "learning_rate": 0.0005673111888111888, + "loss": 3.7237, + "step": 9450 + }, + { + "epoch": 2.7664977575863476, + "grad_norm": 0.3212113082408905, + "learning_rate": 0.0005671363636363635, + "loss": 3.7389, + "step": 9500 + }, + { + "epoch": 2.78105888520007, + "grad_norm": 0.319784551858902, + "learning_rate": 0.0005669615384615384, + "loss": 3.7401, + "step": 9550 + }, + { + "epoch": 2.7956200128137922, + "grad_norm": 0.31253302097320557, + "learning_rate": 0.0005667867132867132, + "loss": 3.7299, + "step": 9600 + }, + { + "epoch": 2.8101811404275145, + "grad_norm": 0.3241884708404541, + "learning_rate": 0.000566611888111888, + "loss": 3.7281, + "step": 9650 + }, + { + "epoch": 2.824742268041237, + "grad_norm": 0.3327905833721161, + "learning_rate": 0.0005664370629370628, + "loss": 3.7403, + "step": 9700 + }, + { + "epoch": 2.8393033956549596, + "grad_norm": 0.33363252878189087, + "learning_rate": 0.0005662622377622377, + "loss": 3.7429, + "step": 9750 + }, + { + "epoch": 2.853864523268682, + "grad_norm": 0.3250058591365814, + "learning_rate": 0.0005660874125874125, + "loss": 3.7313, + "step": 9800 + }, + { + "epoch": 2.868425650882404, + "grad_norm": 0.3366358280181885, + "learning_rate": 0.0005659125874125873, + "loss": 3.732, + "step": 9850 + }, + { + "epoch": 2.882986778496127, + "grad_norm": 0.3395000100135803, + "learning_rate": 0.0005657377622377622, + "loss": 3.7283, + "step": 9900 + }, + { + "epoch": 2.8975479061098492, + "grad_norm": 0.30396348237991333, + "learning_rate": 0.000565562937062937, + "loss": 3.7282, + "step": 9950 + }, + { + "epoch": 2.9121090337235715, + "grad_norm": 0.310280442237854, + "learning_rate": 0.0005653881118881118, + "loss": 3.7222, + "step": 10000 + }, + { + "epoch": 2.9121090337235715, + "eval_accuracy": 0.3495220962447447, + "eval_loss": 3.729001045227051, + "eval_runtime": 179.9042, + "eval_samples_per_second": 92.527, + "eval_steps_per_second": 5.786, + "step": 10000 + }, + { + "epoch": 2.926670161337294, + "grad_norm": 0.3250355124473572, + "learning_rate": 0.0005652132867132866, + "loss": 3.7166, + "step": 10050 + }, + { + "epoch": 2.941231288951016, + "grad_norm": 0.30567246675491333, + "learning_rate": 0.0005650384615384615, + "loss": 3.7325, + "step": 10100 + }, + { + "epoch": 2.955792416564739, + "grad_norm": 0.34791237115859985, + "learning_rate": 0.0005648636363636363, + "loss": 3.7056, + "step": 10150 + }, + { + "epoch": 2.970353544178461, + "grad_norm": 0.31332409381866455, + "learning_rate": 0.0005646888111888111, + "loss": 3.7251, + "step": 10200 + }, + { + "epoch": 2.9849146717921835, + "grad_norm": 0.2971247136592865, + "learning_rate": 0.000564513986013986, + "loss": 3.7126, + "step": 10250 + }, + { + "epoch": 2.9994757994059063, + "grad_norm": 0.32203900814056396, + "learning_rate": 0.0005643391608391607, + "loss": 3.7198, + "step": 10300 + }, + { + "epoch": 3.0139786825091734, + "grad_norm": 0.3143203854560852, + "learning_rate": 0.0005641643356643355, + "loss": 3.6176, + "step": 10350 + }, + { + "epoch": 3.0285398101228957, + "grad_norm": 0.33899393677711487, + "learning_rate": 0.0005639895104895105, + "loss": 3.6184, + "step": 10400 + }, + { + "epoch": 3.0431009377366185, + "grad_norm": 0.33629149198532104, + "learning_rate": 0.0005638146853146853, + "loss": 3.6218, + "step": 10450 + }, + { + "epoch": 3.057662065350341, + "grad_norm": 0.33977800607681274, + "learning_rate": 0.0005636398601398601, + "loss": 3.6169, + "step": 10500 + }, + { + "epoch": 3.072223192964063, + "grad_norm": 0.3242505192756653, + "learning_rate": 0.000563465034965035, + "loss": 3.6248, + "step": 10550 + }, + { + "epoch": 3.0867843205777854, + "grad_norm": 0.33569052815437317, + "learning_rate": 0.0005632902097902098, + "loss": 3.6438, + "step": 10600 + }, + { + "epoch": 3.101345448191508, + "grad_norm": 0.3249237835407257, + "learning_rate": 0.0005631153846153846, + "loss": 3.6286, + "step": 10650 + }, + { + "epoch": 3.1159065758052304, + "grad_norm": 0.3126699924468994, + "learning_rate": 0.0005629405594405594, + "loss": 3.6282, + "step": 10700 + }, + { + "epoch": 3.1304677034189528, + "grad_norm": 0.3072546720504761, + "learning_rate": 0.0005627657342657343, + "loss": 3.6303, + "step": 10750 + }, + { + "epoch": 3.145028831032675, + "grad_norm": 0.30215486884117126, + "learning_rate": 0.0005625909090909091, + "loss": 3.6246, + "step": 10800 + }, + { + "epoch": 3.1595899586463974, + "grad_norm": 0.30103379487991333, + "learning_rate": 0.0005624160839160839, + "loss": 3.632, + "step": 10850 + }, + { + "epoch": 3.17415108626012, + "grad_norm": 0.40593844652175903, + "learning_rate": 0.0005622412587412587, + "loss": 3.6411, + "step": 10900 + }, + { + "epoch": 3.1887122138738424, + "grad_norm": 0.30845344066619873, + "learning_rate": 0.0005620664335664336, + "loss": 3.6405, + "step": 10950 + }, + { + "epoch": 3.2032733414875647, + "grad_norm": 0.31571993231773376, + "learning_rate": 0.0005618916083916083, + "loss": 3.6434, + "step": 11000 + }, + { + "epoch": 3.2032733414875647, + "eval_accuracy": 0.3514291968616427, + "eval_loss": 3.715721607208252, + "eval_runtime": 179.7371, + "eval_samples_per_second": 92.613, + "eval_steps_per_second": 5.792, + "step": 11000 + }, + { + "epoch": 3.217834469101287, + "grad_norm": 0.3318782448768616, + "learning_rate": 0.0005617167832167832, + "loss": 3.6317, + "step": 11050 + }, + { + "epoch": 3.2323955967150098, + "grad_norm": 0.33287033438682556, + "learning_rate": 0.000561541958041958, + "loss": 3.6503, + "step": 11100 + }, + { + "epoch": 3.246956724328732, + "grad_norm": 0.3447157144546509, + "learning_rate": 0.0005613671328671328, + "loss": 3.6464, + "step": 11150 + }, + { + "epoch": 3.2615178519424544, + "grad_norm": 0.31866371631622314, + "learning_rate": 0.0005611923076923077, + "loss": 3.6459, + "step": 11200 + }, + { + "epoch": 3.2760789795561767, + "grad_norm": 0.3190111517906189, + "learning_rate": 0.0005610174825174825, + "loss": 3.6375, + "step": 11250 + }, + { + "epoch": 3.2906401071698994, + "grad_norm": 0.3384534418582916, + "learning_rate": 0.0005608426573426573, + "loss": 3.6297, + "step": 11300 + }, + { + "epoch": 3.3052012347836217, + "grad_norm": 0.3122884929180145, + "learning_rate": 0.0005606678321678321, + "loss": 3.6488, + "step": 11350 + }, + { + "epoch": 3.319762362397344, + "grad_norm": 0.3280264139175415, + "learning_rate": 0.000560493006993007, + "loss": 3.6409, + "step": 11400 + }, + { + "epoch": 3.3343234900110663, + "grad_norm": 0.3291660249233246, + "learning_rate": 0.0005603181818181818, + "loss": 3.6371, + "step": 11450 + }, + { + "epoch": 3.3488846176247886, + "grad_norm": 0.3122524619102478, + "learning_rate": 0.0005601433566433566, + "loss": 3.6437, + "step": 11500 + }, + { + "epoch": 3.3634457452385114, + "grad_norm": 0.3195066452026367, + "learning_rate": 0.0005599685314685314, + "loss": 3.654, + "step": 11550 + }, + { + "epoch": 3.3780068728522337, + "grad_norm": 0.32396697998046875, + "learning_rate": 0.0005597937062937063, + "loss": 3.6451, + "step": 11600 + }, + { + "epoch": 3.392568000465956, + "grad_norm": 0.31407713890075684, + "learning_rate": 0.0005596188811188811, + "loss": 3.6336, + "step": 11650 + }, + { + "epoch": 3.4071291280796787, + "grad_norm": 0.31519898772239685, + "learning_rate": 0.0005594440559440559, + "loss": 3.6432, + "step": 11700 + }, + { + "epoch": 3.421690255693401, + "grad_norm": 0.33295854926109314, + "learning_rate": 0.0005592692307692307, + "loss": 3.6271, + "step": 11750 + }, + { + "epoch": 3.4362513833071233, + "grad_norm": 0.3175846338272095, + "learning_rate": 0.0005590944055944055, + "loss": 3.6419, + "step": 11800 + }, + { + "epoch": 3.4508125109208456, + "grad_norm": 0.3179056942462921, + "learning_rate": 0.0005589195804195803, + "loss": 3.649, + "step": 11850 + }, + { + "epoch": 3.465373638534568, + "grad_norm": 0.31343457102775574, + "learning_rate": 0.0005587447552447552, + "loss": 3.6439, + "step": 11900 + }, + { + "epoch": 3.4799347661482907, + "grad_norm": 0.3348383903503418, + "learning_rate": 0.00055856993006993, + "loss": 3.6389, + "step": 11950 + }, + { + "epoch": 3.494495893762013, + "grad_norm": 0.33012107014656067, + "learning_rate": 0.0005583951048951048, + "loss": 3.6493, + "step": 12000 + }, + { + "epoch": 3.494495893762013, + "eval_accuracy": 0.3528794491862669, + "eval_loss": 3.7000977993011475, + "eval_runtime": 179.7672, + "eval_samples_per_second": 92.598, + "eval_steps_per_second": 5.791, + "step": 12000 + }, + { + "epoch": 3.5090570213757353, + "grad_norm": 0.30933046340942383, + "learning_rate": 0.0005582202797202797, + "loss": 3.6413, + "step": 12050 + }, + { + "epoch": 3.523618148989458, + "grad_norm": 0.3057238757610321, + "learning_rate": 0.0005580454545454545, + "loss": 3.6378, + "step": 12100 + }, + { + "epoch": 3.53817927660318, + "grad_norm": 0.3380361497402191, + "learning_rate": 0.0005578706293706293, + "loss": 3.6462, + "step": 12150 + }, + { + "epoch": 3.5527404042169026, + "grad_norm": 0.32907187938690186, + "learning_rate": 0.0005576958041958041, + "loss": 3.6464, + "step": 12200 + }, + { + "epoch": 3.567301531830625, + "grad_norm": 0.3162597417831421, + "learning_rate": 0.000557520979020979, + "loss": 3.656, + "step": 12250 + }, + { + "epoch": 3.5818626594443472, + "grad_norm": 0.3106593191623688, + "learning_rate": 0.0005573461538461538, + "loss": 3.6515, + "step": 12300 + }, + { + "epoch": 3.59642378705807, + "grad_norm": 0.29408252239227295, + "learning_rate": 0.0005571713286713286, + "loss": 3.6489, + "step": 12350 + }, + { + "epoch": 3.6109849146717923, + "grad_norm": 0.3639216721057892, + "learning_rate": 0.0005569965034965034, + "loss": 3.6424, + "step": 12400 + }, + { + "epoch": 3.6255460422855146, + "grad_norm": 0.31863993406295776, + "learning_rate": 0.0005568216783216783, + "loss": 3.6446, + "step": 12450 + }, + { + "epoch": 3.640107169899237, + "grad_norm": 0.3066108822822571, + "learning_rate": 0.000556646853146853, + "loss": 3.6432, + "step": 12500 + }, + { + "epoch": 3.654668297512959, + "grad_norm": 0.30826711654663086, + "learning_rate": 0.0005564720279720279, + "loss": 3.6457, + "step": 12550 + }, + { + "epoch": 3.669229425126682, + "grad_norm": 0.3210170567035675, + "learning_rate": 0.0005562972027972027, + "loss": 3.6411, + "step": 12600 + }, + { + "epoch": 3.6837905527404042, + "grad_norm": 0.31402987241744995, + "learning_rate": 0.0005561223776223775, + "loss": 3.6542, + "step": 12650 + }, + { + "epoch": 3.6983516803541265, + "grad_norm": 0.33224406838417053, + "learning_rate": 0.0005559475524475524, + "loss": 3.6385, + "step": 12700 + }, + { + "epoch": 3.7129128079678493, + "grad_norm": 0.3081912398338318, + "learning_rate": 0.0005557727272727272, + "loss": 3.6361, + "step": 12750 + }, + { + "epoch": 3.7274739355815716, + "grad_norm": 0.31198635697364807, + "learning_rate": 0.000555597902097902, + "loss": 3.6456, + "step": 12800 + }, + { + "epoch": 3.742035063195294, + "grad_norm": 0.31249940395355225, + "learning_rate": 0.0005554230769230768, + "loss": 3.6301, + "step": 12850 + }, + { + "epoch": 3.756596190809016, + "grad_norm": 0.29419270157814026, + "learning_rate": 0.0005552482517482517, + "loss": 3.6319, + "step": 12900 + }, + { + "epoch": 3.7711573184227385, + "grad_norm": 0.3123679459095001, + "learning_rate": 0.0005550734265734265, + "loss": 3.6439, + "step": 12950 + }, + { + "epoch": 3.7857184460364612, + "grad_norm": 0.3085649013519287, + "learning_rate": 0.0005548986013986013, + "loss": 3.6429, + "step": 13000 + }, + { + "epoch": 3.7857184460364612, + "eval_accuracy": 0.3545970388800704, + "eval_loss": 3.6809747219085693, + "eval_runtime": 179.6902, + "eval_samples_per_second": 92.637, + "eval_steps_per_second": 5.793, + "step": 13000 + }, + { + "epoch": 3.8002795736501835, + "grad_norm": 0.3226883113384247, + "learning_rate": 0.0005547237762237761, + "loss": 3.644, + "step": 13050 + }, + { + "epoch": 3.814840701263906, + "grad_norm": 0.32543593645095825, + "learning_rate": 0.000554548951048951, + "loss": 3.6455, + "step": 13100 + }, + { + "epoch": 3.829401828877628, + "grad_norm": 0.313363254070282, + "learning_rate": 0.0005543741258741258, + "loss": 3.647, + "step": 13150 + }, + { + "epoch": 3.8439629564913504, + "grad_norm": 0.3085945248603821, + "learning_rate": 0.0005541993006993006, + "loss": 3.6409, + "step": 13200 + }, + { + "epoch": 3.858524084105073, + "grad_norm": 0.32422712445259094, + "learning_rate": 0.0005540244755244756, + "loss": 3.6415, + "step": 13250 + }, + { + "epoch": 3.8730852117187955, + "grad_norm": 0.31334224343299866, + "learning_rate": 0.0005538496503496502, + "loss": 3.6376, + "step": 13300 + }, + { + "epoch": 3.887646339332518, + "grad_norm": 0.3215864598751068, + "learning_rate": 0.0005536748251748252, + "loss": 3.6382, + "step": 13350 + }, + { + "epoch": 3.9022074669462405, + "grad_norm": 0.32258346676826477, + "learning_rate": 0.0005535, + "loss": 3.6334, + "step": 13400 + }, + { + "epoch": 3.916768594559963, + "grad_norm": 0.32085853815078735, + "learning_rate": 0.0005533251748251748, + "loss": 3.6264, + "step": 13450 + }, + { + "epoch": 3.931329722173685, + "grad_norm": 0.30639684200286865, + "learning_rate": 0.0005531503496503496, + "loss": 3.6552, + "step": 13500 + }, + { + "epoch": 3.9458908497874075, + "grad_norm": 0.31769323348999023, + "learning_rate": 0.0005529755244755245, + "loss": 3.6226, + "step": 13550 + }, + { + "epoch": 3.9604519774011298, + "grad_norm": 0.31194061040878296, + "learning_rate": 0.0005528006993006993, + "loss": 3.6418, + "step": 13600 + }, + { + "epoch": 3.9750131050148525, + "grad_norm": 0.326402485370636, + "learning_rate": 0.0005526258741258741, + "loss": 3.6404, + "step": 13650 + }, + { + "epoch": 3.989574232628575, + "grad_norm": 0.3246409595012665, + "learning_rate": 0.0005524510489510489, + "loss": 3.6294, + "step": 13700 + }, + { + "epoch": 4.004077115731842, + "grad_norm": 0.32423749566078186, + "learning_rate": 0.0005522762237762238, + "loss": 3.6146, + "step": 13750 + }, + { + "epoch": 4.018638243345564, + "grad_norm": 0.311954140663147, + "learning_rate": 0.0005521013986013986, + "loss": 3.5222, + "step": 13800 + }, + { + "epoch": 4.033199370959287, + "grad_norm": 0.31119635701179504, + "learning_rate": 0.0005519265734265734, + "loss": 3.5373, + "step": 13850 + }, + { + "epoch": 4.04776049857301, + "grad_norm": 0.31895068287849426, + "learning_rate": 0.0005517517482517482, + "loss": 3.5214, + "step": 13900 + }, + { + "epoch": 4.062321626186732, + "grad_norm": 0.34818094968795776, + "learning_rate": 0.0005515769230769231, + "loss": 3.5465, + "step": 13950 + }, + { + "epoch": 4.076882753800454, + "grad_norm": 0.33164742588996887, + "learning_rate": 0.0005514020979020979, + "loss": 3.5427, + "step": 14000 + }, + { + "epoch": 4.076882753800454, + "eval_accuracy": 0.3561751993215227, + "eval_loss": 3.6737630367279053, + "eval_runtime": 179.6974, + "eval_samples_per_second": 92.633, + "eval_steps_per_second": 5.793, + "step": 14000 + }, + { + "epoch": 4.091443881414177, + "grad_norm": 0.3311789333820343, + "learning_rate": 0.0005512272727272727, + "loss": 3.5457, + "step": 14050 + }, + { + "epoch": 4.106005009027899, + "grad_norm": 0.3225516378879547, + "learning_rate": 0.0005510524475524475, + "loss": 3.5393, + "step": 14100 + }, + { + "epoch": 4.120566136641622, + "grad_norm": 0.3110713064670563, + "learning_rate": 0.0005508776223776223, + "loss": 3.5559, + "step": 14150 + }, + { + "epoch": 4.135127264255344, + "grad_norm": 0.32352516055107117, + "learning_rate": 0.0005507027972027972, + "loss": 3.557, + "step": 14200 + }, + { + "epoch": 4.149688391869066, + "grad_norm": 0.32771018147468567, + "learning_rate": 0.000550527972027972, + "loss": 3.5614, + "step": 14250 + }, + { + "epoch": 4.164249519482789, + "grad_norm": 0.3170819580554962, + "learning_rate": 0.0005503531468531468, + "loss": 3.5519, + "step": 14300 + }, + { + "epoch": 4.178810647096511, + "grad_norm": 0.3334265947341919, + "learning_rate": 0.0005501783216783216, + "loss": 3.5502, + "step": 14350 + }, + { + "epoch": 4.193371774710234, + "grad_norm": 0.30677902698516846, + "learning_rate": 0.0005500034965034965, + "loss": 3.574, + "step": 14400 + }, + { + "epoch": 4.207932902323956, + "grad_norm": 0.33088985085487366, + "learning_rate": 0.0005498286713286713, + "loss": 3.5655, + "step": 14450 + }, + { + "epoch": 4.222494029937678, + "grad_norm": 0.31959256529808044, + "learning_rate": 0.0005496538461538461, + "loss": 3.5559, + "step": 14500 + }, + { + "epoch": 4.237055157551401, + "grad_norm": 0.31475120782852173, + "learning_rate": 0.0005494790209790209, + "loss": 3.559, + "step": 14550 + }, + { + "epoch": 4.251616285165123, + "grad_norm": 0.3372187912464142, + "learning_rate": 0.0005493041958041958, + "loss": 3.568, + "step": 14600 + }, + { + "epoch": 4.266177412778846, + "grad_norm": 0.3159469962120056, + "learning_rate": 0.0005491293706293706, + "loss": 3.5742, + "step": 14650 + }, + { + "epoch": 4.280738540392568, + "grad_norm": 0.34496167302131653, + "learning_rate": 0.0005489545454545454, + "loss": 3.569, + "step": 14700 + }, + { + "epoch": 4.29529966800629, + "grad_norm": 0.3201475441455841, + "learning_rate": 0.0005487797202797203, + "loss": 3.573, + "step": 14750 + }, + { + "epoch": 4.309860795620013, + "grad_norm": 0.3239315450191498, + "learning_rate": 0.000548604895104895, + "loss": 3.577, + "step": 14800 + }, + { + "epoch": 4.324421923233735, + "grad_norm": 0.30931442975997925, + "learning_rate": 0.0005484300699300699, + "loss": 3.5692, + "step": 14850 + }, + { + "epoch": 4.338983050847458, + "grad_norm": 0.3285701870918274, + "learning_rate": 0.0005482552447552447, + "loss": 3.566, + "step": 14900 + }, + { + "epoch": 4.35354417846118, + "grad_norm": 0.325842022895813, + "learning_rate": 0.0005480804195804195, + "loss": 3.5647, + "step": 14950 + }, + { + "epoch": 4.368105306074902, + "grad_norm": 0.3167710304260254, + "learning_rate": 0.0005479055944055943, + "loss": 3.5735, + "step": 15000 + }, + { + "epoch": 4.368105306074902, + "eval_accuracy": 0.3571037087946, + "eval_loss": 3.659233570098877, + "eval_runtime": 179.796, + "eval_samples_per_second": 92.583, + "eval_steps_per_second": 5.79, + "step": 15000 + }, + { + "epoch": 4.382666433688625, + "grad_norm": 0.3091343939304352, + "learning_rate": 0.0005477307692307692, + "loss": 3.5822, + "step": 15050 + }, + { + "epoch": 4.397227561302348, + "grad_norm": 0.33039334416389465, + "learning_rate": 0.000547555944055944, + "loss": 3.586, + "step": 15100 + }, + { + "epoch": 4.41178868891607, + "grad_norm": 0.30892929434776306, + "learning_rate": 0.0005473811188811188, + "loss": 3.5716, + "step": 15150 + }, + { + "epoch": 4.426349816529792, + "grad_norm": 0.3354114592075348, + "learning_rate": 0.0005472062937062936, + "loss": 3.5646, + "step": 15200 + }, + { + "epoch": 4.440910944143514, + "grad_norm": 0.3432832360267639, + "learning_rate": 0.0005470314685314685, + "loss": 3.5779, + "step": 15250 + }, + { + "epoch": 4.455472071757237, + "grad_norm": 0.3167623281478882, + "learning_rate": 0.0005468566433566433, + "loss": 3.5657, + "step": 15300 + }, + { + "epoch": 4.47003319937096, + "grad_norm": 0.3280886113643646, + "learning_rate": 0.0005466818181818181, + "loss": 3.5732, + "step": 15350 + }, + { + "epoch": 4.4845943269846815, + "grad_norm": 0.3291832208633423, + "learning_rate": 0.000546506993006993, + "loss": 3.5683, + "step": 15400 + }, + { + "epoch": 4.499155454598404, + "grad_norm": 0.31101885437965393, + "learning_rate": 0.0005463321678321678, + "loss": 3.5722, + "step": 15450 + }, + { + "epoch": 4.513716582212126, + "grad_norm": 0.3118363320827484, + "learning_rate": 0.0005461573426573426, + "loss": 3.5855, + "step": 15500 + }, + { + "epoch": 4.528277709825849, + "grad_norm": 0.31627270579338074, + "learning_rate": 0.0005459825174825174, + "loss": 3.5871, + "step": 15550 + }, + { + "epoch": 4.542838837439572, + "grad_norm": 0.32284530997276306, + "learning_rate": 0.0005458076923076922, + "loss": 3.5754, + "step": 15600 + }, + { + "epoch": 4.5573999650532935, + "grad_norm": 0.32503610849380493, + "learning_rate": 0.000545632867132867, + "loss": 3.569, + "step": 15650 + }, + { + "epoch": 4.571961092667016, + "grad_norm": 0.3345843553543091, + "learning_rate": 0.0005454580419580419, + "loss": 3.566, + "step": 15700 + }, + { + "epoch": 4.586522220280738, + "grad_norm": 0.31699925661087036, + "learning_rate": 0.0005452832167832167, + "loss": 3.5757, + "step": 15750 + }, + { + "epoch": 4.601083347894461, + "grad_norm": 0.3411146402359009, + "learning_rate": 0.0005451083916083915, + "loss": 3.5894, + "step": 15800 + }, + { + "epoch": 4.615644475508184, + "grad_norm": 0.31675615906715393, + "learning_rate": 0.0005449335664335663, + "loss": 3.5752, + "step": 15850 + }, + { + "epoch": 4.630205603121905, + "grad_norm": 0.3413219153881073, + "learning_rate": 0.0005447587412587412, + "loss": 3.5711, + "step": 15900 + }, + { + "epoch": 4.644766730735628, + "grad_norm": 0.3177620470523834, + "learning_rate": 0.000544583916083916, + "loss": 3.5798, + "step": 15950 + }, + { + "epoch": 4.659327858349351, + "grad_norm": 0.31724312901496887, + "learning_rate": 0.0005444090909090908, + "loss": 3.5796, + "step": 16000 + }, + { + "epoch": 4.659327858349351, + "eval_accuracy": 0.35869268499593115, + "eval_loss": 3.648486375808716, + "eval_runtime": 179.9066, + "eval_samples_per_second": 92.526, + "eval_steps_per_second": 5.786, + "step": 16000 + }, + { + "epoch": 4.673888985963073, + "grad_norm": 0.32944586873054504, + "learning_rate": 0.0005442342657342657, + "loss": 3.5742, + "step": 16050 + }, + { + "epoch": 4.6884501135767955, + "grad_norm": 0.320095956325531, + "learning_rate": 0.0005440594405594405, + "loss": 3.5843, + "step": 16100 + }, + { + "epoch": 4.703011241190518, + "grad_norm": 0.3284047245979309, + "learning_rate": 0.0005438846153846153, + "loss": 3.566, + "step": 16150 + }, + { + "epoch": 4.71757236880424, + "grad_norm": 0.338379830121994, + "learning_rate": 0.0005437097902097901, + "loss": 3.5667, + "step": 16200 + }, + { + "epoch": 4.732133496417963, + "grad_norm": 0.3109598159790039, + "learning_rate": 0.0005435349650349651, + "loss": 3.5742, + "step": 16250 + }, + { + "epoch": 4.746694624031685, + "grad_norm": 0.30519962310791016, + "learning_rate": 0.0005433601398601397, + "loss": 3.5789, + "step": 16300 + }, + { + "epoch": 4.7612557516454075, + "grad_norm": 0.3150230944156647, + "learning_rate": 0.0005431853146853147, + "loss": 3.5769, + "step": 16350 + }, + { + "epoch": 4.77581687925913, + "grad_norm": 0.29910922050476074, + "learning_rate": 0.0005430104895104895, + "loss": 3.5761, + "step": 16400 + }, + { + "epoch": 4.790378006872852, + "grad_norm": 0.3157634437084198, + "learning_rate": 0.0005428356643356643, + "loss": 3.5709, + "step": 16450 + }, + { + "epoch": 4.804939134486575, + "grad_norm": 0.3214448094367981, + "learning_rate": 0.0005426608391608391, + "loss": 3.5804, + "step": 16500 + }, + { + "epoch": 4.819500262100297, + "grad_norm": 0.31892773509025574, + "learning_rate": 0.000542486013986014, + "loss": 3.5899, + "step": 16550 + }, + { + "epoch": 4.834061389714019, + "grad_norm": 0.3179968595504761, + "learning_rate": 0.0005423111888111888, + "loss": 3.5709, + "step": 16600 + }, + { + "epoch": 4.848622517327742, + "grad_norm": 0.33231818675994873, + "learning_rate": 0.0005421363636363636, + "loss": 3.5737, + "step": 16650 + }, + { + "epoch": 4.863183644941464, + "grad_norm": 0.30390241742134094, + "learning_rate": 0.0005419615384615385, + "loss": 3.572, + "step": 16700 + }, + { + "epoch": 4.877744772555187, + "grad_norm": 0.3263714909553528, + "learning_rate": 0.0005417867132867133, + "loss": 3.5714, + "step": 16750 + }, + { + "epoch": 4.892305900168909, + "grad_norm": 0.31608420610427856, + "learning_rate": 0.0005416118881118881, + "loss": 3.573, + "step": 16800 + }, + { + "epoch": 4.906867027782631, + "grad_norm": 0.3054676353931427, + "learning_rate": 0.0005414370629370629, + "loss": 3.5793, + "step": 16850 + }, + { + "epoch": 4.921428155396354, + "grad_norm": 0.3099980354309082, + "learning_rate": 0.0005412622377622378, + "loss": 3.5697, + "step": 16900 + }, + { + "epoch": 4.935989283010076, + "grad_norm": 0.29981857538223267, + "learning_rate": 0.0005410874125874126, + "loss": 3.5735, + "step": 16950 + }, + { + "epoch": 4.950550410623799, + "grad_norm": 0.3208276033401489, + "learning_rate": 0.0005409125874125874, + "loss": 3.5819, + "step": 17000 + }, + { + "epoch": 4.950550410623799, + "eval_accuracy": 0.3599148658622406, + "eval_loss": 3.634756326675415, + "eval_runtime": 179.7751, + "eval_samples_per_second": 92.593, + "eval_steps_per_second": 5.791, + "step": 17000 + }, + { + "epoch": 4.9651115382375215, + "grad_norm": 0.310529500246048, + "learning_rate": 0.0005407377622377622, + "loss": 3.5832, + "step": 17050 + }, + { + "epoch": 4.979672665851243, + "grad_norm": 0.32999780774116516, + "learning_rate": 0.000540562937062937, + "loss": 3.5711, + "step": 17100 + }, + { + "epoch": 4.994233793464966, + "grad_norm": 0.3354627192020416, + "learning_rate": 0.0005403881118881118, + "loss": 3.5734, + "step": 17150 + }, + { + "epoch": 5.008736676568233, + "grad_norm": 0.35508137941360474, + "learning_rate": 0.0005402132867132867, + "loss": 3.5155, + "step": 17200 + }, + { + "epoch": 5.023297804181956, + "grad_norm": 0.31227484345436096, + "learning_rate": 0.0005400384615384615, + "loss": 3.4713, + "step": 17250 + }, + { + "epoch": 5.037858931795678, + "grad_norm": 0.31459367275238037, + "learning_rate": 0.0005398636363636363, + "loss": 3.48, + "step": 17300 + }, + { + "epoch": 5.052420059409401, + "grad_norm": 0.31045621633529663, + "learning_rate": 0.0005396888111888111, + "loss": 3.4772, + "step": 17350 + }, + { + "epoch": 5.066981187023123, + "grad_norm": 0.3227365016937256, + "learning_rate": 0.000539513986013986, + "loss": 3.4702, + "step": 17400 + }, + { + "epoch": 5.081542314636845, + "grad_norm": 0.30600887537002563, + "learning_rate": 0.0005393391608391608, + "loss": 3.4776, + "step": 17450 + }, + { + "epoch": 5.096103442250568, + "grad_norm": 0.3312874138355255, + "learning_rate": 0.0005391643356643356, + "loss": 3.4876, + "step": 17500 + }, + { + "epoch": 5.110664569864291, + "grad_norm": 0.3330562114715576, + "learning_rate": 0.0005389895104895105, + "loss": 3.4802, + "step": 17550 + }, + { + "epoch": 5.125225697478013, + "grad_norm": 0.32655513286590576, + "learning_rate": 0.0005388146853146853, + "loss": 3.4899, + "step": 17600 + }, + { + "epoch": 5.139786825091735, + "grad_norm": 0.34551799297332764, + "learning_rate": 0.0005386398601398601, + "loss": 3.493, + "step": 17650 + }, + { + "epoch": 5.154347952705457, + "grad_norm": 0.3142414093017578, + "learning_rate": 0.0005384650349650349, + "loss": 3.5019, + "step": 17700 + }, + { + "epoch": 5.16890908031918, + "grad_norm": 0.3235276937484741, + "learning_rate": 0.0005382902097902098, + "loss": 3.4889, + "step": 17750 + }, + { + "epoch": 5.183470207932903, + "grad_norm": 0.3249594569206238, + "learning_rate": 0.0005381153846153845, + "loss": 3.4947, + "step": 17800 + }, + { + "epoch": 5.1980313355466246, + "grad_norm": 0.32166171073913574, + "learning_rate": 0.0005379405594405594, + "loss": 3.5064, + "step": 17850 + }, + { + "epoch": 5.212592463160347, + "grad_norm": 0.3284703195095062, + "learning_rate": 0.0005377657342657342, + "loss": 3.5105, + "step": 17900 + }, + { + "epoch": 5.227153590774069, + "grad_norm": 0.32744383811950684, + "learning_rate": 0.000537590909090909, + "loss": 3.5143, + "step": 17950 + }, + { + "epoch": 5.241714718387792, + "grad_norm": 0.312739759683609, + "learning_rate": 0.0005374160839160838, + "loss": 3.5007, + "step": 18000 + }, + { + "epoch": 5.241714718387792, + "eval_accuracy": 0.36018232079402723, + "eval_loss": 3.6365652084350586, + "eval_runtime": 179.8304, + "eval_samples_per_second": 92.565, + "eval_steps_per_second": 5.789, + "step": 18000 + }, + { + "epoch": 5.256275846001515, + "grad_norm": 0.31837671995162964, + "learning_rate": 0.0005372412587412587, + "loss": 3.5128, + "step": 18050 + }, + { + "epoch": 5.2708369736152365, + "grad_norm": 0.33519458770751953, + "learning_rate": 0.0005370664335664335, + "loss": 3.5119, + "step": 18100 + }, + { + "epoch": 5.285398101228959, + "grad_norm": 0.34740373492240906, + "learning_rate": 0.0005368916083916083, + "loss": 3.5228, + "step": 18150 + }, + { + "epoch": 5.299959228842681, + "grad_norm": 0.34328994154930115, + "learning_rate": 0.0005367167832167832, + "loss": 3.5142, + "step": 18200 + }, + { + "epoch": 5.314520356456404, + "grad_norm": 0.3207642436027527, + "learning_rate": 0.000536541958041958, + "loss": 3.5114, + "step": 18250 + }, + { + "epoch": 5.329081484070127, + "grad_norm": 0.335101455450058, + "learning_rate": 0.0005363671328671328, + "loss": 3.5175, + "step": 18300 + }, + { + "epoch": 5.3436426116838485, + "grad_norm": 0.34362977743148804, + "learning_rate": 0.0005361923076923076, + "loss": 3.519, + "step": 18350 + }, + { + "epoch": 5.358203739297571, + "grad_norm": 0.3147866725921631, + "learning_rate": 0.0005360174825174825, + "loss": 3.5155, + "step": 18400 + }, + { + "epoch": 5.372764866911294, + "grad_norm": 0.33346375823020935, + "learning_rate": 0.0005358426573426573, + "loss": 3.5163, + "step": 18450 + }, + { + "epoch": 5.387325994525016, + "grad_norm": 0.3331373631954193, + "learning_rate": 0.0005356678321678321, + "loss": 3.5133, + "step": 18500 + }, + { + "epoch": 5.401887122138739, + "grad_norm": 0.3066289722919464, + "learning_rate": 0.0005354930069930069, + "loss": 3.5202, + "step": 18550 + }, + { + "epoch": 5.41644824975246, + "grad_norm": 0.32293954491615295, + "learning_rate": 0.0005353181818181817, + "loss": 3.5251, + "step": 18600 + }, + { + "epoch": 5.431009377366183, + "grad_norm": 0.33153200149536133, + "learning_rate": 0.0005351433566433565, + "loss": 3.5089, + "step": 18650 + }, + { + "epoch": 5.445570504979906, + "grad_norm": 0.32844340801239014, + "learning_rate": 0.0005349685314685314, + "loss": 3.5175, + "step": 18700 + }, + { + "epoch": 5.460131632593628, + "grad_norm": 0.33013710379600525, + "learning_rate": 0.0005347937062937062, + "loss": 3.5155, + "step": 18750 + }, + { + "epoch": 5.4746927602073505, + "grad_norm": 0.318752259016037, + "learning_rate": 0.000534618881118881, + "loss": 3.5226, + "step": 18800 + }, + { + "epoch": 5.489253887821073, + "grad_norm": 0.3632429242134094, + "learning_rate": 0.0005344440559440559, + "loss": 3.5256, + "step": 18850 + }, + { + "epoch": 5.503815015434795, + "grad_norm": 0.31200987100601196, + "learning_rate": 0.0005342692307692307, + "loss": 3.5277, + "step": 18900 + }, + { + "epoch": 5.518376143048518, + "grad_norm": 0.35066500306129456, + "learning_rate": 0.0005340944055944055, + "loss": 3.5224, + "step": 18950 + }, + { + "epoch": 5.53293727066224, + "grad_norm": 0.3067936301231384, + "learning_rate": 0.0005339195804195803, + "loss": 3.5156, + "step": 19000 + }, + { + "epoch": 5.53293727066224, + "eval_accuracy": 0.3610321808827682, + "eval_loss": 3.6285228729248047, + "eval_runtime": 180.0932, + "eval_samples_per_second": 92.43, + "eval_steps_per_second": 5.78, + "step": 19000 + }, + { + "epoch": 5.5474983982759625, + "grad_norm": 0.3141394853591919, + "learning_rate": 0.0005337447552447552, + "loss": 3.5173, + "step": 19050 + }, + { + "epoch": 5.562059525889685, + "grad_norm": 0.334416925907135, + "learning_rate": 0.00053356993006993, + "loss": 3.5189, + "step": 19100 + }, + { + "epoch": 5.576620653503407, + "grad_norm": 0.3050374686717987, + "learning_rate": 0.0005333951048951048, + "loss": 3.5142, + "step": 19150 + }, + { + "epoch": 5.59118178111713, + "grad_norm": 0.33711856603622437, + "learning_rate": 0.0005332202797202796, + "loss": 3.5282, + "step": 19200 + }, + { + "epoch": 5.605742908730852, + "grad_norm": 0.34378382563591003, + "learning_rate": 0.0005330454545454546, + "loss": 3.5195, + "step": 19250 + }, + { + "epoch": 5.620304036344574, + "grad_norm": 0.3297707736492157, + "learning_rate": 0.0005328706293706292, + "loss": 3.532, + "step": 19300 + }, + { + "epoch": 5.634865163958297, + "grad_norm": 0.33016687631607056, + "learning_rate": 0.0005326958041958042, + "loss": 3.5425, + "step": 19350 + }, + { + "epoch": 5.649426291572019, + "grad_norm": 0.34170061349868774, + "learning_rate": 0.000532520979020979, + "loss": 3.5282, + "step": 19400 + }, + { + "epoch": 5.663987419185742, + "grad_norm": 0.3264179825782776, + "learning_rate": 0.0005323461538461538, + "loss": 3.5302, + "step": 19450 + }, + { + "epoch": 5.6785485467994645, + "grad_norm": 0.3002929091453552, + "learning_rate": 0.0005321713286713287, + "loss": 3.5267, + "step": 19500 + }, + { + "epoch": 5.693109674413186, + "grad_norm": 0.35670411586761475, + "learning_rate": 0.0005319965034965035, + "loss": 3.5173, + "step": 19550 + }, + { + "epoch": 5.707670802026909, + "grad_norm": 0.3164016902446747, + "learning_rate": 0.0005318216783216783, + "loss": 3.5437, + "step": 19600 + }, + { + "epoch": 5.722231929640631, + "grad_norm": 0.3452078700065613, + "learning_rate": 0.0005316468531468531, + "loss": 3.5239, + "step": 19650 + }, + { + "epoch": 5.736793057254354, + "grad_norm": 0.3179798424243927, + "learning_rate": 0.000531472027972028, + "loss": 3.5254, + "step": 19700 + }, + { + "epoch": 5.7513541848680765, + "grad_norm": 0.32574138045310974, + "learning_rate": 0.0005312972027972028, + "loss": 3.5335, + "step": 19750 + }, + { + "epoch": 5.765915312481798, + "grad_norm": 0.32392826676368713, + "learning_rate": 0.0005311223776223776, + "loss": 3.5268, + "step": 19800 + }, + { + "epoch": 5.780476440095521, + "grad_norm": 0.34594979882240295, + "learning_rate": 0.0005309475524475524, + "loss": 3.534, + "step": 19850 + }, + { + "epoch": 5.795037567709244, + "grad_norm": 0.31376367807388306, + "learning_rate": 0.0005307727272727273, + "loss": 3.5306, + "step": 19900 + }, + { + "epoch": 5.809598695322966, + "grad_norm": 0.32450011372566223, + "learning_rate": 0.0005305979020979021, + "loss": 3.5337, + "step": 19950 + }, + { + "epoch": 5.824159822936688, + "grad_norm": 0.30886128544807434, + "learning_rate": 0.0005304230769230769, + "loss": 3.5239, + "step": 20000 + }, + { + "epoch": 5.824159822936688, + "eval_accuracy": 0.3622778742705534, + "eval_loss": 3.6140716075897217, + "eval_runtime": 180.1478, + "eval_samples_per_second": 92.402, + "eval_steps_per_second": 5.779, + "step": 20000 + }, + { + "epoch": 5.83872095055041, + "grad_norm": 0.32404589653015137, + "learning_rate": 0.0005302482517482517, + "loss": 3.5409, + "step": 20050 + }, + { + "epoch": 5.853282078164133, + "grad_norm": 0.30877238512039185, + "learning_rate": 0.0005300734265734265, + "loss": 3.5373, + "step": 20100 + }, + { + "epoch": 5.867843205777856, + "grad_norm": 0.31356489658355713, + "learning_rate": 0.0005298986013986013, + "loss": 3.5219, + "step": 20150 + }, + { + "epoch": 5.882404333391578, + "grad_norm": 0.30876606702804565, + "learning_rate": 0.0005297237762237762, + "loss": 3.529, + "step": 20200 + }, + { + "epoch": 5.8969654610053, + "grad_norm": 0.3364260494709015, + "learning_rate": 0.000529548951048951, + "loss": 3.5252, + "step": 20250 + }, + { + "epoch": 5.911526588619022, + "grad_norm": 0.3011105954647064, + "learning_rate": 0.0005293741258741258, + "loss": 3.5244, + "step": 20300 + }, + { + "epoch": 5.926087716232745, + "grad_norm": 0.31753775477409363, + "learning_rate": 0.0005291993006993007, + "loss": 3.5309, + "step": 20350 + }, + { + "epoch": 5.940648843846468, + "grad_norm": 0.3421807289123535, + "learning_rate": 0.0005290244755244755, + "loss": 3.537, + "step": 20400 + }, + { + "epoch": 5.95520997146019, + "grad_norm": 0.3219417631626129, + "learning_rate": 0.0005288496503496503, + "loss": 3.5311, + "step": 20450 + }, + { + "epoch": 5.969771099073912, + "grad_norm": 0.3096925616264343, + "learning_rate": 0.0005286748251748251, + "loss": 3.5334, + "step": 20500 + }, + { + "epoch": 5.984332226687634, + "grad_norm": 0.3308550715446472, + "learning_rate": 0.0005285, + "loss": 3.5199, + "step": 20550 + }, + { + "epoch": 5.998893354301357, + "grad_norm": 0.31948336958885193, + "learning_rate": 0.0005283251748251748, + "loss": 3.5393, + "step": 20600 + }, + { + "epoch": 6.013396237404625, + "grad_norm": 0.31365492939949036, + "learning_rate": 0.0005281503496503496, + "loss": 3.432, + "step": 20650 + }, + { + "epoch": 6.027957365018347, + "grad_norm": 0.32687506079673767, + "learning_rate": 0.0005279755244755244, + "loss": 3.4276, + "step": 20700 + }, + { + "epoch": 6.04251849263207, + "grad_norm": 0.32380980253219604, + "learning_rate": 0.0005278006993006993, + "loss": 3.4312, + "step": 20750 + }, + { + "epoch": 6.0570796202457915, + "grad_norm": 0.3151368498802185, + "learning_rate": 0.000527625874125874, + "loss": 3.4158, + "step": 20800 + }, + { + "epoch": 6.071640747859514, + "grad_norm": 0.315514475107193, + "learning_rate": 0.0005274510489510489, + "loss": 3.4395, + "step": 20850 + }, + { + "epoch": 6.086201875473237, + "grad_norm": 0.32791003584861755, + "learning_rate": 0.0005272762237762238, + "loss": 3.4373, + "step": 20900 + }, + { + "epoch": 6.100763003086959, + "grad_norm": 0.3153580129146576, + "learning_rate": 0.0005271013986013985, + "loss": 3.4479, + "step": 20950 + }, + { + "epoch": 6.115324130700682, + "grad_norm": 0.34948551654815674, + "learning_rate": 0.0005269265734265734, + "loss": 3.4463, + "step": 21000 + }, + { + "epoch": 6.115324130700682, + "eval_accuracy": 0.3622545968742924, + "eval_loss": 3.6173741817474365, + "eval_runtime": 179.8785, + "eval_samples_per_second": 92.54, + "eval_steps_per_second": 5.787, + "step": 21000 + }, + { + "epoch": 6.1298852583144035, + "grad_norm": 0.3471393883228302, + "learning_rate": 0.0005267517482517482, + "loss": 3.4418, + "step": 21050 + }, + { + "epoch": 6.144446385928126, + "grad_norm": 0.32299190759658813, + "learning_rate": 0.000526576923076923, + "loss": 3.462, + "step": 21100 + }, + { + "epoch": 6.159007513541849, + "grad_norm": 0.3276447355747223, + "learning_rate": 0.0005264020979020978, + "loss": 3.4441, + "step": 21150 + }, + { + "epoch": 6.173568641155571, + "grad_norm": 0.3275761604309082, + "learning_rate": 0.0005262272727272727, + "loss": 3.4414, + "step": 21200 + }, + { + "epoch": 6.1881297687692935, + "grad_norm": 0.32831233739852905, + "learning_rate": 0.0005260524475524475, + "loss": 3.4573, + "step": 21250 + }, + { + "epoch": 6.202690896383016, + "grad_norm": 0.32581037282943726, + "learning_rate": 0.0005258776223776223, + "loss": 3.4413, + "step": 21300 + }, + { + "epoch": 6.217252023996738, + "grad_norm": 0.3218664228916168, + "learning_rate": 0.0005257027972027971, + "loss": 3.4494, + "step": 21350 + }, + { + "epoch": 6.231813151610461, + "grad_norm": 0.34039339423179626, + "learning_rate": 0.000525527972027972, + "loss": 3.458, + "step": 21400 + }, + { + "epoch": 6.246374279224183, + "grad_norm": 0.3327193260192871, + "learning_rate": 0.0005253531468531468, + "loss": 3.4557, + "step": 21450 + }, + { + "epoch": 6.2609354068379055, + "grad_norm": 0.3233095705509186, + "learning_rate": 0.0005251783216783216, + "loss": 3.4511, + "step": 21500 + }, + { + "epoch": 6.275496534451628, + "grad_norm": 0.3496866822242737, + "learning_rate": 0.0005250034965034965, + "loss": 3.4622, + "step": 21550 + }, + { + "epoch": 6.29005766206535, + "grad_norm": 0.3645714521408081, + "learning_rate": 0.0005248286713286712, + "loss": 3.4517, + "step": 21600 + }, + { + "epoch": 6.304618789679073, + "grad_norm": 0.3256557881832123, + "learning_rate": 0.0005246538461538461, + "loss": 3.4739, + "step": 21650 + }, + { + "epoch": 6.319179917292795, + "grad_norm": 0.3235686719417572, + "learning_rate": 0.0005244790209790209, + "loss": 3.4725, + "step": 21700 + }, + { + "epoch": 6.3337410449065175, + "grad_norm": 0.3351970911026001, + "learning_rate": 0.0005243041958041957, + "loss": 3.4857, + "step": 21750 + }, + { + "epoch": 6.34830217252024, + "grad_norm": 0.3423496186733246, + "learning_rate": 0.0005241293706293705, + "loss": 3.4746, + "step": 21800 + }, + { + "epoch": 6.362863300133962, + "grad_norm": 0.3310966491699219, + "learning_rate": 0.0005239545454545454, + "loss": 3.4763, + "step": 21850 + }, + { + "epoch": 6.377424427747685, + "grad_norm": 0.31002819538116455, + "learning_rate": 0.0005237797202797202, + "loss": 3.4714, + "step": 21900 + }, + { + "epoch": 6.391985555361408, + "grad_norm": 0.3289186358451843, + "learning_rate": 0.000523604895104895, + "loss": 3.4637, + "step": 21950 + }, + { + "epoch": 6.406546682975129, + "grad_norm": 0.3141127824783325, + "learning_rate": 0.0005234300699300698, + "loss": 3.4779, + "step": 22000 + }, + { + "epoch": 6.406546682975129, + "eval_accuracy": 0.36285122710673956, + "eval_loss": 3.610785722732544, + "eval_runtime": 179.7092, + "eval_samples_per_second": 92.627, + "eval_steps_per_second": 5.793, + "step": 22000 + }, + { + "epoch": 6.421107810588852, + "grad_norm": 0.3150128722190857, + "learning_rate": 0.0005232552447552447, + "loss": 3.4842, + "step": 22050 + }, + { + "epoch": 6.435668938202574, + "grad_norm": 0.3259349465370178, + "learning_rate": 0.0005230804195804195, + "loss": 3.4848, + "step": 22100 + }, + { + "epoch": 6.450230065816297, + "grad_norm": 0.32301968336105347, + "learning_rate": 0.0005229055944055943, + "loss": 3.4818, + "step": 22150 + }, + { + "epoch": 6.4647911934300195, + "grad_norm": 0.3123028874397278, + "learning_rate": 0.0005227307692307691, + "loss": 3.4914, + "step": 22200 + }, + { + "epoch": 6.479352321043741, + "grad_norm": 0.3286699652671814, + "learning_rate": 0.0005225559440559441, + "loss": 3.4875, + "step": 22250 + }, + { + "epoch": 6.493913448657464, + "grad_norm": 0.3313329517841339, + "learning_rate": 0.0005223811188811189, + "loss": 3.4791, + "step": 22300 + }, + { + "epoch": 6.508474576271187, + "grad_norm": 0.31018057465553284, + "learning_rate": 0.0005222062937062937, + "loss": 3.4807, + "step": 22350 + }, + { + "epoch": 6.523035703884909, + "grad_norm": 0.32716143131256104, + "learning_rate": 0.0005220314685314686, + "loss": 3.4846, + "step": 22400 + }, + { + "epoch": 6.5375968314986315, + "grad_norm": 0.3213047981262207, + "learning_rate": 0.0005218566433566433, + "loss": 3.4959, + "step": 22450 + }, + { + "epoch": 6.552157959112353, + "grad_norm": 0.3478303849697113, + "learning_rate": 0.0005216818181818182, + "loss": 3.4794, + "step": 22500 + }, + { + "epoch": 6.566719086726076, + "grad_norm": 0.334625780582428, + "learning_rate": 0.000521506993006993, + "loss": 3.4993, + "step": 22550 + }, + { + "epoch": 6.581280214339799, + "grad_norm": 0.3324287235736847, + "learning_rate": 0.0005213321678321678, + "loss": 3.4947, + "step": 22600 + }, + { + "epoch": 6.595841341953521, + "grad_norm": 0.3208302855491638, + "learning_rate": 0.0005211573426573426, + "loss": 3.4816, + "step": 22650 + }, + { + "epoch": 6.610402469567243, + "grad_norm": 0.3206283152103424, + "learning_rate": 0.0005209825174825175, + "loss": 3.4811, + "step": 22700 + }, + { + "epoch": 6.624963597180965, + "grad_norm": 0.3405255377292633, + "learning_rate": 0.0005208076923076923, + "loss": 3.4839, + "step": 22750 + }, + { + "epoch": 6.639524724794688, + "grad_norm": 0.33559542894363403, + "learning_rate": 0.0005206328671328671, + "loss": 3.4962, + "step": 22800 + }, + { + "epoch": 6.654085852408411, + "grad_norm": 0.3277864456176758, + "learning_rate": 0.0005204580419580419, + "loss": 3.4831, + "step": 22850 + }, + { + "epoch": 6.668646980022133, + "grad_norm": 0.3352718949317932, + "learning_rate": 0.0005202832167832168, + "loss": 3.4782, + "step": 22900 + }, + { + "epoch": 6.683208107635855, + "grad_norm": 0.31568098068237305, + "learning_rate": 0.0005201083916083916, + "loss": 3.4802, + "step": 22950 + }, + { + "epoch": 6.697769235249577, + "grad_norm": 0.3398934602737427, + "learning_rate": 0.0005199335664335664, + "loss": 3.4888, + "step": 23000 + }, + { + "epoch": 6.697769235249577, + "eval_accuracy": 0.3639860589557666, + "eval_loss": 3.598484992980957, + "eval_runtime": 179.7416, + "eval_samples_per_second": 92.611, + "eval_steps_per_second": 5.792, + "step": 23000 + }, + { + "epoch": 6.7123303628633, + "grad_norm": 0.30721819400787354, + "learning_rate": 0.0005197587412587413, + "loss": 3.4866, + "step": 23050 + }, + { + "epoch": 6.726891490477023, + "grad_norm": 0.3224666714668274, + "learning_rate": 0.0005195839160839161, + "loss": 3.4968, + "step": 23100 + }, + { + "epoch": 6.741452618090745, + "grad_norm": 0.32522931694984436, + "learning_rate": 0.0005194090909090909, + "loss": 3.4878, + "step": 23150 + }, + { + "epoch": 6.756013745704467, + "grad_norm": 0.31341007351875305, + "learning_rate": 0.0005192342657342657, + "loss": 3.4833, + "step": 23200 + }, + { + "epoch": 6.77057487331819, + "grad_norm": 0.3186572790145874, + "learning_rate": 0.0005190594405594405, + "loss": 3.4975, + "step": 23250 + }, + { + "epoch": 6.785136000931912, + "grad_norm": 0.33995872735977173, + "learning_rate": 0.0005188846153846153, + "loss": 3.4878, + "step": 23300 + }, + { + "epoch": 6.799697128545635, + "grad_norm": 0.3231462836265564, + "learning_rate": 0.0005187097902097902, + "loss": 3.4897, + "step": 23350 + }, + { + "epoch": 6.814258256159357, + "grad_norm": 0.31064069271087646, + "learning_rate": 0.000518534965034965, + "loss": 3.4984, + "step": 23400 + }, + { + "epoch": 6.828819383773079, + "grad_norm": 0.31749048829078674, + "learning_rate": 0.0005183601398601398, + "loss": 3.4867, + "step": 23450 + }, + { + "epoch": 6.843380511386802, + "grad_norm": 0.31053680181503296, + "learning_rate": 0.0005181853146853146, + "loss": 3.4937, + "step": 23500 + }, + { + "epoch": 6.857941639000524, + "grad_norm": 0.3226015269756317, + "learning_rate": 0.0005180104895104895, + "loss": 3.4918, + "step": 23550 + }, + { + "epoch": 6.872502766614247, + "grad_norm": 0.3255876302719116, + "learning_rate": 0.0005178356643356643, + "loss": 3.4998, + "step": 23600 + }, + { + "epoch": 6.887063894227969, + "grad_norm": 0.32611915469169617, + "learning_rate": 0.0005176608391608391, + "loss": 3.4874, + "step": 23650 + }, + { + "epoch": 6.901625021841691, + "grad_norm": 0.3349880874156952, + "learning_rate": 0.000517486013986014, + "loss": 3.4898, + "step": 23700 + }, + { + "epoch": 6.916186149455414, + "grad_norm": 0.32357269525527954, + "learning_rate": 0.0005173111888111888, + "loss": 3.4919, + "step": 23750 + }, + { + "epoch": 6.930747277069136, + "grad_norm": 0.30893370509147644, + "learning_rate": 0.0005171363636363636, + "loss": 3.488, + "step": 23800 + }, + { + "epoch": 6.945308404682859, + "grad_norm": 0.34728315472602844, + "learning_rate": 0.0005169615384615384, + "loss": 3.4851, + "step": 23850 + }, + { + "epoch": 6.959869532296581, + "grad_norm": 0.34141796827316284, + "learning_rate": 0.0005167867132867133, + "loss": 3.4924, + "step": 23900 + }, + { + "epoch": 6.974430659910303, + "grad_norm": 0.33731377124786377, + "learning_rate": 0.000516611888111888, + "loss": 3.4936, + "step": 23950 + }, + { + "epoch": 6.988991787524026, + "grad_norm": 0.330599308013916, + "learning_rate": 0.0005164370629370629, + "loss": 3.4999, + "step": 24000 + }, + { + "epoch": 6.988991787524026, + "eval_accuracy": 0.36480441226573007, + "eval_loss": 3.590554714202881, + "eval_runtime": 179.6543, + "eval_samples_per_second": 92.656, + "eval_steps_per_second": 5.794, + "step": 24000 + }, + { + "epoch": 7.003494670627293, + "grad_norm": 0.3449358642101288, + "learning_rate": 0.0005162622377622377, + "loss": 3.471, + "step": 24050 + }, + { + "epoch": 7.018055798241016, + "grad_norm": 0.35293149948120117, + "learning_rate": 0.0005160874125874125, + "loss": 3.3888, + "step": 24100 + }, + { + "epoch": 7.032616925854738, + "grad_norm": 0.3265637755393982, + "learning_rate": 0.0005159125874125873, + "loss": 3.3925, + "step": 24150 + }, + { + "epoch": 7.0471780534684605, + "grad_norm": 0.32121822237968445, + "learning_rate": 0.0005157377622377622, + "loss": 3.3897, + "step": 24200 + }, + { + "epoch": 7.061739181082183, + "grad_norm": 0.3485367000102997, + "learning_rate": 0.000515562937062937, + "loss": 3.3965, + "step": 24250 + }, + { + "epoch": 7.076300308695905, + "grad_norm": 0.32369834184646606, + "learning_rate": 0.0005153881118881118, + "loss": 3.4083, + "step": 24300 + }, + { + "epoch": 7.090861436309628, + "grad_norm": 0.3367840349674225, + "learning_rate": 0.0005152132867132867, + "loss": 3.4072, + "step": 24350 + }, + { + "epoch": 7.105422563923351, + "grad_norm": 0.3350302278995514, + "learning_rate": 0.0005150384615384615, + "loss": 3.403, + "step": 24400 + }, + { + "epoch": 7.1199836915370724, + "grad_norm": 0.3556578755378723, + "learning_rate": 0.0005148636363636363, + "loss": 3.3993, + "step": 24450 + }, + { + "epoch": 7.134544819150795, + "grad_norm": 0.33493995666503906, + "learning_rate": 0.0005146888111888111, + "loss": 3.4033, + "step": 24500 + }, + { + "epoch": 7.149105946764517, + "grad_norm": 0.3266991674900055, + "learning_rate": 0.000514513986013986, + "loss": 3.4133, + "step": 24550 + }, + { + "epoch": 7.16366707437824, + "grad_norm": 0.33190712332725525, + "learning_rate": 0.0005143391608391608, + "loss": 3.4191, + "step": 24600 + }, + { + "epoch": 7.1782282019919625, + "grad_norm": 0.33754125237464905, + "learning_rate": 0.0005141643356643356, + "loss": 3.4116, + "step": 24650 + }, + { + "epoch": 7.192789329605684, + "grad_norm": 0.3015083074569702, + "learning_rate": 0.0005139895104895104, + "loss": 3.4225, + "step": 24700 + }, + { + "epoch": 7.207350457219407, + "grad_norm": 0.3270661532878876, + "learning_rate": 0.0005138146853146852, + "loss": 3.4205, + "step": 24750 + }, + { + "epoch": 7.22191158483313, + "grad_norm": 0.3491705656051636, + "learning_rate": 0.00051363986013986, + "loss": 3.4145, + "step": 24800 + }, + { + "epoch": 7.236472712446852, + "grad_norm": 0.3363984525203705, + "learning_rate": 0.0005134650349650349, + "loss": 3.4213, + "step": 24850 + }, + { + "epoch": 7.2510338400605745, + "grad_norm": 0.33105769753456116, + "learning_rate": 0.0005132902097902097, + "loss": 3.4229, + "step": 24900 + }, + { + "epoch": 7.265594967674296, + "grad_norm": 0.3505908250808716, + "learning_rate": 0.0005131153846153845, + "loss": 3.4278, + "step": 24950 + }, + { + "epoch": 7.280156095288019, + "grad_norm": 0.3380582332611084, + "learning_rate": 0.0005129405594405594, + "loss": 3.428, + "step": 25000 + }, + { + "epoch": 7.280156095288019, + "eval_accuracy": 0.36480793914395143, + "eval_loss": 3.5974769592285156, + "eval_runtime": 179.8567, + "eval_samples_per_second": 92.551, + "eval_steps_per_second": 5.788, + "step": 25000 + }, + { + "epoch": 7.294717222901742, + "grad_norm": 0.31806254386901855, + "learning_rate": 0.0005127657342657342, + "loss": 3.4173, + "step": 25050 + }, + { + "epoch": 7.309278350515464, + "grad_norm": 0.3278155028820038, + "learning_rate": 0.000512590909090909, + "loss": 3.4293, + "step": 25100 + }, + { + "epoch": 7.3238394781291865, + "grad_norm": 0.31251752376556396, + "learning_rate": 0.0005124160839160838, + "loss": 3.4386, + "step": 25150 + }, + { + "epoch": 7.338400605742908, + "grad_norm": 0.3372874855995178, + "learning_rate": 0.0005122412587412588, + "loss": 3.4216, + "step": 25200 + }, + { + "epoch": 7.352961733356631, + "grad_norm": 0.32962003350257874, + "learning_rate": 0.0005120664335664336, + "loss": 3.4316, + "step": 25250 + }, + { + "epoch": 7.367522860970354, + "grad_norm": 0.3354533612728119, + "learning_rate": 0.0005118916083916084, + "loss": 3.4331, + "step": 25300 + }, + { + "epoch": 7.382083988584076, + "grad_norm": 0.32760855555534363, + "learning_rate": 0.0005117167832167832, + "loss": 3.4298, + "step": 25350 + }, + { + "epoch": 7.396645116197798, + "grad_norm": 0.323398232460022, + "learning_rate": 0.0005115419580419581, + "loss": 3.4329, + "step": 25400 + }, + { + "epoch": 7.411206243811521, + "grad_norm": 0.3129633665084839, + "learning_rate": 0.0005113671328671328, + "loss": 3.4451, + "step": 25450 + }, + { + "epoch": 7.425767371425243, + "grad_norm": 0.308672159910202, + "learning_rate": 0.0005111923076923077, + "loss": 3.4402, + "step": 25500 + }, + { + "epoch": 7.440328499038966, + "grad_norm": 0.3408229649066925, + "learning_rate": 0.0005110174825174825, + "loss": 3.4397, + "step": 25550 + }, + { + "epoch": 7.454889626652688, + "grad_norm": 0.320758581161499, + "learning_rate": 0.0005108426573426573, + "loss": 3.4447, + "step": 25600 + }, + { + "epoch": 7.46945075426641, + "grad_norm": 0.33821046352386475, + "learning_rate": 0.0005106678321678321, + "loss": 3.4399, + "step": 25650 + }, + { + "epoch": 7.484011881880133, + "grad_norm": 0.32798120379447937, + "learning_rate": 0.000510493006993007, + "loss": 3.4441, + "step": 25700 + }, + { + "epoch": 7.498573009493855, + "grad_norm": 0.36191534996032715, + "learning_rate": 0.0005103181818181818, + "loss": 3.4465, + "step": 25750 + }, + { + "epoch": 7.513134137107578, + "grad_norm": 0.333870530128479, + "learning_rate": 0.0005101433566433566, + "loss": 3.4483, + "step": 25800 + }, + { + "epoch": 7.5276952647213005, + "grad_norm": 0.3584294319152832, + "learning_rate": 0.0005099685314685315, + "loss": 3.4472, + "step": 25850 + }, + { + "epoch": 7.542256392335022, + "grad_norm": 0.3232259750366211, + "learning_rate": 0.0005097937062937063, + "loss": 3.4551, + "step": 25900 + }, + { + "epoch": 7.556817519948745, + "grad_norm": 0.34521010518074036, + "learning_rate": 0.0005096188811188811, + "loss": 3.4492, + "step": 25950 + }, + { + "epoch": 7.571378647562467, + "grad_norm": 0.3537822365760803, + "learning_rate": 0.0005094440559440559, + "loss": 3.4552, + "step": 26000 + }, + { + "epoch": 7.571378647562467, + "eval_accuracy": 0.36535131351525596, + "eval_loss": 3.590632200241089, + "eval_runtime": 179.7252, + "eval_samples_per_second": 92.619, + "eval_steps_per_second": 5.792, + "step": 26000 + }, + { + "epoch": 7.58593977517619, + "grad_norm": 0.3477293848991394, + "learning_rate": 0.0005092692307692308, + "loss": 3.4485, + "step": 26050 + }, + { + "epoch": 7.600500902789912, + "grad_norm": 0.3164335787296295, + "learning_rate": 0.0005090944055944056, + "loss": 3.4537, + "step": 26100 + }, + { + "epoch": 7.615062030403634, + "grad_norm": 0.31365999579429626, + "learning_rate": 0.0005089195804195804, + "loss": 3.4563, + "step": 26150 + }, + { + "epoch": 7.629623158017357, + "grad_norm": 0.33597031235694885, + "learning_rate": 0.0005087447552447552, + "loss": 3.4469, + "step": 26200 + }, + { + "epoch": 7.644184285631079, + "grad_norm": 0.33030572533607483, + "learning_rate": 0.00050856993006993, + "loss": 3.4471, + "step": 26250 + }, + { + "epoch": 7.658745413244802, + "grad_norm": 0.34268873929977417, + "learning_rate": 0.0005083951048951048, + "loss": 3.4566, + "step": 26300 + }, + { + "epoch": 7.673306540858524, + "grad_norm": 0.34644824266433716, + "learning_rate": 0.0005082202797202797, + "loss": 3.4572, + "step": 26350 + }, + { + "epoch": 7.687867668472246, + "grad_norm": 0.3286401331424713, + "learning_rate": 0.0005080454545454545, + "loss": 3.4614, + "step": 26400 + }, + { + "epoch": 7.702428796085969, + "grad_norm": 0.3406911790370941, + "learning_rate": 0.0005078706293706293, + "loss": 3.4613, + "step": 26450 + }, + { + "epoch": 7.716989923699691, + "grad_norm": 0.32939502596855164, + "learning_rate": 0.0005076958041958042, + "loss": 3.4677, + "step": 26500 + }, + { + "epoch": 7.731551051313414, + "grad_norm": 0.33044230937957764, + "learning_rate": 0.000507520979020979, + "loss": 3.4601, + "step": 26550 + }, + { + "epoch": 7.746112178927136, + "grad_norm": 0.315995454788208, + "learning_rate": 0.0005073461538461538, + "loss": 3.4459, + "step": 26600 + }, + { + "epoch": 7.760673306540858, + "grad_norm": 0.35745933651924133, + "learning_rate": 0.0005071713286713286, + "loss": 3.4574, + "step": 26650 + }, + { + "epoch": 7.775234434154581, + "grad_norm": 0.3426244258880615, + "learning_rate": 0.0005069965034965035, + "loss": 3.4537, + "step": 26700 + }, + { + "epoch": 7.789795561768304, + "grad_norm": 0.3141034245491028, + "learning_rate": 0.0005068216783216783, + "loss": 3.4541, + "step": 26750 + }, + { + "epoch": 7.8043566893820255, + "grad_norm": 0.34187954664230347, + "learning_rate": 0.0005066468531468531, + "loss": 3.4703, + "step": 26800 + }, + { + "epoch": 7.818917816995748, + "grad_norm": 0.32608917355537415, + "learning_rate": 0.0005064720279720279, + "loss": 3.4433, + "step": 26850 + }, + { + "epoch": 7.833478944609471, + "grad_norm": 0.30253276228904724, + "learning_rate": 0.0005062972027972028, + "loss": 3.4582, + "step": 26900 + }, + { + "epoch": 7.848040072223193, + "grad_norm": 0.3292168378829956, + "learning_rate": 0.0005061223776223775, + "loss": 3.4609, + "step": 26950 + }, + { + "epoch": 7.862601199836916, + "grad_norm": 0.3352425992488861, + "learning_rate": 0.0005059475524475524, + "loss": 3.4646, + "step": 27000 + }, + { + "epoch": 7.862601199836916, + "eval_accuracy": 0.3661527378097569, + "eval_loss": 3.5805842876434326, + "eval_runtime": 179.8829, + "eval_samples_per_second": 92.538, + "eval_steps_per_second": 5.787, + "step": 27000 + }, + { + "epoch": 7.8771623274506375, + "grad_norm": 0.33013811707496643, + "learning_rate": 0.0005057727272727272, + "loss": 3.4593, + "step": 27050 + }, + { + "epoch": 7.89172345506436, + "grad_norm": 0.3326588273048401, + "learning_rate": 0.000505597902097902, + "loss": 3.4569, + "step": 27100 + }, + { + "epoch": 7.906284582678083, + "grad_norm": 0.3302481472492218, + "learning_rate": 0.0005054230769230769, + "loss": 3.4571, + "step": 27150 + }, + { + "epoch": 7.920845710291805, + "grad_norm": 0.3329846262931824, + "learning_rate": 0.0005052482517482517, + "loss": 3.471, + "step": 27200 + }, + { + "epoch": 7.935406837905528, + "grad_norm": 0.3458568751811981, + "learning_rate": 0.0005050734265734265, + "loss": 3.4765, + "step": 27250 + }, + { + "epoch": 7.9499679655192494, + "grad_norm": 0.3226156532764435, + "learning_rate": 0.0005048986013986013, + "loss": 3.4703, + "step": 27300 + }, + { + "epoch": 7.964529093132972, + "grad_norm": 0.34230631589889526, + "learning_rate": 0.0005047237762237762, + "loss": 3.4688, + "step": 27350 + }, + { + "epoch": 7.979090220746695, + "grad_norm": 0.31827977299690247, + "learning_rate": 0.000504548951048951, + "loss": 3.4583, + "step": 27400 + }, + { + "epoch": 7.993651348360417, + "grad_norm": 0.32115548849105835, + "learning_rate": 0.0005043741258741258, + "loss": 3.4671, + "step": 27450 + }, + { + "epoch": 8.008154231463685, + "grad_norm": 0.3317052721977234, + "learning_rate": 0.0005041993006993006, + "loss": 3.4071, + "step": 27500 + }, + { + "epoch": 8.022715359077408, + "grad_norm": 0.3432307541370392, + "learning_rate": 0.0005040244755244755, + "loss": 3.3524, + "step": 27550 + }, + { + "epoch": 8.037276486691129, + "grad_norm": 0.3754727244377136, + "learning_rate": 0.0005038496503496503, + "loss": 3.3459, + "step": 27600 + }, + { + "epoch": 8.051837614304851, + "grad_norm": 0.35126733779907227, + "learning_rate": 0.0005036748251748251, + "loss": 3.3583, + "step": 27650 + }, + { + "epoch": 8.066398741918574, + "grad_norm": 0.3542656898498535, + "learning_rate": 0.0005034999999999999, + "loss": 3.3705, + "step": 27700 + }, + { + "epoch": 8.080959869532297, + "grad_norm": 0.34104204177856445, + "learning_rate": 0.0005033251748251747, + "loss": 3.3645, + "step": 27750 + }, + { + "epoch": 8.09552099714602, + "grad_norm": 0.34891462326049805, + "learning_rate": 0.0005031503496503496, + "loss": 3.3766, + "step": 27800 + }, + { + "epoch": 8.11008212475974, + "grad_norm": 0.3348483145236969, + "learning_rate": 0.0005029755244755244, + "loss": 3.3758, + "step": 27850 + }, + { + "epoch": 8.124643252373463, + "grad_norm": 0.35943523049354553, + "learning_rate": 0.0005028006993006992, + "loss": 3.3722, + "step": 27900 + }, + { + "epoch": 8.139204379987186, + "grad_norm": 0.360538125038147, + "learning_rate": 0.000502625874125874, + "loss": 3.3764, + "step": 27950 + }, + { + "epoch": 8.153765507600909, + "grad_norm": 0.34357813000679016, + "learning_rate": 0.000502451048951049, + "loss": 3.3921, + "step": 28000 + }, + { + "epoch": 8.153765507600909, + "eval_accuracy": 0.3661058303294128, + "eval_loss": 3.58715558052063, + "eval_runtime": 180.3137, + "eval_samples_per_second": 92.317, + "eval_steps_per_second": 5.773, + "step": 28000 + }, + { + "epoch": 8.168326635214632, + "grad_norm": 0.3614567220211029, + "learning_rate": 0.0005022762237762237, + "loss": 3.3796, + "step": 28050 + }, + { + "epoch": 8.182887762828354, + "grad_norm": 0.33387571573257446, + "learning_rate": 0.0005021013986013985, + "loss": 3.383, + "step": 28100 + }, + { + "epoch": 8.197448890442075, + "grad_norm": 0.3599357008934021, + "learning_rate": 0.0005019265734265733, + "loss": 3.3902, + "step": 28150 + }, + { + "epoch": 8.212010018055798, + "grad_norm": 0.3254016041755676, + "learning_rate": 0.0005017517482517483, + "loss": 3.3856, + "step": 28200 + }, + { + "epoch": 8.22657114566952, + "grad_norm": 0.3269076347351074, + "learning_rate": 0.0005015769230769231, + "loss": 3.3963, + "step": 28250 + }, + { + "epoch": 8.241132273283243, + "grad_norm": 0.3196601867675781, + "learning_rate": 0.0005014020979020979, + "loss": 3.3973, + "step": 28300 + }, + { + "epoch": 8.255693400896966, + "grad_norm": 0.3544836640357971, + "learning_rate": 0.0005012272727272727, + "loss": 3.3985, + "step": 28350 + }, + { + "epoch": 8.270254528510687, + "grad_norm": 0.33133646845817566, + "learning_rate": 0.0005010524475524476, + "loss": 3.3971, + "step": 28400 + }, + { + "epoch": 8.28481565612441, + "grad_norm": 0.365125834941864, + "learning_rate": 0.0005008776223776223, + "loss": 3.3979, + "step": 28450 + }, + { + "epoch": 8.299376783738133, + "grad_norm": 0.3482271730899811, + "learning_rate": 0.0005007027972027972, + "loss": 3.4087, + "step": 28500 + }, + { + "epoch": 8.313937911351855, + "grad_norm": 0.3457016050815582, + "learning_rate": 0.000500527972027972, + "loss": 3.4072, + "step": 28550 + }, + { + "epoch": 8.328499038965578, + "grad_norm": 0.3350307047367096, + "learning_rate": 0.0005003531468531468, + "loss": 3.3984, + "step": 28600 + }, + { + "epoch": 8.3430601665793, + "grad_norm": 0.33122938871383667, + "learning_rate": 0.0005001783216783217, + "loss": 3.4036, + "step": 28650 + }, + { + "epoch": 8.357621294193022, + "grad_norm": 0.3646140992641449, + "learning_rate": 0.0005000034965034965, + "loss": 3.4019, + "step": 28700 + }, + { + "epoch": 8.372182421806745, + "grad_norm": 0.339650422334671, + "learning_rate": 0.0004998286713286713, + "loss": 3.3992, + "step": 28750 + }, + { + "epoch": 8.386743549420467, + "grad_norm": 0.31742623448371887, + "learning_rate": 0.0004996538461538461, + "loss": 3.404, + "step": 28800 + }, + { + "epoch": 8.40130467703419, + "grad_norm": 0.3145395815372467, + "learning_rate": 0.000499479020979021, + "loss": 3.3995, + "step": 28850 + }, + { + "epoch": 8.415865804647911, + "grad_norm": 0.34881776571273804, + "learning_rate": 0.0004993041958041958, + "loss": 3.4046, + "step": 28900 + }, + { + "epoch": 8.430426932261634, + "grad_norm": 0.3403722941875458, + "learning_rate": 0.0004991293706293706, + "loss": 3.4167, + "step": 28950 + }, + { + "epoch": 8.444988059875357, + "grad_norm": 0.3250523507595062, + "learning_rate": 0.0004989545454545454, + "loss": 3.4086, + "step": 29000 + }, + { + "epoch": 8.444988059875357, + "eval_accuracy": 0.3662244510002579, + "eval_loss": 3.5857577323913574, + "eval_runtime": 179.7516, + "eval_samples_per_second": 92.606, + "eval_steps_per_second": 5.791, + "step": 29000 + }, + { + "epoch": 8.45954918748908, + "grad_norm": 0.3275490403175354, + "learning_rate": 0.0004987797202797203, + "loss": 3.4166, + "step": 29050 + }, + { + "epoch": 8.474110315102802, + "grad_norm": 0.31486833095550537, + "learning_rate": 0.0004986048951048951, + "loss": 3.4041, + "step": 29100 + }, + { + "epoch": 8.488671442716523, + "grad_norm": 0.3729318082332611, + "learning_rate": 0.0004984300699300699, + "loss": 3.4167, + "step": 29150 + }, + { + "epoch": 8.503232570330246, + "grad_norm": 0.3305770456790924, + "learning_rate": 0.0004982552447552448, + "loss": 3.4228, + "step": 29200 + }, + { + "epoch": 8.517793697943969, + "grad_norm": 0.3442740738391876, + "learning_rate": 0.0004980804195804195, + "loss": 3.406, + "step": 29250 + }, + { + "epoch": 8.532354825557691, + "grad_norm": 0.32196056842803955, + "learning_rate": 0.0004979055944055944, + "loss": 3.4296, + "step": 29300 + }, + { + "epoch": 8.546915953171414, + "grad_norm": 0.3387078642845154, + "learning_rate": 0.0004977307692307692, + "loss": 3.4227, + "step": 29350 + }, + { + "epoch": 8.561477080785137, + "grad_norm": 0.32302534580230713, + "learning_rate": 0.000497555944055944, + "loss": 3.414, + "step": 29400 + }, + { + "epoch": 8.576038208398858, + "grad_norm": 0.3491160571575165, + "learning_rate": 0.0004973811188811188, + "loss": 3.4214, + "step": 29450 + }, + { + "epoch": 8.59059933601258, + "grad_norm": 0.32889190316200256, + "learning_rate": 0.0004972062937062937, + "loss": 3.4281, + "step": 29500 + }, + { + "epoch": 8.605160463626303, + "grad_norm": 0.32402417063713074, + "learning_rate": 0.0004970314685314685, + "loss": 3.4171, + "step": 29550 + }, + { + "epoch": 8.619721591240026, + "grad_norm": 0.3430418074131012, + "learning_rate": 0.0004968566433566433, + "loss": 3.4293, + "step": 29600 + }, + { + "epoch": 8.634282718853749, + "grad_norm": 0.34214910864830017, + "learning_rate": 0.0004966818181818181, + "loss": 3.4223, + "step": 29650 + }, + { + "epoch": 8.64884384646747, + "grad_norm": 0.3425740897655487, + "learning_rate": 0.000496506993006993, + "loss": 3.4194, + "step": 29700 + }, + { + "epoch": 8.663404974081192, + "grad_norm": 0.34497156739234924, + "learning_rate": 0.0004963321678321678, + "loss": 3.4311, + "step": 29750 + }, + { + "epoch": 8.677966101694915, + "grad_norm": 0.35663503408432007, + "learning_rate": 0.0004961573426573426, + "loss": 3.431, + "step": 29800 + }, + { + "epoch": 8.692527229308638, + "grad_norm": 0.34114986658096313, + "learning_rate": 0.0004959825174825175, + "loss": 3.4287, + "step": 29850 + }, + { + "epoch": 8.70708835692236, + "grad_norm": 0.3398053050041199, + "learning_rate": 0.0004958076923076923, + "loss": 3.4288, + "step": 29900 + }, + { + "epoch": 8.721649484536082, + "grad_norm": 0.34339818358421326, + "learning_rate": 0.0004956328671328671, + "loss": 3.4414, + "step": 29950 + }, + { + "epoch": 8.736210612149804, + "grad_norm": 0.3127419352531433, + "learning_rate": 0.0004954580419580419, + "loss": 3.4342, + "step": 30000 + }, + { + "epoch": 8.736210612149804, + "eval_accuracy": 0.36726194101037535, + "eval_loss": 3.5715725421905518, + "eval_runtime": 179.7611, + "eval_samples_per_second": 92.601, + "eval_steps_per_second": 5.791, + "step": 30000 + }, + { + "epoch": 8.750771739763527, + "grad_norm": 0.3249102532863617, + "learning_rate": 0.0004952832167832167, + "loss": 3.428, + "step": 30050 + }, + { + "epoch": 8.76533286737725, + "grad_norm": 0.34389105439186096, + "learning_rate": 0.0004951083916083915, + "loss": 3.4286, + "step": 30100 + }, + { + "epoch": 8.779893994990973, + "grad_norm": 0.34607070684432983, + "learning_rate": 0.0004949335664335664, + "loss": 3.4325, + "step": 30150 + }, + { + "epoch": 8.794455122604695, + "grad_norm": 0.33967599272727966, + "learning_rate": 0.0004947587412587412, + "loss": 3.4188, + "step": 30200 + }, + { + "epoch": 8.809016250218416, + "grad_norm": 0.34365391731262207, + "learning_rate": 0.000494583916083916, + "loss": 3.4258, + "step": 30250 + }, + { + "epoch": 8.82357737783214, + "grad_norm": 0.31158357858657837, + "learning_rate": 0.0004944090909090908, + "loss": 3.4302, + "step": 30300 + }, + { + "epoch": 8.838138505445862, + "grad_norm": 0.3425881564617157, + "learning_rate": 0.0004942342657342657, + "loss": 3.4471, + "step": 30350 + }, + { + "epoch": 8.852699633059585, + "grad_norm": 0.33694136142730713, + "learning_rate": 0.0004940594405594405, + "loss": 3.4363, + "step": 30400 + }, + { + "epoch": 8.867260760673307, + "grad_norm": 0.33916687965393066, + "learning_rate": 0.0004938846153846153, + "loss": 3.4398, + "step": 30450 + }, + { + "epoch": 8.881821888287028, + "grad_norm": 0.3424004912376404, + "learning_rate": 0.0004937097902097901, + "loss": 3.4373, + "step": 30500 + }, + { + "epoch": 8.896383015900751, + "grad_norm": 0.3579810857772827, + "learning_rate": 0.000493534965034965, + "loss": 3.4423, + "step": 30550 + }, + { + "epoch": 8.910944143514474, + "grad_norm": 0.35978007316589355, + "learning_rate": 0.0004933601398601398, + "loss": 3.4223, + "step": 30600 + }, + { + "epoch": 8.925505271128197, + "grad_norm": 0.34889093041419983, + "learning_rate": 0.0004931853146853146, + "loss": 3.4384, + "step": 30650 + }, + { + "epoch": 8.94006639874192, + "grad_norm": 0.3178730010986328, + "learning_rate": 0.0004930104895104895, + "loss": 3.4316, + "step": 30700 + }, + { + "epoch": 8.95462752635564, + "grad_norm": 0.3225439190864563, + "learning_rate": 0.0004928356643356642, + "loss": 3.4376, + "step": 30750 + }, + { + "epoch": 8.969188653969363, + "grad_norm": 0.32753077149391174, + "learning_rate": 0.0004926608391608391, + "loss": 3.4457, + "step": 30800 + }, + { + "epoch": 8.983749781583086, + "grad_norm": 0.3687169551849365, + "learning_rate": 0.0004924860139860139, + "loss": 3.4323, + "step": 30850 + }, + { + "epoch": 8.998310909196809, + "grad_norm": 0.3431978225708008, + "learning_rate": 0.0004923111888111887, + "loss": 3.4443, + "step": 30900 + }, + { + "epoch": 9.012813792300076, + "grad_norm": 0.3386369049549103, + "learning_rate": 0.0004921363636363635, + "loss": 3.3368, + "step": 30950 + }, + { + "epoch": 9.027374919913798, + "grad_norm": 0.35466766357421875, + "learning_rate": 0.0004919615384615384, + "loss": 3.3258, + "step": 31000 + }, + { + "epoch": 9.027374919913798, + "eval_accuracy": 0.36759758225444167, + "eval_loss": 3.57794451713562, + "eval_runtime": 184.3404, + "eval_samples_per_second": 90.3, + "eval_steps_per_second": 5.647, + "step": 31000 + }, + { + "epoch": 9.041936047527521, + "grad_norm": 0.3356529176235199, + "learning_rate": 0.0004917867132867132, + "loss": 3.3184, + "step": 31050 + }, + { + "epoch": 9.056497175141242, + "grad_norm": 0.3212270140647888, + "learning_rate": 0.000491611888111888, + "loss": 3.3383, + "step": 31100 + }, + { + "epoch": 9.071058302754965, + "grad_norm": 0.3324335813522339, + "learning_rate": 0.0004914370629370628, + "loss": 3.3392, + "step": 31150 + }, + { + "epoch": 9.085619430368688, + "grad_norm": 0.32331228256225586, + "learning_rate": 0.0004912622377622378, + "loss": 3.3521, + "step": 31200 + }, + { + "epoch": 9.10018055798241, + "grad_norm": 0.31954678893089294, + "learning_rate": 0.0004910874125874126, + "loss": 3.3435, + "step": 31250 + }, + { + "epoch": 9.114741685596133, + "grad_norm": 0.32974445819854736, + "learning_rate": 0.0004909125874125874, + "loss": 3.3501, + "step": 31300 + }, + { + "epoch": 9.129302813209854, + "grad_norm": 0.35506731271743774, + "learning_rate": 0.0004907377622377623, + "loss": 3.3467, + "step": 31350 + }, + { + "epoch": 9.143863940823577, + "grad_norm": 0.32969748973846436, + "learning_rate": 0.0004905629370629371, + "loss": 3.357, + "step": 31400 + }, + { + "epoch": 9.1584250684373, + "grad_norm": 0.3305834233760834, + "learning_rate": 0.0004903881118881119, + "loss": 3.3573, + "step": 31450 + }, + { + "epoch": 9.172986196051022, + "grad_norm": 0.33574923872947693, + "learning_rate": 0.0004902132867132867, + "loss": 3.3572, + "step": 31500 + }, + { + "epoch": 9.187547323664745, + "grad_norm": 0.32476624846458435, + "learning_rate": 0.0004900384615384615, + "loss": 3.3454, + "step": 31550 + }, + { + "epoch": 9.202108451278466, + "grad_norm": 0.36604878306388855, + "learning_rate": 0.0004898636363636363, + "loss": 3.3598, + "step": 31600 + }, + { + "epoch": 9.216669578892189, + "grad_norm": 0.3407774567604065, + "learning_rate": 0.0004896888111888112, + "loss": 3.3655, + "step": 31650 + }, + { + "epoch": 9.231230706505912, + "grad_norm": 0.3136043846607208, + "learning_rate": 0.000489513986013986, + "loss": 3.3658, + "step": 31700 + }, + { + "epoch": 9.245791834119634, + "grad_norm": 0.34752407670021057, + "learning_rate": 0.0004893391608391608, + "loss": 3.374, + "step": 31750 + }, + { + "epoch": 9.260352961733357, + "grad_norm": 0.33697524666786194, + "learning_rate": 0.0004891643356643356, + "loss": 3.3715, + "step": 31800 + }, + { + "epoch": 9.27491408934708, + "grad_norm": 0.3399849832057953, + "learning_rate": 0.0004889895104895105, + "loss": 3.3781, + "step": 31850 + }, + { + "epoch": 9.2894752169608, + "grad_norm": 0.32320427894592285, + "learning_rate": 0.0004888146853146853, + "loss": 3.3606, + "step": 31900 + }, + { + "epoch": 9.304036344574524, + "grad_norm": 0.3273387849330902, + "learning_rate": 0.0004886398601398601, + "loss": 3.3726, + "step": 31950 + }, + { + "epoch": 9.318597472188246, + "grad_norm": 0.33997225761413574, + "learning_rate": 0.000488465034965035, + "loss": 3.3831, + "step": 32000 + }, + { + "epoch": 9.318597472188246, + "eval_accuracy": 0.3672809861527707, + "eval_loss": 3.577822685241699, + "eval_runtime": 186.0219, + "eval_samples_per_second": 89.484, + "eval_steps_per_second": 5.596, + "step": 32000 + }, + { + "epoch": 9.333158599801969, + "grad_norm": 0.3397623896598816, + "learning_rate": 0.0004882902097902098, + "loss": 3.3802, + "step": 32050 + }, + { + "epoch": 9.347719727415692, + "grad_norm": 0.3965780735015869, + "learning_rate": 0.0004881153846153846, + "loss": 3.3961, + "step": 32100 + }, + { + "epoch": 9.362280855029413, + "grad_norm": 0.32509127259254456, + "learning_rate": 0.0004879405594405594, + "loss": 3.392, + "step": 32150 + }, + { + "epoch": 9.376841982643136, + "grad_norm": 0.3580123484134674, + "learning_rate": 0.00048776573426573424, + "loss": 3.3685, + "step": 32200 + }, + { + "epoch": 9.391403110256858, + "grad_norm": 0.33572641015052795, + "learning_rate": 0.00048759090909090904, + "loss": 3.3738, + "step": 32250 + }, + { + "epoch": 9.405964237870581, + "grad_norm": 0.34592849016189575, + "learning_rate": 0.0004874160839160839, + "loss": 3.3792, + "step": 32300 + }, + { + "epoch": 9.420525365484304, + "grad_norm": 0.39023056626319885, + "learning_rate": 0.0004872412587412587, + "loss": 3.3712, + "step": 32350 + }, + { + "epoch": 9.435086493098025, + "grad_norm": 0.3557857871055603, + "learning_rate": 0.00048706643356643354, + "loss": 3.3959, + "step": 32400 + }, + { + "epoch": 9.449647620711747, + "grad_norm": 0.3627590537071228, + "learning_rate": 0.00048689160839160834, + "loss": 3.3798, + "step": 32450 + }, + { + "epoch": 9.46420874832547, + "grad_norm": 0.34032562375068665, + "learning_rate": 0.0004867167832167832, + "loss": 3.3949, + "step": 32500 + }, + { + "epoch": 9.478769875939193, + "grad_norm": 0.32405319809913635, + "learning_rate": 0.00048654195804195794, + "loss": 3.382, + "step": 32550 + }, + { + "epoch": 9.493331003552916, + "grad_norm": 0.34905362129211426, + "learning_rate": 0.00048636713286713285, + "loss": 3.3953, + "step": 32600 + }, + { + "epoch": 9.507892131166638, + "grad_norm": 0.3418472409248352, + "learning_rate": 0.0004861923076923077, + "loss": 3.3972, + "step": 32650 + }, + { + "epoch": 9.52245325878036, + "grad_norm": 0.3480176031589508, + "learning_rate": 0.00048601748251748245, + "loss": 3.3983, + "step": 32700 + }, + { + "epoch": 9.537014386394082, + "grad_norm": 0.3377174139022827, + "learning_rate": 0.0004858426573426573, + "loss": 3.3821, + "step": 32750 + }, + { + "epoch": 9.551575514007805, + "grad_norm": 0.3357522487640381, + "learning_rate": 0.0004856678321678321, + "loss": 3.3975, + "step": 32800 + }, + { + "epoch": 9.566136641621528, + "grad_norm": 0.32841238379478455, + "learning_rate": 0.00048549300699300696, + "loss": 3.3976, + "step": 32850 + }, + { + "epoch": 9.58069776923525, + "grad_norm": 0.33749887347221375, + "learning_rate": 0.00048531818181818176, + "loss": 3.4136, + "step": 32900 + }, + { + "epoch": 9.595258896848971, + "grad_norm": 0.3626416325569153, + "learning_rate": 0.0004851433566433566, + "loss": 3.3985, + "step": 32950 + }, + { + "epoch": 9.609820024462694, + "grad_norm": 0.36860400438308716, + "learning_rate": 0.0004849685314685314, + "loss": 3.3965, + "step": 33000 + }, + { + "epoch": 9.609820024462694, + "eval_accuracy": 0.36782729958925975, + "eval_loss": 3.5715696811676025, + "eval_runtime": 183.923, + "eval_samples_per_second": 90.505, + "eval_steps_per_second": 5.66, + "step": 33000 + }, + { + "epoch": 9.624381152076417, + "grad_norm": 0.3426574468612671, + "learning_rate": 0.00048479370629370627, + "loss": 3.4014, + "step": 33050 + }, + { + "epoch": 9.63894227969014, + "grad_norm": 0.3384750783443451, + "learning_rate": 0.00048461888111888106, + "loss": 3.4112, + "step": 33100 + }, + { + "epoch": 9.653503407303862, + "grad_norm": 0.35202690958976746, + "learning_rate": 0.0004844440559440559, + "loss": 3.4129, + "step": 33150 + }, + { + "epoch": 9.668064534917583, + "grad_norm": 0.355497270822525, + "learning_rate": 0.0004842692307692307, + "loss": 3.3892, + "step": 33200 + }, + { + "epoch": 9.682625662531306, + "grad_norm": 0.32850146293640137, + "learning_rate": 0.00048409440559440557, + "loss": 3.4, + "step": 33250 + }, + { + "epoch": 9.697186790145029, + "grad_norm": 0.3368713855743408, + "learning_rate": 0.0004839195804195803, + "loss": 3.4127, + "step": 33300 + }, + { + "epoch": 9.711747917758752, + "grad_norm": 0.3568696677684784, + "learning_rate": 0.0004837447552447552, + "loss": 3.3944, + "step": 33350 + }, + { + "epoch": 9.726309045372474, + "grad_norm": 0.32732048630714417, + "learning_rate": 0.0004835699300699301, + "loss": 3.4031, + "step": 33400 + }, + { + "epoch": 9.740870172986195, + "grad_norm": 0.3446010649204254, + "learning_rate": 0.0004833951048951048, + "loss": 3.4169, + "step": 33450 + }, + { + "epoch": 9.755431300599918, + "grad_norm": 0.32168522477149963, + "learning_rate": 0.0004832202797202797, + "loss": 3.4106, + "step": 33500 + }, + { + "epoch": 9.76999242821364, + "grad_norm": 0.35548439621925354, + "learning_rate": 0.0004830454545454545, + "loss": 3.3974, + "step": 33550 + }, + { + "epoch": 9.784553555827364, + "grad_norm": 0.3315522074699402, + "learning_rate": 0.00048287062937062933, + "loss": 3.4138, + "step": 33600 + }, + { + "epoch": 9.799114683441086, + "grad_norm": 0.33013713359832764, + "learning_rate": 0.00048269580419580413, + "loss": 3.4092, + "step": 33650 + }, + { + "epoch": 9.813675811054807, + "grad_norm": 0.34848856925964355, + "learning_rate": 0.000482520979020979, + "loss": 3.4104, + "step": 33700 + }, + { + "epoch": 9.82823693866853, + "grad_norm": 0.32687628269195557, + "learning_rate": 0.0004823461538461538, + "loss": 3.4018, + "step": 33750 + }, + { + "epoch": 9.842798066282253, + "grad_norm": 0.36140188574790955, + "learning_rate": 0.00048217132867132864, + "loss": 3.4006, + "step": 33800 + }, + { + "epoch": 9.857359193895975, + "grad_norm": 0.32018741965293884, + "learning_rate": 0.00048199650349650344, + "loss": 3.4013, + "step": 33850 + }, + { + "epoch": 9.871920321509698, + "grad_norm": 0.34114909172058105, + "learning_rate": 0.0004818216783216783, + "loss": 3.3959, + "step": 33900 + }, + { + "epoch": 9.88648144912342, + "grad_norm": 0.32743898034095764, + "learning_rate": 0.0004816468531468531, + "loss": 3.4191, + "step": 33950 + }, + { + "epoch": 9.901042576737142, + "grad_norm": 0.37873852252960205, + "learning_rate": 0.00048147202797202795, + "loss": 3.4187, + "step": 34000 + }, + { + "epoch": 9.901042576737142, + "eval_accuracy": 0.36834492774954836, + "eval_loss": 3.561105728149414, + "eval_runtime": 179.7864, + "eval_samples_per_second": 92.588, + "eval_steps_per_second": 5.79, + "step": 34000 + }, + { + "epoch": 9.915603704350865, + "grad_norm": 0.33194535970687866, + "learning_rate": 0.0004812972027972028, + "loss": 3.4104, + "step": 34050 + }, + { + "epoch": 9.930164831964587, + "grad_norm": 0.3524761497974396, + "learning_rate": 0.0004811223776223776, + "loss": 3.4132, + "step": 34100 + }, + { + "epoch": 9.94472595957831, + "grad_norm": 0.3482424020767212, + "learning_rate": 0.00048094755244755245, + "loss": 3.4059, + "step": 34150 + }, + { + "epoch": 9.959287087192033, + "grad_norm": 0.350705087184906, + "learning_rate": 0.0004807727272727272, + "loss": 3.4047, + "step": 34200 + }, + { + "epoch": 9.973848214805754, + "grad_norm": 0.33391574025154114, + "learning_rate": 0.00048059790209790205, + "loss": 3.4213, + "step": 34250 + }, + { + "epoch": 9.988409342419477, + "grad_norm": 0.3514692187309265, + "learning_rate": 0.00048042307692307685, + "loss": 3.4138, + "step": 34300 + }, + { + "epoch": 10.002912225522744, + "grad_norm": 0.3422977030277252, + "learning_rate": 0.0004802482517482517, + "loss": 3.3925, + "step": 34350 + }, + { + "epoch": 10.017473353136467, + "grad_norm": 0.33694586157798767, + "learning_rate": 0.0004800734265734265, + "loss": 3.296, + "step": 34400 + }, + { + "epoch": 10.03203448075019, + "grad_norm": 0.3611753284931183, + "learning_rate": 0.00047989860139860136, + "loss": 3.2954, + "step": 34450 + }, + { + "epoch": 10.046595608363912, + "grad_norm": 0.3551093637943268, + "learning_rate": 0.00047972377622377616, + "loss": 3.2935, + "step": 34500 + }, + { + "epoch": 10.061156735977635, + "grad_norm": 0.37375402450561523, + "learning_rate": 0.000479548951048951, + "loss": 3.3175, + "step": 34550 + }, + { + "epoch": 10.075717863591356, + "grad_norm": 0.365528404712677, + "learning_rate": 0.0004793741258741258, + "loss": 3.3185, + "step": 34600 + }, + { + "epoch": 10.090278991205079, + "grad_norm": 0.35895606875419617, + "learning_rate": 0.00047919930069930067, + "loss": 3.3174, + "step": 34650 + }, + { + "epoch": 10.104840118818801, + "grad_norm": 0.32946503162384033, + "learning_rate": 0.0004790244755244755, + "loss": 3.32, + "step": 34700 + }, + { + "epoch": 10.119401246432524, + "grad_norm": 0.33243829011917114, + "learning_rate": 0.0004788496503496503, + "loss": 3.3142, + "step": 34750 + }, + { + "epoch": 10.133962374046247, + "grad_norm": 0.3511507511138916, + "learning_rate": 0.0004786748251748252, + "loss": 3.3314, + "step": 34800 + }, + { + "epoch": 10.148523501659968, + "grad_norm": 0.3584575653076172, + "learning_rate": 0.0004785, + "loss": 3.3345, + "step": 34850 + }, + { + "epoch": 10.16308462927369, + "grad_norm": 0.3298545479774475, + "learning_rate": 0.00047832517482517483, + "loss": 3.339, + "step": 34900 + }, + { + "epoch": 10.177645756887413, + "grad_norm": 0.3483952581882477, + "learning_rate": 0.0004781503496503496, + "loss": 3.324, + "step": 34950 + }, + { + "epoch": 10.192206884501136, + "grad_norm": 0.3524647057056427, + "learning_rate": 0.00047797552447552443, + "loss": 3.344, + "step": 35000 + }, + { + "epoch": 10.192206884501136, + "eval_accuracy": 0.3683687929588463, + "eval_loss": 3.5726046562194824, + "eval_runtime": 180.0575, + "eval_samples_per_second": 92.448, + "eval_steps_per_second": 5.781, + "step": 35000 + }, + { + "epoch": 10.206768012114859, + "grad_norm": 0.3376518189907074, + "learning_rate": 0.00047780069930069923, + "loss": 3.3347, + "step": 35050 + }, + { + "epoch": 10.221329139728581, + "grad_norm": 0.3457695543766022, + "learning_rate": 0.0004776258741258741, + "loss": 3.3483, + "step": 35100 + }, + { + "epoch": 10.235890267342302, + "grad_norm": 0.37430188059806824, + "learning_rate": 0.0004774510489510489, + "loss": 3.3552, + "step": 35150 + }, + { + "epoch": 10.250451394956025, + "grad_norm": 0.3510351777076721, + "learning_rate": 0.00047727622377622374, + "loss": 3.3549, + "step": 35200 + }, + { + "epoch": 10.265012522569748, + "grad_norm": 0.37889590859413147, + "learning_rate": 0.00047710139860139854, + "loss": 3.3416, + "step": 35250 + }, + { + "epoch": 10.27957365018347, + "grad_norm": 0.3422775864601135, + "learning_rate": 0.0004769265734265734, + "loss": 3.3383, + "step": 35300 + }, + { + "epoch": 10.294134777797193, + "grad_norm": 0.38626229763031006, + "learning_rate": 0.0004767517482517482, + "loss": 3.3461, + "step": 35350 + }, + { + "epoch": 10.308695905410914, + "grad_norm": 0.3493908643722534, + "learning_rate": 0.00047657692307692304, + "loss": 3.3535, + "step": 35400 + }, + { + "epoch": 10.323257033024637, + "grad_norm": 0.35432669520378113, + "learning_rate": 0.0004764020979020979, + "loss": 3.3555, + "step": 35450 + }, + { + "epoch": 10.33781816063836, + "grad_norm": 0.3410918116569519, + "learning_rate": 0.0004762272727272727, + "loss": 3.3469, + "step": 35500 + }, + { + "epoch": 10.352379288252083, + "grad_norm": 0.36023515462875366, + "learning_rate": 0.00047605244755244755, + "loss": 3.3584, + "step": 35550 + }, + { + "epoch": 10.366940415865805, + "grad_norm": 0.3287743330001831, + "learning_rate": 0.00047587762237762235, + "loss": 3.3674, + "step": 35600 + }, + { + "epoch": 10.381501543479526, + "grad_norm": 0.3435341715812683, + "learning_rate": 0.0004757027972027972, + "loss": 3.3646, + "step": 35650 + }, + { + "epoch": 10.396062671093249, + "grad_norm": 0.3478064239025116, + "learning_rate": 0.00047552797202797195, + "loss": 3.3675, + "step": 35700 + }, + { + "epoch": 10.410623798706972, + "grad_norm": 0.3641142249107361, + "learning_rate": 0.0004753531468531468, + "loss": 3.3591, + "step": 35750 + }, + { + "epoch": 10.425184926320695, + "grad_norm": 0.3393605649471283, + "learning_rate": 0.0004751783216783216, + "loss": 3.3615, + "step": 35800 + }, + { + "epoch": 10.439746053934417, + "grad_norm": 0.38742467761039734, + "learning_rate": 0.00047500349650349646, + "loss": 3.3701, + "step": 35850 + }, + { + "epoch": 10.454307181548138, + "grad_norm": 0.37009376287460327, + "learning_rate": 0.00047482867132867126, + "loss": 3.3576, + "step": 35900 + }, + { + "epoch": 10.468868309161861, + "grad_norm": 0.36964377760887146, + "learning_rate": 0.0004746538461538461, + "loss": 3.3546, + "step": 35950 + }, + { + "epoch": 10.483429436775584, + "grad_norm": 0.3347964882850647, + "learning_rate": 0.0004744790209790209, + "loss": 3.3789, + "step": 36000 + }, + { + "epoch": 10.483429436775584, + "eval_accuracy": 0.3682949636414124, + "eval_loss": 3.5694682598114014, + "eval_runtime": 179.8767, + "eval_samples_per_second": 92.541, + "eval_steps_per_second": 5.787, + "step": 36000 + }, + { + "epoch": 10.497990564389307, + "grad_norm": 0.3568975329399109, + "learning_rate": 0.00047430419580419576, + "loss": 3.3692, + "step": 36050 + }, + { + "epoch": 10.51255169200303, + "grad_norm": 0.3243386447429657, + "learning_rate": 0.0004741293706293706, + "loss": 3.3693, + "step": 36100 + }, + { + "epoch": 10.52711281961675, + "grad_norm": 0.3336549997329712, + "learning_rate": 0.0004739545454545454, + "loss": 3.3785, + "step": 36150 + }, + { + "epoch": 10.541673947230473, + "grad_norm": 0.3561848998069763, + "learning_rate": 0.00047377972027972027, + "loss": 3.3691, + "step": 36200 + }, + { + "epoch": 10.556235074844196, + "grad_norm": 0.356851726770401, + "learning_rate": 0.00047360489510489507, + "loss": 3.3643, + "step": 36250 + }, + { + "epoch": 10.570796202457919, + "grad_norm": 0.33825376629829407, + "learning_rate": 0.0004734300699300699, + "loss": 3.3913, + "step": 36300 + }, + { + "epoch": 10.585357330071641, + "grad_norm": 0.3185909390449524, + "learning_rate": 0.0004732552447552447, + "loss": 3.3784, + "step": 36350 + }, + { + "epoch": 10.599918457685362, + "grad_norm": 0.3499145805835724, + "learning_rate": 0.0004730804195804196, + "loss": 3.379, + "step": 36400 + }, + { + "epoch": 10.614479585299085, + "grad_norm": NaN, + "learning_rate": 0.0004729055944055943, + "loss": 3.3796, + "step": 36450 + }, + { + "epoch": 10.629040712912808, + "grad_norm": 0.34612196683883667, + "learning_rate": 0.0004727307692307692, + "loss": 3.3845, + "step": 36500 + }, + { + "epoch": 10.64360184052653, + "grad_norm": 0.3680227994918823, + "learning_rate": 0.000472555944055944, + "loss": 3.3761, + "step": 36550 + }, + { + "epoch": 10.658162968140253, + "grad_norm": 0.36743855476379395, + "learning_rate": 0.00047238111888111883, + "loss": 3.3684, + "step": 36600 + }, + { + "epoch": 10.672724095753976, + "grad_norm": 0.33822494745254517, + "learning_rate": 0.00047220629370629363, + "loss": 3.3854, + "step": 36650 + }, + { + "epoch": 10.687285223367697, + "grad_norm": 0.3840745687484741, + "learning_rate": 0.0004720314685314685, + "loss": 3.3676, + "step": 36700 + }, + { + "epoch": 10.70184635098142, + "grad_norm": 0.3411411941051483, + "learning_rate": 0.0004718566433566433, + "loss": 3.3732, + "step": 36750 + }, + { + "epoch": 10.716407478595142, + "grad_norm": 0.3389338552951813, + "learning_rate": 0.00047168181818181814, + "loss": 3.3848, + "step": 36800 + }, + { + "epoch": 10.730968606208865, + "grad_norm": 0.33619245886802673, + "learning_rate": 0.000471506993006993, + "loss": 3.3761, + "step": 36850 + }, + { + "epoch": 10.745529733822588, + "grad_norm": 0.35158175230026245, + "learning_rate": 0.0004713321678321678, + "loss": 3.3823, + "step": 36900 + }, + { + "epoch": 10.760090861436309, + "grad_norm": 0.3377283811569214, + "learning_rate": 0.00047115734265734265, + "loss": 3.4041, + "step": 36950 + }, + { + "epoch": 10.774651989050032, + "grad_norm": 0.3622257113456726, + "learning_rate": 0.00047098251748251745, + "loss": 3.4005, + "step": 37000 + }, + { + "epoch": 10.774651989050032, + "eval_accuracy": 0.36925674333237796, + "eval_loss": 3.5602827072143555, + "eval_runtime": 179.7742, + "eval_samples_per_second": 92.594, + "eval_steps_per_second": 5.791, + "step": 37000 + }, + { + "epoch": 10.789213116663754, + "grad_norm": 0.3160146474838257, + "learning_rate": 0.0004708076923076923, + "loss": 3.3912, + "step": 37050 + }, + { + "epoch": 10.803774244277477, + "grad_norm": 0.3281710147857666, + "learning_rate": 0.0004706328671328671, + "loss": 3.3967, + "step": 37100 + }, + { + "epoch": 10.8183353718912, + "grad_norm": 0.3841243386268616, + "learning_rate": 0.00047045804195804195, + "loss": 3.3979, + "step": 37150 + }, + { + "epoch": 10.83289649950492, + "grad_norm": 0.33870750665664673, + "learning_rate": 0.0004702832167832167, + "loss": 3.3879, + "step": 37200 + }, + { + "epoch": 10.847457627118644, + "grad_norm": 0.34300288558006287, + "learning_rate": 0.00047010839160839155, + "loss": 3.3959, + "step": 37250 + }, + { + "epoch": 10.862018754732366, + "grad_norm": 0.35645779967308044, + "learning_rate": 0.00046993356643356635, + "loss": 3.3974, + "step": 37300 + }, + { + "epoch": 10.876579882346089, + "grad_norm": 0.35175466537475586, + "learning_rate": 0.0004697587412587412, + "loss": 3.3888, + "step": 37350 + }, + { + "epoch": 10.891141009959812, + "grad_norm": 0.3222729563713074, + "learning_rate": 0.000469583916083916, + "loss": 3.3981, + "step": 37400 + }, + { + "epoch": 10.905702137573535, + "grad_norm": 0.32414838671684265, + "learning_rate": 0.00046940909090909086, + "loss": 3.391, + "step": 37450 + }, + { + "epoch": 10.920263265187256, + "grad_norm": 0.34362274408340454, + "learning_rate": 0.0004692342657342657, + "loss": 3.3832, + "step": 37500 + }, + { + "epoch": 10.934824392800978, + "grad_norm": 0.3682989776134491, + "learning_rate": 0.0004690594405594405, + "loss": 3.3894, + "step": 37550 + }, + { + "epoch": 10.949385520414701, + "grad_norm": 0.3218347728252411, + "learning_rate": 0.00046888461538461537, + "loss": 3.3901, + "step": 37600 + }, + { + "epoch": 10.963946648028424, + "grad_norm": 0.37173983454704285, + "learning_rate": 0.00046870979020979017, + "loss": 3.3884, + "step": 37650 + }, + { + "epoch": 10.978507775642147, + "grad_norm": 0.37107351422309875, + "learning_rate": 0.000468534965034965, + "loss": 3.3922, + "step": 37700 + }, + { + "epoch": 10.993068903255867, + "grad_norm": 0.35204780101776123, + "learning_rate": 0.0004683601398601398, + "loss": 3.3753, + "step": 37750 + }, + { + "epoch": 11.007571786359136, + "grad_norm": 0.33582913875579834, + "learning_rate": 0.0004681853146853147, + "loss": 3.3251, + "step": 37800 + }, + { + "epoch": 11.022132913972857, + "grad_norm": 0.33512166142463684, + "learning_rate": 0.0004680104895104895, + "loss": 3.2653, + "step": 37850 + }, + { + "epoch": 11.03669404158658, + "grad_norm": 0.3530137240886688, + "learning_rate": 0.00046783566433566433, + "loss": 3.2888, + "step": 37900 + }, + { + "epoch": 11.051255169200303, + "grad_norm": 0.3322924077510834, + "learning_rate": 0.0004676608391608391, + "loss": 3.304, + "step": 37950 + }, + { + "epoch": 11.065816296814026, + "grad_norm": 0.34434235095977783, + "learning_rate": 0.00046748601398601393, + "loss": 3.2886, + "step": 38000 + }, + { + "epoch": 11.065816296814026, + "eval_accuracy": 0.36925674333237796, + "eval_loss": 3.5678048133850098, + "eval_runtime": 179.7265, + "eval_samples_per_second": 92.618, + "eval_steps_per_second": 5.792, + "step": 38000 + }, + { + "epoch": 11.080377424427748, + "grad_norm": 0.3417208790779114, + "learning_rate": 0.00046731118881118873, + "loss": 3.2852, + "step": 38050 + }, + { + "epoch": 11.09493855204147, + "grad_norm": 0.36699753999710083, + "learning_rate": 0.0004671363636363636, + "loss": 3.2885, + "step": 38100 + }, + { + "epoch": 11.109499679655192, + "grad_norm": 0.336487740278244, + "learning_rate": 0.00046696153846153844, + "loss": 3.309, + "step": 38150 + }, + { + "epoch": 11.124060807268915, + "grad_norm": 0.35883960127830505, + "learning_rate": 0.00046678671328671324, + "loss": 3.3019, + "step": 38200 + }, + { + "epoch": 11.138621934882638, + "grad_norm": 0.3807854950428009, + "learning_rate": 0.0004666118881118881, + "loss": 3.3034, + "step": 38250 + }, + { + "epoch": 11.15318306249636, + "grad_norm": 0.3594328463077545, + "learning_rate": 0.0004664370629370629, + "loss": 3.2984, + "step": 38300 + }, + { + "epoch": 11.167744190110081, + "grad_norm": 0.344275563955307, + "learning_rate": 0.00046626223776223774, + "loss": 3.3229, + "step": 38350 + }, + { + "epoch": 11.182305317723804, + "grad_norm": 0.3437211811542511, + "learning_rate": 0.00046608741258741254, + "loss": 3.3241, + "step": 38400 + }, + { + "epoch": 11.196866445337527, + "grad_norm": 0.3443619906902313, + "learning_rate": 0.0004659125874125874, + "loss": 3.319, + "step": 38450 + }, + { + "epoch": 11.21142757295125, + "grad_norm": 0.3490242063999176, + "learning_rate": 0.0004657377622377622, + "loss": 3.3212, + "step": 38500 + }, + { + "epoch": 11.225988700564972, + "grad_norm": 0.33340850472450256, + "learning_rate": 0.00046556293706293705, + "loss": 3.3279, + "step": 38550 + }, + { + "epoch": 11.240549828178693, + "grad_norm": 0.33329835534095764, + "learning_rate": 0.00046538811188811185, + "loss": 3.3262, + "step": 38600 + }, + { + "epoch": 11.255110955792416, + "grad_norm": 0.3586530387401581, + "learning_rate": 0.0004652132867132867, + "loss": 3.3209, + "step": 38650 + }, + { + "epoch": 11.269672083406139, + "grad_norm": 0.3424071669578552, + "learning_rate": 0.00046503846153846145, + "loss": 3.3249, + "step": 38700 + }, + { + "epoch": 11.284233211019862, + "grad_norm": 0.36820656061172485, + "learning_rate": 0.0004648636363636363, + "loss": 3.3411, + "step": 38750 + }, + { + "epoch": 11.298794338633584, + "grad_norm": 0.33335718512535095, + "learning_rate": 0.0004646888111888111, + "loss": 3.3367, + "step": 38800 + }, + { + "epoch": 11.313355466247307, + "grad_norm": 0.3624469041824341, + "learning_rate": 0.00046451398601398596, + "loss": 3.3382, + "step": 38850 + }, + { + "epoch": 11.327916593861028, + "grad_norm": 0.3541378974914551, + "learning_rate": 0.0004643391608391608, + "loss": 3.335, + "step": 38900 + }, + { + "epoch": 11.34247772147475, + "grad_norm": 0.3461047112941742, + "learning_rate": 0.0004641643356643356, + "loss": 3.3381, + "step": 38950 + }, + { + "epoch": 11.357038849088473, + "grad_norm": 0.33990010619163513, + "learning_rate": 0.00046398951048951046, + "loss": 3.339, + "step": 39000 + }, + { + "epoch": 11.357038849088473, + "eval_accuracy": 0.36928096122949794, + "eval_loss": 3.5659894943237305, + "eval_runtime": 179.756, + "eval_samples_per_second": 92.603, + "eval_steps_per_second": 5.791, + "step": 39000 + }, + { + "epoch": 11.371599976702196, + "grad_norm": 0.32082730531692505, + "learning_rate": 0.00046381468531468526, + "loss": 3.3427, + "step": 39050 + }, + { + "epoch": 11.386161104315919, + "grad_norm": 0.34358900785446167, + "learning_rate": 0.0004636398601398601, + "loss": 3.3326, + "step": 39100 + }, + { + "epoch": 11.40072223192964, + "grad_norm": 0.3438430428504944, + "learning_rate": 0.0004634650349650349, + "loss": 3.3397, + "step": 39150 + }, + { + "epoch": 11.415283359543363, + "grad_norm": 0.33695173263549805, + "learning_rate": 0.00046329020979020977, + "loss": 3.3419, + "step": 39200 + }, + { + "epoch": 11.429844487157085, + "grad_norm": 0.35443180799484253, + "learning_rate": 0.00046311538461538457, + "loss": 3.3384, + "step": 39250 + }, + { + "epoch": 11.444405614770808, + "grad_norm": 0.3469848930835724, + "learning_rate": 0.0004629405594405594, + "loss": 3.3428, + "step": 39300 + }, + { + "epoch": 11.458966742384531, + "grad_norm": 0.3489019274711609, + "learning_rate": 0.0004627657342657342, + "loss": 3.3367, + "step": 39350 + }, + { + "epoch": 11.473527869998252, + "grad_norm": 0.34575167298316956, + "learning_rate": 0.0004625909090909091, + "loss": 3.3421, + "step": 39400 + }, + { + "epoch": 11.488088997611975, + "grad_norm": 0.36594027280807495, + "learning_rate": 0.0004624160839160838, + "loss": 3.3443, + "step": 39450 + }, + { + "epoch": 11.502650125225697, + "grad_norm": 0.37035486102104187, + "learning_rate": 0.0004622412587412587, + "loss": 3.3556, + "step": 39500 + }, + { + "epoch": 11.51721125283942, + "grad_norm": 0.378604918718338, + "learning_rate": 0.00046206643356643353, + "loss": 3.3449, + "step": 39550 + }, + { + "epoch": 11.531772380453143, + "grad_norm": 0.37700214982032776, + "learning_rate": 0.00046189160839160833, + "loss": 3.3588, + "step": 39600 + }, + { + "epoch": 11.546333508066864, + "grad_norm": 0.3387846350669861, + "learning_rate": 0.0004617167832167832, + "loss": 3.3551, + "step": 39650 + }, + { + "epoch": 11.560894635680587, + "grad_norm": 0.35091421008110046, + "learning_rate": 0.000461541958041958, + "loss": 3.3484, + "step": 39700 + }, + { + "epoch": 11.57545576329431, + "grad_norm": 0.36825278401374817, + "learning_rate": 0.00046136713286713284, + "loss": 3.3578, + "step": 39750 + }, + { + "epoch": 11.590016890908032, + "grad_norm": 0.3793783485889435, + "learning_rate": 0.00046119230769230764, + "loss": 3.3567, + "step": 39800 + }, + { + "epoch": 11.604578018521755, + "grad_norm": 0.35660460591316223, + "learning_rate": 0.0004610174825174825, + "loss": 3.3665, + "step": 39850 + }, + { + "epoch": 11.619139146135478, + "grad_norm": 0.3568241000175476, + "learning_rate": 0.0004608426573426573, + "loss": 3.3687, + "step": 39900 + }, + { + "epoch": 11.633700273749199, + "grad_norm": 0.3692575991153717, + "learning_rate": 0.00046066783216783215, + "loss": 3.3489, + "step": 39950 + }, + { + "epoch": 11.648261401362921, + "grad_norm": 0.33610156178474426, + "learning_rate": 0.00046049300699300695, + "loss": 3.3622, + "step": 40000 + }, + { + "epoch": 11.648261401362921, + "eval_accuracy": 0.36969924898655154, + "eval_loss": 3.5569655895233154, + "eval_runtime": 179.7612, + "eval_samples_per_second": 92.601, + "eval_steps_per_second": 5.791, + "step": 40000 + }, + { + "epoch": 11.662822528976644, + "grad_norm": 0.35730060935020447, + "learning_rate": 0.0004603181818181818, + "loss": 3.3747, + "step": 40050 + }, + { + "epoch": 11.677383656590367, + "grad_norm": 0.36511674523353577, + "learning_rate": 0.0004601433566433566, + "loss": 3.3653, + "step": 40100 + }, + { + "epoch": 11.69194478420409, + "grad_norm": 0.3742559254169464, + "learning_rate": 0.00045996853146853145, + "loss": 3.3673, + "step": 40150 + }, + { + "epoch": 11.70650591181781, + "grad_norm": 0.3337925970554352, + "learning_rate": 0.0004597937062937062, + "loss": 3.353, + "step": 40200 + }, + { + "epoch": 11.721067039431533, + "grad_norm": 0.3480316996574402, + "learning_rate": 0.00045961888111888105, + "loss": 3.3657, + "step": 40250 + }, + { + "epoch": 11.735628167045256, + "grad_norm": 0.34601667523384094, + "learning_rate": 0.0004594440559440559, + "loss": 3.3716, + "step": 40300 + }, + { + "epoch": 11.750189294658979, + "grad_norm": 0.34064266085624695, + "learning_rate": 0.0004592692307692307, + "loss": 3.3759, + "step": 40350 + }, + { + "epoch": 11.764750422272702, + "grad_norm": 0.34361889958381653, + "learning_rate": 0.00045909440559440556, + "loss": 3.3829, + "step": 40400 + }, + { + "epoch": 11.779311549886422, + "grad_norm": 0.36990559101104736, + "learning_rate": 0.00045891958041958036, + "loss": 3.3704, + "step": 40450 + }, + { + "epoch": 11.793872677500145, + "grad_norm": 0.3608870804309845, + "learning_rate": 0.0004587447552447552, + "loss": 3.3708, + "step": 40500 + }, + { + "epoch": 11.808433805113868, + "grad_norm": 0.3519209027290344, + "learning_rate": 0.00045856993006993, + "loss": 3.3659, + "step": 40550 + }, + { + "epoch": 11.82299493272759, + "grad_norm": 0.353716641664505, + "learning_rate": 0.00045839510489510487, + "loss": 3.3706, + "step": 40600 + }, + { + "epoch": 11.837556060341313, + "grad_norm": 0.3455187976360321, + "learning_rate": 0.00045822027972027967, + "loss": 3.3705, + "step": 40650 + }, + { + "epoch": 11.852117187955034, + "grad_norm": 0.373958945274353, + "learning_rate": 0.0004580454545454545, + "loss": 3.348, + "step": 40700 + }, + { + "epoch": 11.866678315568757, + "grad_norm": 0.3595150411128998, + "learning_rate": 0.0004578706293706293, + "loss": 3.3672, + "step": 40750 + }, + { + "epoch": 11.88123944318248, + "grad_norm": 0.36767902970314026, + "learning_rate": 0.0004576958041958042, + "loss": 3.3728, + "step": 40800 + }, + { + "epoch": 11.895800570796203, + "grad_norm": 0.35420218110084534, + "learning_rate": 0.000457520979020979, + "loss": 3.3674, + "step": 40850 + }, + { + "epoch": 11.910361698409925, + "grad_norm": 0.35077446699142456, + "learning_rate": 0.00045734615384615383, + "loss": 3.3652, + "step": 40900 + }, + { + "epoch": 11.924922826023646, + "grad_norm": 0.3696039021015167, + "learning_rate": 0.0004571713286713287, + "loss": 3.3689, + "step": 40950 + }, + { + "epoch": 11.93948395363737, + "grad_norm": 0.34140545129776, + "learning_rate": 0.00045699650349650343, + "loss": 3.3645, + "step": 41000 + }, + { + "epoch": 11.93948395363737, + "eval_accuracy": 0.3705252438659946, + "eval_loss": 3.5469377040863037, + "eval_runtime": 179.8742, + "eval_samples_per_second": 92.542, + "eval_steps_per_second": 5.787, + "step": 41000 + }, + { + "epoch": 11.954045081251092, + "grad_norm": 0.35556402802467346, + "learning_rate": 0.0004568216783216783, + "loss": 3.3857, + "step": 41050 + }, + { + "epoch": 11.968606208864815, + "grad_norm": 0.3346519470214844, + "learning_rate": 0.0004566468531468531, + "loss": 3.3626, + "step": 41100 + }, + { + "epoch": 11.983167336478537, + "grad_norm": 0.34058094024658203, + "learning_rate": 0.00045647202797202794, + "loss": 3.3772, + "step": 41150 + }, + { + "epoch": 11.99772846409226, + "grad_norm": 0.38102081418037415, + "learning_rate": 0.00045629720279720274, + "loss": 3.3671, + "step": 41200 + }, + { + "epoch": 12.012231347195527, + "grad_norm": 0.3588341772556305, + "learning_rate": 0.0004561223776223776, + "loss": 3.2736, + "step": 41250 + }, + { + "epoch": 12.02679247480925, + "grad_norm": 0.3291144073009491, + "learning_rate": 0.0004559475524475524, + "loss": 3.2687, + "step": 41300 + }, + { + "epoch": 12.041353602422971, + "grad_norm": 0.35584986209869385, + "learning_rate": 0.00045577272727272724, + "loss": 3.271, + "step": 41350 + }, + { + "epoch": 12.055914730036694, + "grad_norm": 0.3618343770503998, + "learning_rate": 0.00045559790209790204, + "loss": 3.2689, + "step": 41400 + }, + { + "epoch": 12.070475857650417, + "grad_norm": 0.3511015772819519, + "learning_rate": 0.0004554230769230769, + "loss": 3.2771, + "step": 41450 + }, + { + "epoch": 12.08503698526414, + "grad_norm": 0.3445127308368683, + "learning_rate": 0.0004552482517482517, + "loss": 3.2766, + "step": 41500 + }, + { + "epoch": 12.099598112877862, + "grad_norm": 0.34342440962791443, + "learning_rate": 0.00045507342657342655, + "loss": 3.2763, + "step": 41550 + }, + { + "epoch": 12.114159240491583, + "grad_norm": 0.3772423565387726, + "learning_rate": 0.00045489860139860135, + "loss": 3.2767, + "step": 41600 + }, + { + "epoch": 12.128720368105306, + "grad_norm": 0.3567655086517334, + "learning_rate": 0.0004547237762237762, + "loss": 3.2699, + "step": 41650 + }, + { + "epoch": 12.143281495719028, + "grad_norm": 0.3559315800666809, + "learning_rate": 0.00045454895104895106, + "loss": 3.2895, + "step": 41700 + }, + { + "epoch": 12.157842623332751, + "grad_norm": 0.3517071604728699, + "learning_rate": 0.0004543741258741258, + "loss": 3.2963, + "step": 41750 + }, + { + "epoch": 12.172403750946474, + "grad_norm": 0.3621581792831421, + "learning_rate": 0.00045419930069930066, + "loss": 3.2909, + "step": 41800 + }, + { + "epoch": 12.186964878560195, + "grad_norm": 0.34336772561073303, + "learning_rate": 0.00045402447552447546, + "loss": 3.3055, + "step": 41850 + }, + { + "epoch": 12.201526006173918, + "grad_norm": 0.36337751150131226, + "learning_rate": 0.0004538496503496503, + "loss": 3.303, + "step": 41900 + }, + { + "epoch": 12.21608713378764, + "grad_norm": 0.38408976793289185, + "learning_rate": 0.0004536748251748251, + "loss": 3.32, + "step": 41950 + }, + { + "epoch": 12.230648261401363, + "grad_norm": 0.3528490960597992, + "learning_rate": 0.00045349999999999996, + "loss": 3.3026, + "step": 42000 + }, + { + "epoch": 12.230648261401363, + "eval_accuracy": 0.3697736661170223, + "eval_loss": 3.5663251876831055, + "eval_runtime": 179.7473, + "eval_samples_per_second": 92.608, + "eval_steps_per_second": 5.791, + "step": 42000 + }, + { + "epoch": 12.245209389015086, + "grad_norm": 0.36504805088043213, + "learning_rate": 0.00045332517482517476, + "loss": 3.3043, + "step": 42050 + }, + { + "epoch": 12.259770516628807, + "grad_norm": 0.35969120264053345, + "learning_rate": 0.0004531503496503496, + "loss": 3.3094, + "step": 42100 + }, + { + "epoch": 12.27433164424253, + "grad_norm": 0.3558993339538574, + "learning_rate": 0.0004529755244755244, + "loss": 3.3273, + "step": 42150 + }, + { + "epoch": 12.288892771856252, + "grad_norm": 0.3570861220359802, + "learning_rate": 0.00045280069930069927, + "loss": 3.3146, + "step": 42200 + }, + { + "epoch": 12.303453899469975, + "grad_norm": 0.3580907881259918, + "learning_rate": 0.00045262587412587407, + "loss": 3.3042, + "step": 42250 + }, + { + "epoch": 12.318015027083698, + "grad_norm": 0.36477571725845337, + "learning_rate": 0.0004524510489510489, + "loss": 3.315, + "step": 42300 + }, + { + "epoch": 12.33257615469742, + "grad_norm": 0.33544719219207764, + "learning_rate": 0.0004522762237762238, + "loss": 3.3091, + "step": 42350 + }, + { + "epoch": 12.347137282311142, + "grad_norm": 0.34085631370544434, + "learning_rate": 0.0004521013986013986, + "loss": 3.3134, + "step": 42400 + }, + { + "epoch": 12.361698409924864, + "grad_norm": 0.3568131625652313, + "learning_rate": 0.00045192657342657343, + "loss": 3.324, + "step": 42450 + }, + { + "epoch": 12.376259537538587, + "grad_norm": 0.33206743001937866, + "learning_rate": 0.0004517517482517482, + "loss": 3.3225, + "step": 42500 + }, + { + "epoch": 12.39082066515231, + "grad_norm": 0.37920984625816345, + "learning_rate": 0.00045157692307692303, + "loss": 3.3209, + "step": 42550 + }, + { + "epoch": 12.405381792766033, + "grad_norm": 0.3489953577518463, + "learning_rate": 0.00045140209790209783, + "loss": 3.3243, + "step": 42600 + }, + { + "epoch": 12.419942920379754, + "grad_norm": 0.340498149394989, + "learning_rate": 0.0004512272727272727, + "loss": 3.3187, + "step": 42650 + }, + { + "epoch": 12.434504047993476, + "grad_norm": 0.3989890515804291, + "learning_rate": 0.0004510524475524475, + "loss": 3.3103, + "step": 42700 + }, + { + "epoch": 12.449065175607199, + "grad_norm": 0.3404858112335205, + "learning_rate": 0.00045087762237762234, + "loss": 3.3273, + "step": 42750 + }, + { + "epoch": 12.463626303220922, + "grad_norm": 0.3558301329612732, + "learning_rate": 0.00045070279720279714, + "loss": 3.3207, + "step": 42800 + }, + { + "epoch": 12.478187430834645, + "grad_norm": 0.34800827503204346, + "learning_rate": 0.000450527972027972, + "loss": 3.3293, + "step": 42850 + }, + { + "epoch": 12.492748558448366, + "grad_norm": 0.3653114438056946, + "learning_rate": 0.0004503531468531468, + "loss": 3.3484, + "step": 42900 + }, + { + "epoch": 12.507309686062088, + "grad_norm": 0.3844183087348938, + "learning_rate": 0.00045017832167832165, + "loss": 3.3311, + "step": 42950 + }, + { + "epoch": 12.521870813675811, + "grad_norm": 0.3569987118244171, + "learning_rate": 0.0004500034965034965, + "loss": 3.3243, + "step": 43000 + }, + { + "epoch": 12.521870813675811, + "eval_accuracy": 0.37050173134451886, + "eval_loss": 3.554635524749756, + "eval_runtime": 179.7072, + "eval_samples_per_second": 92.628, + "eval_steps_per_second": 5.793, + "step": 43000 + }, + { + "epoch": 12.536431941289534, + "grad_norm": 0.348994642496109, + "learning_rate": 0.0004498286713286713, + "loss": 3.3379, + "step": 43050 + }, + { + "epoch": 12.550993068903256, + "grad_norm": 0.3396040201187134, + "learning_rate": 0.00044965384615384615, + "loss": 3.3401, + "step": 43100 + }, + { + "epoch": 12.565554196516977, + "grad_norm": 0.3896959722042084, + "learning_rate": 0.00044947902097902095, + "loss": 3.3389, + "step": 43150 + }, + { + "epoch": 12.5801153241307, + "grad_norm": 0.3507692813873291, + "learning_rate": 0.0004493041958041958, + "loss": 3.3419, + "step": 43200 + }, + { + "epoch": 12.594676451744423, + "grad_norm": 0.38180142641067505, + "learning_rate": 0.00044912937062937055, + "loss": 3.3378, + "step": 43250 + }, + { + "epoch": 12.609237579358146, + "grad_norm": 0.3205597996711731, + "learning_rate": 0.0004489545454545454, + "loss": 3.3532, + "step": 43300 + }, + { + "epoch": 12.623798706971868, + "grad_norm": 0.3707713782787323, + "learning_rate": 0.0004487797202797202, + "loss": 3.335, + "step": 43350 + }, + { + "epoch": 12.63835983458559, + "grad_norm": 0.3585401773452759, + "learning_rate": 0.00044860489510489506, + "loss": 3.3376, + "step": 43400 + }, + { + "epoch": 12.652920962199312, + "grad_norm": 0.36372795701026917, + "learning_rate": 0.00044843006993006986, + "loss": 3.3392, + "step": 43450 + }, + { + "epoch": 12.667482089813035, + "grad_norm": 0.36102354526519775, + "learning_rate": 0.0004482552447552447, + "loss": 3.3409, + "step": 43500 + }, + { + "epoch": 12.682043217426758, + "grad_norm": 0.36197811365127563, + "learning_rate": 0.0004480804195804195, + "loss": 3.3425, + "step": 43550 + }, + { + "epoch": 12.69660434504048, + "grad_norm": 0.3554995357990265, + "learning_rate": 0.00044790559440559437, + "loss": 3.3442, + "step": 43600 + }, + { + "epoch": 12.711165472654203, + "grad_norm": 0.35034361481666565, + "learning_rate": 0.00044773076923076917, + "loss": 3.3537, + "step": 43650 + }, + { + "epoch": 12.725726600267924, + "grad_norm": 0.3263697326183319, + "learning_rate": 0.000447555944055944, + "loss": 3.3484, + "step": 43700 + }, + { + "epoch": 12.740287727881647, + "grad_norm": 0.333683043718338, + "learning_rate": 0.0004473811188811189, + "loss": 3.3595, + "step": 43750 + }, + { + "epoch": 12.75484885549537, + "grad_norm": 0.3857356607913971, + "learning_rate": 0.0004472062937062937, + "loss": 3.3512, + "step": 43800 + }, + { + "epoch": 12.769409983109092, + "grad_norm": 0.36390504240989685, + "learning_rate": 0.00044703146853146853, + "loss": 3.3423, + "step": 43850 + }, + { + "epoch": 12.783971110722815, + "grad_norm": 0.361526221036911, + "learning_rate": 0.00044685664335664333, + "loss": 3.3576, + "step": 43900 + }, + { + "epoch": 12.798532238336536, + "grad_norm": 0.3464201092720032, + "learning_rate": 0.0004466818181818182, + "loss": 3.3507, + "step": 43950 + }, + { + "epoch": 12.813093365950259, + "grad_norm": 0.3635178506374359, + "learning_rate": 0.00044650699300699293, + "loss": 3.3648, + "step": 44000 + }, + { + "epoch": 12.813093365950259, + "eval_accuracy": 0.37106509135907784, + "eval_loss": 3.5451130867004395, + "eval_runtime": 179.8666, + "eval_samples_per_second": 92.546, + "eval_steps_per_second": 5.788, + "step": 44000 + }, + { + "epoch": 12.827654493563982, + "grad_norm": 0.35989975929260254, + "learning_rate": 0.0004463321678321678, + "loss": 3.3498, + "step": 44050 + }, + { + "epoch": 12.842215621177704, + "grad_norm": 0.36528387665748596, + "learning_rate": 0.0004461573426573426, + "loss": 3.3446, + "step": 44100 + }, + { + "epoch": 12.856776748791427, + "grad_norm": 0.3401133716106415, + "learning_rate": 0.00044598251748251744, + "loss": 3.3641, + "step": 44150 + }, + { + "epoch": 12.871337876405148, + "grad_norm": 0.34729644656181335, + "learning_rate": 0.00044580769230769224, + "loss": 3.3605, + "step": 44200 + }, + { + "epoch": 12.88589900401887, + "grad_norm": 0.3324040174484253, + "learning_rate": 0.0004456328671328671, + "loss": 3.3516, + "step": 44250 + }, + { + "epoch": 12.900460131632594, + "grad_norm": 0.3577946126461029, + "learning_rate": 0.0004454580419580419, + "loss": 3.3671, + "step": 44300 + }, + { + "epoch": 12.915021259246316, + "grad_norm": 0.3214498460292816, + "learning_rate": 0.00044528321678321674, + "loss": 3.3511, + "step": 44350 + }, + { + "epoch": 12.929582386860039, + "grad_norm": 0.35906174778938293, + "learning_rate": 0.0004451083916083916, + "loss": 3.3554, + "step": 44400 + }, + { + "epoch": 12.944143514473762, + "grad_norm": 0.3261829912662506, + "learning_rate": 0.0004449335664335664, + "loss": 3.3469, + "step": 44450 + }, + { + "epoch": 12.958704642087483, + "grad_norm": 0.340356707572937, + "learning_rate": 0.00044475874125874125, + "loss": 3.3583, + "step": 44500 + }, + { + "epoch": 12.973265769701205, + "grad_norm": 0.33837756514549255, + "learning_rate": 0.00044458391608391605, + "loss": 3.3421, + "step": 44550 + }, + { + "epoch": 12.987826897314928, + "grad_norm": 0.3522256910800934, + "learning_rate": 0.0004444090909090909, + "loss": 3.364, + "step": 44600 + }, + { + "epoch": 13.002329780418195, + "grad_norm": 0.3823665678501129, + "learning_rate": 0.0004442342657342657, + "loss": 3.3249, + "step": 44650 + }, + { + "epoch": 13.016890908031918, + "grad_norm": 0.337575763463974, + "learning_rate": 0.00044405944055944056, + "loss": 3.237, + "step": 44700 + }, + { + "epoch": 13.031452035645641, + "grad_norm": 0.37781283259391785, + "learning_rate": 0.0004438846153846153, + "loss": 3.2572, + "step": 44750 + }, + { + "epoch": 13.046013163259364, + "grad_norm": 0.3561868667602539, + "learning_rate": 0.00044370979020979016, + "loss": 3.2568, + "step": 44800 + }, + { + "epoch": 13.060574290873085, + "grad_norm": 0.34880560636520386, + "learning_rate": 0.00044353496503496496, + "loss": 3.2554, + "step": 44850 + }, + { + "epoch": 13.075135418486807, + "grad_norm": 0.35558071732521057, + "learning_rate": 0.0004433601398601398, + "loss": 3.2505, + "step": 44900 + }, + { + "epoch": 13.08969654610053, + "grad_norm": 0.3940425217151642, + "learning_rate": 0.0004431853146853146, + "loss": 3.2628, + "step": 44950 + }, + { + "epoch": 13.104257673714253, + "grad_norm": 0.37770476937294006, + "learning_rate": 0.00044301048951048946, + "loss": 3.275, + "step": 45000 + }, + { + "epoch": 13.104257673714253, + "eval_accuracy": 0.3701498664606343, + "eval_loss": 3.5611326694488525, + "eval_runtime": 179.8018, + "eval_samples_per_second": 92.58, + "eval_steps_per_second": 5.79, + "step": 45000 + }, + { + "epoch": 13.118818801327976, + "grad_norm": 0.4152877926826477, + "learning_rate": 0.00044283566433566426, + "loss": 3.2714, + "step": 45050 + }, + { + "epoch": 13.133379928941697, + "grad_norm": 0.381516695022583, + "learning_rate": 0.0004426608391608391, + "loss": 3.2652, + "step": 45100 + }, + { + "epoch": 13.14794105655542, + "grad_norm": 0.3559730350971222, + "learning_rate": 0.00044248601398601397, + "loss": 3.2787, + "step": 45150 + }, + { + "epoch": 13.162502184169142, + "grad_norm": 0.36056405305862427, + "learning_rate": 0.00044231118881118877, + "loss": 3.2815, + "step": 45200 + }, + { + "epoch": 13.177063311782865, + "grad_norm": 0.36023563146591187, + "learning_rate": 0.0004421363636363636, + "loss": 3.2925, + "step": 45250 + }, + { + "epoch": 13.191624439396588, + "grad_norm": 0.3763374388217926, + "learning_rate": 0.0004419615384615384, + "loss": 3.2771, + "step": 45300 + }, + { + "epoch": 13.206185567010309, + "grad_norm": 0.36706557869911194, + "learning_rate": 0.0004417867132867133, + "loss": 3.2913, + "step": 45350 + }, + { + "epoch": 13.220746694624031, + "grad_norm": 0.36076489090919495, + "learning_rate": 0.0004416118881118881, + "loss": 3.2803, + "step": 45400 + }, + { + "epoch": 13.235307822237754, + "grad_norm": 0.34233537316322327, + "learning_rate": 0.00044143706293706293, + "loss": 3.2837, + "step": 45450 + }, + { + "epoch": 13.249868949851477, + "grad_norm": 0.37531542778015137, + "learning_rate": 0.0004412622377622377, + "loss": 3.3033, + "step": 45500 + }, + { + "epoch": 13.2644300774652, + "grad_norm": 0.3751412034034729, + "learning_rate": 0.00044108741258741253, + "loss": 3.2912, + "step": 45550 + }, + { + "epoch": 13.27899120507892, + "grad_norm": 0.3610355854034424, + "learning_rate": 0.00044091258741258733, + "loss": 3.299, + "step": 45600 + }, + { + "epoch": 13.293552332692643, + "grad_norm": 0.3749397397041321, + "learning_rate": 0.0004407377622377622, + "loss": 3.294, + "step": 45650 + }, + { + "epoch": 13.308113460306366, + "grad_norm": 0.3670322895050049, + "learning_rate": 0.000440562937062937, + "loss": 3.2975, + "step": 45700 + }, + { + "epoch": 13.322674587920089, + "grad_norm": 0.3655760884284973, + "learning_rate": 0.00044038811188811184, + "loss": 3.2943, + "step": 45750 + }, + { + "epoch": 13.337235715533811, + "grad_norm": 0.3786638379096985, + "learning_rate": 0.0004402132867132867, + "loss": 3.2848, + "step": 45800 + }, + { + "epoch": 13.351796843147532, + "grad_norm": 0.34307000041007996, + "learning_rate": 0.0004400384615384615, + "loss": 3.2926, + "step": 45850 + }, + { + "epoch": 13.366357970761255, + "grad_norm": 0.3485225439071655, + "learning_rate": 0.00043986363636363635, + "loss": 3.2964, + "step": 45900 + }, + { + "epoch": 13.380919098374978, + "grad_norm": 0.34821778535842896, + "learning_rate": 0.00043968881118881115, + "loss": 3.3167, + "step": 45950 + }, + { + "epoch": 13.3954802259887, + "grad_norm": 0.3911685347557068, + "learning_rate": 0.000439513986013986, + "loss": 3.3082, + "step": 46000 + }, + { + "epoch": 13.3954802259887, + "eval_accuracy": 0.3704263737131891, + "eval_loss": 3.556065082550049, + "eval_runtime": 179.915, + "eval_samples_per_second": 92.521, + "eval_steps_per_second": 5.786, + "step": 46000 + }, + { + "epoch": 13.410041353602423, + "grad_norm": 0.3521997928619385, + "learning_rate": 0.0004393391608391608, + "loss": 3.3085, + "step": 46050 + }, + { + "epoch": 13.424602481216146, + "grad_norm": 0.3501444160938263, + "learning_rate": 0.00043916433566433565, + "loss": 3.3154, + "step": 46100 + }, + { + "epoch": 13.439163608829867, + "grad_norm": 0.3678217828273773, + "learning_rate": 0.00043898951048951045, + "loss": 3.3044, + "step": 46150 + }, + { + "epoch": 13.45372473644359, + "grad_norm": 0.397447407245636, + "learning_rate": 0.0004388146853146853, + "loss": 3.3181, + "step": 46200 + }, + { + "epoch": 13.468285864057313, + "grad_norm": 0.3521028757095337, + "learning_rate": 0.00043863986013986005, + "loss": 3.3053, + "step": 46250 + }, + { + "epoch": 13.482846991671035, + "grad_norm": 0.3721345365047455, + "learning_rate": 0.0004384650349650349, + "loss": 3.3111, + "step": 46300 + }, + { + "epoch": 13.497408119284758, + "grad_norm": 0.3641456365585327, + "learning_rate": 0.0004382902097902097, + "loss": 3.3039, + "step": 46350 + }, + { + "epoch": 13.51196924689848, + "grad_norm": 0.3499159812927246, + "learning_rate": 0.00043811538461538456, + "loss": 3.3111, + "step": 46400 + }, + { + "epoch": 13.526530374512202, + "grad_norm": 0.35478129982948303, + "learning_rate": 0.0004379405594405594, + "loss": 3.3228, + "step": 46450 + }, + { + "epoch": 13.541091502125925, + "grad_norm": 0.3691585958003998, + "learning_rate": 0.0004377657342657342, + "loss": 3.3195, + "step": 46500 + }, + { + "epoch": 13.555652629739647, + "grad_norm": 0.37163329124450684, + "learning_rate": 0.00043759090909090907, + "loss": 3.3317, + "step": 46550 + }, + { + "epoch": 13.57021375735337, + "grad_norm": 0.3679807782173157, + "learning_rate": 0.00043741608391608387, + "loss": 3.3153, + "step": 46600 + }, + { + "epoch": 13.584774884967091, + "grad_norm": 0.34902745485305786, + "learning_rate": 0.0004372412587412587, + "loss": 3.3289, + "step": 46650 + }, + { + "epoch": 13.599336012580814, + "grad_norm": 0.4036466181278229, + "learning_rate": 0.0004370664335664335, + "loss": 3.3076, + "step": 46700 + }, + { + "epoch": 13.613897140194537, + "grad_norm": 0.35974839329719543, + "learning_rate": 0.0004368916083916084, + "loss": 3.3236, + "step": 46750 + }, + { + "epoch": 13.62845826780826, + "grad_norm": 0.35530760884284973, + "learning_rate": 0.0004367167832167832, + "loss": 3.3195, + "step": 46800 + }, + { + "epoch": 13.643019395421982, + "grad_norm": 0.3408248722553253, + "learning_rate": 0.00043654195804195803, + "loss": 3.3229, + "step": 46850 + }, + { + "epoch": 13.657580523035705, + "grad_norm": 0.3587759733200073, + "learning_rate": 0.00043636713286713283, + "loss": 3.3369, + "step": 46900 + }, + { + "epoch": 13.672141650649426, + "grad_norm": 0.37035682797431946, + "learning_rate": 0.0004361923076923077, + "loss": 3.3287, + "step": 46950 + }, + { + "epoch": 13.686702778263149, + "grad_norm": 0.3677375316619873, + "learning_rate": 0.00043601748251748243, + "loss": 3.3173, + "step": 47000 + }, + { + "epoch": 13.686702778263149, + "eval_accuracy": 0.3709606957637255, + "eval_loss": 3.548353672027588, + "eval_runtime": 179.8113, + "eval_samples_per_second": 92.575, + "eval_steps_per_second": 5.789, + "step": 47000 + }, + { + "epoch": 13.701263905876871, + "grad_norm": 0.3480958640575409, + "learning_rate": 0.00043584265734265734, + "loss": 3.3378, + "step": 47050 + }, + { + "epoch": 13.715825033490594, + "grad_norm": 0.3933262825012207, + "learning_rate": 0.0004356678321678321, + "loss": 3.3263, + "step": 47100 + }, + { + "epoch": 13.730386161104317, + "grad_norm": 0.3582457900047302, + "learning_rate": 0.00043549300699300694, + "loss": 3.3258, + "step": 47150 + }, + { + "epoch": 13.744947288718038, + "grad_norm": 0.3504882752895355, + "learning_rate": 0.0004353181818181818, + "loss": 3.3322, + "step": 47200 + }, + { + "epoch": 13.75950841633176, + "grad_norm": 0.38114842772483826, + "learning_rate": 0.0004351433566433566, + "loss": 3.3381, + "step": 47250 + }, + { + "epoch": 13.774069543945483, + "grad_norm": 0.3709891736507416, + "learning_rate": 0.00043496853146853144, + "loss": 3.328, + "step": 47300 + }, + { + "epoch": 13.788630671559206, + "grad_norm": 0.3773655295372009, + "learning_rate": 0.00043479370629370624, + "loss": 3.3324, + "step": 47350 + }, + { + "epoch": 13.803191799172929, + "grad_norm": 0.34999653697013855, + "learning_rate": 0.0004346188811188811, + "loss": 3.3365, + "step": 47400 + }, + { + "epoch": 13.81775292678665, + "grad_norm": 0.3433198630809784, + "learning_rate": 0.0004344440559440559, + "loss": 3.328, + "step": 47450 + }, + { + "epoch": 13.832314054400372, + "grad_norm": 0.3613443970680237, + "learning_rate": 0.00043426923076923075, + "loss": 3.3452, + "step": 47500 + }, + { + "epoch": 13.846875182014095, + "grad_norm": 0.3868599832057953, + "learning_rate": 0.00043409440559440555, + "loss": 3.3437, + "step": 47550 + }, + { + "epoch": 13.861436309627818, + "grad_norm": 0.3586479127407074, + "learning_rate": 0.0004339195804195804, + "loss": 3.3467, + "step": 47600 + }, + { + "epoch": 13.87599743724154, + "grad_norm": 0.38852712512016296, + "learning_rate": 0.0004337447552447552, + "loss": 3.3367, + "step": 47650 + }, + { + "epoch": 13.890558564855262, + "grad_norm": 0.3898317515850067, + "learning_rate": 0.00043356993006993006, + "loss": 3.3341, + "step": 47700 + }, + { + "epoch": 13.905119692468984, + "grad_norm": 0.3417160212993622, + "learning_rate": 0.0004333951048951048, + "loss": 3.342, + "step": 47750 + }, + { + "epoch": 13.919680820082707, + "grad_norm": 0.36018186807632446, + "learning_rate": 0.0004332202797202797, + "loss": 3.3444, + "step": 47800 + }, + { + "epoch": 13.93424194769643, + "grad_norm": 0.3359622061252594, + "learning_rate": 0.00043304545454545456, + "loss": 3.3351, + "step": 47850 + }, + { + "epoch": 13.948803075310153, + "grad_norm": 0.35396841168403625, + "learning_rate": 0.0004328706293706293, + "loss": 3.3447, + "step": 47900 + }, + { + "epoch": 13.963364202923874, + "grad_norm": 0.3624133765697479, + "learning_rate": 0.00043269580419580416, + "loss": 3.3484, + "step": 47950 + }, + { + "epoch": 13.977925330537596, + "grad_norm": 0.36250928044319153, + "learning_rate": 0.00043252097902097896, + "loss": 3.3449, + "step": 48000 + }, + { + "epoch": 13.977925330537596, + "eval_accuracy": 0.37173649140981785, + "eval_loss": 3.539720058441162, + "eval_runtime": 179.6484, + "eval_samples_per_second": 92.659, + "eval_steps_per_second": 5.795, + "step": 48000 + }, + { + "epoch": 13.992486458151319, + "grad_norm": 0.3764212727546692, + "learning_rate": 0.0004323461538461538, + "loss": 3.3403, + "step": 48050 + }, + { + "epoch": 14.006989341254586, + "grad_norm": 0.353367418050766, + "learning_rate": 0.0004321713286713286, + "loss": 3.2937, + "step": 48100 + }, + { + "epoch": 14.021550468868309, + "grad_norm": 0.371685266494751, + "learning_rate": 0.00043199650349650347, + "loss": 3.2138, + "step": 48150 + }, + { + "epoch": 14.036111596482032, + "grad_norm": 0.3745143711566925, + "learning_rate": 0.00043182167832167827, + "loss": 3.2321, + "step": 48200 + }, + { + "epoch": 14.050672724095755, + "grad_norm": 0.3856935203075409, + "learning_rate": 0.0004316468531468531, + "loss": 3.2316, + "step": 48250 + }, + { + "epoch": 14.065233851709475, + "grad_norm": 0.36421141028404236, + "learning_rate": 0.0004314720279720279, + "loss": 3.2413, + "step": 48300 + }, + { + "epoch": 14.079794979323198, + "grad_norm": 0.3954959809780121, + "learning_rate": 0.0004312972027972028, + "loss": 3.2449, + "step": 48350 + }, + { + "epoch": 14.094356106936921, + "grad_norm": 0.36783069372177124, + "learning_rate": 0.0004311223776223776, + "loss": 3.2469, + "step": 48400 + }, + { + "epoch": 14.108917234550644, + "grad_norm": 0.3712175786495209, + "learning_rate": 0.00043094755244755243, + "loss": 3.2541, + "step": 48450 + }, + { + "epoch": 14.123478362164366, + "grad_norm": 0.3833962678909302, + "learning_rate": 0.0004307727272727272, + "loss": 3.2665, + "step": 48500 + }, + { + "epoch": 14.13803948977809, + "grad_norm": 0.3755820095539093, + "learning_rate": 0.0004305979020979021, + "loss": 3.2583, + "step": 48550 + }, + { + "epoch": 14.15260061739181, + "grad_norm": 0.3721693158149719, + "learning_rate": 0.00043042307692307694, + "loss": 3.2461, + "step": 48600 + }, + { + "epoch": 14.167161745005533, + "grad_norm": 0.36444807052612305, + "learning_rate": 0.0004302482517482517, + "loss": 3.2672, + "step": 48650 + }, + { + "epoch": 14.181722872619256, + "grad_norm": 0.3308829367160797, + "learning_rate": 0.00043007342657342654, + "loss": 3.2678, + "step": 48700 + }, + { + "epoch": 14.196284000232978, + "grad_norm": 0.3922617435455322, + "learning_rate": 0.00042989860139860134, + "loss": 3.2674, + "step": 48750 + }, + { + "epoch": 14.210845127846701, + "grad_norm": 0.3621092140674591, + "learning_rate": 0.0004297237762237762, + "loss": 3.2708, + "step": 48800 + }, + { + "epoch": 14.225406255460422, + "grad_norm": 0.36468708515167236, + "learning_rate": 0.000429548951048951, + "loss": 3.2829, + "step": 48850 + }, + { + "epoch": 14.239967383074145, + "grad_norm": 0.37252795696258545, + "learning_rate": 0.00042937412587412585, + "loss": 3.2624, + "step": 48900 + }, + { + "epoch": 14.254528510687868, + "grad_norm": 0.38541239500045776, + "learning_rate": 0.00042919930069930065, + "loss": 3.2773, + "step": 48950 + }, + { + "epoch": 14.26908963830159, + "grad_norm": 0.35144972801208496, + "learning_rate": 0.0004290244755244755, + "loss": 3.2796, + "step": 49000 + }, + { + "epoch": 14.26908963830159, + "eval_accuracy": 0.3708721711203693, + "eval_loss": 3.5575828552246094, + "eval_runtime": 179.8637, + "eval_samples_per_second": 92.548, + "eval_steps_per_second": 5.788, + "step": 49000 + }, + { + "epoch": 14.283650765915313, + "grad_norm": 0.36003535985946655, + "learning_rate": 0.0004288496503496503, + "loss": 3.2822, + "step": 49050 + }, + { + "epoch": 14.298211893529034, + "grad_norm": 0.3763592839241028, + "learning_rate": 0.00042867482517482515, + "loss": 3.2788, + "step": 49100 + }, + { + "epoch": 14.312773021142757, + "grad_norm": 0.35325881838798523, + "learning_rate": 0.00042849999999999995, + "loss": 3.2867, + "step": 49150 + }, + { + "epoch": 14.32733414875648, + "grad_norm": 0.36146456003189087, + "learning_rate": 0.0004283251748251748, + "loss": 3.2838, + "step": 49200 + }, + { + "epoch": 14.341895276370202, + "grad_norm": 0.35914602875709534, + "learning_rate": 0.00042815034965034966, + "loss": 3.2862, + "step": 49250 + }, + { + "epoch": 14.356456403983925, + "grad_norm": 0.35315200686454773, + "learning_rate": 0.00042797552447552446, + "loss": 3.2912, + "step": 49300 + }, + { + "epoch": 14.371017531597648, + "grad_norm": 0.355522483587265, + "learning_rate": 0.0004278006993006993, + "loss": 3.2925, + "step": 49350 + }, + { + "epoch": 14.385578659211369, + "grad_norm": 0.3386734426021576, + "learning_rate": 0.00042762587412587406, + "loss": 3.2745, + "step": 49400 + }, + { + "epoch": 14.400139786825092, + "grad_norm": 0.37271982431411743, + "learning_rate": 0.0004274510489510489, + "loss": 3.2914, + "step": 49450 + }, + { + "epoch": 14.414700914438814, + "grad_norm": 0.3474613130092621, + "learning_rate": 0.0004272762237762237, + "loss": 3.2917, + "step": 49500 + }, + { + "epoch": 14.429262042052537, + "grad_norm": 0.37980303168296814, + "learning_rate": 0.00042710139860139857, + "loss": 3.2942, + "step": 49550 + }, + { + "epoch": 14.44382316966626, + "grad_norm": 0.3631215989589691, + "learning_rate": 0.00042692657342657337, + "loss": 3.292, + "step": 49600 + }, + { + "epoch": 14.45838429727998, + "grad_norm": 0.3490867614746094, + "learning_rate": 0.0004267517482517482, + "loss": 3.2975, + "step": 49650 + }, + { + "epoch": 14.472945424893704, + "grad_norm": 0.3449762761592865, + "learning_rate": 0.000426576923076923, + "loss": 3.2993, + "step": 49700 + }, + { + "epoch": 14.487506552507426, + "grad_norm": 0.3923116624355316, + "learning_rate": 0.0004264020979020979, + "loss": 3.3112, + "step": 49750 + }, + { + "epoch": 14.502067680121149, + "grad_norm": 0.35419827699661255, + "learning_rate": 0.0004262272727272727, + "loss": 3.2972, + "step": 49800 + }, + { + "epoch": 14.516628807734872, + "grad_norm": 0.3563985824584961, + "learning_rate": 0.00042605244755244753, + "loss": 3.3037, + "step": 49850 + }, + { + "epoch": 14.531189935348593, + "grad_norm": 0.37263786792755127, + "learning_rate": 0.00042587762237762233, + "loss": 3.3085, + "step": 49900 + }, + { + "epoch": 14.545751062962315, + "grad_norm": 0.3686966896057129, + "learning_rate": 0.0004257027972027972, + "loss": 3.3032, + "step": 49950 + }, + { + "epoch": 14.560312190576038, + "grad_norm": 0.35801100730895996, + "learning_rate": 0.00042552797202797204, + "loss": 3.3156, + "step": 50000 + }, + { + "epoch": 14.560312190576038, + "eval_accuracy": 0.3712233306286096, + "eval_loss": 3.5494096279144287, + "eval_runtime": 179.8895, + "eval_samples_per_second": 92.535, + "eval_steps_per_second": 5.787, + "step": 50000 + }, + { + "epoch": 14.574873318189761, + "grad_norm": 0.36634013056755066, + "learning_rate": 0.00042535314685314684, + "loss": 3.3055, + "step": 50050 + }, + { + "epoch": 14.589434445803484, + "grad_norm": 0.38294774293899536, + "learning_rate": 0.0004251783216783217, + "loss": 3.2938, + "step": 50100 + }, + { + "epoch": 14.603995573417205, + "grad_norm": 0.36177265644073486, + "learning_rate": 0.00042500349650349643, + "loss": 3.3063, + "step": 50150 + }, + { + "epoch": 14.618556701030927, + "grad_norm": 0.38831570744514465, + "learning_rate": 0.0004248286713286713, + "loss": 3.3124, + "step": 50200 + }, + { + "epoch": 14.63311782864465, + "grad_norm": 0.3788811266422272, + "learning_rate": 0.0004246538461538461, + "loss": 3.3095, + "step": 50250 + }, + { + "epoch": 14.647678956258373, + "grad_norm": 0.37422165274620056, + "learning_rate": 0.00042447902097902094, + "loss": 3.3222, + "step": 50300 + }, + { + "epoch": 14.662240083872096, + "grad_norm": 0.34806740283966064, + "learning_rate": 0.00042430419580419574, + "loss": 3.3083, + "step": 50350 + }, + { + "epoch": 14.676801211485817, + "grad_norm": 0.38060638308525085, + "learning_rate": 0.0004241293706293706, + "loss": 3.3136, + "step": 50400 + }, + { + "epoch": 14.69136233909954, + "grad_norm": 0.35740432143211365, + "learning_rate": 0.0004239545454545454, + "loss": 3.318, + "step": 50450 + }, + { + "epoch": 14.705923466713262, + "grad_norm": 0.3618071377277374, + "learning_rate": 0.00042377972027972025, + "loss": 3.3257, + "step": 50500 + }, + { + "epoch": 14.720484594326985, + "grad_norm": 0.5806776285171509, + "learning_rate": 0.00042360489510489505, + "loss": 3.3143, + "step": 50550 + }, + { + "epoch": 14.735045721940708, + "grad_norm": 0.38723915815353394, + "learning_rate": 0.0004234300699300699, + "loss": 3.3116, + "step": 50600 + }, + { + "epoch": 14.749606849554429, + "grad_norm": 0.3388756215572357, + "learning_rate": 0.00042325524475524476, + "loss": 3.3198, + "step": 50650 + }, + { + "epoch": 14.764167977168151, + "grad_norm": 0.3440285325050354, + "learning_rate": 0.00042308041958041956, + "loss": 3.3098, + "step": 50700 + }, + { + "epoch": 14.778729104781874, + "grad_norm": 0.37004444003105164, + "learning_rate": 0.0004229055944055944, + "loss": 3.329, + "step": 50750 + }, + { + "epoch": 14.793290232395597, + "grad_norm": 0.34428054094314575, + "learning_rate": 0.0004227307692307692, + "loss": 3.3177, + "step": 50800 + }, + { + "epoch": 14.80785136000932, + "grad_norm": 0.36497217416763306, + "learning_rate": 0.00042255594405594406, + "loss": 3.3178, + "step": 50850 + }, + { + "epoch": 14.822412487623042, + "grad_norm": 0.3842722475528717, + "learning_rate": 0.0004223811188811188, + "loss": 3.309, + "step": 50900 + }, + { + "epoch": 14.836973615236763, + "grad_norm": 0.35773956775665283, + "learning_rate": 0.00042220629370629366, + "loss": 3.3148, + "step": 50950 + }, + { + "epoch": 14.851534742850486, + "grad_norm": 0.3422044813632965, + "learning_rate": 0.00042203146853146846, + "loss": 3.3248, + "step": 51000 + }, + { + "epoch": 14.851534742850486, + "eval_accuracy": 0.37198043382012874, + "eval_loss": 3.540163516998291, + "eval_runtime": 179.888, + "eval_samples_per_second": 92.535, + "eval_steps_per_second": 5.787, + "step": 51000 + }, + { + "epoch": 14.866095870464209, + "grad_norm": 0.34656912088394165, + "learning_rate": 0.0004218566433566433, + "loss": 3.3221, + "step": 51050 + }, + { + "epoch": 14.880656998077932, + "grad_norm": 0.3491133749485016, + "learning_rate": 0.0004216818181818181, + "loss": 3.3193, + "step": 51100 + }, + { + "epoch": 14.895218125691654, + "grad_norm": 0.3695593774318695, + "learning_rate": 0.00042150699300699297, + "loss": 3.3278, + "step": 51150 + }, + { + "epoch": 14.909779253305375, + "grad_norm": 0.3540794253349304, + "learning_rate": 0.00042133216783216777, + "loss": 3.3186, + "step": 51200 + }, + { + "epoch": 14.924340380919098, + "grad_norm": 0.3513336479663849, + "learning_rate": 0.0004211573426573426, + "loss": 3.329, + "step": 51250 + }, + { + "epoch": 14.93890150853282, + "grad_norm": 0.3728017807006836, + "learning_rate": 0.0004209825174825175, + "loss": 3.337, + "step": 51300 + }, + { + "epoch": 14.953462636146543, + "grad_norm": 0.3794797956943512, + "learning_rate": 0.0004208076923076923, + "loss": 3.3271, + "step": 51350 + }, + { + "epoch": 14.968023763760266, + "grad_norm": 0.3826712667942047, + "learning_rate": 0.00042063286713286713, + "loss": 3.3207, + "step": 51400 + }, + { + "epoch": 14.982584891373987, + "grad_norm": 0.3683113753795624, + "learning_rate": 0.00042045804195804193, + "loss": 3.3222, + "step": 51450 + }, + { + "epoch": 14.99714601898771, + "grad_norm": 0.3468119204044342, + "learning_rate": 0.0004202832167832168, + "loss": 3.3369, + "step": 51500 + }, + { + "epoch": 15.011648902090977, + "grad_norm": 0.3682543635368347, + "learning_rate": 0.0004201083916083916, + "loss": 3.2329, + "step": 51550 + }, + { + "epoch": 15.0262100297047, + "grad_norm": 0.38179928064346313, + "learning_rate": 0.00041993356643356644, + "loss": 3.2202, + "step": 51600 + }, + { + "epoch": 15.040771157318423, + "grad_norm": 0.3646683692932129, + "learning_rate": 0.0004197587412587412, + "loss": 3.2306, + "step": 51650 + }, + { + "epoch": 15.055332284932145, + "grad_norm": 0.37061500549316406, + "learning_rate": 0.00041958391608391604, + "loss": 3.2339, + "step": 51700 + }, + { + "epoch": 15.069893412545868, + "grad_norm": 0.4042431116104126, + "learning_rate": 0.00041940909090909084, + "loss": 3.242, + "step": 51750 + }, + { + "epoch": 15.084454540159589, + "grad_norm": 0.3703981339931488, + "learning_rate": 0.0004192342657342657, + "loss": 3.2247, + "step": 51800 + }, + { + "epoch": 15.099015667773312, + "grad_norm": 0.38997241854667664, + "learning_rate": 0.0004190594405594405, + "loss": 3.2418, + "step": 51850 + }, + { + "epoch": 15.113576795387035, + "grad_norm": 0.3963125944137573, + "learning_rate": 0.00041888461538461535, + "loss": 3.2503, + "step": 51900 + }, + { + "epoch": 15.128137923000757, + "grad_norm": 0.3787480294704437, + "learning_rate": 0.00041870979020979015, + "loss": 3.2543, + "step": 51950 + }, + { + "epoch": 15.14269905061448, + "grad_norm": 0.3614787459373474, + "learning_rate": 0.000418534965034965, + "loss": 3.2411, + "step": 52000 + }, + { + "epoch": 15.14269905061448, + "eval_accuracy": 0.3713345448551899, + "eval_loss": 3.558533191680908, + "eval_runtime": 179.6973, + "eval_samples_per_second": 92.634, + "eval_steps_per_second": 5.793, + "step": 52000 + }, + { + "epoch": 15.157260178228203, + "grad_norm": 0.37162527441978455, + "learning_rate": 0.00041836013986013985, + "loss": 3.2484, + "step": 52050 + }, + { + "epoch": 15.171821305841924, + "grad_norm": 0.37970441579818726, + "learning_rate": 0.00041818531468531465, + "loss": 3.251, + "step": 52100 + }, + { + "epoch": 15.186382433455647, + "grad_norm": 0.39577314257621765, + "learning_rate": 0.0004180104895104895, + "loss": 3.2534, + "step": 52150 + }, + { + "epoch": 15.20094356106937, + "grad_norm": 0.39917322993278503, + "learning_rate": 0.0004178356643356643, + "loss": 3.2581, + "step": 52200 + }, + { + "epoch": 15.215504688683092, + "grad_norm": 0.3855136036872864, + "learning_rate": 0.00041766083916083916, + "loss": 3.2488, + "step": 52250 + }, + { + "epoch": 15.230065816296815, + "grad_norm": 0.3883339762687683, + "learning_rate": 0.00041748601398601396, + "loss": 3.2616, + "step": 52300 + }, + { + "epoch": 15.244626943910536, + "grad_norm": 0.3921905755996704, + "learning_rate": 0.0004173111888111888, + "loss": 3.2664, + "step": 52350 + }, + { + "epoch": 15.259188071524258, + "grad_norm": 0.3932233154773712, + "learning_rate": 0.00041713636363636356, + "loss": 3.2652, + "step": 52400 + }, + { + "epoch": 15.273749199137981, + "grad_norm": 0.35905158519744873, + "learning_rate": 0.0004169615384615384, + "loss": 3.2578, + "step": 52450 + }, + { + "epoch": 15.288310326751704, + "grad_norm": 0.3927863538265228, + "learning_rate": 0.0004167867132867132, + "loss": 3.2658, + "step": 52500 + }, + { + "epoch": 15.302871454365427, + "grad_norm": 0.3803752660751343, + "learning_rate": 0.00041661188811188807, + "loss": 3.2791, + "step": 52550 + }, + { + "epoch": 15.317432581979148, + "grad_norm": 0.37040677666664124, + "learning_rate": 0.00041643706293706287, + "loss": 3.2675, + "step": 52600 + }, + { + "epoch": 15.33199370959287, + "grad_norm": 0.3753259778022766, + "learning_rate": 0.0004162622377622377, + "loss": 3.2588, + "step": 52650 + }, + { + "epoch": 15.346554837206593, + "grad_norm": 0.333647757768631, + "learning_rate": 0.0004160874125874126, + "loss": 3.2638, + "step": 52700 + }, + { + "epoch": 15.361115964820316, + "grad_norm": 0.4069964587688446, + "learning_rate": 0.0004159125874125874, + "loss": 3.2788, + "step": 52750 + }, + { + "epoch": 15.375677092434039, + "grad_norm": 0.3561253547668457, + "learning_rate": 0.00041573776223776223, + "loss": 3.269, + "step": 52800 + }, + { + "epoch": 15.39023822004776, + "grad_norm": 0.3721391558647156, + "learning_rate": 0.00041556293706293703, + "loss": 3.2708, + "step": 52850 + }, + { + "epoch": 15.404799347661482, + "grad_norm": 0.3592582941055298, + "learning_rate": 0.0004153881118881119, + "loss": 3.2836, + "step": 52900 + }, + { + "epoch": 15.419360475275205, + "grad_norm": 0.3861062824726105, + "learning_rate": 0.0004152132867132867, + "loss": 3.2757, + "step": 52950 + }, + { + "epoch": 15.433921602888928, + "grad_norm": 0.35728558897972107, + "learning_rate": 0.00041503846153846154, + "loss": 3.2784, + "step": 53000 + }, + { + "epoch": 15.433921602888928, + "eval_accuracy": 0.3720729555921358, + "eval_loss": 3.5504865646362305, + "eval_runtime": 179.8078, + "eval_samples_per_second": 92.577, + "eval_steps_per_second": 5.79, + "step": 53000 + }, + { + "epoch": 15.44848273050265, + "grad_norm": 0.36359742283821106, + "learning_rate": 0.00041486363636363634, + "loss": 3.2761, + "step": 53050 + }, + { + "epoch": 15.463043858116373, + "grad_norm": 0.375919908285141, + "learning_rate": 0.0004146888111888112, + "loss": 3.2761, + "step": 53100 + }, + { + "epoch": 15.477604985730094, + "grad_norm": 0.3810451924800873, + "learning_rate": 0.00041451398601398593, + "loss": 3.2829, + "step": 53150 + }, + { + "epoch": 15.492166113343817, + "grad_norm": 0.3776414394378662, + "learning_rate": 0.0004143391608391608, + "loss": 3.3008, + "step": 53200 + }, + { + "epoch": 15.50672724095754, + "grad_norm": 0.4121120274066925, + "learning_rate": 0.0004141643356643356, + "loss": 3.2937, + "step": 53250 + }, + { + "epoch": 15.521288368571263, + "grad_norm": 0.37099769711494446, + "learning_rate": 0.00041398951048951044, + "loss": 3.2832, + "step": 53300 + }, + { + "epoch": 15.535849496184985, + "grad_norm": 0.3595023453235626, + "learning_rate": 0.00041381468531468524, + "loss": 3.2857, + "step": 53350 + }, + { + "epoch": 15.550410623798706, + "grad_norm": 0.377890408039093, + "learning_rate": 0.0004136398601398601, + "loss": 3.2899, + "step": 53400 + }, + { + "epoch": 15.564971751412429, + "grad_norm": 0.37558192014694214, + "learning_rate": 0.00041346503496503495, + "loss": 3.298, + "step": 53450 + }, + { + "epoch": 15.579532879026152, + "grad_norm": 0.38341572880744934, + "learning_rate": 0.00041329020979020975, + "loss": 3.297, + "step": 53500 + }, + { + "epoch": 15.594094006639875, + "grad_norm": 0.3785656690597534, + "learning_rate": 0.0004131153846153846, + "loss": 3.2965, + "step": 53550 + }, + { + "epoch": 15.608655134253597, + "grad_norm": 0.3675081133842468, + "learning_rate": 0.0004129405594405594, + "loss": 3.2942, + "step": 53600 + }, + { + "epoch": 15.623216261867318, + "grad_norm": 0.35816362500190735, + "learning_rate": 0.00041276573426573426, + "loss": 3.2965, + "step": 53650 + }, + { + "epoch": 15.637777389481041, + "grad_norm": 0.3761393129825592, + "learning_rate": 0.00041259090909090906, + "loss": 3.2902, + "step": 53700 + }, + { + "epoch": 15.652338517094764, + "grad_norm": 0.4039056897163391, + "learning_rate": 0.0004124160839160839, + "loss": 3.2882, + "step": 53750 + }, + { + "epoch": 15.666899644708487, + "grad_norm": 0.3910631537437439, + "learning_rate": 0.0004122412587412587, + "loss": 3.2949, + "step": 53800 + }, + { + "epoch": 15.68146077232221, + "grad_norm": 0.38190576434135437, + "learning_rate": 0.00041206643356643356, + "loss": 3.3012, + "step": 53850 + }, + { + "epoch": 15.69602189993593, + "grad_norm": 0.34941723942756653, + "learning_rate": 0.0004118916083916083, + "loss": 3.293, + "step": 53900 + }, + { + "epoch": 15.710583027549653, + "grad_norm": 0.3839094936847687, + "learning_rate": 0.00041171678321678316, + "loss": 3.3087, + "step": 53950 + }, + { + "epoch": 15.725144155163376, + "grad_norm": 0.41564396023750305, + "learning_rate": 0.00041154195804195796, + "loss": 3.2982, + "step": 54000 + }, + { + "epoch": 15.725144155163376, + "eval_accuracy": 0.3723717997400926, + "eval_loss": 3.539506673812866, + "eval_runtime": 179.7562, + "eval_samples_per_second": 92.603, + "eval_steps_per_second": 5.791, + "step": 54000 + }, + { + "epoch": 15.739705282777098, + "grad_norm": 0.3858591616153717, + "learning_rate": 0.0004113671328671328, + "loss": 3.3119, + "step": 54050 + }, + { + "epoch": 15.754266410390821, + "grad_norm": 0.39614084362983704, + "learning_rate": 0.00041119230769230767, + "loss": 3.2928, + "step": 54100 + }, + { + "epoch": 15.768827538004544, + "grad_norm": 0.3449946641921997, + "learning_rate": 0.00041101748251748247, + "loss": 3.3069, + "step": 54150 + }, + { + "epoch": 15.783388665618265, + "grad_norm": 0.3629119098186493, + "learning_rate": 0.0004108426573426573, + "loss": 3.3068, + "step": 54200 + }, + { + "epoch": 15.797949793231988, + "grad_norm": 0.36593127250671387, + "learning_rate": 0.0004106678321678321, + "loss": 3.2962, + "step": 54250 + }, + { + "epoch": 15.81251092084571, + "grad_norm": 0.38235387206077576, + "learning_rate": 0.000410493006993007, + "loss": 3.3174, + "step": 54300 + }, + { + "epoch": 15.827072048459433, + "grad_norm": 0.3705120086669922, + "learning_rate": 0.0004103181818181818, + "loss": 3.3, + "step": 54350 + }, + { + "epoch": 15.841633176073156, + "grad_norm": 0.3669654428958893, + "learning_rate": 0.00041014335664335663, + "loss": 3.3017, + "step": 54400 + }, + { + "epoch": 15.856194303686877, + "grad_norm": 0.36617377400398254, + "learning_rate": 0.00040996853146853143, + "loss": 3.3104, + "step": 54450 + }, + { + "epoch": 15.8707554313006, + "grad_norm": 0.3692075312137604, + "learning_rate": 0.0004097937062937063, + "loss": 3.3003, + "step": 54500 + }, + { + "epoch": 15.885316558914322, + "grad_norm": 0.35389435291290283, + "learning_rate": 0.0004096188811188811, + "loss": 3.3191, + "step": 54550 + }, + { + "epoch": 15.899877686528045, + "grad_norm": 0.36336150765419006, + "learning_rate": 0.00040944405594405594, + "loss": 3.3095, + "step": 54600 + }, + { + "epoch": 15.914438814141768, + "grad_norm": 0.3665081262588501, + "learning_rate": 0.0004092692307692307, + "loss": 3.3147, + "step": 54650 + }, + { + "epoch": 15.928999941755489, + "grad_norm": 0.3585745394229889, + "learning_rate": 0.00040909440559440554, + "loss": 3.3144, + "step": 54700 + }, + { + "epoch": 15.943561069369212, + "grad_norm": 0.3691560924053192, + "learning_rate": 0.00040891958041958034, + "loss": 3.3034, + "step": 54750 + }, + { + "epoch": 15.958122196982934, + "grad_norm": 0.38911911845207214, + "learning_rate": 0.0004087447552447552, + "loss": 3.3129, + "step": 54800 + }, + { + "epoch": 15.972683324596657, + "grad_norm": 0.3935786783695221, + "learning_rate": 0.00040856993006993005, + "loss": 3.3115, + "step": 54850 + }, + { + "epoch": 15.98724445221038, + "grad_norm": 0.35671958327293396, + "learning_rate": 0.00040839510489510485, + "loss": 3.3134, + "step": 54900 + }, + { + "epoch": 16.001747335313645, + "grad_norm": 0.3811984360218048, + "learning_rate": 0.0004082202797202797, + "loss": 3.3019, + "step": 54950 + }, + { + "epoch": 16.01630846292737, + "grad_norm": 0.3744508922100067, + "learning_rate": 0.0004080454545454545, + "loss": 3.2021, + "step": 55000 + }, + { + "epoch": 16.01630846292737, + "eval_accuracy": 0.37187298159698456, + "eval_loss": 3.5502474308013916, + "eval_runtime": 179.8056, + "eval_samples_per_second": 92.578, + "eval_steps_per_second": 5.79, + "step": 55000 + }, + { + "epoch": 16.03086959054109, + "grad_norm": 0.3818357586860657, + "learning_rate": 0.00040787062937062935, + "loss": 3.1967, + "step": 55050 + }, + { + "epoch": 16.045430718154815, + "grad_norm": 0.38330283761024475, + "learning_rate": 0.00040769580419580415, + "loss": 3.2128, + "step": 55100 + }, + { + "epoch": 16.059991845768536, + "grad_norm": 0.4017890989780426, + "learning_rate": 0.000407520979020979, + "loss": 3.2118, + "step": 55150 + }, + { + "epoch": 16.074552973382257, + "grad_norm": 0.3700309693813324, + "learning_rate": 0.0004073461538461538, + "loss": 3.2218, + "step": 55200 + }, + { + "epoch": 16.08911410099598, + "grad_norm": 0.38918474316596985, + "learning_rate": 0.00040717132867132866, + "loss": 3.226, + "step": 55250 + }, + { + "epoch": 16.103675228609703, + "grad_norm": 0.3746088743209839, + "learning_rate": 0.00040699650349650346, + "loss": 3.2235, + "step": 55300 + }, + { + "epoch": 16.118236356223427, + "grad_norm": 0.3673935532569885, + "learning_rate": 0.0004068216783216783, + "loss": 3.2331, + "step": 55350 + }, + { + "epoch": 16.132797483837148, + "grad_norm": 0.3882383704185486, + "learning_rate": 0.00040664685314685306, + "loss": 3.2304, + "step": 55400 + }, + { + "epoch": 16.14735861145087, + "grad_norm": 0.39152103662490845, + "learning_rate": 0.0004064720279720279, + "loss": 3.2356, + "step": 55450 + }, + { + "epoch": 16.161919739064594, + "grad_norm": 0.36934158205986023, + "learning_rate": 0.00040629720279720277, + "loss": 3.2385, + "step": 55500 + }, + { + "epoch": 16.176480866678315, + "grad_norm": 0.37601685523986816, + "learning_rate": 0.00040612237762237757, + "loss": 3.2393, + "step": 55550 + }, + { + "epoch": 16.19104199429204, + "grad_norm": 0.3896099328994751, + "learning_rate": 0.0004059475524475524, + "loss": 3.2378, + "step": 55600 + }, + { + "epoch": 16.20560312190576, + "grad_norm": 0.4255818724632263, + "learning_rate": 0.0004057727272727272, + "loss": 3.2477, + "step": 55650 + }, + { + "epoch": 16.22016424951948, + "grad_norm": 0.3997867703437805, + "learning_rate": 0.0004055979020979021, + "loss": 3.248, + "step": 55700 + }, + { + "epoch": 16.234725377133206, + "grad_norm": 0.3857765197753906, + "learning_rate": 0.0004054230769230769, + "loss": 3.2516, + "step": 55750 + }, + { + "epoch": 16.249286504746927, + "grad_norm": 0.3792831003665924, + "learning_rate": 0.00040524825174825173, + "loss": 3.25, + "step": 55800 + }, + { + "epoch": 16.26384763236065, + "grad_norm": 0.3990461230278015, + "learning_rate": 0.00040507342657342653, + "loss": 3.2654, + "step": 55850 + }, + { + "epoch": 16.278408759974372, + "grad_norm": 0.3778679072856903, + "learning_rate": 0.0004048986013986014, + "loss": 3.2594, + "step": 55900 + }, + { + "epoch": 16.292969887588093, + "grad_norm": 0.38567569851875305, + "learning_rate": 0.0004047237762237762, + "loss": 3.2516, + "step": 55950 + }, + { + "epoch": 16.307531015201818, + "grad_norm": 0.3942672908306122, + "learning_rate": 0.00040454895104895104, + "loss": 3.2393, + "step": 56000 + }, + { + "epoch": 16.307531015201818, + "eval_accuracy": 0.37220392033675576, + "eval_loss": 3.5524516105651855, + "eval_runtime": 179.7486, + "eval_samples_per_second": 92.607, + "eval_steps_per_second": 5.791, + "step": 56000 + }, + { + "epoch": 16.32209214281554, + "grad_norm": 0.36694231629371643, + "learning_rate": 0.00040437412587412583, + "loss": 3.2515, + "step": 56050 + }, + { + "epoch": 16.336653270429263, + "grad_norm": 0.36499130725860596, + "learning_rate": 0.0004041993006993007, + "loss": 3.2496, + "step": 56100 + }, + { + "epoch": 16.351214398042984, + "grad_norm": 0.3922365605831146, + "learning_rate": 0.00040402447552447554, + "loss": 3.2439, + "step": 56150 + }, + { + "epoch": 16.36577552565671, + "grad_norm": 0.3942036032676697, + "learning_rate": 0.0004038496503496503, + "loss": 3.2654, + "step": 56200 + }, + { + "epoch": 16.38033665327043, + "grad_norm": 0.3666972219944, + "learning_rate": 0.00040367482517482514, + "loss": 3.2619, + "step": 56250 + }, + { + "epoch": 16.39489778088415, + "grad_norm": 0.37733784317970276, + "learning_rate": 0.00040349999999999994, + "loss": 3.2648, + "step": 56300 + }, + { + "epoch": 16.409458908497875, + "grad_norm": 0.3841692805290222, + "learning_rate": 0.0004033251748251748, + "loss": 3.2814, + "step": 56350 + }, + { + "epoch": 16.424020036111596, + "grad_norm": 0.3820192217826843, + "learning_rate": 0.0004031503496503496, + "loss": 3.2705, + "step": 56400 + }, + { + "epoch": 16.43858116372532, + "grad_norm": 0.3828584551811218, + "learning_rate": 0.00040297552447552445, + "loss": 3.2728, + "step": 56450 + }, + { + "epoch": 16.45314229133904, + "grad_norm": 0.4050110876560211, + "learning_rate": 0.00040280069930069925, + "loss": 3.2735, + "step": 56500 + }, + { + "epoch": 16.467703418952762, + "grad_norm": 0.38829973340034485, + "learning_rate": 0.0004026258741258741, + "loss": 3.2735, + "step": 56550 + }, + { + "epoch": 16.482264546566487, + "grad_norm": 0.39007800817489624, + "learning_rate": 0.0004024510489510489, + "loss": 3.2612, + "step": 56600 + }, + { + "epoch": 16.496825674180208, + "grad_norm": 0.38671746850013733, + "learning_rate": 0.00040227622377622376, + "loss": 3.271, + "step": 56650 + }, + { + "epoch": 16.511386801793932, + "grad_norm": 0.43336355686187744, + "learning_rate": 0.00040210139860139856, + "loss": 3.2684, + "step": 56700 + }, + { + "epoch": 16.525947929407653, + "grad_norm": 0.36696234345436096, + "learning_rate": 0.0004019265734265734, + "loss": 3.2739, + "step": 56750 + }, + { + "epoch": 16.540509057021374, + "grad_norm": 0.3764398396015167, + "learning_rate": 0.0004017517482517482, + "loss": 3.2852, + "step": 56800 + }, + { + "epoch": 16.5550701846351, + "grad_norm": 0.3709736764431, + "learning_rate": 0.00040157692307692306, + "loss": 3.2853, + "step": 56850 + }, + { + "epoch": 16.56963131224882, + "grad_norm": 0.37827807664871216, + "learning_rate": 0.0004014020979020979, + "loss": 3.2614, + "step": 56900 + }, + { + "epoch": 16.584192439862544, + "grad_norm": 0.34628504514694214, + "learning_rate": 0.00040122727272727266, + "loss": 3.2794, + "step": 56950 + }, + { + "epoch": 16.598753567476265, + "grad_norm": 0.37481993436813354, + "learning_rate": 0.0004010524475524475, + "loss": 3.2732, + "step": 57000 + }, + { + "epoch": 16.598753567476265, + "eval_accuracy": 0.37256272141447566, + "eval_loss": 3.544576406478882, + "eval_runtime": 179.8109, + "eval_samples_per_second": 92.575, + "eval_steps_per_second": 5.789, + "step": 57000 + }, + { + "epoch": 16.613314695089986, + "grad_norm": 0.3837030231952667, + "learning_rate": 0.0004008776223776223, + "loss": 3.2913, + "step": 57050 + }, + { + "epoch": 16.62787582270371, + "grad_norm": 0.34924599528312683, + "learning_rate": 0.00040070279720279717, + "loss": 3.2843, + "step": 57100 + }, + { + "epoch": 16.642436950317432, + "grad_norm": 0.37274640798568726, + "learning_rate": 0.00040052797202797197, + "loss": 3.2824, + "step": 57150 + }, + { + "epoch": 16.656998077931156, + "grad_norm": 0.3472432792186737, + "learning_rate": 0.0004003531468531468, + "loss": 3.2826, + "step": 57200 + }, + { + "epoch": 16.671559205544877, + "grad_norm": 0.3804296553134918, + "learning_rate": 0.0004001783216783216, + "loss": 3.2813, + "step": 57250 + }, + { + "epoch": 16.6861203331586, + "grad_norm": 0.36797961592674255, + "learning_rate": 0.0004000034965034965, + "loss": 3.291, + "step": 57300 + }, + { + "epoch": 16.700681460772323, + "grad_norm": 0.3804137408733368, + "learning_rate": 0.0003998286713286713, + "loss": 3.2863, + "step": 57350 + }, + { + "epoch": 16.715242588386044, + "grad_norm": 0.3731144070625305, + "learning_rate": 0.00039965384615384613, + "loss": 3.302, + "step": 57400 + }, + { + "epoch": 16.72980371599977, + "grad_norm": 0.3816506266593933, + "learning_rate": 0.00039947902097902093, + "loss": 3.2834, + "step": 57450 + }, + { + "epoch": 16.74436484361349, + "grad_norm": 0.38200855255126953, + "learning_rate": 0.0003993041958041958, + "loss": 3.2961, + "step": 57500 + }, + { + "epoch": 16.75892597122721, + "grad_norm": 0.4178982377052307, + "learning_rate": 0.00039912937062937064, + "loss": 3.2958, + "step": 57550 + }, + { + "epoch": 16.773487098840935, + "grad_norm": 0.37672388553619385, + "learning_rate": 0.00039895454545454544, + "loss": 3.2928, + "step": 57600 + }, + { + "epoch": 16.788048226454656, + "grad_norm": 0.3759998381137848, + "learning_rate": 0.0003987797202797203, + "loss": 3.2941, + "step": 57650 + }, + { + "epoch": 16.80260935406838, + "grad_norm": 0.3826366066932678, + "learning_rate": 0.00039860489510489504, + "loss": 3.2902, + "step": 57700 + }, + { + "epoch": 16.8171704816821, + "grad_norm": 0.3507887125015259, + "learning_rate": 0.0003984300699300699, + "loss": 3.285, + "step": 57750 + }, + { + "epoch": 16.831731609295822, + "grad_norm": 0.3854213058948517, + "learning_rate": 0.0003982552447552447, + "loss": 3.2929, + "step": 57800 + }, + { + "epoch": 16.846292736909547, + "grad_norm": 0.37187182903289795, + "learning_rate": 0.00039808041958041955, + "loss": 3.2943, + "step": 57850 + }, + { + "epoch": 16.860853864523268, + "grad_norm": 0.38784974813461304, + "learning_rate": 0.00039790559440559435, + "loss": 3.2884, + "step": 57900 + }, + { + "epoch": 16.875414992136992, + "grad_norm": 0.3870326280593872, + "learning_rate": 0.0003977307692307692, + "loss": 3.2948, + "step": 57950 + }, + { + "epoch": 16.889976119750713, + "grad_norm": 0.35052213072776794, + "learning_rate": 0.000397555944055944, + "loss": 3.2869, + "step": 58000 + }, + { + "epoch": 16.889976119750713, + "eval_accuracy": 0.3729087081679913, + "eval_loss": 3.5367329120635986, + "eval_runtime": 179.95, + "eval_samples_per_second": 92.503, + "eval_steps_per_second": 5.785, + "step": 58000 + }, + { + "epoch": 16.904537247364434, + "grad_norm": 0.3954494297504425, + "learning_rate": 0.00039738111888111885, + "loss": 3.306, + "step": 58050 + }, + { + "epoch": 16.91909837497816, + "grad_norm": 0.3943544030189514, + "learning_rate": 0.00039720629370629365, + "loss": 3.293, + "step": 58100 + }, + { + "epoch": 16.93365950259188, + "grad_norm": 0.35315266251564026, + "learning_rate": 0.0003970314685314685, + "loss": 3.3036, + "step": 58150 + }, + { + "epoch": 16.948220630205604, + "grad_norm": 0.37690219283103943, + "learning_rate": 0.0003968566433566433, + "loss": 3.2983, + "step": 58200 + }, + { + "epoch": 16.962781757819325, + "grad_norm": 0.35092926025390625, + "learning_rate": 0.00039668181818181816, + "loss": 3.299, + "step": 58250 + }, + { + "epoch": 16.977342885433046, + "grad_norm": 0.3741217851638794, + "learning_rate": 0.000396506993006993, + "loss": 3.2969, + "step": 58300 + }, + { + "epoch": 16.99190401304677, + "grad_norm": 0.3852684199810028, + "learning_rate": 0.0003963321678321678, + "loss": 3.2911, + "step": 58350 + }, + { + "epoch": 17.006406896150036, + "grad_norm": 0.3808322846889496, + "learning_rate": 0.00039615734265734267, + "loss": 3.2516, + "step": 58400 + }, + { + "epoch": 17.02096802376376, + "grad_norm": 0.38001880049705505, + "learning_rate": 0.0003959825174825174, + "loss": 3.1904, + "step": 58450 + }, + { + "epoch": 17.03552915137748, + "grad_norm": 0.426193505525589, + "learning_rate": 0.00039580769230769227, + "loss": 3.1828, + "step": 58500 + }, + { + "epoch": 17.050090278991206, + "grad_norm": 0.4035714268684387, + "learning_rate": 0.00039563286713286707, + "loss": 3.2003, + "step": 58550 + }, + { + "epoch": 17.064651406604927, + "grad_norm": 0.40296247601509094, + "learning_rate": 0.0003954580419580419, + "loss": 3.2132, + "step": 58600 + }, + { + "epoch": 17.07921253421865, + "grad_norm": 0.36768728494644165, + "learning_rate": 0.0003952832167832167, + "loss": 3.2112, + "step": 58650 + }, + { + "epoch": 17.093773661832373, + "grad_norm": 0.38769692182540894, + "learning_rate": 0.0003951083916083916, + "loss": 3.2056, + "step": 58700 + }, + { + "epoch": 17.108334789446094, + "grad_norm": 0.5081406831741333, + "learning_rate": 0.0003949335664335664, + "loss": 3.2071, + "step": 58750 + }, + { + "epoch": 17.122895917059818, + "grad_norm": 0.3892911374568939, + "learning_rate": 0.00039475874125874123, + "loss": 3.2145, + "step": 58800 + }, + { + "epoch": 17.13745704467354, + "grad_norm": 0.3744818866252899, + "learning_rate": 0.00039458391608391603, + "loss": 3.2318, + "step": 58850 + }, + { + "epoch": 17.152018172287264, + "grad_norm": 0.3952346742153168, + "learning_rate": 0.0003944090909090909, + "loss": 3.2336, + "step": 58900 + }, + { + "epoch": 17.166579299900985, + "grad_norm": 0.38604477047920227, + "learning_rate": 0.00039423426573426573, + "loss": 3.2199, + "step": 58950 + }, + { + "epoch": 17.181140427514705, + "grad_norm": 0.3790982663631439, + "learning_rate": 0.00039405944055944053, + "loss": 3.2329, + "step": 59000 + }, + { + "epoch": 17.181140427514705, + "eval_accuracy": 0.37229738260962186, + "eval_loss": 3.550474166870117, + "eval_runtime": 179.7974, + "eval_samples_per_second": 92.582, + "eval_steps_per_second": 5.79, + "step": 59000 + }, + { + "epoch": 17.19570155512843, + "grad_norm": 0.3766101598739624, + "learning_rate": 0.0003938846153846154, + "loss": 3.2224, + "step": 59050 + }, + { + "epoch": 17.21026268274215, + "grad_norm": 0.3996836245059967, + "learning_rate": 0.0003937097902097902, + "loss": 3.2276, + "step": 59100 + }, + { + "epoch": 17.224823810355876, + "grad_norm": 0.3663976490497589, + "learning_rate": 0.00039353496503496504, + "loss": 3.2365, + "step": 59150 + }, + { + "epoch": 17.239384937969596, + "grad_norm": 0.3746318221092224, + "learning_rate": 0.0003933601398601398, + "loss": 3.2349, + "step": 59200 + }, + { + "epoch": 17.253946065583317, + "grad_norm": 0.3788388669490814, + "learning_rate": 0.00039318531468531464, + "loss": 3.2399, + "step": 59250 + }, + { + "epoch": 17.268507193197042, + "grad_norm": 0.38114452362060547, + "learning_rate": 0.00039301048951048944, + "loss": 3.2377, + "step": 59300 + }, + { + "epoch": 17.283068320810763, + "grad_norm": 0.38223931193351746, + "learning_rate": 0.0003928356643356643, + "loss": 3.2375, + "step": 59350 + }, + { + "epoch": 17.297629448424487, + "grad_norm": 0.3718387484550476, + "learning_rate": 0.0003926608391608391, + "loss": 3.2493, + "step": 59400 + }, + { + "epoch": 17.31219057603821, + "grad_norm": 0.4343428313732147, + "learning_rate": 0.00039248601398601395, + "loss": 3.2366, + "step": 59450 + }, + { + "epoch": 17.32675170365193, + "grad_norm": 0.40242546796798706, + "learning_rate": 0.00039231118881118875, + "loss": 3.2394, + "step": 59500 + }, + { + "epoch": 17.341312831265654, + "grad_norm": 0.3776035010814667, + "learning_rate": 0.0003921363636363636, + "loss": 3.2384, + "step": 59550 + }, + { + "epoch": 17.355873958879375, + "grad_norm": 0.3844600021839142, + "learning_rate": 0.00039196153846153846, + "loss": 3.2412, + "step": 59600 + }, + { + "epoch": 17.3704350864931, + "grad_norm": 0.374520480632782, + "learning_rate": 0.00039178671328671326, + "loss": 3.2549, + "step": 59650 + }, + { + "epoch": 17.38499621410682, + "grad_norm": 0.38012269139289856, + "learning_rate": 0.0003916118881118881, + "loss": 3.2481, + "step": 59700 + }, + { + "epoch": 17.39955734172054, + "grad_norm": 0.37167397141456604, + "learning_rate": 0.0003914370629370629, + "loss": 3.2559, + "step": 59750 + }, + { + "epoch": 17.414118469334266, + "grad_norm": 0.42666929960250854, + "learning_rate": 0.00039126223776223776, + "loss": 3.2479, + "step": 59800 + }, + { + "epoch": 17.428679596947987, + "grad_norm": 0.37930217385292053, + "learning_rate": 0.00039108741258741256, + "loss": 3.2507, + "step": 59850 + }, + { + "epoch": 17.44324072456171, + "grad_norm": 0.3739822506904602, + "learning_rate": 0.0003909125874125874, + "loss": 3.2631, + "step": 59900 + }, + { + "epoch": 17.457801852175432, + "grad_norm": 0.36338284611701965, + "learning_rate": 0.00039073776223776216, + "loss": 3.2534, + "step": 59950 + }, + { + "epoch": 17.472362979789153, + "grad_norm": 0.3643838167190552, + "learning_rate": 0.000390562937062937, + "loss": 3.2627, + "step": 60000 + }, + { + "epoch": 17.472362979789153, + "eval_accuracy": 0.372753407963644, + "eval_loss": 3.54595685005188, + "eval_runtime": 179.9118, + "eval_samples_per_second": 92.523, + "eval_steps_per_second": 5.786, + "step": 60000 + }, + { + "epoch": 17.486924107402878, + "grad_norm": 0.36303991079330444, + "learning_rate": 0.0003903881118881118, + "loss": 3.2644, + "step": 60050 + }, + { + "epoch": 17.5014852350166, + "grad_norm": 0.3679315149784088, + "learning_rate": 0.00039021328671328667, + "loss": 3.2594, + "step": 60100 + }, + { + "epoch": 17.516046362630323, + "grad_norm": 0.3789927363395691, + "learning_rate": 0.00039003846153846147, + "loss": 3.2575, + "step": 60150 + }, + { + "epoch": 17.530607490244044, + "grad_norm": 0.380993127822876, + "learning_rate": 0.0003898636363636363, + "loss": 3.2592, + "step": 60200 + }, + { + "epoch": 17.545168617857765, + "grad_norm": 0.36757829785346985, + "learning_rate": 0.0003896888111888111, + "loss": 3.2626, + "step": 60250 + }, + { + "epoch": 17.55972974547149, + "grad_norm": 0.39095836877822876, + "learning_rate": 0.000389513986013986, + "loss": 3.2766, + "step": 60300 + }, + { + "epoch": 17.57429087308521, + "grad_norm": 0.3795633912086487, + "learning_rate": 0.00038933916083916083, + "loss": 3.2637, + "step": 60350 + }, + { + "epoch": 17.588852000698935, + "grad_norm": 0.3707791268825531, + "learning_rate": 0.00038916433566433563, + "loss": 3.276, + "step": 60400 + }, + { + "epoch": 17.603413128312656, + "grad_norm": 0.38798531889915466, + "learning_rate": 0.0003889895104895105, + "loss": 3.2725, + "step": 60450 + }, + { + "epoch": 17.617974255926377, + "grad_norm": 0.3834350109100342, + "learning_rate": 0.0003888146853146853, + "loss": 3.2726, + "step": 60500 + }, + { + "epoch": 17.6325353835401, + "grad_norm": 0.36958327889442444, + "learning_rate": 0.00038863986013986014, + "loss": 3.2846, + "step": 60550 + }, + { + "epoch": 17.647096511153823, + "grad_norm": 0.4254196584224701, + "learning_rate": 0.00038846503496503494, + "loss": 3.265, + "step": 60600 + }, + { + "epoch": 17.661657638767547, + "grad_norm": 0.35333532094955444, + "learning_rate": 0.0003882902097902098, + "loss": 3.2839, + "step": 60650 + }, + { + "epoch": 17.676218766381268, + "grad_norm": 0.4240613579750061, + "learning_rate": 0.00038811538461538454, + "loss": 3.2719, + "step": 60700 + }, + { + "epoch": 17.690779893994993, + "grad_norm": 0.4057902991771698, + "learning_rate": 0.0003879405594405594, + "loss": 3.2586, + "step": 60750 + }, + { + "epoch": 17.705341021608714, + "grad_norm": 0.42154860496520996, + "learning_rate": 0.0003877657342657342, + "loss": 3.2783, + "step": 60800 + }, + { + "epoch": 17.719902149222435, + "grad_norm": 0.37844952940940857, + "learning_rate": 0.00038759090909090905, + "loss": 3.271, + "step": 60850 + }, + { + "epoch": 17.73446327683616, + "grad_norm": 0.38174551725387573, + "learning_rate": 0.00038741608391608384, + "loss": 3.275, + "step": 60900 + }, + { + "epoch": 17.74902440444988, + "grad_norm": 0.36706846952438354, + "learning_rate": 0.0003872412587412587, + "loss": 3.2777, + "step": 60950 + }, + { + "epoch": 17.763585532063605, + "grad_norm": 0.37525689601898193, + "learning_rate": 0.00038706643356643355, + "loss": 3.2649, + "step": 61000 + }, + { + "epoch": 17.763585532063605, + "eval_accuracy": 0.3734166961944749, + "eval_loss": 3.5382072925567627, + "eval_runtime": 179.9566, + "eval_samples_per_second": 92.5, + "eval_steps_per_second": 5.785, + "step": 61000 + }, + { + "epoch": 17.778146659677326, + "grad_norm": 0.40704724192619324, + "learning_rate": 0.00038689160839160835, + "loss": 3.2755, + "step": 61050 + }, + { + "epoch": 17.792707787291047, + "grad_norm": 0.3714872896671295, + "learning_rate": 0.0003867167832167832, + "loss": 3.2766, + "step": 61100 + }, + { + "epoch": 17.80726891490477, + "grad_norm": 0.3932179808616638, + "learning_rate": 0.000386541958041958, + "loss": 3.2844, + "step": 61150 + }, + { + "epoch": 17.821830042518492, + "grad_norm": 0.42021384835243225, + "learning_rate": 0.00038636713286713286, + "loss": 3.2882, + "step": 61200 + }, + { + "epoch": 17.836391170132217, + "grad_norm": 0.39331310987472534, + "learning_rate": 0.00038619230769230766, + "loss": 3.2748, + "step": 61250 + }, + { + "epoch": 17.850952297745938, + "grad_norm": 0.36521244049072266, + "learning_rate": 0.0003860174825174825, + "loss": 3.2788, + "step": 61300 + }, + { + "epoch": 17.86551342535966, + "grad_norm": 0.3976779580116272, + "learning_rate": 0.0003858426573426573, + "loss": 3.2742, + "step": 61350 + }, + { + "epoch": 17.880074552973383, + "grad_norm": 0.37855014204978943, + "learning_rate": 0.00038566783216783217, + "loss": 3.3008, + "step": 61400 + }, + { + "epoch": 17.894635680587104, + "grad_norm": 0.40165817737579346, + "learning_rate": 0.0003854930069930069, + "loss": 3.2812, + "step": 61450 + }, + { + "epoch": 17.90919680820083, + "grad_norm": 0.3996519148349762, + "learning_rate": 0.00038531818181818177, + "loss": 3.2909, + "step": 61500 + }, + { + "epoch": 17.92375793581455, + "grad_norm": 0.37104541063308716, + "learning_rate": 0.00038514335664335657, + "loss": 3.2924, + "step": 61550 + }, + { + "epoch": 17.93831906342827, + "grad_norm": 0.37623950839042664, + "learning_rate": 0.0003849685314685314, + "loss": 3.2858, + "step": 61600 + }, + { + "epoch": 17.952880191041995, + "grad_norm": 0.393968790769577, + "learning_rate": 0.0003847937062937062, + "loss": 3.2893, + "step": 61650 + }, + { + "epoch": 17.967441318655716, + "grad_norm": 0.42417454719543457, + "learning_rate": 0.0003846188811188811, + "loss": 3.2834, + "step": 61700 + }, + { + "epoch": 17.98200244626944, + "grad_norm": 0.36883294582366943, + "learning_rate": 0.00038444405594405593, + "loss": 3.2832, + "step": 61750 + }, + { + "epoch": 17.99656357388316, + "grad_norm": 0.3445383608341217, + "learning_rate": 0.00038426923076923073, + "loss": 3.2962, + "step": 61800 + }, + { + "epoch": 18.01106645698643, + "grad_norm": 0.38899731636047363, + "learning_rate": 0.0003840944055944056, + "loss": 3.201, + "step": 61850 + }, + { + "epoch": 18.02562758460015, + "grad_norm": 0.389384001493454, + "learning_rate": 0.0003839195804195804, + "loss": 3.1656, + "step": 61900 + }, + { + "epoch": 18.040188712213872, + "grad_norm": 0.3927420973777771, + "learning_rate": 0.00038374475524475523, + "loss": 3.1787, + "step": 61950 + }, + { + "epoch": 18.054749839827597, + "grad_norm": 0.37237685918807983, + "learning_rate": 0.00038356993006993003, + "loss": 3.1859, + "step": 62000 + }, + { + "epoch": 18.054749839827597, + "eval_accuracy": 0.3727140244901721, + "eval_loss": 3.55122709274292, + "eval_runtime": 179.7522, + "eval_samples_per_second": 92.605, + "eval_steps_per_second": 5.791, + "step": 62000 + }, + { + "epoch": 18.069310967441318, + "grad_norm": 0.3972647488117218, + "learning_rate": 0.0003833951048951049, + "loss": 3.1965, + "step": 62050 + }, + { + "epoch": 18.083872095055042, + "grad_norm": 0.413581520318985, + "learning_rate": 0.0003832202797202797, + "loss": 3.1931, + "step": 62100 + }, + { + "epoch": 18.098433222668763, + "grad_norm": 0.3956906199455261, + "learning_rate": 0.00038304545454545454, + "loss": 3.2077, + "step": 62150 + }, + { + "epoch": 18.112994350282484, + "grad_norm": 0.4045657217502594, + "learning_rate": 0.0003828706293706293, + "loss": 3.1915, + "step": 62200 + }, + { + "epoch": 18.12755547789621, + "grad_norm": 0.37881526350975037, + "learning_rate": 0.00038269580419580414, + "loss": 3.2048, + "step": 62250 + }, + { + "epoch": 18.14211660550993, + "grad_norm": 0.4110286235809326, + "learning_rate": 0.00038252097902097894, + "loss": 3.206, + "step": 62300 + }, + { + "epoch": 18.156677733123654, + "grad_norm": 0.3890237510204315, + "learning_rate": 0.0003823461538461538, + "loss": 3.2024, + "step": 62350 + }, + { + "epoch": 18.171238860737375, + "grad_norm": 0.3655931055545807, + "learning_rate": 0.00038217132867132865, + "loss": 3.2136, + "step": 62400 + }, + { + "epoch": 18.185799988351096, + "grad_norm": 0.41427183151245117, + "learning_rate": 0.00038199650349650345, + "loss": 3.2244, + "step": 62450 + }, + { + "epoch": 18.20036111596482, + "grad_norm": 0.4075052738189697, + "learning_rate": 0.0003818216783216783, + "loss": 3.2025, + "step": 62500 + }, + { + "epoch": 18.214922243578542, + "grad_norm": 0.376781702041626, + "learning_rate": 0.0003816468531468531, + "loss": 3.2109, + "step": 62550 + }, + { + "epoch": 18.229483371192266, + "grad_norm": 0.3829306662082672, + "learning_rate": 0.00038147202797202796, + "loss": 3.236, + "step": 62600 + }, + { + "epoch": 18.244044498805987, + "grad_norm": 0.4079030156135559, + "learning_rate": 0.00038129720279720276, + "loss": 3.2274, + "step": 62650 + }, + { + "epoch": 18.25860562641971, + "grad_norm": 0.4116332530975342, + "learning_rate": 0.0003811223776223776, + "loss": 3.2281, + "step": 62700 + }, + { + "epoch": 18.273166754033433, + "grad_norm": 0.36425337195396423, + "learning_rate": 0.0003809475524475524, + "loss": 3.2316, + "step": 62750 + }, + { + "epoch": 18.287727881647154, + "grad_norm": 0.3588009178638458, + "learning_rate": 0.00038077272727272726, + "loss": 3.237, + "step": 62800 + }, + { + "epoch": 18.30228900926088, + "grad_norm": 0.38670557737350464, + "learning_rate": 0.00038059790209790206, + "loss": 3.2351, + "step": 62850 + }, + { + "epoch": 18.3168501368746, + "grad_norm": 0.3748593032360077, + "learning_rate": 0.0003804230769230769, + "loss": 3.2318, + "step": 62900 + }, + { + "epoch": 18.33141126448832, + "grad_norm": 0.3898875117301941, + "learning_rate": 0.00038024825174825166, + "loss": 3.2381, + "step": 62950 + }, + { + "epoch": 18.345972392102045, + "grad_norm": 0.40329083800315857, + "learning_rate": 0.0003800734265734265, + "loss": 3.2295, + "step": 63000 + }, + { + "epoch": 18.345972392102045, + "eval_accuracy": 0.3723687431123007, + "eval_loss": 3.5525670051574707, + "eval_runtime": 179.8597, + "eval_samples_per_second": 92.55, + "eval_steps_per_second": 5.788, + "step": 63000 + }, + { + "epoch": 18.360533519715766, + "grad_norm": 0.37379077076911926, + "learning_rate": 0.0003798986013986013, + "loss": 3.2425, + "step": 63050 + }, + { + "epoch": 18.37509464732949, + "grad_norm": 0.38135528564453125, + "learning_rate": 0.00037972377622377617, + "loss": 3.2355, + "step": 63100 + }, + { + "epoch": 18.38965577494321, + "grad_norm": 0.3832929730415344, + "learning_rate": 0.000379548951048951, + "loss": 3.2411, + "step": 63150 + }, + { + "epoch": 18.404216902556932, + "grad_norm": 0.38185015320777893, + "learning_rate": 0.0003793741258741258, + "loss": 3.2397, + "step": 63200 + }, + { + "epoch": 18.418778030170657, + "grad_norm": 0.40947502851486206, + "learning_rate": 0.0003791993006993007, + "loss": 3.2413, + "step": 63250 + }, + { + "epoch": 18.433339157784378, + "grad_norm": 0.4148065745830536, + "learning_rate": 0.0003790244755244755, + "loss": 3.2406, + "step": 63300 + }, + { + "epoch": 18.447900285398102, + "grad_norm": 0.37160590291023254, + "learning_rate": 0.00037884965034965033, + "loss": 3.2575, + "step": 63350 + }, + { + "epoch": 18.462461413011823, + "grad_norm": 0.3780807852745056, + "learning_rate": 0.00037867482517482513, + "loss": 3.2459, + "step": 63400 + }, + { + "epoch": 18.477022540625548, + "grad_norm": 0.37864765524864197, + "learning_rate": 0.0003785, + "loss": 3.235, + "step": 63450 + }, + { + "epoch": 18.49158366823927, + "grad_norm": 0.393699049949646, + "learning_rate": 0.0003783251748251748, + "loss": 3.2458, + "step": 63500 + }, + { + "epoch": 18.50614479585299, + "grad_norm": 0.40041160583496094, + "learning_rate": 0.00037815034965034964, + "loss": 3.2509, + "step": 63550 + }, + { + "epoch": 18.520705923466714, + "grad_norm": 0.4013485014438629, + "learning_rate": 0.00037797552447552444, + "loss": 3.2516, + "step": 63600 + }, + { + "epoch": 18.535267051080435, + "grad_norm": 0.3757307231426239, + "learning_rate": 0.0003778006993006993, + "loss": 3.2674, + "step": 63650 + }, + { + "epoch": 18.54982817869416, + "grad_norm": 0.392844557762146, + "learning_rate": 0.00037762587412587404, + "loss": 3.2657, + "step": 63700 + }, + { + "epoch": 18.56438930630788, + "grad_norm": 0.425006628036499, + "learning_rate": 0.0003774510489510489, + "loss": 3.2543, + "step": 63750 + }, + { + "epoch": 18.5789504339216, + "grad_norm": 0.3740367591381073, + "learning_rate": 0.0003772762237762238, + "loss": 3.251, + "step": 63800 + }, + { + "epoch": 18.593511561535326, + "grad_norm": 0.40788206458091736, + "learning_rate": 0.00037710139860139854, + "loss": 3.2649, + "step": 63850 + }, + { + "epoch": 18.608072689149047, + "grad_norm": 0.3888489007949829, + "learning_rate": 0.0003769265734265734, + "loss": 3.2551, + "step": 63900 + }, + { + "epoch": 18.62263381676277, + "grad_norm": 0.3995554447174072, + "learning_rate": 0.0003767517482517482, + "loss": 3.2607, + "step": 63950 + }, + { + "epoch": 18.637194944376493, + "grad_norm": 0.3996911942958832, + "learning_rate": 0.00037657692307692305, + "loss": 3.253, + "step": 64000 + }, + { + "epoch": 18.637194944376493, + "eval_accuracy": 0.3733275837380818, + "eval_loss": 3.5378425121307373, + "eval_runtime": 179.5439, + "eval_samples_per_second": 92.713, + "eval_steps_per_second": 5.798, + "step": 64000 + }, + { + "epoch": 18.651756071990214, + "grad_norm": 0.39029937982559204, + "learning_rate": 0.00037640209790209785, + "loss": 3.2797, + "step": 64050 + }, + { + "epoch": 18.666317199603938, + "grad_norm": 0.37387433648109436, + "learning_rate": 0.0003762272727272727, + "loss": 3.2686, + "step": 64100 + }, + { + "epoch": 18.68087832721766, + "grad_norm": 0.37229660153388977, + "learning_rate": 0.0003760524475524475, + "loss": 3.2552, + "step": 64150 + }, + { + "epoch": 18.695439454831384, + "grad_norm": 0.38790515065193176, + "learning_rate": 0.00037587762237762236, + "loss": 3.2541, + "step": 64200 + }, + { + "epoch": 18.710000582445105, + "grad_norm": 0.4044416844844818, + "learning_rate": 0.00037570279720279716, + "loss": 3.2744, + "step": 64250 + }, + { + "epoch": 18.724561710058826, + "grad_norm": 0.39733484387397766, + "learning_rate": 0.000375527972027972, + "loss": 3.2686, + "step": 64300 + }, + { + "epoch": 18.73912283767255, + "grad_norm": 0.38480284810066223, + "learning_rate": 0.0003753531468531468, + "loss": 3.2625, + "step": 64350 + }, + { + "epoch": 18.75368396528627, + "grad_norm": 0.4033631682395935, + "learning_rate": 0.00037517832167832167, + "loss": 3.2693, + "step": 64400 + }, + { + "epoch": 18.768245092899996, + "grad_norm": 0.41612687706947327, + "learning_rate": 0.0003750034965034965, + "loss": 3.2638, + "step": 64450 + }, + { + "epoch": 18.782806220513717, + "grad_norm": 0.40416860580444336, + "learning_rate": 0.00037482867132867127, + "loss": 3.2583, + "step": 64500 + }, + { + "epoch": 18.797367348127437, + "grad_norm": 0.3601404130458832, + "learning_rate": 0.0003746538461538462, + "loss": 3.2644, + "step": 64550 + }, + { + "epoch": 18.811928475741162, + "grad_norm": 0.3742053508758545, + "learning_rate": 0.0003744790209790209, + "loss": 3.2667, + "step": 64600 + }, + { + "epoch": 18.826489603354883, + "grad_norm": 0.37909430265426636, + "learning_rate": 0.0003743041958041958, + "loss": 3.2681, + "step": 64650 + }, + { + "epoch": 18.841050730968607, + "grad_norm": 0.40138882398605347, + "learning_rate": 0.0003741293706293706, + "loss": 3.2699, + "step": 64700 + }, + { + "epoch": 18.85561185858233, + "grad_norm": 0.42691561579704285, + "learning_rate": 0.0003739545454545454, + "loss": 3.2697, + "step": 64750 + }, + { + "epoch": 18.87017298619605, + "grad_norm": 0.378101110458374, + "learning_rate": 0.0003737797202797202, + "loss": 3.2754, + "step": 64800 + }, + { + "epoch": 18.884734113809774, + "grad_norm": 0.40638959407806396, + "learning_rate": 0.0003736048951048951, + "loss": 3.2699, + "step": 64850 + }, + { + "epoch": 18.899295241423495, + "grad_norm": 0.371693879365921, + "learning_rate": 0.0003734300699300699, + "loss": 3.2768, + "step": 64900 + }, + { + "epoch": 18.91385636903722, + "grad_norm": 0.38783252239227295, + "learning_rate": 0.00037325524475524473, + "loss": 3.2732, + "step": 64950 + }, + { + "epoch": 18.92841749665094, + "grad_norm": 0.3685288727283478, + "learning_rate": 0.00037308041958041953, + "loss": 3.2776, + "step": 65000 + }, + { + "epoch": 18.92841749665094, + "eval_accuracy": 0.3736479418431889, + "eval_loss": 3.5299794673919678, + "eval_runtime": 179.7636, + "eval_samples_per_second": 92.599, + "eval_steps_per_second": 5.791, + "step": 65000 + }, + { + "epoch": 18.94297862426466, + "grad_norm": 0.39574894309043884, + "learning_rate": 0.0003729055944055944, + "loss": 3.2666, + "step": 65050 + }, + { + "epoch": 18.957539751878386, + "grad_norm": 0.3749750852584839, + "learning_rate": 0.0003727307692307692, + "loss": 3.272, + "step": 65100 + }, + { + "epoch": 18.972100879492107, + "grad_norm": 0.41102007031440735, + "learning_rate": 0.00037255594405594404, + "loss": 3.2792, + "step": 65150 + }, + { + "epoch": 18.98666200710583, + "grad_norm": 0.39592432975769043, + "learning_rate": 0.0003723811188811189, + "loss": 3.2632, + "step": 65200 + }, + { + "epoch": 19.001164890209097, + "grad_norm": 0.4055386781692505, + "learning_rate": 0.00037220629370629364, + "loss": 3.272, + "step": 65250 + }, + { + "epoch": 19.01572601782282, + "grad_norm": 0.4150431752204895, + "learning_rate": 0.00037203146853146855, + "loss": 3.1721, + "step": 65300 + }, + { + "epoch": 19.030287145436542, + "grad_norm": 0.36770737171173096, + "learning_rate": 0.0003718566433566433, + "loss": 3.1721, + "step": 65350 + }, + { + "epoch": 19.044848273050263, + "grad_norm": 0.38085243105888367, + "learning_rate": 0.00037168181818181815, + "loss": 3.1797, + "step": 65400 + }, + { + "epoch": 19.059409400663988, + "grad_norm": 0.4273834228515625, + "learning_rate": 0.00037150699300699295, + "loss": 3.1857, + "step": 65450 + }, + { + "epoch": 19.07397052827771, + "grad_norm": 0.395258367061615, + "learning_rate": 0.0003713321678321678, + "loss": 3.1715, + "step": 65500 + }, + { + "epoch": 19.088531655891433, + "grad_norm": 0.44449496269226074, + "learning_rate": 0.0003711573426573426, + "loss": 3.1888, + "step": 65550 + }, + { + "epoch": 19.103092783505154, + "grad_norm": 0.4193888306617737, + "learning_rate": 0.00037098251748251746, + "loss": 3.2022, + "step": 65600 + }, + { + "epoch": 19.11765391111888, + "grad_norm": 0.42027559876441956, + "learning_rate": 0.00037080769230769226, + "loss": 3.1978, + "step": 65650 + }, + { + "epoch": 19.1322150387326, + "grad_norm": 0.37804079055786133, + "learning_rate": 0.0003706328671328671, + "loss": 3.1873, + "step": 65700 + }, + { + "epoch": 19.14677616634632, + "grad_norm": 0.3857940137386322, + "learning_rate": 0.0003704580419580419, + "loss": 3.1962, + "step": 65750 + }, + { + "epoch": 19.161337293960045, + "grad_norm": 0.3942515254020691, + "learning_rate": 0.00037028321678321676, + "loss": 3.2013, + "step": 65800 + }, + { + "epoch": 19.175898421573766, + "grad_norm": 0.41163113713264465, + "learning_rate": 0.0003701083916083916, + "loss": 3.2061, + "step": 65850 + }, + { + "epoch": 19.19045954918749, + "grad_norm": 0.3958335220813751, + "learning_rate": 0.0003699335664335664, + "loss": 3.1975, + "step": 65900 + }, + { + "epoch": 19.20502067680121, + "grad_norm": 0.4099127948284149, + "learning_rate": 0.00036975874125874127, + "loss": 3.1951, + "step": 65950 + }, + { + "epoch": 19.219581804414933, + "grad_norm": 0.3889888525009155, + "learning_rate": 0.00036958391608391607, + "loss": 3.2092, + "step": 66000 + }, + { + "epoch": 19.219581804414933, + "eval_accuracy": 0.37289683434464604, + "eval_loss": 3.550581455230713, + "eval_runtime": 179.8286, + "eval_samples_per_second": 92.566, + "eval_steps_per_second": 5.789, + "step": 66000 + }, + { + "epoch": 19.234142932028657, + "grad_norm": 0.43066680431365967, + "learning_rate": 0.0003694090909090909, + "loss": 3.2092, + "step": 66050 + }, + { + "epoch": 19.248704059642378, + "grad_norm": 0.39301857352256775, + "learning_rate": 0.00036923426573426567, + "loss": 3.2095, + "step": 66100 + }, + { + "epoch": 19.263265187256103, + "grad_norm": 0.39563480019569397, + "learning_rate": 0.0003690594405594405, + "loss": 3.2217, + "step": 66150 + }, + { + "epoch": 19.277826314869824, + "grad_norm": 0.4087700843811035, + "learning_rate": 0.0003688846153846153, + "loss": 3.2138, + "step": 66200 + }, + { + "epoch": 19.292387442483545, + "grad_norm": 0.39682114124298096, + "learning_rate": 0.0003687097902097902, + "loss": 3.2162, + "step": 66250 + }, + { + "epoch": 19.30694857009727, + "grad_norm": 0.4453403949737549, + "learning_rate": 0.000368534965034965, + "loss": 3.2261, + "step": 66300 + }, + { + "epoch": 19.32150969771099, + "grad_norm": 0.3561858534812927, + "learning_rate": 0.00036836013986013983, + "loss": 3.2254, + "step": 66350 + }, + { + "epoch": 19.336070825324715, + "grad_norm": 0.3951205313205719, + "learning_rate": 0.00036818531468531463, + "loss": 3.225, + "step": 66400 + }, + { + "epoch": 19.350631952938436, + "grad_norm": 0.4216971695423126, + "learning_rate": 0.0003680104895104895, + "loss": 3.2389, + "step": 66450 + }, + { + "epoch": 19.365193080552157, + "grad_norm": 0.37545275688171387, + "learning_rate": 0.0003678356643356643, + "loss": 3.218, + "step": 66500 + }, + { + "epoch": 19.37975420816588, + "grad_norm": 0.41713809967041016, + "learning_rate": 0.00036766083916083914, + "loss": 3.2371, + "step": 66550 + }, + { + "epoch": 19.394315335779602, + "grad_norm": 0.41936424374580383, + "learning_rate": 0.000367486013986014, + "loss": 3.2311, + "step": 66600 + }, + { + "epoch": 19.408876463393327, + "grad_norm": 0.4337376058101654, + "learning_rate": 0.0003673111888111888, + "loss": 3.2338, + "step": 66650 + }, + { + "epoch": 19.423437591007048, + "grad_norm": 0.3993789553642273, + "learning_rate": 0.00036713636363636365, + "loss": 3.2278, + "step": 66700 + }, + { + "epoch": 19.43799871862077, + "grad_norm": 0.40916818380355835, + "learning_rate": 0.00036696153846153844, + "loss": 3.2349, + "step": 66750 + }, + { + "epoch": 19.452559846234493, + "grad_norm": 0.41507256031036377, + "learning_rate": 0.0003667867132867133, + "loss": 3.2452, + "step": 66800 + }, + { + "epoch": 19.467120973848214, + "grad_norm": 0.407402366399765, + "learning_rate": 0.00036661188811188804, + "loss": 3.2273, + "step": 66850 + }, + { + "epoch": 19.48168210146194, + "grad_norm": 0.393828809261322, + "learning_rate": 0.0003664370629370629, + "loss": 3.2292, + "step": 66900 + }, + { + "epoch": 19.49624322907566, + "grad_norm": 0.3818491995334625, + "learning_rate": 0.0003662622377622377, + "loss": 3.2383, + "step": 66950 + }, + { + "epoch": 19.51080435668938, + "grad_norm": 0.3816803991794586, + "learning_rate": 0.00036608741258741255, + "loss": 3.2554, + "step": 67000 + }, + { + "epoch": 19.51080435668938, + "eval_accuracy": 0.3730255653997258, + "eval_loss": 3.5415730476379395, + "eval_runtime": 179.7245, + "eval_samples_per_second": 92.62, + "eval_steps_per_second": 5.792, + "step": 67000 + }, + { + "epoch": 19.525365484303105, + "grad_norm": 0.41924670338630676, + "learning_rate": 0.00036591258741258735, + "loss": 3.2414, + "step": 67050 + }, + { + "epoch": 19.539926611916826, + "grad_norm": 0.38373491168022156, + "learning_rate": 0.0003657377622377622, + "loss": 3.2434, + "step": 67100 + }, + { + "epoch": 19.55448773953055, + "grad_norm": 0.39270487427711487, + "learning_rate": 0.000365562937062937, + "loss": 3.2356, + "step": 67150 + }, + { + "epoch": 19.56904886714427, + "grad_norm": 0.39708688855171204, + "learning_rate": 0.00036538811188811186, + "loss": 3.2399, + "step": 67200 + }, + { + "epoch": 19.583609994757992, + "grad_norm": 0.4171540439128876, + "learning_rate": 0.0003652132867132867, + "loss": 3.2436, + "step": 67250 + }, + { + "epoch": 19.598171122371717, + "grad_norm": 0.39421361684799194, + "learning_rate": 0.0003650384615384615, + "loss": 3.2415, + "step": 67300 + }, + { + "epoch": 19.612732249985438, + "grad_norm": 0.39570003747940063, + "learning_rate": 0.00036486363636363637, + "loss": 3.2317, + "step": 67350 + }, + { + "epoch": 19.627293377599162, + "grad_norm": 0.40717577934265137, + "learning_rate": 0.00036468881118881117, + "loss": 3.242, + "step": 67400 + }, + { + "epoch": 19.641854505212883, + "grad_norm": 0.41312843561172485, + "learning_rate": 0.000364513986013986, + "loss": 3.2523, + "step": 67450 + }, + { + "epoch": 19.656415632826604, + "grad_norm": 0.430477499961853, + "learning_rate": 0.0003643391608391608, + "loss": 3.2413, + "step": 67500 + }, + { + "epoch": 19.67097676044033, + "grad_norm": 0.3858359456062317, + "learning_rate": 0.0003641643356643357, + "loss": 3.2544, + "step": 67550 + }, + { + "epoch": 19.68553788805405, + "grad_norm": 0.38897889852523804, + "learning_rate": 0.0003639895104895104, + "loss": 3.2468, + "step": 67600 + }, + { + "epoch": 19.700099015667774, + "grad_norm": 0.40000104904174805, + "learning_rate": 0.0003638146853146853, + "loss": 3.26, + "step": 67650 + }, + { + "epoch": 19.714660143281495, + "grad_norm": 0.3826768398284912, + "learning_rate": 0.00036363986013986007, + "loss": 3.2475, + "step": 67700 + }, + { + "epoch": 19.729221270895216, + "grad_norm": 0.41855689883232117, + "learning_rate": 0.0003634650349650349, + "loss": 3.2509, + "step": 67750 + }, + { + "epoch": 19.74378239850894, + "grad_norm": 0.39253532886505127, + "learning_rate": 0.0003632902097902097, + "loss": 3.2466, + "step": 67800 + }, + { + "epoch": 19.758343526122662, + "grad_norm": 0.3928729295730591, + "learning_rate": 0.0003631153846153846, + "loss": 3.2505, + "step": 67850 + }, + { + "epoch": 19.772904653736386, + "grad_norm": 0.3860240876674652, + "learning_rate": 0.00036294055944055943, + "loss": 3.2621, + "step": 67900 + }, + { + "epoch": 19.787465781350107, + "grad_norm": 0.41534480452537537, + "learning_rate": 0.00036276573426573423, + "loss": 3.2547, + "step": 67950 + }, + { + "epoch": 19.802026908963832, + "grad_norm": 0.3703750967979431, + "learning_rate": 0.0003625909090909091, + "loss": 3.2672, + "step": 68000 + }, + { + "epoch": 19.802026908963832, + "eval_accuracy": 0.3736522916596619, + "eval_loss": 3.534688949584961, + "eval_runtime": 179.6625, + "eval_samples_per_second": 92.651, + "eval_steps_per_second": 5.794, + "step": 68000 + }, + { + "epoch": 19.816588036577553, + "grad_norm": 0.43742990493774414, + "learning_rate": 0.0003624160839160839, + "loss": 3.2588, + "step": 68050 + }, + { + "epoch": 19.831149164191274, + "grad_norm": 0.38435590267181396, + "learning_rate": 0.00036224125874125874, + "loss": 3.2622, + "step": 68100 + }, + { + "epoch": 19.845710291805, + "grad_norm": 0.40424367785453796, + "learning_rate": 0.00036206643356643354, + "loss": 3.2583, + "step": 68150 + }, + { + "epoch": 19.86027141941872, + "grad_norm": 0.40375852584838867, + "learning_rate": 0.0003618916083916084, + "loss": 3.259, + "step": 68200 + }, + { + "epoch": 19.874832547032444, + "grad_norm": 0.4463939666748047, + "learning_rate": 0.0003617167832167832, + "loss": 3.2576, + "step": 68250 + }, + { + "epoch": 19.889393674646165, + "grad_norm": 0.4056410491466522, + "learning_rate": 0.00036154195804195805, + "loss": 3.2519, + "step": 68300 + }, + { + "epoch": 19.903954802259886, + "grad_norm": 0.37983453273773193, + "learning_rate": 0.0003613671328671328, + "loss": 3.2664, + "step": 68350 + }, + { + "epoch": 19.91851592987361, + "grad_norm": 0.38492605090141296, + "learning_rate": 0.00036119230769230765, + "loss": 3.256, + "step": 68400 + }, + { + "epoch": 19.93307705748733, + "grad_norm": 0.3811017870903015, + "learning_rate": 0.00036101748251748245, + "loss": 3.263, + "step": 68450 + }, + { + "epoch": 19.947638185101056, + "grad_norm": 0.38036519289016724, + "learning_rate": 0.0003608426573426573, + "loss": 3.267, + "step": 68500 + }, + { + "epoch": 19.962199312714777, + "grad_norm": 0.3989908695220947, + "learning_rate": 0.0003606678321678321, + "loss": 3.2585, + "step": 68550 + }, + { + "epoch": 19.976760440328498, + "grad_norm": 0.40518391132354736, + "learning_rate": 0.00036049300699300696, + "loss": 3.2629, + "step": 68600 + }, + { + "epoch": 19.991321567942222, + "grad_norm": 0.4135623276233673, + "learning_rate": 0.0003603181818181818, + "loss": 3.268, + "step": 68650 + }, + { + "epoch": 20.005824451045488, + "grad_norm": 0.41751164197921753, + "learning_rate": 0.0003601433566433566, + "loss": 3.2203, + "step": 68700 + }, + { + "epoch": 20.020385578659212, + "grad_norm": 0.3791915774345398, + "learning_rate": 0.00035996853146853146, + "loss": 3.1551, + "step": 68750 + }, + { + "epoch": 20.034946706272933, + "grad_norm": 0.4008273184299469, + "learning_rate": 0.00035979370629370626, + "loss": 3.1711, + "step": 68800 + }, + { + "epoch": 20.049507833886658, + "grad_norm": 0.3967386484146118, + "learning_rate": 0.0003596188811188811, + "loss": 3.1668, + "step": 68850 + }, + { + "epoch": 20.06406896150038, + "grad_norm": 0.40993747115135193, + "learning_rate": 0.0003594440559440559, + "loss": 3.1773, + "step": 68900 + }, + { + "epoch": 20.0786300891141, + "grad_norm": 0.42113885283470154, + "learning_rate": 0.00035926923076923077, + "loss": 3.1719, + "step": 68950 + }, + { + "epoch": 20.093191216727824, + "grad_norm": 0.41880732774734497, + "learning_rate": 0.00035909440559440557, + "loss": 3.178, + "step": 69000 + }, + { + "epoch": 20.093191216727824, + "eval_accuracy": 0.3726620618177107, + "eval_loss": 3.5489065647125244, + "eval_runtime": 179.7764, + "eval_samples_per_second": 92.593, + "eval_steps_per_second": 5.791, + "step": 69000 + }, + { + "epoch": 20.107752344341545, + "grad_norm": 0.44065818190574646, + "learning_rate": 0.0003589195804195804, + "loss": 3.1677, + "step": 69050 + }, + { + "epoch": 20.12231347195527, + "grad_norm": 0.4093584418296814, + "learning_rate": 0.00035874475524475517, + "loss": 3.1905, + "step": 69100 + }, + { + "epoch": 20.13687459956899, + "grad_norm": 0.4320305585861206, + "learning_rate": 0.00035856993006993, + "loss": 3.185, + "step": 69150 + }, + { + "epoch": 20.15143572718271, + "grad_norm": 0.4164067208766937, + "learning_rate": 0.0003583951048951048, + "loss": 3.1896, + "step": 69200 + }, + { + "epoch": 20.165996854796436, + "grad_norm": 0.3870382606983185, + "learning_rate": 0.0003582202797202797, + "loss": 3.1809, + "step": 69250 + }, + { + "epoch": 20.180557982410157, + "grad_norm": 0.3869068920612335, + "learning_rate": 0.00035804545454545453, + "loss": 3.1896, + "step": 69300 + }, + { + "epoch": 20.19511911002388, + "grad_norm": 0.3980240225791931, + "learning_rate": 0.00035787062937062933, + "loss": 3.2019, + "step": 69350 + }, + { + "epoch": 20.209680237637603, + "grad_norm": 0.3959798812866211, + "learning_rate": 0.0003576958041958042, + "loss": 3.1961, + "step": 69400 + }, + { + "epoch": 20.224241365251324, + "grad_norm": 0.40293267369270325, + "learning_rate": 0.000357520979020979, + "loss": 3.1988, + "step": 69450 + }, + { + "epoch": 20.238802492865048, + "grad_norm": 0.4127791225910187, + "learning_rate": 0.00035734615384615384, + "loss": 3.1983, + "step": 69500 + }, + { + "epoch": 20.25336362047877, + "grad_norm": 0.4141997992992401, + "learning_rate": 0.00035717132867132864, + "loss": 3.2069, + "step": 69550 + }, + { + "epoch": 20.267924748092494, + "grad_norm": 0.39670079946517944, + "learning_rate": 0.0003569965034965035, + "loss": 3.1993, + "step": 69600 + }, + { + "epoch": 20.282485875706215, + "grad_norm": 0.41046348214149475, + "learning_rate": 0.0003568216783216783, + "loss": 3.2115, + "step": 69650 + }, + { + "epoch": 20.297047003319935, + "grad_norm": 0.3998822271823883, + "learning_rate": 0.00035664685314685314, + "loss": 3.2161, + "step": 69700 + }, + { + "epoch": 20.31160813093366, + "grad_norm": 0.3757505714893341, + "learning_rate": 0.00035647202797202794, + "loss": 3.224, + "step": 69750 + }, + { + "epoch": 20.32616925854738, + "grad_norm": 0.42314520478248596, + "learning_rate": 0.0003562972027972028, + "loss": 3.2191, + "step": 69800 + }, + { + "epoch": 20.340730386161106, + "grad_norm": 0.372994065284729, + "learning_rate": 0.00035612237762237754, + "loss": 3.2216, + "step": 69850 + }, + { + "epoch": 20.355291513774826, + "grad_norm": 0.39920496940612793, + "learning_rate": 0.0003559475524475524, + "loss": 3.211, + "step": 69900 + }, + { + "epoch": 20.369852641388547, + "grad_norm": 0.40215423703193665, + "learning_rate": 0.0003557727272727272, + "loss": 3.205, + "step": 69950 + }, + { + "epoch": 20.384413769002272, + "grad_norm": 0.40181687474250793, + "learning_rate": 0.00035559790209790205, + "loss": 3.22, + "step": 70000 + }, + { + "epoch": 20.384413769002272, + "eval_accuracy": 0.37305248723681556, + "eval_loss": 3.5457870960235596, + "eval_runtime": 179.9649, + "eval_samples_per_second": 92.496, + "eval_steps_per_second": 5.784, + "step": 70000 + }, + { + "epoch": 20.398974896615993, + "grad_norm": 0.4172757863998413, + "learning_rate": 0.0003554230769230769, + "loss": 3.221, + "step": 70050 + }, + { + "epoch": 20.413536024229717, + "grad_norm": 0.3744981586933136, + "learning_rate": 0.0003552482517482517, + "loss": 3.2222, + "step": 70100 + }, + { + "epoch": 20.42809715184344, + "grad_norm": 0.45279887318611145, + "learning_rate": 0.00035507342657342656, + "loss": 3.2334, + "step": 70150 + }, + { + "epoch": 20.442658279457163, + "grad_norm": 0.41247794032096863, + "learning_rate": 0.00035489860139860136, + "loss": 3.2169, + "step": 70200 + }, + { + "epoch": 20.457219407070884, + "grad_norm": 0.381062388420105, + "learning_rate": 0.0003547237762237762, + "loss": 3.2149, + "step": 70250 + }, + { + "epoch": 20.471780534684605, + "grad_norm": 0.4296295642852783, + "learning_rate": 0.000354548951048951, + "loss": 3.23, + "step": 70300 + }, + { + "epoch": 20.48634166229833, + "grad_norm": 0.406954824924469, + "learning_rate": 0.00035437412587412587, + "loss": 3.2253, + "step": 70350 + }, + { + "epoch": 20.50090278991205, + "grad_norm": 0.4118635058403015, + "learning_rate": 0.00035419930069930067, + "loss": 3.2125, + "step": 70400 + }, + { + "epoch": 20.51546391752577, + "grad_norm": 0.44677120447158813, + "learning_rate": 0.0003540244755244755, + "loss": 3.2222, + "step": 70450 + }, + { + "epoch": 20.530025045139496, + "grad_norm": 0.38376492261886597, + "learning_rate": 0.0003538496503496503, + "loss": 3.2291, + "step": 70500 + }, + { + "epoch": 20.544586172753217, + "grad_norm": 0.3942374885082245, + "learning_rate": 0.0003536748251748252, + "loss": 3.2361, + "step": 70550 + }, + { + "epoch": 20.55914730036694, + "grad_norm": 0.39628666639328003, + "learning_rate": 0.0003534999999999999, + "loss": 3.2164, + "step": 70600 + }, + { + "epoch": 20.573708427980662, + "grad_norm": 0.4207809567451477, + "learning_rate": 0.00035332517482517477, + "loss": 3.2193, + "step": 70650 + }, + { + "epoch": 20.588269555594387, + "grad_norm": 0.40318813920021057, + "learning_rate": 0.0003531503496503496, + "loss": 3.2358, + "step": 70700 + }, + { + "epoch": 20.602830683208108, + "grad_norm": 0.39456906914711, + "learning_rate": 0.0003529755244755244, + "loss": 3.2453, + "step": 70750 + }, + { + "epoch": 20.61739181082183, + "grad_norm": 0.4417160749435425, + "learning_rate": 0.0003528006993006993, + "loss": 3.2363, + "step": 70800 + }, + { + "epoch": 20.631952938435553, + "grad_norm": 0.41225501894950867, + "learning_rate": 0.0003526258741258741, + "loss": 3.2298, + "step": 70850 + }, + { + "epoch": 20.646514066049274, + "grad_norm": 0.39469778537750244, + "learning_rate": 0.00035245104895104893, + "loss": 3.241, + "step": 70900 + }, + { + "epoch": 20.661075193663, + "grad_norm": 0.37514132261276245, + "learning_rate": 0.00035227622377622373, + "loss": 3.242, + "step": 70950 + }, + { + "epoch": 20.67563632127672, + "grad_norm": 0.4397062659263611, + "learning_rate": 0.0003521013986013986, + "loss": 3.2396, + "step": 71000 + }, + { + "epoch": 20.67563632127672, + "eval_accuracy": 0.3737319991074647, + "eval_loss": 3.5377063751220703, + "eval_runtime": 179.6736, + "eval_samples_per_second": 92.646, + "eval_steps_per_second": 5.794, + "step": 71000 + }, + { + "epoch": 20.69019744889044, + "grad_norm": 0.4205426573753357, + "learning_rate": 0.0003519265734265734, + "loss": 3.2439, + "step": 71050 + }, + { + "epoch": 20.704758576504165, + "grad_norm": 0.3931397497653961, + "learning_rate": 0.00035175174825174824, + "loss": 3.245, + "step": 71100 + }, + { + "epoch": 20.719319704117886, + "grad_norm": 0.41281893849372864, + "learning_rate": 0.00035157692307692304, + "loss": 3.2373, + "step": 71150 + }, + { + "epoch": 20.73388083173161, + "grad_norm": 0.38877737522125244, + "learning_rate": 0.0003514020979020979, + "loss": 3.2437, + "step": 71200 + }, + { + "epoch": 20.74844195934533, + "grad_norm": 0.3918205797672272, + "learning_rate": 0.0003512272727272727, + "loss": 3.2329, + "step": 71250 + }, + { + "epoch": 20.763003086959053, + "grad_norm": 0.4087960720062256, + "learning_rate": 0.00035105244755244755, + "loss": 3.2433, + "step": 71300 + }, + { + "epoch": 20.777564214572777, + "grad_norm": 0.37705087661743164, + "learning_rate": 0.0003508776223776223, + "loss": 3.2449, + "step": 71350 + }, + { + "epoch": 20.792125342186498, + "grad_norm": 0.4165264666080475, + "learning_rate": 0.00035070279720279715, + "loss": 3.2455, + "step": 71400 + }, + { + "epoch": 20.806686469800223, + "grad_norm": 0.3990912437438965, + "learning_rate": 0.000350527972027972, + "loss": 3.252, + "step": 71450 + }, + { + "epoch": 20.821247597413944, + "grad_norm": 0.4242381453514099, + "learning_rate": 0.0003503531468531468, + "loss": 3.2341, + "step": 71500 + }, + { + "epoch": 20.835808725027665, + "grad_norm": 0.39090588688850403, + "learning_rate": 0.00035017832167832166, + "loss": 3.2474, + "step": 71550 + }, + { + "epoch": 20.85036985264139, + "grad_norm": 0.38443315029144287, + "learning_rate": 0.00035000349650349645, + "loss": 3.2553, + "step": 71600 + }, + { + "epoch": 20.86493098025511, + "grad_norm": 0.3968809247016907, + "learning_rate": 0.0003498286713286713, + "loss": 3.2484, + "step": 71650 + }, + { + "epoch": 20.879492107868835, + "grad_norm": 0.4066787660121918, + "learning_rate": 0.0003496538461538461, + "loss": 3.2533, + "step": 71700 + }, + { + "epoch": 20.894053235482556, + "grad_norm": 0.4234001040458679, + "learning_rate": 0.00034947902097902096, + "loss": 3.2429, + "step": 71750 + }, + { + "epoch": 20.908614363096277, + "grad_norm": 0.37977680563926697, + "learning_rate": 0.00034930419580419576, + "loss": 3.2499, + "step": 71800 + }, + { + "epoch": 20.92317549071, + "grad_norm": 0.435090571641922, + "learning_rate": 0.0003491293706293706, + "loss": 3.2546, + "step": 71850 + }, + { + "epoch": 20.937736618323722, + "grad_norm": 0.41951361298561096, + "learning_rate": 0.0003489545454545454, + "loss": 3.2535, + "step": 71900 + }, + { + "epoch": 20.952297745937447, + "grad_norm": 0.40699416399002075, + "learning_rate": 0.00034877972027972027, + "loss": 3.253, + "step": 71950 + }, + { + "epoch": 20.966858873551168, + "grad_norm": 0.43681085109710693, + "learning_rate": 0.00034860489510489507, + "loss": 3.2587, + "step": 72000 + }, + { + "epoch": 20.966858873551168, + "eval_accuracy": 0.37395536806148433, + "eval_loss": 3.530937910079956, + "eval_runtime": 179.6973, + "eval_samples_per_second": 92.634, + "eval_steps_per_second": 5.793, + "step": 72000 + }, + { + "epoch": 20.98142000116489, + "grad_norm": 0.40609315037727356, + "learning_rate": 0.0003484300699300699, + "loss": 3.2681, + "step": 72050 + }, + { + "epoch": 20.995981128778613, + "grad_norm": 0.3836963176727295, + "learning_rate": 0.0003482552447552448, + "loss": 3.2512, + "step": 72100 + }, + { + "epoch": 21.01048401188188, + "grad_norm": 0.42151519656181335, + "learning_rate": 0.0003480804195804195, + "loss": 3.1914, + "step": 72150 + }, + { + "epoch": 21.025045139495603, + "grad_norm": 0.39965352416038513, + "learning_rate": 0.0003479055944055944, + "loss": 3.1574, + "step": 72200 + }, + { + "epoch": 21.039606267109324, + "grad_norm": 0.39035817980766296, + "learning_rate": 0.0003477307692307692, + "loss": 3.1594, + "step": 72250 + }, + { + "epoch": 21.05416739472305, + "grad_norm": 0.43687039613723755, + "learning_rate": 0.00034755594405594403, + "loss": 3.1593, + "step": 72300 + }, + { + "epoch": 21.06872852233677, + "grad_norm": 0.43884846568107605, + "learning_rate": 0.00034738111888111883, + "loss": 3.1601, + "step": 72350 + }, + { + "epoch": 21.08328964995049, + "grad_norm": 0.4526152014732361, + "learning_rate": 0.0003472062937062937, + "loss": 3.1574, + "step": 72400 + }, + { + "epoch": 21.097850777564215, + "grad_norm": 0.4564531445503235, + "learning_rate": 0.0003470314685314685, + "loss": 3.1738, + "step": 72450 + }, + { + "epoch": 21.112411905177936, + "grad_norm": 0.3975127339363098, + "learning_rate": 0.00034685664335664334, + "loss": 3.1673, + "step": 72500 + }, + { + "epoch": 21.12697303279166, + "grad_norm": 0.4158114194869995, + "learning_rate": 0.00034668181818181814, + "loss": 3.1641, + "step": 72550 + }, + { + "epoch": 21.14153416040538, + "grad_norm": 0.40853938460350037, + "learning_rate": 0.000346506993006993, + "loss": 3.1721, + "step": 72600 + }, + { + "epoch": 21.156095288019102, + "grad_norm": 0.4014092981815338, + "learning_rate": 0.0003463321678321678, + "loss": 3.1741, + "step": 72650 + }, + { + "epoch": 21.170656415632827, + "grad_norm": 0.4352574944496155, + "learning_rate": 0.00034615734265734264, + "loss": 3.1833, + "step": 72700 + }, + { + "epoch": 21.185217543246548, + "grad_norm": 0.3742693066596985, + "learning_rate": 0.0003459825174825175, + "loss": 3.1811, + "step": 72750 + }, + { + "epoch": 21.199778670860272, + "grad_norm": 0.38294392824172974, + "learning_rate": 0.0003458076923076923, + "loss": 3.2017, + "step": 72800 + }, + { + "epoch": 21.214339798473993, + "grad_norm": 0.4095962643623352, + "learning_rate": 0.00034563286713286715, + "loss": 3.1751, + "step": 72850 + }, + { + "epoch": 21.228900926087718, + "grad_norm": 0.4042835831642151, + "learning_rate": 0.0003454580419580419, + "loss": 3.1879, + "step": 72900 + }, + { + "epoch": 21.24346205370144, + "grad_norm": 0.3950826823711395, + "learning_rate": 0.00034528321678321675, + "loss": 3.1906, + "step": 72950 + }, + { + "epoch": 21.25802318131516, + "grad_norm": 0.387794554233551, + "learning_rate": 0.00034510839160839155, + "loss": 3.1993, + "step": 73000 + }, + { + "epoch": 21.25802318131516, + "eval_accuracy": 0.37353320073838725, + "eval_loss": 3.5477094650268555, + "eval_runtime": 180.0332, + "eval_samples_per_second": 92.461, + "eval_steps_per_second": 5.782, + "step": 73000 + }, + { + "epoch": 21.272584308928884, + "grad_norm": 0.40166184306144714, + "learning_rate": 0.0003449335664335664, + "loss": 3.1922, + "step": 73050 + }, + { + "epoch": 21.287145436542605, + "grad_norm": 0.40153566002845764, + "learning_rate": 0.0003447587412587412, + "loss": 3.1882, + "step": 73100 + }, + { + "epoch": 21.30170656415633, + "grad_norm": 0.39476677775382996, + "learning_rate": 0.00034458391608391606, + "loss": 3.2036, + "step": 73150 + }, + { + "epoch": 21.31626769177005, + "grad_norm": 0.4329933524131775, + "learning_rate": 0.00034440909090909086, + "loss": 3.1981, + "step": 73200 + }, + { + "epoch": 21.330828819383772, + "grad_norm": 0.4002084732055664, + "learning_rate": 0.0003442342657342657, + "loss": 3.2047, + "step": 73250 + }, + { + "epoch": 21.345389946997496, + "grad_norm": 0.42490726709365845, + "learning_rate": 0.0003440594405594405, + "loss": 3.1977, + "step": 73300 + }, + { + "epoch": 21.359951074611217, + "grad_norm": 0.3994070887565613, + "learning_rate": 0.00034388461538461537, + "loss": 3.2103, + "step": 73350 + }, + { + "epoch": 21.374512202224942, + "grad_norm": 0.399056613445282, + "learning_rate": 0.00034370979020979017, + "loss": 3.2099, + "step": 73400 + }, + { + "epoch": 21.389073329838663, + "grad_norm": 0.39403849840164185, + "learning_rate": 0.000343534965034965, + "loss": 3.2125, + "step": 73450 + }, + { + "epoch": 21.403634457452384, + "grad_norm": 0.4174330532550812, + "learning_rate": 0.0003433601398601399, + "loss": 3.2097, + "step": 73500 + }, + { + "epoch": 21.41819558506611, + "grad_norm": 0.4052690863609314, + "learning_rate": 0.0003431853146853147, + "loss": 3.2114, + "step": 73550 + }, + { + "epoch": 21.43275671267983, + "grad_norm": 0.3890800178050995, + "learning_rate": 0.0003430104895104895, + "loss": 3.2104, + "step": 73600 + }, + { + "epoch": 21.447317840293554, + "grad_norm": 0.4229435324668884, + "learning_rate": 0.00034283566433566427, + "loss": 3.2104, + "step": 73650 + }, + { + "epoch": 21.461878967907275, + "grad_norm": 0.39021503925323486, + "learning_rate": 0.0003426608391608391, + "loss": 3.2126, + "step": 73700 + }, + { + "epoch": 21.476440095520996, + "grad_norm": 0.38516533374786377, + "learning_rate": 0.0003424860139860139, + "loss": 3.2203, + "step": 73750 + }, + { + "epoch": 21.49100122313472, + "grad_norm": 0.39541691541671753, + "learning_rate": 0.0003423111888111888, + "loss": 3.2282, + "step": 73800 + }, + { + "epoch": 21.50556235074844, + "grad_norm": 0.3969518840312958, + "learning_rate": 0.0003421363636363636, + "loss": 3.2145, + "step": 73850 + }, + { + "epoch": 21.520123478362166, + "grad_norm": 0.4029087722301483, + "learning_rate": 0.00034196153846153843, + "loss": 3.229, + "step": 73900 + }, + { + "epoch": 21.534684605975887, + "grad_norm": 0.4271200895309448, + "learning_rate": 0.00034178671328671323, + "loss": 3.2216, + "step": 73950 + }, + { + "epoch": 21.549245733589608, + "grad_norm": 0.4122142493724823, + "learning_rate": 0.0003416118881118881, + "loss": 3.2144, + "step": 74000 + }, + { + "epoch": 21.549245733589608, + "eval_accuracy": 0.37348217856678484, + "eval_loss": 3.5398857593536377, + "eval_runtime": 179.8183, + "eval_samples_per_second": 92.571, + "eval_steps_per_second": 5.789, + "step": 74000 + }, + { + "epoch": 21.563806861203332, + "grad_norm": 0.37525999546051025, + "learning_rate": 0.0003414370629370629, + "loss": 3.2275, + "step": 74050 + }, + { + "epoch": 21.578367988817053, + "grad_norm": 0.42617496848106384, + "learning_rate": 0.00034126223776223774, + "loss": 3.2097, + "step": 74100 + }, + { + "epoch": 21.592929116430778, + "grad_norm": 0.41545069217681885, + "learning_rate": 0.0003410874125874126, + "loss": 3.2252, + "step": 74150 + }, + { + "epoch": 21.6074902440445, + "grad_norm": 0.3902042508125305, + "learning_rate": 0.0003409125874125874, + "loss": 3.2274, + "step": 74200 + }, + { + "epoch": 21.62205137165822, + "grad_norm": 0.3986893892288208, + "learning_rate": 0.00034073776223776225, + "loss": 3.2347, + "step": 74250 + }, + { + "epoch": 21.636612499271944, + "grad_norm": 0.3976801335811615, + "learning_rate": 0.00034056293706293705, + "loss": 3.2224, + "step": 74300 + }, + { + "epoch": 21.651173626885665, + "grad_norm": 0.41786202788352966, + "learning_rate": 0.0003403881118881119, + "loss": 3.2307, + "step": 74350 + }, + { + "epoch": 21.66573475449939, + "grad_norm": 0.43465039134025574, + "learning_rate": 0.00034021328671328665, + "loss": 3.2236, + "step": 74400 + }, + { + "epoch": 21.68029588211311, + "grad_norm": 0.3973064720630646, + "learning_rate": 0.0003400384615384615, + "loss": 3.2318, + "step": 74450 + }, + { + "epoch": 21.69485700972683, + "grad_norm": 0.4066312909126282, + "learning_rate": 0.0003398636363636363, + "loss": 3.2169, + "step": 74500 + }, + { + "epoch": 21.709418137340556, + "grad_norm": 0.41648659110069275, + "learning_rate": 0.00033968881118881115, + "loss": 3.2215, + "step": 74550 + }, + { + "epoch": 21.723979264954277, + "grad_norm": 0.40081432461738586, + "learning_rate": 0.00033951398601398595, + "loss": 3.2344, + "step": 74600 + }, + { + "epoch": 21.738540392568, + "grad_norm": 0.3875221610069275, + "learning_rate": 0.0003393391608391608, + "loss": 3.2356, + "step": 74650 + }, + { + "epoch": 21.753101520181723, + "grad_norm": 0.38530296087265015, + "learning_rate": 0.0003391643356643356, + "loss": 3.23, + "step": 74700 + }, + { + "epoch": 21.767662647795444, + "grad_norm": 0.39854732155799866, + "learning_rate": 0.00033898951048951046, + "loss": 3.2332, + "step": 74750 + }, + { + "epoch": 21.782223775409168, + "grad_norm": 0.4148358404636383, + "learning_rate": 0.00033881468531468526, + "loss": 3.2353, + "step": 74800 + }, + { + "epoch": 21.79678490302289, + "grad_norm": 0.3934561014175415, + "learning_rate": 0.0003386398601398601, + "loss": 3.2407, + "step": 74850 + }, + { + "epoch": 21.811346030636614, + "grad_norm": 0.414039671421051, + "learning_rate": 0.00033846503496503497, + "loss": 3.2336, + "step": 74900 + }, + { + "epoch": 21.825907158250335, + "grad_norm": 0.39278024435043335, + "learning_rate": 0.00033829020979020977, + "loss": 3.2404, + "step": 74950 + }, + { + "epoch": 21.840468285864056, + "grad_norm": 0.42450013756752014, + "learning_rate": 0.0003381153846153846, + "loss": 3.2363, + "step": 75000 + }, + { + "epoch": 21.840468285864056, + "eval_accuracy": 0.3742865419264702, + "eval_loss": 3.531919479370117, + "eval_runtime": 179.9028, + "eval_samples_per_second": 92.528, + "eval_steps_per_second": 5.786, + "step": 75000 + }, + { + "epoch": 21.85502941347778, + "grad_norm": 0.3962397873401642, + "learning_rate": 0.0003379405594405594, + "loss": 3.2311, + "step": 75050 + }, + { + "epoch": 21.8695905410915, + "grad_norm": 0.4015287756919861, + "learning_rate": 0.0003377657342657343, + "loss": 3.2483, + "step": 75100 + }, + { + "epoch": 21.884151668705226, + "grad_norm": 0.4102756977081299, + "learning_rate": 0.000337590909090909, + "loss": 3.2399, + "step": 75150 + }, + { + "epoch": 21.898712796318947, + "grad_norm": 0.4179084599018097, + "learning_rate": 0.0003374160839160839, + "loss": 3.2523, + "step": 75200 + }, + { + "epoch": 21.91327392393267, + "grad_norm": 0.42700445652008057, + "learning_rate": 0.0003372412587412587, + "loss": 3.2424, + "step": 75250 + }, + { + "epoch": 21.927835051546392, + "grad_norm": 0.4214681386947632, + "learning_rate": 0.00033706643356643353, + "loss": 3.2407, + "step": 75300 + }, + { + "epoch": 21.942396179160113, + "grad_norm": 0.43577635288238525, + "learning_rate": 0.00033689160839160833, + "loss": 3.2301, + "step": 75350 + }, + { + "epoch": 21.956957306773838, + "grad_norm": 0.4034266173839569, + "learning_rate": 0.0003367167832167832, + "loss": 3.2496, + "step": 75400 + }, + { + "epoch": 21.97151843438756, + "grad_norm": 0.40454426407814026, + "learning_rate": 0.000336541958041958, + "loss": 3.2426, + "step": 75450 + }, + { + "epoch": 21.986079562001283, + "grad_norm": 0.4080248475074768, + "learning_rate": 0.00033636713286713284, + "loss": 3.2446, + "step": 75500 + }, + { + "epoch": 22.00058244510455, + "grad_norm": 0.4675055742263794, + "learning_rate": 0.0003361923076923077, + "loss": 3.2306, + "step": 75550 + }, + { + "epoch": 22.015143572718273, + "grad_norm": 0.43351686000823975, + "learning_rate": 0.0003360174825174825, + "loss": 3.1425, + "step": 75600 + }, + { + "epoch": 22.029704700331994, + "grad_norm": 0.4114014506340027, + "learning_rate": 0.00033584265734265734, + "loss": 3.1431, + "step": 75650 + }, + { + "epoch": 22.044265827945715, + "grad_norm": 0.40984776616096497, + "learning_rate": 0.00033566783216783214, + "loss": 3.1486, + "step": 75700 + }, + { + "epoch": 22.05882695555944, + "grad_norm": 0.3943866193294525, + "learning_rate": 0.000335493006993007, + "loss": 3.1488, + "step": 75750 + }, + { + "epoch": 22.07338808317316, + "grad_norm": 0.44786471128463745, + "learning_rate": 0.0003353181818181818, + "loss": 3.1494, + "step": 75800 + }, + { + "epoch": 22.087949210786885, + "grad_norm": 0.42942312359809875, + "learning_rate": 0.00033514335664335665, + "loss": 3.1642, + "step": 75850 + }, + { + "epoch": 22.102510338400606, + "grad_norm": 0.40645474195480347, + "learning_rate": 0.0003349685314685314, + "loss": 3.1632, + "step": 75900 + }, + { + "epoch": 22.117071466014327, + "grad_norm": 0.41174495220184326, + "learning_rate": 0.00033479370629370625, + "loss": 3.1626, + "step": 75950 + }, + { + "epoch": 22.13163259362805, + "grad_norm": 0.41596174240112305, + "learning_rate": 0.00033461888111888105, + "loss": 3.1602, + "step": 76000 + }, + { + "epoch": 22.13163259362805, + "eval_accuracy": 0.37349816708138833, + "eval_loss": 3.5504817962646484, + "eval_runtime": 203.6866, + "eval_samples_per_second": 81.724, + "eval_steps_per_second": 5.111, + "step": 76000 + }, + { + "epoch": 22.146193721241772, + "grad_norm": 0.4284675419330597, + "learning_rate": 0.0003344440559440559, + "loss": 3.1731, + "step": 76050 + }, + { + "epoch": 22.160754848855497, + "grad_norm": 0.39755699038505554, + "learning_rate": 0.0003342692307692307, + "loss": 3.1743, + "step": 76100 + }, + { + "epoch": 22.175315976469218, + "grad_norm": 0.41730695962905884, + "learning_rate": 0.00033409440559440556, + "loss": 3.1718, + "step": 76150 + }, + { + "epoch": 22.18987710408294, + "grad_norm": 0.42204800248146057, + "learning_rate": 0.00033391958041958036, + "loss": 3.1784, + "step": 76200 + }, + { + "epoch": 22.204438231696663, + "grad_norm": 0.42969995737075806, + "learning_rate": 0.0003337447552447552, + "loss": 3.176, + "step": 76250 + }, + { + "epoch": 22.218999359310384, + "grad_norm": 0.39928796887397766, + "learning_rate": 0.00033356993006993007, + "loss": 3.1751, + "step": 76300 + }, + { + "epoch": 22.23356048692411, + "grad_norm": 0.44051143527030945, + "learning_rate": 0.00033339510489510487, + "loss": 3.1718, + "step": 76350 + }, + { + "epoch": 22.24812161453783, + "grad_norm": 0.4043222963809967, + "learning_rate": 0.0003332202797202797, + "loss": 3.1673, + "step": 76400 + }, + { + "epoch": 22.26268274215155, + "grad_norm": 0.4258318841457367, + "learning_rate": 0.0003330454545454545, + "loss": 3.1765, + "step": 76450 + }, + { + "epoch": 22.277243869765275, + "grad_norm": 0.41968485713005066, + "learning_rate": 0.0003328706293706294, + "loss": 3.187, + "step": 76500 + }, + { + "epoch": 22.291804997378996, + "grad_norm": 0.41846907138824463, + "learning_rate": 0.00033269580419580417, + "loss": 3.1817, + "step": 76550 + }, + { + "epoch": 22.30636612499272, + "grad_norm": 0.42832231521606445, + "learning_rate": 0.000332520979020979, + "loss": 3.1935, + "step": 76600 + }, + { + "epoch": 22.32092725260644, + "grad_norm": 0.4351508319377899, + "learning_rate": 0.00033234615384615377, + "loss": 3.1889, + "step": 76650 + }, + { + "epoch": 22.335488380220163, + "grad_norm": 0.45581719279289246, + "learning_rate": 0.0003321713286713286, + "loss": 3.1928, + "step": 76700 + }, + { + "epoch": 22.350049507833887, + "grad_norm": 0.40861231088638306, + "learning_rate": 0.0003319965034965034, + "loss": 3.1976, + "step": 76750 + }, + { + "epoch": 22.364610635447608, + "grad_norm": 0.45059776306152344, + "learning_rate": 0.0003318216783216783, + "loss": 3.2065, + "step": 76800 + }, + { + "epoch": 22.379171763061333, + "grad_norm": 0.41020745038986206, + "learning_rate": 0.0003316468531468531, + "loss": 3.1938, + "step": 76850 + }, + { + "epoch": 22.393732890675054, + "grad_norm": 0.43098288774490356, + "learning_rate": 0.00033147202797202793, + "loss": 3.199, + "step": 76900 + }, + { + "epoch": 22.408294018288775, + "grad_norm": 0.4355289340019226, + "learning_rate": 0.0003312972027972028, + "loss": 3.2009, + "step": 76950 + }, + { + "epoch": 22.4228551459025, + "grad_norm": 0.4274457097053528, + "learning_rate": 0.0003311223776223776, + "loss": 3.1957, + "step": 77000 + }, + { + "epoch": 22.4228551459025, + "eval_accuracy": 0.37429806306199337, + "eval_loss": 3.5408267974853516, + "eval_runtime": 209.8461, + "eval_samples_per_second": 79.325, + "eval_steps_per_second": 4.961, + "step": 77000 + }, + { + "epoch": 22.43741627351622, + "grad_norm": 0.4628225266933441, + "learning_rate": 0.00033094755244755244, + "loss": 3.2019, + "step": 77050 + }, + { + "epoch": 22.451977401129945, + "grad_norm": 0.40951475501060486, + "learning_rate": 0.00033077272727272724, + "loss": 3.2059, + "step": 77100 + }, + { + "epoch": 22.466538528743666, + "grad_norm": 0.4228519797325134, + "learning_rate": 0.0003305979020979021, + "loss": 3.196, + "step": 77150 + }, + { + "epoch": 22.481099656357387, + "grad_norm": 0.39467620849609375, + "learning_rate": 0.0003304230769230769, + "loss": 3.2024, + "step": 77200 + }, + { + "epoch": 22.49566078397111, + "grad_norm": 0.41270583868026733, + "learning_rate": 0.00033024825174825175, + "loss": 3.1946, + "step": 77250 + }, + { + "epoch": 22.510221911584832, + "grad_norm": 0.44770345091819763, + "learning_rate": 0.00033007342657342655, + "loss": 3.1961, + "step": 77300 + }, + { + "epoch": 22.524783039198557, + "grad_norm": 0.41133832931518555, + "learning_rate": 0.0003298986013986014, + "loss": 3.2093, + "step": 77350 + }, + { + "epoch": 22.539344166812278, + "grad_norm": 0.40371203422546387, + "learning_rate": 0.00032972377622377615, + "loss": 3.217, + "step": 77400 + }, + { + "epoch": 22.553905294426002, + "grad_norm": 0.4077425003051758, + "learning_rate": 0.000329548951048951, + "loss": 3.2111, + "step": 77450 + }, + { + "epoch": 22.568466422039723, + "grad_norm": 0.42343926429748535, + "learning_rate": 0.0003293741258741258, + "loss": 3.2109, + "step": 77500 + }, + { + "epoch": 22.583027549653444, + "grad_norm": 0.4148765802383423, + "learning_rate": 0.00032919930069930065, + "loss": 3.2178, + "step": 77550 + }, + { + "epoch": 22.59758867726717, + "grad_norm": 0.4103222191333771, + "learning_rate": 0.0003290244755244755, + "loss": 3.2097, + "step": 77600 + }, + { + "epoch": 22.61214980488089, + "grad_norm": 0.44261687994003296, + "learning_rate": 0.0003288496503496503, + "loss": 3.2068, + "step": 77650 + }, + { + "epoch": 22.626710932494614, + "grad_norm": 0.41240397095680237, + "learning_rate": 0.00032867482517482516, + "loss": 3.2092, + "step": 77700 + }, + { + "epoch": 22.641272060108335, + "grad_norm": 0.40966787934303284, + "learning_rate": 0.00032849999999999996, + "loss": 3.2144, + "step": 77750 + }, + { + "epoch": 22.655833187722056, + "grad_norm": 0.39857950806617737, + "learning_rate": 0.0003283251748251748, + "loss": 3.2212, + "step": 77800 + }, + { + "epoch": 22.67039431533578, + "grad_norm": 0.4325350522994995, + "learning_rate": 0.0003281503496503496, + "loss": 3.2199, + "step": 77850 + }, + { + "epoch": 22.6849554429495, + "grad_norm": 0.4444861114025116, + "learning_rate": 0.00032797552447552447, + "loss": 3.2181, + "step": 77900 + }, + { + "epoch": 22.699516570563226, + "grad_norm": 0.39821311831474304, + "learning_rate": 0.00032780069930069927, + "loss": 3.2296, + "step": 77950 + }, + { + "epoch": 22.714077698176947, + "grad_norm": 0.4173693060874939, + "learning_rate": 0.0003276258741258741, + "loss": 3.2258, + "step": 78000 + }, + { + "epoch": 22.714077698176947, + "eval_accuracy": 0.3743928185235406, + "eval_loss": 3.53715443611145, + "eval_runtime": 179.68, + "eval_samples_per_second": 92.642, + "eval_steps_per_second": 5.794, + "step": 78000 + }, + { + "epoch": 22.728638825790668, + "grad_norm": 0.4090815782546997, + "learning_rate": 0.0003274510489510489, + "loss": 3.225, + "step": 78050 + }, + { + "epoch": 22.743199953404392, + "grad_norm": 0.3890690803527832, + "learning_rate": 0.0003272762237762238, + "loss": 3.2165, + "step": 78100 + }, + { + "epoch": 22.757761081018113, + "grad_norm": 0.4822121858596802, + "learning_rate": 0.0003271013986013985, + "loss": 3.2231, + "step": 78150 + }, + { + "epoch": 22.772322208631838, + "grad_norm": 0.4495407044887543, + "learning_rate": 0.0003269265734265734, + "loss": 3.2311, + "step": 78200 + }, + { + "epoch": 22.78688333624556, + "grad_norm": 0.42291054129600525, + "learning_rate": 0.0003267517482517482, + "loss": 3.2286, + "step": 78250 + }, + { + "epoch": 22.80144446385928, + "grad_norm": 0.43875908851623535, + "learning_rate": 0.00032657692307692303, + "loss": 3.2133, + "step": 78300 + }, + { + "epoch": 22.816005591473004, + "grad_norm": 0.39126718044281006, + "learning_rate": 0.0003264020979020979, + "loss": 3.2198, + "step": 78350 + }, + { + "epoch": 22.830566719086725, + "grad_norm": 0.39724770188331604, + "learning_rate": 0.0003262272727272727, + "loss": 3.2341, + "step": 78400 + }, + { + "epoch": 22.84512784670045, + "grad_norm": 0.40128400921821594, + "learning_rate": 0.00032605244755244754, + "loss": 3.2236, + "step": 78450 + }, + { + "epoch": 22.85968897431417, + "grad_norm": 0.42113637924194336, + "learning_rate": 0.00032587762237762234, + "loss": 3.2344, + "step": 78500 + }, + { + "epoch": 22.874250101927892, + "grad_norm": 0.3953815698623657, + "learning_rate": 0.0003257027972027972, + "loss": 3.2346, + "step": 78550 + }, + { + "epoch": 22.888811229541616, + "grad_norm": 0.40962186455726624, + "learning_rate": 0.000325527972027972, + "loss": 3.2355, + "step": 78600 + }, + { + "epoch": 22.903372357155337, + "grad_norm": 0.41248080134391785, + "learning_rate": 0.00032535314685314684, + "loss": 3.2279, + "step": 78650 + }, + { + "epoch": 22.917933484769062, + "grad_norm": 0.3938314914703369, + "learning_rate": 0.00032517832167832164, + "loss": 3.2427, + "step": 78700 + }, + { + "epoch": 22.932494612382783, + "grad_norm": 0.4408927857875824, + "learning_rate": 0.0003250034965034965, + "loss": 3.2283, + "step": 78750 + }, + { + "epoch": 22.947055739996504, + "grad_norm": 0.4145766496658325, + "learning_rate": 0.0003248286713286713, + "loss": 3.2249, + "step": 78800 + }, + { + "epoch": 22.96161686761023, + "grad_norm": 0.39652279019355774, + "learning_rate": 0.00032465384615384615, + "loss": 3.2319, + "step": 78850 + }, + { + "epoch": 22.97617799522395, + "grad_norm": 0.39880838990211487, + "learning_rate": 0.0003244790209790209, + "loss": 3.2527, + "step": 78900 + }, + { + "epoch": 22.990739122837674, + "grad_norm": 0.42180368304252625, + "learning_rate": 0.00032430419580419575, + "loss": 3.258, + "step": 78950 + }, + { + "epoch": 23.00524200594094, + "grad_norm": 0.45832839608192444, + "learning_rate": 0.00032412937062937066, + "loss": 3.1934, + "step": 79000 + }, + { + "epoch": 23.00524200594094, + "eval_accuracy": 0.37386425704076576, + "eval_loss": 3.5435686111450195, + "eval_runtime": 204.0982, + "eval_samples_per_second": 81.559, + "eval_steps_per_second": 5.1, + "step": 79000 + }, + { + "epoch": 23.019803133554664, + "grad_norm": 0.43102172017097473, + "learning_rate": 0.0003239545454545454, + "loss": 3.1394, + "step": 79050 + }, + { + "epoch": 23.034364261168385, + "grad_norm": 0.4099760949611664, + "learning_rate": 0.00032377972027972026, + "loss": 3.1418, + "step": 79100 + }, + { + "epoch": 23.048925388782106, + "grad_norm": 0.4064216911792755, + "learning_rate": 0.00032360489510489506, + "loss": 3.1343, + "step": 79150 + }, + { + "epoch": 23.06348651639583, + "grad_norm": 0.41040489077568054, + "learning_rate": 0.0003234300699300699, + "loss": 3.1439, + "step": 79200 + }, + { + "epoch": 23.07804764400955, + "grad_norm": 0.40389707684516907, + "learning_rate": 0.0003232552447552447, + "loss": 3.1514, + "step": 79250 + }, + { + "epoch": 23.092608771623276, + "grad_norm": 0.44415801763534546, + "learning_rate": 0.00032308041958041957, + "loss": 3.1359, + "step": 79300 + }, + { + "epoch": 23.107169899236997, + "grad_norm": 0.4046581983566284, + "learning_rate": 0.00032290559440559437, + "loss": 3.1528, + "step": 79350 + }, + { + "epoch": 23.121731026850718, + "grad_norm": 0.4351769983768463, + "learning_rate": 0.0003227307692307692, + "loss": 3.1586, + "step": 79400 + }, + { + "epoch": 23.136292154464442, + "grad_norm": 0.45303016901016235, + "learning_rate": 0.000322555944055944, + "loss": 3.1487, + "step": 79450 + }, + { + "epoch": 23.150853282078163, + "grad_norm": 0.4389405846595764, + "learning_rate": 0.00032238111888111887, + "loss": 3.1571, + "step": 79500 + }, + { + "epoch": 23.165414409691888, + "grad_norm": 0.425350546836853, + "learning_rate": 0.00032220629370629367, + "loss": 3.1615, + "step": 79550 + }, + { + "epoch": 23.17997553730561, + "grad_norm": 0.4540613293647766, + "learning_rate": 0.0003220314685314685, + "loss": 3.1741, + "step": 79600 + }, + { + "epoch": 23.19453666491933, + "grad_norm": 0.44594645500183105, + "learning_rate": 0.00032185664335664327, + "loss": 3.1742, + "step": 79650 + }, + { + "epoch": 23.209097792533054, + "grad_norm": 0.4110971689224243, + "learning_rate": 0.0003216818181818181, + "loss": 3.1683, + "step": 79700 + }, + { + "epoch": 23.223658920146775, + "grad_norm": 0.4295291602611542, + "learning_rate": 0.00032150699300699303, + "loss": 3.1743, + "step": 79750 + }, + { + "epoch": 23.2382200477605, + "grad_norm": 0.4476734399795532, + "learning_rate": 0.0003213321678321678, + "loss": 3.1666, + "step": 79800 + }, + { + "epoch": 23.25278117537422, + "grad_norm": 0.42437073588371277, + "learning_rate": 0.00032115734265734263, + "loss": 3.1704, + "step": 79850 + }, + { + "epoch": 23.26734230298794, + "grad_norm": 0.41974642872810364, + "learning_rate": 0.00032098251748251743, + "loss": 3.1811, + "step": 79900 + }, + { + "epoch": 23.281903430601666, + "grad_norm": 0.42279666662216187, + "learning_rate": 0.0003208076923076923, + "loss": 3.1837, + "step": 79950 + }, + { + "epoch": 23.296464558215387, + "grad_norm": 0.4157053828239441, + "learning_rate": 0.0003206328671328671, + "loss": 3.183, + "step": 80000 + }, + { + "epoch": 23.296464558215387, + "eval_accuracy": 0.37421988392808647, + "eval_loss": 3.5450761318206787, + "eval_runtime": 924.0109, + "eval_samples_per_second": 18.015, + "eval_steps_per_second": 1.127, + "step": 80000 + }, + { + "epoch": 23.31102568582911, + "grad_norm": 0.4075993001461029, + "learning_rate": 0.00032045804195804194, + "loss": 3.1425, + "step": 80050 + }, + { + "epoch": 23.325586813442833, + "grad_norm": 0.435520738363266, + "learning_rate": 0.00032028321678321674, + "loss": 3.1409, + "step": 80100 + }, + { + "epoch": 23.340147941056557, + "grad_norm": 0.44043847918510437, + "learning_rate": 0.0003201083916083916, + "loss": 3.1473, + "step": 80150 + }, + { + "epoch": 23.354709068670278, + "grad_norm": 0.406053751707077, + "learning_rate": 0.0003199335664335664, + "loss": 3.149, + "step": 80200 + }, + { + "epoch": 23.369270196284, + "grad_norm": 0.423145055770874, + "learning_rate": 0.00031975874125874125, + "loss": 3.1526, + "step": 80250 + }, + { + "epoch": 23.383831323897724, + "grad_norm": 0.4597681164741516, + "learning_rate": 0.00031958391608391605, + "loss": 3.1547, + "step": 80300 + }, + { + "epoch": 23.398392451511445, + "grad_norm": 0.41860097646713257, + "learning_rate": 0.0003194090909090909, + "loss": 3.1435, + "step": 80350 + }, + { + "epoch": 23.41295357912517, + "grad_norm": 0.4147387742996216, + "learning_rate": 0.00031923426573426576, + "loss": 3.1565, + "step": 80400 + }, + { + "epoch": 23.42751470673889, + "grad_norm": 0.43652892112731934, + "learning_rate": 0.0003190594405594405, + "loss": 3.155, + "step": 80450 + }, + { + "epoch": 23.44207583435261, + "grad_norm": 0.43268704414367676, + "learning_rate": 0.0003188846153846154, + "loss": 3.1696, + "step": 80500 + }, + { + "epoch": 23.456636961966336, + "grad_norm": 0.4125154912471771, + "learning_rate": 0.00031870979020979015, + "loss": 3.1671, + "step": 80550 + }, + { + "epoch": 23.471198089580056, + "grad_norm": 0.436293363571167, + "learning_rate": 0.000318534965034965, + "loss": 3.1677, + "step": 80600 + }, + { + "epoch": 23.48575921719378, + "grad_norm": 0.44055047631263733, + "learning_rate": 0.0003183601398601398, + "loss": 3.168, + "step": 80650 + }, + { + "epoch": 23.500320344807502, + "grad_norm": 0.4154927432537079, + "learning_rate": 0.00031818531468531466, + "loss": 3.1656, + "step": 80700 + }, + { + "epoch": 23.514881472421223, + "grad_norm": 0.4015692174434662, + "learning_rate": 0.00031801048951048946, + "loss": 3.1698, + "step": 80750 + }, + { + "epoch": 23.529442600034947, + "grad_norm": 0.462243914604187, + "learning_rate": 0.0003178356643356643, + "loss": 3.1756, + "step": 80800 + }, + { + "epoch": 23.54400372764867, + "grad_norm": 0.3996904194355011, + "learning_rate": 0.0003176608391608391, + "loss": 3.1665, + "step": 80850 + }, + { + "epoch": 23.558564855262393, + "grad_norm": 0.4055446982383728, + "learning_rate": 0.00031748601398601397, + "loss": 3.1848, + "step": 80900 + }, + { + "epoch": 23.573125982876114, + "grad_norm": 0.41298335790634155, + "learning_rate": 0.00031731118881118877, + "loss": 3.1912, + "step": 80950 + }, + { + "epoch": 23.587687110489835, + "grad_norm": 0.447311669588089, + "learning_rate": 0.0003171363636363636, + "loss": 3.182, + "step": 81000 + }, + { + "epoch": 23.587687110489835, + "eval_accuracy": 0.3740028633548653, + "eval_loss": 3.5492684841156006, + "eval_runtime": 80.4358, + "eval_samples_per_second": 206.948, + "eval_steps_per_second": 12.942, + "step": 81000 + }, + { + "epoch": 23.60224823810356, + "grad_norm": 0.42425885796546936, + "learning_rate": 0.0003169615384615385, + "loss": 3.1837, + "step": 81050 + }, + { + "epoch": 23.61680936571728, + "grad_norm": 0.40934208035469055, + "learning_rate": 0.0003167867132867133, + "loss": 3.1819, + "step": 81100 + }, + { + "epoch": 23.631370493331005, + "grad_norm": 0.4387608766555786, + "learning_rate": 0.00031661188811188813, + "loss": 3.1764, + "step": 81150 + }, + { + "epoch": 23.645931620944726, + "grad_norm": 0.3999146521091461, + "learning_rate": 0.0003164370629370629, + "loss": 3.1878, + "step": 81200 + }, + { + "epoch": 23.660492748558447, + "grad_norm": 0.4529256820678711, + "learning_rate": 0.0003162622377622378, + "loss": 3.179, + "step": 81250 + }, + { + "epoch": 23.67505387617217, + "grad_norm": 0.4573615491390228, + "learning_rate": 0.00031608741258741253, + "loss": 3.1939, + "step": 81300 + }, + { + "epoch": 23.689615003785892, + "grad_norm": 0.43424728512763977, + "learning_rate": 0.0003159125874125874, + "loss": 3.1878, + "step": 81350 + }, + { + "epoch": 23.704176131399617, + "grad_norm": 0.4275583326816559, + "learning_rate": 0.0003157377622377622, + "loss": 3.1769, + "step": 81400 + }, + { + "epoch": 23.718737259013338, + "grad_norm": 0.46233272552490234, + "learning_rate": 0.00031556293706293704, + "loss": 3.1916, + "step": 81450 + }, + { + "epoch": 23.73329838662706, + "grad_norm": 0.4323495030403137, + "learning_rate": 0.00031538811188811184, + "loss": 3.2002, + "step": 81500 + }, + { + "epoch": 23.747859514240783, + "grad_norm": 0.4355345666408539, + "learning_rate": 0.0003152132867132867, + "loss": 3.1995, + "step": 81550 + }, + { + "epoch": 23.762420641854504, + "grad_norm": 0.45010533928871155, + "learning_rate": 0.0003150384615384615, + "loss": 3.1912, + "step": 81600 + }, + { + "epoch": 23.77698176946823, + "grad_norm": 0.41172489523887634, + "learning_rate": 0.00031486363636363634, + "loss": 3.1936, + "step": 81650 + }, + { + "epoch": 23.79154289708195, + "grad_norm": 0.4244674742221832, + "learning_rate": 0.00031468881118881114, + "loss": 3.2111, + "step": 81700 + }, + { + "epoch": 23.80610402469567, + "grad_norm": 0.45701536536216736, + "learning_rate": 0.000314513986013986, + "loss": 3.2003, + "step": 81750 + }, + { + "epoch": 23.820665152309395, + "grad_norm": 0.4102361798286438, + "learning_rate": 0.00031433916083916085, + "loss": 3.1871, + "step": 81800 + }, + { + "epoch": 23.835226279923116, + "grad_norm": 0.4519789516925812, + "learning_rate": 0.00031416433566433565, + "loss": 3.199, + "step": 81850 + }, + { + "epoch": 23.84978740753684, + "grad_norm": 0.42273131012916565, + "learning_rate": 0.0003139895104895105, + "loss": 3.189, + "step": 81900 + }, + { + "epoch": 23.86434853515056, + "grad_norm": 0.39564603567123413, + "learning_rate": 0.00031381468531468525, + "loss": 3.2031, + "step": 81950 + }, + { + "epoch": 23.878909662764286, + "grad_norm": 0.4374904930591583, + "learning_rate": 0.00031363986013986016, + "loss": 3.2035, + "step": 82000 + }, + { + "epoch": 23.878909662764286, + "eval_accuracy": 0.37404953570999466, + "eval_loss": 3.541025400161743, + "eval_runtime": 80.2075, + "eval_samples_per_second": 207.537, + "eval_steps_per_second": 12.979, + "step": 82000 + }, + { + "epoch": 23.893470790378007, + "grad_norm": 0.41857510805130005, + "learning_rate": 0.0003134650349650349, + "loss": 3.2, + "step": 82050 + }, + { + "epoch": 23.908031917991728, + "grad_norm": 0.43600863218307495, + "learning_rate": 0.00031329020979020976, + "loss": 3.2011, + "step": 82100 + }, + { + "epoch": 23.922593045605453, + "grad_norm": 0.4216744899749756, + "learning_rate": 0.00031311538461538456, + "loss": 3.2035, + "step": 82150 + }, + { + "epoch": 23.937154173219174, + "grad_norm": 0.411630243062973, + "learning_rate": 0.0003129405594405594, + "loss": 3.2061, + "step": 82200 + }, + { + "epoch": 23.951715300832895, + "grad_norm": 0.438644140958786, + "learning_rate": 0.0003127657342657342, + "loss": 3.2096, + "step": 82250 + }, + { + "epoch": 23.96627642844662, + "grad_norm": 0.41276854276657104, + "learning_rate": 0.00031259090909090907, + "loss": 3.2023, + "step": 82300 + }, + { + "epoch": 23.98083755606034, + "grad_norm": 0.4013693928718567, + "learning_rate": 0.00031241608391608386, + "loss": 3.2011, + "step": 82350 + }, + { + "epoch": 23.995398683674065, + "grad_norm": 0.4175088703632355, + "learning_rate": 0.0003122412587412587, + "loss": 3.2096, + "step": 82400 + }, + { + "epoch": 24.010192789329604, + "grad_norm": 0.3980584442615509, + "learning_rate": 0.00031206643356643357, + "loss": 3.2127, + "step": 82450 + }, + { + "epoch": 24.02475391694333, + "grad_norm": 0.40012335777282715, + "learning_rate": 0.00031189160839160837, + "loss": 3.1319, + "step": 82500 + }, + { + "epoch": 24.03931504455705, + "grad_norm": 0.45777830481529236, + "learning_rate": 0.0003117167832167832, + "loss": 3.1226, + "step": 82550 + }, + { + "epoch": 24.053876172170774, + "grad_norm": 0.44569453597068787, + "learning_rate": 0.000311541958041958, + "loss": 3.1373, + "step": 82600 + }, + { + "epoch": 24.068437299784495, + "grad_norm": 0.4426841735839844, + "learning_rate": 0.0003113671328671329, + "loss": 3.1402, + "step": 82650 + }, + { + "epoch": 24.082998427398216, + "grad_norm": 0.4288923442363739, + "learning_rate": 0.0003111923076923076, + "loss": 3.1401, + "step": 82700 + }, + { + "epoch": 24.09755955501194, + "grad_norm": 0.45076754689216614, + "learning_rate": 0.00031101748251748253, + "loss": 3.1331, + "step": 82750 + }, + { + "epoch": 24.11212068262566, + "grad_norm": 0.4328001141548157, + "learning_rate": 0.0003108426573426573, + "loss": 3.1486, + "step": 82800 + }, + { + "epoch": 24.126681810239386, + "grad_norm": 0.4192425608634949, + "learning_rate": 0.00031066783216783213, + "loss": 3.1511, + "step": 82850 + }, + { + "epoch": 24.141242937853107, + "grad_norm": 0.432831346988678, + "learning_rate": 0.00031049300699300693, + "loss": 3.1592, + "step": 82900 + }, + { + "epoch": 24.15580406546683, + "grad_norm": 0.4342346787452698, + "learning_rate": 0.0003103181818181818, + "loss": 3.1628, + "step": 82950 + }, + { + "epoch": 24.170365193080553, + "grad_norm": 0.4602639973163605, + "learning_rate": 0.0003101433566433566, + "loss": 3.1533, + "step": 83000 + }, + { + "epoch": 24.170365193080553, + "eval_accuracy": 0.3738702527337421, + "eval_loss": 3.5505053997039795, + "eval_runtime": 81.647, + "eval_samples_per_second": 203.878, + "eval_steps_per_second": 12.75, + "step": 83000 + }, + { + "epoch": 24.184926320694274, + "grad_norm": 0.42068788409233093, + "learning_rate": 0.00030996853146853144, + "loss": 3.1589, + "step": 83050 + }, + { + "epoch": 24.199487448308, + "grad_norm": 0.4067046344280243, + "learning_rate": 0.00030979370629370624, + "loss": 3.1652, + "step": 83100 + }, + { + "epoch": 24.21404857592172, + "grad_norm": 0.43839457631111145, + "learning_rate": 0.0003096188811188811, + "loss": 3.1579, + "step": 83150 + }, + { + "epoch": 24.22860970353544, + "grad_norm": 0.439429372549057, + "learning_rate": 0.00030944405594405595, + "loss": 3.1643, + "step": 83200 + }, + { + "epoch": 24.243170831149165, + "grad_norm": 0.41719990968704224, + "learning_rate": 0.00030926923076923075, + "loss": 3.1661, + "step": 83250 + }, + { + "epoch": 24.257731958762886, + "grad_norm": 0.44479116797447205, + "learning_rate": 0.0003090944055944056, + "loss": 3.154, + "step": 83300 + }, + { + "epoch": 24.27229308637661, + "grad_norm": 0.4219229519367218, + "learning_rate": 0.0003089195804195804, + "loss": 3.1805, + "step": 83350 + }, + { + "epoch": 24.28685421399033, + "grad_norm": 0.42215579748153687, + "learning_rate": 0.00030874475524475525, + "loss": 3.1685, + "step": 83400 + }, + { + "epoch": 24.301415341604052, + "grad_norm": 0.40854838490486145, + "learning_rate": 0.00030856993006993, + "loss": 3.1756, + "step": 83450 + }, + { + "epoch": 24.315976469217777, + "grad_norm": 0.4285014271736145, + "learning_rate": 0.0003083951048951049, + "loss": 3.1724, + "step": 83500 + }, + { + "epoch": 24.330537596831498, + "grad_norm": 0.44286397099494934, + "learning_rate": 0.00030822027972027965, + "loss": 3.1803, + "step": 83550 + }, + { + "epoch": 24.345098724445222, + "grad_norm": 0.4677809476852417, + "learning_rate": 0.0003080454545454545, + "loss": 3.175, + "step": 83600 + }, + { + "epoch": 24.359659852058943, + "grad_norm": 0.44377604126930237, + "learning_rate": 0.0003078706293706293, + "loss": 3.1706, + "step": 83650 + }, + { + "epoch": 24.374220979672664, + "grad_norm": 0.42284804582595825, + "learning_rate": 0.00030769580419580416, + "loss": 3.1799, + "step": 83700 + }, + { + "epoch": 24.38878210728639, + "grad_norm": 0.4500799775123596, + "learning_rate": 0.00030752097902097896, + "loss": 3.1823, + "step": 83750 + }, + { + "epoch": 24.40334323490011, + "grad_norm": 0.43594327569007874, + "learning_rate": 0.0003073461538461538, + "loss": 3.1918, + "step": 83800 + }, + { + "epoch": 24.417904362513834, + "grad_norm": 0.487556517124176, + "learning_rate": 0.00030717132867132867, + "loss": 3.1907, + "step": 83850 + }, + { + "epoch": 24.432465490127555, + "grad_norm": 0.4531272053718567, + "learning_rate": 0.00030699650349650347, + "loss": 3.2009, + "step": 83900 + }, + { + "epoch": 24.44702661774128, + "grad_norm": 0.4245949387550354, + "learning_rate": 0.0003068216783216783, + "loss": 3.1898, + "step": 83950 + }, + { + "epoch": 24.461587745355, + "grad_norm": 0.4252057373523712, + "learning_rate": 0.0003066468531468531, + "loss": 3.1841, + "step": 84000 + }, + { + "epoch": 24.461587745355, + "eval_accuracy": 0.3741536961801322, + "eval_loss": 3.5409674644470215, + "eval_runtime": 81.8197, + "eval_samples_per_second": 203.447, + "eval_steps_per_second": 12.723, + "step": 84000 + }, + { + "epoch": 24.47614887296872, + "grad_norm": 0.42920979857444763, + "learning_rate": 0.000306472027972028, + "loss": 3.1956, + "step": 84050 + }, + { + "epoch": 24.490710000582446, + "grad_norm": 0.43671944737434387, + "learning_rate": 0.0003062972027972028, + "loss": 3.184, + "step": 84100 + }, + { + "epoch": 24.505271128196167, + "grad_norm": 0.4089336693286896, + "learning_rate": 0.00030612237762237763, + "loss": 3.186, + "step": 84150 + }, + { + "epoch": 24.51983225580989, + "grad_norm": 0.4428396224975586, + "learning_rate": 0.0003059475524475524, + "loss": 3.2047, + "step": 84200 + }, + { + "epoch": 24.534393383423613, + "grad_norm": 0.419969767332077, + "learning_rate": 0.0003057727272727273, + "loss": 3.1867, + "step": 84250 + }, + { + "epoch": 24.548954511037334, + "grad_norm": 0.40729889273643494, + "learning_rate": 0.00030559790209790203, + "loss": 3.2113, + "step": 84300 + }, + { + "epoch": 24.563515638651058, + "grad_norm": 0.45397377014160156, + "learning_rate": 0.0003054230769230769, + "loss": 3.1974, + "step": 84350 + }, + { + "epoch": 24.57807676626478, + "grad_norm": 0.4194105863571167, + "learning_rate": 0.0003052482517482517, + "loss": 3.2091, + "step": 84400 + }, + { + "epoch": 24.592637893878504, + "grad_norm": 0.4194582402706146, + "learning_rate": 0.00030507342657342654, + "loss": 3.1941, + "step": 84450 + }, + { + "epoch": 24.607199021492224, + "grad_norm": 0.40591809153556824, + "learning_rate": 0.00030489860139860134, + "loss": 3.1961, + "step": 84500 + }, + { + "epoch": 24.621760149105945, + "grad_norm": 0.4254491627216339, + "learning_rate": 0.0003047237762237762, + "loss": 3.2122, + "step": 84550 + }, + { + "epoch": 24.63632127671967, + "grad_norm": 0.4454302489757538, + "learning_rate": 0.00030454895104895104, + "loss": 3.2067, + "step": 84600 + }, + { + "epoch": 24.65088240433339, + "grad_norm": 0.4131137430667877, + "learning_rate": 0.00030437412587412584, + "loss": 3.2015, + "step": 84650 + }, + { + "epoch": 24.665443531947115, + "grad_norm": 0.4385775923728943, + "learning_rate": 0.0003041993006993007, + "loss": 3.1937, + "step": 84700 + }, + { + "epoch": 24.680004659560836, + "grad_norm": 0.42315569519996643, + "learning_rate": 0.0003040244755244755, + "loss": 3.2043, + "step": 84750 + }, + { + "epoch": 24.694565787174557, + "grad_norm": 0.42199307680130005, + "learning_rate": 0.00030384965034965035, + "loss": 3.2053, + "step": 84800 + }, + { + "epoch": 24.709126914788282, + "grad_norm": 0.4560045599937439, + "learning_rate": 0.00030367482517482515, + "loss": 3.2146, + "step": 84850 + }, + { + "epoch": 24.723688042402003, + "grad_norm": 0.4480164349079132, + "learning_rate": 0.0003035, + "loss": 3.189, + "step": 84900 + }, + { + "epoch": 24.738249170015727, + "grad_norm": 0.45350882411003113, + "learning_rate": 0.00030332517482517475, + "loss": 3.2061, + "step": 84950 + }, + { + "epoch": 24.75281029762945, + "grad_norm": 0.4041248857975006, + "learning_rate": 0.00030315034965034966, + "loss": 3.1957, + "step": 85000 + }, + { + "epoch": 24.75281029762945, + "eval_accuracy": 0.37445571451848825, + "eval_loss": 3.537726640701294, + "eval_runtime": 81.755, + "eval_samples_per_second": 203.608, + "eval_steps_per_second": 12.733, + "step": 85000 + }, + { + "epoch": 24.76737142524317, + "grad_norm": 0.4523855447769165, + "learning_rate": 0.0003029755244755244, + "loss": 3.2056, + "step": 85050 + }, + { + "epoch": 24.781932552856894, + "grad_norm": 0.39477360248565674, + "learning_rate": 0.00030280069930069926, + "loss": 3.2026, + "step": 85100 + }, + { + "epoch": 24.796493680470615, + "grad_norm": 0.4192032814025879, + "learning_rate": 0.00030262587412587406, + "loss": 3.2077, + "step": 85150 + }, + { + "epoch": 24.81105480808434, + "grad_norm": 0.44437819719314575, + "learning_rate": 0.0003024510489510489, + "loss": 3.2079, + "step": 85200 + }, + { + "epoch": 24.82561593569806, + "grad_norm": 0.454645037651062, + "learning_rate": 0.00030227622377622377, + "loss": 3.2258, + "step": 85250 + }, + { + "epoch": 24.84017706331178, + "grad_norm": 0.4044639766216278, + "learning_rate": 0.00030210139860139856, + "loss": 3.2006, + "step": 85300 + }, + { + "epoch": 24.854738190925506, + "grad_norm": 0.3925352990627289, + "learning_rate": 0.0003019265734265734, + "loss": 3.2019, + "step": 85350 + }, + { + "epoch": 24.869299318539227, + "grad_norm": 0.4119468331336975, + "learning_rate": 0.0003017517482517482, + "loss": 3.2163, + "step": 85400 + }, + { + "epoch": 24.88386044615295, + "grad_norm": 0.4199124574661255, + "learning_rate": 0.00030157692307692307, + "loss": 3.2147, + "step": 85450 + }, + { + "epoch": 24.898421573766672, + "grad_norm": 0.435290664434433, + "learning_rate": 0.00030140209790209787, + "loss": 3.219, + "step": 85500 + }, + { + "epoch": 24.912982701380393, + "grad_norm": 0.4147189259529114, + "learning_rate": 0.0003012272727272727, + "loss": 3.209, + "step": 85550 + }, + { + "epoch": 24.927543828994118, + "grad_norm": 0.4204513132572174, + "learning_rate": 0.0003010524475524475, + "loss": 3.2265, + "step": 85600 + }, + { + "epoch": 24.94210495660784, + "grad_norm": 0.4550034701824188, + "learning_rate": 0.0003008776223776224, + "loss": 3.2328, + "step": 85650 + }, + { + "epoch": 24.956666084221563, + "grad_norm": 0.4185858964920044, + "learning_rate": 0.0003007027972027972, + "loss": 3.2214, + "step": 85700 + }, + { + "epoch": 24.971227211835284, + "grad_norm": 0.44467946887016296, + "learning_rate": 0.00030052797202797203, + "loss": 3.2178, + "step": 85750 + }, + { + "epoch": 24.985788339449005, + "grad_norm": 0.4270879626274109, + "learning_rate": 0.0003003531468531468, + "loss": 3.2316, + "step": 85800 + }, + { + "epoch": 25.000291222552274, + "grad_norm": 0.42271360754966736, + "learning_rate": 0.00030017832167832163, + "loss": 3.2205, + "step": 85850 + }, + { + "epoch": 25.014852350165995, + "grad_norm": 0.4256053566932678, + "learning_rate": 0.0003000034965034965, + "loss": 3.1192, + "step": 85900 + }, + { + "epoch": 25.02941347777972, + "grad_norm": 0.4629895091056824, + "learning_rate": 0.0002998286713286713, + "loss": 3.1012, + "step": 85950 + }, + { + "epoch": 25.04397460539344, + "grad_norm": 0.465717613697052, + "learning_rate": 0.00029965384615384614, + "loss": 3.1198, + "step": 86000 + }, + { + "epoch": 25.04397460539344, + "eval_accuracy": 0.3741750925746752, + "eval_loss": 3.5509843826293945, + "eval_runtime": 81.7203, + "eval_samples_per_second": 203.695, + "eval_steps_per_second": 12.739, + "step": 86000 + }, + { + "epoch": 25.058535733007165, + "grad_norm": 0.4088675379753113, + "learning_rate": 0.00029947902097902094, + "loss": 3.1269, + "step": 86050 + }, + { + "epoch": 25.073096860620886, + "grad_norm": 0.46389642357826233, + "learning_rate": 0.0002993041958041958, + "loss": 3.134, + "step": 86100 + }, + { + "epoch": 25.087657988234607, + "grad_norm": 0.44134199619293213, + "learning_rate": 0.0002991293706293706, + "loss": 3.129, + "step": 86150 + }, + { + "epoch": 25.10221911584833, + "grad_norm": 0.4049500524997711, + "learning_rate": 0.0002989545454545454, + "loss": 3.139, + "step": 86200 + }, + { + "epoch": 25.116780243462053, + "grad_norm": 0.4358852803707123, + "learning_rate": 0.00029877972027972025, + "loss": 3.1309, + "step": 86250 + }, + { + "epoch": 25.131341371075777, + "grad_norm": 0.4504960775375366, + "learning_rate": 0.0002986048951048951, + "loss": 3.1381, + "step": 86300 + }, + { + "epoch": 25.145902498689498, + "grad_norm": 0.4623953402042389, + "learning_rate": 0.0002984300699300699, + "loss": 3.138, + "step": 86350 + }, + { + "epoch": 25.160463626303223, + "grad_norm": 0.4341890811920166, + "learning_rate": 0.00029825524475524475, + "loss": 3.1464, + "step": 86400 + }, + { + "epoch": 25.175024753916944, + "grad_norm": 0.4155861437320709, + "learning_rate": 0.00029808041958041955, + "loss": 3.1528, + "step": 86450 + }, + { + "epoch": 25.189585881530665, + "grad_norm": 0.43031230568885803, + "learning_rate": 0.0002979055944055944, + "loss": 3.1627, + "step": 86500 + }, + { + "epoch": 25.20414700914439, + "grad_norm": 0.42100533843040466, + "learning_rate": 0.0002977307692307692, + "loss": 3.1528, + "step": 86550 + }, + { + "epoch": 25.21870813675811, + "grad_norm": 0.45310893654823303, + "learning_rate": 0.000297555944055944, + "loss": 3.1452, + "step": 86600 + }, + { + "epoch": 25.233269264371835, + "grad_norm": 0.43537208437919617, + "learning_rate": 0.00029738111888111886, + "loss": 3.1542, + "step": 86650 + }, + { + "epoch": 25.247830391985556, + "grad_norm": 0.4328289031982422, + "learning_rate": 0.00029720629370629366, + "loss": 3.1518, + "step": 86700 + }, + { + "epoch": 25.262391519599277, + "grad_norm": 0.4341360032558441, + "learning_rate": 0.0002970314685314685, + "loss": 3.1585, + "step": 86750 + }, + { + "epoch": 25.276952647213, + "grad_norm": 0.42555415630340576, + "learning_rate": 0.0002968566433566433, + "loss": 3.1713, + "step": 86800 + }, + { + "epoch": 25.291513774826722, + "grad_norm": 0.4444054663181305, + "learning_rate": 0.00029668181818181817, + "loss": 3.167, + "step": 86850 + }, + { + "epoch": 25.306074902440447, + "grad_norm": 0.4327361285686493, + "learning_rate": 0.00029650699300699297, + "loss": 3.161, + "step": 86900 + }, + { + "epoch": 25.320636030054168, + "grad_norm": 0.4254668354988098, + "learning_rate": 0.0002963321678321678, + "loss": 3.1667, + "step": 86950 + }, + { + "epoch": 25.33519715766789, + "grad_norm": 0.4326876997947693, + "learning_rate": 0.0002961573426573426, + "loss": 3.1721, + "step": 87000 + }, + { + "epoch": 25.33519715766789, + "eval_accuracy": 0.374514613384785, + "eval_loss": 3.5434107780456543, + "eval_runtime": 81.7833, + "eval_samples_per_second": 203.538, + "eval_steps_per_second": 12.729, + "step": 87000 + }, + { + "epoch": 25.349758285281613, + "grad_norm": 0.42808645963668823, + "learning_rate": 0.0002959825174825175, + "loss": 3.1783, + "step": 87050 + }, + { + "epoch": 25.364319412895334, + "grad_norm": 0.43070703744888306, + "learning_rate": 0.0002958076923076923, + "loss": 3.1639, + "step": 87100 + }, + { + "epoch": 25.37888054050906, + "grad_norm": 0.4267195463180542, + "learning_rate": 0.00029563286713286713, + "loss": 3.1752, + "step": 87150 + }, + { + "epoch": 25.39344166812278, + "grad_norm": 0.4095643162727356, + "learning_rate": 0.00029545804195804193, + "loss": 3.1775, + "step": 87200 + }, + { + "epoch": 25.4080027957365, + "grad_norm": 0.43954792618751526, + "learning_rate": 0.0002952832167832168, + "loss": 3.174, + "step": 87250 + }, + { + "epoch": 25.422563923350225, + "grad_norm": 0.42223548889160156, + "learning_rate": 0.0002951083916083916, + "loss": 3.1674, + "step": 87300 + }, + { + "epoch": 25.437125050963946, + "grad_norm": 0.41911086440086365, + "learning_rate": 0.0002949335664335664, + "loss": 3.1803, + "step": 87350 + }, + { + "epoch": 25.45168617857767, + "grad_norm": 0.42769038677215576, + "learning_rate": 0.00029475874125874124, + "loss": 3.1638, + "step": 87400 + }, + { + "epoch": 25.46624730619139, + "grad_norm": 0.4431706964969635, + "learning_rate": 0.00029458391608391604, + "loss": 3.1688, + "step": 87450 + }, + { + "epoch": 25.480808433805112, + "grad_norm": 0.4112902879714966, + "learning_rate": 0.0002944090909090909, + "loss": 3.1988, + "step": 87500 + }, + { + "epoch": 25.495369561418837, + "grad_norm": 0.42165741324424744, + "learning_rate": 0.0002942342657342657, + "loss": 3.1791, + "step": 87550 + }, + { + "epoch": 25.509930689032558, + "grad_norm": 0.4093731939792633, + "learning_rate": 0.00029405944055944054, + "loss": 3.1786, + "step": 87600 + }, + { + "epoch": 25.524491816646282, + "grad_norm": 0.4248475432395935, + "learning_rate": 0.0002938846153846154, + "loss": 3.184, + "step": 87650 + }, + { + "epoch": 25.539052944260003, + "grad_norm": 0.42811527848243713, + "learning_rate": 0.0002937097902097902, + "loss": 3.1883, + "step": 87700 + }, + { + "epoch": 25.553614071873724, + "grad_norm": 0.44041386246681213, + "learning_rate": 0.000293534965034965, + "loss": 3.1955, + "step": 87750 + }, + { + "epoch": 25.56817519948745, + "grad_norm": 0.4249640107154846, + "learning_rate": 0.00029336013986013985, + "loss": 3.1888, + "step": 87800 + }, + { + "epoch": 25.58273632710117, + "grad_norm": 0.4414690136909485, + "learning_rate": 0.00029318531468531465, + "loss": 3.201, + "step": 87850 + }, + { + "epoch": 25.597297454714894, + "grad_norm": 0.45681437849998474, + "learning_rate": 0.0002930104895104895, + "loss": 3.1867, + "step": 87900 + }, + { + "epoch": 25.611858582328615, + "grad_norm": 0.41531237959861755, + "learning_rate": 0.0002928356643356643, + "loss": 3.1941, + "step": 87950 + }, + { + "epoch": 25.626419709942336, + "grad_norm": 0.4413832426071167, + "learning_rate": 0.00029266083916083916, + "loss": 3.1847, + "step": 88000 + }, + { + "epoch": 25.626419709942336, + "eval_accuracy": 0.37498650969080327, + "eval_loss": 3.533761501312256, + "eval_runtime": 81.8224, + "eval_samples_per_second": 203.441, + "eval_steps_per_second": 12.723, + "step": 88000 + }, + { + "epoch": 25.64098083755606, + "grad_norm": 0.422968327999115, + "learning_rate": 0.00029248601398601396, + "loss": 3.1973, + "step": 88050 + }, + { + "epoch": 25.655541965169782, + "grad_norm": 0.4194158911705017, + "learning_rate": 0.00029231118881118876, + "loss": 3.2006, + "step": 88100 + }, + { + "epoch": 25.670103092783506, + "grad_norm": 0.42744433879852295, + "learning_rate": 0.0002921363636363636, + "loss": 3.1893, + "step": 88150 + }, + { + "epoch": 25.684664220397227, + "grad_norm": 0.4123415946960449, + "learning_rate": 0.0002919615384615384, + "loss": 3.2009, + "step": 88200 + }, + { + "epoch": 25.69922534801095, + "grad_norm": 0.4174706041812897, + "learning_rate": 0.00029178671328671326, + "loss": 3.1865, + "step": 88250 + }, + { + "epoch": 25.713786475624673, + "grad_norm": 0.4339861571788788, + "learning_rate": 0.00029161188811188806, + "loss": 3.1874, + "step": 88300 + }, + { + "epoch": 25.728347603238394, + "grad_norm": 0.4318697154521942, + "learning_rate": 0.0002914370629370629, + "loss": 3.1928, + "step": 88350 + }, + { + "epoch": 25.74290873085212, + "grad_norm": 0.4214717149734497, + "learning_rate": 0.00029126223776223777, + "loss": 3.1945, + "step": 88400 + }, + { + "epoch": 25.75746985846584, + "grad_norm": 0.45795249938964844, + "learning_rate": 0.00029108741258741257, + "loss": 3.1971, + "step": 88450 + }, + { + "epoch": 25.772030986079564, + "grad_norm": 0.4096623361110687, + "learning_rate": 0.00029091258741258737, + "loss": 3.2021, + "step": 88500 + }, + { + "epoch": 25.786592113693285, + "grad_norm": 0.4361957311630249, + "learning_rate": 0.0002907377622377622, + "loss": 3.2035, + "step": 88550 + }, + { + "epoch": 25.801153241307006, + "grad_norm": 0.44716793298721313, + "learning_rate": 0.000290562937062937, + "loss": 3.1993, + "step": 88600 + }, + { + "epoch": 25.81571436892073, + "grad_norm": 0.4102550148963928, + "learning_rate": 0.0002903881118881119, + "loss": 3.1993, + "step": 88650 + }, + { + "epoch": 25.83027549653445, + "grad_norm": 0.4365271329879761, + "learning_rate": 0.0002902132867132867, + "loss": 3.1944, + "step": 88700 + }, + { + "epoch": 25.844836624148176, + "grad_norm": 0.44143885374069214, + "learning_rate": 0.00029003846153846153, + "loss": 3.2149, + "step": 88750 + }, + { + "epoch": 25.859397751761897, + "grad_norm": 0.4330017864704132, + "learning_rate": 0.00028986363636363633, + "loss": 3.2087, + "step": 88800 + }, + { + "epoch": 25.873958879375618, + "grad_norm": 0.41513675451278687, + "learning_rate": 0.00028968881118881113, + "loss": 3.1979, + "step": 88850 + }, + { + "epoch": 25.888520006989342, + "grad_norm": 0.45582523941993713, + "learning_rate": 0.000289513986013986, + "loss": 3.1936, + "step": 88900 + }, + { + "epoch": 25.903081134603063, + "grad_norm": 0.43732136487960815, + "learning_rate": 0.0002893391608391608, + "loss": 3.2025, + "step": 88950 + }, + { + "epoch": 25.917642262216788, + "grad_norm": 0.4278513491153717, + "learning_rate": 0.00028916433566433564, + "loss": 3.2055, + "step": 89000 + }, + { + "epoch": 25.917642262216788, + "eval_accuracy": 0.37510172104603445, + "eval_loss": 3.530043840408325, + "eval_runtime": 81.6291, + "eval_samples_per_second": 203.922, + "eval_steps_per_second": 12.753, + "step": 89000 + }, + { + "epoch": 25.93220338983051, + "grad_norm": 0.42948028445243835, + "learning_rate": 0.0002889895104895105, + "loss": 3.201, + "step": 89050 + }, + { + "epoch": 25.94676451744423, + "grad_norm": 0.415725976228714, + "learning_rate": 0.0002888146853146853, + "loss": 3.1962, + "step": 89100 + }, + { + "epoch": 25.961325645057954, + "grad_norm": 0.4632090628147125, + "learning_rate": 0.00028863986013986015, + "loss": 3.1898, + "step": 89150 + }, + { + "epoch": 25.975886772671675, + "grad_norm": 0.42952993512153625, + "learning_rate": 0.00028846503496503495, + "loss": 3.1991, + "step": 89200 + }, + { + "epoch": 25.9904479002854, + "grad_norm": 0.440868616104126, + "learning_rate": 0.00028829020979020975, + "loss": 3.1998, + "step": 89250 + }, + { + "epoch": 26.004950783388665, + "grad_norm": 0.40771979093551636, + "learning_rate": 0.0002881153846153846, + "loss": 3.1739, + "step": 89300 + }, + { + "epoch": 26.01951191100239, + "grad_norm": 0.4043092727661133, + "learning_rate": 0.0002879405594405594, + "loss": 3.1053, + "step": 89350 + }, + { + "epoch": 26.03407303861611, + "grad_norm": 0.4216521680355072, + "learning_rate": 0.00028776573426573425, + "loss": 3.1138, + "step": 89400 + }, + { + "epoch": 26.04863416622983, + "grad_norm": 0.45783573389053345, + "learning_rate": 0.00028759090909090905, + "loss": 3.1126, + "step": 89450 + }, + { + "epoch": 26.063195293843556, + "grad_norm": 0.4315570890903473, + "learning_rate": 0.0002874160839160839, + "loss": 3.1133, + "step": 89500 + }, + { + "epoch": 26.077756421457277, + "grad_norm": 0.45492488145828247, + "learning_rate": 0.0002872412587412587, + "loss": 3.1265, + "step": 89550 + }, + { + "epoch": 26.092317549071, + "grad_norm": 0.44426026940345764, + "learning_rate": 0.0002870664335664335, + "loss": 3.127, + "step": 89600 + }, + { + "epoch": 26.106878676684723, + "grad_norm": 0.4351698160171509, + "learning_rate": 0.00028689160839160836, + "loss": 3.1366, + "step": 89650 + }, + { + "epoch": 26.121439804298443, + "grad_norm": 0.4132281243801117, + "learning_rate": 0.0002867167832167832, + "loss": 3.1335, + "step": 89700 + }, + { + "epoch": 26.136000931912168, + "grad_norm": 0.42897453904151917, + "learning_rate": 0.000286541958041958, + "loss": 3.1487, + "step": 89750 + }, + { + "epoch": 26.15056205952589, + "grad_norm": 0.42802971601486206, + "learning_rate": 0.00028636713286713287, + "loss": 3.1416, + "step": 89800 + }, + { + "epoch": 26.165123187139613, + "grad_norm": 0.4399700462818146, + "learning_rate": 0.00028619230769230767, + "loss": 3.1312, + "step": 89850 + }, + { + "epoch": 26.179684314753334, + "grad_norm": 0.444806843996048, + "learning_rate": 0.0002860174825174825, + "loss": 3.1316, + "step": 89900 + }, + { + "epoch": 26.194245442367055, + "grad_norm": 0.4164326786994934, + "learning_rate": 0.0002858426573426573, + "loss": 3.1531, + "step": 89950 + }, + { + "epoch": 26.20880656998078, + "grad_norm": 0.47919753193855286, + "learning_rate": 0.0002856678321678321, + "loss": 3.1387, + "step": 90000 + }, + { + "epoch": 26.20880656998078, + "eval_accuracy": 0.3742905390551211, + "eval_loss": 3.5481929779052734, + "eval_runtime": 81.7603, + "eval_samples_per_second": 203.595, + "eval_steps_per_second": 12.732, + "step": 90000 + }, + { + "epoch": 26.2233676975945, + "grad_norm": 0.41104620695114136, + "learning_rate": 0.000285493006993007, + "loss": 3.1324, + "step": 90050 + }, + { + "epoch": 26.237928825208225, + "grad_norm": 0.42326322197914124, + "learning_rate": 0.0002853181818181818, + "loss": 3.1438, + "step": 90100 + }, + { + "epoch": 26.252489952821946, + "grad_norm": 0.44323936104774475, + "learning_rate": 0.00028514335664335663, + "loss": 3.1566, + "step": 90150 + }, + { + "epoch": 26.267051080435667, + "grad_norm": 0.44852617383003235, + "learning_rate": 0.00028496853146853143, + "loss": 3.1535, + "step": 90200 + }, + { + "epoch": 26.281612208049392, + "grad_norm": 0.4421542286872864, + "learning_rate": 0.0002847937062937063, + "loss": 3.1578, + "step": 90250 + }, + { + "epoch": 26.296173335663113, + "grad_norm": 0.45970240235328674, + "learning_rate": 0.0002846188811188811, + "loss": 3.1538, + "step": 90300 + }, + { + "epoch": 26.310734463276837, + "grad_norm": 0.48239070177078247, + "learning_rate": 0.0002844440559440559, + "loss": 3.1532, + "step": 90350 + }, + { + "epoch": 26.32529559089056, + "grad_norm": 0.42468351125717163, + "learning_rate": 0.00028426923076923074, + "loss": 3.1495, + "step": 90400 + }, + { + "epoch": 26.33985671850428, + "grad_norm": 0.4387561082839966, + "learning_rate": 0.0002840944055944056, + "loss": 3.1623, + "step": 90450 + }, + { + "epoch": 26.354417846118004, + "grad_norm": 0.43278950452804565, + "learning_rate": 0.0002839195804195804, + "loss": 3.1566, + "step": 90500 + }, + { + "epoch": 26.368978973731725, + "grad_norm": 0.4446139931678772, + "learning_rate": 0.00028374475524475524, + "loss": 3.168, + "step": 90550 + }, + { + "epoch": 26.38354010134545, + "grad_norm": 0.4542044401168823, + "learning_rate": 0.00028356993006993004, + "loss": 3.1542, + "step": 90600 + }, + { + "epoch": 26.39810122895917, + "grad_norm": 0.4319124221801758, + "learning_rate": 0.0002833951048951049, + "loss": 3.1641, + "step": 90650 + }, + { + "epoch": 26.41266235657289, + "grad_norm": 0.40921565890312195, + "learning_rate": 0.0002832202797202797, + "loss": 3.1606, + "step": 90700 + }, + { + "epoch": 26.427223484186616, + "grad_norm": 0.4638734757900238, + "learning_rate": 0.0002830454545454545, + "loss": 3.1725, + "step": 90750 + }, + { + "epoch": 26.441784611800337, + "grad_norm": 0.43601810932159424, + "learning_rate": 0.00028287062937062935, + "loss": 3.159, + "step": 90800 + }, + { + "epoch": 26.45634573941406, + "grad_norm": 0.4467451870441437, + "learning_rate": 0.00028269580419580415, + "loss": 3.1726, + "step": 90850 + }, + { + "epoch": 26.470906867027782, + "grad_norm": 0.4496610760688782, + "learning_rate": 0.000282520979020979, + "loss": 3.1601, + "step": 90900 + }, + { + "epoch": 26.485467994641503, + "grad_norm": 0.43101951479911804, + "learning_rate": 0.0002823461538461538, + "loss": 3.1734, + "step": 90950 + }, + { + "epoch": 26.500029122255228, + "grad_norm": 0.4648696482181549, + "learning_rate": 0.00028217132867132866, + "loss": 3.1767, + "step": 91000 + }, + { + "epoch": 26.500029122255228, + "eval_accuracy": 0.3745937330195509, + "eval_loss": 3.539630889892578, + "eval_runtime": 81.2454, + "eval_samples_per_second": 204.885, + "eval_steps_per_second": 12.813, + "step": 91000 + }, + { + "epoch": 26.51459024986895, + "grad_norm": 0.4609490931034088, + "learning_rate": 0.00028199650349650346, + "loss": 3.1757, + "step": 91050 + }, + { + "epoch": 26.529151377482673, + "grad_norm": 0.4083974361419678, + "learning_rate": 0.0002818216783216783, + "loss": 3.1822, + "step": 91100 + }, + { + "epoch": 26.543712505096394, + "grad_norm": 0.4239809215068817, + "learning_rate": 0.0002816468531468531, + "loss": 3.1685, + "step": 91150 + }, + { + "epoch": 26.55827363271012, + "grad_norm": 0.4246228337287903, + "learning_rate": 0.00028147202797202796, + "loss": 3.177, + "step": 91200 + }, + { + "epoch": 26.57283476032384, + "grad_norm": 0.4565730690956116, + "learning_rate": 0.00028129720279720276, + "loss": 3.191, + "step": 91250 + }, + { + "epoch": 26.58739588793756, + "grad_norm": 0.4267881512641907, + "learning_rate": 0.0002811223776223776, + "loss": 3.1774, + "step": 91300 + }, + { + "epoch": 26.601957015551285, + "grad_norm": 0.418037474155426, + "learning_rate": 0.0002809475524475524, + "loss": 3.1792, + "step": 91350 + }, + { + "epoch": 26.616518143165006, + "grad_norm": 0.4386206567287445, + "learning_rate": 0.00028077272727272727, + "loss": 3.1748, + "step": 91400 + }, + { + "epoch": 26.63107927077873, + "grad_norm": 0.47127339243888855, + "learning_rate": 0.00028059790209790207, + "loss": 3.1774, + "step": 91450 + }, + { + "epoch": 26.64564039839245, + "grad_norm": 0.43801349401474, + "learning_rate": 0.00028042307692307687, + "loss": 3.1796, + "step": 91500 + }, + { + "epoch": 26.660201526006173, + "grad_norm": 0.4200217127799988, + "learning_rate": 0.0002802482517482517, + "loss": 3.1718, + "step": 91550 + }, + { + "epoch": 26.674762653619897, + "grad_norm": 0.4898620545864105, + "learning_rate": 0.0002800734265734265, + "loss": 3.1915, + "step": 91600 + }, + { + "epoch": 26.689323781233618, + "grad_norm": 0.4548785090446472, + "learning_rate": 0.0002798986013986014, + "loss": 3.1842, + "step": 91650 + }, + { + "epoch": 26.703884908847343, + "grad_norm": 0.42794618010520935, + "learning_rate": 0.0002797237762237762, + "loss": 3.1773, + "step": 91700 + }, + { + "epoch": 26.718446036461064, + "grad_norm": 0.4374929368495941, + "learning_rate": 0.00027954895104895103, + "loss": 3.1894, + "step": 91750 + }, + { + "epoch": 26.733007164074785, + "grad_norm": 0.46282780170440674, + "learning_rate": 0.0002793741258741259, + "loss": 3.1916, + "step": 91800 + }, + { + "epoch": 26.74756829168851, + "grad_norm": 0.413688987493515, + "learning_rate": 0.0002791993006993007, + "loss": 3.195, + "step": 91850 + }, + { + "epoch": 26.76212941930223, + "grad_norm": 0.43770772218704224, + "learning_rate": 0.0002790244755244755, + "loss": 3.183, + "step": 91900 + }, + { + "epoch": 26.776690546915955, + "grad_norm": 0.43841320276260376, + "learning_rate": 0.00027884965034965034, + "loss": 3.1939, + "step": 91950 + }, + { + "epoch": 26.791251674529676, + "grad_norm": 0.4358888566493988, + "learning_rate": 0.00027867482517482514, + "loss": 3.1893, + "step": 92000 + }, + { + "epoch": 26.791251674529676, + "eval_accuracy": 0.37525760906341865, + "eval_loss": 3.532517194747925, + "eval_runtime": 80.5056, + "eval_samples_per_second": 206.768, + "eval_steps_per_second": 12.931, + "step": 92000 + }, + { + "epoch": 26.805812802143397, + "grad_norm": 0.43341144919395447, + "learning_rate": 0.0002785, + "loss": 3.1936, + "step": 92050 + }, + { + "epoch": 26.82037392975712, + "grad_norm": 0.41889843344688416, + "learning_rate": 0.0002783251748251748, + "loss": 3.1841, + "step": 92100 + }, + { + "epoch": 26.834935057370842, + "grad_norm": 0.39708298444747925, + "learning_rate": 0.00027815034965034965, + "loss": 3.1769, + "step": 92150 + }, + { + "epoch": 26.849496184984567, + "grad_norm": 0.41780245304107666, + "learning_rate": 0.00027797552447552445, + "loss": 3.1899, + "step": 92200 + }, + { + "epoch": 26.864057312598288, + "grad_norm": 0.42979490756988525, + "learning_rate": 0.00027780069930069925, + "loss": 3.1975, + "step": 92250 + }, + { + "epoch": 26.87861844021201, + "grad_norm": 0.44316792488098145, + "learning_rate": 0.0002776258741258741, + "loss": 3.1838, + "step": 92300 + }, + { + "epoch": 26.893179567825733, + "grad_norm": 0.4461120367050171, + "learning_rate": 0.0002774510489510489, + "loss": 3.1812, + "step": 92350 + }, + { + "epoch": 26.907740695439454, + "grad_norm": 0.4495375156402588, + "learning_rate": 0.00027727622377622375, + "loss": 3.1994, + "step": 92400 + }, + { + "epoch": 26.92230182305318, + "grad_norm": 0.43633168935775757, + "learning_rate": 0.00027710139860139855, + "loss": 3.1962, + "step": 92450 + }, + { + "epoch": 26.9368629506669, + "grad_norm": 0.4445933401584625, + "learning_rate": 0.0002769265734265734, + "loss": 3.1962, + "step": 92500 + }, + { + "epoch": 26.95142407828062, + "grad_norm": 0.4221359193325043, + "learning_rate": 0.00027675174825174826, + "loss": 3.1908, + "step": 92550 + }, + { + "epoch": 26.965985205894345, + "grad_norm": 0.4284641146659851, + "learning_rate": 0.00027657692307692306, + "loss": 3.2026, + "step": 92600 + }, + { + "epoch": 26.980546333508066, + "grad_norm": 0.428451806306839, + "learning_rate": 0.00027640209790209786, + "loss": 3.2005, + "step": 92650 + }, + { + "epoch": 26.99510746112179, + "grad_norm": 0.4614730477333069, + "learning_rate": 0.0002762272727272727, + "loss": 3.1913, + "step": 92700 + }, + { + "epoch": 27.009610344225056, + "grad_norm": 0.43082916736602783, + "learning_rate": 0.0002760524475524475, + "loss": 3.141, + "step": 92750 + }, + { + "epoch": 27.02417147183878, + "grad_norm": 0.4590015113353729, + "learning_rate": 0.00027587762237762237, + "loss": 3.0963, + "step": 92800 + }, + { + "epoch": 27.0387325994525, + "grad_norm": 0.4303778111934662, + "learning_rate": 0.00027570279720279717, + "loss": 3.1013, + "step": 92850 + }, + { + "epoch": 27.053293727066222, + "grad_norm": 0.47176748514175415, + "learning_rate": 0.000275527972027972, + "loss": 3.1143, + "step": 92900 + }, + { + "epoch": 27.067854854679947, + "grad_norm": 0.4238791763782501, + "learning_rate": 0.0002753531468531468, + "loss": 3.1279, + "step": 92950 + }, + { + "epoch": 27.082415982293668, + "grad_norm": 0.46568360924720764, + "learning_rate": 0.0002751783216783216, + "loss": 3.1194, + "step": 93000 + }, + { + "epoch": 27.082415982293668, + "eval_accuracy": 0.3744267941170731, + "eval_loss": 3.5482656955718994, + "eval_runtime": 80.1211, + "eval_samples_per_second": 207.761, + "eval_steps_per_second": 12.993, + "step": 93000 + }, + { + "epoch": 27.096977109907392, + "grad_norm": 0.4266977608203888, + "learning_rate": 0.0002750034965034965, + "loss": 3.1087, + "step": 93050 + }, + { + "epoch": 27.111538237521113, + "grad_norm": 0.4253590404987335, + "learning_rate": 0.0002748286713286713, + "loss": 3.1223, + "step": 93100 + }, + { + "epoch": 27.126099365134834, + "grad_norm": 0.47660887241363525, + "learning_rate": 0.00027465384615384613, + "loss": 3.1237, + "step": 93150 + }, + { + "epoch": 27.14066049274856, + "grad_norm": 0.4275610148906708, + "learning_rate": 0.000274479020979021, + "loss": 3.1237, + "step": 93200 + }, + { + "epoch": 27.15522162036228, + "grad_norm": 0.4558410346508026, + "learning_rate": 0.0002743041958041958, + "loss": 3.1228, + "step": 93250 + }, + { + "epoch": 27.169782747976004, + "grad_norm": 0.4413674473762512, + "learning_rate": 0.00027412937062937064, + "loss": 3.1332, + "step": 93300 + }, + { + "epoch": 27.184343875589725, + "grad_norm": 0.43724343180656433, + "learning_rate": 0.00027395454545454544, + "loss": 3.1409, + "step": 93350 + }, + { + "epoch": 27.19890500320345, + "grad_norm": 0.4715915024280548, + "learning_rate": 0.00027377972027972024, + "loss": 3.1249, + "step": 93400 + }, + { + "epoch": 27.21346613081717, + "grad_norm": 0.43383172154426575, + "learning_rate": 0.0002736048951048951, + "loss": 3.1361, + "step": 93450 + }, + { + "epoch": 27.228027258430892, + "grad_norm": 0.4315720200538635, + "learning_rate": 0.0002734300699300699, + "loss": 3.1299, + "step": 93500 + }, + { + "epoch": 27.242588386044616, + "grad_norm": 0.47162047028541565, + "learning_rate": 0.00027325524475524474, + "loss": 3.1471, + "step": 93550 + }, + { + "epoch": 27.257149513658337, + "grad_norm": 0.44325777888298035, + "learning_rate": 0.00027308041958041954, + "loss": 3.1333, + "step": 93600 + }, + { + "epoch": 27.271710641272062, + "grad_norm": 0.43961724638938904, + "learning_rate": 0.0002729055944055944, + "loss": 3.1455, + "step": 93650 + }, + { + "epoch": 27.286271768885783, + "grad_norm": 0.4498037099838257, + "learning_rate": 0.0002727307692307692, + "loss": 3.1441, + "step": 93700 + }, + { + "epoch": 27.300832896499504, + "grad_norm": 0.43977901339530945, + "learning_rate": 0.000272555944055944, + "loss": 3.1543, + "step": 93750 + }, + { + "epoch": 27.31539402411323, + "grad_norm": 0.44024720788002014, + "learning_rate": 0.00027238111888111885, + "loss": 3.1355, + "step": 93800 + }, + { + "epoch": 27.32995515172695, + "grad_norm": 0.4535662531852722, + "learning_rate": 0.0002722062937062937, + "loss": 3.1427, + "step": 93850 + }, + { + "epoch": 27.344516279340674, + "grad_norm": 0.46223586797714233, + "learning_rate": 0.0002720314685314685, + "loss": 3.1344, + "step": 93900 + }, + { + "epoch": 27.359077406954395, + "grad_norm": 0.44665613770484924, + "learning_rate": 0.00027185664335664336, + "loss": 3.1499, + "step": 93950 + }, + { + "epoch": 27.373638534568116, + "grad_norm": 0.41817039251327515, + "learning_rate": 0.00027168181818181816, + "loss": 3.146, + "step": 94000 + }, + { + "epoch": 27.373638534568116, + "eval_accuracy": 0.3747087092495673, + "eval_loss": 3.545140027999878, + "eval_runtime": 80.3242, + "eval_samples_per_second": 207.235, + "eval_steps_per_second": 12.96, + "step": 94000 + }, + { + "epoch": 27.38819966218184, + "grad_norm": 0.4406947195529938, + "learning_rate": 0.000271506993006993, + "loss": 3.1496, + "step": 94050 + }, + { + "epoch": 27.40276078979556, + "grad_norm": 0.44461363554000854, + "learning_rate": 0.0002713321678321678, + "loss": 3.1488, + "step": 94100 + }, + { + "epoch": 27.417321917409286, + "grad_norm": 0.437137246131897, + "learning_rate": 0.0002711573426573426, + "loss": 3.1507, + "step": 94150 + }, + { + "epoch": 27.431883045023007, + "grad_norm": 0.43815845251083374, + "learning_rate": 0.00027098251748251746, + "loss": 3.1596, + "step": 94200 + }, + { + "epoch": 27.446444172636728, + "grad_norm": 0.44346702098846436, + "learning_rate": 0.00027080769230769226, + "loss": 3.1428, + "step": 94250 + }, + { + "epoch": 27.461005300250452, + "grad_norm": 0.45342469215393066, + "learning_rate": 0.0002706328671328671, + "loss": 3.1459, + "step": 94300 + }, + { + "epoch": 27.475566427864173, + "grad_norm": 0.431231826543808, + "learning_rate": 0.0002704580419580419, + "loss": 3.1567, + "step": 94350 + }, + { + "epoch": 27.490127555477898, + "grad_norm": 0.48505812883377075, + "learning_rate": 0.00027028321678321677, + "loss": 3.1563, + "step": 94400 + }, + { + "epoch": 27.50468868309162, + "grad_norm": 0.4472305178642273, + "learning_rate": 0.00027010839160839157, + "loss": 3.1576, + "step": 94450 + }, + { + "epoch": 27.51924981070534, + "grad_norm": 0.43050718307495117, + "learning_rate": 0.00026993356643356637, + "loss": 3.1602, + "step": 94500 + }, + { + "epoch": 27.533810938319064, + "grad_norm": 0.4264449179172516, + "learning_rate": 0.0002697587412587412, + "loss": 3.1585, + "step": 94550 + }, + { + "epoch": 27.548372065932785, + "grad_norm": 0.4252346158027649, + "learning_rate": 0.0002695839160839161, + "loss": 3.1651, + "step": 94600 + }, + { + "epoch": 27.56293319354651, + "grad_norm": 0.4452660381793976, + "learning_rate": 0.0002694090909090909, + "loss": 3.1683, + "step": 94650 + }, + { + "epoch": 27.57749432116023, + "grad_norm": 0.4676416516304016, + "learning_rate": 0.00026923426573426573, + "loss": 3.1722, + "step": 94700 + }, + { + "epoch": 27.59205544877395, + "grad_norm": 0.4459725022315979, + "learning_rate": 0.00026905944055944053, + "loss": 3.1786, + "step": 94750 + }, + { + "epoch": 27.606616576387676, + "grad_norm": 0.43419715762138367, + "learning_rate": 0.0002688846153846154, + "loss": 3.1628, + "step": 94800 + }, + { + "epoch": 27.621177704001397, + "grad_norm": 0.4442163109779358, + "learning_rate": 0.0002687097902097902, + "loss": 3.1747, + "step": 94850 + }, + { + "epoch": 27.63573883161512, + "grad_norm": 0.4572891592979431, + "learning_rate": 0.000268534965034965, + "loss": 3.1667, + "step": 94900 + }, + { + "epoch": 27.650299959228843, + "grad_norm": 0.4328926205635071, + "learning_rate": 0.00026836013986013984, + "loss": 3.1682, + "step": 94950 + }, + { + "epoch": 27.664861086842564, + "grad_norm": 0.4510846734046936, + "learning_rate": 0.00026818531468531464, + "loss": 3.1737, + "step": 95000 + }, + { + "epoch": 27.664861086842564, + "eval_accuracy": 0.3751563876584656, + "eval_loss": 3.538522720336914, + "eval_runtime": 80.1488, + "eval_samples_per_second": 207.689, + "eval_steps_per_second": 12.988, + "step": 95000 + }, + { + "epoch": 27.679422214456288, + "grad_norm": 0.46054887771606445, + "learning_rate": 0.0002680104895104895, + "loss": 3.1805, + "step": 95050 + }, + { + "epoch": 27.69398334207001, + "grad_norm": 0.44147053360939026, + "learning_rate": 0.0002678356643356643, + "loss": 3.1851, + "step": 95100 + }, + { + "epoch": 27.708544469683734, + "grad_norm": 0.424288272857666, + "learning_rate": 0.00026766083916083915, + "loss": 3.1731, + "step": 95150 + }, + { + "epoch": 27.723105597297454, + "grad_norm": 0.4261254072189331, + "learning_rate": 0.00026748601398601395, + "loss": 3.1827, + "step": 95200 + }, + { + "epoch": 27.737666724911175, + "grad_norm": 0.4687895178794861, + "learning_rate": 0.0002673111888111888, + "loss": 3.1915, + "step": 95250 + }, + { + "epoch": 27.7522278525249, + "grad_norm": 0.4428876042366028, + "learning_rate": 0.0002671363636363636, + "loss": 3.1873, + "step": 95300 + }, + { + "epoch": 27.76678898013862, + "grad_norm": 0.4460248649120331, + "learning_rate": 0.00026696153846153845, + "loss": 3.1759, + "step": 95350 + }, + { + "epoch": 27.781350107752345, + "grad_norm": 0.45214036107063293, + "learning_rate": 0.00026678671328671325, + "loss": 3.183, + "step": 95400 + }, + { + "epoch": 27.795911235366066, + "grad_norm": 0.4358888864517212, + "learning_rate": 0.0002666118881118881, + "loss": 3.1758, + "step": 95450 + }, + { + "epoch": 27.810472362979787, + "grad_norm": 0.4265310764312744, + "learning_rate": 0.0002664370629370629, + "loss": 3.1986, + "step": 95500 + }, + { + "epoch": 27.825033490593512, + "grad_norm": 0.4481215178966522, + "learning_rate": 0.00026626223776223776, + "loss": 3.172, + "step": 95550 + }, + { + "epoch": 27.839594618207233, + "grad_norm": 0.42975902557373047, + "learning_rate": 0.00026608741258741256, + "loss": 3.1959, + "step": 95600 + }, + { + "epoch": 27.854155745820957, + "grad_norm": 0.4381426274776459, + "learning_rate": 0.00026591258741258736, + "loss": 3.1879, + "step": 95650 + }, + { + "epoch": 27.86871687343468, + "grad_norm": 0.42458629608154297, + "learning_rate": 0.0002657377622377622, + "loss": 3.1921, + "step": 95700 + }, + { + "epoch": 27.883278001048403, + "grad_norm": 0.4169227182865143, + "learning_rate": 0.000265562937062937, + "loss": 3.1862, + "step": 95750 + }, + { + "epoch": 27.897839128662124, + "grad_norm": 0.45456182956695557, + "learning_rate": 0.00026538811188811187, + "loss": 3.184, + "step": 95800 + }, + { + "epoch": 27.912400256275845, + "grad_norm": 0.4321167469024658, + "learning_rate": 0.00026521328671328667, + "loss": 3.184, + "step": 95850 + }, + { + "epoch": 27.92696138388957, + "grad_norm": 0.48365333676338196, + "learning_rate": 0.0002650384615384615, + "loss": 3.1859, + "step": 95900 + }, + { + "epoch": 27.94152251150329, + "grad_norm": 0.47256672382354736, + "learning_rate": 0.0002648636363636364, + "loss": 3.1861, + "step": 95950 + }, + { + "epoch": 27.956083639117015, + "grad_norm": 0.46820148825645447, + "learning_rate": 0.0002646888111888112, + "loss": 3.1959, + "step": 96000 + }, + { + "epoch": 27.956083639117015, + "eval_accuracy": 0.3755887829284046, + "eval_loss": 3.529139995574951, + "eval_runtime": 80.2502, + "eval_samples_per_second": 207.426, + "eval_steps_per_second": 12.972, + "step": 96000 + }, + { + "epoch": 27.970644766730736, + "grad_norm": 0.42508140206336975, + "learning_rate": 0.000264513986013986, + "loss": 3.1972, + "step": 96050 + }, + { + "epoch": 27.985205894344457, + "grad_norm": 0.41875147819519043, + "learning_rate": 0.00026433916083916083, + "loss": 3.191, + "step": 96100 + }, + { + "epoch": 27.99976702195818, + "grad_norm": 0.4364998936653137, + "learning_rate": 0.00026416433566433563, + "loss": 3.2029, + "step": 96150 + }, + { + "epoch": 28.014269905061447, + "grad_norm": 0.4501911699771881, + "learning_rate": 0.0002639895104895105, + "loss": 3.0886, + "step": 96200 + }, + { + "epoch": 28.02883103267517, + "grad_norm": 0.4514945149421692, + "learning_rate": 0.0002638146853146853, + "loss": 3.0965, + "step": 96250 + }, + { + "epoch": 28.043392160288892, + "grad_norm": 0.4345646798610687, + "learning_rate": 0.00026363986013986014, + "loss": 3.0975, + "step": 96300 + }, + { + "epoch": 28.057953287902617, + "grad_norm": 0.4820605516433716, + "learning_rate": 0.00026346503496503494, + "loss": 3.1039, + "step": 96350 + }, + { + "epoch": 28.072514415516338, + "grad_norm": 0.4498361647129059, + "learning_rate": 0.00026329020979020974, + "loss": 3.1032, + "step": 96400 + }, + { + "epoch": 28.08707554313006, + "grad_norm": 0.4363076984882355, + "learning_rate": 0.0002631153846153846, + "loss": 3.1, + "step": 96450 + }, + { + "epoch": 28.101636670743783, + "grad_norm": 0.4439297020435333, + "learning_rate": 0.0002629405594405594, + "loss": 3.1075, + "step": 96500 + }, + { + "epoch": 28.116197798357504, + "grad_norm": 0.4495980739593506, + "learning_rate": 0.00026276573426573424, + "loss": 3.1011, + "step": 96550 + }, + { + "epoch": 28.13075892597123, + "grad_norm": 0.4267682731151581, + "learning_rate": 0.00026259090909090904, + "loss": 3.1196, + "step": 96600 + }, + { + "epoch": 28.14532005358495, + "grad_norm": 0.45746278762817383, + "learning_rate": 0.0002624160839160839, + "loss": 3.1064, + "step": 96650 + }, + { + "epoch": 28.15988118119867, + "grad_norm": 0.4494858682155609, + "learning_rate": 0.00026224125874125875, + "loss": 3.1358, + "step": 96700 + }, + { + "epoch": 28.174442308812395, + "grad_norm": 0.4345605671405792, + "learning_rate": 0.00026206643356643355, + "loss": 3.1165, + "step": 96750 + }, + { + "epoch": 28.189003436426116, + "grad_norm": 0.4938182532787323, + "learning_rate": 0.00026189160839160835, + "loss": 3.1316, + "step": 96800 + }, + { + "epoch": 28.20356456403984, + "grad_norm": 0.4320889711380005, + "learning_rate": 0.0002617167832167832, + "loss": 3.1396, + "step": 96850 + }, + { + "epoch": 28.21812569165356, + "grad_norm": 0.4658863842487335, + "learning_rate": 0.000261541958041958, + "loss": 3.1285, + "step": 96900 + }, + { + "epoch": 28.232686819267283, + "grad_norm": 0.4570460319519043, + "learning_rate": 0.00026136713286713286, + "loss": 3.1254, + "step": 96950 + }, + { + "epoch": 28.247247946881007, + "grad_norm": 0.428600013256073, + "learning_rate": 0.00026119230769230766, + "loss": 3.1176, + "step": 97000 + }, + { + "epoch": 28.247247946881007, + "eval_accuracy": 0.37457974306927283, + "eval_loss": 3.549133539199829, + "eval_runtime": 80.2696, + "eval_samples_per_second": 207.376, + "eval_steps_per_second": 12.969, + "step": 97000 + }, + { + "epoch": 28.261809074494728, + "grad_norm": 0.4301176965236664, + "learning_rate": 0.0002610174825174825, + "loss": 3.1302, + "step": 97050 + }, + { + "epoch": 28.276370202108453, + "grad_norm": 0.4620283544063568, + "learning_rate": 0.0002608426573426573, + "loss": 3.1245, + "step": 97100 + }, + { + "epoch": 28.290931329722174, + "grad_norm": 0.47182488441467285, + "learning_rate": 0.0002606678321678321, + "loss": 3.1322, + "step": 97150 + }, + { + "epoch": 28.305492457335895, + "grad_norm": 0.4671400189399719, + "learning_rate": 0.00026049300699300696, + "loss": 3.1412, + "step": 97200 + }, + { + "epoch": 28.32005358494962, + "grad_norm": 0.47812679409980774, + "learning_rate": 0.00026031818181818176, + "loss": 3.1458, + "step": 97250 + }, + { + "epoch": 28.33461471256334, + "grad_norm": 0.4420274794101715, + "learning_rate": 0.0002601433566433566, + "loss": 3.1302, + "step": 97300 + }, + { + "epoch": 28.349175840177065, + "grad_norm": 0.44124430418014526, + "learning_rate": 0.00025996853146853147, + "loss": 3.1475, + "step": 97350 + }, + { + "epoch": 28.363736967790786, + "grad_norm": 0.44124090671539307, + "learning_rate": 0.00025979370629370627, + "loss": 3.14, + "step": 97400 + }, + { + "epoch": 28.378298095404507, + "grad_norm": 0.4414704740047455, + "learning_rate": 0.0002596188811188811, + "loss": 3.1497, + "step": 97450 + }, + { + "epoch": 28.39285922301823, + "grad_norm": 0.46400997042655945, + "learning_rate": 0.0002594440559440559, + "loss": 3.1499, + "step": 97500 + }, + { + "epoch": 28.407420350631952, + "grad_norm": 0.4541986286640167, + "learning_rate": 0.0002592692307692307, + "loss": 3.1394, + "step": 97550 + }, + { + "epoch": 28.421981478245677, + "grad_norm": 0.441388338804245, + "learning_rate": 0.0002590944055944056, + "loss": 3.146, + "step": 97600 + }, + { + "epoch": 28.436542605859398, + "grad_norm": 0.44498082995414734, + "learning_rate": 0.0002589195804195804, + "loss": 3.1438, + "step": 97650 + }, + { + "epoch": 28.45110373347312, + "grad_norm": 0.4672899842262268, + "learning_rate": 0.00025874475524475523, + "loss": 3.1599, + "step": 97700 + }, + { + "epoch": 28.465664861086843, + "grad_norm": 0.4834342300891876, + "learning_rate": 0.00025856993006993003, + "loss": 3.1441, + "step": 97750 + }, + { + "epoch": 28.480225988700564, + "grad_norm": 0.47349074482917786, + "learning_rate": 0.0002583951048951049, + "loss": 3.1528, + "step": 97800 + }, + { + "epoch": 28.49478711631429, + "grad_norm": 0.4718509614467621, + "learning_rate": 0.0002582202797202797, + "loss": 3.1566, + "step": 97850 + }, + { + "epoch": 28.50934824392801, + "grad_norm": 0.4297626316547394, + "learning_rate": 0.0002580454545454545, + "loss": 3.1597, + "step": 97900 + }, + { + "epoch": 28.523909371541734, + "grad_norm": 0.44546571373939514, + "learning_rate": 0.00025787062937062934, + "loss": 3.1509, + "step": 97950 + }, + { + "epoch": 28.538470499155455, + "grad_norm": 0.4164583683013916, + "learning_rate": 0.0002576958041958042, + "loss": 3.1518, + "step": 98000 + }, + { + "epoch": 28.538470499155455, + "eval_accuracy": 0.3749666416101563, + "eval_loss": 3.5419790744781494, + "eval_runtime": 80.1957, + "eval_samples_per_second": 207.567, + "eval_steps_per_second": 12.981, + "step": 98000 + }, + { + "epoch": 28.553031626769176, + "grad_norm": 0.4906649589538574, + "learning_rate": 0.000257520979020979, + "loss": 3.1513, + "step": 98050 + }, + { + "epoch": 28.5675927543829, + "grad_norm": 0.44152840971946716, + "learning_rate": 0.00025734615384615385, + "loss": 3.1588, + "step": 98100 + }, + { + "epoch": 28.58215388199662, + "grad_norm": 0.4337315261363983, + "learning_rate": 0.00025717132867132865, + "loss": 3.1519, + "step": 98150 + }, + { + "epoch": 28.596715009610342, + "grad_norm": 0.47403421998023987, + "learning_rate": 0.0002569965034965035, + "loss": 3.1559, + "step": 98200 + }, + { + "epoch": 28.611276137224067, + "grad_norm": 0.4357426166534424, + "learning_rate": 0.0002568216783216783, + "loss": 3.1732, + "step": 98250 + }, + { + "epoch": 28.625837264837788, + "grad_norm": 0.46377402544021606, + "learning_rate": 0.0002566468531468531, + "loss": 3.1715, + "step": 98300 + }, + { + "epoch": 28.640398392451512, + "grad_norm": 0.46978867053985596, + "learning_rate": 0.00025647202797202795, + "loss": 3.179, + "step": 98350 + }, + { + "epoch": 28.654959520065233, + "grad_norm": 0.434087872505188, + "learning_rate": 0.00025629720279720275, + "loss": 3.1732, + "step": 98400 + }, + { + "epoch": 28.669520647678958, + "grad_norm": 0.4620792865753174, + "learning_rate": 0.0002561223776223776, + "loss": 3.1526, + "step": 98450 + }, + { + "epoch": 28.68408177529268, + "grad_norm": 0.4453633725643158, + "learning_rate": 0.0002559475524475524, + "loss": 3.1641, + "step": 98500 + }, + { + "epoch": 28.6986429029064, + "grad_norm": 0.4334501624107361, + "learning_rate": 0.00025577272727272726, + "loss": 3.1682, + "step": 98550 + }, + { + "epoch": 28.713204030520124, + "grad_norm": 0.4319615662097931, + "learning_rate": 0.00025559790209790206, + "loss": 3.1638, + "step": 98600 + }, + { + "epoch": 28.727765158133845, + "grad_norm": 0.4342224895954132, + "learning_rate": 0.00025542307692307686, + "loss": 3.177, + "step": 98650 + }, + { + "epoch": 28.74232628574757, + "grad_norm": 0.46854275465011597, + "learning_rate": 0.00025524825174825177, + "loss": 3.1656, + "step": 98700 + }, + { + "epoch": 28.75688741336129, + "grad_norm": 0.4229051172733307, + "learning_rate": 0.00025507342657342657, + "loss": 3.1708, + "step": 98750 + }, + { + "epoch": 28.771448540975012, + "grad_norm": 0.4276522099971771, + "learning_rate": 0.00025489860139860137, + "loss": 3.1722, + "step": 98800 + }, + { + "epoch": 28.786009668588736, + "grad_norm": 0.43881532549858093, + "learning_rate": 0.0002547237762237762, + "loss": 3.1635, + "step": 98850 + }, + { + "epoch": 28.800570796202457, + "grad_norm": 0.44678550958633423, + "learning_rate": 0.000254548951048951, + "loss": 3.183, + "step": 98900 + }, + { + "epoch": 28.815131923816182, + "grad_norm": 0.43881022930145264, + "learning_rate": 0.0002543741258741259, + "loss": 3.1755, + "step": 98950 + }, + { + "epoch": 28.829693051429903, + "grad_norm": 0.45341137051582336, + "learning_rate": 0.0002541993006993007, + "loss": 3.1707, + "step": 99000 + }, + { + "epoch": 28.829693051429903, + "eval_accuracy": 0.37557420516508966, + "eval_loss": 3.5332367420196533, + "eval_runtime": 80.287, + "eval_samples_per_second": 207.331, + "eval_steps_per_second": 12.966, + "step": 99000 + }, + { + "epoch": 28.844254179043624, + "grad_norm": 0.4361846446990967, + "learning_rate": 0.0002540244755244755, + "loss": 3.1643, + "step": 99050 + }, + { + "epoch": 28.85881530665735, + "grad_norm": 0.44548898935317993, + "learning_rate": 0.00025384965034965033, + "loss": 3.1791, + "step": 99100 + }, + { + "epoch": 28.87337643427107, + "grad_norm": 0.45371025800704956, + "learning_rate": 0.00025367482517482513, + "loss": 3.1785, + "step": 99150 + }, + { + "epoch": 28.887937561884794, + "grad_norm": 0.4528813660144806, + "learning_rate": 0.0002535, + "loss": 3.1653, + "step": 99200 + }, + { + "epoch": 28.902498689498515, + "grad_norm": 0.464300274848938, + "learning_rate": 0.0002533251748251748, + "loss": 3.1823, + "step": 99250 + }, + { + "epoch": 28.917059817112236, + "grad_norm": 0.4450153410434723, + "learning_rate": 0.00025315034965034964, + "loss": 3.1798, + "step": 99300 + }, + { + "epoch": 28.93162094472596, + "grad_norm": 0.43621668219566345, + "learning_rate": 0.00025297552447552444, + "loss": 3.18, + "step": 99350 + }, + { + "epoch": 28.94618207233968, + "grad_norm": 0.43210044503211975, + "learning_rate": 0.0002528006993006993, + "loss": 3.178, + "step": 99400 + }, + { + "epoch": 28.960743199953406, + "grad_norm": 0.44858497381210327, + "learning_rate": 0.00025262587412587414, + "loss": 3.1737, + "step": 99450 + }, + { + "epoch": 28.975304327567127, + "grad_norm": 0.42016497254371643, + "learning_rate": 0.00025245104895104894, + "loss": 3.1891, + "step": 99500 + }, + { + "epoch": 28.989865455180848, + "grad_norm": 0.457013875246048, + "learning_rate": 0.00025227622377622374, + "loss": 3.1854, + "step": 99550 + }, + { + "epoch": 29.004368338284117, + "grad_norm": 0.4366385042667389, + "learning_rate": 0.0002521013986013986, + "loss": 3.156, + "step": 99600 + }, + { + "epoch": 29.018929465897838, + "grad_norm": 0.4410390853881836, + "learning_rate": 0.0002519265734265734, + "loss": 3.0916, + "step": 99650 + }, + { + "epoch": 29.033490593511562, + "grad_norm": 0.4760029911994934, + "learning_rate": 0.00025175174825174825, + "loss": 3.0856, + "step": 99700 + }, + { + "epoch": 29.048051721125283, + "grad_norm": 0.4599713981151581, + "learning_rate": 0.00025157692307692305, + "loss": 3.0921, + "step": 99750 + }, + { + "epoch": 29.062612848739008, + "grad_norm": 0.4557690918445587, + "learning_rate": 0.0002514020979020979, + "loss": 3.0904, + "step": 99800 + }, + { + "epoch": 29.07717397635273, + "grad_norm": 0.46123120188713074, + "learning_rate": 0.0002512272727272727, + "loss": 3.0978, + "step": 99850 + }, + { + "epoch": 29.09173510396645, + "grad_norm": 0.47671201825141907, + "learning_rate": 0.0002510524475524475, + "loss": 3.1055, + "step": 99900 + }, + { + "epoch": 29.106296231580174, + "grad_norm": 0.465494841337204, + "learning_rate": 0.00025087762237762236, + "loss": 3.1087, + "step": 99950 + }, + { + "epoch": 29.120857359193895, + "grad_norm": 0.4717271029949188, + "learning_rate": 0.00025070279720279716, + "loss": 3.1001, + "step": 100000 + }, + { + "epoch": 29.120857359193895, + "eval_accuracy": 0.3747177615703355, + "eval_loss": 3.5496444702148438, + "eval_runtime": 80.0651, + "eval_samples_per_second": 207.906, + "eval_steps_per_second": 13.002, + "step": 100000 + }, + { + "epoch": 29.13541848680762, + "grad_norm": 0.4883511960506439, + "learning_rate": 0.000250527972027972, + "loss": 3.0934, + "step": 100050 + }, + { + "epoch": 29.14997961442134, + "grad_norm": 0.46403738856315613, + "learning_rate": 0.00025035314685314686, + "loss": 3.1031, + "step": 100100 + }, + { + "epoch": 29.16454074203506, + "grad_norm": 0.44273096323013306, + "learning_rate": 0.00025017832167832166, + "loss": 3.115, + "step": 100150 + }, + { + "epoch": 29.179101869648786, + "grad_norm": 0.44171518087387085, + "learning_rate": 0.0002500034965034965, + "loss": 3.1134, + "step": 100200 + }, + { + "epoch": 29.193662997262507, + "grad_norm": 0.4661337733268738, + "learning_rate": 0.0002498286713286713, + "loss": 3.1086, + "step": 100250 + }, + { + "epoch": 29.20822412487623, + "grad_norm": 0.4632498621940613, + "learning_rate": 0.0002496538461538461, + "loss": 3.1203, + "step": 100300 + }, + { + "epoch": 29.222785252489953, + "grad_norm": 0.44383248686790466, + "learning_rate": 0.00024947902097902097, + "loss": 3.1172, + "step": 100350 + }, + { + "epoch": 29.237346380103673, + "grad_norm": 0.4826676845550537, + "learning_rate": 0.00024930419580419577, + "loss": 3.1148, + "step": 100400 + }, + { + "epoch": 29.251907507717398, + "grad_norm": 0.4492054879665375, + "learning_rate": 0.0002491293706293706, + "loss": 3.1228, + "step": 100450 + }, + { + "epoch": 29.26646863533112, + "grad_norm": 0.4762059450149536, + "learning_rate": 0.0002489545454545454, + "loss": 3.1254, + "step": 100500 + }, + { + "epoch": 29.281029762944843, + "grad_norm": 0.44990840554237366, + "learning_rate": 0.0002487797202797203, + "loss": 3.1239, + "step": 100550 + }, + { + "epoch": 29.295590890558564, + "grad_norm": 0.4655245840549469, + "learning_rate": 0.0002486048951048951, + "loss": 3.1264, + "step": 100600 + }, + { + "epoch": 29.31015201817229, + "grad_norm": 0.4644462764263153, + "learning_rate": 0.0002484300699300699, + "loss": 3.1268, + "step": 100650 + }, + { + "epoch": 29.32471314578601, + "grad_norm": 0.4611171782016754, + "learning_rate": 0.00024825524475524473, + "loss": 3.1363, + "step": 100700 + }, + { + "epoch": 29.33927427339973, + "grad_norm": 0.4311342239379883, + "learning_rate": 0.00024808041958041953, + "loss": 3.1333, + "step": 100750 + }, + { + "epoch": 29.353835401013455, + "grad_norm": 0.4353893995285034, + "learning_rate": 0.0002479055944055944, + "loss": 3.1292, + "step": 100800 + }, + { + "epoch": 29.368396528627176, + "grad_norm": 0.44847074151039124, + "learning_rate": 0.00024773076923076924, + "loss": 3.1416, + "step": 100850 + }, + { + "epoch": 29.3829576562409, + "grad_norm": 0.47350096702575684, + "learning_rate": 0.00024755594405594404, + "loss": 3.142, + "step": 100900 + }, + { + "epoch": 29.397518783854622, + "grad_norm": 0.4682367444038391, + "learning_rate": 0.0002473811188811189, + "loss": 3.1421, + "step": 100950 + }, + { + "epoch": 29.412079911468343, + "grad_norm": 0.4489864408969879, + "learning_rate": 0.0002472062937062937, + "loss": 3.1358, + "step": 101000 + }, + { + "epoch": 29.412079911468343, + "eval_accuracy": 0.374867536332136, + "eval_loss": 3.5422158241271973, + "eval_runtime": 80.1833, + "eval_samples_per_second": 207.599, + "eval_steps_per_second": 12.983, + "step": 101000 + }, + { + "epoch": 29.426641039082067, + "grad_norm": 0.44524815678596497, + "learning_rate": 0.0002470314685314685, + "loss": 3.1352, + "step": 101050 + }, + { + "epoch": 29.44120216669579, + "grad_norm": 0.4737188518047333, + "learning_rate": 0.00024685664335664335, + "loss": 3.1558, + "step": 101100 + }, + { + "epoch": 29.455763294309513, + "grad_norm": 0.42699918150901794, + "learning_rate": 0.00024668181818181815, + "loss": 3.1368, + "step": 101150 + }, + { + "epoch": 29.470324421923234, + "grad_norm": 0.4792460501194, + "learning_rate": 0.000246506993006993, + "loss": 3.1303, + "step": 101200 + }, + { + "epoch": 29.484885549536955, + "grad_norm": 0.4665248394012451, + "learning_rate": 0.0002463321678321678, + "loss": 3.1459, + "step": 101250 + }, + { + "epoch": 29.49944667715068, + "grad_norm": 0.46637818217277527, + "learning_rate": 0.00024615734265734265, + "loss": 3.1432, + "step": 101300 + }, + { + "epoch": 29.5140078047644, + "grad_norm": 0.44114309549331665, + "learning_rate": 0.00024598251748251745, + "loss": 3.1468, + "step": 101350 + }, + { + "epoch": 29.528568932378125, + "grad_norm": 0.42972317337989807, + "learning_rate": 0.00024580769230769225, + "loss": 3.1451, + "step": 101400 + }, + { + "epoch": 29.543130059991846, + "grad_norm": 0.4702269732952118, + "learning_rate": 0.0002456328671328671, + "loss": 3.1502, + "step": 101450 + }, + { + "epoch": 29.557691187605567, + "grad_norm": 0.4438580274581909, + "learning_rate": 0.00024545804195804196, + "loss": 3.1464, + "step": 101500 + }, + { + "epoch": 29.57225231521929, + "grad_norm": 0.4511452913284302, + "learning_rate": 0.00024528321678321676, + "loss": 3.1685, + "step": 101550 + }, + { + "epoch": 29.586813442833012, + "grad_norm": 0.47536587715148926, + "learning_rate": 0.0002451083916083916, + "loss": 3.1497, + "step": 101600 + }, + { + "epoch": 29.601374570446737, + "grad_norm": 0.45455479621887207, + "learning_rate": 0.0002449335664335664, + "loss": 3.1497, + "step": 101650 + }, + { + "epoch": 29.615935698060458, + "grad_norm": 0.44872206449508667, + "learning_rate": 0.00024475874125874127, + "loss": 3.1571, + "step": 101700 + }, + { + "epoch": 29.63049682567418, + "grad_norm": 0.445295512676239, + "learning_rate": 0.00024458391608391607, + "loss": 3.1487, + "step": 101750 + }, + { + "epoch": 29.645057953287903, + "grad_norm": 0.441080778837204, + "learning_rate": 0.00024440909090909087, + "loss": 3.1633, + "step": 101800 + }, + { + "epoch": 29.659619080901624, + "grad_norm": 0.46299949288368225, + "learning_rate": 0.0002442342657342657, + "loss": 3.1565, + "step": 101850 + }, + { + "epoch": 29.67418020851535, + "grad_norm": 0.47477272152900696, + "learning_rate": 0.00024405944055944052, + "loss": 3.1519, + "step": 101900 + }, + { + "epoch": 29.68874133612907, + "grad_norm": 0.49416160583496094, + "learning_rate": 0.00024388461538461535, + "loss": 3.1549, + "step": 101950 + }, + { + "epoch": 29.70330246374279, + "grad_norm": 0.42633551359176636, + "learning_rate": 0.00024370979020979017, + "loss": 3.1647, + "step": 102000 + }, + { + "epoch": 29.70330246374279, + "eval_accuracy": 0.37523515460540935, + "eval_loss": 3.538114070892334, + "eval_runtime": 80.0602, + "eval_samples_per_second": 207.919, + "eval_steps_per_second": 13.003, + "step": 102000 + }, + { + "epoch": 29.717863591356515, + "grad_norm": 0.44595080614089966, + "learning_rate": 0.000243534965034965, + "loss": 3.1493, + "step": 102050 + }, + { + "epoch": 29.732424718970236, + "grad_norm": 0.4457579553127289, + "learning_rate": 0.00024336013986013983, + "loss": 3.1555, + "step": 102100 + }, + { + "epoch": 29.74698584658396, + "grad_norm": 0.4522918164730072, + "learning_rate": 0.00024318531468531468, + "loss": 3.1614, + "step": 102150 + }, + { + "epoch": 29.76154697419768, + "grad_norm": 0.4831245541572571, + "learning_rate": 0.0002430104895104895, + "loss": 3.1669, + "step": 102200 + }, + { + "epoch": 29.776108101811403, + "grad_norm": 0.4426896274089813, + "learning_rate": 0.00024283566433566434, + "loss": 3.1637, + "step": 102250 + }, + { + "epoch": 29.790669229425127, + "grad_norm": 0.4679453372955322, + "learning_rate": 0.00024266083916083916, + "loss": 3.143, + "step": 102300 + }, + { + "epoch": 29.805230357038848, + "grad_norm": 0.46347737312316895, + "learning_rate": 0.00024248601398601396, + "loss": 3.1639, + "step": 102350 + }, + { + "epoch": 29.819791484652573, + "grad_norm": 0.43473029136657715, + "learning_rate": 0.0002423111888111888, + "loss": 3.1701, + "step": 102400 + }, + { + "epoch": 29.834352612266294, + "grad_norm": 0.476441353559494, + "learning_rate": 0.00024213636363636362, + "loss": 3.1764, + "step": 102450 + }, + { + "epoch": 29.848913739880015, + "grad_norm": 0.4864274263381958, + "learning_rate": 0.00024196153846153844, + "loss": 3.175, + "step": 102500 + }, + { + "epoch": 29.86347486749374, + "grad_norm": 0.4513048827648163, + "learning_rate": 0.00024178671328671327, + "loss": 3.1733, + "step": 102550 + }, + { + "epoch": 29.87803599510746, + "grad_norm": 0.45398372411727905, + "learning_rate": 0.0002416118881118881, + "loss": 3.1719, + "step": 102600 + }, + { + "epoch": 29.892597122721185, + "grad_norm": 0.46702417731285095, + "learning_rate": 0.0002414370629370629, + "loss": 3.1643, + "step": 102650 + }, + { + "epoch": 29.907158250334906, + "grad_norm": 0.4604516625404358, + "learning_rate": 0.00024126223776223772, + "loss": 3.1713, + "step": 102700 + }, + { + "epoch": 29.921719377948627, + "grad_norm": 0.4356239438056946, + "learning_rate": 0.00024108741258741255, + "loss": 3.1608, + "step": 102750 + }, + { + "epoch": 29.93628050556235, + "grad_norm": 0.4850304126739502, + "learning_rate": 0.00024091258741258738, + "loss": 3.161, + "step": 102800 + }, + { + "epoch": 29.950841633176072, + "grad_norm": 0.44736698269844055, + "learning_rate": 0.00024073776223776223, + "loss": 3.1603, + "step": 102850 + }, + { + "epoch": 29.965402760789797, + "grad_norm": 0.4641413986682892, + "learning_rate": 0.00024056293706293706, + "loss": 3.1901, + "step": 102900 + }, + { + "epoch": 29.979963888403518, + "grad_norm": 0.4486852288246155, + "learning_rate": 0.00024038811188811188, + "loss": 3.1877, + "step": 102950 + }, + { + "epoch": 29.994525016017242, + "grad_norm": 0.49141523241996765, + "learning_rate": 0.0002402132867132867, + "loss": 3.1656, + "step": 103000 + }, + { + "epoch": 29.994525016017242, + "eval_accuracy": 0.37589832527363287, + "eval_loss": 3.530726432800293, + "eval_runtime": 80.1567, + "eval_samples_per_second": 207.668, + "eval_steps_per_second": 12.987, + "step": 103000 + }, + { + "epoch": 30.009027899120507, + "grad_norm": 0.45091068744659424, + "learning_rate": 0.00024003846153846154, + "loss": 3.1075, + "step": 103050 + }, + { + "epoch": 30.023589026734232, + "grad_norm": 0.49528542160987854, + "learning_rate": 0.00023986363636363634, + "loss": 3.0731, + "step": 103100 + }, + { + "epoch": 30.038150154347953, + "grad_norm": 0.4450590908527374, + "learning_rate": 0.00023968881118881116, + "loss": 3.0853, + "step": 103150 + }, + { + "epoch": 30.052711281961674, + "grad_norm": 0.47571080923080444, + "learning_rate": 0.000239513986013986, + "loss": 3.0899, + "step": 103200 + }, + { + "epoch": 30.0672724095754, + "grad_norm": 0.4562253952026367, + "learning_rate": 0.00023933916083916082, + "loss": 3.084, + "step": 103250 + }, + { + "epoch": 30.08183353718912, + "grad_norm": 0.5024160742759705, + "learning_rate": 0.00023916433566433564, + "loss": 3.086, + "step": 103300 + }, + { + "epoch": 30.096394664802844, + "grad_norm": 0.4524730145931244, + "learning_rate": 0.00023898951048951047, + "loss": 3.0895, + "step": 103350 + }, + { + "epoch": 30.110955792416565, + "grad_norm": 0.47511914372444153, + "learning_rate": 0.00023881468531468527, + "loss": 3.1031, + "step": 103400 + }, + { + "epoch": 30.125516920030286, + "grad_norm": 0.4788551330566406, + "learning_rate": 0.0002386398601398601, + "loss": 3.0969, + "step": 103450 + }, + { + "epoch": 30.14007804764401, + "grad_norm": 0.47147780656814575, + "learning_rate": 0.00023846503496503492, + "loss": 3.1024, + "step": 103500 + }, + { + "epoch": 30.15463917525773, + "grad_norm": 0.4830690324306488, + "learning_rate": 0.00023829020979020978, + "loss": 3.1028, + "step": 103550 + }, + { + "epoch": 30.169200302871456, + "grad_norm": 0.43729934096336365, + "learning_rate": 0.0002381153846153846, + "loss": 3.0977, + "step": 103600 + }, + { + "epoch": 30.183761430485177, + "grad_norm": 0.4548850655555725, + "learning_rate": 0.00023794055944055943, + "loss": 3.1001, + "step": 103650 + }, + { + "epoch": 30.198322558098898, + "grad_norm": 0.45362600684165955, + "learning_rate": 0.00023776573426573426, + "loss": 3.1122, + "step": 103700 + }, + { + "epoch": 30.212883685712622, + "grad_norm": 0.48883917927742004, + "learning_rate": 0.00023759090909090909, + "loss": 3.1036, + "step": 103750 + }, + { + "epoch": 30.227444813326343, + "grad_norm": 0.4714140295982361, + "learning_rate": 0.0002374160839160839, + "loss": 3.12, + "step": 103800 + }, + { + "epoch": 30.242005940940068, + "grad_norm": 0.45753130316734314, + "learning_rate": 0.0002372412587412587, + "loss": 3.1131, + "step": 103850 + }, + { + "epoch": 30.25656706855379, + "grad_norm": 0.4977312684059143, + "learning_rate": 0.00023706643356643354, + "loss": 3.116, + "step": 103900 + }, + { + "epoch": 30.27112819616751, + "grad_norm": 0.4684438109397888, + "learning_rate": 0.00023689160839160837, + "loss": 3.1174, + "step": 103950 + }, + { + "epoch": 30.285689323781234, + "grad_norm": 0.5155289173126221, + "learning_rate": 0.0002367167832167832, + "loss": 3.1186, + "step": 104000 + }, + { + "epoch": 30.285689323781234, + "eval_accuracy": 0.3748508424418882, + "eval_loss": 3.547801971435547, + "eval_runtime": 80.077, + "eval_samples_per_second": 207.875, + "eval_steps_per_second": 13.0, + "step": 104000 + }, + { + "epoch": 30.300250451394955, + "grad_norm": 0.46649569272994995, + "learning_rate": 0.00023654195804195802, + "loss": 3.1257, + "step": 104050 + }, + { + "epoch": 30.31481157900868, + "grad_norm": 0.4594707489013672, + "learning_rate": 0.00023636713286713285, + "loss": 3.1221, + "step": 104100 + }, + { + "epoch": 30.3293727066224, + "grad_norm": 0.46418672800064087, + "learning_rate": 0.00023619230769230765, + "loss": 3.1134, + "step": 104150 + }, + { + "epoch": 30.343933834236122, + "grad_norm": 0.4466555714607239, + "learning_rate": 0.00023601748251748247, + "loss": 3.1242, + "step": 104200 + }, + { + "epoch": 30.358494961849846, + "grad_norm": 0.43967121839523315, + "learning_rate": 0.00023584265734265733, + "loss": 3.1281, + "step": 104250 + }, + { + "epoch": 30.373056089463567, + "grad_norm": 0.4631558656692505, + "learning_rate": 0.00023566783216783215, + "loss": 3.1192, + "step": 104300 + }, + { + "epoch": 30.387617217077292, + "grad_norm": 0.4498026371002197, + "learning_rate": 0.00023549300699300698, + "loss": 3.1147, + "step": 104350 + }, + { + "epoch": 30.402178344691013, + "grad_norm": 0.4484608471393585, + "learning_rate": 0.0002353181818181818, + "loss": 3.1144, + "step": 104400 + }, + { + "epoch": 30.416739472304734, + "grad_norm": 0.4739231765270233, + "learning_rate": 0.00023514335664335663, + "loss": 3.1434, + "step": 104450 + }, + { + "epoch": 30.43130059991846, + "grad_norm": 0.46061864495277405, + "learning_rate": 0.00023496853146853146, + "loss": 3.1278, + "step": 104500 + }, + { + "epoch": 30.44586172753218, + "grad_norm": 0.46449682116508484, + "learning_rate": 0.0002347937062937063, + "loss": 3.1416, + "step": 104550 + }, + { + "epoch": 30.460422855145904, + "grad_norm": 0.48044365644454956, + "learning_rate": 0.0002346188811188811, + "loss": 3.1387, + "step": 104600 + }, + { + "epoch": 30.474983982759625, + "grad_norm": 0.4746876060962677, + "learning_rate": 0.0002344440559440559, + "loss": 3.1409, + "step": 104650 + }, + { + "epoch": 30.489545110373346, + "grad_norm": 0.4719257354736328, + "learning_rate": 0.00023426923076923074, + "loss": 3.1386, + "step": 104700 + }, + { + "epoch": 30.50410623798707, + "grad_norm": 0.4413412809371948, + "learning_rate": 0.00023409440559440557, + "loss": 3.1297, + "step": 104750 + }, + { + "epoch": 30.51866736560079, + "grad_norm": 0.4679592251777649, + "learning_rate": 0.0002339195804195804, + "loss": 3.1452, + "step": 104800 + }, + { + "epoch": 30.533228493214516, + "grad_norm": 0.4566100835800171, + "learning_rate": 0.00023374475524475522, + "loss": 3.1453, + "step": 104850 + }, + { + "epoch": 30.547789620828237, + "grad_norm": 0.47029706835746765, + "learning_rate": 0.00023356993006993002, + "loss": 3.1506, + "step": 104900 + }, + { + "epoch": 30.562350748441958, + "grad_norm": 0.4564160406589508, + "learning_rate": 0.0002333951048951049, + "loss": 3.1419, + "step": 104950 + }, + { + "epoch": 30.576911876055682, + "grad_norm": 0.4709474742412567, + "learning_rate": 0.0002332202797202797, + "loss": 3.146, + "step": 105000 + }, + { + "epoch": 30.576911876055682, + "eval_accuracy": 0.3752152865247623, + "eval_loss": 3.5399692058563232, + "eval_runtime": 80.0664, + "eval_samples_per_second": 207.902, + "eval_steps_per_second": 13.002, + "step": 105000 + }, + { + "epoch": 30.591473003669403, + "grad_norm": 0.46142178773880005, + "learning_rate": 0.00023304545454545453, + "loss": 3.1563, + "step": 105050 + }, + { + "epoch": 30.606034131283128, + "grad_norm": 0.46358320116996765, + "learning_rate": 0.00023287062937062935, + "loss": 3.1385, + "step": 105100 + }, + { + "epoch": 30.62059525889685, + "grad_norm": 0.4504646360874176, + "learning_rate": 0.00023269580419580418, + "loss": 3.1471, + "step": 105150 + }, + { + "epoch": 30.635156386510573, + "grad_norm": 0.4714089632034302, + "learning_rate": 0.000232520979020979, + "loss": 3.1404, + "step": 105200 + }, + { + "epoch": 30.649717514124294, + "grad_norm": 0.4826344847679138, + "learning_rate": 0.00023234615384615384, + "loss": 3.1385, + "step": 105250 + }, + { + "epoch": 30.664278641738015, + "grad_norm": 0.4676797688007355, + "learning_rate": 0.00023217132867132866, + "loss": 3.158, + "step": 105300 + }, + { + "epoch": 30.67883976935174, + "grad_norm": 0.4501577317714691, + "learning_rate": 0.00023199650349650346, + "loss": 3.1386, + "step": 105350 + }, + { + "epoch": 30.69340089696546, + "grad_norm": 0.4723682701587677, + "learning_rate": 0.0002318216783216783, + "loss": 3.15, + "step": 105400 + }, + { + "epoch": 30.707962024579185, + "grad_norm": 0.48127326369285583, + "learning_rate": 0.00023164685314685312, + "loss": 3.1437, + "step": 105450 + }, + { + "epoch": 30.722523152192906, + "grad_norm": 0.4738757908344269, + "learning_rate": 0.00023147202797202794, + "loss": 3.144, + "step": 105500 + }, + { + "epoch": 30.737084279806627, + "grad_norm": 0.47248905897140503, + "learning_rate": 0.00023129720279720277, + "loss": 3.1454, + "step": 105550 + }, + { + "epoch": 30.75164540742035, + "grad_norm": 0.4754147231578827, + "learning_rate": 0.0002311223776223776, + "loss": 3.1639, + "step": 105600 + }, + { + "epoch": 30.766206535034073, + "grad_norm": 0.44767364859580994, + "learning_rate": 0.00023094755244755245, + "loss": 3.1555, + "step": 105650 + }, + { + "epoch": 30.780767662647797, + "grad_norm": 0.46636345982551575, + "learning_rate": 0.00023077272727272728, + "loss": 3.1481, + "step": 105700 + }, + { + "epoch": 30.795328790261518, + "grad_norm": 0.4550693929195404, + "learning_rate": 0.00023059790209790208, + "loss": 3.1523, + "step": 105750 + }, + { + "epoch": 30.80988991787524, + "grad_norm": 0.4555010199546814, + "learning_rate": 0.0002304230769230769, + "loss": 3.1685, + "step": 105800 + }, + { + "epoch": 30.824451045488964, + "grad_norm": 0.45012399554252625, + "learning_rate": 0.00023024825174825173, + "loss": 3.1568, + "step": 105850 + }, + { + "epoch": 30.839012173102684, + "grad_norm": 0.45288777351379395, + "learning_rate": 0.00023007342657342656, + "loss": 3.1639, + "step": 105900 + }, + { + "epoch": 30.85357330071641, + "grad_norm": 0.4628019332885742, + "learning_rate": 0.00022989860139860138, + "loss": 3.1595, + "step": 105950 + }, + { + "epoch": 30.86813442833013, + "grad_norm": 0.46279942989349365, + "learning_rate": 0.0002297237762237762, + "loss": 3.1564, + "step": 106000 + }, + { + "epoch": 30.86813442833013, + "eval_accuracy": 0.375562213779137, + "eval_loss": 3.534041166305542, + "eval_runtime": 80.0374, + "eval_samples_per_second": 207.978, + "eval_steps_per_second": 13.006, + "step": 106000 + }, + { + "epoch": 30.88269555594385, + "grad_norm": 0.5104995369911194, + "learning_rate": 0.00022954895104895104, + "loss": 3.1669, + "step": 106050 + }, + { + "epoch": 30.897256683557575, + "grad_norm": 0.4528321921825409, + "learning_rate": 0.00022937412587412584, + "loss": 3.1579, + "step": 106100 + }, + { + "epoch": 30.911817811171296, + "grad_norm": 0.5047301650047302, + "learning_rate": 0.00022919930069930066, + "loss": 3.1495, + "step": 106150 + }, + { + "epoch": 30.92637893878502, + "grad_norm": 0.4677218496799469, + "learning_rate": 0.0002290244755244755, + "loss": 3.1518, + "step": 106200 + }, + { + "epoch": 30.940940066398742, + "grad_norm": 0.4636547267436981, + "learning_rate": 0.00022884965034965032, + "loss": 3.1625, + "step": 106250 + }, + { + "epoch": 30.955501194012463, + "grad_norm": 0.4806773066520691, + "learning_rate": 0.00022867482517482517, + "loss": 3.1572, + "step": 106300 + }, + { + "epoch": 30.970062321626187, + "grad_norm": 0.446891188621521, + "learning_rate": 0.0002285, + "loss": 3.174, + "step": 106350 + }, + { + "epoch": 30.98462344923991, + "grad_norm": 0.4715198874473572, + "learning_rate": 0.00022832517482517482, + "loss": 3.1645, + "step": 106400 + }, + { + "epoch": 30.999184576853633, + "grad_norm": 0.47789454460144043, + "learning_rate": 0.00022815034965034965, + "loss": 3.1705, + "step": 106450 + }, + { + "epoch": 31.0136874599569, + "grad_norm": 0.47417598962783813, + "learning_rate": 0.00022797552447552445, + "loss": 3.0832, + "step": 106500 + }, + { + "epoch": 31.028248587570623, + "grad_norm": 0.4684296250343323, + "learning_rate": 0.00022780069930069928, + "loss": 3.0746, + "step": 106550 + }, + { + "epoch": 31.042809715184344, + "grad_norm": 0.48018065094947815, + "learning_rate": 0.0002276258741258741, + "loss": 3.0733, + "step": 106600 + }, + { + "epoch": 31.057370842798065, + "grad_norm": 0.45850443840026855, + "learning_rate": 0.00022745104895104893, + "loss": 3.0727, + "step": 106650 + }, + { + "epoch": 31.07193197041179, + "grad_norm": 0.48787158727645874, + "learning_rate": 0.00022727622377622376, + "loss": 3.0854, + "step": 106700 + }, + { + "epoch": 31.08649309802551, + "grad_norm": 0.4667603671550751, + "learning_rate": 0.00022710139860139858, + "loss": 3.0871, + "step": 106750 + }, + { + "epoch": 31.101054225639235, + "grad_norm": 0.47687122225761414, + "learning_rate": 0.0002269265734265734, + "loss": 3.0864, + "step": 106800 + }, + { + "epoch": 31.115615353252956, + "grad_norm": 0.5060867667198181, + "learning_rate": 0.0002267517482517482, + "loss": 3.0812, + "step": 106850 + }, + { + "epoch": 31.130176480866677, + "grad_norm": 0.45447468757629395, + "learning_rate": 0.00022657692307692304, + "loss": 3.1017, + "step": 106900 + }, + { + "epoch": 31.1447376084804, + "grad_norm": 0.45640525221824646, + "learning_rate": 0.00022640209790209787, + "loss": 3.0849, + "step": 106950 + }, + { + "epoch": 31.159298736094122, + "grad_norm": 0.47501760721206665, + "learning_rate": 0.00022622727272727272, + "loss": 3.1052, + "step": 107000 + }, + { + "epoch": 31.159298736094122, + "eval_accuracy": 0.37493631045745257, + "eval_loss": 3.5480411052703857, + "eval_runtime": 80.1188, + "eval_samples_per_second": 207.766, + "eval_steps_per_second": 12.993, + "step": 107000 + }, + { + "epoch": 31.173859863707847, + "grad_norm": 0.4731229543685913, + "learning_rate": 0.00022605244755244755, + "loss": 3.0897, + "step": 107050 + }, + { + "epoch": 31.188420991321568, + "grad_norm": 0.43716487288475037, + "learning_rate": 0.00022587762237762237, + "loss": 3.0957, + "step": 107100 + }, + { + "epoch": 31.20298211893529, + "grad_norm": 0.46976327896118164, + "learning_rate": 0.0002257027972027972, + "loss": 3.0931, + "step": 107150 + }, + { + "epoch": 31.217543246549013, + "grad_norm": 0.48122093081474304, + "learning_rate": 0.00022552797202797203, + "loss": 3.1004, + "step": 107200 + }, + { + "epoch": 31.232104374162734, + "grad_norm": 0.48461681604385376, + "learning_rate": 0.00022535314685314683, + "loss": 3.0987, + "step": 107250 + }, + { + "epoch": 31.24666550177646, + "grad_norm": 0.47971364855766296, + "learning_rate": 0.00022517832167832165, + "loss": 3.1053, + "step": 107300 + }, + { + "epoch": 31.26122662939018, + "grad_norm": 0.48928123712539673, + "learning_rate": 0.00022500349650349648, + "loss": 3.111, + "step": 107350 + }, + { + "epoch": 31.2757877570039, + "grad_norm": 0.4729492962360382, + "learning_rate": 0.0002248286713286713, + "loss": 3.1053, + "step": 107400 + }, + { + "epoch": 31.290348884617625, + "grad_norm": 0.4527254104614258, + "learning_rate": 0.00022465384615384613, + "loss": 3.1115, + "step": 107450 + }, + { + "epoch": 31.304910012231346, + "grad_norm": 0.515654444694519, + "learning_rate": 0.00022447902097902096, + "loss": 3.1026, + "step": 107500 + }, + { + "epoch": 31.31947113984507, + "grad_norm": 0.46846842765808105, + "learning_rate": 0.0002243041958041958, + "loss": 3.1171, + "step": 107550 + }, + { + "epoch": 31.33403226745879, + "grad_norm": 0.4890732765197754, + "learning_rate": 0.00022412937062937059, + "loss": 3.1184, + "step": 107600 + }, + { + "epoch": 31.348593395072513, + "grad_norm": 0.48433053493499756, + "learning_rate": 0.0002239545454545454, + "loss": 3.1206, + "step": 107650 + }, + { + "epoch": 31.363154522686237, + "grad_norm": 0.46679550409317017, + "learning_rate": 0.00022377972027972027, + "loss": 3.1204, + "step": 107700 + }, + { + "epoch": 31.377715650299958, + "grad_norm": 0.4765881896018982, + "learning_rate": 0.0002236048951048951, + "loss": 3.1142, + "step": 107750 + }, + { + "epoch": 31.392276777913683, + "grad_norm": 0.4770815968513489, + "learning_rate": 0.00022343006993006992, + "loss": 3.1075, + "step": 107800 + }, + { + "epoch": 31.406837905527404, + "grad_norm": 0.44743603467941284, + "learning_rate": 0.00022325524475524475, + "loss": 3.1302, + "step": 107850 + }, + { + "epoch": 31.421399033141128, + "grad_norm": 0.464642733335495, + "learning_rate": 0.00022308041958041957, + "loss": 3.1183, + "step": 107900 + }, + { + "epoch": 31.43596016075485, + "grad_norm": 0.4641686975955963, + "learning_rate": 0.0002229055944055944, + "loss": 3.1279, + "step": 107950 + }, + { + "epoch": 31.45052128836857, + "grad_norm": 0.4865407943725586, + "learning_rate": 0.0002227307692307692, + "loss": 3.1158, + "step": 108000 + }, + { + "epoch": 31.45052128836857, + "eval_accuracy": 0.3751401640186473, + "eval_loss": 3.5436835289001465, + "eval_runtime": 80.2711, + "eval_samples_per_second": 207.372, + "eval_steps_per_second": 12.969, + "step": 108000 + }, + { + "epoch": 31.465082415982295, + "grad_norm": 0.48484715819358826, + "learning_rate": 0.00022255594405594403, + "loss": 3.1279, + "step": 108050 + }, + { + "epoch": 31.479643543596016, + "grad_norm": 0.49689507484436035, + "learning_rate": 0.00022238111888111885, + "loss": 3.1381, + "step": 108100 + }, + { + "epoch": 31.49420467120974, + "grad_norm": 0.48077550530433655, + "learning_rate": 0.00022220629370629368, + "loss": 3.1143, + "step": 108150 + }, + { + "epoch": 31.50876579882346, + "grad_norm": 0.46697720885276794, + "learning_rate": 0.0002220314685314685, + "loss": 3.1276, + "step": 108200 + }, + { + "epoch": 31.523326926437182, + "grad_norm": 0.450339138507843, + "learning_rate": 0.00022185664335664333, + "loss": 3.127, + "step": 108250 + }, + { + "epoch": 31.537888054050907, + "grad_norm": 0.48000800609588623, + "learning_rate": 0.00022168181818181816, + "loss": 3.1256, + "step": 108300 + }, + { + "epoch": 31.552449181664628, + "grad_norm": 0.45078784227371216, + "learning_rate": 0.00022150699300699296, + "loss": 3.1354, + "step": 108350 + }, + { + "epoch": 31.567010309278352, + "grad_norm": 0.5220247507095337, + "learning_rate": 0.00022133216783216782, + "loss": 3.1319, + "step": 108400 + }, + { + "epoch": 31.581571436892073, + "grad_norm": 0.47364139556884766, + "learning_rate": 0.00022115734265734264, + "loss": 3.138, + "step": 108450 + }, + { + "epoch": 31.596132564505794, + "grad_norm": 0.5164151191711426, + "learning_rate": 0.00022098251748251747, + "loss": 3.1304, + "step": 108500 + }, + { + "epoch": 31.61069369211952, + "grad_norm": 0.48746195435523987, + "learning_rate": 0.0002208076923076923, + "loss": 3.1443, + "step": 108550 + }, + { + "epoch": 31.62525481973324, + "grad_norm": 0.4802226424217224, + "learning_rate": 0.00022063286713286712, + "loss": 3.1355, + "step": 108600 + }, + { + "epoch": 31.639815947346964, + "grad_norm": 0.4758455455303192, + "learning_rate": 0.00022045804195804195, + "loss": 3.1396, + "step": 108650 + }, + { + "epoch": 31.654377074960685, + "grad_norm": 0.5246302485466003, + "learning_rate": 0.00022028321678321678, + "loss": 3.129, + "step": 108700 + }, + { + "epoch": 31.668938202574406, + "grad_norm": 0.530346691608429, + "learning_rate": 0.00022010839160839158, + "loss": 3.1192, + "step": 108750 + }, + { + "epoch": 31.68349933018813, + "grad_norm": 0.5023282170295715, + "learning_rate": 0.0002199335664335664, + "loss": 3.1508, + "step": 108800 + }, + { + "epoch": 31.69806045780185, + "grad_norm": 0.4601937532424927, + "learning_rate": 0.00021975874125874123, + "loss": 3.1397, + "step": 108850 + }, + { + "epoch": 31.712621585415576, + "grad_norm": 0.461478590965271, + "learning_rate": 0.00021958391608391606, + "loss": 3.146, + "step": 108900 + }, + { + "epoch": 31.727182713029297, + "grad_norm": 0.4497092068195343, + "learning_rate": 0.00021940909090909088, + "loss": 3.1376, + "step": 108950 + }, + { + "epoch": 31.741743840643018, + "grad_norm": 0.4443832039833069, + "learning_rate": 0.0002192342657342657, + "loss": 3.1372, + "step": 109000 + }, + { + "epoch": 31.741743840643018, + "eval_accuracy": 0.37569035702117987, + "eval_loss": 3.5374155044555664, + "eval_runtime": 80.5794, + "eval_samples_per_second": 206.579, + "eval_steps_per_second": 12.919, + "step": 109000 + }, + { + "epoch": 31.756304968256742, + "grad_norm": 0.4915781319141388, + "learning_rate": 0.00021905944055944054, + "loss": 3.1481, + "step": 109050 + }, + { + "epoch": 31.770866095870463, + "grad_norm": 0.469692587852478, + "learning_rate": 0.0002188846153846154, + "loss": 3.1523, + "step": 109100 + }, + { + "epoch": 31.785427223484188, + "grad_norm": 0.4651733636856079, + "learning_rate": 0.0002187097902097902, + "loss": 3.1517, + "step": 109150 + }, + { + "epoch": 31.79998835109791, + "grad_norm": 0.46926674246788025, + "learning_rate": 0.00021853496503496502, + "loss": 3.1418, + "step": 109200 + }, + { + "epoch": 31.81454947871163, + "grad_norm": 0.47990190982818604, + "learning_rate": 0.00021836013986013984, + "loss": 3.1477, + "step": 109250 + }, + { + "epoch": 31.829110606325354, + "grad_norm": 0.46162647008895874, + "learning_rate": 0.00021818531468531467, + "loss": 3.1466, + "step": 109300 + }, + { + "epoch": 31.843671733939075, + "grad_norm": 0.47720441222190857, + "learning_rate": 0.0002180104895104895, + "loss": 3.1627, + "step": 109350 + }, + { + "epoch": 31.8582328615528, + "grad_norm": 0.47837400436401367, + "learning_rate": 0.00021783566433566432, + "loss": 3.1456, + "step": 109400 + }, + { + "epoch": 31.87279398916652, + "grad_norm": 0.491738885641098, + "learning_rate": 0.00021766083916083915, + "loss": 3.1459, + "step": 109450 + }, + { + "epoch": 31.887355116780242, + "grad_norm": 0.48230868577957153, + "learning_rate": 0.00021748601398601395, + "loss": 3.1453, + "step": 109500 + }, + { + "epoch": 31.901916244393966, + "grad_norm": 0.476310133934021, + "learning_rate": 0.00021731118881118878, + "loss": 3.1492, + "step": 109550 + }, + { + "epoch": 31.916477372007687, + "grad_norm": 0.4597647786140442, + "learning_rate": 0.0002171363636363636, + "loss": 3.1604, + "step": 109600 + }, + { + "epoch": 31.931038499621412, + "grad_norm": 0.45928624272346497, + "learning_rate": 0.00021696153846153843, + "loss": 3.1674, + "step": 109650 + }, + { + "epoch": 31.945599627235133, + "grad_norm": 0.4845750033855438, + "learning_rate": 0.00021678671328671326, + "loss": 3.1557, + "step": 109700 + }, + { + "epoch": 31.960160754848857, + "grad_norm": 0.46560871601104736, + "learning_rate": 0.00021661188811188808, + "loss": 3.1529, + "step": 109750 + }, + { + "epoch": 31.97472188246258, + "grad_norm": 0.44942712783813477, + "learning_rate": 0.00021643706293706294, + "loss": 3.1677, + "step": 109800 + }, + { + "epoch": 31.9892830100763, + "grad_norm": 0.4887540340423584, + "learning_rate": 0.00021626223776223777, + "loss": 3.1662, + "step": 109850 + }, + { + "epoch": 32.00378589317957, + "grad_norm": 0.4859778881072998, + "learning_rate": 0.00021608741258741256, + "loss": 3.1318, + "step": 109900 + }, + { + "epoch": 32.01834702079329, + "grad_norm": 0.47409725189208984, + "learning_rate": 0.0002159125874125874, + "loss": 3.0582, + "step": 109950 + }, + { + "epoch": 32.03290814840701, + "grad_norm": 0.5324541926383972, + "learning_rate": 0.00021573776223776222, + "loss": 3.0647, + "step": 110000 + }, + { + "epoch": 32.03290814840701, + "eval_accuracy": 0.3751038371729673, + "eval_loss": 3.548638343811035, + "eval_runtime": 80.3445, + "eval_samples_per_second": 207.183, + "eval_steps_per_second": 12.957, + "step": 110000 + } + ], + "logging_steps": 50, + "max_steps": 171700, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 10000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 20, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 14 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.29922355806208e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}