| { |
| "best_global_step": 103000, |
| "best_metric": 3.5314321517944336, |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/resemble_to_drop_frequency_1032/checkpoint-40000", |
| "epoch": 35.8183353718912, |
| "eval_steps": 1000, |
| "global_step": 123000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.014561127613722406, |
| "grad_norm": 0.6528840065002441, |
| "learning_rate": 0.000294, |
| "loss": 8.4698, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.029122255227444813, |
| "grad_norm": 0.8257938027381897, |
| "learning_rate": 0.0005939999999999999, |
| "loss": 6.7174, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04368338284116722, |
| "grad_norm": 0.5600097179412842, |
| "learning_rate": 0.0005998286713286713, |
| "loss": 6.351, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.058244510454889625, |
| "grad_norm": 0.5472339391708374, |
| "learning_rate": 0.0005996538461538461, |
| "loss": 6.1467, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.07280563806861204, |
| "grad_norm": 0.5507671236991882, |
| "learning_rate": 0.0005994790209790209, |
| "loss": 5.9855, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.08736676568233444, |
| "grad_norm": 0.4078548550605774, |
| "learning_rate": 0.0005993041958041958, |
| "loss": 5.8572, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.10192789329605685, |
| "grad_norm": 0.48085489869117737, |
| "learning_rate": 0.0005991293706293705, |
| "loss": 5.7475, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.11648902090977925, |
| "grad_norm": 0.5217859745025635, |
| "learning_rate": 0.0005989545454545454, |
| "loss": 5.61, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.13105014852350166, |
| "grad_norm": 0.4224102199077606, |
| "learning_rate": 0.0005987797202797202, |
| "loss": 5.5122, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.14561127613722408, |
| "grad_norm": 0.5438912510871887, |
| "learning_rate": 0.000598604895104895, |
| "loss": 5.3894, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.16017240375094646, |
| "grad_norm": 0.4599864184856415, |
| "learning_rate": 0.0005984300699300698, |
| "loss": 5.3147, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.17473353136466888, |
| "grad_norm": 0.4337705671787262, |
| "learning_rate": 0.0005982552447552447, |
| "loss": 5.2455, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.1892946589783913, |
| "grad_norm": 0.42485231161117554, |
| "learning_rate": 0.0005980804195804195, |
| "loss": 5.1733, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.2038557865921137, |
| "grad_norm": 0.46135449409484863, |
| "learning_rate": 0.0005979055944055943, |
| "loss": 5.1193, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.2184169142058361, |
| "grad_norm": 0.5277227163314819, |
| "learning_rate": 0.0005977307692307691, |
| "loss": 5.0636, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.2329780418195585, |
| "grad_norm": 0.4671933352947235, |
| "learning_rate": 0.000597555944055944, |
| "loss": 5.0196, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.24753916943328091, |
| "grad_norm": 0.45326927304267883, |
| "learning_rate": 0.0005973811188811188, |
| "loss": 4.9593, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.2621002970470033, |
| "grad_norm": 0.4231816530227661, |
| "learning_rate": 0.0005972062937062936, |
| "loss": 4.907, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.27666142466072574, |
| "grad_norm": 0.5522693395614624, |
| "learning_rate": 0.0005970314685314685, |
| "loss": 4.8631, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.29122255227444815, |
| "grad_norm": 0.4881018400192261, |
| "learning_rate": 0.0005968566433566433, |
| "loss": 4.8179, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.29122255227444815, |
| "eval_accuracy": 0.2567770805396475, |
| "eval_loss": 4.738797664642334, |
| "eval_runtime": 82.9407, |
| "eval_samples_per_second": 200.637, |
| "eval_steps_per_second": 12.551, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.30578367988817057, |
| "grad_norm": 0.4916388690471649, |
| "learning_rate": 0.0005966818181818181, |
| "loss": 4.7603, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.3203448075018929, |
| "grad_norm": 0.44584327936172485, |
| "learning_rate": 0.0005965069930069929, |
| "loss": 4.7204, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.33490593511561534, |
| "grad_norm": 0.42427363991737366, |
| "learning_rate": 0.0005963321678321677, |
| "loss": 4.7079, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.34946706272933775, |
| "grad_norm": 0.4405299723148346, |
| "learning_rate": 0.0005961573426573425, |
| "loss": 4.6493, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.36402819034306017, |
| "grad_norm": 0.44946354627609253, |
| "learning_rate": 0.0005959825174825174, |
| "loss": 4.6272, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.3785893179567826, |
| "grad_norm": 0.43205249309539795, |
| "learning_rate": 0.0005958076923076922, |
| "loss": 4.5963, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.393150445570505, |
| "grad_norm": 0.4678141176700592, |
| "learning_rate": 0.000595632867132867, |
| "loss": 4.5641, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.4077115731842274, |
| "grad_norm": 0.440606027841568, |
| "learning_rate": 0.0005954580419580418, |
| "loss": 4.5378, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.4222727007979498, |
| "grad_norm": 0.4175941050052643, |
| "learning_rate": 0.0005952832167832168, |
| "loss": 4.5121, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.4368338284116722, |
| "grad_norm": 0.3882256746292114, |
| "learning_rate": 0.0005951083916083916, |
| "loss": 4.5025, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.4513949560253946, |
| "grad_norm": 0.41249504685401917, |
| "learning_rate": 0.0005949335664335664, |
| "loss": 4.4752, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.465956083639117, |
| "grad_norm": 0.4133797287940979, |
| "learning_rate": 0.0005947587412587413, |
| "loss": 4.4642, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.4805172112528394, |
| "grad_norm": 0.42349982261657715, |
| "learning_rate": 0.0005945839160839161, |
| "loss": 4.4517, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.49507833886656183, |
| "grad_norm": 0.4470789134502411, |
| "learning_rate": 0.0005944090909090909, |
| "loss": 4.4508, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.5096394664802842, |
| "grad_norm": 0.4773712158203125, |
| "learning_rate": 0.0005942342657342657, |
| "loss": 4.4264, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.5242005940940067, |
| "grad_norm": 0.44834038615226746, |
| "learning_rate": 0.0005940594405594406, |
| "loss": 4.4079, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.5387617217077291, |
| "grad_norm": 0.3919208347797394, |
| "learning_rate": 0.0005938846153846153, |
| "loss": 4.3795, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.5533228493214515, |
| "grad_norm": 0.43790921568870544, |
| "learning_rate": 0.0005937097902097902, |
| "loss": 4.3684, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.5678839769351739, |
| "grad_norm": 0.4102521240711212, |
| "learning_rate": 0.000593534965034965, |
| "loss": 4.3512, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.5824451045488963, |
| "grad_norm": 0.37008747458457947, |
| "learning_rate": 0.0005933601398601398, |
| "loss": 4.3391, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5824451045488963, |
| "eval_accuracy": 0.2992115881941556, |
| "eval_loss": 4.283796310424805, |
| "eval_runtime": 82.289, |
| "eval_samples_per_second": 202.226, |
| "eval_steps_per_second": 12.651, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5970062321626187, |
| "grad_norm": 0.42243221402168274, |
| "learning_rate": 0.0005931853146853146, |
| "loss": 4.3304, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.6115673597763411, |
| "grad_norm": 0.37911084294319153, |
| "learning_rate": 0.0005930104895104895, |
| "loss": 4.3009, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.6261284873900634, |
| "grad_norm": 0.40207958221435547, |
| "learning_rate": 0.0005928356643356643, |
| "loss": 4.3084, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.6406896150037859, |
| "grad_norm": 0.3750835061073303, |
| "learning_rate": 0.0005926608391608391, |
| "loss": 4.2849, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.6552507426175083, |
| "grad_norm": 0.3993470370769501, |
| "learning_rate": 0.000592486013986014, |
| "loss": 4.2729, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.6698118702312307, |
| "grad_norm": 0.3905385434627533, |
| "learning_rate": 0.0005923111888111888, |
| "loss": 4.2668, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.6843729978449531, |
| "grad_norm": 0.378936231136322, |
| "learning_rate": 0.0005921363636363636, |
| "loss": 4.2614, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.6989341254586755, |
| "grad_norm": 0.38004130125045776, |
| "learning_rate": 0.0005919615384615384, |
| "loss": 4.2571, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.7134952530723979, |
| "grad_norm": 0.3870111107826233, |
| "learning_rate": 0.0005917867132867133, |
| "loss": 4.2385, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.7280563806861203, |
| "grad_norm": 0.4324592351913452, |
| "learning_rate": 0.0005916118881118881, |
| "loss": 4.2402, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.7426175082998427, |
| "grad_norm": 0.38840192556381226, |
| "learning_rate": 0.0005914370629370629, |
| "loss": 4.2224, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.7571786359135652, |
| "grad_norm": 0.38530442118644714, |
| "learning_rate": 0.0005912622377622377, |
| "loss": 4.2124, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.7717397635272876, |
| "grad_norm": 0.40686488151550293, |
| "learning_rate": 0.0005910874125874125, |
| "loss": 4.2037, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.78630089114101, |
| "grad_norm": 0.38555827736854553, |
| "learning_rate": 0.0005909125874125873, |
| "loss": 4.1955, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.8008620187547324, |
| "grad_norm": 0.36234381794929504, |
| "learning_rate": 0.0005907377622377622, |
| "loss": 4.1821, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.8154231463684548, |
| "grad_norm": 0.3602573275566101, |
| "learning_rate": 0.000590562937062937, |
| "loss": 4.1828, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.8299842739821772, |
| "grad_norm": 0.3877962827682495, |
| "learning_rate": 0.0005903881118881118, |
| "loss": 4.1582, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.8445454015958996, |
| "grad_norm": 0.4010036587715149, |
| "learning_rate": 0.0005902132867132867, |
| "loss": 4.1698, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.8591065292096219, |
| "grad_norm": 0.40551772713661194, |
| "learning_rate": 0.0005900384615384615, |
| "loss": 4.1533, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.8736676568233444, |
| "grad_norm": 0.3658730685710907, |
| "learning_rate": 0.0005898636363636363, |
| "loss": 4.1469, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8736676568233444, |
| "eval_accuracy": 0.3146345567869235, |
| "eval_loss": 4.099434852600098, |
| "eval_runtime": 82.0319, |
| "eval_samples_per_second": 202.86, |
| "eval_steps_per_second": 12.69, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8882287844370668, |
| "grad_norm": 0.388008177280426, |
| "learning_rate": 0.0005896888111888111, |
| "loss": 4.1328, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.9027899120507892, |
| "grad_norm": 0.39945200085639954, |
| "learning_rate": 0.000589513986013986, |
| "loss": 4.1308, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.9173510396645116, |
| "grad_norm": 0.3612997829914093, |
| "learning_rate": 0.0005893391608391608, |
| "loss": 4.1223, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.931912167278234, |
| "grad_norm": 0.3395882248878479, |
| "learning_rate": 0.0005891643356643356, |
| "loss": 4.1291, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.9464732948919564, |
| "grad_norm": 0.34821709990501404, |
| "learning_rate": 0.0005889895104895104, |
| "loss": 4.1156, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.9610344225056788, |
| "grad_norm": 0.3562611937522888, |
| "learning_rate": 0.0005888146853146853, |
| "loss": 4.1067, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.9755955501194012, |
| "grad_norm": 0.34223735332489014, |
| "learning_rate": 0.00058863986013986, |
| "loss": 4.098, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.9901566777331237, |
| "grad_norm": 0.35366857051849365, |
| "learning_rate": 0.0005884650349650349, |
| "loss": 4.0923, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.004659560836391, |
| "grad_norm": 0.34298276901245117, |
| "learning_rate": 0.0005882902097902097, |
| "loss": 4.0716, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.0192206884501136, |
| "grad_norm": 0.35829463601112366, |
| "learning_rate": 0.0005881153846153845, |
| "loss": 4.0165, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.033781816063836, |
| "grad_norm": 0.36702677607536316, |
| "learning_rate": 0.0005879405594405594, |
| "loss": 4.0234, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.0483429436775584, |
| "grad_norm": 0.3444264531135559, |
| "learning_rate": 0.0005877657342657342, |
| "loss": 4.0037, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.0629040712912807, |
| "grad_norm": 0.3652048707008362, |
| "learning_rate": 0.000587590909090909, |
| "loss": 4.0106, |
| "step": 3650 |
| }, |
| { |
| "epoch": 1.0774651989050033, |
| "grad_norm": 0.34887561202049255, |
| "learning_rate": 0.0005874160839160838, |
| "loss": 4.012, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.0920263265187256, |
| "grad_norm": 0.365772008895874, |
| "learning_rate": 0.0005872412587412587, |
| "loss": 4.0089, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.106587454132448, |
| "grad_norm": 0.3509705364704132, |
| "learning_rate": 0.0005870664335664335, |
| "loss": 4.0032, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.1211485817461704, |
| "grad_norm": 0.3509422838687897, |
| "learning_rate": 0.0005868916083916083, |
| "loss": 3.9953, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.135709709359893, |
| "grad_norm": 0.35106027126312256, |
| "learning_rate": 0.0005867167832167831, |
| "loss": 4.0127, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.1502708369736152, |
| "grad_norm": 0.33764806389808655, |
| "learning_rate": 0.000586541958041958, |
| "loss": 3.9874, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.1648319645873377, |
| "grad_norm": 0.34795883297920227, |
| "learning_rate": 0.0005863671328671328, |
| "loss": 3.9916, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.1648319645873377, |
| "eval_accuracy": 0.3247409229391345, |
| "eval_loss": 3.9897139072418213, |
| "eval_runtime": 81.9187, |
| "eval_samples_per_second": 203.14, |
| "eval_steps_per_second": 12.708, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.17939309220106, |
| "grad_norm": 0.3436771035194397, |
| "learning_rate": 0.0005861923076923076, |
| "loss": 3.9808, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.1939542198147826, |
| "grad_norm": 0.33662286400794983, |
| "learning_rate": 0.0005860174825174824, |
| "loss": 3.97, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.2085153474285049, |
| "grad_norm": 0.3770606219768524, |
| "learning_rate": 0.0005858426573426573, |
| "loss": 3.9846, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.2230764750422272, |
| "grad_norm": 0.33492302894592285, |
| "learning_rate": 0.000585667832167832, |
| "loss": 3.9782, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.2376376026559497, |
| "grad_norm": 0.3708723783493042, |
| "learning_rate": 0.000585493006993007, |
| "loss": 3.9727, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.2521987302696722, |
| "grad_norm": 0.3522159457206726, |
| "learning_rate": 0.0005853181818181817, |
| "loss": 3.973, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.2667598578833945, |
| "grad_norm": 0.36696356534957886, |
| "learning_rate": 0.0005851433566433565, |
| "loss": 3.974, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.2813209854971168, |
| "grad_norm": 0.34658706188201904, |
| "learning_rate": 0.0005849685314685315, |
| "loss": 3.9553, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.2958821131108393, |
| "grad_norm": 0.34904006123542786, |
| "learning_rate": 0.0005847937062937063, |
| "loss": 3.959, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.3104432407245616, |
| "grad_norm": 0.3611108362674713, |
| "learning_rate": 0.0005846188811188811, |
| "loss": 3.9435, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.3250043683382842, |
| "grad_norm": 0.35322946310043335, |
| "learning_rate": 0.0005844440559440559, |
| "loss": 3.9656, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.3395654959520065, |
| "grad_norm": 0.3196225166320801, |
| "learning_rate": 0.0005842692307692308, |
| "loss": 3.9568, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.354126623565729, |
| "grad_norm": 0.3687885105609894, |
| "learning_rate": 0.0005840944055944056, |
| "loss": 3.9468, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.3686877511794513, |
| "grad_norm": 0.3646402359008789, |
| "learning_rate": 0.0005839195804195804, |
| "loss": 3.9572, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.3832488787931738, |
| "grad_norm": 0.3606205880641937, |
| "learning_rate": 0.0005837447552447552, |
| "loss": 3.9502, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.3978100064068961, |
| "grad_norm": 0.33462458848953247, |
| "learning_rate": 0.0005835699300699301, |
| "loss": 3.9463, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.4123711340206184, |
| "grad_norm": 0.33761417865753174, |
| "learning_rate": 0.0005833951048951048, |
| "loss": 3.9387, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.426932261634341, |
| "grad_norm": 0.32925212383270264, |
| "learning_rate": 0.0005832202797202797, |
| "loss": 3.9335, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.4414933892480635, |
| "grad_norm": 0.3408794105052948, |
| "learning_rate": 0.0005830454545454546, |
| "loss": 3.934, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.4560545168617858, |
| "grad_norm": 0.35612738132476807, |
| "learning_rate": 0.0005828706293706293, |
| "loss": 3.9303, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.4560545168617858, |
| "eval_accuracy": 0.3320795041977169, |
| "eval_loss": 3.9140219688415527, |
| "eval_runtime": 82.1598, |
| "eval_samples_per_second": 202.544, |
| "eval_steps_per_second": 12.67, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.470615644475508, |
| "grad_norm": 0.34724774956703186, |
| "learning_rate": 0.0005826958041958042, |
| "loss": 3.9387, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.4851767720892306, |
| "grad_norm": 0.32402661442756653, |
| "learning_rate": 0.000582520979020979, |
| "loss": 3.9247, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.4997378997029531, |
| "grad_norm": 0.3497373163700104, |
| "learning_rate": 0.0005823461538461538, |
| "loss": 3.9217, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.5142990273166754, |
| "grad_norm": 0.3167015016078949, |
| "learning_rate": 0.0005821713286713286, |
| "loss": 3.9281, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.5288601549303977, |
| "grad_norm": 0.32632240653038025, |
| "learning_rate": 0.0005819965034965035, |
| "loss": 3.9098, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.5434212825441203, |
| "grad_norm": 0.33851656317710876, |
| "learning_rate": 0.0005818216783216783, |
| "loss": 3.9135, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.5579824101578428, |
| "grad_norm": 0.35444602370262146, |
| "learning_rate": 0.0005816468531468531, |
| "loss": 3.9145, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.572543537771565, |
| "grad_norm": 0.3512258231639862, |
| "learning_rate": 0.0005814720279720279, |
| "loss": 3.9173, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.5871046653852874, |
| "grad_norm": 0.3554958403110504, |
| "learning_rate": 0.0005812972027972028, |
| "loss": 3.9097, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.6016657929990097, |
| "grad_norm": 0.3344559967517853, |
| "learning_rate": 0.0005811223776223776, |
| "loss": 3.8982, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.6162269206127322, |
| "grad_norm": 0.3254737854003906, |
| "learning_rate": 0.0005809475524475524, |
| "loss": 3.9036, |
| "step": 5550 |
| }, |
| { |
| "epoch": 1.6307880482264547, |
| "grad_norm": 0.3269593119621277, |
| "learning_rate": 0.0005807727272727272, |
| "loss": 3.8826, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.645349175840177, |
| "grad_norm": 0.3281262516975403, |
| "learning_rate": 0.0005805979020979021, |
| "loss": 3.8894, |
| "step": 5650 |
| }, |
| { |
| "epoch": 1.6599103034538993, |
| "grad_norm": 0.3467496633529663, |
| "learning_rate": 0.0005804230769230769, |
| "loss": 3.8941, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.6744714310676219, |
| "grad_norm": 0.33242523670196533, |
| "learning_rate": 0.0005802482517482517, |
| "loss": 3.893, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.6890325586813444, |
| "grad_norm": 0.34757599234580994, |
| "learning_rate": 0.0005800734265734265, |
| "loss": 3.8859, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.7035936862950667, |
| "grad_norm": 0.3275822699069977, |
| "learning_rate": 0.0005798986013986013, |
| "loss": 3.8957, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.718154813908789, |
| "grad_norm": 0.3242318332195282, |
| "learning_rate": 0.0005797237762237762, |
| "loss": 3.881, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.7327159415225115, |
| "grad_norm": 0.32875534892082214, |
| "learning_rate": 0.000579548951048951, |
| "loss": 3.8828, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.747277069136234, |
| "grad_norm": 0.33540111780166626, |
| "learning_rate": 0.0005793741258741258, |
| "loss": 3.8779, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.747277069136234, |
| "eval_accuracy": 0.336951351264901, |
| "eval_loss": 3.8589026927948, |
| "eval_runtime": 81.894, |
| "eval_samples_per_second": 203.202, |
| "eval_steps_per_second": 12.712, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.7618381967499563, |
| "grad_norm": 0.3311355412006378, |
| "learning_rate": 0.0005791993006993006, |
| "loss": 3.8786, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.7763993243636786, |
| "grad_norm": 0.3496015667915344, |
| "learning_rate": 0.0005790244755244755, |
| "loss": 3.865, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.7909604519774012, |
| "grad_norm": 0.3370971977710724, |
| "learning_rate": 0.0005788496503496503, |
| "loss": 3.8654, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.8055215795911237, |
| "grad_norm": 0.33096763491630554, |
| "learning_rate": 0.0005786748251748251, |
| "loss": 3.8695, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.820082707204846, |
| "grad_norm": 0.3417201340198517, |
| "learning_rate": 0.0005784999999999999, |
| "loss": 3.8653, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.8346438348185683, |
| "grad_norm": 0.3253554105758667, |
| "learning_rate": 0.0005783251748251748, |
| "loss": 3.8751, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.8492049624322906, |
| "grad_norm": 0.33386585116386414, |
| "learning_rate": 0.0005781503496503496, |
| "loss": 3.8571, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.8637660900460131, |
| "grad_norm": 0.33774393796920776, |
| "learning_rate": 0.0005779755244755244, |
| "loss": 3.8516, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.8783272176597356, |
| "grad_norm": 0.3423512876033783, |
| "learning_rate": 0.0005778006993006993, |
| "loss": 3.8532, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.892888345273458, |
| "grad_norm": 0.33588406443595886, |
| "learning_rate": 0.000577625874125874, |
| "loss": 3.8642, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.9074494728871803, |
| "grad_norm": 0.32922130823135376, |
| "learning_rate": 0.0005774510489510489, |
| "loss": 3.8582, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.9220106005009028, |
| "grad_norm": 0.3173973560333252, |
| "learning_rate": 0.0005772762237762237, |
| "loss": 3.8563, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.9365717281146253, |
| "grad_norm": 0.3350159525871277, |
| "learning_rate": 0.0005771013986013985, |
| "loss": 3.851, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.9511328557283476, |
| "grad_norm": 0.3206890821456909, |
| "learning_rate": 0.0005769265734265733, |
| "loss": 3.8535, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.96569398334207, |
| "grad_norm": 0.3488020598888397, |
| "learning_rate": 0.0005767517482517482, |
| "loss": 3.8442, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.9802551109557924, |
| "grad_norm": 0.324398934841156, |
| "learning_rate": 0.000576576923076923, |
| "loss": 3.8537, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.994816238569515, |
| "grad_norm": 0.34111925959587097, |
| "learning_rate": 0.0005764020979020978, |
| "loss": 3.8494, |
| "step": 6850 |
| }, |
| { |
| "epoch": 2.009319121672782, |
| "grad_norm": 0.3338935375213623, |
| "learning_rate": 0.0005762272727272726, |
| "loss": 3.7817, |
| "step": 6900 |
| }, |
| { |
| "epoch": 2.023880249286505, |
| "grad_norm": 0.3760268986225128, |
| "learning_rate": 0.0005760524475524475, |
| "loss": 3.7569, |
| "step": 6950 |
| }, |
| { |
| "epoch": 2.038441376900227, |
| "grad_norm": 0.3327067494392395, |
| "learning_rate": 0.0005758776223776223, |
| "loss": 3.7486, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.038441376900227, |
| "eval_accuracy": 0.3417364110593327, |
| "eval_loss": 3.8123481273651123, |
| "eval_runtime": 82.0725, |
| "eval_samples_per_second": 202.76, |
| "eval_steps_per_second": 12.684, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.0530025045139495, |
| "grad_norm": 0.33140629529953003, |
| "learning_rate": 0.0005757027972027971, |
| "loss": 3.7404, |
| "step": 7050 |
| }, |
| { |
| "epoch": 2.067563632127672, |
| "grad_norm": 0.3301616311073303, |
| "learning_rate": 0.000575527972027972, |
| "loss": 3.7554, |
| "step": 7100 |
| }, |
| { |
| "epoch": 2.0821247597413945, |
| "grad_norm": 0.33339133858680725, |
| "learning_rate": 0.0005753531468531468, |
| "loss": 3.7459, |
| "step": 7150 |
| }, |
| { |
| "epoch": 2.096685887355117, |
| "grad_norm": 0.32956090569496155, |
| "learning_rate": 0.0005751783216783216, |
| "loss": 3.7591, |
| "step": 7200 |
| }, |
| { |
| "epoch": 2.111247014968839, |
| "grad_norm": 0.3433052599430084, |
| "learning_rate": 0.0005750034965034964, |
| "loss": 3.7543, |
| "step": 7250 |
| }, |
| { |
| "epoch": 2.1258081425825615, |
| "grad_norm": 0.3369249999523163, |
| "learning_rate": 0.0005748286713286712, |
| "loss": 3.7375, |
| "step": 7300 |
| }, |
| { |
| "epoch": 2.140369270196284, |
| "grad_norm": 0.3354310095310211, |
| "learning_rate": 0.000574653846153846, |
| "loss": 3.7632, |
| "step": 7350 |
| }, |
| { |
| "epoch": 2.1549303978100065, |
| "grad_norm": 0.33519676327705383, |
| "learning_rate": 0.000574479020979021, |
| "loss": 3.75, |
| "step": 7400 |
| }, |
| { |
| "epoch": 2.169491525423729, |
| "grad_norm": 0.327425479888916, |
| "learning_rate": 0.0005743041958041958, |
| "loss": 3.757, |
| "step": 7450 |
| }, |
| { |
| "epoch": 2.184052653037451, |
| "grad_norm": 0.33982619643211365, |
| "learning_rate": 0.0005741293706293706, |
| "loss": 3.7605, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.198613780651174, |
| "grad_norm": 0.3179401457309723, |
| "learning_rate": 0.0005739545454545454, |
| "loss": 3.7467, |
| "step": 7550 |
| }, |
| { |
| "epoch": 2.213174908264896, |
| "grad_norm": 0.33316200971603394, |
| "learning_rate": 0.0005737797202797203, |
| "loss": 3.769, |
| "step": 7600 |
| }, |
| { |
| "epoch": 2.2277360358786185, |
| "grad_norm": 0.3287079632282257, |
| "learning_rate": 0.0005736048951048951, |
| "loss": 3.7543, |
| "step": 7650 |
| }, |
| { |
| "epoch": 2.2422971634923408, |
| "grad_norm": 0.3334376811981201, |
| "learning_rate": 0.0005734300699300699, |
| "loss": 3.7539, |
| "step": 7700 |
| }, |
| { |
| "epoch": 2.256858291106063, |
| "grad_norm": 0.332804799079895, |
| "learning_rate": 0.0005732552447552448, |
| "loss": 3.7412, |
| "step": 7750 |
| }, |
| { |
| "epoch": 2.271419418719786, |
| "grad_norm": 0.3318295180797577, |
| "learning_rate": 0.0005730804195804196, |
| "loss": 3.7679, |
| "step": 7800 |
| }, |
| { |
| "epoch": 2.285980546333508, |
| "grad_norm": 0.3402138948440552, |
| "learning_rate": 0.0005729055944055944, |
| "loss": 3.756, |
| "step": 7850 |
| }, |
| { |
| "epoch": 2.3005416739472304, |
| "grad_norm": 0.3430437445640564, |
| "learning_rate": 0.0005727307692307692, |
| "loss": 3.7571, |
| "step": 7900 |
| }, |
| { |
| "epoch": 2.3151028015609527, |
| "grad_norm": 0.3436347544193268, |
| "learning_rate": 0.0005725559440559441, |
| "loss": 3.7457, |
| "step": 7950 |
| }, |
| { |
| "epoch": 2.3296639291746755, |
| "grad_norm": 0.3539259433746338, |
| "learning_rate": 0.0005723811188811188, |
| "loss": 3.7408, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.3296639291746755, |
| "eval_accuracy": 0.3445159557460172, |
| "eval_loss": 3.7837913036346436, |
| "eval_runtime": 82.0688, |
| "eval_samples_per_second": 202.769, |
| "eval_steps_per_second": 12.684, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.3442250567883978, |
| "grad_norm": 0.3337029218673706, |
| "learning_rate": 0.0005722062937062937, |
| "loss": 3.7384, |
| "step": 8050 |
| }, |
| { |
| "epoch": 2.35878618440212, |
| "grad_norm": 0.3322373032569885, |
| "learning_rate": 0.0005720314685314685, |
| "loss": 3.7419, |
| "step": 8100 |
| }, |
| { |
| "epoch": 2.3733473120158424, |
| "grad_norm": 0.3534303903579712, |
| "learning_rate": 0.0005718566433566433, |
| "loss": 3.7587, |
| "step": 8150 |
| }, |
| { |
| "epoch": 2.387908439629565, |
| "grad_norm": 0.3298768103122711, |
| "learning_rate": 0.0005716818181818181, |
| "loss": 3.7613, |
| "step": 8200 |
| }, |
| { |
| "epoch": 2.4024695672432874, |
| "grad_norm": 0.33706793189048767, |
| "learning_rate": 0.000571506993006993, |
| "loss": 3.7462, |
| "step": 8250 |
| }, |
| { |
| "epoch": 2.4170306948570097, |
| "grad_norm": 0.31929516792297363, |
| "learning_rate": 0.0005713321678321678, |
| "loss": 3.7511, |
| "step": 8300 |
| }, |
| { |
| "epoch": 2.431591822470732, |
| "grad_norm": 0.33855506777763367, |
| "learning_rate": 0.0005711573426573426, |
| "loss": 3.756, |
| "step": 8350 |
| }, |
| { |
| "epoch": 2.4461529500844543, |
| "grad_norm": 0.32105639576911926, |
| "learning_rate": 0.0005709825174825175, |
| "loss": 3.7517, |
| "step": 8400 |
| }, |
| { |
| "epoch": 2.460714077698177, |
| "grad_norm": 0.33733922243118286, |
| "learning_rate": 0.0005708076923076923, |
| "loss": 3.7469, |
| "step": 8450 |
| }, |
| { |
| "epoch": 2.4752752053118994, |
| "grad_norm": 0.33622434735298157, |
| "learning_rate": 0.0005706328671328671, |
| "loss": 3.7427, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.4898363329256217, |
| "grad_norm": 0.353163480758667, |
| "learning_rate": 0.0005704580419580419, |
| "loss": 3.7481, |
| "step": 8550 |
| }, |
| { |
| "epoch": 2.5043974605393444, |
| "grad_norm": 0.37388384342193604, |
| "learning_rate": 0.0005702832167832168, |
| "loss": 3.7407, |
| "step": 8600 |
| }, |
| { |
| "epoch": 2.5189585881530667, |
| "grad_norm": 0.3204995393753052, |
| "learning_rate": 0.0005701083916083916, |
| "loss": 3.7369, |
| "step": 8650 |
| }, |
| { |
| "epoch": 2.533519715766789, |
| "grad_norm": 0.33969059586524963, |
| "learning_rate": 0.0005699335664335664, |
| "loss": 3.7397, |
| "step": 8700 |
| }, |
| { |
| "epoch": 2.5480808433805113, |
| "grad_norm": 0.36448442935943604, |
| "learning_rate": 0.0005697587412587412, |
| "loss": 3.7486, |
| "step": 8750 |
| }, |
| { |
| "epoch": 2.5626419709942336, |
| "grad_norm": 0.3244781494140625, |
| "learning_rate": 0.000569583916083916, |
| "loss": 3.7457, |
| "step": 8800 |
| }, |
| { |
| "epoch": 2.5772030986079564, |
| "grad_norm": 0.3253434896469116, |
| "learning_rate": 0.0005694090909090908, |
| "loss": 3.7361, |
| "step": 8850 |
| }, |
| { |
| "epoch": 2.5917642262216787, |
| "grad_norm": 0.32862237095832825, |
| "learning_rate": 0.0005692342657342657, |
| "loss": 3.7415, |
| "step": 8900 |
| }, |
| { |
| "epoch": 2.606325353835401, |
| "grad_norm": 0.34222903847694397, |
| "learning_rate": 0.0005690594405594405, |
| "loss": 3.7302, |
| "step": 8950 |
| }, |
| { |
| "epoch": 2.6208864814491233, |
| "grad_norm": 0.3521084487438202, |
| "learning_rate": 0.0005688846153846153, |
| "loss": 3.7353, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.6208864814491233, |
| "eval_accuracy": 0.34751599655249904, |
| "eval_loss": 3.7540175914764404, |
| "eval_runtime": 82.1431, |
| "eval_samples_per_second": 202.585, |
| "eval_steps_per_second": 12.673, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.6354476090628456, |
| "grad_norm": 0.32190045714378357, |
| "learning_rate": 0.0005687097902097901, |
| "loss": 3.7463, |
| "step": 9050 |
| }, |
| { |
| "epoch": 2.6500087366765683, |
| "grad_norm": 0.35038506984710693, |
| "learning_rate": 0.000568534965034965, |
| "loss": 3.7463, |
| "step": 9100 |
| }, |
| { |
| "epoch": 2.6645698642902906, |
| "grad_norm": 0.32920852303504944, |
| "learning_rate": 0.0005683601398601398, |
| "loss": 3.7322, |
| "step": 9150 |
| }, |
| { |
| "epoch": 2.679130991904013, |
| "grad_norm": 0.34264495968818665, |
| "learning_rate": 0.0005681853146853146, |
| "loss": 3.7255, |
| "step": 9200 |
| }, |
| { |
| "epoch": 2.6936921195177357, |
| "grad_norm": 0.3350652754306793, |
| "learning_rate": 0.0005680104895104895, |
| "loss": 3.7382, |
| "step": 9250 |
| }, |
| { |
| "epoch": 2.708253247131458, |
| "grad_norm": 0.352579265832901, |
| "learning_rate": 0.0005678356643356643, |
| "loss": 3.7282, |
| "step": 9300 |
| }, |
| { |
| "epoch": 2.7228143747451803, |
| "grad_norm": 0.330732136964798, |
| "learning_rate": 0.0005676608391608391, |
| "loss": 3.7407, |
| "step": 9350 |
| }, |
| { |
| "epoch": 2.7373755023589026, |
| "grad_norm": 0.3589532971382141, |
| "learning_rate": 0.0005674860139860139, |
| "loss": 3.7369, |
| "step": 9400 |
| }, |
| { |
| "epoch": 2.751936629972625, |
| "grad_norm": 0.32775962352752686, |
| "learning_rate": 0.0005673111888111888, |
| "loss": 3.7366, |
| "step": 9450 |
| }, |
| { |
| "epoch": 2.7664977575863476, |
| "grad_norm": 0.32425662875175476, |
| "learning_rate": 0.0005671363636363635, |
| "loss": 3.7095, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.78105888520007, |
| "grad_norm": 0.3335295021533966, |
| "learning_rate": 0.0005669615384615384, |
| "loss": 3.7193, |
| "step": 9550 |
| }, |
| { |
| "epoch": 2.7956200128137922, |
| "grad_norm": 0.30232927203178406, |
| "learning_rate": 0.0005667867132867132, |
| "loss": 3.7327, |
| "step": 9600 |
| }, |
| { |
| "epoch": 2.8101811404275145, |
| "grad_norm": 0.3307936191558838, |
| "learning_rate": 0.000566611888111888, |
| "loss": 3.7352, |
| "step": 9650 |
| }, |
| { |
| "epoch": 2.824742268041237, |
| "grad_norm": 0.3367597460746765, |
| "learning_rate": 0.0005664370629370628, |
| "loss": 3.7185, |
| "step": 9700 |
| }, |
| { |
| "epoch": 2.8393033956549596, |
| "grad_norm": 0.32896873354911804, |
| "learning_rate": 0.0005662622377622377, |
| "loss": 3.7263, |
| "step": 9750 |
| }, |
| { |
| "epoch": 2.853864523268682, |
| "grad_norm": 0.34071600437164307, |
| "learning_rate": 0.0005660874125874125, |
| "loss": 3.7331, |
| "step": 9800 |
| }, |
| { |
| "epoch": 2.868425650882404, |
| "grad_norm": 0.32482579350471497, |
| "learning_rate": 0.0005659125874125873, |
| "loss": 3.7181, |
| "step": 9850 |
| }, |
| { |
| "epoch": 2.882986778496127, |
| "grad_norm": 0.3404524326324463, |
| "learning_rate": 0.0005657377622377622, |
| "loss": 3.7237, |
| "step": 9900 |
| }, |
| { |
| "epoch": 2.8975479061098492, |
| "grad_norm": 0.33561643958091736, |
| "learning_rate": 0.000565562937062937, |
| "loss": 3.7209, |
| "step": 9950 |
| }, |
| { |
| "epoch": 2.9121090337235715, |
| "grad_norm": 0.3168312907218933, |
| "learning_rate": 0.0005653881118881118, |
| "loss": 3.7216, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.9121090337235715, |
| "eval_accuracy": 0.35001836291685673, |
| "eval_loss": 3.726482629776001, |
| "eval_runtime": 82.3859, |
| "eval_samples_per_second": 201.988, |
| "eval_steps_per_second": 12.636, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.926670161337294, |
| "grad_norm": 0.3269791901111603, |
| "learning_rate": 0.0005652132867132866, |
| "loss": 3.7298, |
| "step": 10050 |
| }, |
| { |
| "epoch": 2.941231288951016, |
| "grad_norm": 0.324239581823349, |
| "learning_rate": 0.0005650384615384615, |
| "loss": 3.7301, |
| "step": 10100 |
| }, |
| { |
| "epoch": 2.955792416564739, |
| "grad_norm": 0.31391772627830505, |
| "learning_rate": 0.0005648636363636363, |
| "loss": 3.7229, |
| "step": 10150 |
| }, |
| { |
| "epoch": 2.970353544178461, |
| "grad_norm": 0.3260219395160675, |
| "learning_rate": 0.0005646888111888111, |
| "loss": 3.7196, |
| "step": 10200 |
| }, |
| { |
| "epoch": 2.9849146717921835, |
| "grad_norm": 0.32373833656311035, |
| "learning_rate": 0.000564513986013986, |
| "loss": 3.7202, |
| "step": 10250 |
| }, |
| { |
| "epoch": 2.9994757994059063, |
| "grad_norm": 0.3273918926715851, |
| "learning_rate": 0.0005643391608391607, |
| "loss": 3.7113, |
| "step": 10300 |
| }, |
| { |
| "epoch": 3.0139786825091734, |
| "grad_norm": 0.33086708188056946, |
| "learning_rate": 0.0005641643356643355, |
| "loss": 3.6129, |
| "step": 10350 |
| }, |
| { |
| "epoch": 3.0285398101228957, |
| "grad_norm": 0.3447768986225128, |
| "learning_rate": 0.0005639895104895105, |
| "loss": 3.6121, |
| "step": 10400 |
| }, |
| { |
| "epoch": 3.0431009377366185, |
| "grad_norm": 0.33584967255592346, |
| "learning_rate": 0.0005638146853146853, |
| "loss": 3.6127, |
| "step": 10450 |
| }, |
| { |
| "epoch": 3.057662065350341, |
| "grad_norm": 0.32875823974609375, |
| "learning_rate": 0.0005636398601398601, |
| "loss": 3.6041, |
| "step": 10500 |
| }, |
| { |
| "epoch": 3.072223192964063, |
| "grad_norm": 0.3671276271343231, |
| "learning_rate": 0.000563465034965035, |
| "loss": 3.6296, |
| "step": 10550 |
| }, |
| { |
| "epoch": 3.0867843205777854, |
| "grad_norm": 0.32451075315475464, |
| "learning_rate": 0.0005632902097902098, |
| "loss": 3.6296, |
| "step": 10600 |
| }, |
| { |
| "epoch": 3.101345448191508, |
| "grad_norm": 0.32295992970466614, |
| "learning_rate": 0.0005631153846153846, |
| "loss": 3.6212, |
| "step": 10650 |
| }, |
| { |
| "epoch": 3.1159065758052304, |
| "grad_norm": 0.3424418568611145, |
| "learning_rate": 0.0005629405594405594, |
| "loss": 3.6273, |
| "step": 10700 |
| }, |
| { |
| "epoch": 3.1304677034189528, |
| "grad_norm": 0.32966750860214233, |
| "learning_rate": 0.0005627657342657343, |
| "loss": 3.6275, |
| "step": 10750 |
| }, |
| { |
| "epoch": 3.145028831032675, |
| "grad_norm": 0.3598492443561554, |
| "learning_rate": 0.0005625909090909091, |
| "loss": 3.6326, |
| "step": 10800 |
| }, |
| { |
| "epoch": 3.1595899586463974, |
| "grad_norm": 0.3407542109489441, |
| "learning_rate": 0.0005624160839160839, |
| "loss": 3.6369, |
| "step": 10850 |
| }, |
| { |
| "epoch": 3.17415108626012, |
| "grad_norm": 0.3183291256427765, |
| "learning_rate": 0.0005622412587412587, |
| "loss": 3.6324, |
| "step": 10900 |
| }, |
| { |
| "epoch": 3.1887122138738424, |
| "grad_norm": 0.37839338183403015, |
| "learning_rate": 0.0005620664335664336, |
| "loss": 3.626, |
| "step": 10950 |
| }, |
| { |
| "epoch": 3.2032733414875647, |
| "grad_norm": 0.3264527916908264, |
| "learning_rate": 0.0005618916083916083, |
| "loss": 3.627, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.2032733414875647, |
| "eval_accuracy": 0.3516285137820659, |
| "eval_loss": 3.714599370956421, |
| "eval_runtime": 82.2561, |
| "eval_samples_per_second": 202.307, |
| "eval_steps_per_second": 12.656, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.217834469101287, |
| "grad_norm": 0.3339904546737671, |
| "learning_rate": 0.0005617167832167832, |
| "loss": 3.6287, |
| "step": 11050 |
| }, |
| { |
| "epoch": 3.2323955967150098, |
| "grad_norm": 0.3266938626766205, |
| "learning_rate": 0.000561541958041958, |
| "loss": 3.637, |
| "step": 11100 |
| }, |
| { |
| "epoch": 3.246956724328732, |
| "grad_norm": 0.31706535816192627, |
| "learning_rate": 0.0005613671328671328, |
| "loss": 3.633, |
| "step": 11150 |
| }, |
| { |
| "epoch": 3.2615178519424544, |
| "grad_norm": 0.3285302221775055, |
| "learning_rate": 0.0005611923076923077, |
| "loss": 3.6445, |
| "step": 11200 |
| }, |
| { |
| "epoch": 3.2760789795561767, |
| "grad_norm": 0.33090338110923767, |
| "learning_rate": 0.0005610174825174825, |
| "loss": 3.6549, |
| "step": 11250 |
| }, |
| { |
| "epoch": 3.2906401071698994, |
| "grad_norm": 0.3252682387828827, |
| "learning_rate": 0.0005608426573426573, |
| "loss": 3.6443, |
| "step": 11300 |
| }, |
| { |
| "epoch": 3.3052012347836217, |
| "grad_norm": 0.32692620158195496, |
| "learning_rate": 0.0005606678321678321, |
| "loss": 3.6474, |
| "step": 11350 |
| }, |
| { |
| "epoch": 3.319762362397344, |
| "grad_norm": 0.33046919107437134, |
| "learning_rate": 0.000560493006993007, |
| "loss": 3.6309, |
| "step": 11400 |
| }, |
| { |
| "epoch": 3.3343234900110663, |
| "grad_norm": 0.3266463577747345, |
| "learning_rate": 0.0005603181818181818, |
| "loss": 3.6392, |
| "step": 11450 |
| }, |
| { |
| "epoch": 3.3488846176247886, |
| "grad_norm": 0.3450075685977936, |
| "learning_rate": 0.0005601433566433566, |
| "loss": 3.6473, |
| "step": 11500 |
| }, |
| { |
| "epoch": 3.3634457452385114, |
| "grad_norm": 0.3198067545890808, |
| "learning_rate": 0.0005599685314685314, |
| "loss": 3.6422, |
| "step": 11550 |
| }, |
| { |
| "epoch": 3.3780068728522337, |
| "grad_norm": 0.33383065462112427, |
| "learning_rate": 0.0005597937062937063, |
| "loss": 3.6519, |
| "step": 11600 |
| }, |
| { |
| "epoch": 3.392568000465956, |
| "grad_norm": 0.3222882151603699, |
| "learning_rate": 0.0005596188811188811, |
| "loss": 3.6515, |
| "step": 11650 |
| }, |
| { |
| "epoch": 3.4071291280796787, |
| "grad_norm": 0.3156638443470001, |
| "learning_rate": 0.0005594440559440559, |
| "loss": 3.65, |
| "step": 11700 |
| }, |
| { |
| "epoch": 3.421690255693401, |
| "grad_norm": 0.32944968342781067, |
| "learning_rate": 0.0005592692307692307, |
| "loss": 3.636, |
| "step": 11750 |
| }, |
| { |
| "epoch": 3.4362513833071233, |
| "grad_norm": 0.3216640055179596, |
| "learning_rate": 0.0005590944055944055, |
| "loss": 3.644, |
| "step": 11800 |
| }, |
| { |
| "epoch": 3.4508125109208456, |
| "grad_norm": 0.33631327748298645, |
| "learning_rate": 0.0005589195804195803, |
| "loss": 3.6419, |
| "step": 11850 |
| }, |
| { |
| "epoch": 3.465373638534568, |
| "grad_norm": 0.31073594093322754, |
| "learning_rate": 0.0005587447552447552, |
| "loss": 3.6407, |
| "step": 11900 |
| }, |
| { |
| "epoch": 3.4799347661482907, |
| "grad_norm": 0.31848210096359253, |
| "learning_rate": 0.00055856993006993, |
| "loss": 3.6375, |
| "step": 11950 |
| }, |
| { |
| "epoch": 3.494495893762013, |
| "grad_norm": 0.33654505014419556, |
| "learning_rate": 0.0005583951048951048, |
| "loss": 3.6338, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.494495893762013, |
| "eval_accuracy": 0.3533456787640834, |
| "eval_loss": 3.69667911529541, |
| "eval_runtime": 82.2157, |
| "eval_samples_per_second": 202.407, |
| "eval_steps_per_second": 12.662, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.5090570213757353, |
| "grad_norm": 0.3209030330181122, |
| "learning_rate": 0.0005582202797202797, |
| "loss": 3.6441, |
| "step": 12050 |
| }, |
| { |
| "epoch": 3.523618148989458, |
| "grad_norm": 0.32589489221572876, |
| "learning_rate": 0.0005580454545454545, |
| "loss": 3.6467, |
| "step": 12100 |
| }, |
| { |
| "epoch": 3.53817927660318, |
| "grad_norm": 0.3204765021800995, |
| "learning_rate": 0.0005578706293706293, |
| "loss": 3.6442, |
| "step": 12150 |
| }, |
| { |
| "epoch": 3.5527404042169026, |
| "grad_norm": 0.34807610511779785, |
| "learning_rate": 0.0005576958041958041, |
| "loss": 3.6419, |
| "step": 12200 |
| }, |
| { |
| "epoch": 3.567301531830625, |
| "grad_norm": 0.33871540427207947, |
| "learning_rate": 0.000557520979020979, |
| "loss": 3.6413, |
| "step": 12250 |
| }, |
| { |
| "epoch": 3.5818626594443472, |
| "grad_norm": 0.3269228935241699, |
| "learning_rate": 0.0005573461538461538, |
| "loss": 3.6419, |
| "step": 12300 |
| }, |
| { |
| "epoch": 3.59642378705807, |
| "grad_norm": 0.31153029203414917, |
| "learning_rate": 0.0005571713286713286, |
| "loss": 3.6481, |
| "step": 12350 |
| }, |
| { |
| "epoch": 3.6109849146717923, |
| "grad_norm": 0.3126721978187561, |
| "learning_rate": 0.0005569965034965034, |
| "loss": 3.6469, |
| "step": 12400 |
| }, |
| { |
| "epoch": 3.6255460422855146, |
| "grad_norm": 0.3358977437019348, |
| "learning_rate": 0.0005568216783216783, |
| "loss": 3.6568, |
| "step": 12450 |
| }, |
| { |
| "epoch": 3.640107169899237, |
| "grad_norm": 0.3345838189125061, |
| "learning_rate": 0.000556646853146853, |
| "loss": 3.6407, |
| "step": 12500 |
| }, |
| { |
| "epoch": 3.654668297512959, |
| "grad_norm": 0.35589200258255005, |
| "learning_rate": 0.0005564720279720279, |
| "loss": 3.6322, |
| "step": 12550 |
| }, |
| { |
| "epoch": 3.669229425126682, |
| "grad_norm": 0.3354792892932892, |
| "learning_rate": 0.0005562972027972027, |
| "loss": 3.6539, |
| "step": 12600 |
| }, |
| { |
| "epoch": 3.6837905527404042, |
| "grad_norm": 0.33764365315437317, |
| "learning_rate": 0.0005561223776223775, |
| "loss": 3.6387, |
| "step": 12650 |
| }, |
| { |
| "epoch": 3.6983516803541265, |
| "grad_norm": 0.33043789863586426, |
| "learning_rate": 0.0005559475524475524, |
| "loss": 3.6299, |
| "step": 12700 |
| }, |
| { |
| "epoch": 3.7129128079678493, |
| "grad_norm": 0.3240761458873749, |
| "learning_rate": 0.0005557727272727272, |
| "loss": 3.6354, |
| "step": 12750 |
| }, |
| { |
| "epoch": 3.7274739355815716, |
| "grad_norm": 0.31605395674705505, |
| "learning_rate": 0.000555597902097902, |
| "loss": 3.6449, |
| "step": 12800 |
| }, |
| { |
| "epoch": 3.742035063195294, |
| "grad_norm": 0.33991727232933044, |
| "learning_rate": 0.0005554230769230768, |
| "loss": 3.6499, |
| "step": 12850 |
| }, |
| { |
| "epoch": 3.756596190809016, |
| "grad_norm": 0.3339294493198395, |
| "learning_rate": 0.0005552482517482517, |
| "loss": 3.6464, |
| "step": 12900 |
| }, |
| { |
| "epoch": 3.7711573184227385, |
| "grad_norm": 0.3151257336139679, |
| "learning_rate": 0.0005550734265734265, |
| "loss": 3.6547, |
| "step": 12950 |
| }, |
| { |
| "epoch": 3.7857184460364612, |
| "grad_norm": 0.32810837030410767, |
| "learning_rate": 0.0005548986013986013, |
| "loss": 3.6322, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.7857184460364612, |
| "eval_accuracy": 0.354754266776315, |
| "eval_loss": 3.678460121154785, |
| "eval_runtime": 82.1181, |
| "eval_samples_per_second": 202.647, |
| "eval_steps_per_second": 12.677, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.8002795736501835, |
| "grad_norm": 0.3307671546936035, |
| "learning_rate": 0.0005547237762237761, |
| "loss": 3.6388, |
| "step": 13050 |
| }, |
| { |
| "epoch": 3.814840701263906, |
| "grad_norm": 0.32820889353752136, |
| "learning_rate": 0.000554548951048951, |
| "loss": 3.6484, |
| "step": 13100 |
| }, |
| { |
| "epoch": 3.829401828877628, |
| "grad_norm": 0.3053266704082489, |
| "learning_rate": 0.0005543741258741258, |
| "loss": 3.6349, |
| "step": 13150 |
| }, |
| { |
| "epoch": 3.8439629564913504, |
| "grad_norm": 0.3349308371543884, |
| "learning_rate": 0.0005541993006993006, |
| "loss": 3.6343, |
| "step": 13200 |
| }, |
| { |
| "epoch": 3.858524084105073, |
| "grad_norm": 0.3266512155532837, |
| "learning_rate": 0.0005540244755244756, |
| "loss": 3.6431, |
| "step": 13250 |
| }, |
| { |
| "epoch": 3.8730852117187955, |
| "grad_norm": 0.30198171734809875, |
| "learning_rate": 0.0005538496503496502, |
| "loss": 3.6319, |
| "step": 13300 |
| }, |
| { |
| "epoch": 3.887646339332518, |
| "grad_norm": 0.3381938636302948, |
| "learning_rate": 0.0005536748251748252, |
| "loss": 3.6404, |
| "step": 13350 |
| }, |
| { |
| "epoch": 3.9022074669462405, |
| "grad_norm": 0.30284807085990906, |
| "learning_rate": 0.0005535, |
| "loss": 3.6248, |
| "step": 13400 |
| }, |
| { |
| "epoch": 3.916768594559963, |
| "grad_norm": 0.3528636693954468, |
| "learning_rate": 0.0005533251748251748, |
| "loss": 3.6486, |
| "step": 13450 |
| }, |
| { |
| "epoch": 3.931329722173685, |
| "grad_norm": 0.31746986508369446, |
| "learning_rate": 0.0005531503496503496, |
| "loss": 3.6343, |
| "step": 13500 |
| }, |
| { |
| "epoch": 3.9458908497874075, |
| "grad_norm": 0.355306476354599, |
| "learning_rate": 0.0005529755244755245, |
| "loss": 3.6424, |
| "step": 13550 |
| }, |
| { |
| "epoch": 3.9604519774011298, |
| "grad_norm": 0.31772711873054504, |
| "learning_rate": 0.0005528006993006993, |
| "loss": 3.6432, |
| "step": 13600 |
| }, |
| { |
| "epoch": 3.9750131050148525, |
| "grad_norm": 0.3653605878353119, |
| "learning_rate": 0.0005526258741258741, |
| "loss": 3.6226, |
| "step": 13650 |
| }, |
| { |
| "epoch": 3.989574232628575, |
| "grad_norm": 0.3318699300289154, |
| "learning_rate": 0.0005524510489510489, |
| "loss": 3.6374, |
| "step": 13700 |
| }, |
| { |
| "epoch": 4.004077115731842, |
| "grad_norm": 0.3198544681072235, |
| "learning_rate": 0.0005522762237762238, |
| "loss": 3.6085, |
| "step": 13750 |
| }, |
| { |
| "epoch": 4.018638243345564, |
| "grad_norm": 0.3349785804748535, |
| "learning_rate": 0.0005521013986013986, |
| "loss": 3.5298, |
| "step": 13800 |
| }, |
| { |
| "epoch": 4.033199370959287, |
| "grad_norm": 0.31650543212890625, |
| "learning_rate": 0.0005519265734265734, |
| "loss": 3.508, |
| "step": 13850 |
| }, |
| { |
| "epoch": 4.04776049857301, |
| "grad_norm": 0.3458385169506073, |
| "learning_rate": 0.0005517517482517482, |
| "loss": 3.5203, |
| "step": 13900 |
| }, |
| { |
| "epoch": 4.062321626186732, |
| "grad_norm": 0.35154587030410767, |
| "learning_rate": 0.0005515769230769231, |
| "loss": 3.5392, |
| "step": 13950 |
| }, |
| { |
| "epoch": 4.076882753800454, |
| "grad_norm": 0.3365286588668823, |
| "learning_rate": 0.0005514020979020979, |
| "loss": 3.5546, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.076882753800454, |
| "eval_accuracy": 0.3559218966288319, |
| "eval_loss": 3.6718833446502686, |
| "eval_runtime": 82.1313, |
| "eval_samples_per_second": 202.615, |
| "eval_steps_per_second": 12.675, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.091443881414177, |
| "grad_norm": 0.33697375655174255, |
| "learning_rate": 0.0005512272727272727, |
| "loss": 3.5493, |
| "step": 14050 |
| }, |
| { |
| "epoch": 4.106005009027899, |
| "grad_norm": 0.31997400522232056, |
| "learning_rate": 0.0005510524475524475, |
| "loss": 3.5506, |
| "step": 14100 |
| }, |
| { |
| "epoch": 4.120566136641622, |
| "grad_norm": 0.3323104679584503, |
| "learning_rate": 0.0005508776223776223, |
| "loss": 3.5536, |
| "step": 14150 |
| }, |
| { |
| "epoch": 4.135127264255344, |
| "grad_norm": 0.3328968584537506, |
| "learning_rate": 0.0005507027972027972, |
| "loss": 3.542, |
| "step": 14200 |
| }, |
| { |
| "epoch": 4.149688391869066, |
| "grad_norm": 0.3348419666290283, |
| "learning_rate": 0.000550527972027972, |
| "loss": 3.5587, |
| "step": 14250 |
| }, |
| { |
| "epoch": 4.164249519482789, |
| "grad_norm": 0.32070958614349365, |
| "learning_rate": 0.0005503531468531468, |
| "loss": 3.5591, |
| "step": 14300 |
| }, |
| { |
| "epoch": 4.178810647096511, |
| "grad_norm": 0.3369140028953552, |
| "learning_rate": 0.0005501783216783216, |
| "loss": 3.554, |
| "step": 14350 |
| }, |
| { |
| "epoch": 4.193371774710234, |
| "grad_norm": 0.36318278312683105, |
| "learning_rate": 0.0005500034965034965, |
| "loss": 3.5557, |
| "step": 14400 |
| }, |
| { |
| "epoch": 4.207932902323956, |
| "grad_norm": 0.3334214389324188, |
| "learning_rate": 0.0005498286713286713, |
| "loss": 3.5603, |
| "step": 14450 |
| }, |
| { |
| "epoch": 4.222494029937678, |
| "grad_norm": 0.327919065952301, |
| "learning_rate": 0.0005496538461538461, |
| "loss": 3.5431, |
| "step": 14500 |
| }, |
| { |
| "epoch": 4.237055157551401, |
| "grad_norm": 0.3220844864845276, |
| "learning_rate": 0.0005494790209790209, |
| "loss": 3.562, |
| "step": 14550 |
| }, |
| { |
| "epoch": 4.251616285165123, |
| "grad_norm": 0.36172279715538025, |
| "learning_rate": 0.0005493041958041958, |
| "loss": 3.5743, |
| "step": 14600 |
| }, |
| { |
| "epoch": 4.266177412778846, |
| "grad_norm": 0.3367713689804077, |
| "learning_rate": 0.0005491293706293706, |
| "loss": 3.5628, |
| "step": 14650 |
| }, |
| { |
| "epoch": 4.280738540392568, |
| "grad_norm": 0.3379133641719818, |
| "learning_rate": 0.0005489545454545454, |
| "loss": 3.5702, |
| "step": 14700 |
| }, |
| { |
| "epoch": 4.29529966800629, |
| "grad_norm": 0.323989599943161, |
| "learning_rate": 0.0005487797202797203, |
| "loss": 3.5581, |
| "step": 14750 |
| }, |
| { |
| "epoch": 4.309860795620013, |
| "grad_norm": 0.31686824560165405, |
| "learning_rate": 0.000548604895104895, |
| "loss": 3.5601, |
| "step": 14800 |
| }, |
| { |
| "epoch": 4.324421923233735, |
| "grad_norm": 0.3622470498085022, |
| "learning_rate": 0.0005484300699300699, |
| "loss": 3.5723, |
| "step": 14850 |
| }, |
| { |
| "epoch": 4.338983050847458, |
| "grad_norm": 0.3448304533958435, |
| "learning_rate": 0.0005482552447552447, |
| "loss": 3.5531, |
| "step": 14900 |
| }, |
| { |
| "epoch": 4.35354417846118, |
| "grad_norm": 0.35602378845214844, |
| "learning_rate": 0.0005480804195804195, |
| "loss": 3.5845, |
| "step": 14950 |
| }, |
| { |
| "epoch": 4.368105306074902, |
| "grad_norm": 0.3290434181690216, |
| "learning_rate": 0.0005479055944055943, |
| "loss": 3.5751, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.368105306074902, |
| "eval_accuracy": 0.35690560331795507, |
| "eval_loss": 3.662770986557007, |
| "eval_runtime": 82.2621, |
| "eval_samples_per_second": 202.292, |
| "eval_steps_per_second": 12.655, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.382666433688625, |
| "grad_norm": 0.33140048384666443, |
| "learning_rate": 0.0005477307692307692, |
| "loss": 3.5631, |
| "step": 15050 |
| }, |
| { |
| "epoch": 4.397227561302348, |
| "grad_norm": 0.33117520809173584, |
| "learning_rate": 0.000547555944055944, |
| "loss": 3.5736, |
| "step": 15100 |
| }, |
| { |
| "epoch": 4.41178868891607, |
| "grad_norm": 0.34096455574035645, |
| "learning_rate": 0.0005473811188811188, |
| "loss": 3.5631, |
| "step": 15150 |
| }, |
| { |
| "epoch": 4.426349816529792, |
| "grad_norm": 0.33980458974838257, |
| "learning_rate": 0.0005472062937062936, |
| "loss": 3.5684, |
| "step": 15200 |
| }, |
| { |
| "epoch": 4.440910944143514, |
| "grad_norm": 0.31594201922416687, |
| "learning_rate": 0.0005470314685314685, |
| "loss": 3.569, |
| "step": 15250 |
| }, |
| { |
| "epoch": 4.455472071757237, |
| "grad_norm": 0.3251492381095886, |
| "learning_rate": 0.0005468566433566433, |
| "loss": 3.5653, |
| "step": 15300 |
| }, |
| { |
| "epoch": 4.47003319937096, |
| "grad_norm": 0.32946985960006714, |
| "learning_rate": 0.0005466818181818181, |
| "loss": 3.569, |
| "step": 15350 |
| }, |
| { |
| "epoch": 4.4845943269846815, |
| "grad_norm": 0.3279535174369812, |
| "learning_rate": 0.000546506993006993, |
| "loss": 3.5741, |
| "step": 15400 |
| }, |
| { |
| "epoch": 4.499155454598404, |
| "grad_norm": 0.3351619243621826, |
| "learning_rate": 0.0005463321678321678, |
| "loss": 3.5828, |
| "step": 15450 |
| }, |
| { |
| "epoch": 4.513716582212126, |
| "grad_norm": 0.31456589698791504, |
| "learning_rate": 0.0005461573426573426, |
| "loss": 3.5744, |
| "step": 15500 |
| }, |
| { |
| "epoch": 4.528277709825849, |
| "grad_norm": 0.3224773406982422, |
| "learning_rate": 0.0005459825174825174, |
| "loss": 3.5781, |
| "step": 15550 |
| }, |
| { |
| "epoch": 4.542838837439572, |
| "grad_norm": 0.3278660774230957, |
| "learning_rate": 0.0005458076923076922, |
| "loss": 3.5837, |
| "step": 15600 |
| }, |
| { |
| "epoch": 4.5573999650532935, |
| "grad_norm": 0.3417823612689972, |
| "learning_rate": 0.000545632867132867, |
| "loss": 3.5788, |
| "step": 15650 |
| }, |
| { |
| "epoch": 4.571961092667016, |
| "grad_norm": 0.3384145200252533, |
| "learning_rate": 0.0005454580419580419, |
| "loss": 3.576, |
| "step": 15700 |
| }, |
| { |
| "epoch": 4.586522220280738, |
| "grad_norm": 0.3389778435230255, |
| "learning_rate": 0.0005452832167832167, |
| "loss": 3.564, |
| "step": 15750 |
| }, |
| { |
| "epoch": 4.601083347894461, |
| "grad_norm": 0.3455224931240082, |
| "learning_rate": 0.0005451083916083915, |
| "loss": 3.5728, |
| "step": 15800 |
| }, |
| { |
| "epoch": 4.615644475508184, |
| "grad_norm": 0.31277620792388916, |
| "learning_rate": 0.0005449335664335663, |
| "loss": 3.5646, |
| "step": 15850 |
| }, |
| { |
| "epoch": 4.630205603121905, |
| "grad_norm": 0.3417457640171051, |
| "learning_rate": 0.0005447587412587412, |
| "loss": 3.5856, |
| "step": 15900 |
| }, |
| { |
| "epoch": 4.644766730735628, |
| "grad_norm": 0.34152984619140625, |
| "learning_rate": 0.000544583916083916, |
| "loss": 3.5831, |
| "step": 15950 |
| }, |
| { |
| "epoch": 4.659327858349351, |
| "grad_norm": 0.3237427771091461, |
| "learning_rate": 0.0005444090909090908, |
| "loss": 3.5864, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.659327858349351, |
| "eval_accuracy": 0.35829913879507513, |
| "eval_loss": 3.647592544555664, |
| "eval_runtime": 82.1333, |
| "eval_samples_per_second": 202.61, |
| "eval_steps_per_second": 12.675, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.673888985963073, |
| "grad_norm": 0.3205546736717224, |
| "learning_rate": 0.0005442342657342657, |
| "loss": 3.5749, |
| "step": 16050 |
| }, |
| { |
| "epoch": 4.6884501135767955, |
| "grad_norm": 0.32711103558540344, |
| "learning_rate": 0.0005440594405594405, |
| "loss": 3.5744, |
| "step": 16100 |
| }, |
| { |
| "epoch": 4.703011241190518, |
| "grad_norm": 0.338239461183548, |
| "learning_rate": 0.0005438846153846153, |
| "loss": 3.5699, |
| "step": 16150 |
| }, |
| { |
| "epoch": 4.71757236880424, |
| "grad_norm": 0.32012733817100525, |
| "learning_rate": 0.0005437097902097901, |
| "loss": 3.5682, |
| "step": 16200 |
| }, |
| { |
| "epoch": 4.732133496417963, |
| "grad_norm": 0.32086509466171265, |
| "learning_rate": 0.0005435349650349651, |
| "loss": 3.5795, |
| "step": 16250 |
| }, |
| { |
| "epoch": 4.746694624031685, |
| "grad_norm": 0.32381826639175415, |
| "learning_rate": 0.0005433601398601397, |
| "loss": 3.5793, |
| "step": 16300 |
| }, |
| { |
| "epoch": 4.7612557516454075, |
| "grad_norm": 0.3108307123184204, |
| "learning_rate": 0.0005431853146853147, |
| "loss": 3.584, |
| "step": 16350 |
| }, |
| { |
| "epoch": 4.77581687925913, |
| "grad_norm": 0.3144691288471222, |
| "learning_rate": 0.0005430104895104895, |
| "loss": 3.5766, |
| "step": 16400 |
| }, |
| { |
| "epoch": 4.790378006872852, |
| "grad_norm": 0.3357386887073517, |
| "learning_rate": 0.0005428356643356643, |
| "loss": 3.5776, |
| "step": 16450 |
| }, |
| { |
| "epoch": 4.804939134486575, |
| "grad_norm": 0.32278338074684143, |
| "learning_rate": 0.0005426608391608391, |
| "loss": 3.5865, |
| "step": 16500 |
| }, |
| { |
| "epoch": 4.819500262100297, |
| "grad_norm": 0.32999950647354126, |
| "learning_rate": 0.000542486013986014, |
| "loss": 3.5782, |
| "step": 16550 |
| }, |
| { |
| "epoch": 4.834061389714019, |
| "grad_norm": 0.33030134439468384, |
| "learning_rate": 0.0005423111888111888, |
| "loss": 3.5762, |
| "step": 16600 |
| }, |
| { |
| "epoch": 4.848622517327742, |
| "grad_norm": 0.30417221784591675, |
| "learning_rate": 0.0005421363636363636, |
| "loss": 3.5791, |
| "step": 16650 |
| }, |
| { |
| "epoch": 4.863183644941464, |
| "grad_norm": 0.3387216031551361, |
| "learning_rate": 0.0005419615384615385, |
| "loss": 3.5857, |
| "step": 16700 |
| }, |
| { |
| "epoch": 4.877744772555187, |
| "grad_norm": 0.3328517973423004, |
| "learning_rate": 0.0005417867132867133, |
| "loss": 3.563, |
| "step": 16750 |
| }, |
| { |
| "epoch": 4.892305900168909, |
| "grad_norm": 0.3243357539176941, |
| "learning_rate": 0.0005416118881118881, |
| "loss": 3.582, |
| "step": 16800 |
| }, |
| { |
| "epoch": 4.906867027782631, |
| "grad_norm": 0.3433445394039154, |
| "learning_rate": 0.0005414370629370629, |
| "loss": 3.5811, |
| "step": 16850 |
| }, |
| { |
| "epoch": 4.921428155396354, |
| "grad_norm": 0.3283991813659668, |
| "learning_rate": 0.0005412622377622378, |
| "loss": 3.5731, |
| "step": 16900 |
| }, |
| { |
| "epoch": 4.935989283010076, |
| "grad_norm": 0.31490233540534973, |
| "learning_rate": 0.0005410874125874126, |
| "loss": 3.5758, |
| "step": 16950 |
| }, |
| { |
| "epoch": 4.950550410623799, |
| "grad_norm": 0.3135548532009125, |
| "learning_rate": 0.0005409125874125874, |
| "loss": 3.5727, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.950550410623799, |
| "eval_accuracy": 0.35975264921678013, |
| "eval_loss": 3.6343069076538086, |
| "eval_runtime": 82.2287, |
| "eval_samples_per_second": 202.375, |
| "eval_steps_per_second": 12.66, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.9651115382375215, |
| "grad_norm": 0.3001994788646698, |
| "learning_rate": 0.0005407377622377622, |
| "loss": 3.5755, |
| "step": 17050 |
| }, |
| { |
| "epoch": 4.979672665851243, |
| "grad_norm": 0.33647260069847107, |
| "learning_rate": 0.000540562937062937, |
| "loss": 3.5727, |
| "step": 17100 |
| }, |
| { |
| "epoch": 4.994233793464966, |
| "grad_norm": 0.3387227952480316, |
| "learning_rate": 0.0005403881118881118, |
| "loss": 3.5701, |
| "step": 17150 |
| }, |
| { |
| "epoch": 5.008736676568233, |
| "grad_norm": 0.3355279564857483, |
| "learning_rate": 0.0005402132867132867, |
| "loss": 3.5209, |
| "step": 17200 |
| }, |
| { |
| "epoch": 5.023297804181956, |
| "grad_norm": 0.3206043243408203, |
| "learning_rate": 0.0005400384615384615, |
| "loss": 3.4666, |
| "step": 17250 |
| }, |
| { |
| "epoch": 5.037858931795678, |
| "grad_norm": 0.3168812394142151, |
| "learning_rate": 0.0005398636363636363, |
| "loss": 3.4718, |
| "step": 17300 |
| }, |
| { |
| "epoch": 5.052420059409401, |
| "grad_norm": 0.3269653022289276, |
| "learning_rate": 0.0005396888111888111, |
| "loss": 3.4704, |
| "step": 17350 |
| }, |
| { |
| "epoch": 5.066981187023123, |
| "grad_norm": 0.3134930729866028, |
| "learning_rate": 0.000539513986013986, |
| "loss": 3.4874, |
| "step": 17400 |
| }, |
| { |
| "epoch": 5.081542314636845, |
| "grad_norm": 0.31965067982673645, |
| "learning_rate": 0.0005393391608391608, |
| "loss": 3.4852, |
| "step": 17450 |
| }, |
| { |
| "epoch": 5.096103442250568, |
| "grad_norm": 0.3426007926464081, |
| "learning_rate": 0.0005391643356643356, |
| "loss": 3.4822, |
| "step": 17500 |
| }, |
| { |
| "epoch": 5.110664569864291, |
| "grad_norm": 0.31415146589279175, |
| "learning_rate": 0.0005389895104895105, |
| "loss": 3.4899, |
| "step": 17550 |
| }, |
| { |
| "epoch": 5.125225697478013, |
| "grad_norm": 0.3285400867462158, |
| "learning_rate": 0.0005388146853146853, |
| "loss": 3.4832, |
| "step": 17600 |
| }, |
| { |
| "epoch": 5.139786825091735, |
| "grad_norm": 0.3207187056541443, |
| "learning_rate": 0.0005386398601398601, |
| "loss": 3.4931, |
| "step": 17650 |
| }, |
| { |
| "epoch": 5.154347952705457, |
| "grad_norm": 0.3365948498249054, |
| "learning_rate": 0.0005384650349650349, |
| "loss": 3.4906, |
| "step": 17700 |
| }, |
| { |
| "epoch": 5.16890908031918, |
| "grad_norm": 0.35180333256721497, |
| "learning_rate": 0.0005382902097902098, |
| "loss": 3.4897, |
| "step": 17750 |
| }, |
| { |
| "epoch": 5.183470207932903, |
| "grad_norm": 0.32608678936958313, |
| "learning_rate": 0.0005381153846153845, |
| "loss": 3.5102, |
| "step": 17800 |
| }, |
| { |
| "epoch": 5.1980313355466246, |
| "grad_norm": 0.35791006684303284, |
| "learning_rate": 0.0005379405594405594, |
| "loss": 3.5052, |
| "step": 17850 |
| }, |
| { |
| "epoch": 5.212592463160347, |
| "grad_norm": 0.33370959758758545, |
| "learning_rate": 0.0005377657342657342, |
| "loss": 3.4954, |
| "step": 17900 |
| }, |
| { |
| "epoch": 5.227153590774069, |
| "grad_norm": 0.33202075958251953, |
| "learning_rate": 0.000537590909090909, |
| "loss": 3.4967, |
| "step": 17950 |
| }, |
| { |
| "epoch": 5.241714718387792, |
| "grad_norm": 0.3159421682357788, |
| "learning_rate": 0.0005374160839160838, |
| "loss": 3.5001, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.241714718387792, |
| "eval_accuracy": 0.35993810115327113, |
| "eval_loss": 3.6371030807495117, |
| "eval_runtime": 82.2306, |
| "eval_samples_per_second": 202.37, |
| "eval_steps_per_second": 12.66, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.256275846001515, |
| "grad_norm": 0.3204033672809601, |
| "learning_rate": 0.0005372412587412587, |
| "loss": 3.5113, |
| "step": 18050 |
| }, |
| { |
| "epoch": 5.2708369736152365, |
| "grad_norm": 0.3157881796360016, |
| "learning_rate": 0.0005370664335664335, |
| "loss": 3.5039, |
| "step": 18100 |
| }, |
| { |
| "epoch": 5.285398101228959, |
| "grad_norm": 0.3386020064353943, |
| "learning_rate": 0.0005368916083916083, |
| "loss": 3.5148, |
| "step": 18150 |
| }, |
| { |
| "epoch": 5.299959228842681, |
| "grad_norm": 0.3367830216884613, |
| "learning_rate": 0.0005367167832167832, |
| "loss": 3.5045, |
| "step": 18200 |
| }, |
| { |
| "epoch": 5.314520356456404, |
| "grad_norm": 0.329608678817749, |
| "learning_rate": 0.000536541958041958, |
| "loss": 3.5052, |
| "step": 18250 |
| }, |
| { |
| "epoch": 5.329081484070127, |
| "grad_norm": 0.3417602479457855, |
| "learning_rate": 0.0005363671328671328, |
| "loss": 3.502, |
| "step": 18300 |
| }, |
| { |
| "epoch": 5.3436426116838485, |
| "grad_norm": 0.32132115960121155, |
| "learning_rate": 0.0005361923076923076, |
| "loss": 3.5123, |
| "step": 18350 |
| }, |
| { |
| "epoch": 5.358203739297571, |
| "grad_norm": 0.35218629240989685, |
| "learning_rate": 0.0005360174825174825, |
| "loss": 3.52, |
| "step": 18400 |
| }, |
| { |
| "epoch": 5.372764866911294, |
| "grad_norm": 0.3243536055088043, |
| "learning_rate": 0.0005358426573426573, |
| "loss": 3.5062, |
| "step": 18450 |
| }, |
| { |
| "epoch": 5.387325994525016, |
| "grad_norm": 0.3235512673854828, |
| "learning_rate": 0.0005356678321678321, |
| "loss": 3.5243, |
| "step": 18500 |
| }, |
| { |
| "epoch": 5.401887122138739, |
| "grad_norm": 0.32949063181877136, |
| "learning_rate": 0.0005354930069930069, |
| "loss": 3.5143, |
| "step": 18550 |
| }, |
| { |
| "epoch": 5.41644824975246, |
| "grad_norm": 0.3332709074020386, |
| "learning_rate": 0.0005353181818181817, |
| "loss": 3.5149, |
| "step": 18600 |
| }, |
| { |
| "epoch": 5.431009377366183, |
| "grad_norm": 0.3282678723335266, |
| "learning_rate": 0.0005351433566433565, |
| "loss": 3.532, |
| "step": 18650 |
| }, |
| { |
| "epoch": 5.445570504979906, |
| "grad_norm": 0.3131265640258789, |
| "learning_rate": 0.0005349685314685314, |
| "loss": 3.513, |
| "step": 18700 |
| }, |
| { |
| "epoch": 5.460131632593628, |
| "grad_norm": 0.3347927927970886, |
| "learning_rate": 0.0005347937062937062, |
| "loss": 3.5088, |
| "step": 18750 |
| }, |
| { |
| "epoch": 5.4746927602073505, |
| "grad_norm": 0.32753896713256836, |
| "learning_rate": 0.000534618881118881, |
| "loss": 3.5197, |
| "step": 18800 |
| }, |
| { |
| "epoch": 5.489253887821073, |
| "grad_norm": 0.3317394554615021, |
| "learning_rate": 0.0005344440559440559, |
| "loss": 3.5182, |
| "step": 18850 |
| }, |
| { |
| "epoch": 5.503815015434795, |
| "grad_norm": 0.3506125509738922, |
| "learning_rate": 0.0005342692307692307, |
| "loss": 3.5075, |
| "step": 18900 |
| }, |
| { |
| "epoch": 5.518376143048518, |
| "grad_norm": 0.3561033010482788, |
| "learning_rate": 0.0005340944055944055, |
| "loss": 3.5151, |
| "step": 18950 |
| }, |
| { |
| "epoch": 5.53293727066224, |
| "grad_norm": 0.3183492422103882, |
| "learning_rate": 0.0005339195804195803, |
| "loss": 3.5292, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.53293727066224, |
| "eval_accuracy": 0.3608853524839211, |
| "eval_loss": 3.627208948135376, |
| "eval_runtime": 81.8958, |
| "eval_samples_per_second": 203.197, |
| "eval_steps_per_second": 12.711, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.5474983982759625, |
| "grad_norm": 0.3402908444404602, |
| "learning_rate": 0.0005337447552447552, |
| "loss": 3.5217, |
| "step": 19050 |
| }, |
| { |
| "epoch": 5.562059525889685, |
| "grad_norm": 0.3298224210739136, |
| "learning_rate": 0.00053356993006993, |
| "loss": 3.5292, |
| "step": 19100 |
| }, |
| { |
| "epoch": 5.576620653503407, |
| "grad_norm": 0.3130464553833008, |
| "learning_rate": 0.0005333951048951048, |
| "loss": 3.5315, |
| "step": 19150 |
| }, |
| { |
| "epoch": 5.59118178111713, |
| "grad_norm": 0.31813472509384155, |
| "learning_rate": 0.0005332202797202796, |
| "loss": 3.5357, |
| "step": 19200 |
| }, |
| { |
| "epoch": 5.605742908730852, |
| "grad_norm": 0.3343231976032257, |
| "learning_rate": 0.0005330454545454546, |
| "loss": 3.5224, |
| "step": 19250 |
| }, |
| { |
| "epoch": 5.620304036344574, |
| "grad_norm": 0.358654260635376, |
| "learning_rate": 0.0005328706293706292, |
| "loss": 3.5102, |
| "step": 19300 |
| }, |
| { |
| "epoch": 5.634865163958297, |
| "grad_norm": 0.3118330240249634, |
| "learning_rate": 0.0005326958041958042, |
| "loss": 3.5199, |
| "step": 19350 |
| }, |
| { |
| "epoch": 5.649426291572019, |
| "grad_norm": 0.32270094752311707, |
| "learning_rate": 0.000532520979020979, |
| "loss": 3.509, |
| "step": 19400 |
| }, |
| { |
| "epoch": 5.663987419185742, |
| "grad_norm": 0.32599106431007385, |
| "learning_rate": 0.0005323461538461538, |
| "loss": 3.5214, |
| "step": 19450 |
| }, |
| { |
| "epoch": 5.6785485467994645, |
| "grad_norm": 0.35585257411003113, |
| "learning_rate": 0.0005321713286713287, |
| "loss": 3.5204, |
| "step": 19500 |
| }, |
| { |
| "epoch": 5.693109674413186, |
| "grad_norm": 0.302842378616333, |
| "learning_rate": 0.0005319965034965035, |
| "loss": 3.5284, |
| "step": 19550 |
| }, |
| { |
| "epoch": 5.707670802026909, |
| "grad_norm": 0.32723668217658997, |
| "learning_rate": 0.0005318216783216783, |
| "loss": 3.519, |
| "step": 19600 |
| }, |
| { |
| "epoch": 5.722231929640631, |
| "grad_norm": 0.3255930244922638, |
| "learning_rate": 0.0005316468531468531, |
| "loss": 3.5349, |
| "step": 19650 |
| }, |
| { |
| "epoch": 5.736793057254354, |
| "grad_norm": 0.32024484872817993, |
| "learning_rate": 0.000531472027972028, |
| "loss": 3.5368, |
| "step": 19700 |
| }, |
| { |
| "epoch": 5.7513541848680765, |
| "grad_norm": 0.3348290026187897, |
| "learning_rate": 0.0005312972027972028, |
| "loss": 3.5243, |
| "step": 19750 |
| }, |
| { |
| "epoch": 5.765915312481798, |
| "grad_norm": 0.33889368176460266, |
| "learning_rate": 0.0005311223776223776, |
| "loss": 3.5395, |
| "step": 19800 |
| }, |
| { |
| "epoch": 5.780476440095521, |
| "grad_norm": 0.3375495374202728, |
| "learning_rate": 0.0005309475524475524, |
| "loss": 3.5308, |
| "step": 19850 |
| }, |
| { |
| "epoch": 5.795037567709244, |
| "grad_norm": 0.3312002122402191, |
| "learning_rate": 0.0005307727272727273, |
| "loss": 3.5332, |
| "step": 19900 |
| }, |
| { |
| "epoch": 5.809598695322966, |
| "grad_norm": 0.33301153779029846, |
| "learning_rate": 0.0005305979020979021, |
| "loss": 3.5331, |
| "step": 19950 |
| }, |
| { |
| "epoch": 5.824159822936688, |
| "grad_norm": 0.32492154836654663, |
| "learning_rate": 0.0005304230769230769, |
| "loss": 3.5401, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.824159822936688, |
| "eval_accuracy": 0.3622630122404158, |
| "eval_loss": 3.6149392127990723, |
| "eval_runtime": 81.9732, |
| "eval_samples_per_second": 203.005, |
| "eval_steps_per_second": 12.699, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.83872095055041, |
| "grad_norm": 0.37921079993247986, |
| "learning_rate": 0.0005302482517482517, |
| "loss": 3.5361, |
| "step": 20050 |
| }, |
| { |
| "epoch": 5.853282078164133, |
| "grad_norm": 0.3262177109718323, |
| "learning_rate": 0.0005300734265734265, |
| "loss": 3.5317, |
| "step": 20100 |
| }, |
| { |
| "epoch": 5.867843205777856, |
| "grad_norm": 0.31618988513946533, |
| "learning_rate": 0.0005298986013986013, |
| "loss": 3.5363, |
| "step": 20150 |
| }, |
| { |
| "epoch": 5.882404333391578, |
| "grad_norm": 0.31392234563827515, |
| "learning_rate": 0.0005297237762237762, |
| "loss": 3.5295, |
| "step": 20200 |
| }, |
| { |
| "epoch": 5.8969654610053, |
| "grad_norm": 0.3153601586818695, |
| "learning_rate": 0.000529548951048951, |
| "loss": 3.5233, |
| "step": 20250 |
| }, |
| { |
| "epoch": 5.911526588619022, |
| "grad_norm": 0.3152620196342468, |
| "learning_rate": 0.0005293741258741258, |
| "loss": 3.5326, |
| "step": 20300 |
| }, |
| { |
| "epoch": 5.926087716232745, |
| "grad_norm": 0.35128146409988403, |
| "learning_rate": 0.0005291993006993007, |
| "loss": 3.5331, |
| "step": 20350 |
| }, |
| { |
| "epoch": 5.940648843846468, |
| "grad_norm": 0.3338232934474945, |
| "learning_rate": 0.0005290244755244755, |
| "loss": 3.5358, |
| "step": 20400 |
| }, |
| { |
| "epoch": 5.95520997146019, |
| "grad_norm": 0.3489064872264862, |
| "learning_rate": 0.0005288496503496503, |
| "loss": 3.5344, |
| "step": 20450 |
| }, |
| { |
| "epoch": 5.969771099073912, |
| "grad_norm": 0.3025525212287903, |
| "learning_rate": 0.0005286748251748251, |
| "loss": 3.5383, |
| "step": 20500 |
| }, |
| { |
| "epoch": 5.984332226687634, |
| "grad_norm": 0.3072284162044525, |
| "learning_rate": 0.0005285, |
| "loss": 3.5326, |
| "step": 20550 |
| }, |
| { |
| "epoch": 5.998893354301357, |
| "grad_norm": 0.32312294840812683, |
| "learning_rate": 0.0005283251748251748, |
| "loss": 3.5264, |
| "step": 20600 |
| }, |
| { |
| "epoch": 6.013396237404625, |
| "grad_norm": 0.3299575448036194, |
| "learning_rate": 0.0005281503496503496, |
| "loss": 3.4191, |
| "step": 20650 |
| }, |
| { |
| "epoch": 6.027957365018347, |
| "grad_norm": 0.34084299206733704, |
| "learning_rate": 0.0005279755244755244, |
| "loss": 3.415, |
| "step": 20700 |
| }, |
| { |
| "epoch": 6.04251849263207, |
| "grad_norm": 0.32552212476730347, |
| "learning_rate": 0.0005278006993006993, |
| "loss": 3.4189, |
| "step": 20750 |
| }, |
| { |
| "epoch": 6.0570796202457915, |
| "grad_norm": 0.33539167046546936, |
| "learning_rate": 0.000527625874125874, |
| "loss": 3.4253, |
| "step": 20800 |
| }, |
| { |
| "epoch": 6.071640747859514, |
| "grad_norm": 0.32660338282585144, |
| "learning_rate": 0.0005274510489510489, |
| "loss": 3.422, |
| "step": 20850 |
| }, |
| { |
| "epoch": 6.086201875473237, |
| "grad_norm": 0.37775811553001404, |
| "learning_rate": 0.0005272762237762238, |
| "loss": 3.441, |
| "step": 20900 |
| }, |
| { |
| "epoch": 6.100763003086959, |
| "grad_norm": 0.32690706849098206, |
| "learning_rate": 0.0005271013986013985, |
| "loss": 3.447, |
| "step": 20950 |
| }, |
| { |
| "epoch": 6.115324130700682, |
| "grad_norm": 0.3458637595176697, |
| "learning_rate": 0.0005269265734265734, |
| "loss": 3.4454, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.115324130700682, |
| "eval_accuracy": 0.3623287494835981, |
| "eval_loss": 3.6226019859313965, |
| "eval_runtime": 82.1462, |
| "eval_samples_per_second": 202.578, |
| "eval_steps_per_second": 12.673, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.1298852583144035, |
| "grad_norm": 0.3165479600429535, |
| "learning_rate": 0.0005267517482517482, |
| "loss": 3.4441, |
| "step": 21050 |
| }, |
| { |
| "epoch": 6.144446385928126, |
| "grad_norm": 0.3269355595111847, |
| "learning_rate": 0.000526576923076923, |
| "loss": 3.4504, |
| "step": 21100 |
| }, |
| { |
| "epoch": 6.159007513541849, |
| "grad_norm": 0.339388370513916, |
| "learning_rate": 0.0005264020979020978, |
| "loss": 3.4574, |
| "step": 21150 |
| }, |
| { |
| "epoch": 6.173568641155571, |
| "grad_norm": 0.31941908597946167, |
| "learning_rate": 0.0005262272727272727, |
| "loss": 3.4463, |
| "step": 21200 |
| }, |
| { |
| "epoch": 6.1881297687692935, |
| "grad_norm": 0.3259830176830292, |
| "learning_rate": 0.0005260524475524475, |
| "loss": 3.4519, |
| "step": 21250 |
| }, |
| { |
| "epoch": 6.202690896383016, |
| "grad_norm": 0.3307611644268036, |
| "learning_rate": 0.0005258776223776223, |
| "loss": 3.4586, |
| "step": 21300 |
| }, |
| { |
| "epoch": 6.217252023996738, |
| "grad_norm": 0.3359248638153076, |
| "learning_rate": 0.0005257027972027971, |
| "loss": 3.4629, |
| "step": 21350 |
| }, |
| { |
| "epoch": 6.231813151610461, |
| "grad_norm": 0.32742172479629517, |
| "learning_rate": 0.000525527972027972, |
| "loss": 3.4539, |
| "step": 21400 |
| }, |
| { |
| "epoch": 6.246374279224183, |
| "grad_norm": 0.3473532795906067, |
| "learning_rate": 0.0005253531468531468, |
| "loss": 3.4552, |
| "step": 21450 |
| }, |
| { |
| "epoch": 6.2609354068379055, |
| "grad_norm": 0.3250901699066162, |
| "learning_rate": 0.0005251783216783216, |
| "loss": 3.4814, |
| "step": 21500 |
| }, |
| { |
| "epoch": 6.275496534451628, |
| "grad_norm": 0.33140328526496887, |
| "learning_rate": 0.0005250034965034965, |
| "loss": 3.4572, |
| "step": 21550 |
| }, |
| { |
| "epoch": 6.29005766206535, |
| "grad_norm": 0.3557279109954834, |
| "learning_rate": 0.0005248286713286712, |
| "loss": 3.4626, |
| "step": 21600 |
| }, |
| { |
| "epoch": 6.304618789679073, |
| "grad_norm": 0.3310116231441498, |
| "learning_rate": 0.0005246538461538461, |
| "loss": 3.4784, |
| "step": 21650 |
| }, |
| { |
| "epoch": 6.319179917292795, |
| "grad_norm": 0.33806267380714417, |
| "learning_rate": 0.0005244790209790209, |
| "loss": 3.4681, |
| "step": 21700 |
| }, |
| { |
| "epoch": 6.3337410449065175, |
| "grad_norm": 0.32359644770622253, |
| "learning_rate": 0.0005243041958041957, |
| "loss": 3.4674, |
| "step": 21750 |
| }, |
| { |
| "epoch": 6.34830217252024, |
| "grad_norm": 0.3346199691295624, |
| "learning_rate": 0.0005241293706293705, |
| "loss": 3.4849, |
| "step": 21800 |
| }, |
| { |
| "epoch": 6.362863300133962, |
| "grad_norm": 0.3482149839401245, |
| "learning_rate": 0.0005239545454545454, |
| "loss": 3.4695, |
| "step": 21850 |
| }, |
| { |
| "epoch": 6.377424427747685, |
| "grad_norm": 0.3360421061515808, |
| "learning_rate": 0.0005237797202797202, |
| "loss": 3.468, |
| "step": 21900 |
| }, |
| { |
| "epoch": 6.391985555361408, |
| "grad_norm": 0.3140206038951874, |
| "learning_rate": 0.000523604895104895, |
| "loss": 3.4804, |
| "step": 21950 |
| }, |
| { |
| "epoch": 6.406546682975129, |
| "grad_norm": 0.3536447584629059, |
| "learning_rate": 0.0005234300699300698, |
| "loss": 3.4806, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.406546682975129, |
| "eval_accuracy": 0.36294637381489214, |
| "eval_loss": 3.611290454864502, |
| "eval_runtime": 82.0499, |
| "eval_samples_per_second": 202.815, |
| "eval_steps_per_second": 12.687, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.421107810588852, |
| "grad_norm": 0.3595673739910126, |
| "learning_rate": 0.0005232552447552447, |
| "loss": 3.4658, |
| "step": 22050 |
| }, |
| { |
| "epoch": 6.435668938202574, |
| "grad_norm": 0.3298998773097992, |
| "learning_rate": 0.0005230804195804195, |
| "loss": 3.4947, |
| "step": 22100 |
| }, |
| { |
| "epoch": 6.450230065816297, |
| "grad_norm": 0.31251832842826843, |
| "learning_rate": 0.0005229055944055943, |
| "loss": 3.4695, |
| "step": 22150 |
| }, |
| { |
| "epoch": 6.4647911934300195, |
| "grad_norm": 0.3445403277873993, |
| "learning_rate": 0.0005227307692307691, |
| "loss": 3.4782, |
| "step": 22200 |
| }, |
| { |
| "epoch": 6.479352321043741, |
| "grad_norm": 0.3203848898410797, |
| "learning_rate": 0.0005225559440559441, |
| "loss": 3.4703, |
| "step": 22250 |
| }, |
| { |
| "epoch": 6.493913448657464, |
| "grad_norm": 0.3400220572948456, |
| "learning_rate": 0.0005223811188811189, |
| "loss": 3.4886, |
| "step": 22300 |
| }, |
| { |
| "epoch": 6.508474576271187, |
| "grad_norm": 0.33314988017082214, |
| "learning_rate": 0.0005222062937062937, |
| "loss": 3.4735, |
| "step": 22350 |
| }, |
| { |
| "epoch": 6.523035703884909, |
| "grad_norm": 0.32038331031799316, |
| "learning_rate": 0.0005220314685314686, |
| "loss": 3.4823, |
| "step": 22400 |
| }, |
| { |
| "epoch": 6.5375968314986315, |
| "grad_norm": 0.34869644045829773, |
| "learning_rate": 0.0005218566433566433, |
| "loss": 3.4659, |
| "step": 22450 |
| }, |
| { |
| "epoch": 6.552157959112353, |
| "grad_norm": 0.33996596932411194, |
| "learning_rate": 0.0005216818181818182, |
| "loss": 3.4945, |
| "step": 22500 |
| }, |
| { |
| "epoch": 6.566719086726076, |
| "grad_norm": 0.32487791776657104, |
| "learning_rate": 0.000521506993006993, |
| "loss": 3.4714, |
| "step": 22550 |
| }, |
| { |
| "epoch": 6.581280214339799, |
| "grad_norm": 0.3136010766029358, |
| "learning_rate": 0.0005213321678321678, |
| "loss": 3.4963, |
| "step": 22600 |
| }, |
| { |
| "epoch": 6.595841341953521, |
| "grad_norm": 0.32845839858055115, |
| "learning_rate": 0.0005211573426573426, |
| "loss": 3.4899, |
| "step": 22650 |
| }, |
| { |
| "epoch": 6.610402469567243, |
| "grad_norm": 0.3246721029281616, |
| "learning_rate": 0.0005209825174825175, |
| "loss": 3.48, |
| "step": 22700 |
| }, |
| { |
| "epoch": 6.624963597180965, |
| "grad_norm": 0.3148110508918762, |
| "learning_rate": 0.0005208076923076923, |
| "loss": 3.4778, |
| "step": 22750 |
| }, |
| { |
| "epoch": 6.639524724794688, |
| "grad_norm": 0.33099523186683655, |
| "learning_rate": 0.0005206328671328671, |
| "loss": 3.4915, |
| "step": 22800 |
| }, |
| { |
| "epoch": 6.654085852408411, |
| "grad_norm": 0.34380704164505005, |
| "learning_rate": 0.0005204580419580419, |
| "loss": 3.4807, |
| "step": 22850 |
| }, |
| { |
| "epoch": 6.668646980022133, |
| "grad_norm": 0.3488045930862427, |
| "learning_rate": 0.0005202832167832168, |
| "loss": 3.4862, |
| "step": 22900 |
| }, |
| { |
| "epoch": 6.683208107635855, |
| "grad_norm": 0.3439943790435791, |
| "learning_rate": 0.0005201083916083916, |
| "loss": 3.4963, |
| "step": 22950 |
| }, |
| { |
| "epoch": 6.697769235249577, |
| "grad_norm": 0.3523930013179779, |
| "learning_rate": 0.0005199335664335664, |
| "loss": 3.4971, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.697769235249577, |
| "eval_accuracy": 0.3635006128616151, |
| "eval_loss": 3.604053020477295, |
| "eval_runtime": 81.988, |
| "eval_samples_per_second": 202.969, |
| "eval_steps_per_second": 12.697, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.7123303628633, |
| "grad_norm": 0.313567191362381, |
| "learning_rate": 0.0005197587412587413, |
| "loss": 3.4776, |
| "step": 23050 |
| }, |
| { |
| "epoch": 6.726891490477023, |
| "grad_norm": 0.35864803194999695, |
| "learning_rate": 0.0005195839160839161, |
| "loss": 3.4847, |
| "step": 23100 |
| }, |
| { |
| "epoch": 6.741452618090745, |
| "grad_norm": 0.35224711894989014, |
| "learning_rate": 0.0005194090909090909, |
| "loss": 3.4826, |
| "step": 23150 |
| }, |
| { |
| "epoch": 6.756013745704467, |
| "grad_norm": 0.33748653531074524, |
| "learning_rate": 0.0005192342657342657, |
| "loss": 3.4924, |
| "step": 23200 |
| }, |
| { |
| "epoch": 6.77057487331819, |
| "grad_norm": 0.3664532005786896, |
| "learning_rate": 0.0005190594405594405, |
| "loss": 3.4889, |
| "step": 23250 |
| }, |
| { |
| "epoch": 6.785136000931912, |
| "grad_norm": 0.3239114284515381, |
| "learning_rate": 0.0005188846153846153, |
| "loss": 3.4843, |
| "step": 23300 |
| }, |
| { |
| "epoch": 6.799697128545635, |
| "grad_norm": 0.3121638298034668, |
| "learning_rate": 0.0005187097902097902, |
| "loss": 3.4874, |
| "step": 23350 |
| }, |
| { |
| "epoch": 6.814258256159357, |
| "grad_norm": 0.3163391351699829, |
| "learning_rate": 0.000518534965034965, |
| "loss": 3.491, |
| "step": 23400 |
| }, |
| { |
| "epoch": 6.828819383773079, |
| "grad_norm": 0.3387483060359955, |
| "learning_rate": 0.0005183601398601398, |
| "loss": 3.4956, |
| "step": 23450 |
| }, |
| { |
| "epoch": 6.843380511386802, |
| "grad_norm": 0.33326035737991333, |
| "learning_rate": 0.0005181853146853146, |
| "loss": 3.4972, |
| "step": 23500 |
| }, |
| { |
| "epoch": 6.857941639000524, |
| "grad_norm": 0.3814000189304352, |
| "learning_rate": 0.0005180104895104895, |
| "loss": 3.4951, |
| "step": 23550 |
| }, |
| { |
| "epoch": 6.872502766614247, |
| "grad_norm": 0.32710936665534973, |
| "learning_rate": 0.0005178356643356643, |
| "loss": 3.4867, |
| "step": 23600 |
| }, |
| { |
| "epoch": 6.887063894227969, |
| "grad_norm": 0.31053856015205383, |
| "learning_rate": 0.0005176608391608391, |
| "loss": 3.5002, |
| "step": 23650 |
| }, |
| { |
| "epoch": 6.901625021841691, |
| "grad_norm": 0.3194434642791748, |
| "learning_rate": 0.000517486013986014, |
| "loss": 3.4829, |
| "step": 23700 |
| }, |
| { |
| "epoch": 6.916186149455414, |
| "grad_norm": 0.33319905400276184, |
| "learning_rate": 0.0005173111888111888, |
| "loss": 3.4942, |
| "step": 23750 |
| }, |
| { |
| "epoch": 6.930747277069136, |
| "grad_norm": 0.3246103525161743, |
| "learning_rate": 0.0005171363636363636, |
| "loss": 3.4926, |
| "step": 23800 |
| }, |
| { |
| "epoch": 6.945308404682859, |
| "grad_norm": 0.3280305862426758, |
| "learning_rate": 0.0005169615384615384, |
| "loss": 3.4881, |
| "step": 23850 |
| }, |
| { |
| "epoch": 6.959869532296581, |
| "grad_norm": 0.3515370190143585, |
| "learning_rate": 0.0005167867132867133, |
| "loss": 3.499, |
| "step": 23900 |
| }, |
| { |
| "epoch": 6.974430659910303, |
| "grad_norm": 0.34678447246551514, |
| "learning_rate": 0.000516611888111888, |
| "loss": 3.4892, |
| "step": 23950 |
| }, |
| { |
| "epoch": 6.988991787524026, |
| "grad_norm": 0.3294186294078827, |
| "learning_rate": 0.0005164370629370629, |
| "loss": 3.4885, |
| "step": 24000 |
| }, |
| { |
| "epoch": 6.988991787524026, |
| "eval_accuracy": 0.36452218608437814, |
| "eval_loss": 3.5930957794189453, |
| "eval_runtime": 82.2209, |
| "eval_samples_per_second": 202.394, |
| "eval_steps_per_second": 12.661, |
| "step": 24000 |
| }, |
| { |
| "epoch": 7.003494670627293, |
| "grad_norm": 0.35472336411476135, |
| "learning_rate": 0.0005162622377622377, |
| "loss": 3.4703, |
| "step": 24050 |
| }, |
| { |
| "epoch": 7.018055798241016, |
| "grad_norm": 0.3583216667175293, |
| "learning_rate": 0.0005160874125874125, |
| "loss": 3.3914, |
| "step": 24100 |
| }, |
| { |
| "epoch": 7.032616925854738, |
| "grad_norm": 0.34145647287368774, |
| "learning_rate": 0.0005159125874125873, |
| "loss": 3.3913, |
| "step": 24150 |
| }, |
| { |
| "epoch": 7.0471780534684605, |
| "grad_norm": 0.3234576880931854, |
| "learning_rate": 0.0005157377622377622, |
| "loss": 3.3954, |
| "step": 24200 |
| }, |
| { |
| "epoch": 7.061739181082183, |
| "grad_norm": 0.361056387424469, |
| "learning_rate": 0.000515562937062937, |
| "loss": 3.3916, |
| "step": 24250 |
| }, |
| { |
| "epoch": 7.076300308695905, |
| "grad_norm": 0.3266637325286865, |
| "learning_rate": 0.0005153881118881118, |
| "loss": 3.4026, |
| "step": 24300 |
| }, |
| { |
| "epoch": 7.090861436309628, |
| "grad_norm": 0.33646684885025024, |
| "learning_rate": 0.0005152132867132867, |
| "loss": 3.3902, |
| "step": 24350 |
| }, |
| { |
| "epoch": 7.105422563923351, |
| "grad_norm": 0.3473447859287262, |
| "learning_rate": 0.0005150384615384615, |
| "loss": 3.4096, |
| "step": 24400 |
| }, |
| { |
| "epoch": 7.1199836915370724, |
| "grad_norm": 0.31828585267066956, |
| "learning_rate": 0.0005148636363636363, |
| "loss": 3.4178, |
| "step": 24450 |
| }, |
| { |
| "epoch": 7.134544819150795, |
| "grad_norm": 0.33760425448417664, |
| "learning_rate": 0.0005146888111888111, |
| "loss": 3.3988, |
| "step": 24500 |
| }, |
| { |
| "epoch": 7.149105946764517, |
| "grad_norm": 0.33086958527565, |
| "learning_rate": 0.000514513986013986, |
| "loss": 3.4188, |
| "step": 24550 |
| }, |
| { |
| "epoch": 7.16366707437824, |
| "grad_norm": 0.342672735452652, |
| "learning_rate": 0.0005143391608391608, |
| "loss": 3.41, |
| "step": 24600 |
| }, |
| { |
| "epoch": 7.1782282019919625, |
| "grad_norm": 0.3187035620212555, |
| "learning_rate": 0.0005141643356643356, |
| "loss": 3.4183, |
| "step": 24650 |
| }, |
| { |
| "epoch": 7.192789329605684, |
| "grad_norm": 0.32852640748023987, |
| "learning_rate": 0.0005139895104895104, |
| "loss": 3.4124, |
| "step": 24700 |
| }, |
| { |
| "epoch": 7.207350457219407, |
| "grad_norm": 0.336311936378479, |
| "learning_rate": 0.0005138146853146852, |
| "loss": 3.4294, |
| "step": 24750 |
| }, |
| { |
| "epoch": 7.22191158483313, |
| "grad_norm": 0.3630037307739258, |
| "learning_rate": 0.00051363986013986, |
| "loss": 3.4259, |
| "step": 24800 |
| }, |
| { |
| "epoch": 7.236472712446852, |
| "grad_norm": 0.3462861180305481, |
| "learning_rate": 0.0005134650349650349, |
| "loss": 3.4251, |
| "step": 24850 |
| }, |
| { |
| "epoch": 7.2510338400605745, |
| "grad_norm": 0.3268493413925171, |
| "learning_rate": 0.0005132902097902097, |
| "loss": 3.4324, |
| "step": 24900 |
| }, |
| { |
| "epoch": 7.265594967674296, |
| "grad_norm": 0.33687540888786316, |
| "learning_rate": 0.0005131153846153845, |
| "loss": 3.4199, |
| "step": 24950 |
| }, |
| { |
| "epoch": 7.280156095288019, |
| "grad_norm": 0.33355942368507385, |
| "learning_rate": 0.0005129405594405594, |
| "loss": 3.4323, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.280156095288019, |
| "eval_accuracy": 0.36439800266970823, |
| "eval_loss": 3.6028881072998047, |
| "eval_runtime": 82.1263, |
| "eval_samples_per_second": 202.627, |
| "eval_steps_per_second": 12.676, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.294717222901742, |
| "grad_norm": 0.32576438784599304, |
| "learning_rate": 0.0005127657342657342, |
| "loss": 3.4229, |
| "step": 25050 |
| }, |
| { |
| "epoch": 7.309278350515464, |
| "grad_norm": 0.32318952679634094, |
| "learning_rate": 0.000512590909090909, |
| "loss": 3.434, |
| "step": 25100 |
| }, |
| { |
| "epoch": 7.3238394781291865, |
| "grad_norm": 0.33517026901245117, |
| "learning_rate": 0.0005124160839160838, |
| "loss": 3.4375, |
| "step": 25150 |
| }, |
| { |
| "epoch": 7.338400605742908, |
| "grad_norm": 0.34164315462112427, |
| "learning_rate": 0.0005122412587412588, |
| "loss": 3.4257, |
| "step": 25200 |
| }, |
| { |
| "epoch": 7.352961733356631, |
| "grad_norm": 0.33371347188949585, |
| "learning_rate": 0.0005120664335664336, |
| "loss": 3.4215, |
| "step": 25250 |
| }, |
| { |
| "epoch": 7.367522860970354, |
| "grad_norm": 0.3320077359676361, |
| "learning_rate": 0.0005118916083916084, |
| "loss": 3.4399, |
| "step": 25300 |
| }, |
| { |
| "epoch": 7.382083988584076, |
| "grad_norm": 0.316684365272522, |
| "learning_rate": 0.0005117167832167832, |
| "loss": 3.4237, |
| "step": 25350 |
| }, |
| { |
| "epoch": 7.396645116197798, |
| "grad_norm": 0.3490113317966461, |
| "learning_rate": 0.0005115419580419581, |
| "loss": 3.4438, |
| "step": 25400 |
| }, |
| { |
| "epoch": 7.411206243811521, |
| "grad_norm": 0.3502480983734131, |
| "learning_rate": 0.0005113671328671328, |
| "loss": 3.4402, |
| "step": 25450 |
| }, |
| { |
| "epoch": 7.425767371425243, |
| "grad_norm": 0.33350250124931335, |
| "learning_rate": 0.0005111923076923077, |
| "loss": 3.4408, |
| "step": 25500 |
| }, |
| { |
| "epoch": 7.440328499038966, |
| "grad_norm": 0.34780311584472656, |
| "learning_rate": 0.0005110174825174825, |
| "loss": 3.4488, |
| "step": 25550 |
| }, |
| { |
| "epoch": 7.454889626652688, |
| "grad_norm": 0.32847583293914795, |
| "learning_rate": 0.0005108426573426573, |
| "loss": 3.4344, |
| "step": 25600 |
| }, |
| { |
| "epoch": 7.46945075426641, |
| "grad_norm": 0.3202177584171295, |
| "learning_rate": 0.0005106678321678321, |
| "loss": 3.4401, |
| "step": 25650 |
| }, |
| { |
| "epoch": 7.484011881880133, |
| "grad_norm": 0.3321341276168823, |
| "learning_rate": 0.000510493006993007, |
| "loss": 3.444, |
| "step": 25700 |
| }, |
| { |
| "epoch": 7.498573009493855, |
| "grad_norm": 0.3298264443874359, |
| "learning_rate": 0.0005103181818181818, |
| "loss": 3.4397, |
| "step": 25750 |
| }, |
| { |
| "epoch": 7.513134137107578, |
| "grad_norm": 0.3475128412246704, |
| "learning_rate": 0.0005101433566433566, |
| "loss": 3.442, |
| "step": 25800 |
| }, |
| { |
| "epoch": 7.5276952647213005, |
| "grad_norm": 0.3355371952056885, |
| "learning_rate": 0.0005099685314685315, |
| "loss": 3.4446, |
| "step": 25850 |
| }, |
| { |
| "epoch": 7.542256392335022, |
| "grad_norm": 0.3441219627857208, |
| "learning_rate": 0.0005097937062937063, |
| "loss": 3.4464, |
| "step": 25900 |
| }, |
| { |
| "epoch": 7.556817519948745, |
| "grad_norm": 0.3435618579387665, |
| "learning_rate": 0.0005096188811188811, |
| "loss": 3.4633, |
| "step": 25950 |
| }, |
| { |
| "epoch": 7.571378647562467, |
| "grad_norm": 0.34061139822006226, |
| "learning_rate": 0.0005094440559440559, |
| "loss": 3.4491, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.571378647562467, |
| "eval_accuracy": 0.36492978051169445, |
| "eval_loss": 3.592580556869507, |
| "eval_runtime": 82.1493, |
| "eval_samples_per_second": 202.57, |
| "eval_steps_per_second": 12.672, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.58593977517619, |
| "grad_norm": 0.29835349321365356, |
| "learning_rate": 0.0005092692307692308, |
| "loss": 3.4528, |
| "step": 26050 |
| }, |
| { |
| "epoch": 7.600500902789912, |
| "grad_norm": 0.36271563172340393, |
| "learning_rate": 0.0005090944055944056, |
| "loss": 3.4624, |
| "step": 26100 |
| }, |
| { |
| "epoch": 7.615062030403634, |
| "grad_norm": 0.3818504214286804, |
| "learning_rate": 0.0005089195804195804, |
| "loss": 3.4474, |
| "step": 26150 |
| }, |
| { |
| "epoch": 7.629623158017357, |
| "grad_norm": 0.34247979521751404, |
| "learning_rate": 0.0005087447552447552, |
| "loss": 3.4562, |
| "step": 26200 |
| }, |
| { |
| "epoch": 7.644184285631079, |
| "grad_norm": 0.3293968439102173, |
| "learning_rate": 0.00050856993006993, |
| "loss": 3.4554, |
| "step": 26250 |
| }, |
| { |
| "epoch": 7.658745413244802, |
| "grad_norm": 0.349753737449646, |
| "learning_rate": 0.0005083951048951048, |
| "loss": 3.4679, |
| "step": 26300 |
| }, |
| { |
| "epoch": 7.673306540858524, |
| "grad_norm": 0.3234799802303314, |
| "learning_rate": 0.0005082202797202797, |
| "loss": 3.4443, |
| "step": 26350 |
| }, |
| { |
| "epoch": 7.687867668472246, |
| "grad_norm": 0.32560014724731445, |
| "learning_rate": 0.0005080454545454545, |
| "loss": 3.4588, |
| "step": 26400 |
| }, |
| { |
| "epoch": 7.702428796085969, |
| "grad_norm": 0.33277180790901184, |
| "learning_rate": 0.0005078706293706293, |
| "loss": 3.458, |
| "step": 26450 |
| }, |
| { |
| "epoch": 7.716989923699691, |
| "grad_norm": 0.3313036561012268, |
| "learning_rate": 0.0005076958041958042, |
| "loss": 3.4517, |
| "step": 26500 |
| }, |
| { |
| "epoch": 7.731551051313414, |
| "grad_norm": 0.32954031229019165, |
| "learning_rate": 0.000507520979020979, |
| "loss": 3.4483, |
| "step": 26550 |
| }, |
| { |
| "epoch": 7.746112178927136, |
| "grad_norm": 0.32677844166755676, |
| "learning_rate": 0.0005073461538461538, |
| "loss": 3.4536, |
| "step": 26600 |
| }, |
| { |
| "epoch": 7.760673306540858, |
| "grad_norm": 0.3216758072376251, |
| "learning_rate": 0.0005071713286713286, |
| "loss": 3.4623, |
| "step": 26650 |
| }, |
| { |
| "epoch": 7.775234434154581, |
| "grad_norm": 0.3317262530326843, |
| "learning_rate": 0.0005069965034965035, |
| "loss": 3.4574, |
| "step": 26700 |
| }, |
| { |
| "epoch": 7.789795561768304, |
| "grad_norm": 0.3077804148197174, |
| "learning_rate": 0.0005068216783216783, |
| "loss": 3.4581, |
| "step": 26750 |
| }, |
| { |
| "epoch": 7.8043566893820255, |
| "grad_norm": 0.322814404964447, |
| "learning_rate": 0.0005066468531468531, |
| "loss": 3.4569, |
| "step": 26800 |
| }, |
| { |
| "epoch": 7.818917816995748, |
| "grad_norm": 0.35393473505973816, |
| "learning_rate": 0.0005064720279720279, |
| "loss": 3.4541, |
| "step": 26850 |
| }, |
| { |
| "epoch": 7.833478944609471, |
| "grad_norm": 0.3259778618812561, |
| "learning_rate": 0.0005062972027972028, |
| "loss": 3.4527, |
| "step": 26900 |
| }, |
| { |
| "epoch": 7.848040072223193, |
| "grad_norm": 0.34595271944999695, |
| "learning_rate": 0.0005061223776223775, |
| "loss": 3.4554, |
| "step": 26950 |
| }, |
| { |
| "epoch": 7.862601199836916, |
| "grad_norm": 0.3397703170776367, |
| "learning_rate": 0.0005059475524475524, |
| "loss": 3.4585, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.862601199836916, |
| "eval_accuracy": 0.36590549054153965, |
| "eval_loss": 3.58744215965271, |
| "eval_runtime": 82.0773, |
| "eval_samples_per_second": 202.748, |
| "eval_steps_per_second": 12.683, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.8771623274506375, |
| "grad_norm": 0.36333784461021423, |
| "learning_rate": 0.0005057727272727272, |
| "loss": 3.4637, |
| "step": 27050 |
| }, |
| { |
| "epoch": 7.89172345506436, |
| "grad_norm": 0.33508622646331787, |
| "learning_rate": 0.000505597902097902, |
| "loss": 3.4621, |
| "step": 27100 |
| }, |
| { |
| "epoch": 7.906284582678083, |
| "grad_norm": 0.34249427914619446, |
| "learning_rate": 0.0005054230769230769, |
| "loss": 3.4622, |
| "step": 27150 |
| }, |
| { |
| "epoch": 7.920845710291805, |
| "grad_norm": 0.36897435784339905, |
| "learning_rate": 0.0005052482517482517, |
| "loss": 3.4594, |
| "step": 27200 |
| }, |
| { |
| "epoch": 7.935406837905528, |
| "grad_norm": 0.3256712555885315, |
| "learning_rate": 0.0005050734265734265, |
| "loss": 3.4537, |
| "step": 27250 |
| }, |
| { |
| "epoch": 7.9499679655192494, |
| "grad_norm": 0.34272387623786926, |
| "learning_rate": 0.0005048986013986013, |
| "loss": 3.4522, |
| "step": 27300 |
| }, |
| { |
| "epoch": 7.964529093132972, |
| "grad_norm": 0.3302561640739441, |
| "learning_rate": 0.0005047237762237762, |
| "loss": 3.458, |
| "step": 27350 |
| }, |
| { |
| "epoch": 7.979090220746695, |
| "grad_norm": 0.37136200070381165, |
| "learning_rate": 0.000504548951048951, |
| "loss": 3.4632, |
| "step": 27400 |
| }, |
| { |
| "epoch": 7.993651348360417, |
| "grad_norm": 0.36610859632492065, |
| "learning_rate": 0.0005043741258741258, |
| "loss": 3.4699, |
| "step": 27450 |
| }, |
| { |
| "epoch": 8.008154231463685, |
| "grad_norm": 0.37154725193977356, |
| "learning_rate": 0.0005041993006993006, |
| "loss": 3.3855, |
| "step": 27500 |
| }, |
| { |
| "epoch": 8.022715359077408, |
| "grad_norm": 0.3297024071216583, |
| "learning_rate": 0.0005040244755244755, |
| "loss": 3.3477, |
| "step": 27550 |
| }, |
| { |
| "epoch": 8.037276486691129, |
| "grad_norm": 0.35138505697250366, |
| "learning_rate": 0.0005038496503496503, |
| "loss": 3.3524, |
| "step": 27600 |
| }, |
| { |
| "epoch": 8.051837614304851, |
| "grad_norm": 0.3634728491306305, |
| "learning_rate": 0.0005036748251748251, |
| "loss": 3.3477, |
| "step": 27650 |
| }, |
| { |
| "epoch": 8.066398741918574, |
| "grad_norm": 0.35875585675239563, |
| "learning_rate": 0.0005034999999999999, |
| "loss": 3.3595, |
| "step": 27700 |
| }, |
| { |
| "epoch": 8.080959869532297, |
| "grad_norm": 0.3288542926311493, |
| "learning_rate": 0.0005033251748251747, |
| "loss": 3.3589, |
| "step": 27750 |
| }, |
| { |
| "epoch": 8.09552099714602, |
| "grad_norm": 0.35329458117485046, |
| "learning_rate": 0.0005031503496503496, |
| "loss": 3.3658, |
| "step": 27800 |
| }, |
| { |
| "epoch": 8.11008212475974, |
| "grad_norm": 0.327090322971344, |
| "learning_rate": 0.0005029755244755244, |
| "loss": 3.3897, |
| "step": 27850 |
| }, |
| { |
| "epoch": 8.124643252373463, |
| "grad_norm": 0.36477968096733093, |
| "learning_rate": 0.0005028006993006992, |
| "loss": 3.3798, |
| "step": 27900 |
| }, |
| { |
| "epoch": 8.139204379987186, |
| "grad_norm": 0.3294038772583008, |
| "learning_rate": 0.000502625874125874, |
| "loss": 3.3885, |
| "step": 27950 |
| }, |
| { |
| "epoch": 8.153765507600909, |
| "grad_norm": 0.3175249993801117, |
| "learning_rate": 0.000502451048951049, |
| "loss": 3.3888, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.153765507600909, |
| "eval_accuracy": 0.36540193620288747, |
| "eval_loss": 3.5956852436065674, |
| "eval_runtime": 81.9746, |
| "eval_samples_per_second": 203.002, |
| "eval_steps_per_second": 12.699, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.168326635214632, |
| "grad_norm": 0.35772353410720825, |
| "learning_rate": 0.0005022762237762237, |
| "loss": 3.3963, |
| "step": 28050 |
| }, |
| { |
| "epoch": 8.182887762828354, |
| "grad_norm": 0.3367580473423004, |
| "learning_rate": 0.0005021013986013985, |
| "loss": 3.3912, |
| "step": 28100 |
| }, |
| { |
| "epoch": 8.197448890442075, |
| "grad_norm": 0.34762635827064514, |
| "learning_rate": 0.0005019265734265733, |
| "loss": 3.381, |
| "step": 28150 |
| }, |
| { |
| "epoch": 8.212010018055798, |
| "grad_norm": 0.33486559987068176, |
| "learning_rate": 0.0005017517482517483, |
| "loss": 3.3857, |
| "step": 28200 |
| }, |
| { |
| "epoch": 8.22657114566952, |
| "grad_norm": 0.31943923234939575, |
| "learning_rate": 0.0005015769230769231, |
| "loss": 3.3948, |
| "step": 28250 |
| }, |
| { |
| "epoch": 8.241132273283243, |
| "grad_norm": 0.35075676441192627, |
| "learning_rate": 0.0005014020979020979, |
| "loss": 3.39, |
| "step": 28300 |
| }, |
| { |
| "epoch": 8.255693400896966, |
| "grad_norm": 0.36571091413497925, |
| "learning_rate": 0.0005012272727272727, |
| "loss": 3.4008, |
| "step": 28350 |
| }, |
| { |
| "epoch": 8.270254528510687, |
| "grad_norm": 0.33476728200912476, |
| "learning_rate": 0.0005010524475524476, |
| "loss": 3.4003, |
| "step": 28400 |
| }, |
| { |
| "epoch": 8.28481565612441, |
| "grad_norm": 0.3515719175338745, |
| "learning_rate": 0.0005008776223776223, |
| "loss": 3.3984, |
| "step": 28450 |
| }, |
| { |
| "epoch": 8.299376783738133, |
| "grad_norm": 0.3276821970939636, |
| "learning_rate": 0.0005007027972027972, |
| "loss": 3.3894, |
| "step": 28500 |
| }, |
| { |
| "epoch": 8.313937911351855, |
| "grad_norm": 0.3458368480205536, |
| "learning_rate": 0.000500527972027972, |
| "loss": 3.3873, |
| "step": 28550 |
| }, |
| { |
| "epoch": 8.328499038965578, |
| "grad_norm": 0.3482520878314972, |
| "learning_rate": 0.0005003531468531468, |
| "loss": 3.4023, |
| "step": 28600 |
| }, |
| { |
| "epoch": 8.3430601665793, |
| "grad_norm": 0.3726944625377655, |
| "learning_rate": 0.0005001783216783217, |
| "loss": 3.4094, |
| "step": 28650 |
| }, |
| { |
| "epoch": 8.357621294193022, |
| "grad_norm": 0.341086745262146, |
| "learning_rate": 0.0005000034965034965, |
| "loss": 3.3969, |
| "step": 28700 |
| }, |
| { |
| "epoch": 8.372182421806745, |
| "grad_norm": 0.33654260635375977, |
| "learning_rate": 0.0004998286713286713, |
| "loss": 3.4079, |
| "step": 28750 |
| }, |
| { |
| "epoch": 8.386743549420467, |
| "grad_norm": 0.3618704378604889, |
| "learning_rate": 0.0004996538461538461, |
| "loss": 3.4096, |
| "step": 28800 |
| }, |
| { |
| "epoch": 8.40130467703419, |
| "grad_norm": 0.32404372096061707, |
| "learning_rate": 0.000499479020979021, |
| "loss": 3.4089, |
| "step": 28850 |
| }, |
| { |
| "epoch": 8.415865804647911, |
| "grad_norm": 0.3437615633010864, |
| "learning_rate": 0.0004993041958041958, |
| "loss": 3.4112, |
| "step": 28900 |
| }, |
| { |
| "epoch": 8.430426932261634, |
| "grad_norm": 0.3371738791465759, |
| "learning_rate": 0.0004991293706293706, |
| "loss": 3.4037, |
| "step": 28950 |
| }, |
| { |
| "epoch": 8.444988059875357, |
| "grad_norm": 0.33119940757751465, |
| "learning_rate": 0.0004989545454545454, |
| "loss": 3.431, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.444988059875357, |
| "eval_accuracy": 0.36627945196071615, |
| "eval_loss": 3.5857601165771484, |
| "eval_runtime": 82.1728, |
| "eval_samples_per_second": 202.512, |
| "eval_steps_per_second": 12.668, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.45954918748908, |
| "grad_norm": 0.32963335514068604, |
| "learning_rate": 0.0004987797202797203, |
| "loss": 3.4087, |
| "step": 29050 |
| }, |
| { |
| "epoch": 8.474110315102802, |
| "grad_norm": 0.3364778161048889, |
| "learning_rate": 0.0004986048951048951, |
| "loss": 3.4143, |
| "step": 29100 |
| }, |
| { |
| "epoch": 8.488671442716523, |
| "grad_norm": 0.3649277985095978, |
| "learning_rate": 0.0004984300699300699, |
| "loss": 3.4117, |
| "step": 29150 |
| }, |
| { |
| "epoch": 8.503232570330246, |
| "grad_norm": 0.3436116874217987, |
| "learning_rate": 0.0004982552447552448, |
| "loss": 3.4234, |
| "step": 29200 |
| }, |
| { |
| "epoch": 8.517793697943969, |
| "grad_norm": 0.34127357602119446, |
| "learning_rate": 0.0004980804195804195, |
| "loss": 3.4111, |
| "step": 29250 |
| }, |
| { |
| "epoch": 8.532354825557691, |
| "grad_norm": 0.34620535373687744, |
| "learning_rate": 0.0004979055944055944, |
| "loss": 3.4207, |
| "step": 29300 |
| }, |
| { |
| "epoch": 8.546915953171414, |
| "grad_norm": 0.33740928769111633, |
| "learning_rate": 0.0004977307692307692, |
| "loss": 3.4167, |
| "step": 29350 |
| }, |
| { |
| "epoch": 8.561477080785137, |
| "grad_norm": 0.3372558057308197, |
| "learning_rate": 0.000497555944055944, |
| "loss": 3.4167, |
| "step": 29400 |
| }, |
| { |
| "epoch": 8.576038208398858, |
| "grad_norm": 0.3505604863166809, |
| "learning_rate": 0.0004973811188811188, |
| "loss": 3.4237, |
| "step": 29450 |
| }, |
| { |
| "epoch": 8.59059933601258, |
| "grad_norm": 0.3436596989631653, |
| "learning_rate": 0.0004972062937062937, |
| "loss": 3.4208, |
| "step": 29500 |
| }, |
| { |
| "epoch": 8.605160463626303, |
| "grad_norm": 0.33551499247550964, |
| "learning_rate": 0.0004970314685314685, |
| "loss": 3.4224, |
| "step": 29550 |
| }, |
| { |
| "epoch": 8.619721591240026, |
| "grad_norm": 0.35356608033180237, |
| "learning_rate": 0.0004968566433566433, |
| "loss": 3.4311, |
| "step": 29600 |
| }, |
| { |
| "epoch": 8.634282718853749, |
| "grad_norm": 0.3498377799987793, |
| "learning_rate": 0.0004966818181818181, |
| "loss": 3.4253, |
| "step": 29650 |
| }, |
| { |
| "epoch": 8.64884384646747, |
| "grad_norm": 0.3388371467590332, |
| "learning_rate": 0.000496506993006993, |
| "loss": 3.4207, |
| "step": 29700 |
| }, |
| { |
| "epoch": 8.663404974081192, |
| "grad_norm": 0.3412773907184601, |
| "learning_rate": 0.0004963321678321678, |
| "loss": 3.4186, |
| "step": 29750 |
| }, |
| { |
| "epoch": 8.677966101694915, |
| "grad_norm": 0.3211311101913452, |
| "learning_rate": 0.0004961573426573426, |
| "loss": 3.423, |
| "step": 29800 |
| }, |
| { |
| "epoch": 8.692527229308638, |
| "grad_norm": 0.34617725014686584, |
| "learning_rate": 0.0004959825174825175, |
| "loss": 3.4226, |
| "step": 29850 |
| }, |
| { |
| "epoch": 8.70708835692236, |
| "grad_norm": 0.35791119933128357, |
| "learning_rate": 0.0004958076923076923, |
| "loss": 3.4334, |
| "step": 29900 |
| }, |
| { |
| "epoch": 8.721649484536082, |
| "grad_norm": 0.32724177837371826, |
| "learning_rate": 0.0004956328671328671, |
| "loss": 3.4257, |
| "step": 29950 |
| }, |
| { |
| "epoch": 8.736210612149804, |
| "grad_norm": 0.31460657715797424, |
| "learning_rate": 0.0004954580419580419, |
| "loss": 3.4303, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.736210612149804, |
| "eval_accuracy": 0.3667746568462987, |
| "eval_loss": 3.5769011974334717, |
| "eval_runtime": 82.1724, |
| "eval_samples_per_second": 202.513, |
| "eval_steps_per_second": 12.668, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.750771739763527, |
| "grad_norm": 0.34847262501716614, |
| "learning_rate": 0.0004952832167832167, |
| "loss": 3.4165, |
| "step": 30050 |
| }, |
| { |
| "epoch": 8.76533286737725, |
| "grad_norm": 0.34068724513053894, |
| "learning_rate": 0.0004951083916083915, |
| "loss": 3.4344, |
| "step": 30100 |
| }, |
| { |
| "epoch": 8.779893994990973, |
| "grad_norm": 0.3295113146305084, |
| "learning_rate": 0.0004949335664335664, |
| "loss": 3.423, |
| "step": 30150 |
| }, |
| { |
| "epoch": 8.794455122604695, |
| "grad_norm": 0.3246609568595886, |
| "learning_rate": 0.0004947587412587412, |
| "loss": 3.4382, |
| "step": 30200 |
| }, |
| { |
| "epoch": 8.809016250218416, |
| "grad_norm": 0.34611716866493225, |
| "learning_rate": 0.000494583916083916, |
| "loss": 3.4296, |
| "step": 30250 |
| }, |
| { |
| "epoch": 8.82357737783214, |
| "grad_norm": 0.3563117980957031, |
| "learning_rate": 0.0004944090909090908, |
| "loss": 3.4268, |
| "step": 30300 |
| }, |
| { |
| "epoch": 8.838138505445862, |
| "grad_norm": 0.3378249704837799, |
| "learning_rate": 0.0004942342657342657, |
| "loss": 3.4351, |
| "step": 30350 |
| }, |
| { |
| "epoch": 8.852699633059585, |
| "grad_norm": 0.3394589424133301, |
| "learning_rate": 0.0004940594405594405, |
| "loss": 3.4324, |
| "step": 30400 |
| }, |
| { |
| "epoch": 8.867260760673307, |
| "grad_norm": 0.31331267952919006, |
| "learning_rate": 0.0004938846153846153, |
| "loss": 3.4308, |
| "step": 30450 |
| }, |
| { |
| "epoch": 8.881821888287028, |
| "grad_norm": 0.33133113384246826, |
| "learning_rate": 0.0004937097902097901, |
| "loss": 3.4319, |
| "step": 30500 |
| }, |
| { |
| "epoch": 8.896383015900751, |
| "grad_norm": 0.3186222016811371, |
| "learning_rate": 0.000493534965034965, |
| "loss": 3.4392, |
| "step": 30550 |
| }, |
| { |
| "epoch": 8.910944143514474, |
| "grad_norm": 0.335579514503479, |
| "learning_rate": 0.0004933601398601398, |
| "loss": 3.4355, |
| "step": 30600 |
| }, |
| { |
| "epoch": 8.925505271128197, |
| "grad_norm": 0.3626861572265625, |
| "learning_rate": 0.0004931853146853146, |
| "loss": 3.4276, |
| "step": 30650 |
| }, |
| { |
| "epoch": 8.94006639874192, |
| "grad_norm": 0.304193913936615, |
| "learning_rate": 0.0004930104895104895, |
| "loss": 3.4353, |
| "step": 30700 |
| }, |
| { |
| "epoch": 8.95462752635564, |
| "grad_norm": 0.3138200640678406, |
| "learning_rate": 0.0004928356643356642, |
| "loss": 3.434, |
| "step": 30750 |
| }, |
| { |
| "epoch": 8.969188653969363, |
| "grad_norm": 0.3355734944343567, |
| "learning_rate": 0.0004926608391608391, |
| "loss": 3.4422, |
| "step": 30800 |
| }, |
| { |
| "epoch": 8.983749781583086, |
| "grad_norm": 0.3736382722854614, |
| "learning_rate": 0.0004924860139860139, |
| "loss": 3.4473, |
| "step": 30850 |
| }, |
| { |
| "epoch": 8.998310909196809, |
| "grad_norm": 0.3251797556877136, |
| "learning_rate": 0.0004923111888111887, |
| "loss": 3.4439, |
| "step": 30900 |
| }, |
| { |
| "epoch": 9.012813792300076, |
| "grad_norm": 0.37890997529029846, |
| "learning_rate": 0.0004921363636363635, |
| "loss": 3.3313, |
| "step": 30950 |
| }, |
| { |
| "epoch": 9.027374919913798, |
| "grad_norm": 0.34193721413612366, |
| "learning_rate": 0.0004919615384615384, |
| "loss": 3.3287, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.027374919913798, |
| "eval_accuracy": 0.36702666921148586, |
| "eval_loss": 3.5820813179016113, |
| "eval_runtime": 82.0452, |
| "eval_samples_per_second": 202.827, |
| "eval_steps_per_second": 12.688, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.041936047527521, |
| "grad_norm": 0.35010021924972534, |
| "learning_rate": 0.0004917867132867132, |
| "loss": 3.3277, |
| "step": 31050 |
| }, |
| { |
| "epoch": 9.056497175141242, |
| "grad_norm": 0.34834861755371094, |
| "learning_rate": 0.000491611888111888, |
| "loss": 3.3346, |
| "step": 31100 |
| }, |
| { |
| "epoch": 9.071058302754965, |
| "grad_norm": 0.3463055491447449, |
| "learning_rate": 0.0004914370629370628, |
| "loss": 3.3381, |
| "step": 31150 |
| }, |
| { |
| "epoch": 9.085619430368688, |
| "grad_norm": 0.33116334676742554, |
| "learning_rate": 0.0004912622377622378, |
| "loss": 3.3421, |
| "step": 31200 |
| }, |
| { |
| "epoch": 9.10018055798241, |
| "grad_norm": 0.35653725266456604, |
| "learning_rate": 0.0004910874125874126, |
| "loss": 3.3375, |
| "step": 31250 |
| }, |
| { |
| "epoch": 9.114741685596133, |
| "grad_norm": 0.33677345514297485, |
| "learning_rate": 0.0004909125874125874, |
| "loss": 3.3527, |
| "step": 31300 |
| }, |
| { |
| "epoch": 9.129302813209854, |
| "grad_norm": 0.3330564498901367, |
| "learning_rate": 0.0004907377622377623, |
| "loss": 3.345, |
| "step": 31350 |
| }, |
| { |
| "epoch": 9.143863940823577, |
| "grad_norm": 0.3343384861946106, |
| "learning_rate": 0.0004905629370629371, |
| "loss": 3.3639, |
| "step": 31400 |
| }, |
| { |
| "epoch": 9.1584250684373, |
| "grad_norm": 0.3333738148212433, |
| "learning_rate": 0.0004903881118881119, |
| "loss": 3.3582, |
| "step": 31450 |
| }, |
| { |
| "epoch": 9.172986196051022, |
| "grad_norm": 0.342664897441864, |
| "learning_rate": 0.0004902132867132867, |
| "loss": 3.3577, |
| "step": 31500 |
| }, |
| { |
| "epoch": 9.187547323664745, |
| "grad_norm": 0.32999032735824585, |
| "learning_rate": 0.0004900384615384615, |
| "loss": 3.3654, |
| "step": 31550 |
| }, |
| { |
| "epoch": 9.202108451278466, |
| "grad_norm": 0.33270004391670227, |
| "learning_rate": 0.0004898636363636363, |
| "loss": 3.3638, |
| "step": 31600 |
| }, |
| { |
| "epoch": 9.216669578892189, |
| "grad_norm": 0.3335644006729126, |
| "learning_rate": 0.0004896888111888112, |
| "loss": 3.3577, |
| "step": 31650 |
| }, |
| { |
| "epoch": 9.231230706505912, |
| "grad_norm": 0.3499751091003418, |
| "learning_rate": 0.000489513986013986, |
| "loss": 3.37, |
| "step": 31700 |
| }, |
| { |
| "epoch": 9.245791834119634, |
| "grad_norm": 0.3536551296710968, |
| "learning_rate": 0.0004893391608391608, |
| "loss": 3.3572, |
| "step": 31750 |
| }, |
| { |
| "epoch": 9.260352961733357, |
| "grad_norm": 0.3320184350013733, |
| "learning_rate": 0.0004891643356643356, |
| "loss": 3.3567, |
| "step": 31800 |
| }, |
| { |
| "epoch": 9.27491408934708, |
| "grad_norm": 0.3454374670982361, |
| "learning_rate": 0.0004889895104895105, |
| "loss": 3.3768, |
| "step": 31850 |
| }, |
| { |
| "epoch": 9.2894752169608, |
| "grad_norm": 0.33931684494018555, |
| "learning_rate": 0.0004888146853146853, |
| "loss": 3.3554, |
| "step": 31900 |
| }, |
| { |
| "epoch": 9.304036344574524, |
| "grad_norm": 0.34095895290374756, |
| "learning_rate": 0.0004886398601398601, |
| "loss": 3.3841, |
| "step": 31950 |
| }, |
| { |
| "epoch": 9.318597472188246, |
| "grad_norm": 0.34105435013771057, |
| "learning_rate": 0.000488465034965035, |
| "loss": 3.3702, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.318597472188246, |
| "eval_accuracy": 0.36734959312879994, |
| "eval_loss": 3.5813770294189453, |
| "eval_runtime": 82.201, |
| "eval_samples_per_second": 202.443, |
| "eval_steps_per_second": 12.664, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.333158599801969, |
| "grad_norm": 0.3383043706417084, |
| "learning_rate": 0.0004882902097902098, |
| "loss": 3.3817, |
| "step": 32050 |
| }, |
| { |
| "epoch": 9.347719727415692, |
| "grad_norm": 0.3726060688495636, |
| "learning_rate": 0.0004881153846153846, |
| "loss": 3.3896, |
| "step": 32100 |
| }, |
| { |
| "epoch": 9.362280855029413, |
| "grad_norm": 0.38369378447532654, |
| "learning_rate": 0.0004879405594405594, |
| "loss": 3.3853, |
| "step": 32150 |
| }, |
| { |
| "epoch": 9.376841982643136, |
| "grad_norm": 0.34436163306236267, |
| "learning_rate": 0.00048776573426573424, |
| "loss": 3.3809, |
| "step": 32200 |
| }, |
| { |
| "epoch": 9.391403110256858, |
| "grad_norm": 0.3672736883163452, |
| "learning_rate": 0.00048759090909090904, |
| "loss": 3.38, |
| "step": 32250 |
| }, |
| { |
| "epoch": 9.405964237870581, |
| "grad_norm": 0.3847672641277313, |
| "learning_rate": 0.0004874160839160839, |
| "loss": 3.3859, |
| "step": 32300 |
| }, |
| { |
| "epoch": 9.420525365484304, |
| "grad_norm": 0.3339109718799591, |
| "learning_rate": 0.0004872412587412587, |
| "loss": 3.3808, |
| "step": 32350 |
| }, |
| { |
| "epoch": 9.435086493098025, |
| "grad_norm": 0.3328935503959656, |
| "learning_rate": 0.00048706643356643354, |
| "loss": 3.3862, |
| "step": 32400 |
| }, |
| { |
| "epoch": 9.449647620711747, |
| "grad_norm": 0.3171926736831665, |
| "learning_rate": 0.00048689160839160834, |
| "loss": 3.3978, |
| "step": 32450 |
| }, |
| { |
| "epoch": 9.46420874832547, |
| "grad_norm": 0.3459148705005646, |
| "learning_rate": 0.0004867167832167832, |
| "loss": 3.3813, |
| "step": 32500 |
| }, |
| { |
| "epoch": 9.478769875939193, |
| "grad_norm": 0.3541918694972992, |
| "learning_rate": 0.00048654195804195794, |
| "loss": 3.3944, |
| "step": 32550 |
| }, |
| { |
| "epoch": 9.493331003552916, |
| "grad_norm": 0.33506298065185547, |
| "learning_rate": 0.00048636713286713285, |
| "loss": 3.3814, |
| "step": 32600 |
| }, |
| { |
| "epoch": 9.507892131166638, |
| "grad_norm": 0.32979851961135864, |
| "learning_rate": 0.0004861923076923077, |
| "loss": 3.3811, |
| "step": 32650 |
| }, |
| { |
| "epoch": 9.52245325878036, |
| "grad_norm": 0.3828373849391937, |
| "learning_rate": 0.00048601748251748245, |
| "loss": 3.3841, |
| "step": 32700 |
| }, |
| { |
| "epoch": 9.537014386394082, |
| "grad_norm": 0.34116461873054504, |
| "learning_rate": 0.0004858426573426573, |
| "loss": 3.3965, |
| "step": 32750 |
| }, |
| { |
| "epoch": 9.551575514007805, |
| "grad_norm": 0.3470398187637329, |
| "learning_rate": 0.0004856678321678321, |
| "loss": 3.3912, |
| "step": 32800 |
| }, |
| { |
| "epoch": 9.566136641621528, |
| "grad_norm": 0.3707630932331085, |
| "learning_rate": 0.00048549300699300696, |
| "loss": 3.3963, |
| "step": 32850 |
| }, |
| { |
| "epoch": 9.58069776923525, |
| "grad_norm": 0.3699617087841034, |
| "learning_rate": 0.00048531818181818176, |
| "loss": 3.4016, |
| "step": 32900 |
| }, |
| { |
| "epoch": 9.595258896848971, |
| "grad_norm": 0.3348841071128845, |
| "learning_rate": 0.0004851433566433566, |
| "loss": 3.3964, |
| "step": 32950 |
| }, |
| { |
| "epoch": 9.609820024462694, |
| "grad_norm": 0.32681694626808167, |
| "learning_rate": 0.0004849685314685314, |
| "loss": 3.3958, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.609820024462694, |
| "eval_accuracy": 0.36752434365361014, |
| "eval_loss": 3.57114577293396, |
| "eval_runtime": 82.286, |
| "eval_samples_per_second": 202.234, |
| "eval_steps_per_second": 12.651, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.624381152076417, |
| "grad_norm": 0.3367058038711548, |
| "learning_rate": 0.00048479370629370627, |
| "loss": 3.3883, |
| "step": 33050 |
| }, |
| { |
| "epoch": 9.63894227969014, |
| "grad_norm": 0.35300782322883606, |
| "learning_rate": 0.00048461888111888106, |
| "loss": 3.3921, |
| "step": 33100 |
| }, |
| { |
| "epoch": 9.653503407303862, |
| "grad_norm": 0.3464009165763855, |
| "learning_rate": 0.0004844440559440559, |
| "loss": 3.3924, |
| "step": 33150 |
| }, |
| { |
| "epoch": 9.668064534917583, |
| "grad_norm": 0.3604465126991272, |
| "learning_rate": 0.0004842692307692307, |
| "loss": 3.3948, |
| "step": 33200 |
| }, |
| { |
| "epoch": 9.682625662531306, |
| "grad_norm": 0.35210102796554565, |
| "learning_rate": 0.00048409440559440557, |
| "loss": 3.3997, |
| "step": 33250 |
| }, |
| { |
| "epoch": 9.697186790145029, |
| "grad_norm": 0.36348727345466614, |
| "learning_rate": 0.0004839195804195803, |
| "loss": 3.4057, |
| "step": 33300 |
| }, |
| { |
| "epoch": 9.711747917758752, |
| "grad_norm": 0.3434184491634369, |
| "learning_rate": 0.0004837447552447552, |
| "loss": 3.3928, |
| "step": 33350 |
| }, |
| { |
| "epoch": 9.726309045372474, |
| "grad_norm": 0.35491108894348145, |
| "learning_rate": 0.0004835699300699301, |
| "loss": 3.4002, |
| "step": 33400 |
| }, |
| { |
| "epoch": 9.740870172986195, |
| "grad_norm": 0.3505319654941559, |
| "learning_rate": 0.0004833951048951048, |
| "loss": 3.4101, |
| "step": 33450 |
| }, |
| { |
| "epoch": 9.755431300599918, |
| "grad_norm": 0.3373035192489624, |
| "learning_rate": 0.0004832202797202797, |
| "loss": 3.406, |
| "step": 33500 |
| }, |
| { |
| "epoch": 9.76999242821364, |
| "grad_norm": 0.3695663511753082, |
| "learning_rate": 0.0004830454545454545, |
| "loss": 3.4008, |
| "step": 33550 |
| }, |
| { |
| "epoch": 9.784553555827364, |
| "grad_norm": 0.34008243680000305, |
| "learning_rate": 0.00048287062937062933, |
| "loss": 3.4126, |
| "step": 33600 |
| }, |
| { |
| "epoch": 9.799114683441086, |
| "grad_norm": 0.3522728383541107, |
| "learning_rate": 0.00048269580419580413, |
| "loss": 3.4102, |
| "step": 33650 |
| }, |
| { |
| "epoch": 9.813675811054807, |
| "grad_norm": 0.33306053280830383, |
| "learning_rate": 0.000482520979020979, |
| "loss": 3.4045, |
| "step": 33700 |
| }, |
| { |
| "epoch": 9.82823693866853, |
| "grad_norm": 0.3559751510620117, |
| "learning_rate": 0.0004823461538461538, |
| "loss": 3.4183, |
| "step": 33750 |
| }, |
| { |
| "epoch": 9.842798066282253, |
| "grad_norm": 0.36464354395866394, |
| "learning_rate": 0.00048217132867132864, |
| "loss": 3.4063, |
| "step": 33800 |
| }, |
| { |
| "epoch": 9.857359193895975, |
| "grad_norm": 0.33706969022750854, |
| "learning_rate": 0.00048199650349650344, |
| "loss": 3.4034, |
| "step": 33850 |
| }, |
| { |
| "epoch": 9.871920321509698, |
| "grad_norm": 0.34669429063796997, |
| "learning_rate": 0.0004818216783216783, |
| "loss": 3.4114, |
| "step": 33900 |
| }, |
| { |
| "epoch": 9.88648144912342, |
| "grad_norm": 0.3512224555015564, |
| "learning_rate": 0.0004816468531468531, |
| "loss": 3.4176, |
| "step": 33950 |
| }, |
| { |
| "epoch": 9.901042576737142, |
| "grad_norm": 0.34088560938835144, |
| "learning_rate": 0.00048147202797202795, |
| "loss": 3.4182, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.901042576737142, |
| "eval_accuracy": 0.36833859172479827, |
| "eval_loss": 3.5638182163238525, |
| "eval_runtime": 82.1878, |
| "eval_samples_per_second": 202.475, |
| "eval_steps_per_second": 12.666, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.915603704350865, |
| "grad_norm": 0.3342849016189575, |
| "learning_rate": 0.0004812972027972028, |
| "loss": 3.4353, |
| "step": 34050 |
| }, |
| { |
| "epoch": 9.930164831964587, |
| "grad_norm": 0.32753968238830566, |
| "learning_rate": 0.0004811223776223776, |
| "loss": 3.4116, |
| "step": 34100 |
| }, |
| { |
| "epoch": 9.94472595957831, |
| "grad_norm": 0.35529664158821106, |
| "learning_rate": 0.00048094755244755245, |
| "loss": 3.4155, |
| "step": 34150 |
| }, |
| { |
| "epoch": 9.959287087192033, |
| "grad_norm": 0.3305436074733734, |
| "learning_rate": 0.0004807727272727272, |
| "loss": 3.4092, |
| "step": 34200 |
| }, |
| { |
| "epoch": 9.973848214805754, |
| "grad_norm": 0.335018515586853, |
| "learning_rate": 0.00048059790209790205, |
| "loss": 3.4258, |
| "step": 34250 |
| }, |
| { |
| "epoch": 9.988409342419477, |
| "grad_norm": 0.36587873101234436, |
| "learning_rate": 0.00048042307692307685, |
| "loss": 3.4104, |
| "step": 34300 |
| }, |
| { |
| "epoch": 10.002912225522744, |
| "grad_norm": 0.35869884490966797, |
| "learning_rate": 0.0004802482517482517, |
| "loss": 3.3835, |
| "step": 34350 |
| }, |
| { |
| "epoch": 10.017473353136467, |
| "grad_norm": 0.3370542526245117, |
| "learning_rate": 0.0004800734265734265, |
| "loss": 3.293, |
| "step": 34400 |
| }, |
| { |
| "epoch": 10.03203448075019, |
| "grad_norm": 0.3546755313873291, |
| "learning_rate": 0.00047989860139860136, |
| "loss": 3.3049, |
| "step": 34450 |
| }, |
| { |
| "epoch": 10.046595608363912, |
| "grad_norm": 0.3513050675392151, |
| "learning_rate": 0.00047972377622377616, |
| "loss": 3.3017, |
| "step": 34500 |
| }, |
| { |
| "epoch": 10.061156735977635, |
| "grad_norm": 0.3362545967102051, |
| "learning_rate": 0.000479548951048951, |
| "loss": 3.3224, |
| "step": 34550 |
| }, |
| { |
| "epoch": 10.075717863591356, |
| "grad_norm": 0.33519917726516724, |
| "learning_rate": 0.0004793741258741258, |
| "loss": 3.3147, |
| "step": 34600 |
| }, |
| { |
| "epoch": 10.090278991205079, |
| "grad_norm": 0.3624829649925232, |
| "learning_rate": 0.00047919930069930067, |
| "loss": 3.3218, |
| "step": 34650 |
| }, |
| { |
| "epoch": 10.104840118818801, |
| "grad_norm": 0.3625950217247009, |
| "learning_rate": 0.0004790244755244755, |
| "loss": 3.3132, |
| "step": 34700 |
| }, |
| { |
| "epoch": 10.119401246432524, |
| "grad_norm": 0.3731849491596222, |
| "learning_rate": 0.0004788496503496503, |
| "loss": 3.3178, |
| "step": 34750 |
| }, |
| { |
| "epoch": 10.133962374046247, |
| "grad_norm": 0.32456716895103455, |
| "learning_rate": 0.0004786748251748252, |
| "loss": 3.323, |
| "step": 34800 |
| }, |
| { |
| "epoch": 10.148523501659968, |
| "grad_norm": 0.3482270836830139, |
| "learning_rate": 0.0004785, |
| "loss": 3.3271, |
| "step": 34850 |
| }, |
| { |
| "epoch": 10.16308462927369, |
| "grad_norm": 0.34670212864875793, |
| "learning_rate": 0.00047832517482517483, |
| "loss": 3.3345, |
| "step": 34900 |
| }, |
| { |
| "epoch": 10.177645756887413, |
| "grad_norm": 0.33922117948532104, |
| "learning_rate": 0.0004781503496503496, |
| "loss": 3.3318, |
| "step": 34950 |
| }, |
| { |
| "epoch": 10.192206884501136, |
| "grad_norm": 0.3692997694015503, |
| "learning_rate": 0.00047797552447552443, |
| "loss": 3.3301, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.192206884501136, |
| "eval_accuracy": 0.36811927158430635, |
| "eval_loss": 3.5755672454833984, |
| "eval_runtime": 82.1758, |
| "eval_samples_per_second": 202.505, |
| "eval_steps_per_second": 12.668, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.206768012114859, |
| "grad_norm": 0.37641751766204834, |
| "learning_rate": 0.00047780069930069923, |
| "loss": 3.3302, |
| "step": 35050 |
| }, |
| { |
| "epoch": 10.221329139728581, |
| "grad_norm": 0.3289963901042938, |
| "learning_rate": 0.0004776258741258741, |
| "loss": 3.3413, |
| "step": 35100 |
| }, |
| { |
| "epoch": 10.235890267342302, |
| "grad_norm": 0.3262127637863159, |
| "learning_rate": 0.0004774510489510489, |
| "loss": 3.3465, |
| "step": 35150 |
| }, |
| { |
| "epoch": 10.250451394956025, |
| "grad_norm": 0.32249385118484497, |
| "learning_rate": 0.00047727622377622374, |
| "loss": 3.3438, |
| "step": 35200 |
| }, |
| { |
| "epoch": 10.265012522569748, |
| "grad_norm": 0.35172364115715027, |
| "learning_rate": 0.00047710139860139854, |
| "loss": 3.3424, |
| "step": 35250 |
| }, |
| { |
| "epoch": 10.27957365018347, |
| "grad_norm": 0.33251386880874634, |
| "learning_rate": 0.0004769265734265734, |
| "loss": 3.3565, |
| "step": 35300 |
| }, |
| { |
| "epoch": 10.294134777797193, |
| "grad_norm": 0.32211682200431824, |
| "learning_rate": 0.0004767517482517482, |
| "loss": 3.3575, |
| "step": 35350 |
| }, |
| { |
| "epoch": 10.308695905410914, |
| "grad_norm": 0.36176618933677673, |
| "learning_rate": 0.00047657692307692304, |
| "loss": 3.35, |
| "step": 35400 |
| }, |
| { |
| "epoch": 10.323257033024637, |
| "grad_norm": 0.34484919905662537, |
| "learning_rate": 0.0004764020979020979, |
| "loss": 3.3645, |
| "step": 35450 |
| }, |
| { |
| "epoch": 10.33781816063836, |
| "grad_norm": 0.33707818388938904, |
| "learning_rate": 0.0004762272727272727, |
| "loss": 3.3526, |
| "step": 35500 |
| }, |
| { |
| "epoch": 10.352379288252083, |
| "grad_norm": 0.33765700459480286, |
| "learning_rate": 0.00047605244755244755, |
| "loss": 3.3734, |
| "step": 35550 |
| }, |
| { |
| "epoch": 10.366940415865805, |
| "grad_norm": 0.33717748522758484, |
| "learning_rate": 0.00047587762237762235, |
| "loss": 3.3486, |
| "step": 35600 |
| }, |
| { |
| "epoch": 10.381501543479526, |
| "grad_norm": 0.33442923426628113, |
| "learning_rate": 0.0004757027972027972, |
| "loss": 3.3702, |
| "step": 35650 |
| }, |
| { |
| "epoch": 10.396062671093249, |
| "grad_norm": 0.3614840507507324, |
| "learning_rate": 0.00047552797202797195, |
| "loss": 3.3654, |
| "step": 35700 |
| }, |
| { |
| "epoch": 10.410623798706972, |
| "grad_norm": 0.34301745891571045, |
| "learning_rate": 0.0004753531468531468, |
| "loss": 3.365, |
| "step": 35750 |
| }, |
| { |
| "epoch": 10.425184926320695, |
| "grad_norm": 0.32938823103904724, |
| "learning_rate": 0.0004751783216783216, |
| "loss": 3.3555, |
| "step": 35800 |
| }, |
| { |
| "epoch": 10.439746053934417, |
| "grad_norm": 0.3688816428184509, |
| "learning_rate": 0.00047500349650349646, |
| "loss": 3.3746, |
| "step": 35850 |
| }, |
| { |
| "epoch": 10.454307181548138, |
| "grad_norm": 0.3300078511238098, |
| "learning_rate": 0.00047482867132867126, |
| "loss": 3.353, |
| "step": 35900 |
| }, |
| { |
| "epoch": 10.468868309161861, |
| "grad_norm": 0.38252222537994385, |
| "learning_rate": 0.0004746538461538461, |
| "loss": 3.3572, |
| "step": 35950 |
| }, |
| { |
| "epoch": 10.483429436775584, |
| "grad_norm": 0.3652520775794983, |
| "learning_rate": 0.0004744790209790209, |
| "loss": 3.3746, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.483429436775584, |
| "eval_accuracy": 0.36854109536122026, |
| "eval_loss": 3.5670838356018066, |
| "eval_runtime": 82.2094, |
| "eval_samples_per_second": 202.422, |
| "eval_steps_per_second": 12.663, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.497990564389307, |
| "grad_norm": 0.35863104462623596, |
| "learning_rate": 0.00047430419580419576, |
| "loss": 3.3606, |
| "step": 36050 |
| }, |
| { |
| "epoch": 10.51255169200303, |
| "grad_norm": 0.3476513624191284, |
| "learning_rate": 0.0004741293706293706, |
| "loss": 3.3615, |
| "step": 36100 |
| }, |
| { |
| "epoch": 10.52711281961675, |
| "grad_norm": 0.36102762818336487, |
| "learning_rate": 0.0004739545454545454, |
| "loss": 3.37, |
| "step": 36150 |
| }, |
| { |
| "epoch": 10.541673947230473, |
| "grad_norm": 0.3664513826370239, |
| "learning_rate": 0.00047377972027972027, |
| "loss": 3.3725, |
| "step": 36200 |
| }, |
| { |
| "epoch": 10.556235074844196, |
| "grad_norm": 0.3517782390117645, |
| "learning_rate": 0.00047360489510489507, |
| "loss": 3.3762, |
| "step": 36250 |
| }, |
| { |
| "epoch": 10.570796202457919, |
| "grad_norm": 0.3351428210735321, |
| "learning_rate": 0.0004734300699300699, |
| "loss": 3.387, |
| "step": 36300 |
| }, |
| { |
| "epoch": 10.585357330071641, |
| "grad_norm": 0.35888513922691345, |
| "learning_rate": 0.0004732552447552447, |
| "loss": 3.3698, |
| "step": 36350 |
| }, |
| { |
| "epoch": 10.599918457685362, |
| "grad_norm": 0.36743810772895813, |
| "learning_rate": 0.0004730804195804196, |
| "loss": 3.3778, |
| "step": 36400 |
| }, |
| { |
| "epoch": 10.614479585299085, |
| "grad_norm": 0.3578709065914154, |
| "learning_rate": 0.0004729055944055943, |
| "loss": 3.3766, |
| "step": 36450 |
| }, |
| { |
| "epoch": 10.629040712912808, |
| "grad_norm": 0.3719770312309265, |
| "learning_rate": 0.0004727307692307692, |
| "loss": 3.3647, |
| "step": 36500 |
| }, |
| { |
| "epoch": 10.64360184052653, |
| "grad_norm": 0.5826433300971985, |
| "learning_rate": 0.000472555944055944, |
| "loss": 3.3641, |
| "step": 36550 |
| }, |
| { |
| "epoch": 10.658162968140253, |
| "grad_norm": 0.4075304865837097, |
| "learning_rate": 0.00047238111888111883, |
| "loss": 3.3851, |
| "step": 36600 |
| }, |
| { |
| "epoch": 10.672724095753976, |
| "grad_norm": 0.33860844373703003, |
| "learning_rate": 0.00047220629370629363, |
| "loss": 3.3893, |
| "step": 36650 |
| }, |
| { |
| "epoch": 10.687285223367697, |
| "grad_norm": 0.3330341577529907, |
| "learning_rate": 0.0004720314685314685, |
| "loss": 3.3804, |
| "step": 36700 |
| }, |
| { |
| "epoch": 10.70184635098142, |
| "grad_norm": 0.3583664000034332, |
| "learning_rate": 0.0004718566433566433, |
| "loss": 3.3746, |
| "step": 36750 |
| }, |
| { |
| "epoch": 10.716407478595142, |
| "grad_norm": 0.3603028953075409, |
| "learning_rate": 0.00047168181818181814, |
| "loss": 3.3915, |
| "step": 36800 |
| }, |
| { |
| "epoch": 10.730968606208865, |
| "grad_norm": 0.33417341113090515, |
| "learning_rate": 0.000471506993006993, |
| "loss": 3.3789, |
| "step": 36850 |
| }, |
| { |
| "epoch": 10.745529733822588, |
| "grad_norm": 0.34149593114852905, |
| "learning_rate": 0.0004713321678321678, |
| "loss": 3.3791, |
| "step": 36900 |
| }, |
| { |
| "epoch": 10.760090861436309, |
| "grad_norm": 0.3596676290035248, |
| "learning_rate": 0.00047115734265734265, |
| "loss": 3.3825, |
| "step": 36950 |
| }, |
| { |
| "epoch": 10.774651989050032, |
| "grad_norm": 0.3537720739841461, |
| "learning_rate": 0.00047098251748251745, |
| "loss": 3.3964, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.774651989050032, |
| "eval_accuracy": 0.368824153580075, |
| "eval_loss": 3.5620808601379395, |
| "eval_runtime": 82.1393, |
| "eval_samples_per_second": 202.595, |
| "eval_steps_per_second": 12.674, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.789213116663754, |
| "grad_norm": 0.3514542877674103, |
| "learning_rate": 0.0004708076923076923, |
| "loss": 3.3921, |
| "step": 37050 |
| }, |
| { |
| "epoch": 10.803774244277477, |
| "grad_norm": 0.33731502294540405, |
| "learning_rate": 0.0004706328671328671, |
| "loss": 3.3906, |
| "step": 37100 |
| }, |
| { |
| "epoch": 10.8183353718912, |
| "grad_norm": 0.35446056723594666, |
| "learning_rate": 0.00047045804195804195, |
| "loss": 3.3874, |
| "step": 37150 |
| }, |
| { |
| "epoch": 10.83289649950492, |
| "grad_norm": 0.3617531955242157, |
| "learning_rate": 0.0004702832167832167, |
| "loss": 3.3855, |
| "step": 37200 |
| }, |
| { |
| "epoch": 10.847457627118644, |
| "grad_norm": 0.32348453998565674, |
| "learning_rate": 0.00047010839160839155, |
| "loss": 3.3787, |
| "step": 37250 |
| }, |
| { |
| "epoch": 10.862018754732366, |
| "grad_norm": 0.3541351854801178, |
| "learning_rate": 0.00046993356643356635, |
| "loss": 3.3854, |
| "step": 37300 |
| }, |
| { |
| "epoch": 10.876579882346089, |
| "grad_norm": 0.332714319229126, |
| "learning_rate": 0.0004697587412587412, |
| "loss": 3.3814, |
| "step": 37350 |
| }, |
| { |
| "epoch": 10.891141009959812, |
| "grad_norm": 0.3369390666484833, |
| "learning_rate": 0.000469583916083916, |
| "loss": 3.3856, |
| "step": 37400 |
| }, |
| { |
| "epoch": 10.905702137573535, |
| "grad_norm": 0.32856276631355286, |
| "learning_rate": 0.00046940909090909086, |
| "loss": 3.4018, |
| "step": 37450 |
| }, |
| { |
| "epoch": 10.920263265187256, |
| "grad_norm": 0.3409399390220642, |
| "learning_rate": 0.0004692342657342657, |
| "loss": 3.3937, |
| "step": 37500 |
| }, |
| { |
| "epoch": 10.934824392800978, |
| "grad_norm": 0.32305291295051575, |
| "learning_rate": 0.0004690594405594405, |
| "loss": 3.3949, |
| "step": 37550 |
| }, |
| { |
| "epoch": 10.949385520414701, |
| "grad_norm": 0.3399060368537903, |
| "learning_rate": 0.00046888461538461537, |
| "loss": 3.399, |
| "step": 37600 |
| }, |
| { |
| "epoch": 10.963946648028424, |
| "grad_norm": 0.35931140184402466, |
| "learning_rate": 0.00046870979020979017, |
| "loss": 3.3951, |
| "step": 37650 |
| }, |
| { |
| "epoch": 10.978507775642147, |
| "grad_norm": 0.3609871566295624, |
| "learning_rate": 0.000468534965034965, |
| "loss": 3.3833, |
| "step": 37700 |
| }, |
| { |
| "epoch": 10.993068903255867, |
| "grad_norm": 0.37121859192848206, |
| "learning_rate": 0.0004683601398601398, |
| "loss": 3.3902, |
| "step": 37750 |
| }, |
| { |
| "epoch": 11.007571786359136, |
| "grad_norm": 0.3667745292186737, |
| "learning_rate": 0.0004681853146853147, |
| "loss": 3.3219, |
| "step": 37800 |
| }, |
| { |
| "epoch": 11.022132913972857, |
| "grad_norm": 0.35213324427604675, |
| "learning_rate": 0.0004680104895104895, |
| "loss": 3.2761, |
| "step": 37850 |
| }, |
| { |
| "epoch": 11.03669404158658, |
| "grad_norm": 0.3473345637321472, |
| "learning_rate": 0.00046783566433566433, |
| "loss": 3.2923, |
| "step": 37900 |
| }, |
| { |
| "epoch": 11.051255169200303, |
| "grad_norm": 0.35238996148109436, |
| "learning_rate": 0.0004676608391608391, |
| "loss": 3.2854, |
| "step": 37950 |
| }, |
| { |
| "epoch": 11.065816296814026, |
| "grad_norm": 0.3831373453140259, |
| "learning_rate": 0.00046748601398601393, |
| "loss": 3.2846, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.065816296814026, |
| "eval_accuracy": 0.36904982400881703, |
| "eval_loss": 3.5707571506500244, |
| "eval_runtime": 82.1036, |
| "eval_samples_per_second": 202.683, |
| "eval_steps_per_second": 12.679, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.080377424427748, |
| "grad_norm": 0.33279159665107727, |
| "learning_rate": 0.00046731118881118873, |
| "loss": 3.2893, |
| "step": 38050 |
| }, |
| { |
| "epoch": 11.09493855204147, |
| "grad_norm": 0.3536868989467621, |
| "learning_rate": 0.0004671363636363636, |
| "loss": 3.2955, |
| "step": 38100 |
| }, |
| { |
| "epoch": 11.109499679655192, |
| "grad_norm": 0.3576470911502838, |
| "learning_rate": 0.00046696153846153844, |
| "loss": 3.2988, |
| "step": 38150 |
| }, |
| { |
| "epoch": 11.124060807268915, |
| "grad_norm": 0.3538837730884552, |
| "learning_rate": 0.00046678671328671324, |
| "loss": 3.2997, |
| "step": 38200 |
| }, |
| { |
| "epoch": 11.138621934882638, |
| "grad_norm": 0.34002363681793213, |
| "learning_rate": 0.0004666118881118881, |
| "loss": 3.3319, |
| "step": 38250 |
| }, |
| { |
| "epoch": 11.15318306249636, |
| "grad_norm": 0.3394576907157898, |
| "learning_rate": 0.0004664370629370629, |
| "loss": 3.3139, |
| "step": 38300 |
| }, |
| { |
| "epoch": 11.167744190110081, |
| "grad_norm": 0.3327895700931549, |
| "learning_rate": 0.00046626223776223774, |
| "loss": 3.3072, |
| "step": 38350 |
| }, |
| { |
| "epoch": 11.182305317723804, |
| "grad_norm": 0.3352562189102173, |
| "learning_rate": 0.00046608741258741254, |
| "loss": 3.3288, |
| "step": 38400 |
| }, |
| { |
| "epoch": 11.196866445337527, |
| "grad_norm": 0.356283038854599, |
| "learning_rate": 0.0004659125874125874, |
| "loss": 3.3187, |
| "step": 38450 |
| }, |
| { |
| "epoch": 11.21142757295125, |
| "grad_norm": 0.36333218216896057, |
| "learning_rate": 0.0004657377622377622, |
| "loss": 3.3223, |
| "step": 38500 |
| }, |
| { |
| "epoch": 11.225988700564972, |
| "grad_norm": 0.34671902656555176, |
| "learning_rate": 0.00046556293706293705, |
| "loss": 3.3343, |
| "step": 38550 |
| }, |
| { |
| "epoch": 11.240549828178693, |
| "grad_norm": 0.3378485143184662, |
| "learning_rate": 0.00046538811188811185, |
| "loss": 3.3268, |
| "step": 38600 |
| }, |
| { |
| "epoch": 11.255110955792416, |
| "grad_norm": 0.342264860868454, |
| "learning_rate": 0.0004652132867132867, |
| "loss": 3.3217, |
| "step": 38650 |
| }, |
| { |
| "epoch": 11.269672083406139, |
| "grad_norm": 0.34399548172950745, |
| "learning_rate": 0.00046503846153846145, |
| "loss": 3.317, |
| "step": 38700 |
| }, |
| { |
| "epoch": 11.284233211019862, |
| "grad_norm": 0.3392704427242279, |
| "learning_rate": 0.0004648636363636363, |
| "loss": 3.33, |
| "step": 38750 |
| }, |
| { |
| "epoch": 11.298794338633584, |
| "grad_norm": 0.33312466740608215, |
| "learning_rate": 0.0004646888111888111, |
| "loss": 3.3319, |
| "step": 38800 |
| }, |
| { |
| "epoch": 11.313355466247307, |
| "grad_norm": 0.3910272717475891, |
| "learning_rate": 0.00046451398601398596, |
| "loss": 3.3362, |
| "step": 38850 |
| }, |
| { |
| "epoch": 11.327916593861028, |
| "grad_norm": 0.3460436463356018, |
| "learning_rate": 0.0004643391608391608, |
| "loss": 3.325, |
| "step": 38900 |
| }, |
| { |
| "epoch": 11.34247772147475, |
| "grad_norm": 0.3455114960670471, |
| "learning_rate": 0.0004641643356643356, |
| "loss": 3.3395, |
| "step": 38950 |
| }, |
| { |
| "epoch": 11.357038849088473, |
| "grad_norm": 0.3465653657913208, |
| "learning_rate": 0.00046398951048951046, |
| "loss": 3.3397, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.357038849088473, |
| "eval_accuracy": 0.36902983236062203, |
| "eval_loss": 3.5662317276000977, |
| "eval_runtime": 82.1226, |
| "eval_samples_per_second": 202.636, |
| "eval_steps_per_second": 12.676, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.371599976702196, |
| "grad_norm": 0.37152397632598877, |
| "learning_rate": 0.00046381468531468526, |
| "loss": 3.3484, |
| "step": 39050 |
| }, |
| { |
| "epoch": 11.386161104315919, |
| "grad_norm": 0.3424335718154907, |
| "learning_rate": 0.0004636398601398601, |
| "loss": 3.3493, |
| "step": 39100 |
| }, |
| { |
| "epoch": 11.40072223192964, |
| "grad_norm": 0.3614085912704468, |
| "learning_rate": 0.0004634650349650349, |
| "loss": 3.3478, |
| "step": 39150 |
| }, |
| { |
| "epoch": 11.415283359543363, |
| "grad_norm": 0.346829891204834, |
| "learning_rate": 0.00046329020979020977, |
| "loss": 3.3354, |
| "step": 39200 |
| }, |
| { |
| "epoch": 11.429844487157085, |
| "grad_norm": 0.33924925327301025, |
| "learning_rate": 0.00046311538461538457, |
| "loss": 3.3445, |
| "step": 39250 |
| }, |
| { |
| "epoch": 11.444405614770808, |
| "grad_norm": 0.33950796723365784, |
| "learning_rate": 0.0004629405594405594, |
| "loss": 3.3386, |
| "step": 39300 |
| }, |
| { |
| "epoch": 11.458966742384531, |
| "grad_norm": 0.33234700560569763, |
| "learning_rate": 0.0004627657342657342, |
| "loss": 3.3392, |
| "step": 39350 |
| }, |
| { |
| "epoch": 11.473527869998252, |
| "grad_norm": 0.3629755973815918, |
| "learning_rate": 0.0004625909090909091, |
| "loss": 3.3559, |
| "step": 39400 |
| }, |
| { |
| "epoch": 11.488088997611975, |
| "grad_norm": 0.35883602499961853, |
| "learning_rate": 0.0004624160839160838, |
| "loss": 3.3512, |
| "step": 39450 |
| }, |
| { |
| "epoch": 11.502650125225697, |
| "grad_norm": 0.3551258146762848, |
| "learning_rate": 0.0004622412587412587, |
| "loss": 3.3436, |
| "step": 39500 |
| }, |
| { |
| "epoch": 11.51721125283942, |
| "grad_norm": 0.3312642276287079, |
| "learning_rate": 0.00046206643356643353, |
| "loss": 3.3598, |
| "step": 39550 |
| }, |
| { |
| "epoch": 11.531772380453143, |
| "grad_norm": 0.34731200337409973, |
| "learning_rate": 0.00046189160839160833, |
| "loss": 3.3448, |
| "step": 39600 |
| }, |
| { |
| "epoch": 11.546333508066864, |
| "grad_norm": 0.3447478711605072, |
| "learning_rate": 0.0004617167832167832, |
| "loss": 3.3527, |
| "step": 39650 |
| }, |
| { |
| "epoch": 11.560894635680587, |
| "grad_norm": 0.3528204560279846, |
| "learning_rate": 0.000461541958041958, |
| "loss": 3.3561, |
| "step": 39700 |
| }, |
| { |
| "epoch": 11.57545576329431, |
| "grad_norm": 0.34261322021484375, |
| "learning_rate": 0.00046136713286713284, |
| "loss": 3.3535, |
| "step": 39750 |
| }, |
| { |
| "epoch": 11.590016890908032, |
| "grad_norm": 0.3756110370159149, |
| "learning_rate": 0.00046119230769230764, |
| "loss": 3.3577, |
| "step": 39800 |
| }, |
| { |
| "epoch": 11.604578018521755, |
| "grad_norm": 0.36323457956314087, |
| "learning_rate": 0.0004610174825174825, |
| "loss": 3.3631, |
| "step": 39850 |
| }, |
| { |
| "epoch": 11.619139146135478, |
| "grad_norm": 0.3396397531032562, |
| "learning_rate": 0.0004608426573426573, |
| "loss": 3.3392, |
| "step": 39900 |
| }, |
| { |
| "epoch": 11.633700273749199, |
| "grad_norm": 0.3573121428489685, |
| "learning_rate": 0.00046066783216783215, |
| "loss": 3.3624, |
| "step": 39950 |
| }, |
| { |
| "epoch": 11.648261401362921, |
| "grad_norm": 0.3364925682544708, |
| "learning_rate": 0.00046049300699300695, |
| "loss": 3.3675, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.648261401362921, |
| "eval_accuracy": 0.3696610980518609, |
| "eval_loss": 3.560828924179077, |
| "eval_runtime": 82.1553, |
| "eval_samples_per_second": 202.556, |
| "eval_steps_per_second": 12.671, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.662822528976644, |
| "grad_norm": 0.32618987560272217, |
| "learning_rate": 0.0004603181818181818, |
| "loss": 3.3627, |
| "step": 40050 |
| }, |
| { |
| "epoch": 11.677383656590367, |
| "grad_norm": 0.38889262080192566, |
| "learning_rate": 0.0004601433566433566, |
| "loss": 3.3546, |
| "step": 40100 |
| }, |
| { |
| "epoch": 11.69194478420409, |
| "grad_norm": 0.3467303514480591, |
| "learning_rate": 0.00045996853146853145, |
| "loss": 3.3661, |
| "step": 40150 |
| }, |
| { |
| "epoch": 11.70650591181781, |
| "grad_norm": 0.3630310893058777, |
| "learning_rate": 0.0004597937062937062, |
| "loss": 3.3691, |
| "step": 40200 |
| }, |
| { |
| "epoch": 11.721067039431533, |
| "grad_norm": 0.3508758544921875, |
| "learning_rate": 0.00045961888111888105, |
| "loss": 3.3516, |
| "step": 40250 |
| }, |
| { |
| "epoch": 11.735628167045256, |
| "grad_norm": 0.36394113302230835, |
| "learning_rate": 0.0004594440559440559, |
| "loss": 3.3491, |
| "step": 40300 |
| }, |
| { |
| "epoch": 11.750189294658979, |
| "grad_norm": 0.3571886122226715, |
| "learning_rate": 0.0004592692307692307, |
| "loss": 3.3624, |
| "step": 40350 |
| }, |
| { |
| "epoch": 11.764750422272702, |
| "grad_norm": 0.3501681089401245, |
| "learning_rate": 0.00045909440559440556, |
| "loss": 3.3656, |
| "step": 40400 |
| }, |
| { |
| "epoch": 11.779311549886422, |
| "grad_norm": 0.37331053614616394, |
| "learning_rate": 0.00045891958041958036, |
| "loss": 3.3648, |
| "step": 40450 |
| }, |
| { |
| "epoch": 11.793872677500145, |
| "grad_norm": 0.3355902135372162, |
| "learning_rate": 0.0004587447552447552, |
| "loss": 3.3576, |
| "step": 40500 |
| }, |
| { |
| "epoch": 11.808433805113868, |
| "grad_norm": 0.33591264486312866, |
| "learning_rate": 0.00045856993006993, |
| "loss": 3.3701, |
| "step": 40550 |
| }, |
| { |
| "epoch": 11.82299493272759, |
| "grad_norm": 0.3608211576938629, |
| "learning_rate": 0.00045839510489510487, |
| "loss": 3.3628, |
| "step": 40600 |
| }, |
| { |
| "epoch": 11.837556060341313, |
| "grad_norm": 0.36434468626976013, |
| "learning_rate": 0.00045822027972027967, |
| "loss": 3.3615, |
| "step": 40650 |
| }, |
| { |
| "epoch": 11.852117187955034, |
| "grad_norm": 0.36887839436531067, |
| "learning_rate": 0.0004580454545454545, |
| "loss": 3.369, |
| "step": 40700 |
| }, |
| { |
| "epoch": 11.866678315568757, |
| "grad_norm": 0.35393133759498596, |
| "learning_rate": 0.0004578706293706293, |
| "loss": 3.3641, |
| "step": 40750 |
| }, |
| { |
| "epoch": 11.88123944318248, |
| "grad_norm": 0.3569134771823883, |
| "learning_rate": 0.0004576958041958042, |
| "loss": 3.3789, |
| "step": 40800 |
| }, |
| { |
| "epoch": 11.895800570796203, |
| "grad_norm": 0.34833458065986633, |
| "learning_rate": 0.000457520979020979, |
| "loss": 3.3606, |
| "step": 40850 |
| }, |
| { |
| "epoch": 11.910361698409925, |
| "grad_norm": 0.34039419889450073, |
| "learning_rate": 0.00045734615384615383, |
| "loss": 3.3806, |
| "step": 40900 |
| }, |
| { |
| "epoch": 11.924922826023646, |
| "grad_norm": 0.3344103693962097, |
| "learning_rate": 0.0004571713286713287, |
| "loss": 3.3617, |
| "step": 40950 |
| }, |
| { |
| "epoch": 11.93948395363737, |
| "grad_norm": 0.3435809910297394, |
| "learning_rate": 0.00045699650349650343, |
| "loss": 3.3618, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.93948395363737, |
| "eval_accuracy": 0.3700118926787174, |
| "eval_loss": 3.5535316467285156, |
| "eval_runtime": 82.1773, |
| "eval_samples_per_second": 202.501, |
| "eval_steps_per_second": 12.668, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.954045081251092, |
| "grad_norm": 0.35758745670318604, |
| "learning_rate": 0.0004568216783216783, |
| "loss": 3.3682, |
| "step": 41050 |
| }, |
| { |
| "epoch": 11.968606208864815, |
| "grad_norm": 0.36225852370262146, |
| "learning_rate": 0.0004566468531468531, |
| "loss": 3.3727, |
| "step": 41100 |
| }, |
| { |
| "epoch": 11.983167336478537, |
| "grad_norm": 0.3377271592617035, |
| "learning_rate": 0.00045647202797202794, |
| "loss": 3.3893, |
| "step": 41150 |
| }, |
| { |
| "epoch": 11.99772846409226, |
| "grad_norm": 0.342596173286438, |
| "learning_rate": 0.00045629720279720274, |
| "loss": 3.3619, |
| "step": 41200 |
| }, |
| { |
| "epoch": 12.012231347195527, |
| "grad_norm": 0.3525770306587219, |
| "learning_rate": 0.0004561223776223776, |
| "loss": 3.2768, |
| "step": 41250 |
| }, |
| { |
| "epoch": 12.02679247480925, |
| "grad_norm": 0.3470669090747833, |
| "learning_rate": 0.0004559475524475524, |
| "loss": 3.2607, |
| "step": 41300 |
| }, |
| { |
| "epoch": 12.041353602422971, |
| "grad_norm": 0.34680554270744324, |
| "learning_rate": 0.00045577272727272724, |
| "loss": 3.2615, |
| "step": 41350 |
| }, |
| { |
| "epoch": 12.055914730036694, |
| "grad_norm": 0.3507510721683502, |
| "learning_rate": 0.00045559790209790204, |
| "loss": 3.2701, |
| "step": 41400 |
| }, |
| { |
| "epoch": 12.070475857650417, |
| "grad_norm": 0.3711635172367096, |
| "learning_rate": 0.0004554230769230769, |
| "loss": 3.2863, |
| "step": 41450 |
| }, |
| { |
| "epoch": 12.08503698526414, |
| "grad_norm": 0.34734708070755005, |
| "learning_rate": 0.0004552482517482517, |
| "loss": 3.2725, |
| "step": 41500 |
| }, |
| { |
| "epoch": 12.099598112877862, |
| "grad_norm": 0.34102872014045715, |
| "learning_rate": 0.00045507342657342655, |
| "loss": 3.2825, |
| "step": 41550 |
| }, |
| { |
| "epoch": 12.114159240491583, |
| "grad_norm": 0.36111149191856384, |
| "learning_rate": 0.00045489860139860135, |
| "loss": 3.283, |
| "step": 41600 |
| }, |
| { |
| "epoch": 12.128720368105306, |
| "grad_norm": 0.3672299385070801, |
| "learning_rate": 0.0004547237762237762, |
| "loss": 3.2871, |
| "step": 41650 |
| }, |
| { |
| "epoch": 12.143281495719028, |
| "grad_norm": 0.34666985273361206, |
| "learning_rate": 0.00045454895104895106, |
| "loss": 3.3032, |
| "step": 41700 |
| }, |
| { |
| "epoch": 12.157842623332751, |
| "grad_norm": 0.35126757621765137, |
| "learning_rate": 0.0004543741258741258, |
| "loss": 3.2892, |
| "step": 41750 |
| }, |
| { |
| "epoch": 12.172403750946474, |
| "grad_norm": 0.3565657436847687, |
| "learning_rate": 0.00045419930069930066, |
| "loss": 3.289, |
| "step": 41800 |
| }, |
| { |
| "epoch": 12.186964878560195, |
| "grad_norm": 0.3579462170600891, |
| "learning_rate": 0.00045402447552447546, |
| "loss": 3.2888, |
| "step": 41850 |
| }, |
| { |
| "epoch": 12.201526006173918, |
| "grad_norm": 0.3309963345527649, |
| "learning_rate": 0.0004538496503496503, |
| "loss": 3.2927, |
| "step": 41900 |
| }, |
| { |
| "epoch": 12.21608713378764, |
| "grad_norm": 0.3587437868118286, |
| "learning_rate": 0.0004536748251748251, |
| "loss": 3.3158, |
| "step": 41950 |
| }, |
| { |
| "epoch": 12.230648261401363, |
| "grad_norm": 0.3527557849884033, |
| "learning_rate": 0.00045349999999999996, |
| "loss": 3.2947, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.230648261401363, |
| "eval_accuracy": 0.3694092032846043, |
| "eval_loss": 3.566235065460205, |
| "eval_runtime": 82.0077, |
| "eval_samples_per_second": 202.92, |
| "eval_steps_per_second": 12.694, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.245209389015086, |
| "grad_norm": 0.3493395149707794, |
| "learning_rate": 0.00045332517482517476, |
| "loss": 3.2966, |
| "step": 42050 |
| }, |
| { |
| "epoch": 12.259770516628807, |
| "grad_norm": 0.3381343483924866, |
| "learning_rate": 0.0004531503496503496, |
| "loss": 3.3236, |
| "step": 42100 |
| }, |
| { |
| "epoch": 12.27433164424253, |
| "grad_norm": 0.3581599295139313, |
| "learning_rate": 0.0004529755244755244, |
| "loss": 3.3052, |
| "step": 42150 |
| }, |
| { |
| "epoch": 12.288892771856252, |
| "grad_norm": 0.3661227822303772, |
| "learning_rate": 0.00045280069930069927, |
| "loss": 3.2965, |
| "step": 42200 |
| }, |
| { |
| "epoch": 12.303453899469975, |
| "grad_norm": 0.38722071051597595, |
| "learning_rate": 0.00045262587412587407, |
| "loss": 3.3123, |
| "step": 42250 |
| }, |
| { |
| "epoch": 12.318015027083698, |
| "grad_norm": 0.3642864525318146, |
| "learning_rate": 0.0004524510489510489, |
| "loss": 3.309, |
| "step": 42300 |
| }, |
| { |
| "epoch": 12.33257615469742, |
| "grad_norm": 0.4007324278354645, |
| "learning_rate": 0.0004522762237762238, |
| "loss": 3.3021, |
| "step": 42350 |
| }, |
| { |
| "epoch": 12.347137282311142, |
| "grad_norm": 0.3553014099597931, |
| "learning_rate": 0.0004521013986013986, |
| "loss": 3.3203, |
| "step": 42400 |
| }, |
| { |
| "epoch": 12.361698409924864, |
| "grad_norm": 0.4004995822906494, |
| "learning_rate": 0.00045192657342657343, |
| "loss": 3.3145, |
| "step": 42450 |
| }, |
| { |
| "epoch": 12.376259537538587, |
| "grad_norm": 0.38768574595451355, |
| "learning_rate": 0.0004517517482517482, |
| "loss": 3.3219, |
| "step": 42500 |
| }, |
| { |
| "epoch": 12.39082066515231, |
| "grad_norm": 0.3459119200706482, |
| "learning_rate": 0.00045157692307692303, |
| "loss": 3.3267, |
| "step": 42550 |
| }, |
| { |
| "epoch": 12.405381792766033, |
| "grad_norm": 0.36608731746673584, |
| "learning_rate": 0.00045140209790209783, |
| "loss": 3.3238, |
| "step": 42600 |
| }, |
| { |
| "epoch": 12.419942920379754, |
| "grad_norm": 0.33612096309661865, |
| "learning_rate": 0.0004512272727272727, |
| "loss": 3.3289, |
| "step": 42650 |
| }, |
| { |
| "epoch": 12.434504047993476, |
| "grad_norm": 0.3239712417125702, |
| "learning_rate": 0.0004510524475524475, |
| "loss": 3.3296, |
| "step": 42700 |
| }, |
| { |
| "epoch": 12.449065175607199, |
| "grad_norm": 0.3535769581794739, |
| "learning_rate": 0.00045087762237762234, |
| "loss": 3.3308, |
| "step": 42750 |
| }, |
| { |
| "epoch": 12.463626303220922, |
| "grad_norm": 0.3301953375339508, |
| "learning_rate": 0.00045070279720279714, |
| "loss": 3.3338, |
| "step": 42800 |
| }, |
| { |
| "epoch": 12.478187430834645, |
| "grad_norm": 0.365500807762146, |
| "learning_rate": 0.000450527972027972, |
| "loss": 3.3318, |
| "step": 42850 |
| }, |
| { |
| "epoch": 12.492748558448366, |
| "grad_norm": 0.39921191334724426, |
| "learning_rate": 0.0004503531468531468, |
| "loss": 3.329, |
| "step": 42900 |
| }, |
| { |
| "epoch": 12.507309686062088, |
| "grad_norm": 0.37658029794692993, |
| "learning_rate": 0.00045017832167832165, |
| "loss": 3.3142, |
| "step": 42950 |
| }, |
| { |
| "epoch": 12.521870813675811, |
| "grad_norm": 0.38668063282966614, |
| "learning_rate": 0.0004500034965034965, |
| "loss": 3.3348, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.521870813675811, |
| "eval_accuracy": 0.36999813372084206, |
| "eval_loss": 3.559868335723877, |
| "eval_runtime": 82.4911, |
| "eval_samples_per_second": 201.731, |
| "eval_steps_per_second": 12.62, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.536431941289534, |
| "grad_norm": 0.34607669711112976, |
| "learning_rate": 0.0004498286713286713, |
| "loss": 3.333, |
| "step": 43050 |
| }, |
| { |
| "epoch": 12.550993068903256, |
| "grad_norm": 0.3572712540626526, |
| "learning_rate": 0.00044965384615384615, |
| "loss": 3.3428, |
| "step": 43100 |
| }, |
| { |
| "epoch": 12.565554196516977, |
| "grad_norm": 0.3587905168533325, |
| "learning_rate": 0.00044947902097902095, |
| "loss": 3.3331, |
| "step": 43150 |
| }, |
| { |
| "epoch": 12.5801153241307, |
| "grad_norm": 0.37342527508735657, |
| "learning_rate": 0.0004493041958041958, |
| "loss": 3.3368, |
| "step": 43200 |
| }, |
| { |
| "epoch": 12.594676451744423, |
| "grad_norm": 0.3531188666820526, |
| "learning_rate": 0.00044912937062937055, |
| "loss": 3.334, |
| "step": 43250 |
| }, |
| { |
| "epoch": 12.609237579358146, |
| "grad_norm": 0.3537793755531311, |
| "learning_rate": 0.0004489545454545454, |
| "loss": 3.3318, |
| "step": 43300 |
| }, |
| { |
| "epoch": 12.623798706971868, |
| "grad_norm": 0.355112224817276, |
| "learning_rate": 0.0004487797202797202, |
| "loss": 3.3385, |
| "step": 43350 |
| }, |
| { |
| "epoch": 12.63835983458559, |
| "grad_norm": 0.35365793108940125, |
| "learning_rate": 0.00044860489510489506, |
| "loss": 3.3351, |
| "step": 43400 |
| }, |
| { |
| "epoch": 12.652920962199312, |
| "grad_norm": 0.3463797867298126, |
| "learning_rate": 0.00044843006993006986, |
| "loss": 3.3535, |
| "step": 43450 |
| }, |
| { |
| "epoch": 12.667482089813035, |
| "grad_norm": 0.3554716110229492, |
| "learning_rate": 0.0004482552447552447, |
| "loss": 3.3419, |
| "step": 43500 |
| }, |
| { |
| "epoch": 12.682043217426758, |
| "grad_norm": 0.33308327198028564, |
| "learning_rate": 0.0004480804195804195, |
| "loss": 3.3417, |
| "step": 43550 |
| }, |
| { |
| "epoch": 12.69660434504048, |
| "grad_norm": 0.35232919454574585, |
| "learning_rate": 0.00044790559440559437, |
| "loss": 3.334, |
| "step": 43600 |
| }, |
| { |
| "epoch": 12.711165472654203, |
| "grad_norm": 0.3530453145503998, |
| "learning_rate": 0.00044773076923076917, |
| "loss": 3.3351, |
| "step": 43650 |
| }, |
| { |
| "epoch": 12.725726600267924, |
| "grad_norm": 0.35968685150146484, |
| "learning_rate": 0.000447555944055944, |
| "loss": 3.3481, |
| "step": 43700 |
| }, |
| { |
| "epoch": 12.740287727881647, |
| "grad_norm": 0.35502877831459045, |
| "learning_rate": 0.0004473811188811189, |
| "loss": 3.3544, |
| "step": 43750 |
| }, |
| { |
| "epoch": 12.75484885549537, |
| "grad_norm": 0.35576528310775757, |
| "learning_rate": 0.0004472062937062937, |
| "loss": 3.3385, |
| "step": 43800 |
| }, |
| { |
| "epoch": 12.769409983109092, |
| "grad_norm": 0.37485212087631226, |
| "learning_rate": 0.00044703146853146853, |
| "loss": 3.3578, |
| "step": 43850 |
| }, |
| { |
| "epoch": 12.783971110722815, |
| "grad_norm": 0.39017146825790405, |
| "learning_rate": 0.00044685664335664333, |
| "loss": 3.3434, |
| "step": 43900 |
| }, |
| { |
| "epoch": 12.798532238336536, |
| "grad_norm": 0.35243257880210876, |
| "learning_rate": 0.0004466818181818182, |
| "loss": 3.348, |
| "step": 43950 |
| }, |
| { |
| "epoch": 12.813093365950259, |
| "grad_norm": 0.33278748393058777, |
| "learning_rate": 0.00044650699300699293, |
| "loss": 3.3386, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.813093365950259, |
| "eval_accuracy": 0.3704748757313268, |
| "eval_loss": 3.5524535179138184, |
| "eval_runtime": 82.1492, |
| "eval_samples_per_second": 202.57, |
| "eval_steps_per_second": 12.672, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.827654493563982, |
| "grad_norm": 0.3648782968521118, |
| "learning_rate": 0.0004463321678321678, |
| "loss": 3.3565, |
| "step": 44050 |
| }, |
| { |
| "epoch": 12.842215621177704, |
| "grad_norm": 0.35999104380607605, |
| "learning_rate": 0.0004461573426573426, |
| "loss": 3.3539, |
| "step": 44100 |
| }, |
| { |
| "epoch": 12.856776748791427, |
| "grad_norm": 0.3632679879665375, |
| "learning_rate": 0.00044598251748251744, |
| "loss": 3.3564, |
| "step": 44150 |
| }, |
| { |
| "epoch": 12.871337876405148, |
| "grad_norm": 0.35622382164001465, |
| "learning_rate": 0.00044580769230769224, |
| "loss": 3.3564, |
| "step": 44200 |
| }, |
| { |
| "epoch": 12.88589900401887, |
| "grad_norm": 0.3516140878200531, |
| "learning_rate": 0.0004456328671328671, |
| "loss": 3.3568, |
| "step": 44250 |
| }, |
| { |
| "epoch": 12.900460131632594, |
| "grad_norm": 0.3556675314903259, |
| "learning_rate": 0.0004454580419580419, |
| "loss": 3.3552, |
| "step": 44300 |
| }, |
| { |
| "epoch": 12.915021259246316, |
| "grad_norm": 0.3647211194038391, |
| "learning_rate": 0.00044528321678321674, |
| "loss": 3.3479, |
| "step": 44350 |
| }, |
| { |
| "epoch": 12.929582386860039, |
| "grad_norm": 0.37669581174850464, |
| "learning_rate": 0.0004451083916083916, |
| "loss": 3.3614, |
| "step": 44400 |
| }, |
| { |
| "epoch": 12.944143514473762, |
| "grad_norm": 0.3309657871723175, |
| "learning_rate": 0.0004449335664335664, |
| "loss": 3.3695, |
| "step": 44450 |
| }, |
| { |
| "epoch": 12.958704642087483, |
| "grad_norm": 0.34548866748809814, |
| "learning_rate": 0.00044475874125874125, |
| "loss": 3.3559, |
| "step": 44500 |
| }, |
| { |
| "epoch": 12.973265769701205, |
| "grad_norm": 0.3327321708202362, |
| "learning_rate": 0.00044458391608391605, |
| "loss": 3.3549, |
| "step": 44550 |
| }, |
| { |
| "epoch": 12.987826897314928, |
| "grad_norm": 0.3529459238052368, |
| "learning_rate": 0.0004444090909090909, |
| "loss": 3.3533, |
| "step": 44600 |
| }, |
| { |
| "epoch": 13.002329780418195, |
| "grad_norm": 0.3597906529903412, |
| "learning_rate": 0.0004442342657342657, |
| "loss": 3.3432, |
| "step": 44650 |
| }, |
| { |
| "epoch": 13.016890908031918, |
| "grad_norm": 0.3633843660354614, |
| "learning_rate": 0.00044405944055944056, |
| "loss": 3.2473, |
| "step": 44700 |
| }, |
| { |
| "epoch": 13.031452035645641, |
| "grad_norm": 0.36908018589019775, |
| "learning_rate": 0.0004438846153846153, |
| "loss": 3.2444, |
| "step": 44750 |
| }, |
| { |
| "epoch": 13.046013163259364, |
| "grad_norm": 0.3617497384548187, |
| "learning_rate": 0.00044370979020979016, |
| "loss": 3.2461, |
| "step": 44800 |
| }, |
| { |
| "epoch": 13.060574290873085, |
| "grad_norm": 0.3689548671245575, |
| "learning_rate": 0.00044353496503496496, |
| "loss": 3.2562, |
| "step": 44850 |
| }, |
| { |
| "epoch": 13.075135418486807, |
| "grad_norm": 0.364460289478302, |
| "learning_rate": 0.0004433601398601398, |
| "loss": 3.2561, |
| "step": 44900 |
| }, |
| { |
| "epoch": 13.08969654610053, |
| "grad_norm": 0.38561636209487915, |
| "learning_rate": 0.0004431853146853146, |
| "loss": 3.2601, |
| "step": 44950 |
| }, |
| { |
| "epoch": 13.104257673714253, |
| "grad_norm": 0.3805942237377167, |
| "learning_rate": 0.00044301048951048946, |
| "loss": 3.2812, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.104257673714253, |
| "eval_accuracy": 0.37006528213919104, |
| "eval_loss": 3.5637786388397217, |
| "eval_runtime": 82.0677, |
| "eval_samples_per_second": 202.771, |
| "eval_steps_per_second": 12.685, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.118818801327976, |
| "grad_norm": 0.39642012119293213, |
| "learning_rate": 0.00044283566433566426, |
| "loss": 3.2632, |
| "step": 45050 |
| }, |
| { |
| "epoch": 13.133379928941697, |
| "grad_norm": 0.3674771785736084, |
| "learning_rate": 0.0004426608391608391, |
| "loss": 3.2745, |
| "step": 45100 |
| }, |
| { |
| "epoch": 13.14794105655542, |
| "grad_norm": 0.37106794118881226, |
| "learning_rate": 0.00044248601398601397, |
| "loss": 3.2698, |
| "step": 45150 |
| }, |
| { |
| "epoch": 13.162502184169142, |
| "grad_norm": 0.33802124857902527, |
| "learning_rate": 0.00044231118881118877, |
| "loss": 3.2785, |
| "step": 45200 |
| }, |
| { |
| "epoch": 13.177063311782865, |
| "grad_norm": 0.35392314195632935, |
| "learning_rate": 0.0004421363636363636, |
| "loss": 3.272, |
| "step": 45250 |
| }, |
| { |
| "epoch": 13.191624439396588, |
| "grad_norm": 0.4176012873649597, |
| "learning_rate": 0.0004419615384615384, |
| "loss": 3.2729, |
| "step": 45300 |
| }, |
| { |
| "epoch": 13.206185567010309, |
| "grad_norm": 0.36650487780570984, |
| "learning_rate": 0.0004417867132867133, |
| "loss": 3.2839, |
| "step": 45350 |
| }, |
| { |
| "epoch": 13.220746694624031, |
| "grad_norm": 0.3893243968486786, |
| "learning_rate": 0.0004416118881118881, |
| "loss": 3.2874, |
| "step": 45400 |
| }, |
| { |
| "epoch": 13.235307822237754, |
| "grad_norm": 0.34698382019996643, |
| "learning_rate": 0.00044143706293706293, |
| "loss": 3.277, |
| "step": 45450 |
| }, |
| { |
| "epoch": 13.249868949851477, |
| "grad_norm": 0.3904397189617157, |
| "learning_rate": 0.0004412622377622377, |
| "loss": 3.2942, |
| "step": 45500 |
| }, |
| { |
| "epoch": 13.2644300774652, |
| "grad_norm": 0.3720010221004486, |
| "learning_rate": 0.00044108741258741253, |
| "loss": 3.2851, |
| "step": 45550 |
| }, |
| { |
| "epoch": 13.27899120507892, |
| "grad_norm": 0.39584729075431824, |
| "learning_rate": 0.00044091258741258733, |
| "loss": 3.2916, |
| "step": 45600 |
| }, |
| { |
| "epoch": 13.293552332692643, |
| "grad_norm": 0.3631509840488434, |
| "learning_rate": 0.0004407377622377622, |
| "loss": 3.2881, |
| "step": 45650 |
| }, |
| { |
| "epoch": 13.308113460306366, |
| "grad_norm": 0.3456946909427643, |
| "learning_rate": 0.000440562937062937, |
| "loss": 3.2984, |
| "step": 45700 |
| }, |
| { |
| "epoch": 13.322674587920089, |
| "grad_norm": 0.34069541096687317, |
| "learning_rate": 0.00044038811188811184, |
| "loss": 3.3027, |
| "step": 45750 |
| }, |
| { |
| "epoch": 13.337235715533811, |
| "grad_norm": 0.36634930968284607, |
| "learning_rate": 0.0004402132867132867, |
| "loss": 3.3108, |
| "step": 45800 |
| }, |
| { |
| "epoch": 13.351796843147532, |
| "grad_norm": 0.38517844676971436, |
| "learning_rate": 0.0004400384615384615, |
| "loss": 3.2981, |
| "step": 45850 |
| }, |
| { |
| "epoch": 13.366357970761255, |
| "grad_norm": 0.36287397146224976, |
| "learning_rate": 0.00043986363636363635, |
| "loss": 3.3019, |
| "step": 45900 |
| }, |
| { |
| "epoch": 13.380919098374978, |
| "grad_norm": 0.3538667857646942, |
| "learning_rate": 0.00043968881118881115, |
| "loss": 3.3108, |
| "step": 45950 |
| }, |
| { |
| "epoch": 13.3954802259887, |
| "grad_norm": 0.3757765293121338, |
| "learning_rate": 0.000439513986013986, |
| "loss": 3.3061, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.3954802259887, |
| "eval_accuracy": 0.3699185199218538, |
| "eval_loss": 3.560844659805298, |
| "eval_runtime": 82.3569, |
| "eval_samples_per_second": 202.06, |
| "eval_steps_per_second": 12.64, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.410041353602423, |
| "grad_norm": 0.34489282965660095, |
| "learning_rate": 0.0004393391608391608, |
| "loss": 3.2967, |
| "step": 46050 |
| }, |
| { |
| "epoch": 13.424602481216146, |
| "grad_norm": 0.348224401473999, |
| "learning_rate": 0.00043916433566433565, |
| "loss": 3.3077, |
| "step": 46100 |
| }, |
| { |
| "epoch": 13.439163608829867, |
| "grad_norm": 0.36835676431655884, |
| "learning_rate": 0.00043898951048951045, |
| "loss": 3.3027, |
| "step": 46150 |
| }, |
| { |
| "epoch": 13.45372473644359, |
| "grad_norm": 0.3495423197746277, |
| "learning_rate": 0.0004388146853146853, |
| "loss": 3.3117, |
| "step": 46200 |
| }, |
| { |
| "epoch": 13.468285864057313, |
| "grad_norm": 0.3777487576007843, |
| "learning_rate": 0.00043863986013986005, |
| "loss": 3.3183, |
| "step": 46250 |
| }, |
| { |
| "epoch": 13.482846991671035, |
| "grad_norm": 0.3374326527118683, |
| "learning_rate": 0.0004384650349650349, |
| "loss": 3.3187, |
| "step": 46300 |
| }, |
| { |
| "epoch": 13.497408119284758, |
| "grad_norm": 0.35795071721076965, |
| "learning_rate": 0.0004382902097902097, |
| "loss": 3.3187, |
| "step": 46350 |
| }, |
| { |
| "epoch": 13.51196924689848, |
| "grad_norm": 0.36752477288246155, |
| "learning_rate": 0.00043811538461538456, |
| "loss": 3.3277, |
| "step": 46400 |
| }, |
| { |
| "epoch": 13.526530374512202, |
| "grad_norm": 0.3484336733818054, |
| "learning_rate": 0.0004379405594405594, |
| "loss": 3.3004, |
| "step": 46450 |
| }, |
| { |
| "epoch": 13.541091502125925, |
| "grad_norm": 0.3466947376728058, |
| "learning_rate": 0.0004377657342657342, |
| "loss": 3.3154, |
| "step": 46500 |
| }, |
| { |
| "epoch": 13.555652629739647, |
| "grad_norm": 0.35439735651016235, |
| "learning_rate": 0.00043759090909090907, |
| "loss": 3.3152, |
| "step": 46550 |
| }, |
| { |
| "epoch": 13.57021375735337, |
| "grad_norm": 0.35467153787612915, |
| "learning_rate": 0.00043741608391608387, |
| "loss": 3.3353, |
| "step": 46600 |
| }, |
| { |
| "epoch": 13.584774884967091, |
| "grad_norm": 0.4003247916698456, |
| "learning_rate": 0.0004372412587412587, |
| "loss": 3.3132, |
| "step": 46650 |
| }, |
| { |
| "epoch": 13.599336012580814, |
| "grad_norm": 0.3468843102455139, |
| "learning_rate": 0.0004370664335664335, |
| "loss": 3.3281, |
| "step": 46700 |
| }, |
| { |
| "epoch": 13.613897140194537, |
| "grad_norm": 0.3521159887313843, |
| "learning_rate": 0.0004368916083916084, |
| "loss": 3.3195, |
| "step": 46750 |
| }, |
| { |
| "epoch": 13.62845826780826, |
| "grad_norm": 0.3621974587440491, |
| "learning_rate": 0.0004367167832167832, |
| "loss": 3.3211, |
| "step": 46800 |
| }, |
| { |
| "epoch": 13.643019395421982, |
| "grad_norm": 0.340221643447876, |
| "learning_rate": 0.00043654195804195803, |
| "loss": 3.3267, |
| "step": 46850 |
| }, |
| { |
| "epoch": 13.657580523035705, |
| "grad_norm": 0.3822995126247406, |
| "learning_rate": 0.00043636713286713283, |
| "loss": 3.3226, |
| "step": 46900 |
| }, |
| { |
| "epoch": 13.672141650649426, |
| "grad_norm": 0.369263619184494, |
| "learning_rate": 0.0004361923076923077, |
| "loss": 3.3335, |
| "step": 46950 |
| }, |
| { |
| "epoch": 13.686702778263149, |
| "grad_norm": 0.34928369522094727, |
| "learning_rate": 0.00043601748251748243, |
| "loss": 3.3265, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.686702778263149, |
| "eval_accuracy": 0.3709275101660471, |
| "eval_loss": 3.551309108734131, |
| "eval_runtime": 81.9048, |
| "eval_samples_per_second": 203.175, |
| "eval_steps_per_second": 12.71, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.701263905876871, |
| "grad_norm": 0.34369638562202454, |
| "learning_rate": 0.00043584265734265734, |
| "loss": 3.3385, |
| "step": 47050 |
| }, |
| { |
| "epoch": 13.715825033490594, |
| "grad_norm": 0.3323599696159363, |
| "learning_rate": 0.0004356678321678321, |
| "loss": 3.3341, |
| "step": 47100 |
| }, |
| { |
| "epoch": 13.730386161104317, |
| "grad_norm": 0.37531906366348267, |
| "learning_rate": 0.00043549300699300694, |
| "loss": 3.3248, |
| "step": 47150 |
| }, |
| { |
| "epoch": 13.744947288718038, |
| "grad_norm": 0.40877652168273926, |
| "learning_rate": 0.0004353181818181818, |
| "loss": 3.3348, |
| "step": 47200 |
| }, |
| { |
| "epoch": 13.75950841633176, |
| "grad_norm": 0.39260047674179077, |
| "learning_rate": 0.0004351433566433566, |
| "loss": 3.3275, |
| "step": 47250 |
| }, |
| { |
| "epoch": 13.774069543945483, |
| "grad_norm": 0.3621160387992859, |
| "learning_rate": 0.00043496853146853144, |
| "loss": 3.3428, |
| "step": 47300 |
| }, |
| { |
| "epoch": 13.788630671559206, |
| "grad_norm": 0.38188454508781433, |
| "learning_rate": 0.00043479370629370624, |
| "loss": 3.3253, |
| "step": 47350 |
| }, |
| { |
| "epoch": 13.803191799172929, |
| "grad_norm": 0.38131508231163025, |
| "learning_rate": 0.0004346188811188811, |
| "loss": 3.3448, |
| "step": 47400 |
| }, |
| { |
| "epoch": 13.81775292678665, |
| "grad_norm": 0.34486502408981323, |
| "learning_rate": 0.0004344440559440559, |
| "loss": 3.3266, |
| "step": 47450 |
| }, |
| { |
| "epoch": 13.832314054400372, |
| "grad_norm": 0.3711560070514679, |
| "learning_rate": 0.00043426923076923075, |
| "loss": 3.3223, |
| "step": 47500 |
| }, |
| { |
| "epoch": 13.846875182014095, |
| "grad_norm": 0.37756112217903137, |
| "learning_rate": 0.00043409440559440555, |
| "loss": 3.3401, |
| "step": 47550 |
| }, |
| { |
| "epoch": 13.861436309627818, |
| "grad_norm": 0.3453884422779083, |
| "learning_rate": 0.0004339195804195804, |
| "loss": 3.3377, |
| "step": 47600 |
| }, |
| { |
| "epoch": 13.87599743724154, |
| "grad_norm": 0.3538273572921753, |
| "learning_rate": 0.0004337447552447552, |
| "loss": 3.3428, |
| "step": 47650 |
| }, |
| { |
| "epoch": 13.890558564855262, |
| "grad_norm": 0.3426096439361572, |
| "learning_rate": 0.00043356993006993006, |
| "loss": 3.3192, |
| "step": 47700 |
| }, |
| { |
| "epoch": 13.905119692468984, |
| "grad_norm": 0.34052231907844543, |
| "learning_rate": 0.0004333951048951048, |
| "loss": 3.3314, |
| "step": 47750 |
| }, |
| { |
| "epoch": 13.919680820082707, |
| "grad_norm": 0.36880505084991455, |
| "learning_rate": 0.0004332202797202797, |
| "loss": 3.3236, |
| "step": 47800 |
| }, |
| { |
| "epoch": 13.93424194769643, |
| "grad_norm": 0.3990458548069, |
| "learning_rate": 0.00043304545454545456, |
| "loss": 3.3378, |
| "step": 47850 |
| }, |
| { |
| "epoch": 13.948803075310153, |
| "grad_norm": 0.36244332790374756, |
| "learning_rate": 0.0004328706293706293, |
| "loss": 3.3383, |
| "step": 47900 |
| }, |
| { |
| "epoch": 13.963364202923874, |
| "grad_norm": 0.33829572796821594, |
| "learning_rate": 0.00043269580419580416, |
| "loss": 3.3433, |
| "step": 47950 |
| }, |
| { |
| "epoch": 13.977925330537596, |
| "grad_norm": 0.3507706820964813, |
| "learning_rate": 0.00043252097902097896, |
| "loss": 3.3405, |
| "step": 48000 |
| }, |
| { |
| "epoch": 13.977925330537596, |
| "eval_accuracy": 0.37116282362509495, |
| "eval_loss": 3.545074462890625, |
| "eval_runtime": 82.1377, |
| "eval_samples_per_second": 202.599, |
| "eval_steps_per_second": 12.674, |
| "step": 48000 |
| }, |
| { |
| "epoch": 13.992486458151319, |
| "grad_norm": 0.3454228639602661, |
| "learning_rate": 0.0004323461538461538, |
| "loss": 3.3452, |
| "step": 48050 |
| }, |
| { |
| "epoch": 14.006989341254586, |
| "grad_norm": 0.3690054416656494, |
| "learning_rate": 0.0004321713286713286, |
| "loss": 3.2894, |
| "step": 48100 |
| }, |
| { |
| "epoch": 14.021550468868309, |
| "grad_norm": 0.38319116830825806, |
| "learning_rate": 0.00043199650349650347, |
| "loss": 3.2352, |
| "step": 48150 |
| }, |
| { |
| "epoch": 14.036111596482032, |
| "grad_norm": 0.37697914242744446, |
| "learning_rate": 0.00043182167832167827, |
| "loss": 3.2293, |
| "step": 48200 |
| }, |
| { |
| "epoch": 14.050672724095755, |
| "grad_norm": 0.36340099573135376, |
| "learning_rate": 0.0004316468531468531, |
| "loss": 3.2402, |
| "step": 48250 |
| }, |
| { |
| "epoch": 14.065233851709475, |
| "grad_norm": 0.35308969020843506, |
| "learning_rate": 0.0004314720279720279, |
| "loss": 3.2388, |
| "step": 48300 |
| }, |
| { |
| "epoch": 14.079794979323198, |
| "grad_norm": 0.3750530481338501, |
| "learning_rate": 0.0004312972027972028, |
| "loss": 3.2408, |
| "step": 48350 |
| }, |
| { |
| "epoch": 14.094356106936921, |
| "grad_norm": 0.3973260223865509, |
| "learning_rate": 0.0004311223776223776, |
| "loss": 3.2493, |
| "step": 48400 |
| }, |
| { |
| "epoch": 14.108917234550644, |
| "grad_norm": 0.36961326003074646, |
| "learning_rate": 0.00043094755244755243, |
| "loss": 3.2477, |
| "step": 48450 |
| }, |
| { |
| "epoch": 14.123478362164366, |
| "grad_norm": 0.40170127153396606, |
| "learning_rate": 0.0004307727272727272, |
| "loss": 3.2651, |
| "step": 48500 |
| }, |
| { |
| "epoch": 14.13803948977809, |
| "grad_norm": 0.3788222074508667, |
| "learning_rate": 0.0004305979020979021, |
| "loss": 3.2638, |
| "step": 48550 |
| }, |
| { |
| "epoch": 14.15260061739181, |
| "grad_norm": 0.3767904043197632, |
| "learning_rate": 0.00043042307692307694, |
| "loss": 3.2609, |
| "step": 48600 |
| }, |
| { |
| "epoch": 14.167161745005533, |
| "grad_norm": 0.36420321464538574, |
| "learning_rate": 0.0004302482517482517, |
| "loss": 3.2609, |
| "step": 48650 |
| }, |
| { |
| "epoch": 14.181722872619256, |
| "grad_norm": 0.36340752243995667, |
| "learning_rate": 0.00043007342657342654, |
| "loss": 3.2576, |
| "step": 48700 |
| }, |
| { |
| "epoch": 14.196284000232978, |
| "grad_norm": 0.35927969217300415, |
| "learning_rate": 0.00042989860139860134, |
| "loss": 3.2685, |
| "step": 48750 |
| }, |
| { |
| "epoch": 14.210845127846701, |
| "grad_norm": 0.37248143553733826, |
| "learning_rate": 0.0004297237762237762, |
| "loss": 3.2712, |
| "step": 48800 |
| }, |
| { |
| "epoch": 14.225406255460422, |
| "grad_norm": 0.3582911193370819, |
| "learning_rate": 0.000429548951048951, |
| "loss": 3.2696, |
| "step": 48850 |
| }, |
| { |
| "epoch": 14.239967383074145, |
| "grad_norm": 0.36641067266464233, |
| "learning_rate": 0.00042937412587412585, |
| "loss": 3.2778, |
| "step": 48900 |
| }, |
| { |
| "epoch": 14.254528510687868, |
| "grad_norm": 0.3534380793571472, |
| "learning_rate": 0.00042919930069930065, |
| "loss": 3.2738, |
| "step": 48950 |
| }, |
| { |
| "epoch": 14.26908963830159, |
| "grad_norm": 0.3574196398258209, |
| "learning_rate": 0.0004290244755244755, |
| "loss": 3.2761, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.26908963830159, |
| "eval_accuracy": 0.370831079862989, |
| "eval_loss": 3.5578083992004395, |
| "eval_runtime": 82.1242, |
| "eval_samples_per_second": 202.632, |
| "eval_steps_per_second": 12.676, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.283650765915313, |
| "grad_norm": 0.3565859794616699, |
| "learning_rate": 0.0004288496503496503, |
| "loss": 3.2857, |
| "step": 49050 |
| }, |
| { |
| "epoch": 14.298211893529034, |
| "grad_norm": 0.3482775390148163, |
| "learning_rate": 0.00042867482517482515, |
| "loss": 3.2925, |
| "step": 49100 |
| }, |
| { |
| "epoch": 14.312773021142757, |
| "grad_norm": 0.38286519050598145, |
| "learning_rate": 0.00042849999999999995, |
| "loss": 3.2793, |
| "step": 49150 |
| }, |
| { |
| "epoch": 14.32733414875648, |
| "grad_norm": 0.3537954092025757, |
| "learning_rate": 0.0004283251748251748, |
| "loss": 3.2747, |
| "step": 49200 |
| }, |
| { |
| "epoch": 14.341895276370202, |
| "grad_norm": 0.3822333812713623, |
| "learning_rate": 0.00042815034965034966, |
| "loss": 3.2914, |
| "step": 49250 |
| }, |
| { |
| "epoch": 14.356456403983925, |
| "grad_norm": 0.36340805888175964, |
| "learning_rate": 0.00042797552447552446, |
| "loss": 3.2851, |
| "step": 49300 |
| }, |
| { |
| "epoch": 14.371017531597648, |
| "grad_norm": 0.36440396308898926, |
| "learning_rate": 0.0004278006993006993, |
| "loss": 3.2775, |
| "step": 49350 |
| }, |
| { |
| "epoch": 14.385578659211369, |
| "grad_norm": 0.38242268562316895, |
| "learning_rate": 0.00042762587412587406, |
| "loss": 3.2893, |
| "step": 49400 |
| }, |
| { |
| "epoch": 14.400139786825092, |
| "grad_norm": 0.3417001962661743, |
| "learning_rate": 0.0004274510489510489, |
| "loss": 3.2944, |
| "step": 49450 |
| }, |
| { |
| "epoch": 14.414700914438814, |
| "grad_norm": 0.37533003091812134, |
| "learning_rate": 0.0004272762237762237, |
| "loss": 3.2919, |
| "step": 49500 |
| }, |
| { |
| "epoch": 14.429262042052537, |
| "grad_norm": 0.3594585359096527, |
| "learning_rate": 0.00042710139860139857, |
| "loss": 3.2878, |
| "step": 49550 |
| }, |
| { |
| "epoch": 14.44382316966626, |
| "grad_norm": 0.36212876439094543, |
| "learning_rate": 0.00042692657342657337, |
| "loss": 3.2843, |
| "step": 49600 |
| }, |
| { |
| "epoch": 14.45838429727998, |
| "grad_norm": 0.34770578145980835, |
| "learning_rate": 0.0004267517482517482, |
| "loss": 3.3028, |
| "step": 49650 |
| }, |
| { |
| "epoch": 14.472945424893704, |
| "grad_norm": 0.3649824559688568, |
| "learning_rate": 0.000426576923076923, |
| "loss": 3.2857, |
| "step": 49700 |
| }, |
| { |
| "epoch": 14.487506552507426, |
| "grad_norm": 0.38572993874549866, |
| "learning_rate": 0.0004264020979020979, |
| "loss": 3.307, |
| "step": 49750 |
| }, |
| { |
| "epoch": 14.502067680121149, |
| "grad_norm": 0.36206668615341187, |
| "learning_rate": 0.0004262272727272727, |
| "loss": 3.2805, |
| "step": 49800 |
| }, |
| { |
| "epoch": 14.516628807734872, |
| "grad_norm": 0.3737871050834656, |
| "learning_rate": 0.00042605244755244753, |
| "loss": 3.3019, |
| "step": 49850 |
| }, |
| { |
| "epoch": 14.531189935348593, |
| "grad_norm": 0.3483608365058899, |
| "learning_rate": 0.00042587762237762233, |
| "loss": 3.3, |
| "step": 49900 |
| }, |
| { |
| "epoch": 14.545751062962315, |
| "grad_norm": 0.37755563855171204, |
| "learning_rate": 0.0004257027972027972, |
| "loss": 3.2959, |
| "step": 49950 |
| }, |
| { |
| "epoch": 14.560312190576038, |
| "grad_norm": 0.3582163453102112, |
| "learning_rate": 0.00042552797202797204, |
| "loss": 3.3001, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.560312190576038, |
| "eval_accuracy": 0.3712794807722092, |
| "eval_loss": 3.552330732345581, |
| "eval_runtime": 82.1998, |
| "eval_samples_per_second": 202.446, |
| "eval_steps_per_second": 12.664, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.574873318189761, |
| "grad_norm": 0.3447462320327759, |
| "learning_rate": 0.00042535314685314684, |
| "loss": 3.3161, |
| "step": 50050 |
| }, |
| { |
| "epoch": 14.589434445803484, |
| "grad_norm": 0.3644867539405823, |
| "learning_rate": 0.0004251783216783217, |
| "loss": 3.3099, |
| "step": 50100 |
| }, |
| { |
| "epoch": 14.603995573417205, |
| "grad_norm": 0.3638802468776703, |
| "learning_rate": 0.00042500349650349643, |
| "loss": 3.3068, |
| "step": 50150 |
| }, |
| { |
| "epoch": 14.618556701030927, |
| "grad_norm": 0.36938712000846863, |
| "learning_rate": 0.0004248286713286713, |
| "loss": 3.2991, |
| "step": 50200 |
| }, |
| { |
| "epoch": 14.63311782864465, |
| "grad_norm": 0.39543572068214417, |
| "learning_rate": 0.0004246538461538461, |
| "loss": 3.3069, |
| "step": 50250 |
| }, |
| { |
| "epoch": 14.647678956258373, |
| "grad_norm": 0.38200637698173523, |
| "learning_rate": 0.00042447902097902094, |
| "loss": 3.3123, |
| "step": 50300 |
| }, |
| { |
| "epoch": 14.662240083872096, |
| "grad_norm": 0.3697560727596283, |
| "learning_rate": 0.00042430419580419574, |
| "loss": 3.3053, |
| "step": 50350 |
| }, |
| { |
| "epoch": 14.676801211485817, |
| "grad_norm": 0.37939342856407166, |
| "learning_rate": 0.0004241293706293706, |
| "loss": 3.3053, |
| "step": 50400 |
| }, |
| { |
| "epoch": 14.69136233909954, |
| "grad_norm": 0.3589920103549957, |
| "learning_rate": 0.0004239545454545454, |
| "loss": 3.3015, |
| "step": 50450 |
| }, |
| { |
| "epoch": 14.705923466713262, |
| "grad_norm": 0.37587329745292664, |
| "learning_rate": 0.00042377972027972025, |
| "loss": 3.3187, |
| "step": 50500 |
| }, |
| { |
| "epoch": 14.720484594326985, |
| "grad_norm": 0.3526425361633301, |
| "learning_rate": 0.00042360489510489505, |
| "loss": 3.311, |
| "step": 50550 |
| }, |
| { |
| "epoch": 14.735045721940708, |
| "grad_norm": 0.3987361788749695, |
| "learning_rate": 0.0004234300699300699, |
| "loss": 3.327, |
| "step": 50600 |
| }, |
| { |
| "epoch": 14.749606849554429, |
| "grad_norm": 0.38629478216171265, |
| "learning_rate": 0.00042325524475524476, |
| "loss": 3.3143, |
| "step": 50650 |
| }, |
| { |
| "epoch": 14.764167977168151, |
| "grad_norm": 0.34681326150894165, |
| "learning_rate": 0.00042308041958041956, |
| "loss": 3.3089, |
| "step": 50700 |
| }, |
| { |
| "epoch": 14.778729104781874, |
| "grad_norm": 0.37358734011650085, |
| "learning_rate": 0.0004229055944055944, |
| "loss": 3.3309, |
| "step": 50750 |
| }, |
| { |
| "epoch": 14.793290232395597, |
| "grad_norm": 0.38487982749938965, |
| "learning_rate": 0.0004227307692307692, |
| "loss": 3.3146, |
| "step": 50800 |
| }, |
| { |
| "epoch": 14.80785136000932, |
| "grad_norm": 0.3684466779232025, |
| "learning_rate": 0.00042255594405594406, |
| "loss": 3.3102, |
| "step": 50850 |
| }, |
| { |
| "epoch": 14.822412487623042, |
| "grad_norm": 0.38600221276283264, |
| "learning_rate": 0.0004223811188811188, |
| "loss": 3.3167, |
| "step": 50900 |
| }, |
| { |
| "epoch": 14.836973615236763, |
| "grad_norm": 0.34845396876335144, |
| "learning_rate": 0.00042220629370629366, |
| "loss": 3.324, |
| "step": 50950 |
| }, |
| { |
| "epoch": 14.851534742850486, |
| "grad_norm": 0.35497453808784485, |
| "learning_rate": 0.00042203146853146846, |
| "loss": 3.3161, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.851534742850486, |
| "eval_accuracy": 0.37179644127494504, |
| "eval_loss": 3.544046401977539, |
| "eval_runtime": 81.9583, |
| "eval_samples_per_second": 203.042, |
| "eval_steps_per_second": 12.702, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.866095870464209, |
| "grad_norm": 0.3616420030593872, |
| "learning_rate": 0.0004218566433566433, |
| "loss": 3.3233, |
| "step": 51050 |
| }, |
| { |
| "epoch": 14.880656998077932, |
| "grad_norm": 0.35815533995628357, |
| "learning_rate": 0.0004216818181818181, |
| "loss": 3.3253, |
| "step": 51100 |
| }, |
| { |
| "epoch": 14.895218125691654, |
| "grad_norm": 0.3734304904937744, |
| "learning_rate": 0.00042150699300699297, |
| "loss": 3.3276, |
| "step": 51150 |
| }, |
| { |
| "epoch": 14.909779253305375, |
| "grad_norm": 0.38237103819847107, |
| "learning_rate": 0.00042133216783216777, |
| "loss": 3.3356, |
| "step": 51200 |
| }, |
| { |
| "epoch": 14.924340380919098, |
| "grad_norm": 0.3630594313144684, |
| "learning_rate": 0.0004211573426573426, |
| "loss": 3.3168, |
| "step": 51250 |
| }, |
| { |
| "epoch": 14.93890150853282, |
| "grad_norm": 0.37732306122779846, |
| "learning_rate": 0.0004209825174825175, |
| "loss": 3.311, |
| "step": 51300 |
| }, |
| { |
| "epoch": 14.953462636146543, |
| "grad_norm": 0.36326178908348083, |
| "learning_rate": 0.0004208076923076923, |
| "loss": 3.3203, |
| "step": 51350 |
| }, |
| { |
| "epoch": 14.968023763760266, |
| "grad_norm": 0.39929264783859253, |
| "learning_rate": 0.00042063286713286713, |
| "loss": 3.3329, |
| "step": 51400 |
| }, |
| { |
| "epoch": 14.982584891373987, |
| "grad_norm": 0.39931896328926086, |
| "learning_rate": 0.00042045804195804193, |
| "loss": 3.3378, |
| "step": 51450 |
| }, |
| { |
| "epoch": 14.99714601898771, |
| "grad_norm": 0.36345505714416504, |
| "learning_rate": 0.0004202832167832168, |
| "loss": 3.3238, |
| "step": 51500 |
| }, |
| { |
| "epoch": 15.011648902090977, |
| "grad_norm": 0.3538660407066345, |
| "learning_rate": 0.0004201083916083916, |
| "loss": 3.2263, |
| "step": 51550 |
| }, |
| { |
| "epoch": 15.0262100297047, |
| "grad_norm": 0.3989592492580414, |
| "learning_rate": 0.00041993356643356644, |
| "loss": 3.2213, |
| "step": 51600 |
| }, |
| { |
| "epoch": 15.040771157318423, |
| "grad_norm": 0.39716923236846924, |
| "learning_rate": 0.0004197587412587412, |
| "loss": 3.2164, |
| "step": 51650 |
| }, |
| { |
| "epoch": 15.055332284932145, |
| "grad_norm": 0.3658950924873352, |
| "learning_rate": 0.00041958391608391604, |
| "loss": 3.2189, |
| "step": 51700 |
| }, |
| { |
| "epoch": 15.069893412545868, |
| "grad_norm": 0.3497481346130371, |
| "learning_rate": 0.00041940909090909084, |
| "loss": 3.2407, |
| "step": 51750 |
| }, |
| { |
| "epoch": 15.084454540159589, |
| "grad_norm": 0.3735629618167877, |
| "learning_rate": 0.0004192342657342657, |
| "loss": 3.2415, |
| "step": 51800 |
| }, |
| { |
| "epoch": 15.099015667773312, |
| "grad_norm": 0.37101686000823975, |
| "learning_rate": 0.0004190594405594405, |
| "loss": 3.2323, |
| "step": 51850 |
| }, |
| { |
| "epoch": 15.113576795387035, |
| "grad_norm": 0.34967759251594543, |
| "learning_rate": 0.00041888461538461535, |
| "loss": 3.237, |
| "step": 51900 |
| }, |
| { |
| "epoch": 15.128137923000757, |
| "grad_norm": 0.3733450770378113, |
| "learning_rate": 0.00041870979020979015, |
| "loss": 3.2362, |
| "step": 51950 |
| }, |
| { |
| "epoch": 15.14269905061448, |
| "grad_norm": 0.3671697676181793, |
| "learning_rate": 0.000418534965034965, |
| "loss": 3.2441, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.14269905061448, |
| "eval_accuracy": 0.370729828044778, |
| "eval_loss": 3.559016704559326, |
| "eval_runtime": 82.0837, |
| "eval_samples_per_second": 202.732, |
| "eval_steps_per_second": 12.682, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.157260178228203, |
| "grad_norm": 0.3946242332458496, |
| "learning_rate": 0.00041836013986013985, |
| "loss": 3.2496, |
| "step": 52050 |
| }, |
| { |
| "epoch": 15.171821305841924, |
| "grad_norm": 0.4355080723762512, |
| "learning_rate": 0.00041818531468531465, |
| "loss": 3.2431, |
| "step": 52100 |
| }, |
| { |
| "epoch": 15.186382433455647, |
| "grad_norm": 0.35990259051322937, |
| "learning_rate": 0.0004180104895104895, |
| "loss": 3.2354, |
| "step": 52150 |
| }, |
| { |
| "epoch": 15.20094356106937, |
| "grad_norm": 0.372814416885376, |
| "learning_rate": 0.0004178356643356643, |
| "loss": 3.2487, |
| "step": 52200 |
| }, |
| { |
| "epoch": 15.215504688683092, |
| "grad_norm": 0.37970396876335144, |
| "learning_rate": 0.00041766083916083916, |
| "loss": 3.2365, |
| "step": 52250 |
| }, |
| { |
| "epoch": 15.230065816296815, |
| "grad_norm": 0.36887216567993164, |
| "learning_rate": 0.00041748601398601396, |
| "loss": 3.2574, |
| "step": 52300 |
| }, |
| { |
| "epoch": 15.244626943910536, |
| "grad_norm": 0.39955610036849976, |
| "learning_rate": 0.0004173111888111888, |
| "loss": 3.2559, |
| "step": 52350 |
| }, |
| { |
| "epoch": 15.259188071524258, |
| "grad_norm": 0.4258992075920105, |
| "learning_rate": 0.00041713636363636356, |
| "loss": 3.2657, |
| "step": 52400 |
| }, |
| { |
| "epoch": 15.273749199137981, |
| "grad_norm": 0.36497777700424194, |
| "learning_rate": 0.0004169615384615384, |
| "loss": 3.2653, |
| "step": 52450 |
| }, |
| { |
| "epoch": 15.288310326751704, |
| "grad_norm": 0.3772423565387726, |
| "learning_rate": 0.0004167867132867132, |
| "loss": 3.2642, |
| "step": 52500 |
| }, |
| { |
| "epoch": 15.302871454365427, |
| "grad_norm": 0.3884272873401642, |
| "learning_rate": 0.00041661188811188807, |
| "loss": 3.2572, |
| "step": 52550 |
| }, |
| { |
| "epoch": 15.317432581979148, |
| "grad_norm": 0.38749808073043823, |
| "learning_rate": 0.00041643706293706287, |
| "loss": 3.2778, |
| "step": 52600 |
| }, |
| { |
| "epoch": 15.33199370959287, |
| "grad_norm": 0.3408059775829315, |
| "learning_rate": 0.0004162622377622377, |
| "loss": 3.27, |
| "step": 52650 |
| }, |
| { |
| "epoch": 15.346554837206593, |
| "grad_norm": 0.3697678744792938, |
| "learning_rate": 0.0004160874125874126, |
| "loss": 3.2733, |
| "step": 52700 |
| }, |
| { |
| "epoch": 15.361115964820316, |
| "grad_norm": 0.37584030628204346, |
| "learning_rate": 0.0004159125874125874, |
| "loss": 3.2659, |
| "step": 52750 |
| }, |
| { |
| "epoch": 15.375677092434039, |
| "grad_norm": 0.4046936631202698, |
| "learning_rate": 0.00041573776223776223, |
| "loss": 3.2736, |
| "step": 52800 |
| }, |
| { |
| "epoch": 15.39023822004776, |
| "grad_norm": 0.370634526014328, |
| "learning_rate": 0.00041556293706293703, |
| "loss": 3.2716, |
| "step": 52850 |
| }, |
| { |
| "epoch": 15.404799347661482, |
| "grad_norm": 0.3945183753967285, |
| "learning_rate": 0.0004153881118881119, |
| "loss": 3.279, |
| "step": 52900 |
| }, |
| { |
| "epoch": 15.419360475275205, |
| "grad_norm": 0.4186868965625763, |
| "learning_rate": 0.0004152132867132867, |
| "loss": 3.2747, |
| "step": 52950 |
| }, |
| { |
| "epoch": 15.433921602888928, |
| "grad_norm": 0.39357316493988037, |
| "learning_rate": 0.00041503846153846154, |
| "loss": 3.2836, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.433921602888928, |
| "eval_accuracy": 0.37158958651509233, |
| "eval_loss": 3.5512025356292725, |
| "eval_runtime": 82.2768, |
| "eval_samples_per_second": 202.256, |
| "eval_steps_per_second": 12.652, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.44848273050265, |
| "grad_norm": 0.3643952012062073, |
| "learning_rate": 0.00041486363636363634, |
| "loss": 3.2939, |
| "step": 53050 |
| }, |
| { |
| "epoch": 15.463043858116373, |
| "grad_norm": 0.37709832191467285, |
| "learning_rate": 0.0004146888111888112, |
| "loss": 3.2696, |
| "step": 53100 |
| }, |
| { |
| "epoch": 15.477604985730094, |
| "grad_norm": 0.36169663071632385, |
| "learning_rate": 0.00041451398601398593, |
| "loss": 3.2997, |
| "step": 53150 |
| }, |
| { |
| "epoch": 15.492166113343817, |
| "grad_norm": 0.37795886397361755, |
| "learning_rate": 0.0004143391608391608, |
| "loss": 3.2934, |
| "step": 53200 |
| }, |
| { |
| "epoch": 15.50672724095754, |
| "grad_norm": 0.3691202402114868, |
| "learning_rate": 0.0004141643356643356, |
| "loss": 3.2848, |
| "step": 53250 |
| }, |
| { |
| "epoch": 15.521288368571263, |
| "grad_norm": 0.38011452555656433, |
| "learning_rate": 0.00041398951048951044, |
| "loss": 3.2948, |
| "step": 53300 |
| }, |
| { |
| "epoch": 15.535849496184985, |
| "grad_norm": 0.38378387689590454, |
| "learning_rate": 0.00041381468531468524, |
| "loss": 3.2915, |
| "step": 53350 |
| }, |
| { |
| "epoch": 15.550410623798706, |
| "grad_norm": 0.4115823805332184, |
| "learning_rate": 0.0004136398601398601, |
| "loss": 3.2869, |
| "step": 53400 |
| }, |
| { |
| "epoch": 15.564971751412429, |
| "grad_norm": 0.37096720933914185, |
| "learning_rate": 0.00041346503496503495, |
| "loss": 3.2908, |
| "step": 53450 |
| }, |
| { |
| "epoch": 15.579532879026152, |
| "grad_norm": 0.4138024151325226, |
| "learning_rate": 0.00041329020979020975, |
| "loss": 3.2937, |
| "step": 53500 |
| }, |
| { |
| "epoch": 15.594094006639875, |
| "grad_norm": 0.3554668128490448, |
| "learning_rate": 0.0004131153846153846, |
| "loss": 3.2849, |
| "step": 53550 |
| }, |
| { |
| "epoch": 15.608655134253597, |
| "grad_norm": 0.3484932780265808, |
| "learning_rate": 0.0004129405594405594, |
| "loss": 3.2877, |
| "step": 53600 |
| }, |
| { |
| "epoch": 15.623216261867318, |
| "grad_norm": 0.3929693102836609, |
| "learning_rate": 0.00041276573426573426, |
| "loss": 3.297, |
| "step": 53650 |
| }, |
| { |
| "epoch": 15.637777389481041, |
| "grad_norm": 0.3712870180606842, |
| "learning_rate": 0.00041259090909090906, |
| "loss": 3.3016, |
| "step": 53700 |
| }, |
| { |
| "epoch": 15.652338517094764, |
| "grad_norm": 0.3405837118625641, |
| "learning_rate": 0.0004124160839160839, |
| "loss": 3.3009, |
| "step": 53750 |
| }, |
| { |
| "epoch": 15.666899644708487, |
| "grad_norm": 0.3703879714012146, |
| "learning_rate": 0.0004122412587412587, |
| "loss": 3.2986, |
| "step": 53800 |
| }, |
| { |
| "epoch": 15.68146077232221, |
| "grad_norm": 0.3651980459690094, |
| "learning_rate": 0.00041206643356643356, |
| "loss": 3.2863, |
| "step": 53850 |
| }, |
| { |
| "epoch": 15.69602189993593, |
| "grad_norm": 0.3742126226425171, |
| "learning_rate": 0.0004118916083916083, |
| "loss": 3.2947, |
| "step": 53900 |
| }, |
| { |
| "epoch": 15.710583027549653, |
| "grad_norm": 0.3682291507720947, |
| "learning_rate": 0.00041171678321678316, |
| "loss": 3.3101, |
| "step": 53950 |
| }, |
| { |
| "epoch": 15.725144155163376, |
| "grad_norm": 0.391160249710083, |
| "learning_rate": 0.00041154195804195796, |
| "loss": 3.3039, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.725144155163376, |
| "eval_accuracy": 0.3720695036697022, |
| "eval_loss": 3.542681932449341, |
| "eval_runtime": 82.093, |
| "eval_samples_per_second": 202.709, |
| "eval_steps_per_second": 12.681, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.739705282777098, |
| "grad_norm": 0.3886108696460724, |
| "learning_rate": 0.0004113671328671328, |
| "loss": 3.2945, |
| "step": 54050 |
| }, |
| { |
| "epoch": 15.754266410390821, |
| "grad_norm": 0.3552185297012329, |
| "learning_rate": 0.00041119230769230767, |
| "loss": 3.2925, |
| "step": 54100 |
| }, |
| { |
| "epoch": 15.768827538004544, |
| "grad_norm": 0.3652881979942322, |
| "learning_rate": 0.00041101748251748247, |
| "loss": 3.3035, |
| "step": 54150 |
| }, |
| { |
| "epoch": 15.783388665618265, |
| "grad_norm": 0.3634861707687378, |
| "learning_rate": 0.0004108426573426573, |
| "loss": 3.303, |
| "step": 54200 |
| }, |
| { |
| "epoch": 15.797949793231988, |
| "grad_norm": 0.362498939037323, |
| "learning_rate": 0.0004106678321678321, |
| "loss": 3.3041, |
| "step": 54250 |
| }, |
| { |
| "epoch": 15.81251092084571, |
| "grad_norm": 0.3781660497188568, |
| "learning_rate": 0.000410493006993007, |
| "loss": 3.3, |
| "step": 54300 |
| }, |
| { |
| "epoch": 15.827072048459433, |
| "grad_norm": 0.36942511796951294, |
| "learning_rate": 0.0004103181818181818, |
| "loss": 3.3095, |
| "step": 54350 |
| }, |
| { |
| "epoch": 15.841633176073156, |
| "grad_norm": 0.3828555941581726, |
| "learning_rate": 0.00041014335664335663, |
| "loss": 3.2971, |
| "step": 54400 |
| }, |
| { |
| "epoch": 15.856194303686877, |
| "grad_norm": 0.36651521921157837, |
| "learning_rate": 0.00040996853146853143, |
| "loss": 3.3086, |
| "step": 54450 |
| }, |
| { |
| "epoch": 15.8707554313006, |
| "grad_norm": 0.34819480776786804, |
| "learning_rate": 0.0004097937062937063, |
| "loss": 3.3052, |
| "step": 54500 |
| }, |
| { |
| "epoch": 15.885316558914322, |
| "grad_norm": 0.36927011609077454, |
| "learning_rate": 0.0004096188811188811, |
| "loss": 3.3052, |
| "step": 54550 |
| }, |
| { |
| "epoch": 15.899877686528045, |
| "grad_norm": 0.3449710011482239, |
| "learning_rate": 0.00040944405594405594, |
| "loss": 3.3137, |
| "step": 54600 |
| }, |
| { |
| "epoch": 15.914438814141768, |
| "grad_norm": 0.3505341112613678, |
| "learning_rate": 0.0004092692307692307, |
| "loss": 3.3103, |
| "step": 54650 |
| }, |
| { |
| "epoch": 15.928999941755489, |
| "grad_norm": 0.3776344954967499, |
| "learning_rate": 0.00040909440559440554, |
| "loss": 3.3051, |
| "step": 54700 |
| }, |
| { |
| "epoch": 15.943561069369212, |
| "grad_norm": 0.37436896562576294, |
| "learning_rate": 0.00040891958041958034, |
| "loss": 3.3303, |
| "step": 54750 |
| }, |
| { |
| "epoch": 15.958122196982934, |
| "grad_norm": 0.37014177441596985, |
| "learning_rate": 0.0004087447552447552, |
| "loss": 3.3041, |
| "step": 54800 |
| }, |
| { |
| "epoch": 15.972683324596657, |
| "grad_norm": 0.3878595232963562, |
| "learning_rate": 0.00040856993006993005, |
| "loss": 3.3204, |
| "step": 54850 |
| }, |
| { |
| "epoch": 15.98724445221038, |
| "grad_norm": 0.4234336316585541, |
| "learning_rate": 0.00040839510489510485, |
| "loss": 3.3135, |
| "step": 54900 |
| }, |
| { |
| "epoch": 16.001747335313645, |
| "grad_norm": 0.40520554780960083, |
| "learning_rate": 0.0004082202797202797, |
| "loss": 3.2956, |
| "step": 54950 |
| }, |
| { |
| "epoch": 16.01630846292737, |
| "grad_norm": 0.36294302344322205, |
| "learning_rate": 0.0004080454545454545, |
| "loss": 3.1908, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.01630846292737, |
| "eval_accuracy": 0.3716135764929263, |
| "eval_loss": 3.553448438644409, |
| "eval_runtime": 82.1793, |
| "eval_samples_per_second": 202.496, |
| "eval_steps_per_second": 12.667, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.03086959054109, |
| "grad_norm": 0.3586152493953705, |
| "learning_rate": 0.00040787062937062935, |
| "loss": 3.2095, |
| "step": 55050 |
| }, |
| { |
| "epoch": 16.045430718154815, |
| "grad_norm": 0.3570796251296997, |
| "learning_rate": 0.00040769580419580415, |
| "loss": 3.2067, |
| "step": 55100 |
| }, |
| { |
| "epoch": 16.059991845768536, |
| "grad_norm": 0.36861592531204224, |
| "learning_rate": 0.000407520979020979, |
| "loss": 3.2165, |
| "step": 55150 |
| }, |
| { |
| "epoch": 16.074552973382257, |
| "grad_norm": 0.3648891746997833, |
| "learning_rate": 0.0004073461538461538, |
| "loss": 3.2031, |
| "step": 55200 |
| }, |
| { |
| "epoch": 16.08911410099598, |
| "grad_norm": 0.368233859539032, |
| "learning_rate": 0.00040717132867132866, |
| "loss": 3.2215, |
| "step": 55250 |
| }, |
| { |
| "epoch": 16.103675228609703, |
| "grad_norm": 0.3576681911945343, |
| "learning_rate": 0.00040699650349650346, |
| "loss": 3.2127, |
| "step": 55300 |
| }, |
| { |
| "epoch": 16.118236356223427, |
| "grad_norm": 0.3752358555793762, |
| "learning_rate": 0.0004068216783216783, |
| "loss": 3.2318, |
| "step": 55350 |
| }, |
| { |
| "epoch": 16.132797483837148, |
| "grad_norm": 0.37468796968460083, |
| "learning_rate": 0.00040664685314685306, |
| "loss": 3.2249, |
| "step": 55400 |
| }, |
| { |
| "epoch": 16.14735861145087, |
| "grad_norm": 0.40062421560287476, |
| "learning_rate": 0.0004064720279720279, |
| "loss": 3.2248, |
| "step": 55450 |
| }, |
| { |
| "epoch": 16.161919739064594, |
| "grad_norm": 0.357378751039505, |
| "learning_rate": 0.00040629720279720277, |
| "loss": 3.2358, |
| "step": 55500 |
| }, |
| { |
| "epoch": 16.176480866678315, |
| "grad_norm": 0.38645318150520325, |
| "learning_rate": 0.00040612237762237757, |
| "loss": 3.2301, |
| "step": 55550 |
| }, |
| { |
| "epoch": 16.19104199429204, |
| "grad_norm": 0.38812294602394104, |
| "learning_rate": 0.0004059475524475524, |
| "loss": 3.2504, |
| "step": 55600 |
| }, |
| { |
| "epoch": 16.20560312190576, |
| "grad_norm": 0.41834986209869385, |
| "learning_rate": 0.0004057727272727272, |
| "loss": 3.232, |
| "step": 55650 |
| }, |
| { |
| "epoch": 16.22016424951948, |
| "grad_norm": 0.3931691348552704, |
| "learning_rate": 0.0004055979020979021, |
| "loss": 3.2408, |
| "step": 55700 |
| }, |
| { |
| "epoch": 16.234725377133206, |
| "grad_norm": 0.35789263248443604, |
| "learning_rate": 0.0004054230769230769, |
| "loss": 3.2335, |
| "step": 55750 |
| }, |
| { |
| "epoch": 16.249286504746927, |
| "grad_norm": 0.3876798748970032, |
| "learning_rate": 0.00040524825174825173, |
| "loss": 3.2497, |
| "step": 55800 |
| }, |
| { |
| "epoch": 16.26384763236065, |
| "grad_norm": 0.3514769971370697, |
| "learning_rate": 0.00040507342657342653, |
| "loss": 3.2556, |
| "step": 55850 |
| }, |
| { |
| "epoch": 16.278408759974372, |
| "grad_norm": 0.3832399845123291, |
| "learning_rate": 0.0004048986013986014, |
| "loss": 3.256, |
| "step": 55900 |
| }, |
| { |
| "epoch": 16.292969887588093, |
| "grad_norm": 0.4378623366355896, |
| "learning_rate": 0.0004047237762237762, |
| "loss": 3.2511, |
| "step": 55950 |
| }, |
| { |
| "epoch": 16.307531015201818, |
| "grad_norm": 0.4016898572444916, |
| "learning_rate": 0.00040454895104895104, |
| "loss": 3.2603, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.307531015201818, |
| "eval_accuracy": 0.3716584989023997, |
| "eval_loss": 3.553546905517578, |
| "eval_runtime": 81.9045, |
| "eval_samples_per_second": 203.176, |
| "eval_steps_per_second": 12.71, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.32209214281554, |
| "grad_norm": 0.3478069007396698, |
| "learning_rate": 0.00040437412587412583, |
| "loss": 3.2549, |
| "step": 56050 |
| }, |
| { |
| "epoch": 16.336653270429263, |
| "grad_norm": 0.3654768168926239, |
| "learning_rate": 0.0004041993006993007, |
| "loss": 3.2586, |
| "step": 56100 |
| }, |
| { |
| "epoch": 16.351214398042984, |
| "grad_norm": 0.39851120114326477, |
| "learning_rate": 0.00040402447552447554, |
| "loss": 3.2633, |
| "step": 56150 |
| }, |
| { |
| "epoch": 16.36577552565671, |
| "grad_norm": 0.3615724444389343, |
| "learning_rate": 0.0004038496503496503, |
| "loss": 3.2638, |
| "step": 56200 |
| }, |
| { |
| "epoch": 16.38033665327043, |
| "grad_norm": 0.3678106665611267, |
| "learning_rate": 0.00040367482517482514, |
| "loss": 3.2649, |
| "step": 56250 |
| }, |
| { |
| "epoch": 16.39489778088415, |
| "grad_norm": 0.38318178057670593, |
| "learning_rate": 0.00040349999999999994, |
| "loss": 3.2598, |
| "step": 56300 |
| }, |
| { |
| "epoch": 16.409458908497875, |
| "grad_norm": 0.386793315410614, |
| "learning_rate": 0.0004033251748251748, |
| "loss": 3.2656, |
| "step": 56350 |
| }, |
| { |
| "epoch": 16.424020036111596, |
| "grad_norm": 0.3788229823112488, |
| "learning_rate": 0.0004031503496503496, |
| "loss": 3.2816, |
| "step": 56400 |
| }, |
| { |
| "epoch": 16.43858116372532, |
| "grad_norm": 0.3787629306316376, |
| "learning_rate": 0.00040297552447552445, |
| "loss": 3.2731, |
| "step": 56450 |
| }, |
| { |
| "epoch": 16.45314229133904, |
| "grad_norm": 0.386405348777771, |
| "learning_rate": 0.00040280069930069925, |
| "loss": 3.2659, |
| "step": 56500 |
| }, |
| { |
| "epoch": 16.467703418952762, |
| "grad_norm": 0.35036298632621765, |
| "learning_rate": 0.0004026258741258741, |
| "loss": 3.2642, |
| "step": 56550 |
| }, |
| { |
| "epoch": 16.482264546566487, |
| "grad_norm": 0.3653537631034851, |
| "learning_rate": 0.0004024510489510489, |
| "loss": 3.268, |
| "step": 56600 |
| }, |
| { |
| "epoch": 16.496825674180208, |
| "grad_norm": 0.3731655180454254, |
| "learning_rate": 0.00040227622377622376, |
| "loss": 3.2557, |
| "step": 56650 |
| }, |
| { |
| "epoch": 16.511386801793932, |
| "grad_norm": 0.3894757330417633, |
| "learning_rate": 0.00040210139860139856, |
| "loss": 3.2713, |
| "step": 56700 |
| }, |
| { |
| "epoch": 16.525947929407653, |
| "grad_norm": 0.3940126299858093, |
| "learning_rate": 0.0004019265734265734, |
| "loss": 3.2799, |
| "step": 56750 |
| }, |
| { |
| "epoch": 16.540509057021374, |
| "grad_norm": 0.36855053901672363, |
| "learning_rate": 0.0004017517482517482, |
| "loss": 3.2792, |
| "step": 56800 |
| }, |
| { |
| "epoch": 16.5550701846351, |
| "grad_norm": 0.3890272378921509, |
| "learning_rate": 0.00040157692307692306, |
| "loss": 3.2718, |
| "step": 56850 |
| }, |
| { |
| "epoch": 16.56963131224882, |
| "grad_norm": 0.37038499116897583, |
| "learning_rate": 0.0004014020979020979, |
| "loss": 3.2813, |
| "step": 56900 |
| }, |
| { |
| "epoch": 16.584192439862544, |
| "grad_norm": 0.3991513252258301, |
| "learning_rate": 0.00040122727272727266, |
| "loss": 3.2658, |
| "step": 56950 |
| }, |
| { |
| "epoch": 16.598753567476265, |
| "grad_norm": 0.36486366391181946, |
| "learning_rate": 0.0004010524475524475, |
| "loss": 3.2797, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.598753567476265, |
| "eval_accuracy": 0.37219039434231654, |
| "eval_loss": 3.546515941619873, |
| "eval_runtime": 81.9818, |
| "eval_samples_per_second": 202.984, |
| "eval_steps_per_second": 12.698, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.613314695089986, |
| "grad_norm": 0.37861695885658264, |
| "learning_rate": 0.0004008776223776223, |
| "loss": 3.2849, |
| "step": 57050 |
| }, |
| { |
| "epoch": 16.62787582270371, |
| "grad_norm": 0.3594093918800354, |
| "learning_rate": 0.00040070279720279717, |
| "loss": 3.2713, |
| "step": 57100 |
| }, |
| { |
| "epoch": 16.642436950317432, |
| "grad_norm": 0.39628636837005615, |
| "learning_rate": 0.00040052797202797197, |
| "loss": 3.2835, |
| "step": 57150 |
| }, |
| { |
| "epoch": 16.656998077931156, |
| "grad_norm": 0.3658110201358795, |
| "learning_rate": 0.0004003531468531468, |
| "loss": 3.2836, |
| "step": 57200 |
| }, |
| { |
| "epoch": 16.671559205544877, |
| "grad_norm": 0.37391743063926697, |
| "learning_rate": 0.0004001783216783216, |
| "loss": 3.2912, |
| "step": 57250 |
| }, |
| { |
| "epoch": 16.6861203331586, |
| "grad_norm": 0.3740245997905731, |
| "learning_rate": 0.0004000034965034965, |
| "loss": 3.2767, |
| "step": 57300 |
| }, |
| { |
| "epoch": 16.700681460772323, |
| "grad_norm": 0.3966841399669647, |
| "learning_rate": 0.0003998286713286713, |
| "loss": 3.2959, |
| "step": 57350 |
| }, |
| { |
| "epoch": 16.715242588386044, |
| "grad_norm": 0.3855196237564087, |
| "learning_rate": 0.00039965384615384613, |
| "loss": 3.2852, |
| "step": 57400 |
| }, |
| { |
| "epoch": 16.72980371599977, |
| "grad_norm": 0.3950461447238922, |
| "learning_rate": 0.00039947902097902093, |
| "loss": 3.2924, |
| "step": 57450 |
| }, |
| { |
| "epoch": 16.74436484361349, |
| "grad_norm": 0.37199780344963074, |
| "learning_rate": 0.0003993041958041958, |
| "loss": 3.2766, |
| "step": 57500 |
| }, |
| { |
| "epoch": 16.75892597122721, |
| "grad_norm": 0.36921125650405884, |
| "learning_rate": 0.00039912937062937064, |
| "loss": 3.2957, |
| "step": 57550 |
| }, |
| { |
| "epoch": 16.773487098840935, |
| "grad_norm": 0.35316649079322815, |
| "learning_rate": 0.00039895454545454544, |
| "loss": 3.285, |
| "step": 57600 |
| }, |
| { |
| "epoch": 16.788048226454656, |
| "grad_norm": 0.3791947364807129, |
| "learning_rate": 0.0003987797202797203, |
| "loss": 3.2869, |
| "step": 57650 |
| }, |
| { |
| "epoch": 16.80260935406838, |
| "grad_norm": 0.37145963311195374, |
| "learning_rate": 0.00039860489510489504, |
| "loss": 3.2977, |
| "step": 57700 |
| }, |
| { |
| "epoch": 16.8171704816821, |
| "grad_norm": 0.37916064262390137, |
| "learning_rate": 0.0003984300699300699, |
| "loss": 3.288, |
| "step": 57750 |
| }, |
| { |
| "epoch": 16.831731609295822, |
| "grad_norm": 0.3936197757720947, |
| "learning_rate": 0.0003982552447552447, |
| "loss": 3.2952, |
| "step": 57800 |
| }, |
| { |
| "epoch": 16.846292736909547, |
| "grad_norm": 0.37863749265670776, |
| "learning_rate": 0.00039808041958041955, |
| "loss": 3.2882, |
| "step": 57850 |
| }, |
| { |
| "epoch": 16.860853864523268, |
| "grad_norm": 0.3748323321342468, |
| "learning_rate": 0.00039790559440559435, |
| "loss": 3.2962, |
| "step": 57900 |
| }, |
| { |
| "epoch": 16.875414992136992, |
| "grad_norm": 0.37044841051101685, |
| "learning_rate": 0.0003977307692307692, |
| "loss": 3.2824, |
| "step": 57950 |
| }, |
| { |
| "epoch": 16.889976119750713, |
| "grad_norm": 0.3849230408668518, |
| "learning_rate": 0.000397555944055944, |
| "loss": 3.3015, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.889976119750713, |
| "eval_accuracy": 0.37276403704758165, |
| "eval_loss": 3.5398917198181152, |
| "eval_runtime": 82.1159, |
| "eval_samples_per_second": 202.652, |
| "eval_steps_per_second": 12.677, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.904537247364434, |
| "grad_norm": 0.3697381615638733, |
| "learning_rate": 0.00039738111888111885, |
| "loss": 3.2983, |
| "step": 58050 |
| }, |
| { |
| "epoch": 16.91909837497816, |
| "grad_norm": 0.3599628508090973, |
| "learning_rate": 0.00039720629370629365, |
| "loss": 3.2954, |
| "step": 58100 |
| }, |
| { |
| "epoch": 16.93365950259188, |
| "grad_norm": 0.36210155487060547, |
| "learning_rate": 0.0003970314685314685, |
| "loss": 3.3027, |
| "step": 58150 |
| }, |
| { |
| "epoch": 16.948220630205604, |
| "grad_norm": 0.3860971927642822, |
| "learning_rate": 0.0003968566433566433, |
| "loss": 3.3103, |
| "step": 58200 |
| }, |
| { |
| "epoch": 16.962781757819325, |
| "grad_norm": 0.3646826446056366, |
| "learning_rate": 0.00039668181818181816, |
| "loss": 3.2892, |
| "step": 58250 |
| }, |
| { |
| "epoch": 16.977342885433046, |
| "grad_norm": 0.36923229694366455, |
| "learning_rate": 0.000396506993006993, |
| "loss": 3.3038, |
| "step": 58300 |
| }, |
| { |
| "epoch": 16.99190401304677, |
| "grad_norm": 0.36637458205223083, |
| "learning_rate": 0.0003963321678321678, |
| "loss": 3.2982, |
| "step": 58350 |
| }, |
| { |
| "epoch": 17.006406896150036, |
| "grad_norm": 0.4128393530845642, |
| "learning_rate": 0.00039615734265734267, |
| "loss": 3.2421, |
| "step": 58400 |
| }, |
| { |
| "epoch": 17.02096802376376, |
| "grad_norm": 0.3423386514186859, |
| "learning_rate": 0.0003959825174825174, |
| "loss": 3.1918, |
| "step": 58450 |
| }, |
| { |
| "epoch": 17.03552915137748, |
| "grad_norm": 0.3761656880378723, |
| "learning_rate": 0.00039580769230769227, |
| "loss": 3.1872, |
| "step": 58500 |
| }, |
| { |
| "epoch": 17.050090278991206, |
| "grad_norm": 0.37738358974456787, |
| "learning_rate": 0.00039563286713286707, |
| "loss": 3.2038, |
| "step": 58550 |
| }, |
| { |
| "epoch": 17.064651406604927, |
| "grad_norm": 0.3634432554244995, |
| "learning_rate": 0.0003954580419580419, |
| "loss": 3.2074, |
| "step": 58600 |
| }, |
| { |
| "epoch": 17.07921253421865, |
| "grad_norm": 0.3546642065048218, |
| "learning_rate": 0.0003952832167832167, |
| "loss": 3.2159, |
| "step": 58650 |
| }, |
| { |
| "epoch": 17.093773661832373, |
| "grad_norm": 0.3801240026950836, |
| "learning_rate": 0.0003951083916083916, |
| "loss": 3.2093, |
| "step": 58700 |
| }, |
| { |
| "epoch": 17.108334789446094, |
| "grad_norm": 0.3783891499042511, |
| "learning_rate": 0.0003949335664335664, |
| "loss": 3.2092, |
| "step": 58750 |
| }, |
| { |
| "epoch": 17.122895917059818, |
| "grad_norm": 0.37025707960128784, |
| "learning_rate": 0.00039475874125874123, |
| "loss": 3.2178, |
| "step": 58800 |
| }, |
| { |
| "epoch": 17.13745704467354, |
| "grad_norm": 0.40171951055526733, |
| "learning_rate": 0.00039458391608391603, |
| "loss": 3.2049, |
| "step": 58850 |
| }, |
| { |
| "epoch": 17.152018172287264, |
| "grad_norm": 0.38419309258461, |
| "learning_rate": 0.0003944090909090909, |
| "loss": 3.2096, |
| "step": 58900 |
| }, |
| { |
| "epoch": 17.166579299900985, |
| "grad_norm": 0.39173948764801025, |
| "learning_rate": 0.00039423426573426573, |
| "loss": 3.23, |
| "step": 58950 |
| }, |
| { |
| "epoch": 17.181140427514705, |
| "grad_norm": 0.3768489956855774, |
| "learning_rate": 0.00039405944055944053, |
| "loss": 3.214, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.181140427514705, |
| "eval_accuracy": 0.37176563061713863, |
| "eval_loss": 3.557431697845459, |
| "eval_runtime": 82.13, |
| "eval_samples_per_second": 202.618, |
| "eval_steps_per_second": 12.675, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.19570155512843, |
| "grad_norm": 0.3854677677154541, |
| "learning_rate": 0.0003938846153846154, |
| "loss": 3.2259, |
| "step": 59050 |
| }, |
| { |
| "epoch": 17.21026268274215, |
| "grad_norm": 0.3884560763835907, |
| "learning_rate": 0.0003937097902097902, |
| "loss": 3.2248, |
| "step": 59100 |
| }, |
| { |
| "epoch": 17.224823810355876, |
| "grad_norm": 0.36343252658843994, |
| "learning_rate": 0.00039353496503496504, |
| "loss": 3.2208, |
| "step": 59150 |
| }, |
| { |
| "epoch": 17.239384937969596, |
| "grad_norm": 0.4002389907836914, |
| "learning_rate": 0.0003933601398601398, |
| "loss": 3.2356, |
| "step": 59200 |
| }, |
| { |
| "epoch": 17.253946065583317, |
| "grad_norm": 0.38611212372779846, |
| "learning_rate": 0.00039318531468531464, |
| "loss": 3.2239, |
| "step": 59250 |
| }, |
| { |
| "epoch": 17.268507193197042, |
| "grad_norm": 0.36958229541778564, |
| "learning_rate": 0.00039301048951048944, |
| "loss": 3.2299, |
| "step": 59300 |
| }, |
| { |
| "epoch": 17.283068320810763, |
| "grad_norm": 0.37455496191978455, |
| "learning_rate": 0.0003928356643356643, |
| "loss": 3.2312, |
| "step": 59350 |
| }, |
| { |
| "epoch": 17.297629448424487, |
| "grad_norm": 0.39618027210235596, |
| "learning_rate": 0.0003926608391608391, |
| "loss": 3.2361, |
| "step": 59400 |
| }, |
| { |
| "epoch": 17.31219057603821, |
| "grad_norm": 0.384206086397171, |
| "learning_rate": 0.00039248601398601395, |
| "loss": 3.2419, |
| "step": 59450 |
| }, |
| { |
| "epoch": 17.32675170365193, |
| "grad_norm": 0.4317259192466736, |
| "learning_rate": 0.00039231118881118875, |
| "loss": 3.2366, |
| "step": 59500 |
| }, |
| { |
| "epoch": 17.341312831265654, |
| "grad_norm": 0.38545721769332886, |
| "learning_rate": 0.0003921363636363636, |
| "loss": 3.2508, |
| "step": 59550 |
| }, |
| { |
| "epoch": 17.355873958879375, |
| "grad_norm": 0.39435115456581116, |
| "learning_rate": 0.00039196153846153846, |
| "loss": 3.2668, |
| "step": 59600 |
| }, |
| { |
| "epoch": 17.3704350864931, |
| "grad_norm": 0.3884045481681824, |
| "learning_rate": 0.00039178671328671326, |
| "loss": 3.2531, |
| "step": 59650 |
| }, |
| { |
| "epoch": 17.38499621410682, |
| "grad_norm": 0.38310301303863525, |
| "learning_rate": 0.0003916118881118881, |
| "loss": 3.2465, |
| "step": 59700 |
| }, |
| { |
| "epoch": 17.39955734172054, |
| "grad_norm": 0.3826778829097748, |
| "learning_rate": 0.0003914370629370629, |
| "loss": 3.2506, |
| "step": 59750 |
| }, |
| { |
| "epoch": 17.414118469334266, |
| "grad_norm": 0.3634958863258362, |
| "learning_rate": 0.00039126223776223776, |
| "loss": 3.2571, |
| "step": 59800 |
| }, |
| { |
| "epoch": 17.428679596947987, |
| "grad_norm": 0.40879565477371216, |
| "learning_rate": 0.00039108741258741256, |
| "loss": 3.2514, |
| "step": 59850 |
| }, |
| { |
| "epoch": 17.44324072456171, |
| "grad_norm": 0.3600159287452698, |
| "learning_rate": 0.0003909125874125874, |
| "loss": 3.2683, |
| "step": 59900 |
| }, |
| { |
| "epoch": 17.457801852175432, |
| "grad_norm": 0.38907891511917114, |
| "learning_rate": 0.00039073776223776216, |
| "loss": 3.2612, |
| "step": 59950 |
| }, |
| { |
| "epoch": 17.472362979789153, |
| "grad_norm": 0.3830939531326294, |
| "learning_rate": 0.000390562937062937, |
| "loss": 3.2536, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.472362979789153, |
| "eval_accuracy": 0.3723162241280143, |
| "eval_loss": 3.5466294288635254, |
| "eval_runtime": 82.0532, |
| "eval_samples_per_second": 202.807, |
| "eval_steps_per_second": 12.687, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.486924107402878, |
| "grad_norm": 0.36161839962005615, |
| "learning_rate": 0.0003903881118881118, |
| "loss": 3.2642, |
| "step": 60050 |
| }, |
| { |
| "epoch": 17.5014852350166, |
| "grad_norm": 0.3771781027317047, |
| "learning_rate": 0.00039021328671328667, |
| "loss": 3.2734, |
| "step": 60100 |
| }, |
| { |
| "epoch": 17.516046362630323, |
| "grad_norm": 0.3700454831123352, |
| "learning_rate": 0.00039003846153846147, |
| "loss": 3.2534, |
| "step": 60150 |
| }, |
| { |
| "epoch": 17.530607490244044, |
| "grad_norm": 0.38430526852607727, |
| "learning_rate": 0.0003898636363636363, |
| "loss": 3.2665, |
| "step": 60200 |
| }, |
| { |
| "epoch": 17.545168617857765, |
| "grad_norm": 0.38378286361694336, |
| "learning_rate": 0.0003896888111888111, |
| "loss": 3.2486, |
| "step": 60250 |
| }, |
| { |
| "epoch": 17.55972974547149, |
| "grad_norm": 0.37801867723464966, |
| "learning_rate": 0.000389513986013986, |
| "loss": 3.2626, |
| "step": 60300 |
| }, |
| { |
| "epoch": 17.57429087308521, |
| "grad_norm": 0.4010647237300873, |
| "learning_rate": 0.00038933916083916083, |
| "loss": 3.2615, |
| "step": 60350 |
| }, |
| { |
| "epoch": 17.588852000698935, |
| "grad_norm": 0.3832377791404724, |
| "learning_rate": 0.00038916433566433563, |
| "loss": 3.2744, |
| "step": 60400 |
| }, |
| { |
| "epoch": 17.603413128312656, |
| "grad_norm": 0.37959593534469604, |
| "learning_rate": 0.0003889895104895105, |
| "loss": 3.2601, |
| "step": 60450 |
| }, |
| { |
| "epoch": 17.617974255926377, |
| "grad_norm": 0.38724425435066223, |
| "learning_rate": 0.0003888146853146853, |
| "loss": 3.2527, |
| "step": 60500 |
| }, |
| { |
| "epoch": 17.6325353835401, |
| "grad_norm": 0.3717820942401886, |
| "learning_rate": 0.00038863986013986014, |
| "loss": 3.2607, |
| "step": 60550 |
| }, |
| { |
| "epoch": 17.647096511153823, |
| "grad_norm": 0.37195321917533875, |
| "learning_rate": 0.00038846503496503494, |
| "loss": 3.2763, |
| "step": 60600 |
| }, |
| { |
| "epoch": 17.661657638767547, |
| "grad_norm": 0.37872201204299927, |
| "learning_rate": 0.0003882902097902098, |
| "loss": 3.2789, |
| "step": 60650 |
| }, |
| { |
| "epoch": 17.676218766381268, |
| "grad_norm": 0.3744581937789917, |
| "learning_rate": 0.00038811538461538454, |
| "loss": 3.2796, |
| "step": 60700 |
| }, |
| { |
| "epoch": 17.690779893994993, |
| "grad_norm": 0.35313180088996887, |
| "learning_rate": 0.0003879405594405594, |
| "loss": 3.2604, |
| "step": 60750 |
| }, |
| { |
| "epoch": 17.705341021608714, |
| "grad_norm": 0.3678748905658722, |
| "learning_rate": 0.0003877657342657342, |
| "loss": 3.2732, |
| "step": 60800 |
| }, |
| { |
| "epoch": 17.719902149222435, |
| "grad_norm": 0.4213277995586395, |
| "learning_rate": 0.00038759090909090905, |
| "loss": 3.2759, |
| "step": 60850 |
| }, |
| { |
| "epoch": 17.73446327683616, |
| "grad_norm": 0.3885853886604309, |
| "learning_rate": 0.00038741608391608384, |
| "loss": 3.2748, |
| "step": 60900 |
| }, |
| { |
| "epoch": 17.74902440444988, |
| "grad_norm": 0.3901517987251282, |
| "learning_rate": 0.0003872412587412587, |
| "loss": 3.2815, |
| "step": 60950 |
| }, |
| { |
| "epoch": 17.763585532063605, |
| "grad_norm": 0.36514630913734436, |
| "learning_rate": 0.00038706643356643355, |
| "loss": 3.2649, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.763585532063605, |
| "eval_accuracy": 0.37262691786055024, |
| "eval_loss": 3.5422403812408447, |
| "eval_runtime": 81.9201, |
| "eval_samples_per_second": 203.137, |
| "eval_steps_per_second": 12.708, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.778146659677326, |
| "grad_norm": 0.3570280969142914, |
| "learning_rate": 0.00038689160839160835, |
| "loss": 3.279, |
| "step": 61050 |
| }, |
| { |
| "epoch": 17.792707787291047, |
| "grad_norm": 0.3739185631275177, |
| "learning_rate": 0.0003867167832167832, |
| "loss": 3.2988, |
| "step": 61100 |
| }, |
| { |
| "epoch": 17.80726891490477, |
| "grad_norm": 0.3826848566532135, |
| "learning_rate": 0.000386541958041958, |
| "loss": 3.2691, |
| "step": 61150 |
| }, |
| { |
| "epoch": 17.821830042518492, |
| "grad_norm": 0.3982725739479065, |
| "learning_rate": 0.00038636713286713286, |
| "loss": 3.2796, |
| "step": 61200 |
| }, |
| { |
| "epoch": 17.836391170132217, |
| "grad_norm": 0.3579130470752716, |
| "learning_rate": 0.00038619230769230766, |
| "loss": 3.2799, |
| "step": 61250 |
| }, |
| { |
| "epoch": 17.850952297745938, |
| "grad_norm": 0.3596711754798889, |
| "learning_rate": 0.0003860174825174825, |
| "loss": 3.278, |
| "step": 61300 |
| }, |
| { |
| "epoch": 17.86551342535966, |
| "grad_norm": 0.3691098093986511, |
| "learning_rate": 0.0003858426573426573, |
| "loss": 3.2899, |
| "step": 61350 |
| }, |
| { |
| "epoch": 17.880074552973383, |
| "grad_norm": 0.40516984462738037, |
| "learning_rate": 0.00038566783216783217, |
| "loss": 3.2849, |
| "step": 61400 |
| }, |
| { |
| "epoch": 17.894635680587104, |
| "grad_norm": 0.36803507804870605, |
| "learning_rate": 0.0003854930069930069, |
| "loss": 3.2799, |
| "step": 61450 |
| }, |
| { |
| "epoch": 17.90919680820083, |
| "grad_norm": 0.35957422852516174, |
| "learning_rate": 0.00038531818181818177, |
| "loss": 3.2936, |
| "step": 61500 |
| }, |
| { |
| "epoch": 17.92375793581455, |
| "grad_norm": 0.38461193442344666, |
| "learning_rate": 0.00038514335664335657, |
| "loss": 3.2826, |
| "step": 61550 |
| }, |
| { |
| "epoch": 17.93831906342827, |
| "grad_norm": 0.4032984673976898, |
| "learning_rate": 0.0003849685314685314, |
| "loss": 3.295, |
| "step": 61600 |
| }, |
| { |
| "epoch": 17.952880191041995, |
| "grad_norm": 0.41971707344055176, |
| "learning_rate": 0.0003847937062937062, |
| "loss": 3.2784, |
| "step": 61650 |
| }, |
| { |
| "epoch": 17.967441318655716, |
| "grad_norm": 0.37677469849586487, |
| "learning_rate": 0.0003846188811188811, |
| "loss": 3.2803, |
| "step": 61700 |
| }, |
| { |
| "epoch": 17.98200244626944, |
| "grad_norm": 0.3791235089302063, |
| "learning_rate": 0.00038444405594405593, |
| "loss": 3.2823, |
| "step": 61750 |
| }, |
| { |
| "epoch": 17.99656357388316, |
| "grad_norm": 0.37751075625419617, |
| "learning_rate": 0.00038426923076923073, |
| "loss": 3.2816, |
| "step": 61800 |
| }, |
| { |
| "epoch": 18.01106645698643, |
| "grad_norm": 0.3732227385044098, |
| "learning_rate": 0.0003840944055944056, |
| "loss": 3.2002, |
| "step": 61850 |
| }, |
| { |
| "epoch": 18.02562758460015, |
| "grad_norm": 0.37563759088516235, |
| "learning_rate": 0.0003839195804195804, |
| "loss": 3.18, |
| "step": 61900 |
| }, |
| { |
| "epoch": 18.040188712213872, |
| "grad_norm": 0.3778409957885742, |
| "learning_rate": 0.00038374475524475523, |
| "loss": 3.1928, |
| "step": 61950 |
| }, |
| { |
| "epoch": 18.054749839827597, |
| "grad_norm": 0.3905414938926697, |
| "learning_rate": 0.00038356993006993003, |
| "loss": 3.1855, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.054749839827597, |
| "eval_accuracy": 0.3721660515706909, |
| "eval_loss": 3.5522522926330566, |
| "eval_runtime": 82.1887, |
| "eval_samples_per_second": 202.473, |
| "eval_steps_per_second": 12.666, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.069310967441318, |
| "grad_norm": 0.36873266100883484, |
| "learning_rate": 0.0003833951048951049, |
| "loss": 3.2067, |
| "step": 62050 |
| }, |
| { |
| "epoch": 18.083872095055042, |
| "grad_norm": 0.4099154770374298, |
| "learning_rate": 0.0003832202797202797, |
| "loss": 3.1974, |
| "step": 62100 |
| }, |
| { |
| "epoch": 18.098433222668763, |
| "grad_norm": 0.3663903772830963, |
| "learning_rate": 0.00038304545454545454, |
| "loss": 3.1992, |
| "step": 62150 |
| }, |
| { |
| "epoch": 18.112994350282484, |
| "grad_norm": 0.36760830879211426, |
| "learning_rate": 0.0003828706293706293, |
| "loss": 3.2036, |
| "step": 62200 |
| }, |
| { |
| "epoch": 18.12755547789621, |
| "grad_norm": 0.38908717036247253, |
| "learning_rate": 0.00038269580419580414, |
| "loss": 3.1956, |
| "step": 62250 |
| }, |
| { |
| "epoch": 18.14211660550993, |
| "grad_norm": 0.39743584394454956, |
| "learning_rate": 0.00038252097902097894, |
| "loss": 3.214, |
| "step": 62300 |
| }, |
| { |
| "epoch": 18.156677733123654, |
| "grad_norm": 0.3820717930793762, |
| "learning_rate": 0.0003823461538461538, |
| "loss": 3.2131, |
| "step": 62350 |
| }, |
| { |
| "epoch": 18.171238860737375, |
| "grad_norm": 0.4144818186759949, |
| "learning_rate": 0.00038217132867132865, |
| "loss": 3.2005, |
| "step": 62400 |
| }, |
| { |
| "epoch": 18.185799988351096, |
| "grad_norm": 0.38267946243286133, |
| "learning_rate": 0.00038199650349650345, |
| "loss": 3.2128, |
| "step": 62450 |
| }, |
| { |
| "epoch": 18.20036111596482, |
| "grad_norm": 0.35174643993377686, |
| "learning_rate": 0.0003818216783216783, |
| "loss": 3.2255, |
| "step": 62500 |
| }, |
| { |
| "epoch": 18.214922243578542, |
| "grad_norm": 0.3899083137512207, |
| "learning_rate": 0.0003816468531468531, |
| "loss": 3.2175, |
| "step": 62550 |
| }, |
| { |
| "epoch": 18.229483371192266, |
| "grad_norm": 0.40650811791419983, |
| "learning_rate": 0.00038147202797202796, |
| "loss": 3.2096, |
| "step": 62600 |
| }, |
| { |
| "epoch": 18.244044498805987, |
| "grad_norm": 0.4108577072620392, |
| "learning_rate": 0.00038129720279720276, |
| "loss": 3.2321, |
| "step": 62650 |
| }, |
| { |
| "epoch": 18.25860562641971, |
| "grad_norm": 0.3795531392097473, |
| "learning_rate": 0.0003811223776223776, |
| "loss": 3.2267, |
| "step": 62700 |
| }, |
| { |
| "epoch": 18.273166754033433, |
| "grad_norm": 0.3712756335735321, |
| "learning_rate": 0.0003809475524475524, |
| "loss": 3.2202, |
| "step": 62750 |
| }, |
| { |
| "epoch": 18.287727881647154, |
| "grad_norm": 0.3941921889781952, |
| "learning_rate": 0.00038077272727272726, |
| "loss": 3.2264, |
| "step": 62800 |
| }, |
| { |
| "epoch": 18.30228900926088, |
| "grad_norm": 0.39660295844078064, |
| "learning_rate": 0.00038059790209790206, |
| "loss": 3.2276, |
| "step": 62850 |
| }, |
| { |
| "epoch": 18.3168501368746, |
| "grad_norm": 0.38824042677879333, |
| "learning_rate": 0.0003804230769230769, |
| "loss": 3.2339, |
| "step": 62900 |
| }, |
| { |
| "epoch": 18.33141126448832, |
| "grad_norm": 0.38477823138237, |
| "learning_rate": 0.00038024825174825166, |
| "loss": 3.2352, |
| "step": 62950 |
| }, |
| { |
| "epoch": 18.345972392102045, |
| "grad_norm": 0.38246315717697144, |
| "learning_rate": 0.0003800734265734265, |
| "loss": 3.234, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.345972392102045, |
| "eval_accuracy": 0.3723918395973635, |
| "eval_loss": 3.550502300262451, |
| "eval_runtime": 82.1614, |
| "eval_samples_per_second": 202.54, |
| "eval_steps_per_second": 12.67, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.360533519715766, |
| "grad_norm": 0.398716539144516, |
| "learning_rate": 0.0003798986013986013, |
| "loss": 3.2288, |
| "step": 63050 |
| }, |
| { |
| "epoch": 18.37509464732949, |
| "grad_norm": 0.3816038966178894, |
| "learning_rate": 0.00037972377622377617, |
| "loss": 3.2397, |
| "step": 63100 |
| }, |
| { |
| "epoch": 18.38965577494321, |
| "grad_norm": 0.3807135820388794, |
| "learning_rate": 0.000379548951048951, |
| "loss": 3.2359, |
| "step": 63150 |
| }, |
| { |
| "epoch": 18.404216902556932, |
| "grad_norm": 0.42097538709640503, |
| "learning_rate": 0.0003793741258741258, |
| "loss": 3.2442, |
| "step": 63200 |
| }, |
| { |
| "epoch": 18.418778030170657, |
| "grad_norm": 0.36351248621940613, |
| "learning_rate": 0.0003791993006993007, |
| "loss": 3.2386, |
| "step": 63250 |
| }, |
| { |
| "epoch": 18.433339157784378, |
| "grad_norm": 0.39638498425483704, |
| "learning_rate": 0.0003790244755244755, |
| "loss": 3.2304, |
| "step": 63300 |
| }, |
| { |
| "epoch": 18.447900285398102, |
| "grad_norm": 0.40697968006134033, |
| "learning_rate": 0.00037884965034965033, |
| "loss": 3.2419, |
| "step": 63350 |
| }, |
| { |
| "epoch": 18.462461413011823, |
| "grad_norm": 0.37197327613830566, |
| "learning_rate": 0.00037867482517482513, |
| "loss": 3.2494, |
| "step": 63400 |
| }, |
| { |
| "epoch": 18.477022540625548, |
| "grad_norm": 0.3793451488018036, |
| "learning_rate": 0.0003785, |
| "loss": 3.2435, |
| "step": 63450 |
| }, |
| { |
| "epoch": 18.49158366823927, |
| "grad_norm": 0.3888586461544037, |
| "learning_rate": 0.0003783251748251748, |
| "loss": 3.2408, |
| "step": 63500 |
| }, |
| { |
| "epoch": 18.50614479585299, |
| "grad_norm": 0.3695577085018158, |
| "learning_rate": 0.00037815034965034964, |
| "loss": 3.2423, |
| "step": 63550 |
| }, |
| { |
| "epoch": 18.520705923466714, |
| "grad_norm": 0.3714613914489746, |
| "learning_rate": 0.00037797552447552444, |
| "loss": 3.2457, |
| "step": 63600 |
| }, |
| { |
| "epoch": 18.535267051080435, |
| "grad_norm": 0.40346500277519226, |
| "learning_rate": 0.0003778006993006993, |
| "loss": 3.2493, |
| "step": 63650 |
| }, |
| { |
| "epoch": 18.54982817869416, |
| "grad_norm": 0.3811418116092682, |
| "learning_rate": 0.00037762587412587404, |
| "loss": 3.2519, |
| "step": 63700 |
| }, |
| { |
| "epoch": 18.56438930630788, |
| "grad_norm": 0.3739699721336365, |
| "learning_rate": 0.0003774510489510489, |
| "loss": 3.2594, |
| "step": 63750 |
| }, |
| { |
| "epoch": 18.5789504339216, |
| "grad_norm": 0.3953077793121338, |
| "learning_rate": 0.0003772762237762238, |
| "loss": 3.2622, |
| "step": 63800 |
| }, |
| { |
| "epoch": 18.593511561535326, |
| "grad_norm": 0.4199087917804718, |
| "learning_rate": 0.00037710139860139854, |
| "loss": 3.2465, |
| "step": 63850 |
| }, |
| { |
| "epoch": 18.608072689149047, |
| "grad_norm": 0.3744155168533325, |
| "learning_rate": 0.0003769265734265734, |
| "loss": 3.2549, |
| "step": 63900 |
| }, |
| { |
| "epoch": 18.62263381676277, |
| "grad_norm": 0.3768845200538635, |
| "learning_rate": 0.0003767517482517482, |
| "loss": 3.2581, |
| "step": 63950 |
| }, |
| { |
| "epoch": 18.637194944376493, |
| "grad_norm": 0.38198384642601013, |
| "learning_rate": 0.00037657692307692305, |
| "loss": 3.2567, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.637194944376493, |
| "eval_accuracy": 0.3728074306839578, |
| "eval_loss": 3.543090581893921, |
| "eval_runtime": 82.1012, |
| "eval_samples_per_second": 202.689, |
| "eval_steps_per_second": 12.679, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.651756071990214, |
| "grad_norm": 0.41056832671165466, |
| "learning_rate": 0.00037640209790209785, |
| "loss": 3.2563, |
| "step": 64050 |
| }, |
| { |
| "epoch": 18.666317199603938, |
| "grad_norm": 0.3860456943511963, |
| "learning_rate": 0.0003762272727272727, |
| "loss": 3.261, |
| "step": 64100 |
| }, |
| { |
| "epoch": 18.68087832721766, |
| "grad_norm": 0.3985314667224884, |
| "learning_rate": 0.0003760524475524475, |
| "loss": 3.2564, |
| "step": 64150 |
| }, |
| { |
| "epoch": 18.695439454831384, |
| "grad_norm": 0.36221933364868164, |
| "learning_rate": 0.00037587762237762236, |
| "loss": 3.2605, |
| "step": 64200 |
| }, |
| { |
| "epoch": 18.710000582445105, |
| "grad_norm": 0.3958105742931366, |
| "learning_rate": 0.00037570279720279716, |
| "loss": 3.2581, |
| "step": 64250 |
| }, |
| { |
| "epoch": 18.724561710058826, |
| "grad_norm": 0.42070651054382324, |
| "learning_rate": 0.000375527972027972, |
| "loss": 3.2537, |
| "step": 64300 |
| }, |
| { |
| "epoch": 18.73912283767255, |
| "grad_norm": 0.37500694394111633, |
| "learning_rate": 0.0003753531468531468, |
| "loss": 3.2631, |
| "step": 64350 |
| }, |
| { |
| "epoch": 18.75368396528627, |
| "grad_norm": 0.37421441078186035, |
| "learning_rate": 0.00037517832167832167, |
| "loss": 3.2611, |
| "step": 64400 |
| }, |
| { |
| "epoch": 18.768245092899996, |
| "grad_norm": 0.3694532513618469, |
| "learning_rate": 0.0003750034965034965, |
| "loss": 3.2722, |
| "step": 64450 |
| }, |
| { |
| "epoch": 18.782806220513717, |
| "grad_norm": 0.35953474044799805, |
| "learning_rate": 0.00037482867132867127, |
| "loss": 3.2664, |
| "step": 64500 |
| }, |
| { |
| "epoch": 18.797367348127437, |
| "grad_norm": 0.37228283286094666, |
| "learning_rate": 0.0003746538461538462, |
| "loss": 3.2691, |
| "step": 64550 |
| }, |
| { |
| "epoch": 18.811928475741162, |
| "grad_norm": 0.37105363607406616, |
| "learning_rate": 0.0003744790209790209, |
| "loss": 3.2592, |
| "step": 64600 |
| }, |
| { |
| "epoch": 18.826489603354883, |
| "grad_norm": 0.3755386471748352, |
| "learning_rate": 0.0003743041958041958, |
| "loss": 3.2603, |
| "step": 64650 |
| }, |
| { |
| "epoch": 18.841050730968607, |
| "grad_norm": 0.4071175158023834, |
| "learning_rate": 0.0003741293706293706, |
| "loss": 3.2752, |
| "step": 64700 |
| }, |
| { |
| "epoch": 18.85561185858233, |
| "grad_norm": 0.3656890094280243, |
| "learning_rate": 0.0003739545454545454, |
| "loss": 3.2604, |
| "step": 64750 |
| }, |
| { |
| "epoch": 18.87017298619605, |
| "grad_norm": 0.3718837797641754, |
| "learning_rate": 0.0003737797202797202, |
| "loss": 3.2736, |
| "step": 64800 |
| }, |
| { |
| "epoch": 18.884734113809774, |
| "grad_norm": 0.3790704607963562, |
| "learning_rate": 0.0003736048951048951, |
| "loss": 3.2799, |
| "step": 64850 |
| }, |
| { |
| "epoch": 18.899295241423495, |
| "grad_norm": 0.39467573165893555, |
| "learning_rate": 0.0003734300699300699, |
| "loss": 3.2611, |
| "step": 64900 |
| }, |
| { |
| "epoch": 18.91385636903722, |
| "grad_norm": 0.3760416507720947, |
| "learning_rate": 0.00037325524475524473, |
| "loss": 3.2776, |
| "step": 64950 |
| }, |
| { |
| "epoch": 18.92841749665094, |
| "grad_norm": 0.37934595346450806, |
| "learning_rate": 0.00037308041958041953, |
| "loss": 3.2589, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.92841749665094, |
| "eval_accuracy": 0.3735114894942125, |
| "eval_loss": 3.5340983867645264, |
| "eval_runtime": 81.9584, |
| "eval_samples_per_second": 203.042, |
| "eval_steps_per_second": 12.702, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.94297862426466, |
| "grad_norm": 0.3777463436126709, |
| "learning_rate": 0.0003729055944055944, |
| "loss": 3.2646, |
| "step": 65050 |
| }, |
| { |
| "epoch": 18.957539751878386, |
| "grad_norm": 0.36563143134117126, |
| "learning_rate": 0.0003727307692307692, |
| "loss": 3.2794, |
| "step": 65100 |
| }, |
| { |
| "epoch": 18.972100879492107, |
| "grad_norm": 0.36497148871421814, |
| "learning_rate": 0.00037255594405594404, |
| "loss": 3.2786, |
| "step": 65150 |
| }, |
| { |
| "epoch": 18.98666200710583, |
| "grad_norm": 0.3956014811992645, |
| "learning_rate": 0.0003723811188811189, |
| "loss": 3.2765, |
| "step": 65200 |
| }, |
| { |
| "epoch": 19.001164890209097, |
| "grad_norm": 0.3765634000301361, |
| "learning_rate": 0.00037220629370629364, |
| "loss": 3.2695, |
| "step": 65250 |
| }, |
| { |
| "epoch": 19.01572601782282, |
| "grad_norm": 0.3949611485004425, |
| "learning_rate": 0.00037203146853146855, |
| "loss": 3.1778, |
| "step": 65300 |
| }, |
| { |
| "epoch": 19.030287145436542, |
| "grad_norm": 0.38870665431022644, |
| "learning_rate": 0.0003718566433566433, |
| "loss": 3.169, |
| "step": 65350 |
| }, |
| { |
| "epoch": 19.044848273050263, |
| "grad_norm": 0.38133251667022705, |
| "learning_rate": 0.00037168181818181815, |
| "loss": 3.1779, |
| "step": 65400 |
| }, |
| { |
| "epoch": 19.059409400663988, |
| "grad_norm": 0.4323780834674835, |
| "learning_rate": 0.00037150699300699295, |
| "loss": 3.1673, |
| "step": 65450 |
| }, |
| { |
| "epoch": 19.07397052827771, |
| "grad_norm": 0.42456677556037903, |
| "learning_rate": 0.0003713321678321678, |
| "loss": 3.1778, |
| "step": 65500 |
| }, |
| { |
| "epoch": 19.088531655891433, |
| "grad_norm": 0.3735314905643463, |
| "learning_rate": 0.0003711573426573426, |
| "loss": 3.1858, |
| "step": 65550 |
| }, |
| { |
| "epoch": 19.103092783505154, |
| "grad_norm": 0.3831532895565033, |
| "learning_rate": 0.00037098251748251746, |
| "loss": 3.1934, |
| "step": 65600 |
| }, |
| { |
| "epoch": 19.11765391111888, |
| "grad_norm": 0.40036728978157043, |
| "learning_rate": 0.00037080769230769226, |
| "loss": 3.1794, |
| "step": 65650 |
| }, |
| { |
| "epoch": 19.1322150387326, |
| "grad_norm": 0.4043117165565491, |
| "learning_rate": 0.0003706328671328671, |
| "loss": 3.1998, |
| "step": 65700 |
| }, |
| { |
| "epoch": 19.14677616634632, |
| "grad_norm": 0.38884180784225464, |
| "learning_rate": 0.0003704580419580419, |
| "loss": 3.2065, |
| "step": 65750 |
| }, |
| { |
| "epoch": 19.161337293960045, |
| "grad_norm": 0.4088717997074127, |
| "learning_rate": 0.00037028321678321676, |
| "loss": 3.1955, |
| "step": 65800 |
| }, |
| { |
| "epoch": 19.175898421573766, |
| "grad_norm": 0.37341269850730896, |
| "learning_rate": 0.0003701083916083916, |
| "loss": 3.2076, |
| "step": 65850 |
| }, |
| { |
| "epoch": 19.19045954918749, |
| "grad_norm": 0.38941335678100586, |
| "learning_rate": 0.0003699335664335664, |
| "loss": 3.2054, |
| "step": 65900 |
| }, |
| { |
| "epoch": 19.20502067680121, |
| "grad_norm": 0.3908570408821106, |
| "learning_rate": 0.00036975874125874127, |
| "loss": 3.2164, |
| "step": 65950 |
| }, |
| { |
| "epoch": 19.219581804414933, |
| "grad_norm": 0.3603554368019104, |
| "learning_rate": 0.00036958391608391607, |
| "loss": 3.2112, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.219581804414933, |
| "eval_accuracy": 0.3726382072618839, |
| "eval_loss": 3.5502724647521973, |
| "eval_runtime": 81.8907, |
| "eval_samples_per_second": 203.21, |
| "eval_steps_per_second": 12.712, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.234142932028657, |
| "grad_norm": 0.3899898827075958, |
| "learning_rate": 0.0003694090909090909, |
| "loss": 3.2144, |
| "step": 66050 |
| }, |
| { |
| "epoch": 19.248704059642378, |
| "grad_norm": 0.3943706154823303, |
| "learning_rate": 0.00036923426573426567, |
| "loss": 3.2055, |
| "step": 66100 |
| }, |
| { |
| "epoch": 19.263265187256103, |
| "grad_norm": 0.3804349899291992, |
| "learning_rate": 0.0003690594405594405, |
| "loss": 3.2178, |
| "step": 66150 |
| }, |
| { |
| "epoch": 19.277826314869824, |
| "grad_norm": 0.3607994318008423, |
| "learning_rate": 0.0003688846153846153, |
| "loss": 3.2206, |
| "step": 66200 |
| }, |
| { |
| "epoch": 19.292387442483545, |
| "grad_norm": 0.40597769618034363, |
| "learning_rate": 0.0003687097902097902, |
| "loss": 3.2199, |
| "step": 66250 |
| }, |
| { |
| "epoch": 19.30694857009727, |
| "grad_norm": 0.4105750024318695, |
| "learning_rate": 0.000368534965034965, |
| "loss": 3.2246, |
| "step": 66300 |
| }, |
| { |
| "epoch": 19.32150969771099, |
| "grad_norm": 0.38372939825057983, |
| "learning_rate": 0.00036836013986013983, |
| "loss": 3.2155, |
| "step": 66350 |
| }, |
| { |
| "epoch": 19.336070825324715, |
| "grad_norm": 0.38922742009162903, |
| "learning_rate": 0.00036818531468531463, |
| "loss": 3.2059, |
| "step": 66400 |
| }, |
| { |
| "epoch": 19.350631952938436, |
| "grad_norm": 0.42378517985343933, |
| "learning_rate": 0.0003680104895104895, |
| "loss": 3.2266, |
| "step": 66450 |
| }, |
| { |
| "epoch": 19.365193080552157, |
| "grad_norm": 0.38724464178085327, |
| "learning_rate": 0.0003678356643356643, |
| "loss": 3.2188, |
| "step": 66500 |
| }, |
| { |
| "epoch": 19.37975420816588, |
| "grad_norm": 0.37156447768211365, |
| "learning_rate": 0.00036766083916083914, |
| "loss": 3.2341, |
| "step": 66550 |
| }, |
| { |
| "epoch": 19.394315335779602, |
| "grad_norm": 0.4274117052555084, |
| "learning_rate": 0.000367486013986014, |
| "loss": 3.2195, |
| "step": 66600 |
| }, |
| { |
| "epoch": 19.408876463393327, |
| "grad_norm": 0.40793415904045105, |
| "learning_rate": 0.0003673111888111888, |
| "loss": 3.2233, |
| "step": 66650 |
| }, |
| { |
| "epoch": 19.423437591007048, |
| "grad_norm": 0.38283291459083557, |
| "learning_rate": 0.00036713636363636365, |
| "loss": 3.2166, |
| "step": 66700 |
| }, |
| { |
| "epoch": 19.43799871862077, |
| "grad_norm": 0.40094292163848877, |
| "learning_rate": 0.00036696153846153844, |
| "loss": 3.223, |
| "step": 66750 |
| }, |
| { |
| "epoch": 19.452559846234493, |
| "grad_norm": 0.40695202350616455, |
| "learning_rate": 0.0003667867132867133, |
| "loss": 3.2246, |
| "step": 66800 |
| }, |
| { |
| "epoch": 19.467120973848214, |
| "grad_norm": 0.3906048536300659, |
| "learning_rate": 0.00036661188811188804, |
| "loss": 3.2403, |
| "step": 66850 |
| }, |
| { |
| "epoch": 19.48168210146194, |
| "grad_norm": 0.3866629898548126, |
| "learning_rate": 0.0003664370629370629, |
| "loss": 3.2231, |
| "step": 66900 |
| }, |
| { |
| "epoch": 19.49624322907566, |
| "grad_norm": 0.38951748609542847, |
| "learning_rate": 0.0003662622377622377, |
| "loss": 3.2448, |
| "step": 66950 |
| }, |
| { |
| "epoch": 19.51080435668938, |
| "grad_norm": 0.4151657521724701, |
| "learning_rate": 0.00036608741258741255, |
| "loss": 3.2412, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.51080435668938, |
| "eval_accuracy": 0.37298100522946237, |
| "eval_loss": 3.546610116958618, |
| "eval_runtime": 82.2055, |
| "eval_samples_per_second": 202.432, |
| "eval_steps_per_second": 12.663, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.525365484303105, |
| "grad_norm": 0.38904881477355957, |
| "learning_rate": 0.00036591258741258735, |
| "loss": 3.2419, |
| "step": 67050 |
| }, |
| { |
| "epoch": 19.539926611916826, |
| "grad_norm": 0.42368486523628235, |
| "learning_rate": 0.0003657377622377622, |
| "loss": 3.2344, |
| "step": 67100 |
| }, |
| { |
| "epoch": 19.55448773953055, |
| "grad_norm": 0.3994165062904358, |
| "learning_rate": 0.000365562937062937, |
| "loss": 3.2385, |
| "step": 67150 |
| }, |
| { |
| "epoch": 19.56904886714427, |
| "grad_norm": 0.38828790187835693, |
| "learning_rate": 0.00036538811188811186, |
| "loss": 3.2367, |
| "step": 67200 |
| }, |
| { |
| "epoch": 19.583609994757992, |
| "grad_norm": 0.4017794728279114, |
| "learning_rate": 0.0003652132867132867, |
| "loss": 3.2342, |
| "step": 67250 |
| }, |
| { |
| "epoch": 19.598171122371717, |
| "grad_norm": 0.3893563449382782, |
| "learning_rate": 0.0003650384615384615, |
| "loss": 3.2517, |
| "step": 67300 |
| }, |
| { |
| "epoch": 19.612732249985438, |
| "grad_norm": 0.3365978002548218, |
| "learning_rate": 0.00036486363636363637, |
| "loss": 3.245, |
| "step": 67350 |
| }, |
| { |
| "epoch": 19.627293377599162, |
| "grad_norm": 0.4130851924419403, |
| "learning_rate": 0.00036468881118881117, |
| "loss": 3.2463, |
| "step": 67400 |
| }, |
| { |
| "epoch": 19.641854505212883, |
| "grad_norm": 0.38315796852111816, |
| "learning_rate": 0.000364513986013986, |
| "loss": 3.2418, |
| "step": 67450 |
| }, |
| { |
| "epoch": 19.656415632826604, |
| "grad_norm": 0.4083530604839325, |
| "learning_rate": 0.0003643391608391608, |
| "loss": 3.2553, |
| "step": 67500 |
| }, |
| { |
| "epoch": 19.67097676044033, |
| "grad_norm": 0.3799106180667877, |
| "learning_rate": 0.0003641643356643357, |
| "loss": 3.2419, |
| "step": 67550 |
| }, |
| { |
| "epoch": 19.68553788805405, |
| "grad_norm": 0.39325082302093506, |
| "learning_rate": 0.0003639895104895104, |
| "loss": 3.2552, |
| "step": 67600 |
| }, |
| { |
| "epoch": 19.700099015667774, |
| "grad_norm": 0.37322697043418884, |
| "learning_rate": 0.0003638146853146853, |
| "loss": 3.2428, |
| "step": 67650 |
| }, |
| { |
| "epoch": 19.714660143281495, |
| "grad_norm": 0.3775913417339325, |
| "learning_rate": 0.00036363986013986007, |
| "loss": 3.2405, |
| "step": 67700 |
| }, |
| { |
| "epoch": 19.729221270895216, |
| "grad_norm": 0.38930535316467285, |
| "learning_rate": 0.0003634650349650349, |
| "loss": 3.2561, |
| "step": 67750 |
| }, |
| { |
| "epoch": 19.74378239850894, |
| "grad_norm": 0.38972270488739014, |
| "learning_rate": 0.0003632902097902097, |
| "loss": 3.2477, |
| "step": 67800 |
| }, |
| { |
| "epoch": 19.758343526122662, |
| "grad_norm": 0.3653428554534912, |
| "learning_rate": 0.0003631153846153846, |
| "loss": 3.2572, |
| "step": 67850 |
| }, |
| { |
| "epoch": 19.772904653736386, |
| "grad_norm": 0.39451688528060913, |
| "learning_rate": 0.00036294055944055943, |
| "loss": 3.2492, |
| "step": 67900 |
| }, |
| { |
| "epoch": 19.787465781350107, |
| "grad_norm": 0.38252776861190796, |
| "learning_rate": 0.00036276573426573423, |
| "loss": 3.2582, |
| "step": 67950 |
| }, |
| { |
| "epoch": 19.802026908963832, |
| "grad_norm": 0.39216548204421997, |
| "learning_rate": 0.0003625909090909091, |
| "loss": 3.248, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.802026908963832, |
| "eval_accuracy": 0.3735597046457415, |
| "eval_loss": 3.537071704864502, |
| "eval_runtime": 82.1823, |
| "eval_samples_per_second": 202.489, |
| "eval_steps_per_second": 12.667, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.816588036577553, |
| "grad_norm": 0.37432602047920227, |
| "learning_rate": 0.0003624160839160839, |
| "loss": 3.2559, |
| "step": 68050 |
| }, |
| { |
| "epoch": 19.831149164191274, |
| "grad_norm": 0.42247503995895386, |
| "learning_rate": 0.00036224125874125874, |
| "loss": 3.2516, |
| "step": 68100 |
| }, |
| { |
| "epoch": 19.845710291805, |
| "grad_norm": 0.4186394214630127, |
| "learning_rate": 0.00036206643356643354, |
| "loss": 3.2691, |
| "step": 68150 |
| }, |
| { |
| "epoch": 19.86027141941872, |
| "grad_norm": 0.3878350853919983, |
| "learning_rate": 0.0003618916083916084, |
| "loss": 3.255, |
| "step": 68200 |
| }, |
| { |
| "epoch": 19.874832547032444, |
| "grad_norm": 0.39463165402412415, |
| "learning_rate": 0.0003617167832167832, |
| "loss": 3.2587, |
| "step": 68250 |
| }, |
| { |
| "epoch": 19.889393674646165, |
| "grad_norm": 0.394517183303833, |
| "learning_rate": 0.00036154195804195805, |
| "loss": 3.2574, |
| "step": 68300 |
| }, |
| { |
| "epoch": 19.903954802259886, |
| "grad_norm": 0.3798947036266327, |
| "learning_rate": 0.0003613671328671328, |
| "loss": 3.2634, |
| "step": 68350 |
| }, |
| { |
| "epoch": 19.91851592987361, |
| "grad_norm": 0.4175240993499756, |
| "learning_rate": 0.00036119230769230765, |
| "loss": 3.2617, |
| "step": 68400 |
| }, |
| { |
| "epoch": 19.93307705748733, |
| "grad_norm": 0.3988494277000427, |
| "learning_rate": 0.00036101748251748245, |
| "loss": 3.2605, |
| "step": 68450 |
| }, |
| { |
| "epoch": 19.947638185101056, |
| "grad_norm": 0.37991979718208313, |
| "learning_rate": 0.0003608426573426573, |
| "loss": 3.2638, |
| "step": 68500 |
| }, |
| { |
| "epoch": 19.962199312714777, |
| "grad_norm": 0.39357224106788635, |
| "learning_rate": 0.0003606678321678321, |
| "loss": 3.2602, |
| "step": 68550 |
| }, |
| { |
| "epoch": 19.976760440328498, |
| "grad_norm": 0.38228750228881836, |
| "learning_rate": 0.00036049300699300696, |
| "loss": 3.2587, |
| "step": 68600 |
| }, |
| { |
| "epoch": 19.991321567942222, |
| "grad_norm": 0.36185112595558167, |
| "learning_rate": 0.0003603181818181818, |
| "loss": 3.2704, |
| "step": 68650 |
| }, |
| { |
| "epoch": 20.005824451045488, |
| "grad_norm": 0.4131271541118622, |
| "learning_rate": 0.0003601433566433566, |
| "loss": 3.2211, |
| "step": 68700 |
| }, |
| { |
| "epoch": 20.020385578659212, |
| "grad_norm": 0.4005662202835083, |
| "learning_rate": 0.00035996853146853146, |
| "loss": 3.1531, |
| "step": 68750 |
| }, |
| { |
| "epoch": 20.034946706272933, |
| "grad_norm": 0.38565582036972046, |
| "learning_rate": 0.00035979370629370626, |
| "loss": 3.1623, |
| "step": 68800 |
| }, |
| { |
| "epoch": 20.049507833886658, |
| "grad_norm": 0.40073373913764954, |
| "learning_rate": 0.0003596188811188811, |
| "loss": 3.1666, |
| "step": 68850 |
| }, |
| { |
| "epoch": 20.06406896150038, |
| "grad_norm": 0.38596728444099426, |
| "learning_rate": 0.0003594440559440559, |
| "loss": 3.1636, |
| "step": 68900 |
| }, |
| { |
| "epoch": 20.0786300891141, |
| "grad_norm": 0.3958479166030884, |
| "learning_rate": 0.00035926923076923077, |
| "loss": 3.1705, |
| "step": 68950 |
| }, |
| { |
| "epoch": 20.093191216727824, |
| "grad_norm": 0.394986093044281, |
| "learning_rate": 0.00035909440559440557, |
| "loss": 3.1563, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.093191216727824, |
| "eval_accuracy": 0.37273898868837263, |
| "eval_loss": 3.555129289627075, |
| "eval_runtime": 82.1235, |
| "eval_samples_per_second": 202.634, |
| "eval_steps_per_second": 12.676, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.107752344341545, |
| "grad_norm": 0.38096439838409424, |
| "learning_rate": 0.0003589195804195804, |
| "loss": 3.1657, |
| "step": 69050 |
| }, |
| { |
| "epoch": 20.12231347195527, |
| "grad_norm": 0.3882683515548706, |
| "learning_rate": 0.00035874475524475517, |
| "loss": 3.1865, |
| "step": 69100 |
| }, |
| { |
| "epoch": 20.13687459956899, |
| "grad_norm": 0.39912328124046326, |
| "learning_rate": 0.00035856993006993, |
| "loss": 3.1869, |
| "step": 69150 |
| }, |
| { |
| "epoch": 20.15143572718271, |
| "grad_norm": 0.38343095779418945, |
| "learning_rate": 0.0003583951048951048, |
| "loss": 3.187, |
| "step": 69200 |
| }, |
| { |
| "epoch": 20.165996854796436, |
| "grad_norm": 0.3823148012161255, |
| "learning_rate": 0.0003582202797202797, |
| "loss": 3.1909, |
| "step": 69250 |
| }, |
| { |
| "epoch": 20.180557982410157, |
| "grad_norm": 0.42335066199302673, |
| "learning_rate": 0.00035804545454545453, |
| "loss": 3.1919, |
| "step": 69300 |
| }, |
| { |
| "epoch": 20.19511911002388, |
| "grad_norm": 0.3876207172870636, |
| "learning_rate": 0.00035787062937062933, |
| "loss": 3.1904, |
| "step": 69350 |
| }, |
| { |
| "epoch": 20.209680237637603, |
| "grad_norm": 0.43065130710601807, |
| "learning_rate": 0.0003576958041958042, |
| "loss": 3.1921, |
| "step": 69400 |
| }, |
| { |
| "epoch": 20.224241365251324, |
| "grad_norm": 0.4058898389339447, |
| "learning_rate": 0.000357520979020979, |
| "loss": 3.2036, |
| "step": 69450 |
| }, |
| { |
| "epoch": 20.238802492865048, |
| "grad_norm": 0.3874876797199249, |
| "learning_rate": 0.00035734615384615384, |
| "loss": 3.2017, |
| "step": 69500 |
| }, |
| { |
| "epoch": 20.25336362047877, |
| "grad_norm": 0.4190688729286194, |
| "learning_rate": 0.00035717132867132864, |
| "loss": 3.2045, |
| "step": 69550 |
| }, |
| { |
| "epoch": 20.267924748092494, |
| "grad_norm": 0.42875048518180847, |
| "learning_rate": 0.0003569965034965035, |
| "loss": 3.2117, |
| "step": 69600 |
| }, |
| { |
| "epoch": 20.282485875706215, |
| "grad_norm": 0.3930942118167877, |
| "learning_rate": 0.0003568216783216783, |
| "loss": 3.2069, |
| "step": 69650 |
| }, |
| { |
| "epoch": 20.297047003319935, |
| "grad_norm": 0.3895373046398163, |
| "learning_rate": 0.00035664685314685314, |
| "loss": 3.2059, |
| "step": 69700 |
| }, |
| { |
| "epoch": 20.31160813093366, |
| "grad_norm": 0.40520724654197693, |
| "learning_rate": 0.00035647202797202794, |
| "loss": 3.2213, |
| "step": 69750 |
| }, |
| { |
| "epoch": 20.32616925854738, |
| "grad_norm": 0.3832961618900299, |
| "learning_rate": 0.0003562972027972028, |
| "loss": 3.2112, |
| "step": 69800 |
| }, |
| { |
| "epoch": 20.340730386161106, |
| "grad_norm": 0.392443984746933, |
| "learning_rate": 0.00035612237762237754, |
| "loss": 3.2145, |
| "step": 69850 |
| }, |
| { |
| "epoch": 20.355291513774826, |
| "grad_norm": 0.4083760380744934, |
| "learning_rate": 0.0003559475524475524, |
| "loss": 3.2075, |
| "step": 69900 |
| }, |
| { |
| "epoch": 20.369852641388547, |
| "grad_norm": 0.42120611667633057, |
| "learning_rate": 0.0003557727272727272, |
| "loss": 3.205, |
| "step": 69950 |
| }, |
| { |
| "epoch": 20.384413769002272, |
| "grad_norm": 0.42366138100624084, |
| "learning_rate": 0.00035559790209790205, |
| "loss": 3.217, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.384413769002272, |
| "eval_accuracy": 0.37319291670032906, |
| "eval_loss": 3.5472211837768555, |
| "eval_runtime": 81.9766, |
| "eval_samples_per_second": 202.997, |
| "eval_steps_per_second": 12.699, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.398974896615993, |
| "grad_norm": 0.4008452594280243, |
| "learning_rate": 0.0003554230769230769, |
| "loss": 3.2203, |
| "step": 70050 |
| }, |
| { |
| "epoch": 20.413536024229717, |
| "grad_norm": 0.3865274488925934, |
| "learning_rate": 0.0003552482517482517, |
| "loss": 3.2259, |
| "step": 70100 |
| }, |
| { |
| "epoch": 20.42809715184344, |
| "grad_norm": 0.38302117586135864, |
| "learning_rate": 0.00035507342657342656, |
| "loss": 3.2301, |
| "step": 70150 |
| }, |
| { |
| "epoch": 20.442658279457163, |
| "grad_norm": 0.35692399740219116, |
| "learning_rate": 0.00035489860139860136, |
| "loss": 3.2153, |
| "step": 70200 |
| }, |
| { |
| "epoch": 20.457219407070884, |
| "grad_norm": 0.41295337677001953, |
| "learning_rate": 0.0003547237762237762, |
| "loss": 3.2252, |
| "step": 70250 |
| }, |
| { |
| "epoch": 20.471780534684605, |
| "grad_norm": 0.3778761327266693, |
| "learning_rate": 0.000354548951048951, |
| "loss": 3.2271, |
| "step": 70300 |
| }, |
| { |
| "epoch": 20.48634166229833, |
| "grad_norm": 0.41152307391166687, |
| "learning_rate": 0.00035437412587412587, |
| "loss": 3.2124, |
| "step": 70350 |
| }, |
| { |
| "epoch": 20.50090278991205, |
| "grad_norm": 0.3975955545902252, |
| "learning_rate": 0.00035419930069930067, |
| "loss": 3.2289, |
| "step": 70400 |
| }, |
| { |
| "epoch": 20.51546391752577, |
| "grad_norm": 0.39379915595054626, |
| "learning_rate": 0.0003540244755244755, |
| "loss": 3.2188, |
| "step": 70450 |
| }, |
| { |
| "epoch": 20.530025045139496, |
| "grad_norm": 0.37985914945602417, |
| "learning_rate": 0.0003538496503496503, |
| "loss": 3.2352, |
| "step": 70500 |
| }, |
| { |
| "epoch": 20.544586172753217, |
| "grad_norm": 0.4027496874332428, |
| "learning_rate": 0.0003536748251748252, |
| "loss": 3.2292, |
| "step": 70550 |
| }, |
| { |
| "epoch": 20.55914730036694, |
| "grad_norm": 0.3890494108200073, |
| "learning_rate": 0.0003534999999999999, |
| "loss": 3.2242, |
| "step": 70600 |
| }, |
| { |
| "epoch": 20.573708427980662, |
| "grad_norm": 0.384518027305603, |
| "learning_rate": 0.00035332517482517477, |
| "loss": 3.2323, |
| "step": 70650 |
| }, |
| { |
| "epoch": 20.588269555594387, |
| "grad_norm": 0.42106884717941284, |
| "learning_rate": 0.0003531503496503496, |
| "loss": 3.2398, |
| "step": 70700 |
| }, |
| { |
| "epoch": 20.602830683208108, |
| "grad_norm": 0.39280247688293457, |
| "learning_rate": 0.0003529755244755244, |
| "loss": 3.227, |
| "step": 70750 |
| }, |
| { |
| "epoch": 20.61739181082183, |
| "grad_norm": 0.3934417963027954, |
| "learning_rate": 0.0003528006993006993, |
| "loss": 3.239, |
| "step": 70800 |
| }, |
| { |
| "epoch": 20.631952938435553, |
| "grad_norm": 0.3855319619178772, |
| "learning_rate": 0.0003526258741258741, |
| "loss": 3.2263, |
| "step": 70850 |
| }, |
| { |
| "epoch": 20.646514066049274, |
| "grad_norm": 0.38433825969696045, |
| "learning_rate": 0.00035245104895104893, |
| "loss": 3.2336, |
| "step": 70900 |
| }, |
| { |
| "epoch": 20.661075193663, |
| "grad_norm": 0.4200402498245239, |
| "learning_rate": 0.00035227622377622373, |
| "loss": 3.2335, |
| "step": 70950 |
| }, |
| { |
| "epoch": 20.67563632127672, |
| "grad_norm": 0.41327399015426636, |
| "learning_rate": 0.0003521013986013986, |
| "loss": 3.2238, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.67563632127672, |
| "eval_accuracy": 0.37333438701079114, |
| "eval_loss": 3.5425360202789307, |
| "eval_runtime": 81.9492, |
| "eval_samples_per_second": 203.065, |
| "eval_steps_per_second": 12.703, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.69019744889044, |
| "grad_norm": 0.40937313437461853, |
| "learning_rate": 0.0003519265734265734, |
| "loss": 3.2354, |
| "step": 71050 |
| }, |
| { |
| "epoch": 20.704758576504165, |
| "grad_norm": 0.3639281988143921, |
| "learning_rate": 0.00035175174825174824, |
| "loss": 3.2467, |
| "step": 71100 |
| }, |
| { |
| "epoch": 20.719319704117886, |
| "grad_norm": 0.42520174384117126, |
| "learning_rate": 0.00035157692307692304, |
| "loss": 3.2378, |
| "step": 71150 |
| }, |
| { |
| "epoch": 20.73388083173161, |
| "grad_norm": 0.3757728040218353, |
| "learning_rate": 0.0003514020979020979, |
| "loss": 3.235, |
| "step": 71200 |
| }, |
| { |
| "epoch": 20.74844195934533, |
| "grad_norm": 0.4070497751235962, |
| "learning_rate": 0.0003512272727272727, |
| "loss": 3.2478, |
| "step": 71250 |
| }, |
| { |
| "epoch": 20.763003086959053, |
| "grad_norm": 0.4231252074241638, |
| "learning_rate": 0.00035105244755244755, |
| "loss": 3.2431, |
| "step": 71300 |
| }, |
| { |
| "epoch": 20.777564214572777, |
| "grad_norm": 0.3944244980812073, |
| "learning_rate": 0.0003508776223776223, |
| "loss": 3.2403, |
| "step": 71350 |
| }, |
| { |
| "epoch": 20.792125342186498, |
| "grad_norm": 0.43236541748046875, |
| "learning_rate": 0.00035070279720279715, |
| "loss": 3.2442, |
| "step": 71400 |
| }, |
| { |
| "epoch": 20.806686469800223, |
| "grad_norm": 0.3983427882194519, |
| "learning_rate": 0.000350527972027972, |
| "loss": 3.2505, |
| "step": 71450 |
| }, |
| { |
| "epoch": 20.821247597413944, |
| "grad_norm": 0.3704656660556793, |
| "learning_rate": 0.0003503531468531468, |
| "loss": 3.2505, |
| "step": 71500 |
| }, |
| { |
| "epoch": 20.835808725027665, |
| "grad_norm": 0.37858283519744873, |
| "learning_rate": 0.00035017832167832166, |
| "loss": 3.2495, |
| "step": 71550 |
| }, |
| { |
| "epoch": 20.85036985264139, |
| "grad_norm": 0.40734684467315674, |
| "learning_rate": 0.00035000349650349645, |
| "loss": 3.2405, |
| "step": 71600 |
| }, |
| { |
| "epoch": 20.86493098025511, |
| "grad_norm": 0.41431188583374023, |
| "learning_rate": 0.0003498286713286713, |
| "loss": 3.2388, |
| "step": 71650 |
| }, |
| { |
| "epoch": 20.879492107868835, |
| "grad_norm": 0.4164574146270752, |
| "learning_rate": 0.0003496538461538461, |
| "loss": 3.247, |
| "step": 71700 |
| }, |
| { |
| "epoch": 20.894053235482556, |
| "grad_norm": 0.38683512806892395, |
| "learning_rate": 0.00034947902097902096, |
| "loss": 3.2532, |
| "step": 71750 |
| }, |
| { |
| "epoch": 20.908614363096277, |
| "grad_norm": 0.39211592078208923, |
| "learning_rate": 0.00034930419580419576, |
| "loss": 3.2524, |
| "step": 71800 |
| }, |
| { |
| "epoch": 20.92317549071, |
| "grad_norm": 0.3883165717124939, |
| "learning_rate": 0.0003491293706293706, |
| "loss": 3.2447, |
| "step": 71850 |
| }, |
| { |
| "epoch": 20.937736618323722, |
| "grad_norm": 0.3908028304576874, |
| "learning_rate": 0.0003489545454545454, |
| "loss": 3.2677, |
| "step": 71900 |
| }, |
| { |
| "epoch": 20.952297745937447, |
| "grad_norm": 0.36733561754226685, |
| "learning_rate": 0.00034877972027972027, |
| "loss": 3.265, |
| "step": 71950 |
| }, |
| { |
| "epoch": 20.966858873551168, |
| "grad_norm": 0.3770594894886017, |
| "learning_rate": 0.00034860489510489507, |
| "loss": 3.2515, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.966858873551168, |
| "eval_accuracy": 0.37392943253941796, |
| "eval_loss": 3.5328328609466553, |
| "eval_runtime": 82.054, |
| "eval_samples_per_second": 202.805, |
| "eval_steps_per_second": 12.687, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.98142000116489, |
| "grad_norm": 0.37822750210762024, |
| "learning_rate": 0.0003484300699300699, |
| "loss": 3.235, |
| "step": 72050 |
| }, |
| { |
| "epoch": 20.995981128778613, |
| "grad_norm": 0.38507455587387085, |
| "learning_rate": 0.0003482552447552448, |
| "loss": 3.2536, |
| "step": 72100 |
| }, |
| { |
| "epoch": 21.01048401188188, |
| "grad_norm": 0.4130054712295532, |
| "learning_rate": 0.0003480804195804195, |
| "loss": 3.1723, |
| "step": 72150 |
| }, |
| { |
| "epoch": 21.025045139495603, |
| "grad_norm": 0.39601749181747437, |
| "learning_rate": 0.0003479055944055944, |
| "loss": 3.146, |
| "step": 72200 |
| }, |
| { |
| "epoch": 21.039606267109324, |
| "grad_norm": 0.40298110246658325, |
| "learning_rate": 0.0003477307692307692, |
| "loss": 3.1515, |
| "step": 72250 |
| }, |
| { |
| "epoch": 21.05416739472305, |
| "grad_norm": 0.400453120470047, |
| "learning_rate": 0.00034755594405594403, |
| "loss": 3.1532, |
| "step": 72300 |
| }, |
| { |
| "epoch": 21.06872852233677, |
| "grad_norm": 0.3756105601787567, |
| "learning_rate": 0.00034738111888111883, |
| "loss": 3.1586, |
| "step": 72350 |
| }, |
| { |
| "epoch": 21.08328964995049, |
| "grad_norm": 0.3838154673576355, |
| "learning_rate": 0.0003472062937062937, |
| "loss": 3.1711, |
| "step": 72400 |
| }, |
| { |
| "epoch": 21.097850777564215, |
| "grad_norm": 0.40945592522621155, |
| "learning_rate": 0.0003470314685314685, |
| "loss": 3.1568, |
| "step": 72450 |
| }, |
| { |
| "epoch": 21.112411905177936, |
| "grad_norm": 0.39956948161125183, |
| "learning_rate": 0.00034685664335664334, |
| "loss": 3.1728, |
| "step": 72500 |
| }, |
| { |
| "epoch": 21.12697303279166, |
| "grad_norm": 0.3876982629299164, |
| "learning_rate": 0.00034668181818181814, |
| "loss": 3.1706, |
| "step": 72550 |
| }, |
| { |
| "epoch": 21.14153416040538, |
| "grad_norm": 0.42792001366615295, |
| "learning_rate": 0.000346506993006993, |
| "loss": 3.1699, |
| "step": 72600 |
| }, |
| { |
| "epoch": 21.156095288019102, |
| "grad_norm": 0.40332284569740295, |
| "learning_rate": 0.0003463321678321678, |
| "loss": 3.1749, |
| "step": 72650 |
| }, |
| { |
| "epoch": 21.170656415632827, |
| "grad_norm": 0.4165891408920288, |
| "learning_rate": 0.00034615734265734264, |
| "loss": 3.1884, |
| "step": 72700 |
| }, |
| { |
| "epoch": 21.185217543246548, |
| "grad_norm": 0.39421212673187256, |
| "learning_rate": 0.0003459825174825175, |
| "loss": 3.1877, |
| "step": 72750 |
| }, |
| { |
| "epoch": 21.199778670860272, |
| "grad_norm": 0.43057945370674133, |
| "learning_rate": 0.0003458076923076923, |
| "loss": 3.1814, |
| "step": 72800 |
| }, |
| { |
| "epoch": 21.214339798473993, |
| "grad_norm": 0.4369525611400604, |
| "learning_rate": 0.00034563286713286715, |
| "loss": 3.1821, |
| "step": 72850 |
| }, |
| { |
| "epoch": 21.228900926087718, |
| "grad_norm": 0.4119269847869873, |
| "learning_rate": 0.0003454580419580419, |
| "loss": 3.1993, |
| "step": 72900 |
| }, |
| { |
| "epoch": 21.24346205370144, |
| "grad_norm": 0.40221941471099854, |
| "learning_rate": 0.00034528321678321675, |
| "loss": 3.1933, |
| "step": 72950 |
| }, |
| { |
| "epoch": 21.25802318131516, |
| "grad_norm": 0.41838139295578003, |
| "learning_rate": 0.00034510839160839155, |
| "loss": 3.195, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.25802318131516, |
| "eval_accuracy": 0.3728074306839578, |
| "eval_loss": 3.5512492656707764, |
| "eval_runtime": 81.8981, |
| "eval_samples_per_second": 203.192, |
| "eval_steps_per_second": 12.711, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.272584308928884, |
| "grad_norm": 0.37707242369651794, |
| "learning_rate": 0.0003449335664335664, |
| "loss": 3.201, |
| "step": 73050 |
| }, |
| { |
| "epoch": 21.287145436542605, |
| "grad_norm": 0.3755451738834381, |
| "learning_rate": 0.0003447587412587412, |
| "loss": 3.203, |
| "step": 73100 |
| }, |
| { |
| "epoch": 21.30170656415633, |
| "grad_norm": 0.40628618001937866, |
| "learning_rate": 0.00034458391608391606, |
| "loss": 3.1881, |
| "step": 73150 |
| }, |
| { |
| "epoch": 21.31626769177005, |
| "grad_norm": 0.39285871386528015, |
| "learning_rate": 0.00034440909090909086, |
| "loss": 3.1981, |
| "step": 73200 |
| }, |
| { |
| "epoch": 21.330828819383772, |
| "grad_norm": 0.4157426357269287, |
| "learning_rate": 0.0003442342657342657, |
| "loss": 3.2024, |
| "step": 73250 |
| }, |
| { |
| "epoch": 21.345389946997496, |
| "grad_norm": 0.37565159797668457, |
| "learning_rate": 0.0003440594405594405, |
| "loss": 3.1987, |
| "step": 73300 |
| }, |
| { |
| "epoch": 21.359951074611217, |
| "grad_norm": 0.3829394280910492, |
| "learning_rate": 0.00034388461538461537, |
| "loss": 3.1961, |
| "step": 73350 |
| }, |
| { |
| "epoch": 21.374512202224942, |
| "grad_norm": 0.41310203075408936, |
| "learning_rate": 0.00034370979020979017, |
| "loss": 3.2053, |
| "step": 73400 |
| }, |
| { |
| "epoch": 21.389073329838663, |
| "grad_norm": 0.40526509284973145, |
| "learning_rate": 0.000343534965034965, |
| "loss": 3.2053, |
| "step": 73450 |
| }, |
| { |
| "epoch": 21.403634457452384, |
| "grad_norm": 0.41370317339897156, |
| "learning_rate": 0.0003433601398601399, |
| "loss": 3.2133, |
| "step": 73500 |
| }, |
| { |
| "epoch": 21.41819558506611, |
| "grad_norm": 0.4183395504951477, |
| "learning_rate": 0.0003431853146853147, |
| "loss": 3.2084, |
| "step": 73550 |
| }, |
| { |
| "epoch": 21.43275671267983, |
| "grad_norm": 0.398794561624527, |
| "learning_rate": 0.0003430104895104895, |
| "loss": 3.1943, |
| "step": 73600 |
| }, |
| { |
| "epoch": 21.447317840293554, |
| "grad_norm": 0.39731237292289734, |
| "learning_rate": 0.00034283566433566427, |
| "loss": 3.2119, |
| "step": 73650 |
| }, |
| { |
| "epoch": 21.461878967907275, |
| "grad_norm": 0.37555816769599915, |
| "learning_rate": 0.0003426608391608391, |
| "loss": 3.2172, |
| "step": 73700 |
| }, |
| { |
| "epoch": 21.476440095520996, |
| "grad_norm": 0.4385387897491455, |
| "learning_rate": 0.0003424860139860139, |
| "loss": 3.1985, |
| "step": 73750 |
| }, |
| { |
| "epoch": 21.49100122313472, |
| "grad_norm": 0.39294108748435974, |
| "learning_rate": 0.0003423111888111888, |
| "loss": 3.2277, |
| "step": 73800 |
| }, |
| { |
| "epoch": 21.50556235074844, |
| "grad_norm": 0.3976404368877411, |
| "learning_rate": 0.0003421363636363636, |
| "loss": 3.2161, |
| "step": 73850 |
| }, |
| { |
| "epoch": 21.520123478362166, |
| "grad_norm": 0.42642197012901306, |
| "learning_rate": 0.00034196153846153843, |
| "loss": 3.2137, |
| "step": 73900 |
| }, |
| { |
| "epoch": 21.534684605975887, |
| "grad_norm": 0.39419153332710266, |
| "learning_rate": 0.00034178671328671323, |
| "loss": 3.2102, |
| "step": 73950 |
| }, |
| { |
| "epoch": 21.549245733589608, |
| "grad_norm": 0.41290178894996643, |
| "learning_rate": 0.0003416118881118881, |
| "loss": 3.2048, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.549245733589608, |
| "eval_accuracy": 0.37326947295312274, |
| "eval_loss": 3.5448601245880127, |
| "eval_runtime": 81.9523, |
| "eval_samples_per_second": 203.057, |
| "eval_steps_per_second": 12.703, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.563806861203332, |
| "grad_norm": 0.3989127576351166, |
| "learning_rate": 0.0003414370629370629, |
| "loss": 3.2246, |
| "step": 74050 |
| }, |
| { |
| "epoch": 21.578367988817053, |
| "grad_norm": 0.43993303179740906, |
| "learning_rate": 0.00034126223776223774, |
| "loss": 3.2238, |
| "step": 74100 |
| }, |
| { |
| "epoch": 21.592929116430778, |
| "grad_norm": 0.39450588822364807, |
| "learning_rate": 0.0003410874125874126, |
| "loss": 3.2338, |
| "step": 74150 |
| }, |
| { |
| "epoch": 21.6074902440445, |
| "grad_norm": 0.39482542872428894, |
| "learning_rate": 0.0003409125874125874, |
| "loss": 3.2235, |
| "step": 74200 |
| }, |
| { |
| "epoch": 21.62205137165822, |
| "grad_norm": 0.41980329155921936, |
| "learning_rate": 0.00034073776223776225, |
| "loss": 3.2243, |
| "step": 74250 |
| }, |
| { |
| "epoch": 21.636612499271944, |
| "grad_norm": 0.3849197030067444, |
| "learning_rate": 0.00034056293706293705, |
| "loss": 3.2337, |
| "step": 74300 |
| }, |
| { |
| "epoch": 21.651173626885665, |
| "grad_norm": 0.4282234013080597, |
| "learning_rate": 0.0003403881118881119, |
| "loss": 3.2219, |
| "step": 74350 |
| }, |
| { |
| "epoch": 21.66573475449939, |
| "grad_norm": 0.4085162878036499, |
| "learning_rate": 0.00034021328671328665, |
| "loss": 3.2302, |
| "step": 74400 |
| }, |
| { |
| "epoch": 21.68029588211311, |
| "grad_norm": 0.41765034198760986, |
| "learning_rate": 0.0003400384615384615, |
| "loss": 3.2177, |
| "step": 74450 |
| }, |
| { |
| "epoch": 21.69485700972683, |
| "grad_norm": 0.3892502188682556, |
| "learning_rate": 0.0003398636363636363, |
| "loss": 3.2193, |
| "step": 74500 |
| }, |
| { |
| "epoch": 21.709418137340556, |
| "grad_norm": 0.43037962913513184, |
| "learning_rate": 0.00033968881118881115, |
| "loss": 3.2256, |
| "step": 74550 |
| }, |
| { |
| "epoch": 21.723979264954277, |
| "grad_norm": 0.40652963519096375, |
| "learning_rate": 0.00033951398601398595, |
| "loss": 3.2315, |
| "step": 74600 |
| }, |
| { |
| "epoch": 21.738540392568, |
| "grad_norm": 0.4284264147281647, |
| "learning_rate": 0.0003393391608391608, |
| "loss": 3.229, |
| "step": 74650 |
| }, |
| { |
| "epoch": 21.753101520181723, |
| "grad_norm": 0.3994840383529663, |
| "learning_rate": 0.0003391643356643356, |
| "loss": 3.2268, |
| "step": 74700 |
| }, |
| { |
| "epoch": 21.767662647795444, |
| "grad_norm": 0.39457011222839355, |
| "learning_rate": 0.00033898951048951046, |
| "loss": 3.2286, |
| "step": 74750 |
| }, |
| { |
| "epoch": 21.782223775409168, |
| "grad_norm": 0.4335285723209381, |
| "learning_rate": 0.00033881468531468526, |
| "loss": 3.2321, |
| "step": 74800 |
| }, |
| { |
| "epoch": 21.79678490302289, |
| "grad_norm": 0.40969493985176086, |
| "learning_rate": 0.0003386398601398601, |
| "loss": 3.2372, |
| "step": 74850 |
| }, |
| { |
| "epoch": 21.811346030636614, |
| "grad_norm": 0.4298085868358612, |
| "learning_rate": 0.00033846503496503497, |
| "loss": 3.2518, |
| "step": 74900 |
| }, |
| { |
| "epoch": 21.825907158250335, |
| "grad_norm": 0.3903745114803314, |
| "learning_rate": 0.00033829020979020977, |
| "loss": 3.2443, |
| "step": 74950 |
| }, |
| { |
| "epoch": 21.840468285864056, |
| "grad_norm": 0.41702258586883545, |
| "learning_rate": 0.0003381153846153846, |
| "loss": 3.2347, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.840468285864056, |
| "eval_accuracy": 0.3741654515860492, |
| "eval_loss": 3.534738063812256, |
| "eval_runtime": 82.0602, |
| "eval_samples_per_second": 202.79, |
| "eval_steps_per_second": 12.686, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.85502941347778, |
| "grad_norm": 0.4226226806640625, |
| "learning_rate": 0.0003379405594405594, |
| "loss": 3.2417, |
| "step": 75050 |
| }, |
| { |
| "epoch": 21.8695905410915, |
| "grad_norm": 0.4108355641365051, |
| "learning_rate": 0.0003377657342657343, |
| "loss": 3.2364, |
| "step": 75100 |
| }, |
| { |
| "epoch": 21.884151668705226, |
| "grad_norm": 0.40037399530410767, |
| "learning_rate": 0.000337590909090909, |
| "loss": 3.2308, |
| "step": 75150 |
| }, |
| { |
| "epoch": 21.898712796318947, |
| "grad_norm": 0.4118042290210724, |
| "learning_rate": 0.0003374160839160839, |
| "loss": 3.2279, |
| "step": 75200 |
| }, |
| { |
| "epoch": 21.91327392393267, |
| "grad_norm": 0.40740594267845154, |
| "learning_rate": 0.0003372412587412587, |
| "loss": 3.2364, |
| "step": 75250 |
| }, |
| { |
| "epoch": 21.927835051546392, |
| "grad_norm": 0.4201134443283081, |
| "learning_rate": 0.00033706643356643353, |
| "loss": 3.2436, |
| "step": 75300 |
| }, |
| { |
| "epoch": 21.942396179160113, |
| "grad_norm": 0.3596144914627075, |
| "learning_rate": 0.00033689160839160833, |
| "loss": 3.2335, |
| "step": 75350 |
| }, |
| { |
| "epoch": 21.956957306773838, |
| "grad_norm": 0.4211823344230652, |
| "learning_rate": 0.0003367167832167832, |
| "loss": 3.2349, |
| "step": 75400 |
| }, |
| { |
| "epoch": 21.97151843438756, |
| "grad_norm": 0.4012598693370819, |
| "learning_rate": 0.000336541958041958, |
| "loss": 3.2405, |
| "step": 75450 |
| }, |
| { |
| "epoch": 21.986079562001283, |
| "grad_norm": 0.38613030314445496, |
| "learning_rate": 0.00033636713286713284, |
| "loss": 3.2385, |
| "step": 75500 |
| }, |
| { |
| "epoch": 22.00058244510455, |
| "grad_norm": 0.41727739572525024, |
| "learning_rate": 0.0003361923076923077, |
| "loss": 3.2546, |
| "step": 75550 |
| }, |
| { |
| "epoch": 22.015143572718273, |
| "grad_norm": 0.3983735144138336, |
| "learning_rate": 0.0003360174825174825, |
| "loss": 3.1467, |
| "step": 75600 |
| }, |
| { |
| "epoch": 22.029704700331994, |
| "grad_norm": 0.4194571077823639, |
| "learning_rate": 0.00033584265734265734, |
| "loss": 3.144, |
| "step": 75650 |
| }, |
| { |
| "epoch": 22.044265827945715, |
| "grad_norm": 0.3799212574958801, |
| "learning_rate": 0.00033566783216783214, |
| "loss": 3.1377, |
| "step": 75700 |
| }, |
| { |
| "epoch": 22.05882695555944, |
| "grad_norm": 0.3902527689933777, |
| "learning_rate": 0.000335493006993007, |
| "loss": 3.1468, |
| "step": 75750 |
| }, |
| { |
| "epoch": 22.07338808317316, |
| "grad_norm": 0.40930628776550293, |
| "learning_rate": 0.0003353181818181818, |
| "loss": 3.1543, |
| "step": 75800 |
| }, |
| { |
| "epoch": 22.087949210786885, |
| "grad_norm": 0.42795684933662415, |
| "learning_rate": 0.00033514335664335665, |
| "loss": 3.1572, |
| "step": 75850 |
| }, |
| { |
| "epoch": 22.102510338400606, |
| "grad_norm": 0.41338643431663513, |
| "learning_rate": 0.0003349685314685314, |
| "loss": 3.1563, |
| "step": 75900 |
| }, |
| { |
| "epoch": 22.117071466014327, |
| "grad_norm": 0.38442519307136536, |
| "learning_rate": 0.00033479370629370625, |
| "loss": 3.162, |
| "step": 75950 |
| }, |
| { |
| "epoch": 22.13163259362805, |
| "grad_norm": 0.3954649865627289, |
| "learning_rate": 0.00033461888111888105, |
| "loss": 3.1672, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.13163259362805, |
| "eval_accuracy": 0.37297430214742056, |
| "eval_loss": 3.5518293380737305, |
| "eval_runtime": 82.1642, |
| "eval_samples_per_second": 202.533, |
| "eval_steps_per_second": 12.67, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.146193721241772, |
| "grad_norm": 0.40157610177993774, |
| "learning_rate": 0.0003344440559440559, |
| "loss": 3.1606, |
| "step": 76050 |
| }, |
| { |
| "epoch": 22.160754848855497, |
| "grad_norm": 0.4080871641635895, |
| "learning_rate": 0.0003342692307692307, |
| "loss": 3.1719, |
| "step": 76100 |
| }, |
| { |
| "epoch": 22.175315976469218, |
| "grad_norm": 0.41374674439430237, |
| "learning_rate": 0.00033409440559440556, |
| "loss": 3.1747, |
| "step": 76150 |
| }, |
| { |
| "epoch": 22.18987710408294, |
| "grad_norm": 0.4247576594352722, |
| "learning_rate": 0.00033391958041958036, |
| "loss": 3.1719, |
| "step": 76200 |
| }, |
| { |
| "epoch": 22.204438231696663, |
| "grad_norm": 0.40080681443214417, |
| "learning_rate": 0.0003337447552447552, |
| "loss": 3.174, |
| "step": 76250 |
| }, |
| { |
| "epoch": 22.218999359310384, |
| "grad_norm": 0.3779149353504181, |
| "learning_rate": 0.00033356993006993007, |
| "loss": 3.1824, |
| "step": 76300 |
| }, |
| { |
| "epoch": 22.23356048692411, |
| "grad_norm": 0.4396825134754181, |
| "learning_rate": 0.00033339510489510487, |
| "loss": 3.1711, |
| "step": 76350 |
| }, |
| { |
| "epoch": 22.24812161453783, |
| "grad_norm": 0.3883700966835022, |
| "learning_rate": 0.0003332202797202797, |
| "loss": 3.1869, |
| "step": 76400 |
| }, |
| { |
| "epoch": 22.26268274215155, |
| "grad_norm": 0.4062824249267578, |
| "learning_rate": 0.0003330454545454545, |
| "loss": 3.1961, |
| "step": 76450 |
| }, |
| { |
| "epoch": 22.277243869765275, |
| "grad_norm": 0.40531423687934875, |
| "learning_rate": 0.0003328706293706294, |
| "loss": 3.1708, |
| "step": 76500 |
| }, |
| { |
| "epoch": 22.291804997378996, |
| "grad_norm": 0.4267043173313141, |
| "learning_rate": 0.00033269580419580417, |
| "loss": 3.1883, |
| "step": 76550 |
| }, |
| { |
| "epoch": 22.30636612499272, |
| "grad_norm": 0.4165393114089966, |
| "learning_rate": 0.000332520979020979, |
| "loss": 3.1922, |
| "step": 76600 |
| }, |
| { |
| "epoch": 22.32092725260644, |
| "grad_norm": 0.4269765615463257, |
| "learning_rate": 0.00033234615384615377, |
| "loss": 3.1893, |
| "step": 76650 |
| }, |
| { |
| "epoch": 22.335488380220163, |
| "grad_norm": 0.6468771696090698, |
| "learning_rate": 0.0003321713286713286, |
| "loss": 3.2057, |
| "step": 76700 |
| }, |
| { |
| "epoch": 22.350049507833887, |
| "grad_norm": 0.4627307951450348, |
| "learning_rate": 0.0003319965034965034, |
| "loss": 3.1751, |
| "step": 76750 |
| }, |
| { |
| "epoch": 22.364610635447608, |
| "grad_norm": 0.4320053458213806, |
| "learning_rate": 0.0003318216783216783, |
| "loss": 3.1786, |
| "step": 76800 |
| }, |
| { |
| "epoch": 22.379171763061333, |
| "grad_norm": 0.40447214245796204, |
| "learning_rate": 0.0003316468531468531, |
| "loss": 3.2027, |
| "step": 76850 |
| }, |
| { |
| "epoch": 22.393732890675054, |
| "grad_norm": 0.421953409910202, |
| "learning_rate": 0.00033147202797202793, |
| "loss": 3.1886, |
| "step": 76900 |
| }, |
| { |
| "epoch": 22.408294018288775, |
| "grad_norm": 0.3994221091270447, |
| "learning_rate": 0.0003312972027972028, |
| "loss": 3.196, |
| "step": 76950 |
| }, |
| { |
| "epoch": 22.4228551459025, |
| "grad_norm": 0.4102366268634796, |
| "learning_rate": 0.0003311223776223776, |
| "loss": 3.1982, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.4228551459025, |
| "eval_accuracy": 0.37345551287926654, |
| "eval_loss": 3.547607898712158, |
| "eval_runtime": 81.9012, |
| "eval_samples_per_second": 203.184, |
| "eval_steps_per_second": 12.71, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.43741627351622, |
| "grad_norm": 0.40204089879989624, |
| "learning_rate": 0.00033094755244755244, |
| "loss": 3.1945, |
| "step": 77050 |
| }, |
| { |
| "epoch": 22.451977401129945, |
| "grad_norm": 0.4352427124977112, |
| "learning_rate": 0.00033077272727272724, |
| "loss": 3.2039, |
| "step": 77100 |
| }, |
| { |
| "epoch": 22.466538528743666, |
| "grad_norm": 0.42460089921951294, |
| "learning_rate": 0.0003305979020979021, |
| "loss": 3.206, |
| "step": 77150 |
| }, |
| { |
| "epoch": 22.481099656357387, |
| "grad_norm": 0.41234180331230164, |
| "learning_rate": 0.0003304230769230769, |
| "loss": 3.2102, |
| "step": 77200 |
| }, |
| { |
| "epoch": 22.49566078397111, |
| "grad_norm": 0.40233999490737915, |
| "learning_rate": 0.00033024825174825175, |
| "loss": 3.1914, |
| "step": 77250 |
| }, |
| { |
| "epoch": 22.510221911584832, |
| "grad_norm": 0.4162119925022125, |
| "learning_rate": 0.00033007342657342655, |
| "loss": 3.2146, |
| "step": 77300 |
| }, |
| { |
| "epoch": 22.524783039198557, |
| "grad_norm": 0.41718634963035583, |
| "learning_rate": 0.0003298986013986014, |
| "loss": 3.1961, |
| "step": 77350 |
| }, |
| { |
| "epoch": 22.539344166812278, |
| "grad_norm": 0.40040820837020874, |
| "learning_rate": 0.00032972377622377615, |
| "loss": 3.205, |
| "step": 77400 |
| }, |
| { |
| "epoch": 22.553905294426002, |
| "grad_norm": 0.41082045435905457, |
| "learning_rate": 0.000329548951048951, |
| "loss": 3.205, |
| "step": 77450 |
| }, |
| { |
| "epoch": 22.568466422039723, |
| "grad_norm": 0.40234094858169556, |
| "learning_rate": 0.0003293741258741258, |
| "loss": 3.2129, |
| "step": 77500 |
| }, |
| { |
| "epoch": 22.583027549653444, |
| "grad_norm": 0.4170769453048706, |
| "learning_rate": 0.00032919930069930065, |
| "loss": 3.2078, |
| "step": 77550 |
| }, |
| { |
| "epoch": 22.59758867726717, |
| "grad_norm": 0.391119122505188, |
| "learning_rate": 0.0003290244755244755, |
| "loss": 3.2127, |
| "step": 77600 |
| }, |
| { |
| "epoch": 22.61214980488089, |
| "grad_norm": 0.4319838583469391, |
| "learning_rate": 0.0003288496503496503, |
| "loss": 3.2134, |
| "step": 77650 |
| }, |
| { |
| "epoch": 22.626710932494614, |
| "grad_norm": 0.41355255246162415, |
| "learning_rate": 0.00032867482517482516, |
| "loss": 3.2173, |
| "step": 77700 |
| }, |
| { |
| "epoch": 22.641272060108335, |
| "grad_norm": 0.4015265107154846, |
| "learning_rate": 0.00032849999999999996, |
| "loss": 3.2159, |
| "step": 77750 |
| }, |
| { |
| "epoch": 22.655833187722056, |
| "grad_norm": 0.41282346844673157, |
| "learning_rate": 0.0003283251748251748, |
| "loss": 3.2163, |
| "step": 77800 |
| }, |
| { |
| "epoch": 22.67039431533578, |
| "grad_norm": 0.39489439129829407, |
| "learning_rate": 0.0003281503496503496, |
| "loss": 3.21, |
| "step": 77850 |
| }, |
| { |
| "epoch": 22.6849554429495, |
| "grad_norm": 0.39321643114089966, |
| "learning_rate": 0.00032797552447552447, |
| "loss": 3.2076, |
| "step": 77900 |
| }, |
| { |
| "epoch": 22.699516570563226, |
| "grad_norm": 0.4069315195083618, |
| "learning_rate": 0.00032780069930069927, |
| "loss": 3.217, |
| "step": 77950 |
| }, |
| { |
| "epoch": 22.714077698176947, |
| "grad_norm": 0.40287089347839355, |
| "learning_rate": 0.0003276258741258741, |
| "loss": 3.2172, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.714077698176947, |
| "eval_accuracy": 0.37371646268717623, |
| "eval_loss": 3.53983998298645, |
| "eval_runtime": 82.3321, |
| "eval_samples_per_second": 202.12, |
| "eval_steps_per_second": 12.644, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.728638825790668, |
| "grad_norm": 0.3979836702346802, |
| "learning_rate": 0.0003274510489510489, |
| "loss": 3.2283, |
| "step": 78050 |
| }, |
| { |
| "epoch": 22.743199953404392, |
| "grad_norm": 0.39330774545669556, |
| "learning_rate": 0.0003272762237762238, |
| "loss": 3.2199, |
| "step": 78100 |
| }, |
| { |
| "epoch": 22.757761081018113, |
| "grad_norm": 0.39571234583854675, |
| "learning_rate": 0.0003271013986013985, |
| "loss": 3.2234, |
| "step": 78150 |
| }, |
| { |
| "epoch": 22.772322208631838, |
| "grad_norm": 0.40434396266937256, |
| "learning_rate": 0.0003269265734265734, |
| "loss": 3.215, |
| "step": 78200 |
| }, |
| { |
| "epoch": 22.78688333624556, |
| "grad_norm": 0.408203661441803, |
| "learning_rate": 0.0003267517482517482, |
| "loss": 3.2252, |
| "step": 78250 |
| }, |
| { |
| "epoch": 22.80144446385928, |
| "grad_norm": 0.39552298188209534, |
| "learning_rate": 0.00032657692307692303, |
| "loss": 3.2325, |
| "step": 78300 |
| }, |
| { |
| "epoch": 22.816005591473004, |
| "grad_norm": 0.4124816656112671, |
| "learning_rate": 0.0003264020979020979, |
| "loss": 3.2139, |
| "step": 78350 |
| }, |
| { |
| "epoch": 22.830566719086725, |
| "grad_norm": 0.40322208404541016, |
| "learning_rate": 0.0003262272727272727, |
| "loss": 3.2314, |
| "step": 78400 |
| }, |
| { |
| "epoch": 22.84512784670045, |
| "grad_norm": 0.43797048926353455, |
| "learning_rate": 0.00032605244755244754, |
| "loss": 3.2317, |
| "step": 78450 |
| }, |
| { |
| "epoch": 22.85968897431417, |
| "grad_norm": 0.41053059697151184, |
| "learning_rate": 0.00032587762237762234, |
| "loss": 3.2263, |
| "step": 78500 |
| }, |
| { |
| "epoch": 22.874250101927892, |
| "grad_norm": 0.3760128319263458, |
| "learning_rate": 0.0003257027972027972, |
| "loss": 3.2239, |
| "step": 78550 |
| }, |
| { |
| "epoch": 22.888811229541616, |
| "grad_norm": 0.39207974076271057, |
| "learning_rate": 0.000325527972027972, |
| "loss": 3.2183, |
| "step": 78600 |
| }, |
| { |
| "epoch": 22.903372357155337, |
| "grad_norm": 0.38080188632011414, |
| "learning_rate": 0.00032535314685314684, |
| "loss": 3.2293, |
| "step": 78650 |
| }, |
| { |
| "epoch": 22.917933484769062, |
| "grad_norm": 0.4008138179779053, |
| "learning_rate": 0.00032517832167832164, |
| "loss": 3.2351, |
| "step": 78700 |
| }, |
| { |
| "epoch": 22.932494612382783, |
| "grad_norm": 0.45200642943382263, |
| "learning_rate": 0.0003250034965034965, |
| "loss": 3.2299, |
| "step": 78750 |
| }, |
| { |
| "epoch": 22.947055739996504, |
| "grad_norm": 0.37784847617149353, |
| "learning_rate": 0.0003248286713286713, |
| "loss": 3.2221, |
| "step": 78800 |
| }, |
| { |
| "epoch": 22.96161686761023, |
| "grad_norm": 0.4110269546508789, |
| "learning_rate": 0.00032465384615384615, |
| "loss": 3.2337, |
| "step": 78850 |
| }, |
| { |
| "epoch": 22.97617799522395, |
| "grad_norm": 0.38130491971969604, |
| "learning_rate": 0.0003244790209790209, |
| "loss": 3.24, |
| "step": 78900 |
| }, |
| { |
| "epoch": 22.990739122837674, |
| "grad_norm": 0.3994849920272827, |
| "learning_rate": 0.00032430419580419575, |
| "loss": 3.2398, |
| "step": 78950 |
| }, |
| { |
| "epoch": 23.00524200594094, |
| "grad_norm": 0.41424617171287537, |
| "learning_rate": 0.00032412937062937066, |
| "loss": 3.1973, |
| "step": 79000 |
| }, |
| { |
| "epoch": 23.00524200594094, |
| "eval_accuracy": 0.3734397547565717, |
| "eval_loss": 3.5512161254882812, |
| "eval_runtime": 82.0355, |
| "eval_samples_per_second": 202.851, |
| "eval_steps_per_second": 12.69, |
| "step": 79000 |
| }, |
| { |
| "epoch": 23.019803133554664, |
| "grad_norm": 0.41139036417007446, |
| "learning_rate": 0.0003239545454545454, |
| "loss": 3.1288, |
| "step": 79050 |
| }, |
| { |
| "epoch": 23.034364261168385, |
| "grad_norm": 0.44748491048812866, |
| "learning_rate": 0.00032377972027972026, |
| "loss": 3.1356, |
| "step": 79100 |
| }, |
| { |
| "epoch": 23.048925388782106, |
| "grad_norm": 0.417758047580719, |
| "learning_rate": 0.00032360489510489506, |
| "loss": 3.1353, |
| "step": 79150 |
| }, |
| { |
| "epoch": 23.06348651639583, |
| "grad_norm": 0.39194077253341675, |
| "learning_rate": 0.0003234300699300699, |
| "loss": 3.1406, |
| "step": 79200 |
| }, |
| { |
| "epoch": 23.07804764400955, |
| "grad_norm": 0.4011492133140564, |
| "learning_rate": 0.0003232552447552447, |
| "loss": 3.1477, |
| "step": 79250 |
| }, |
| { |
| "epoch": 23.092608771623276, |
| "grad_norm": 0.4172074496746063, |
| "learning_rate": 0.00032308041958041957, |
| "loss": 3.14, |
| "step": 79300 |
| }, |
| { |
| "epoch": 23.107169899236997, |
| "grad_norm": 0.42427608370780945, |
| "learning_rate": 0.00032290559440559437, |
| "loss": 3.1406, |
| "step": 79350 |
| }, |
| { |
| "epoch": 23.121731026850718, |
| "grad_norm": 0.40667930245399475, |
| "learning_rate": 0.0003227307692307692, |
| "loss": 3.1446, |
| "step": 79400 |
| }, |
| { |
| "epoch": 23.136292154464442, |
| "grad_norm": 0.43607693910598755, |
| "learning_rate": 0.000322555944055944, |
| "loss": 3.167, |
| "step": 79450 |
| }, |
| { |
| "epoch": 23.150853282078163, |
| "grad_norm": 0.42833682894706726, |
| "learning_rate": 0.00032238111888111887, |
| "loss": 3.1478, |
| "step": 79500 |
| }, |
| { |
| "epoch": 23.165414409691888, |
| "grad_norm": 0.4050724506378174, |
| "learning_rate": 0.00032220629370629367, |
| "loss": 3.1762, |
| "step": 79550 |
| }, |
| { |
| "epoch": 23.17997553730561, |
| "grad_norm": 0.4192575514316559, |
| "learning_rate": 0.0003220314685314685, |
| "loss": 3.1598, |
| "step": 79600 |
| }, |
| { |
| "epoch": 23.19453666491933, |
| "grad_norm": 0.40911537408828735, |
| "learning_rate": 0.00032185664335664327, |
| "loss": 3.1599, |
| "step": 79650 |
| }, |
| { |
| "epoch": 23.209097792533054, |
| "grad_norm": 0.4374655783176422, |
| "learning_rate": 0.0003216818181818181, |
| "loss": 3.1419, |
| "step": 79700 |
| }, |
| { |
| "epoch": 23.223658920146775, |
| "grad_norm": 0.4258494973182678, |
| "learning_rate": 0.00032150699300699303, |
| "loss": 3.1662, |
| "step": 79750 |
| }, |
| { |
| "epoch": 23.2382200477605, |
| "grad_norm": 0.4280802011489868, |
| "learning_rate": 0.0003213321678321678, |
| "loss": 3.1607, |
| "step": 79800 |
| }, |
| { |
| "epoch": 23.25278117537422, |
| "grad_norm": 0.42071959376335144, |
| "learning_rate": 0.00032115734265734263, |
| "loss": 3.1705, |
| "step": 79850 |
| }, |
| { |
| "epoch": 23.26734230298794, |
| "grad_norm": 0.48741045594215393, |
| "learning_rate": 0.00032098251748251743, |
| "loss": 3.1708, |
| "step": 79900 |
| }, |
| { |
| "epoch": 23.281903430601666, |
| "grad_norm": 0.4214498996734619, |
| "learning_rate": 0.0003208076923076923, |
| "loss": 3.1678, |
| "step": 79950 |
| }, |
| { |
| "epoch": 23.296464558215387, |
| "grad_norm": 0.40716812014579773, |
| "learning_rate": 0.0003206328671328671, |
| "loss": 3.167, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.296464558215387, |
| "eval_accuracy": 0.3736282642392572, |
| "eval_loss": 3.5487704277038574, |
| "eval_runtime": 82.1056, |
| "eval_samples_per_second": 202.678, |
| "eval_steps_per_second": 12.679, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.31102568582911, |
| "grad_norm": 0.4156472086906433, |
| "learning_rate": 0.00032045804195804194, |
| "loss": 3.1742, |
| "step": 80050 |
| }, |
| { |
| "epoch": 23.325586813442833, |
| "grad_norm": 0.40892067551612854, |
| "learning_rate": 0.00032028321678321674, |
| "loss": 3.1818, |
| "step": 80100 |
| }, |
| { |
| "epoch": 23.340147941056557, |
| "grad_norm": 0.4137515425682068, |
| "learning_rate": 0.0003201083916083916, |
| "loss": 3.1801, |
| "step": 80150 |
| }, |
| { |
| "epoch": 23.354709068670278, |
| "grad_norm": 0.41917335987091064, |
| "learning_rate": 0.0003199335664335664, |
| "loss": 3.1917, |
| "step": 80200 |
| }, |
| { |
| "epoch": 23.369270196284, |
| "grad_norm": 0.4533822238445282, |
| "learning_rate": 0.00031975874125874125, |
| "loss": 3.1838, |
| "step": 80250 |
| }, |
| { |
| "epoch": 23.383831323897724, |
| "grad_norm": 0.40184345841407776, |
| "learning_rate": 0.00031958391608391605, |
| "loss": 3.1931, |
| "step": 80300 |
| }, |
| { |
| "epoch": 23.398392451511445, |
| "grad_norm": 0.4526698589324951, |
| "learning_rate": 0.0003194090909090909, |
| "loss": 3.1802, |
| "step": 80350 |
| }, |
| { |
| "epoch": 23.41295357912517, |
| "grad_norm": 0.4204019010066986, |
| "learning_rate": 0.00031923426573426576, |
| "loss": 3.194, |
| "step": 80400 |
| }, |
| { |
| "epoch": 23.42751470673889, |
| "grad_norm": 0.4197448790073395, |
| "learning_rate": 0.0003190594405594405, |
| "loss": 3.1937, |
| "step": 80450 |
| }, |
| { |
| "epoch": 23.44207583435261, |
| "grad_norm": 0.42730778455734253, |
| "learning_rate": 0.0003188846153846154, |
| "loss": 3.189, |
| "step": 80500 |
| }, |
| { |
| "epoch": 23.456636961966336, |
| "grad_norm": 0.44028347730636597, |
| "learning_rate": 0.00031870979020979015, |
| "loss": 3.1971, |
| "step": 80550 |
| }, |
| { |
| "epoch": 23.471198089580056, |
| "grad_norm": 0.4363892078399658, |
| "learning_rate": 0.000318534965034965, |
| "loss": 3.1879, |
| "step": 80600 |
| }, |
| { |
| "epoch": 23.48575921719378, |
| "grad_norm": 0.4060303270816803, |
| "learning_rate": 0.0003183601398601398, |
| "loss": 3.1894, |
| "step": 80650 |
| }, |
| { |
| "epoch": 23.500320344807502, |
| "grad_norm": 0.4261462688446045, |
| "learning_rate": 0.00031818531468531466, |
| "loss": 3.1965, |
| "step": 80700 |
| }, |
| { |
| "epoch": 23.514881472421223, |
| "grad_norm": 0.44526487588882446, |
| "learning_rate": 0.00031801048951048946, |
| "loss": 3.2138, |
| "step": 80750 |
| }, |
| { |
| "epoch": 23.529442600034947, |
| "grad_norm": 0.40133729577064514, |
| "learning_rate": 0.0003178356643356643, |
| "loss": 3.1879, |
| "step": 80800 |
| }, |
| { |
| "epoch": 23.54400372764867, |
| "grad_norm": 0.411836713552475, |
| "learning_rate": 0.0003176608391608391, |
| "loss": 3.2041, |
| "step": 80850 |
| }, |
| { |
| "epoch": 23.558564855262393, |
| "grad_norm": 0.42505648732185364, |
| "learning_rate": 0.00031748601398601397, |
| "loss": 3.1868, |
| "step": 80900 |
| }, |
| { |
| "epoch": 23.573125982876114, |
| "grad_norm": 0.4069623351097107, |
| "learning_rate": 0.00031731118881118877, |
| "loss": 3.2088, |
| "step": 80950 |
| }, |
| { |
| "epoch": 23.587687110489835, |
| "grad_norm": 0.43776145577430725, |
| "learning_rate": 0.0003171363636363636, |
| "loss": 3.2075, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.587687110489835, |
| "eval_accuracy": 0.37413228896963163, |
| "eval_loss": 3.539616107940674, |
| "eval_runtime": 82.1632, |
| "eval_samples_per_second": 202.536, |
| "eval_steps_per_second": 12.67, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.60224823810356, |
| "grad_norm": 0.43945014476776123, |
| "learning_rate": 0.0003169615384615385, |
| "loss": 3.2139, |
| "step": 81050 |
| }, |
| { |
| "epoch": 23.61680936571728, |
| "grad_norm": 0.41378363966941833, |
| "learning_rate": 0.0003167867132867133, |
| "loss": 3.2109, |
| "step": 81100 |
| }, |
| { |
| "epoch": 23.631370493331005, |
| "grad_norm": 0.39417126774787903, |
| "learning_rate": 0.00031661188811188813, |
| "loss": 3.2083, |
| "step": 81150 |
| }, |
| { |
| "epoch": 23.645931620944726, |
| "grad_norm": 0.3895316421985626, |
| "learning_rate": 0.0003164370629370629, |
| "loss": 3.2082, |
| "step": 81200 |
| }, |
| { |
| "epoch": 23.660492748558447, |
| "grad_norm": 0.40411219000816345, |
| "learning_rate": 0.0003162622377622378, |
| "loss": 3.2074, |
| "step": 81250 |
| }, |
| { |
| "epoch": 23.67505387617217, |
| "grad_norm": 0.46739012002944946, |
| "learning_rate": 0.00031608741258741253, |
| "loss": 3.2032, |
| "step": 81300 |
| }, |
| { |
| "epoch": 23.689615003785892, |
| "grad_norm": 0.4223136901855469, |
| "learning_rate": 0.0003159125874125874, |
| "loss": 3.2123, |
| "step": 81350 |
| }, |
| { |
| "epoch": 23.704176131399617, |
| "grad_norm": 0.4023708999156952, |
| "learning_rate": 0.0003157377622377622, |
| "loss": 3.2165, |
| "step": 81400 |
| }, |
| { |
| "epoch": 23.718737259013338, |
| "grad_norm": 0.4357004761695862, |
| "learning_rate": 0.00031556293706293704, |
| "loss": 3.2151, |
| "step": 81450 |
| }, |
| { |
| "epoch": 23.73329838662706, |
| "grad_norm": 0.4341804087162018, |
| "learning_rate": 0.00031538811188811184, |
| "loss": 3.2119, |
| "step": 81500 |
| }, |
| { |
| "epoch": 23.747859514240783, |
| "grad_norm": 0.41073915362358093, |
| "learning_rate": 0.0003152132867132867, |
| "loss": 3.2214, |
| "step": 81550 |
| }, |
| { |
| "epoch": 23.762420641854504, |
| "grad_norm": 0.39255452156066895, |
| "learning_rate": 0.0003150384615384615, |
| "loss": 3.2099, |
| "step": 81600 |
| }, |
| { |
| "epoch": 23.77698176946823, |
| "grad_norm": 0.39017966389656067, |
| "learning_rate": 0.00031486363636363634, |
| "loss": 3.2142, |
| "step": 81650 |
| }, |
| { |
| "epoch": 23.79154289708195, |
| "grad_norm": 0.4111308157444, |
| "learning_rate": 0.00031468881118881114, |
| "loss": 3.205, |
| "step": 81700 |
| }, |
| { |
| "epoch": 23.80610402469567, |
| "grad_norm": 0.43630319833755493, |
| "learning_rate": 0.000314513986013986, |
| "loss": 3.2116, |
| "step": 81750 |
| }, |
| { |
| "epoch": 23.820665152309395, |
| "grad_norm": 0.41596662998199463, |
| "learning_rate": 0.00031433916083916085, |
| "loss": 3.216, |
| "step": 81800 |
| }, |
| { |
| "epoch": 23.835226279923116, |
| "grad_norm": 0.42351922392845154, |
| "learning_rate": 0.00031416433566433565, |
| "loss": 3.2102, |
| "step": 81850 |
| }, |
| { |
| "epoch": 23.84978740753684, |
| "grad_norm": 0.3973967134952545, |
| "learning_rate": 0.0003139895104895105, |
| "loss": 3.2018, |
| "step": 81900 |
| }, |
| { |
| "epoch": 23.86434853515056, |
| "grad_norm": 0.41471436619758606, |
| "learning_rate": 0.00031381468531468525, |
| "loss": 3.2132, |
| "step": 81950 |
| }, |
| { |
| "epoch": 23.878909662764286, |
| "grad_norm": 0.4077412188053131, |
| "learning_rate": 0.00031363986013986016, |
| "loss": 3.2149, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.878909662764286, |
| "eval_accuracy": 0.3744067625395555, |
| "eval_loss": 3.534972667694092, |
| "eval_runtime": 81.9839, |
| "eval_samples_per_second": 202.979, |
| "eval_steps_per_second": 12.698, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.893470790378007, |
| "grad_norm": 0.4210464358329773, |
| "learning_rate": 0.0003134650349650349, |
| "loss": 3.2182, |
| "step": 82050 |
| }, |
| { |
| "epoch": 23.908031917991728, |
| "grad_norm": 0.4310647249221802, |
| "learning_rate": 0.00031329020979020976, |
| "loss": 3.224, |
| "step": 82100 |
| }, |
| { |
| "epoch": 23.922593045605453, |
| "grad_norm": 0.4387366771697998, |
| "learning_rate": 0.00031311538461538456, |
| "loss": 3.2167, |
| "step": 82150 |
| }, |
| { |
| "epoch": 23.937154173219174, |
| "grad_norm": 0.41095641255378723, |
| "learning_rate": 0.0003129405594405594, |
| "loss": 3.2223, |
| "step": 82200 |
| }, |
| { |
| "epoch": 23.951715300832895, |
| "grad_norm": 0.42513301968574524, |
| "learning_rate": 0.0003127657342657342, |
| "loss": 3.2156, |
| "step": 82250 |
| }, |
| { |
| "epoch": 23.96627642844662, |
| "grad_norm": 0.4029749631881714, |
| "learning_rate": 0.00031259090909090907, |
| "loss": 3.2266, |
| "step": 82300 |
| }, |
| { |
| "epoch": 23.98083755606034, |
| "grad_norm": 0.39269566535949707, |
| "learning_rate": 0.00031241608391608386, |
| "loss": 3.2273, |
| "step": 82350 |
| }, |
| { |
| "epoch": 23.995398683674065, |
| "grad_norm": 0.42530369758605957, |
| "learning_rate": 0.0003122412587412587, |
| "loss": 3.2252, |
| "step": 82400 |
| }, |
| { |
| "epoch": 24.00990156677733, |
| "grad_norm": 0.4148097634315491, |
| "learning_rate": 0.00031206643356643357, |
| "loss": 3.1428, |
| "step": 82450 |
| }, |
| { |
| "epoch": 24.024462694391055, |
| "grad_norm": 0.41627079248428345, |
| "learning_rate": 0.00031189160839160837, |
| "loss": 3.1225, |
| "step": 82500 |
| }, |
| { |
| "epoch": 24.039023822004776, |
| "grad_norm": 0.4008599519729614, |
| "learning_rate": 0.0003117167832167832, |
| "loss": 3.1159, |
| "step": 82550 |
| }, |
| { |
| "epoch": 24.0535849496185, |
| "grad_norm": 0.40943092107772827, |
| "learning_rate": 0.000311541958041958, |
| "loss": 3.1195, |
| "step": 82600 |
| }, |
| { |
| "epoch": 24.06814607723222, |
| "grad_norm": 0.41067296266555786, |
| "learning_rate": 0.0003113671328671329, |
| "loss": 3.1333, |
| "step": 82650 |
| }, |
| { |
| "epoch": 24.082707204845942, |
| "grad_norm": 0.4443258047103882, |
| "learning_rate": 0.0003111923076923076, |
| "loss": 3.1374, |
| "step": 82700 |
| }, |
| { |
| "epoch": 24.097268332459667, |
| "grad_norm": 0.4577665328979492, |
| "learning_rate": 0.00031101748251748253, |
| "loss": 3.1346, |
| "step": 82750 |
| }, |
| { |
| "epoch": 24.111829460073388, |
| "grad_norm": 0.39557501673698425, |
| "learning_rate": 0.0003108426573426573, |
| "loss": 3.147, |
| "step": 82800 |
| }, |
| { |
| "epoch": 24.126390587687112, |
| "grad_norm": 0.4025188088417053, |
| "learning_rate": 0.00031066783216783213, |
| "loss": 3.1375, |
| "step": 82850 |
| }, |
| { |
| "epoch": 24.140951715300833, |
| "grad_norm": 0.4058598279953003, |
| "learning_rate": 0.00031049300699300693, |
| "loss": 3.1494, |
| "step": 82900 |
| }, |
| { |
| "epoch": 24.155512842914554, |
| "grad_norm": 0.39869424700737, |
| "learning_rate": 0.0003103181818181818, |
| "loss": 3.1565, |
| "step": 82950 |
| }, |
| { |
| "epoch": 24.17007397052828, |
| "grad_norm": 0.40525320172309875, |
| "learning_rate": 0.0003101433566433566, |
| "loss": 3.1563, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.17007397052828, |
| "eval_accuracy": 0.37344387068414125, |
| "eval_loss": 3.552462339401245, |
| "eval_runtime": 82.016, |
| "eval_samples_per_second": 202.899, |
| "eval_steps_per_second": 12.693, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.184635098142, |
| "grad_norm": 0.43325430154800415, |
| "learning_rate": 0.00030996853146853144, |
| "loss": 3.163, |
| "step": 83050 |
| }, |
| { |
| "epoch": 24.199196225755724, |
| "grad_norm": 0.42324233055114746, |
| "learning_rate": 0.00030979370629370624, |
| "loss": 3.1407, |
| "step": 83100 |
| }, |
| { |
| "epoch": 24.213757353369445, |
| "grad_norm": 0.4321128726005554, |
| "learning_rate": 0.0003096188811188811, |
| "loss": 3.154, |
| "step": 83150 |
| }, |
| { |
| "epoch": 24.228318480983166, |
| "grad_norm": 0.41666126251220703, |
| "learning_rate": 0.00030944405594405595, |
| "loss": 3.1588, |
| "step": 83200 |
| }, |
| { |
| "epoch": 24.24287960859689, |
| "grad_norm": 0.4264470934867859, |
| "learning_rate": 0.00030926923076923075, |
| "loss": 3.1706, |
| "step": 83250 |
| }, |
| { |
| "epoch": 24.25744073621061, |
| "grad_norm": 0.40624287724494934, |
| "learning_rate": 0.0003090944055944056, |
| "loss": 3.1608, |
| "step": 83300 |
| }, |
| { |
| "epoch": 24.272001863824336, |
| "grad_norm": 0.42733705043792725, |
| "learning_rate": 0.0003089195804195804, |
| "loss": 3.1519, |
| "step": 83350 |
| }, |
| { |
| "epoch": 24.286562991438057, |
| "grad_norm": 0.40160825848579407, |
| "learning_rate": 0.00030874475524475525, |
| "loss": 3.1668, |
| "step": 83400 |
| }, |
| { |
| "epoch": 24.301124119051778, |
| "grad_norm": 0.40998250246047974, |
| "learning_rate": 0.00030856993006993, |
| "loss": 3.1696, |
| "step": 83450 |
| }, |
| { |
| "epoch": 24.315685246665502, |
| "grad_norm": 0.4238924980163574, |
| "learning_rate": 0.0003083951048951049, |
| "loss": 3.1746, |
| "step": 83500 |
| }, |
| { |
| "epoch": 24.330246374279223, |
| "grad_norm": 0.47616925835609436, |
| "learning_rate": 0.00030822027972027965, |
| "loss": 3.1693, |
| "step": 83550 |
| }, |
| { |
| "epoch": 24.344807501892948, |
| "grad_norm": 0.44316303730010986, |
| "learning_rate": 0.0003080454545454545, |
| "loss": 3.1638, |
| "step": 83600 |
| }, |
| { |
| "epoch": 24.35936862950667, |
| "grad_norm": 0.41838720440864563, |
| "learning_rate": 0.0003078706293706293, |
| "loss": 3.165, |
| "step": 83650 |
| }, |
| { |
| "epoch": 24.37392975712039, |
| "grad_norm": 0.4260813891887665, |
| "learning_rate": 0.00030769580419580416, |
| "loss": 3.1821, |
| "step": 83700 |
| }, |
| { |
| "epoch": 24.388490884734114, |
| "grad_norm": 0.40399202704429626, |
| "learning_rate": 0.00030752097902097896, |
| "loss": 3.1697, |
| "step": 83750 |
| }, |
| { |
| "epoch": 24.403052012347835, |
| "grad_norm": 0.4294564425945282, |
| "learning_rate": 0.0003073461538461538, |
| "loss": 3.1915, |
| "step": 83800 |
| }, |
| { |
| "epoch": 24.41761313996156, |
| "grad_norm": 0.4286939203739166, |
| "learning_rate": 0.00030717132867132867, |
| "loss": 3.1821, |
| "step": 83850 |
| }, |
| { |
| "epoch": 24.43217426757528, |
| "grad_norm": 0.42318156361579895, |
| "learning_rate": 0.00030699650349650347, |
| "loss": 3.1815, |
| "step": 83900 |
| }, |
| { |
| "epoch": 24.446735395189002, |
| "grad_norm": 0.42563241720199585, |
| "learning_rate": 0.0003068216783216783, |
| "loss": 3.1925, |
| "step": 83950 |
| }, |
| { |
| "epoch": 24.461296522802726, |
| "grad_norm": 0.4222131073474884, |
| "learning_rate": 0.0003066468531468531, |
| "loss": 3.1832, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.461296522802726, |
| "eval_accuracy": 0.37402186451283703, |
| "eval_loss": 3.5431149005889893, |
| "eval_runtime": 82.2041, |
| "eval_samples_per_second": 202.435, |
| "eval_steps_per_second": 12.664, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.475857650416447, |
| "grad_norm": 0.4192737936973572, |
| "learning_rate": 0.000306472027972028, |
| "loss": 3.1754, |
| "step": 84050 |
| }, |
| { |
| "epoch": 24.490418778030172, |
| "grad_norm": 0.39972081780433655, |
| "learning_rate": 0.0003062972027972028, |
| "loss": 3.1809, |
| "step": 84100 |
| }, |
| { |
| "epoch": 24.504979905643893, |
| "grad_norm": 0.3968181908130646, |
| "learning_rate": 0.00030612237762237763, |
| "loss": 3.1861, |
| "step": 84150 |
| }, |
| { |
| "epoch": 24.519541033257614, |
| "grad_norm": 0.4444066882133484, |
| "learning_rate": 0.0003059475524475524, |
| "loss": 3.1777, |
| "step": 84200 |
| }, |
| { |
| "epoch": 24.53410216087134, |
| "grad_norm": 0.3841025233268738, |
| "learning_rate": 0.0003057727272727273, |
| "loss": 3.183, |
| "step": 84250 |
| }, |
| { |
| "epoch": 24.54866328848506, |
| "grad_norm": 0.3849414587020874, |
| "learning_rate": 0.00030559790209790203, |
| "loss": 3.1925, |
| "step": 84300 |
| }, |
| { |
| "epoch": 24.563224416098784, |
| "grad_norm": 0.4161996841430664, |
| "learning_rate": 0.0003054230769230769, |
| "loss": 3.1917, |
| "step": 84350 |
| }, |
| { |
| "epoch": 24.577785543712505, |
| "grad_norm": 0.41718488931655884, |
| "learning_rate": 0.0003052482517482517, |
| "loss": 3.1852, |
| "step": 84400 |
| }, |
| { |
| "epoch": 24.592346671326226, |
| "grad_norm": 0.4140050709247589, |
| "learning_rate": 0.00030507342657342654, |
| "loss": 3.1934, |
| "step": 84450 |
| }, |
| { |
| "epoch": 24.60690779893995, |
| "grad_norm": 0.42590102553367615, |
| "learning_rate": 0.00030489860139860134, |
| "loss": 3.2002, |
| "step": 84500 |
| }, |
| { |
| "epoch": 24.62146892655367, |
| "grad_norm": 0.4079754650592804, |
| "learning_rate": 0.0003047237762237762, |
| "loss": 3.2066, |
| "step": 84550 |
| }, |
| { |
| "epoch": 24.636030054167396, |
| "grad_norm": 0.4108400344848633, |
| "learning_rate": 0.00030454895104895104, |
| "loss": 3.1999, |
| "step": 84600 |
| }, |
| { |
| "epoch": 24.650591181781117, |
| "grad_norm": 0.43076011538505554, |
| "learning_rate": 0.00030437412587412584, |
| "loss": 3.1896, |
| "step": 84650 |
| }, |
| { |
| "epoch": 24.66515230939484, |
| "grad_norm": 0.4303816556930542, |
| "learning_rate": 0.0003041993006993007, |
| "loss": 3.1951, |
| "step": 84700 |
| }, |
| { |
| "epoch": 24.679713437008562, |
| "grad_norm": 0.39632555842399597, |
| "learning_rate": 0.0003040244755244755, |
| "loss": 3.2015, |
| "step": 84750 |
| }, |
| { |
| "epoch": 24.694274564622283, |
| "grad_norm": 0.4123823344707489, |
| "learning_rate": 0.00030384965034965035, |
| "loss": 3.2054, |
| "step": 84800 |
| }, |
| { |
| "epoch": 24.708835692236008, |
| "grad_norm": 0.44901368021965027, |
| "learning_rate": 0.00030367482517482515, |
| "loss": 3.1985, |
| "step": 84850 |
| }, |
| { |
| "epoch": 24.72339681984973, |
| "grad_norm": 0.41807523369789124, |
| "learning_rate": 0.0003035, |
| "loss": 3.2054, |
| "step": 84900 |
| }, |
| { |
| "epoch": 24.737957947463453, |
| "grad_norm": 0.45738163590431213, |
| "learning_rate": 0.00030332517482517475, |
| "loss": 3.2092, |
| "step": 84950 |
| }, |
| { |
| "epoch": 24.752519075077174, |
| "grad_norm": 0.403689444065094, |
| "learning_rate": 0.00030315034965034966, |
| "loss": 3.1941, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.752519075077174, |
| "eval_accuracy": 0.3745391778093646, |
| "eval_loss": 3.5367817878723145, |
| "eval_runtime": 82.0758, |
| "eval_samples_per_second": 202.752, |
| "eval_steps_per_second": 12.683, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.767080202690895, |
| "grad_norm": 0.4533366560935974, |
| "learning_rate": 0.0003029755244755244, |
| "loss": 3.1991, |
| "step": 85050 |
| }, |
| { |
| "epoch": 24.78164133030462, |
| "grad_norm": 0.40302085876464844, |
| "learning_rate": 0.00030280069930069926, |
| "loss": 3.2064, |
| "step": 85100 |
| }, |
| { |
| "epoch": 24.79620245791834, |
| "grad_norm": 0.4038363993167877, |
| "learning_rate": 0.00030262587412587406, |
| "loss": 3.201, |
| "step": 85150 |
| }, |
| { |
| "epoch": 24.810763585532065, |
| "grad_norm": 0.4185418486595154, |
| "learning_rate": 0.0003024510489510489, |
| "loss": 3.2146, |
| "step": 85200 |
| }, |
| { |
| "epoch": 24.825324713145786, |
| "grad_norm": 0.4759293496608734, |
| "learning_rate": 0.00030227622377622377, |
| "loss": 3.2071, |
| "step": 85250 |
| }, |
| { |
| "epoch": 24.839885840759507, |
| "grad_norm": 0.3995543420314789, |
| "learning_rate": 0.00030210139860139856, |
| "loss": 3.2121, |
| "step": 85300 |
| }, |
| { |
| "epoch": 24.85444696837323, |
| "grad_norm": 0.4144052565097809, |
| "learning_rate": 0.0003019265734265734, |
| "loss": 3.2163, |
| "step": 85350 |
| }, |
| { |
| "epoch": 24.869008095986953, |
| "grad_norm": 0.4440772831439972, |
| "learning_rate": 0.0003017517482517482, |
| "loss": 3.2146, |
| "step": 85400 |
| }, |
| { |
| "epoch": 24.883569223600677, |
| "grad_norm": 0.4497220814228058, |
| "learning_rate": 0.00030157692307692307, |
| "loss": 3.2002, |
| "step": 85450 |
| }, |
| { |
| "epoch": 24.898130351214398, |
| "grad_norm": 0.43450841307640076, |
| "learning_rate": 0.00030140209790209787, |
| "loss": 3.2157, |
| "step": 85500 |
| }, |
| { |
| "epoch": 24.91269147882812, |
| "grad_norm": 0.42822250723838806, |
| "learning_rate": 0.0003012272727272727, |
| "loss": 3.2006, |
| "step": 85550 |
| }, |
| { |
| "epoch": 24.927252606441844, |
| "grad_norm": 0.38749244809150696, |
| "learning_rate": 0.0003010524475524475, |
| "loss": 3.2085, |
| "step": 85600 |
| }, |
| { |
| "epoch": 24.941813734055565, |
| "grad_norm": 0.45322224497795105, |
| "learning_rate": 0.0003008776223776224, |
| "loss": 3.207, |
| "step": 85650 |
| }, |
| { |
| "epoch": 24.95637486166929, |
| "grad_norm": 0.4396408498287201, |
| "learning_rate": 0.0003007027972027972, |
| "loss": 3.1979, |
| "step": 85700 |
| }, |
| { |
| "epoch": 24.97093598928301, |
| "grad_norm": 0.40385472774505615, |
| "learning_rate": 0.00030052797202797203, |
| "loss": 3.2156, |
| "step": 85750 |
| }, |
| { |
| "epoch": 24.98549711689673, |
| "grad_norm": 0.42568182945251465, |
| "learning_rate": 0.0003003531468531468, |
| "loss": 3.2152, |
| "step": 85800 |
| }, |
| { |
| "epoch": 25.0, |
| "grad_norm": 0.48255637288093567, |
| "learning_rate": 0.00030017832167832163, |
| "loss": 3.223, |
| "step": 85850 |
| }, |
| { |
| "epoch": 25.01456112761372, |
| "grad_norm": 0.42533791065216064, |
| "learning_rate": 0.0003000034965034965, |
| "loss": 3.1128, |
| "step": 85900 |
| }, |
| { |
| "epoch": 25.029122255227445, |
| "grad_norm": 0.4242820143699646, |
| "learning_rate": 0.0002998286713286713, |
| "loss": 3.1117, |
| "step": 85950 |
| }, |
| { |
| "epoch": 25.043683382841166, |
| "grad_norm": 0.41748517751693726, |
| "learning_rate": 0.00029965384615384614, |
| "loss": 3.1068, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.043683382841166, |
| "eval_accuracy": 0.37381324578402597, |
| "eval_loss": 3.5506114959716797, |
| "eval_runtime": 82.011, |
| "eval_samples_per_second": 202.912, |
| "eval_steps_per_second": 12.693, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.05824451045489, |
| "grad_norm": 0.43169549107551575, |
| "learning_rate": 0.00029947902097902094, |
| "loss": 3.1258, |
| "step": 86050 |
| }, |
| { |
| "epoch": 25.072805638068612, |
| "grad_norm": 0.43729960918426514, |
| "learning_rate": 0.0002993041958041958, |
| "loss": 3.1296, |
| "step": 86100 |
| }, |
| { |
| "epoch": 25.087366765682333, |
| "grad_norm": 0.4284965693950653, |
| "learning_rate": 0.0002991293706293706, |
| "loss": 3.1427, |
| "step": 86150 |
| }, |
| { |
| "epoch": 25.101927893296057, |
| "grad_norm": 0.43102532625198364, |
| "learning_rate": 0.0002989545454545454, |
| "loss": 3.1178, |
| "step": 86200 |
| }, |
| { |
| "epoch": 25.11648902090978, |
| "grad_norm": 0.3853287398815155, |
| "learning_rate": 0.00029877972027972025, |
| "loss": 3.1371, |
| "step": 86250 |
| }, |
| { |
| "epoch": 25.131050148523503, |
| "grad_norm": 0.46812674403190613, |
| "learning_rate": 0.0002986048951048951, |
| "loss": 3.1391, |
| "step": 86300 |
| }, |
| { |
| "epoch": 25.145611276137224, |
| "grad_norm": 0.4182865023612976, |
| "learning_rate": 0.0002984300699300699, |
| "loss": 3.1281, |
| "step": 86350 |
| }, |
| { |
| "epoch": 25.160172403750945, |
| "grad_norm": 0.40853768587112427, |
| "learning_rate": 0.00029825524475524475, |
| "loss": 3.1357, |
| "step": 86400 |
| }, |
| { |
| "epoch": 25.17473353136467, |
| "grad_norm": 0.42215555906295776, |
| "learning_rate": 0.00029808041958041955, |
| "loss": 3.1427, |
| "step": 86450 |
| }, |
| { |
| "epoch": 25.18929465897839, |
| "grad_norm": 0.3856315612792969, |
| "learning_rate": 0.0002979055944055944, |
| "loss": 3.1452, |
| "step": 86500 |
| }, |
| { |
| "epoch": 25.203855786592115, |
| "grad_norm": 0.41229718923568726, |
| "learning_rate": 0.0002977307692307692, |
| "loss": 3.1463, |
| "step": 86550 |
| }, |
| { |
| "epoch": 25.218416914205836, |
| "grad_norm": 0.42555561661720276, |
| "learning_rate": 0.000297555944055944, |
| "loss": 3.1494, |
| "step": 86600 |
| }, |
| { |
| "epoch": 25.232978041819557, |
| "grad_norm": 0.43103721737861633, |
| "learning_rate": 0.00029738111888111886, |
| "loss": 3.1553, |
| "step": 86650 |
| }, |
| { |
| "epoch": 25.24753916943328, |
| "grad_norm": 0.45156463980674744, |
| "learning_rate": 0.00029720629370629366, |
| "loss": 3.1498, |
| "step": 86700 |
| }, |
| { |
| "epoch": 25.262100297047002, |
| "grad_norm": 0.4618069529533386, |
| "learning_rate": 0.0002970314685314685, |
| "loss": 3.1527, |
| "step": 86750 |
| }, |
| { |
| "epoch": 25.276661424660727, |
| "grad_norm": 0.4524608254432678, |
| "learning_rate": 0.0002968566433566433, |
| "loss": 3.1531, |
| "step": 86800 |
| }, |
| { |
| "epoch": 25.291222552274448, |
| "grad_norm": 0.42194461822509766, |
| "learning_rate": 0.00029668181818181817, |
| "loss": 3.158, |
| "step": 86850 |
| }, |
| { |
| "epoch": 25.305783679888172, |
| "grad_norm": 0.4134163558483124, |
| "learning_rate": 0.00029650699300699297, |
| "loss": 3.165, |
| "step": 86900 |
| }, |
| { |
| "epoch": 25.320344807501893, |
| "grad_norm": 0.44418227672576904, |
| "learning_rate": 0.0002963321678321678, |
| "loss": 3.1694, |
| "step": 86950 |
| }, |
| { |
| "epoch": 25.334905935115614, |
| "grad_norm": 0.43439069390296936, |
| "learning_rate": 0.0002961573426573426, |
| "loss": 3.1546, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.334905935115614, |
| "eval_accuracy": 0.3740572614899352, |
| "eval_loss": 3.547642469406128, |
| "eval_runtime": 82.0794, |
| "eval_samples_per_second": 202.743, |
| "eval_steps_per_second": 12.683, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.34946706272934, |
| "grad_norm": 0.42910781502723694, |
| "learning_rate": 0.0002959825174825175, |
| "loss": 3.1555, |
| "step": 87050 |
| }, |
| { |
| "epoch": 25.36402819034306, |
| "grad_norm": 0.40855082869529724, |
| "learning_rate": 0.0002958076923076923, |
| "loss": 3.1701, |
| "step": 87100 |
| }, |
| { |
| "epoch": 25.378589317956784, |
| "grad_norm": 0.4396383464336395, |
| "learning_rate": 0.00029563286713286713, |
| "loss": 3.1537, |
| "step": 87150 |
| }, |
| { |
| "epoch": 25.393150445570505, |
| "grad_norm": 0.43299999833106995, |
| "learning_rate": 0.00029545804195804193, |
| "loss": 3.1706, |
| "step": 87200 |
| }, |
| { |
| "epoch": 25.407711573184226, |
| "grad_norm": 0.41637924313545227, |
| "learning_rate": 0.0002952832167832168, |
| "loss": 3.1811, |
| "step": 87250 |
| }, |
| { |
| "epoch": 25.42227270079795, |
| "grad_norm": 0.4246363043785095, |
| "learning_rate": 0.0002951083916083916, |
| "loss": 3.172, |
| "step": 87300 |
| }, |
| { |
| "epoch": 25.43683382841167, |
| "grad_norm": 0.4735258221626282, |
| "learning_rate": 0.0002949335664335664, |
| "loss": 3.1635, |
| "step": 87350 |
| }, |
| { |
| "epoch": 25.451394956025396, |
| "grad_norm": 0.4207344055175781, |
| "learning_rate": 0.00029475874125874124, |
| "loss": 3.1658, |
| "step": 87400 |
| }, |
| { |
| "epoch": 25.465956083639117, |
| "grad_norm": 0.43009617924690247, |
| "learning_rate": 0.00029458391608391604, |
| "loss": 3.18, |
| "step": 87450 |
| }, |
| { |
| "epoch": 25.480517211252838, |
| "grad_norm": 0.42038339376449585, |
| "learning_rate": 0.0002944090909090909, |
| "loss": 3.17, |
| "step": 87500 |
| }, |
| { |
| "epoch": 25.495078338866563, |
| "grad_norm": 0.4331233501434326, |
| "learning_rate": 0.0002942342657342657, |
| "loss": 3.179, |
| "step": 87550 |
| }, |
| { |
| "epoch": 25.509639466480284, |
| "grad_norm": 0.42891553044319153, |
| "learning_rate": 0.00029405944055944054, |
| "loss": 3.1767, |
| "step": 87600 |
| }, |
| { |
| "epoch": 25.524200594094008, |
| "grad_norm": 0.42849770188331604, |
| "learning_rate": 0.0002938846153846154, |
| "loss": 3.1799, |
| "step": 87650 |
| }, |
| { |
| "epoch": 25.53876172170773, |
| "grad_norm": 0.4017314016819, |
| "learning_rate": 0.0002937097902097902, |
| "loss": 3.1609, |
| "step": 87700 |
| }, |
| { |
| "epoch": 25.55332284932145, |
| "grad_norm": 0.41529572010040283, |
| "learning_rate": 0.000293534965034965, |
| "loss": 3.1817, |
| "step": 87750 |
| }, |
| { |
| "epoch": 25.567883976935175, |
| "grad_norm": 0.4347538948059082, |
| "learning_rate": 0.00029336013986013985, |
| "loss": 3.1672, |
| "step": 87800 |
| }, |
| { |
| "epoch": 25.582445104548896, |
| "grad_norm": 0.42473986744880676, |
| "learning_rate": 0.00029318531468531465, |
| "loss": 3.1906, |
| "step": 87850 |
| }, |
| { |
| "epoch": 25.59700623216262, |
| "grad_norm": 0.4047037661075592, |
| "learning_rate": 0.0002930104895104895, |
| "loss": 3.1753, |
| "step": 87900 |
| }, |
| { |
| "epoch": 25.61156735977634, |
| "grad_norm": 0.4402819573879242, |
| "learning_rate": 0.0002928356643356643, |
| "loss": 3.1798, |
| "step": 87950 |
| }, |
| { |
| "epoch": 25.626128487390062, |
| "grad_norm": 0.4249858856201172, |
| "learning_rate": 0.00029266083916083916, |
| "loss": 3.1857, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.626128487390062, |
| "eval_accuracy": 0.3743038643503167, |
| "eval_loss": 3.541656732559204, |
| "eval_runtime": 82.3652, |
| "eval_samples_per_second": 202.039, |
| "eval_steps_per_second": 12.639, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.640689615003787, |
| "grad_norm": 0.4145389497280121, |
| "learning_rate": 0.00029248601398601396, |
| "loss": 3.1818, |
| "step": 88050 |
| }, |
| { |
| "epoch": 25.655250742617508, |
| "grad_norm": 0.41636502742767334, |
| "learning_rate": 0.00029231118881118876, |
| "loss": 3.1934, |
| "step": 88100 |
| }, |
| { |
| "epoch": 25.669811870231232, |
| "grad_norm": 0.42255234718322754, |
| "learning_rate": 0.0002921363636363636, |
| "loss": 3.1879, |
| "step": 88150 |
| }, |
| { |
| "epoch": 25.684372997844953, |
| "grad_norm": 0.459856778383255, |
| "learning_rate": 0.0002919615384615384, |
| "loss": 3.1944, |
| "step": 88200 |
| }, |
| { |
| "epoch": 25.698934125458674, |
| "grad_norm": 0.4657002389431, |
| "learning_rate": 0.00029178671328671326, |
| "loss": 3.1909, |
| "step": 88250 |
| }, |
| { |
| "epoch": 25.7134952530724, |
| "grad_norm": 0.4016781151294708, |
| "learning_rate": 0.00029161188811188806, |
| "loss": 3.1911, |
| "step": 88300 |
| }, |
| { |
| "epoch": 25.72805638068612, |
| "grad_norm": 0.3989131450653076, |
| "learning_rate": 0.0002914370629370629, |
| "loss": 3.1984, |
| "step": 88350 |
| }, |
| { |
| "epoch": 25.742617508299844, |
| "grad_norm": 0.4224452078342438, |
| "learning_rate": 0.00029126223776223777, |
| "loss": 3.1769, |
| "step": 88400 |
| }, |
| { |
| "epoch": 25.757178635913565, |
| "grad_norm": 0.4237838685512543, |
| "learning_rate": 0.00029108741258741257, |
| "loss": 3.2024, |
| "step": 88450 |
| }, |
| { |
| "epoch": 25.771739763527286, |
| "grad_norm": 0.42534908652305603, |
| "learning_rate": 0.00029091258741258737, |
| "loss": 3.1919, |
| "step": 88500 |
| }, |
| { |
| "epoch": 25.78630089114101, |
| "grad_norm": 0.3884274661540985, |
| "learning_rate": 0.0002907377622377622, |
| "loss": 3.2004, |
| "step": 88550 |
| }, |
| { |
| "epoch": 25.80086201875473, |
| "grad_norm": 0.4012846052646637, |
| "learning_rate": 0.000290562937062937, |
| "loss": 3.194, |
| "step": 88600 |
| }, |
| { |
| "epoch": 25.815423146368456, |
| "grad_norm": 0.44112586975097656, |
| "learning_rate": 0.0002903881118881119, |
| "loss": 3.2073, |
| "step": 88650 |
| }, |
| { |
| "epoch": 25.829984273982177, |
| "grad_norm": 0.437884658575058, |
| "learning_rate": 0.0002902132867132867, |
| "loss": 3.1988, |
| "step": 88700 |
| }, |
| { |
| "epoch": 25.844545401595898, |
| "grad_norm": 0.45394134521484375, |
| "learning_rate": 0.00029003846153846153, |
| "loss": 3.2035, |
| "step": 88750 |
| }, |
| { |
| "epoch": 25.859106529209622, |
| "grad_norm": 0.4056381285190582, |
| "learning_rate": 0.00028986363636363633, |
| "loss": 3.1984, |
| "step": 88800 |
| }, |
| { |
| "epoch": 25.873667656823343, |
| "grad_norm": 0.4186859130859375, |
| "learning_rate": 0.00028968881118881113, |
| "loss": 3.1993, |
| "step": 88850 |
| }, |
| { |
| "epoch": 25.888228784437068, |
| "grad_norm": 0.4594738185405731, |
| "learning_rate": 0.000289513986013986, |
| "loss": 3.2086, |
| "step": 88900 |
| }, |
| { |
| "epoch": 25.90278991205079, |
| "grad_norm": 0.42213621735572815, |
| "learning_rate": 0.0002893391608391608, |
| "loss": 3.2026, |
| "step": 88950 |
| }, |
| { |
| "epoch": 25.91735103966451, |
| "grad_norm": 0.40868252515792847, |
| "learning_rate": 0.00028916433566433564, |
| "loss": 3.21, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.91735103966451, |
| "eval_accuracy": 0.375065310950684, |
| "eval_loss": 3.532235860824585, |
| "eval_runtime": 82.0011, |
| "eval_samples_per_second": 202.936, |
| "eval_steps_per_second": 12.695, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.931912167278234, |
| "grad_norm": 0.4510246813297272, |
| "learning_rate": 0.0002889895104895105, |
| "loss": 3.204, |
| "step": 89050 |
| }, |
| { |
| "epoch": 25.946473294891955, |
| "grad_norm": 0.43960273265838623, |
| "learning_rate": 0.0002888146853146853, |
| "loss": 3.2013, |
| "step": 89100 |
| }, |
| { |
| "epoch": 25.96103442250568, |
| "grad_norm": 0.462924987077713, |
| "learning_rate": 0.00028863986013986015, |
| "loss": 3.2053, |
| "step": 89150 |
| }, |
| { |
| "epoch": 25.9755955501194, |
| "grad_norm": 0.40917086601257324, |
| "learning_rate": 0.00028846503496503495, |
| "loss": 3.2014, |
| "step": 89200 |
| }, |
| { |
| "epoch": 25.990156677733125, |
| "grad_norm": 0.43744927644729614, |
| "learning_rate": 0.00028829020979020975, |
| "loss": 3.2103, |
| "step": 89250 |
| }, |
| { |
| "epoch": 26.00465956083639, |
| "grad_norm": 0.42926573753356934, |
| "learning_rate": 0.0002881153846153846, |
| "loss": 3.1655, |
| "step": 89300 |
| }, |
| { |
| "epoch": 26.019220688450112, |
| "grad_norm": 0.3990574777126312, |
| "learning_rate": 0.0002879405594405594, |
| "loss": 3.105, |
| "step": 89350 |
| }, |
| { |
| "epoch": 26.033781816063836, |
| "grad_norm": 0.4124816060066223, |
| "learning_rate": 0.00028776573426573425, |
| "loss": 3.1061, |
| "step": 89400 |
| }, |
| { |
| "epoch": 26.048342943677557, |
| "grad_norm": 0.39809614419937134, |
| "learning_rate": 0.00028759090909090905, |
| "loss": 3.1165, |
| "step": 89450 |
| }, |
| { |
| "epoch": 26.062904071291282, |
| "grad_norm": 0.41795167326927185, |
| "learning_rate": 0.0002874160839160839, |
| "loss": 3.1191, |
| "step": 89500 |
| }, |
| { |
| "epoch": 26.077465198905003, |
| "grad_norm": 0.42836716771125793, |
| "learning_rate": 0.0002872412587412587, |
| "loss": 3.1168, |
| "step": 89550 |
| }, |
| { |
| "epoch": 26.092026326518727, |
| "grad_norm": 0.4195990562438965, |
| "learning_rate": 0.0002870664335664335, |
| "loss": 3.1216, |
| "step": 89600 |
| }, |
| { |
| "epoch": 26.10658745413245, |
| "grad_norm": 0.4439239203929901, |
| "learning_rate": 0.00028689160839160836, |
| "loss": 3.1169, |
| "step": 89650 |
| }, |
| { |
| "epoch": 26.12114858174617, |
| "grad_norm": 0.4708871841430664, |
| "learning_rate": 0.0002867167832167832, |
| "loss": 3.1261, |
| "step": 89700 |
| }, |
| { |
| "epoch": 26.135709709359894, |
| "grad_norm": 0.44045379757881165, |
| "learning_rate": 0.000286541958041958, |
| "loss": 3.1187, |
| "step": 89750 |
| }, |
| { |
| "epoch": 26.150270836973615, |
| "grad_norm": 0.4310579001903534, |
| "learning_rate": 0.00028636713286713287, |
| "loss": 3.1369, |
| "step": 89800 |
| }, |
| { |
| "epoch": 26.16483196458734, |
| "grad_norm": 0.4071486294269562, |
| "learning_rate": 0.00028619230769230767, |
| "loss": 3.1346, |
| "step": 89850 |
| }, |
| { |
| "epoch": 26.17939309220106, |
| "grad_norm": 0.4394294023513794, |
| "learning_rate": 0.0002860174825174825, |
| "loss": 3.1316, |
| "step": 89900 |
| }, |
| { |
| "epoch": 26.19395421981478, |
| "grad_norm": 0.43135660886764526, |
| "learning_rate": 0.0002858426573426573, |
| "loss": 3.1395, |
| "step": 89950 |
| }, |
| { |
| "epoch": 26.208515347428506, |
| "grad_norm": 0.44521310925483704, |
| "learning_rate": 0.0002856678321678321, |
| "loss": 3.1305, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.208515347428506, |
| "eval_accuracy": 0.37415039905093767, |
| "eval_loss": 3.550454616546631, |
| "eval_runtime": 82.1172, |
| "eval_samples_per_second": 202.649, |
| "eval_steps_per_second": 12.677, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.223076475042227, |
| "grad_norm": 0.4306325316429138, |
| "learning_rate": 0.000285493006993007, |
| "loss": 3.1461, |
| "step": 90050 |
| }, |
| { |
| "epoch": 26.23763760265595, |
| "grad_norm": 0.44331514835357666, |
| "learning_rate": 0.0002853181818181818, |
| "loss": 3.1406, |
| "step": 90100 |
| }, |
| { |
| "epoch": 26.252198730269672, |
| "grad_norm": 0.42998775839805603, |
| "learning_rate": 0.00028514335664335663, |
| "loss": 3.1523, |
| "step": 90150 |
| }, |
| { |
| "epoch": 26.266759857883393, |
| "grad_norm": 0.42007073760032654, |
| "learning_rate": 0.00028496853146853143, |
| "loss": 3.1414, |
| "step": 90200 |
| }, |
| { |
| "epoch": 26.281320985497118, |
| "grad_norm": 0.4389496445655823, |
| "learning_rate": 0.0002847937062937063, |
| "loss": 3.1423, |
| "step": 90250 |
| }, |
| { |
| "epoch": 26.29588211311084, |
| "grad_norm": 0.44720974564552307, |
| "learning_rate": 0.0002846188811188811, |
| "loss": 3.1571, |
| "step": 90300 |
| }, |
| { |
| "epoch": 26.310443240724563, |
| "grad_norm": 0.42087510228157043, |
| "learning_rate": 0.0002844440559440559, |
| "loss": 3.1465, |
| "step": 90350 |
| }, |
| { |
| "epoch": 26.325004368338284, |
| "grad_norm": 0.408970445394516, |
| "learning_rate": 0.00028426923076923074, |
| "loss": 3.1437, |
| "step": 90400 |
| }, |
| { |
| "epoch": 26.339565495952005, |
| "grad_norm": 0.39762216806411743, |
| "learning_rate": 0.0002840944055944056, |
| "loss": 3.1507, |
| "step": 90450 |
| }, |
| { |
| "epoch": 26.35412662356573, |
| "grad_norm": 0.41580408811569214, |
| "learning_rate": 0.0002839195804195804, |
| "loss": 3.1491, |
| "step": 90500 |
| }, |
| { |
| "epoch": 26.36868775117945, |
| "grad_norm": 0.4389164447784424, |
| "learning_rate": 0.00028374475524475524, |
| "loss": 3.1522, |
| "step": 90550 |
| }, |
| { |
| "epoch": 26.383248878793175, |
| "grad_norm": 0.4354810416698456, |
| "learning_rate": 0.00028356993006993004, |
| "loss": 3.1604, |
| "step": 90600 |
| }, |
| { |
| "epoch": 26.397810006406896, |
| "grad_norm": 0.44377344846725464, |
| "learning_rate": 0.0002833951048951049, |
| "loss": 3.1545, |
| "step": 90650 |
| }, |
| { |
| "epoch": 26.412371134020617, |
| "grad_norm": 0.4414256811141968, |
| "learning_rate": 0.0002832202797202797, |
| "loss": 3.1666, |
| "step": 90700 |
| }, |
| { |
| "epoch": 26.42693226163434, |
| "grad_norm": 0.4358312785625458, |
| "learning_rate": 0.0002830454545454545, |
| "loss": 3.1652, |
| "step": 90750 |
| }, |
| { |
| "epoch": 26.441493389248063, |
| "grad_norm": 0.4352630376815796, |
| "learning_rate": 0.00028287062937062935, |
| "loss": 3.1519, |
| "step": 90800 |
| }, |
| { |
| "epoch": 26.456054516861787, |
| "grad_norm": 0.4301542341709137, |
| "learning_rate": 0.00028269580419580415, |
| "loss": 3.1656, |
| "step": 90850 |
| }, |
| { |
| "epoch": 26.470615644475508, |
| "grad_norm": 0.4386056959629059, |
| "learning_rate": 0.000282520979020979, |
| "loss": 3.1671, |
| "step": 90900 |
| }, |
| { |
| "epoch": 26.48517677208923, |
| "grad_norm": 0.415607750415802, |
| "learning_rate": 0.0002823461538461538, |
| "loss": 3.1609, |
| "step": 90950 |
| }, |
| { |
| "epoch": 26.499737899702954, |
| "grad_norm": 0.43342646956443787, |
| "learning_rate": 0.00028217132867132866, |
| "loss": 3.1673, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.499737899702954, |
| "eval_accuracy": 0.37397976445369707, |
| "eval_loss": 3.5473692417144775, |
| "eval_runtime": 82.1727, |
| "eval_samples_per_second": 202.512, |
| "eval_steps_per_second": 12.668, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.514299027316675, |
| "grad_norm": 0.40647515654563904, |
| "learning_rate": 0.00028199650349650346, |
| "loss": 3.1668, |
| "step": 91050 |
| }, |
| { |
| "epoch": 26.5288601549304, |
| "grad_norm": 0.42316776514053345, |
| "learning_rate": 0.0002818216783216783, |
| "loss": 3.1721, |
| "step": 91100 |
| }, |
| { |
| "epoch": 26.54342128254412, |
| "grad_norm": 0.4468265473842621, |
| "learning_rate": 0.0002816468531468531, |
| "loss": 3.1698, |
| "step": 91150 |
| }, |
| { |
| "epoch": 26.55798241015784, |
| "grad_norm": 0.4379112422466278, |
| "learning_rate": 0.00028147202797202796, |
| "loss": 3.1787, |
| "step": 91200 |
| }, |
| { |
| "epoch": 26.572543537771566, |
| "grad_norm": 0.44611984491348267, |
| "learning_rate": 0.00028129720279720276, |
| "loss": 3.1899, |
| "step": 91250 |
| }, |
| { |
| "epoch": 26.587104665385286, |
| "grad_norm": 0.45425495505332947, |
| "learning_rate": 0.0002811223776223776, |
| "loss": 3.1747, |
| "step": 91300 |
| }, |
| { |
| "epoch": 26.60166579299901, |
| "grad_norm": 0.44823259115219116, |
| "learning_rate": 0.0002809475524475524, |
| "loss": 3.1674, |
| "step": 91350 |
| }, |
| { |
| "epoch": 26.616226920612732, |
| "grad_norm": 0.4419941008090973, |
| "learning_rate": 0.00028077272727272727, |
| "loss": 3.1855, |
| "step": 91400 |
| }, |
| { |
| "epoch": 26.630788048226453, |
| "grad_norm": 0.4752655625343323, |
| "learning_rate": 0.00028059790209790207, |
| "loss": 3.1719, |
| "step": 91450 |
| }, |
| { |
| "epoch": 26.645349175840177, |
| "grad_norm": 0.4482170343399048, |
| "learning_rate": 0.00028042307692307687, |
| "loss": 3.1874, |
| "step": 91500 |
| }, |
| { |
| "epoch": 26.6599103034539, |
| "grad_norm": 0.44821420311927795, |
| "learning_rate": 0.0002802482517482517, |
| "loss": 3.1777, |
| "step": 91550 |
| }, |
| { |
| "epoch": 26.674471431067623, |
| "grad_norm": 0.4214096665382385, |
| "learning_rate": 0.0002800734265734265, |
| "loss": 3.1828, |
| "step": 91600 |
| }, |
| { |
| "epoch": 26.689032558681344, |
| "grad_norm": 0.44212210178375244, |
| "learning_rate": 0.0002798986013986014, |
| "loss": 3.1811, |
| "step": 91650 |
| }, |
| { |
| "epoch": 26.703593686295065, |
| "grad_norm": 0.44208231568336487, |
| "learning_rate": 0.0002797237762237762, |
| "loss": 3.1806, |
| "step": 91700 |
| }, |
| { |
| "epoch": 26.71815481390879, |
| "grad_norm": 0.4540371000766754, |
| "learning_rate": 0.00027954895104895103, |
| "loss": 3.1804, |
| "step": 91750 |
| }, |
| { |
| "epoch": 26.73271594152251, |
| "grad_norm": 0.4299270212650299, |
| "learning_rate": 0.0002793741258741259, |
| "loss": 3.1913, |
| "step": 91800 |
| }, |
| { |
| "epoch": 26.747277069136235, |
| "grad_norm": 0.4339700937271118, |
| "learning_rate": 0.0002791993006993007, |
| "loss": 3.1839, |
| "step": 91850 |
| }, |
| { |
| "epoch": 26.761838196749956, |
| "grad_norm": 0.4123922288417816, |
| "learning_rate": 0.0002790244755244755, |
| "loss": 3.1786, |
| "step": 91900 |
| }, |
| { |
| "epoch": 26.77639932436368, |
| "grad_norm": 0.4458857476711273, |
| "learning_rate": 0.00027884965034965034, |
| "loss": 3.1893, |
| "step": 91950 |
| }, |
| { |
| "epoch": 26.7909604519774, |
| "grad_norm": 0.42856302857398987, |
| "learning_rate": 0.00027867482517482514, |
| "loss": 3.1799, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.7909604519774, |
| "eval_accuracy": 0.374520950130128, |
| "eval_loss": 3.5412089824676514, |
| "eval_runtime": 82.0537, |
| "eval_samples_per_second": 202.806, |
| "eval_steps_per_second": 12.687, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.805521579591122, |
| "grad_norm": 0.4275612235069275, |
| "learning_rate": 0.0002785, |
| "loss": 3.1747, |
| "step": 92050 |
| }, |
| { |
| "epoch": 26.820082707204847, |
| "grad_norm": 0.42083340883255005, |
| "learning_rate": 0.0002783251748251748, |
| "loss": 3.1933, |
| "step": 92100 |
| }, |
| { |
| "epoch": 26.834643834818568, |
| "grad_norm": 0.4058454632759094, |
| "learning_rate": 0.00027815034965034965, |
| "loss": 3.1905, |
| "step": 92150 |
| }, |
| { |
| "epoch": 26.849204962432292, |
| "grad_norm": 0.42851758003234863, |
| "learning_rate": 0.00027797552447552445, |
| "loss": 3.1796, |
| "step": 92200 |
| }, |
| { |
| "epoch": 26.863766090046013, |
| "grad_norm": 0.46810829639434814, |
| "learning_rate": 0.00027780069930069925, |
| "loss": 3.1853, |
| "step": 92250 |
| }, |
| { |
| "epoch": 26.878327217659734, |
| "grad_norm": 0.4216388761997223, |
| "learning_rate": 0.0002776258741258741, |
| "loss": 3.1979, |
| "step": 92300 |
| }, |
| { |
| "epoch": 26.89288834527346, |
| "grad_norm": 0.39875245094299316, |
| "learning_rate": 0.0002774510489510489, |
| "loss": 3.1958, |
| "step": 92350 |
| }, |
| { |
| "epoch": 26.90744947288718, |
| "grad_norm": 0.46872153878211975, |
| "learning_rate": 0.00027727622377622375, |
| "loss": 3.1948, |
| "step": 92400 |
| }, |
| { |
| "epoch": 26.922010600500904, |
| "grad_norm": 0.4436827600002289, |
| "learning_rate": 0.00027710139860139855, |
| "loss": 3.1905, |
| "step": 92450 |
| }, |
| { |
| "epoch": 26.936571728114625, |
| "grad_norm": 0.4602677524089813, |
| "learning_rate": 0.0002769265734265734, |
| "loss": 3.1913, |
| "step": 92500 |
| }, |
| { |
| "epoch": 26.951132855728346, |
| "grad_norm": 0.4108976423740387, |
| "learning_rate": 0.00027675174825174826, |
| "loss": 3.1878, |
| "step": 92550 |
| }, |
| { |
| "epoch": 26.96569398334207, |
| "grad_norm": 0.48152244091033936, |
| "learning_rate": 0.00027657692307692306, |
| "loss": 3.1961, |
| "step": 92600 |
| }, |
| { |
| "epoch": 26.98025511095579, |
| "grad_norm": 0.42791837453842163, |
| "learning_rate": 0.00027640209790209786, |
| "loss": 3.2055, |
| "step": 92650 |
| }, |
| { |
| "epoch": 26.994816238569516, |
| "grad_norm": 0.43961113691329956, |
| "learning_rate": 0.0002762272727272727, |
| "loss": 3.1991, |
| "step": 92700 |
| }, |
| { |
| "epoch": 27.00931912167278, |
| "grad_norm": 0.45831483602523804, |
| "learning_rate": 0.0002760524475524475, |
| "loss": 3.1371, |
| "step": 92750 |
| }, |
| { |
| "epoch": 27.023880249286506, |
| "grad_norm": 0.42818915843963623, |
| "learning_rate": 0.00027587762237762237, |
| "loss": 3.0908, |
| "step": 92800 |
| }, |
| { |
| "epoch": 27.038441376900227, |
| "grad_norm": 0.4936304986476898, |
| "learning_rate": 0.00027570279720279717, |
| "loss": 3.1, |
| "step": 92850 |
| }, |
| { |
| "epoch": 27.053002504513948, |
| "grad_norm": 0.44421711564064026, |
| "learning_rate": 0.000275527972027972, |
| "loss": 3.1015, |
| "step": 92900 |
| }, |
| { |
| "epoch": 27.067563632127673, |
| "grad_norm": 0.4191001057624817, |
| "learning_rate": 0.0002753531468531468, |
| "loss": 3.1068, |
| "step": 92950 |
| }, |
| { |
| "epoch": 27.082124759741394, |
| "grad_norm": 0.4463858902454376, |
| "learning_rate": 0.0002751783216783216, |
| "loss": 3.1123, |
| "step": 93000 |
| }, |
| { |
| "epoch": 27.082124759741394, |
| "eval_accuracy": 0.37414957586542374, |
| "eval_loss": 3.553143262863159, |
| "eval_runtime": 81.9347, |
| "eval_samples_per_second": 203.101, |
| "eval_steps_per_second": 12.705, |
| "step": 93000 |
| }, |
| { |
| "epoch": 27.096685887355118, |
| "grad_norm": 0.45107048749923706, |
| "learning_rate": 0.0002750034965034965, |
| "loss": 3.1236, |
| "step": 93050 |
| }, |
| { |
| "epoch": 27.11124701496884, |
| "grad_norm": 0.4446358382701874, |
| "learning_rate": 0.0002748286713286713, |
| "loss": 3.1225, |
| "step": 93100 |
| }, |
| { |
| "epoch": 27.12580814258256, |
| "grad_norm": 0.44536080956459045, |
| "learning_rate": 0.00027465384615384613, |
| "loss": 3.1133, |
| "step": 93150 |
| }, |
| { |
| "epoch": 27.140369270196285, |
| "grad_norm": 0.4294614791870117, |
| "learning_rate": 0.000274479020979021, |
| "loss": 3.1182, |
| "step": 93200 |
| }, |
| { |
| "epoch": 27.154930397810006, |
| "grad_norm": 0.4464033842086792, |
| "learning_rate": 0.0002743041958041958, |
| "loss": 3.1167, |
| "step": 93250 |
| }, |
| { |
| "epoch": 27.16949152542373, |
| "grad_norm": 0.4504905343055725, |
| "learning_rate": 0.00027412937062937064, |
| "loss": 3.1306, |
| "step": 93300 |
| }, |
| { |
| "epoch": 27.18405265303745, |
| "grad_norm": 0.40164899826049805, |
| "learning_rate": 0.00027395454545454544, |
| "loss": 3.1322, |
| "step": 93350 |
| }, |
| { |
| "epoch": 27.198613780651172, |
| "grad_norm": 0.4864751994609833, |
| "learning_rate": 0.00027377972027972024, |
| "loss": 3.1283, |
| "step": 93400 |
| }, |
| { |
| "epoch": 27.213174908264897, |
| "grad_norm": 0.49909770488739014, |
| "learning_rate": 0.0002736048951048951, |
| "loss": 3.1323, |
| "step": 93450 |
| }, |
| { |
| "epoch": 27.227736035878618, |
| "grad_norm": 0.45618757605552673, |
| "learning_rate": 0.0002734300699300699, |
| "loss": 3.1309, |
| "step": 93500 |
| }, |
| { |
| "epoch": 27.242297163492342, |
| "grad_norm": 0.4547065198421478, |
| "learning_rate": 0.00027325524475524474, |
| "loss": 3.1244, |
| "step": 93550 |
| }, |
| { |
| "epoch": 27.256858291106063, |
| "grad_norm": 0.43155187368392944, |
| "learning_rate": 0.00027308041958041954, |
| "loss": 3.1401, |
| "step": 93600 |
| }, |
| { |
| "epoch": 27.271419418719784, |
| "grad_norm": 0.411630243062973, |
| "learning_rate": 0.0002729055944055944, |
| "loss": 3.1436, |
| "step": 93650 |
| }, |
| { |
| "epoch": 27.28598054633351, |
| "grad_norm": 0.4542841911315918, |
| "learning_rate": 0.0002727307692307692, |
| "loss": 3.1358, |
| "step": 93700 |
| }, |
| { |
| "epoch": 27.30054167394723, |
| "grad_norm": 0.4188205301761627, |
| "learning_rate": 0.000272555944055944, |
| "loss": 3.1465, |
| "step": 93750 |
| }, |
| { |
| "epoch": 27.315102801560954, |
| "grad_norm": 0.42731666564941406, |
| "learning_rate": 0.00027238111888111885, |
| "loss": 3.1473, |
| "step": 93800 |
| }, |
| { |
| "epoch": 27.329663929174675, |
| "grad_norm": 0.43808305263519287, |
| "learning_rate": 0.0002722062937062937, |
| "loss": 3.1355, |
| "step": 93850 |
| }, |
| { |
| "epoch": 27.344225056788396, |
| "grad_norm": 0.4501499533653259, |
| "learning_rate": 0.0002720314685314685, |
| "loss": 3.1418, |
| "step": 93900 |
| }, |
| { |
| "epoch": 27.35878618440212, |
| "grad_norm": 0.44502314925193787, |
| "learning_rate": 0.00027185664335664336, |
| "loss": 3.1504, |
| "step": 93950 |
| }, |
| { |
| "epoch": 27.37334731201584, |
| "grad_norm": 0.42435598373413086, |
| "learning_rate": 0.00027168181818181816, |
| "loss": 3.1516, |
| "step": 94000 |
| }, |
| { |
| "epoch": 27.37334731201584, |
| "eval_accuracy": 0.37442663658981995, |
| "eval_loss": 3.548163652420044, |
| "eval_runtime": 82.0172, |
| "eval_samples_per_second": 202.897, |
| "eval_steps_per_second": 12.692, |
| "step": 94000 |
| }, |
| { |
| "epoch": 27.387908439629566, |
| "grad_norm": 0.45668622851371765, |
| "learning_rate": 0.000271506993006993, |
| "loss": 3.1569, |
| "step": 94050 |
| }, |
| { |
| "epoch": 27.402469567243287, |
| "grad_norm": 0.41939079761505127, |
| "learning_rate": 0.0002713321678321678, |
| "loss": 3.1509, |
| "step": 94100 |
| }, |
| { |
| "epoch": 27.41703069485701, |
| "grad_norm": 0.44690999388694763, |
| "learning_rate": 0.0002711573426573426, |
| "loss": 3.155, |
| "step": 94150 |
| }, |
| { |
| "epoch": 27.431591822470732, |
| "grad_norm": 0.4177742600440979, |
| "learning_rate": 0.00027098251748251746, |
| "loss": 3.159, |
| "step": 94200 |
| }, |
| { |
| "epoch": 27.446152950084453, |
| "grad_norm": 0.4320014417171478, |
| "learning_rate": 0.00027080769230769226, |
| "loss": 3.1492, |
| "step": 94250 |
| }, |
| { |
| "epoch": 27.460714077698178, |
| "grad_norm": 0.4479568600654602, |
| "learning_rate": 0.0002706328671328671, |
| "loss": 3.1699, |
| "step": 94300 |
| }, |
| { |
| "epoch": 27.4752752053119, |
| "grad_norm": 0.4187909662723541, |
| "learning_rate": 0.0002704580419580419, |
| "loss": 3.1598, |
| "step": 94350 |
| }, |
| { |
| "epoch": 27.489836332925623, |
| "grad_norm": 0.5112937092781067, |
| "learning_rate": 0.00027028321678321677, |
| "loss": 3.1581, |
| "step": 94400 |
| }, |
| { |
| "epoch": 27.504397460539344, |
| "grad_norm": 0.432607501745224, |
| "learning_rate": 0.00027010839160839157, |
| "loss": 3.164, |
| "step": 94450 |
| }, |
| { |
| "epoch": 27.518958588153065, |
| "grad_norm": 0.42288729548454285, |
| "learning_rate": 0.00026993356643356637, |
| "loss": 3.1597, |
| "step": 94500 |
| }, |
| { |
| "epoch": 27.53351971576679, |
| "grad_norm": 0.43434974551200867, |
| "learning_rate": 0.0002697587412587412, |
| "loss": 3.1577, |
| "step": 94550 |
| }, |
| { |
| "epoch": 27.54808084338051, |
| "grad_norm": 0.41360169649124146, |
| "learning_rate": 0.0002695839160839161, |
| "loss": 3.1703, |
| "step": 94600 |
| }, |
| { |
| "epoch": 27.562641970994235, |
| "grad_norm": 0.43720167875289917, |
| "learning_rate": 0.0002694090909090909, |
| "loss": 3.1587, |
| "step": 94650 |
| }, |
| { |
| "epoch": 27.577203098607956, |
| "grad_norm": 0.43156540393829346, |
| "learning_rate": 0.00026923426573426573, |
| "loss": 3.1582, |
| "step": 94700 |
| }, |
| { |
| "epoch": 27.591764226221677, |
| "grad_norm": 0.4230353832244873, |
| "learning_rate": 0.00026905944055944053, |
| "loss": 3.1608, |
| "step": 94750 |
| }, |
| { |
| "epoch": 27.606325353835402, |
| "grad_norm": 0.44392943382263184, |
| "learning_rate": 0.0002688846153846154, |
| "loss": 3.1641, |
| "step": 94800 |
| }, |
| { |
| "epoch": 27.620886481449123, |
| "grad_norm": 0.4539208710193634, |
| "learning_rate": 0.0002687097902097902, |
| "loss": 3.1525, |
| "step": 94850 |
| }, |
| { |
| "epoch": 27.635447609062847, |
| "grad_norm": 0.4404858648777008, |
| "learning_rate": 0.000268534965034965, |
| "loss": 3.156, |
| "step": 94900 |
| }, |
| { |
| "epoch": 27.65000873667657, |
| "grad_norm": 0.4305916130542755, |
| "learning_rate": 0.00026836013986013984, |
| "loss": 3.1683, |
| "step": 94950 |
| }, |
| { |
| "epoch": 27.66456986429029, |
| "grad_norm": 0.4608471691608429, |
| "learning_rate": 0.00026818531468531464, |
| "loss": 3.1699, |
| "step": 95000 |
| }, |
| { |
| "epoch": 27.66456986429029, |
| "eval_accuracy": 0.3743760694796797, |
| "eval_loss": 3.5420138835906982, |
| "eval_runtime": 82.003, |
| "eval_samples_per_second": 202.932, |
| "eval_steps_per_second": 12.695, |
| "step": 95000 |
| }, |
| { |
| "epoch": 27.679130991904014, |
| "grad_norm": 0.4489138126373291, |
| "learning_rate": 0.0002680104895104895, |
| "loss": 3.1737, |
| "step": 95050 |
| }, |
| { |
| "epoch": 27.693692119517735, |
| "grad_norm": 0.4277009963989258, |
| "learning_rate": 0.0002678356643356643, |
| "loss": 3.1812, |
| "step": 95100 |
| }, |
| { |
| "epoch": 27.70825324713146, |
| "grad_norm": 0.46089035272598267, |
| "learning_rate": 0.00026766083916083915, |
| "loss": 3.1616, |
| "step": 95150 |
| }, |
| { |
| "epoch": 27.72281437474518, |
| "grad_norm": 0.4575275182723999, |
| "learning_rate": 0.00026748601398601395, |
| "loss": 3.1901, |
| "step": 95200 |
| }, |
| { |
| "epoch": 27.7373755023589, |
| "grad_norm": 0.4245191812515259, |
| "learning_rate": 0.0002673111888111888, |
| "loss": 3.1738, |
| "step": 95250 |
| }, |
| { |
| "epoch": 27.751936629972626, |
| "grad_norm": 0.4395964741706848, |
| "learning_rate": 0.0002671363636363636, |
| "loss": 3.1771, |
| "step": 95300 |
| }, |
| { |
| "epoch": 27.766497757586347, |
| "grad_norm": 0.4557894468307495, |
| "learning_rate": 0.00026696153846153845, |
| "loss": 3.1711, |
| "step": 95350 |
| }, |
| { |
| "epoch": 27.78105888520007, |
| "grad_norm": 0.467388778924942, |
| "learning_rate": 0.00026678671328671325, |
| "loss": 3.1755, |
| "step": 95400 |
| }, |
| { |
| "epoch": 27.795620012813792, |
| "grad_norm": 0.42502060532569885, |
| "learning_rate": 0.0002666118881118881, |
| "loss": 3.1802, |
| "step": 95450 |
| }, |
| { |
| "epoch": 27.810181140427513, |
| "grad_norm": 0.4643760621547699, |
| "learning_rate": 0.0002664370629370629, |
| "loss": 3.1759, |
| "step": 95500 |
| }, |
| { |
| "epoch": 27.824742268041238, |
| "grad_norm": 0.4207506775856018, |
| "learning_rate": 0.00026626223776223776, |
| "loss": 3.1851, |
| "step": 95550 |
| }, |
| { |
| "epoch": 27.83930339565496, |
| "grad_norm": 0.42335277795791626, |
| "learning_rate": 0.00026608741258741256, |
| "loss": 3.184, |
| "step": 95600 |
| }, |
| { |
| "epoch": 27.853864523268683, |
| "grad_norm": 0.4319060146808624, |
| "learning_rate": 0.00026591258741258736, |
| "loss": 3.1799, |
| "step": 95650 |
| }, |
| { |
| "epoch": 27.868425650882404, |
| "grad_norm": 0.4218584895133972, |
| "learning_rate": 0.0002657377622377622, |
| "loss": 3.1751, |
| "step": 95700 |
| }, |
| { |
| "epoch": 27.882986778496125, |
| "grad_norm": 0.4247802197933197, |
| "learning_rate": 0.000265562937062937, |
| "loss": 3.1794, |
| "step": 95750 |
| }, |
| { |
| "epoch": 27.89754790610985, |
| "grad_norm": 0.4139273762702942, |
| "learning_rate": 0.00026538811188811187, |
| "loss": 3.1784, |
| "step": 95800 |
| }, |
| { |
| "epoch": 27.91210903372357, |
| "grad_norm": 0.4383907914161682, |
| "learning_rate": 0.00026521328671328667, |
| "loss": 3.1811, |
| "step": 95850 |
| }, |
| { |
| "epoch": 27.926670161337295, |
| "grad_norm": 0.44586578011512756, |
| "learning_rate": 0.0002650384615384615, |
| "loss": 3.1943, |
| "step": 95900 |
| }, |
| { |
| "epoch": 27.941231288951016, |
| "grad_norm": 0.43650010228157043, |
| "learning_rate": 0.0002648636363636364, |
| "loss": 3.1754, |
| "step": 95950 |
| }, |
| { |
| "epoch": 27.955792416564737, |
| "grad_norm": 0.4336267411708832, |
| "learning_rate": 0.0002646888111888112, |
| "loss": 3.193, |
| "step": 96000 |
| }, |
| { |
| "epoch": 27.955792416564737, |
| "eval_accuracy": 0.37506695732171186, |
| "eval_loss": 3.533367872238159, |
| "eval_runtime": 82.0868, |
| "eval_samples_per_second": 202.724, |
| "eval_steps_per_second": 12.682, |
| "step": 96000 |
| }, |
| { |
| "epoch": 27.97035354417846, |
| "grad_norm": 0.43773847818374634, |
| "learning_rate": 0.000264513986013986, |
| "loss": 3.1739, |
| "step": 96050 |
| }, |
| { |
| "epoch": 27.984914671792183, |
| "grad_norm": 0.4969949424266815, |
| "learning_rate": 0.00026433916083916083, |
| "loss": 3.1868, |
| "step": 96100 |
| }, |
| { |
| "epoch": 27.999475799405907, |
| "grad_norm": 0.4523860812187195, |
| "learning_rate": 0.00026416433566433563, |
| "loss": 3.1843, |
| "step": 96150 |
| }, |
| { |
| "epoch": 28.013978682509173, |
| "grad_norm": 0.4267706871032715, |
| "learning_rate": 0.0002639895104895105, |
| "loss": 3.1004, |
| "step": 96200 |
| }, |
| { |
| "epoch": 28.028539810122897, |
| "grad_norm": 0.43044593930244446, |
| "learning_rate": 0.0002638146853146853, |
| "loss": 3.0891, |
| "step": 96250 |
| }, |
| { |
| "epoch": 28.043100937736618, |
| "grad_norm": 0.42589476704597473, |
| "learning_rate": 0.00026363986013986014, |
| "loss": 3.0925, |
| "step": 96300 |
| }, |
| { |
| "epoch": 28.05766206535034, |
| "grad_norm": 0.4761544167995453, |
| "learning_rate": 0.00026346503496503494, |
| "loss": 3.0975, |
| "step": 96350 |
| }, |
| { |
| "epoch": 28.072223192964064, |
| "grad_norm": 0.4301607012748718, |
| "learning_rate": 0.00026329020979020974, |
| "loss": 3.1009, |
| "step": 96400 |
| }, |
| { |
| "epoch": 28.086784320577785, |
| "grad_norm": 0.44543901085853577, |
| "learning_rate": 0.0002631153846153846, |
| "loss": 3.1022, |
| "step": 96450 |
| }, |
| { |
| "epoch": 28.10134544819151, |
| "grad_norm": 0.43196022510528564, |
| "learning_rate": 0.0002629405594405594, |
| "loss": 3.0985, |
| "step": 96500 |
| }, |
| { |
| "epoch": 28.11590657580523, |
| "grad_norm": 0.4325665533542633, |
| "learning_rate": 0.00026276573426573424, |
| "loss": 3.0997, |
| "step": 96550 |
| }, |
| { |
| "epoch": 28.13046770341895, |
| "grad_norm": 0.42045143246650696, |
| "learning_rate": 0.00026259090909090904, |
| "loss": 3.1123, |
| "step": 96600 |
| }, |
| { |
| "epoch": 28.145028831032675, |
| "grad_norm": 0.45770788192749023, |
| "learning_rate": 0.0002624160839160839, |
| "loss": 3.1092, |
| "step": 96650 |
| }, |
| { |
| "epoch": 28.159589958646396, |
| "grad_norm": 0.4542134702205658, |
| "learning_rate": 0.00026224125874125875, |
| "loss": 3.0943, |
| "step": 96700 |
| }, |
| { |
| "epoch": 28.17415108626012, |
| "grad_norm": 0.43821507692337036, |
| "learning_rate": 0.00026206643356643355, |
| "loss": 3.1152, |
| "step": 96750 |
| }, |
| { |
| "epoch": 28.188712213873842, |
| "grad_norm": 0.44620075821876526, |
| "learning_rate": 0.00026189160839160835, |
| "loss": 3.1204, |
| "step": 96800 |
| }, |
| { |
| "epoch": 28.203273341487566, |
| "grad_norm": 0.43391305208206177, |
| "learning_rate": 0.0002617167832167832, |
| "loss": 3.1192, |
| "step": 96850 |
| }, |
| { |
| "epoch": 28.217834469101287, |
| "grad_norm": 0.4529394805431366, |
| "learning_rate": 0.000261541958041958, |
| "loss": 3.1237, |
| "step": 96900 |
| }, |
| { |
| "epoch": 28.23239559671501, |
| "grad_norm": 0.4503569006919861, |
| "learning_rate": 0.00026136713286713286, |
| "loss": 3.1131, |
| "step": 96950 |
| }, |
| { |
| "epoch": 28.246956724328733, |
| "grad_norm": 0.4577544629573822, |
| "learning_rate": 0.00026119230769230766, |
| "loss": 3.1241, |
| "step": 97000 |
| }, |
| { |
| "epoch": 28.246956724328733, |
| "eval_accuracy": 0.3742659978166768, |
| "eval_loss": 3.5512301921844482, |
| "eval_runtime": 81.8589, |
| "eval_samples_per_second": 203.289, |
| "eval_steps_per_second": 12.717, |
| "step": 97000 |
| }, |
| { |
| "epoch": 28.261517851942454, |
| "grad_norm": 0.4486400783061981, |
| "learning_rate": 0.0002610174825174825, |
| "loss": 3.1329, |
| "step": 97050 |
| }, |
| { |
| "epoch": 28.27607897955618, |
| "grad_norm": 0.4469950795173645, |
| "learning_rate": 0.0002608426573426573, |
| "loss": 3.1178, |
| "step": 97100 |
| }, |
| { |
| "epoch": 28.2906401071699, |
| "grad_norm": 0.4762290120124817, |
| "learning_rate": 0.0002606678321678321, |
| "loss": 3.1369, |
| "step": 97150 |
| }, |
| { |
| "epoch": 28.30520123478362, |
| "grad_norm": 0.47702765464782715, |
| "learning_rate": 0.00026049300699300696, |
| "loss": 3.1342, |
| "step": 97200 |
| }, |
| { |
| "epoch": 28.319762362397345, |
| "grad_norm": 0.43102771043777466, |
| "learning_rate": 0.00026031818181818176, |
| "loss": 3.1289, |
| "step": 97250 |
| }, |
| { |
| "epoch": 28.334323490011066, |
| "grad_norm": 0.4476456642150879, |
| "learning_rate": 0.0002601433566433566, |
| "loss": 3.1426, |
| "step": 97300 |
| }, |
| { |
| "epoch": 28.34888461762479, |
| "grad_norm": 0.47269853949546814, |
| "learning_rate": 0.00025996853146853147, |
| "loss": 3.1424, |
| "step": 97350 |
| }, |
| { |
| "epoch": 28.36344574523851, |
| "grad_norm": 0.4365980327129364, |
| "learning_rate": 0.00025979370629370627, |
| "loss": 3.1443, |
| "step": 97400 |
| }, |
| { |
| "epoch": 28.378006872852232, |
| "grad_norm": 0.43918779492378235, |
| "learning_rate": 0.0002596188811188811, |
| "loss": 3.1387, |
| "step": 97450 |
| }, |
| { |
| "epoch": 28.392568000465957, |
| "grad_norm": 0.4355844557285309, |
| "learning_rate": 0.0002594440559440559, |
| "loss": 3.1365, |
| "step": 97500 |
| }, |
| { |
| "epoch": 28.407129128079678, |
| "grad_norm": 0.4529942572116852, |
| "learning_rate": 0.0002592692307692307, |
| "loss": 3.1489, |
| "step": 97550 |
| }, |
| { |
| "epoch": 28.421690255693402, |
| "grad_norm": 0.4477832615375519, |
| "learning_rate": 0.0002590944055944056, |
| "loss": 3.1397, |
| "step": 97600 |
| }, |
| { |
| "epoch": 28.436251383307123, |
| "grad_norm": 0.43925923109054565, |
| "learning_rate": 0.0002589195804195804, |
| "loss": 3.1434, |
| "step": 97650 |
| }, |
| { |
| "epoch": 28.450812510920844, |
| "grad_norm": 0.4570046365261078, |
| "learning_rate": 0.00025874475524475523, |
| "loss": 3.149, |
| "step": 97700 |
| }, |
| { |
| "epoch": 28.46537363853457, |
| "grad_norm": 0.4124143719673157, |
| "learning_rate": 0.00025856993006993003, |
| "loss": 3.1443, |
| "step": 97750 |
| }, |
| { |
| "epoch": 28.47993476614829, |
| "grad_norm": 0.4491616189479828, |
| "learning_rate": 0.0002583951048951049, |
| "loss": 3.1415, |
| "step": 97800 |
| }, |
| { |
| "epoch": 28.494495893762014, |
| "grad_norm": 0.4381904900074005, |
| "learning_rate": 0.0002582202797202797, |
| "loss": 3.1525, |
| "step": 97850 |
| }, |
| { |
| "epoch": 28.509057021375735, |
| "grad_norm": 0.449377179145813, |
| "learning_rate": 0.0002580454545454545, |
| "loss": 3.1583, |
| "step": 97900 |
| }, |
| { |
| "epoch": 28.523618148989456, |
| "grad_norm": 0.4469902515411377, |
| "learning_rate": 0.00025787062937062934, |
| "loss": 3.1613, |
| "step": 97950 |
| }, |
| { |
| "epoch": 28.53817927660318, |
| "grad_norm": 0.42082348465919495, |
| "learning_rate": 0.0002576958041958042, |
| "loss": 3.1584, |
| "step": 98000 |
| }, |
| { |
| "epoch": 28.53817927660318, |
| "eval_accuracy": 0.37466700675988185, |
| "eval_loss": 3.5444014072418213, |
| "eval_runtime": 82.1105, |
| "eval_samples_per_second": 202.666, |
| "eval_steps_per_second": 12.678, |
| "step": 98000 |
| }, |
| { |
| "epoch": 28.5527404042169, |
| "grad_norm": 0.4826982617378235, |
| "learning_rate": 0.000257520979020979, |
| "loss": 3.1546, |
| "step": 98050 |
| }, |
| { |
| "epoch": 28.567301531830626, |
| "grad_norm": 0.4464641213417053, |
| "learning_rate": 0.00025734615384615385, |
| "loss": 3.1479, |
| "step": 98100 |
| }, |
| { |
| "epoch": 28.581862659444347, |
| "grad_norm": 0.4337342083454132, |
| "learning_rate": 0.00025717132867132865, |
| "loss": 3.1538, |
| "step": 98150 |
| }, |
| { |
| "epoch": 28.596423787058068, |
| "grad_norm": 0.4158535897731781, |
| "learning_rate": 0.0002569965034965035, |
| "loss": 3.1502, |
| "step": 98200 |
| }, |
| { |
| "epoch": 28.610984914671793, |
| "grad_norm": 0.43536701798439026, |
| "learning_rate": 0.0002568216783216783, |
| "loss": 3.1553, |
| "step": 98250 |
| }, |
| { |
| "epoch": 28.625546042285514, |
| "grad_norm": 0.43609172105789185, |
| "learning_rate": 0.0002566468531468531, |
| "loss": 3.1541, |
| "step": 98300 |
| }, |
| { |
| "epoch": 28.640107169899238, |
| "grad_norm": 0.48345381021499634, |
| "learning_rate": 0.00025647202797202795, |
| "loss": 3.1607, |
| "step": 98350 |
| }, |
| { |
| "epoch": 28.65466829751296, |
| "grad_norm": 0.4729558825492859, |
| "learning_rate": 0.00025629720279720275, |
| "loss": 3.1533, |
| "step": 98400 |
| }, |
| { |
| "epoch": 28.66922942512668, |
| "grad_norm": 0.4618447422981262, |
| "learning_rate": 0.0002561223776223776, |
| "loss": 3.1574, |
| "step": 98450 |
| }, |
| { |
| "epoch": 28.683790552740405, |
| "grad_norm": 0.4279381334781647, |
| "learning_rate": 0.0002559475524475524, |
| "loss": 3.1648, |
| "step": 98500 |
| }, |
| { |
| "epoch": 28.698351680354126, |
| "grad_norm": 0.5665946006774902, |
| "learning_rate": 0.00025577272727272726, |
| "loss": 3.1659, |
| "step": 98550 |
| }, |
| { |
| "epoch": 28.71291280796785, |
| "grad_norm": 0.45663198828697205, |
| "learning_rate": 0.00025559790209790206, |
| "loss": 3.1713, |
| "step": 98600 |
| }, |
| { |
| "epoch": 28.72747393558157, |
| "grad_norm": 0.44645336270332336, |
| "learning_rate": 0.00025542307692307686, |
| "loss": 3.1619, |
| "step": 98650 |
| }, |
| { |
| "epoch": 28.742035063195296, |
| "grad_norm": 0.44865837693214417, |
| "learning_rate": 0.00025524825174825177, |
| "loss": 3.1683, |
| "step": 98700 |
| }, |
| { |
| "epoch": 28.756596190809017, |
| "grad_norm": 0.4584784209728241, |
| "learning_rate": 0.00025507342657342657, |
| "loss": 3.172, |
| "step": 98750 |
| }, |
| { |
| "epoch": 28.771157318422738, |
| "grad_norm": 0.439564973115921, |
| "learning_rate": 0.00025489860139860137, |
| "loss": 3.172, |
| "step": 98800 |
| }, |
| { |
| "epoch": 28.785718446036462, |
| "grad_norm": 0.43353208899497986, |
| "learning_rate": 0.0002547237762237762, |
| "loss": 3.1709, |
| "step": 98850 |
| }, |
| { |
| "epoch": 28.800279573650183, |
| "grad_norm": 0.4293374717235565, |
| "learning_rate": 0.000254548951048951, |
| "loss": 3.1782, |
| "step": 98900 |
| }, |
| { |
| "epoch": 28.814840701263904, |
| "grad_norm": 0.41944700479507446, |
| "learning_rate": 0.0002543741258741259, |
| "loss": 3.1707, |
| "step": 98950 |
| }, |
| { |
| "epoch": 28.82940182887763, |
| "grad_norm": 0.45922040939331055, |
| "learning_rate": 0.0002541993006993007, |
| "loss": 3.1838, |
| "step": 99000 |
| }, |
| { |
| "epoch": 28.82940182887763, |
| "eval_accuracy": 0.37545832323461104, |
| "eval_loss": 3.535820245742798, |
| "eval_runtime": 82.0224, |
| "eval_samples_per_second": 202.883, |
| "eval_steps_per_second": 12.692, |
| "step": 99000 |
| }, |
| { |
| "epoch": 28.84396295649135, |
| "grad_norm": 0.4806719422340393, |
| "learning_rate": 0.0002540244755244755, |
| "loss": 3.1617, |
| "step": 99050 |
| }, |
| { |
| "epoch": 28.858524084105074, |
| "grad_norm": 0.4589778780937195, |
| "learning_rate": 0.00025384965034965033, |
| "loss": 3.1754, |
| "step": 99100 |
| }, |
| { |
| "epoch": 28.873085211718795, |
| "grad_norm": 0.4465028941631317, |
| "learning_rate": 0.00025367482517482513, |
| "loss": 3.1745, |
| "step": 99150 |
| }, |
| { |
| "epoch": 28.88764633933252, |
| "grad_norm": 0.4480758607387543, |
| "learning_rate": 0.0002535, |
| "loss": 3.178, |
| "step": 99200 |
| }, |
| { |
| "epoch": 28.90220746694624, |
| "grad_norm": 0.4510329067707062, |
| "learning_rate": 0.0002533251748251748, |
| "loss": 3.1698, |
| "step": 99250 |
| }, |
| { |
| "epoch": 28.91676859455996, |
| "grad_norm": 0.4257822036743164, |
| "learning_rate": 0.00025315034965034964, |
| "loss": 3.1813, |
| "step": 99300 |
| }, |
| { |
| "epoch": 28.931329722173686, |
| "grad_norm": 0.45149117708206177, |
| "learning_rate": 0.00025297552447552444, |
| "loss": 3.18, |
| "step": 99350 |
| }, |
| { |
| "epoch": 28.945890849787407, |
| "grad_norm": 0.4418821334838867, |
| "learning_rate": 0.0002528006993006993, |
| "loss": 3.1789, |
| "step": 99400 |
| }, |
| { |
| "epoch": 28.96045197740113, |
| "grad_norm": 0.46663710474967957, |
| "learning_rate": 0.00025262587412587414, |
| "loss": 3.1857, |
| "step": 99450 |
| }, |
| { |
| "epoch": 28.975013105014852, |
| "grad_norm": 0.4695162773132324, |
| "learning_rate": 0.00025245104895104894, |
| "loss": 3.1828, |
| "step": 99500 |
| }, |
| { |
| "epoch": 28.989574232628573, |
| "grad_norm": 0.44682714343070984, |
| "learning_rate": 0.00025227622377622374, |
| "loss": 3.1801, |
| "step": 99550 |
| }, |
| { |
| "epoch": 29.004077115731842, |
| "grad_norm": 0.439547061920166, |
| "learning_rate": 0.0002521013986013986, |
| "loss": 3.1512, |
| "step": 99600 |
| }, |
| { |
| "epoch": 29.018638243345563, |
| "grad_norm": 0.45149293541908264, |
| "learning_rate": 0.0002519265734265734, |
| "loss": 3.0796, |
| "step": 99650 |
| }, |
| { |
| "epoch": 29.033199370959288, |
| "grad_norm": 0.4620671272277832, |
| "learning_rate": 0.00025175174825174825, |
| "loss": 3.0841, |
| "step": 99700 |
| }, |
| { |
| "epoch": 29.04776049857301, |
| "grad_norm": 0.4556296169757843, |
| "learning_rate": 0.00025157692307692305, |
| "loss": 3.0937, |
| "step": 99750 |
| }, |
| { |
| "epoch": 29.062321626186733, |
| "grad_norm": 0.4277898371219635, |
| "learning_rate": 0.0002514020979020979, |
| "loss": 3.1107, |
| "step": 99800 |
| }, |
| { |
| "epoch": 29.076882753800454, |
| "grad_norm": 0.44129517674446106, |
| "learning_rate": 0.0002512272727272727, |
| "loss": 3.0905, |
| "step": 99850 |
| }, |
| { |
| "epoch": 29.091443881414175, |
| "grad_norm": 0.44161632657051086, |
| "learning_rate": 0.0002510524475524475, |
| "loss": 3.1015, |
| "step": 99900 |
| }, |
| { |
| "epoch": 29.1060050090279, |
| "grad_norm": 0.442378431558609, |
| "learning_rate": 0.00025087762237762236, |
| "loss": 3.1034, |
| "step": 99950 |
| }, |
| { |
| "epoch": 29.12056613664162, |
| "grad_norm": 0.4560869336128235, |
| "learning_rate": 0.00025070279720279716, |
| "loss": 3.1031, |
| "step": 100000 |
| }, |
| { |
| "epoch": 29.12056613664162, |
| "eval_accuracy": 0.37452106772805854, |
| "eval_loss": 3.5504512786865234, |
| "eval_runtime": 82.0851, |
| "eval_samples_per_second": 202.729, |
| "eval_steps_per_second": 12.682, |
| "step": 100000 |
| }, |
| { |
| "epoch": 29.135127264255345, |
| "grad_norm": 0.4479752779006958, |
| "learning_rate": 0.000250527972027972, |
| "loss": 3.1142, |
| "step": 100050 |
| }, |
| { |
| "epoch": 29.149688391869066, |
| "grad_norm": 0.4781578481197357, |
| "learning_rate": 0.00025035314685314686, |
| "loss": 3.105, |
| "step": 100100 |
| }, |
| { |
| "epoch": 29.164249519482787, |
| "grad_norm": 0.4513346254825592, |
| "learning_rate": 0.00025017832167832166, |
| "loss": 3.107, |
| "step": 100150 |
| }, |
| { |
| "epoch": 29.178810647096512, |
| "grad_norm": 0.44745007157325745, |
| "learning_rate": 0.0002500034965034965, |
| "loss": 3.1073, |
| "step": 100200 |
| }, |
| { |
| "epoch": 29.193371774710233, |
| "grad_norm": 0.4455200731754303, |
| "learning_rate": 0.0002498286713286713, |
| "loss": 3.1115, |
| "step": 100250 |
| }, |
| { |
| "epoch": 29.207932902323957, |
| "grad_norm": 0.4602887034416199, |
| "learning_rate": 0.0002496538461538461, |
| "loss": 3.1081, |
| "step": 100300 |
| }, |
| { |
| "epoch": 29.22249402993768, |
| "grad_norm": 0.46344348788261414, |
| "learning_rate": 0.00024947902097902097, |
| "loss": 3.1172, |
| "step": 100350 |
| }, |
| { |
| "epoch": 29.2370551575514, |
| "grad_norm": 0.44583073258399963, |
| "learning_rate": 0.00024930419580419577, |
| "loss": 3.1216, |
| "step": 100400 |
| }, |
| { |
| "epoch": 29.251616285165124, |
| "grad_norm": 0.4341423213481903, |
| "learning_rate": 0.0002491293706293706, |
| "loss": 3.1092, |
| "step": 100450 |
| }, |
| { |
| "epoch": 29.266177412778845, |
| "grad_norm": 0.4741925001144409, |
| "learning_rate": 0.0002489545454545454, |
| "loss": 3.1293, |
| "step": 100500 |
| }, |
| { |
| "epoch": 29.28073854039257, |
| "grad_norm": 0.47563058137893677, |
| "learning_rate": 0.0002487797202797203, |
| "loss": 3.1252, |
| "step": 100550 |
| }, |
| { |
| "epoch": 29.29529966800629, |
| "grad_norm": 0.4274418354034424, |
| "learning_rate": 0.0002486048951048951, |
| "loss": 3.1159, |
| "step": 100600 |
| }, |
| { |
| "epoch": 29.30986079562001, |
| "grad_norm": 0.43985074758529663, |
| "learning_rate": 0.0002484300699300699, |
| "loss": 3.124, |
| "step": 100650 |
| }, |
| { |
| "epoch": 29.324421923233736, |
| "grad_norm": 0.4203740954399109, |
| "learning_rate": 0.00024825524475524473, |
| "loss": 3.1353, |
| "step": 100700 |
| }, |
| { |
| "epoch": 29.338983050847457, |
| "grad_norm": 0.434476375579834, |
| "learning_rate": 0.00024808041958041953, |
| "loss": 3.1267, |
| "step": 100750 |
| }, |
| { |
| "epoch": 29.35354417846118, |
| "grad_norm": 0.4747839570045471, |
| "learning_rate": 0.0002479055944055944, |
| "loss": 3.1268, |
| "step": 100800 |
| }, |
| { |
| "epoch": 29.368105306074902, |
| "grad_norm": 0.4452389180660248, |
| "learning_rate": 0.00024773076923076924, |
| "loss": 3.139, |
| "step": 100850 |
| }, |
| { |
| "epoch": 29.382666433688623, |
| "grad_norm": 0.443792462348938, |
| "learning_rate": 0.00024755594405594404, |
| "loss": 3.1146, |
| "step": 100900 |
| }, |
| { |
| "epoch": 29.397227561302348, |
| "grad_norm": 0.4169985353946686, |
| "learning_rate": 0.0002473811188811189, |
| "loss": 3.1308, |
| "step": 100950 |
| }, |
| { |
| "epoch": 29.41178868891607, |
| "grad_norm": 0.44406652450561523, |
| "learning_rate": 0.0002472062937062937, |
| "loss": 3.1339, |
| "step": 101000 |
| }, |
| { |
| "epoch": 29.41178868891607, |
| "eval_accuracy": 0.3747840167007877, |
| "eval_loss": 3.5469107627868652, |
| "eval_runtime": 82.1998, |
| "eval_samples_per_second": 202.446, |
| "eval_steps_per_second": 12.664, |
| "step": 101000 |
| }, |
| { |
| "epoch": 29.426349816529793, |
| "grad_norm": 0.4526227116584778, |
| "learning_rate": 0.0002470314685314685, |
| "loss": 3.1385, |
| "step": 101050 |
| }, |
| { |
| "epoch": 29.440910944143514, |
| "grad_norm": 0.45975977182388306, |
| "learning_rate": 0.00024685664335664335, |
| "loss": 3.1394, |
| "step": 101100 |
| }, |
| { |
| "epoch": 29.455472071757235, |
| "grad_norm": 0.46566811203956604, |
| "learning_rate": 0.00024668181818181815, |
| "loss": 3.1425, |
| "step": 101150 |
| }, |
| { |
| "epoch": 29.47003319937096, |
| "grad_norm": 0.43443745374679565, |
| "learning_rate": 0.000246506993006993, |
| "loss": 3.1317, |
| "step": 101200 |
| }, |
| { |
| "epoch": 29.48459432698468, |
| "grad_norm": 0.45862895250320435, |
| "learning_rate": 0.0002463321678321678, |
| "loss": 3.1459, |
| "step": 101250 |
| }, |
| { |
| "epoch": 29.499155454598405, |
| "grad_norm": 0.46274325251579285, |
| "learning_rate": 0.00024615734265734265, |
| "loss": 3.1362, |
| "step": 101300 |
| }, |
| { |
| "epoch": 29.513716582212126, |
| "grad_norm": 0.46081170439720154, |
| "learning_rate": 0.00024598251748251745, |
| "loss": 3.1528, |
| "step": 101350 |
| }, |
| { |
| "epoch": 29.52827770982585, |
| "grad_norm": 0.47659215331077576, |
| "learning_rate": 0.00024580769230769225, |
| "loss": 3.1347, |
| "step": 101400 |
| }, |
| { |
| "epoch": 29.54283883743957, |
| "grad_norm": 0.4489154815673828, |
| "learning_rate": 0.0002456328671328671, |
| "loss": 3.1419, |
| "step": 101450 |
| }, |
| { |
| "epoch": 29.557399965053293, |
| "grad_norm": 0.4594299793243408, |
| "learning_rate": 0.00024545804195804196, |
| "loss": 3.1437, |
| "step": 101500 |
| }, |
| { |
| "epoch": 29.571961092667017, |
| "grad_norm": 0.44176366925239563, |
| "learning_rate": 0.00024528321678321676, |
| "loss": 3.1444, |
| "step": 101550 |
| }, |
| { |
| "epoch": 29.586522220280738, |
| "grad_norm": 0.460981547832489, |
| "learning_rate": 0.0002451083916083916, |
| "loss": 3.1445, |
| "step": 101600 |
| }, |
| { |
| "epoch": 29.601083347894463, |
| "grad_norm": 0.4526672661304474, |
| "learning_rate": 0.0002449335664335664, |
| "loss": 3.1518, |
| "step": 101650 |
| }, |
| { |
| "epoch": 29.615644475508184, |
| "grad_norm": 0.43304094672203064, |
| "learning_rate": 0.00024475874125874127, |
| "loss": 3.1573, |
| "step": 101700 |
| }, |
| { |
| "epoch": 29.630205603121905, |
| "grad_norm": 0.4359411895275116, |
| "learning_rate": 0.00024458391608391607, |
| "loss": 3.1515, |
| "step": 101750 |
| }, |
| { |
| "epoch": 29.64476673073563, |
| "grad_norm": 0.45941245555877686, |
| "learning_rate": 0.00024440909090909087, |
| "loss": 3.1404, |
| "step": 101800 |
| }, |
| { |
| "epoch": 29.65932785834935, |
| "grad_norm": 0.4487653076648712, |
| "learning_rate": 0.0002442342657342657, |
| "loss": 3.1554, |
| "step": 101850 |
| }, |
| { |
| "epoch": 29.673888985963075, |
| "grad_norm": 0.46325019001960754, |
| "learning_rate": 0.00024405944055944052, |
| "loss": 3.1522, |
| "step": 101900 |
| }, |
| { |
| "epoch": 29.688450113576796, |
| "grad_norm": 0.45048460364341736, |
| "learning_rate": 0.00024388461538461535, |
| "loss": 3.1597, |
| "step": 101950 |
| }, |
| { |
| "epoch": 29.703011241190517, |
| "grad_norm": 0.5076062083244324, |
| "learning_rate": 0.00024370979020979017, |
| "loss": 3.1536, |
| "step": 102000 |
| }, |
| { |
| "epoch": 29.703011241190517, |
| "eval_accuracy": 0.3749000858582491, |
| "eval_loss": 3.541860580444336, |
| "eval_runtime": 82.1497, |
| "eval_samples_per_second": 202.569, |
| "eval_steps_per_second": 12.672, |
| "step": 102000 |
| }, |
| { |
| "epoch": 29.71757236880424, |
| "grad_norm": 0.4789530038833618, |
| "learning_rate": 0.000243534965034965, |
| "loss": 3.1505, |
| "step": 102050 |
| }, |
| { |
| "epoch": 29.732133496417962, |
| "grad_norm": 0.46807464957237244, |
| "learning_rate": 0.00024336013986013983, |
| "loss": 3.1538, |
| "step": 102100 |
| }, |
| { |
| "epoch": 29.746694624031687, |
| "grad_norm": 0.4563269019126892, |
| "learning_rate": 0.00024318531468531468, |
| "loss": 3.151, |
| "step": 102150 |
| }, |
| { |
| "epoch": 29.761255751645407, |
| "grad_norm": 0.44792503118515015, |
| "learning_rate": 0.0002430104895104895, |
| "loss": 3.1667, |
| "step": 102200 |
| }, |
| { |
| "epoch": 29.77581687925913, |
| "grad_norm": 0.4363691210746765, |
| "learning_rate": 0.00024283566433566434, |
| "loss": 3.1473, |
| "step": 102250 |
| }, |
| { |
| "epoch": 29.790378006872853, |
| "grad_norm": 0.4299928545951843, |
| "learning_rate": 0.00024266083916083916, |
| "loss": 3.1665, |
| "step": 102300 |
| }, |
| { |
| "epoch": 29.804939134486574, |
| "grad_norm": 0.4807325303554535, |
| "learning_rate": 0.00024248601398601396, |
| "loss": 3.1599, |
| "step": 102350 |
| }, |
| { |
| "epoch": 29.8195002621003, |
| "grad_norm": 0.4705067276954651, |
| "learning_rate": 0.0002423111888111888, |
| "loss": 3.1679, |
| "step": 102400 |
| }, |
| { |
| "epoch": 29.83406138971402, |
| "grad_norm": 0.4763050377368927, |
| "learning_rate": 0.00024213636363636362, |
| "loss": 3.1655, |
| "step": 102450 |
| }, |
| { |
| "epoch": 29.84862251732774, |
| "grad_norm": 0.46041691303253174, |
| "learning_rate": 0.00024196153846153844, |
| "loss": 3.1579, |
| "step": 102500 |
| }, |
| { |
| "epoch": 29.863183644941465, |
| "grad_norm": 0.488145112991333, |
| "learning_rate": 0.00024178671328671327, |
| "loss": 3.1611, |
| "step": 102550 |
| }, |
| { |
| "epoch": 29.877744772555186, |
| "grad_norm": 0.45449942350387573, |
| "learning_rate": 0.0002416118881118881, |
| "loss": 3.1598, |
| "step": 102600 |
| }, |
| { |
| "epoch": 29.89230590016891, |
| "grad_norm": 0.4500633478164673, |
| "learning_rate": 0.0002414370629370629, |
| "loss": 3.1658, |
| "step": 102650 |
| }, |
| { |
| "epoch": 29.90686702778263, |
| "grad_norm": 0.44506168365478516, |
| "learning_rate": 0.00024126223776223772, |
| "loss": 3.1627, |
| "step": 102700 |
| }, |
| { |
| "epoch": 29.921428155396352, |
| "grad_norm": 0.4824460744857788, |
| "learning_rate": 0.00024108741258741255, |
| "loss": 3.1651, |
| "step": 102750 |
| }, |
| { |
| "epoch": 29.935989283010077, |
| "grad_norm": 0.44130200147628784, |
| "learning_rate": 0.00024091258741258738, |
| "loss": 3.1635, |
| "step": 102800 |
| }, |
| { |
| "epoch": 29.950550410623798, |
| "grad_norm": 0.4428160786628723, |
| "learning_rate": 0.00024073776223776223, |
| "loss": 3.1705, |
| "step": 102850 |
| }, |
| { |
| "epoch": 29.965111538237522, |
| "grad_norm": 0.45782288908958435, |
| "learning_rate": 0.00024056293706293706, |
| "loss": 3.1774, |
| "step": 102900 |
| }, |
| { |
| "epoch": 29.979672665851243, |
| "grad_norm": 0.44905662536621094, |
| "learning_rate": 0.00024038811188811188, |
| "loss": 3.1688, |
| "step": 102950 |
| }, |
| { |
| "epoch": 29.994233793464964, |
| "grad_norm": 0.4480430781841278, |
| "learning_rate": 0.0002402132867132867, |
| "loss": 3.1805, |
| "step": 103000 |
| }, |
| { |
| "epoch": 29.994233793464964, |
| "eval_accuracy": 0.3754883107069035, |
| "eval_loss": 3.5314321517944336, |
| "eval_runtime": 82.8692, |
| "eval_samples_per_second": 200.81, |
| "eval_steps_per_second": 12.562, |
| "step": 103000 |
| }, |
| { |
| "epoch": 30.008736676568233, |
| "grad_norm": 0.44055482745170593, |
| "learning_rate": 0.00024003846153846154, |
| "loss": 3.1037, |
| "step": 103050 |
| }, |
| { |
| "epoch": 30.023297804181954, |
| "grad_norm": 0.4426362216472626, |
| "learning_rate": 0.00023986363636363634, |
| "loss": 3.0711, |
| "step": 103100 |
| }, |
| { |
| "epoch": 30.03785893179568, |
| "grad_norm": 0.46040791273117065, |
| "learning_rate": 0.00023968881118881116, |
| "loss": 3.0674, |
| "step": 103150 |
| }, |
| { |
| "epoch": 30.0524200594094, |
| "grad_norm": 0.47156092524528503, |
| "learning_rate": 0.000239513986013986, |
| "loss": 3.0889, |
| "step": 103200 |
| }, |
| { |
| "epoch": 30.066981187023124, |
| "grad_norm": 0.42856401205062866, |
| "learning_rate": 0.00023933916083916082, |
| "loss": 3.0831, |
| "step": 103250 |
| }, |
| { |
| "epoch": 30.081542314636845, |
| "grad_norm": 0.4443930387496948, |
| "learning_rate": 0.00023916433566433564, |
| "loss": 3.085, |
| "step": 103300 |
| }, |
| { |
| "epoch": 30.096103442250566, |
| "grad_norm": 0.44311341643333435, |
| "learning_rate": 0.00023898951048951047, |
| "loss": 3.0807, |
| "step": 103350 |
| }, |
| { |
| "epoch": 30.11066456986429, |
| "grad_norm": 0.4659915566444397, |
| "learning_rate": 0.00023881468531468527, |
| "loss": 3.0909, |
| "step": 103400 |
| }, |
| { |
| "epoch": 30.12522569747801, |
| "grad_norm": 0.46115854382514954, |
| "learning_rate": 0.0002386398601398601, |
| "loss": 3.096, |
| "step": 103450 |
| }, |
| { |
| "epoch": 30.139786825091736, |
| "grad_norm": 0.4553181231021881, |
| "learning_rate": 0.00023846503496503492, |
| "loss": 3.1071, |
| "step": 103500 |
| }, |
| { |
| "epoch": 30.154347952705457, |
| "grad_norm": 0.4349801242351532, |
| "learning_rate": 0.00023829020979020978, |
| "loss": 3.1125, |
| "step": 103550 |
| }, |
| { |
| "epoch": 30.168909080319178, |
| "grad_norm": 0.43821319937705994, |
| "learning_rate": 0.0002381153846153846, |
| "loss": 3.1104, |
| "step": 103600 |
| }, |
| { |
| "epoch": 30.183470207932903, |
| "grad_norm": 0.4530899226665497, |
| "learning_rate": 0.00023794055944055943, |
| "loss": 3.0901, |
| "step": 103650 |
| }, |
| { |
| "epoch": 30.198031335546624, |
| "grad_norm": 0.43999993801116943, |
| "learning_rate": 0.00023776573426573426, |
| "loss": 3.1013, |
| "step": 103700 |
| }, |
| { |
| "epoch": 30.212592463160348, |
| "grad_norm": 0.46763479709625244, |
| "learning_rate": 0.00023759090909090909, |
| "loss": 3.1088, |
| "step": 103750 |
| }, |
| { |
| "epoch": 30.22715359077407, |
| "grad_norm": 0.41881898045539856, |
| "learning_rate": 0.0002374160839160839, |
| "loss": 3.1041, |
| "step": 103800 |
| }, |
| { |
| "epoch": 30.241714718387794, |
| "grad_norm": 0.4678491950035095, |
| "learning_rate": 0.0002372412587412587, |
| "loss": 3.1105, |
| "step": 103850 |
| }, |
| { |
| "epoch": 30.256275846001515, |
| "grad_norm": 0.4604734182357788, |
| "learning_rate": 0.00023706643356643354, |
| "loss": 3.1057, |
| "step": 103900 |
| }, |
| { |
| "epoch": 30.270836973615236, |
| "grad_norm": 0.4329134225845337, |
| "learning_rate": 0.00023689160839160837, |
| "loss": 3.1133, |
| "step": 103950 |
| }, |
| { |
| "epoch": 30.28539810122896, |
| "grad_norm": 0.43885889649391174, |
| "learning_rate": 0.0002367167832167832, |
| "loss": 3.1183, |
| "step": 104000 |
| }, |
| { |
| "epoch": 30.28539810122896, |
| "eval_accuracy": 0.37448131962752973, |
| "eval_loss": 3.5509355068206787, |
| "eval_runtime": 82.2111, |
| "eval_samples_per_second": 202.418, |
| "eval_steps_per_second": 12.663, |
| "step": 104000 |
| }, |
| { |
| "epoch": 30.29995922884268, |
| "grad_norm": 0.45464998483657837, |
| "learning_rate": 0.00023654195804195802, |
| "loss": 3.1144, |
| "step": 104050 |
| }, |
| { |
| "epoch": 30.314520356456406, |
| "grad_norm": 0.5067242383956909, |
| "learning_rate": 0.00023636713286713285, |
| "loss": 3.1207, |
| "step": 104100 |
| }, |
| { |
| "epoch": 30.329081484070127, |
| "grad_norm": 0.43562057614326477, |
| "learning_rate": 0.00023619230769230765, |
| "loss": 3.1239, |
| "step": 104150 |
| }, |
| { |
| "epoch": 30.343642611683848, |
| "grad_norm": 0.5031965374946594, |
| "learning_rate": 0.00023601748251748247, |
| "loss": 3.1235, |
| "step": 104200 |
| }, |
| { |
| "epoch": 30.358203739297572, |
| "grad_norm": 0.4372304677963257, |
| "learning_rate": 0.00023584265734265733, |
| "loss": 3.1279, |
| "step": 104250 |
| }, |
| { |
| "epoch": 30.372764866911293, |
| "grad_norm": 0.4631631374359131, |
| "learning_rate": 0.00023566783216783215, |
| "loss": 3.1245, |
| "step": 104300 |
| }, |
| { |
| "epoch": 30.387325994525018, |
| "grad_norm": 0.4882197678089142, |
| "learning_rate": 0.00023549300699300698, |
| "loss": 3.1235, |
| "step": 104350 |
| }, |
| { |
| "epoch": 30.40188712213874, |
| "grad_norm": 0.4441990256309509, |
| "learning_rate": 0.0002353181818181818, |
| "loss": 3.1234, |
| "step": 104400 |
| }, |
| { |
| "epoch": 30.41644824975246, |
| "grad_norm": 0.46458905935287476, |
| "learning_rate": 0.00023514335664335663, |
| "loss": 3.1337, |
| "step": 104450 |
| }, |
| { |
| "epoch": 30.431009377366184, |
| "grad_norm": 0.4909729063510895, |
| "learning_rate": 0.00023496853146853146, |
| "loss": 3.1329, |
| "step": 104500 |
| }, |
| { |
| "epoch": 30.445570504979905, |
| "grad_norm": 0.4699871838092804, |
| "learning_rate": 0.0002347937062937063, |
| "loss": 3.1361, |
| "step": 104550 |
| }, |
| { |
| "epoch": 30.46013163259363, |
| "grad_norm": 0.4524746537208557, |
| "learning_rate": 0.0002346188811188811, |
| "loss": 3.1122, |
| "step": 104600 |
| }, |
| { |
| "epoch": 30.47469276020735, |
| "grad_norm": 0.4407884180545807, |
| "learning_rate": 0.0002344440559440559, |
| "loss": 3.1268, |
| "step": 104650 |
| }, |
| { |
| "epoch": 30.48925388782107, |
| "grad_norm": 0.4524759352207184, |
| "learning_rate": 0.00023426923076923074, |
| "loss": 3.1355, |
| "step": 104700 |
| }, |
| { |
| "epoch": 30.503815015434796, |
| "grad_norm": 0.5185858011245728, |
| "learning_rate": 0.00023409440559440557, |
| "loss": 3.1376, |
| "step": 104750 |
| }, |
| { |
| "epoch": 30.518376143048517, |
| "grad_norm": 0.4437595307826996, |
| "learning_rate": 0.0002339195804195804, |
| "loss": 3.1282, |
| "step": 104800 |
| }, |
| { |
| "epoch": 30.53293727066224, |
| "grad_norm": 0.4705304503440857, |
| "learning_rate": 0.00023374475524475522, |
| "loss": 3.1457, |
| "step": 104850 |
| }, |
| { |
| "epoch": 30.547498398275962, |
| "grad_norm": 0.47511255741119385, |
| "learning_rate": 0.00023356993006993002, |
| "loss": 3.1493, |
| "step": 104900 |
| }, |
| { |
| "epoch": 30.562059525889683, |
| "grad_norm": 0.4592675566673279, |
| "learning_rate": 0.0002333951048951049, |
| "loss": 3.1413, |
| "step": 104950 |
| }, |
| { |
| "epoch": 30.576620653503408, |
| "grad_norm": 0.4349755346775055, |
| "learning_rate": 0.0002332202797202797, |
| "loss": 3.129, |
| "step": 105000 |
| }, |
| { |
| "epoch": 30.576620653503408, |
| "eval_accuracy": 0.37513292976075524, |
| "eval_loss": 3.5427258014678955, |
| "eval_runtime": 82.7575, |
| "eval_samples_per_second": 201.081, |
| "eval_steps_per_second": 12.579, |
| "step": 105000 |
| }, |
| { |
| "epoch": 30.59118178111713, |
| "grad_norm": 0.46668490767478943, |
| "learning_rate": 0.00023304545454545453, |
| "loss": 3.1447, |
| "step": 105050 |
| }, |
| { |
| "epoch": 30.605742908730853, |
| "grad_norm": 0.44386574625968933, |
| "learning_rate": 0.00023287062937062935, |
| "loss": 3.1389, |
| "step": 105100 |
| }, |
| { |
| "epoch": 30.620304036344574, |
| "grad_norm": 0.4497250020503998, |
| "learning_rate": 0.00023269580419580418, |
| "loss": 3.1394, |
| "step": 105150 |
| }, |
| { |
| "epoch": 30.634865163958295, |
| "grad_norm": 0.46050894260406494, |
| "learning_rate": 0.000232520979020979, |
| "loss": 3.1456, |
| "step": 105200 |
| }, |
| { |
| "epoch": 30.64942629157202, |
| "grad_norm": 0.5074132680892944, |
| "learning_rate": 0.00023234615384615384, |
| "loss": 3.1434, |
| "step": 105250 |
| }, |
| { |
| "epoch": 30.66398741918574, |
| "grad_norm": 0.45272794365882874, |
| "learning_rate": 0.00023217132867132866, |
| "loss": 3.1501, |
| "step": 105300 |
| }, |
| { |
| "epoch": 30.678548546799465, |
| "grad_norm": 0.4442860186100006, |
| "learning_rate": 0.00023199650349650346, |
| "loss": 3.1416, |
| "step": 105350 |
| }, |
| { |
| "epoch": 30.693109674413186, |
| "grad_norm": 0.45951324701309204, |
| "learning_rate": 0.0002318216783216783, |
| "loss": 3.1537, |
| "step": 105400 |
| }, |
| { |
| "epoch": 30.707670802026907, |
| "grad_norm": 0.4305625855922699, |
| "learning_rate": 0.00023164685314685312, |
| "loss": 3.151, |
| "step": 105450 |
| }, |
| { |
| "epoch": 30.722231929640632, |
| "grad_norm": 0.4825020134449005, |
| "learning_rate": 0.00023147202797202794, |
| "loss": 3.1647, |
| "step": 105500 |
| }, |
| { |
| "epoch": 30.736793057254353, |
| "grad_norm": 0.4628813564777374, |
| "learning_rate": 0.00023129720279720277, |
| "loss": 3.137, |
| "step": 105550 |
| }, |
| { |
| "epoch": 30.751354184868077, |
| "grad_norm": 0.53852778673172, |
| "learning_rate": 0.0002311223776223776, |
| "loss": 3.1455, |
| "step": 105600 |
| }, |
| { |
| "epoch": 30.7659153124818, |
| "grad_norm": 0.4441567063331604, |
| "learning_rate": 0.00023094755244755245, |
| "loss": 3.1471, |
| "step": 105650 |
| }, |
| { |
| "epoch": 30.78047644009552, |
| "grad_norm": 0.5117921233177185, |
| "learning_rate": 0.00023077272727272728, |
| "loss": 3.1475, |
| "step": 105700 |
| }, |
| { |
| "epoch": 30.795037567709244, |
| "grad_norm": 0.4449724555015564, |
| "learning_rate": 0.00023059790209790208, |
| "loss": 3.1492, |
| "step": 105750 |
| }, |
| { |
| "epoch": 30.809598695322965, |
| "grad_norm": 0.43966683745384216, |
| "learning_rate": 0.0002304230769230769, |
| "loss": 3.1465, |
| "step": 105800 |
| }, |
| { |
| "epoch": 30.82415982293669, |
| "grad_norm": 0.4602974057197571, |
| "learning_rate": 0.00023024825174825173, |
| "loss": 3.1613, |
| "step": 105850 |
| }, |
| { |
| "epoch": 30.83872095055041, |
| "grad_norm": 0.46699258685112, |
| "learning_rate": 0.00023007342657342656, |
| "loss": 3.1442, |
| "step": 105900 |
| }, |
| { |
| "epoch": 30.853282078164135, |
| "grad_norm": 0.4726366698741913, |
| "learning_rate": 0.00022989860139860138, |
| "loss": 3.1533, |
| "step": 105950 |
| }, |
| { |
| "epoch": 30.867843205777856, |
| "grad_norm": 0.44906020164489746, |
| "learning_rate": 0.0002297237762237762, |
| "loss": 3.146, |
| "step": 106000 |
| }, |
| { |
| "epoch": 30.867843205777856, |
| "eval_accuracy": 0.375208662828035, |
| "eval_loss": 3.5381979942321777, |
| "eval_runtime": 82.6236, |
| "eval_samples_per_second": 201.407, |
| "eval_steps_per_second": 12.599, |
| "step": 106000 |
| }, |
| { |
| "epoch": 30.882404333391577, |
| "grad_norm": 0.45382431149482727, |
| "learning_rate": 0.00022954895104895104, |
| "loss": 3.1654, |
| "step": 106050 |
| }, |
| { |
| "epoch": 30.8969654610053, |
| "grad_norm": 0.47706761956214905, |
| "learning_rate": 0.00022937412587412584, |
| "loss": 3.1575, |
| "step": 106100 |
| }, |
| { |
| "epoch": 30.911526588619022, |
| "grad_norm": 0.4413605332374573, |
| "learning_rate": 0.00022919930069930066, |
| "loss": 3.149, |
| "step": 106150 |
| }, |
| { |
| "epoch": 30.926087716232747, |
| "grad_norm": 0.45387986302375793, |
| "learning_rate": 0.0002290244755244755, |
| "loss": 3.1521, |
| "step": 106200 |
| }, |
| { |
| "epoch": 30.940648843846468, |
| "grad_norm": 0.465212881565094, |
| "learning_rate": 0.00022884965034965032, |
| "loss": 3.1623, |
| "step": 106250 |
| }, |
| { |
| "epoch": 30.95520997146019, |
| "grad_norm": 0.488710880279541, |
| "learning_rate": 0.00022867482517482517, |
| "loss": 3.1592, |
| "step": 106300 |
| }, |
| { |
| "epoch": 30.969771099073913, |
| "grad_norm": 0.4711342453956604, |
| "learning_rate": 0.0002285, |
| "loss": 3.1597, |
| "step": 106350 |
| }, |
| { |
| "epoch": 30.984332226687634, |
| "grad_norm": 0.457851767539978, |
| "learning_rate": 0.00022832517482517482, |
| "loss": 3.1591, |
| "step": 106400 |
| }, |
| { |
| "epoch": 30.99889335430136, |
| "grad_norm": 0.496644526720047, |
| "learning_rate": 0.00022815034965034965, |
| "loss": 3.1612, |
| "step": 106450 |
| }, |
| { |
| "epoch": 31.013396237404624, |
| "grad_norm": 0.4517747759819031, |
| "learning_rate": 0.00022797552447552445, |
| "loss": 3.0659, |
| "step": 106500 |
| }, |
| { |
| "epoch": 31.02795736501835, |
| "grad_norm": 0.5124735832214355, |
| "learning_rate": 0.00022780069930069928, |
| "loss": 3.0748, |
| "step": 106550 |
| }, |
| { |
| "epoch": 31.04251849263207, |
| "grad_norm": 0.44049033522605896, |
| "learning_rate": 0.0002276258741258741, |
| "loss": 3.0735, |
| "step": 106600 |
| }, |
| { |
| "epoch": 31.05707962024579, |
| "grad_norm": 0.45591986179351807, |
| "learning_rate": 0.00022745104895104893, |
| "loss": 3.0649, |
| "step": 106650 |
| }, |
| { |
| "epoch": 31.071640747859515, |
| "grad_norm": 0.4508044421672821, |
| "learning_rate": 0.00022727622377622376, |
| "loss": 3.08, |
| "step": 106700 |
| }, |
| { |
| "epoch": 31.086201875473236, |
| "grad_norm": 0.46148911118507385, |
| "learning_rate": 0.00022710139860139858, |
| "loss": 3.0801, |
| "step": 106750 |
| }, |
| { |
| "epoch": 31.10076300308696, |
| "grad_norm": 0.4811590313911438, |
| "learning_rate": 0.0002269265734265734, |
| "loss": 3.0837, |
| "step": 106800 |
| }, |
| { |
| "epoch": 31.11532413070068, |
| "grad_norm": 0.45535677671432495, |
| "learning_rate": 0.0002267517482517482, |
| "loss": 3.0781, |
| "step": 106850 |
| }, |
| { |
| "epoch": 31.129885258314403, |
| "grad_norm": 0.455352783203125, |
| "learning_rate": 0.00022657692307692304, |
| "loss": 3.0838, |
| "step": 106900 |
| }, |
| { |
| "epoch": 31.144446385928127, |
| "grad_norm": 0.4488782584667206, |
| "learning_rate": 0.00022640209790209787, |
| "loss": 3.0875, |
| "step": 106950 |
| }, |
| { |
| "epoch": 31.159007513541848, |
| "grad_norm": 0.43766698241233826, |
| "learning_rate": 0.00022622727272727272, |
| "loss": 3.1008, |
| "step": 107000 |
| }, |
| { |
| "epoch": 31.159007513541848, |
| "eval_accuracy": 0.3746367840917283, |
| "eval_loss": 3.5534043312072754, |
| "eval_runtime": 82.6994, |
| "eval_samples_per_second": 201.223, |
| "eval_steps_per_second": 12.588, |
| "step": 107000 |
| }, |
| { |
| "epoch": 31.173568641155573, |
| "grad_norm": 0.44717875123023987, |
| "learning_rate": 0.00022605244755244755, |
| "loss": 3.0961, |
| "step": 107050 |
| }, |
| { |
| "epoch": 31.188129768769294, |
| "grad_norm": 0.4676537811756134, |
| "learning_rate": 0.00022587762237762237, |
| "loss": 3.0959, |
| "step": 107100 |
| }, |
| { |
| "epoch": 31.202690896383015, |
| "grad_norm": 0.4594411253929138, |
| "learning_rate": 0.0002257027972027972, |
| "loss": 3.0948, |
| "step": 107150 |
| }, |
| { |
| "epoch": 31.21725202399674, |
| "grad_norm": 0.49264439940452576, |
| "learning_rate": 0.00022552797202797203, |
| "loss": 3.0953, |
| "step": 107200 |
| }, |
| { |
| "epoch": 31.23181315161046, |
| "grad_norm": 0.47982412576675415, |
| "learning_rate": 0.00022535314685314683, |
| "loss": 3.095, |
| "step": 107250 |
| }, |
| { |
| "epoch": 31.246374279224185, |
| "grad_norm": 0.4431138336658478, |
| "learning_rate": 0.00022517832167832165, |
| "loss": 3.103, |
| "step": 107300 |
| }, |
| { |
| "epoch": 31.260935406837906, |
| "grad_norm": 0.48370277881622314, |
| "learning_rate": 0.00022500349650349648, |
| "loss": 3.106, |
| "step": 107350 |
| }, |
| { |
| "epoch": 31.275496534451626, |
| "grad_norm": 0.4662681818008423, |
| "learning_rate": 0.0002248286713286713, |
| "loss": 3.0989, |
| "step": 107400 |
| }, |
| { |
| "epoch": 31.29005766206535, |
| "grad_norm": 0.44836848974227905, |
| "learning_rate": 0.00022465384615384613, |
| "loss": 3.1091, |
| "step": 107450 |
| }, |
| { |
| "epoch": 31.304618789679072, |
| "grad_norm": 0.4993094205856323, |
| "learning_rate": 0.00022447902097902096, |
| "loss": 3.1165, |
| "step": 107500 |
| }, |
| { |
| "epoch": 31.319179917292796, |
| "grad_norm": 0.4505784809589386, |
| "learning_rate": 0.0002243041958041958, |
| "loss": 3.1162, |
| "step": 107550 |
| }, |
| { |
| "epoch": 31.333741044906517, |
| "grad_norm": 0.449940949678421, |
| "learning_rate": 0.00022412937062937059, |
| "loss": 3.1115, |
| "step": 107600 |
| }, |
| { |
| "epoch": 31.34830217252024, |
| "grad_norm": 0.47981855273246765, |
| "learning_rate": 0.0002239545454545454, |
| "loss": 3.1185, |
| "step": 107650 |
| }, |
| { |
| "epoch": 31.362863300133963, |
| "grad_norm": 0.47897806763648987, |
| "learning_rate": 0.00022377972027972027, |
| "loss": 3.1215, |
| "step": 107700 |
| }, |
| { |
| "epoch": 31.377424427747684, |
| "grad_norm": 0.4578169286251068, |
| "learning_rate": 0.0002236048951048951, |
| "loss": 3.1061, |
| "step": 107750 |
| }, |
| { |
| "epoch": 31.39198555536141, |
| "grad_norm": 0.46493199467658997, |
| "learning_rate": 0.00022343006993006992, |
| "loss": 3.1153, |
| "step": 107800 |
| }, |
| { |
| "epoch": 31.40654668297513, |
| "grad_norm": 0.44090625643730164, |
| "learning_rate": 0.00022325524475524475, |
| "loss": 3.1199, |
| "step": 107850 |
| }, |
| { |
| "epoch": 31.42110781058885, |
| "grad_norm": 0.4627723693847656, |
| "learning_rate": 0.00022308041958041957, |
| "loss": 3.1368, |
| "step": 107900 |
| }, |
| { |
| "epoch": 31.435668938202575, |
| "grad_norm": 0.47543758153915405, |
| "learning_rate": 0.0002229055944055944, |
| "loss": 3.123, |
| "step": 107950 |
| }, |
| { |
| "epoch": 31.450230065816296, |
| "grad_norm": 0.4394117593765259, |
| "learning_rate": 0.0002227307692307692, |
| "loss": 3.1194, |
| "step": 108000 |
| }, |
| { |
| "epoch": 31.450230065816296, |
| "eval_accuracy": 0.37487691906592907, |
| "eval_loss": 3.5460078716278076, |
| "eval_runtime": 82.695, |
| "eval_samples_per_second": 201.233, |
| "eval_steps_per_second": 12.588, |
| "step": 108000 |
| }, |
| { |
| "epoch": 31.46479119343002, |
| "grad_norm": 0.4584934413433075, |
| "learning_rate": 0.00022255594405594403, |
| "loss": 3.118, |
| "step": 108050 |
| }, |
| { |
| "epoch": 31.47935232104374, |
| "grad_norm": 0.4759714603424072, |
| "learning_rate": 0.00022238111888111885, |
| "loss": 3.1156, |
| "step": 108100 |
| }, |
| { |
| "epoch": 31.493913448657462, |
| "grad_norm": 0.45908963680267334, |
| "learning_rate": 0.00022220629370629368, |
| "loss": 3.1261, |
| "step": 108150 |
| }, |
| { |
| "epoch": 31.508474576271187, |
| "grad_norm": 0.449799507856369, |
| "learning_rate": 0.0002220314685314685, |
| "loss": 3.113, |
| "step": 108200 |
| }, |
| { |
| "epoch": 31.523035703884908, |
| "grad_norm": 0.46549323201179504, |
| "learning_rate": 0.00022185664335664333, |
| "loss": 3.1184, |
| "step": 108250 |
| }, |
| { |
| "epoch": 31.537596831498632, |
| "grad_norm": 0.45758989453315735, |
| "learning_rate": 0.00022168181818181816, |
| "loss": 3.1254, |
| "step": 108300 |
| }, |
| { |
| "epoch": 31.552157959112353, |
| "grad_norm": 0.46553415060043335, |
| "learning_rate": 0.00022150699300699296, |
| "loss": 3.1358, |
| "step": 108350 |
| }, |
| { |
| "epoch": 31.566719086726074, |
| "grad_norm": 0.4532431662082672, |
| "learning_rate": 0.00022133216783216782, |
| "loss": 3.132, |
| "step": 108400 |
| }, |
| { |
| "epoch": 31.5812802143398, |
| "grad_norm": 0.4570625126361847, |
| "learning_rate": 0.00022115734265734264, |
| "loss": 3.1313, |
| "step": 108450 |
| }, |
| { |
| "epoch": 31.59584134195352, |
| "grad_norm": 0.4676782488822937, |
| "learning_rate": 0.00022098251748251747, |
| "loss": 3.1295, |
| "step": 108500 |
| }, |
| { |
| "epoch": 31.610402469567244, |
| "grad_norm": 0.44572845101356506, |
| "learning_rate": 0.0002208076923076923, |
| "loss": 3.1209, |
| "step": 108550 |
| }, |
| { |
| "epoch": 31.624963597180965, |
| "grad_norm": 0.5037442445755005, |
| "learning_rate": 0.00022063286713286712, |
| "loss": 3.1443, |
| "step": 108600 |
| }, |
| { |
| "epoch": 31.63952472479469, |
| "grad_norm": 0.46466493606567383, |
| "learning_rate": 0.00022045804195804195, |
| "loss": 3.139, |
| "step": 108650 |
| }, |
| { |
| "epoch": 31.65408585240841, |
| "grad_norm": 0.456209659576416, |
| "learning_rate": 0.00022028321678321678, |
| "loss": 3.1322, |
| "step": 108700 |
| }, |
| { |
| "epoch": 31.66864698002213, |
| "grad_norm": 0.4677102863788605, |
| "learning_rate": 0.00022010839160839158, |
| "loss": 3.1447, |
| "step": 108750 |
| }, |
| { |
| "epoch": 31.683208107635856, |
| "grad_norm": 0.4477277994155884, |
| "learning_rate": 0.0002199335664335664, |
| "loss": 3.1378, |
| "step": 108800 |
| }, |
| { |
| "epoch": 31.697769235249577, |
| "grad_norm": 0.43700361251831055, |
| "learning_rate": 0.00021975874125874123, |
| "loss": 3.1514, |
| "step": 108850 |
| }, |
| { |
| "epoch": 31.7123303628633, |
| "grad_norm": 0.465725302696228, |
| "learning_rate": 0.00021958391608391606, |
| "loss": 3.1482, |
| "step": 108900 |
| }, |
| { |
| "epoch": 31.726891490477023, |
| "grad_norm": 0.4785679280757904, |
| "learning_rate": 0.00021940909090909088, |
| "loss": 3.1342, |
| "step": 108950 |
| }, |
| { |
| "epoch": 31.741452618090744, |
| "grad_norm": 0.4512113630771637, |
| "learning_rate": 0.0002192342657342657, |
| "loss": 3.1194, |
| "step": 109000 |
| }, |
| { |
| "epoch": 31.741452618090744, |
| "eval_accuracy": 0.37523394638310514, |
| "eval_loss": 3.5423052310943604, |
| "eval_runtime": 82.505, |
| "eval_samples_per_second": 201.697, |
| "eval_steps_per_second": 12.617, |
| "step": 109000 |
| }, |
| { |
| "epoch": 31.756013745704468, |
| "grad_norm": 0.45781755447387695, |
| "learning_rate": 0.00021905944055944054, |
| "loss": 3.1453, |
| "step": 109050 |
| }, |
| { |
| "epoch": 31.77057487331819, |
| "grad_norm": 0.46818092465400696, |
| "learning_rate": 0.0002188846153846154, |
| "loss": 3.1497, |
| "step": 109100 |
| }, |
| { |
| "epoch": 31.785136000931914, |
| "grad_norm": 0.4777269959449768, |
| "learning_rate": 0.0002187097902097902, |
| "loss": 3.1435, |
| "step": 109150 |
| }, |
| { |
| "epoch": 31.799697128545635, |
| "grad_norm": 0.4731384217739105, |
| "learning_rate": 0.00021853496503496502, |
| "loss": 3.1411, |
| "step": 109200 |
| }, |
| { |
| "epoch": 31.814258256159356, |
| "grad_norm": 0.45721766352653503, |
| "learning_rate": 0.00021836013986013984, |
| "loss": 3.1575, |
| "step": 109250 |
| }, |
| { |
| "epoch": 31.82881938377308, |
| "grad_norm": 0.47995349764823914, |
| "learning_rate": 0.00021818531468531467, |
| "loss": 3.1431, |
| "step": 109300 |
| }, |
| { |
| "epoch": 31.8433805113868, |
| "grad_norm": 0.4390334188938141, |
| "learning_rate": 0.0002180104895104895, |
| "loss": 3.1442, |
| "step": 109350 |
| }, |
| { |
| "epoch": 31.857941639000526, |
| "grad_norm": 0.48997631669044495, |
| "learning_rate": 0.00021783566433566432, |
| "loss": 3.1466, |
| "step": 109400 |
| }, |
| { |
| "epoch": 31.872502766614247, |
| "grad_norm": 0.46940040588378906, |
| "learning_rate": 0.00021766083916083915, |
| "loss": 3.1423, |
| "step": 109450 |
| }, |
| { |
| "epoch": 31.887063894227968, |
| "grad_norm": 0.4647296071052551, |
| "learning_rate": 0.00021748601398601395, |
| "loss": 3.1491, |
| "step": 109500 |
| }, |
| { |
| "epoch": 31.901625021841692, |
| "grad_norm": 0.48121047019958496, |
| "learning_rate": 0.00021731118881118878, |
| "loss": 3.1492, |
| "step": 109550 |
| }, |
| { |
| "epoch": 31.916186149455413, |
| "grad_norm": 0.5126810669898987, |
| "learning_rate": 0.0002171363636363636, |
| "loss": 3.146, |
| "step": 109600 |
| }, |
| { |
| "epoch": 31.930747277069138, |
| "grad_norm": 0.46646544337272644, |
| "learning_rate": 0.00021696153846153843, |
| "loss": 3.1567, |
| "step": 109650 |
| }, |
| { |
| "epoch": 31.94530840468286, |
| "grad_norm": 0.45344728231430054, |
| "learning_rate": 0.00021678671328671326, |
| "loss": 3.1554, |
| "step": 109700 |
| }, |
| { |
| "epoch": 31.95986953229658, |
| "grad_norm": 0.4628826677799225, |
| "learning_rate": 0.00021661188811188808, |
| "loss": 3.1401, |
| "step": 109750 |
| }, |
| { |
| "epoch": 31.974430659910304, |
| "grad_norm": 0.4626869261264801, |
| "learning_rate": 0.00021643706293706294, |
| "loss": 3.1537, |
| "step": 109800 |
| }, |
| { |
| "epoch": 31.988991787524025, |
| "grad_norm": 0.47640642523765564, |
| "learning_rate": 0.00021626223776223777, |
| "loss": 3.1515, |
| "step": 109850 |
| }, |
| { |
| "epoch": 32.00349467062729, |
| "grad_norm": 0.4777139127254486, |
| "learning_rate": 0.00021608741258741256, |
| "loss": 3.1211, |
| "step": 109900 |
| }, |
| { |
| "epoch": 32.018055798241015, |
| "grad_norm": 0.46064773201942444, |
| "learning_rate": 0.0002159125874125874, |
| "loss": 3.0574, |
| "step": 109950 |
| }, |
| { |
| "epoch": 32.03261692585474, |
| "grad_norm": 0.4667201340198517, |
| "learning_rate": 0.00021573776223776222, |
| "loss": 3.0636, |
| "step": 110000 |
| }, |
| { |
| "epoch": 32.03261692585474, |
| "eval_accuracy": 0.37468158890327113, |
| "eval_loss": 3.5524070262908936, |
| "eval_runtime": 81.8936, |
| "eval_samples_per_second": 203.203, |
| "eval_steps_per_second": 12.712, |
| "step": 110000 |
| }, |
| { |
| "epoch": 32.047178053468464, |
| "grad_norm": 0.46497970819473267, |
| "learning_rate": 0.00021556293706293705, |
| "loss": 3.0754, |
| "step": 110050 |
| }, |
| { |
| "epoch": 32.06173918108218, |
| "grad_norm": 0.49732282757759094, |
| "learning_rate": 0.00021538811188811187, |
| "loss": 3.0544, |
| "step": 110100 |
| }, |
| { |
| "epoch": 32.076300308695906, |
| "grad_norm": 0.4592716097831726, |
| "learning_rate": 0.0002152132867132867, |
| "loss": 3.0722, |
| "step": 110150 |
| }, |
| { |
| "epoch": 32.09086143630963, |
| "grad_norm": 0.44930028915405273, |
| "learning_rate": 0.00021503846153846153, |
| "loss": 3.0762, |
| "step": 110200 |
| }, |
| { |
| "epoch": 32.10542256392335, |
| "grad_norm": 0.45672476291656494, |
| "learning_rate": 0.00021486363636363633, |
| "loss": 3.0767, |
| "step": 110250 |
| }, |
| { |
| "epoch": 32.11998369153707, |
| "grad_norm": 0.47720056772232056, |
| "learning_rate": 0.00021468881118881115, |
| "loss": 3.0761, |
| "step": 110300 |
| }, |
| { |
| "epoch": 32.1345448191508, |
| "grad_norm": 0.46338218450546265, |
| "learning_rate": 0.00021451398601398598, |
| "loss": 3.0786, |
| "step": 110350 |
| }, |
| { |
| "epoch": 32.149105946764514, |
| "grad_norm": 0.4831918776035309, |
| "learning_rate": 0.0002143391608391608, |
| "loss": 3.0787, |
| "step": 110400 |
| }, |
| { |
| "epoch": 32.16366707437824, |
| "grad_norm": 0.49108681082725525, |
| "learning_rate": 0.00021416433566433563, |
| "loss": 3.0835, |
| "step": 110450 |
| }, |
| { |
| "epoch": 32.17822820199196, |
| "grad_norm": 0.4968203604221344, |
| "learning_rate": 0.0002139895104895105, |
| "loss": 3.088, |
| "step": 110500 |
| }, |
| { |
| "epoch": 32.19278932960569, |
| "grad_norm": 0.49390709400177, |
| "learning_rate": 0.0002138146853146853, |
| "loss": 3.099, |
| "step": 110550 |
| }, |
| { |
| "epoch": 32.207350457219405, |
| "grad_norm": 0.4764431118965149, |
| "learning_rate": 0.00021363986013986014, |
| "loss": 3.0993, |
| "step": 110600 |
| }, |
| { |
| "epoch": 32.22191158483313, |
| "grad_norm": 0.4683314263820648, |
| "learning_rate": 0.00021346503496503494, |
| "loss": 3.0956, |
| "step": 110650 |
| }, |
| { |
| "epoch": 32.236472712446854, |
| "grad_norm": 0.5034164786338806, |
| "learning_rate": 0.00021329020979020977, |
| "loss": 3.1003, |
| "step": 110700 |
| }, |
| { |
| "epoch": 32.25103384006057, |
| "grad_norm": 0.4939976632595062, |
| "learning_rate": 0.0002131153846153846, |
| "loss": 3.0976, |
| "step": 110750 |
| }, |
| { |
| "epoch": 32.265594967674296, |
| "grad_norm": 0.47088491916656494, |
| "learning_rate": 0.00021294055944055942, |
| "loss": 3.0915, |
| "step": 110800 |
| }, |
| { |
| "epoch": 32.28015609528802, |
| "grad_norm": 0.4773483872413635, |
| "learning_rate": 0.00021276573426573425, |
| "loss": 3.1055, |
| "step": 110850 |
| }, |
| { |
| "epoch": 32.29471722290174, |
| "grad_norm": 0.4978649914264679, |
| "learning_rate": 0.00021259090909090907, |
| "loss": 3.1121, |
| "step": 110900 |
| }, |
| { |
| "epoch": 32.30927835051546, |
| "grad_norm": 0.4833299517631531, |
| "learning_rate": 0.0002124160839160839, |
| "loss": 3.0952, |
| "step": 110950 |
| }, |
| { |
| "epoch": 32.32383947812919, |
| "grad_norm": 0.47267720103263855, |
| "learning_rate": 0.0002122412587412587, |
| "loss": 3.1026, |
| "step": 111000 |
| }, |
| { |
| "epoch": 32.32383947812919, |
| "eval_accuracy": 0.37488221097280416, |
| "eval_loss": 3.552276372909546, |
| "eval_runtime": 82.0399, |
| "eval_samples_per_second": 202.84, |
| "eval_steps_per_second": 12.689, |
| "step": 111000 |
| }, |
| { |
| "epoch": 32.33840060574291, |
| "grad_norm": 0.4781993329524994, |
| "learning_rate": 0.00021206643356643353, |
| "loss": 3.1095, |
| "step": 111050 |
| }, |
| { |
| "epoch": 32.35296173335663, |
| "grad_norm": 0.46916815638542175, |
| "learning_rate": 0.00021189160839160835, |
| "loss": 3.0965, |
| "step": 111100 |
| }, |
| { |
| "epoch": 32.367522860970354, |
| "grad_norm": 0.4273686707019806, |
| "learning_rate": 0.0002117167832167832, |
| "loss": 3.0992, |
| "step": 111150 |
| }, |
| { |
| "epoch": 32.38208398858408, |
| "grad_norm": 0.44655710458755493, |
| "learning_rate": 0.00021154195804195803, |
| "loss": 3.1002, |
| "step": 111200 |
| }, |
| { |
| "epoch": 32.396645116197796, |
| "grad_norm": 0.46945303678512573, |
| "learning_rate": 0.00021136713286713286, |
| "loss": 3.1134, |
| "step": 111250 |
| }, |
| { |
| "epoch": 32.41120624381152, |
| "grad_norm": 0.4778572916984558, |
| "learning_rate": 0.0002111923076923077, |
| "loss": 3.1074, |
| "step": 111300 |
| }, |
| { |
| "epoch": 32.425767371425245, |
| "grad_norm": 0.46566638350486755, |
| "learning_rate": 0.00021101748251748252, |
| "loss": 3.1082, |
| "step": 111350 |
| }, |
| { |
| "epoch": 32.44032849903896, |
| "grad_norm": 0.4643358588218689, |
| "learning_rate": 0.00021084265734265734, |
| "loss": 3.1143, |
| "step": 111400 |
| }, |
| { |
| "epoch": 32.45488962665269, |
| "grad_norm": 0.4506838917732239, |
| "learning_rate": 0.00021066783216783214, |
| "loss": 3.1171, |
| "step": 111450 |
| }, |
| { |
| "epoch": 32.46945075426641, |
| "grad_norm": 0.4812449812889099, |
| "learning_rate": 0.00021049300699300697, |
| "loss": 3.1258, |
| "step": 111500 |
| }, |
| { |
| "epoch": 32.484011881880136, |
| "grad_norm": 0.4793895483016968, |
| "learning_rate": 0.0002103181818181818, |
| "loss": 3.1182, |
| "step": 111550 |
| }, |
| { |
| "epoch": 32.49857300949385, |
| "grad_norm": 0.4465021789073944, |
| "learning_rate": 0.00021014335664335662, |
| "loss": 3.1217, |
| "step": 111600 |
| }, |
| { |
| "epoch": 32.51313413710758, |
| "grad_norm": 0.4548931419849396, |
| "learning_rate": 0.00020996853146853145, |
| "loss": 3.1272, |
| "step": 111650 |
| }, |
| { |
| "epoch": 32.5276952647213, |
| "grad_norm": 0.46215078234672546, |
| "learning_rate": 0.00020979370629370628, |
| "loss": 3.1081, |
| "step": 111700 |
| }, |
| { |
| "epoch": 32.54225639233502, |
| "grad_norm": 0.5043772459030151, |
| "learning_rate": 0.00020961888111888108, |
| "loss": 3.1157, |
| "step": 111750 |
| }, |
| { |
| "epoch": 32.556817519948744, |
| "grad_norm": 0.494863897562027, |
| "learning_rate": 0.0002094440559440559, |
| "loss": 3.1183, |
| "step": 111800 |
| }, |
| { |
| "epoch": 32.57137864756247, |
| "grad_norm": 0.4757680296897888, |
| "learning_rate": 0.00020926923076923076, |
| "loss": 3.1292, |
| "step": 111850 |
| }, |
| { |
| "epoch": 32.585939775176186, |
| "grad_norm": 0.44847366213798523, |
| "learning_rate": 0.00020909440559440558, |
| "loss": 3.1249, |
| "step": 111900 |
| }, |
| { |
| "epoch": 32.60050090278991, |
| "grad_norm": 0.4508185088634491, |
| "learning_rate": 0.0002089195804195804, |
| "loss": 3.1156, |
| "step": 111950 |
| }, |
| { |
| "epoch": 32.615062030403635, |
| "grad_norm": 0.4706362187862396, |
| "learning_rate": 0.00020874475524475524, |
| "loss": 3.1189, |
| "step": 112000 |
| }, |
| { |
| "epoch": 32.615062030403635, |
| "eval_accuracy": 0.37553311551844637, |
| "eval_loss": 3.544769763946533, |
| "eval_runtime": 82.1099, |
| "eval_samples_per_second": 202.667, |
| "eval_steps_per_second": 12.678, |
| "step": 112000 |
| }, |
| { |
| "epoch": 32.62962315801736, |
| "grad_norm": 0.4620210826396942, |
| "learning_rate": 0.00020856993006993006, |
| "loss": 3.132, |
| "step": 112050 |
| }, |
| { |
| "epoch": 32.64418428563108, |
| "grad_norm": 0.47282111644744873, |
| "learning_rate": 0.0002083951048951049, |
| "loss": 3.1262, |
| "step": 112100 |
| }, |
| { |
| "epoch": 32.6587454132448, |
| "grad_norm": 0.45098698139190674, |
| "learning_rate": 0.00020822027972027972, |
| "loss": 3.1216, |
| "step": 112150 |
| }, |
| { |
| "epoch": 32.673306540858526, |
| "grad_norm": 0.4220297932624817, |
| "learning_rate": 0.00020804545454545452, |
| "loss": 3.1279, |
| "step": 112200 |
| }, |
| { |
| "epoch": 32.687867668472244, |
| "grad_norm": 0.49172690510749817, |
| "learning_rate": 0.00020787062937062934, |
| "loss": 3.1314, |
| "step": 112250 |
| }, |
| { |
| "epoch": 32.70242879608597, |
| "grad_norm": 0.4788571298122406, |
| "learning_rate": 0.00020769580419580417, |
| "loss": 3.1306, |
| "step": 112300 |
| }, |
| { |
| "epoch": 32.71698992369969, |
| "grad_norm": 0.47536975145339966, |
| "learning_rate": 0.000207520979020979, |
| "loss": 3.1278, |
| "step": 112350 |
| }, |
| { |
| "epoch": 32.73155105131342, |
| "grad_norm": 0.4684229791164398, |
| "learning_rate": 0.00020734615384615382, |
| "loss": 3.1402, |
| "step": 112400 |
| }, |
| { |
| "epoch": 32.746112178927135, |
| "grad_norm": 0.4739868938922882, |
| "learning_rate": 0.00020717132867132865, |
| "loss": 3.1384, |
| "step": 112450 |
| }, |
| { |
| "epoch": 32.76067330654086, |
| "grad_norm": 0.5022250413894653, |
| "learning_rate": 0.00020699650349650345, |
| "loss": 3.1339, |
| "step": 112500 |
| }, |
| { |
| "epoch": 32.775234434154584, |
| "grad_norm": 0.5051437020301819, |
| "learning_rate": 0.00020682167832167833, |
| "loss": 3.1286, |
| "step": 112550 |
| }, |
| { |
| "epoch": 32.7897955617683, |
| "grad_norm": 0.4468280076980591, |
| "learning_rate": 0.00020664685314685313, |
| "loss": 3.1434, |
| "step": 112600 |
| }, |
| { |
| "epoch": 32.804356689382026, |
| "grad_norm": 0.5315858125686646, |
| "learning_rate": 0.00020647202797202796, |
| "loss": 3.1315, |
| "step": 112650 |
| }, |
| { |
| "epoch": 32.81891781699575, |
| "grad_norm": 0.47643396258354187, |
| "learning_rate": 0.00020629720279720278, |
| "loss": 3.1412, |
| "step": 112700 |
| }, |
| { |
| "epoch": 32.83347894460947, |
| "grad_norm": 0.47981470823287964, |
| "learning_rate": 0.0002061223776223776, |
| "loss": 3.1482, |
| "step": 112750 |
| }, |
| { |
| "epoch": 32.84804007222319, |
| "grad_norm": 0.4537067115306854, |
| "learning_rate": 0.00020594755244755244, |
| "loss": 3.1273, |
| "step": 112800 |
| }, |
| { |
| "epoch": 32.86260119983692, |
| "grad_norm": 0.4843168556690216, |
| "learning_rate": 0.00020577272727272726, |
| "loss": 3.1317, |
| "step": 112850 |
| }, |
| { |
| "epoch": 32.87716232745064, |
| "grad_norm": 0.4961332678794861, |
| "learning_rate": 0.0002055979020979021, |
| "loss": 3.1494, |
| "step": 112900 |
| }, |
| { |
| "epoch": 32.89172345506436, |
| "grad_norm": 0.46178942918777466, |
| "learning_rate": 0.0002054230769230769, |
| "loss": 3.1392, |
| "step": 112950 |
| }, |
| { |
| "epoch": 32.90628458267808, |
| "grad_norm": 0.4781860113143921, |
| "learning_rate": 0.00020524825174825172, |
| "loss": 3.1413, |
| "step": 113000 |
| }, |
| { |
| "epoch": 32.90628458267808, |
| "eval_accuracy": 0.3756620028503386, |
| "eval_loss": 3.535982847213745, |
| "eval_runtime": 82.041, |
| "eval_samples_per_second": 202.838, |
| "eval_steps_per_second": 12.689, |
| "step": 113000 |
| }, |
| { |
| "epoch": 32.92084571029181, |
| "grad_norm": 0.4798736274242401, |
| "learning_rate": 0.00020507342657342655, |
| "loss": 3.1342, |
| "step": 113050 |
| }, |
| { |
| "epoch": 32.935406837905525, |
| "grad_norm": 0.46386483311653137, |
| "learning_rate": 0.00020489860139860137, |
| "loss": 3.1379, |
| "step": 113100 |
| }, |
| { |
| "epoch": 32.94996796551925, |
| "grad_norm": 0.5032864212989807, |
| "learning_rate": 0.0002047237762237762, |
| "loss": 3.1313, |
| "step": 113150 |
| }, |
| { |
| "epoch": 32.964529093132974, |
| "grad_norm": 0.48214972019195557, |
| "learning_rate": 0.00020454895104895103, |
| "loss": 3.1359, |
| "step": 113200 |
| }, |
| { |
| "epoch": 32.97909022074669, |
| "grad_norm": 0.4960155487060547, |
| "learning_rate": 0.00020437412587412588, |
| "loss": 3.1328, |
| "step": 113250 |
| }, |
| { |
| "epoch": 32.993651348360416, |
| "grad_norm": 0.46663808822631836, |
| "learning_rate": 0.0002041993006993007, |
| "loss": 3.1385, |
| "step": 113300 |
| }, |
| { |
| "epoch": 33.008154231463685, |
| "grad_norm": 0.46300604939460754, |
| "learning_rate": 0.0002040244755244755, |
| "loss": 3.09, |
| "step": 113350 |
| }, |
| { |
| "epoch": 33.02271535907741, |
| "grad_norm": 0.48827603459358215, |
| "learning_rate": 0.00020384965034965033, |
| "loss": 3.0522, |
| "step": 113400 |
| }, |
| { |
| "epoch": 33.03727648669113, |
| "grad_norm": 0.49160000681877136, |
| "learning_rate": 0.00020367482517482516, |
| "loss": 3.0605, |
| "step": 113450 |
| }, |
| { |
| "epoch": 33.05183761430485, |
| "grad_norm": 0.4797455370426178, |
| "learning_rate": 0.00020349999999999999, |
| "loss": 3.0587, |
| "step": 113500 |
| }, |
| { |
| "epoch": 33.066398741918576, |
| "grad_norm": 0.48172780871391296, |
| "learning_rate": 0.0002033251748251748, |
| "loss": 3.0662, |
| "step": 113550 |
| }, |
| { |
| "epoch": 33.08095986953229, |
| "grad_norm": 0.48839229345321655, |
| "learning_rate": 0.00020315034965034964, |
| "loss": 3.0706, |
| "step": 113600 |
| }, |
| { |
| "epoch": 33.09552099714602, |
| "grad_norm": 0.46855488419532776, |
| "learning_rate": 0.00020297552447552447, |
| "loss": 3.0711, |
| "step": 113650 |
| }, |
| { |
| "epoch": 33.11008212475974, |
| "grad_norm": 0.49039342999458313, |
| "learning_rate": 0.00020280069930069927, |
| "loss": 3.0657, |
| "step": 113700 |
| }, |
| { |
| "epoch": 33.12464325237347, |
| "grad_norm": 0.4812520444393158, |
| "learning_rate": 0.0002026258741258741, |
| "loss": 3.0816, |
| "step": 113750 |
| }, |
| { |
| "epoch": 33.139204379987184, |
| "grad_norm": 0.471444308757782, |
| "learning_rate": 0.00020245104895104892, |
| "loss": 3.0761, |
| "step": 113800 |
| }, |
| { |
| "epoch": 33.15376550760091, |
| "grad_norm": 0.46630987524986267, |
| "learning_rate": 0.00020227622377622375, |
| "loss": 3.0683, |
| "step": 113850 |
| }, |
| { |
| "epoch": 33.16832663521463, |
| "grad_norm": 0.4628123342990875, |
| "learning_rate": 0.00020210139860139857, |
| "loss": 3.0725, |
| "step": 113900 |
| }, |
| { |
| "epoch": 33.18288776282835, |
| "grad_norm": 0.4704195261001587, |
| "learning_rate": 0.00020192657342657343, |
| "loss": 3.0857, |
| "step": 113950 |
| }, |
| { |
| "epoch": 33.197448890442075, |
| "grad_norm": 0.4793620705604553, |
| "learning_rate": 0.00020175174825174825, |
| "loss": 3.0919, |
| "step": 114000 |
| }, |
| { |
| "epoch": 33.197448890442075, |
| "eval_accuracy": 0.3749032610023742, |
| "eval_loss": 3.554394483566284, |
| "eval_runtime": 81.9221, |
| "eval_samples_per_second": 203.132, |
| "eval_steps_per_second": 12.707, |
| "step": 114000 |
| }, |
| { |
| "epoch": 33.2120100180558, |
| "grad_norm": 0.49349385499954224, |
| "learning_rate": 0.00020157692307692308, |
| "loss": 3.0829, |
| "step": 114050 |
| }, |
| { |
| "epoch": 33.22657114566952, |
| "grad_norm": 0.4812479317188263, |
| "learning_rate": 0.00020140209790209788, |
| "loss": 3.0888, |
| "step": 114100 |
| }, |
| { |
| "epoch": 33.24113227328324, |
| "grad_norm": 0.48198625445365906, |
| "learning_rate": 0.0002012272727272727, |
| "loss": 3.0825, |
| "step": 114150 |
| }, |
| { |
| "epoch": 33.255693400896966, |
| "grad_norm": 0.45920151472091675, |
| "learning_rate": 0.00020105244755244753, |
| "loss": 3.088, |
| "step": 114200 |
| }, |
| { |
| "epoch": 33.27025452851069, |
| "grad_norm": 0.49331265687942505, |
| "learning_rate": 0.00020087762237762236, |
| "loss": 3.0968, |
| "step": 114250 |
| }, |
| { |
| "epoch": 33.28481565612441, |
| "grad_norm": 0.4696246385574341, |
| "learning_rate": 0.0002007027972027972, |
| "loss": 3.0868, |
| "step": 114300 |
| }, |
| { |
| "epoch": 33.29937678373813, |
| "grad_norm": 0.5038855075836182, |
| "learning_rate": 0.00020052797202797201, |
| "loss": 3.0902, |
| "step": 114350 |
| }, |
| { |
| "epoch": 33.31393791135186, |
| "grad_norm": 0.48368892073631287, |
| "learning_rate": 0.00020035314685314684, |
| "loss": 3.086, |
| "step": 114400 |
| }, |
| { |
| "epoch": 33.328499038965575, |
| "grad_norm": 0.4968721568584442, |
| "learning_rate": 0.00020017832167832164, |
| "loss": 3.0891, |
| "step": 114450 |
| }, |
| { |
| "epoch": 33.3430601665793, |
| "grad_norm": 0.480759859085083, |
| "learning_rate": 0.00020000349650349647, |
| "loss": 3.0882, |
| "step": 114500 |
| }, |
| { |
| "epoch": 33.357621294193024, |
| "grad_norm": 0.4783109724521637, |
| "learning_rate": 0.0001998286713286713, |
| "loss": 3.0992, |
| "step": 114550 |
| }, |
| { |
| "epoch": 33.37218242180675, |
| "grad_norm": 0.4845009744167328, |
| "learning_rate": 0.00019965384615384612, |
| "loss": 3.1017, |
| "step": 114600 |
| }, |
| { |
| "epoch": 33.386743549420466, |
| "grad_norm": 0.5239719152450562, |
| "learning_rate": 0.00019947902097902098, |
| "loss": 3.0969, |
| "step": 114650 |
| }, |
| { |
| "epoch": 33.40130467703419, |
| "grad_norm": 0.4676482081413269, |
| "learning_rate": 0.0001993041958041958, |
| "loss": 3.1102, |
| "step": 114700 |
| }, |
| { |
| "epoch": 33.415865804647915, |
| "grad_norm": 0.475463330745697, |
| "learning_rate": 0.00019912937062937063, |
| "loss": 3.1028, |
| "step": 114750 |
| }, |
| { |
| "epoch": 33.43042693226163, |
| "grad_norm": 0.4781501889228821, |
| "learning_rate": 0.00019895454545454546, |
| "loss": 3.1087, |
| "step": 114800 |
| }, |
| { |
| "epoch": 33.44498805987536, |
| "grad_norm": 0.4624377489089966, |
| "learning_rate": 0.00019877972027972026, |
| "loss": 3.112, |
| "step": 114850 |
| }, |
| { |
| "epoch": 33.45954918748908, |
| "grad_norm": 0.48185843229293823, |
| "learning_rate": 0.00019860489510489508, |
| "loss": 3.1035, |
| "step": 114900 |
| }, |
| { |
| "epoch": 33.4741103151028, |
| "grad_norm": 0.47139590978622437, |
| "learning_rate": 0.0001984300699300699, |
| "loss": 3.1053, |
| "step": 114950 |
| }, |
| { |
| "epoch": 33.48867144271652, |
| "grad_norm": 0.4681786298751831, |
| "learning_rate": 0.00019825524475524474, |
| "loss": 3.0985, |
| "step": 115000 |
| }, |
| { |
| "epoch": 33.48867144271652, |
| "eval_accuracy": 0.3752184234562714, |
| "eval_loss": 3.5457003116607666, |
| "eval_runtime": 82.3635, |
| "eval_samples_per_second": 202.043, |
| "eval_steps_per_second": 12.639, |
| "step": 115000 |
| }, |
| { |
| "epoch": 33.50323257033025, |
| "grad_norm": 0.4925929009914398, |
| "learning_rate": 0.00019808041958041956, |
| "loss": 3.1113, |
| "step": 115050 |
| }, |
| { |
| "epoch": 33.51779369794397, |
| "grad_norm": 0.49513986706733704, |
| "learning_rate": 0.0001979055944055944, |
| "loss": 3.1084, |
| "step": 115100 |
| }, |
| { |
| "epoch": 33.53235482555769, |
| "grad_norm": 0.4670472741127014, |
| "learning_rate": 0.00019773076923076922, |
| "loss": 3.11, |
| "step": 115150 |
| }, |
| { |
| "epoch": 33.546915953171414, |
| "grad_norm": 0.44348880648612976, |
| "learning_rate": 0.00019755594405594402, |
| "loss": 3.1139, |
| "step": 115200 |
| }, |
| { |
| "epoch": 33.56147708078514, |
| "grad_norm": 0.4951223134994507, |
| "learning_rate": 0.00019738111888111884, |
| "loss": 3.1175, |
| "step": 115250 |
| }, |
| { |
| "epoch": 33.576038208398856, |
| "grad_norm": 0.47863635420799255, |
| "learning_rate": 0.0001972062937062937, |
| "loss": 3.1246, |
| "step": 115300 |
| }, |
| { |
| "epoch": 33.59059933601258, |
| "grad_norm": 0.47658371925354004, |
| "learning_rate": 0.00019703146853146852, |
| "loss": 3.1215, |
| "step": 115350 |
| }, |
| { |
| "epoch": 33.605160463626305, |
| "grad_norm": 0.479282408952713, |
| "learning_rate": 0.00019685664335664335, |
| "loss": 3.1058, |
| "step": 115400 |
| }, |
| { |
| "epoch": 33.61972159124002, |
| "grad_norm": 0.46034684777259827, |
| "learning_rate": 0.00019668181818181818, |
| "loss": 3.1254, |
| "step": 115450 |
| }, |
| { |
| "epoch": 33.63428271885375, |
| "grad_norm": 0.47371000051498413, |
| "learning_rate": 0.000196506993006993, |
| "loss": 3.1129, |
| "step": 115500 |
| }, |
| { |
| "epoch": 33.64884384646747, |
| "grad_norm": 0.4657459259033203, |
| "learning_rate": 0.00019633216783216783, |
| "loss": 3.1081, |
| "step": 115550 |
| }, |
| { |
| "epoch": 33.663404974081196, |
| "grad_norm": 0.4571855366230011, |
| "learning_rate": 0.00019615734265734263, |
| "loss": 3.1203, |
| "step": 115600 |
| }, |
| { |
| "epoch": 33.67796610169491, |
| "grad_norm": 0.4966122806072235, |
| "learning_rate": 0.00019598251748251746, |
| "loss": 3.1224, |
| "step": 115650 |
| }, |
| { |
| "epoch": 33.69252722930864, |
| "grad_norm": 0.45418232679367065, |
| "learning_rate": 0.00019580769230769228, |
| "loss": 3.1248, |
| "step": 115700 |
| }, |
| { |
| "epoch": 33.70708835692236, |
| "grad_norm": 0.49161285161972046, |
| "learning_rate": 0.0001956328671328671, |
| "loss": 3.1106, |
| "step": 115750 |
| }, |
| { |
| "epoch": 33.72164948453608, |
| "grad_norm": 0.48708125948905945, |
| "learning_rate": 0.00019545804195804194, |
| "loss": 3.1185, |
| "step": 115800 |
| }, |
| { |
| "epoch": 33.736210612149804, |
| "grad_norm": 0.4417417049407959, |
| "learning_rate": 0.00019528321678321676, |
| "loss": 3.1241, |
| "step": 115850 |
| }, |
| { |
| "epoch": 33.75077173976353, |
| "grad_norm": 0.47574418783187866, |
| "learning_rate": 0.0001951083916083916, |
| "loss": 3.1222, |
| "step": 115900 |
| }, |
| { |
| "epoch": 33.765332867377246, |
| "grad_norm": 0.4897342920303345, |
| "learning_rate": 0.0001949335664335664, |
| "loss": 3.1263, |
| "step": 115950 |
| }, |
| { |
| "epoch": 33.77989399499097, |
| "grad_norm": 0.4595036506652832, |
| "learning_rate": 0.00019475874125874124, |
| "loss": 3.1181, |
| "step": 116000 |
| }, |
| { |
| "epoch": 33.77989399499097, |
| "eval_accuracy": 0.37571562750667337, |
| "eval_loss": 3.539067268371582, |
| "eval_runtime": 82.0565, |
| "eval_samples_per_second": 202.799, |
| "eval_steps_per_second": 12.686, |
| "step": 116000 |
| }, |
| { |
| "epoch": 33.794455122604695, |
| "grad_norm": 0.49908018112182617, |
| "learning_rate": 0.00019458391608391607, |
| "loss": 3.1136, |
| "step": 116050 |
| }, |
| { |
| "epoch": 33.80901625021842, |
| "grad_norm": 0.467189759016037, |
| "learning_rate": 0.0001944090909090909, |
| "loss": 3.1207, |
| "step": 116100 |
| }, |
| { |
| "epoch": 33.82357737783214, |
| "grad_norm": 0.48708781599998474, |
| "learning_rate": 0.00019423426573426573, |
| "loss": 3.1262, |
| "step": 116150 |
| }, |
| { |
| "epoch": 33.83813850544586, |
| "grad_norm": 0.4589572548866272, |
| "learning_rate": 0.00019405944055944055, |
| "loss": 3.1188, |
| "step": 116200 |
| }, |
| { |
| "epoch": 33.852699633059586, |
| "grad_norm": 0.4717656672000885, |
| "learning_rate": 0.00019388461538461538, |
| "loss": 3.1333, |
| "step": 116250 |
| }, |
| { |
| "epoch": 33.867260760673304, |
| "grad_norm": 0.4619591534137726, |
| "learning_rate": 0.0001937097902097902, |
| "loss": 3.1307, |
| "step": 116300 |
| }, |
| { |
| "epoch": 33.88182188828703, |
| "grad_norm": 0.47739994525909424, |
| "learning_rate": 0.000193534965034965, |
| "loss": 3.1276, |
| "step": 116350 |
| }, |
| { |
| "epoch": 33.89638301590075, |
| "grad_norm": 0.47352170944213867, |
| "learning_rate": 0.00019336013986013983, |
| "loss": 3.1263, |
| "step": 116400 |
| }, |
| { |
| "epoch": 33.91094414351447, |
| "grad_norm": 0.46669456362724304, |
| "learning_rate": 0.00019318531468531466, |
| "loss": 3.1517, |
| "step": 116450 |
| }, |
| { |
| "epoch": 33.925505271128195, |
| "grad_norm": 0.4683612883090973, |
| "learning_rate": 0.00019301048951048949, |
| "loss": 3.1409, |
| "step": 116500 |
| }, |
| { |
| "epoch": 33.94006639874192, |
| "grad_norm": 0.4722287356853485, |
| "learning_rate": 0.0001928356643356643, |
| "loss": 3.1373, |
| "step": 116550 |
| }, |
| { |
| "epoch": 33.954627526355644, |
| "grad_norm": 0.4885099232196808, |
| "learning_rate": 0.00019266083916083914, |
| "loss": 3.1347, |
| "step": 116600 |
| }, |
| { |
| "epoch": 33.96918865396936, |
| "grad_norm": 0.4621324837207794, |
| "learning_rate": 0.00019248601398601397, |
| "loss": 3.1271, |
| "step": 116650 |
| }, |
| { |
| "epoch": 33.983749781583086, |
| "grad_norm": 0.47383829951286316, |
| "learning_rate": 0.00019231118881118882, |
| "loss": 3.1439, |
| "step": 116700 |
| }, |
| { |
| "epoch": 33.99831090919681, |
| "grad_norm": 0.47833409905433655, |
| "learning_rate": 0.00019213636363636362, |
| "loss": 3.1246, |
| "step": 116750 |
| }, |
| { |
| "epoch": 34.01281379230007, |
| "grad_norm": 0.4776931405067444, |
| "learning_rate": 0.00019196153846153845, |
| "loss": 3.0536, |
| "step": 116800 |
| }, |
| { |
| "epoch": 34.0273749199138, |
| "grad_norm": 0.49985092878341675, |
| "learning_rate": 0.00019178671328671327, |
| "loss": 3.0531, |
| "step": 116850 |
| }, |
| { |
| "epoch": 34.04193604752752, |
| "grad_norm": 0.5092440247535706, |
| "learning_rate": 0.0001916118881118881, |
| "loss": 3.054, |
| "step": 116900 |
| }, |
| { |
| "epoch": 34.056497175141246, |
| "grad_norm": 0.4726572334766388, |
| "learning_rate": 0.00019143706293706293, |
| "loss": 3.0574, |
| "step": 116950 |
| }, |
| { |
| "epoch": 34.07105830275496, |
| "grad_norm": 0.45419245958328247, |
| "learning_rate": 0.00019126223776223775, |
| "loss": 3.0525, |
| "step": 117000 |
| }, |
| { |
| "epoch": 34.07105830275496, |
| "eval_accuracy": 0.37497240858554265, |
| "eval_loss": 3.552489995956421, |
| "eval_runtime": 82.0135, |
| "eval_samples_per_second": 202.905, |
| "eval_steps_per_second": 12.693, |
| "step": 117000 |
| }, |
| { |
| "epoch": 34.08561943036869, |
| "grad_norm": 0.48084136843681335, |
| "learning_rate": 0.00019108741258741258, |
| "loss": 3.0684, |
| "step": 117050 |
| }, |
| { |
| "epoch": 34.10018055798241, |
| "grad_norm": 0.48191818594932556, |
| "learning_rate": 0.00019091258741258738, |
| "loss": 3.0553, |
| "step": 117100 |
| }, |
| { |
| "epoch": 34.11474168559613, |
| "grad_norm": 0.46842437982559204, |
| "learning_rate": 0.0001907377622377622, |
| "loss": 3.0641, |
| "step": 117150 |
| }, |
| { |
| "epoch": 34.129302813209854, |
| "grad_norm": 0.4803396165370941, |
| "learning_rate": 0.00019056293706293703, |
| "loss": 3.0601, |
| "step": 117200 |
| }, |
| { |
| "epoch": 34.14386394082358, |
| "grad_norm": 0.49113401770591736, |
| "learning_rate": 0.00019038811188811186, |
| "loss": 3.0648, |
| "step": 117250 |
| }, |
| { |
| "epoch": 34.1584250684373, |
| "grad_norm": 0.4804093837738037, |
| "learning_rate": 0.0001902132867132867, |
| "loss": 3.0732, |
| "step": 117300 |
| }, |
| { |
| "epoch": 34.17298619605102, |
| "grad_norm": 0.477255642414093, |
| "learning_rate": 0.00019003846153846151, |
| "loss": 3.0673, |
| "step": 117350 |
| }, |
| { |
| "epoch": 34.187547323664745, |
| "grad_norm": 0.486612468957901, |
| "learning_rate": 0.00018986363636363637, |
| "loss": 3.08, |
| "step": 117400 |
| }, |
| { |
| "epoch": 34.20210845127847, |
| "grad_norm": 0.4927060008049011, |
| "learning_rate": 0.0001896888111888112, |
| "loss": 3.079, |
| "step": 117450 |
| }, |
| { |
| "epoch": 34.21666957889219, |
| "grad_norm": 0.47757139801979065, |
| "learning_rate": 0.000189513986013986, |
| "loss": 3.0703, |
| "step": 117500 |
| }, |
| { |
| "epoch": 34.23123070650591, |
| "grad_norm": 0.49288758635520935, |
| "learning_rate": 0.00018933916083916082, |
| "loss": 3.0753, |
| "step": 117550 |
| }, |
| { |
| "epoch": 34.245791834119636, |
| "grad_norm": 0.4877185821533203, |
| "learning_rate": 0.00018916433566433565, |
| "loss": 3.0877, |
| "step": 117600 |
| }, |
| { |
| "epoch": 34.26035296173335, |
| "grad_norm": 0.5010328888893127, |
| "learning_rate": 0.00018898951048951048, |
| "loss": 3.0706, |
| "step": 117650 |
| }, |
| { |
| "epoch": 34.27491408934708, |
| "grad_norm": 0.5213559865951538, |
| "learning_rate": 0.0001888146853146853, |
| "loss": 3.0727, |
| "step": 117700 |
| }, |
| { |
| "epoch": 34.2894752169608, |
| "grad_norm": 0.5111045837402344, |
| "learning_rate": 0.00018863986013986013, |
| "loss": 3.0819, |
| "step": 117750 |
| }, |
| { |
| "epoch": 34.30403634457453, |
| "grad_norm": 0.5128031969070435, |
| "learning_rate": 0.00018846503496503496, |
| "loss": 3.0835, |
| "step": 117800 |
| }, |
| { |
| "epoch": 34.318597472188245, |
| "grad_norm": 0.49285659193992615, |
| "learning_rate": 0.00018829020979020976, |
| "loss": 3.0768, |
| "step": 117850 |
| }, |
| { |
| "epoch": 34.33315859980197, |
| "grad_norm": 0.5066632628440857, |
| "learning_rate": 0.00018811538461538458, |
| "loss": 3.092, |
| "step": 117900 |
| }, |
| { |
| "epoch": 34.347719727415694, |
| "grad_norm": 0.5188387036323547, |
| "learning_rate": 0.0001879405594405594, |
| "loss": 3.0718, |
| "step": 117950 |
| }, |
| { |
| "epoch": 34.36228085502941, |
| "grad_norm": 0.5478219985961914, |
| "learning_rate": 0.00018776573426573424, |
| "loss": 3.0902, |
| "step": 118000 |
| }, |
| { |
| "epoch": 34.36228085502941, |
| "eval_accuracy": 0.3748842101376237, |
| "eval_loss": 3.555130958557129, |
| "eval_runtime": 82.1427, |
| "eval_samples_per_second": 202.586, |
| "eval_steps_per_second": 12.673, |
| "step": 118000 |
| }, |
| { |
| "epoch": 34.376841982643136, |
| "grad_norm": 0.4986362159252167, |
| "learning_rate": 0.00018759090909090906, |
| "loss": 3.0867, |
| "step": 118050 |
| }, |
| { |
| "epoch": 34.39140311025686, |
| "grad_norm": 0.46725940704345703, |
| "learning_rate": 0.00018741608391608392, |
| "loss": 3.0952, |
| "step": 118100 |
| }, |
| { |
| "epoch": 34.40596423787058, |
| "grad_norm": 0.48414498567581177, |
| "learning_rate": 0.00018724125874125874, |
| "loss": 3.0864, |
| "step": 118150 |
| }, |
| { |
| "epoch": 34.4205253654843, |
| "grad_norm": 0.47394856810569763, |
| "learning_rate": 0.00018706643356643357, |
| "loss": 3.0867, |
| "step": 118200 |
| }, |
| { |
| "epoch": 34.43508649309803, |
| "grad_norm": 0.4806564152240753, |
| "learning_rate": 0.00018689160839160837, |
| "loss": 3.1011, |
| "step": 118250 |
| }, |
| { |
| "epoch": 34.44964762071175, |
| "grad_norm": 0.5065816640853882, |
| "learning_rate": 0.0001867167832167832, |
| "loss": 3.1005, |
| "step": 118300 |
| }, |
| { |
| "epoch": 34.46420874832547, |
| "grad_norm": 0.5356646776199341, |
| "learning_rate": 0.00018654195804195802, |
| "loss": 3.0966, |
| "step": 118350 |
| }, |
| { |
| "epoch": 34.47876987593919, |
| "grad_norm": 0.5453819036483765, |
| "learning_rate": 0.00018636713286713285, |
| "loss": 3.1138, |
| "step": 118400 |
| }, |
| { |
| "epoch": 34.49333100355292, |
| "grad_norm": 0.48635029792785645, |
| "learning_rate": 0.00018619230769230768, |
| "loss": 3.0934, |
| "step": 118450 |
| }, |
| { |
| "epoch": 34.507892131166635, |
| "grad_norm": 0.4848397970199585, |
| "learning_rate": 0.0001860174825174825, |
| "loss": 3.1004, |
| "step": 118500 |
| }, |
| { |
| "epoch": 34.52245325878036, |
| "grad_norm": 0.49055853486061096, |
| "learning_rate": 0.00018584265734265733, |
| "loss": 3.1014, |
| "step": 118550 |
| }, |
| { |
| "epoch": 34.537014386394084, |
| "grad_norm": 0.46897390484809875, |
| "learning_rate": 0.00018566783216783213, |
| "loss": 3.0912, |
| "step": 118600 |
| }, |
| { |
| "epoch": 34.5515755140078, |
| "grad_norm": 0.48043298721313477, |
| "learning_rate": 0.00018549300699300696, |
| "loss": 3.1118, |
| "step": 118650 |
| }, |
| { |
| "epoch": 34.566136641621526, |
| "grad_norm": 0.4745662808418274, |
| "learning_rate": 0.00018531818181818178, |
| "loss": 3.1093, |
| "step": 118700 |
| }, |
| { |
| "epoch": 34.58069776923525, |
| "grad_norm": 0.5236697793006897, |
| "learning_rate": 0.0001851433566433566, |
| "loss": 3.1051, |
| "step": 118750 |
| }, |
| { |
| "epoch": 34.595258896848975, |
| "grad_norm": 0.5072975754737854, |
| "learning_rate": 0.00018496853146853146, |
| "loss": 3.1151, |
| "step": 118800 |
| }, |
| { |
| "epoch": 34.60982002446269, |
| "grad_norm": 0.523888885974884, |
| "learning_rate": 0.0001847937062937063, |
| "loss": 3.0986, |
| "step": 118850 |
| }, |
| { |
| "epoch": 34.62438115207642, |
| "grad_norm": 0.4943288266658783, |
| "learning_rate": 0.00018461888111888112, |
| "loss": 3.1056, |
| "step": 118900 |
| }, |
| { |
| "epoch": 34.63894227969014, |
| "grad_norm": 0.47327354550361633, |
| "learning_rate": 0.00018444405594405594, |
| "loss": 3.1172, |
| "step": 118950 |
| }, |
| { |
| "epoch": 34.65350340730386, |
| "grad_norm": 0.48451825976371765, |
| "learning_rate": 0.00018426923076923074, |
| "loss": 3.1092, |
| "step": 119000 |
| }, |
| { |
| "epoch": 34.65350340730386, |
| "eval_accuracy": 0.37545314892566645, |
| "eval_loss": 3.547590494155884, |
| "eval_runtime": 82.0545, |
| "eval_samples_per_second": 202.804, |
| "eval_steps_per_second": 12.687, |
| "step": 119000 |
| }, |
| { |
| "epoch": 34.66806453491758, |
| "grad_norm": 0.4731082618236542, |
| "learning_rate": 0.00018409440559440557, |
| "loss": 3.109, |
| "step": 119050 |
| }, |
| { |
| "epoch": 34.68262566253131, |
| "grad_norm": 0.48922571539878845, |
| "learning_rate": 0.0001839195804195804, |
| "loss": 3.1054, |
| "step": 119100 |
| }, |
| { |
| "epoch": 34.697186790145025, |
| "grad_norm": 0.4867154061794281, |
| "learning_rate": 0.00018374475524475523, |
| "loss": 3.1189, |
| "step": 119150 |
| }, |
| { |
| "epoch": 34.71174791775875, |
| "grad_norm": 0.4935893416404724, |
| "learning_rate": 0.00018356993006993005, |
| "loss": 3.1059, |
| "step": 119200 |
| }, |
| { |
| "epoch": 34.726309045372474, |
| "grad_norm": 0.4788350760936737, |
| "learning_rate": 0.00018339510489510488, |
| "loss": 3.1146, |
| "step": 119250 |
| }, |
| { |
| "epoch": 34.7408701729862, |
| "grad_norm": 0.4787976145744324, |
| "learning_rate": 0.0001832202797202797, |
| "loss": 3.1168, |
| "step": 119300 |
| }, |
| { |
| "epoch": 34.755431300599916, |
| "grad_norm": 0.49792295694351196, |
| "learning_rate": 0.0001830454545454545, |
| "loss": 3.1044, |
| "step": 119350 |
| }, |
| { |
| "epoch": 34.76999242821364, |
| "grad_norm": 0.47856009006500244, |
| "learning_rate": 0.00018287062937062933, |
| "loss": 3.1247, |
| "step": 119400 |
| }, |
| { |
| "epoch": 34.784553555827365, |
| "grad_norm": 0.5033456087112427, |
| "learning_rate": 0.00018269580419580419, |
| "loss": 3.1165, |
| "step": 119450 |
| }, |
| { |
| "epoch": 34.79911468344108, |
| "grad_norm": 0.48007580637931824, |
| "learning_rate": 0.000182520979020979, |
| "loss": 3.1226, |
| "step": 119500 |
| }, |
| { |
| "epoch": 34.81367581105481, |
| "grad_norm": 0.4864267110824585, |
| "learning_rate": 0.00018234615384615384, |
| "loss": 3.1217, |
| "step": 119550 |
| }, |
| { |
| "epoch": 34.82823693866853, |
| "grad_norm": 0.5009084939956665, |
| "learning_rate": 0.00018217132867132867, |
| "loss": 3.1256, |
| "step": 119600 |
| }, |
| { |
| "epoch": 34.842798066282256, |
| "grad_norm": 0.473192423582077, |
| "learning_rate": 0.0001819965034965035, |
| "loss": 3.1286, |
| "step": 119650 |
| }, |
| { |
| "epoch": 34.857359193895974, |
| "grad_norm": 0.5062655210494995, |
| "learning_rate": 0.00018182167832167832, |
| "loss": 3.1213, |
| "step": 119700 |
| }, |
| { |
| "epoch": 34.8719203215097, |
| "grad_norm": 0.479464054107666, |
| "learning_rate": 0.00018164685314685312, |
| "loss": 3.1263, |
| "step": 119750 |
| }, |
| { |
| "epoch": 34.88648144912342, |
| "grad_norm": 0.502382218837738, |
| "learning_rate": 0.00018147202797202795, |
| "loss": 3.1265, |
| "step": 119800 |
| }, |
| { |
| "epoch": 34.90104257673714, |
| "grad_norm": 0.46601811051368713, |
| "learning_rate": 0.00018129720279720277, |
| "loss": 3.1314, |
| "step": 119850 |
| }, |
| { |
| "epoch": 34.915603704350865, |
| "grad_norm": 0.5134609937667847, |
| "learning_rate": 0.0001811223776223776, |
| "loss": 3.1261, |
| "step": 119900 |
| }, |
| { |
| "epoch": 34.93016483196459, |
| "grad_norm": 0.47554293274879456, |
| "learning_rate": 0.00018094755244755243, |
| "loss": 3.1209, |
| "step": 119950 |
| }, |
| { |
| "epoch": 34.94472595957831, |
| "grad_norm": 0.4887586534023285, |
| "learning_rate": 0.00018077272727272725, |
| "loss": 3.1216, |
| "step": 120000 |
| }, |
| { |
| "epoch": 34.94472595957831, |
| "eval_accuracy": 0.37557239322725294, |
| "eval_loss": 3.542292356491089, |
| "eval_runtime": 82.0006, |
| "eval_samples_per_second": 202.937, |
| "eval_steps_per_second": 12.695, |
| "step": 120000 |
| }, |
| { |
| "epoch": 34.95928708719203, |
| "grad_norm": 0.48519790172576904, |
| "learning_rate": 0.00018059790209790208, |
| "loss": 3.1235, |
| "step": 120050 |
| }, |
| { |
| "epoch": 34.973848214805756, |
| "grad_norm": 0.49591171741485596, |
| "learning_rate": 0.00018042307692307688, |
| "loss": 3.1318, |
| "step": 120100 |
| }, |
| { |
| "epoch": 34.98840934241948, |
| "grad_norm": 0.497721403837204, |
| "learning_rate": 0.00018024825174825176, |
| "loss": 3.1261, |
| "step": 120150 |
| }, |
| { |
| "epoch": 35.00291222552274, |
| "grad_norm": 0.5038352012634277, |
| "learning_rate": 0.00018007342657342656, |
| "loss": 3.1049, |
| "step": 120200 |
| }, |
| { |
| "epoch": 35.01747335313647, |
| "grad_norm": 0.493927925825119, |
| "learning_rate": 0.0001798986013986014, |
| "loss": 3.0353, |
| "step": 120250 |
| }, |
| { |
| "epoch": 35.03203448075019, |
| "grad_norm": 0.4847087860107422, |
| "learning_rate": 0.00017972377622377621, |
| "loss": 3.0336, |
| "step": 120300 |
| }, |
| { |
| "epoch": 35.04659560836391, |
| "grad_norm": 0.49554508924484253, |
| "learning_rate": 0.00017954895104895104, |
| "loss": 3.0357, |
| "step": 120350 |
| }, |
| { |
| "epoch": 35.06115673597763, |
| "grad_norm": 0.4976358413696289, |
| "learning_rate": 0.00017937412587412587, |
| "loss": 3.0533, |
| "step": 120400 |
| }, |
| { |
| "epoch": 35.07571786359136, |
| "grad_norm": 0.5121445655822754, |
| "learning_rate": 0.0001791993006993007, |
| "loss": 3.0374, |
| "step": 120450 |
| }, |
| { |
| "epoch": 35.09027899120508, |
| "grad_norm": 0.4897436797618866, |
| "learning_rate": 0.0001790244755244755, |
| "loss": 3.0558, |
| "step": 120500 |
| }, |
| { |
| "epoch": 35.1048401188188, |
| "grad_norm": 0.46646207571029663, |
| "learning_rate": 0.00017884965034965032, |
| "loss": 3.065, |
| "step": 120550 |
| }, |
| { |
| "epoch": 35.119401246432524, |
| "grad_norm": 0.48043006658554077, |
| "learning_rate": 0.00017867482517482515, |
| "loss": 3.0624, |
| "step": 120600 |
| }, |
| { |
| "epoch": 35.13396237404625, |
| "grad_norm": 0.4928785264492035, |
| "learning_rate": 0.00017849999999999997, |
| "loss": 3.0575, |
| "step": 120650 |
| }, |
| { |
| "epoch": 35.148523501659966, |
| "grad_norm": 0.49768656492233276, |
| "learning_rate": 0.0001783251748251748, |
| "loss": 3.0631, |
| "step": 120700 |
| }, |
| { |
| "epoch": 35.16308462927369, |
| "grad_norm": 0.4867604076862335, |
| "learning_rate": 0.00017815034965034963, |
| "loss": 3.067, |
| "step": 120750 |
| }, |
| { |
| "epoch": 35.177645756887415, |
| "grad_norm": 0.5165238380432129, |
| "learning_rate": 0.00017797552447552446, |
| "loss": 3.0754, |
| "step": 120800 |
| }, |
| { |
| "epoch": 35.19220688450113, |
| "grad_norm": 0.476970911026001, |
| "learning_rate": 0.0001778006993006993, |
| "loss": 3.0705, |
| "step": 120850 |
| }, |
| { |
| "epoch": 35.20676801211486, |
| "grad_norm": 0.48118260502815247, |
| "learning_rate": 0.00017762587412587414, |
| "loss": 3.075, |
| "step": 120900 |
| }, |
| { |
| "epoch": 35.22132913972858, |
| "grad_norm": 0.5257270336151123, |
| "learning_rate": 0.00017745104895104894, |
| "loss": 3.08, |
| "step": 120950 |
| }, |
| { |
| "epoch": 35.235890267342306, |
| "grad_norm": 0.49592307209968567, |
| "learning_rate": 0.00017727622377622376, |
| "loss": 3.0606, |
| "step": 121000 |
| }, |
| { |
| "epoch": 35.235890267342306, |
| "eval_accuracy": 0.37511105654567134, |
| "eval_loss": 3.5528719425201416, |
| "eval_runtime": 81.9973, |
| "eval_samples_per_second": 202.946, |
| "eval_steps_per_second": 12.696, |
| "step": 121000 |
| }, |
| { |
| "epoch": 35.25045139495602, |
| "grad_norm": 0.4846612811088562, |
| "learning_rate": 0.0001771013986013986, |
| "loss": 3.0619, |
| "step": 121050 |
| }, |
| { |
| "epoch": 35.26501252256975, |
| "grad_norm": 0.482929527759552, |
| "learning_rate": 0.00017692657342657342, |
| "loss": 3.087, |
| "step": 121100 |
| }, |
| { |
| "epoch": 35.27957365018347, |
| "grad_norm": 0.4884685277938843, |
| "learning_rate": 0.00017675174825174824, |
| "loss": 3.074, |
| "step": 121150 |
| }, |
| { |
| "epoch": 35.29413477779719, |
| "grad_norm": 0.500960111618042, |
| "learning_rate": 0.00017657692307692307, |
| "loss": 3.0729, |
| "step": 121200 |
| }, |
| { |
| "epoch": 35.308695905410914, |
| "grad_norm": 0.5140550136566162, |
| "learning_rate": 0.0001764020979020979, |
| "loss": 3.0799, |
| "step": 121250 |
| }, |
| { |
| "epoch": 35.32325703302464, |
| "grad_norm": 0.47481632232666016, |
| "learning_rate": 0.0001762272727272727, |
| "loss": 3.0766, |
| "step": 121300 |
| }, |
| { |
| "epoch": 35.337818160638356, |
| "grad_norm": 0.4896376430988312, |
| "learning_rate": 0.00017605244755244752, |
| "loss": 3.0806, |
| "step": 121350 |
| }, |
| { |
| "epoch": 35.35237928825208, |
| "grad_norm": 0.49121615290641785, |
| "learning_rate": 0.00017587762237762235, |
| "loss": 3.0722, |
| "step": 121400 |
| }, |
| { |
| "epoch": 35.366940415865805, |
| "grad_norm": 0.47697511315345764, |
| "learning_rate": 0.00017570279720279718, |
| "loss": 3.0929, |
| "step": 121450 |
| }, |
| { |
| "epoch": 35.38150154347953, |
| "grad_norm": 0.500864565372467, |
| "learning_rate": 0.000175527972027972, |
| "loss": 3.0876, |
| "step": 121500 |
| }, |
| { |
| "epoch": 35.39606267109325, |
| "grad_norm": 0.48108971118927, |
| "learning_rate": 0.00017535314685314686, |
| "loss": 3.0797, |
| "step": 121550 |
| }, |
| { |
| "epoch": 35.41062379870697, |
| "grad_norm": 0.506229817867279, |
| "learning_rate": 0.00017517832167832168, |
| "loss": 3.0859, |
| "step": 121600 |
| }, |
| { |
| "epoch": 35.425184926320696, |
| "grad_norm": 0.5014274716377258, |
| "learning_rate": 0.0001750034965034965, |
| "loss": 3.0791, |
| "step": 121650 |
| }, |
| { |
| "epoch": 35.439746053934414, |
| "grad_norm": 0.5198182463645935, |
| "learning_rate": 0.0001748286713286713, |
| "loss": 3.0919, |
| "step": 121700 |
| }, |
| { |
| "epoch": 35.45430718154814, |
| "grad_norm": 0.502521276473999, |
| "learning_rate": 0.00017465384615384614, |
| "loss": 3.0876, |
| "step": 121750 |
| }, |
| { |
| "epoch": 35.46886830916186, |
| "grad_norm": 0.5124614238739014, |
| "learning_rate": 0.00017447902097902096, |
| "loss": 3.0809, |
| "step": 121800 |
| }, |
| { |
| "epoch": 35.48342943677559, |
| "grad_norm": 0.5127264261245728, |
| "learning_rate": 0.0001743041958041958, |
| "loss": 3.0982, |
| "step": 121850 |
| }, |
| { |
| "epoch": 35.497990564389305, |
| "grad_norm": 0.4943045973777771, |
| "learning_rate": 0.00017412937062937062, |
| "loss": 3.0835, |
| "step": 121900 |
| }, |
| { |
| "epoch": 35.51255169200303, |
| "grad_norm": 0.512199878692627, |
| "learning_rate": 0.00017395454545454544, |
| "loss": 3.1013, |
| "step": 121950 |
| }, |
| { |
| "epoch": 35.527112819616754, |
| "grad_norm": 0.4812929332256317, |
| "learning_rate": 0.00017377972027972027, |
| "loss": 3.0835, |
| "step": 122000 |
| }, |
| { |
| "epoch": 35.527112819616754, |
| "eval_accuracy": 0.3755722756293224, |
| "eval_loss": 3.5473082065582275, |
| "eval_runtime": 82.0854, |
| "eval_samples_per_second": 202.728, |
| "eval_steps_per_second": 12.682, |
| "step": 122000 |
| }, |
| { |
| "epoch": 35.54167394723047, |
| "grad_norm": 0.4876304566860199, |
| "learning_rate": 0.00017360489510489507, |
| "loss": 3.087, |
| "step": 122050 |
| }, |
| { |
| "epoch": 35.556235074844196, |
| "grad_norm": 0.47825881838798523, |
| "learning_rate": 0.0001734300699300699, |
| "loss": 3.1018, |
| "step": 122100 |
| }, |
| { |
| "epoch": 35.57079620245792, |
| "grad_norm": 0.47563305497169495, |
| "learning_rate": 0.00017325524475524472, |
| "loss": 3.0955, |
| "step": 122150 |
| }, |
| { |
| "epoch": 35.58535733007164, |
| "grad_norm": 0.47859513759613037, |
| "learning_rate": 0.00017308041958041955, |
| "loss": 3.0818, |
| "step": 122200 |
| }, |
| { |
| "epoch": 35.59991845768536, |
| "grad_norm": 0.5417383909225464, |
| "learning_rate": 0.0001729055944055944, |
| "loss": 3.1043, |
| "step": 122250 |
| }, |
| { |
| "epoch": 35.61447958529909, |
| "grad_norm": 0.48571497201919556, |
| "learning_rate": 0.00017273076923076923, |
| "loss": 3.0955, |
| "step": 122300 |
| }, |
| { |
| "epoch": 35.62904071291281, |
| "grad_norm": 0.46994882822036743, |
| "learning_rate": 0.00017255594405594406, |
| "loss": 3.1058, |
| "step": 122350 |
| }, |
| { |
| "epoch": 35.64360184052653, |
| "grad_norm": 0.5115216374397278, |
| "learning_rate": 0.00017238111888111889, |
| "loss": 3.1144, |
| "step": 122400 |
| }, |
| { |
| "epoch": 35.65816296814025, |
| "grad_norm": 0.5126329660415649, |
| "learning_rate": 0.00017220629370629369, |
| "loss": 3.1074, |
| "step": 122450 |
| }, |
| { |
| "epoch": 35.67272409575398, |
| "grad_norm": 0.483478844165802, |
| "learning_rate": 0.0001720314685314685, |
| "loss": 3.1093, |
| "step": 122500 |
| }, |
| { |
| "epoch": 35.687285223367695, |
| "grad_norm": 0.5179030299186707, |
| "learning_rate": 0.00017185664335664334, |
| "loss": 3.0987, |
| "step": 122550 |
| }, |
| { |
| "epoch": 35.70184635098142, |
| "grad_norm": 0.4972935616970062, |
| "learning_rate": 0.00017168181818181817, |
| "loss": 3.1164, |
| "step": 122600 |
| }, |
| { |
| "epoch": 35.716407478595144, |
| "grad_norm": 0.4637240767478943, |
| "learning_rate": 0.000171506993006993, |
| "loss": 3.1103, |
| "step": 122650 |
| }, |
| { |
| "epoch": 35.73096860620886, |
| "grad_norm": 0.4902418553829193, |
| "learning_rate": 0.00017133216783216782, |
| "loss": 3.1156, |
| "step": 122700 |
| }, |
| { |
| "epoch": 35.745529733822586, |
| "grad_norm": 0.4804700016975403, |
| "learning_rate": 0.00017115734265734265, |
| "loss": 3.1112, |
| "step": 122750 |
| }, |
| { |
| "epoch": 35.76009086143631, |
| "grad_norm": 0.5173152089118958, |
| "learning_rate": 0.00017098251748251745, |
| "loss": 3.0996, |
| "step": 122800 |
| }, |
| { |
| "epoch": 35.774651989050035, |
| "grad_norm": 0.5122510194778442, |
| "learning_rate": 0.00017080769230769227, |
| "loss": 3.0999, |
| "step": 122850 |
| }, |
| { |
| "epoch": 35.78921311666375, |
| "grad_norm": 0.4938056468963623, |
| "learning_rate": 0.0001706328671328671, |
| "loss": 3.117, |
| "step": 122900 |
| }, |
| { |
| "epoch": 35.80377424427748, |
| "grad_norm": 0.493918776512146, |
| "learning_rate": 0.00017045804195804195, |
| "loss": 3.1049, |
| "step": 122950 |
| }, |
| { |
| "epoch": 35.8183353718912, |
| "grad_norm": 0.49973034858703613, |
| "learning_rate": 0.00017028321678321678, |
| "loss": 3.1028, |
| "step": 123000 |
| }, |
| { |
| "epoch": 35.8183353718912, |
| "eval_accuracy": 0.375704808497062, |
| "eval_loss": 3.5453245639801025, |
| "eval_runtime": 82.1224, |
| "eval_samples_per_second": 202.637, |
| "eval_steps_per_second": 12.676, |
| "step": 123000 |
| }, |
| { |
| "epoch": 35.8183353718912, |
| "step": 123000, |
| "total_flos": 2.57086667390976e+18, |
| "train_loss": 3.3376626631573934, |
| "train_runtime": 91204.18, |
| "train_samples_per_second": 150.592, |
| "train_steps_per_second": 1.883 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 171700, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 20, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 17 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.57086667390976e+18, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|