| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9992228935207619, | |
| "eval_steps": 5000, | |
| "global_step": 415000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0012038830042418818, | |
| "grad_norm": 8.390454292297363, | |
| "learning_rate": 5.910962367274216e-07, | |
| "loss": 4.9044, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0024077660084837636, | |
| "grad_norm": 12.908564567565918, | |
| "learning_rate": 1.1930272313581972e-06, | |
| "loss": 4.8297, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.0036116490127256454, | |
| "grad_norm": 16.473039627075195, | |
| "learning_rate": 1.7949582259889727e-06, | |
| "loss": 4.7877, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.004815532016967527, | |
| "grad_norm": 20.620445251464844, | |
| "learning_rate": 2.3968892206197483e-06, | |
| "loss": 4.6598, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.0060194150212094085, | |
| "grad_norm": 17.269424438476562, | |
| "learning_rate": 2.996412491272001e-06, | |
| "loss": 4.3792, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.007223298025451291, | |
| "grad_norm": 20.924165725708008, | |
| "learning_rate": 3.598343485902776e-06, | |
| "loss": 4.1505, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.008427181029693172, | |
| "grad_norm": 22.36298179626465, | |
| "learning_rate": 4.200274480533552e-06, | |
| "loss": 3.8286, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.009631064033935054, | |
| "grad_norm": 17.17203712463379, | |
| "learning_rate": 4.802205475164327e-06, | |
| "loss": 3.6786, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.010834947038176937, | |
| "grad_norm": 21.19932746887207, | |
| "learning_rate": 5.404136469795103e-06, | |
| "loss": 3.5698, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.012038830042418817, | |
| "grad_norm": 26.214515686035156, | |
| "learning_rate": 6.0060674644258785e-06, | |
| "loss": 3.5362, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.012038830042418817, | |
| "eval_runtime": 6118.3437, | |
| "eval_samples_per_second": 135.763, | |
| "eval_steps_per_second": 33.941, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.0132427130466607, | |
| "grad_norm": 20.36381721496582, | |
| "learning_rate": 6.6079984590566535e-06, | |
| "loss": 3.4738, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.014446596050902582, | |
| "grad_norm": 27.98621940612793, | |
| "learning_rate": 7.209929453687429e-06, | |
| "loss": 3.4793, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.015650479055144464, | |
| "grad_norm": 36.69669723510742, | |
| "learning_rate": 7.810656586328943e-06, | |
| "loss": 3.4226, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.016854362059386344, | |
| "grad_norm": 19.70973014831543, | |
| "learning_rate": 8.412587580959718e-06, | |
| "loss": 3.3237, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.018058245063628225, | |
| "grad_norm": 22.874879837036133, | |
| "learning_rate": 9.014518575590495e-06, | |
| "loss": 3.3729, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.01926212806787011, | |
| "grad_norm": 27.921424865722656, | |
| "learning_rate": 9.61644957022127e-06, | |
| "loss": 3.3806, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.02046601107211199, | |
| "grad_norm": 20.24401092529297, | |
| "learning_rate": 1.0218380564852045e-05, | |
| "loss": 3.3732, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.021669894076353873, | |
| "grad_norm": 24.8074893951416, | |
| "learning_rate": 1.0820311559482821e-05, | |
| "loss": 3.3689, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.022873777080595754, | |
| "grad_norm": 20.271554946899414, | |
| "learning_rate": 1.1422242554113596e-05, | |
| "loss": 3.3533, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.024077660084837634, | |
| "grad_norm": 25.907699584960938, | |
| "learning_rate": 1.2024173548744371e-05, | |
| "loss": 3.3256, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.024077660084837634, | |
| "eval_runtime": 5885.2151, | |
| "eval_samples_per_second": 141.141, | |
| "eval_steps_per_second": 35.285, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.025281543089079518, | |
| "grad_norm": 27.258106231689453, | |
| "learning_rate": 1.2624900681385886e-05, | |
| "loss": 3.3775, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.0264854260933214, | |
| "grad_norm": 34.32582473754883, | |
| "learning_rate": 1.3226831676016663e-05, | |
| "loss": 3.3426, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.02768930909756328, | |
| "grad_norm": 32.09712600708008, | |
| "learning_rate": 1.3828762670647438e-05, | |
| "loss": 3.3584, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.028893192101805163, | |
| "grad_norm": 21.120548248291016, | |
| "learning_rate": 1.4430693665278213e-05, | |
| "loss": 3.3117, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.030097075106047044, | |
| "grad_norm": 21.870174407958984, | |
| "learning_rate": 1.503262465990899e-05, | |
| "loss": 3.3063, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.03130095811028893, | |
| "grad_norm": 20.636926651000977, | |
| "learning_rate": 1.56333517925505e-05, | |
| "loss": 3.3181, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.03250484111453081, | |
| "grad_norm": 17.365741729736328, | |
| "learning_rate": 1.623528278718128e-05, | |
| "loss": 3.2712, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.03370872411877269, | |
| "grad_norm": 29.897491455078125, | |
| "learning_rate": 1.683721378181205e-05, | |
| "loss": 3.3179, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.03491260712301457, | |
| "grad_norm": 21.453102111816406, | |
| "learning_rate": 1.7439144776442828e-05, | |
| "loss": 3.3506, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.03611649012725645, | |
| "grad_norm": 27.832408905029297, | |
| "learning_rate": 1.8041075771073605e-05, | |
| "loss": 3.3097, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.03611649012725645, | |
| "eval_runtime": 6111.9913, | |
| "eval_samples_per_second": 135.904, | |
| "eval_steps_per_second": 33.976, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.03732037313149834, | |
| "grad_norm": 27.701480865478516, | |
| "learning_rate": 1.864180290371512e-05, | |
| "loss": 3.3058, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.03852425613574022, | |
| "grad_norm": 28.425107955932617, | |
| "learning_rate": 1.9243733898345895e-05, | |
| "loss": 3.3129, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.0397281391399821, | |
| "grad_norm": 20.096038818359375, | |
| "learning_rate": 1.9845664892976672e-05, | |
| "loss": 3.3126, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.04093202214422398, | |
| "grad_norm": 32.19493103027344, | |
| "learning_rate": 2.0447595887607445e-05, | |
| "loss": 3.2431, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.04213590514846586, | |
| "grad_norm": 27.442581176757812, | |
| "learning_rate": 2.1049526882238222e-05, | |
| "loss": 3.3009, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.043339788152707746, | |
| "grad_norm": 23.221601486206055, | |
| "learning_rate": 2.1650254014879736e-05, | |
| "loss": 3.3023, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.04454367115694963, | |
| "grad_norm": 20.713502883911133, | |
| "learning_rate": 2.225218500951051e-05, | |
| "loss": 3.2834, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.04574755416119151, | |
| "grad_norm": 26.963842391967773, | |
| "learning_rate": 2.2854116004141285e-05, | |
| "loss": 3.2512, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.04695143716543339, | |
| "grad_norm": 26.029918670654297, | |
| "learning_rate": 2.3456046998772062e-05, | |
| "loss": 3.2678, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.04815532016967527, | |
| "grad_norm": 24.989070892333984, | |
| "learning_rate": 2.405797799340284e-05, | |
| "loss": 3.2962, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.04815532016967527, | |
| "eval_runtime": 6189.8097, | |
| "eval_samples_per_second": 134.196, | |
| "eval_steps_per_second": 33.549, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.049359203173917156, | |
| "grad_norm": 31.154277801513672, | |
| "learning_rate": 2.4659908988033615e-05, | |
| "loss": 3.2434, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.050563086178159036, | |
| "grad_norm": 25.946931838989258, | |
| "learning_rate": 2.526183998266439e-05, | |
| "loss": 3.2261, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.05176696918240092, | |
| "grad_norm": 18.157636642456055, | |
| "learning_rate": 2.5863770977295165e-05, | |
| "loss": 3.2844, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.0529708521866428, | |
| "grad_norm": 21.17293930053711, | |
| "learning_rate": 2.6464498109936682e-05, | |
| "loss": 3.2154, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.05417473519088468, | |
| "grad_norm": 26.187950134277344, | |
| "learning_rate": 2.7066429104567452e-05, | |
| "loss": 3.2815, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.05537861819512656, | |
| "grad_norm": 27.726953506469727, | |
| "learning_rate": 2.766715623720897e-05, | |
| "loss": 3.2538, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.056582501199368446, | |
| "grad_norm": 23.557273864746094, | |
| "learning_rate": 2.8269087231839743e-05, | |
| "loss": 3.2708, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.057786384203610326, | |
| "grad_norm": 22.728757858276367, | |
| "learning_rate": 2.8869814364481256e-05, | |
| "loss": 3.2052, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.05899026720785221, | |
| "grad_norm": 22.837238311767578, | |
| "learning_rate": 2.9471745359112036e-05, | |
| "loss": 3.2341, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.06019415021209409, | |
| "grad_norm": 24.8745059967041, | |
| "learning_rate": 3.0073676353742806e-05, | |
| "loss": 3.2309, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.06019415021209409, | |
| "eval_runtime": 6182.4703, | |
| "eval_samples_per_second": 134.355, | |
| "eval_steps_per_second": 33.589, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.06139803321633597, | |
| "grad_norm": 26.50299835205078, | |
| "learning_rate": 3.0675607348373586e-05, | |
| "loss": 3.2289, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.06260191622057786, | |
| "grad_norm": 26.083864212036133, | |
| "learning_rate": 3.1277538343004356e-05, | |
| "loss": 3.2321, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.06380579922481973, | |
| "grad_norm": 34.2237434387207, | |
| "learning_rate": 3.187946933763514e-05, | |
| "loss": 3.2297, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.06500968222906162, | |
| "grad_norm": 18.643739700317383, | |
| "learning_rate": 3.248140033226591e-05, | |
| "loss": 3.2801, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.0662135652333035, | |
| "grad_norm": 16.629045486450195, | |
| "learning_rate": 3.3083331326896686e-05, | |
| "loss": 3.2924, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.06741744823754538, | |
| "grad_norm": 24.145009994506836, | |
| "learning_rate": 3.368526232152746e-05, | |
| "loss": 3.208, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.06862133124178726, | |
| "grad_norm": 27.239717483520508, | |
| "learning_rate": 3.428719331615824e-05, | |
| "loss": 3.2432, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.06982521424602914, | |
| "grad_norm": 19.675600051879883, | |
| "learning_rate": 3.488912431078901e-05, | |
| "loss": 3.2389, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.07102909725027103, | |
| "grad_norm": 26.076309204101562, | |
| "learning_rate": 3.549105530541979e-05, | |
| "loss": 3.2849, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.0722329802545129, | |
| "grad_norm": 26.58043670654297, | |
| "learning_rate": 3.60917824380613e-05, | |
| "loss": 3.2597, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.0722329802545129, | |
| "eval_runtime": 6142.9637, | |
| "eval_samples_per_second": 135.219, | |
| "eval_steps_per_second": 33.805, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.07343686325875479, | |
| "grad_norm": 28.792818069458008, | |
| "learning_rate": 3.6693713432692076e-05, | |
| "loss": 3.2583, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.07464074626299667, | |
| "grad_norm": 15.474958419799805, | |
| "learning_rate": 3.72944405653336e-05, | |
| "loss": 3.2516, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.07584462926723855, | |
| "grad_norm": 22.783601760864258, | |
| "learning_rate": 3.7896371559964367e-05, | |
| "loss": 3.252, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.07704851227148043, | |
| "grad_norm": 30.097503662109375, | |
| "learning_rate": 3.849830255459514e-05, | |
| "loss": 3.2607, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.07825239527572231, | |
| "grad_norm": 16.432775497436523, | |
| "learning_rate": 3.910023354922592e-05, | |
| "loss": 3.2281, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.0794562782799642, | |
| "grad_norm": 22.113264083862305, | |
| "learning_rate": 3.9702164543856697e-05, | |
| "loss": 3.1994, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.08066016128420608, | |
| "grad_norm": 23.033098220825195, | |
| "learning_rate": 4.0304095538487466e-05, | |
| "loss": 3.2641, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.08186404428844796, | |
| "grad_norm": 22.20917510986328, | |
| "learning_rate": 4.090602653311825e-05, | |
| "loss": 3.2382, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.08306792729268984, | |
| "grad_norm": 29.848487854003906, | |
| "learning_rate": 4.150795752774902e-05, | |
| "loss": 3.2067, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.08427181029693172, | |
| "grad_norm": 25.557289123535156, | |
| "learning_rate": 4.2108684660390533e-05, | |
| "loss": 3.1953, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.08427181029693172, | |
| "eval_runtime": 6095.2446, | |
| "eval_samples_per_second": 136.278, | |
| "eval_steps_per_second": 34.07, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.0854756933011736, | |
| "grad_norm": 20.05970001220703, | |
| "learning_rate": 4.271061565502131e-05, | |
| "loss": 3.2184, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.08667957630541549, | |
| "grad_norm": 20.745208740234375, | |
| "learning_rate": 4.3311342787662824e-05, | |
| "loss": 3.2016, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.08788345930965737, | |
| "grad_norm": 24.379274368286133, | |
| "learning_rate": 4.39132737822936e-05, | |
| "loss": 3.1851, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.08908734231389925, | |
| "grad_norm": 14.577661514282227, | |
| "learning_rate": 4.451520477692438e-05, | |
| "loss": 3.2191, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.09029122531814113, | |
| "grad_norm": 19.822919845581055, | |
| "learning_rate": 4.5117135771555154e-05, | |
| "loss": 3.1865, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.09149510832238301, | |
| "grad_norm": 20.753366470336914, | |
| "learning_rate": 4.5719066766185924e-05, | |
| "loss": 3.238, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.0926989913266249, | |
| "grad_norm": 16.455045700073242, | |
| "learning_rate": 4.63209977608167e-05, | |
| "loss": 3.2118, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.09390287433086678, | |
| "grad_norm": 22.421308517456055, | |
| "learning_rate": 4.692052103146896e-05, | |
| "loss": 3.2192, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.09510675733510866, | |
| "grad_norm": 18.18105125427246, | |
| "learning_rate": 4.752245202609973e-05, | |
| "loss": 3.2, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 0.09631064033935054, | |
| "grad_norm": 14.840981483459473, | |
| "learning_rate": 4.8124383020730504e-05, | |
| "loss": 3.1897, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.09631064033935054, | |
| "eval_runtime": 6142.3167, | |
| "eval_samples_per_second": 135.233, | |
| "eval_steps_per_second": 33.808, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.09751452334359242, | |
| "grad_norm": 16.446571350097656, | |
| "learning_rate": 4.872631401536128e-05, | |
| "loss": 3.2723, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 0.09871840634783431, | |
| "grad_norm": 32.157161712646484, | |
| "learning_rate": 4.932824500999206e-05, | |
| "loss": 3.1675, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.09992228935207619, | |
| "grad_norm": 18.431787490844727, | |
| "learning_rate": 4.993017600462283e-05, | |
| "loss": 3.2205, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 0.10112617235631807, | |
| "grad_norm": 21.929658889770508, | |
| "learning_rate": 4.994087573470594e-05, | |
| "loss": 3.1701, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.10233005536055995, | |
| "grad_norm": 20.648868560791016, | |
| "learning_rate": 4.987399308165837e-05, | |
| "loss": 3.1764, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 0.10353393836480183, | |
| "grad_norm": 20.37611198425293, | |
| "learning_rate": 4.98071104286108e-05, | |
| "loss": 3.1684, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.10473782136904371, | |
| "grad_norm": 15.72383975982666, | |
| "learning_rate": 4.974036154086932e-05, | |
| "loss": 3.1698, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 0.1059417043732856, | |
| "grad_norm": 17.074922561645508, | |
| "learning_rate": 4.967347888782174e-05, | |
| "loss": 3.1913, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.10714558737752748, | |
| "grad_norm": 16.70379066467285, | |
| "learning_rate": 4.9606596234774164e-05, | |
| "loss": 3.2184, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 0.10834947038176936, | |
| "grad_norm": 12.970040321350098, | |
| "learning_rate": 4.9539713581726594e-05, | |
| "loss": 3.1826, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.10834947038176936, | |
| "eval_runtime": 6114.7803, | |
| "eval_samples_per_second": 135.842, | |
| "eval_steps_per_second": 33.961, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.10955335338601124, | |
| "grad_norm": 17.909025192260742, | |
| "learning_rate": 4.9472830928679016e-05, | |
| "loss": 3.175, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 0.11075723639025312, | |
| "grad_norm": 17.254976272583008, | |
| "learning_rate": 4.9405948275631445e-05, | |
| "loss": 3.1959, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 0.111961119394495, | |
| "grad_norm": 17.655014038085938, | |
| "learning_rate": 4.9339199387889964e-05, | |
| "loss": 3.194, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 0.11316500239873689, | |
| "grad_norm": 18.48583984375, | |
| "learning_rate": 4.9272316734842386e-05, | |
| "loss": 3.19, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 0.11436888540297876, | |
| "grad_norm": 13.14960765838623, | |
| "learning_rate": 4.920556784710091e-05, | |
| "loss": 3.159, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 0.11557276840722065, | |
| "grad_norm": 13.100486755371094, | |
| "learning_rate": 4.913868519405333e-05, | |
| "loss": 3.197, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.11677665141146253, | |
| "grad_norm": 20.155548095703125, | |
| "learning_rate": 4.9071802541005756e-05, | |
| "loss": 3.192, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 0.11798053441570441, | |
| "grad_norm": 16.038169860839844, | |
| "learning_rate": 4.9004919887958185e-05, | |
| "loss": 3.1575, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 0.1191844174199463, | |
| "grad_norm": 22.394641876220703, | |
| "learning_rate": 4.893803723491061e-05, | |
| "loss": 3.1787, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 0.12038830042418817, | |
| "grad_norm": 18.06816864013672, | |
| "learning_rate": 4.887115458186303e-05, | |
| "loss": 3.138, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.12038830042418817, | |
| "eval_runtime": 6098.0137, | |
| "eval_samples_per_second": 136.216, | |
| "eval_steps_per_second": 34.054, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.12159218342843006, | |
| "grad_norm": 25.612211227416992, | |
| "learning_rate": 4.880427192881546e-05, | |
| "loss": 3.1291, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 0.12279606643267194, | |
| "grad_norm": 23.026065826416016, | |
| "learning_rate": 4.873738927576788e-05, | |
| "loss": 3.1429, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 0.12399994943691382, | |
| "grad_norm": 17.348302841186523, | |
| "learning_rate": 4.86706403880264e-05, | |
| "loss": 3.208, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 0.1252038324411557, | |
| "grad_norm": 25.699726104736328, | |
| "learning_rate": 4.860375773497883e-05, | |
| "loss": 3.1744, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 0.12640771544539758, | |
| "grad_norm": 20.069185256958008, | |
| "learning_rate": 4.853687508193125e-05, | |
| "loss": 3.1429, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 0.12761159844963946, | |
| "grad_norm": 15.095413208007812, | |
| "learning_rate": 4.8469992428883674e-05, | |
| "loss": 3.1767, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 0.12881548145388136, | |
| "grad_norm": 12.854238510131836, | |
| "learning_rate": 4.84031097758361e-05, | |
| "loss": 3.1726, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 0.13001936445812323, | |
| "grad_norm": 18.715251922607422, | |
| "learning_rate": 4.833622712278853e-05, | |
| "loss": 3.1745, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 0.1312232474623651, | |
| "grad_norm": 16.41513442993164, | |
| "learning_rate": 4.8269344469740955e-05, | |
| "loss": 3.163, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 0.132427130466607, | |
| "grad_norm": 16.754322052001953, | |
| "learning_rate": 4.820246181669338e-05, | |
| "loss": 3.2186, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.132427130466607, | |
| "eval_runtime": 6080.0844, | |
| "eval_samples_per_second": 136.617, | |
| "eval_steps_per_second": 34.154, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.13363101347084888, | |
| "grad_norm": 21.774137496948242, | |
| "learning_rate": 4.8135712928951896e-05, | |
| "loss": 3.1601, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 0.13483489647509075, | |
| "grad_norm": 15.781363487243652, | |
| "learning_rate": 4.8068830275904325e-05, | |
| "loss": 3.1762, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 0.13603877947933263, | |
| "grad_norm": 15.000273704528809, | |
| "learning_rate": 4.800194762285675e-05, | |
| "loss": 3.1576, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 0.13724266248357453, | |
| "grad_norm": 16.367721557617188, | |
| "learning_rate": 4.7935064969809176e-05, | |
| "loss": 3.1373, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 0.1384465454878164, | |
| "grad_norm": 14.089083671569824, | |
| "learning_rate": 4.7868316082067695e-05, | |
| "loss": 3.1886, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 0.13965042849205828, | |
| "grad_norm": 15.01375675201416, | |
| "learning_rate": 4.780143342902012e-05, | |
| "loss": 3.1041, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 0.14085431149630018, | |
| "grad_norm": 15.857519149780273, | |
| "learning_rate": 4.773455077597254e-05, | |
| "loss": 3.157, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 0.14205819450054205, | |
| "grad_norm": 15.649994850158691, | |
| "learning_rate": 4.7667801888231065e-05, | |
| "loss": 3.1307, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 0.14326207750478392, | |
| "grad_norm": 17.55912208557129, | |
| "learning_rate": 4.760091923518349e-05, | |
| "loss": 3.1491, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 0.1444659605090258, | |
| "grad_norm": 39.259796142578125, | |
| "learning_rate": 4.753403658213592e-05, | |
| "loss": 3.156, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.1444659605090258, | |
| "eval_runtime": 6118.114, | |
| "eval_samples_per_second": 135.768, | |
| "eval_steps_per_second": 33.942, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.1456698435132677, | |
| "grad_norm": 14.720120429992676, | |
| "learning_rate": 4.746715392908834e-05, | |
| "loss": 3.1288, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 0.14687372651750957, | |
| "grad_norm": 17.65592384338379, | |
| "learning_rate": 4.740027127604076e-05, | |
| "loss": 3.1324, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 0.14807760952175145, | |
| "grad_norm": 22.232887268066406, | |
| "learning_rate": 4.7333388622993184e-05, | |
| "loss": 3.1047, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 0.14928149252599335, | |
| "grad_norm": 13.700913429260254, | |
| "learning_rate": 4.726650596994561e-05, | |
| "loss": 3.1472, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 0.15048537553023522, | |
| "grad_norm": 12.120200157165527, | |
| "learning_rate": 4.719962331689804e-05, | |
| "loss": 3.1314, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 0.1516892585344771, | |
| "grad_norm": 18.1831111907959, | |
| "learning_rate": 4.713287442915656e-05, | |
| "loss": 3.1473, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 0.152893141538719, | |
| "grad_norm": 16.345584869384766, | |
| "learning_rate": 4.706599177610898e-05, | |
| "loss": 3.0957, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 0.15409702454296087, | |
| "grad_norm": 12.382160186767578, | |
| "learning_rate": 4.6999109123061405e-05, | |
| "loss": 3.1352, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 0.15530090754720274, | |
| "grad_norm": 11.769241333007812, | |
| "learning_rate": 4.6932226470013834e-05, | |
| "loss": 3.1241, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 0.15650479055144462, | |
| "grad_norm": 16.600685119628906, | |
| "learning_rate": 4.686547758227235e-05, | |
| "loss": 3.1001, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 0.15650479055144462, | |
| "eval_runtime": 6099.2, | |
| "eval_samples_per_second": 136.189, | |
| "eval_steps_per_second": 34.047, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 0.15770867355568652, | |
| "grad_norm": 27.690532684326172, | |
| "learning_rate": 4.6798594929224775e-05, | |
| "loss": 3.1427, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 0.1589125565599284, | |
| "grad_norm": 16.452138900756836, | |
| "learning_rate": 4.6731712276177205e-05, | |
| "loss": 3.1489, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 0.16011643956417027, | |
| "grad_norm": 22.97121238708496, | |
| "learning_rate": 4.666482962312963e-05, | |
| "loss": 3.1554, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 0.16132032256841217, | |
| "grad_norm": 17.040058135986328, | |
| "learning_rate": 4.659808073538815e-05, | |
| "loss": 3.1619, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 0.16252420557265404, | |
| "grad_norm": 15.484249114990234, | |
| "learning_rate": 4.653119808234057e-05, | |
| "loss": 3.1277, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 0.1637280885768959, | |
| "grad_norm": 14.11099910736084, | |
| "learning_rate": 4.6464315429293e-05, | |
| "loss": 3.1736, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 0.16493197158113782, | |
| "grad_norm": 13.42932415008545, | |
| "learning_rate": 4.6397432776245426e-05, | |
| "loss": 3.1114, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 0.1661358545853797, | |
| "grad_norm": 14.439802169799805, | |
| "learning_rate": 4.633055012319785e-05, | |
| "loss": 3.1526, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 0.16733973758962156, | |
| "grad_norm": 19.759899139404297, | |
| "learning_rate": 4.626366747015027e-05, | |
| "loss": 3.13, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 0.16854362059386344, | |
| "grad_norm": 14.265944480895996, | |
| "learning_rate": 4.61967848171027e-05, | |
| "loss": 3.1197, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.16854362059386344, | |
| "eval_runtime": 6159.4194, | |
| "eval_samples_per_second": 134.858, | |
| "eval_steps_per_second": 33.715, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.16974750359810534, | |
| "grad_norm": 14.379167556762695, | |
| "learning_rate": 4.612990216405512e-05, | |
| "loss": 3.1587, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 0.1709513866023472, | |
| "grad_norm": 16.7622013092041, | |
| "learning_rate": 4.606315327631364e-05, | |
| "loss": 3.1265, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 0.17215526960658908, | |
| "grad_norm": 16.946565628051758, | |
| "learning_rate": 4.599627062326607e-05, | |
| "loss": 3.1666, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 0.17335915261083099, | |
| "grad_norm": 15.118630409240723, | |
| "learning_rate": 4.592952173552459e-05, | |
| "loss": 3.1067, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 0.17456303561507286, | |
| "grad_norm": 13.015983581542969, | |
| "learning_rate": 4.586263908247702e-05, | |
| "loss": 3.0988, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 0.17576691861931473, | |
| "grad_norm": 16.574451446533203, | |
| "learning_rate": 4.579575642942944e-05, | |
| "loss": 3.1373, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 0.1769708016235566, | |
| "grad_norm": 13.634519577026367, | |
| "learning_rate": 4.572887377638186e-05, | |
| "loss": 3.1362, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 0.1781746846277985, | |
| "grad_norm": 19.195165634155273, | |
| "learning_rate": 4.566199112333429e-05, | |
| "loss": 3.0999, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 0.17937856763204038, | |
| "grad_norm": 16.080059051513672, | |
| "learning_rate": 4.5595108470286714e-05, | |
| "loss": 3.1615, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 0.18058245063628225, | |
| "grad_norm": 17.560720443725586, | |
| "learning_rate": 4.5528225817239137e-05, | |
| "loss": 3.1018, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.18058245063628225, | |
| "eval_runtime": 6167.2803, | |
| "eval_samples_per_second": 134.686, | |
| "eval_steps_per_second": 33.672, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.18178633364052416, | |
| "grad_norm": 14.580060005187988, | |
| "learning_rate": 4.5461343164191566e-05, | |
| "loss": 3.1417, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 0.18299021664476603, | |
| "grad_norm": 20.411680221557617, | |
| "learning_rate": 4.5394594276450084e-05, | |
| "loss": 3.1915, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 0.1841940996490079, | |
| "grad_norm": 24.153568267822266, | |
| "learning_rate": 4.53278453887086e-05, | |
| "loss": 3.129, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 0.1853979826532498, | |
| "grad_norm": 20.700895309448242, | |
| "learning_rate": 4.5260962735661025e-05, | |
| "loss": 3.1237, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 0.18660186565749168, | |
| "grad_norm": 19.473398208618164, | |
| "learning_rate": 4.5194080082613455e-05, | |
| "loss": 3.1593, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 0.18780574866173355, | |
| "grad_norm": 16.656599044799805, | |
| "learning_rate": 4.5127197429565884e-05, | |
| "loss": 3.1273, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 0.18900963166597542, | |
| "grad_norm": 17.83378028869629, | |
| "learning_rate": 4.5060314776518306e-05, | |
| "loss": 3.1454, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 0.19021351467021733, | |
| "grad_norm": 12.069873809814453, | |
| "learning_rate": 4.499343212347073e-05, | |
| "loss": 3.1567, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 0.1914173976744592, | |
| "grad_norm": 21.635231018066406, | |
| "learning_rate": 4.492654947042316e-05, | |
| "loss": 3.1063, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 0.19262128067870107, | |
| "grad_norm": 12.80612564086914, | |
| "learning_rate": 4.4859800582681676e-05, | |
| "loss": 3.1169, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.19262128067870107, | |
| "eval_runtime": 6089.7621, | |
| "eval_samples_per_second": 136.4, | |
| "eval_steps_per_second": 34.1, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.19382516368294297, | |
| "grad_norm": 15.964377403259277, | |
| "learning_rate": 4.47929179296341e-05, | |
| "loss": 3.1736, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 0.19502904668718485, | |
| "grad_norm": 33.44254684448242, | |
| "learning_rate": 4.472603527658653e-05, | |
| "loss": 3.1525, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 0.19623292969142672, | |
| "grad_norm": 13.991809844970703, | |
| "learning_rate": 4.465915262353895e-05, | |
| "loss": 3.1492, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 0.19743681269566862, | |
| "grad_norm": 12.851255416870117, | |
| "learning_rate": 4.4592403735797475e-05, | |
| "loss": 3.1153, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 0.1986406956999105, | |
| "grad_norm": 17.928274154663086, | |
| "learning_rate": 4.452552108274989e-05, | |
| "loss": 3.1518, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 0.19984457870415237, | |
| "grad_norm": 12.124229431152344, | |
| "learning_rate": 4.445863842970232e-05, | |
| "loss": 3.1087, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 0.20104846170839424, | |
| "grad_norm": 15.766402244567871, | |
| "learning_rate": 4.439175577665475e-05, | |
| "loss": 3.1327, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 0.20225234471263615, | |
| "grad_norm": 16.555757522583008, | |
| "learning_rate": 4.432487312360717e-05, | |
| "loss": 3.0765, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 0.20345622771687802, | |
| "grad_norm": 19.65941619873047, | |
| "learning_rate": 4.4257990470559594e-05, | |
| "loss": 3.1393, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 0.2046601107211199, | |
| "grad_norm": 16.987285614013672, | |
| "learning_rate": 4.419110781751202e-05, | |
| "loss": 3.1145, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 0.2046601107211199, | |
| "eval_runtime": 6166.7962, | |
| "eval_samples_per_second": 134.696, | |
| "eval_steps_per_second": 33.674, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 0.2058639937253618, | |
| "grad_norm": 14.441193580627441, | |
| "learning_rate": 4.4124225164464445e-05, | |
| "loss": 3.1662, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 0.20706787672960367, | |
| "grad_norm": 18.12236976623535, | |
| "learning_rate": 4.4057476276722964e-05, | |
| "loss": 3.0565, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 0.20827175973384554, | |
| "grad_norm": 12.500991821289062, | |
| "learning_rate": 4.399059362367539e-05, | |
| "loss": 3.1047, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 0.20947564273808741, | |
| "grad_norm": 16.244428634643555, | |
| "learning_rate": 4.3923710970627816e-05, | |
| "loss": 3.0893, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 0.21067952574232932, | |
| "grad_norm": 21.911731719970703, | |
| "learning_rate": 4.385682831758024e-05, | |
| "loss": 3.0743, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 0.2118834087465712, | |
| "grad_norm": 16.75537109375, | |
| "learning_rate": 4.378994566453267e-05, | |
| "loss": 3.1221, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 0.21308729175081306, | |
| "grad_norm": 20.14570426940918, | |
| "learning_rate": 4.3723063011485096e-05, | |
| "loss": 3.1413, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 0.21429117475505496, | |
| "grad_norm": 14.766070365905762, | |
| "learning_rate": 4.365618035843751e-05, | |
| "loss": 3.0955, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 0.21549505775929684, | |
| "grad_norm": 17.830801010131836, | |
| "learning_rate": 4.358929770538994e-05, | |
| "loss": 3.1517, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 0.2166989407635387, | |
| "grad_norm": 10.205118179321289, | |
| "learning_rate": 4.352254881764846e-05, | |
| "loss": 3.1332, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.2166989407635387, | |
| "eval_runtime": 6149.5749, | |
| "eval_samples_per_second": 135.074, | |
| "eval_steps_per_second": 33.769, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.2179028237677806, | |
| "grad_norm": 16.20384979248047, | |
| "learning_rate": 4.345566616460089e-05, | |
| "loss": 3.1003, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 0.21910670677202249, | |
| "grad_norm": 17.35607147216797, | |
| "learning_rate": 4.338878351155331e-05, | |
| "loss": 3.1193, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 0.22031058977626436, | |
| "grad_norm": 17.914997100830078, | |
| "learning_rate": 4.332190085850574e-05, | |
| "loss": 3.0944, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 0.22151447278050623, | |
| "grad_norm": 23.45078468322754, | |
| "learning_rate": 4.325515197076426e-05, | |
| "loss": 3.1518, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 0.22271835578474813, | |
| "grad_norm": 19.160053253173828, | |
| "learning_rate": 4.318826931771668e-05, | |
| "loss": 3.1144, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 0.22392223878899, | |
| "grad_norm": 16.796180725097656, | |
| "learning_rate": 4.312152042997521e-05, | |
| "loss": 3.1354, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 0.22512612179323188, | |
| "grad_norm": 13.598986625671387, | |
| "learning_rate": 4.305463777692762e-05, | |
| "loss": 3.0675, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 0.22633000479747378, | |
| "grad_norm": 16.168975830078125, | |
| "learning_rate": 4.298775512388005e-05, | |
| "loss": 3.1065, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 0.22753388780171566, | |
| "grad_norm": 22.480331420898438, | |
| "learning_rate": 4.292087247083248e-05, | |
| "loss": 3.1201, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 0.22873777080595753, | |
| "grad_norm": 16.593976974487305, | |
| "learning_rate": 4.28539898177849e-05, | |
| "loss": 3.1264, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 0.22873777080595753, | |
| "eval_runtime": 6100.46, | |
| "eval_samples_per_second": 136.161, | |
| "eval_steps_per_second": 34.04, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 0.2299416538101994, | |
| "grad_norm": 14.308032989501953, | |
| "learning_rate": 4.2787107164737325e-05, | |
| "loss": 3.1278, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 0.2311455368144413, | |
| "grad_norm": 13.68152141571045, | |
| "learning_rate": 4.272035827699585e-05, | |
| "loss": 3.1079, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 0.23234941981868318, | |
| "grad_norm": 15.30040454864502, | |
| "learning_rate": 4.265347562394827e-05, | |
| "loss": 3.0765, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 0.23355330282292505, | |
| "grad_norm": 17.36260223388672, | |
| "learning_rate": 4.2586592970900695e-05, | |
| "loss": 3.0966, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 0.23475718582716695, | |
| "grad_norm": 16.50679588317871, | |
| "learning_rate": 4.2519710317853125e-05, | |
| "loss": 3.1462, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 0.23596106883140883, | |
| "grad_norm": 15.678003311157227, | |
| "learning_rate": 4.245282766480555e-05, | |
| "loss": 3.1434, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 0.2371649518356507, | |
| "grad_norm": 14.652356147766113, | |
| "learning_rate": 4.238594501175797e-05, | |
| "loss": 3.1461, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 0.2383688348398926, | |
| "grad_norm": 13.707479476928711, | |
| "learning_rate": 4.23190623587104e-05, | |
| "loss": 3.0894, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 0.23957271784413448, | |
| "grad_norm": 22.13295555114746, | |
| "learning_rate": 4.225217970566282e-05, | |
| "loss": 3.1317, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 0.24077660084837635, | |
| "grad_norm": 14.54344367980957, | |
| "learning_rate": 4.2185430817921346e-05, | |
| "loss": 3.1209, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 0.24077660084837635, | |
| "eval_runtime": 6233.0245, | |
| "eval_samples_per_second": 133.265, | |
| "eval_steps_per_second": 33.316, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 0.24198048385261822, | |
| "grad_norm": 16.891630172729492, | |
| "learning_rate": 4.211854816487377e-05, | |
| "loss": 3.1261, | |
| "step": 100500 | |
| }, | |
| { | |
| "epoch": 0.24318436685686012, | |
| "grad_norm": 17.46337127685547, | |
| "learning_rate": 4.205166551182619e-05, | |
| "loss": 3.1625, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 0.244388249861102, | |
| "grad_norm": 14.349138259887695, | |
| "learning_rate": 4.1984916624084716e-05, | |
| "loss": 3.0834, | |
| "step": 101500 | |
| }, | |
| { | |
| "epoch": 0.24559213286534387, | |
| "grad_norm": 18.939817428588867, | |
| "learning_rate": 4.191803397103714e-05, | |
| "loss": 3.1521, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 0.24679601586958577, | |
| "grad_norm": 16.54868507385254, | |
| "learning_rate": 4.185115131798956e-05, | |
| "loss": 3.0694, | |
| "step": 102500 | |
| }, | |
| { | |
| "epoch": 0.24799989887382765, | |
| "grad_norm": 14.203706741333008, | |
| "learning_rate": 4.178426866494199e-05, | |
| "loss": 3.1201, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 0.24920378187806952, | |
| "grad_norm": 14.797431945800781, | |
| "learning_rate": 4.171738601189441e-05, | |
| "loss": 3.1252, | |
| "step": 103500 | |
| }, | |
| { | |
| "epoch": 0.2504076648823114, | |
| "grad_norm": 14.449517250061035, | |
| "learning_rate": 4.165063712415294e-05, | |
| "loss": 3.0932, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 0.25161154788655327, | |
| "grad_norm": 17.101430892944336, | |
| "learning_rate": 4.158375447110536e-05, | |
| "loss": 3.1127, | |
| "step": 104500 | |
| }, | |
| { | |
| "epoch": 0.25281543089079517, | |
| "grad_norm": 20.582412719726562, | |
| "learning_rate": 4.151700558336388e-05, | |
| "loss": 3.0675, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 0.25281543089079517, | |
| "eval_runtime": 6156.9182, | |
| "eval_samples_per_second": 134.912, | |
| "eval_steps_per_second": 33.728, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 0.25401931389503707, | |
| "grad_norm": 14.351494789123535, | |
| "learning_rate": 4.14501229303163e-05, | |
| "loss": 3.0845, | |
| "step": 105500 | |
| }, | |
| { | |
| "epoch": 0.2552231968992789, | |
| "grad_norm": 11.951766967773438, | |
| "learning_rate": 4.138324027726873e-05, | |
| "loss": 3.0907, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 0.2564270799035208, | |
| "grad_norm": 13.831068992614746, | |
| "learning_rate": 4.131635762422115e-05, | |
| "loss": 3.1139, | |
| "step": 106500 | |
| }, | |
| { | |
| "epoch": 0.2576309629077627, | |
| "grad_norm": 16.089948654174805, | |
| "learning_rate": 4.124947497117358e-05, | |
| "loss": 3.085, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 0.25883484591200456, | |
| "grad_norm": 16.427217483520508, | |
| "learning_rate": 4.1182592318126004e-05, | |
| "loss": 3.1444, | |
| "step": 107500 | |
| }, | |
| { | |
| "epoch": 0.26003872891624646, | |
| "grad_norm": 16.443748474121094, | |
| "learning_rate": 4.111570966507843e-05, | |
| "loss": 3.1197, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 0.26124261192048837, | |
| "grad_norm": 12.318251609802246, | |
| "learning_rate": 4.1048827012030856e-05, | |
| "loss": 3.0734, | |
| "step": 108500 | |
| }, | |
| { | |
| "epoch": 0.2624464949247302, | |
| "grad_norm": 13.695268630981445, | |
| "learning_rate": 4.098194435898328e-05, | |
| "loss": 3.1275, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 0.2636503779289721, | |
| "grad_norm": 15.07443904876709, | |
| "learning_rate": 4.09151954712418e-05, | |
| "loss": 3.097, | |
| "step": 109500 | |
| }, | |
| { | |
| "epoch": 0.264854260933214, | |
| "grad_norm": 15.240448951721191, | |
| "learning_rate": 4.0848312818194226e-05, | |
| "loss": 3.088, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 0.264854260933214, | |
| "eval_runtime": 6153.3584, | |
| "eval_samples_per_second": 134.991, | |
| "eval_steps_per_second": 33.748, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 0.26605814393745586, | |
| "grad_norm": 13.12667179107666, | |
| "learning_rate": 4.078250028759541e-05, | |
| "loss": 3.0962, | |
| "step": 110500 | |
| }, | |
| { | |
| "epoch": 0.26726202694169776, | |
| "grad_norm": 17.520675659179688, | |
| "learning_rate": 4.0715617634547834e-05, | |
| "loss": 3.0786, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 0.2684659099459396, | |
| "grad_norm": 27.284038543701172, | |
| "learning_rate": 4.064886874680636e-05, | |
| "loss": 3.1162, | |
| "step": 111500 | |
| }, | |
| { | |
| "epoch": 0.2696697929501815, | |
| "grad_norm": 12.623812675476074, | |
| "learning_rate": 4.0581986093758775e-05, | |
| "loss": 3.0993, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 0.2708736759544234, | |
| "grad_norm": 14.702446937561035, | |
| "learning_rate": 4.0515103440711204e-05, | |
| "loss": 3.0733, | |
| "step": 112500 | |
| }, | |
| { | |
| "epoch": 0.27207755895866526, | |
| "grad_norm": 16.056833267211914, | |
| "learning_rate": 4.0448220787663634e-05, | |
| "loss": 3.0788, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 0.27328144196290716, | |
| "grad_norm": 12.753098487854004, | |
| "learning_rate": 4.038147189992215e-05, | |
| "loss": 3.0991, | |
| "step": 113500 | |
| }, | |
| { | |
| "epoch": 0.27448532496714906, | |
| "grad_norm": 13.137269020080566, | |
| "learning_rate": 4.0314589246874575e-05, | |
| "loss": 3.0871, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 0.2756892079713909, | |
| "grad_norm": 15.072389602661133, | |
| "learning_rate": 4.0247706593827004e-05, | |
| "loss": 3.115, | |
| "step": 114500 | |
| }, | |
| { | |
| "epoch": 0.2768930909756328, | |
| "grad_norm": 15.979447364807129, | |
| "learning_rate": 4.0180823940779426e-05, | |
| "loss": 3.1002, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 0.2768930909756328, | |
| "eval_runtime": 6179.6049, | |
| "eval_samples_per_second": 134.417, | |
| "eval_steps_per_second": 33.604, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 0.2780969739798747, | |
| "grad_norm": 13.973761558532715, | |
| "learning_rate": 4.011394128773185e-05, | |
| "loss": 3.0706, | |
| "step": 115500 | |
| }, | |
| { | |
| "epoch": 0.27930085698411655, | |
| "grad_norm": 16.156885147094727, | |
| "learning_rate": 4.004705863468428e-05, | |
| "loss": 3.0595, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 0.28050473998835845, | |
| "grad_norm": 14.320749282836914, | |
| "learning_rate": 3.99801759816367e-05, | |
| "loss": 3.1083, | |
| "step": 116500 | |
| }, | |
| { | |
| "epoch": 0.28170862299260035, | |
| "grad_norm": 13.002079010009766, | |
| "learning_rate": 3.991329332858912e-05, | |
| "loss": 3.0554, | |
| "step": 117000 | |
| }, | |
| { | |
| "epoch": 0.2829125059968422, | |
| "grad_norm": 19.574172973632812, | |
| "learning_rate": 3.984654444084764e-05, | |
| "loss": 3.1074, | |
| "step": 117500 | |
| }, | |
| { | |
| "epoch": 0.2841163890010841, | |
| "grad_norm": 12.356159210205078, | |
| "learning_rate": 3.977966178780007e-05, | |
| "loss": 3.1215, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 0.285320272005326, | |
| "grad_norm": 17.327226638793945, | |
| "learning_rate": 3.97127791347525e-05, | |
| "loss": 3.047, | |
| "step": 118500 | |
| }, | |
| { | |
| "epoch": 0.28652415500956785, | |
| "grad_norm": 16.561124801635742, | |
| "learning_rate": 3.964589648170492e-05, | |
| "loss": 3.1006, | |
| "step": 119000 | |
| }, | |
| { | |
| "epoch": 0.28772803801380975, | |
| "grad_norm": 14.118390083312988, | |
| "learning_rate": 3.9579013828657344e-05, | |
| "loss": 3.08, | |
| "step": 119500 | |
| }, | |
| { | |
| "epoch": 0.2889319210180516, | |
| "grad_norm": 15.130383491516113, | |
| "learning_rate": 3.951213117560977e-05, | |
| "loss": 3.0229, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 0.2889319210180516, | |
| "eval_runtime": 6265.7809, | |
| "eval_samples_per_second": 132.568, | |
| "eval_steps_per_second": 33.142, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 0.2901358040222935, | |
| "grad_norm": 20.27661895751953, | |
| "learning_rate": 3.944538228786829e-05, | |
| "loss": 3.0565, | |
| "step": 120500 | |
| }, | |
| { | |
| "epoch": 0.2913396870265354, | |
| "grad_norm": 15.461856842041016, | |
| "learning_rate": 3.9378499634820714e-05, | |
| "loss": 3.0717, | |
| "step": 121000 | |
| }, | |
| { | |
| "epoch": 0.29254357003077724, | |
| "grad_norm": 17.019287109375, | |
| "learning_rate": 3.931161698177314e-05, | |
| "loss": 3.1387, | |
| "step": 121500 | |
| }, | |
| { | |
| "epoch": 0.29374745303501915, | |
| "grad_norm": 18.06890106201172, | |
| "learning_rate": 3.9244734328725566e-05, | |
| "loss": 3.1166, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 0.29495133603926105, | |
| "grad_norm": 31.920703887939453, | |
| "learning_rate": 3.917798544098409e-05, | |
| "loss": 3.095, | |
| "step": 122500 | |
| }, | |
| { | |
| "epoch": 0.2961552190435029, | |
| "grad_norm": 15.199366569519043, | |
| "learning_rate": 3.9111102787936507e-05, | |
| "loss": 3.0706, | |
| "step": 123000 | |
| }, | |
| { | |
| "epoch": 0.2973591020477448, | |
| "grad_norm": 15.413779258728027, | |
| "learning_rate": 3.9044220134888936e-05, | |
| "loss": 3.121, | |
| "step": 123500 | |
| }, | |
| { | |
| "epoch": 0.2985629850519867, | |
| "grad_norm": 14.4086275100708, | |
| "learning_rate": 3.8977337481841365e-05, | |
| "loss": 3.087, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 0.29976686805622854, | |
| "grad_norm": 12.95889663696289, | |
| "learning_rate": 3.891045482879379e-05, | |
| "loss": 3.0934, | |
| "step": 124500 | |
| }, | |
| { | |
| "epoch": 0.30097075106047044, | |
| "grad_norm": 19.025604248046875, | |
| "learning_rate": 3.884357217574621e-05, | |
| "loss": 3.1332, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 0.30097075106047044, | |
| "eval_runtime": 6218.4719, | |
| "eval_samples_per_second": 133.577, | |
| "eval_steps_per_second": 33.394, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 0.30217463406471234, | |
| "grad_norm": 14.700455665588379, | |
| "learning_rate": 3.877668952269864e-05, | |
| "loss": 3.0799, | |
| "step": 125500 | |
| }, | |
| { | |
| "epoch": 0.3033785170689542, | |
| "grad_norm": 15.362942695617676, | |
| "learning_rate": 3.870994063495716e-05, | |
| "loss": 3.0551, | |
| "step": 126000 | |
| }, | |
| { | |
| "epoch": 0.3045824000731961, | |
| "grad_norm": 18.218399047851562, | |
| "learning_rate": 3.864305798190958e-05, | |
| "loss": 3.0529, | |
| "step": 126500 | |
| }, | |
| { | |
| "epoch": 0.305786283077438, | |
| "grad_norm": 18.461824417114258, | |
| "learning_rate": 3.857617532886201e-05, | |
| "loss": 3.1065, | |
| "step": 127000 | |
| }, | |
| { | |
| "epoch": 0.30699016608167984, | |
| "grad_norm": 12.244810104370117, | |
| "learning_rate": 3.850929267581443e-05, | |
| "loss": 3.0844, | |
| "step": 127500 | |
| }, | |
| { | |
| "epoch": 0.30819404908592174, | |
| "grad_norm": 20.86441993713379, | |
| "learning_rate": 3.8442410022766854e-05, | |
| "loss": 3.0551, | |
| "step": 128000 | |
| }, | |
| { | |
| "epoch": 0.30939793209016364, | |
| "grad_norm": 16.215953826904297, | |
| "learning_rate": 3.837552736971928e-05, | |
| "loss": 3.0748, | |
| "step": 128500 | |
| }, | |
| { | |
| "epoch": 0.3106018150944055, | |
| "grad_norm": 17.1651554107666, | |
| "learning_rate": 3.8308644716671705e-05, | |
| "loss": 3.144, | |
| "step": 129000 | |
| }, | |
| { | |
| "epoch": 0.3118056980986474, | |
| "grad_norm": 22.377321243286133, | |
| "learning_rate": 3.8241762063624134e-05, | |
| "loss": 3.1162, | |
| "step": 129500 | |
| }, | |
| { | |
| "epoch": 0.31300958110288923, | |
| "grad_norm": 21.55461883544922, | |
| "learning_rate": 3.817501317588265e-05, | |
| "loss": 3.1048, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 0.31300958110288923, | |
| "eval_runtime": 6198.7963, | |
| "eval_samples_per_second": 134.001, | |
| "eval_steps_per_second": 33.5, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 0.31421346410713114, | |
| "grad_norm": 17.96697425842285, | |
| "learning_rate": 3.8108130522835075e-05, | |
| "loss": 3.0576, | |
| "step": 130500 | |
| }, | |
| { | |
| "epoch": 0.31541734711137304, | |
| "grad_norm": 15.112616539001465, | |
| "learning_rate": 3.80412478697875e-05, | |
| "loss": 3.1265, | |
| "step": 131000 | |
| }, | |
| { | |
| "epoch": 0.3166212301156149, | |
| "grad_norm": 15.317338943481445, | |
| "learning_rate": 3.797449898204602e-05, | |
| "loss": 3.0716, | |
| "step": 131500 | |
| }, | |
| { | |
| "epoch": 0.3178251131198568, | |
| "grad_norm": 14.246545791625977, | |
| "learning_rate": 3.7907616328998445e-05, | |
| "loss": 3.1111, | |
| "step": 132000 | |
| }, | |
| { | |
| "epoch": 0.3190289961240987, | |
| "grad_norm": 14.737203598022461, | |
| "learning_rate": 3.7840733675950874e-05, | |
| "loss": 3.1051, | |
| "step": 132500 | |
| }, | |
| { | |
| "epoch": 0.32023287912834053, | |
| "grad_norm": 16.053455352783203, | |
| "learning_rate": 3.77738510229033e-05, | |
| "loss": 3.0498, | |
| "step": 133000 | |
| }, | |
| { | |
| "epoch": 0.32143676213258243, | |
| "grad_norm": 15.171459197998047, | |
| "learning_rate": 3.770696836985572e-05, | |
| "loss": 3.0535, | |
| "step": 133500 | |
| }, | |
| { | |
| "epoch": 0.32264064513682433, | |
| "grad_norm": 23.735517501831055, | |
| "learning_rate": 3.7640219482114245e-05, | |
| "loss": 3.0349, | |
| "step": 134000 | |
| }, | |
| { | |
| "epoch": 0.3238445281410662, | |
| "grad_norm": 13.836942672729492, | |
| "learning_rate": 3.757333682906667e-05, | |
| "loss": 3.0985, | |
| "step": 134500 | |
| }, | |
| { | |
| "epoch": 0.3250484111453081, | |
| "grad_norm": 15.954339027404785, | |
| "learning_rate": 3.750645417601909e-05, | |
| "loss": 3.0927, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 0.3250484111453081, | |
| "eval_runtime": 6258.0775, | |
| "eval_samples_per_second": 132.732, | |
| "eval_steps_per_second": 33.183, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 0.32625229414955, | |
| "grad_norm": 23.13224983215332, | |
| "learning_rate": 3.7439705288277615e-05, | |
| "loss": 3.0961, | |
| "step": 135500 | |
| }, | |
| { | |
| "epoch": 0.3274561771537918, | |
| "grad_norm": 11.840916633605957, | |
| "learning_rate": 3.737282263523004e-05, | |
| "loss": 3.0769, | |
| "step": 136000 | |
| }, | |
| { | |
| "epoch": 0.32866006015803373, | |
| "grad_norm": 11.10158634185791, | |
| "learning_rate": 3.7305939982182466e-05, | |
| "loss": 3.0942, | |
| "step": 136500 | |
| }, | |
| { | |
| "epoch": 0.32986394316227563, | |
| "grad_norm": 14.162835121154785, | |
| "learning_rate": 3.723905732913489e-05, | |
| "loss": 3.1289, | |
| "step": 137000 | |
| }, | |
| { | |
| "epoch": 0.3310678261665175, | |
| "grad_norm": 23.765029907226562, | |
| "learning_rate": 3.717217467608731e-05, | |
| "loss": 3.0774, | |
| "step": 137500 | |
| }, | |
| { | |
| "epoch": 0.3322717091707594, | |
| "grad_norm": 22.40215492248535, | |
| "learning_rate": 3.710542578834583e-05, | |
| "loss": 3.0886, | |
| "step": 138000 | |
| }, | |
| { | |
| "epoch": 0.3334755921750012, | |
| "grad_norm": 16.616819381713867, | |
| "learning_rate": 3.703854313529826e-05, | |
| "loss": 3.102, | |
| "step": 138500 | |
| }, | |
| { | |
| "epoch": 0.3346794751792431, | |
| "grad_norm": 19.094507217407227, | |
| "learning_rate": 3.697166048225068e-05, | |
| "loss": 3.1027, | |
| "step": 139000 | |
| }, | |
| { | |
| "epoch": 0.335883358183485, | |
| "grad_norm": 20.761945724487305, | |
| "learning_rate": 3.690477782920311e-05, | |
| "loss": 3.0609, | |
| "step": 139500 | |
| }, | |
| { | |
| "epoch": 0.33708724118772687, | |
| "grad_norm": 11.371627807617188, | |
| "learning_rate": 3.683789517615553e-05, | |
| "loss": 3.0916, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 0.33708724118772687, | |
| "eval_runtime": 6174.667, | |
| "eval_samples_per_second": 134.525, | |
| "eval_steps_per_second": 33.631, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 0.3382911241919688, | |
| "grad_norm": 15.36569881439209, | |
| "learning_rate": 3.6771012523107955e-05, | |
| "loss": 3.0964, | |
| "step": 140500 | |
| }, | |
| { | |
| "epoch": 0.3394950071962107, | |
| "grad_norm": 19.703203201293945, | |
| "learning_rate": 3.6704129870060384e-05, | |
| "loss": 3.0631, | |
| "step": 141000 | |
| }, | |
| { | |
| "epoch": 0.3406988902004525, | |
| "grad_norm": 23.92881965637207, | |
| "learning_rate": 3.663724721701281e-05, | |
| "loss": 3.0702, | |
| "step": 141500 | |
| }, | |
| { | |
| "epoch": 0.3419027732046944, | |
| "grad_norm": 18.54579734802246, | |
| "learning_rate": 3.657036456396523e-05, | |
| "loss": 3.0732, | |
| "step": 142000 | |
| }, | |
| { | |
| "epoch": 0.3431066562089363, | |
| "grad_norm": 13.281709671020508, | |
| "learning_rate": 3.650348191091766e-05, | |
| "loss": 3.0937, | |
| "step": 142500 | |
| }, | |
| { | |
| "epoch": 0.34431053921317817, | |
| "grad_norm": 17.042314529418945, | |
| "learning_rate": 3.6436733023176177e-05, | |
| "loss": 3.0914, | |
| "step": 143000 | |
| }, | |
| { | |
| "epoch": 0.34551442221742007, | |
| "grad_norm": 16.268789291381836, | |
| "learning_rate": 3.6369850370128606e-05, | |
| "loss": 3.0899, | |
| "step": 143500 | |
| }, | |
| { | |
| "epoch": 0.34671830522166197, | |
| "grad_norm": 26.38330841064453, | |
| "learning_rate": 3.630296771708103e-05, | |
| "loss": 3.0666, | |
| "step": 144000 | |
| }, | |
| { | |
| "epoch": 0.3479221882259038, | |
| "grad_norm": 14.961106300354004, | |
| "learning_rate": 3.623608506403345e-05, | |
| "loss": 3.069, | |
| "step": 144500 | |
| }, | |
| { | |
| "epoch": 0.3491260712301457, | |
| "grad_norm": 12.415295600891113, | |
| "learning_rate": 3.616920241098588e-05, | |
| "loss": 3.0293, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 0.3491260712301457, | |
| "eval_runtime": 6109.5629, | |
| "eval_samples_per_second": 135.958, | |
| "eval_steps_per_second": 33.99, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 0.3503299542343876, | |
| "grad_norm": 16.554115295410156, | |
| "learning_rate": 3.61024535232444e-05, | |
| "loss": 3.0739, | |
| "step": 145500 | |
| }, | |
| { | |
| "epoch": 0.35153383723862947, | |
| "grad_norm": 20.627267837524414, | |
| "learning_rate": 3.603557087019682e-05, | |
| "loss": 3.0799, | |
| "step": 146000 | |
| }, | |
| { | |
| "epoch": 0.35273772024287137, | |
| "grad_norm": 15.106368064880371, | |
| "learning_rate": 3.596868821714925e-05, | |
| "loss": 3.0417, | |
| "step": 146500 | |
| }, | |
| { | |
| "epoch": 0.3539416032471132, | |
| "grad_norm": 17.705570220947266, | |
| "learning_rate": 3.590180556410168e-05, | |
| "loss": 3.0896, | |
| "step": 147000 | |
| }, | |
| { | |
| "epoch": 0.3551454862513551, | |
| "grad_norm": 16.01241683959961, | |
| "learning_rate": 3.5834922911054094e-05, | |
| "loss": 3.0729, | |
| "step": 147500 | |
| }, | |
| { | |
| "epoch": 0.356349369255597, | |
| "grad_norm": 17.986221313476562, | |
| "learning_rate": 3.576817402331262e-05, | |
| "loss": 3.11, | |
| "step": 148000 | |
| }, | |
| { | |
| "epoch": 0.35755325225983886, | |
| "grad_norm": 17.471803665161133, | |
| "learning_rate": 3.570129137026504e-05, | |
| "loss": 3.0968, | |
| "step": 148500 | |
| }, | |
| { | |
| "epoch": 0.35875713526408076, | |
| "grad_norm": 16.683828353881836, | |
| "learning_rate": 3.563440871721747e-05, | |
| "loss": 3.0491, | |
| "step": 149000 | |
| }, | |
| { | |
| "epoch": 0.35996101826832266, | |
| "grad_norm": 18.689273834228516, | |
| "learning_rate": 3.5567526064169894e-05, | |
| "loss": 3.0183, | |
| "step": 149500 | |
| }, | |
| { | |
| "epoch": 0.3611649012725645, | |
| "grad_norm": 14.659083366394043, | |
| "learning_rate": 3.550064341112232e-05, | |
| "loss": 3.0965, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 0.3611649012725645, | |
| "eval_runtime": 6228.4893, | |
| "eval_samples_per_second": 133.362, | |
| "eval_steps_per_second": 33.341, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 0.3623687842768064, | |
| "grad_norm": 16.2710018157959, | |
| "learning_rate": 3.5433760758074745e-05, | |
| "loss": 3.1006, | |
| "step": 150500 | |
| }, | |
| { | |
| "epoch": 0.3635726672810483, | |
| "grad_norm": 16.394590377807617, | |
| "learning_rate": 3.5367011870333264e-05, | |
| "loss": 3.0602, | |
| "step": 151000 | |
| }, | |
| { | |
| "epoch": 0.36477655028529016, | |
| "grad_norm": 15.235190391540527, | |
| "learning_rate": 3.5300129217285686e-05, | |
| "loss": 3.0777, | |
| "step": 151500 | |
| }, | |
| { | |
| "epoch": 0.36598043328953206, | |
| "grad_norm": 15.201708793640137, | |
| "learning_rate": 3.5233246564238115e-05, | |
| "loss": 3.0595, | |
| "step": 152000 | |
| }, | |
| { | |
| "epoch": 0.36718431629377396, | |
| "grad_norm": 22.309728622436523, | |
| "learning_rate": 3.5166363911190544e-05, | |
| "loss": 3.0446, | |
| "step": 152500 | |
| }, | |
| { | |
| "epoch": 0.3683881992980158, | |
| "grad_norm": 13.854850769042969, | |
| "learning_rate": 3.509961502344906e-05, | |
| "loss": 3.0665, | |
| "step": 153000 | |
| }, | |
| { | |
| "epoch": 0.3695920823022577, | |
| "grad_norm": 14.474712371826172, | |
| "learning_rate": 3.5032732370401485e-05, | |
| "loss": 3.1098, | |
| "step": 153500 | |
| }, | |
| { | |
| "epoch": 0.3707959653064996, | |
| "grad_norm": 13.207783699035645, | |
| "learning_rate": 3.496584971735391e-05, | |
| "loss": 3.1007, | |
| "step": 154000 | |
| }, | |
| { | |
| "epoch": 0.37199984831074145, | |
| "grad_norm": 13.456844329833984, | |
| "learning_rate": 3.489896706430634e-05, | |
| "loss": 3.0957, | |
| "step": 154500 | |
| }, | |
| { | |
| "epoch": 0.37320373131498336, | |
| "grad_norm": 17.590436935424805, | |
| "learning_rate": 3.483208441125876e-05, | |
| "loss": 3.0295, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 0.37320373131498336, | |
| "eval_runtime": 6177.0488, | |
| "eval_samples_per_second": 134.473, | |
| "eval_steps_per_second": 33.618, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 0.3744076143192252, | |
| "grad_norm": 12.911888122558594, | |
| "learning_rate": 3.476520175821119e-05, | |
| "loss": 3.0661, | |
| "step": 155500 | |
| }, | |
| { | |
| "epoch": 0.3756114973234671, | |
| "grad_norm": 14.606691360473633, | |
| "learning_rate": 3.469831910516361e-05, | |
| "loss": 3.0804, | |
| "step": 156000 | |
| }, | |
| { | |
| "epoch": 0.376815380327709, | |
| "grad_norm": 18.043087005615234, | |
| "learning_rate": 3.463143645211603e-05, | |
| "loss": 3.1359, | |
| "step": 156500 | |
| }, | |
| { | |
| "epoch": 0.37801926333195085, | |
| "grad_norm": 15.033346176147461, | |
| "learning_rate": 3.456468756437455e-05, | |
| "loss": 2.9907, | |
| "step": 157000 | |
| }, | |
| { | |
| "epoch": 0.37922314633619275, | |
| "grad_norm": 17.020784378051758, | |
| "learning_rate": 3.449780491132698e-05, | |
| "loss": 3.0606, | |
| "step": 157500 | |
| }, | |
| { | |
| "epoch": 0.38042702934043465, | |
| "grad_norm": 22.74751091003418, | |
| "learning_rate": 3.44310560235855e-05, | |
| "loss": 3.1285, | |
| "step": 158000 | |
| }, | |
| { | |
| "epoch": 0.3816309123446765, | |
| "grad_norm": 14.052987098693848, | |
| "learning_rate": 3.436417337053793e-05, | |
| "loss": 3.0703, | |
| "step": 158500 | |
| }, | |
| { | |
| "epoch": 0.3828347953489184, | |
| "grad_norm": 22.046268463134766, | |
| "learning_rate": 3.429729071749035e-05, | |
| "loss": 3.0916, | |
| "step": 159000 | |
| }, | |
| { | |
| "epoch": 0.3840386783531603, | |
| "grad_norm": 23.049739837646484, | |
| "learning_rate": 3.4230408064442773e-05, | |
| "loss": 3.0909, | |
| "step": 159500 | |
| }, | |
| { | |
| "epoch": 0.38524256135740215, | |
| "grad_norm": 15.563003540039062, | |
| "learning_rate": 3.41635254113952e-05, | |
| "loss": 3.051, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 0.38524256135740215, | |
| "eval_runtime": 6216.4944, | |
| "eval_samples_per_second": 133.62, | |
| "eval_steps_per_second": 33.405, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 0.38644644436164405, | |
| "grad_norm": 11.055919647216797, | |
| "learning_rate": 3.4096642758347625e-05, | |
| "loss": 3.0614, | |
| "step": 160500 | |
| }, | |
| { | |
| "epoch": 0.38765032736588595, | |
| "grad_norm": 18.309402465820312, | |
| "learning_rate": 3.4029760105300054e-05, | |
| "loss": 3.0618, | |
| "step": 161000 | |
| }, | |
| { | |
| "epoch": 0.3888542103701278, | |
| "grad_norm": 15.657028198242188, | |
| "learning_rate": 3.396287745225247e-05, | |
| "loss": 3.068, | |
| "step": 161500 | |
| }, | |
| { | |
| "epoch": 0.3900580933743697, | |
| "grad_norm": 15.660598754882812, | |
| "learning_rate": 3.3896128564510995e-05, | |
| "loss": 3.0956, | |
| "step": 162000 | |
| }, | |
| { | |
| "epoch": 0.3912619763786116, | |
| "grad_norm": 17.219053268432617, | |
| "learning_rate": 3.382924591146342e-05, | |
| "loss": 3.0762, | |
| "step": 162500 | |
| }, | |
| { | |
| "epoch": 0.39246585938285344, | |
| "grad_norm": 15.2114896774292, | |
| "learning_rate": 3.376249702372194e-05, | |
| "loss": 3.0193, | |
| "step": 163000 | |
| }, | |
| { | |
| "epoch": 0.39366974238709534, | |
| "grad_norm": 15.437503814697266, | |
| "learning_rate": 3.3695614370674365e-05, | |
| "loss": 3.0757, | |
| "step": 163500 | |
| }, | |
| { | |
| "epoch": 0.39487362539133725, | |
| "grad_norm": 17.652286529541016, | |
| "learning_rate": 3.3628731717626794e-05, | |
| "loss": 3.0871, | |
| "step": 164000 | |
| }, | |
| { | |
| "epoch": 0.3960775083955791, | |
| "grad_norm": 14.703353881835938, | |
| "learning_rate": 3.356184906457922e-05, | |
| "loss": 3.0025, | |
| "step": 164500 | |
| }, | |
| { | |
| "epoch": 0.397281391399821, | |
| "grad_norm": 15.438825607299805, | |
| "learning_rate": 3.349496641153164e-05, | |
| "loss": 3.049, | |
| "step": 165000 | |
| }, | |
| { | |
| "epoch": 0.397281391399821, | |
| "eval_runtime": 6142.6208, | |
| "eval_samples_per_second": 135.226, | |
| "eval_steps_per_second": 33.807, | |
| "step": 165000 | |
| }, | |
| { | |
| "epoch": 0.39848527440406284, | |
| "grad_norm": 21.73479461669922, | |
| "learning_rate": 3.3428217523790165e-05, | |
| "loss": 3.0724, | |
| "step": 165500 | |
| }, | |
| { | |
| "epoch": 0.39968915740830474, | |
| "grad_norm": 13.589031219482422, | |
| "learning_rate": 3.336133487074259e-05, | |
| "loss": 3.0599, | |
| "step": 166000 | |
| }, | |
| { | |
| "epoch": 0.40089304041254664, | |
| "grad_norm": 12.588455200195312, | |
| "learning_rate": 3.329445221769501e-05, | |
| "loss": 3.0674, | |
| "step": 166500 | |
| }, | |
| { | |
| "epoch": 0.4020969234167885, | |
| "grad_norm": 16.856395721435547, | |
| "learning_rate": 3.322756956464744e-05, | |
| "loss": 3.0598, | |
| "step": 167000 | |
| }, | |
| { | |
| "epoch": 0.4033008064210304, | |
| "grad_norm": 14.325052261352539, | |
| "learning_rate": 3.316068691159986e-05, | |
| "loss": 3.1033, | |
| "step": 167500 | |
| }, | |
| { | |
| "epoch": 0.4045046894252723, | |
| "grad_norm": 20.509449005126953, | |
| "learning_rate": 3.3093938023858386e-05, | |
| "loss": 3.0843, | |
| "step": 168000 | |
| }, | |
| { | |
| "epoch": 0.40570857242951414, | |
| "grad_norm": 17.73023796081543, | |
| "learning_rate": 3.302705537081081e-05, | |
| "loss": 3.0367, | |
| "step": 168500 | |
| }, | |
| { | |
| "epoch": 0.40691245543375604, | |
| "grad_norm": 24.057329177856445, | |
| "learning_rate": 3.296017271776323e-05, | |
| "loss": 3.0771, | |
| "step": 169000 | |
| }, | |
| { | |
| "epoch": 0.40811633843799794, | |
| "grad_norm": 19.776145935058594, | |
| "learning_rate": 3.289329006471566e-05, | |
| "loss": 3.0784, | |
| "step": 169500 | |
| }, | |
| { | |
| "epoch": 0.4093202214422398, | |
| "grad_norm": 23.74951934814453, | |
| "learning_rate": 3.282654117697418e-05, | |
| "loss": 3.0786, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 0.4093202214422398, | |
| "eval_runtime": 6188.4105, | |
| "eval_samples_per_second": 134.226, | |
| "eval_steps_per_second": 33.557, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 0.4105241044464817, | |
| "grad_norm": 17.745681762695312, | |
| "learning_rate": 3.27596585239266e-05, | |
| "loss": 3.0666, | |
| "step": 170500 | |
| }, | |
| { | |
| "epoch": 0.4117279874507236, | |
| "grad_norm": 20.147336959838867, | |
| "learning_rate": 3.269277587087903e-05, | |
| "loss": 3.1238, | |
| "step": 171000 | |
| }, | |
| { | |
| "epoch": 0.41293187045496543, | |
| "grad_norm": 16.938888549804688, | |
| "learning_rate": 3.262589321783145e-05, | |
| "loss": 3.0414, | |
| "step": 171500 | |
| }, | |
| { | |
| "epoch": 0.41413575345920733, | |
| "grad_norm": 15.663901329040527, | |
| "learning_rate": 3.2559010564783875e-05, | |
| "loss": 3.0892, | |
| "step": 172000 | |
| }, | |
| { | |
| "epoch": 0.41533963646344924, | |
| "grad_norm": 16.39117431640625, | |
| "learning_rate": 3.2492127911736304e-05, | |
| "loss": 3.0685, | |
| "step": 172500 | |
| }, | |
| { | |
| "epoch": 0.4165435194676911, | |
| "grad_norm": 14.299029350280762, | |
| "learning_rate": 3.242537902399482e-05, | |
| "loss": 3.0725, | |
| "step": 173000 | |
| }, | |
| { | |
| "epoch": 0.417747402471933, | |
| "grad_norm": 11.168866157531738, | |
| "learning_rate": 3.235849637094725e-05, | |
| "loss": 3.0502, | |
| "step": 173500 | |
| }, | |
| { | |
| "epoch": 0.41895128547617483, | |
| "grad_norm": 13.38841724395752, | |
| "learning_rate": 3.2291613717899674e-05, | |
| "loss": 3.062, | |
| "step": 174000 | |
| }, | |
| { | |
| "epoch": 0.42015516848041673, | |
| "grad_norm": 14.151941299438477, | |
| "learning_rate": 3.2224731064852097e-05, | |
| "loss": 3.0666, | |
| "step": 174500 | |
| }, | |
| { | |
| "epoch": 0.42135905148465863, | |
| "grad_norm": 17.730104446411133, | |
| "learning_rate": 3.215784841180452e-05, | |
| "loss": 3.0709, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 0.42135905148465863, | |
| "eval_runtime": 6186.2143, | |
| "eval_samples_per_second": 134.274, | |
| "eval_steps_per_second": 33.569, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 0.4225629344889005, | |
| "grad_norm": 16.822513580322266, | |
| "learning_rate": 3.209096575875695e-05, | |
| "loss": 3.065, | |
| "step": 175500 | |
| }, | |
| { | |
| "epoch": 0.4237668174931424, | |
| "grad_norm": 15.454965591430664, | |
| "learning_rate": 3.202408310570938e-05, | |
| "loss": 3.0476, | |
| "step": 176000 | |
| }, | |
| { | |
| "epoch": 0.4249707004973843, | |
| "grad_norm": 21.14031410217285, | |
| "learning_rate": 3.195720045266179e-05, | |
| "loss": 3.0339, | |
| "step": 176500 | |
| }, | |
| { | |
| "epoch": 0.4261745835016261, | |
| "grad_norm": 19.002689361572266, | |
| "learning_rate": 3.189045156492032e-05, | |
| "loss": 3.0598, | |
| "step": 177000 | |
| }, | |
| { | |
| "epoch": 0.427378466505868, | |
| "grad_norm": 11.582403182983398, | |
| "learning_rate": 3.182356891187274e-05, | |
| "loss": 3.0454, | |
| "step": 177500 | |
| }, | |
| { | |
| "epoch": 0.42858234951010993, | |
| "grad_norm": 14.35600757598877, | |
| "learning_rate": 3.175668625882517e-05, | |
| "loss": 3.0677, | |
| "step": 178000 | |
| }, | |
| { | |
| "epoch": 0.4297862325143518, | |
| "grad_norm": 18.5367374420166, | |
| "learning_rate": 3.168980360577759e-05, | |
| "loss": 3.1098, | |
| "step": 178500 | |
| }, | |
| { | |
| "epoch": 0.4309901155185937, | |
| "grad_norm": 17.769344329833984, | |
| "learning_rate": 3.162305471803611e-05, | |
| "loss": 3.052, | |
| "step": 179000 | |
| }, | |
| { | |
| "epoch": 0.4321939985228356, | |
| "grad_norm": 17.472938537597656, | |
| "learning_rate": 3.155617206498854e-05, | |
| "loss": 3.0699, | |
| "step": 179500 | |
| }, | |
| { | |
| "epoch": 0.4333978815270774, | |
| "grad_norm": 14.995344161987305, | |
| "learning_rate": 3.148928941194096e-05, | |
| "loss": 3.0682, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 0.4333978815270774, | |
| "eval_runtime": 6302.7174, | |
| "eval_samples_per_second": 131.792, | |
| "eval_steps_per_second": 32.948, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 0.4346017645313193, | |
| "grad_norm": 17.150964736938477, | |
| "learning_rate": 3.1422406758893384e-05, | |
| "loss": 3.0906, | |
| "step": 180500 | |
| }, | |
| { | |
| "epoch": 0.4358056475355612, | |
| "grad_norm": 14.804174423217773, | |
| "learning_rate": 3.1355524105845814e-05, | |
| "loss": 3.0493, | |
| "step": 181000 | |
| }, | |
| { | |
| "epoch": 0.43700953053980307, | |
| "grad_norm": 17.898832321166992, | |
| "learning_rate": 3.128864145279824e-05, | |
| "loss": 3.089, | |
| "step": 181500 | |
| }, | |
| { | |
| "epoch": 0.43821341354404497, | |
| "grad_norm": 16.601884841918945, | |
| "learning_rate": 3.122189256505676e-05, | |
| "loss": 3.0688, | |
| "step": 182000 | |
| }, | |
| { | |
| "epoch": 0.4394172965482868, | |
| "grad_norm": 14.000849723815918, | |
| "learning_rate": 3.1155009912009184e-05, | |
| "loss": 3.0295, | |
| "step": 182500 | |
| }, | |
| { | |
| "epoch": 0.4406211795525287, | |
| "grad_norm": 17.828115463256836, | |
| "learning_rate": 3.1088127258961606e-05, | |
| "loss": 3.0588, | |
| "step": 183000 | |
| }, | |
| { | |
| "epoch": 0.4418250625567706, | |
| "grad_norm": 20.30364418029785, | |
| "learning_rate": 3.1021244605914035e-05, | |
| "loss": 3.0203, | |
| "step": 183500 | |
| }, | |
| { | |
| "epoch": 0.44302894556101247, | |
| "grad_norm": 17.606700897216797, | |
| "learning_rate": 3.095436195286646e-05, | |
| "loss": 3.0568, | |
| "step": 184000 | |
| }, | |
| { | |
| "epoch": 0.44423282856525437, | |
| "grad_norm": 17.633464813232422, | |
| "learning_rate": 3.0887613065124976e-05, | |
| "loss": 3.0702, | |
| "step": 184500 | |
| }, | |
| { | |
| "epoch": 0.44543671156949627, | |
| "grad_norm": 14.55715274810791, | |
| "learning_rate": 3.0820730412077405e-05, | |
| "loss": 3.0746, | |
| "step": 185000 | |
| }, | |
| { | |
| "epoch": 0.44543671156949627, | |
| "eval_runtime": 6306.1272, | |
| "eval_samples_per_second": 131.72, | |
| "eval_steps_per_second": 32.93, | |
| "step": 185000 | |
| }, | |
| { | |
| "epoch": 0.4466405945737381, | |
| "grad_norm": 16.668909072875977, | |
| "learning_rate": 3.075384775902983e-05, | |
| "loss": 3.0566, | |
| "step": 185500 | |
| }, | |
| { | |
| "epoch": 0.44784447757798, | |
| "grad_norm": 14.347661018371582, | |
| "learning_rate": 3.068696510598225e-05, | |
| "loss": 3.0616, | |
| "step": 186000 | |
| }, | |
| { | |
| "epoch": 0.4490483605822219, | |
| "grad_norm": 17.429546356201172, | |
| "learning_rate": 3.062021621824077e-05, | |
| "loss": 3.0875, | |
| "step": 186500 | |
| }, | |
| { | |
| "epoch": 0.45025224358646376, | |
| "grad_norm": 19.362503051757812, | |
| "learning_rate": 3.0553467330499294e-05, | |
| "loss": 3.057, | |
| "step": 187000 | |
| }, | |
| { | |
| "epoch": 0.45145612659070566, | |
| "grad_norm": 14.057225227355957, | |
| "learning_rate": 3.048658467745172e-05, | |
| "loss": 3.0644, | |
| "step": 187500 | |
| }, | |
| { | |
| "epoch": 0.45266000959494757, | |
| "grad_norm": 21.090145111083984, | |
| "learning_rate": 3.0419702024404146e-05, | |
| "loss": 3.0886, | |
| "step": 188000 | |
| }, | |
| { | |
| "epoch": 0.4538638925991894, | |
| "grad_norm": 13.602699279785156, | |
| "learning_rate": 3.0352819371356568e-05, | |
| "loss": 3.0649, | |
| "step": 188500 | |
| }, | |
| { | |
| "epoch": 0.4550677756034313, | |
| "grad_norm": 14.61277961730957, | |
| "learning_rate": 3.0285936718308994e-05, | |
| "loss": 3.0502, | |
| "step": 189000 | |
| }, | |
| { | |
| "epoch": 0.4562716586076732, | |
| "grad_norm": 14.571629524230957, | |
| "learning_rate": 3.021905406526142e-05, | |
| "loss": 3.0512, | |
| "step": 189500 | |
| }, | |
| { | |
| "epoch": 0.45747554161191506, | |
| "grad_norm": 16.995033264160156, | |
| "learning_rate": 3.0152171412213842e-05, | |
| "loss": 3.0619, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 0.45747554161191506, | |
| "eval_runtime": 6119.7371, | |
| "eval_samples_per_second": 135.732, | |
| "eval_steps_per_second": 33.933, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 0.45867942461615696, | |
| "grad_norm": 14.749920845031738, | |
| "learning_rate": 3.0085288759166268e-05, | |
| "loss": 3.0377, | |
| "step": 190500 | |
| }, | |
| { | |
| "epoch": 0.4598833076203988, | |
| "grad_norm": 18.717721939086914, | |
| "learning_rate": 3.0018406106118697e-05, | |
| "loss": 3.028, | |
| "step": 191000 | |
| }, | |
| { | |
| "epoch": 0.4610871906246407, | |
| "grad_norm": 13.981959342956543, | |
| "learning_rate": 2.995152345307112e-05, | |
| "loss": 3.0643, | |
| "step": 191500 | |
| }, | |
| { | |
| "epoch": 0.4622910736288826, | |
| "grad_norm": 13.590766906738281, | |
| "learning_rate": 2.9884640800023545e-05, | |
| "loss": 3.0734, | |
| "step": 192000 | |
| }, | |
| { | |
| "epoch": 0.46349495663312446, | |
| "grad_norm": 14.754199028015137, | |
| "learning_rate": 2.981775814697597e-05, | |
| "loss": 3.0575, | |
| "step": 192500 | |
| }, | |
| { | |
| "epoch": 0.46469883963736636, | |
| "grad_norm": 15.374496459960938, | |
| "learning_rate": 2.9751009259234493e-05, | |
| "loss": 3.0545, | |
| "step": 193000 | |
| }, | |
| { | |
| "epoch": 0.46590272264160826, | |
| "grad_norm": 17.713016510009766, | |
| "learning_rate": 2.968412660618691e-05, | |
| "loss": 3.022, | |
| "step": 193500 | |
| }, | |
| { | |
| "epoch": 0.4671066056458501, | |
| "grad_norm": 13.752087593078613, | |
| "learning_rate": 2.961724395313934e-05, | |
| "loss": 3.0129, | |
| "step": 194000 | |
| }, | |
| { | |
| "epoch": 0.468310488650092, | |
| "grad_norm": 11.1192626953125, | |
| "learning_rate": 2.9550361300091767e-05, | |
| "loss": 3.0285, | |
| "step": 194500 | |
| }, | |
| { | |
| "epoch": 0.4695143716543339, | |
| "grad_norm": 17.55103874206543, | |
| "learning_rate": 2.9483746177656378e-05, | |
| "loss": 3.045, | |
| "step": 195000 | |
| }, | |
| { | |
| "epoch": 0.4695143716543339, | |
| "eval_runtime": 6132.8059, | |
| "eval_samples_per_second": 135.443, | |
| "eval_steps_per_second": 33.861, | |
| "step": 195000 | |
| }, | |
| { | |
| "epoch": 0.47071825465857575, | |
| "grad_norm": 27.24392318725586, | |
| "learning_rate": 2.9416863524608807e-05, | |
| "loss": 3.0499, | |
| "step": 195500 | |
| }, | |
| { | |
| "epoch": 0.47192213766281765, | |
| "grad_norm": 14.595544815063477, | |
| "learning_rate": 2.9349980871561226e-05, | |
| "loss": 3.0375, | |
| "step": 196000 | |
| }, | |
| { | |
| "epoch": 0.47312602066705955, | |
| "grad_norm": 13.058863639831543, | |
| "learning_rate": 2.9283098218513655e-05, | |
| "loss": 3.1024, | |
| "step": 196500 | |
| }, | |
| { | |
| "epoch": 0.4743299036713014, | |
| "grad_norm": 15.837779998779297, | |
| "learning_rate": 2.921621556546608e-05, | |
| "loss": 3.082, | |
| "step": 197000 | |
| }, | |
| { | |
| "epoch": 0.4755337866755433, | |
| "grad_norm": 14.441446304321289, | |
| "learning_rate": 2.9149466677724603e-05, | |
| "loss": 3.0608, | |
| "step": 197500 | |
| }, | |
| { | |
| "epoch": 0.4767376696797852, | |
| "grad_norm": 16.908939361572266, | |
| "learning_rate": 2.9082584024677022e-05, | |
| "loss": 3.0524, | |
| "step": 198000 | |
| }, | |
| { | |
| "epoch": 0.47794155268402705, | |
| "grad_norm": 15.620512962341309, | |
| "learning_rate": 2.901570137162945e-05, | |
| "loss": 3.0614, | |
| "step": 198500 | |
| }, | |
| { | |
| "epoch": 0.47914543568826895, | |
| "grad_norm": 17.97640609741211, | |
| "learning_rate": 2.8948818718581877e-05, | |
| "loss": 3.0483, | |
| "step": 199000 | |
| }, | |
| { | |
| "epoch": 0.48034931869251085, | |
| "grad_norm": 19.494766235351562, | |
| "learning_rate": 2.88819360655343e-05, | |
| "loss": 3.0629, | |
| "step": 199500 | |
| }, | |
| { | |
| "epoch": 0.4815532016967527, | |
| "grad_norm": 18.747150421142578, | |
| "learning_rate": 2.8815053412486725e-05, | |
| "loss": 3.0774, | |
| "step": 200000 | |
| }, | |
| { | |
| "epoch": 0.4815532016967527, | |
| "eval_runtime": 6171.1886, | |
| "eval_samples_per_second": 134.6, | |
| "eval_steps_per_second": 33.65, | |
| "step": 200000 | |
| }, | |
| { | |
| "epoch": 0.4827570847009946, | |
| "grad_norm": 15.972591400146484, | |
| "learning_rate": 2.874817075943915e-05, | |
| "loss": 3.0938, | |
| "step": 200500 | |
| }, | |
| { | |
| "epoch": 0.48396096770523644, | |
| "grad_norm": 16.991474151611328, | |
| "learning_rate": 2.8681421871697673e-05, | |
| "loss": 3.0431, | |
| "step": 201000 | |
| }, | |
| { | |
| "epoch": 0.48516485070947835, | |
| "grad_norm": 16.47597312927246, | |
| "learning_rate": 2.8614539218650095e-05, | |
| "loss": 3.0886, | |
| "step": 201500 | |
| }, | |
| { | |
| "epoch": 0.48636873371372025, | |
| "grad_norm": 20.3975830078125, | |
| "learning_rate": 2.854765656560252e-05, | |
| "loss": 3.0562, | |
| "step": 202000 | |
| }, | |
| { | |
| "epoch": 0.4875726167179621, | |
| "grad_norm": 17.682926177978516, | |
| "learning_rate": 2.8480773912554947e-05, | |
| "loss": 3.1002, | |
| "step": 202500 | |
| }, | |
| { | |
| "epoch": 0.488776499722204, | |
| "grad_norm": 18.027238845825195, | |
| "learning_rate": 2.841389125950737e-05, | |
| "loss": 3.0798, | |
| "step": 203000 | |
| }, | |
| { | |
| "epoch": 0.4899803827264459, | |
| "grad_norm": 20.950571060180664, | |
| "learning_rate": 2.8347142371765888e-05, | |
| "loss": 3.0573, | |
| "step": 203500 | |
| }, | |
| { | |
| "epoch": 0.49118426573068774, | |
| "grad_norm": 17.63266372680664, | |
| "learning_rate": 2.8280259718718317e-05, | |
| "loss": 3.048, | |
| "step": 204000 | |
| }, | |
| { | |
| "epoch": 0.49238814873492964, | |
| "grad_norm": 17.037296295166016, | |
| "learning_rate": 2.8213377065670743e-05, | |
| "loss": 3.016, | |
| "step": 204500 | |
| }, | |
| { | |
| "epoch": 0.49359203173917154, | |
| "grad_norm": 21.214052200317383, | |
| "learning_rate": 2.8146494412623165e-05, | |
| "loss": 3.0676, | |
| "step": 205000 | |
| }, | |
| { | |
| "epoch": 0.49359203173917154, | |
| "eval_runtime": 6343.1785, | |
| "eval_samples_per_second": 130.951, | |
| "eval_steps_per_second": 32.738, | |
| "step": 205000 | |
| }, | |
| { | |
| "epoch": 0.4947959147434134, | |
| "grad_norm": 17.722492218017578, | |
| "learning_rate": 2.807961175957559e-05, | |
| "loss": 3.076, | |
| "step": 205500 | |
| }, | |
| { | |
| "epoch": 0.4959997977476553, | |
| "grad_norm": 17.147768020629883, | |
| "learning_rate": 2.801272910652802e-05, | |
| "loss": 3.0684, | |
| "step": 206000 | |
| }, | |
| { | |
| "epoch": 0.4972036807518972, | |
| "grad_norm": 15.113913536071777, | |
| "learning_rate": 2.794584645348044e-05, | |
| "loss": 3.0133, | |
| "step": 206500 | |
| }, | |
| { | |
| "epoch": 0.49840756375613904, | |
| "grad_norm": 15.339323043823242, | |
| "learning_rate": 2.7878963800432868e-05, | |
| "loss": 3.06, | |
| "step": 207000 | |
| }, | |
| { | |
| "epoch": 0.49961144676038094, | |
| "grad_norm": 14.279352188110352, | |
| "learning_rate": 2.7812214912691387e-05, | |
| "loss": 3.0718, | |
| "step": 207500 | |
| }, | |
| { | |
| "epoch": 0.5008153297646228, | |
| "grad_norm": 15.7473726272583, | |
| "learning_rate": 2.7745466024949905e-05, | |
| "loss": 3.0382, | |
| "step": 208000 | |
| }, | |
| { | |
| "epoch": 0.5020192127688647, | |
| "grad_norm": 16.69623374938965, | |
| "learning_rate": 2.7678583371902334e-05, | |
| "loss": 3.0469, | |
| "step": 208500 | |
| }, | |
| { | |
| "epoch": 0.5032230957731065, | |
| "grad_norm": 12.795482635498047, | |
| "learning_rate": 2.7611700718854753e-05, | |
| "loss": 3.0691, | |
| "step": 209000 | |
| }, | |
| { | |
| "epoch": 0.5044269787773484, | |
| "grad_norm": 15.719594955444336, | |
| "learning_rate": 2.7544818065807182e-05, | |
| "loss": 3.0843, | |
| "step": 209500 | |
| }, | |
| { | |
| "epoch": 0.5056308617815903, | |
| "grad_norm": 16.107906341552734, | |
| "learning_rate": 2.74780691780657e-05, | |
| "loss": 3.0939, | |
| "step": 210000 | |
| }, | |
| { | |
| "epoch": 0.5056308617815903, | |
| "eval_runtime": 6288.8164, | |
| "eval_samples_per_second": 132.083, | |
| "eval_steps_per_second": 33.021, | |
| "step": 210000 | |
| }, | |
| { | |
| "epoch": 0.5068347447858322, | |
| "grad_norm": 22.665922164916992, | |
| "learning_rate": 2.741118652501813e-05, | |
| "loss": 3.0311, | |
| "step": 210500 | |
| }, | |
| { | |
| "epoch": 0.5080386277900741, | |
| "grad_norm": 12.993492126464844, | |
| "learning_rate": 2.734430387197055e-05, | |
| "loss": 3.0409, | |
| "step": 211000 | |
| }, | |
| { | |
| "epoch": 0.509242510794316, | |
| "grad_norm": 13.392237663269043, | |
| "learning_rate": 2.727742121892298e-05, | |
| "loss": 3.0185, | |
| "step": 211500 | |
| }, | |
| { | |
| "epoch": 0.5104463937985578, | |
| "grad_norm": 18.179622650146484, | |
| "learning_rate": 2.7210538565875404e-05, | |
| "loss": 3.1036, | |
| "step": 212000 | |
| }, | |
| { | |
| "epoch": 0.5116502768027997, | |
| "grad_norm": 16.70694923400879, | |
| "learning_rate": 2.7143655912827826e-05, | |
| "loss": 3.063, | |
| "step": 212500 | |
| }, | |
| { | |
| "epoch": 0.5128541598070416, | |
| "grad_norm": 23.674760818481445, | |
| "learning_rate": 2.7076773259780252e-05, | |
| "loss": 3.0342, | |
| "step": 213000 | |
| }, | |
| { | |
| "epoch": 0.5140580428112835, | |
| "grad_norm": 19.409990310668945, | |
| "learning_rate": 2.701002437203877e-05, | |
| "loss": 3.0462, | |
| "step": 213500 | |
| }, | |
| { | |
| "epoch": 0.5152619258155254, | |
| "grad_norm": 15.574653625488281, | |
| "learning_rate": 2.69431417189912e-05, | |
| "loss": 3.0292, | |
| "step": 214000 | |
| }, | |
| { | |
| "epoch": 0.5164658088197672, | |
| "grad_norm": 17.644498825073242, | |
| "learning_rate": 2.6876259065943622e-05, | |
| "loss": 3.0152, | |
| "step": 214500 | |
| }, | |
| { | |
| "epoch": 0.5176696918240091, | |
| "grad_norm": 14.58530330657959, | |
| "learning_rate": 2.6809376412896048e-05, | |
| "loss": 3.1034, | |
| "step": 215000 | |
| }, | |
| { | |
| "epoch": 0.5176696918240091, | |
| "eval_runtime": 6223.0215, | |
| "eval_samples_per_second": 133.479, | |
| "eval_steps_per_second": 33.37, | |
| "step": 215000 | |
| }, | |
| { | |
| "epoch": 0.518873574828251, | |
| "grad_norm": 19.024547576904297, | |
| "learning_rate": 2.674249375984847e-05, | |
| "loss": 3.0733, | |
| "step": 215500 | |
| }, | |
| { | |
| "epoch": 0.5200774578324929, | |
| "grad_norm": 17.260374069213867, | |
| "learning_rate": 2.6675611106800896e-05, | |
| "loss": 3.0252, | |
| "step": 216000 | |
| }, | |
| { | |
| "epoch": 0.5212813408367348, | |
| "grad_norm": 18.4815673828125, | |
| "learning_rate": 2.6608862219059415e-05, | |
| "loss": 3.069, | |
| "step": 216500 | |
| }, | |
| { | |
| "epoch": 0.5224852238409767, | |
| "grad_norm": 15.065186500549316, | |
| "learning_rate": 2.6541979566011844e-05, | |
| "loss": 3.0697, | |
| "step": 217000 | |
| }, | |
| { | |
| "epoch": 0.5236891068452185, | |
| "grad_norm": 16.79564666748047, | |
| "learning_rate": 2.6475096912964263e-05, | |
| "loss": 3.0433, | |
| "step": 217500 | |
| }, | |
| { | |
| "epoch": 0.5248929898494604, | |
| "grad_norm": 18.250133514404297, | |
| "learning_rate": 2.6408214259916692e-05, | |
| "loss": 3.0243, | |
| "step": 218000 | |
| }, | |
| { | |
| "epoch": 0.5260968728537023, | |
| "grad_norm": 15.040393829345703, | |
| "learning_rate": 2.6341331606869118e-05, | |
| "loss": 3.0501, | |
| "step": 218500 | |
| }, | |
| { | |
| "epoch": 0.5273007558579442, | |
| "grad_norm": 18.00982093811035, | |
| "learning_rate": 2.627444895382154e-05, | |
| "loss": 3.0481, | |
| "step": 219000 | |
| }, | |
| { | |
| "epoch": 0.5285046388621861, | |
| "grad_norm": 14.428119659423828, | |
| "learning_rate": 2.6207566300773966e-05, | |
| "loss": 3.0788, | |
| "step": 219500 | |
| }, | |
| { | |
| "epoch": 0.529708521866428, | |
| "grad_norm": 19.191162109375, | |
| "learning_rate": 2.6140683647726395e-05, | |
| "loss": 3.0549, | |
| "step": 220000 | |
| }, | |
| { | |
| "epoch": 0.529708521866428, | |
| "eval_runtime": 6262.92, | |
| "eval_samples_per_second": 132.629, | |
| "eval_steps_per_second": 33.157, | |
| "step": 220000 | |
| }, | |
| { | |
| "epoch": 0.5309124048706698, | |
| "grad_norm": 18.9827938079834, | |
| "learning_rate": 2.6073934759984914e-05, | |
| "loss": 3.032, | |
| "step": 220500 | |
| }, | |
| { | |
| "epoch": 0.5321162878749117, | |
| "grad_norm": 16.249061584472656, | |
| "learning_rate": 2.6007052106937336e-05, | |
| "loss": 3.0587, | |
| "step": 221000 | |
| }, | |
| { | |
| "epoch": 0.5333201708791536, | |
| "grad_norm": 27.886228561401367, | |
| "learning_rate": 2.5940303219195855e-05, | |
| "loss": 3.0959, | |
| "step": 221500 | |
| }, | |
| { | |
| "epoch": 0.5345240538833955, | |
| "grad_norm": 28.477378845214844, | |
| "learning_rate": 2.587342056614828e-05, | |
| "loss": 3.0545, | |
| "step": 222000 | |
| }, | |
| { | |
| "epoch": 0.5357279368876374, | |
| "grad_norm": 54.090702056884766, | |
| "learning_rate": 2.580653791310071e-05, | |
| "loss": 3.0052, | |
| "step": 222500 | |
| }, | |
| { | |
| "epoch": 0.5369318198918792, | |
| "grad_norm": 20.456764221191406, | |
| "learning_rate": 2.5739655260053132e-05, | |
| "loss": 3.0362, | |
| "step": 223000 | |
| }, | |
| { | |
| "epoch": 0.5381357028961211, | |
| "grad_norm": 18.759544372558594, | |
| "learning_rate": 2.5672772607005558e-05, | |
| "loss": 3.0841, | |
| "step": 223500 | |
| }, | |
| { | |
| "epoch": 0.539339585900363, | |
| "grad_norm": 24.140661239624023, | |
| "learning_rate": 2.5605889953957983e-05, | |
| "loss": 3.0545, | |
| "step": 224000 | |
| }, | |
| { | |
| "epoch": 0.5405434689046049, | |
| "grad_norm": 15.08611011505127, | |
| "learning_rate": 2.5539007300910406e-05, | |
| "loss": 3.0784, | |
| "step": 224500 | |
| }, | |
| { | |
| "epoch": 0.5417473519088468, | |
| "grad_norm": 20.986557006835938, | |
| "learning_rate": 2.547212464786283e-05, | |
| "loss": 3.0682, | |
| "step": 225000 | |
| }, | |
| { | |
| "epoch": 0.5417473519088468, | |
| "eval_runtime": 6240.4652, | |
| "eval_samples_per_second": 133.106, | |
| "eval_steps_per_second": 33.277, | |
| "step": 225000 | |
| }, | |
| { | |
| "epoch": 0.5429512349130887, | |
| "grad_norm": 11.451869010925293, | |
| "learning_rate": 2.5405375760121354e-05, | |
| "loss": 3.0627, | |
| "step": 225500 | |
| }, | |
| { | |
| "epoch": 0.5441551179173305, | |
| "grad_norm": 17.614988327026367, | |
| "learning_rate": 2.5338626872379872e-05, | |
| "loss": 3.0518, | |
| "step": 226000 | |
| }, | |
| { | |
| "epoch": 0.5453590009215724, | |
| "grad_norm": 14.993136405944824, | |
| "learning_rate": 2.5271744219332298e-05, | |
| "loss": 3.0515, | |
| "step": 226500 | |
| }, | |
| { | |
| "epoch": 0.5465628839258143, | |
| "grad_norm": 21.78707504272461, | |
| "learning_rate": 2.520486156628472e-05, | |
| "loss": 3.0632, | |
| "step": 227000 | |
| }, | |
| { | |
| "epoch": 0.5477667669300562, | |
| "grad_norm": 16.39373207092285, | |
| "learning_rate": 2.513797891323715e-05, | |
| "loss": 3.0524, | |
| "step": 227500 | |
| }, | |
| { | |
| "epoch": 0.5489706499342981, | |
| "grad_norm": 13.787343978881836, | |
| "learning_rate": 2.5071230025495668e-05, | |
| "loss": 3.0449, | |
| "step": 228000 | |
| }, | |
| { | |
| "epoch": 0.55017453293854, | |
| "grad_norm": 19.658519744873047, | |
| "learning_rate": 2.5004347372448094e-05, | |
| "loss": 3.0304, | |
| "step": 228500 | |
| }, | |
| { | |
| "epoch": 0.5513784159427818, | |
| "grad_norm": 16.18865203857422, | |
| "learning_rate": 2.493746471940052e-05, | |
| "loss": 3.0746, | |
| "step": 229000 | |
| }, | |
| { | |
| "epoch": 0.5525822989470237, | |
| "grad_norm": 17.702472686767578, | |
| "learning_rate": 2.4870582066352942e-05, | |
| "loss": 3.07, | |
| "step": 229500 | |
| }, | |
| { | |
| "epoch": 0.5537861819512656, | |
| "grad_norm": 18.08761215209961, | |
| "learning_rate": 2.4803699413305368e-05, | |
| "loss": 3.0417, | |
| "step": 230000 | |
| }, | |
| { | |
| "epoch": 0.5537861819512656, | |
| "eval_runtime": 6192.5264, | |
| "eval_samples_per_second": 134.137, | |
| "eval_steps_per_second": 33.534, | |
| "step": 230000 | |
| }, | |
| { | |
| "epoch": 0.5549900649555075, | |
| "grad_norm": 12.940227508544922, | |
| "learning_rate": 2.473695052556389e-05, | |
| "loss": 3.0623, | |
| "step": 230500 | |
| }, | |
| { | |
| "epoch": 0.5561939479597494, | |
| "grad_norm": 14.184712409973145, | |
| "learning_rate": 2.4670067872516316e-05, | |
| "loss": 3.0565, | |
| "step": 231000 | |
| }, | |
| { | |
| "epoch": 0.5573978309639912, | |
| "grad_norm": 16.096614837646484, | |
| "learning_rate": 2.4603185219468738e-05, | |
| "loss": 2.9976, | |
| "step": 231500 | |
| }, | |
| { | |
| "epoch": 0.5586017139682331, | |
| "grad_norm": 15.835817337036133, | |
| "learning_rate": 2.4536302566421164e-05, | |
| "loss": 2.9842, | |
| "step": 232000 | |
| }, | |
| { | |
| "epoch": 0.559805596972475, | |
| "grad_norm": 22.432340621948242, | |
| "learning_rate": 2.446941991337359e-05, | |
| "loss": 3.0831, | |
| "step": 232500 | |
| }, | |
| { | |
| "epoch": 0.5610094799767169, | |
| "grad_norm": 19.895309448242188, | |
| "learning_rate": 2.4402537260326015e-05, | |
| "loss": 3.0444, | |
| "step": 233000 | |
| }, | |
| { | |
| "epoch": 0.5622133629809588, | |
| "grad_norm": 14.998634338378906, | |
| "learning_rate": 2.4335788372584534e-05, | |
| "loss": 3.0233, | |
| "step": 233500 | |
| }, | |
| { | |
| "epoch": 0.5634172459852007, | |
| "grad_norm": 12.780035972595215, | |
| "learning_rate": 2.426890571953696e-05, | |
| "loss": 3.0215, | |
| "step": 234000 | |
| }, | |
| { | |
| "epoch": 0.5646211289894425, | |
| "grad_norm": 18.854740142822266, | |
| "learning_rate": 2.4202023066489385e-05, | |
| "loss": 3.0684, | |
| "step": 234500 | |
| }, | |
| { | |
| "epoch": 0.5658250119936844, | |
| "grad_norm": 17.486467361450195, | |
| "learning_rate": 2.4135140413441808e-05, | |
| "loss": 3.053, | |
| "step": 235000 | |
| }, | |
| { | |
| "epoch": 0.5658250119936844, | |
| "eval_runtime": 6288.8357, | |
| "eval_samples_per_second": 132.082, | |
| "eval_steps_per_second": 33.021, | |
| "step": 235000 | |
| }, | |
| { | |
| "epoch": 0.5670288949979263, | |
| "grad_norm": 14.92556095123291, | |
| "learning_rate": 2.4068257760394233e-05, | |
| "loss": 3.0641, | |
| "step": 235500 | |
| }, | |
| { | |
| "epoch": 0.5682327780021682, | |
| "grad_norm": 13.280654907226562, | |
| "learning_rate": 2.400137510734666e-05, | |
| "loss": 3.0217, | |
| "step": 236000 | |
| }, | |
| { | |
| "epoch": 0.5694366610064101, | |
| "grad_norm": 16.9669246673584, | |
| "learning_rate": 2.393462621960518e-05, | |
| "loss": 3.0162, | |
| "step": 236500 | |
| }, | |
| { | |
| "epoch": 0.570640544010652, | |
| "grad_norm": 14.215867042541504, | |
| "learning_rate": 2.3867743566557604e-05, | |
| "loss": 3.0158, | |
| "step": 237000 | |
| }, | |
| { | |
| "epoch": 0.5718444270148938, | |
| "grad_norm": 19.857236862182617, | |
| "learning_rate": 2.380086091351003e-05, | |
| "loss": 3.0011, | |
| "step": 237500 | |
| }, | |
| { | |
| "epoch": 0.5730483100191357, | |
| "grad_norm": 14.70789909362793, | |
| "learning_rate": 2.3733978260462455e-05, | |
| "loss": 3.0155, | |
| "step": 238000 | |
| }, | |
| { | |
| "epoch": 0.5742521930233776, | |
| "grad_norm": 16.156538009643555, | |
| "learning_rate": 2.3667229372720977e-05, | |
| "loss": 3.0281, | |
| "step": 238500 | |
| }, | |
| { | |
| "epoch": 0.5754560760276195, | |
| "grad_norm": 29.431739807128906, | |
| "learning_rate": 2.36003467196734e-05, | |
| "loss": 3.0404, | |
| "step": 239000 | |
| }, | |
| { | |
| "epoch": 0.5766599590318614, | |
| "grad_norm": 14.224696159362793, | |
| "learning_rate": 2.3533464066625825e-05, | |
| "loss": 3.0172, | |
| "step": 239500 | |
| }, | |
| { | |
| "epoch": 0.5778638420361032, | |
| "grad_norm": 19.29595184326172, | |
| "learning_rate": 2.346658141357825e-05, | |
| "loss": 3.0622, | |
| "step": 240000 | |
| }, | |
| { | |
| "epoch": 0.5778638420361032, | |
| "eval_runtime": 6227.4429, | |
| "eval_samples_per_second": 133.385, | |
| "eval_steps_per_second": 33.346, | |
| "step": 240000 | |
| }, | |
| { | |
| "epoch": 0.5790677250403451, | |
| "grad_norm": 24.003347396850586, | |
| "learning_rate": 2.3399698760530677e-05, | |
| "loss": 2.9962, | |
| "step": 240500 | |
| }, | |
| { | |
| "epoch": 0.580271608044587, | |
| "grad_norm": 16.034706115722656, | |
| "learning_rate": 2.33328161074831e-05, | |
| "loss": 3.0286, | |
| "step": 241000 | |
| }, | |
| { | |
| "epoch": 0.5814754910488289, | |
| "grad_norm": 16.609622955322266, | |
| "learning_rate": 2.3265933454435525e-05, | |
| "loss": 3.031, | |
| "step": 241500 | |
| }, | |
| { | |
| "epoch": 0.5826793740530708, | |
| "grad_norm": 30.813108444213867, | |
| "learning_rate": 2.319905080138795e-05, | |
| "loss": 3.0143, | |
| "step": 242000 | |
| }, | |
| { | |
| "epoch": 0.5838832570573127, | |
| "grad_norm": 15.091474533081055, | |
| "learning_rate": 2.313230191364647e-05, | |
| "loss": 3.0475, | |
| "step": 242500 | |
| }, | |
| { | |
| "epoch": 0.5850871400615545, | |
| "grad_norm": 19.889976501464844, | |
| "learning_rate": 2.3065419260598895e-05, | |
| "loss": 3.0551, | |
| "step": 243000 | |
| }, | |
| { | |
| "epoch": 0.5862910230657964, | |
| "grad_norm": 16.42539405822754, | |
| "learning_rate": 2.299853660755132e-05, | |
| "loss": 2.9885, | |
| "step": 243500 | |
| }, | |
| { | |
| "epoch": 0.5874949060700383, | |
| "grad_norm": 18.250354766845703, | |
| "learning_rate": 2.2931653954503746e-05, | |
| "loss": 3.0267, | |
| "step": 244000 | |
| }, | |
| { | |
| "epoch": 0.5886987890742802, | |
| "grad_norm": 11.44227409362793, | |
| "learning_rate": 2.286477130145617e-05, | |
| "loss": 2.9568, | |
| "step": 244500 | |
| }, | |
| { | |
| "epoch": 0.5899026720785221, | |
| "grad_norm": 21.37769889831543, | |
| "learning_rate": 2.279802241371469e-05, | |
| "loss": 3.0259, | |
| "step": 245000 | |
| }, | |
| { | |
| "epoch": 0.5899026720785221, | |
| "eval_runtime": 6297.4297, | |
| "eval_samples_per_second": 131.902, | |
| "eval_steps_per_second": 32.976, | |
| "step": 245000 | |
| }, | |
| { | |
| "epoch": 0.591106555082764, | |
| "grad_norm": 15.137754440307617, | |
| "learning_rate": 2.273127352597321e-05, | |
| "loss": 3.0087, | |
| "step": 245500 | |
| }, | |
| { | |
| "epoch": 0.5923104380870058, | |
| "grad_norm": 15.59156608581543, | |
| "learning_rate": 2.2664390872925635e-05, | |
| "loss": 3.0397, | |
| "step": 246000 | |
| }, | |
| { | |
| "epoch": 0.5935143210912477, | |
| "grad_norm": 14.741199493408203, | |
| "learning_rate": 2.259750821987806e-05, | |
| "loss": 3.0505, | |
| "step": 246500 | |
| }, | |
| { | |
| "epoch": 0.5947182040954896, | |
| "grad_norm": 37.30345153808594, | |
| "learning_rate": 2.2530625566830483e-05, | |
| "loss": 3.0312, | |
| "step": 247000 | |
| }, | |
| { | |
| "epoch": 0.5959220870997315, | |
| "grad_norm": 16.39379119873047, | |
| "learning_rate": 2.2463742913782912e-05, | |
| "loss": 3.0068, | |
| "step": 247500 | |
| }, | |
| { | |
| "epoch": 0.5971259701039734, | |
| "grad_norm": 16.724523544311523, | |
| "learning_rate": 2.2396860260735335e-05, | |
| "loss": 3.0172, | |
| "step": 248000 | |
| }, | |
| { | |
| "epoch": 0.5983298531082153, | |
| "grad_norm": 13.491678237915039, | |
| "learning_rate": 2.2330111372993857e-05, | |
| "loss": 3.0396, | |
| "step": 248500 | |
| }, | |
| { | |
| "epoch": 0.5995337361124571, | |
| "grad_norm": 17.01793670654297, | |
| "learning_rate": 2.226322871994628e-05, | |
| "loss": 3.0092, | |
| "step": 249000 | |
| }, | |
| { | |
| "epoch": 0.600737619116699, | |
| "grad_norm": 16.2504825592041, | |
| "learning_rate": 2.219634606689871e-05, | |
| "loss": 3.0564, | |
| "step": 249500 | |
| }, | |
| { | |
| "epoch": 0.6019415021209409, | |
| "grad_norm": 19.381729125976562, | |
| "learning_rate": 2.212946341385113e-05, | |
| "loss": 2.9991, | |
| "step": 250000 | |
| }, | |
| { | |
| "epoch": 0.6019415021209409, | |
| "eval_runtime": 6343.8627, | |
| "eval_samples_per_second": 130.937, | |
| "eval_steps_per_second": 32.734, | |
| "step": 250000 | |
| }, | |
| { | |
| "epoch": 0.6031453851251828, | |
| "grad_norm": 15.789433479309082, | |
| "learning_rate": 2.2062714526109653e-05, | |
| "loss": 3.0164, | |
| "step": 250500 | |
| }, | |
| { | |
| "epoch": 0.6043492681294247, | |
| "grad_norm": 15.380681037902832, | |
| "learning_rate": 2.1995831873062075e-05, | |
| "loss": 3.006, | |
| "step": 251000 | |
| }, | |
| { | |
| "epoch": 0.6055531511336665, | |
| "grad_norm": 12.976866722106934, | |
| "learning_rate": 2.19289492200145e-05, | |
| "loss": 3.099, | |
| "step": 251500 | |
| }, | |
| { | |
| "epoch": 0.6067570341379084, | |
| "grad_norm": 17.682626724243164, | |
| "learning_rate": 2.1862066566966927e-05, | |
| "loss": 3.0381, | |
| "step": 252000 | |
| }, | |
| { | |
| "epoch": 0.6079609171421503, | |
| "grad_norm": 15.32071304321289, | |
| "learning_rate": 2.1795183913919352e-05, | |
| "loss": 3.0404, | |
| "step": 252500 | |
| }, | |
| { | |
| "epoch": 0.6091648001463922, | |
| "grad_norm": 21.887651443481445, | |
| "learning_rate": 2.1728301260871775e-05, | |
| "loss": 3.0282, | |
| "step": 253000 | |
| }, | |
| { | |
| "epoch": 0.6103686831506341, | |
| "grad_norm": 16.731210708618164, | |
| "learning_rate": 2.1661552373130297e-05, | |
| "loss": 3.0219, | |
| "step": 253500 | |
| }, | |
| { | |
| "epoch": 0.611572566154876, | |
| "grad_norm": 22.759746551513672, | |
| "learning_rate": 2.1594669720082722e-05, | |
| "loss": 3.0442, | |
| "step": 254000 | |
| }, | |
| { | |
| "epoch": 0.6127764491591178, | |
| "grad_norm": 18.68710708618164, | |
| "learning_rate": 2.1527787067035145e-05, | |
| "loss": 3.0091, | |
| "step": 254500 | |
| }, | |
| { | |
| "epoch": 0.6139803321633597, | |
| "grad_norm": 23.144712448120117, | |
| "learning_rate": 2.146090441398757e-05, | |
| "loss": 3.0501, | |
| "step": 255000 | |
| }, | |
| { | |
| "epoch": 0.6139803321633597, | |
| "eval_runtime": 6230.1182, | |
| "eval_samples_per_second": 133.327, | |
| "eval_steps_per_second": 33.332, | |
| "step": 255000 | |
| }, | |
| { | |
| "epoch": 0.6151842151676016, | |
| "grad_norm": 18.833757400512695, | |
| "learning_rate": 2.1394021760939996e-05, | |
| "loss": 3.1018, | |
| "step": 255500 | |
| }, | |
| { | |
| "epoch": 0.6163880981718435, | |
| "grad_norm": 21.688997268676758, | |
| "learning_rate": 2.132727287319852e-05, | |
| "loss": 3.0579, | |
| "step": 256000 | |
| }, | |
| { | |
| "epoch": 0.6175919811760854, | |
| "grad_norm": 17.346538543701172, | |
| "learning_rate": 2.126039022015094e-05, | |
| "loss": 3.0306, | |
| "step": 256500 | |
| }, | |
| { | |
| "epoch": 0.6187958641803273, | |
| "grad_norm": 18.86598014831543, | |
| "learning_rate": 2.1193507567103366e-05, | |
| "loss": 3.0237, | |
| "step": 257000 | |
| }, | |
| { | |
| "epoch": 0.6199997471845691, | |
| "grad_norm": 13.735309600830078, | |
| "learning_rate": 2.1126624914055792e-05, | |
| "loss": 3.0416, | |
| "step": 257500 | |
| }, | |
| { | |
| "epoch": 0.621203630188811, | |
| "grad_norm": 21.433256149291992, | |
| "learning_rate": 2.1059742261008218e-05, | |
| "loss": 3.0162, | |
| "step": 258000 | |
| }, | |
| { | |
| "epoch": 0.6224075131930529, | |
| "grad_norm": 18.01786231994629, | |
| "learning_rate": 2.099285960796064e-05, | |
| "loss": 3.0192, | |
| "step": 258500 | |
| }, | |
| { | |
| "epoch": 0.6236113961972948, | |
| "grad_norm": 17.93750762939453, | |
| "learning_rate": 2.092597695491307e-05, | |
| "loss": 3.0162, | |
| "step": 259000 | |
| }, | |
| { | |
| "epoch": 0.6248152792015367, | |
| "grad_norm": 19.375873565673828, | |
| "learning_rate": 2.0859094301865492e-05, | |
| "loss": 2.9953, | |
| "step": 259500 | |
| }, | |
| { | |
| "epoch": 0.6260191622057785, | |
| "grad_norm": 16.76817512512207, | |
| "learning_rate": 2.0792479179430107e-05, | |
| "loss": 2.9848, | |
| "step": 260000 | |
| }, | |
| { | |
| "epoch": 0.6260191622057785, | |
| "eval_runtime": 6319.8113, | |
| "eval_samples_per_second": 131.435, | |
| "eval_steps_per_second": 32.859, | |
| "step": 260000 | |
| }, | |
| { | |
| "epoch": 0.6272230452100204, | |
| "grad_norm": 19.69635009765625, | |
| "learning_rate": 2.0725596526382532e-05, | |
| "loss": 3.0555, | |
| "step": 260500 | |
| }, | |
| { | |
| "epoch": 0.6284269282142623, | |
| "grad_norm": 16.243324279785156, | |
| "learning_rate": 2.0658713873334955e-05, | |
| "loss": 3.0212, | |
| "step": 261000 | |
| }, | |
| { | |
| "epoch": 0.6296308112185042, | |
| "grad_norm": 17.867599487304688, | |
| "learning_rate": 2.0591831220287384e-05, | |
| "loss": 3.0451, | |
| "step": 261500 | |
| }, | |
| { | |
| "epoch": 0.6308346942227461, | |
| "grad_norm": 17.559730529785156, | |
| "learning_rate": 2.0525082332545903e-05, | |
| "loss": 3.012, | |
| "step": 262000 | |
| }, | |
| { | |
| "epoch": 0.632038577226988, | |
| "grad_norm": 14.618083953857422, | |
| "learning_rate": 2.045833344480442e-05, | |
| "loss": 3.034, | |
| "step": 262500 | |
| }, | |
| { | |
| "epoch": 0.6332424602312298, | |
| "grad_norm": 16.521699905395508, | |
| "learning_rate": 2.0391450791756847e-05, | |
| "loss": 3.0197, | |
| "step": 263000 | |
| }, | |
| { | |
| "epoch": 0.6344463432354717, | |
| "grad_norm": 16.326717376708984, | |
| "learning_rate": 2.0324568138709273e-05, | |
| "loss": 3.0566, | |
| "step": 263500 | |
| }, | |
| { | |
| "epoch": 0.6356502262397136, | |
| "grad_norm": 22.72909164428711, | |
| "learning_rate": 2.02576854856617e-05, | |
| "loss": 3.0413, | |
| "step": 264000 | |
| }, | |
| { | |
| "epoch": 0.6368541092439555, | |
| "grad_norm": 21.150442123413086, | |
| "learning_rate": 2.019080283261412e-05, | |
| "loss": 3.0337, | |
| "step": 264500 | |
| }, | |
| { | |
| "epoch": 0.6380579922481974, | |
| "grad_norm": 18.094627380371094, | |
| "learning_rate": 2.0123920179566547e-05, | |
| "loss": 3.0103, | |
| "step": 265000 | |
| }, | |
| { | |
| "epoch": 0.6380579922481974, | |
| "eval_runtime": 6283.8462, | |
| "eval_samples_per_second": 132.187, | |
| "eval_steps_per_second": 33.047, | |
| "step": 265000 | |
| }, | |
| { | |
| "epoch": 0.6392618752524393, | |
| "grad_norm": 16.778398513793945, | |
| "learning_rate": 2.0057037526518972e-05, | |
| "loss": 3.0193, | |
| "step": 265500 | |
| }, | |
| { | |
| "epoch": 0.6404657582566811, | |
| "grad_norm": 16.389066696166992, | |
| "learning_rate": 1.9990154873471398e-05, | |
| "loss": 3.0297, | |
| "step": 266000 | |
| }, | |
| { | |
| "epoch": 0.641669641260923, | |
| "grad_norm": 15.284423828125, | |
| "learning_rate": 1.9923405985729917e-05, | |
| "loss": 3.0253, | |
| "step": 266500 | |
| }, | |
| { | |
| "epoch": 0.6428735242651649, | |
| "grad_norm": 21.423006057739258, | |
| "learning_rate": 1.9856523332682343e-05, | |
| "loss": 3.0313, | |
| "step": 267000 | |
| }, | |
| { | |
| "epoch": 0.6440774072694068, | |
| "grad_norm": 17.86176109313965, | |
| "learning_rate": 1.9789640679634768e-05, | |
| "loss": 3.0644, | |
| "step": 267500 | |
| }, | |
| { | |
| "epoch": 0.6452812902736487, | |
| "grad_norm": 19.17348861694336, | |
| "learning_rate": 1.9722758026587194e-05, | |
| "loss": 3.0494, | |
| "step": 268000 | |
| }, | |
| { | |
| "epoch": 0.6464851732778905, | |
| "grad_norm": 19.088390350341797, | |
| "learning_rate": 1.9655875373539616e-05, | |
| "loss": 3.0172, | |
| "step": 268500 | |
| }, | |
| { | |
| "epoch": 0.6476890562821324, | |
| "grad_norm": 17.714704513549805, | |
| "learning_rate": 1.9588992720492046e-05, | |
| "loss": 3.0296, | |
| "step": 269000 | |
| }, | |
| { | |
| "epoch": 0.6488929392863743, | |
| "grad_norm": 16.175125122070312, | |
| "learning_rate": 1.9522110067444468e-05, | |
| "loss": 3.033, | |
| "step": 269500 | |
| }, | |
| { | |
| "epoch": 0.6500968222906162, | |
| "grad_norm": 13.180002212524414, | |
| "learning_rate": 1.9455227414396894e-05, | |
| "loss": 3.042, | |
| "step": 270000 | |
| }, | |
| { | |
| "epoch": 0.6500968222906162, | |
| "eval_runtime": 6276.2648, | |
| "eval_samples_per_second": 132.347, | |
| "eval_steps_per_second": 33.087, | |
| "step": 270000 | |
| }, | |
| { | |
| "epoch": 0.6513007052948581, | |
| "grad_norm": 19.098552703857422, | |
| "learning_rate": 1.9388478526655412e-05, | |
| "loss": 3.0693, | |
| "step": 270500 | |
| }, | |
| { | |
| "epoch": 0.6525045882991, | |
| "grad_norm": 17.581096649169922, | |
| "learning_rate": 1.9321595873607838e-05, | |
| "loss": 3.0159, | |
| "step": 271000 | |
| }, | |
| { | |
| "epoch": 0.6537084713033418, | |
| "grad_norm": 16.60484504699707, | |
| "learning_rate": 1.9254713220560264e-05, | |
| "loss": 3.0212, | |
| "step": 271500 | |
| }, | |
| { | |
| "epoch": 0.6549123543075837, | |
| "grad_norm": 16.275178909301758, | |
| "learning_rate": 1.918783056751269e-05, | |
| "loss": 3.0536, | |
| "step": 272000 | |
| }, | |
| { | |
| "epoch": 0.6561162373118256, | |
| "grad_norm": 18.09239959716797, | |
| "learning_rate": 1.9121081679771208e-05, | |
| "loss": 3.0576, | |
| "step": 272500 | |
| }, | |
| { | |
| "epoch": 0.6573201203160675, | |
| "grad_norm": 17.817174911499023, | |
| "learning_rate": 1.9054199026723634e-05, | |
| "loss": 3.006, | |
| "step": 273000 | |
| }, | |
| { | |
| "epoch": 0.6585240033203094, | |
| "grad_norm": 20.33548355102539, | |
| "learning_rate": 1.8987450138982156e-05, | |
| "loss": 3.0236, | |
| "step": 273500 | |
| }, | |
| { | |
| "epoch": 0.6597278863245513, | |
| "grad_norm": 16.80567169189453, | |
| "learning_rate": 1.892056748593458e-05, | |
| "loss": 3.0272, | |
| "step": 274000 | |
| }, | |
| { | |
| "epoch": 0.660931769328793, | |
| "grad_norm": 14.377747535705566, | |
| "learning_rate": 1.8853684832887004e-05, | |
| "loss": 3.0447, | |
| "step": 274500 | |
| }, | |
| { | |
| "epoch": 0.662135652333035, | |
| "grad_norm": 20.724485397338867, | |
| "learning_rate": 1.878680217983943e-05, | |
| "loss": 3.0422, | |
| "step": 275000 | |
| }, | |
| { | |
| "epoch": 0.662135652333035, | |
| "eval_runtime": 6186.2818, | |
| "eval_samples_per_second": 134.272, | |
| "eval_steps_per_second": 33.568, | |
| "step": 275000 | |
| }, | |
| { | |
| "epoch": 0.6633395353372769, | |
| "grad_norm": 18.72093963623047, | |
| "learning_rate": 1.8719919526791856e-05, | |
| "loss": 3.0455, | |
| "step": 275500 | |
| }, | |
| { | |
| "epoch": 0.6645434183415188, | |
| "grad_norm": 20.733427047729492, | |
| "learning_rate": 1.8653170639050374e-05, | |
| "loss": 3.0217, | |
| "step": 276000 | |
| }, | |
| { | |
| "epoch": 0.6657473013457607, | |
| "grad_norm": 20.21004295349121, | |
| "learning_rate": 1.85862879860028e-05, | |
| "loss": 3.0201, | |
| "step": 276500 | |
| }, | |
| { | |
| "epoch": 0.6669511843500024, | |
| "grad_norm": 16.68962860107422, | |
| "learning_rate": 1.8519405332955226e-05, | |
| "loss": 3.0333, | |
| "step": 277000 | |
| }, | |
| { | |
| "epoch": 0.6681550673542443, | |
| "grad_norm": 16.575241088867188, | |
| "learning_rate": 1.8452522679907648e-05, | |
| "loss": 3.018, | |
| "step": 277500 | |
| }, | |
| { | |
| "epoch": 0.6693589503584862, | |
| "grad_norm": 19.38899803161621, | |
| "learning_rate": 1.8385640026860074e-05, | |
| "loss": 3.0496, | |
| "step": 278000 | |
| }, | |
| { | |
| "epoch": 0.6705628333627282, | |
| "grad_norm": 14.967867851257324, | |
| "learning_rate": 1.831902490442469e-05, | |
| "loss": 2.999, | |
| "step": 278500 | |
| }, | |
| { | |
| "epoch": 0.67176671636697, | |
| "grad_norm": 22.434553146362305, | |
| "learning_rate": 1.8252142251377114e-05, | |
| "loss": 3.0349, | |
| "step": 279000 | |
| }, | |
| { | |
| "epoch": 0.672970599371212, | |
| "grad_norm": 16.710906982421875, | |
| "learning_rate": 1.818525959832954e-05, | |
| "loss": 3.0342, | |
| "step": 279500 | |
| }, | |
| { | |
| "epoch": 0.6741744823754537, | |
| "grad_norm": 15.848820686340332, | |
| "learning_rate": 1.8118376945281966e-05, | |
| "loss": 3.0272, | |
| "step": 280000 | |
| }, | |
| { | |
| "epoch": 0.6741744823754537, | |
| "eval_runtime": 6353.5428, | |
| "eval_samples_per_second": 130.737, | |
| "eval_steps_per_second": 32.684, | |
| "step": 280000 | |
| }, | |
| { | |
| "epoch": 0.6753783653796956, | |
| "grad_norm": 15.844106674194336, | |
| "learning_rate": 1.805149429223439e-05, | |
| "loss": 3.0116, | |
| "step": 280500 | |
| }, | |
| { | |
| "epoch": 0.6765822483839375, | |
| "grad_norm": 19.46364402770996, | |
| "learning_rate": 1.7984611639186817e-05, | |
| "loss": 3.0428, | |
| "step": 281000 | |
| }, | |
| { | |
| "epoch": 0.6777861313881794, | |
| "grad_norm": 16.986345291137695, | |
| "learning_rate": 1.791772898613924e-05, | |
| "loss": 3.0407, | |
| "step": 281500 | |
| }, | |
| { | |
| "epoch": 0.6789900143924213, | |
| "grad_norm": 19.00211524963379, | |
| "learning_rate": 1.7850846333091666e-05, | |
| "loss": 3.0754, | |
| "step": 282000 | |
| }, | |
| { | |
| "epoch": 0.6801938973966632, | |
| "grad_norm": 16.347320556640625, | |
| "learning_rate": 1.778396368004409e-05, | |
| "loss": 3.0583, | |
| "step": 282500 | |
| }, | |
| { | |
| "epoch": 0.681397780400905, | |
| "grad_norm": 17.984121322631836, | |
| "learning_rate": 1.7717081026996517e-05, | |
| "loss": 3.0078, | |
| "step": 283000 | |
| }, | |
| { | |
| "epoch": 0.6826016634051469, | |
| "grad_norm": 13.47775936126709, | |
| "learning_rate": 1.765019837394894e-05, | |
| "loss": 3.0313, | |
| "step": 283500 | |
| }, | |
| { | |
| "epoch": 0.6838055464093888, | |
| "grad_norm": 19.955591201782227, | |
| "learning_rate": 1.7583449486207458e-05, | |
| "loss": 3.0128, | |
| "step": 284000 | |
| }, | |
| { | |
| "epoch": 0.6850094294136307, | |
| "grad_norm": 15.306801795959473, | |
| "learning_rate": 1.7516566833159887e-05, | |
| "loss": 3.0537, | |
| "step": 284500 | |
| }, | |
| { | |
| "epoch": 0.6862133124178726, | |
| "grad_norm": 18.41864013671875, | |
| "learning_rate": 1.744968418011231e-05, | |
| "loss": 2.9884, | |
| "step": 285000 | |
| }, | |
| { | |
| "epoch": 0.6862133124178726, | |
| "eval_runtime": 6358.5857, | |
| "eval_samples_per_second": 130.634, | |
| "eval_steps_per_second": 32.659, | |
| "step": 285000 | |
| }, | |
| { | |
| "epoch": 0.6874171954221144, | |
| "grad_norm": 23.076107025146484, | |
| "learning_rate": 1.7382801527064735e-05, | |
| "loss": 3.0266, | |
| "step": 285500 | |
| }, | |
| { | |
| "epoch": 0.6886210784263563, | |
| "grad_norm": 13.705315589904785, | |
| "learning_rate": 1.7315918874017158e-05, | |
| "loss": 3.0475, | |
| "step": 286000 | |
| }, | |
| { | |
| "epoch": 0.6898249614305982, | |
| "grad_norm": 16.31940460205078, | |
| "learning_rate": 1.7249036220969587e-05, | |
| "loss": 2.9996, | |
| "step": 286500 | |
| }, | |
| { | |
| "epoch": 0.6910288444348401, | |
| "grad_norm": 18.389102935791016, | |
| "learning_rate": 1.718215356792201e-05, | |
| "loss": 3.0546, | |
| "step": 287000 | |
| }, | |
| { | |
| "epoch": 0.692232727439082, | |
| "grad_norm": 13.655202865600586, | |
| "learning_rate": 1.711540468018053e-05, | |
| "loss": 3.0324, | |
| "step": 287500 | |
| }, | |
| { | |
| "epoch": 0.6934366104433239, | |
| "grad_norm": 16.57909393310547, | |
| "learning_rate": 1.7048522027132954e-05, | |
| "loss": 3.0293, | |
| "step": 288000 | |
| }, | |
| { | |
| "epoch": 0.6946404934475657, | |
| "grad_norm": 20.497554779052734, | |
| "learning_rate": 1.6981639374085383e-05, | |
| "loss": 3.0236, | |
| "step": 288500 | |
| }, | |
| { | |
| "epoch": 0.6958443764518076, | |
| "grad_norm": 18.09133529663086, | |
| "learning_rate": 1.6914756721037805e-05, | |
| "loss": 3.0379, | |
| "step": 289000 | |
| }, | |
| { | |
| "epoch": 0.6970482594560495, | |
| "grad_norm": 26.225669860839844, | |
| "learning_rate": 1.684787406799023e-05, | |
| "loss": 3.0053, | |
| "step": 289500 | |
| }, | |
| { | |
| "epoch": 0.6982521424602914, | |
| "grad_norm": 17.222896575927734, | |
| "learning_rate": 1.6780991414942657e-05, | |
| "loss": 2.9939, | |
| "step": 290000 | |
| }, | |
| { | |
| "epoch": 0.6982521424602914, | |
| "eval_runtime": 6305.6016, | |
| "eval_samples_per_second": 131.731, | |
| "eval_steps_per_second": 32.933, | |
| "step": 290000 | |
| }, | |
| { | |
| "epoch": 0.6994560254645333, | |
| "grad_norm": 13.189409255981445, | |
| "learning_rate": 1.6714108761895082e-05, | |
| "loss": 3.0342, | |
| "step": 290500 | |
| }, | |
| { | |
| "epoch": 0.7006599084687752, | |
| "grad_norm": 16.97842025756836, | |
| "learning_rate": 1.6647226108847505e-05, | |
| "loss": 3.046, | |
| "step": 291000 | |
| }, | |
| { | |
| "epoch": 0.701863791473017, | |
| "grad_norm": 22.634611129760742, | |
| "learning_rate": 1.658061098641212e-05, | |
| "loss": 3.0375, | |
| "step": 291500 | |
| }, | |
| { | |
| "epoch": 0.7030676744772589, | |
| "grad_norm": 18.193796157836914, | |
| "learning_rate": 1.6513728333364545e-05, | |
| "loss": 3.0379, | |
| "step": 292000 | |
| }, | |
| { | |
| "epoch": 0.7042715574815008, | |
| "grad_norm": 18.391408920288086, | |
| "learning_rate": 1.644684568031697e-05, | |
| "loss": 2.9838, | |
| "step": 292500 | |
| }, | |
| { | |
| "epoch": 0.7054754404857427, | |
| "grad_norm": 20.497100830078125, | |
| "learning_rate": 1.6380096792575493e-05, | |
| "loss": 3.0761, | |
| "step": 293000 | |
| }, | |
| { | |
| "epoch": 0.7066793234899846, | |
| "grad_norm": 18.94228744506836, | |
| "learning_rate": 1.6313214139527915e-05, | |
| "loss": 3.0614, | |
| "step": 293500 | |
| }, | |
| { | |
| "epoch": 0.7078832064942264, | |
| "grad_norm": 15.402490615844727, | |
| "learning_rate": 1.624633148648034e-05, | |
| "loss": 3.0053, | |
| "step": 294000 | |
| }, | |
| { | |
| "epoch": 0.7090870894984683, | |
| "grad_norm": 26.502038955688477, | |
| "learning_rate": 1.6179448833432767e-05, | |
| "loss": 3.0216, | |
| "step": 294500 | |
| }, | |
| { | |
| "epoch": 0.7102909725027102, | |
| "grad_norm": 20.452205657958984, | |
| "learning_rate": 1.6112566180385193e-05, | |
| "loss": 2.9757, | |
| "step": 295000 | |
| }, | |
| { | |
| "epoch": 0.7102909725027102, | |
| "eval_runtime": 6348.6102, | |
| "eval_samples_per_second": 130.839, | |
| "eval_steps_per_second": 32.71, | |
| "step": 295000 | |
| }, | |
| { | |
| "epoch": 0.7114948555069521, | |
| "grad_norm": 104.5809097290039, | |
| "learning_rate": 1.6045683527337615e-05, | |
| "loss": 3.0088, | |
| "step": 295500 | |
| }, | |
| { | |
| "epoch": 0.712698738511194, | |
| "grad_norm": 15.921069145202637, | |
| "learning_rate": 1.597880087429004e-05, | |
| "loss": 3.0624, | |
| "step": 296000 | |
| }, | |
| { | |
| "epoch": 0.7139026215154359, | |
| "grad_norm": 11.739727020263672, | |
| "learning_rate": 1.5911918221242467e-05, | |
| "loss": 3.0515, | |
| "step": 296500 | |
| }, | |
| { | |
| "epoch": 0.7151065045196777, | |
| "grad_norm": 15.340862274169922, | |
| "learning_rate": 1.5845169333500985e-05, | |
| "loss": 3.0208, | |
| "step": 297000 | |
| }, | |
| { | |
| "epoch": 0.7163103875239196, | |
| "grad_norm": 16.77552604675293, | |
| "learning_rate": 1.577828668045341e-05, | |
| "loss": 3.0112, | |
| "step": 297500 | |
| }, | |
| { | |
| "epoch": 0.7175142705281615, | |
| "grad_norm": 19.09606170654297, | |
| "learning_rate": 1.5711404027405837e-05, | |
| "loss": 3.0038, | |
| "step": 298000 | |
| }, | |
| { | |
| "epoch": 0.7187181535324034, | |
| "grad_norm": 12.892488479614258, | |
| "learning_rate": 1.5644521374358262e-05, | |
| "loss": 3.0353, | |
| "step": 298500 | |
| }, | |
| { | |
| "epoch": 0.7199220365366453, | |
| "grad_norm": 15.720181465148926, | |
| "learning_rate": 1.5577638721310685e-05, | |
| "loss": 3.003, | |
| "step": 299000 | |
| }, | |
| { | |
| "epoch": 0.7211259195408872, | |
| "grad_norm": 16.5432186126709, | |
| "learning_rate": 1.5510756068263114e-05, | |
| "loss": 3.0594, | |
| "step": 299500 | |
| }, | |
| { | |
| "epoch": 0.722329802545129, | |
| "grad_norm": 24.2777042388916, | |
| "learning_rate": 1.5443873415215536e-05, | |
| "loss": 3.0239, | |
| "step": 300000 | |
| }, | |
| { | |
| "epoch": 0.722329802545129, | |
| "eval_runtime": 6203.3821, | |
| "eval_samples_per_second": 133.902, | |
| "eval_steps_per_second": 33.476, | |
| "step": 300000 | |
| }, | |
| { | |
| "epoch": 0.7235336855493709, | |
| "grad_norm": 14.297070503234863, | |
| "learning_rate": 1.5376990762167962e-05, | |
| "loss": 3.0123, | |
| "step": 300500 | |
| }, | |
| { | |
| "epoch": 0.7247375685536128, | |
| "grad_norm": 18.216154098510742, | |
| "learning_rate": 1.5310108109120384e-05, | |
| "loss": 2.9833, | |
| "step": 301000 | |
| }, | |
| { | |
| "epoch": 0.7259414515578547, | |
| "grad_norm": 15.619494438171387, | |
| "learning_rate": 1.5243359221378908e-05, | |
| "loss": 3.0715, | |
| "step": 301500 | |
| }, | |
| { | |
| "epoch": 0.7271453345620966, | |
| "grad_norm": 22.748498916625977, | |
| "learning_rate": 1.5176476568331332e-05, | |
| "loss": 3.0101, | |
| "step": 302000 | |
| }, | |
| { | |
| "epoch": 0.7283492175663384, | |
| "grad_norm": 16.824371337890625, | |
| "learning_rate": 1.5109593915283756e-05, | |
| "loss": 3.0347, | |
| "step": 302500 | |
| }, | |
| { | |
| "epoch": 0.7295531005705803, | |
| "grad_norm": 15.611109733581543, | |
| "learning_rate": 1.504271126223618e-05, | |
| "loss": 3.0386, | |
| "step": 303000 | |
| }, | |
| { | |
| "epoch": 0.7307569835748222, | |
| "grad_norm": 17.015262603759766, | |
| "learning_rate": 1.4975962374494704e-05, | |
| "loss": 3.0148, | |
| "step": 303500 | |
| }, | |
| { | |
| "epoch": 0.7319608665790641, | |
| "grad_norm": 18.96904945373535, | |
| "learning_rate": 1.4909079721447128e-05, | |
| "loss": 3.1005, | |
| "step": 304000 | |
| }, | |
| { | |
| "epoch": 0.733164749583306, | |
| "grad_norm": 21.718101501464844, | |
| "learning_rate": 1.4842197068399552e-05, | |
| "loss": 3.0489, | |
| "step": 304500 | |
| }, | |
| { | |
| "epoch": 0.7343686325875479, | |
| "grad_norm": 14.246601104736328, | |
| "learning_rate": 1.4775314415351976e-05, | |
| "loss": 3.0439, | |
| "step": 305000 | |
| }, | |
| { | |
| "epoch": 0.7343686325875479, | |
| "eval_runtime": 6028.9355, | |
| "eval_samples_per_second": 137.776, | |
| "eval_steps_per_second": 34.444, | |
| "step": 305000 | |
| }, | |
| { | |
| "epoch": 0.7355725155917897, | |
| "grad_norm": 16.374101638793945, | |
| "learning_rate": 1.4708431762304404e-05, | |
| "loss": 2.9875, | |
| "step": 305500 | |
| }, | |
| { | |
| "epoch": 0.7367763985960316, | |
| "grad_norm": 21.80797004699707, | |
| "learning_rate": 1.4641549109256828e-05, | |
| "loss": 3.0288, | |
| "step": 306000 | |
| }, | |
| { | |
| "epoch": 0.7379802816002735, | |
| "grad_norm": 14.981256484985352, | |
| "learning_rate": 1.4574666456209252e-05, | |
| "loss": 3.0079, | |
| "step": 306500 | |
| }, | |
| { | |
| "epoch": 0.7391841646045154, | |
| "grad_norm": 15.336825370788574, | |
| "learning_rate": 1.4507783803161679e-05, | |
| "loss": 3.0317, | |
| "step": 307000 | |
| }, | |
| { | |
| "epoch": 0.7403880476087573, | |
| "grad_norm": 16.014474868774414, | |
| "learning_rate": 1.4440901150114103e-05, | |
| "loss": 3.0247, | |
| "step": 307500 | |
| }, | |
| { | |
| "epoch": 0.7415919306129992, | |
| "grad_norm": 14.997090339660645, | |
| "learning_rate": 1.4374152262372622e-05, | |
| "loss": 3.0177, | |
| "step": 308000 | |
| }, | |
| { | |
| "epoch": 0.742795813617241, | |
| "grad_norm": 17.185972213745117, | |
| "learning_rate": 1.4307269609325048e-05, | |
| "loss": 3.021, | |
| "step": 308500 | |
| }, | |
| { | |
| "epoch": 0.7439996966214829, | |
| "grad_norm": 14.902591705322266, | |
| "learning_rate": 1.4240386956277473e-05, | |
| "loss": 3.0291, | |
| "step": 309000 | |
| }, | |
| { | |
| "epoch": 0.7452035796257248, | |
| "grad_norm": 17.680278778076172, | |
| "learning_rate": 1.4173504303229897e-05, | |
| "loss": 3.0205, | |
| "step": 309500 | |
| }, | |
| { | |
| "epoch": 0.7464074626299667, | |
| "grad_norm": 18.492225646972656, | |
| "learning_rate": 1.4106621650182321e-05, | |
| "loss": 3.012, | |
| "step": 310000 | |
| }, | |
| { | |
| "epoch": 0.7464074626299667, | |
| "eval_runtime": 6370.9929, | |
| "eval_samples_per_second": 130.379, | |
| "eval_steps_per_second": 32.595, | |
| "step": 310000 | |
| }, | |
| { | |
| "epoch": 0.7476113456342086, | |
| "grad_norm": 18.544729232788086, | |
| "learning_rate": 1.4039872762440842e-05, | |
| "loss": 2.9965, | |
| "step": 310500 | |
| }, | |
| { | |
| "epoch": 0.7488152286384504, | |
| "grad_norm": 19.649858474731445, | |
| "learning_rate": 1.397299010939327e-05, | |
| "loss": 3.0335, | |
| "step": 311000 | |
| }, | |
| { | |
| "epoch": 0.7500191116426923, | |
| "grad_norm": 19.35677146911621, | |
| "learning_rate": 1.3906107456345693e-05, | |
| "loss": 3.0426, | |
| "step": 311500 | |
| }, | |
| { | |
| "epoch": 0.7512229946469342, | |
| "grad_norm": 19.635725021362305, | |
| "learning_rate": 1.3839224803298117e-05, | |
| "loss": 3.0506, | |
| "step": 312000 | |
| }, | |
| { | |
| "epoch": 0.7524268776511761, | |
| "grad_norm": 16.11264991760254, | |
| "learning_rate": 1.3772342150250541e-05, | |
| "loss": 3.0185, | |
| "step": 312500 | |
| }, | |
| { | |
| "epoch": 0.753630760655418, | |
| "grad_norm": 16.436038970947266, | |
| "learning_rate": 1.3705593262509065e-05, | |
| "loss": 2.9902, | |
| "step": 313000 | |
| }, | |
| { | |
| "epoch": 0.7548346436596599, | |
| "grad_norm": 15.412540435791016, | |
| "learning_rate": 1.363871060946149e-05, | |
| "loss": 2.987, | |
| "step": 313500 | |
| }, | |
| { | |
| "epoch": 0.7560385266639017, | |
| "grad_norm": 15.1536283493042, | |
| "learning_rate": 1.3571827956413913e-05, | |
| "loss": 2.9802, | |
| "step": 314000 | |
| }, | |
| { | |
| "epoch": 0.7572424096681436, | |
| "grad_norm": 12.424234390258789, | |
| "learning_rate": 1.3504945303366337e-05, | |
| "loss": 3.0389, | |
| "step": 314500 | |
| }, | |
| { | |
| "epoch": 0.7584462926723855, | |
| "grad_norm": 18.4250431060791, | |
| "learning_rate": 1.3438062650318765e-05, | |
| "loss": 3.0125, | |
| "step": 315000 | |
| }, | |
| { | |
| "epoch": 0.7584462926723855, | |
| "eval_runtime": 6375.7867, | |
| "eval_samples_per_second": 130.281, | |
| "eval_steps_per_second": 32.57, | |
| "step": 315000 | |
| }, | |
| { | |
| "epoch": 0.7596501756766274, | |
| "grad_norm": 16.10649299621582, | |
| "learning_rate": 1.3371313762577283e-05, | |
| "loss": 2.9806, | |
| "step": 315500 | |
| }, | |
| { | |
| "epoch": 0.7608540586808693, | |
| "grad_norm": 20.46068572998047, | |
| "learning_rate": 1.3304431109529707e-05, | |
| "loss": 3.0044, | |
| "step": 316000 | |
| }, | |
| { | |
| "epoch": 0.7620579416851112, | |
| "grad_norm": 13.980119705200195, | |
| "learning_rate": 1.3237548456482131e-05, | |
| "loss": 3.0349, | |
| "step": 316500 | |
| }, | |
| { | |
| "epoch": 0.763261824689353, | |
| "grad_norm": 14.805524826049805, | |
| "learning_rate": 1.3170665803434559e-05, | |
| "loss": 3.0352, | |
| "step": 317000 | |
| }, | |
| { | |
| "epoch": 0.7644657076935949, | |
| "grad_norm": 17.586395263671875, | |
| "learning_rate": 1.3103783150386983e-05, | |
| "loss": 3.0501, | |
| "step": 317500 | |
| }, | |
| { | |
| "epoch": 0.7656695906978368, | |
| "grad_norm": 17.75722312927246, | |
| "learning_rate": 1.3036900497339407e-05, | |
| "loss": 3.0598, | |
| "step": 318000 | |
| }, | |
| { | |
| "epoch": 0.7668734737020787, | |
| "grad_norm": 22.714632034301758, | |
| "learning_rate": 1.2970017844291834e-05, | |
| "loss": 3.0555, | |
| "step": 318500 | |
| }, | |
| { | |
| "epoch": 0.7680773567063206, | |
| "grad_norm": 13.692117691040039, | |
| "learning_rate": 1.2903268956550355e-05, | |
| "loss": 3.0004, | |
| "step": 319000 | |
| }, | |
| { | |
| "epoch": 0.7692812397105625, | |
| "grad_norm": 15.780096054077148, | |
| "learning_rate": 1.2836386303502779e-05, | |
| "loss": 3.0278, | |
| "step": 319500 | |
| }, | |
| { | |
| "epoch": 0.7704851227148043, | |
| "grad_norm": 22.532176971435547, | |
| "learning_rate": 1.2769503650455203e-05, | |
| "loss": 3.045, | |
| "step": 320000 | |
| }, | |
| { | |
| "epoch": 0.7704851227148043, | |
| "eval_runtime": 6361.4309, | |
| "eval_samples_per_second": 130.575, | |
| "eval_steps_per_second": 32.644, | |
| "step": 320000 | |
| }, | |
| { | |
| "epoch": 0.7716890057190462, | |
| "grad_norm": 16.199644088745117, | |
| "learning_rate": 1.270262099740763e-05, | |
| "loss": 3.03, | |
| "step": 320500 | |
| }, | |
| { | |
| "epoch": 0.7728928887232881, | |
| "grad_norm": 23.411863327026367, | |
| "learning_rate": 1.2635738344360054e-05, | |
| "loss": 3.0227, | |
| "step": 321000 | |
| }, | |
| { | |
| "epoch": 0.77409677172753, | |
| "grad_norm": 14.578089714050293, | |
| "learning_rate": 1.2568989456618575e-05, | |
| "loss": 3.0099, | |
| "step": 321500 | |
| }, | |
| { | |
| "epoch": 0.7753006547317719, | |
| "grad_norm": 22.472322463989258, | |
| "learning_rate": 1.2502106803570999e-05, | |
| "loss": 3.0347, | |
| "step": 322000 | |
| }, | |
| { | |
| "epoch": 0.7765045377360137, | |
| "grad_norm": 12.440498352050781, | |
| "learning_rate": 1.2435224150523425e-05, | |
| "loss": 2.9987, | |
| "step": 322500 | |
| }, | |
| { | |
| "epoch": 0.7777084207402556, | |
| "grad_norm": 20.633949279785156, | |
| "learning_rate": 1.2368341497475849e-05, | |
| "loss": 3.0421, | |
| "step": 323000 | |
| }, | |
| { | |
| "epoch": 0.7789123037444975, | |
| "grad_norm": 17.52497673034668, | |
| "learning_rate": 1.2301458844428274e-05, | |
| "loss": 3.0747, | |
| "step": 323500 | |
| }, | |
| { | |
| "epoch": 0.7801161867487394, | |
| "grad_norm": 19.617210388183594, | |
| "learning_rate": 1.2234576191380698e-05, | |
| "loss": 2.9955, | |
| "step": 324000 | |
| }, | |
| { | |
| "epoch": 0.7813200697529813, | |
| "grad_norm": 16.269994735717773, | |
| "learning_rate": 1.2167827303639219e-05, | |
| "loss": 2.94, | |
| "step": 324500 | |
| }, | |
| { | |
| "epoch": 0.7825239527572232, | |
| "grad_norm": 13.604962348937988, | |
| "learning_rate": 1.2100944650591644e-05, | |
| "loss": 3.0736, | |
| "step": 325000 | |
| }, | |
| { | |
| "epoch": 0.7825239527572232, | |
| "eval_runtime": 6376.4381, | |
| "eval_samples_per_second": 130.268, | |
| "eval_steps_per_second": 32.567, | |
| "step": 325000 | |
| }, | |
| { | |
| "epoch": 0.783727835761465, | |
| "grad_norm": 20.704360961914062, | |
| "learning_rate": 1.203406199754407e-05, | |
| "loss": 3.0536, | |
| "step": 325500 | |
| }, | |
| { | |
| "epoch": 0.7849317187657069, | |
| "grad_norm": 14.824162483215332, | |
| "learning_rate": 1.1967179344496494e-05, | |
| "loss": 3.0263, | |
| "step": 326000 | |
| }, | |
| { | |
| "epoch": 0.7861356017699488, | |
| "grad_norm": 16.627286911010742, | |
| "learning_rate": 1.190029669144892e-05, | |
| "loss": 3.0037, | |
| "step": 326500 | |
| }, | |
| { | |
| "epoch": 0.7873394847741907, | |
| "grad_norm": 13.925793647766113, | |
| "learning_rate": 1.183354780370744e-05, | |
| "loss": 3.0127, | |
| "step": 327000 | |
| }, | |
| { | |
| "epoch": 0.7885433677784326, | |
| "grad_norm": 19.544754028320312, | |
| "learning_rate": 1.1766665150659866e-05, | |
| "loss": 3.0307, | |
| "step": 327500 | |
| }, | |
| { | |
| "epoch": 0.7897472507826745, | |
| "grad_norm": 13.963886260986328, | |
| "learning_rate": 1.169978249761229e-05, | |
| "loss": 3.034, | |
| "step": 328000 | |
| }, | |
| { | |
| "epoch": 0.7909511337869163, | |
| "grad_norm": 17.435409545898438, | |
| "learning_rate": 1.1632899844564716e-05, | |
| "loss": 3.0295, | |
| "step": 328500 | |
| }, | |
| { | |
| "epoch": 0.7921550167911582, | |
| "grad_norm": 17.950336456298828, | |
| "learning_rate": 1.156601719151714e-05, | |
| "loss": 3.0332, | |
| "step": 329000 | |
| }, | |
| { | |
| "epoch": 0.7933588997954001, | |
| "grad_norm": 18.523168563842773, | |
| "learning_rate": 1.1499134538469566e-05, | |
| "loss": 3.0235, | |
| "step": 329500 | |
| }, | |
| { | |
| "epoch": 0.794562782799642, | |
| "grad_norm": 14.469148635864258, | |
| "learning_rate": 1.143225188542199e-05, | |
| "loss": 3.0022, | |
| "step": 330000 | |
| }, | |
| { | |
| "epoch": 0.794562782799642, | |
| "eval_runtime": 6383.675, | |
| "eval_samples_per_second": 130.12, | |
| "eval_steps_per_second": 32.53, | |
| "step": 330000 | |
| }, | |
| { | |
| "epoch": 0.7957666658038839, | |
| "grad_norm": 17.111066818237305, | |
| "learning_rate": 1.136550299768051e-05, | |
| "loss": 3.0552, | |
| "step": 330500 | |
| }, | |
| { | |
| "epoch": 0.7969705488081257, | |
| "grad_norm": 15.104440689086914, | |
| "learning_rate": 1.1298620344632934e-05, | |
| "loss": 3.0274, | |
| "step": 331000 | |
| }, | |
| { | |
| "epoch": 0.7981744318123676, | |
| "grad_norm": 16.809152603149414, | |
| "learning_rate": 1.123173769158536e-05, | |
| "loss": 3.0156, | |
| "step": 331500 | |
| }, | |
| { | |
| "epoch": 0.7993783148166095, | |
| "grad_norm": 16.31627655029297, | |
| "learning_rate": 1.1164855038537784e-05, | |
| "loss": 3.0302, | |
| "step": 332000 | |
| }, | |
| { | |
| "epoch": 0.8005821978208514, | |
| "grad_norm": 14.074172019958496, | |
| "learning_rate": 1.109797238549021e-05, | |
| "loss": 3.0415, | |
| "step": 332500 | |
| }, | |
| { | |
| "epoch": 0.8017860808250933, | |
| "grad_norm": 26.245460510253906, | |
| "learning_rate": 1.1031089732442635e-05, | |
| "loss": 3.0031, | |
| "step": 333000 | |
| }, | |
| { | |
| "epoch": 0.8029899638293352, | |
| "grad_norm": 30.44843864440918, | |
| "learning_rate": 1.0964340844701156e-05, | |
| "loss": 3.0017, | |
| "step": 333500 | |
| }, | |
| { | |
| "epoch": 0.804193846833577, | |
| "grad_norm": 17.4643611907959, | |
| "learning_rate": 1.0897458191653582e-05, | |
| "loss": 3.0633, | |
| "step": 334000 | |
| }, | |
| { | |
| "epoch": 0.8053977298378189, | |
| "grad_norm": 31.82565689086914, | |
| "learning_rate": 1.0830575538606006e-05, | |
| "loss": 3.043, | |
| "step": 334500 | |
| }, | |
| { | |
| "epoch": 0.8066016128420608, | |
| "grad_norm": 17.253402709960938, | |
| "learning_rate": 1.0763692885558431e-05, | |
| "loss": 3.0325, | |
| "step": 335000 | |
| }, | |
| { | |
| "epoch": 0.8066016128420608, | |
| "eval_runtime": 6315.4758, | |
| "eval_samples_per_second": 131.525, | |
| "eval_steps_per_second": 32.881, | |
| "step": 335000 | |
| }, | |
| { | |
| "epoch": 0.8078054958463027, | |
| "grad_norm": 22.236631393432617, | |
| "learning_rate": 1.0696810232510855e-05, | |
| "loss": 3.0358, | |
| "step": 335500 | |
| }, | |
| { | |
| "epoch": 0.8090093788505446, | |
| "grad_norm": 14.467453956604004, | |
| "learning_rate": 1.0629927579463281e-05, | |
| "loss": 2.9967, | |
| "step": 336000 | |
| }, | |
| { | |
| "epoch": 0.8102132618547865, | |
| "grad_norm": 23.571836471557617, | |
| "learning_rate": 1.0563044926415705e-05, | |
| "loss": 3.0579, | |
| "step": 336500 | |
| }, | |
| { | |
| "epoch": 0.8114171448590283, | |
| "grad_norm": 19.492727279663086, | |
| "learning_rate": 1.0496162273368131e-05, | |
| "loss": 3.0471, | |
| "step": 337000 | |
| }, | |
| { | |
| "epoch": 0.8126210278632702, | |
| "grad_norm": 14.599898338317871, | |
| "learning_rate": 1.0429413385626651e-05, | |
| "loss": 3.0066, | |
| "step": 337500 | |
| }, | |
| { | |
| "epoch": 0.8138249108675121, | |
| "grad_norm": 17.604732513427734, | |
| "learning_rate": 1.0362530732579075e-05, | |
| "loss": 3.0106, | |
| "step": 338000 | |
| }, | |
| { | |
| "epoch": 0.815028793871754, | |
| "grad_norm": 15.079025268554688, | |
| "learning_rate": 1.0295781844837596e-05, | |
| "loss": 3.006, | |
| "step": 338500 | |
| }, | |
| { | |
| "epoch": 0.8162326768759959, | |
| "grad_norm": 17.019149780273438, | |
| "learning_rate": 1.0228899191790021e-05, | |
| "loss": 3.0254, | |
| "step": 339000 | |
| }, | |
| { | |
| "epoch": 0.8174365598802377, | |
| "grad_norm": 15.817625045776367, | |
| "learning_rate": 1.0162016538742445e-05, | |
| "loss": 3.002, | |
| "step": 339500 | |
| }, | |
| { | |
| "epoch": 0.8186404428844796, | |
| "grad_norm": 13.755847930908203, | |
| "learning_rate": 1.0095133885694871e-05, | |
| "loss": 3.0058, | |
| "step": 340000 | |
| }, | |
| { | |
| "epoch": 0.8186404428844796, | |
| "eval_runtime": 6241.8468, | |
| "eval_samples_per_second": 133.077, | |
| "eval_steps_per_second": 33.269, | |
| "step": 340000 | |
| }, | |
| { | |
| "epoch": 0.8198443258887215, | |
| "grad_norm": 16.21925926208496, | |
| "learning_rate": 1.0028251232647295e-05, | |
| "loss": 3.0572, | |
| "step": 340500 | |
| }, | |
| { | |
| "epoch": 0.8210482088929634, | |
| "grad_norm": 17.245609283447266, | |
| "learning_rate": 9.961502344905817e-06, | |
| "loss": 3.0659, | |
| "step": 341000 | |
| }, | |
| { | |
| "epoch": 0.8222520918972053, | |
| "grad_norm": 17.12338638305664, | |
| "learning_rate": 9.894619691858241e-06, | |
| "loss": 3.0002, | |
| "step": 341500 | |
| }, | |
| { | |
| "epoch": 0.8234559749014472, | |
| "grad_norm": 13.26212215423584, | |
| "learning_rate": 9.827737038810667e-06, | |
| "loss": 2.9828, | |
| "step": 342000 | |
| }, | |
| { | |
| "epoch": 0.824659857905689, | |
| "grad_norm": 20.169322967529297, | |
| "learning_rate": 9.760854385763091e-06, | |
| "loss": 2.9912, | |
| "step": 342500 | |
| }, | |
| { | |
| "epoch": 0.8258637409099309, | |
| "grad_norm": 18.99537467956543, | |
| "learning_rate": 9.693971732715517e-06, | |
| "loss": 3.0485, | |
| "step": 343000 | |
| }, | |
| { | |
| "epoch": 0.8270676239141728, | |
| "grad_norm": 27.021839141845703, | |
| "learning_rate": 9.627089079667943e-06, | |
| "loss": 3.029, | |
| "step": 343500 | |
| }, | |
| { | |
| "epoch": 0.8282715069184147, | |
| "grad_norm": 21.197938919067383, | |
| "learning_rate": 9.560206426620367e-06, | |
| "loss": 3.058, | |
| "step": 344000 | |
| }, | |
| { | |
| "epoch": 0.8294753899226566, | |
| "grad_norm": 15.80473518371582, | |
| "learning_rate": 9.493457538878885e-06, | |
| "loss": 3.0378, | |
| "step": 344500 | |
| }, | |
| { | |
| "epoch": 0.8306792729268985, | |
| "grad_norm": 20.992782592773438, | |
| "learning_rate": 9.426574885831311e-06, | |
| "loss": 3.042, | |
| "step": 345000 | |
| }, | |
| { | |
| "epoch": 0.8306792729268985, | |
| "eval_runtime": 6237.488, | |
| "eval_samples_per_second": 133.17, | |
| "eval_steps_per_second": 33.293, | |
| "step": 345000 | |
| }, | |
| { | |
| "epoch": 0.8318831559311403, | |
| "grad_norm": 15.700128555297852, | |
| "learning_rate": 9.359692232783737e-06, | |
| "loss": 3.007, | |
| "step": 345500 | |
| }, | |
| { | |
| "epoch": 0.8330870389353822, | |
| "grad_norm": 15.391378402709961, | |
| "learning_rate": 9.292809579736161e-06, | |
| "loss": 3.0211, | |
| "step": 346000 | |
| }, | |
| { | |
| "epoch": 0.8342909219396241, | |
| "grad_norm": 17.32360076904297, | |
| "learning_rate": 9.225926926688587e-06, | |
| "loss": 3.0727, | |
| "step": 346500 | |
| }, | |
| { | |
| "epoch": 0.835494804943866, | |
| "grad_norm": 15.85698127746582, | |
| "learning_rate": 9.159178038947107e-06, | |
| "loss": 3.0066, | |
| "step": 347000 | |
| }, | |
| { | |
| "epoch": 0.8366986879481079, | |
| "grad_norm": 15.092347145080566, | |
| "learning_rate": 9.092295385899533e-06, | |
| "loss": 3.0106, | |
| "step": 347500 | |
| }, | |
| { | |
| "epoch": 0.8379025709523497, | |
| "grad_norm": 14.47977352142334, | |
| "learning_rate": 9.025412732851957e-06, | |
| "loss": 3.0139, | |
| "step": 348000 | |
| }, | |
| { | |
| "epoch": 0.8391064539565916, | |
| "grad_norm": 12.257486343383789, | |
| "learning_rate": 8.958530079804383e-06, | |
| "loss": 3.0264, | |
| "step": 348500 | |
| }, | |
| { | |
| "epoch": 0.8403103369608335, | |
| "grad_norm": 17.00981330871582, | |
| "learning_rate": 8.891781192062903e-06, | |
| "loss": 3.0321, | |
| "step": 349000 | |
| }, | |
| { | |
| "epoch": 0.8415142199650754, | |
| "grad_norm": 17.08600616455078, | |
| "learning_rate": 8.824898539015327e-06, | |
| "loss": 3.0046, | |
| "step": 349500 | |
| }, | |
| { | |
| "epoch": 0.8427181029693173, | |
| "grad_norm": 14.907938003540039, | |
| "learning_rate": 8.758015885967753e-06, | |
| "loss": 3.0485, | |
| "step": 350000 | |
| }, | |
| { | |
| "epoch": 0.8427181029693173, | |
| "eval_runtime": 6388.9903, | |
| "eval_samples_per_second": 130.012, | |
| "eval_steps_per_second": 32.503, | |
| "step": 350000 | |
| }, | |
| { | |
| "epoch": 0.8439219859735592, | |
| "grad_norm": 14.369677543640137, | |
| "learning_rate": 8.691133232920177e-06, | |
| "loss": 3.0205, | |
| "step": 350500 | |
| }, | |
| { | |
| "epoch": 0.845125868977801, | |
| "grad_norm": 19.901779174804688, | |
| "learning_rate": 8.624250579872602e-06, | |
| "loss": 3.0481, | |
| "step": 351000 | |
| }, | |
| { | |
| "epoch": 0.8463297519820429, | |
| "grad_norm": 14.823498725891113, | |
| "learning_rate": 8.557367926825027e-06, | |
| "loss": 2.9577, | |
| "step": 351500 | |
| }, | |
| { | |
| "epoch": 0.8475336349862848, | |
| "grad_norm": 19.70775032043457, | |
| "learning_rate": 8.490485273777452e-06, | |
| "loss": 3.0341, | |
| "step": 352000 | |
| }, | |
| { | |
| "epoch": 0.8487375179905267, | |
| "grad_norm": 17.01579475402832, | |
| "learning_rate": 8.423736386035973e-06, | |
| "loss": 2.9874, | |
| "step": 352500 | |
| }, | |
| { | |
| "epoch": 0.8499414009947686, | |
| "grad_norm": 16.942848205566406, | |
| "learning_rate": 8.356853732988397e-06, | |
| "loss": 3.0226, | |
| "step": 353000 | |
| }, | |
| { | |
| "epoch": 0.8511452839990105, | |
| "grad_norm": 16.905664443969727, | |
| "learning_rate": 8.289971079940822e-06, | |
| "loss": 2.9484, | |
| "step": 353500 | |
| }, | |
| { | |
| "epoch": 0.8523491670032523, | |
| "grad_norm": 15.149470329284668, | |
| "learning_rate": 8.223088426893248e-06, | |
| "loss": 2.9945, | |
| "step": 354000 | |
| }, | |
| { | |
| "epoch": 0.8535530500074942, | |
| "grad_norm": 21.70083236694336, | |
| "learning_rate": 8.156205773845672e-06, | |
| "loss": 3.0103, | |
| "step": 354500 | |
| }, | |
| { | |
| "epoch": 0.854756933011736, | |
| "grad_norm": 12.760059356689453, | |
| "learning_rate": 8.089323120798098e-06, | |
| "loss": 3.0178, | |
| "step": 355000 | |
| }, | |
| { | |
| "epoch": 0.854756933011736, | |
| "eval_runtime": 6293.9927, | |
| "eval_samples_per_second": 131.974, | |
| "eval_steps_per_second": 32.994, | |
| "step": 355000 | |
| }, | |
| { | |
| "epoch": 0.855960816015978, | |
| "grad_norm": 28.85261344909668, | |
| "learning_rate": 8.022440467750522e-06, | |
| "loss": 2.9688, | |
| "step": 355500 | |
| }, | |
| { | |
| "epoch": 0.8571646990202199, | |
| "grad_norm": 13.942831039428711, | |
| "learning_rate": 7.955557814702948e-06, | |
| "loss": 2.9447, | |
| "step": 356000 | |
| }, | |
| { | |
| "epoch": 0.8583685820244616, | |
| "grad_norm": 14.091262817382812, | |
| "learning_rate": 7.888808926961468e-06, | |
| "loss": 2.9865, | |
| "step": 356500 | |
| }, | |
| { | |
| "epoch": 0.8595724650287035, | |
| "grad_norm": 19.63146209716797, | |
| "learning_rate": 7.821926273913894e-06, | |
| "loss": 3.0031, | |
| "step": 357000 | |
| }, | |
| { | |
| "epoch": 0.8607763480329454, | |
| "grad_norm": 12.868454933166504, | |
| "learning_rate": 7.755043620866318e-06, | |
| "loss": 2.9701, | |
| "step": 357500 | |
| }, | |
| { | |
| "epoch": 0.8619802310371873, | |
| "grad_norm": 18.4489803314209, | |
| "learning_rate": 7.688160967818744e-06, | |
| "loss": 2.9628, | |
| "step": 358000 | |
| }, | |
| { | |
| "epoch": 0.8631841140414293, | |
| "grad_norm": 14.441180229187012, | |
| "learning_rate": 7.6212783147711685e-06, | |
| "loss": 3.0001, | |
| "step": 358500 | |
| }, | |
| { | |
| "epoch": 0.8643879970456712, | |
| "grad_norm": 14.59991455078125, | |
| "learning_rate": 7.554529427029688e-06, | |
| "loss": 3.0118, | |
| "step": 359000 | |
| }, | |
| { | |
| "epoch": 0.8655918800499129, | |
| "grad_norm": 24.200435638427734, | |
| "learning_rate": 7.487646773982113e-06, | |
| "loss": 3.0567, | |
| "step": 359500 | |
| }, | |
| { | |
| "epoch": 0.8667957630541548, | |
| "grad_norm": 17.150327682495117, | |
| "learning_rate": 7.420764120934539e-06, | |
| "loss": 3.0472, | |
| "step": 360000 | |
| }, | |
| { | |
| "epoch": 0.8667957630541548, | |
| "eval_runtime": 6262.3633, | |
| "eval_samples_per_second": 132.641, | |
| "eval_steps_per_second": 33.16, | |
| "step": 360000 | |
| }, | |
| { | |
| "epoch": 0.8679996460583967, | |
| "grad_norm": 20.363269805908203, | |
| "learning_rate": 7.353881467886964e-06, | |
| "loss": 2.9548, | |
| "step": 360500 | |
| }, | |
| { | |
| "epoch": 0.8692035290626386, | |
| "grad_norm": 16.118206024169922, | |
| "learning_rate": 7.2869988148393885e-06, | |
| "loss": 3.0507, | |
| "step": 361000 | |
| }, | |
| { | |
| "epoch": 0.8704074120668805, | |
| "grad_norm": 16.389257431030273, | |
| "learning_rate": 7.220116161791813e-06, | |
| "loss": 3.0169, | |
| "step": 361500 | |
| }, | |
| { | |
| "epoch": 0.8716112950711224, | |
| "grad_norm": 15.485569953918457, | |
| "learning_rate": 7.153233508744238e-06, | |
| "loss": 3.016, | |
| "step": 362000 | |
| }, | |
| { | |
| "epoch": 0.8728151780753642, | |
| "grad_norm": 18.530200958251953, | |
| "learning_rate": 7.086350855696663e-06, | |
| "loss": 3.0083, | |
| "step": 362500 | |
| }, | |
| { | |
| "epoch": 0.8740190610796061, | |
| "grad_norm": 14.700156211853027, | |
| "learning_rate": 7.0196019679551835e-06, | |
| "loss": 2.9861, | |
| "step": 363000 | |
| }, | |
| { | |
| "epoch": 0.875222944083848, | |
| "grad_norm": 19.87506675720215, | |
| "learning_rate": 6.952719314907609e-06, | |
| "loss": 3.0287, | |
| "step": 363500 | |
| }, | |
| { | |
| "epoch": 0.8764268270880899, | |
| "grad_norm": 25.59213638305664, | |
| "learning_rate": 6.885836661860033e-06, | |
| "loss": 3.0149, | |
| "step": 364000 | |
| }, | |
| { | |
| "epoch": 0.8776307100923318, | |
| "grad_norm": 16.81450653076172, | |
| "learning_rate": 6.818954008812459e-06, | |
| "loss": 3.0167, | |
| "step": 364500 | |
| }, | |
| { | |
| "epoch": 0.8788345930965736, | |
| "grad_norm": 20.761167526245117, | |
| "learning_rate": 6.7522051210709786e-06, | |
| "loss": 3.037, | |
| "step": 365000 | |
| }, | |
| { | |
| "epoch": 0.8788345930965736, | |
| "eval_runtime": 6325.2523, | |
| "eval_samples_per_second": 131.322, | |
| "eval_steps_per_second": 32.831, | |
| "step": 365000 | |
| }, | |
| { | |
| "epoch": 0.8800384761008155, | |
| "grad_norm": 18.997737884521484, | |
| "learning_rate": 6.685322468023404e-06, | |
| "loss": 3.0299, | |
| "step": 365500 | |
| }, | |
| { | |
| "epoch": 0.8812423591050574, | |
| "grad_norm": 18.71440315246582, | |
| "learning_rate": 6.618439814975828e-06, | |
| "loss": 3.018, | |
| "step": 366000 | |
| }, | |
| { | |
| "epoch": 0.8824462421092993, | |
| "grad_norm": 17.6945858001709, | |
| "learning_rate": 6.551557161928254e-06, | |
| "loss": 3.0215, | |
| "step": 366500 | |
| }, | |
| { | |
| "epoch": 0.8836501251135412, | |
| "grad_norm": 17.693279266357422, | |
| "learning_rate": 6.48467450888068e-06, | |
| "loss": 3.0327, | |
| "step": 367000 | |
| }, | |
| { | |
| "epoch": 0.8848540081177831, | |
| "grad_norm": 12.849013328552246, | |
| "learning_rate": 6.417791855833104e-06, | |
| "loss": 3.0219, | |
| "step": 367500 | |
| }, | |
| { | |
| "epoch": 0.8860578911220249, | |
| "grad_norm": 15.688241958618164, | |
| "learning_rate": 6.351042968091623e-06, | |
| "loss": 3.0481, | |
| "step": 368000 | |
| }, | |
| { | |
| "epoch": 0.8872617741262668, | |
| "grad_norm": 16.61380958557129, | |
| "learning_rate": 6.284160315044049e-06, | |
| "loss": 2.9957, | |
| "step": 368500 | |
| }, | |
| { | |
| "epoch": 0.8884656571305087, | |
| "grad_norm": 14.891318321228027, | |
| "learning_rate": 6.217277661996474e-06, | |
| "loss": 3.0474, | |
| "step": 369000 | |
| }, | |
| { | |
| "epoch": 0.8896695401347506, | |
| "grad_norm": 20.029443740844727, | |
| "learning_rate": 6.150395008948899e-06, | |
| "loss": 3.0241, | |
| "step": 369500 | |
| }, | |
| { | |
| "epoch": 0.8908734231389925, | |
| "grad_norm": 13.43873119354248, | |
| "learning_rate": 6.083512355901325e-06, | |
| "loss": 3.0318, | |
| "step": 370000 | |
| }, | |
| { | |
| "epoch": 0.8908734231389925, | |
| "eval_runtime": 6403.6333, | |
| "eval_samples_per_second": 129.715, | |
| "eval_steps_per_second": 32.429, | |
| "step": 370000 | |
| }, | |
| { | |
| "epoch": 0.8920773061432344, | |
| "grad_norm": 16.173236846923828, | |
| "learning_rate": 6.016763468159844e-06, | |
| "loss": 3.0562, | |
| "step": 370500 | |
| }, | |
| { | |
| "epoch": 0.8932811891474762, | |
| "grad_norm": 21.55840301513672, | |
| "learning_rate": 5.949880815112269e-06, | |
| "loss": 2.9967, | |
| "step": 371000 | |
| }, | |
| { | |
| "epoch": 0.8944850721517181, | |
| "grad_norm": 15.276843070983887, | |
| "learning_rate": 5.882998162064694e-06, | |
| "loss": 3.0263, | |
| "step": 371500 | |
| }, | |
| { | |
| "epoch": 0.89568895515596, | |
| "grad_norm": 15.087631225585938, | |
| "learning_rate": 5.81611550901712e-06, | |
| "loss": 2.9793, | |
| "step": 372000 | |
| }, | |
| { | |
| "epoch": 0.8968928381602019, | |
| "grad_norm": 12.954302787780762, | |
| "learning_rate": 5.749232855969545e-06, | |
| "loss": 3.0192, | |
| "step": 372500 | |
| }, | |
| { | |
| "epoch": 0.8980967211644438, | |
| "grad_norm": 20.37034797668457, | |
| "learning_rate": 5.6823502029219695e-06, | |
| "loss": 3.0274, | |
| "step": 373000 | |
| }, | |
| { | |
| "epoch": 0.8993006041686856, | |
| "grad_norm": 16.947673797607422, | |
| "learning_rate": 5.61560131518049e-06, | |
| "loss": 3.0792, | |
| "step": 373500 | |
| }, | |
| { | |
| "epoch": 0.9005044871729275, | |
| "grad_norm": 14.517135620117188, | |
| "learning_rate": 5.548718662132915e-06, | |
| "loss": 2.9878, | |
| "step": 374000 | |
| }, | |
| { | |
| "epoch": 0.9017083701771694, | |
| "grad_norm": 15.187361717224121, | |
| "learning_rate": 5.48183600908534e-06, | |
| "loss": 3.0541, | |
| "step": 374500 | |
| }, | |
| { | |
| "epoch": 0.9029122531814113, | |
| "grad_norm": 15.383942604064941, | |
| "learning_rate": 5.4149533560377646e-06, | |
| "loss": 3.021, | |
| "step": 375000 | |
| }, | |
| { | |
| "epoch": 0.9029122531814113, | |
| "eval_runtime": 6390.605, | |
| "eval_samples_per_second": 129.979, | |
| "eval_steps_per_second": 32.495, | |
| "step": 375000 | |
| }, | |
| { | |
| "epoch": 0.9041161361856532, | |
| "grad_norm": 17.510334014892578, | |
| "learning_rate": 5.3480707029901895e-06, | |
| "loss": 3.0943, | |
| "step": 375500 | |
| }, | |
| { | |
| "epoch": 0.9053200191898951, | |
| "grad_norm": 16.601346969604492, | |
| "learning_rate": 5.281321815248711e-06, | |
| "loss": 3.0723, | |
| "step": 376000 | |
| }, | |
| { | |
| "epoch": 0.9065239021941369, | |
| "grad_norm": 22.802818298339844, | |
| "learning_rate": 5.2144391622011356e-06, | |
| "loss": 3.0491, | |
| "step": 376500 | |
| }, | |
| { | |
| "epoch": 0.9077277851983788, | |
| "grad_norm": 17.018939971923828, | |
| "learning_rate": 5.1475565091535605e-06, | |
| "loss": 3.0556, | |
| "step": 377000 | |
| }, | |
| { | |
| "epoch": 0.9089316682026207, | |
| "grad_norm": 19.08505630493164, | |
| "learning_rate": 5.080673856105985e-06, | |
| "loss": 3.046, | |
| "step": 377500 | |
| }, | |
| { | |
| "epoch": 0.9101355512068626, | |
| "grad_norm": 16.25370216369629, | |
| "learning_rate": 5.01379120305841e-06, | |
| "loss": 3.0191, | |
| "step": 378000 | |
| }, | |
| { | |
| "epoch": 0.9113394342111045, | |
| "grad_norm": 16.954275131225586, | |
| "learning_rate": 4.946908550010835e-06, | |
| "loss": 3.0025, | |
| "step": 378500 | |
| }, | |
| { | |
| "epoch": 0.9125433172153464, | |
| "grad_norm": 26.870176315307617, | |
| "learning_rate": 4.8801596622693555e-06, | |
| "loss": 3.0288, | |
| "step": 379000 | |
| }, | |
| { | |
| "epoch": 0.9137472002195882, | |
| "grad_norm": 14.162908554077148, | |
| "learning_rate": 4.81327700922178e-06, | |
| "loss": 3.0278, | |
| "step": 379500 | |
| }, | |
| { | |
| "epoch": 0.9149510832238301, | |
| "grad_norm": 16.129444122314453, | |
| "learning_rate": 4.746394356174205e-06, | |
| "loss": 3.0409, | |
| "step": 380000 | |
| }, | |
| { | |
| "epoch": 0.9149510832238301, | |
| "eval_runtime": 6329.5669, | |
| "eval_samples_per_second": 131.233, | |
| "eval_steps_per_second": 32.808, | |
| "step": 380000 | |
| }, | |
| { | |
| "epoch": 0.916154966228072, | |
| "grad_norm": 19.689468383789062, | |
| "learning_rate": 4.67951170312663e-06, | |
| "loss": 3.0399, | |
| "step": 380500 | |
| }, | |
| { | |
| "epoch": 0.9173588492323139, | |
| "grad_norm": 17.123493194580078, | |
| "learning_rate": 4.6127628153851506e-06, | |
| "loss": 3.0142, | |
| "step": 381000 | |
| }, | |
| { | |
| "epoch": 0.9185627322365558, | |
| "grad_norm": 15.44541072845459, | |
| "learning_rate": 4.5458801623375755e-06, | |
| "loss": 2.9937, | |
| "step": 381500 | |
| }, | |
| { | |
| "epoch": 0.9197666152407976, | |
| "grad_norm": 20.037689208984375, | |
| "learning_rate": 4.47899750929e-06, | |
| "loss": 3.0889, | |
| "step": 382000 | |
| }, | |
| { | |
| "epoch": 0.9209704982450395, | |
| "grad_norm": 17.4291934967041, | |
| "learning_rate": 4.412114856242425e-06, | |
| "loss": 2.9653, | |
| "step": 382500 | |
| }, | |
| { | |
| "epoch": 0.9221743812492814, | |
| "grad_norm": 18.911190032958984, | |
| "learning_rate": 4.345232203194851e-06, | |
| "loss": 3.0299, | |
| "step": 383000 | |
| }, | |
| { | |
| "epoch": 0.9233782642535233, | |
| "grad_norm": 18.403993606567383, | |
| "learning_rate": 4.278349550147276e-06, | |
| "loss": 3.0437, | |
| "step": 383500 | |
| }, | |
| { | |
| "epoch": 0.9245821472577652, | |
| "grad_norm": 17.68988800048828, | |
| "learning_rate": 4.211600662405795e-06, | |
| "loss": 2.9655, | |
| "step": 384000 | |
| }, | |
| { | |
| "epoch": 0.9257860302620071, | |
| "grad_norm": 15.752707481384277, | |
| "learning_rate": 4.144718009358221e-06, | |
| "loss": 3.0118, | |
| "step": 384500 | |
| }, | |
| { | |
| "epoch": 0.9269899132662489, | |
| "grad_norm": 15.633676528930664, | |
| "learning_rate": 4.077835356310646e-06, | |
| "loss": 2.98, | |
| "step": 385000 | |
| }, | |
| { | |
| "epoch": 0.9269899132662489, | |
| "eval_runtime": 6416.4378, | |
| "eval_samples_per_second": 129.456, | |
| "eval_steps_per_second": 32.364, | |
| "step": 385000 | |
| }, | |
| { | |
| "epoch": 0.9281937962704908, | |
| "grad_norm": 22.764881134033203, | |
| "learning_rate": 4.010952703263071e-06, | |
| "loss": 3.0464, | |
| "step": 385500 | |
| }, | |
| { | |
| "epoch": 0.9293976792747327, | |
| "grad_norm": 16.236614227294922, | |
| "learning_rate": 3.944070050215496e-06, | |
| "loss": 3.0362, | |
| "step": 386000 | |
| }, | |
| { | |
| "epoch": 0.9306015622789746, | |
| "grad_norm": 14.50631332397461, | |
| "learning_rate": 3.877321162474017e-06, | |
| "loss": 3.071, | |
| "step": 386500 | |
| }, | |
| { | |
| "epoch": 0.9318054452832165, | |
| "grad_norm": 13.831846237182617, | |
| "learning_rate": 3.8104385094264415e-06, | |
| "loss": 3.0001, | |
| "step": 387000 | |
| }, | |
| { | |
| "epoch": 0.9330093282874584, | |
| "grad_norm": 12.26697826385498, | |
| "learning_rate": 3.7435558563788664e-06, | |
| "loss": 3.0437, | |
| "step": 387500 | |
| }, | |
| { | |
| "epoch": 0.9342132112917002, | |
| "grad_norm": 20.174835205078125, | |
| "learning_rate": 3.6766732033312913e-06, | |
| "loss": 3.0136, | |
| "step": 388000 | |
| }, | |
| { | |
| "epoch": 0.9354170942959421, | |
| "grad_norm": 19.26807975769043, | |
| "learning_rate": 3.609790550283716e-06, | |
| "loss": 3.0054, | |
| "step": 388500 | |
| }, | |
| { | |
| "epoch": 0.936620977300184, | |
| "grad_norm": 13.987044334411621, | |
| "learning_rate": 3.542907897236141e-06, | |
| "loss": 3.004, | |
| "step": 389000 | |
| }, | |
| { | |
| "epoch": 0.9378248603044259, | |
| "grad_norm": 19.408586502075195, | |
| "learning_rate": 3.476025244188567e-06, | |
| "loss": 3.0438, | |
| "step": 389500 | |
| }, | |
| { | |
| "epoch": 0.9390287433086678, | |
| "grad_norm": 20.116239547729492, | |
| "learning_rate": 3.4092763564470868e-06, | |
| "loss": 3.0043, | |
| "step": 390000 | |
| }, | |
| { | |
| "epoch": 0.9390287433086678, | |
| "eval_runtime": 6418.8738, | |
| "eval_samples_per_second": 129.407, | |
| "eval_steps_per_second": 32.352, | |
| "step": 390000 | |
| }, | |
| { | |
| "epoch": 0.9402326263129097, | |
| "grad_norm": 15.818509101867676, | |
| "learning_rate": 3.3423937033995117e-06, | |
| "loss": 2.9467, | |
| "step": 390500 | |
| }, | |
| { | |
| "epoch": 0.9414365093171515, | |
| "grad_norm": 17.208309173583984, | |
| "learning_rate": 3.2755110503519366e-06, | |
| "loss": 3.0507, | |
| "step": 391000 | |
| }, | |
| { | |
| "epoch": 0.9426403923213934, | |
| "grad_norm": 14.738162994384766, | |
| "learning_rate": 3.208628397304362e-06, | |
| "loss": 3.0281, | |
| "step": 391500 | |
| }, | |
| { | |
| "epoch": 0.9438442753256353, | |
| "grad_norm": 15.624344825744629, | |
| "learning_rate": 3.141879509562882e-06, | |
| "loss": 3.0248, | |
| "step": 392000 | |
| }, | |
| { | |
| "epoch": 0.9450481583298772, | |
| "grad_norm": 17.159011840820312, | |
| "learning_rate": 3.074996856515307e-06, | |
| "loss": 2.9597, | |
| "step": 392500 | |
| }, | |
| { | |
| "epoch": 0.9462520413341191, | |
| "grad_norm": 13.915901184082031, | |
| "learning_rate": 3.008114203467732e-06, | |
| "loss": 2.9937, | |
| "step": 393000 | |
| }, | |
| { | |
| "epoch": 0.9474559243383609, | |
| "grad_norm": 20.13627052307129, | |
| "learning_rate": 2.941231550420157e-06, | |
| "loss": 2.9966, | |
| "step": 393500 | |
| }, | |
| { | |
| "epoch": 0.9486598073426028, | |
| "grad_norm": 26.449026107788086, | |
| "learning_rate": 2.8743488973725822e-06, | |
| "loss": 2.9904, | |
| "step": 394000 | |
| }, | |
| { | |
| "epoch": 0.9498636903468447, | |
| "grad_norm": 18.189252853393555, | |
| "learning_rate": 2.807600009631102e-06, | |
| "loss": 3.0078, | |
| "step": 394500 | |
| }, | |
| { | |
| "epoch": 0.9510675733510866, | |
| "grad_norm": 20.91954803466797, | |
| "learning_rate": 2.740717356583527e-06, | |
| "loss": 3.0439, | |
| "step": 395000 | |
| }, | |
| { | |
| "epoch": 0.9510675733510866, | |
| "eval_runtime": 6350.5821, | |
| "eval_samples_per_second": 130.798, | |
| "eval_steps_per_second": 32.7, | |
| "step": 395000 | |
| }, | |
| { | |
| "epoch": 0.9522714563553285, | |
| "grad_norm": 18.318206787109375, | |
| "learning_rate": 2.673834703535952e-06, | |
| "loss": 2.989, | |
| "step": 395500 | |
| }, | |
| { | |
| "epoch": 0.9534753393595704, | |
| "grad_norm": 16.19314193725586, | |
| "learning_rate": 2.6069520504883773e-06, | |
| "loss": 2.9842, | |
| "step": 396000 | |
| }, | |
| { | |
| "epoch": 0.9546792223638122, | |
| "grad_norm": 16.36551856994629, | |
| "learning_rate": 2.540069397440802e-06, | |
| "loss": 2.9938, | |
| "step": 396500 | |
| }, | |
| { | |
| "epoch": 0.9558831053680541, | |
| "grad_norm": 19.816038131713867, | |
| "learning_rate": 2.473186744393227e-06, | |
| "loss": 3.0204, | |
| "step": 397000 | |
| }, | |
| { | |
| "epoch": 0.957086988372296, | |
| "grad_norm": 14.318347930908203, | |
| "learning_rate": 2.4064378566517474e-06, | |
| "loss": 3.0851, | |
| "step": 397500 | |
| }, | |
| { | |
| "epoch": 0.9582908713765379, | |
| "grad_norm": 17.114421844482422, | |
| "learning_rate": 2.3395552036041728e-06, | |
| "loss": 3.0096, | |
| "step": 398000 | |
| }, | |
| { | |
| "epoch": 0.9594947543807798, | |
| "grad_norm": 18.27849578857422, | |
| "learning_rate": 2.2726725505565977e-06, | |
| "loss": 3.0374, | |
| "step": 398500 | |
| }, | |
| { | |
| "epoch": 0.9606986373850217, | |
| "grad_norm": 16.87068748474121, | |
| "learning_rate": 2.2057898975090225e-06, | |
| "loss": 3.0484, | |
| "step": 399000 | |
| }, | |
| { | |
| "epoch": 0.9619025203892635, | |
| "grad_norm": 22.162954330444336, | |
| "learning_rate": 2.138907244461448e-06, | |
| "loss": 3.04, | |
| "step": 399500 | |
| }, | |
| { | |
| "epoch": 0.9631064033935054, | |
| "grad_norm": 16.329286575317383, | |
| "learning_rate": 2.0720245914138728e-06, | |
| "loss": 2.9491, | |
| "step": 400000 | |
| }, | |
| { | |
| "epoch": 0.9631064033935054, | |
| "eval_runtime": 6424.6879, | |
| "eval_samples_per_second": 129.29, | |
| "eval_steps_per_second": 32.323, | |
| "step": 400000 | |
| }, | |
| { | |
| "epoch": 0.9643102863977473, | |
| "grad_norm": 16.189512252807617, | |
| "learning_rate": 2.0052757036723927e-06, | |
| "loss": 3.045, | |
| "step": 400500 | |
| }, | |
| { | |
| "epoch": 0.9655141694019892, | |
| "grad_norm": 18.53325080871582, | |
| "learning_rate": 1.9383930506248176e-06, | |
| "loss": 3.0405, | |
| "step": 401000 | |
| }, | |
| { | |
| "epoch": 0.9667180524062311, | |
| "grad_norm": 21.920936584472656, | |
| "learning_rate": 1.871510397577243e-06, | |
| "loss": 3.0347, | |
| "step": 401500 | |
| }, | |
| { | |
| "epoch": 0.9679219354104729, | |
| "grad_norm": 10.188512802124023, | |
| "learning_rate": 1.8046277445296678e-06, | |
| "loss": 2.9497, | |
| "step": 402000 | |
| }, | |
| { | |
| "epoch": 0.9691258184147148, | |
| "grad_norm": 23.691808700561523, | |
| "learning_rate": 1.7377450914820931e-06, | |
| "loss": 3.0046, | |
| "step": 402500 | |
| }, | |
| { | |
| "epoch": 0.9703297014189567, | |
| "grad_norm": 17.392013549804688, | |
| "learning_rate": 1.6709962037406129e-06, | |
| "loss": 2.996, | |
| "step": 403000 | |
| }, | |
| { | |
| "epoch": 0.9715335844231986, | |
| "grad_norm": 19.87090492248535, | |
| "learning_rate": 1.6041135506930382e-06, | |
| "loss": 3.042, | |
| "step": 403500 | |
| }, | |
| { | |
| "epoch": 0.9727374674274405, | |
| "grad_norm": 19.895801544189453, | |
| "learning_rate": 1.537230897645463e-06, | |
| "loss": 2.978, | |
| "step": 404000 | |
| }, | |
| { | |
| "epoch": 0.9739413504316824, | |
| "grad_norm": 16.795654296875, | |
| "learning_rate": 1.4703482445978882e-06, | |
| "loss": 3.0219, | |
| "step": 404500 | |
| }, | |
| { | |
| "epoch": 0.9751452334359242, | |
| "grad_norm": 13.37932014465332, | |
| "learning_rate": 1.4034655915503133e-06, | |
| "loss": 3.0323, | |
| "step": 405000 | |
| }, | |
| { | |
| "epoch": 0.9751452334359242, | |
| "eval_runtime": 6375.6502, | |
| "eval_samples_per_second": 130.284, | |
| "eval_steps_per_second": 32.571, | |
| "step": 405000 | |
| }, | |
| { | |
| "epoch": 0.9763491164401661, | |
| "grad_norm": 14.84689712524414, | |
| "learning_rate": 1.3367167038088334e-06, | |
| "loss": 2.9709, | |
| "step": 405500 | |
| }, | |
| { | |
| "epoch": 0.977552999444408, | |
| "grad_norm": 14.532979011535645, | |
| "learning_rate": 1.2698340507612583e-06, | |
| "loss": 3.0614, | |
| "step": 406000 | |
| }, | |
| { | |
| "epoch": 0.9787568824486499, | |
| "grad_norm": 15.914132118225098, | |
| "learning_rate": 1.2029513977136834e-06, | |
| "loss": 3.0498, | |
| "step": 406500 | |
| }, | |
| { | |
| "epoch": 0.9799607654528918, | |
| "grad_norm": 14.478850364685059, | |
| "learning_rate": 1.1360687446661085e-06, | |
| "loss": 2.9675, | |
| "step": 407000 | |
| }, | |
| { | |
| "epoch": 0.9811646484571337, | |
| "grad_norm": 22.8538818359375, | |
| "learning_rate": 1.0691860916185336e-06, | |
| "loss": 3.0232, | |
| "step": 407500 | |
| }, | |
| { | |
| "epoch": 0.9823685314613755, | |
| "grad_norm": 15.004932403564453, | |
| "learning_rate": 1.0023034385709585e-06, | |
| "loss": 2.9698, | |
| "step": 408000 | |
| }, | |
| { | |
| "epoch": 0.9835724144656174, | |
| "grad_norm": 15.036443710327148, | |
| "learning_rate": 9.354207855233835e-07, | |
| "loss": 2.9961, | |
| "step": 408500 | |
| }, | |
| { | |
| "epoch": 0.9847762974698593, | |
| "grad_norm": 19.975051879882812, | |
| "learning_rate": 8.685381324758087e-07, | |
| "loss": 3.0067, | |
| "step": 409000 | |
| }, | |
| { | |
| "epoch": 0.9859801804741012, | |
| "grad_norm": 17.99605369567871, | |
| "learning_rate": 8.017892447343288e-07, | |
| "loss": 3.0702, | |
| "step": 409500 | |
| }, | |
| { | |
| "epoch": 0.9871840634783431, | |
| "grad_norm": 15.935543060302734, | |
| "learning_rate": 7.349065916867538e-07, | |
| "loss": 3.0132, | |
| "step": 410000 | |
| }, | |
| { | |
| "epoch": 0.9871840634783431, | |
| "eval_runtime": 6347.5902, | |
| "eval_samples_per_second": 130.86, | |
| "eval_steps_per_second": 32.715, | |
| "step": 410000 | |
| }, | |
| { | |
| "epoch": 0.9883879464825849, | |
| "grad_norm": 12.5308256149292, | |
| "learning_rate": 6.680239386391788e-07, | |
| "loss": 2.977, | |
| "step": 410500 | |
| }, | |
| { | |
| "epoch": 0.9895918294868268, | |
| "grad_norm": 15.325048446655273, | |
| "learning_rate": 6.011412855916039e-07, | |
| "loss": 3.0383, | |
| "step": 411000 | |
| }, | |
| { | |
| "epoch": 0.9907957124910687, | |
| "grad_norm": 16.378740310668945, | |
| "learning_rate": 5.342586325440288e-07, | |
| "loss": 3.0278, | |
| "step": 411500 | |
| }, | |
| { | |
| "epoch": 0.9919995954953106, | |
| "grad_norm": 17.669631958007812, | |
| "learning_rate": 4.673759794964539e-07, | |
| "loss": 2.9929, | |
| "step": 412000 | |
| }, | |
| { | |
| "epoch": 0.9932034784995525, | |
| "grad_norm": 16.54693603515625, | |
| "learning_rate": 4.004933264488789e-07, | |
| "loss": 2.9768, | |
| "step": 412500 | |
| }, | |
| { | |
| "epoch": 0.9944073615037944, | |
| "grad_norm": 16.434072494506836, | |
| "learning_rate": 3.337444387073991e-07, | |
| "loss": 3.0664, | |
| "step": 413000 | |
| }, | |
| { | |
| "epoch": 0.9956112445080362, | |
| "grad_norm": 28.83799171447754, | |
| "learning_rate": 2.6686178565982417e-07, | |
| "loss": 3.0716, | |
| "step": 413500 | |
| }, | |
| { | |
| "epoch": 0.9968151275122781, | |
| "grad_norm": 15.776455879211426, | |
| "learning_rate": 1.9997913261224917e-07, | |
| "loss": 3.0396, | |
| "step": 414000 | |
| }, | |
| { | |
| "epoch": 0.99801901051652, | |
| "grad_norm": 18.937358856201172, | |
| "learning_rate": 1.330964795646742e-07, | |
| "loss": 3.0165, | |
| "step": 414500 | |
| }, | |
| { | |
| "epoch": 0.9992228935207619, | |
| "grad_norm": 20.05877685546875, | |
| "learning_rate": 6.621382651709922e-08, | |
| "loss": 3.0059, | |
| "step": 415000 | |
| }, | |
| { | |
| "epoch": 0.9992228935207619, | |
| "eval_runtime": 6436.495, | |
| "eval_samples_per_second": 129.052, | |
| "eval_steps_per_second": 32.263, | |
| "step": 415000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 415322, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 5000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1124779666016266e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |