| { |
| "best_global_step": 79000, |
| "best_metric": 3.5263609886169434, |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_drop_frequency_1032/checkpoint-40000", |
| "epoch": 28.821151942228177, |
| "eval_steps": 1000, |
| "global_step": 99000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.014559431599790344, |
| "grad_norm": 0.9666687846183777, |
| "learning_rate": 0.000294, |
| "loss": 8.4834, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.029118863199580687, |
| "grad_norm": 0.6833519339561462, |
| "learning_rate": 0.0005939999999999999, |
| "loss": 6.7077, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.043678294799371034, |
| "grad_norm": 0.4742502272129059, |
| "learning_rate": 0.0005998287212350713, |
| "loss": 6.3505, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.058237726399161374, |
| "grad_norm": 0.44300776720046997, |
| "learning_rate": 0.0005996539469851441, |
| "loss": 6.126, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.07279715799895171, |
| "grad_norm": 0.6068394780158997, |
| "learning_rate": 0.000599479172735217, |
| "loss": 6.0028, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.08735658959874207, |
| "grad_norm": 0.3999798893928528, |
| "learning_rate": 0.0005993043984852897, |
| "loss": 5.8771, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.10191602119853241, |
| "grad_norm": 0.47448229789733887, |
| "learning_rate": 0.0005991296242353626, |
| "loss": 5.7297, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.11647545279832275, |
| "grad_norm": 0.4407774806022644, |
| "learning_rate": 0.0005989548499854355, |
| "loss": 5.6118, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.1310348843981131, |
| "grad_norm": 0.4476652145385742, |
| "learning_rate": 0.0005987800757355083, |
| "loss": 5.5129, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.14559431599790343, |
| "grad_norm": 0.44474974274635315, |
| "learning_rate": 0.0005986053014855811, |
| "loss": 5.4224, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.1601537475976938, |
| "grad_norm": 0.41966310143470764, |
| "learning_rate": 0.000598430527235654, |
| "loss": 5.3379, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.17471317919748414, |
| "grad_norm": 0.4177263081073761, |
| "learning_rate": 0.0005982557529857267, |
| "loss": 5.2512, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.18927261079727448, |
| "grad_norm": 0.44888487458229065, |
| "learning_rate": 0.0005980809787357995, |
| "loss": 5.1803, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.20383204239706482, |
| "grad_norm": 0.43577608466148376, |
| "learning_rate": 0.0005979062044858724, |
| "loss": 5.1268, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.21839147399685516, |
| "grad_norm": 0.43620362877845764, |
| "learning_rate": 0.0005977314302359452, |
| "loss": 5.0768, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.2329509055966455, |
| "grad_norm": 0.44033217430114746, |
| "learning_rate": 0.0005975566559860181, |
| "loss": 5.0148, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.24751033719643586, |
| "grad_norm": 0.4907797574996948, |
| "learning_rate": 0.0005973818817360908, |
| "loss": 4.9808, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.2620697687962262, |
| "grad_norm": 0.45115190744400024, |
| "learning_rate": 0.0005972071074861636, |
| "loss": 4.9143, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.2766292003960165, |
| "grad_norm": 0.42089030146598816, |
| "learning_rate": 0.0005970323332362365, |
| "loss": 4.8585, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.29118863199580686, |
| "grad_norm": 0.441190630197525, |
| "learning_rate": 0.0005968575589863093, |
| "loss": 4.8164, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.29118863199580686, |
| "eval_accuracy": 0.2562200359735874, |
| "eval_loss": 4.745169639587402, |
| "eval_runtime": 55.0786, |
| "eval_samples_per_second": 302.168, |
| "eval_steps_per_second": 18.9, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.30574806359559725, |
| "grad_norm": 0.4603753387928009, |
| "learning_rate": 0.0005966827847363822, |
| "loss": 4.7669, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.3203074951953876, |
| "grad_norm": 0.46023908257484436, |
| "learning_rate": 0.000596508010486455, |
| "loss": 4.7283, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.33486692679517793, |
| "grad_norm": 0.47654032707214355, |
| "learning_rate": 0.0005963332362365277, |
| "loss": 4.688, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.3494263583949683, |
| "grad_norm": 0.5129820108413696, |
| "learning_rate": 0.0005961584619866006, |
| "loss": 4.6455, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.3639857899947586, |
| "grad_norm": 0.4901754856109619, |
| "learning_rate": 0.0005959836877366734, |
| "loss": 4.6235, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.37854522159454895, |
| "grad_norm": 0.4569827914237976, |
| "learning_rate": 0.0005958089134867463, |
| "loss": 4.5877, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.3931046531943393, |
| "grad_norm": 0.44484788179397583, |
| "learning_rate": 0.0005956341392368191, |
| "loss": 4.5622, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.40766408479412963, |
| "grad_norm": 0.45020967721939087, |
| "learning_rate": 0.0005954593649868918, |
| "loss": 4.5348, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.42222351639392, |
| "grad_norm": 0.44121989607810974, |
| "learning_rate": 0.0005952845907369647, |
| "loss": 4.5162, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.4367829479937103, |
| "grad_norm": 0.45869946479797363, |
| "learning_rate": 0.0005951098164870375, |
| "loss": 4.4997, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.45134237959350065, |
| "grad_norm": 0.4464726150035858, |
| "learning_rate": 0.0005949350422371104, |
| "loss": 4.4767, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.465901811193291, |
| "grad_norm": 0.40169212222099304, |
| "learning_rate": 0.0005947602679871832, |
| "loss": 4.4618, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.48046124279308133, |
| "grad_norm": 0.40386444330215454, |
| "learning_rate": 0.000594585493737256, |
| "loss": 4.4517, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.49502067439287173, |
| "grad_norm": 0.40597283840179443, |
| "learning_rate": 0.0005944107194873288, |
| "loss": 4.4269, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.509580105992662, |
| "grad_norm": 0.41445210576057434, |
| "learning_rate": 0.0005942359452374016, |
| "loss": 4.4003, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.5241395375924524, |
| "grad_norm": 0.4420674741268158, |
| "learning_rate": 0.0005940611709874745, |
| "loss": 4.393, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.5386989691922427, |
| "grad_norm": 0.4193710386753082, |
| "learning_rate": 0.0005938863967375473, |
| "loss": 4.3779, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.553258400792033, |
| "grad_norm": 0.4820527732372284, |
| "learning_rate": 0.0005937116224876201, |
| "loss": 4.3614, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.5678178323918234, |
| "grad_norm": 0.4935331642627716, |
| "learning_rate": 0.000593536848237693, |
| "loss": 4.3505, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.5823772639916137, |
| "grad_norm": 0.3989650011062622, |
| "learning_rate": 0.0005933620739877657, |
| "loss": 4.339, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5823772639916137, |
| "eval_accuracy": 0.3002242440625767, |
| "eval_loss": 4.28311824798584, |
| "eval_runtime": 54.2851, |
| "eval_samples_per_second": 306.585, |
| "eval_steps_per_second": 19.177, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5969366955914042, |
| "grad_norm": 0.38092532753944397, |
| "learning_rate": 0.0005931872997378385, |
| "loss": 4.3209, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.6114961271911945, |
| "grad_norm": 0.3823917508125305, |
| "learning_rate": 0.0005930125254879114, |
| "loss": 4.3103, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.6260555587909848, |
| "grad_norm": 0.4137718677520752, |
| "learning_rate": 0.0005928377512379842, |
| "loss": 4.301, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.6406149903907752, |
| "grad_norm": 0.38565006852149963, |
| "learning_rate": 0.0005926629769880571, |
| "loss": 4.2898, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.6551744219905655, |
| "grad_norm": 0.41142529249191284, |
| "learning_rate": 0.0005924882027381298, |
| "loss": 4.2786, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.6697338535903559, |
| "grad_norm": 0.3580577075481415, |
| "learning_rate": 0.0005923134284882026, |
| "loss": 4.2655, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.6842932851901462, |
| "grad_norm": 0.3694271147251129, |
| "learning_rate": 0.0005921386542382755, |
| "loss": 4.2603, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.6988527167899365, |
| "grad_norm": 0.3978845477104187, |
| "learning_rate": 0.0005919638799883483, |
| "loss": 4.2569, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.7134121483897269, |
| "grad_norm": 0.40943846106529236, |
| "learning_rate": 0.0005917891057384212, |
| "loss": 4.2379, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.7279715799895172, |
| "grad_norm": 0.3747680187225342, |
| "learning_rate": 0.000591614331488494, |
| "loss": 4.2269, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.7425310115893076, |
| "grad_norm": 0.36069026589393616, |
| "learning_rate": 0.0005914395572385667, |
| "loss": 4.2199, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.7570904431890979, |
| "grad_norm": 0.3404024541378021, |
| "learning_rate": 0.0005912647829886396, |
| "loss": 4.208, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.7716498747888882, |
| "grad_norm": 0.3656750023365021, |
| "learning_rate": 0.0005910900087387124, |
| "loss": 4.1972, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.7862093063886786, |
| "grad_norm": 0.3920387327671051, |
| "learning_rate": 0.0005909152344887853, |
| "loss": 4.1947, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.8007687379884689, |
| "grad_norm": 0.3375900089740753, |
| "learning_rate": 0.0005907404602388581, |
| "loss": 4.1886, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.8153281695882593, |
| "grad_norm": 0.3581906855106354, |
| "learning_rate": 0.0005905656859889308, |
| "loss": 4.1761, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.8298876011880496, |
| "grad_norm": 0.37578803300857544, |
| "learning_rate": 0.0005903909117390037, |
| "loss": 4.1768, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.84444703278784, |
| "grad_norm": 0.34982478618621826, |
| "learning_rate": 0.0005902161374890766, |
| "loss": 4.1671, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.8590064643876303, |
| "grad_norm": 0.36295685172080994, |
| "learning_rate": 0.0005900413632391494, |
| "loss": 4.1643, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.8735658959874206, |
| "grad_norm": 0.35229530930519104, |
| "learning_rate": 0.0005898665889892223, |
| "loss": 4.1293, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8735658959874206, |
| "eval_accuracy": 0.3151443347008721, |
| "eval_loss": 4.097636699676514, |
| "eval_runtime": 54.4875, |
| "eval_samples_per_second": 305.446, |
| "eval_steps_per_second": 19.105, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.888125327587211, |
| "grad_norm": 0.3671216368675232, |
| "learning_rate": 0.0005896918147392951, |
| "loss": 4.1345, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.9026847591870013, |
| "grad_norm": 0.3501938581466675, |
| "learning_rate": 0.0005895170404893678, |
| "loss": 4.1387, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.9172441907867916, |
| "grad_norm": 0.3753458261489868, |
| "learning_rate": 0.0005893422662394407, |
| "loss": 4.1391, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.931803622386582, |
| "grad_norm": 0.32294249534606934, |
| "learning_rate": 0.0005891674919895135, |
| "loss": 4.111, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.9463630539863723, |
| "grad_norm": 0.3349330425262451, |
| "learning_rate": 0.0005889927177395864, |
| "loss": 4.1282, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.9609224855861627, |
| "grad_norm": 0.3496231138706207, |
| "learning_rate": 0.0005888179434896592, |
| "loss": 4.1165, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.975481917185953, |
| "grad_norm": 0.3345687985420227, |
| "learning_rate": 0.000588643169239732, |
| "loss": 4.1009, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.9900413487857435, |
| "grad_norm": 0.3242470920085907, |
| "learning_rate": 0.0005884683949898048, |
| "loss": 4.0959, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.0043678294799372, |
| "grad_norm": 0.34464624524116516, |
| "learning_rate": 0.0005882936207398776, |
| "loss": 4.0755, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.0189272610797275, |
| "grad_norm": 0.3738979399204254, |
| "learning_rate": 0.0005881188464899504, |
| "loss": 4.0271, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.0334866926795179, |
| "grad_norm": 0.32653385400772095, |
| "learning_rate": 0.0005879440722400233, |
| "loss": 4.0212, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.0480461242793082, |
| "grad_norm": 0.3483313024044037, |
| "learning_rate": 0.0005877692979900961, |
| "loss": 4.0193, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.0626055558790986, |
| "grad_norm": 0.3408651351928711, |
| "learning_rate": 0.000587594523740169, |
| "loss": 4.0118, |
| "step": 3650 |
| }, |
| { |
| "epoch": 1.077164987478889, |
| "grad_norm": 0.33649924397468567, |
| "learning_rate": 0.0005874197494902417, |
| "loss": 4.0068, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.0917244190786792, |
| "grad_norm": 0.3527706563472748, |
| "learning_rate": 0.0005872449752403145, |
| "loss": 4.0024, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.1062838506784696, |
| "grad_norm": 0.35581859946250916, |
| "learning_rate": 0.0005870702009903874, |
| "loss": 4.0074, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.12084328227826, |
| "grad_norm": 0.3438279628753662, |
| "learning_rate": 0.0005868954267404602, |
| "loss": 4.0077, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.1354027138780503, |
| "grad_norm": 0.3411901593208313, |
| "learning_rate": 0.0005867206524905331, |
| "loss": 3.995, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.1499621454778406, |
| "grad_norm": 0.3471081852912903, |
| "learning_rate": 0.0005865458782406058, |
| "loss": 4.0004, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.164521577077631, |
| "grad_norm": 0.3431474566459656, |
| "learning_rate": 0.0005863711039906786, |
| "loss": 3.9912, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.164521577077631, |
| "eval_accuracy": 0.3247543409880778, |
| "eval_loss": 3.995392322540283, |
| "eval_runtime": 53.9545, |
| "eval_samples_per_second": 308.464, |
| "eval_steps_per_second": 19.294, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.1790810086774213, |
| "grad_norm": 0.33514344692230225, |
| "learning_rate": 0.0005861963297407515, |
| "loss": 4.0057, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.1936404402772116, |
| "grad_norm": 0.33262795209884644, |
| "learning_rate": 0.0005860215554908243, |
| "loss": 3.9851, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.208199871877002, |
| "grad_norm": 0.3248565196990967, |
| "learning_rate": 0.0005858467812408972, |
| "loss": 3.9799, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.2227593034767923, |
| "grad_norm": 0.33513668179512024, |
| "learning_rate": 0.00058567200699097, |
| "loss": 3.9747, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.2373187350765826, |
| "grad_norm": 0.3263963460922241, |
| "learning_rate": 0.0005854972327410427, |
| "loss": 3.9791, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.251878166676373, |
| "grad_norm": 0.34506794810295105, |
| "learning_rate": 0.0005853224584911156, |
| "loss": 3.9817, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.2664375982761633, |
| "grad_norm": 0.319431871175766, |
| "learning_rate": 0.0005851476842411884, |
| "loss": 3.9671, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.2809970298759537, |
| "grad_norm": 0.33263441920280457, |
| "learning_rate": 0.0005849729099912613, |
| "loss": 3.9745, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.295556461475744, |
| "grad_norm": 0.33536776900291443, |
| "learning_rate": 0.0005847981357413341, |
| "loss": 3.9744, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.3101158930755343, |
| "grad_norm": 0.34813007712364197, |
| "learning_rate": 0.0005846233614914068, |
| "loss": 3.9551, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.3246753246753247, |
| "grad_norm": 0.34729769825935364, |
| "learning_rate": 0.0005844485872414797, |
| "loss": 3.955, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.339234756275115, |
| "grad_norm": 0.34845030307769775, |
| "learning_rate": 0.0005842738129915525, |
| "loss": 3.9533, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.3537941878749054, |
| "grad_norm": 0.32806530594825745, |
| "learning_rate": 0.0005840990387416253, |
| "loss": 3.9656, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.3683536194746957, |
| "grad_norm": 0.3774058222770691, |
| "learning_rate": 0.0005839242644916982, |
| "loss": 3.9569, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.382913051074486, |
| "grad_norm": 0.3156019449234009, |
| "learning_rate": 0.000583749490241771, |
| "loss": 3.9416, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.3974724826742764, |
| "grad_norm": 0.3090326488018036, |
| "learning_rate": 0.0005835747159918438, |
| "loss": 3.9483, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.4120319142740667, |
| "grad_norm": 0.33362695574760437, |
| "learning_rate": 0.0005833999417419166, |
| "loss": 3.9398, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.426591345873857, |
| "grad_norm": 0.32493874430656433, |
| "learning_rate": 0.0005832251674919894, |
| "loss": 3.9381, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.4411507774736474, |
| "grad_norm": 0.32908692955970764, |
| "learning_rate": 0.0005830503932420623, |
| "loss": 3.9488, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.4557102090734377, |
| "grad_norm": 0.31451448798179626, |
| "learning_rate": 0.0005828756189921351, |
| "loss": 3.9401, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.4557102090734377, |
| "eval_accuracy": 0.33137442644092774, |
| "eval_loss": 3.9215424060821533, |
| "eval_runtime": 54.124, |
| "eval_samples_per_second": 307.497, |
| "eval_steps_per_second": 19.234, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.470269640673228, |
| "grad_norm": 0.3171522319316864, |
| "learning_rate": 0.000582700844742208, |
| "loss": 3.9336, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.4848290722730184, |
| "grad_norm": 0.3146921694278717, |
| "learning_rate": 0.0005825260704922807, |
| "loss": 3.9335, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.4993885038728088, |
| "grad_norm": 0.354244589805603, |
| "learning_rate": 0.0005823512962423535, |
| "loss": 3.9259, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.5139479354725993, |
| "grad_norm": 0.3321908116340637, |
| "learning_rate": 0.0005821765219924264, |
| "loss": 3.9218, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.5285073670723897, |
| "grad_norm": 0.3170032203197479, |
| "learning_rate": 0.0005820017477424992, |
| "loss": 3.9297, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.54306679867218, |
| "grad_norm": 0.3232501149177551, |
| "learning_rate": 0.0005818269734925721, |
| "loss": 3.9106, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.5576262302719703, |
| "grad_norm": 0.321432888507843, |
| "learning_rate": 0.0005816521992426448, |
| "loss": 3.924, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.5721856618717607, |
| "grad_norm": 0.3379034399986267, |
| "learning_rate": 0.0005814774249927176, |
| "loss": 3.9177, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.586745093471551, |
| "grad_norm": 0.31285661458969116, |
| "learning_rate": 0.0005813026507427905, |
| "loss": 3.9152, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.6013045250713414, |
| "grad_norm": 0.3218041658401489, |
| "learning_rate": 0.0005811278764928634, |
| "loss": 3.9095, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.6158639566711317, |
| "grad_norm": 0.29677674174308777, |
| "learning_rate": 0.0005809531022429362, |
| "loss": 3.907, |
| "step": 5550 |
| }, |
| { |
| "epoch": 1.630423388270922, |
| "grad_norm": 0.33540773391723633, |
| "learning_rate": 0.0005807783279930091, |
| "loss": 3.9074, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.6449828198707124, |
| "grad_norm": 0.3314766585826874, |
| "learning_rate": 0.0005806035537430818, |
| "loss": 3.9054, |
| "step": 5650 |
| }, |
| { |
| "epoch": 1.6595422514705027, |
| "grad_norm": 0.3158515989780426, |
| "learning_rate": 0.0005804287794931546, |
| "loss": 3.8869, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.674101683070293, |
| "grad_norm": 0.30713674426078796, |
| "learning_rate": 0.0005802540052432275, |
| "loss": 3.8962, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.6886611146700834, |
| "grad_norm": 0.334526389837265, |
| "learning_rate": 0.0005800792309933003, |
| "loss": 3.8883, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.7032205462698737, |
| "grad_norm": 0.31845974922180176, |
| "learning_rate": 0.0005799044567433732, |
| "loss": 3.9004, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.717779977869664, |
| "grad_norm": 0.31406348943710327, |
| "learning_rate": 0.000579729682493446, |
| "loss": 3.8783, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.7323394094694544, |
| "grad_norm": 0.30887269973754883, |
| "learning_rate": 0.0005795549082435187, |
| "loss": 3.8818, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.7468988410692448, |
| "grad_norm": 0.3162541687488556, |
| "learning_rate": 0.0005793801339935916, |
| "loss": 3.8958, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.7468988410692448, |
| "eval_accuracy": 0.33682020249576317, |
| "eval_loss": 3.8626632690429688, |
| "eval_runtime": 54.4766, |
| "eval_samples_per_second": 305.507, |
| "eval_steps_per_second": 19.109, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.761458272669035, |
| "grad_norm": 0.32636117935180664, |
| "learning_rate": 0.0005792053597436644, |
| "loss": 3.8833, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.7760177042688254, |
| "grad_norm": 0.3097558319568634, |
| "learning_rate": 0.0005790305854937372, |
| "loss": 3.8691, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.7905771358686158, |
| "grad_norm": 0.34034818410873413, |
| "learning_rate": 0.0005788558112438101, |
| "loss": 3.8802, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.8051365674684061, |
| "grad_norm": 0.32233792543411255, |
| "learning_rate": 0.0005786810369938828, |
| "loss": 3.8883, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.8196959990681965, |
| "grad_norm": 0.3292987048625946, |
| "learning_rate": 0.0005785062627439557, |
| "loss": 3.8728, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.8342554306679868, |
| "grad_norm": 0.31761112809181213, |
| "learning_rate": 0.0005783314884940285, |
| "loss": 3.8788, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.8488148622677771, |
| "grad_norm": 0.3174941837787628, |
| "learning_rate": 0.0005781567142441013, |
| "loss": 3.8706, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.8633742938675675, |
| "grad_norm": 0.2970981299877167, |
| "learning_rate": 0.0005779819399941742, |
| "loss": 3.8679, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.8779337254673578, |
| "grad_norm": 0.32310914993286133, |
| "learning_rate": 0.000577807165744247, |
| "loss": 3.8601, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.8924931570671482, |
| "grad_norm": 0.3130098879337311, |
| "learning_rate": 0.0005776323914943198, |
| "loss": 3.864, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.9070525886669385, |
| "grad_norm": 0.2963304817676544, |
| "learning_rate": 0.0005774576172443926, |
| "loss": 3.8588, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.9216120202667288, |
| "grad_norm": 0.32023951411247253, |
| "learning_rate": 0.0005772828429944654, |
| "loss": 3.8634, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.9361714518665192, |
| "grad_norm": 0.306027889251709, |
| "learning_rate": 0.0005771080687445383, |
| "loss": 3.8449, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.9507308834663095, |
| "grad_norm": 0.3124430775642395, |
| "learning_rate": 0.0005769332944946111, |
| "loss": 3.8598, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.9652903150660999, |
| "grad_norm": 0.3299570083618164, |
| "learning_rate": 0.0005767585202446839, |
| "loss": 3.848, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.9798497466658902, |
| "grad_norm": 0.31828752160072327, |
| "learning_rate": 0.0005765837459947567, |
| "loss": 3.8571, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.9944091782656805, |
| "grad_norm": 0.3087295591831207, |
| "learning_rate": 0.0005764089717448295, |
| "loss": 3.8415, |
| "step": 6850 |
| }, |
| { |
| "epoch": 2.0087356589598744, |
| "grad_norm": 0.3276985287666321, |
| "learning_rate": 0.0005762341974949024, |
| "loss": 3.7926, |
| "step": 6900 |
| }, |
| { |
| "epoch": 2.0232950905596647, |
| "grad_norm": 0.3149563670158386, |
| "learning_rate": 0.0005760594232449752, |
| "loss": 3.7356, |
| "step": 6950 |
| }, |
| { |
| "epoch": 2.037854522159455, |
| "grad_norm": 0.2993158996105194, |
| "learning_rate": 0.0005758846489950481, |
| "loss": 3.7519, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.037854522159455, |
| "eval_accuracy": 0.340481644404722, |
| "eval_loss": 3.821302890777588, |
| "eval_runtime": 54.3871, |
| "eval_samples_per_second": 306.01, |
| "eval_steps_per_second": 19.141, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.0524139537592454, |
| "grad_norm": 0.31537163257598877, |
| "learning_rate": 0.0005757098747451208, |
| "loss": 3.7487, |
| "step": 7050 |
| }, |
| { |
| "epoch": 2.0669733853590357, |
| "grad_norm": 0.30981120467185974, |
| "learning_rate": 0.0005755351004951936, |
| "loss": 3.7554, |
| "step": 7100 |
| }, |
| { |
| "epoch": 2.081532816958826, |
| "grad_norm": 0.3070833086967468, |
| "learning_rate": 0.0005753603262452665, |
| "loss": 3.7464, |
| "step": 7150 |
| }, |
| { |
| "epoch": 2.0960922485586164, |
| "grad_norm": 0.3225395977497101, |
| "learning_rate": 0.0005751855519953393, |
| "loss": 3.7526, |
| "step": 7200 |
| }, |
| { |
| "epoch": 2.1106516801584068, |
| "grad_norm": 0.3187791407108307, |
| "learning_rate": 0.0005750107777454121, |
| "loss": 3.7594, |
| "step": 7250 |
| }, |
| { |
| "epoch": 2.125211111758197, |
| "grad_norm": 0.32253599166870117, |
| "learning_rate": 0.0005748360034954849, |
| "loss": 3.759, |
| "step": 7300 |
| }, |
| { |
| "epoch": 2.1397705433579874, |
| "grad_norm": 0.3304608464241028, |
| "learning_rate": 0.0005746612292455577, |
| "loss": 3.755, |
| "step": 7350 |
| }, |
| { |
| "epoch": 2.154329974957778, |
| "grad_norm": 0.31175366044044495, |
| "learning_rate": 0.0005744864549956306, |
| "loss": 3.7636, |
| "step": 7400 |
| }, |
| { |
| "epoch": 2.168889406557568, |
| "grad_norm": 0.3209310472011566, |
| "learning_rate": 0.0005743116807457034, |
| "loss": 3.7474, |
| "step": 7450 |
| }, |
| { |
| "epoch": 2.1834488381573585, |
| "grad_norm": 0.32411980628967285, |
| "learning_rate": 0.0005741369064957762, |
| "loss": 3.7611, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.198008269757149, |
| "grad_norm": 0.31499746441841125, |
| "learning_rate": 0.0005739621322458491, |
| "loss": 3.7567, |
| "step": 7550 |
| }, |
| { |
| "epoch": 2.212567701356939, |
| "grad_norm": 0.3134678602218628, |
| "learning_rate": 0.0005737873579959218, |
| "loss": 3.7556, |
| "step": 7600 |
| }, |
| { |
| "epoch": 2.2271271329567295, |
| "grad_norm": 0.31595176458358765, |
| "learning_rate": 0.0005736125837459947, |
| "loss": 3.7522, |
| "step": 7650 |
| }, |
| { |
| "epoch": 2.24168656455652, |
| "grad_norm": 0.33326753973960876, |
| "learning_rate": 0.0005734378094960675, |
| "loss": 3.7659, |
| "step": 7700 |
| }, |
| { |
| "epoch": 2.25624599615631, |
| "grad_norm": 0.3168000876903534, |
| "learning_rate": 0.0005732630352461403, |
| "loss": 3.7552, |
| "step": 7750 |
| }, |
| { |
| "epoch": 2.2708054277561005, |
| "grad_norm": 0.3116115629673004, |
| "learning_rate": 0.0005730882609962132, |
| "loss": 3.7474, |
| "step": 7800 |
| }, |
| { |
| "epoch": 2.285364859355891, |
| "grad_norm": 0.30287280678749084, |
| "learning_rate": 0.0005729134867462859, |
| "loss": 3.7573, |
| "step": 7850 |
| }, |
| { |
| "epoch": 2.299924290955681, |
| "grad_norm": 0.3045010566711426, |
| "learning_rate": 0.0005727387124963588, |
| "loss": 3.7762, |
| "step": 7900 |
| }, |
| { |
| "epoch": 2.3144837225554715, |
| "grad_norm": 0.3148520290851593, |
| "learning_rate": 0.0005725639382464317, |
| "loss": 3.748, |
| "step": 7950 |
| }, |
| { |
| "epoch": 2.329043154155262, |
| "grad_norm": 0.31695887446403503, |
| "learning_rate": 0.0005723891639965045, |
| "loss": 3.756, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.329043154155262, |
| "eval_accuracy": 0.3437476519985189, |
| "eval_loss": 3.7891645431518555, |
| "eval_runtime": 53.9955, |
| "eval_samples_per_second": 308.229, |
| "eval_steps_per_second": 19.279, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.343602585755052, |
| "grad_norm": 0.3124977648258209, |
| "learning_rate": 0.0005722143897465773, |
| "loss": 3.7641, |
| "step": 8050 |
| }, |
| { |
| "epoch": 2.3581620173548425, |
| "grad_norm": 0.3077414631843567, |
| "learning_rate": 0.0005720396154966502, |
| "loss": 3.7611, |
| "step": 8100 |
| }, |
| { |
| "epoch": 2.372721448954633, |
| "grad_norm": 0.31669649481773376, |
| "learning_rate": 0.0005718648412467229, |
| "loss": 3.7623, |
| "step": 8150 |
| }, |
| { |
| "epoch": 2.3872808805544232, |
| "grad_norm": 0.3225698173046112, |
| "learning_rate": 0.0005716900669967958, |
| "loss": 3.7578, |
| "step": 8200 |
| }, |
| { |
| "epoch": 2.4018403121542136, |
| "grad_norm": 0.31290560960769653, |
| "learning_rate": 0.0005715152927468686, |
| "loss": 3.7603, |
| "step": 8250 |
| }, |
| { |
| "epoch": 2.416399743754004, |
| "grad_norm": 0.3104917109012604, |
| "learning_rate": 0.0005713405184969414, |
| "loss": 3.7553, |
| "step": 8300 |
| }, |
| { |
| "epoch": 2.4309591753537942, |
| "grad_norm": 0.3150444030761719, |
| "learning_rate": 0.0005711657442470143, |
| "loss": 3.7486, |
| "step": 8350 |
| }, |
| { |
| "epoch": 2.4455186069535846, |
| "grad_norm": 0.31220582127571106, |
| "learning_rate": 0.000570990969997087, |
| "loss": 3.7551, |
| "step": 8400 |
| }, |
| { |
| "epoch": 2.460078038553375, |
| "grad_norm": 0.29195117950439453, |
| "learning_rate": 0.0005708161957471599, |
| "loss": 3.7531, |
| "step": 8450 |
| }, |
| { |
| "epoch": 2.4746374701531653, |
| "grad_norm": 0.31497254967689514, |
| "learning_rate": 0.0005706414214972327, |
| "loss": 3.7579, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.4891969017529556, |
| "grad_norm": 0.31675657629966736, |
| "learning_rate": 0.0005704666472473055, |
| "loss": 3.7515, |
| "step": 8550 |
| }, |
| { |
| "epoch": 2.503756333352746, |
| "grad_norm": 0.30652034282684326, |
| "learning_rate": 0.0005702918729973784, |
| "loss": 3.7504, |
| "step": 8600 |
| }, |
| { |
| "epoch": 2.5183157649525363, |
| "grad_norm": 0.30155831575393677, |
| "learning_rate": 0.0005701170987474512, |
| "loss": 3.755, |
| "step": 8650 |
| }, |
| { |
| "epoch": 2.5328751965523266, |
| "grad_norm": 0.2939681112766266, |
| "learning_rate": 0.0005699423244975239, |
| "loss": 3.7377, |
| "step": 8700 |
| }, |
| { |
| "epoch": 2.547434628152117, |
| "grad_norm": 0.31261202692985535, |
| "learning_rate": 0.0005697675502475968, |
| "loss": 3.7476, |
| "step": 8750 |
| }, |
| { |
| "epoch": 2.5619940597519073, |
| "grad_norm": 0.31194260716438293, |
| "learning_rate": 0.0005695927759976696, |
| "loss": 3.7342, |
| "step": 8800 |
| }, |
| { |
| "epoch": 2.5765534913516976, |
| "grad_norm": 0.3196016550064087, |
| "learning_rate": 0.0005694180017477425, |
| "loss": 3.7295, |
| "step": 8850 |
| }, |
| { |
| "epoch": 2.591112922951488, |
| "grad_norm": 0.30109933018684387, |
| "learning_rate": 0.0005692432274978153, |
| "loss": 3.7498, |
| "step": 8900 |
| }, |
| { |
| "epoch": 2.6056723545512783, |
| "grad_norm": 0.33264702558517456, |
| "learning_rate": 0.000569068453247888, |
| "loss": 3.7422, |
| "step": 8950 |
| }, |
| { |
| "epoch": 2.6202317861510687, |
| "grad_norm": 0.30177509784698486, |
| "learning_rate": 0.0005688936789979609, |
| "loss": 3.7507, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.6202317861510687, |
| "eval_accuracy": 0.34679883399201816, |
| "eval_loss": 3.7572646141052246, |
| "eval_runtime": 54.2719, |
| "eval_samples_per_second": 306.66, |
| "eval_steps_per_second": 19.181, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.634791217750859, |
| "grad_norm": 0.31919026374816895, |
| "learning_rate": 0.0005687189047480337, |
| "loss": 3.7458, |
| "step": 9050 |
| }, |
| { |
| "epoch": 2.6493506493506493, |
| "grad_norm": 0.3107492923736572, |
| "learning_rate": 0.0005685441304981066, |
| "loss": 3.742, |
| "step": 9100 |
| }, |
| { |
| "epoch": 2.6639100809504397, |
| "grad_norm": 0.3181777596473694, |
| "learning_rate": 0.0005683693562481794, |
| "loss": 3.7316, |
| "step": 9150 |
| }, |
| { |
| "epoch": 2.67846951255023, |
| "grad_norm": 0.30232176184654236, |
| "learning_rate": 0.0005681945819982522, |
| "loss": 3.7382, |
| "step": 9200 |
| }, |
| { |
| "epoch": 2.6930289441500204, |
| "grad_norm": 0.3122684359550476, |
| "learning_rate": 0.000568019807748325, |
| "loss": 3.7305, |
| "step": 9250 |
| }, |
| { |
| "epoch": 2.7075883757498107, |
| "grad_norm": 0.31523433327674866, |
| "learning_rate": 0.0005678450334983978, |
| "loss": 3.7453, |
| "step": 9300 |
| }, |
| { |
| "epoch": 2.722147807349601, |
| "grad_norm": 0.2995758652687073, |
| "learning_rate": 0.0005676702592484707, |
| "loss": 3.7388, |
| "step": 9350 |
| }, |
| { |
| "epoch": 2.7367072389493914, |
| "grad_norm": 0.29348045587539673, |
| "learning_rate": 0.0005674954849985435, |
| "loss": 3.7336, |
| "step": 9400 |
| }, |
| { |
| "epoch": 2.7512666705491817, |
| "grad_norm": 0.3128356337547302, |
| "learning_rate": 0.0005673207107486163, |
| "loss": 3.7327, |
| "step": 9450 |
| }, |
| { |
| "epoch": 2.765826102148972, |
| "grad_norm": 0.3079585134983063, |
| "learning_rate": 0.0005671459364986892, |
| "loss": 3.7333, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.7803855337487624, |
| "grad_norm": 0.3393835127353668, |
| "learning_rate": 0.0005669711622487619, |
| "loss": 3.7363, |
| "step": 9550 |
| }, |
| { |
| "epoch": 2.7949449653485527, |
| "grad_norm": 0.31067395210266113, |
| "learning_rate": 0.0005667963879988348, |
| "loss": 3.7345, |
| "step": 9600 |
| }, |
| { |
| "epoch": 2.809504396948343, |
| "grad_norm": 0.310585081577301, |
| "learning_rate": 0.0005666216137489076, |
| "loss": 3.7407, |
| "step": 9650 |
| }, |
| { |
| "epoch": 2.8240638285481334, |
| "grad_norm": 0.30752629041671753, |
| "learning_rate": 0.0005664468394989804, |
| "loss": 3.7323, |
| "step": 9700 |
| }, |
| { |
| "epoch": 2.8386232601479238, |
| "grad_norm": 0.3025956451892853, |
| "learning_rate": 0.0005662720652490533, |
| "loss": 3.74, |
| "step": 9750 |
| }, |
| { |
| "epoch": 2.853182691747714, |
| "grad_norm": 0.3014912009239197, |
| "learning_rate": 0.000566097290999126, |
| "loss": 3.7323, |
| "step": 9800 |
| }, |
| { |
| "epoch": 2.8677421233475044, |
| "grad_norm": 0.3069649934768677, |
| "learning_rate": 0.0005659225167491988, |
| "loss": 3.7426, |
| "step": 9850 |
| }, |
| { |
| "epoch": 2.882301554947295, |
| "grad_norm": 0.3200516104698181, |
| "learning_rate": 0.0005657477424992717, |
| "loss": 3.7251, |
| "step": 9900 |
| }, |
| { |
| "epoch": 2.896860986547085, |
| "grad_norm": 0.29957816004753113, |
| "learning_rate": 0.0005655729682493445, |
| "loss": 3.739, |
| "step": 9950 |
| }, |
| { |
| "epoch": 2.9114204181468755, |
| "grad_norm": 0.2948393225669861, |
| "learning_rate": 0.0005653981939994174, |
| "loss": 3.731, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.9114204181468755, |
| "eval_accuracy": 0.3490493878998981, |
| "eval_loss": 3.7306628227233887, |
| "eval_runtime": 54.2596, |
| "eval_samples_per_second": 306.729, |
| "eval_steps_per_second": 19.186, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.925979849746666, |
| "grad_norm": 0.29376843571662903, |
| "learning_rate": 0.0005652234197494902, |
| "loss": 3.7304, |
| "step": 10050 |
| }, |
| { |
| "epoch": 2.940539281346456, |
| "grad_norm": 0.30596283078193665, |
| "learning_rate": 0.0005650486454995629, |
| "loss": 3.7211, |
| "step": 10100 |
| }, |
| { |
| "epoch": 2.9550987129462465, |
| "grad_norm": 0.29923945665359497, |
| "learning_rate": 0.0005648738712496358, |
| "loss": 3.7182, |
| "step": 10150 |
| }, |
| { |
| "epoch": 2.969658144546037, |
| "grad_norm": 0.2963240444660187, |
| "learning_rate": 0.0005646990969997086, |
| "loss": 3.7241, |
| "step": 10200 |
| }, |
| { |
| "epoch": 2.984217576145827, |
| "grad_norm": 0.31028681993484497, |
| "learning_rate": 0.0005645243227497815, |
| "loss": 3.7338, |
| "step": 10250 |
| }, |
| { |
| "epoch": 2.9987770077456175, |
| "grad_norm": 0.33405447006225586, |
| "learning_rate": 0.0005643495484998543, |
| "loss": 3.7299, |
| "step": 10300 |
| }, |
| { |
| "epoch": 3.0131034884398114, |
| "grad_norm": 0.2879531979560852, |
| "learning_rate": 0.000564174774249927, |
| "loss": 3.6315, |
| "step": 10350 |
| }, |
| { |
| "epoch": 3.0276629200396017, |
| "grad_norm": 0.28263619542121887, |
| "learning_rate": 0.0005639999999999999, |
| "loss": 3.6257, |
| "step": 10400 |
| }, |
| { |
| "epoch": 3.042222351639392, |
| "grad_norm": 0.31342795491218567, |
| "learning_rate": 0.0005638252257500727, |
| "loss": 3.6218, |
| "step": 10450 |
| }, |
| { |
| "epoch": 3.0567817832391824, |
| "grad_norm": 0.2994714081287384, |
| "learning_rate": 0.0005636504515001456, |
| "loss": 3.6184, |
| "step": 10500 |
| }, |
| { |
| "epoch": 3.0713412148389727, |
| "grad_norm": 0.3202952444553375, |
| "learning_rate": 0.0005634756772502185, |
| "loss": 3.613, |
| "step": 10550 |
| }, |
| { |
| "epoch": 3.085900646438763, |
| "grad_norm": 0.3065634071826935, |
| "learning_rate": 0.0005633009030002913, |
| "loss": 3.6314, |
| "step": 10600 |
| }, |
| { |
| "epoch": 3.1004600780385534, |
| "grad_norm": 0.29666420817375183, |
| "learning_rate": 0.000563126128750364, |
| "loss": 3.629, |
| "step": 10650 |
| }, |
| { |
| "epoch": 3.1150195096383437, |
| "grad_norm": 0.3102670907974243, |
| "learning_rate": 0.0005629513545004369, |
| "loss": 3.6263, |
| "step": 10700 |
| }, |
| { |
| "epoch": 3.129578941238134, |
| "grad_norm": 0.2930033206939697, |
| "learning_rate": 0.0005627765802505097, |
| "loss": 3.6311, |
| "step": 10750 |
| }, |
| { |
| "epoch": 3.1441383728379244, |
| "grad_norm": 0.3130742609500885, |
| "learning_rate": 0.0005626018060005826, |
| "loss": 3.6318, |
| "step": 10800 |
| }, |
| { |
| "epoch": 3.1586978044377148, |
| "grad_norm": 0.311313271522522, |
| "learning_rate": 0.0005624270317506554, |
| "loss": 3.6366, |
| "step": 10850 |
| }, |
| { |
| "epoch": 3.173257236037505, |
| "grad_norm": 0.32204383611679077, |
| "learning_rate": 0.0005622522575007282, |
| "loss": 3.6384, |
| "step": 10900 |
| }, |
| { |
| "epoch": 3.1878166676372954, |
| "grad_norm": 0.3110329508781433, |
| "learning_rate": 0.000562077483250801, |
| "loss": 3.6283, |
| "step": 10950 |
| }, |
| { |
| "epoch": 3.2023760992370858, |
| "grad_norm": 0.3356837034225464, |
| "learning_rate": 0.0005619027090008738, |
| "loss": 3.6279, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.2023760992370858, |
| "eval_accuracy": 0.3507460045319148, |
| "eval_loss": 3.720451593399048, |
| "eval_runtime": 54.2329, |
| "eval_samples_per_second": 306.88, |
| "eval_steps_per_second": 19.195, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.216935530836876, |
| "grad_norm": 0.3050292134284973, |
| "learning_rate": 0.0005617279347509467, |
| "loss": 3.6233, |
| "step": 11050 |
| }, |
| { |
| "epoch": 3.2314949624366665, |
| "grad_norm": 0.30432161688804626, |
| "learning_rate": 0.0005615531605010195, |
| "loss": 3.6161, |
| "step": 11100 |
| }, |
| { |
| "epoch": 3.246054394036457, |
| "grad_norm": 0.30849170684814453, |
| "learning_rate": 0.0005613783862510923, |
| "loss": 3.6405, |
| "step": 11150 |
| }, |
| { |
| "epoch": 3.260613825636247, |
| "grad_norm": 0.2979888617992401, |
| "learning_rate": 0.0005612036120011652, |
| "loss": 3.6458, |
| "step": 11200 |
| }, |
| { |
| "epoch": 3.2751732572360375, |
| "grad_norm": 0.3145684599876404, |
| "learning_rate": 0.0005610288377512379, |
| "loss": 3.653, |
| "step": 11250 |
| }, |
| { |
| "epoch": 3.289732688835828, |
| "grad_norm": 0.3270987868309021, |
| "learning_rate": 0.0005608540635013107, |
| "loss": 3.6535, |
| "step": 11300 |
| }, |
| { |
| "epoch": 3.304292120435618, |
| "grad_norm": 0.3080297112464905, |
| "learning_rate": 0.0005606792892513836, |
| "loss": 3.6421, |
| "step": 11350 |
| }, |
| { |
| "epoch": 3.3188515520354085, |
| "grad_norm": 0.3096049427986145, |
| "learning_rate": 0.0005605045150014564, |
| "loss": 3.6457, |
| "step": 11400 |
| }, |
| { |
| "epoch": 3.333410983635199, |
| "grad_norm": 0.3183659315109253, |
| "learning_rate": 0.0005603297407515293, |
| "loss": 3.6523, |
| "step": 11450 |
| }, |
| { |
| "epoch": 3.347970415234989, |
| "grad_norm": 0.29702067375183105, |
| "learning_rate": 0.000560154966501602, |
| "loss": 3.6438, |
| "step": 11500 |
| }, |
| { |
| "epoch": 3.3625298468347795, |
| "grad_norm": 0.3221289813518524, |
| "learning_rate": 0.0005599801922516748, |
| "loss": 3.6431, |
| "step": 11550 |
| }, |
| { |
| "epoch": 3.37708927843457, |
| "grad_norm": 0.30078771710395813, |
| "learning_rate": 0.0005598054180017477, |
| "loss": 3.643, |
| "step": 11600 |
| }, |
| { |
| "epoch": 3.39164871003436, |
| "grad_norm": 0.2992677390575409, |
| "learning_rate": 0.0005596306437518205, |
| "loss": 3.6412, |
| "step": 11650 |
| }, |
| { |
| "epoch": 3.4062081416341505, |
| "grad_norm": 0.2946924567222595, |
| "learning_rate": 0.0005594558695018934, |
| "loss": 3.6514, |
| "step": 11700 |
| }, |
| { |
| "epoch": 3.420767573233941, |
| "grad_norm": 0.3194354772567749, |
| "learning_rate": 0.0005592810952519662, |
| "loss": 3.6428, |
| "step": 11750 |
| }, |
| { |
| "epoch": 3.435327004833731, |
| "grad_norm": 0.30356186628341675, |
| "learning_rate": 0.0005591063210020389, |
| "loss": 3.644, |
| "step": 11800 |
| }, |
| { |
| "epoch": 3.4498864364335216, |
| "grad_norm": 0.3009282350540161, |
| "learning_rate": 0.0005589315467521118, |
| "loss": 3.6435, |
| "step": 11850 |
| }, |
| { |
| "epoch": 3.464445868033312, |
| "grad_norm": 0.3127005994319916, |
| "learning_rate": 0.0005587567725021846, |
| "loss": 3.6493, |
| "step": 11900 |
| }, |
| { |
| "epoch": 3.4790052996331022, |
| "grad_norm": 0.3001411557197571, |
| "learning_rate": 0.0005585819982522575, |
| "loss": 3.643, |
| "step": 11950 |
| }, |
| { |
| "epoch": 3.4935647312328926, |
| "grad_norm": 0.30470672249794006, |
| "learning_rate": 0.0005584072240023303, |
| "loss": 3.6457, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.4935647312328926, |
| "eval_accuracy": 0.35282688501821313, |
| "eval_loss": 3.7005693912506104, |
| "eval_runtime": 54.1621, |
| "eval_samples_per_second": 307.281, |
| "eval_steps_per_second": 19.22, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.508124162832683, |
| "grad_norm": 0.3177192211151123, |
| "learning_rate": 0.000558232449752403, |
| "loss": 3.649, |
| "step": 12050 |
| }, |
| { |
| "epoch": 3.5226835944324733, |
| "grad_norm": 0.28869521617889404, |
| "learning_rate": 0.0005580576755024759, |
| "loss": 3.6462, |
| "step": 12100 |
| }, |
| { |
| "epoch": 3.5372430260322636, |
| "grad_norm": 0.2869119346141815, |
| "learning_rate": 0.0005578829012525487, |
| "loss": 3.6557, |
| "step": 12150 |
| }, |
| { |
| "epoch": 3.551802457632054, |
| "grad_norm": 0.2948986887931824, |
| "learning_rate": 0.0005577081270026216, |
| "loss": 3.6402, |
| "step": 12200 |
| }, |
| { |
| "epoch": 3.5663618892318443, |
| "grad_norm": 0.3073137402534485, |
| "learning_rate": 0.0005575333527526944, |
| "loss": 3.6419, |
| "step": 12250 |
| }, |
| { |
| "epoch": 3.5809213208316346, |
| "grad_norm": 0.2939525246620178, |
| "learning_rate": 0.0005573585785027672, |
| "loss": 3.6478, |
| "step": 12300 |
| }, |
| { |
| "epoch": 3.595480752431425, |
| "grad_norm": 0.30777987837791443, |
| "learning_rate": 0.00055718380425284, |
| "loss": 3.6392, |
| "step": 12350 |
| }, |
| { |
| "epoch": 3.6100401840312153, |
| "grad_norm": 0.2951711118221283, |
| "learning_rate": 0.0005570090300029128, |
| "loss": 3.6404, |
| "step": 12400 |
| }, |
| { |
| "epoch": 3.6245996156310056, |
| "grad_norm": 0.3050254285335541, |
| "learning_rate": 0.0005568342557529856, |
| "loss": 3.6431, |
| "step": 12450 |
| }, |
| { |
| "epoch": 3.639159047230796, |
| "grad_norm": 0.3149774670600891, |
| "learning_rate": 0.0005566594815030585, |
| "loss": 3.6474, |
| "step": 12500 |
| }, |
| { |
| "epoch": 3.6537184788305863, |
| "grad_norm": 0.29318898916244507, |
| "learning_rate": 0.0005564847072531313, |
| "loss": 3.6583, |
| "step": 12550 |
| }, |
| { |
| "epoch": 3.6682779104303767, |
| "grad_norm": 0.3082406222820282, |
| "learning_rate": 0.0005563099330032042, |
| "loss": 3.648, |
| "step": 12600 |
| }, |
| { |
| "epoch": 3.682837342030167, |
| "grad_norm": 0.30790096521377563, |
| "learning_rate": 0.0005561351587532769, |
| "loss": 3.6514, |
| "step": 12650 |
| }, |
| { |
| "epoch": 3.6973967736299573, |
| "grad_norm": 0.2913956046104431, |
| "learning_rate": 0.0005559603845033497, |
| "loss": 3.6446, |
| "step": 12700 |
| }, |
| { |
| "epoch": 3.7119562052297477, |
| "grad_norm": 0.2990446090698242, |
| "learning_rate": 0.0005557856102534226, |
| "loss": 3.648, |
| "step": 12750 |
| }, |
| { |
| "epoch": 3.726515636829538, |
| "grad_norm": 0.2935076057910919, |
| "learning_rate": 0.0005556108360034954, |
| "loss": 3.6476, |
| "step": 12800 |
| }, |
| { |
| "epoch": 3.7410750684293284, |
| "grad_norm": 0.30372142791748047, |
| "learning_rate": 0.0005554360617535683, |
| "loss": 3.6439, |
| "step": 12850 |
| }, |
| { |
| "epoch": 3.755634500029119, |
| "grad_norm": 0.2952456772327423, |
| "learning_rate": 0.000555261287503641, |
| "loss": 3.6436, |
| "step": 12900 |
| }, |
| { |
| "epoch": 3.770193931628909, |
| "grad_norm": 0.30808019638061523, |
| "learning_rate": 0.0005550865132537138, |
| "loss": 3.6507, |
| "step": 12950 |
| }, |
| { |
| "epoch": 3.7847533632287, |
| "grad_norm": 0.3047133684158325, |
| "learning_rate": 0.0005549117390037867, |
| "loss": 3.6444, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.7847533632287, |
| "eval_accuracy": 0.3544879913430104, |
| "eval_loss": 3.6809916496276855, |
| "eval_runtime": 54.2962, |
| "eval_samples_per_second": 306.522, |
| "eval_steps_per_second": 19.173, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.7993127948284897, |
| "grad_norm": 0.29825451970100403, |
| "learning_rate": 0.0005547369647538596, |
| "loss": 3.6342, |
| "step": 13050 |
| }, |
| { |
| "epoch": 3.8138722264282805, |
| "grad_norm": 0.303677499294281, |
| "learning_rate": 0.0005545621905039324, |
| "loss": 3.6411, |
| "step": 13100 |
| }, |
| { |
| "epoch": 3.8284316580280704, |
| "grad_norm": 0.29525500535964966, |
| "learning_rate": 0.0005543874162540053, |
| "loss": 3.6517, |
| "step": 13150 |
| }, |
| { |
| "epoch": 3.842991089627861, |
| "grad_norm": 0.28726667165756226, |
| "learning_rate": 0.000554212642004078, |
| "loss": 3.6351, |
| "step": 13200 |
| }, |
| { |
| "epoch": 3.857550521227651, |
| "grad_norm": 0.29034554958343506, |
| "learning_rate": 0.0005540378677541508, |
| "loss": 3.6442, |
| "step": 13250 |
| }, |
| { |
| "epoch": 3.872109952827442, |
| "grad_norm": 0.30698099732398987, |
| "learning_rate": 0.0005538630935042237, |
| "loss": 3.6443, |
| "step": 13300 |
| }, |
| { |
| "epoch": 3.8866693844272318, |
| "grad_norm": 0.31629928946495056, |
| "learning_rate": 0.0005536883192542965, |
| "loss": 3.6473, |
| "step": 13350 |
| }, |
| { |
| "epoch": 3.9012288160270225, |
| "grad_norm": 0.2991327941417694, |
| "learning_rate": 0.0005535135450043694, |
| "loss": 3.6418, |
| "step": 13400 |
| }, |
| { |
| "epoch": 3.9157882476268124, |
| "grad_norm": 0.31006455421447754, |
| "learning_rate": 0.0005533387707544422, |
| "loss": 3.6389, |
| "step": 13450 |
| }, |
| { |
| "epoch": 3.930347679226603, |
| "grad_norm": 0.3237093985080719, |
| "learning_rate": 0.0005531639965045149, |
| "loss": 3.6389, |
| "step": 13500 |
| }, |
| { |
| "epoch": 3.944907110826393, |
| "grad_norm": 0.2965652346611023, |
| "learning_rate": 0.0005529892222545878, |
| "loss": 3.638, |
| "step": 13550 |
| }, |
| { |
| "epoch": 3.959466542426184, |
| "grad_norm": 0.311987042427063, |
| "learning_rate": 0.0005528144480046606, |
| "loss": 3.643, |
| "step": 13600 |
| }, |
| { |
| "epoch": 3.974025974025974, |
| "grad_norm": 0.3185499310493469, |
| "learning_rate": 0.0005526396737547335, |
| "loss": 3.6513, |
| "step": 13650 |
| }, |
| { |
| "epoch": 3.9885854056257646, |
| "grad_norm": 0.30643561482429504, |
| "learning_rate": 0.0005524648995048063, |
| "loss": 3.6323, |
| "step": 13700 |
| }, |
| { |
| "epoch": 4.002911886319958, |
| "grad_norm": 0.3289678692817688, |
| "learning_rate": 0.000552290125254879, |
| "loss": 3.622, |
| "step": 13750 |
| }, |
| { |
| "epoch": 4.017471317919749, |
| "grad_norm": 0.2990996539592743, |
| "learning_rate": 0.0005521153510049519, |
| "loss": 3.5201, |
| "step": 13800 |
| }, |
| { |
| "epoch": 4.032030749519539, |
| "grad_norm": 0.3066459596157074, |
| "learning_rate": 0.0005519405767550247, |
| "loss": 3.5315, |
| "step": 13850 |
| }, |
| { |
| "epoch": 4.046590181119329, |
| "grad_norm": 0.30182674527168274, |
| "learning_rate": 0.0005517658025050975, |
| "loss": 3.539, |
| "step": 13900 |
| }, |
| { |
| "epoch": 4.061149612719119, |
| "grad_norm": 0.32139694690704346, |
| "learning_rate": 0.0005515910282551704, |
| "loss": 3.5314, |
| "step": 13950 |
| }, |
| { |
| "epoch": 4.07570904431891, |
| "grad_norm": 0.3230557441711426, |
| "learning_rate": 0.0005514162540052432, |
| "loss": 3.5475, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.07570904431891, |
| "eval_accuracy": 0.35576130629956376, |
| "eval_loss": 3.6757664680480957, |
| "eval_runtime": 54.2593, |
| "eval_samples_per_second": 306.731, |
| "eval_steps_per_second": 19.186, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.0902684759187, |
| "grad_norm": 0.33777207136154175, |
| "learning_rate": 0.000551241479755316, |
| "loss": 3.5486, |
| "step": 14050 |
| }, |
| { |
| "epoch": 4.104827907518491, |
| "grad_norm": 0.31867823004722595, |
| "learning_rate": 0.0005510667055053888, |
| "loss": 3.5385, |
| "step": 14100 |
| }, |
| { |
| "epoch": 4.119387339118281, |
| "grad_norm": 0.3000119924545288, |
| "learning_rate": 0.0005508919312554616, |
| "loss": 3.5491, |
| "step": 14150 |
| }, |
| { |
| "epoch": 4.1339467707180715, |
| "grad_norm": 0.3066224753856659, |
| "learning_rate": 0.0005507171570055345, |
| "loss": 3.5543, |
| "step": 14200 |
| }, |
| { |
| "epoch": 4.148506202317861, |
| "grad_norm": 0.31685900688171387, |
| "learning_rate": 0.0005505423827556073, |
| "loss": 3.5482, |
| "step": 14250 |
| }, |
| { |
| "epoch": 4.163065633917652, |
| "grad_norm": 0.34130823612213135, |
| "learning_rate": 0.0005503676085056802, |
| "loss": 3.5526, |
| "step": 14300 |
| }, |
| { |
| "epoch": 4.177625065517442, |
| "grad_norm": 0.3072553277015686, |
| "learning_rate": 0.0005501928342557529, |
| "loss": 3.5638, |
| "step": 14350 |
| }, |
| { |
| "epoch": 4.192184497117233, |
| "grad_norm": 0.3119163513183594, |
| "learning_rate": 0.0005500180600058257, |
| "loss": 3.5611, |
| "step": 14400 |
| }, |
| { |
| "epoch": 4.206743928717023, |
| "grad_norm": 0.3274585008621216, |
| "learning_rate": 0.0005498432857558986, |
| "loss": 3.5548, |
| "step": 14450 |
| }, |
| { |
| "epoch": 4.2213033603168135, |
| "grad_norm": 0.29814136028289795, |
| "learning_rate": 0.0005496685115059714, |
| "loss": 3.5676, |
| "step": 14500 |
| }, |
| { |
| "epoch": 4.235862791916603, |
| "grad_norm": 0.3296707570552826, |
| "learning_rate": 0.0005494937372560443, |
| "loss": 3.5638, |
| "step": 14550 |
| }, |
| { |
| "epoch": 4.250422223516394, |
| "grad_norm": 0.30175575613975525, |
| "learning_rate": 0.000549318963006117, |
| "loss": 3.5672, |
| "step": 14600 |
| }, |
| { |
| "epoch": 4.264981655116184, |
| "grad_norm": 0.3530194163322449, |
| "learning_rate": 0.0005491441887561898, |
| "loss": 3.5689, |
| "step": 14650 |
| }, |
| { |
| "epoch": 4.279541086715975, |
| "grad_norm": 0.31514260172843933, |
| "learning_rate": 0.0005489694145062627, |
| "loss": 3.5576, |
| "step": 14700 |
| }, |
| { |
| "epoch": 4.294100518315765, |
| "grad_norm": 0.31697317957878113, |
| "learning_rate": 0.0005487946402563355, |
| "loss": 3.5512, |
| "step": 14750 |
| }, |
| { |
| "epoch": 4.308659949915556, |
| "grad_norm": 0.3378666937351227, |
| "learning_rate": 0.0005486198660064084, |
| "loss": 3.5777, |
| "step": 14800 |
| }, |
| { |
| "epoch": 4.3232193815153455, |
| "grad_norm": 0.32078737020492554, |
| "learning_rate": 0.0005484450917564812, |
| "loss": 3.5714, |
| "step": 14850 |
| }, |
| { |
| "epoch": 4.337778813115136, |
| "grad_norm": 0.3216348886489868, |
| "learning_rate": 0.0005482703175065539, |
| "loss": 3.5704, |
| "step": 14900 |
| }, |
| { |
| "epoch": 4.352338244714926, |
| "grad_norm": 0.3143816590309143, |
| "learning_rate": 0.0005480955432566268, |
| "loss": 3.5698, |
| "step": 14950 |
| }, |
| { |
| "epoch": 4.366897676314717, |
| "grad_norm": 0.29025039076805115, |
| "learning_rate": 0.0005479207690066996, |
| "loss": 3.5642, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.366897676314717, |
| "eval_accuracy": 0.3568779996361957, |
| "eval_loss": 3.6625804901123047, |
| "eval_runtime": 54.3076, |
| "eval_samples_per_second": 306.458, |
| "eval_steps_per_second": 19.169, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.381457107914507, |
| "grad_norm": 0.30588728189468384, |
| "learning_rate": 0.0005477459947567725, |
| "loss": 3.5717, |
| "step": 15050 |
| }, |
| { |
| "epoch": 4.396016539514298, |
| "grad_norm": 0.31308284401893616, |
| "learning_rate": 0.0005475712205068453, |
| "loss": 3.581, |
| "step": 15100 |
| }, |
| { |
| "epoch": 4.4105759711140875, |
| "grad_norm": 0.3067929446697235, |
| "learning_rate": 0.000547396446256918, |
| "loss": 3.5683, |
| "step": 15150 |
| }, |
| { |
| "epoch": 4.425135402713878, |
| "grad_norm": 0.3118480443954468, |
| "learning_rate": 0.0005472216720069909, |
| "loss": 3.5769, |
| "step": 15200 |
| }, |
| { |
| "epoch": 4.439694834313668, |
| "grad_norm": 0.3015614151954651, |
| "learning_rate": 0.0005470468977570637, |
| "loss": 3.5702, |
| "step": 15250 |
| }, |
| { |
| "epoch": 4.454254265913459, |
| "grad_norm": 0.29570505023002625, |
| "learning_rate": 0.0005468721235071365, |
| "loss": 3.5623, |
| "step": 15300 |
| }, |
| { |
| "epoch": 4.468813697513249, |
| "grad_norm": 0.3057698905467987, |
| "learning_rate": 0.0005466973492572094, |
| "loss": 3.5715, |
| "step": 15350 |
| }, |
| { |
| "epoch": 4.48337312911304, |
| "grad_norm": 0.3069203197956085, |
| "learning_rate": 0.0005465225750072822, |
| "loss": 3.5627, |
| "step": 15400 |
| }, |
| { |
| "epoch": 4.4979325607128295, |
| "grad_norm": 0.30680912733078003, |
| "learning_rate": 0.000546347800757355, |
| "loss": 3.569, |
| "step": 15450 |
| }, |
| { |
| "epoch": 4.51249199231262, |
| "grad_norm": 0.3536013066768646, |
| "learning_rate": 0.0005461730265074279, |
| "loss": 3.5698, |
| "step": 15500 |
| }, |
| { |
| "epoch": 4.52705142391241, |
| "grad_norm": 0.2927176356315613, |
| "learning_rate": 0.0005459982522575007, |
| "loss": 3.5731, |
| "step": 15550 |
| }, |
| { |
| "epoch": 4.541610855512201, |
| "grad_norm": 0.2923072576522827, |
| "learning_rate": 0.0005458234780075735, |
| "loss": 3.5775, |
| "step": 15600 |
| }, |
| { |
| "epoch": 4.556170287111991, |
| "grad_norm": 0.3102036118507385, |
| "learning_rate": 0.0005456487037576464, |
| "loss": 3.5778, |
| "step": 15650 |
| }, |
| { |
| "epoch": 4.570729718711782, |
| "grad_norm": 0.31466609239578247, |
| "learning_rate": 0.0005454739295077192, |
| "loss": 3.5794, |
| "step": 15700 |
| }, |
| { |
| "epoch": 4.585289150311572, |
| "grad_norm": 0.3035258650779724, |
| "learning_rate": 0.000545299155257792, |
| "loss": 3.5683, |
| "step": 15750 |
| }, |
| { |
| "epoch": 4.599848581911362, |
| "grad_norm": 0.3166468143463135, |
| "learning_rate": 0.0005451243810078648, |
| "loss": 3.5833, |
| "step": 15800 |
| }, |
| { |
| "epoch": 4.614408013511152, |
| "grad_norm": 0.28802427649497986, |
| "learning_rate": 0.0005449496067579376, |
| "loss": 3.5812, |
| "step": 15850 |
| }, |
| { |
| "epoch": 4.628967445110943, |
| "grad_norm": 0.32211169600486755, |
| "learning_rate": 0.0005447748325080105, |
| "loss": 3.582, |
| "step": 15900 |
| }, |
| { |
| "epoch": 4.643526876710733, |
| "grad_norm": 0.32095086574554443, |
| "learning_rate": 0.0005446000582580833, |
| "loss": 3.5819, |
| "step": 15950 |
| }, |
| { |
| "epoch": 4.658086308310524, |
| "grad_norm": 0.2993008494377136, |
| "learning_rate": 0.0005444252840081562, |
| "loss": 3.5793, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.658086308310524, |
| "eval_accuracy": 0.35770002797318573, |
| "eval_loss": 3.65142560005188, |
| "eval_runtime": 54.1813, |
| "eval_samples_per_second": 307.172, |
| "eval_steps_per_second": 19.213, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.672645739910314, |
| "grad_norm": 0.2886819839477539, |
| "learning_rate": 0.0005442505097582289, |
| "loss": 3.5858, |
| "step": 16050 |
| }, |
| { |
| "epoch": 4.687205171510104, |
| "grad_norm": 0.30564039945602417, |
| "learning_rate": 0.0005440757355083017, |
| "loss": 3.5699, |
| "step": 16100 |
| }, |
| { |
| "epoch": 4.701764603109894, |
| "grad_norm": 0.3058094084262848, |
| "learning_rate": 0.0005439009612583746, |
| "loss": 3.5668, |
| "step": 16150 |
| }, |
| { |
| "epoch": 4.716324034709685, |
| "grad_norm": 0.33634814620018005, |
| "learning_rate": 0.0005437261870084474, |
| "loss": 3.587, |
| "step": 16200 |
| }, |
| { |
| "epoch": 4.730883466309475, |
| "grad_norm": 0.29354017972946167, |
| "learning_rate": 0.0005435514127585203, |
| "loss": 3.5741, |
| "step": 16250 |
| }, |
| { |
| "epoch": 4.745442897909266, |
| "grad_norm": 0.3387173116207123, |
| "learning_rate": 0.000543376638508593, |
| "loss": 3.5868, |
| "step": 16300 |
| }, |
| { |
| "epoch": 4.760002329509056, |
| "grad_norm": 0.3297916650772095, |
| "learning_rate": 0.0005432018642586658, |
| "loss": 3.5881, |
| "step": 16350 |
| }, |
| { |
| "epoch": 4.7745617611088464, |
| "grad_norm": 0.31599003076553345, |
| "learning_rate": 0.0005430270900087387, |
| "loss": 3.5791, |
| "step": 16400 |
| }, |
| { |
| "epoch": 4.789121192708636, |
| "grad_norm": 0.30269932746887207, |
| "learning_rate": 0.0005428523157588115, |
| "loss": 3.5765, |
| "step": 16450 |
| }, |
| { |
| "epoch": 4.803680624308427, |
| "grad_norm": 0.3012683093547821, |
| "learning_rate": 0.0005426775415088843, |
| "loss": 3.5923, |
| "step": 16500 |
| }, |
| { |
| "epoch": 4.818240055908217, |
| "grad_norm": 0.32081395387649536, |
| "learning_rate": 0.0005425027672589572, |
| "loss": 3.5764, |
| "step": 16550 |
| }, |
| { |
| "epoch": 4.832799487508008, |
| "grad_norm": 0.3058837056159973, |
| "learning_rate": 0.0005423279930090299, |
| "loss": 3.5786, |
| "step": 16600 |
| }, |
| { |
| "epoch": 4.847358919107798, |
| "grad_norm": 0.2925353944301605, |
| "learning_rate": 0.0005421532187591028, |
| "loss": 3.5914, |
| "step": 16650 |
| }, |
| { |
| "epoch": 4.8619183507075885, |
| "grad_norm": 0.3263484537601471, |
| "learning_rate": 0.0005419784445091756, |
| "loss": 3.5866, |
| "step": 16700 |
| }, |
| { |
| "epoch": 4.876477782307378, |
| "grad_norm": 0.28891900181770325, |
| "learning_rate": 0.0005418036702592484, |
| "loss": 3.5627, |
| "step": 16750 |
| }, |
| { |
| "epoch": 4.891037213907169, |
| "grad_norm": 0.2890537977218628, |
| "learning_rate": 0.0005416288960093213, |
| "loss": 3.583, |
| "step": 16800 |
| }, |
| { |
| "epoch": 4.905596645506959, |
| "grad_norm": 0.2902555465698242, |
| "learning_rate": 0.000541454121759394, |
| "loss": 3.5861, |
| "step": 16850 |
| }, |
| { |
| "epoch": 4.92015607710675, |
| "grad_norm": 0.30477380752563477, |
| "learning_rate": 0.0005412793475094669, |
| "loss": 3.5722, |
| "step": 16900 |
| }, |
| { |
| "epoch": 4.93471550870654, |
| "grad_norm": 0.2957979142665863, |
| "learning_rate": 0.0005411045732595397, |
| "loss": 3.5871, |
| "step": 16950 |
| }, |
| { |
| "epoch": 4.9492749403063305, |
| "grad_norm": 0.3154853880405426, |
| "learning_rate": 0.0005409297990096125, |
| "loss": 3.5751, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.9492749403063305, |
| "eval_accuracy": 0.35962369892056895, |
| "eval_loss": 3.6343870162963867, |
| "eval_runtime": 54.3437, |
| "eval_samples_per_second": 306.255, |
| "eval_steps_per_second": 19.156, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.96383437190612, |
| "grad_norm": 0.3399759531021118, |
| "learning_rate": 0.0005407550247596854, |
| "loss": 3.578, |
| "step": 17050 |
| }, |
| { |
| "epoch": 4.978393803505911, |
| "grad_norm": 0.30684736371040344, |
| "learning_rate": 0.0005405802505097582, |
| "loss": 3.5717, |
| "step": 17100 |
| }, |
| { |
| "epoch": 4.992953235105701, |
| "grad_norm": 0.30768290162086487, |
| "learning_rate": 0.000540405476259831, |
| "loss": 3.5805, |
| "step": 17150 |
| }, |
| { |
| "epoch": 5.007279715799895, |
| "grad_norm": 0.30980637669563293, |
| "learning_rate": 0.0005402307020099038, |
| "loss": 3.5216, |
| "step": 17200 |
| }, |
| { |
| "epoch": 5.021839147399685, |
| "grad_norm": 0.3266715705394745, |
| "learning_rate": 0.0005400559277599766, |
| "loss": 3.4649, |
| "step": 17250 |
| }, |
| { |
| "epoch": 5.036398578999476, |
| "grad_norm": 0.3132665455341339, |
| "learning_rate": 0.0005398811535100495, |
| "loss": 3.461, |
| "step": 17300 |
| }, |
| { |
| "epoch": 5.050958010599266, |
| "grad_norm": 0.3107033371925354, |
| "learning_rate": 0.0005397063792601223, |
| "loss": 3.4781, |
| "step": 17350 |
| }, |
| { |
| "epoch": 5.065517442199057, |
| "grad_norm": 0.29676178097724915, |
| "learning_rate": 0.0005395316050101951, |
| "loss": 3.475, |
| "step": 17400 |
| }, |
| { |
| "epoch": 5.080076873798847, |
| "grad_norm": 0.32961541414260864, |
| "learning_rate": 0.0005393568307602679, |
| "loss": 3.4675, |
| "step": 17450 |
| }, |
| { |
| "epoch": 5.094636305398637, |
| "grad_norm": 0.31969591975212097, |
| "learning_rate": 0.0005391820565103407, |
| "loss": 3.4834, |
| "step": 17500 |
| }, |
| { |
| "epoch": 5.109195736998427, |
| "grad_norm": 0.30894723534584045, |
| "learning_rate": 0.0005390072822604136, |
| "loss": 3.4935, |
| "step": 17550 |
| }, |
| { |
| "epoch": 5.123755168598218, |
| "grad_norm": 0.3231548070907593, |
| "learning_rate": 0.0005388325080104864, |
| "loss": 3.481, |
| "step": 17600 |
| }, |
| { |
| "epoch": 5.138314600198008, |
| "grad_norm": 0.30943936109542847, |
| "learning_rate": 0.0005386577337605593, |
| "loss": 3.4973, |
| "step": 17650 |
| }, |
| { |
| "epoch": 5.152874031797799, |
| "grad_norm": 0.30249306559562683, |
| "learning_rate": 0.000538482959510632, |
| "loss": 3.4798, |
| "step": 17700 |
| }, |
| { |
| "epoch": 5.167433463397589, |
| "grad_norm": 0.306157648563385, |
| "learning_rate": 0.0005383081852607048, |
| "loss": 3.4981, |
| "step": 17750 |
| }, |
| { |
| "epoch": 5.1819928949973795, |
| "grad_norm": 0.334652304649353, |
| "learning_rate": 0.0005381334110107777, |
| "loss": 3.4941, |
| "step": 17800 |
| }, |
| { |
| "epoch": 5.196552326597169, |
| "grad_norm": 0.3305426836013794, |
| "learning_rate": 0.0005379586367608505, |
| "loss": 3.505, |
| "step": 17850 |
| }, |
| { |
| "epoch": 5.21111175819696, |
| "grad_norm": 0.32884451746940613, |
| "learning_rate": 0.0005377838625109233, |
| "loss": 3.5129, |
| "step": 17900 |
| }, |
| { |
| "epoch": 5.22567118979675, |
| "grad_norm": 0.2979142665863037, |
| "learning_rate": 0.0005376090882609961, |
| "loss": 3.5081, |
| "step": 17950 |
| }, |
| { |
| "epoch": 5.240230621396541, |
| "grad_norm": 0.2956278920173645, |
| "learning_rate": 0.0005374343140110689, |
| "loss": 3.5111, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.240230621396541, |
| "eval_accuracy": 0.35995657865480135, |
| "eval_loss": 3.6383495330810547, |
| "eval_runtime": 54.1651, |
| "eval_samples_per_second": 307.264, |
| "eval_steps_per_second": 19.219, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.254790052996331, |
| "grad_norm": 0.32184895873069763, |
| "learning_rate": 0.0005372595397611418, |
| "loss": 3.5039, |
| "step": 18050 |
| }, |
| { |
| "epoch": 5.2693494845961215, |
| "grad_norm": 0.3286205232143402, |
| "learning_rate": 0.0005370847655112147, |
| "loss": 3.5081, |
| "step": 18100 |
| }, |
| { |
| "epoch": 5.283908916195911, |
| "grad_norm": 0.3119205832481384, |
| "learning_rate": 0.0005369099912612875, |
| "loss": 3.5086, |
| "step": 18150 |
| }, |
| { |
| "epoch": 5.298468347795702, |
| "grad_norm": 0.3186841905117035, |
| "learning_rate": 0.0005367352170113603, |
| "loss": 3.5062, |
| "step": 18200 |
| }, |
| { |
| "epoch": 5.313027779395492, |
| "grad_norm": 0.3225612938404083, |
| "learning_rate": 0.0005365604427614331, |
| "loss": 3.507, |
| "step": 18250 |
| }, |
| { |
| "epoch": 5.327587210995283, |
| "grad_norm": 0.31384894251823425, |
| "learning_rate": 0.0005363856685115059, |
| "loss": 3.5197, |
| "step": 18300 |
| }, |
| { |
| "epoch": 5.342146642595073, |
| "grad_norm": 0.3117847740650177, |
| "learning_rate": 0.0005362108942615788, |
| "loss": 3.5111, |
| "step": 18350 |
| }, |
| { |
| "epoch": 5.3567060741948636, |
| "grad_norm": 0.32050588726997375, |
| "learning_rate": 0.0005360361200116516, |
| "loss": 3.5116, |
| "step": 18400 |
| }, |
| { |
| "epoch": 5.3712655057946534, |
| "grad_norm": 0.31556278467178345, |
| "learning_rate": 0.0005358613457617244, |
| "loss": 3.5275, |
| "step": 18450 |
| }, |
| { |
| "epoch": 5.385824937394444, |
| "grad_norm": 0.29756829142570496, |
| "learning_rate": 0.0005356865715117973, |
| "loss": 3.5141, |
| "step": 18500 |
| }, |
| { |
| "epoch": 5.400384368994234, |
| "grad_norm": 0.344346821308136, |
| "learning_rate": 0.00053551179726187, |
| "loss": 3.5257, |
| "step": 18550 |
| }, |
| { |
| "epoch": 5.414943800594025, |
| "grad_norm": 0.3157712519168854, |
| "learning_rate": 0.0005353370230119429, |
| "loss": 3.5109, |
| "step": 18600 |
| }, |
| { |
| "epoch": 5.429503232193815, |
| "grad_norm": 0.3057422339916229, |
| "learning_rate": 0.0005351622487620157, |
| "loss": 3.5194, |
| "step": 18650 |
| }, |
| { |
| "epoch": 5.444062663793606, |
| "grad_norm": 0.3119611144065857, |
| "learning_rate": 0.0005349874745120885, |
| "loss": 3.5164, |
| "step": 18700 |
| }, |
| { |
| "epoch": 5.4586220953933955, |
| "grad_norm": 0.3102344274520874, |
| "learning_rate": 0.0005348127002621614, |
| "loss": 3.5166, |
| "step": 18750 |
| }, |
| { |
| "epoch": 5.473181526993186, |
| "grad_norm": 0.30929329991340637, |
| "learning_rate": 0.0005346379260122341, |
| "loss": 3.5222, |
| "step": 18800 |
| }, |
| { |
| "epoch": 5.487740958592976, |
| "grad_norm": 0.3128523528575897, |
| "learning_rate": 0.000534463151762307, |
| "loss": 3.5241, |
| "step": 18850 |
| }, |
| { |
| "epoch": 5.502300390192767, |
| "grad_norm": 0.3076431155204773, |
| "learning_rate": 0.0005342883775123798, |
| "loss": 3.5398, |
| "step": 18900 |
| }, |
| { |
| "epoch": 5.516859821792557, |
| "grad_norm": 0.3266940116882324, |
| "learning_rate": 0.0005341136032624526, |
| "loss": 3.5278, |
| "step": 18950 |
| }, |
| { |
| "epoch": 5.531419253392348, |
| "grad_norm": 0.3071880638599396, |
| "learning_rate": 0.0005339388290125255, |
| "loss": 3.5112, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.531419253392348, |
| "eval_accuracy": 0.3604157433888803, |
| "eval_loss": 3.6261556148529053, |
| "eval_runtime": 54.4593, |
| "eval_samples_per_second": 305.604, |
| "eval_steps_per_second": 19.115, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.5459786849921375, |
| "grad_norm": 0.30749401450157166, |
| "learning_rate": 0.0005337640547625983, |
| "loss": 3.5273, |
| "step": 19050 |
| }, |
| { |
| "epoch": 5.560538116591928, |
| "grad_norm": 0.3211112916469574, |
| "learning_rate": 0.000533589280512671, |
| "loss": 3.5192, |
| "step": 19100 |
| }, |
| { |
| "epoch": 5.575097548191718, |
| "grad_norm": 0.31711339950561523, |
| "learning_rate": 0.0005334145062627439, |
| "loss": 3.5173, |
| "step": 19150 |
| }, |
| { |
| "epoch": 5.589656979791509, |
| "grad_norm": 0.321505606174469, |
| "learning_rate": 0.0005332397320128167, |
| "loss": 3.55, |
| "step": 19200 |
| }, |
| { |
| "epoch": 5.604216411391299, |
| "grad_norm": 0.31558939814567566, |
| "learning_rate": 0.0005330649577628896, |
| "loss": 3.5341, |
| "step": 19250 |
| }, |
| { |
| "epoch": 5.61877584299109, |
| "grad_norm": 0.31166985630989075, |
| "learning_rate": 0.0005328901835129624, |
| "loss": 3.5368, |
| "step": 19300 |
| }, |
| { |
| "epoch": 5.6333352745908805, |
| "grad_norm": 0.3260209560394287, |
| "learning_rate": 0.0005327154092630351, |
| "loss": 3.5323, |
| "step": 19350 |
| }, |
| { |
| "epoch": 5.64789470619067, |
| "grad_norm": 0.31407615542411804, |
| "learning_rate": 0.000532540635013108, |
| "loss": 3.5209, |
| "step": 19400 |
| }, |
| { |
| "epoch": 5.66245413779046, |
| "grad_norm": 0.32734420895576477, |
| "learning_rate": 0.0005323658607631808, |
| "loss": 3.5266, |
| "step": 19450 |
| }, |
| { |
| "epoch": 5.677013569390251, |
| "grad_norm": 0.3149868845939636, |
| "learning_rate": 0.0005321910865132537, |
| "loss": 3.5234, |
| "step": 19500 |
| }, |
| { |
| "epoch": 5.691573000990042, |
| "grad_norm": 0.3325765132904053, |
| "learning_rate": 0.0005320163122633265, |
| "loss": 3.5196, |
| "step": 19550 |
| }, |
| { |
| "epoch": 5.706132432589832, |
| "grad_norm": 0.2984762191772461, |
| "learning_rate": 0.0005318415380133993, |
| "loss": 3.5319, |
| "step": 19600 |
| }, |
| { |
| "epoch": 5.720691864189622, |
| "grad_norm": 0.3270318806171417, |
| "learning_rate": 0.0005316667637634721, |
| "loss": 3.5425, |
| "step": 19650 |
| }, |
| { |
| "epoch": 5.735251295789412, |
| "grad_norm": 0.32038673758506775, |
| "learning_rate": 0.0005314919895135449, |
| "loss": 3.533, |
| "step": 19700 |
| }, |
| { |
| "epoch": 5.749810727389203, |
| "grad_norm": 0.3111437261104584, |
| "learning_rate": 0.0005313172152636178, |
| "loss": 3.5343, |
| "step": 19750 |
| }, |
| { |
| "epoch": 5.764370158988993, |
| "grad_norm": 0.308755099773407, |
| "learning_rate": 0.0005311424410136906, |
| "loss": 3.5286, |
| "step": 19800 |
| }, |
| { |
| "epoch": 5.778929590588783, |
| "grad_norm": 0.3227141499519348, |
| "learning_rate": 0.0005309676667637634, |
| "loss": 3.527, |
| "step": 19850 |
| }, |
| { |
| "epoch": 5.793489022188574, |
| "grad_norm": 0.31226617097854614, |
| "learning_rate": 0.0005307928925138363, |
| "loss": 3.5389, |
| "step": 19900 |
| }, |
| { |
| "epoch": 5.8080484537883645, |
| "grad_norm": 0.2969604730606079, |
| "learning_rate": 0.000530618118263909, |
| "loss": 3.5176, |
| "step": 19950 |
| }, |
| { |
| "epoch": 5.822607885388154, |
| "grad_norm": 0.34588631987571716, |
| "learning_rate": 0.0005304433440139819, |
| "loss": 3.5203, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.822607885388154, |
| "eval_accuracy": 0.36184932506311607, |
| "eval_loss": 3.6176443099975586, |
| "eval_runtime": 54.4221, |
| "eval_samples_per_second": 305.814, |
| "eval_steps_per_second": 19.128, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.837167316987944, |
| "grad_norm": 0.31692346930503845, |
| "learning_rate": 0.0005302685697640547, |
| "loss": 3.5295, |
| "step": 20050 |
| }, |
| { |
| "epoch": 5.851726748587735, |
| "grad_norm": 0.3064531683921814, |
| "learning_rate": 0.0005300937955141275, |
| "loss": 3.5287, |
| "step": 20100 |
| }, |
| { |
| "epoch": 5.866286180187526, |
| "grad_norm": 0.31022435426712036, |
| "learning_rate": 0.0005299190212642004, |
| "loss": 3.5314, |
| "step": 20150 |
| }, |
| { |
| "epoch": 5.880845611787316, |
| "grad_norm": 0.3072813153266907, |
| "learning_rate": 0.0005297442470142731, |
| "loss": 3.5338, |
| "step": 20200 |
| }, |
| { |
| "epoch": 5.895405043387106, |
| "grad_norm": 0.2999200224876404, |
| "learning_rate": 0.000529569472764346, |
| "loss": 3.5218, |
| "step": 20250 |
| }, |
| { |
| "epoch": 5.9099644749868965, |
| "grad_norm": 0.31100237369537354, |
| "learning_rate": 0.0005293946985144188, |
| "loss": 3.5419, |
| "step": 20300 |
| }, |
| { |
| "epoch": 5.924523906586687, |
| "grad_norm": 0.2922367751598358, |
| "learning_rate": 0.0005292199242644916, |
| "loss": 3.5429, |
| "step": 20350 |
| }, |
| { |
| "epoch": 5.939083338186477, |
| "grad_norm": 0.3141598701477051, |
| "learning_rate": 0.0005290451500145645, |
| "loss": 3.5185, |
| "step": 20400 |
| }, |
| { |
| "epoch": 5.953642769786267, |
| "grad_norm": 0.33754071593284607, |
| "learning_rate": 0.0005288703757646373, |
| "loss": 3.5203, |
| "step": 20450 |
| }, |
| { |
| "epoch": 5.968202201386058, |
| "grad_norm": 0.3237624168395996, |
| "learning_rate": 0.00052869560151471, |
| "loss": 3.5362, |
| "step": 20500 |
| }, |
| { |
| "epoch": 5.982761632985849, |
| "grad_norm": 0.2930886447429657, |
| "learning_rate": 0.0005285208272647829, |
| "loss": 3.5243, |
| "step": 20550 |
| }, |
| { |
| "epoch": 5.9973210645856385, |
| "grad_norm": 0.3187922537326813, |
| "learning_rate": 0.0005283460530148558, |
| "loss": 3.5354, |
| "step": 20600 |
| }, |
| { |
| "epoch": 6.011647545279832, |
| "grad_norm": 0.3651193380355835, |
| "learning_rate": 0.0005281712787649286, |
| "loss": 3.4451, |
| "step": 20650 |
| }, |
| { |
| "epoch": 6.026206976879623, |
| "grad_norm": 0.3150573968887329, |
| "learning_rate": 0.0005279965045150015, |
| "loss": 3.4055, |
| "step": 20700 |
| }, |
| { |
| "epoch": 6.040766408479413, |
| "grad_norm": 0.32729947566986084, |
| "learning_rate": 0.0005278217302650743, |
| "loss": 3.423, |
| "step": 20750 |
| }, |
| { |
| "epoch": 6.055325840079203, |
| "grad_norm": 0.31897303462028503, |
| "learning_rate": 0.000527646956015147, |
| "loss": 3.4237, |
| "step": 20800 |
| }, |
| { |
| "epoch": 6.069885271678993, |
| "grad_norm": 0.3033381700515747, |
| "learning_rate": 0.0005274721817652199, |
| "loss": 3.4244, |
| "step": 20850 |
| }, |
| { |
| "epoch": 6.084444703278784, |
| "grad_norm": 0.33832624554634094, |
| "learning_rate": 0.0005272974075152927, |
| "loss": 3.4368, |
| "step": 20900 |
| }, |
| { |
| "epoch": 6.099004134878574, |
| "grad_norm": 0.3380010426044464, |
| "learning_rate": 0.0005271226332653656, |
| "loss": 3.4396, |
| "step": 20950 |
| }, |
| { |
| "epoch": 6.113563566478365, |
| "grad_norm": 0.32484617829322815, |
| "learning_rate": 0.0005269478590154384, |
| "loss": 3.4341, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.113563566478365, |
| "eval_accuracy": 0.36267782050903674, |
| "eval_loss": 3.6185672283172607, |
| "eval_runtime": 53.9496, |
| "eval_samples_per_second": 308.491, |
| "eval_steps_per_second": 19.296, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.128122998078155, |
| "grad_norm": 0.32085222005844116, |
| "learning_rate": 0.0005267730847655111, |
| "loss": 3.4444, |
| "step": 21050 |
| }, |
| { |
| "epoch": 6.142682429677945, |
| "grad_norm": 0.31938812136650085, |
| "learning_rate": 0.000526598310515584, |
| "loss": 3.4584, |
| "step": 21100 |
| }, |
| { |
| "epoch": 6.157241861277735, |
| "grad_norm": 0.3215405344963074, |
| "learning_rate": 0.0005264235362656568, |
| "loss": 3.4451, |
| "step": 21150 |
| }, |
| { |
| "epoch": 6.171801292877526, |
| "grad_norm": 0.30794188380241394, |
| "learning_rate": 0.0005262487620157297, |
| "loss": 3.4428, |
| "step": 21200 |
| }, |
| { |
| "epoch": 6.186360724477316, |
| "grad_norm": 0.3180452585220337, |
| "learning_rate": 0.0005260739877658025, |
| "loss": 3.4677, |
| "step": 21250 |
| }, |
| { |
| "epoch": 6.200920156077107, |
| "grad_norm": 0.31727978587150574, |
| "learning_rate": 0.0005258992135158753, |
| "loss": 3.4612, |
| "step": 21300 |
| }, |
| { |
| "epoch": 6.215479587676897, |
| "grad_norm": 0.3364965617656708, |
| "learning_rate": 0.0005257244392659481, |
| "loss": 3.4563, |
| "step": 21350 |
| }, |
| { |
| "epoch": 6.2300390192766875, |
| "grad_norm": 0.3123633861541748, |
| "learning_rate": 0.0005255496650160209, |
| "loss": 3.463, |
| "step": 21400 |
| }, |
| { |
| "epoch": 6.244598450876477, |
| "grad_norm": 0.3345796763896942, |
| "learning_rate": 0.0005253748907660938, |
| "loss": 3.4686, |
| "step": 21450 |
| }, |
| { |
| "epoch": 6.259157882476268, |
| "grad_norm": 0.32419443130493164, |
| "learning_rate": 0.0005252001165161666, |
| "loss": 3.4628, |
| "step": 21500 |
| }, |
| { |
| "epoch": 6.273717314076059, |
| "grad_norm": 0.34241312742233276, |
| "learning_rate": 0.0005250253422662394, |
| "loss": 3.4714, |
| "step": 21550 |
| }, |
| { |
| "epoch": 6.288276745675849, |
| "grad_norm": 0.3336371183395386, |
| "learning_rate": 0.0005248505680163123, |
| "loss": 3.4706, |
| "step": 21600 |
| }, |
| { |
| "epoch": 6.302836177275639, |
| "grad_norm": 0.3192159831523895, |
| "learning_rate": 0.000524675793766385, |
| "loss": 3.4659, |
| "step": 21650 |
| }, |
| { |
| "epoch": 6.3173956088754295, |
| "grad_norm": 0.3582025170326233, |
| "learning_rate": 0.0005245010195164579, |
| "loss": 3.4717, |
| "step": 21700 |
| }, |
| { |
| "epoch": 6.33195504047522, |
| "grad_norm": 0.31705984473228455, |
| "learning_rate": 0.0005243262452665307, |
| "loss": 3.4633, |
| "step": 21750 |
| }, |
| { |
| "epoch": 6.34651447207501, |
| "grad_norm": 0.32373446226119995, |
| "learning_rate": 0.0005241514710166035, |
| "loss": 3.4706, |
| "step": 21800 |
| }, |
| { |
| "epoch": 6.3610739036748, |
| "grad_norm": 0.3116013705730438, |
| "learning_rate": 0.0005239766967666764, |
| "loss": 3.4741, |
| "step": 21850 |
| }, |
| { |
| "epoch": 6.375633335274591, |
| "grad_norm": 0.3358467221260071, |
| "learning_rate": 0.0005238019225167491, |
| "loss": 3.4731, |
| "step": 21900 |
| }, |
| { |
| "epoch": 6.390192766874382, |
| "grad_norm": 0.30520445108413696, |
| "learning_rate": 0.0005236271482668219, |
| "loss": 3.4689, |
| "step": 21950 |
| }, |
| { |
| "epoch": 6.4047521984741715, |
| "grad_norm": 0.3098299503326416, |
| "learning_rate": 0.0005234523740168948, |
| "loss": 3.478, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.4047521984741715, |
| "eval_accuracy": 0.3628752437071209, |
| "eval_loss": 3.6096370220184326, |
| "eval_runtime": 54.1283, |
| "eval_samples_per_second": 307.473, |
| "eval_steps_per_second": 19.232, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.419311630073962, |
| "grad_norm": 0.299277126789093, |
| "learning_rate": 0.0005232775997669676, |
| "loss": 3.4772, |
| "step": 22050 |
| }, |
| { |
| "epoch": 6.433871061673752, |
| "grad_norm": 0.33022385835647583, |
| "learning_rate": 0.0005231028255170405, |
| "loss": 3.4784, |
| "step": 22100 |
| }, |
| { |
| "epoch": 6.448430493273543, |
| "grad_norm": 0.3448750376701355, |
| "learning_rate": 0.0005229280512671133, |
| "loss": 3.4862, |
| "step": 22150 |
| }, |
| { |
| "epoch": 6.462989924873333, |
| "grad_norm": 0.31978732347488403, |
| "learning_rate": 0.000522753277017186, |
| "loss": 3.4777, |
| "step": 22200 |
| }, |
| { |
| "epoch": 6.477549356473124, |
| "grad_norm": 0.3098117709159851, |
| "learning_rate": 0.0005225785027672589, |
| "loss": 3.4775, |
| "step": 22250 |
| }, |
| { |
| "epoch": 6.492108788072914, |
| "grad_norm": 0.3060181140899658, |
| "learning_rate": 0.0005224037285173317, |
| "loss": 3.4832, |
| "step": 22300 |
| }, |
| { |
| "epoch": 6.506668219672704, |
| "grad_norm": 0.3079688549041748, |
| "learning_rate": 0.0005222289542674046, |
| "loss": 3.4789, |
| "step": 22350 |
| }, |
| { |
| "epoch": 6.521227651272494, |
| "grad_norm": 0.3160729706287384, |
| "learning_rate": 0.0005220541800174774, |
| "loss": 3.4833, |
| "step": 22400 |
| }, |
| { |
| "epoch": 6.535787082872285, |
| "grad_norm": 0.32327190041542053, |
| "learning_rate": 0.0005218794057675501, |
| "loss": 3.4703, |
| "step": 22450 |
| }, |
| { |
| "epoch": 6.550346514472075, |
| "grad_norm": 0.3138171136379242, |
| "learning_rate": 0.000521704631517623, |
| "loss": 3.4716, |
| "step": 22500 |
| }, |
| { |
| "epoch": 6.564905946071866, |
| "grad_norm": 0.3231295049190521, |
| "learning_rate": 0.0005215298572676958, |
| "loss": 3.4791, |
| "step": 22550 |
| }, |
| { |
| "epoch": 6.579465377671656, |
| "grad_norm": 0.3182159662246704, |
| "learning_rate": 0.0005213550830177687, |
| "loss": 3.4915, |
| "step": 22600 |
| }, |
| { |
| "epoch": 6.594024809271446, |
| "grad_norm": 0.29927167296409607, |
| "learning_rate": 0.0005211803087678415, |
| "loss": 3.4777, |
| "step": 22650 |
| }, |
| { |
| "epoch": 6.608584240871236, |
| "grad_norm": 0.29772549867630005, |
| "learning_rate": 0.0005210055345179143, |
| "loss": 3.4757, |
| "step": 22700 |
| }, |
| { |
| "epoch": 6.623143672471027, |
| "grad_norm": 0.3051946759223938, |
| "learning_rate": 0.0005208307602679871, |
| "loss": 3.4851, |
| "step": 22750 |
| }, |
| { |
| "epoch": 6.637703104070817, |
| "grad_norm": 0.3218457102775574, |
| "learning_rate": 0.0005206559860180599, |
| "loss": 3.4805, |
| "step": 22800 |
| }, |
| { |
| "epoch": 6.652262535670608, |
| "grad_norm": 0.3196474611759186, |
| "learning_rate": 0.0005204812117681328, |
| "loss": 3.4912, |
| "step": 22850 |
| }, |
| { |
| "epoch": 6.666821967270398, |
| "grad_norm": 0.30887487530708313, |
| "learning_rate": 0.0005203064375182056, |
| "loss": 3.4878, |
| "step": 22900 |
| }, |
| { |
| "epoch": 6.6813813988701884, |
| "grad_norm": 0.34201371669769287, |
| "learning_rate": 0.0005201316632682784, |
| "loss": 3.4869, |
| "step": 22950 |
| }, |
| { |
| "epoch": 6.695940830469978, |
| "grad_norm": 0.33082863688468933, |
| "learning_rate": 0.0005199568890183513, |
| "loss": 3.4906, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.695940830469978, |
| "eval_accuracy": 0.3635063159549574, |
| "eval_loss": 3.602727174758911, |
| "eval_runtime": 54.3369, |
| "eval_samples_per_second": 306.293, |
| "eval_steps_per_second": 19.158, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.710500262069769, |
| "grad_norm": 0.31607675552368164, |
| "learning_rate": 0.000519782114768424, |
| "loss": 3.4755, |
| "step": 23050 |
| }, |
| { |
| "epoch": 6.725059693669559, |
| "grad_norm": 0.300519198179245, |
| "learning_rate": 0.0005196073405184969, |
| "loss": 3.4849, |
| "step": 23100 |
| }, |
| { |
| "epoch": 6.73961912526935, |
| "grad_norm": 0.3010658919811249, |
| "learning_rate": 0.0005194325662685697, |
| "loss": 3.4793, |
| "step": 23150 |
| }, |
| { |
| "epoch": 6.75417855686914, |
| "grad_norm": 0.3112882077693939, |
| "learning_rate": 0.0005192577920186426, |
| "loss": 3.4941, |
| "step": 23200 |
| }, |
| { |
| "epoch": 6.7687379884689305, |
| "grad_norm": 0.32871827483177185, |
| "learning_rate": 0.0005190830177687154, |
| "loss": 3.4962, |
| "step": 23250 |
| }, |
| { |
| "epoch": 6.78329742006872, |
| "grad_norm": 0.32987311482429504, |
| "learning_rate": 0.0005189082435187883, |
| "loss": 3.5037, |
| "step": 23300 |
| }, |
| { |
| "epoch": 6.797856851668511, |
| "grad_norm": 0.3238174021244049, |
| "learning_rate": 0.000518733469268861, |
| "loss": 3.4946, |
| "step": 23350 |
| }, |
| { |
| "epoch": 6.812416283268301, |
| "grad_norm": 0.33636802434921265, |
| "learning_rate": 0.0005185586950189338, |
| "loss": 3.4939, |
| "step": 23400 |
| }, |
| { |
| "epoch": 6.826975714868092, |
| "grad_norm": 0.312847763299942, |
| "learning_rate": 0.0005183839207690067, |
| "loss": 3.4955, |
| "step": 23450 |
| }, |
| { |
| "epoch": 6.841535146467882, |
| "grad_norm": 0.3202083706855774, |
| "learning_rate": 0.0005182091465190795, |
| "loss": 3.4939, |
| "step": 23500 |
| }, |
| { |
| "epoch": 6.8560945780676725, |
| "grad_norm": 0.31817376613616943, |
| "learning_rate": 0.0005180343722691524, |
| "loss": 3.4978, |
| "step": 23550 |
| }, |
| { |
| "epoch": 6.870654009667462, |
| "grad_norm": 0.3149799406528473, |
| "learning_rate": 0.0005178595980192251, |
| "loss": 3.4968, |
| "step": 23600 |
| }, |
| { |
| "epoch": 6.885213441267253, |
| "grad_norm": 0.2956794202327728, |
| "learning_rate": 0.0005176848237692979, |
| "loss": 3.4857, |
| "step": 23650 |
| }, |
| { |
| "epoch": 6.899772872867043, |
| "grad_norm": 0.32103171944618225, |
| "learning_rate": 0.0005175100495193708, |
| "loss": 3.4962, |
| "step": 23700 |
| }, |
| { |
| "epoch": 6.914332304466834, |
| "grad_norm": 0.2945249676704407, |
| "learning_rate": 0.0005173352752694436, |
| "loss": 3.4761, |
| "step": 23750 |
| }, |
| { |
| "epoch": 6.928891736066624, |
| "grad_norm": 0.2928471863269806, |
| "learning_rate": 0.0005171605010195165, |
| "loss": 3.4972, |
| "step": 23800 |
| }, |
| { |
| "epoch": 6.943451167666415, |
| "grad_norm": 0.35107314586639404, |
| "learning_rate": 0.0005169857267695893, |
| "loss": 3.5057, |
| "step": 23850 |
| }, |
| { |
| "epoch": 6.9580105992662045, |
| "grad_norm": 0.30272382497787476, |
| "learning_rate": 0.000516810952519662, |
| "loss": 3.4895, |
| "step": 23900 |
| }, |
| { |
| "epoch": 6.972570030865995, |
| "grad_norm": 0.29927268624305725, |
| "learning_rate": 0.0005166361782697349, |
| "loss": 3.5006, |
| "step": 23950 |
| }, |
| { |
| "epoch": 6.987129462465785, |
| "grad_norm": 0.31151506304740906, |
| "learning_rate": 0.0005164614040198077, |
| "loss": 3.4966, |
| "step": 24000 |
| }, |
| { |
| "epoch": 6.987129462465785, |
| "eval_accuracy": 0.364730716051235, |
| "eval_loss": 3.594974994659424, |
| "eval_runtime": 54.1369, |
| "eval_samples_per_second": 307.424, |
| "eval_steps_per_second": 19.229, |
| "step": 24000 |
| }, |
| { |
| "epoch": 7.001455943159979, |
| "grad_norm": 0.3276313841342926, |
| "learning_rate": 0.0005162866297698806, |
| "loss": 3.4778, |
| "step": 24050 |
| }, |
| { |
| "epoch": 7.016015374759769, |
| "grad_norm": 0.33290964365005493, |
| "learning_rate": 0.0005161118555199534, |
| "loss": 3.3818, |
| "step": 24100 |
| }, |
| { |
| "epoch": 7.03057480635956, |
| "grad_norm": 0.3170606195926666, |
| "learning_rate": 0.0005159370812700261, |
| "loss": 3.3891, |
| "step": 24150 |
| }, |
| { |
| "epoch": 7.04513423795935, |
| "grad_norm": 0.3252675533294678, |
| "learning_rate": 0.000515762307020099, |
| "loss": 3.3859, |
| "step": 24200 |
| }, |
| { |
| "epoch": 7.059693669559141, |
| "grad_norm": 0.3159274756908417, |
| "learning_rate": 0.0005155875327701718, |
| "loss": 3.3887, |
| "step": 24250 |
| }, |
| { |
| "epoch": 7.074253101158931, |
| "grad_norm": 0.34390512108802795, |
| "learning_rate": 0.0005154127585202447, |
| "loss": 3.3975, |
| "step": 24300 |
| }, |
| { |
| "epoch": 7.0888125327587215, |
| "grad_norm": 0.32971522212028503, |
| "learning_rate": 0.0005152379842703175, |
| "loss": 3.3996, |
| "step": 24350 |
| }, |
| { |
| "epoch": 7.103371964358511, |
| "grad_norm": 0.306550532579422, |
| "learning_rate": 0.0005150632100203903, |
| "loss": 3.3994, |
| "step": 24400 |
| }, |
| { |
| "epoch": 7.117931395958302, |
| "grad_norm": 0.3376745581626892, |
| "learning_rate": 0.0005148884357704631, |
| "loss": 3.4013, |
| "step": 24450 |
| }, |
| { |
| "epoch": 7.132490827558092, |
| "grad_norm": 0.3302764296531677, |
| "learning_rate": 0.0005147136615205359, |
| "loss": 3.4085, |
| "step": 24500 |
| }, |
| { |
| "epoch": 7.147050259157883, |
| "grad_norm": 0.3368372321128845, |
| "learning_rate": 0.0005145388872706087, |
| "loss": 3.4101, |
| "step": 24550 |
| }, |
| { |
| "epoch": 7.161609690757673, |
| "grad_norm": 0.33424296975135803, |
| "learning_rate": 0.0005143641130206816, |
| "loss": 3.428, |
| "step": 24600 |
| }, |
| { |
| "epoch": 7.1761691223574635, |
| "grad_norm": 0.3270719647407532, |
| "learning_rate": 0.0005141893387707544, |
| "loss": 3.4132, |
| "step": 24650 |
| }, |
| { |
| "epoch": 7.190728553957253, |
| "grad_norm": 0.33818596601486206, |
| "learning_rate": 0.0005140145645208272, |
| "loss": 3.4144, |
| "step": 24700 |
| }, |
| { |
| "epoch": 7.205287985557044, |
| "grad_norm": 0.32162582874298096, |
| "learning_rate": 0.0005138397902709, |
| "loss": 3.4265, |
| "step": 24750 |
| }, |
| { |
| "epoch": 7.219847417156834, |
| "grad_norm": 0.33046919107437134, |
| "learning_rate": 0.0005136650160209728, |
| "loss": 3.4209, |
| "step": 24800 |
| }, |
| { |
| "epoch": 7.234406848756625, |
| "grad_norm": 0.31164079904556274, |
| "learning_rate": 0.0005134902417710457, |
| "loss": 3.4252, |
| "step": 24850 |
| }, |
| { |
| "epoch": 7.248966280356415, |
| "grad_norm": 0.35857391357421875, |
| "learning_rate": 0.0005133154675211185, |
| "loss": 3.4089, |
| "step": 24900 |
| }, |
| { |
| "epoch": 7.2635257119562056, |
| "grad_norm": 0.32694748044013977, |
| "learning_rate": 0.0005131406932711914, |
| "loss": 3.4282, |
| "step": 24950 |
| }, |
| { |
| "epoch": 7.2780851435559955, |
| "grad_norm": 0.3398299813270569, |
| "learning_rate": 0.0005129659190212641, |
| "loss": 3.4272, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.2780851435559955, |
| "eval_accuracy": 0.3646033727971998, |
| "eval_loss": 3.5990543365478516, |
| "eval_runtime": 54.4254, |
| "eval_samples_per_second": 305.794, |
| "eval_steps_per_second": 19.127, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.292644575155786, |
| "grad_norm": 0.3356783092021942, |
| "learning_rate": 0.0005127911447713369, |
| "loss": 3.4373, |
| "step": 25050 |
| }, |
| { |
| "epoch": 7.307204006755576, |
| "grad_norm": 0.3248707354068756, |
| "learning_rate": 0.0005126163705214098, |
| "loss": 3.4371, |
| "step": 25100 |
| }, |
| { |
| "epoch": 7.321763438355367, |
| "grad_norm": 0.31292667984962463, |
| "learning_rate": 0.0005124415962714826, |
| "loss": 3.4362, |
| "step": 25150 |
| }, |
| { |
| "epoch": 7.336322869955157, |
| "grad_norm": 0.29806479811668396, |
| "learning_rate": 0.0005122668220215555, |
| "loss": 3.4325, |
| "step": 25200 |
| }, |
| { |
| "epoch": 7.350882301554948, |
| "grad_norm": 0.33509254455566406, |
| "learning_rate": 0.0005120920477716282, |
| "loss": 3.4387, |
| "step": 25250 |
| }, |
| { |
| "epoch": 7.3654417331547375, |
| "grad_norm": 0.33612021803855896, |
| "learning_rate": 0.000511917273521701, |
| "loss": 3.4294, |
| "step": 25300 |
| }, |
| { |
| "epoch": 7.380001164754528, |
| "grad_norm": 0.3314110040664673, |
| "learning_rate": 0.0005117424992717739, |
| "loss": 3.439, |
| "step": 25350 |
| }, |
| { |
| "epoch": 7.394560596354318, |
| "grad_norm": 0.31326502561569214, |
| "learning_rate": 0.0005115677250218467, |
| "loss": 3.4409, |
| "step": 25400 |
| }, |
| { |
| "epoch": 7.409120027954109, |
| "grad_norm": 0.32877790927886963, |
| "learning_rate": 0.0005113929507719196, |
| "loss": 3.4458, |
| "step": 25450 |
| }, |
| { |
| "epoch": 7.423679459553899, |
| "grad_norm": 0.3034365475177765, |
| "learning_rate": 0.0005112181765219924, |
| "loss": 3.4455, |
| "step": 25500 |
| }, |
| { |
| "epoch": 7.43823889115369, |
| "grad_norm": 0.30074968934059143, |
| "learning_rate": 0.0005110434022720651, |
| "loss": 3.4473, |
| "step": 25550 |
| }, |
| { |
| "epoch": 7.4527983227534795, |
| "grad_norm": 0.34182247519493103, |
| "learning_rate": 0.000510868628022138, |
| "loss": 3.4443, |
| "step": 25600 |
| }, |
| { |
| "epoch": 7.46735775435327, |
| "grad_norm": 0.324805349111557, |
| "learning_rate": 0.0005106938537722109, |
| "loss": 3.4394, |
| "step": 25650 |
| }, |
| { |
| "epoch": 7.48191718595306, |
| "grad_norm": 0.327799916267395, |
| "learning_rate": 0.0005105190795222837, |
| "loss": 3.4358, |
| "step": 25700 |
| }, |
| { |
| "epoch": 7.496476617552851, |
| "grad_norm": 0.32267752289772034, |
| "learning_rate": 0.0005103443052723565, |
| "loss": 3.4471, |
| "step": 25750 |
| }, |
| { |
| "epoch": 7.511036049152641, |
| "grad_norm": 0.3113254904747009, |
| "learning_rate": 0.0005101695310224294, |
| "loss": 3.4371, |
| "step": 25800 |
| }, |
| { |
| "epoch": 7.525595480752432, |
| "grad_norm": 0.3191179633140564, |
| "learning_rate": 0.0005099947567725021, |
| "loss": 3.4375, |
| "step": 25850 |
| }, |
| { |
| "epoch": 7.540154912352222, |
| "grad_norm": 0.3289468288421631, |
| "learning_rate": 0.000509819982522575, |
| "loss": 3.4396, |
| "step": 25900 |
| }, |
| { |
| "epoch": 7.554714343952012, |
| "grad_norm": 0.3294450044631958, |
| "learning_rate": 0.0005096452082726478, |
| "loss": 3.4431, |
| "step": 25950 |
| }, |
| { |
| "epoch": 7.569273775551802, |
| "grad_norm": 0.32150015234947205, |
| "learning_rate": 0.0005094704340227206, |
| "loss": 3.4526, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.569273775551802, |
| "eval_accuracy": 0.3649583582855953, |
| "eval_loss": 3.5938405990600586, |
| "eval_runtime": 54.4229, |
| "eval_samples_per_second": 305.809, |
| "eval_steps_per_second": 19.128, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.583833207151593, |
| "grad_norm": 0.36833828687667847, |
| "learning_rate": 0.0005092956597727935, |
| "loss": 3.4566, |
| "step": 26050 |
| }, |
| { |
| "epoch": 7.598392638751383, |
| "grad_norm": 0.3328242897987366, |
| "learning_rate": 0.0005091208855228662, |
| "loss": 3.4516, |
| "step": 26100 |
| }, |
| { |
| "epoch": 7.612952070351174, |
| "grad_norm": 0.3110596239566803, |
| "learning_rate": 0.0005089461112729391, |
| "loss": 3.4443, |
| "step": 26150 |
| }, |
| { |
| "epoch": 7.627511501950964, |
| "grad_norm": 0.3015148341655731, |
| "learning_rate": 0.0005087713370230119, |
| "loss": 3.4576, |
| "step": 26200 |
| }, |
| { |
| "epoch": 7.642070933550754, |
| "grad_norm": 0.31756895780563354, |
| "learning_rate": 0.0005085965627730847, |
| "loss": 3.4537, |
| "step": 26250 |
| }, |
| { |
| "epoch": 7.656630365150544, |
| "grad_norm": 0.3184044361114502, |
| "learning_rate": 0.0005084217885231576, |
| "loss": 3.4558, |
| "step": 26300 |
| }, |
| { |
| "epoch": 7.671189796750335, |
| "grad_norm": 0.3208577036857605, |
| "learning_rate": 0.0005082470142732304, |
| "loss": 3.4679, |
| "step": 26350 |
| }, |
| { |
| "epoch": 7.685749228350125, |
| "grad_norm": 0.31280356645584106, |
| "learning_rate": 0.0005080722400233032, |
| "loss": 3.4482, |
| "step": 26400 |
| }, |
| { |
| "epoch": 7.700308659949916, |
| "grad_norm": 0.31253930926322937, |
| "learning_rate": 0.000507897465773376, |
| "loss": 3.4452, |
| "step": 26450 |
| }, |
| { |
| "epoch": 7.714868091549706, |
| "grad_norm": 0.30488118529319763, |
| "learning_rate": 0.0005077226915234488, |
| "loss": 3.4542, |
| "step": 26500 |
| }, |
| { |
| "epoch": 7.729427523149496, |
| "grad_norm": 0.3201649785041809, |
| "learning_rate": 0.0005075479172735217, |
| "loss": 3.4561, |
| "step": 26550 |
| }, |
| { |
| "epoch": 7.743986954749286, |
| "grad_norm": 0.3104819655418396, |
| "learning_rate": 0.0005073731430235945, |
| "loss": 3.4467, |
| "step": 26600 |
| }, |
| { |
| "epoch": 7.758546386349077, |
| "grad_norm": 0.3295295238494873, |
| "learning_rate": 0.0005071983687736674, |
| "loss": 3.4631, |
| "step": 26650 |
| }, |
| { |
| "epoch": 7.773105817948867, |
| "grad_norm": 0.30710911750793457, |
| "learning_rate": 0.0005070235945237401, |
| "loss": 3.4518, |
| "step": 26700 |
| }, |
| { |
| "epoch": 7.787665249548658, |
| "grad_norm": 0.33581966161727905, |
| "learning_rate": 0.0005068488202738129, |
| "loss": 3.4568, |
| "step": 26750 |
| }, |
| { |
| "epoch": 7.802224681148448, |
| "grad_norm": 0.3220713436603546, |
| "learning_rate": 0.0005066740460238858, |
| "loss": 3.451, |
| "step": 26800 |
| }, |
| { |
| "epoch": 7.8167841127482385, |
| "grad_norm": 0.3231236934661865, |
| "learning_rate": 0.0005064992717739586, |
| "loss": 3.4682, |
| "step": 26850 |
| }, |
| { |
| "epoch": 7.831343544348028, |
| "grad_norm": 0.3177933692932129, |
| "learning_rate": 0.0005063244975240315, |
| "loss": 3.4558, |
| "step": 26900 |
| }, |
| { |
| "epoch": 7.845902975947819, |
| "grad_norm": 0.3490990102291107, |
| "learning_rate": 0.0005061497232741042, |
| "loss": 3.4578, |
| "step": 26950 |
| }, |
| { |
| "epoch": 7.860462407547609, |
| "grad_norm": 0.3442172110080719, |
| "learning_rate": 0.000505974949024177, |
| "loss": 3.4506, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.860462407547609, |
| "eval_accuracy": 0.3662008662868788, |
| "eval_loss": 3.581078052520752, |
| "eval_runtime": 54.1953, |
| "eval_samples_per_second": 307.093, |
| "eval_steps_per_second": 19.208, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.8750218391474, |
| "grad_norm": 0.343916654586792, |
| "learning_rate": 0.0005058001747742499, |
| "loss": 3.4742, |
| "step": 27050 |
| }, |
| { |
| "epoch": 7.88958127074719, |
| "grad_norm": 0.31028035283088684, |
| "learning_rate": 0.0005056254005243227, |
| "loss": 3.464, |
| "step": 27100 |
| }, |
| { |
| "epoch": 7.9041407023469805, |
| "grad_norm": 0.3196263313293457, |
| "learning_rate": 0.0005054506262743955, |
| "loss": 3.458, |
| "step": 27150 |
| }, |
| { |
| "epoch": 7.91870013394677, |
| "grad_norm": 0.31997817754745483, |
| "learning_rate": 0.0005052758520244684, |
| "loss": 3.467, |
| "step": 27200 |
| }, |
| { |
| "epoch": 7.933259565546561, |
| "grad_norm": 0.3503701388835907, |
| "learning_rate": 0.0005051010777745411, |
| "loss": 3.4566, |
| "step": 27250 |
| }, |
| { |
| "epoch": 7.947818997146351, |
| "grad_norm": 0.3098277151584625, |
| "learning_rate": 0.000504926303524614, |
| "loss": 3.4636, |
| "step": 27300 |
| }, |
| { |
| "epoch": 7.962378428746142, |
| "grad_norm": 0.3302386403083801, |
| "learning_rate": 0.0005047515292746868, |
| "loss": 3.4563, |
| "step": 27350 |
| }, |
| { |
| "epoch": 7.976937860345932, |
| "grad_norm": 0.3156256079673767, |
| "learning_rate": 0.0005045767550247596, |
| "loss": 3.465, |
| "step": 27400 |
| }, |
| { |
| "epoch": 7.991497291945723, |
| "grad_norm": 0.31740131974220276, |
| "learning_rate": 0.0005044019807748325, |
| "loss": 3.4715, |
| "step": 27450 |
| }, |
| { |
| "epoch": 8.005823772639916, |
| "grad_norm": 0.3159884810447693, |
| "learning_rate": 0.0005042272065249052, |
| "loss": 3.4171, |
| "step": 27500 |
| }, |
| { |
| "epoch": 8.020383204239707, |
| "grad_norm": 0.32894912362098694, |
| "learning_rate": 0.0005040524322749781, |
| "loss": 3.3421, |
| "step": 27550 |
| }, |
| { |
| "epoch": 8.034942635839498, |
| "grad_norm": 0.3333737552165985, |
| "learning_rate": 0.0005038776580250509, |
| "loss": 3.3601, |
| "step": 27600 |
| }, |
| { |
| "epoch": 8.049502067439287, |
| "grad_norm": 0.3066194951534271, |
| "learning_rate": 0.0005037028837751237, |
| "loss": 3.3551, |
| "step": 27650 |
| }, |
| { |
| "epoch": 8.064061499039077, |
| "grad_norm": 0.3339882493019104, |
| "learning_rate": 0.0005035281095251966, |
| "loss": 3.3661, |
| "step": 27700 |
| }, |
| { |
| "epoch": 8.078620930638868, |
| "grad_norm": 0.3425856828689575, |
| "learning_rate": 0.0005033533352752694, |
| "loss": 3.3698, |
| "step": 27750 |
| }, |
| { |
| "epoch": 8.093180362238659, |
| "grad_norm": 0.3282395005226135, |
| "learning_rate": 0.0005031785610253422, |
| "loss": 3.3741, |
| "step": 27800 |
| }, |
| { |
| "epoch": 8.107739793838448, |
| "grad_norm": 0.3201969563961029, |
| "learning_rate": 0.000503003786775415, |
| "loss": 3.3759, |
| "step": 27850 |
| }, |
| { |
| "epoch": 8.122299225438239, |
| "grad_norm": 0.3361366391181946, |
| "learning_rate": 0.0005028290125254878, |
| "loss": 3.3752, |
| "step": 27900 |
| }, |
| { |
| "epoch": 8.13685865703803, |
| "grad_norm": 0.3365829288959503, |
| "learning_rate": 0.0005026542382755607, |
| "loss": 3.3637, |
| "step": 27950 |
| }, |
| { |
| "epoch": 8.15141808863782, |
| "grad_norm": 0.3227217495441437, |
| "learning_rate": 0.0005024794640256335, |
| "loss": 3.3616, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.15141808863782, |
| "eval_accuracy": 0.3656379926423114, |
| "eval_loss": 3.5914840698242188, |
| "eval_runtime": 53.9744, |
| "eval_samples_per_second": 308.35, |
| "eval_steps_per_second": 19.287, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.16597752023761, |
| "grad_norm": 0.3198586702346802, |
| "learning_rate": 0.0005023046897757064, |
| "loss": 3.3881, |
| "step": 28050 |
| }, |
| { |
| "epoch": 8.1805369518374, |
| "grad_norm": 0.35139596462249756, |
| "learning_rate": 0.0005021299155257791, |
| "loss": 3.3827, |
| "step": 28100 |
| }, |
| { |
| "epoch": 8.19509638343719, |
| "grad_norm": 0.30760928988456726, |
| "learning_rate": 0.000501955141275852, |
| "loss": 3.3961, |
| "step": 28150 |
| }, |
| { |
| "epoch": 8.209655815036982, |
| "grad_norm": 0.31556805968284607, |
| "learning_rate": 0.0005017803670259248, |
| "loss": 3.3849, |
| "step": 28200 |
| }, |
| { |
| "epoch": 8.22421524663677, |
| "grad_norm": 0.34970980882644653, |
| "learning_rate": 0.0005016055927759977, |
| "loss": 3.3894, |
| "step": 28250 |
| }, |
| { |
| "epoch": 8.238774678236561, |
| "grad_norm": 0.3420124650001526, |
| "learning_rate": 0.0005014308185260705, |
| "loss": 3.4022, |
| "step": 28300 |
| }, |
| { |
| "epoch": 8.253334109836352, |
| "grad_norm": 0.3457973003387451, |
| "learning_rate": 0.0005012560442761432, |
| "loss": 3.3901, |
| "step": 28350 |
| }, |
| { |
| "epoch": 8.267893541436143, |
| "grad_norm": 0.32130882143974304, |
| "learning_rate": 0.0005010812700262161, |
| "loss": 3.3881, |
| "step": 28400 |
| }, |
| { |
| "epoch": 8.282452973035932, |
| "grad_norm": 0.31469210982322693, |
| "learning_rate": 0.0005009064957762889, |
| "loss": 3.4068, |
| "step": 28450 |
| }, |
| { |
| "epoch": 8.297012404635723, |
| "grad_norm": 0.3118443787097931, |
| "learning_rate": 0.0005007317215263618, |
| "loss": 3.4032, |
| "step": 28500 |
| }, |
| { |
| "epoch": 8.311571836235514, |
| "grad_norm": 0.3427928686141968, |
| "learning_rate": 0.0005005569472764346, |
| "loss": 3.4035, |
| "step": 28550 |
| }, |
| { |
| "epoch": 8.326131267835304, |
| "grad_norm": 0.3363751173019409, |
| "learning_rate": 0.0005003821730265074, |
| "loss": 3.4096, |
| "step": 28600 |
| }, |
| { |
| "epoch": 8.340690699435093, |
| "grad_norm": 0.30898264050483704, |
| "learning_rate": 0.0005002073987765802, |
| "loss": 3.4041, |
| "step": 28650 |
| }, |
| { |
| "epoch": 8.355250131034884, |
| "grad_norm": 0.3334648907184601, |
| "learning_rate": 0.000500032624526653, |
| "loss": 3.4122, |
| "step": 28700 |
| }, |
| { |
| "epoch": 8.369809562634675, |
| "grad_norm": 0.3632184565067291, |
| "learning_rate": 0.0004998578502767259, |
| "loss": 3.3975, |
| "step": 28750 |
| }, |
| { |
| "epoch": 8.384368994234466, |
| "grad_norm": 0.3695220649242401, |
| "learning_rate": 0.0004996830760267987, |
| "loss": 3.4077, |
| "step": 28800 |
| }, |
| { |
| "epoch": 8.398928425834255, |
| "grad_norm": 0.3448977470397949, |
| "learning_rate": 0.0004995083017768715, |
| "loss": 3.4034, |
| "step": 28850 |
| }, |
| { |
| "epoch": 8.413487857434045, |
| "grad_norm": 0.3269871771335602, |
| "learning_rate": 0.0004993335275269444, |
| "loss": 3.4123, |
| "step": 28900 |
| }, |
| { |
| "epoch": 8.428047289033836, |
| "grad_norm": 0.34690749645233154, |
| "learning_rate": 0.0004991587532770171, |
| "loss": 3.4035, |
| "step": 28950 |
| }, |
| { |
| "epoch": 8.442606720633627, |
| "grad_norm": 0.31924426555633545, |
| "learning_rate": 0.00049898397902709, |
| "loss": 3.4033, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.442606720633627, |
| "eval_accuracy": 0.36657313659368906, |
| "eval_loss": 3.5842514038085938, |
| "eval_runtime": 54.0313, |
| "eval_samples_per_second": 308.025, |
| "eval_steps_per_second": 19.267, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.457166152233416, |
| "grad_norm": 0.32659900188446045, |
| "learning_rate": 0.0004988092047771628, |
| "loss": 3.4083, |
| "step": 29050 |
| }, |
| { |
| "epoch": 8.471725583833207, |
| "grad_norm": 0.32884863018989563, |
| "learning_rate": 0.0004986344305272356, |
| "loss": 3.4058, |
| "step": 29100 |
| }, |
| { |
| "epoch": 8.486285015432998, |
| "grad_norm": 0.32945168018341064, |
| "learning_rate": 0.0004984596562773085, |
| "loss": 3.4181, |
| "step": 29150 |
| }, |
| { |
| "epoch": 8.500844447032788, |
| "grad_norm": 0.3260224163532257, |
| "learning_rate": 0.0004982848820273812, |
| "loss": 3.4164, |
| "step": 29200 |
| }, |
| { |
| "epoch": 8.515403878632577, |
| "grad_norm": 0.30971044301986694, |
| "learning_rate": 0.0004981101077774541, |
| "loss": 3.4222, |
| "step": 29250 |
| }, |
| { |
| "epoch": 8.529963310232368, |
| "grad_norm": 0.3370177447795868, |
| "learning_rate": 0.0004979353335275269, |
| "loss": 3.4189, |
| "step": 29300 |
| }, |
| { |
| "epoch": 8.544522741832159, |
| "grad_norm": 0.3051028847694397, |
| "learning_rate": 0.0004977605592775997, |
| "loss": 3.4381, |
| "step": 29350 |
| }, |
| { |
| "epoch": 8.55908217343195, |
| "grad_norm": 0.31183165311813354, |
| "learning_rate": 0.0004975857850276726, |
| "loss": 3.4182, |
| "step": 29400 |
| }, |
| { |
| "epoch": 8.573641605031739, |
| "grad_norm": 0.35490667819976807, |
| "learning_rate": 0.0004974110107777454, |
| "loss": 3.4251, |
| "step": 29450 |
| }, |
| { |
| "epoch": 8.58820103663153, |
| "grad_norm": 0.32052773237228394, |
| "learning_rate": 0.0004972362365278182, |
| "loss": 3.4291, |
| "step": 29500 |
| }, |
| { |
| "epoch": 8.60276046823132, |
| "grad_norm": 0.3103652596473694, |
| "learning_rate": 0.000497061462277891, |
| "loss": 3.4086, |
| "step": 29550 |
| }, |
| { |
| "epoch": 8.617319899831111, |
| "grad_norm": 0.31185802817344666, |
| "learning_rate": 0.0004968866880279638, |
| "loss": 3.4274, |
| "step": 29600 |
| }, |
| { |
| "epoch": 8.6318793314309, |
| "grad_norm": 0.33494362235069275, |
| "learning_rate": 0.0004967119137780367, |
| "loss": 3.4227, |
| "step": 29650 |
| }, |
| { |
| "epoch": 8.646438763030691, |
| "grad_norm": 0.3317851424217224, |
| "learning_rate": 0.0004965371395281095, |
| "loss": 3.414, |
| "step": 29700 |
| }, |
| { |
| "epoch": 8.660998194630482, |
| "grad_norm": 0.3123399317264557, |
| "learning_rate": 0.0004963623652781822, |
| "loss": 3.4224, |
| "step": 29750 |
| }, |
| { |
| "epoch": 8.675557626230272, |
| "grad_norm": 0.3217580020427704, |
| "learning_rate": 0.0004961875910282551, |
| "loss": 3.417, |
| "step": 29800 |
| }, |
| { |
| "epoch": 8.690117057830061, |
| "grad_norm": 0.320300430059433, |
| "learning_rate": 0.0004960128167783279, |
| "loss": 3.4167, |
| "step": 29850 |
| }, |
| { |
| "epoch": 8.704676489429852, |
| "grad_norm": 0.3256986141204834, |
| "learning_rate": 0.0004958380425284008, |
| "loss": 3.439, |
| "step": 29900 |
| }, |
| { |
| "epoch": 8.719235921029643, |
| "grad_norm": 0.3188437819480896, |
| "learning_rate": 0.0004956632682784736, |
| "loss": 3.4193, |
| "step": 29950 |
| }, |
| { |
| "epoch": 8.733795352629434, |
| "grad_norm": 0.33221110701560974, |
| "learning_rate": 0.0004954884940285464, |
| "loss": 3.4251, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.733795352629434, |
| "eval_accuracy": 0.36723513338059416, |
| "eval_loss": 3.573622941970825, |
| "eval_runtime": 54.2175, |
| "eval_samples_per_second": 306.967, |
| "eval_steps_per_second": 19.2, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.748354784229225, |
| "grad_norm": 0.34907636046409607, |
| "learning_rate": 0.0004953137197786192, |
| "loss": 3.4265, |
| "step": 30050 |
| }, |
| { |
| "epoch": 8.762914215829014, |
| "grad_norm": 0.31726714968681335, |
| "learning_rate": 0.000495138945528692, |
| "loss": 3.4318, |
| "step": 30100 |
| }, |
| { |
| "epoch": 8.777473647428804, |
| "grad_norm": 0.3318343758583069, |
| "learning_rate": 0.0004949641712787649, |
| "loss": 3.4329, |
| "step": 30150 |
| }, |
| { |
| "epoch": 8.792033079028595, |
| "grad_norm": 0.32355913519859314, |
| "learning_rate": 0.0004947893970288377, |
| "loss": 3.4301, |
| "step": 30200 |
| }, |
| { |
| "epoch": 8.806592510628384, |
| "grad_norm": 0.3216176927089691, |
| "learning_rate": 0.0004946146227789105, |
| "loss": 3.4259, |
| "step": 30250 |
| }, |
| { |
| "epoch": 8.821151942228175, |
| "grad_norm": 0.3203097879886627, |
| "learning_rate": 0.0004944398485289834, |
| "loss": 3.4274, |
| "step": 30300 |
| }, |
| { |
| "epoch": 8.835711373827966, |
| "grad_norm": 0.3079201579093933, |
| "learning_rate": 0.0004942650742790561, |
| "loss": 3.4354, |
| "step": 30350 |
| }, |
| { |
| "epoch": 8.850270805427757, |
| "grad_norm": 0.3074656128883362, |
| "learning_rate": 0.000494090300029129, |
| "loss": 3.4371, |
| "step": 30400 |
| }, |
| { |
| "epoch": 8.864830237027547, |
| "grad_norm": 0.34187525510787964, |
| "learning_rate": 0.0004939155257792018, |
| "loss": 3.442, |
| "step": 30450 |
| }, |
| { |
| "epoch": 8.879389668627336, |
| "grad_norm": 0.33804062008857727, |
| "learning_rate": 0.0004937407515292746, |
| "loss": 3.4379, |
| "step": 30500 |
| }, |
| { |
| "epoch": 8.893949100227127, |
| "grad_norm": 0.3283509314060211, |
| "learning_rate": 0.0004935659772793475, |
| "loss": 3.4292, |
| "step": 30550 |
| }, |
| { |
| "epoch": 8.908508531826918, |
| "grad_norm": 0.3360782563686371, |
| "learning_rate": 0.0004933912030294202, |
| "loss": 3.43, |
| "step": 30600 |
| }, |
| { |
| "epoch": 8.923067963426707, |
| "grad_norm": 0.32649967074394226, |
| "learning_rate": 0.0004932164287794931, |
| "loss": 3.4253, |
| "step": 30650 |
| }, |
| { |
| "epoch": 8.937627395026498, |
| "grad_norm": 0.31838634610176086, |
| "learning_rate": 0.000493041654529566, |
| "loss": 3.4488, |
| "step": 30700 |
| }, |
| { |
| "epoch": 8.952186826626289, |
| "grad_norm": 0.3074035346508026, |
| "learning_rate": 0.0004928668802796388, |
| "loss": 3.4389, |
| "step": 30750 |
| }, |
| { |
| "epoch": 8.96674625822608, |
| "grad_norm": 0.34510338306427, |
| "learning_rate": 0.0004926921060297116, |
| "loss": 3.4359, |
| "step": 30800 |
| }, |
| { |
| "epoch": 8.98130568982587, |
| "grad_norm": 0.32787054777145386, |
| "learning_rate": 0.0004925173317797845, |
| "loss": 3.438, |
| "step": 30850 |
| }, |
| { |
| "epoch": 8.995865121425659, |
| "grad_norm": 0.32717031240463257, |
| "learning_rate": 0.0004923425575298572, |
| "loss": 3.4371, |
| "step": 30900 |
| }, |
| { |
| "epoch": 9.010191602119853, |
| "grad_norm": 0.3534397780895233, |
| "learning_rate": 0.0004921677832799301, |
| "loss": 3.366, |
| "step": 30950 |
| }, |
| { |
| "epoch": 9.024751033719644, |
| "grad_norm": 0.32879695296287537, |
| "learning_rate": 0.0004919930090300029, |
| "loss": 3.3199, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.024751033719644, |
| "eval_accuracy": 0.3674407874445901, |
| "eval_loss": 3.5794942378997803, |
| "eval_runtime": 53.9638, |
| "eval_samples_per_second": 308.41, |
| "eval_steps_per_second": 19.291, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.039310465319433, |
| "grad_norm": 0.32832634449005127, |
| "learning_rate": 0.0004918182347800757, |
| "loss": 3.3192, |
| "step": 31050 |
| }, |
| { |
| "epoch": 9.053869896919224, |
| "grad_norm": 0.3287530839443207, |
| "learning_rate": 0.0004916434605301486, |
| "loss": 3.3388, |
| "step": 31100 |
| }, |
| { |
| "epoch": 9.068429328519015, |
| "grad_norm": 0.32364922761917114, |
| "learning_rate": 0.0004914686862802213, |
| "loss": 3.3306, |
| "step": 31150 |
| }, |
| { |
| "epoch": 9.082988760118806, |
| "grad_norm": 0.343106210231781, |
| "learning_rate": 0.0004912939120302941, |
| "loss": 3.33, |
| "step": 31200 |
| }, |
| { |
| "epoch": 9.097548191718595, |
| "grad_norm": 0.31469976902008057, |
| "learning_rate": 0.000491119137780367, |
| "loss": 3.3493, |
| "step": 31250 |
| }, |
| { |
| "epoch": 9.112107623318385, |
| "grad_norm": 0.36989399790763855, |
| "learning_rate": 0.0004909443635304398, |
| "loss": 3.3455, |
| "step": 31300 |
| }, |
| { |
| "epoch": 9.126667054918176, |
| "grad_norm": 0.3417125344276428, |
| "learning_rate": 0.0004907695892805127, |
| "loss": 3.3545, |
| "step": 31350 |
| }, |
| { |
| "epoch": 9.141226486517967, |
| "grad_norm": 0.3421690762042999, |
| "learning_rate": 0.0004905948150305855, |
| "loss": 3.3503, |
| "step": 31400 |
| }, |
| { |
| "epoch": 9.155785918117756, |
| "grad_norm": 0.34142985939979553, |
| "learning_rate": 0.0004904200407806582, |
| "loss": 3.3663, |
| "step": 31450 |
| }, |
| { |
| "epoch": 9.170345349717547, |
| "grad_norm": 0.3186779320240021, |
| "learning_rate": 0.0004902452665307311, |
| "loss": 3.342, |
| "step": 31500 |
| }, |
| { |
| "epoch": 9.184904781317337, |
| "grad_norm": 0.35970360040664673, |
| "learning_rate": 0.0004900704922808039, |
| "loss": 3.3642, |
| "step": 31550 |
| }, |
| { |
| "epoch": 9.199464212917128, |
| "grad_norm": 0.33403223752975464, |
| "learning_rate": 0.0004898957180308768, |
| "loss": 3.3581, |
| "step": 31600 |
| }, |
| { |
| "epoch": 9.214023644516917, |
| "grad_norm": 0.3198859691619873, |
| "learning_rate": 0.0004897209437809496, |
| "loss": 3.3645, |
| "step": 31650 |
| }, |
| { |
| "epoch": 9.228583076116708, |
| "grad_norm": 0.3827151656150818, |
| "learning_rate": 0.0004895461695310223, |
| "loss": 3.3694, |
| "step": 31700 |
| }, |
| { |
| "epoch": 9.243142507716499, |
| "grad_norm": 0.34455496072769165, |
| "learning_rate": 0.0004893713952810952, |
| "loss": 3.3653, |
| "step": 31750 |
| }, |
| { |
| "epoch": 9.25770193931629, |
| "grad_norm": 0.3326873183250427, |
| "learning_rate": 0.000489196621031168, |
| "loss": 3.3595, |
| "step": 31800 |
| }, |
| { |
| "epoch": 9.272261370916079, |
| "grad_norm": 0.32842305302619934, |
| "learning_rate": 0.0004890218467812409, |
| "loss": 3.3899, |
| "step": 31850 |
| }, |
| { |
| "epoch": 9.28682080251587, |
| "grad_norm": 0.32804909348487854, |
| "learning_rate": 0.0004888470725313137, |
| "loss": 3.3667, |
| "step": 31900 |
| }, |
| { |
| "epoch": 9.30138023411566, |
| "grad_norm": 0.3439745604991913, |
| "learning_rate": 0.0004886722982813865, |
| "loss": 3.3901, |
| "step": 31950 |
| }, |
| { |
| "epoch": 9.315939665715451, |
| "grad_norm": 0.3488461375236511, |
| "learning_rate": 0.0004884975240314593, |
| "loss": 3.3801, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.315939665715451, |
| "eval_accuracy": 0.3670910932271379, |
| "eval_loss": 3.579740047454834, |
| "eval_runtime": 53.9933, |
| "eval_samples_per_second": 308.242, |
| "eval_steps_per_second": 19.28, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.33049909731524, |
| "grad_norm": 0.3165011703968048, |
| "learning_rate": 0.0004883227497815321, |
| "loss": 3.3729, |
| "step": 32050 |
| }, |
| { |
| "epoch": 9.34505852891503, |
| "grad_norm": 0.32998713850975037, |
| "learning_rate": 0.00048814797553160496, |
| "loss": 3.3726, |
| "step": 32100 |
| }, |
| { |
| "epoch": 9.359617960514822, |
| "grad_norm": 0.3431642949581146, |
| "learning_rate": 0.0004879732012816778, |
| "loss": 3.3722, |
| "step": 32150 |
| }, |
| { |
| "epoch": 9.374177392114612, |
| "grad_norm": 0.3487151265144348, |
| "learning_rate": 0.0004877984270317506, |
| "loss": 3.3829, |
| "step": 32200 |
| }, |
| { |
| "epoch": 9.388736823714403, |
| "grad_norm": 0.3365775942802429, |
| "learning_rate": 0.0004876236527818234, |
| "loss": 3.3844, |
| "step": 32250 |
| }, |
| { |
| "epoch": 9.403296255314192, |
| "grad_norm": 0.32179173827171326, |
| "learning_rate": 0.00048744887853189624, |
| "loss": 3.3815, |
| "step": 32300 |
| }, |
| { |
| "epoch": 9.417855686913983, |
| "grad_norm": 0.3473745584487915, |
| "learning_rate": 0.00048727410428196907, |
| "loss": 3.3759, |
| "step": 32350 |
| }, |
| { |
| "epoch": 9.432415118513774, |
| "grad_norm": 0.31671932339668274, |
| "learning_rate": 0.0004870993300320419, |
| "loss": 3.386, |
| "step": 32400 |
| }, |
| { |
| "epoch": 9.446974550113563, |
| "grad_norm": 0.31829264760017395, |
| "learning_rate": 0.00048692455578211474, |
| "loss": 3.3817, |
| "step": 32450 |
| }, |
| { |
| "epoch": 9.461533981713353, |
| "grad_norm": 0.3237382769584656, |
| "learning_rate": 0.0004867497815321875, |
| "loss": 3.3909, |
| "step": 32500 |
| }, |
| { |
| "epoch": 9.476093413313144, |
| "grad_norm": 0.31413301825523376, |
| "learning_rate": 0.00048657500728226035, |
| "loss": 3.3873, |
| "step": 32550 |
| }, |
| { |
| "epoch": 9.490652844912935, |
| "grad_norm": 0.3735544681549072, |
| "learning_rate": 0.0004864002330323332, |
| "loss": 3.3888, |
| "step": 32600 |
| }, |
| { |
| "epoch": 9.505212276512726, |
| "grad_norm": 0.33301472663879395, |
| "learning_rate": 0.000486225458782406, |
| "loss": 3.388, |
| "step": 32650 |
| }, |
| { |
| "epoch": 9.519771708112515, |
| "grad_norm": 0.34808629751205444, |
| "learning_rate": 0.0004860506845324788, |
| "loss": 3.379, |
| "step": 32700 |
| }, |
| { |
| "epoch": 9.534331139712306, |
| "grad_norm": 0.3324873447418213, |
| "learning_rate": 0.0004858759102825516, |
| "loss": 3.3959, |
| "step": 32750 |
| }, |
| { |
| "epoch": 9.548890571312096, |
| "grad_norm": 0.34592559933662415, |
| "learning_rate": 0.00048570113603262446, |
| "loss": 3.3929, |
| "step": 32800 |
| }, |
| { |
| "epoch": 9.563450002911885, |
| "grad_norm": 0.35991552472114563, |
| "learning_rate": 0.0004855263617826973, |
| "loss": 3.3922, |
| "step": 32850 |
| }, |
| { |
| "epoch": 9.578009434511676, |
| "grad_norm": 0.3375621438026428, |
| "learning_rate": 0.0004853515875327701, |
| "loss": 3.3983, |
| "step": 32900 |
| }, |
| { |
| "epoch": 9.592568866111467, |
| "grad_norm": 0.34628742933273315, |
| "learning_rate": 0.0004851768132828429, |
| "loss": 3.391, |
| "step": 32950 |
| }, |
| { |
| "epoch": 9.607128297711258, |
| "grad_norm": 0.3223342001438141, |
| "learning_rate": 0.00048500203903291574, |
| "loss": 3.3891, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.607128297711258, |
| "eval_accuracy": 0.3680227096645534, |
| "eval_loss": 3.569427251815796, |
| "eval_runtime": 54.1344, |
| "eval_samples_per_second": 307.438, |
| "eval_steps_per_second": 19.23, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.621687729311049, |
| "grad_norm": 0.3431673049926758, |
| "learning_rate": 0.00048482726478298857, |
| "loss": 3.3903, |
| "step": 33050 |
| }, |
| { |
| "epoch": 9.636247160910838, |
| "grad_norm": 0.3508543372154236, |
| "learning_rate": 0.0004846524905330614, |
| "loss": 3.3924, |
| "step": 33100 |
| }, |
| { |
| "epoch": 9.650806592510628, |
| "grad_norm": 0.329703688621521, |
| "learning_rate": 0.00048447771628313424, |
| "loss": 3.3948, |
| "step": 33150 |
| }, |
| { |
| "epoch": 9.66536602411042, |
| "grad_norm": 0.33297428488731384, |
| "learning_rate": 0.0004843029420332071, |
| "loss": 3.3903, |
| "step": 33200 |
| }, |
| { |
| "epoch": 9.67992545571021, |
| "grad_norm": 0.32107412815093994, |
| "learning_rate": 0.0004841281677832799, |
| "loss": 3.4097, |
| "step": 33250 |
| }, |
| { |
| "epoch": 9.694484887309999, |
| "grad_norm": 0.35037678480148315, |
| "learning_rate": 0.00048395339353335273, |
| "loss": 3.3981, |
| "step": 33300 |
| }, |
| { |
| "epoch": 9.70904431890979, |
| "grad_norm": 0.3188696801662445, |
| "learning_rate": 0.00048377861928342557, |
| "loss": 3.4078, |
| "step": 33350 |
| }, |
| { |
| "epoch": 9.72360375050958, |
| "grad_norm": 0.3539755046367645, |
| "learning_rate": 0.0004836038450334984, |
| "loss": 3.4027, |
| "step": 33400 |
| }, |
| { |
| "epoch": 9.738163182109371, |
| "grad_norm": 0.35060805082321167, |
| "learning_rate": 0.0004834290707835712, |
| "loss": 3.3889, |
| "step": 33450 |
| }, |
| { |
| "epoch": 9.75272261370916, |
| "grad_norm": 0.3305850327014923, |
| "learning_rate": 0.000483254296533644, |
| "loss": 3.4183, |
| "step": 33500 |
| }, |
| { |
| "epoch": 9.767282045308951, |
| "grad_norm": 0.3297503590583801, |
| "learning_rate": 0.00048307952228371685, |
| "loss": 3.4122, |
| "step": 33550 |
| }, |
| { |
| "epoch": 9.781841476908742, |
| "grad_norm": 0.3465321362018585, |
| "learning_rate": 0.0004829047480337897, |
| "loss": 3.4138, |
| "step": 33600 |
| }, |
| { |
| "epoch": 9.796400908508533, |
| "grad_norm": 0.3402300775051117, |
| "learning_rate": 0.0004827299737838625, |
| "loss": 3.4017, |
| "step": 33650 |
| }, |
| { |
| "epoch": 9.810960340108322, |
| "grad_norm": 0.3542657196521759, |
| "learning_rate": 0.0004825551995339353, |
| "loss": 3.4158, |
| "step": 33700 |
| }, |
| { |
| "epoch": 9.825519771708112, |
| "grad_norm": 0.32605618238449097, |
| "learning_rate": 0.0004823804252840081, |
| "loss": 3.4154, |
| "step": 33750 |
| }, |
| { |
| "epoch": 9.840079203307903, |
| "grad_norm": 0.37203866243362427, |
| "learning_rate": 0.00048220565103408096, |
| "loss": 3.4068, |
| "step": 33800 |
| }, |
| { |
| "epoch": 9.854638634907694, |
| "grad_norm": 0.3198862373828888, |
| "learning_rate": 0.0004820308767841538, |
| "loss": 3.4161, |
| "step": 33850 |
| }, |
| { |
| "epoch": 9.869198066507483, |
| "grad_norm": 0.316527783870697, |
| "learning_rate": 0.0004818561025342266, |
| "loss": 3.4128, |
| "step": 33900 |
| }, |
| { |
| "epoch": 9.883757498107274, |
| "grad_norm": 0.32887765765190125, |
| "learning_rate": 0.0004816813282842994, |
| "loss": 3.4107, |
| "step": 33950 |
| }, |
| { |
| "epoch": 9.898316929707065, |
| "grad_norm": 0.3417325019836426, |
| "learning_rate": 0.00048150655403437223, |
| "loss": 3.4101, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.898316929707065, |
| "eval_accuracy": 0.36854583998514684, |
| "eval_loss": 3.5642430782318115, |
| "eval_runtime": 54.1838, |
| "eval_samples_per_second": 307.158, |
| "eval_steps_per_second": 19.212, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.912876361306855, |
| "grad_norm": 0.33323341608047485, |
| "learning_rate": 0.00048133177978444507, |
| "loss": 3.4087, |
| "step": 34050 |
| }, |
| { |
| "epoch": 9.927435792906644, |
| "grad_norm": 0.32190296053886414, |
| "learning_rate": 0.0004811570055345179, |
| "loss": 3.4097, |
| "step": 34100 |
| }, |
| { |
| "epoch": 9.941995224506435, |
| "grad_norm": 0.33463558554649353, |
| "learning_rate": 0.0004809822312845907, |
| "loss": 3.413, |
| "step": 34150 |
| }, |
| { |
| "epoch": 9.956554656106226, |
| "grad_norm": 0.35033613443374634, |
| "learning_rate": 0.0004808074570346635, |
| "loss": 3.4105, |
| "step": 34200 |
| }, |
| { |
| "epoch": 9.971114087706017, |
| "grad_norm": 0.3425426185131073, |
| "learning_rate": 0.00048063268278473634, |
| "loss": 3.4089, |
| "step": 34250 |
| }, |
| { |
| "epoch": 9.985673519305806, |
| "grad_norm": 0.33646196126937866, |
| "learning_rate": 0.0004804579085348092, |
| "loss": 3.4147, |
| "step": 34300 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.8161603808403015, |
| "learning_rate": 0.000480283134284882, |
| "loss": 3.428, |
| "step": 34350 |
| }, |
| { |
| "epoch": 10.01455943159979, |
| "grad_norm": 0.34390079975128174, |
| "learning_rate": 0.0004801083600349548, |
| "loss": 3.3054, |
| "step": 34400 |
| }, |
| { |
| "epoch": 10.029118863199582, |
| "grad_norm": 0.3613048493862152, |
| "learning_rate": 0.0004799335857850276, |
| "loss": 3.2991, |
| "step": 34450 |
| }, |
| { |
| "epoch": 10.04367829479937, |
| "grad_norm": 0.3359822928905487, |
| "learning_rate": 0.00047975881153510046, |
| "loss": 3.2928, |
| "step": 34500 |
| }, |
| { |
| "epoch": 10.058237726399161, |
| "grad_norm": 0.32760336995124817, |
| "learning_rate": 0.0004795840372851733, |
| "loss": 3.3078, |
| "step": 34550 |
| }, |
| { |
| "epoch": 10.072797157998952, |
| "grad_norm": 0.333717942237854, |
| "learning_rate": 0.00047940926303524607, |
| "loss": 3.3163, |
| "step": 34600 |
| }, |
| { |
| "epoch": 10.087356589598743, |
| "grad_norm": 0.33068737387657166, |
| "learning_rate": 0.0004792344887853189, |
| "loss": 3.3126, |
| "step": 34650 |
| }, |
| { |
| "epoch": 10.101916021198532, |
| "grad_norm": 0.3351806402206421, |
| "learning_rate": 0.00047905971453539173, |
| "loss": 3.3233, |
| "step": 34700 |
| }, |
| { |
| "epoch": 10.116475452798323, |
| "grad_norm": 0.378257691860199, |
| "learning_rate": 0.00047888494028546457, |
| "loss": 3.3302, |
| "step": 34750 |
| }, |
| { |
| "epoch": 10.131034884398114, |
| "grad_norm": 0.3484111428260803, |
| "learning_rate": 0.0004787101660355374, |
| "loss": 3.3303, |
| "step": 34800 |
| }, |
| { |
| "epoch": 10.145594315997904, |
| "grad_norm": 0.3338722884654999, |
| "learning_rate": 0.0004785353917856102, |
| "loss": 3.3367, |
| "step": 34850 |
| }, |
| { |
| "epoch": 10.160153747597693, |
| "grad_norm": 0.33523687720298767, |
| "learning_rate": 0.000478360617535683, |
| "loss": 3.3373, |
| "step": 34900 |
| }, |
| { |
| "epoch": 10.174713179197484, |
| "grad_norm": 0.3656017780303955, |
| "learning_rate": 0.00047818584328575584, |
| "loss": 3.3416, |
| "step": 34950 |
| }, |
| { |
| "epoch": 10.189272610797275, |
| "grad_norm": 0.3645973801612854, |
| "learning_rate": 0.0004780110690358287, |
| "loss": 3.3363, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.189272610797275, |
| "eval_accuracy": 0.36812065696890367, |
| "eval_loss": 3.574761152267456, |
| "eval_runtime": 53.8933, |
| "eval_samples_per_second": 308.814, |
| "eval_steps_per_second": 19.316, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.203832042397066, |
| "grad_norm": 0.33940547704696655, |
| "learning_rate": 0.0004778362947859015, |
| "loss": 3.3454, |
| "step": 35050 |
| }, |
| { |
| "epoch": 10.218391473996855, |
| "grad_norm": 0.33723345398902893, |
| "learning_rate": 0.0004776615205359743, |
| "loss": 3.3449, |
| "step": 35100 |
| }, |
| { |
| "epoch": 10.232950905596645, |
| "grad_norm": 0.3461247980594635, |
| "learning_rate": 0.0004774867462860471, |
| "loss": 3.3466, |
| "step": 35150 |
| }, |
| { |
| "epoch": 10.247510337196436, |
| "grad_norm": 0.3415777385234833, |
| "learning_rate": 0.00047731197203611995, |
| "loss": 3.3445, |
| "step": 35200 |
| }, |
| { |
| "epoch": 10.262069768796227, |
| "grad_norm": 0.3251374363899231, |
| "learning_rate": 0.0004771371977861928, |
| "loss": 3.338, |
| "step": 35250 |
| }, |
| { |
| "epoch": 10.276629200396016, |
| "grad_norm": 0.3444693684577942, |
| "learning_rate": 0.00047696242353626557, |
| "loss": 3.3484, |
| "step": 35300 |
| }, |
| { |
| "epoch": 10.291188631995807, |
| "grad_norm": 0.3499116897583008, |
| "learning_rate": 0.0004767876492863384, |
| "loss": 3.3543, |
| "step": 35350 |
| }, |
| { |
| "epoch": 10.305748063595598, |
| "grad_norm": 0.3512921631336212, |
| "learning_rate": 0.00047661287503641123, |
| "loss": 3.3542, |
| "step": 35400 |
| }, |
| { |
| "epoch": 10.320307495195388, |
| "grad_norm": 0.33651211857795715, |
| "learning_rate": 0.00047643810078648407, |
| "loss": 3.3529, |
| "step": 35450 |
| }, |
| { |
| "epoch": 10.334866926795177, |
| "grad_norm": 0.35819199681282043, |
| "learning_rate": 0.0004762633265365569, |
| "loss": 3.3563, |
| "step": 35500 |
| }, |
| { |
| "epoch": 10.349426358394968, |
| "grad_norm": 0.33055511116981506, |
| "learning_rate": 0.0004760885522866297, |
| "loss": 3.3583, |
| "step": 35550 |
| }, |
| { |
| "epoch": 10.363985789994759, |
| "grad_norm": 0.3145069181919098, |
| "learning_rate": 0.0004759137780367025, |
| "loss": 3.3484, |
| "step": 35600 |
| }, |
| { |
| "epoch": 10.37854522159455, |
| "grad_norm": 0.3309759497642517, |
| "learning_rate": 0.00047573900378677534, |
| "loss": 3.3466, |
| "step": 35650 |
| }, |
| { |
| "epoch": 10.393104653194339, |
| "grad_norm": 0.3606186509132385, |
| "learning_rate": 0.00047556422953684823, |
| "loss": 3.3622, |
| "step": 35700 |
| }, |
| { |
| "epoch": 10.40766408479413, |
| "grad_norm": 0.3602750897407532, |
| "learning_rate": 0.00047538945528692106, |
| "loss": 3.3511, |
| "step": 35750 |
| }, |
| { |
| "epoch": 10.42222351639392, |
| "grad_norm": 0.3379049003124237, |
| "learning_rate": 0.0004752146810369939, |
| "loss": 3.3676, |
| "step": 35800 |
| }, |
| { |
| "epoch": 10.436782947993711, |
| "grad_norm": 0.3288438320159912, |
| "learning_rate": 0.0004750399067870667, |
| "loss": 3.3672, |
| "step": 35850 |
| }, |
| { |
| "epoch": 10.4513423795935, |
| "grad_norm": 0.3229992985725403, |
| "learning_rate": 0.0004748651325371395, |
| "loss": 3.3697, |
| "step": 35900 |
| }, |
| { |
| "epoch": 10.46590181119329, |
| "grad_norm": 0.3085167407989502, |
| "learning_rate": 0.00047469035828721234, |
| "loss": 3.3625, |
| "step": 35950 |
| }, |
| { |
| "epoch": 10.480461242793082, |
| "grad_norm": 0.3422580361366272, |
| "learning_rate": 0.0004745155840372852, |
| "loss": 3.3778, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.480461242793082, |
| "eval_accuracy": 0.36864954889563534, |
| "eval_loss": 3.5684714317321777, |
| "eval_runtime": 53.892, |
| "eval_samples_per_second": 308.821, |
| "eval_steps_per_second": 19.316, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.495020674392872, |
| "grad_norm": 0.34788641333580017, |
| "learning_rate": 0.00047434080978735795, |
| "loss": 3.371, |
| "step": 36050 |
| }, |
| { |
| "epoch": 10.509580105992661, |
| "grad_norm": 0.3174489140510559, |
| "learning_rate": 0.0004741660355374308, |
| "loss": 3.3691, |
| "step": 36100 |
| }, |
| { |
| "epoch": 10.524139537592452, |
| "grad_norm": 0.3501240313053131, |
| "learning_rate": 0.0004739912612875036, |
| "loss": 3.367, |
| "step": 36150 |
| }, |
| { |
| "epoch": 10.538698969192243, |
| "grad_norm": 0.3440835773944855, |
| "learning_rate": 0.00047381648703757645, |
| "loss": 3.3787, |
| "step": 36200 |
| }, |
| { |
| "epoch": 10.553258400792034, |
| "grad_norm": 0.3479546308517456, |
| "learning_rate": 0.0004736417127876493, |
| "loss": 3.3656, |
| "step": 36250 |
| }, |
| { |
| "epoch": 10.567817832391823, |
| "grad_norm": 0.3386428654193878, |
| "learning_rate": 0.00047346693853772206, |
| "loss": 3.3647, |
| "step": 36300 |
| }, |
| { |
| "epoch": 10.582377263991614, |
| "grad_norm": 0.3308039903640747, |
| "learning_rate": 0.0004732921642877949, |
| "loss": 3.3657, |
| "step": 36350 |
| }, |
| { |
| "epoch": 10.596936695591404, |
| "grad_norm": 0.3481200635433197, |
| "learning_rate": 0.00047311739003786773, |
| "loss": 3.3783, |
| "step": 36400 |
| }, |
| { |
| "epoch": 10.611496127191195, |
| "grad_norm": 0.34154003858566284, |
| "learning_rate": 0.00047294261578794056, |
| "loss": 3.3755, |
| "step": 36450 |
| }, |
| { |
| "epoch": 10.626055558790984, |
| "grad_norm": 0.33981460332870483, |
| "learning_rate": 0.0004727678415380134, |
| "loss": 3.3738, |
| "step": 36500 |
| }, |
| { |
| "epoch": 10.640614990390775, |
| "grad_norm": 0.34777724742889404, |
| "learning_rate": 0.0004725930672880862, |
| "loss": 3.3713, |
| "step": 36550 |
| }, |
| { |
| "epoch": 10.655174421990566, |
| "grad_norm": 0.3389792740345001, |
| "learning_rate": 0.000472418293038159, |
| "loss": 3.3861, |
| "step": 36600 |
| }, |
| { |
| "epoch": 10.669733853590357, |
| "grad_norm": 0.32127055525779724, |
| "learning_rate": 0.00047224351878823184, |
| "loss": 3.3809, |
| "step": 36650 |
| }, |
| { |
| "epoch": 10.684293285190146, |
| "grad_norm": 0.3361356854438782, |
| "learning_rate": 0.0004720687445383047, |
| "loss": 3.3902, |
| "step": 36700 |
| }, |
| { |
| "epoch": 10.698852716789936, |
| "grad_norm": 0.3294849395751953, |
| "learning_rate": 0.00047189397028837745, |
| "loss": 3.3857, |
| "step": 36750 |
| }, |
| { |
| "epoch": 10.713412148389727, |
| "grad_norm": 0.3620792031288147, |
| "learning_rate": 0.0004717191960384503, |
| "loss": 3.3883, |
| "step": 36800 |
| }, |
| { |
| "epoch": 10.727971579989518, |
| "grad_norm": 0.31317734718322754, |
| "learning_rate": 0.0004715444217885231, |
| "loss": 3.3763, |
| "step": 36850 |
| }, |
| { |
| "epoch": 10.742531011589307, |
| "grad_norm": 0.33570465445518494, |
| "learning_rate": 0.00047136964753859595, |
| "loss": 3.3725, |
| "step": 36900 |
| }, |
| { |
| "epoch": 10.757090443189098, |
| "grad_norm": 0.32414549589157104, |
| "learning_rate": 0.0004711948732886688, |
| "loss": 3.3789, |
| "step": 36950 |
| }, |
| { |
| "epoch": 10.771649874788888, |
| "grad_norm": 0.3350679874420166, |
| "learning_rate": 0.00047102009903874156, |
| "loss": 3.3797, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.771649874788888, |
| "eval_accuracy": 0.36933459210709346, |
| "eval_loss": 3.5581462383270264, |
| "eval_runtime": 53.7954, |
| "eval_samples_per_second": 309.376, |
| "eval_steps_per_second": 19.351, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.78620930638868, |
| "grad_norm": 0.36362600326538086, |
| "learning_rate": 0.0004708453247888144, |
| "loss": 3.3874, |
| "step": 37050 |
| }, |
| { |
| "epoch": 10.800768737988468, |
| "grad_norm": 0.3308012783527374, |
| "learning_rate": 0.00047067055053888723, |
| "loss": 3.3835, |
| "step": 37100 |
| }, |
| { |
| "epoch": 10.815328169588259, |
| "grad_norm": 0.35319286584854126, |
| "learning_rate": 0.00047049577628896006, |
| "loss": 3.3845, |
| "step": 37150 |
| }, |
| { |
| "epoch": 10.82988760118805, |
| "grad_norm": 0.34049636125564575, |
| "learning_rate": 0.0004703210020390329, |
| "loss": 3.3913, |
| "step": 37200 |
| }, |
| { |
| "epoch": 10.84444703278784, |
| "grad_norm": 0.32271477580070496, |
| "learning_rate": 0.0004701462277891057, |
| "loss": 3.3867, |
| "step": 37250 |
| }, |
| { |
| "epoch": 10.85900646438763, |
| "grad_norm": 0.3542429804801941, |
| "learning_rate": 0.0004699714535391785, |
| "loss": 3.3767, |
| "step": 37300 |
| }, |
| { |
| "epoch": 10.87356589598742, |
| "grad_norm": 0.3518877327442169, |
| "learning_rate": 0.00046979667928925134, |
| "loss": 3.3977, |
| "step": 37350 |
| }, |
| { |
| "epoch": 10.888125327587211, |
| "grad_norm": 0.3210870921611786, |
| "learning_rate": 0.0004696219050393242, |
| "loss": 3.385, |
| "step": 37400 |
| }, |
| { |
| "epoch": 10.902684759187002, |
| "grad_norm": 0.36343225836753845, |
| "learning_rate": 0.00046944713078939695, |
| "loss": 3.3819, |
| "step": 37450 |
| }, |
| { |
| "epoch": 10.917244190786791, |
| "grad_norm": 0.34078526496887207, |
| "learning_rate": 0.0004692723565394698, |
| "loss": 3.3817, |
| "step": 37500 |
| }, |
| { |
| "epoch": 10.931803622386582, |
| "grad_norm": 0.32085925340652466, |
| "learning_rate": 0.0004690975822895426, |
| "loss": 3.3841, |
| "step": 37550 |
| }, |
| { |
| "epoch": 10.946363053986373, |
| "grad_norm": 0.3279073238372803, |
| "learning_rate": 0.00046892280803961545, |
| "loss": 3.3845, |
| "step": 37600 |
| }, |
| { |
| "epoch": 10.960922485586163, |
| "grad_norm": 0.35262706875801086, |
| "learning_rate": 0.0004687480337896883, |
| "loss": 3.3934, |
| "step": 37650 |
| }, |
| { |
| "epoch": 10.975481917185952, |
| "grad_norm": 0.36066585779190063, |
| "learning_rate": 0.00046857325953976106, |
| "loss": 3.3778, |
| "step": 37700 |
| }, |
| { |
| "epoch": 10.990041348785743, |
| "grad_norm": 0.35543933510780334, |
| "learning_rate": 0.0004683984852898339, |
| "loss": 3.3916, |
| "step": 37750 |
| }, |
| { |
| "epoch": 11.004367829479937, |
| "grad_norm": 0.33726900815963745, |
| "learning_rate": 0.00046822371103990673, |
| "loss": 3.365, |
| "step": 37800 |
| }, |
| { |
| "epoch": 11.018927261079728, |
| "grad_norm": 0.38896262645721436, |
| "learning_rate": 0.00046804893678997956, |
| "loss": 3.2857, |
| "step": 37850 |
| }, |
| { |
| "epoch": 11.033486692679517, |
| "grad_norm": 0.32145750522613525, |
| "learning_rate": 0.00046787416254005234, |
| "loss": 3.2777, |
| "step": 37900 |
| }, |
| { |
| "epoch": 11.048046124279308, |
| "grad_norm": 0.3301202058792114, |
| "learning_rate": 0.0004676993882901252, |
| "loss": 3.2926, |
| "step": 37950 |
| }, |
| { |
| "epoch": 11.062605555879099, |
| "grad_norm": 0.3512705862522125, |
| "learning_rate": 0.000467524614040198, |
| "loss": 3.2924, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.062605555879099, |
| "eval_accuracy": 0.369106126786142, |
| "eval_loss": 3.569038152694702, |
| "eval_runtime": 54.0656, |
| "eval_samples_per_second": 307.83, |
| "eval_steps_per_second": 19.254, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.07716498747889, |
| "grad_norm": 0.35407987236976624, |
| "learning_rate": 0.00046734983979027084, |
| "loss": 3.2867, |
| "step": 38050 |
| }, |
| { |
| "epoch": 11.091724419078679, |
| "grad_norm": 0.33891621232032776, |
| "learning_rate": 0.00046717506554034367, |
| "loss": 3.2913, |
| "step": 38100 |
| }, |
| { |
| "epoch": 11.10628385067847, |
| "grad_norm": 0.3495442569255829, |
| "learning_rate": 0.00046700029129041645, |
| "loss": 3.304, |
| "step": 38150 |
| }, |
| { |
| "epoch": 11.12084328227826, |
| "grad_norm": 0.33784544467926025, |
| "learning_rate": 0.0004668255170404893, |
| "loss": 3.314, |
| "step": 38200 |
| }, |
| { |
| "epoch": 11.135402713878051, |
| "grad_norm": 0.3771565556526184, |
| "learning_rate": 0.00046665074279056217, |
| "loss": 3.3101, |
| "step": 38250 |
| }, |
| { |
| "epoch": 11.14996214547784, |
| "grad_norm": 0.3389083743095398, |
| "learning_rate": 0.000466475968540635, |
| "loss": 3.3027, |
| "step": 38300 |
| }, |
| { |
| "epoch": 11.16452157707763, |
| "grad_norm": 0.34060126543045044, |
| "learning_rate": 0.00046630119429070784, |
| "loss": 3.3005, |
| "step": 38350 |
| }, |
| { |
| "epoch": 11.179081008677421, |
| "grad_norm": 0.33946189284324646, |
| "learning_rate": 0.00046612642004078067, |
| "loss": 3.3136, |
| "step": 38400 |
| }, |
| { |
| "epoch": 11.193640440277212, |
| "grad_norm": 0.33616408705711365, |
| "learning_rate": 0.00046595164579085345, |
| "loss": 3.3179, |
| "step": 38450 |
| }, |
| { |
| "epoch": 11.208199871877001, |
| "grad_norm": 0.35129034519195557, |
| "learning_rate": 0.0004657768715409263, |
| "loss": 3.3181, |
| "step": 38500 |
| }, |
| { |
| "epoch": 11.222759303476792, |
| "grad_norm": 0.37194448709487915, |
| "learning_rate": 0.0004656020972909991, |
| "loss": 3.3245, |
| "step": 38550 |
| }, |
| { |
| "epoch": 11.237318735076583, |
| "grad_norm": 0.3498443067073822, |
| "learning_rate": 0.00046542732304107195, |
| "loss": 3.3199, |
| "step": 38600 |
| }, |
| { |
| "epoch": 11.251878166676374, |
| "grad_norm": 0.33491912484169006, |
| "learning_rate": 0.0004652525487911447, |
| "loss": 3.3188, |
| "step": 38650 |
| }, |
| { |
| "epoch": 11.266437598276163, |
| "grad_norm": 0.4009822905063629, |
| "learning_rate": 0.00046507777454121756, |
| "loss": 3.3187, |
| "step": 38700 |
| }, |
| { |
| "epoch": 11.280997029875953, |
| "grad_norm": 0.3393429219722748, |
| "learning_rate": 0.0004649030002912904, |
| "loss": 3.3263, |
| "step": 38750 |
| }, |
| { |
| "epoch": 11.295556461475744, |
| "grad_norm": 0.345793217420578, |
| "learning_rate": 0.0004647282260413632, |
| "loss": 3.3232, |
| "step": 38800 |
| }, |
| { |
| "epoch": 11.310115893075535, |
| "grad_norm": 0.3599303066730499, |
| "learning_rate": 0.00046455345179143606, |
| "loss": 3.3334, |
| "step": 38850 |
| }, |
| { |
| "epoch": 11.324675324675324, |
| "grad_norm": 0.32492297887802124, |
| "learning_rate": 0.00046437867754150884, |
| "loss": 3.3376, |
| "step": 38900 |
| }, |
| { |
| "epoch": 11.339234756275115, |
| "grad_norm": 0.3709739148616791, |
| "learning_rate": 0.00046420390329158167, |
| "loss": 3.3403, |
| "step": 38950 |
| }, |
| { |
| "epoch": 11.353794187874906, |
| "grad_norm": 0.3648548424243927, |
| "learning_rate": 0.0004640291290416545, |
| "loss": 3.3359, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.353794187874906, |
| "eval_accuracy": 0.3693122511853329, |
| "eval_loss": 3.5671463012695312, |
| "eval_runtime": 54.06, |
| "eval_samples_per_second": 307.861, |
| "eval_steps_per_second": 19.256, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.368353619474696, |
| "grad_norm": 0.3588879108428955, |
| "learning_rate": 0.00046385435479172734, |
| "loss": 3.3408, |
| "step": 39050 |
| }, |
| { |
| "epoch": 11.382913051074485, |
| "grad_norm": 0.3257652223110199, |
| "learning_rate": 0.00046367958054180017, |
| "loss": 3.3512, |
| "step": 39100 |
| }, |
| { |
| "epoch": 11.397472482674276, |
| "grad_norm": 0.3484971225261688, |
| "learning_rate": 0.00046350480629187295, |
| "loss": 3.3376, |
| "step": 39150 |
| }, |
| { |
| "epoch": 11.412031914274067, |
| "grad_norm": 0.3337056636810303, |
| "learning_rate": 0.0004633300320419458, |
| "loss": 3.3432, |
| "step": 39200 |
| }, |
| { |
| "epoch": 11.426591345873858, |
| "grad_norm": 0.3409966230392456, |
| "learning_rate": 0.0004631552577920186, |
| "loss": 3.3473, |
| "step": 39250 |
| }, |
| { |
| "epoch": 11.441150777473647, |
| "grad_norm": 0.3414597511291504, |
| "learning_rate": 0.00046298048354209145, |
| "loss": 3.3393, |
| "step": 39300 |
| }, |
| { |
| "epoch": 11.455710209073438, |
| "grad_norm": 0.35023412108421326, |
| "learning_rate": 0.0004628057092921642, |
| "loss": 3.342, |
| "step": 39350 |
| }, |
| { |
| "epoch": 11.470269640673228, |
| "grad_norm": 0.3243994414806366, |
| "learning_rate": 0.00046263093504223706, |
| "loss": 3.3399, |
| "step": 39400 |
| }, |
| { |
| "epoch": 11.484829072273019, |
| "grad_norm": 0.32542338967323303, |
| "learning_rate": 0.0004624561607923099, |
| "loss": 3.3426, |
| "step": 39450 |
| }, |
| { |
| "epoch": 11.499388503872808, |
| "grad_norm": 0.38432076573371887, |
| "learning_rate": 0.0004622813865423827, |
| "loss": 3.34, |
| "step": 39500 |
| }, |
| { |
| "epoch": 11.513947935472599, |
| "grad_norm": 0.36201798915863037, |
| "learning_rate": 0.00046210661229245556, |
| "loss": 3.3318, |
| "step": 39550 |
| }, |
| { |
| "epoch": 11.52850736707239, |
| "grad_norm": 0.3721318542957306, |
| "learning_rate": 0.00046193183804252834, |
| "loss": 3.334, |
| "step": 39600 |
| }, |
| { |
| "epoch": 11.54306679867218, |
| "grad_norm": 0.33296698331832886, |
| "learning_rate": 0.00046175706379260117, |
| "loss": 3.3538, |
| "step": 39650 |
| }, |
| { |
| "epoch": 11.55762623027197, |
| "grad_norm": 0.3993701636791229, |
| "learning_rate": 0.000461582289542674, |
| "loss": 3.3529, |
| "step": 39700 |
| }, |
| { |
| "epoch": 11.57218566187176, |
| "grad_norm": 0.3540388345718384, |
| "learning_rate": 0.00046140751529274684, |
| "loss": 3.3533, |
| "step": 39750 |
| }, |
| { |
| "epoch": 11.586745093471551, |
| "grad_norm": 0.3348828852176666, |
| "learning_rate": 0.00046123274104281967, |
| "loss": 3.3525, |
| "step": 39800 |
| }, |
| { |
| "epoch": 11.601304525071342, |
| "grad_norm": 0.35919663310050964, |
| "learning_rate": 0.00046105796679289245, |
| "loss": 3.3464, |
| "step": 39850 |
| }, |
| { |
| "epoch": 11.61586395667113, |
| "grad_norm": 0.3327691853046417, |
| "learning_rate": 0.0004608831925429653, |
| "loss": 3.3645, |
| "step": 39900 |
| }, |
| { |
| "epoch": 11.630423388270922, |
| "grad_norm": 0.35042068362236023, |
| "learning_rate": 0.0004607084182930381, |
| "loss": 3.3599, |
| "step": 39950 |
| }, |
| { |
| "epoch": 11.644982819870712, |
| "grad_norm": 0.32780370116233826, |
| "learning_rate": 0.00046053364404311095, |
| "loss": 3.3503, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.644982819870712, |
| "eval_accuracy": 0.3698941734052962, |
| "eval_loss": 3.5564932823181152, |
| "eval_runtime": 53.915, |
| "eval_samples_per_second": 308.69, |
| "eval_steps_per_second": 19.308, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.659542251470503, |
| "grad_norm": 0.358482301235199, |
| "learning_rate": 0.0004603588697931837, |
| "loss": 3.3541, |
| "step": 40050 |
| }, |
| { |
| "epoch": 11.674101683070292, |
| "grad_norm": 0.33564674854278564, |
| "learning_rate": 0.00046018409554325656, |
| "loss": 3.3658, |
| "step": 40100 |
| }, |
| { |
| "epoch": 11.688661114670083, |
| "grad_norm": 0.3993256390094757, |
| "learning_rate": 0.0004600093212933294, |
| "loss": 3.3569, |
| "step": 40150 |
| }, |
| { |
| "epoch": 11.703220546269874, |
| "grad_norm": 0.34387677907943726, |
| "learning_rate": 0.0004598345470434022, |
| "loss": 3.3745, |
| "step": 40200 |
| }, |
| { |
| "epoch": 11.717779977869665, |
| "grad_norm": 0.3534398376941681, |
| "learning_rate": 0.00045965977279347506, |
| "loss": 3.357, |
| "step": 40250 |
| }, |
| { |
| "epoch": 11.732339409469454, |
| "grad_norm": 0.35269302129745483, |
| "learning_rate": 0.00045948499854354784, |
| "loss": 3.3703, |
| "step": 40300 |
| }, |
| { |
| "epoch": 11.746898841069244, |
| "grad_norm": 0.3300093114376068, |
| "learning_rate": 0.00045931022429362067, |
| "loss": 3.3671, |
| "step": 40350 |
| }, |
| { |
| "epoch": 11.761458272669035, |
| "grad_norm": 0.33715489506721497, |
| "learning_rate": 0.0004591354500436935, |
| "loss": 3.3716, |
| "step": 40400 |
| }, |
| { |
| "epoch": 11.776017704268826, |
| "grad_norm": 0.34225428104400635, |
| "learning_rate": 0.00045896067579376634, |
| "loss": 3.3641, |
| "step": 40450 |
| }, |
| { |
| "epoch": 11.790577135868615, |
| "grad_norm": 0.3427339494228363, |
| "learning_rate": 0.0004587859015438391, |
| "loss": 3.3754, |
| "step": 40500 |
| }, |
| { |
| "epoch": 11.805136567468406, |
| "grad_norm": 0.3600717782974243, |
| "learning_rate": 0.00045861112729391195, |
| "loss": 3.3607, |
| "step": 40550 |
| }, |
| { |
| "epoch": 11.819695999068196, |
| "grad_norm": 0.34151703119277954, |
| "learning_rate": 0.0004584363530439848, |
| "loss": 3.3648, |
| "step": 40600 |
| }, |
| { |
| "epoch": 11.834255430667987, |
| "grad_norm": 0.3521574139595032, |
| "learning_rate": 0.0004582615787940576, |
| "loss": 3.3645, |
| "step": 40650 |
| }, |
| { |
| "epoch": 11.848814862267776, |
| "grad_norm": 0.3284907937049866, |
| "learning_rate": 0.00045808680454413045, |
| "loss": 3.3637, |
| "step": 40700 |
| }, |
| { |
| "epoch": 11.863374293867567, |
| "grad_norm": 0.35242322087287903, |
| "learning_rate": 0.00045791203029420333, |
| "loss": 3.3755, |
| "step": 40750 |
| }, |
| { |
| "epoch": 11.877933725467358, |
| "grad_norm": 0.33842575550079346, |
| "learning_rate": 0.0004577372560442761, |
| "loss": 3.3726, |
| "step": 40800 |
| }, |
| { |
| "epoch": 11.892493157067149, |
| "grad_norm": 0.354082316160202, |
| "learning_rate": 0.00045756248179434894, |
| "loss": 3.3701, |
| "step": 40850 |
| }, |
| { |
| "epoch": 11.90705258866694, |
| "grad_norm": 0.32441848516464233, |
| "learning_rate": 0.0004573877075444218, |
| "loss": 3.3719, |
| "step": 40900 |
| }, |
| { |
| "epoch": 11.921612020266728, |
| "grad_norm": 0.3555139899253845, |
| "learning_rate": 0.0004572129332944946, |
| "loss": 3.3542, |
| "step": 40950 |
| }, |
| { |
| "epoch": 11.93617145186652, |
| "grad_norm": 0.3518112897872925, |
| "learning_rate": 0.00045703815904456744, |
| "loss": 3.3774, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.93617145186652, |
| "eval_accuracy": 0.3705694571614589, |
| "eval_loss": 3.5490357875823975, |
| "eval_runtime": 53.9812, |
| "eval_samples_per_second": 308.311, |
| "eval_steps_per_second": 19.284, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.95073088346631, |
| "grad_norm": 0.33228781819343567, |
| "learning_rate": 0.0004568633847946402, |
| "loss": 3.3794, |
| "step": 41050 |
| }, |
| { |
| "epoch": 11.965290315066099, |
| "grad_norm": 0.3327503204345703, |
| "learning_rate": 0.00045668861054471306, |
| "loss": 3.391, |
| "step": 41100 |
| }, |
| { |
| "epoch": 11.97984974666589, |
| "grad_norm": 0.32881519198417664, |
| "learning_rate": 0.0004565138362947859, |
| "loss": 3.3747, |
| "step": 41150 |
| }, |
| { |
| "epoch": 11.99440917826568, |
| "grad_norm": 0.3413378894329071, |
| "learning_rate": 0.0004563390620448587, |
| "loss": 3.3809, |
| "step": 41200 |
| }, |
| { |
| "epoch": 12.008735658959875, |
| "grad_norm": 0.35909968614578247, |
| "learning_rate": 0.0004561642877949315, |
| "loss": 3.3022, |
| "step": 41250 |
| }, |
| { |
| "epoch": 12.023295090559664, |
| "grad_norm": 0.3564230799674988, |
| "learning_rate": 0.00045598951354500433, |
| "loss": 3.267, |
| "step": 41300 |
| }, |
| { |
| "epoch": 12.037854522159455, |
| "grad_norm": 0.3933013677597046, |
| "learning_rate": 0.00045581473929507717, |
| "loss": 3.2561, |
| "step": 41350 |
| }, |
| { |
| "epoch": 12.052413953759245, |
| "grad_norm": 0.32905763387680054, |
| "learning_rate": 0.00045563996504515, |
| "loss": 3.2752, |
| "step": 41400 |
| }, |
| { |
| "epoch": 12.066973385359036, |
| "grad_norm": 0.37047168612480164, |
| "learning_rate": 0.00045546519079522283, |
| "loss": 3.2786, |
| "step": 41450 |
| }, |
| { |
| "epoch": 12.081532816958825, |
| "grad_norm": 0.34073546528816223, |
| "learning_rate": 0.0004552904165452956, |
| "loss": 3.2767, |
| "step": 41500 |
| }, |
| { |
| "epoch": 12.096092248558616, |
| "grad_norm": 0.3816109597682953, |
| "learning_rate": 0.00045511564229536844, |
| "loss": 3.2799, |
| "step": 41550 |
| }, |
| { |
| "epoch": 12.110651680158407, |
| "grad_norm": 0.3862399160861969, |
| "learning_rate": 0.0004549408680454413, |
| "loss": 3.2798, |
| "step": 41600 |
| }, |
| { |
| "epoch": 12.125211111758198, |
| "grad_norm": 0.352953165769577, |
| "learning_rate": 0.0004547660937955141, |
| "loss": 3.2707, |
| "step": 41650 |
| }, |
| { |
| "epoch": 12.139770543357987, |
| "grad_norm": 0.3749876320362091, |
| "learning_rate": 0.00045459131954558694, |
| "loss": 3.2888, |
| "step": 41700 |
| }, |
| { |
| "epoch": 12.154329974957777, |
| "grad_norm": 0.3750421106815338, |
| "learning_rate": 0.0004544165452956597, |
| "loss": 3.2932, |
| "step": 41750 |
| }, |
| { |
| "epoch": 12.168889406557568, |
| "grad_norm": 0.32412317395210266, |
| "learning_rate": 0.00045424177104573255, |
| "loss": 3.289, |
| "step": 41800 |
| }, |
| { |
| "epoch": 12.183448838157359, |
| "grad_norm": 0.3581444025039673, |
| "learning_rate": 0.0004540669967958054, |
| "loss": 3.2892, |
| "step": 41850 |
| }, |
| { |
| "epoch": 12.198008269757148, |
| "grad_norm": 0.3392166495323181, |
| "learning_rate": 0.0004538922225458782, |
| "loss": 3.2978, |
| "step": 41900 |
| }, |
| { |
| "epoch": 12.212567701356939, |
| "grad_norm": 0.36249926686286926, |
| "learning_rate": 0.000453717448295951, |
| "loss": 3.3048, |
| "step": 41950 |
| }, |
| { |
| "epoch": 12.22712713295673, |
| "grad_norm": 0.32953113317489624, |
| "learning_rate": 0.00045354267404602383, |
| "loss": 3.3027, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.22712713295673, |
| "eval_accuracy": 0.3696067986011761, |
| "eval_loss": 3.5610363483428955, |
| "eval_runtime": 53.9811, |
| "eval_samples_per_second": 308.312, |
| "eval_steps_per_second": 19.285, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.24168656455652, |
| "grad_norm": 0.3559216260910034, |
| "learning_rate": 0.00045336789979609667, |
| "loss": 3.3001, |
| "step": 42050 |
| }, |
| { |
| "epoch": 12.25624599615631, |
| "grad_norm": 0.3325844705104828, |
| "learning_rate": 0.0004531931255461695, |
| "loss": 3.2938, |
| "step": 42100 |
| }, |
| { |
| "epoch": 12.2708054277561, |
| "grad_norm": 0.3980172574520111, |
| "learning_rate": 0.00045301835129624233, |
| "loss": 3.3119, |
| "step": 42150 |
| }, |
| { |
| "epoch": 12.28536485935589, |
| "grad_norm": 0.32405316829681396, |
| "learning_rate": 0.0004528435770463151, |
| "loss": 3.3034, |
| "step": 42200 |
| }, |
| { |
| "epoch": 12.299924290955682, |
| "grad_norm": 0.34660008549690247, |
| "learning_rate": 0.00045266880279638794, |
| "loss": 3.3109, |
| "step": 42250 |
| }, |
| { |
| "epoch": 12.31448372255547, |
| "grad_norm": 0.3319573998451233, |
| "learning_rate": 0.0004524940285464608, |
| "loss": 3.3025, |
| "step": 42300 |
| }, |
| { |
| "epoch": 12.329043154155261, |
| "grad_norm": 0.38782092928886414, |
| "learning_rate": 0.0004523192542965336, |
| "loss": 3.3051, |
| "step": 42350 |
| }, |
| { |
| "epoch": 12.343602585755052, |
| "grad_norm": 0.3703951835632324, |
| "learning_rate": 0.00045214448004660644, |
| "loss": 3.3112, |
| "step": 42400 |
| }, |
| { |
| "epoch": 12.358162017354843, |
| "grad_norm": 0.3678281903266907, |
| "learning_rate": 0.0004519697057966792, |
| "loss": 3.3175, |
| "step": 42450 |
| }, |
| { |
| "epoch": 12.372721448954632, |
| "grad_norm": 0.34847941994667053, |
| "learning_rate": 0.00045179493154675205, |
| "loss": 3.3125, |
| "step": 42500 |
| }, |
| { |
| "epoch": 12.387280880554423, |
| "grad_norm": 0.39477667212486267, |
| "learning_rate": 0.0004516201572968249, |
| "loss": 3.323, |
| "step": 42550 |
| }, |
| { |
| "epoch": 12.401840312154214, |
| "grad_norm": 0.3333967328071594, |
| "learning_rate": 0.0004514453830468977, |
| "loss": 3.3301, |
| "step": 42600 |
| }, |
| { |
| "epoch": 12.416399743754004, |
| "grad_norm": 0.3731665015220642, |
| "learning_rate": 0.0004512706087969705, |
| "loss": 3.3191, |
| "step": 42650 |
| }, |
| { |
| "epoch": 12.430959175353793, |
| "grad_norm": 0.3655344247817993, |
| "learning_rate": 0.00045109583454704333, |
| "loss": 3.3232, |
| "step": 42700 |
| }, |
| { |
| "epoch": 12.445518606953584, |
| "grad_norm": 0.36312124133110046, |
| "learning_rate": 0.00045092106029711616, |
| "loss": 3.3276, |
| "step": 42750 |
| }, |
| { |
| "epoch": 12.460078038553375, |
| "grad_norm": 0.3322156071662903, |
| "learning_rate": 0.000450746286047189, |
| "loss": 3.3234, |
| "step": 42800 |
| }, |
| { |
| "epoch": 12.474637470153166, |
| "grad_norm": 0.35798975825309753, |
| "learning_rate": 0.00045057151179726183, |
| "loss": 3.3379, |
| "step": 42850 |
| }, |
| { |
| "epoch": 12.489196901752955, |
| "grad_norm": 0.34931838512420654, |
| "learning_rate": 0.0004503967375473346, |
| "loss": 3.3324, |
| "step": 42900 |
| }, |
| { |
| "epoch": 12.503756333352746, |
| "grad_norm": 0.3425159752368927, |
| "learning_rate": 0.00045022196329740744, |
| "loss": 3.3265, |
| "step": 42950 |
| }, |
| { |
| "epoch": 12.518315764952536, |
| "grad_norm": 0.3326928913593292, |
| "learning_rate": 0.0004500471890474803, |
| "loss": 3.3386, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.518315764952536, |
| "eval_accuracy": 0.37016050070944184, |
| "eval_loss": 3.5549118518829346, |
| "eval_runtime": 53.8689, |
| "eval_samples_per_second": 308.954, |
| "eval_steps_per_second": 19.325, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.532875196552327, |
| "grad_norm": 0.39922279119491577, |
| "learning_rate": 0.0004498724147975531, |
| "loss": 3.3232, |
| "step": 43050 |
| }, |
| { |
| "epoch": 12.547434628152118, |
| "grad_norm": 0.3702709376811981, |
| "learning_rate": 0.00044969764054762594, |
| "loss": 3.3329, |
| "step": 43100 |
| }, |
| { |
| "epoch": 12.561994059751907, |
| "grad_norm": 0.33652347326278687, |
| "learning_rate": 0.0004495228662976987, |
| "loss": 3.3552, |
| "step": 43150 |
| }, |
| { |
| "epoch": 12.576553491351698, |
| "grad_norm": 0.35900095105171204, |
| "learning_rate": 0.00044934809204777155, |
| "loss": 3.3367, |
| "step": 43200 |
| }, |
| { |
| "epoch": 12.591112922951488, |
| "grad_norm": 0.3671649694442749, |
| "learning_rate": 0.0004491733177978444, |
| "loss": 3.3357, |
| "step": 43250 |
| }, |
| { |
| "epoch": 12.605672354551277, |
| "grad_norm": 0.3420347273349762, |
| "learning_rate": 0.0004489985435479173, |
| "loss": 3.3338, |
| "step": 43300 |
| }, |
| { |
| "epoch": 12.620231786151068, |
| "grad_norm": 0.3353428244590759, |
| "learning_rate": 0.0004488237692979901, |
| "loss": 3.3421, |
| "step": 43350 |
| }, |
| { |
| "epoch": 12.634791217750859, |
| "grad_norm": 0.3602941036224365, |
| "learning_rate": 0.0004486489950480629, |
| "loss": 3.3351, |
| "step": 43400 |
| }, |
| { |
| "epoch": 12.64935064935065, |
| "grad_norm": 0.3476759195327759, |
| "learning_rate": 0.0004484742207981357, |
| "loss": 3.3393, |
| "step": 43450 |
| }, |
| { |
| "epoch": 12.66391008095044, |
| "grad_norm": 0.3405091464519501, |
| "learning_rate": 0.00044829944654820855, |
| "loss": 3.3518, |
| "step": 43500 |
| }, |
| { |
| "epoch": 12.67846951255023, |
| "grad_norm": 0.36805057525634766, |
| "learning_rate": 0.0004481246722982814, |
| "loss": 3.3508, |
| "step": 43550 |
| }, |
| { |
| "epoch": 12.69302894415002, |
| "grad_norm": 0.3548225164413452, |
| "learning_rate": 0.0004479498980483542, |
| "loss": 3.3468, |
| "step": 43600 |
| }, |
| { |
| "epoch": 12.707588375749811, |
| "grad_norm": 0.3556191325187683, |
| "learning_rate": 0.000447775123798427, |
| "loss": 3.342, |
| "step": 43650 |
| }, |
| { |
| "epoch": 12.7221478073496, |
| "grad_norm": 0.3459774851799011, |
| "learning_rate": 0.00044760034954849983, |
| "loss": 3.3501, |
| "step": 43700 |
| }, |
| { |
| "epoch": 12.736707238949391, |
| "grad_norm": 0.32603490352630615, |
| "learning_rate": 0.00044742557529857266, |
| "loss": 3.3453, |
| "step": 43750 |
| }, |
| { |
| "epoch": 12.751266670549182, |
| "grad_norm": 0.3487292230129242, |
| "learning_rate": 0.0004472508010486455, |
| "loss": 3.34, |
| "step": 43800 |
| }, |
| { |
| "epoch": 12.765826102148973, |
| "grad_norm": 0.34649938344955444, |
| "learning_rate": 0.0004470760267987183, |
| "loss": 3.3549, |
| "step": 43850 |
| }, |
| { |
| "epoch": 12.780385533748763, |
| "grad_norm": 0.33671310544013977, |
| "learning_rate": 0.0004469012525487911, |
| "loss": 3.3551, |
| "step": 43900 |
| }, |
| { |
| "epoch": 12.794944965348552, |
| "grad_norm": 0.3770430088043213, |
| "learning_rate": 0.00044672647829886394, |
| "loss": 3.3422, |
| "step": 43950 |
| }, |
| { |
| "epoch": 12.809504396948343, |
| "grad_norm": 0.3594439625740051, |
| "learning_rate": 0.00044655170404893677, |
| "loss": 3.3472, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.809504396948343, |
| "eval_accuracy": 0.37095325068054563, |
| "eval_loss": 3.5484530925750732, |
| "eval_runtime": 53.9621, |
| "eval_samples_per_second": 308.42, |
| "eval_steps_per_second": 19.291, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.824063828548134, |
| "grad_norm": 0.3437642455101013, |
| "learning_rate": 0.0004463769297990096, |
| "loss": 3.355, |
| "step": 44050 |
| }, |
| { |
| "epoch": 12.838623260147925, |
| "grad_norm": 0.35489070415496826, |
| "learning_rate": 0.0004462021555490824, |
| "loss": 3.3439, |
| "step": 44100 |
| }, |
| { |
| "epoch": 12.853182691747714, |
| "grad_norm": 0.36889269948005676, |
| "learning_rate": 0.0004460273812991552, |
| "loss": 3.3646, |
| "step": 44150 |
| }, |
| { |
| "epoch": 12.867742123347504, |
| "grad_norm": 0.3426932692527771, |
| "learning_rate": 0.00044585260704922805, |
| "loss": 3.3556, |
| "step": 44200 |
| }, |
| { |
| "epoch": 12.882301554947295, |
| "grad_norm": 0.32859864830970764, |
| "learning_rate": 0.0004456778327993009, |
| "loss": 3.3724, |
| "step": 44250 |
| }, |
| { |
| "epoch": 12.896860986547086, |
| "grad_norm": 0.3485097289085388, |
| "learning_rate": 0.0004455030585493737, |
| "loss": 3.3643, |
| "step": 44300 |
| }, |
| { |
| "epoch": 12.911420418146875, |
| "grad_norm": 0.3588804006576538, |
| "learning_rate": 0.0004453282842994465, |
| "loss": 3.3393, |
| "step": 44350 |
| }, |
| { |
| "epoch": 12.925979849746666, |
| "grad_norm": 0.33998623490333557, |
| "learning_rate": 0.00044515351004951933, |
| "loss": 3.3432, |
| "step": 44400 |
| }, |
| { |
| "epoch": 12.940539281346457, |
| "grad_norm": 0.35905739665031433, |
| "learning_rate": 0.00044497873579959216, |
| "loss": 3.3529, |
| "step": 44450 |
| }, |
| { |
| "epoch": 12.955098712946247, |
| "grad_norm": 0.3558342456817627, |
| "learning_rate": 0.000444803961549665, |
| "loss": 3.354, |
| "step": 44500 |
| }, |
| { |
| "epoch": 12.969658144546036, |
| "grad_norm": 0.3453124761581421, |
| "learning_rate": 0.0004446291872997378, |
| "loss": 3.3444, |
| "step": 44550 |
| }, |
| { |
| "epoch": 12.984217576145827, |
| "grad_norm": 0.3470025360584259, |
| "learning_rate": 0.0004444544130498106, |
| "loss": 3.3561, |
| "step": 44600 |
| }, |
| { |
| "epoch": 12.998777007745618, |
| "grad_norm": 0.3806982636451721, |
| "learning_rate": 0.00044427963879988344, |
| "loss": 3.3558, |
| "step": 44650 |
| }, |
| { |
| "epoch": 13.01310348843981, |
| "grad_norm": 0.40065309405326843, |
| "learning_rate": 0.00044410486454995627, |
| "loss": 3.2516, |
| "step": 44700 |
| }, |
| { |
| "epoch": 13.027662920039601, |
| "grad_norm": 0.3657712936401367, |
| "learning_rate": 0.0004439300903000291, |
| "loss": 3.2454, |
| "step": 44750 |
| }, |
| { |
| "epoch": 13.042222351639392, |
| "grad_norm": 0.3365810513496399, |
| "learning_rate": 0.0004437553160501019, |
| "loss": 3.2503, |
| "step": 44800 |
| }, |
| { |
| "epoch": 13.056781783239183, |
| "grad_norm": 0.36263352632522583, |
| "learning_rate": 0.0004435805418001747, |
| "loss": 3.2604, |
| "step": 44850 |
| }, |
| { |
| "epoch": 13.071341214838972, |
| "grad_norm": 0.349775105714798, |
| "learning_rate": 0.00044340576755024755, |
| "loss": 3.2649, |
| "step": 44900 |
| }, |
| { |
| "epoch": 13.085900646438763, |
| "grad_norm": 0.34163522720336914, |
| "learning_rate": 0.0004432309933003204, |
| "loss": 3.2584, |
| "step": 44950 |
| }, |
| { |
| "epoch": 13.100460078038553, |
| "grad_norm": 0.34596383571624756, |
| "learning_rate": 0.0004430562190503932, |
| "loss": 3.2543, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.100460078038553, |
| "eval_accuracy": 0.37036556685444405, |
| "eval_loss": 3.5629189014434814, |
| "eval_runtime": 53.9034, |
| "eval_samples_per_second": 308.756, |
| "eval_steps_per_second": 19.312, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.115019509638344, |
| "grad_norm": 0.34392884373664856, |
| "learning_rate": 0.000442881444800466, |
| "loss": 3.2718, |
| "step": 45050 |
| }, |
| { |
| "epoch": 13.129578941238133, |
| "grad_norm": 0.3442322015762329, |
| "learning_rate": 0.00044270667055053883, |
| "loss": 3.2637, |
| "step": 45100 |
| }, |
| { |
| "epoch": 13.144138372837924, |
| "grad_norm": 0.3357892632484436, |
| "learning_rate": 0.00044253189630061166, |
| "loss": 3.2682, |
| "step": 45150 |
| }, |
| { |
| "epoch": 13.158697804437715, |
| "grad_norm": 0.34123629331588745, |
| "learning_rate": 0.0004423571220506845, |
| "loss": 3.2682, |
| "step": 45200 |
| }, |
| { |
| "epoch": 13.173257236037506, |
| "grad_norm": 0.369365930557251, |
| "learning_rate": 0.00044218234780075727, |
| "loss": 3.2775, |
| "step": 45250 |
| }, |
| { |
| "epoch": 13.187816667637296, |
| "grad_norm": 0.35826003551483154, |
| "learning_rate": 0.0004420075735508301, |
| "loss": 3.2802, |
| "step": 45300 |
| }, |
| { |
| "epoch": 13.202376099237085, |
| "grad_norm": 0.3576175570487976, |
| "learning_rate": 0.00044183279930090294, |
| "loss": 3.2813, |
| "step": 45350 |
| }, |
| { |
| "epoch": 13.216935530836876, |
| "grad_norm": 0.36184608936309814, |
| "learning_rate": 0.00044165802505097577, |
| "loss": 3.2794, |
| "step": 45400 |
| }, |
| { |
| "epoch": 13.231494962436667, |
| "grad_norm": 0.34625130891799927, |
| "learning_rate": 0.0004414832508010486, |
| "loss": 3.2818, |
| "step": 45450 |
| }, |
| { |
| "epoch": 13.246054394036458, |
| "grad_norm": 0.35068416595458984, |
| "learning_rate": 0.0004413084765511214, |
| "loss": 3.2791, |
| "step": 45500 |
| }, |
| { |
| "epoch": 13.260613825636247, |
| "grad_norm": 0.37707656621932983, |
| "learning_rate": 0.0004411337023011942, |
| "loss": 3.2962, |
| "step": 45550 |
| }, |
| { |
| "epoch": 13.275173257236037, |
| "grad_norm": 0.3505012094974518, |
| "learning_rate": 0.00044095892805126705, |
| "loss": 3.2936, |
| "step": 45600 |
| }, |
| { |
| "epoch": 13.289732688835828, |
| "grad_norm": 0.38045358657836914, |
| "learning_rate": 0.0004407841538013399, |
| "loss": 3.2977, |
| "step": 45650 |
| }, |
| { |
| "epoch": 13.304292120435619, |
| "grad_norm": 0.3746790289878845, |
| "learning_rate": 0.0004406093795514127, |
| "loss": 3.3078, |
| "step": 45700 |
| }, |
| { |
| "epoch": 13.318851552035408, |
| "grad_norm": 0.3702433705329895, |
| "learning_rate": 0.0004404346053014855, |
| "loss": 3.2968, |
| "step": 45750 |
| }, |
| { |
| "epoch": 13.333410983635199, |
| "grad_norm": 0.36914801597595215, |
| "learning_rate": 0.0004402598310515584, |
| "loss": 3.2999, |
| "step": 45800 |
| }, |
| { |
| "epoch": 13.34797041523499, |
| "grad_norm": 0.38449791073799133, |
| "learning_rate": 0.0004400850568016312, |
| "loss": 3.306, |
| "step": 45850 |
| }, |
| { |
| "epoch": 13.36252984683478, |
| "grad_norm": 0.3450029790401459, |
| "learning_rate": 0.00043991028255170405, |
| "loss": 3.2906, |
| "step": 45900 |
| }, |
| { |
| "epoch": 13.37708927843457, |
| "grad_norm": 0.3418305516242981, |
| "learning_rate": 0.0004397355083017769, |
| "loss": 3.3096, |
| "step": 45950 |
| }, |
| { |
| "epoch": 13.39164871003436, |
| "grad_norm": 0.3726021945476532, |
| "learning_rate": 0.00043956073405184966, |
| "loss": 3.2975, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.39164871003436, |
| "eval_accuracy": 0.3704893825945171, |
| "eval_loss": 3.562157392501831, |
| "eval_runtime": 54.1441, |
| "eval_samples_per_second": 307.383, |
| "eval_steps_per_second": 19.226, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.406208141634151, |
| "grad_norm": 0.38728371262550354, |
| "learning_rate": 0.0004393859598019225, |
| "loss": 3.3054, |
| "step": 46050 |
| }, |
| { |
| "epoch": 13.420767573233942, |
| "grad_norm": 0.33851444721221924, |
| "learning_rate": 0.0004392111855519953, |
| "loss": 3.3035, |
| "step": 46100 |
| }, |
| { |
| "epoch": 13.43532700483373, |
| "grad_norm": 0.3990343511104584, |
| "learning_rate": 0.00043903641130206816, |
| "loss": 3.3157, |
| "step": 46150 |
| }, |
| { |
| "epoch": 13.449886436433522, |
| "grad_norm": 0.33781442046165466, |
| "learning_rate": 0.000438861637052141, |
| "loss": 3.3149, |
| "step": 46200 |
| }, |
| { |
| "epoch": 13.464445868033312, |
| "grad_norm": 0.3608773350715637, |
| "learning_rate": 0.00043868686280221377, |
| "loss": 3.3063, |
| "step": 46250 |
| }, |
| { |
| "epoch": 13.479005299633103, |
| "grad_norm": 0.35064592957496643, |
| "learning_rate": 0.0004385120885522866, |
| "loss": 3.3086, |
| "step": 46300 |
| }, |
| { |
| "epoch": 13.493564731232892, |
| "grad_norm": 0.36569035053253174, |
| "learning_rate": 0.00043833731430235944, |
| "loss": 3.3099, |
| "step": 46350 |
| }, |
| { |
| "epoch": 13.508124162832683, |
| "grad_norm": 0.3421012759208679, |
| "learning_rate": 0.00043816254005243227, |
| "loss": 3.3263, |
| "step": 46400 |
| }, |
| { |
| "epoch": 13.522683594432474, |
| "grad_norm": 0.3369988203048706, |
| "learning_rate": 0.0004379877658025051, |
| "loss": 3.3287, |
| "step": 46450 |
| }, |
| { |
| "epoch": 13.537243026032264, |
| "grad_norm": 0.36069077253341675, |
| "learning_rate": 0.0004378129915525779, |
| "loss": 3.3208, |
| "step": 46500 |
| }, |
| { |
| "epoch": 13.551802457632053, |
| "grad_norm": 0.3256238102912903, |
| "learning_rate": 0.0004376382173026507, |
| "loss": 3.3102, |
| "step": 46550 |
| }, |
| { |
| "epoch": 13.566361889231844, |
| "grad_norm": 0.3456880748271942, |
| "learning_rate": 0.00043746344305272355, |
| "loss": 3.3189, |
| "step": 46600 |
| }, |
| { |
| "epoch": 13.580921320831635, |
| "grad_norm": 0.3881891071796417, |
| "learning_rate": 0.0004372886688027964, |
| "loss": 3.3217, |
| "step": 46650 |
| }, |
| { |
| "epoch": 13.595480752431426, |
| "grad_norm": 0.34957295656204224, |
| "learning_rate": 0.00043711389455286916, |
| "loss": 3.3285, |
| "step": 46700 |
| }, |
| { |
| "epoch": 13.610040184031215, |
| "grad_norm": 0.34814003109931946, |
| "learning_rate": 0.000436939120302942, |
| "loss": 3.3207, |
| "step": 46750 |
| }, |
| { |
| "epoch": 13.624599615631006, |
| "grad_norm": 0.32534271478652954, |
| "learning_rate": 0.0004367643460530148, |
| "loss": 3.3224, |
| "step": 46800 |
| }, |
| { |
| "epoch": 13.639159047230796, |
| "grad_norm": 0.36359184980392456, |
| "learning_rate": 0.00043658957180308766, |
| "loss": 3.3316, |
| "step": 46850 |
| }, |
| { |
| "epoch": 13.653718478830587, |
| "grad_norm": 0.3385885953903198, |
| "learning_rate": 0.0004364147975531605, |
| "loss": 3.3246, |
| "step": 46900 |
| }, |
| { |
| "epoch": 13.668277910430376, |
| "grad_norm": 0.3716272711753845, |
| "learning_rate": 0.00043624002330323327, |
| "loss": 3.3287, |
| "step": 46950 |
| }, |
| { |
| "epoch": 13.682837342030167, |
| "grad_norm": 0.3404206931591034, |
| "learning_rate": 0.0004360652490533061, |
| "loss": 3.3194, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.682837342030167, |
| "eval_accuracy": 0.3711830094232832, |
| "eval_loss": 3.5483055114746094, |
| "eval_runtime": 53.9157, |
| "eval_samples_per_second": 308.685, |
| "eval_steps_per_second": 19.308, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.697396773629958, |
| "grad_norm": 0.32848721742630005, |
| "learning_rate": 0.00043589047480337893, |
| "loss": 3.3275, |
| "step": 47050 |
| }, |
| { |
| "epoch": 13.711956205229749, |
| "grad_norm": 0.37124887108802795, |
| "learning_rate": 0.00043571570055345177, |
| "loss": 3.3219, |
| "step": 47100 |
| }, |
| { |
| "epoch": 13.726515636829538, |
| "grad_norm": 0.35257938504219055, |
| "learning_rate": 0.00043554092630352455, |
| "loss": 3.3272, |
| "step": 47150 |
| }, |
| { |
| "epoch": 13.741075068429328, |
| "grad_norm": 0.34476980566978455, |
| "learning_rate": 0.0004353661520535974, |
| "loss": 3.3159, |
| "step": 47200 |
| }, |
| { |
| "epoch": 13.75563450002912, |
| "grad_norm": 0.3280114233493805, |
| "learning_rate": 0.0004351913778036702, |
| "loss": 3.3301, |
| "step": 47250 |
| }, |
| { |
| "epoch": 13.77019393162891, |
| "grad_norm": 0.34827882051467896, |
| "learning_rate": 0.00043501660355374305, |
| "loss": 3.3353, |
| "step": 47300 |
| }, |
| { |
| "epoch": 13.784753363228699, |
| "grad_norm": 0.34743741154670715, |
| "learning_rate": 0.0004348418293038159, |
| "loss": 3.3392, |
| "step": 47350 |
| }, |
| { |
| "epoch": 13.79931279482849, |
| "grad_norm": 0.3636046051979065, |
| "learning_rate": 0.00043466705505388866, |
| "loss": 3.3299, |
| "step": 47400 |
| }, |
| { |
| "epoch": 13.81387222642828, |
| "grad_norm": 0.3252928555011749, |
| "learning_rate": 0.0004344922808039615, |
| "loss": 3.3242, |
| "step": 47450 |
| }, |
| { |
| "epoch": 13.828431658028071, |
| "grad_norm": 0.3536205291748047, |
| "learning_rate": 0.0004343175065540343, |
| "loss": 3.3361, |
| "step": 47500 |
| }, |
| { |
| "epoch": 13.84299108962786, |
| "grad_norm": 0.3566448390483856, |
| "learning_rate": 0.00043414273230410716, |
| "loss": 3.3314, |
| "step": 47550 |
| }, |
| { |
| "epoch": 13.857550521227651, |
| "grad_norm": 0.35187003016471863, |
| "learning_rate": 0.00043396795805418, |
| "loss": 3.3405, |
| "step": 47600 |
| }, |
| { |
| "epoch": 13.872109952827442, |
| "grad_norm": 0.3437885642051697, |
| "learning_rate": 0.00043379318380425277, |
| "loss": 3.3419, |
| "step": 47650 |
| }, |
| { |
| "epoch": 13.886669384427233, |
| "grad_norm": 0.3523525595664978, |
| "learning_rate": 0.0004336184095543256, |
| "loss": 3.3432, |
| "step": 47700 |
| }, |
| { |
| "epoch": 13.901228816027022, |
| "grad_norm": 0.3286518454551697, |
| "learning_rate": 0.00043344363530439843, |
| "loss": 3.3352, |
| "step": 47750 |
| }, |
| { |
| "epoch": 13.915788247626812, |
| "grad_norm": 0.3584185540676117, |
| "learning_rate": 0.00043326886105447127, |
| "loss": 3.3352, |
| "step": 47800 |
| }, |
| { |
| "epoch": 13.930347679226603, |
| "grad_norm": 0.3556600511074066, |
| "learning_rate": 0.00043309408680454405, |
| "loss": 3.3239, |
| "step": 47850 |
| }, |
| { |
| "epoch": 13.944907110826394, |
| "grad_norm": 0.3555219769477844, |
| "learning_rate": 0.0004329193125546169, |
| "loss": 3.3366, |
| "step": 47900 |
| }, |
| { |
| "epoch": 13.959466542426183, |
| "grad_norm": 0.33693447709083557, |
| "learning_rate": 0.0004327445383046897, |
| "loss": 3.3353, |
| "step": 47950 |
| }, |
| { |
| "epoch": 13.974025974025974, |
| "grad_norm": 0.32497769594192505, |
| "learning_rate": 0.00043256976405476255, |
| "loss": 3.3425, |
| "step": 48000 |
| }, |
| { |
| "epoch": 13.974025974025974, |
| "eval_accuracy": 0.37171954429693294, |
| "eval_loss": 3.5404067039489746, |
| "eval_runtime": 53.9107, |
| "eval_samples_per_second": 308.714, |
| "eval_steps_per_second": 19.31, |
| "step": 48000 |
| }, |
| { |
| "epoch": 13.988585405625765, |
| "grad_norm": 0.35186105966567993, |
| "learning_rate": 0.0004323949898048354, |
| "loss": 3.3426, |
| "step": 48050 |
| }, |
| { |
| "epoch": 14.002911886319959, |
| "grad_norm": 0.36423787474632263, |
| "learning_rate": 0.00043222021555490816, |
| "loss": 3.3166, |
| "step": 48100 |
| }, |
| { |
| "epoch": 14.017471317919748, |
| "grad_norm": 0.32975277304649353, |
| "learning_rate": 0.000432045441304981, |
| "loss": 3.2352, |
| "step": 48150 |
| }, |
| { |
| "epoch": 14.032030749519539, |
| "grad_norm": 0.3414633572101593, |
| "learning_rate": 0.0004318706670550538, |
| "loss": 3.2437, |
| "step": 48200 |
| }, |
| { |
| "epoch": 14.04659018111933, |
| "grad_norm": 0.3397580087184906, |
| "learning_rate": 0.00043169589280512666, |
| "loss": 3.2371, |
| "step": 48250 |
| }, |
| { |
| "epoch": 14.06114961271912, |
| "grad_norm": 0.37154221534729004, |
| "learning_rate": 0.0004315211185551995, |
| "loss": 3.2452, |
| "step": 48300 |
| }, |
| { |
| "epoch": 14.07570904431891, |
| "grad_norm": 0.37078914046287537, |
| "learning_rate": 0.0004313463443052724, |
| "loss": 3.2346, |
| "step": 48350 |
| }, |
| { |
| "epoch": 14.0902684759187, |
| "grad_norm": 0.36830246448516846, |
| "learning_rate": 0.00043117157005534515, |
| "loss": 3.2361, |
| "step": 48400 |
| }, |
| { |
| "epoch": 14.10482790751849, |
| "grad_norm": 0.33482542634010315, |
| "learning_rate": 0.000430996795805418, |
| "loss": 3.2619, |
| "step": 48450 |
| }, |
| { |
| "epoch": 14.119387339118282, |
| "grad_norm": 0.36272552609443665, |
| "learning_rate": 0.0004308220215554908, |
| "loss": 3.2486, |
| "step": 48500 |
| }, |
| { |
| "epoch": 14.13394677071807, |
| "grad_norm": 0.3442781865596771, |
| "learning_rate": 0.00043064724730556365, |
| "loss": 3.2551, |
| "step": 48550 |
| }, |
| { |
| "epoch": 14.148506202317861, |
| "grad_norm": 0.37955281138420105, |
| "learning_rate": 0.00043047247305563643, |
| "loss": 3.2665, |
| "step": 48600 |
| }, |
| { |
| "epoch": 14.163065633917652, |
| "grad_norm": 0.3380308449268341, |
| "learning_rate": 0.00043029769880570927, |
| "loss": 3.255, |
| "step": 48650 |
| }, |
| { |
| "epoch": 14.177625065517443, |
| "grad_norm": 0.36957699060440063, |
| "learning_rate": 0.0004301229245557821, |
| "loss": 3.2574, |
| "step": 48700 |
| }, |
| { |
| "epoch": 14.192184497117232, |
| "grad_norm": 0.3417280614376068, |
| "learning_rate": 0.00042994815030585493, |
| "loss": 3.2674, |
| "step": 48750 |
| }, |
| { |
| "epoch": 14.206743928717023, |
| "grad_norm": 0.3839270770549774, |
| "learning_rate": 0.00042977337605592776, |
| "loss": 3.2759, |
| "step": 48800 |
| }, |
| { |
| "epoch": 14.221303360316814, |
| "grad_norm": 0.3585422933101654, |
| "learning_rate": 0.00042959860180600054, |
| "loss": 3.2727, |
| "step": 48850 |
| }, |
| { |
| "epoch": 14.235862791916604, |
| "grad_norm": 0.33868226408958435, |
| "learning_rate": 0.0004294238275560734, |
| "loss": 3.2727, |
| "step": 48900 |
| }, |
| { |
| "epoch": 14.250422223516393, |
| "grad_norm": 0.3586726188659668, |
| "learning_rate": 0.0004292490533061462, |
| "loss": 3.2764, |
| "step": 48950 |
| }, |
| { |
| "epoch": 14.264981655116184, |
| "grad_norm": 0.3632107973098755, |
| "learning_rate": 0.00042907427905621904, |
| "loss": 3.2744, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.264981655116184, |
| "eval_accuracy": 0.37127966330584733, |
| "eval_loss": 3.5580098628997803, |
| "eval_runtime": 54.0909, |
| "eval_samples_per_second": 307.686, |
| "eval_steps_per_second": 19.245, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.279541086715975, |
| "grad_norm": 0.36595603823661804, |
| "learning_rate": 0.0004288995048062919, |
| "loss": 3.2797, |
| "step": 49050 |
| }, |
| { |
| "epoch": 14.294100518315766, |
| "grad_norm": 0.3803919553756714, |
| "learning_rate": 0.00042872473055636465, |
| "loss": 3.274, |
| "step": 49100 |
| }, |
| { |
| "epoch": 14.308659949915555, |
| "grad_norm": 0.36094287037849426, |
| "learning_rate": 0.0004285499563064375, |
| "loss": 3.2691, |
| "step": 49150 |
| }, |
| { |
| "epoch": 14.323219381515345, |
| "grad_norm": 0.3780951201915741, |
| "learning_rate": 0.0004283751820565103, |
| "loss": 3.2754, |
| "step": 49200 |
| }, |
| { |
| "epoch": 14.337778813115136, |
| "grad_norm": 0.38594871759414673, |
| "learning_rate": 0.00042820040780658315, |
| "loss": 3.2913, |
| "step": 49250 |
| }, |
| { |
| "epoch": 14.352338244714927, |
| "grad_norm": 0.362032026052475, |
| "learning_rate": 0.00042802563355665593, |
| "loss": 3.2821, |
| "step": 49300 |
| }, |
| { |
| "epoch": 14.366897676314716, |
| "grad_norm": 0.3559892177581787, |
| "learning_rate": 0.00042785085930672876, |
| "loss": 3.2865, |
| "step": 49350 |
| }, |
| { |
| "epoch": 14.381457107914507, |
| "grad_norm": 0.3505326211452484, |
| "learning_rate": 0.0004276760850568016, |
| "loss": 3.2994, |
| "step": 49400 |
| }, |
| { |
| "epoch": 14.396016539514298, |
| "grad_norm": 0.3532223403453827, |
| "learning_rate": 0.00042750131080687443, |
| "loss": 3.279, |
| "step": 49450 |
| }, |
| { |
| "epoch": 14.410575971114088, |
| "grad_norm": 0.3655863404273987, |
| "learning_rate": 0.00042732653655694726, |
| "loss": 3.284, |
| "step": 49500 |
| }, |
| { |
| "epoch": 14.425135402713877, |
| "grad_norm": 0.3348618745803833, |
| "learning_rate": 0.00042715176230702004, |
| "loss": 3.297, |
| "step": 49550 |
| }, |
| { |
| "epoch": 14.439694834313668, |
| "grad_norm": 0.3709411919116974, |
| "learning_rate": 0.0004269769880570929, |
| "loss": 3.3011, |
| "step": 49600 |
| }, |
| { |
| "epoch": 14.454254265913459, |
| "grad_norm": 0.3769262433052063, |
| "learning_rate": 0.0004268022138071657, |
| "loss": 3.2907, |
| "step": 49650 |
| }, |
| { |
| "epoch": 14.46881369751325, |
| "grad_norm": 0.3713088631629944, |
| "learning_rate": 0.00042662743955723854, |
| "loss": 3.2944, |
| "step": 49700 |
| }, |
| { |
| "epoch": 14.483373129113039, |
| "grad_norm": 0.3499068319797516, |
| "learning_rate": 0.0004264526653073114, |
| "loss": 3.2959, |
| "step": 49750 |
| }, |
| { |
| "epoch": 14.49793256071283, |
| "grad_norm": 0.3687066435813904, |
| "learning_rate": 0.00042627789105738415, |
| "loss": 3.2915, |
| "step": 49800 |
| }, |
| { |
| "epoch": 14.51249199231262, |
| "grad_norm": 0.38556233048439026, |
| "learning_rate": 0.000426103116807457, |
| "loss": 3.2961, |
| "step": 49850 |
| }, |
| { |
| "epoch": 14.527051423912411, |
| "grad_norm": 0.3844015896320343, |
| "learning_rate": 0.0004259283425575298, |
| "loss": 3.2934, |
| "step": 49900 |
| }, |
| { |
| "epoch": 14.5416108555122, |
| "grad_norm": 0.36595389246940613, |
| "learning_rate": 0.00042575356830760265, |
| "loss": 3.3006, |
| "step": 49950 |
| }, |
| { |
| "epoch": 14.556170287111991, |
| "grad_norm": 0.3643812835216522, |
| "learning_rate": 0.00042557879405767543, |
| "loss": 3.3138, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.556170287111991, |
| "eval_accuracy": 0.37170237706231696, |
| "eval_loss": 3.5477921962738037, |
| "eval_runtime": 53.7554, |
| "eval_samples_per_second": 309.606, |
| "eval_steps_per_second": 19.366, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.570729718711782, |
| "grad_norm": 0.3703961968421936, |
| "learning_rate": 0.00042540401980774826, |
| "loss": 3.2976, |
| "step": 50050 |
| }, |
| { |
| "epoch": 14.585289150311572, |
| "grad_norm": 0.34066158533096313, |
| "learning_rate": 0.0004252292455578211, |
| "loss": 3.3139, |
| "step": 50100 |
| }, |
| { |
| "epoch": 14.599848581911361, |
| "grad_norm": 0.3463113009929657, |
| "learning_rate": 0.00042505447130789393, |
| "loss": 3.3023, |
| "step": 50150 |
| }, |
| { |
| "epoch": 14.614408013511152, |
| "grad_norm": 0.3557461202144623, |
| "learning_rate": 0.00042487969705796676, |
| "loss": 3.2978, |
| "step": 50200 |
| }, |
| { |
| "epoch": 14.628967445110943, |
| "grad_norm": 0.36186400055885315, |
| "learning_rate": 0.00042470492280803954, |
| "loss": 3.3095, |
| "step": 50250 |
| }, |
| { |
| "epoch": 14.643526876710734, |
| "grad_norm": 0.33865877985954285, |
| "learning_rate": 0.0004245301485581124, |
| "loss": 3.3047, |
| "step": 50300 |
| }, |
| { |
| "epoch": 14.658086308310523, |
| "grad_norm": 0.331206351518631, |
| "learning_rate": 0.0004243553743081852, |
| "loss": 3.3206, |
| "step": 50350 |
| }, |
| { |
| "epoch": 14.672645739910314, |
| "grad_norm": 0.3369050621986389, |
| "learning_rate": 0.00042418060005825804, |
| "loss": 3.3068, |
| "step": 50400 |
| }, |
| { |
| "epoch": 14.687205171510104, |
| "grad_norm": 0.35863232612609863, |
| "learning_rate": 0.0004240058258083308, |
| "loss": 3.3162, |
| "step": 50450 |
| }, |
| { |
| "epoch": 14.701764603109895, |
| "grad_norm": 0.3646875023841858, |
| "learning_rate": 0.00042383105155840365, |
| "loss": 3.3166, |
| "step": 50500 |
| }, |
| { |
| "epoch": 14.716324034709684, |
| "grad_norm": 0.35427382588386536, |
| "learning_rate": 0.0004236562773084765, |
| "loss": 3.3133, |
| "step": 50550 |
| }, |
| { |
| "epoch": 14.730883466309475, |
| "grad_norm": 0.3502483665943146, |
| "learning_rate": 0.0004234815030585493, |
| "loss": 3.3116, |
| "step": 50600 |
| }, |
| { |
| "epoch": 14.745442897909266, |
| "grad_norm": 0.34835225343704224, |
| "learning_rate": 0.00042330672880862215, |
| "loss": 3.32, |
| "step": 50650 |
| }, |
| { |
| "epoch": 14.760002329509057, |
| "grad_norm": 0.35687556862831116, |
| "learning_rate": 0.00042313195455869493, |
| "loss": 3.3089, |
| "step": 50700 |
| }, |
| { |
| "epoch": 14.774561761108846, |
| "grad_norm": 0.3292189836502075, |
| "learning_rate": 0.00042295718030876776, |
| "loss": 3.3108, |
| "step": 50750 |
| }, |
| { |
| "epoch": 14.789121192708636, |
| "grad_norm": 0.3459155559539795, |
| "learning_rate": 0.0004227824060588406, |
| "loss": 3.3135, |
| "step": 50800 |
| }, |
| { |
| "epoch": 14.803680624308427, |
| "grad_norm": 0.3650501072406769, |
| "learning_rate": 0.0004226076318089135, |
| "loss": 3.3255, |
| "step": 50850 |
| }, |
| { |
| "epoch": 14.818240055908218, |
| "grad_norm": 0.3758101463317871, |
| "learning_rate": 0.0004224328575589863, |
| "loss": 3.3176, |
| "step": 50900 |
| }, |
| { |
| "epoch": 14.832799487508007, |
| "grad_norm": 0.39660805463790894, |
| "learning_rate": 0.00042225808330905915, |
| "loss": 3.325, |
| "step": 50950 |
| }, |
| { |
| "epoch": 14.847358919107798, |
| "grad_norm": 0.34520870447158813, |
| "learning_rate": 0.00042208330905913193, |
| "loss": 3.32, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.847358919107798, |
| "eval_accuracy": 0.37202161707589554, |
| "eval_loss": 3.540180206298828, |
| "eval_runtime": 54.0283, |
| "eval_samples_per_second": 308.042, |
| "eval_steps_per_second": 19.268, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.861918350707588, |
| "grad_norm": 0.3515894412994385, |
| "learning_rate": 0.00042190853480920476, |
| "loss": 3.3174, |
| "step": 51050 |
| }, |
| { |
| "epoch": 14.87647778230738, |
| "grad_norm": 0.3637034296989441, |
| "learning_rate": 0.0004217337605592776, |
| "loss": 3.335, |
| "step": 51100 |
| }, |
| { |
| "epoch": 14.891037213907168, |
| "grad_norm": 0.3399968445301056, |
| "learning_rate": 0.00042155898630935043, |
| "loss": 3.3259, |
| "step": 51150 |
| }, |
| { |
| "epoch": 14.905596645506959, |
| "grad_norm": 0.3715548515319824, |
| "learning_rate": 0.0004213842120594232, |
| "loss": 3.3153, |
| "step": 51200 |
| }, |
| { |
| "epoch": 14.92015607710675, |
| "grad_norm": 0.3517013192176819, |
| "learning_rate": 0.00042120943780949604, |
| "loss": 3.3186, |
| "step": 51250 |
| }, |
| { |
| "epoch": 14.93471550870654, |
| "grad_norm": 0.3491024971008301, |
| "learning_rate": 0.00042103466355956887, |
| "loss": 3.3276, |
| "step": 51300 |
| }, |
| { |
| "epoch": 14.94927494030633, |
| "grad_norm": 0.349324107170105, |
| "learning_rate": 0.0004208598893096417, |
| "loss": 3.3208, |
| "step": 51350 |
| }, |
| { |
| "epoch": 14.96383437190612, |
| "grad_norm": 0.3804023265838623, |
| "learning_rate": 0.00042068511505971454, |
| "loss": 3.3212, |
| "step": 51400 |
| }, |
| { |
| "epoch": 14.978393803505911, |
| "grad_norm": 0.3643926978111267, |
| "learning_rate": 0.0004205103408097873, |
| "loss": 3.3238, |
| "step": 51450 |
| }, |
| { |
| "epoch": 14.992953235105702, |
| "grad_norm": 0.3704342544078827, |
| "learning_rate": 0.00042033556655986015, |
| "loss": 3.3282, |
| "step": 51500 |
| }, |
| { |
| "epoch": 15.007279715799895, |
| "grad_norm": 0.375531405210495, |
| "learning_rate": 0.000420160792309933, |
| "loss": 3.2593, |
| "step": 51550 |
| }, |
| { |
| "epoch": 15.021839147399685, |
| "grad_norm": 0.3492209315299988, |
| "learning_rate": 0.0004199860180600058, |
| "loss": 3.2057, |
| "step": 51600 |
| }, |
| { |
| "epoch": 15.036398578999476, |
| "grad_norm": 0.39182865619659424, |
| "learning_rate": 0.00041981124381007865, |
| "loss": 3.2147, |
| "step": 51650 |
| }, |
| { |
| "epoch": 15.050958010599267, |
| "grad_norm": 0.3726196885108948, |
| "learning_rate": 0.00041963646956015143, |
| "loss": 3.2193, |
| "step": 51700 |
| }, |
| { |
| "epoch": 15.065517442199056, |
| "grad_norm": 0.36583876609802246, |
| "learning_rate": 0.00041946169531022426, |
| "loss": 3.2143, |
| "step": 51750 |
| }, |
| { |
| "epoch": 15.080076873798847, |
| "grad_norm": 0.35128819942474365, |
| "learning_rate": 0.0004192869210602971, |
| "loss": 3.2313, |
| "step": 51800 |
| }, |
| { |
| "epoch": 15.094636305398637, |
| "grad_norm": 0.3431715667247772, |
| "learning_rate": 0.0004191121468103699, |
| "loss": 3.2344, |
| "step": 51850 |
| }, |
| { |
| "epoch": 15.109195736998428, |
| "grad_norm": 0.360064834356308, |
| "learning_rate": 0.0004189373725604427, |
| "loss": 3.241, |
| "step": 51900 |
| }, |
| { |
| "epoch": 15.123755168598217, |
| "grad_norm": 0.3761180341243744, |
| "learning_rate": 0.00041876259831051554, |
| "loss": 3.2406, |
| "step": 51950 |
| }, |
| { |
| "epoch": 15.138314600198008, |
| "grad_norm": 0.3627549409866333, |
| "learning_rate": 0.00041858782406058837, |
| "loss": 3.2496, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.138314600198008, |
| "eval_accuracy": 0.3714227627889137, |
| "eval_loss": 3.557051420211792, |
| "eval_runtime": 54.0838, |
| "eval_samples_per_second": 307.726, |
| "eval_steps_per_second": 19.248, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.152874031797799, |
| "grad_norm": 0.33691975474357605, |
| "learning_rate": 0.0004184130498106612, |
| "loss": 3.2433, |
| "step": 52050 |
| }, |
| { |
| "epoch": 15.16743346339759, |
| "grad_norm": 0.33604761958122253, |
| "learning_rate": 0.00041823827556073404, |
| "loss": 3.2478, |
| "step": 52100 |
| }, |
| { |
| "epoch": 15.181992894997379, |
| "grad_norm": 0.34679973125457764, |
| "learning_rate": 0.0004180635013108068, |
| "loss": 3.2522, |
| "step": 52150 |
| }, |
| { |
| "epoch": 15.19655232659717, |
| "grad_norm": 0.3581389784812927, |
| "learning_rate": 0.00041788872706087965, |
| "loss": 3.2547, |
| "step": 52200 |
| }, |
| { |
| "epoch": 15.21111175819696, |
| "grad_norm": 0.33653566241264343, |
| "learning_rate": 0.0004177139528109525, |
| "loss": 3.261, |
| "step": 52250 |
| }, |
| { |
| "epoch": 15.225671189796751, |
| "grad_norm": 0.36339202523231506, |
| "learning_rate": 0.0004175391785610253, |
| "loss": 3.2535, |
| "step": 52300 |
| }, |
| { |
| "epoch": 15.24023062139654, |
| "grad_norm": 0.3384561240673065, |
| "learning_rate": 0.00041736440431109815, |
| "loss": 3.2516, |
| "step": 52350 |
| }, |
| { |
| "epoch": 15.25479005299633, |
| "grad_norm": 0.34658774733543396, |
| "learning_rate": 0.0004171896300611709, |
| "loss": 3.2719, |
| "step": 52400 |
| }, |
| { |
| "epoch": 15.269349484596122, |
| "grad_norm": 0.3711901009082794, |
| "learning_rate": 0.00041701485581124376, |
| "loss": 3.2684, |
| "step": 52450 |
| }, |
| { |
| "epoch": 15.283908916195912, |
| "grad_norm": 0.3736010193824768, |
| "learning_rate": 0.0004168400815613166, |
| "loss": 3.2595, |
| "step": 52500 |
| }, |
| { |
| "epoch": 15.298468347795701, |
| "grad_norm": 0.3565717041492462, |
| "learning_rate": 0.0004166653073113894, |
| "loss": 3.2614, |
| "step": 52550 |
| }, |
| { |
| "epoch": 15.313027779395492, |
| "grad_norm": 0.3514735698699951, |
| "learning_rate": 0.0004164905330614622, |
| "loss": 3.2719, |
| "step": 52600 |
| }, |
| { |
| "epoch": 15.327587210995283, |
| "grad_norm": 0.3635771870613098, |
| "learning_rate": 0.00041631575881153504, |
| "loss": 3.2727, |
| "step": 52650 |
| }, |
| { |
| "epoch": 15.342146642595074, |
| "grad_norm": 0.36101034283638, |
| "learning_rate": 0.00041614098456160787, |
| "loss": 3.2739, |
| "step": 52700 |
| }, |
| { |
| "epoch": 15.356706074194863, |
| "grad_norm": 0.3623729646205902, |
| "learning_rate": 0.0004159662103116807, |
| "loss": 3.2742, |
| "step": 52750 |
| }, |
| { |
| "epoch": 15.371265505794653, |
| "grad_norm": 0.3742918074131012, |
| "learning_rate": 0.00041579143606175354, |
| "loss": 3.2738, |
| "step": 52800 |
| }, |
| { |
| "epoch": 15.385824937394444, |
| "grad_norm": 0.3470838963985443, |
| "learning_rate": 0.0004156166618118263, |
| "loss": 3.2805, |
| "step": 52850 |
| }, |
| { |
| "epoch": 15.400384368994235, |
| "grad_norm": 0.3733544647693634, |
| "learning_rate": 0.00041544188756189915, |
| "loss": 3.2814, |
| "step": 52900 |
| }, |
| { |
| "epoch": 15.414943800594024, |
| "grad_norm": 0.3658877909183502, |
| "learning_rate": 0.000415267113311972, |
| "loss": 3.2687, |
| "step": 52950 |
| }, |
| { |
| "epoch": 15.429503232193815, |
| "grad_norm": 0.36594730615615845, |
| "learning_rate": 0.0004150923390620448, |
| "loss": 3.2803, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.429503232193815, |
| "eval_accuracy": 0.3717013188081283, |
| "eval_loss": 3.549906015396118, |
| "eval_runtime": 53.9513, |
| "eval_samples_per_second": 308.482, |
| "eval_steps_per_second": 19.295, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.444062663793606, |
| "grad_norm": 0.372753381729126, |
| "learning_rate": 0.0004149175648121176, |
| "loss": 3.2817, |
| "step": 53050 |
| }, |
| { |
| "epoch": 15.458622095393396, |
| "grad_norm": 0.3828512132167816, |
| "learning_rate": 0.0004147427905621904, |
| "loss": 3.2858, |
| "step": 53100 |
| }, |
| { |
| "epoch": 15.473181526993185, |
| "grad_norm": 0.3578071594238281, |
| "learning_rate": 0.00041456801631226326, |
| "loss": 3.2727, |
| "step": 53150 |
| }, |
| { |
| "epoch": 15.487740958592976, |
| "grad_norm": 0.39373624324798584, |
| "learning_rate": 0.0004143932420623361, |
| "loss": 3.2806, |
| "step": 53200 |
| }, |
| { |
| "epoch": 15.502300390192767, |
| "grad_norm": 0.38140150904655457, |
| "learning_rate": 0.0004142184678124089, |
| "loss": 3.2895, |
| "step": 53250 |
| }, |
| { |
| "epoch": 15.516859821792558, |
| "grad_norm": 0.3811562955379486, |
| "learning_rate": 0.0004140436935624817, |
| "loss": 3.2965, |
| "step": 53300 |
| }, |
| { |
| "epoch": 15.531419253392347, |
| "grad_norm": 0.3750387132167816, |
| "learning_rate": 0.00041386891931255454, |
| "loss": 3.2872, |
| "step": 53350 |
| }, |
| { |
| "epoch": 15.545978684992138, |
| "grad_norm": 0.4013814628124237, |
| "learning_rate": 0.0004136941450626274, |
| "loss": 3.2822, |
| "step": 53400 |
| }, |
| { |
| "epoch": 15.560538116591928, |
| "grad_norm": 0.3812587559223175, |
| "learning_rate": 0.00041351937081270026, |
| "loss": 3.3002, |
| "step": 53450 |
| }, |
| { |
| "epoch": 15.575097548191719, |
| "grad_norm": 0.37295323610305786, |
| "learning_rate": 0.0004133445965627731, |
| "loss": 3.2983, |
| "step": 53500 |
| }, |
| { |
| "epoch": 15.58965697979151, |
| "grad_norm": 0.3801644742488861, |
| "learning_rate": 0.0004131698223128459, |
| "loss": 3.2869, |
| "step": 53550 |
| }, |
| { |
| "epoch": 15.604216411391299, |
| "grad_norm": 0.3526298999786377, |
| "learning_rate": 0.0004129950480629187, |
| "loss": 3.2995, |
| "step": 53600 |
| }, |
| { |
| "epoch": 15.61877584299109, |
| "grad_norm": 0.3763193190097809, |
| "learning_rate": 0.00041282027381299153, |
| "loss": 3.292, |
| "step": 53650 |
| }, |
| { |
| "epoch": 15.63333527459088, |
| "grad_norm": 0.3336646556854248, |
| "learning_rate": 0.00041264549956306437, |
| "loss": 3.2837, |
| "step": 53700 |
| }, |
| { |
| "epoch": 15.64789470619067, |
| "grad_norm": 0.3432423174381256, |
| "learning_rate": 0.0004124707253131372, |
| "loss": 3.298, |
| "step": 53750 |
| }, |
| { |
| "epoch": 15.66245413779046, |
| "grad_norm": 0.37490835785865784, |
| "learning_rate": 0.00041229595106321, |
| "loss": 3.3064, |
| "step": 53800 |
| }, |
| { |
| "epoch": 15.677013569390251, |
| "grad_norm": 0.355726033449173, |
| "learning_rate": 0.0004121211768132828, |
| "loss": 3.2969, |
| "step": 53850 |
| }, |
| { |
| "epoch": 15.691573000990042, |
| "grad_norm": 0.3476122319698334, |
| "learning_rate": 0.00041194640256335565, |
| "loss": 3.2986, |
| "step": 53900 |
| }, |
| { |
| "epoch": 15.706132432589833, |
| "grad_norm": 0.33737611770629883, |
| "learning_rate": 0.0004117716283134285, |
| "loss": 3.2942, |
| "step": 53950 |
| }, |
| { |
| "epoch": 15.720691864189622, |
| "grad_norm": 0.38145211338996887, |
| "learning_rate": 0.0004115968540635013, |
| "loss": 3.2919, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.720691864189622, |
| "eval_accuracy": 0.3721127445199189, |
| "eval_loss": 3.5404856204986572, |
| "eval_runtime": 53.8872, |
| "eval_samples_per_second": 308.849, |
| "eval_steps_per_second": 19.318, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.735251295789412, |
| "grad_norm": 0.42329856753349304, |
| "learning_rate": 0.0004114220798135741, |
| "loss": 3.3079, |
| "step": 54050 |
| }, |
| { |
| "epoch": 15.749810727389203, |
| "grad_norm": 0.344635546207428, |
| "learning_rate": 0.0004112473055636469, |
| "loss": 3.3045, |
| "step": 54100 |
| }, |
| { |
| "epoch": 15.764370158988992, |
| "grad_norm": 0.38018330931663513, |
| "learning_rate": 0.00041107253131371976, |
| "loss": 3.3042, |
| "step": 54150 |
| }, |
| { |
| "epoch": 15.778929590588783, |
| "grad_norm": 0.34092721343040466, |
| "learning_rate": 0.0004108977570637926, |
| "loss": 3.303, |
| "step": 54200 |
| }, |
| { |
| "epoch": 15.793489022188574, |
| "grad_norm": 0.3967198133468628, |
| "learning_rate": 0.0004107229828138654, |
| "loss": 3.2946, |
| "step": 54250 |
| }, |
| { |
| "epoch": 15.808048453788365, |
| "grad_norm": 0.34989359974861145, |
| "learning_rate": 0.0004105482085639382, |
| "loss": 3.2874, |
| "step": 54300 |
| }, |
| { |
| "epoch": 15.822607885388155, |
| "grad_norm": 0.3509521186351776, |
| "learning_rate": 0.00041037343431401103, |
| "loss": 3.301, |
| "step": 54350 |
| }, |
| { |
| "epoch": 15.837167316987944, |
| "grad_norm": 0.3767544627189636, |
| "learning_rate": 0.00041019866006408387, |
| "loss": 3.304, |
| "step": 54400 |
| }, |
| { |
| "epoch": 15.851726748587735, |
| "grad_norm": 0.3420964777469635, |
| "learning_rate": 0.0004100238858141567, |
| "loss": 3.3212, |
| "step": 54450 |
| }, |
| { |
| "epoch": 15.866286180187526, |
| "grad_norm": 0.35542258620262146, |
| "learning_rate": 0.0004098491115642295, |
| "loss": 3.3, |
| "step": 54500 |
| }, |
| { |
| "epoch": 15.880845611787315, |
| "grad_norm": 0.34566208720207214, |
| "learning_rate": 0.0004096743373143023, |
| "loss": 3.311, |
| "step": 54550 |
| }, |
| { |
| "epoch": 15.895405043387106, |
| "grad_norm": 0.3717935383319855, |
| "learning_rate": 0.00040949956306437514, |
| "loss": 3.3017, |
| "step": 54600 |
| }, |
| { |
| "epoch": 15.909964474986896, |
| "grad_norm": 0.35940760374069214, |
| "learning_rate": 0.000409324788814448, |
| "loss": 3.3117, |
| "step": 54650 |
| }, |
| { |
| "epoch": 15.924523906586687, |
| "grad_norm": 0.3497525155544281, |
| "learning_rate": 0.0004091500145645208, |
| "loss": 3.2981, |
| "step": 54700 |
| }, |
| { |
| "epoch": 15.939083338186478, |
| "grad_norm": 0.35200536251068115, |
| "learning_rate": 0.0004089752403145936, |
| "loss": 3.3011, |
| "step": 54750 |
| }, |
| { |
| "epoch": 15.953642769786267, |
| "grad_norm": 0.35767561197280884, |
| "learning_rate": 0.0004088004660646664, |
| "loss": 3.3174, |
| "step": 54800 |
| }, |
| { |
| "epoch": 15.968202201386058, |
| "grad_norm": 0.33962079882621765, |
| "learning_rate": 0.00040862569181473926, |
| "loss": 3.3088, |
| "step": 54850 |
| }, |
| { |
| "epoch": 15.982761632985849, |
| "grad_norm": 0.33080315589904785, |
| "learning_rate": 0.0004084509175648121, |
| "loss": 3.3117, |
| "step": 54900 |
| }, |
| { |
| "epoch": 15.99732106458564, |
| "grad_norm": 0.35157039761543274, |
| "learning_rate": 0.0004082761433148849, |
| "loss": 3.31, |
| "step": 54950 |
| }, |
| { |
| "epoch": 16.011647545279832, |
| "grad_norm": 0.35152769088745117, |
| "learning_rate": 0.0004081013690649577, |
| "loss": 3.2175, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.011647545279832, |
| "eval_accuracy": 0.37205536362613384, |
| "eval_loss": 3.5515835285186768, |
| "eval_runtime": 53.9904, |
| "eval_samples_per_second": 308.259, |
| "eval_steps_per_second": 19.281, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.02620697687962, |
| "grad_norm": 0.34217002987861633, |
| "learning_rate": 0.00040792659481503053, |
| "loss": 3.2079, |
| "step": 55050 |
| }, |
| { |
| "epoch": 16.040766408479413, |
| "grad_norm": 0.3347378075122833, |
| "learning_rate": 0.00040775182056510337, |
| "loss": 3.198, |
| "step": 55100 |
| }, |
| { |
| "epoch": 16.055325840079202, |
| "grad_norm": 0.38384637236595154, |
| "learning_rate": 0.0004075770463151762, |
| "loss": 3.2018, |
| "step": 55150 |
| }, |
| { |
| "epoch": 16.069885271678995, |
| "grad_norm": 0.3489525616168976, |
| "learning_rate": 0.000407402272065249, |
| "loss": 3.2152, |
| "step": 55200 |
| }, |
| { |
| "epoch": 16.084444703278784, |
| "grad_norm": 0.4081012010574341, |
| "learning_rate": 0.0004072274978153218, |
| "loss": 3.2085, |
| "step": 55250 |
| }, |
| { |
| "epoch": 16.099004134878573, |
| "grad_norm": 0.37120628356933594, |
| "learning_rate": 0.00040705272356539464, |
| "loss": 3.2262, |
| "step": 55300 |
| }, |
| { |
| "epoch": 16.113563566478366, |
| "grad_norm": 0.4057721793651581, |
| "learning_rate": 0.0004068779493154675, |
| "loss": 3.2285, |
| "step": 55350 |
| }, |
| { |
| "epoch": 16.128122998078155, |
| "grad_norm": 0.35924482345581055, |
| "learning_rate": 0.0004067031750655403, |
| "loss": 3.2267, |
| "step": 55400 |
| }, |
| { |
| "epoch": 16.142682429677944, |
| "grad_norm": 0.41697263717651367, |
| "learning_rate": 0.0004065284008156131, |
| "loss": 3.2411, |
| "step": 55450 |
| }, |
| { |
| "epoch": 16.157241861277736, |
| "grad_norm": 0.38057592511177063, |
| "learning_rate": 0.0004063536265656859, |
| "loss": 3.226, |
| "step": 55500 |
| }, |
| { |
| "epoch": 16.171801292877525, |
| "grad_norm": 0.35883548855781555, |
| "learning_rate": 0.00040617885231575876, |
| "loss": 3.2306, |
| "step": 55550 |
| }, |
| { |
| "epoch": 16.186360724477318, |
| "grad_norm": 0.3391270339488983, |
| "learning_rate": 0.0004060040780658316, |
| "loss": 3.2337, |
| "step": 55600 |
| }, |
| { |
| "epoch": 16.200920156077107, |
| "grad_norm": 0.3992067873477936, |
| "learning_rate": 0.0004058293038159044, |
| "loss": 3.2355, |
| "step": 55650 |
| }, |
| { |
| "epoch": 16.215479587676896, |
| "grad_norm": 0.38337230682373047, |
| "learning_rate": 0.0004056545295659772, |
| "loss": 3.2416, |
| "step": 55700 |
| }, |
| { |
| "epoch": 16.23003901927669, |
| "grad_norm": 0.3764645755290985, |
| "learning_rate": 0.00040547975531605003, |
| "loss": 3.2423, |
| "step": 55750 |
| }, |
| { |
| "epoch": 16.244598450876477, |
| "grad_norm": 0.36295512318611145, |
| "learning_rate": 0.00040530498106612287, |
| "loss": 3.2453, |
| "step": 55800 |
| }, |
| { |
| "epoch": 16.259157882476266, |
| "grad_norm": 0.3418979346752167, |
| "learning_rate": 0.0004051302068161957, |
| "loss": 3.2385, |
| "step": 55850 |
| }, |
| { |
| "epoch": 16.27371731407606, |
| "grad_norm": 0.3582913875579834, |
| "learning_rate": 0.0004049554325662686, |
| "loss": 3.2475, |
| "step": 55900 |
| }, |
| { |
| "epoch": 16.288276745675848, |
| "grad_norm": 0.37994620203971863, |
| "learning_rate": 0.00040478065831634136, |
| "loss": 3.2515, |
| "step": 55950 |
| }, |
| { |
| "epoch": 16.30283617727564, |
| "grad_norm": 0.40661951899528503, |
| "learning_rate": 0.0004046058840664142, |
| "loss": 3.2547, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.30283617727564, |
| "eval_accuracy": 0.372163305553377, |
| "eval_loss": 3.5515215396881104, |
| "eval_runtime": 53.9642, |
| "eval_samples_per_second": 308.408, |
| "eval_steps_per_second": 19.291, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.31739560887543, |
| "grad_norm": 0.3635038435459137, |
| "learning_rate": 0.00040443110981648703, |
| "loss": 3.2546, |
| "step": 56050 |
| }, |
| { |
| "epoch": 16.33195504047522, |
| "grad_norm": 0.3598199486732483, |
| "learning_rate": 0.00040425633556655986, |
| "loss": 3.2543, |
| "step": 56100 |
| }, |
| { |
| "epoch": 16.34651447207501, |
| "grad_norm": 0.38117557764053345, |
| "learning_rate": 0.0004040815613166327, |
| "loss": 3.2716, |
| "step": 56150 |
| }, |
| { |
| "epoch": 16.3610739036748, |
| "grad_norm": 0.37007659673690796, |
| "learning_rate": 0.0004039067870667055, |
| "loss": 3.261, |
| "step": 56200 |
| }, |
| { |
| "epoch": 16.375633335274593, |
| "grad_norm": 0.4195094108581543, |
| "learning_rate": 0.0004037320128167783, |
| "loss": 3.2635, |
| "step": 56250 |
| }, |
| { |
| "epoch": 16.39019276687438, |
| "grad_norm": 0.38829296827316284, |
| "learning_rate": 0.00040355723856685114, |
| "loss": 3.2553, |
| "step": 56300 |
| }, |
| { |
| "epoch": 16.40475219847417, |
| "grad_norm": 0.34940147399902344, |
| "learning_rate": 0.000403382464316924, |
| "loss": 3.2697, |
| "step": 56350 |
| }, |
| { |
| "epoch": 16.419311630073963, |
| "grad_norm": 0.39590585231781006, |
| "learning_rate": 0.00040320769006699675, |
| "loss": 3.2618, |
| "step": 56400 |
| }, |
| { |
| "epoch": 16.433871061673752, |
| "grad_norm": 0.37665021419525146, |
| "learning_rate": 0.0004030329158170696, |
| "loss": 3.2748, |
| "step": 56450 |
| }, |
| { |
| "epoch": 16.44843049327354, |
| "grad_norm": 0.36239564418792725, |
| "learning_rate": 0.0004028581415671424, |
| "loss": 3.2668, |
| "step": 56500 |
| }, |
| { |
| "epoch": 16.462989924873334, |
| "grad_norm": 0.3750430941581726, |
| "learning_rate": 0.00040268336731721525, |
| "loss": 3.2781, |
| "step": 56550 |
| }, |
| { |
| "epoch": 16.477549356473123, |
| "grad_norm": 0.3569280505180359, |
| "learning_rate": 0.0004025085930672881, |
| "loss": 3.2796, |
| "step": 56600 |
| }, |
| { |
| "epoch": 16.492108788072915, |
| "grad_norm": 0.37821418046951294, |
| "learning_rate": 0.00040233381881736086, |
| "loss": 3.2652, |
| "step": 56650 |
| }, |
| { |
| "epoch": 16.506668219672704, |
| "grad_norm": 0.36736053228378296, |
| "learning_rate": 0.0004021590445674337, |
| "loss": 3.2769, |
| "step": 56700 |
| }, |
| { |
| "epoch": 16.521227651272493, |
| "grad_norm": 0.3546053171157837, |
| "learning_rate": 0.00040198427031750653, |
| "loss": 3.2711, |
| "step": 56750 |
| }, |
| { |
| "epoch": 16.535787082872286, |
| "grad_norm": 0.35950738191604614, |
| "learning_rate": 0.00040180949606757936, |
| "loss": 3.2752, |
| "step": 56800 |
| }, |
| { |
| "epoch": 16.550346514472075, |
| "grad_norm": 0.35586223006248474, |
| "learning_rate": 0.0004016347218176522, |
| "loss": 3.2702, |
| "step": 56850 |
| }, |
| { |
| "epoch": 16.564905946071864, |
| "grad_norm": 0.3526204228401184, |
| "learning_rate": 0.000401459947567725, |
| "loss": 3.2772, |
| "step": 56900 |
| }, |
| { |
| "epoch": 16.579465377671657, |
| "grad_norm": 0.36836734414100647, |
| "learning_rate": 0.0004012851733177978, |
| "loss": 3.2878, |
| "step": 56950 |
| }, |
| { |
| "epoch": 16.594024809271446, |
| "grad_norm": 0.36949554085731506, |
| "learning_rate": 0.00040111039906787064, |
| "loss": 3.2692, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.594024809271446, |
| "eval_accuracy": 0.37260836023160715, |
| "eval_loss": 3.5415875911712646, |
| "eval_runtime": 53.9908, |
| "eval_samples_per_second": 308.256, |
| "eval_steps_per_second": 19.281, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.608584240871238, |
| "grad_norm": 0.3561803698539734, |
| "learning_rate": 0.0004009356248179435, |
| "loss": 3.2791, |
| "step": 57050 |
| }, |
| { |
| "epoch": 16.623143672471027, |
| "grad_norm": 0.3576090335845947, |
| "learning_rate": 0.00040076085056801625, |
| "loss": 3.2882, |
| "step": 57100 |
| }, |
| { |
| "epoch": 16.637703104070816, |
| "grad_norm": 0.3569983243942261, |
| "learning_rate": 0.0004005860763180891, |
| "loss": 3.2774, |
| "step": 57150 |
| }, |
| { |
| "epoch": 16.65226253567061, |
| "grad_norm": 0.3748653531074524, |
| "learning_rate": 0.0004004113020681619, |
| "loss": 3.2876, |
| "step": 57200 |
| }, |
| { |
| "epoch": 16.666821967270398, |
| "grad_norm": 0.371491402387619, |
| "learning_rate": 0.00040023652781823475, |
| "loss": 3.2824, |
| "step": 57250 |
| }, |
| { |
| "epoch": 16.681381398870187, |
| "grad_norm": 0.37606826424598694, |
| "learning_rate": 0.0004000617535683076, |
| "loss": 3.2731, |
| "step": 57300 |
| }, |
| { |
| "epoch": 16.69594083046998, |
| "grad_norm": 0.3529285192489624, |
| "learning_rate": 0.00039988697931838036, |
| "loss": 3.2925, |
| "step": 57350 |
| }, |
| { |
| "epoch": 16.71050026206977, |
| "grad_norm": 0.42727458477020264, |
| "learning_rate": 0.0003997122050684532, |
| "loss": 3.2851, |
| "step": 57400 |
| }, |
| { |
| "epoch": 16.72505969366956, |
| "grad_norm": 0.3473684787750244, |
| "learning_rate": 0.00039953743081852603, |
| "loss": 3.278, |
| "step": 57450 |
| }, |
| { |
| "epoch": 16.73961912526935, |
| "grad_norm": 0.40747615694999695, |
| "learning_rate": 0.00039936265656859886, |
| "loss": 3.2638, |
| "step": 57500 |
| }, |
| { |
| "epoch": 16.75417855686914, |
| "grad_norm": 0.3650287687778473, |
| "learning_rate": 0.0003991878823186717, |
| "loss": 3.2862, |
| "step": 57550 |
| }, |
| { |
| "epoch": 16.76873798846893, |
| "grad_norm": 0.361441045999527, |
| "learning_rate": 0.0003990131080687445, |
| "loss": 3.2847, |
| "step": 57600 |
| }, |
| { |
| "epoch": 16.78329742006872, |
| "grad_norm": 0.3760271668434143, |
| "learning_rate": 0.0003988383338188173, |
| "loss": 3.2954, |
| "step": 57650 |
| }, |
| { |
| "epoch": 16.79785685166851, |
| "grad_norm": 0.4076845645904541, |
| "learning_rate": 0.00039866355956889014, |
| "loss": 3.2954, |
| "step": 57700 |
| }, |
| { |
| "epoch": 16.812416283268302, |
| "grad_norm": 0.353763222694397, |
| "learning_rate": 0.000398488785318963, |
| "loss": 3.2893, |
| "step": 57750 |
| }, |
| { |
| "epoch": 16.82697571486809, |
| "grad_norm": 0.3695422112941742, |
| "learning_rate": 0.00039831401106903575, |
| "loss": 3.2879, |
| "step": 57800 |
| }, |
| { |
| "epoch": 16.841535146467884, |
| "grad_norm": 0.3703312575817108, |
| "learning_rate": 0.0003981392368191086, |
| "loss": 3.2899, |
| "step": 57850 |
| }, |
| { |
| "epoch": 16.856094578067673, |
| "grad_norm": 0.3426973819732666, |
| "learning_rate": 0.0003979644625691814, |
| "loss": 3.297, |
| "step": 57900 |
| }, |
| { |
| "epoch": 16.87065400966746, |
| "grad_norm": 0.36928045749664307, |
| "learning_rate": 0.00039778968831925425, |
| "loss": 3.2958, |
| "step": 57950 |
| }, |
| { |
| "epoch": 16.885213441267254, |
| "grad_norm": 0.35659393668174744, |
| "learning_rate": 0.0003976149140693271, |
| "loss": 3.2916, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.885213441267254, |
| "eval_accuracy": 0.37303742351320873, |
| "eval_loss": 3.535076856613159, |
| "eval_runtime": 53.7788, |
| "eval_samples_per_second": 309.471, |
| "eval_steps_per_second": 19.357, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.899772872867043, |
| "grad_norm": 0.3856133818626404, |
| "learning_rate": 0.00039744013981939986, |
| "loss": 3.2993, |
| "step": 58050 |
| }, |
| { |
| "epoch": 16.914332304466832, |
| "grad_norm": 0.3538554012775421, |
| "learning_rate": 0.0003972653655694727, |
| "loss": 3.2902, |
| "step": 58100 |
| }, |
| { |
| "epoch": 16.928891736066625, |
| "grad_norm": 0.36010587215423584, |
| "learning_rate": 0.00039709059131954553, |
| "loss": 3.3029, |
| "step": 58150 |
| }, |
| { |
| "epoch": 16.943451167666414, |
| "grad_norm": 0.3581371605396271, |
| "learning_rate": 0.00039691581706961836, |
| "loss": 3.3035, |
| "step": 58200 |
| }, |
| { |
| "epoch": 16.958010599266206, |
| "grad_norm": 0.3781713843345642, |
| "learning_rate": 0.0003967410428196912, |
| "loss": 3.2984, |
| "step": 58250 |
| }, |
| { |
| "epoch": 16.972570030865995, |
| "grad_norm": 0.36059990525245667, |
| "learning_rate": 0.000396566268569764, |
| "loss": 3.3088, |
| "step": 58300 |
| }, |
| { |
| "epoch": 16.987129462465784, |
| "grad_norm": 0.34163540601730347, |
| "learning_rate": 0.0003963914943198368, |
| "loss": 3.3084, |
| "step": 58350 |
| }, |
| { |
| "epoch": 17.00145594315998, |
| "grad_norm": 0.41299328207969666, |
| "learning_rate": 0.00039621672006990964, |
| "loss": 3.281, |
| "step": 58400 |
| }, |
| { |
| "epoch": 17.01601537475977, |
| "grad_norm": 0.4098125696182251, |
| "learning_rate": 0.0003960419458199825, |
| "loss": 3.2017, |
| "step": 58450 |
| }, |
| { |
| "epoch": 17.03057480635956, |
| "grad_norm": 0.3501276969909668, |
| "learning_rate": 0.00039586717157005536, |
| "loss": 3.1889, |
| "step": 58500 |
| }, |
| { |
| "epoch": 17.04513423795935, |
| "grad_norm": 0.3579663336277008, |
| "learning_rate": 0.00039569239732012814, |
| "loss": 3.2048, |
| "step": 58550 |
| }, |
| { |
| "epoch": 17.05969366955914, |
| "grad_norm": 0.3546302616596222, |
| "learning_rate": 0.00039551762307020097, |
| "loss": 3.1901, |
| "step": 58600 |
| }, |
| { |
| "epoch": 17.07425310115893, |
| "grad_norm": 0.39157140254974365, |
| "learning_rate": 0.0003953428488202738, |
| "loss": 3.2057, |
| "step": 58650 |
| }, |
| { |
| "epoch": 17.08881253275872, |
| "grad_norm": 0.3777744174003601, |
| "learning_rate": 0.00039516807457034664, |
| "loss": 3.2057, |
| "step": 58700 |
| }, |
| { |
| "epoch": 17.103371964358512, |
| "grad_norm": 0.3601837158203125, |
| "learning_rate": 0.00039499330032041947, |
| "loss": 3.1967, |
| "step": 58750 |
| }, |
| { |
| "epoch": 17.1179313959583, |
| "grad_norm": 0.37118658423423767, |
| "learning_rate": 0.00039481852607049225, |
| "loss": 3.2013, |
| "step": 58800 |
| }, |
| { |
| "epoch": 17.132490827558094, |
| "grad_norm": 0.35028761625289917, |
| "learning_rate": 0.0003946437518205651, |
| "loss": 3.2105, |
| "step": 58850 |
| }, |
| { |
| "epoch": 17.147050259157883, |
| "grad_norm": 0.40827909111976624, |
| "learning_rate": 0.0003944689775706379, |
| "loss": 3.2346, |
| "step": 58900 |
| }, |
| { |
| "epoch": 17.161609690757672, |
| "grad_norm": 0.37568482756614685, |
| "learning_rate": 0.00039429420332071075, |
| "loss": 3.2186, |
| "step": 58950 |
| }, |
| { |
| "epoch": 17.176169122357464, |
| "grad_norm": 0.37248700857162476, |
| "learning_rate": 0.0003941194290707836, |
| "loss": 3.2282, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.176169122357464, |
| "eval_accuracy": 0.3720623010702595, |
| "eval_loss": 3.553311824798584, |
| "eval_runtime": 54.0404, |
| "eval_samples_per_second": 307.973, |
| "eval_steps_per_second": 19.263, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.190728553957253, |
| "grad_norm": 0.3794865608215332, |
| "learning_rate": 0.00039394465482085636, |
| "loss": 3.2065, |
| "step": 59050 |
| }, |
| { |
| "epoch": 17.205287985557042, |
| "grad_norm": 0.38951265811920166, |
| "learning_rate": 0.0003937698805709292, |
| "loss": 3.2422, |
| "step": 59100 |
| }, |
| { |
| "epoch": 17.219847417156835, |
| "grad_norm": 0.3871360123157501, |
| "learning_rate": 0.000393595106321002, |
| "loss": 3.2231, |
| "step": 59150 |
| }, |
| { |
| "epoch": 17.234406848756624, |
| "grad_norm": 0.36482304334640503, |
| "learning_rate": 0.00039342033207107486, |
| "loss": 3.2388, |
| "step": 59200 |
| }, |
| { |
| "epoch": 17.248966280356417, |
| "grad_norm": 0.3711540699005127, |
| "learning_rate": 0.00039324555782114764, |
| "loss": 3.2294, |
| "step": 59250 |
| }, |
| { |
| "epoch": 17.263525711956206, |
| "grad_norm": 0.3786524534225464, |
| "learning_rate": 0.00039307078357122047, |
| "loss": 3.2428, |
| "step": 59300 |
| }, |
| { |
| "epoch": 17.278085143555995, |
| "grad_norm": 0.39472588896751404, |
| "learning_rate": 0.0003928960093212933, |
| "loss": 3.2392, |
| "step": 59350 |
| }, |
| { |
| "epoch": 17.292644575155787, |
| "grad_norm": 0.38515424728393555, |
| "learning_rate": 0.00039272123507136614, |
| "loss": 3.2422, |
| "step": 59400 |
| }, |
| { |
| "epoch": 17.307204006755576, |
| "grad_norm": 0.3816242218017578, |
| "learning_rate": 0.00039254646082143897, |
| "loss": 3.2431, |
| "step": 59450 |
| }, |
| { |
| "epoch": 17.321763438355365, |
| "grad_norm": 0.3761007785797119, |
| "learning_rate": 0.00039237168657151175, |
| "loss": 3.2431, |
| "step": 59500 |
| }, |
| { |
| "epoch": 17.336322869955158, |
| "grad_norm": 0.38267335295677185, |
| "learning_rate": 0.0003921969123215846, |
| "loss": 3.256, |
| "step": 59550 |
| }, |
| { |
| "epoch": 17.350882301554947, |
| "grad_norm": 0.36106154322624207, |
| "learning_rate": 0.0003920221380716574, |
| "loss": 3.2497, |
| "step": 59600 |
| }, |
| { |
| "epoch": 17.36544173315474, |
| "grad_norm": 0.351624071598053, |
| "learning_rate": 0.00039184736382173025, |
| "loss": 3.2565, |
| "step": 59650 |
| }, |
| { |
| "epoch": 17.38000116475453, |
| "grad_norm": 0.3526884913444519, |
| "learning_rate": 0.000391672589571803, |
| "loss": 3.2546, |
| "step": 59700 |
| }, |
| { |
| "epoch": 17.394560596354317, |
| "grad_norm": 0.39185193181037903, |
| "learning_rate": 0.00039149781532187586, |
| "loss": 3.2386, |
| "step": 59750 |
| }, |
| { |
| "epoch": 17.40912002795411, |
| "grad_norm": 0.3673184812068939, |
| "learning_rate": 0.0003913230410719487, |
| "loss": 3.2497, |
| "step": 59800 |
| }, |
| { |
| "epoch": 17.4236794595539, |
| "grad_norm": 0.36868539452552795, |
| "learning_rate": 0.0003911482668220215, |
| "loss": 3.2442, |
| "step": 59850 |
| }, |
| { |
| "epoch": 17.438238891153688, |
| "grad_norm": 0.3652259111404419, |
| "learning_rate": 0.00039097349257209436, |
| "loss": 3.2523, |
| "step": 59900 |
| }, |
| { |
| "epoch": 17.45279832275348, |
| "grad_norm": 0.370699405670166, |
| "learning_rate": 0.00039079871832216714, |
| "loss": 3.26, |
| "step": 59950 |
| }, |
| { |
| "epoch": 17.46735775435327, |
| "grad_norm": 0.3466508090496063, |
| "learning_rate": 0.00039062394407223997, |
| "loss": 3.2584, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.46735775435327, |
| "eval_accuracy": 0.3726800863488384, |
| "eval_loss": 3.5435853004455566, |
| "eval_runtime": 54.016, |
| "eval_samples_per_second": 308.112, |
| "eval_steps_per_second": 19.272, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.481917185953062, |
| "grad_norm": 0.46203598380088806, |
| "learning_rate": 0.0003904491698223128, |
| "loss": 3.2626, |
| "step": 60050 |
| }, |
| { |
| "epoch": 17.49647661755285, |
| "grad_norm": 0.34256711602211, |
| "learning_rate": 0.00039027439557238564, |
| "loss": 3.2569, |
| "step": 60100 |
| }, |
| { |
| "epoch": 17.51103604915264, |
| "grad_norm": 0.3792712092399597, |
| "learning_rate": 0.00039009962132245847, |
| "loss": 3.2535, |
| "step": 60150 |
| }, |
| { |
| "epoch": 17.525595480752433, |
| "grad_norm": 0.39607110619544983, |
| "learning_rate": 0.00038992484707253125, |
| "loss": 3.257, |
| "step": 60200 |
| }, |
| { |
| "epoch": 17.54015491235222, |
| "grad_norm": 0.3784148693084717, |
| "learning_rate": 0.0003897500728226041, |
| "loss": 3.2686, |
| "step": 60250 |
| }, |
| { |
| "epoch": 17.55471434395201, |
| "grad_norm": 0.38679102063179016, |
| "learning_rate": 0.0003895752985726769, |
| "loss": 3.2681, |
| "step": 60300 |
| }, |
| { |
| "epoch": 17.569273775551803, |
| "grad_norm": 0.38070613145828247, |
| "learning_rate": 0.00038940052432274975, |
| "loss": 3.257, |
| "step": 60350 |
| }, |
| { |
| "epoch": 17.583833207151592, |
| "grad_norm": 0.363008588552475, |
| "learning_rate": 0.0003892257500728225, |
| "loss": 3.2573, |
| "step": 60400 |
| }, |
| { |
| "epoch": 17.598392638751385, |
| "grad_norm": 0.41815274953842163, |
| "learning_rate": 0.00038905097582289536, |
| "loss": 3.2649, |
| "step": 60450 |
| }, |
| { |
| "epoch": 17.612952070351174, |
| "grad_norm": 0.37823486328125, |
| "learning_rate": 0.0003888762015729682, |
| "loss": 3.2645, |
| "step": 60500 |
| }, |
| { |
| "epoch": 17.627511501950963, |
| "grad_norm": 0.38749924302101135, |
| "learning_rate": 0.000388701427323041, |
| "loss": 3.2577, |
| "step": 60550 |
| }, |
| { |
| "epoch": 17.642070933550755, |
| "grad_norm": 0.3703409731388092, |
| "learning_rate": 0.00038852665307311386, |
| "loss": 3.2729, |
| "step": 60600 |
| }, |
| { |
| "epoch": 17.656630365150544, |
| "grad_norm": 0.36761870980262756, |
| "learning_rate": 0.00038835187882318664, |
| "loss": 3.2796, |
| "step": 60650 |
| }, |
| { |
| "epoch": 17.671189796750333, |
| "grad_norm": 0.34877169132232666, |
| "learning_rate": 0.00038817710457325947, |
| "loss": 3.2743, |
| "step": 60700 |
| }, |
| { |
| "epoch": 17.685749228350126, |
| "grad_norm": 0.3577750623226166, |
| "learning_rate": 0.0003880023303233323, |
| "loss": 3.2672, |
| "step": 60750 |
| }, |
| { |
| "epoch": 17.700308659949915, |
| "grad_norm": 0.349612832069397, |
| "learning_rate": 0.00038782755607340514, |
| "loss": 3.2765, |
| "step": 60800 |
| }, |
| { |
| "epoch": 17.714868091549707, |
| "grad_norm": 0.4024096429347992, |
| "learning_rate": 0.00038765278182347797, |
| "loss": 3.281, |
| "step": 60850 |
| }, |
| { |
| "epoch": 17.729427523149496, |
| "grad_norm": 0.3460679352283478, |
| "learning_rate": 0.00038747800757355075, |
| "loss": 3.2651, |
| "step": 60900 |
| }, |
| { |
| "epoch": 17.743986954749285, |
| "grad_norm": 0.3731793165206909, |
| "learning_rate": 0.00038730323332362363, |
| "loss": 3.2789, |
| "step": 60950 |
| }, |
| { |
| "epoch": 17.758546386349078, |
| "grad_norm": 0.38457053899765015, |
| "learning_rate": 0.00038712845907369647, |
| "loss": 3.2724, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.758546386349078, |
| "eval_accuracy": 0.37315571281474097, |
| "eval_loss": 3.5371994972229004, |
| "eval_runtime": 54.3068, |
| "eval_samples_per_second": 306.462, |
| "eval_steps_per_second": 19.169, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.773105817948867, |
| "grad_norm": 0.38386011123657227, |
| "learning_rate": 0.0003869536848237693, |
| "loss": 3.2796, |
| "step": 61050 |
| }, |
| { |
| "epoch": 17.787665249548656, |
| "grad_norm": 0.4016534090042114, |
| "learning_rate": 0.00038677891057384213, |
| "loss": 3.2771, |
| "step": 61100 |
| }, |
| { |
| "epoch": 17.80222468114845, |
| "grad_norm": 0.33354341983795166, |
| "learning_rate": 0.0003866041363239149, |
| "loss": 3.2739, |
| "step": 61150 |
| }, |
| { |
| "epoch": 17.816784112748238, |
| "grad_norm": 0.38390499353408813, |
| "learning_rate": 0.00038642936207398774, |
| "loss": 3.2782, |
| "step": 61200 |
| }, |
| { |
| "epoch": 17.83134354434803, |
| "grad_norm": 0.33095383644104004, |
| "learning_rate": 0.0003862545878240606, |
| "loss": 3.2908, |
| "step": 61250 |
| }, |
| { |
| "epoch": 17.84590297594782, |
| "grad_norm": 0.359825074672699, |
| "learning_rate": 0.0003860798135741334, |
| "loss": 3.2845, |
| "step": 61300 |
| }, |
| { |
| "epoch": 17.860462407547608, |
| "grad_norm": 0.36744236946105957, |
| "learning_rate": 0.00038590503932420624, |
| "loss": 3.2842, |
| "step": 61350 |
| }, |
| { |
| "epoch": 17.8750218391474, |
| "grad_norm": 0.3676607310771942, |
| "learning_rate": 0.000385730265074279, |
| "loss": 3.2905, |
| "step": 61400 |
| }, |
| { |
| "epoch": 17.88958127074719, |
| "grad_norm": 0.39658308029174805, |
| "learning_rate": 0.00038555549082435186, |
| "loss": 3.2796, |
| "step": 61450 |
| }, |
| { |
| "epoch": 17.90414070234698, |
| "grad_norm": 0.3673575222492218, |
| "learning_rate": 0.0003853807165744247, |
| "loss": 3.2805, |
| "step": 61500 |
| }, |
| { |
| "epoch": 17.91870013394677, |
| "grad_norm": 0.3908085525035858, |
| "learning_rate": 0.0003852059423244975, |
| "loss": 3.2885, |
| "step": 61550 |
| }, |
| { |
| "epoch": 17.93325956554656, |
| "grad_norm": 0.4000939130783081, |
| "learning_rate": 0.00038503116807457035, |
| "loss": 3.2736, |
| "step": 61600 |
| }, |
| { |
| "epoch": 17.947818997146353, |
| "grad_norm": 0.3464169204235077, |
| "learning_rate": 0.00038485639382464313, |
| "loss": 3.2832, |
| "step": 61650 |
| }, |
| { |
| "epoch": 17.962378428746142, |
| "grad_norm": 0.3489089608192444, |
| "learning_rate": 0.00038468161957471597, |
| "loss": 3.2796, |
| "step": 61700 |
| }, |
| { |
| "epoch": 17.97693786034593, |
| "grad_norm": 0.4103230834007263, |
| "learning_rate": 0.0003845068453247888, |
| "loss": 3.2685, |
| "step": 61750 |
| }, |
| { |
| "epoch": 17.991497291945723, |
| "grad_norm": 0.387511283159256, |
| "learning_rate": 0.00038433207107486163, |
| "loss": 3.2981, |
| "step": 61800 |
| }, |
| { |
| "epoch": 18.005823772639918, |
| "grad_norm": 0.3746413588523865, |
| "learning_rate": 0.0003841572968249344, |
| "loss": 3.2403, |
| "step": 61850 |
| }, |
| { |
| "epoch": 18.020383204239707, |
| "grad_norm": 0.36936619877815247, |
| "learning_rate": 0.00038398252257500724, |
| "loss": 3.1804, |
| "step": 61900 |
| }, |
| { |
| "epoch": 18.034942635839496, |
| "grad_norm": 0.37005239725112915, |
| "learning_rate": 0.0003838077483250801, |
| "loss": 3.2085, |
| "step": 61950 |
| }, |
| { |
| "epoch": 18.04950206743929, |
| "grad_norm": 0.38752469420433044, |
| "learning_rate": 0.0003836329740751529, |
| "loss": 3.1868, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.04950206743929, |
| "eval_accuracy": 0.3726120053293681, |
| "eval_loss": 3.54896879196167, |
| "eval_runtime": 53.9519, |
| "eval_samples_per_second": 308.478, |
| "eval_steps_per_second": 19.295, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.064061499039077, |
| "grad_norm": 0.3948463201522827, |
| "learning_rate": 0.00038345819982522574, |
| "loss": 3.19, |
| "step": 62050 |
| }, |
| { |
| "epoch": 18.078620930638866, |
| "grad_norm": 0.38081789016723633, |
| "learning_rate": 0.0003832834255752985, |
| "loss": 3.1918, |
| "step": 62100 |
| }, |
| { |
| "epoch": 18.09318036223866, |
| "grad_norm": 0.3688044548034668, |
| "learning_rate": 0.00038310865132537135, |
| "loss": 3.2016, |
| "step": 62150 |
| }, |
| { |
| "epoch": 18.107739793838448, |
| "grad_norm": 0.3519236743450165, |
| "learning_rate": 0.0003829338770754442, |
| "loss": 3.1926, |
| "step": 62200 |
| }, |
| { |
| "epoch": 18.12229922543824, |
| "grad_norm": 0.36413848400115967, |
| "learning_rate": 0.000382759102825517, |
| "loss": 3.2035, |
| "step": 62250 |
| }, |
| { |
| "epoch": 18.13685865703803, |
| "grad_norm": 0.39578020572662354, |
| "learning_rate": 0.00038258432857558985, |
| "loss": 3.2006, |
| "step": 62300 |
| }, |
| { |
| "epoch": 18.15141808863782, |
| "grad_norm": 0.3735930323600769, |
| "learning_rate": 0.00038240955432566263, |
| "loss": 3.1978, |
| "step": 62350 |
| }, |
| { |
| "epoch": 18.16597752023761, |
| "grad_norm": 0.367960125207901, |
| "learning_rate": 0.00038223478007573547, |
| "loss": 3.2175, |
| "step": 62400 |
| }, |
| { |
| "epoch": 18.1805369518374, |
| "grad_norm": 0.3750097453594208, |
| "learning_rate": 0.0003820600058258083, |
| "loss": 3.2119, |
| "step": 62450 |
| }, |
| { |
| "epoch": 18.19509638343719, |
| "grad_norm": 0.37870654463768005, |
| "learning_rate": 0.00038188523157588113, |
| "loss": 3.2167, |
| "step": 62500 |
| }, |
| { |
| "epoch": 18.20965581503698, |
| "grad_norm": 0.4226834177970886, |
| "learning_rate": 0.0003817104573259539, |
| "loss": 3.2163, |
| "step": 62550 |
| }, |
| { |
| "epoch": 18.22421524663677, |
| "grad_norm": 0.3715706765651703, |
| "learning_rate": 0.00038153568307602674, |
| "loss": 3.2295, |
| "step": 62600 |
| }, |
| { |
| "epoch": 18.238774678236563, |
| "grad_norm": 0.35563722252845764, |
| "learning_rate": 0.0003813609088260996, |
| "loss": 3.2179, |
| "step": 62650 |
| }, |
| { |
| "epoch": 18.253334109836352, |
| "grad_norm": 0.38243478536605835, |
| "learning_rate": 0.0003811861345761724, |
| "loss": 3.2199, |
| "step": 62700 |
| }, |
| { |
| "epoch": 18.26789354143614, |
| "grad_norm": 0.368644654750824, |
| "learning_rate": 0.00038101136032624524, |
| "loss": 3.2204, |
| "step": 62750 |
| }, |
| { |
| "epoch": 18.282452973035934, |
| "grad_norm": 0.3634442687034607, |
| "learning_rate": 0.000380836586076318, |
| "loss": 3.2344, |
| "step": 62800 |
| }, |
| { |
| "epoch": 18.297012404635723, |
| "grad_norm": 0.3635788559913635, |
| "learning_rate": 0.00038066181182639085, |
| "loss": 3.2297, |
| "step": 62850 |
| }, |
| { |
| "epoch": 18.31157183623551, |
| "grad_norm": 0.4010733962059021, |
| "learning_rate": 0.0003804870375764637, |
| "loss": 3.2351, |
| "step": 62900 |
| }, |
| { |
| "epoch": 18.326131267835304, |
| "grad_norm": 0.42111244797706604, |
| "learning_rate": 0.0003803122633265365, |
| "loss": 3.2386, |
| "step": 62950 |
| }, |
| { |
| "epoch": 18.340690699435093, |
| "grad_norm": 0.4109092950820923, |
| "learning_rate": 0.0003801374890766093, |
| "loss": 3.2329, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.340690699435093, |
| "eval_accuracy": 0.3725866072288403, |
| "eval_loss": 3.5461974143981934, |
| "eval_runtime": 53.9315, |
| "eval_samples_per_second": 308.595, |
| "eval_steps_per_second": 19.302, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.355250131034886, |
| "grad_norm": 0.379860520362854, |
| "learning_rate": 0.00037996271482668213, |
| "loss": 3.2364, |
| "step": 63050 |
| }, |
| { |
| "epoch": 18.369809562634675, |
| "grad_norm": 0.39272770285606384, |
| "learning_rate": 0.00037978794057675497, |
| "loss": 3.2378, |
| "step": 63100 |
| }, |
| { |
| "epoch": 18.384368994234464, |
| "grad_norm": 0.4046265780925751, |
| "learning_rate": 0.0003796131663268278, |
| "loss": 3.2406, |
| "step": 63150 |
| }, |
| { |
| "epoch": 18.398928425834256, |
| "grad_norm": 0.3786957859992981, |
| "learning_rate": 0.00037943839207690063, |
| "loss": 3.2385, |
| "step": 63200 |
| }, |
| { |
| "epoch": 18.413487857434045, |
| "grad_norm": 0.3831402361392975, |
| "learning_rate": 0.0003792636178269734, |
| "loss": 3.2364, |
| "step": 63250 |
| }, |
| { |
| "epoch": 18.428047289033834, |
| "grad_norm": 0.4113953411579132, |
| "learning_rate": 0.00037908884357704624, |
| "loss": 3.2396, |
| "step": 63300 |
| }, |
| { |
| "epoch": 18.442606720633627, |
| "grad_norm": 0.3877120912075043, |
| "learning_rate": 0.0003789140693271191, |
| "loss": 3.2336, |
| "step": 63350 |
| }, |
| { |
| "epoch": 18.457166152233416, |
| "grad_norm": 0.3839344382286072, |
| "learning_rate": 0.0003787392950771919, |
| "loss": 3.2497, |
| "step": 63400 |
| }, |
| { |
| "epoch": 18.47172558383321, |
| "grad_norm": 0.38105466961860657, |
| "learning_rate": 0.00037856452082726474, |
| "loss": 3.2359, |
| "step": 63450 |
| }, |
| { |
| "epoch": 18.486285015432998, |
| "grad_norm": 0.3721942901611328, |
| "learning_rate": 0.00037838974657733763, |
| "loss": 3.2513, |
| "step": 63500 |
| }, |
| { |
| "epoch": 18.500844447032787, |
| "grad_norm": 0.3668787181377411, |
| "learning_rate": 0.0003782149723274104, |
| "loss": 3.242, |
| "step": 63550 |
| }, |
| { |
| "epoch": 18.51540387863258, |
| "grad_norm": 0.3777357339859009, |
| "learning_rate": 0.00037804019807748324, |
| "loss": 3.2484, |
| "step": 63600 |
| }, |
| { |
| "epoch": 18.529963310232368, |
| "grad_norm": 0.355179101228714, |
| "learning_rate": 0.0003778654238275561, |
| "loss": 3.2535, |
| "step": 63650 |
| }, |
| { |
| "epoch": 18.544522741832157, |
| "grad_norm": 0.37869659066200256, |
| "learning_rate": 0.0003776906495776289, |
| "loss": 3.2545, |
| "step": 63700 |
| }, |
| { |
| "epoch": 18.55908217343195, |
| "grad_norm": 0.3972564935684204, |
| "learning_rate": 0.0003775158753277017, |
| "loss": 3.2569, |
| "step": 63750 |
| }, |
| { |
| "epoch": 18.57364160503174, |
| "grad_norm": 0.3839690089225769, |
| "learning_rate": 0.0003773411010777745, |
| "loss": 3.2518, |
| "step": 63800 |
| }, |
| { |
| "epoch": 18.58820103663153, |
| "grad_norm": 0.36708375811576843, |
| "learning_rate": 0.00037716632682784735, |
| "loss": 3.2481, |
| "step": 63850 |
| }, |
| { |
| "epoch": 18.60276046823132, |
| "grad_norm": 0.3941199779510498, |
| "learning_rate": 0.0003769915525779202, |
| "loss": 3.2456, |
| "step": 63900 |
| }, |
| { |
| "epoch": 18.61731989983111, |
| "grad_norm": 0.3851776123046875, |
| "learning_rate": 0.000376816778327993, |
| "loss": 3.2572, |
| "step": 63950 |
| }, |
| { |
| "epoch": 18.631879331430902, |
| "grad_norm": 0.3694128096103668, |
| "learning_rate": 0.0003766420040780658, |
| "loss": 3.2426, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.631879331430902, |
| "eval_accuracy": 0.3727275902035293, |
| "eval_loss": 3.544032096862793, |
| "eval_runtime": 54.1155, |
| "eval_samples_per_second": 307.546, |
| "eval_steps_per_second": 19.237, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.64643876303069, |
| "grad_norm": 0.36158764362335205, |
| "learning_rate": 0.00037646722982813863, |
| "loss": 3.2605, |
| "step": 64050 |
| }, |
| { |
| "epoch": 18.66099819463048, |
| "grad_norm": 0.3997897803783417, |
| "learning_rate": 0.00037629245557821146, |
| "loss": 3.2588, |
| "step": 64100 |
| }, |
| { |
| "epoch": 18.675557626230272, |
| "grad_norm": 0.38344115018844604, |
| "learning_rate": 0.0003761176813282843, |
| "loss": 3.2558, |
| "step": 64150 |
| }, |
| { |
| "epoch": 18.69011705783006, |
| "grad_norm": 0.37368038296699524, |
| "learning_rate": 0.00037594290707835713, |
| "loss": 3.2568, |
| "step": 64200 |
| }, |
| { |
| "epoch": 18.704676489429854, |
| "grad_norm": 0.3679753541946411, |
| "learning_rate": 0.0003757681328284299, |
| "loss": 3.2441, |
| "step": 64250 |
| }, |
| { |
| "epoch": 18.719235921029643, |
| "grad_norm": 0.39784008264541626, |
| "learning_rate": 0.00037559335857850274, |
| "loss": 3.2673, |
| "step": 64300 |
| }, |
| { |
| "epoch": 18.733795352629432, |
| "grad_norm": 0.3740057349205017, |
| "learning_rate": 0.0003754185843285756, |
| "loss": 3.2658, |
| "step": 64350 |
| }, |
| { |
| "epoch": 18.748354784229225, |
| "grad_norm": 0.3893584907054901, |
| "learning_rate": 0.0003752438100786484, |
| "loss": 3.2733, |
| "step": 64400 |
| }, |
| { |
| "epoch": 18.762914215829014, |
| "grad_norm": 0.3613312542438507, |
| "learning_rate": 0.0003750690358287212, |
| "loss": 3.2595, |
| "step": 64450 |
| }, |
| { |
| "epoch": 18.777473647428806, |
| "grad_norm": 0.3821388781070709, |
| "learning_rate": 0.000374894261578794, |
| "loss": 3.269, |
| "step": 64500 |
| }, |
| { |
| "epoch": 18.792033079028595, |
| "grad_norm": 0.39411360025405884, |
| "learning_rate": 0.00037471948732886685, |
| "loss": 3.2613, |
| "step": 64550 |
| }, |
| { |
| "epoch": 18.806592510628384, |
| "grad_norm": 0.36718764901161194, |
| "learning_rate": 0.0003745447130789397, |
| "loss": 3.2631, |
| "step": 64600 |
| }, |
| { |
| "epoch": 18.821151942228177, |
| "grad_norm": 0.3988918364048004, |
| "learning_rate": 0.0003743699388290125, |
| "loss": 3.2724, |
| "step": 64650 |
| }, |
| { |
| "epoch": 18.835711373827966, |
| "grad_norm": 0.37188059091567993, |
| "learning_rate": 0.0003741951645790853, |
| "loss": 3.2802, |
| "step": 64700 |
| }, |
| { |
| "epoch": 18.850270805427755, |
| "grad_norm": 0.3327014446258545, |
| "learning_rate": 0.00037402039032915813, |
| "loss": 3.2575, |
| "step": 64750 |
| }, |
| { |
| "epoch": 18.864830237027547, |
| "grad_norm": 0.38189268112182617, |
| "learning_rate": 0.00037384561607923096, |
| "loss": 3.2629, |
| "step": 64800 |
| }, |
| { |
| "epoch": 18.879389668627336, |
| "grad_norm": 0.3511042594909668, |
| "learning_rate": 0.0003736708418293038, |
| "loss": 3.2602, |
| "step": 64850 |
| }, |
| { |
| "epoch": 18.893949100227125, |
| "grad_norm": 0.38207387924194336, |
| "learning_rate": 0.00037349606757937663, |
| "loss": 3.281, |
| "step": 64900 |
| }, |
| { |
| "epoch": 18.908508531826918, |
| "grad_norm": 0.3751682937145233, |
| "learning_rate": 0.0003733212933294494, |
| "loss": 3.2783, |
| "step": 64950 |
| }, |
| { |
| "epoch": 18.923067963426707, |
| "grad_norm": 0.39629825949668884, |
| "learning_rate": 0.00037314651907952224, |
| "loss": 3.2577, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.923067963426707, |
| "eval_accuracy": 0.3736138192946313, |
| "eval_loss": 3.5334720611572266, |
| "eval_runtime": 53.8337, |
| "eval_samples_per_second": 309.156, |
| "eval_steps_per_second": 19.337, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.9376273950265, |
| "grad_norm": 0.39632660150527954, |
| "learning_rate": 0.00037297174482959507, |
| "loss": 3.2616, |
| "step": 65050 |
| }, |
| { |
| "epoch": 18.95218682662629, |
| "grad_norm": 0.407411128282547, |
| "learning_rate": 0.0003727969705796679, |
| "loss": 3.2779, |
| "step": 65100 |
| }, |
| { |
| "epoch": 18.966746258226078, |
| "grad_norm": 0.3708113133907318, |
| "learning_rate": 0.0003726221963297407, |
| "loss": 3.2795, |
| "step": 65150 |
| }, |
| { |
| "epoch": 18.98130568982587, |
| "grad_norm": 0.37315139174461365, |
| "learning_rate": 0.0003724474220798135, |
| "loss": 3.2703, |
| "step": 65200 |
| }, |
| { |
| "epoch": 18.99586512142566, |
| "grad_norm": 0.3742704689502716, |
| "learning_rate": 0.00037227264782988635, |
| "loss": 3.286, |
| "step": 65250 |
| }, |
| { |
| "epoch": 19.010191602119853, |
| "grad_norm": 0.37877383828163147, |
| "learning_rate": 0.0003720978735799592, |
| "loss": 3.2045, |
| "step": 65300 |
| }, |
| { |
| "epoch": 19.024751033719642, |
| "grad_norm": 0.37160438299179077, |
| "learning_rate": 0.000371923099330032, |
| "loss": 3.1639, |
| "step": 65350 |
| }, |
| { |
| "epoch": 19.039310465319435, |
| "grad_norm": 0.3627760112285614, |
| "learning_rate": 0.0003717483250801048, |
| "loss": 3.1622, |
| "step": 65400 |
| }, |
| { |
| "epoch": 19.053869896919224, |
| "grad_norm": 0.3930268883705139, |
| "learning_rate": 0.00037157355083017763, |
| "loss": 3.1768, |
| "step": 65450 |
| }, |
| { |
| "epoch": 19.068429328519013, |
| "grad_norm": 0.4046318829059601, |
| "learning_rate": 0.00037139877658025046, |
| "loss": 3.1912, |
| "step": 65500 |
| }, |
| { |
| "epoch": 19.082988760118806, |
| "grad_norm": 0.37506386637687683, |
| "learning_rate": 0.0003712240023303233, |
| "loss": 3.1872, |
| "step": 65550 |
| }, |
| { |
| "epoch": 19.097548191718595, |
| "grad_norm": 0.3771907389163971, |
| "learning_rate": 0.00037104922808039607, |
| "loss": 3.2021, |
| "step": 65600 |
| }, |
| { |
| "epoch": 19.112107623318387, |
| "grad_norm": 0.37820398807525635, |
| "learning_rate": 0.0003708744538304689, |
| "loss": 3.1905, |
| "step": 65650 |
| }, |
| { |
| "epoch": 19.126667054918176, |
| "grad_norm": 0.3759934604167938, |
| "learning_rate": 0.00037069967958054174, |
| "loss": 3.1817, |
| "step": 65700 |
| }, |
| { |
| "epoch": 19.141226486517965, |
| "grad_norm": 0.3758086860179901, |
| "learning_rate": 0.00037052490533061457, |
| "loss": 3.2027, |
| "step": 65750 |
| }, |
| { |
| "epoch": 19.155785918117758, |
| "grad_norm": 0.3827981650829315, |
| "learning_rate": 0.0003703501310806874, |
| "loss": 3.1894, |
| "step": 65800 |
| }, |
| { |
| "epoch": 19.170345349717547, |
| "grad_norm": 0.4200972020626068, |
| "learning_rate": 0.0003701753568307602, |
| "loss": 3.1961, |
| "step": 65850 |
| }, |
| { |
| "epoch": 19.184904781317336, |
| "grad_norm": 0.40495339035987854, |
| "learning_rate": 0.000370000582580833, |
| "loss": 3.1961, |
| "step": 65900 |
| }, |
| { |
| "epoch": 19.19946421291713, |
| "grad_norm": 0.3779332935810089, |
| "learning_rate": 0.00036982580833090585, |
| "loss": 3.1914, |
| "step": 65950 |
| }, |
| { |
| "epoch": 19.214023644516917, |
| "grad_norm": 0.38305872678756714, |
| "learning_rate": 0.00036965103408097874, |
| "loss": 3.1949, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.214023644516917, |
| "eval_accuracy": 0.3729239551474248, |
| "eval_loss": 3.5489401817321777, |
| "eval_runtime": 53.9661, |
| "eval_samples_per_second": 308.397, |
| "eval_steps_per_second": 19.29, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.22858307611671, |
| "grad_norm": 0.4134466350078583, |
| "learning_rate": 0.00036947625983105157, |
| "loss": 3.2189, |
| "step": 66050 |
| }, |
| { |
| "epoch": 19.2431425077165, |
| "grad_norm": 0.35542577505111694, |
| "learning_rate": 0.0003693014855811244, |
| "loss": 3.2184, |
| "step": 66100 |
| }, |
| { |
| "epoch": 19.257701939316288, |
| "grad_norm": 0.37749409675598145, |
| "learning_rate": 0.0003691267113311972, |
| "loss": 3.2118, |
| "step": 66150 |
| }, |
| { |
| "epoch": 19.27226137091608, |
| "grad_norm": 0.36184874176979065, |
| "learning_rate": 0.00036895193708127, |
| "loss": 3.215, |
| "step": 66200 |
| }, |
| { |
| "epoch": 19.28682080251587, |
| "grad_norm": 0.3879592716693878, |
| "learning_rate": 0.00036877716283134285, |
| "loss": 3.22, |
| "step": 66250 |
| }, |
| { |
| "epoch": 19.30138023411566, |
| "grad_norm": 0.37836670875549316, |
| "learning_rate": 0.0003686023885814157, |
| "loss": 3.2235, |
| "step": 66300 |
| }, |
| { |
| "epoch": 19.31593966571545, |
| "grad_norm": 0.38774940371513367, |
| "learning_rate": 0.00036842761433148846, |
| "loss": 3.2293, |
| "step": 66350 |
| }, |
| { |
| "epoch": 19.33049909731524, |
| "grad_norm": 0.3813439905643463, |
| "learning_rate": 0.0003682528400815613, |
| "loss": 3.2192, |
| "step": 66400 |
| }, |
| { |
| "epoch": 19.345058528915033, |
| "grad_norm": 0.40040069818496704, |
| "learning_rate": 0.0003680780658316341, |
| "loss": 3.2372, |
| "step": 66450 |
| }, |
| { |
| "epoch": 19.35961796051482, |
| "grad_norm": 0.37192991375923157, |
| "learning_rate": 0.00036790329158170696, |
| "loss": 3.219, |
| "step": 66500 |
| }, |
| { |
| "epoch": 19.37417739211461, |
| "grad_norm": 0.42093369364738464, |
| "learning_rate": 0.0003677285173317798, |
| "loss": 3.2251, |
| "step": 66550 |
| }, |
| { |
| "epoch": 19.388736823714403, |
| "grad_norm": 0.37080198526382446, |
| "learning_rate": 0.00036755374308185257, |
| "loss": 3.2347, |
| "step": 66600 |
| }, |
| { |
| "epoch": 19.403296255314192, |
| "grad_norm": 0.3722926676273346, |
| "learning_rate": 0.0003673789688319254, |
| "loss": 3.2358, |
| "step": 66650 |
| }, |
| { |
| "epoch": 19.41785568691398, |
| "grad_norm": 0.388348788022995, |
| "learning_rate": 0.00036720419458199824, |
| "loss": 3.2263, |
| "step": 66700 |
| }, |
| { |
| "epoch": 19.432415118513774, |
| "grad_norm": 0.3911234140396118, |
| "learning_rate": 0.00036702942033207107, |
| "loss": 3.227, |
| "step": 66750 |
| }, |
| { |
| "epoch": 19.446974550113563, |
| "grad_norm": 0.38625290989875793, |
| "learning_rate": 0.0003668546460821439, |
| "loss": 3.2332, |
| "step": 66800 |
| }, |
| { |
| "epoch": 19.461533981713355, |
| "grad_norm": 0.35609227418899536, |
| "learning_rate": 0.0003666798718322167, |
| "loss": 3.2197, |
| "step": 66850 |
| }, |
| { |
| "epoch": 19.476093413313144, |
| "grad_norm": 0.4059786796569824, |
| "learning_rate": 0.0003665050975822895, |
| "loss": 3.2387, |
| "step": 66900 |
| }, |
| { |
| "epoch": 19.490652844912933, |
| "grad_norm": 0.40667444467544556, |
| "learning_rate": 0.00036633032333236235, |
| "loss": 3.238, |
| "step": 66950 |
| }, |
| { |
| "epoch": 19.505212276512726, |
| "grad_norm": 0.3943587839603424, |
| "learning_rate": 0.0003661555490824352, |
| "loss": 3.2359, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.505212276512726, |
| "eval_accuracy": 0.3730882197142643, |
| "eval_loss": 3.5423266887664795, |
| "eval_runtime": 55.0514, |
| "eval_samples_per_second": 302.318, |
| "eval_steps_per_second": 18.91, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.519771708112515, |
| "grad_norm": 0.4029746353626251, |
| "learning_rate": 0.00036598077483250796, |
| "loss": 3.2504, |
| "step": 67050 |
| }, |
| { |
| "epoch": 19.534331139712307, |
| "grad_norm": 0.38361015915870667, |
| "learning_rate": 0.0003658060005825808, |
| "loss": 3.241, |
| "step": 67100 |
| }, |
| { |
| "epoch": 19.548890571312096, |
| "grad_norm": 0.4034869372844696, |
| "learning_rate": 0.0003656312263326536, |
| "loss": 3.2417, |
| "step": 67150 |
| }, |
| { |
| "epoch": 19.563450002911885, |
| "grad_norm": 0.41540029644966125, |
| "learning_rate": 0.00036545645208272646, |
| "loss": 3.2375, |
| "step": 67200 |
| }, |
| { |
| "epoch": 19.578009434511678, |
| "grad_norm": 0.3546990156173706, |
| "learning_rate": 0.0003652816778327993, |
| "loss": 3.246, |
| "step": 67250 |
| }, |
| { |
| "epoch": 19.592568866111467, |
| "grad_norm": 0.39462944865226746, |
| "learning_rate": 0.00036510690358287207, |
| "loss": 3.2517, |
| "step": 67300 |
| }, |
| { |
| "epoch": 19.607128297711256, |
| "grad_norm": 0.39903032779693604, |
| "learning_rate": 0.0003649321293329449, |
| "loss": 3.2395, |
| "step": 67350 |
| }, |
| { |
| "epoch": 19.62168772931105, |
| "grad_norm": 0.3746011555194855, |
| "learning_rate": 0.00036475735508301774, |
| "loss": 3.2482, |
| "step": 67400 |
| }, |
| { |
| "epoch": 19.636247160910838, |
| "grad_norm": 0.40539586544036865, |
| "learning_rate": 0.00036458258083309057, |
| "loss": 3.2469, |
| "step": 67450 |
| }, |
| { |
| "epoch": 19.650806592510627, |
| "grad_norm": 0.378738135099411, |
| "learning_rate": 0.0003644078065831634, |
| "loss": 3.2508, |
| "step": 67500 |
| }, |
| { |
| "epoch": 19.66536602411042, |
| "grad_norm": 0.384886771440506, |
| "learning_rate": 0.0003642330323332362, |
| "loss": 3.2453, |
| "step": 67550 |
| }, |
| { |
| "epoch": 19.679925455710208, |
| "grad_norm": 0.38220494985580444, |
| "learning_rate": 0.000364058258083309, |
| "loss": 3.2378, |
| "step": 67600 |
| }, |
| { |
| "epoch": 19.69448488731, |
| "grad_norm": 0.40992140769958496, |
| "learning_rate": 0.00036388348383338185, |
| "loss": 3.247, |
| "step": 67650 |
| }, |
| { |
| "epoch": 19.70904431890979, |
| "grad_norm": 0.3588728606700897, |
| "learning_rate": 0.0003637087095834547, |
| "loss": 3.2447, |
| "step": 67700 |
| }, |
| { |
| "epoch": 19.72360375050958, |
| "grad_norm": 0.39845365285873413, |
| "learning_rate": 0.00036353393533352746, |
| "loss": 3.2365, |
| "step": 67750 |
| }, |
| { |
| "epoch": 19.73816318210937, |
| "grad_norm": 0.3711094260215759, |
| "learning_rate": 0.0003633591610836003, |
| "loss": 3.2589, |
| "step": 67800 |
| }, |
| { |
| "epoch": 19.75272261370916, |
| "grad_norm": 0.44665664434432983, |
| "learning_rate": 0.0003631843868336731, |
| "loss": 3.2394, |
| "step": 67850 |
| }, |
| { |
| "epoch": 19.767282045308953, |
| "grad_norm": 0.35950037837028503, |
| "learning_rate": 0.00036300961258374596, |
| "loss": 3.2603, |
| "step": 67900 |
| }, |
| { |
| "epoch": 19.781841476908742, |
| "grad_norm": 0.3663308024406433, |
| "learning_rate": 0.0003628348383338188, |
| "loss": 3.252, |
| "step": 67950 |
| }, |
| { |
| "epoch": 19.79640090850853, |
| "grad_norm": 0.3737200200557709, |
| "learning_rate": 0.00036266006408389157, |
| "loss": 3.2688, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.79640090850853, |
| "eval_accuracy": 0.3734972937500801, |
| "eval_loss": 3.5395822525024414, |
| "eval_runtime": 55.4217, |
| "eval_samples_per_second": 300.298, |
| "eval_steps_per_second": 18.783, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.810960340108323, |
| "grad_norm": 0.3715822398662567, |
| "learning_rate": 0.0003624852898339644, |
| "loss": 3.234, |
| "step": 68050 |
| }, |
| { |
| "epoch": 19.825519771708112, |
| "grad_norm": 0.4164450764656067, |
| "learning_rate": 0.00036231051558403723, |
| "loss": 3.249, |
| "step": 68100 |
| }, |
| { |
| "epoch": 19.8400792033079, |
| "grad_norm": 0.37426820397377014, |
| "learning_rate": 0.00036213574133411007, |
| "loss": 3.2556, |
| "step": 68150 |
| }, |
| { |
| "epoch": 19.854638634907694, |
| "grad_norm": 0.37632179260253906, |
| "learning_rate": 0.0003619609670841829, |
| "loss": 3.2623, |
| "step": 68200 |
| }, |
| { |
| "epoch": 19.869198066507483, |
| "grad_norm": 0.37010031938552856, |
| "learning_rate": 0.0003617861928342557, |
| "loss": 3.2537, |
| "step": 68250 |
| }, |
| { |
| "epoch": 19.883757498107276, |
| "grad_norm": 0.3916720747947693, |
| "learning_rate": 0.0003616114185843285, |
| "loss": 3.2451, |
| "step": 68300 |
| }, |
| { |
| "epoch": 19.898316929707065, |
| "grad_norm": 0.37939032912254333, |
| "learning_rate": 0.00036143664433440135, |
| "loss": 3.2707, |
| "step": 68350 |
| }, |
| { |
| "epoch": 19.912876361306854, |
| "grad_norm": 0.4042569100856781, |
| "learning_rate": 0.0003612618700844742, |
| "loss": 3.26, |
| "step": 68400 |
| }, |
| { |
| "epoch": 19.927435792906646, |
| "grad_norm": 0.35950231552124023, |
| "learning_rate": 0.00036108709583454696, |
| "loss": 3.2666, |
| "step": 68450 |
| }, |
| { |
| "epoch": 19.941995224506435, |
| "grad_norm": 0.388754665851593, |
| "learning_rate": 0.0003609123215846198, |
| "loss": 3.2571, |
| "step": 68500 |
| }, |
| { |
| "epoch": 19.956554656106224, |
| "grad_norm": 0.39523687958717346, |
| "learning_rate": 0.0003607375473346927, |
| "loss": 3.2525, |
| "step": 68550 |
| }, |
| { |
| "epoch": 19.971114087706017, |
| "grad_norm": 0.37882113456726074, |
| "learning_rate": 0.0003605627730847655, |
| "loss": 3.2668, |
| "step": 68600 |
| }, |
| { |
| "epoch": 19.985673519305806, |
| "grad_norm": 0.3885203003883362, |
| "learning_rate": 0.00036038799883483834, |
| "loss": 3.2625, |
| "step": 68650 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.9490720629692078, |
| "learning_rate": 0.0003602132245849112, |
| "loss": 3.2624, |
| "step": 68700 |
| }, |
| { |
| "epoch": 20.01455943159979, |
| "grad_norm": 0.36799395084381104, |
| "learning_rate": 0.00036003845033498395, |
| "loss": 3.1537, |
| "step": 68750 |
| }, |
| { |
| "epoch": 20.02911886319958, |
| "grad_norm": 0.3699009418487549, |
| "learning_rate": 0.0003598636760850568, |
| "loss": 3.1452, |
| "step": 68800 |
| }, |
| { |
| "epoch": 20.04367829479937, |
| "grad_norm": 0.3853961229324341, |
| "learning_rate": 0.0003596889018351296, |
| "loss": 3.162, |
| "step": 68850 |
| }, |
| { |
| "epoch": 20.058237726399163, |
| "grad_norm": 0.3762233257293701, |
| "learning_rate": 0.00035951412758520245, |
| "loss": 3.163, |
| "step": 68900 |
| }, |
| { |
| "epoch": 20.072797157998952, |
| "grad_norm": 0.4281475841999054, |
| "learning_rate": 0.00035933935333527523, |
| "loss": 3.1756, |
| "step": 68950 |
| }, |
| { |
| "epoch": 20.08735658959874, |
| "grad_norm": 0.40628287196159363, |
| "learning_rate": 0.00035916457908534807, |
| "loss": 3.1636, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.08735658959874, |
| "eval_accuracy": 0.37290549449102267, |
| "eval_loss": 3.550813913345337, |
| "eval_runtime": 55.4801, |
| "eval_samples_per_second": 299.981, |
| "eval_steps_per_second": 18.763, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.101916021198534, |
| "grad_norm": 0.4060651361942291, |
| "learning_rate": 0.0003589898048354209, |
| "loss": 3.1772, |
| "step": 69050 |
| }, |
| { |
| "epoch": 20.116475452798323, |
| "grad_norm": 0.35740476846694946, |
| "learning_rate": 0.00035881503058549373, |
| "loss": 3.1712, |
| "step": 69100 |
| }, |
| { |
| "epoch": 20.13103488439811, |
| "grad_norm": 0.3880947232246399, |
| "learning_rate": 0.00035864025633556656, |
| "loss": 3.1762, |
| "step": 69150 |
| }, |
| { |
| "epoch": 20.145594315997904, |
| "grad_norm": 0.38205868005752563, |
| "learning_rate": 0.00035846548208563934, |
| "loss": 3.1846, |
| "step": 69200 |
| }, |
| { |
| "epoch": 20.160153747597693, |
| "grad_norm": 0.3950273394584656, |
| "learning_rate": 0.0003582907078357122, |
| "loss": 3.1975, |
| "step": 69250 |
| }, |
| { |
| "epoch": 20.174713179197486, |
| "grad_norm": 0.4085865020751953, |
| "learning_rate": 0.000358115933585785, |
| "loss": 3.1907, |
| "step": 69300 |
| }, |
| { |
| "epoch": 20.189272610797275, |
| "grad_norm": 0.41469231247901917, |
| "learning_rate": 0.00035794115933585784, |
| "loss": 3.1888, |
| "step": 69350 |
| }, |
| { |
| "epoch": 20.203832042397064, |
| "grad_norm": 0.40203621983528137, |
| "learning_rate": 0.0003577663850859307, |
| "loss": 3.2023, |
| "step": 69400 |
| }, |
| { |
| "epoch": 20.218391473996856, |
| "grad_norm": 0.39782029390335083, |
| "learning_rate": 0.00035759161083600345, |
| "loss": 3.1859, |
| "step": 69450 |
| }, |
| { |
| "epoch": 20.232950905596645, |
| "grad_norm": 0.396565318107605, |
| "learning_rate": 0.0003574168365860763, |
| "loss": 3.202, |
| "step": 69500 |
| }, |
| { |
| "epoch": 20.247510337196434, |
| "grad_norm": 0.36073732376098633, |
| "learning_rate": 0.0003572420623361491, |
| "loss": 3.1976, |
| "step": 69550 |
| }, |
| { |
| "epoch": 20.262069768796227, |
| "grad_norm": 0.380855530500412, |
| "learning_rate": 0.00035706728808622195, |
| "loss": 3.2032, |
| "step": 69600 |
| }, |
| { |
| "epoch": 20.276629200396016, |
| "grad_norm": 0.40812456607818604, |
| "learning_rate": 0.00035689251383629473, |
| "loss": 3.2076, |
| "step": 69650 |
| }, |
| { |
| "epoch": 20.29118863199581, |
| "grad_norm": 0.38896799087524414, |
| "learning_rate": 0.00035671773958636757, |
| "loss": 3.1939, |
| "step": 69700 |
| }, |
| { |
| "epoch": 20.305748063595598, |
| "grad_norm": 0.3849445581436157, |
| "learning_rate": 0.0003565429653364404, |
| "loss": 3.2116, |
| "step": 69750 |
| }, |
| { |
| "epoch": 20.320307495195387, |
| "grad_norm": 0.444782018661499, |
| "learning_rate": 0.00035636819108651323, |
| "loss": 3.2223, |
| "step": 69800 |
| }, |
| { |
| "epoch": 20.33486692679518, |
| "grad_norm": 0.37147748470306396, |
| "learning_rate": 0.00035619341683658606, |
| "loss": 3.2055, |
| "step": 69850 |
| }, |
| { |
| "epoch": 20.349426358394968, |
| "grad_norm": 0.3865772485733032, |
| "learning_rate": 0.00035601864258665884, |
| "loss": 3.2103, |
| "step": 69900 |
| }, |
| { |
| "epoch": 20.363985789994757, |
| "grad_norm": 0.3948829770088196, |
| "learning_rate": 0.0003558438683367317, |
| "loss": 3.2175, |
| "step": 69950 |
| }, |
| { |
| "epoch": 20.37854522159455, |
| "grad_norm": 0.3815650939941406, |
| "learning_rate": 0.0003556690940868045, |
| "loss": 3.2153, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.37854522159455, |
| "eval_accuracy": 0.37331374544024726, |
| "eval_loss": 3.5450870990753174, |
| "eval_runtime": 55.4338, |
| "eval_samples_per_second": 300.232, |
| "eval_steps_per_second": 18.779, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.39310465319434, |
| "grad_norm": 0.37321028113365173, |
| "learning_rate": 0.00035549431983687734, |
| "loss": 3.2223, |
| "step": 70050 |
| }, |
| { |
| "epoch": 20.40766408479413, |
| "grad_norm": 0.42437463998794556, |
| "learning_rate": 0.0003553195455869502, |
| "loss": 3.225, |
| "step": 70100 |
| }, |
| { |
| "epoch": 20.42222351639392, |
| "grad_norm": 0.3605141341686249, |
| "learning_rate": 0.00035514477133702295, |
| "loss": 3.222, |
| "step": 70150 |
| }, |
| { |
| "epoch": 20.43678294799371, |
| "grad_norm": 0.38323965668678284, |
| "learning_rate": 0.0003549699970870958, |
| "loss": 3.2165, |
| "step": 70200 |
| }, |
| { |
| "epoch": 20.451342379593502, |
| "grad_norm": 0.3995019197463989, |
| "learning_rate": 0.0003547952228371686, |
| "loss": 3.2169, |
| "step": 70250 |
| }, |
| { |
| "epoch": 20.46590181119329, |
| "grad_norm": 0.4000272750854492, |
| "learning_rate": 0.00035462044858724145, |
| "loss": 3.2186, |
| "step": 70300 |
| }, |
| { |
| "epoch": 20.48046124279308, |
| "grad_norm": 0.3743245005607605, |
| "learning_rate": 0.00035444567433731423, |
| "loss": 3.2285, |
| "step": 70350 |
| }, |
| { |
| "epoch": 20.495020674392872, |
| "grad_norm": 0.3767825663089752, |
| "learning_rate": 0.00035427090008738706, |
| "loss": 3.2267, |
| "step": 70400 |
| }, |
| { |
| "epoch": 20.50958010599266, |
| "grad_norm": 0.41871246695518494, |
| "learning_rate": 0.0003540961258374599, |
| "loss": 3.2377, |
| "step": 70450 |
| }, |
| { |
| "epoch": 20.524139537592454, |
| "grad_norm": 0.3861671984195709, |
| "learning_rate": 0.00035392135158753273, |
| "loss": 3.234, |
| "step": 70500 |
| }, |
| { |
| "epoch": 20.538698969192243, |
| "grad_norm": 0.3660334348678589, |
| "learning_rate": 0.00035374657733760556, |
| "loss": 3.2345, |
| "step": 70550 |
| }, |
| { |
| "epoch": 20.553258400792032, |
| "grad_norm": 0.4146880805492401, |
| "learning_rate": 0.00035357180308767834, |
| "loss": 3.2271, |
| "step": 70600 |
| }, |
| { |
| "epoch": 20.567817832391825, |
| "grad_norm": 0.4045553207397461, |
| "learning_rate": 0.0003533970288377512, |
| "loss": 3.2282, |
| "step": 70650 |
| }, |
| { |
| "epoch": 20.582377263991614, |
| "grad_norm": 0.39800694584846497, |
| "learning_rate": 0.000353222254587824, |
| "loss": 3.2371, |
| "step": 70700 |
| }, |
| { |
| "epoch": 20.596936695591403, |
| "grad_norm": 0.37746721506118774, |
| "learning_rate": 0.00035304748033789684, |
| "loss": 3.242, |
| "step": 70750 |
| }, |
| { |
| "epoch": 20.611496127191195, |
| "grad_norm": 0.37762463092803955, |
| "learning_rate": 0.0003528727060879697, |
| "loss": 3.2312, |
| "step": 70800 |
| }, |
| { |
| "epoch": 20.626055558790984, |
| "grad_norm": 0.36045747995376587, |
| "learning_rate": 0.00035269793183804245, |
| "loss": 3.2324, |
| "step": 70850 |
| }, |
| { |
| "epoch": 20.640614990390777, |
| "grad_norm": 0.40877410769462585, |
| "learning_rate": 0.0003525231575881153, |
| "loss": 3.2411, |
| "step": 70900 |
| }, |
| { |
| "epoch": 20.655174421990566, |
| "grad_norm": 0.37934237718582153, |
| "learning_rate": 0.0003523483833381881, |
| "loss": 3.2351, |
| "step": 70950 |
| }, |
| { |
| "epoch": 20.669733853590355, |
| "grad_norm": 0.379980206489563, |
| "learning_rate": 0.00035217360908826095, |
| "loss": 3.2345, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.669733853590355, |
| "eval_accuracy": 0.373392056250208, |
| "eval_loss": 3.540816068649292, |
| "eval_runtime": 55.7309, |
| "eval_samples_per_second": 298.631, |
| "eval_steps_per_second": 18.679, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.684293285190147, |
| "grad_norm": 0.41839686036109924, |
| "learning_rate": 0.00035199883483833384, |
| "loss": 3.2169, |
| "step": 71050 |
| }, |
| { |
| "epoch": 20.698852716789936, |
| "grad_norm": 0.41018539667129517, |
| "learning_rate": 0.0003518240605884066, |
| "loss": 3.2481, |
| "step": 71100 |
| }, |
| { |
| "epoch": 20.713412148389725, |
| "grad_norm": 0.39199137687683105, |
| "learning_rate": 0.00035164928633847945, |
| "loss": 3.2431, |
| "step": 71150 |
| }, |
| { |
| "epoch": 20.727971579989518, |
| "grad_norm": 0.39780452847480774, |
| "learning_rate": 0.0003514745120885523, |
| "loss": 3.2461, |
| "step": 71200 |
| }, |
| { |
| "epoch": 20.742531011589307, |
| "grad_norm": 0.3732914626598358, |
| "learning_rate": 0.0003512997378386251, |
| "loss": 3.247, |
| "step": 71250 |
| }, |
| { |
| "epoch": 20.7570904431891, |
| "grad_norm": 0.3949578106403351, |
| "learning_rate": 0.00035112496358869795, |
| "loss": 3.2449, |
| "step": 71300 |
| }, |
| { |
| "epoch": 20.77164987478889, |
| "grad_norm": 0.41203632950782776, |
| "learning_rate": 0.00035095018933877073, |
| "loss": 3.2269, |
| "step": 71350 |
| }, |
| { |
| "epoch": 20.786209306388677, |
| "grad_norm": 0.40544483065605164, |
| "learning_rate": 0.00035077541508884356, |
| "loss": 3.2396, |
| "step": 71400 |
| }, |
| { |
| "epoch": 20.80076873798847, |
| "grad_norm": 0.39241766929626465, |
| "learning_rate": 0.0003506006408389164, |
| "loss": 3.2499, |
| "step": 71450 |
| }, |
| { |
| "epoch": 20.81532816958826, |
| "grad_norm": 0.390432208776474, |
| "learning_rate": 0.00035042586658898923, |
| "loss": 3.265, |
| "step": 71500 |
| }, |
| { |
| "epoch": 20.829887601188048, |
| "grad_norm": 0.4000423550605774, |
| "learning_rate": 0.00035025109233906206, |
| "loss": 3.2445, |
| "step": 71550 |
| }, |
| { |
| "epoch": 20.84444703278784, |
| "grad_norm": 0.3772542476654053, |
| "learning_rate": 0.00035007631808913484, |
| "loss": 3.2466, |
| "step": 71600 |
| }, |
| { |
| "epoch": 20.85900646438763, |
| "grad_norm": 0.3796822130680084, |
| "learning_rate": 0.00034990154383920767, |
| "loss": 3.248, |
| "step": 71650 |
| }, |
| { |
| "epoch": 20.873565895987422, |
| "grad_norm": 0.39603352546691895, |
| "learning_rate": 0.0003497267695892805, |
| "loss": 3.2477, |
| "step": 71700 |
| }, |
| { |
| "epoch": 20.88812532758721, |
| "grad_norm": 0.4122164845466614, |
| "learning_rate": 0.00034955199533935334, |
| "loss": 3.2418, |
| "step": 71750 |
| }, |
| { |
| "epoch": 20.902684759187, |
| "grad_norm": 0.41109615564346313, |
| "learning_rate": 0.0003493772210894261, |
| "loss": 3.2463, |
| "step": 71800 |
| }, |
| { |
| "epoch": 20.917244190786793, |
| "grad_norm": 0.39169979095458984, |
| "learning_rate": 0.00034920244683949895, |
| "loss": 3.2625, |
| "step": 71850 |
| }, |
| { |
| "epoch": 20.93180362238658, |
| "grad_norm": 0.42167893052101135, |
| "learning_rate": 0.0003490276725895718, |
| "loss": 3.2399, |
| "step": 71900 |
| }, |
| { |
| "epoch": 20.94636305398637, |
| "grad_norm": 0.39163246750831604, |
| "learning_rate": 0.0003488528983396446, |
| "loss": 3.252, |
| "step": 71950 |
| }, |
| { |
| "epoch": 20.960922485586163, |
| "grad_norm": 0.41562479734420776, |
| "learning_rate": 0.00034867812408971745, |
| "loss": 3.2576, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.960922485586163, |
| "eval_accuracy": 0.3742182000201539, |
| "eval_loss": 3.528524875640869, |
| "eval_runtime": 55.4191, |
| "eval_samples_per_second": 300.312, |
| "eval_steps_per_second": 18.784, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.975481917185952, |
| "grad_norm": 0.3895370662212372, |
| "learning_rate": 0.00034850334983979023, |
| "loss": 3.2408, |
| "step": 72050 |
| }, |
| { |
| "epoch": 20.990041348785745, |
| "grad_norm": 0.38841962814331055, |
| "learning_rate": 0.00034832857558986306, |
| "loss": 3.2424, |
| "step": 72100 |
| }, |
| { |
| "epoch": 21.004367829479936, |
| "grad_norm": 0.38999074697494507, |
| "learning_rate": 0.0003481538013399359, |
| "loss": 3.2279, |
| "step": 72150 |
| }, |
| { |
| "epoch": 21.018927261079728, |
| "grad_norm": 0.3888770043849945, |
| "learning_rate": 0.0003479790270900087, |
| "loss": 3.145, |
| "step": 72200 |
| }, |
| { |
| "epoch": 21.033486692679517, |
| "grad_norm": 0.3620569109916687, |
| "learning_rate": 0.0003478042528400815, |
| "loss": 3.1389, |
| "step": 72250 |
| }, |
| { |
| "epoch": 21.04804612427931, |
| "grad_norm": 0.3902473449707031, |
| "learning_rate": 0.00034762947859015434, |
| "loss": 3.1505, |
| "step": 72300 |
| }, |
| { |
| "epoch": 21.0626055558791, |
| "grad_norm": 0.3933819830417633, |
| "learning_rate": 0.00034745470434022717, |
| "loss": 3.1558, |
| "step": 72350 |
| }, |
| { |
| "epoch": 21.077164987478888, |
| "grad_norm": 0.3986400067806244, |
| "learning_rate": 0.0003472799300903, |
| "loss": 3.1592, |
| "step": 72400 |
| }, |
| { |
| "epoch": 21.09172441907868, |
| "grad_norm": 0.3890974521636963, |
| "learning_rate": 0.00034710515584037284, |
| "loss": 3.1688, |
| "step": 72450 |
| }, |
| { |
| "epoch": 21.10628385067847, |
| "grad_norm": 0.38709503412246704, |
| "learning_rate": 0.0003469303815904456, |
| "loss": 3.1723, |
| "step": 72500 |
| }, |
| { |
| "epoch": 21.12084328227826, |
| "grad_norm": 0.4304654896259308, |
| "learning_rate": 0.00034675560734051845, |
| "loss": 3.1755, |
| "step": 72550 |
| }, |
| { |
| "epoch": 21.13540271387805, |
| "grad_norm": 0.4124937653541565, |
| "learning_rate": 0.0003465808330905913, |
| "loss": 3.1816, |
| "step": 72600 |
| }, |
| { |
| "epoch": 21.14996214547784, |
| "grad_norm": 0.39741483330726624, |
| "learning_rate": 0.0003464060588406641, |
| "loss": 3.1765, |
| "step": 72650 |
| }, |
| { |
| "epoch": 21.164521577077632, |
| "grad_norm": 0.38367438316345215, |
| "learning_rate": 0.00034623128459073695, |
| "loss": 3.1732, |
| "step": 72700 |
| }, |
| { |
| "epoch": 21.17908100867742, |
| "grad_norm": 0.38440006971359253, |
| "learning_rate": 0.00034605651034080973, |
| "loss": 3.1819, |
| "step": 72750 |
| }, |
| { |
| "epoch": 21.19364044027721, |
| "grad_norm": 0.3843246102333069, |
| "learning_rate": 0.00034588173609088256, |
| "loss": 3.1871, |
| "step": 72800 |
| }, |
| { |
| "epoch": 21.208199871877003, |
| "grad_norm": 0.407388299703598, |
| "learning_rate": 0.0003457069618409554, |
| "loss": 3.1924, |
| "step": 72850 |
| }, |
| { |
| "epoch": 21.222759303476792, |
| "grad_norm": 0.38186925649642944, |
| "learning_rate": 0.0003455321875910282, |
| "loss": 3.1898, |
| "step": 72900 |
| }, |
| { |
| "epoch": 21.23731873507658, |
| "grad_norm": 0.38251790404319763, |
| "learning_rate": 0.000345357413341101, |
| "loss": 3.1994, |
| "step": 72950 |
| }, |
| { |
| "epoch": 21.251878166676374, |
| "grad_norm": 0.43736377358436584, |
| "learning_rate": 0.00034518263909117384, |
| "loss": 3.1837, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.251878166676374, |
| "eval_accuracy": 0.3736014729957636, |
| "eval_loss": 3.547090530395508, |
| "eval_runtime": 55.385, |
| "eval_samples_per_second": 300.496, |
| "eval_steps_per_second": 18.796, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.266437598276163, |
| "grad_norm": 0.3770950138568878, |
| "learning_rate": 0.00034500786484124667, |
| "loss": 3.1809, |
| "step": 73050 |
| }, |
| { |
| "epoch": 21.280997029875955, |
| "grad_norm": 0.3902764618396759, |
| "learning_rate": 0.0003448330905913195, |
| "loss": 3.2044, |
| "step": 73100 |
| }, |
| { |
| "epoch": 21.295556461475744, |
| "grad_norm": 0.37991878390312195, |
| "learning_rate": 0.00034465831634139234, |
| "loss": 3.1995, |
| "step": 73150 |
| }, |
| { |
| "epoch": 21.310115893075533, |
| "grad_norm": 0.3790615499019623, |
| "learning_rate": 0.0003444835420914651, |
| "loss": 3.2026, |
| "step": 73200 |
| }, |
| { |
| "epoch": 21.324675324675326, |
| "grad_norm": 0.36328187584877014, |
| "learning_rate": 0.00034430876784153795, |
| "loss": 3.1957, |
| "step": 73250 |
| }, |
| { |
| "epoch": 21.339234756275115, |
| "grad_norm": 0.39685186743736267, |
| "learning_rate": 0.0003441339935916108, |
| "loss": 3.1948, |
| "step": 73300 |
| }, |
| { |
| "epoch": 21.353794187874904, |
| "grad_norm": 0.384804368019104, |
| "learning_rate": 0.0003439592193416836, |
| "loss": 3.1912, |
| "step": 73350 |
| }, |
| { |
| "epoch": 21.368353619474696, |
| "grad_norm": 0.3883644640445709, |
| "learning_rate": 0.00034378444509175645, |
| "loss": 3.2182, |
| "step": 73400 |
| }, |
| { |
| "epoch": 21.382913051074485, |
| "grad_norm": 0.40188512206077576, |
| "learning_rate": 0.0003436096708418292, |
| "loss": 3.2072, |
| "step": 73450 |
| }, |
| { |
| "epoch": 21.397472482674278, |
| "grad_norm": 0.4035356640815735, |
| "learning_rate": 0.00034343489659190206, |
| "loss": 3.1982, |
| "step": 73500 |
| }, |
| { |
| "epoch": 21.412031914274067, |
| "grad_norm": 0.37305113673210144, |
| "learning_rate": 0.0003432601223419749, |
| "loss": 3.2027, |
| "step": 73550 |
| }, |
| { |
| "epoch": 21.426591345873856, |
| "grad_norm": 0.3963967263698578, |
| "learning_rate": 0.0003430853480920478, |
| "loss": 3.2171, |
| "step": 73600 |
| }, |
| { |
| "epoch": 21.44115077747365, |
| "grad_norm": 0.4038044810295105, |
| "learning_rate": 0.0003429105738421206, |
| "loss": 3.2203, |
| "step": 73650 |
| }, |
| { |
| "epoch": 21.455710209073438, |
| "grad_norm": 0.3818981647491455, |
| "learning_rate": 0.0003427357995921934, |
| "loss": 3.2169, |
| "step": 73700 |
| }, |
| { |
| "epoch": 21.470269640673227, |
| "grad_norm": 0.37264305353164673, |
| "learning_rate": 0.0003425610253422662, |
| "loss": 3.2133, |
| "step": 73750 |
| }, |
| { |
| "epoch": 21.48482907227302, |
| "grad_norm": 0.3894643783569336, |
| "learning_rate": 0.00034238625109233906, |
| "loss": 3.2316, |
| "step": 73800 |
| }, |
| { |
| "epoch": 21.499388503872808, |
| "grad_norm": 0.4069054424762726, |
| "learning_rate": 0.0003422114768424119, |
| "loss": 3.215, |
| "step": 73850 |
| }, |
| { |
| "epoch": 21.5139479354726, |
| "grad_norm": 0.4019501507282257, |
| "learning_rate": 0.0003420367025924847, |
| "loss": 3.2111, |
| "step": 73900 |
| }, |
| { |
| "epoch": 21.52850736707239, |
| "grad_norm": 0.39605239033699036, |
| "learning_rate": 0.0003418619283425575, |
| "loss": 3.2143, |
| "step": 73950 |
| }, |
| { |
| "epoch": 21.54306679867218, |
| "grad_norm": 0.38995805382728577, |
| "learning_rate": 0.00034168715409263034, |
| "loss": 3.2214, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.54306679867218, |
| "eval_accuracy": 0.37393999675233547, |
| "eval_loss": 3.5405991077423096, |
| "eval_runtime": 55.3361, |
| "eval_samples_per_second": 300.762, |
| "eval_steps_per_second": 18.812, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.55762623027197, |
| "grad_norm": 0.3883754312992096, |
| "learning_rate": 0.00034151237984270317, |
| "loss": 3.2207, |
| "step": 74050 |
| }, |
| { |
| "epoch": 21.57218566187176, |
| "grad_norm": 0.4097517132759094, |
| "learning_rate": 0.000341337605592776, |
| "loss": 3.2087, |
| "step": 74100 |
| }, |
| { |
| "epoch": 21.58674509347155, |
| "grad_norm": 0.3909226953983307, |
| "learning_rate": 0.00034116283134284883, |
| "loss": 3.219, |
| "step": 74150 |
| }, |
| { |
| "epoch": 21.601304525071342, |
| "grad_norm": 0.41672003269195557, |
| "learning_rate": 0.0003409880570929216, |
| "loss": 3.2251, |
| "step": 74200 |
| }, |
| { |
| "epoch": 21.61586395667113, |
| "grad_norm": 0.37609753012657166, |
| "learning_rate": 0.00034081328284299445, |
| "loss": 3.2111, |
| "step": 74250 |
| }, |
| { |
| "epoch": 21.630423388270923, |
| "grad_norm": 0.39157718420028687, |
| "learning_rate": 0.0003406385085930673, |
| "loss": 3.2187, |
| "step": 74300 |
| }, |
| { |
| "epoch": 21.644982819870712, |
| "grad_norm": 0.39113712310791016, |
| "learning_rate": 0.0003404637343431401, |
| "loss": 3.2149, |
| "step": 74350 |
| }, |
| { |
| "epoch": 21.6595422514705, |
| "grad_norm": 0.40942496061325073, |
| "learning_rate": 0.0003402889600932129, |
| "loss": 3.2303, |
| "step": 74400 |
| }, |
| { |
| "epoch": 21.674101683070294, |
| "grad_norm": 0.43008992075920105, |
| "learning_rate": 0.0003401141858432857, |
| "loss": 3.2299, |
| "step": 74450 |
| }, |
| { |
| "epoch": 21.688661114670083, |
| "grad_norm": 0.4003450870513916, |
| "learning_rate": 0.00033993941159335856, |
| "loss": 3.2234, |
| "step": 74500 |
| }, |
| { |
| "epoch": 21.703220546269872, |
| "grad_norm": 0.3863767981529236, |
| "learning_rate": 0.0003397646373434314, |
| "loss": 3.2188, |
| "step": 74550 |
| }, |
| { |
| "epoch": 21.717779977869665, |
| "grad_norm": 0.37120547890663147, |
| "learning_rate": 0.0003395898630935042, |
| "loss": 3.2273, |
| "step": 74600 |
| }, |
| { |
| "epoch": 21.732339409469454, |
| "grad_norm": 0.44255581498146057, |
| "learning_rate": 0.000339415088843577, |
| "loss": 3.2166, |
| "step": 74650 |
| }, |
| { |
| "epoch": 21.746898841069246, |
| "grad_norm": 0.39260992407798767, |
| "learning_rate": 0.00033924031459364983, |
| "loss": 3.2313, |
| "step": 74700 |
| }, |
| { |
| "epoch": 21.761458272669035, |
| "grad_norm": 0.38604602217674255, |
| "learning_rate": 0.00033906554034372267, |
| "loss": 3.2316, |
| "step": 74750 |
| }, |
| { |
| "epoch": 21.776017704268824, |
| "grad_norm": 0.43335166573524475, |
| "learning_rate": 0.0003388907660937955, |
| "loss": 3.2346, |
| "step": 74800 |
| }, |
| { |
| "epoch": 21.790577135868617, |
| "grad_norm": 0.4126780331134796, |
| "learning_rate": 0.0003387159918438683, |
| "loss": 3.2284, |
| "step": 74850 |
| }, |
| { |
| "epoch": 21.805136567468406, |
| "grad_norm": 0.4032544195652008, |
| "learning_rate": 0.0003385412175939411, |
| "loss": 3.2464, |
| "step": 74900 |
| }, |
| { |
| "epoch": 21.819695999068195, |
| "grad_norm": 0.3821982741355896, |
| "learning_rate": 0.00033836644334401395, |
| "loss": 3.2471, |
| "step": 74950 |
| }, |
| { |
| "epoch": 21.834255430667987, |
| "grad_norm": 0.4045506417751312, |
| "learning_rate": 0.0003381916690940868, |
| "loss": 3.2441, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.834255430667987, |
| "eval_accuracy": 0.37437952499202487, |
| "eval_loss": 3.532663345336914, |
| "eval_runtime": 55.3218, |
| "eval_samples_per_second": 300.84, |
| "eval_steps_per_second": 18.817, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.848814862267776, |
| "grad_norm": 0.38999539613723755, |
| "learning_rate": 0.0003380168948441596, |
| "loss": 3.2378, |
| "step": 75050 |
| }, |
| { |
| "epoch": 21.86337429386757, |
| "grad_norm": 0.42581823468208313, |
| "learning_rate": 0.0003378421205942324, |
| "loss": 3.2437, |
| "step": 75100 |
| }, |
| { |
| "epoch": 21.877933725467358, |
| "grad_norm": 0.3946520984172821, |
| "learning_rate": 0.0003376673463443052, |
| "loss": 3.2464, |
| "step": 75150 |
| }, |
| { |
| "epoch": 21.892493157067147, |
| "grad_norm": 0.35369718074798584, |
| "learning_rate": 0.00033749257209437806, |
| "loss": 3.2364, |
| "step": 75200 |
| }, |
| { |
| "epoch": 21.90705258866694, |
| "grad_norm": 0.3942389488220215, |
| "learning_rate": 0.0003373177978444509, |
| "loss": 3.2421, |
| "step": 75250 |
| }, |
| { |
| "epoch": 21.92161202026673, |
| "grad_norm": 0.3681791126728058, |
| "learning_rate": 0.0003371430235945237, |
| "loss": 3.2329, |
| "step": 75300 |
| }, |
| { |
| "epoch": 21.93617145186652, |
| "grad_norm": 0.39455777406692505, |
| "learning_rate": 0.0003369682493445965, |
| "loss": 3.2446, |
| "step": 75350 |
| }, |
| { |
| "epoch": 21.95073088346631, |
| "grad_norm": 0.4163340926170349, |
| "learning_rate": 0.00033679347509466933, |
| "loss": 3.233, |
| "step": 75400 |
| }, |
| { |
| "epoch": 21.9652903150661, |
| "grad_norm": 0.3981238603591919, |
| "learning_rate": 0.00033661870084474217, |
| "loss": 3.245, |
| "step": 75450 |
| }, |
| { |
| "epoch": 21.97984974666589, |
| "grad_norm": 0.3684476315975189, |
| "learning_rate": 0.000336443926594815, |
| "loss": 3.2443, |
| "step": 75500 |
| }, |
| { |
| "epoch": 21.99440917826568, |
| "grad_norm": 0.38006895780563354, |
| "learning_rate": 0.0003362691523448878, |
| "loss": 3.2413, |
| "step": 75550 |
| }, |
| { |
| "epoch": 22.008735658959875, |
| "grad_norm": 0.40769121050834656, |
| "learning_rate": 0.0003360943780949606, |
| "loss": 3.1715, |
| "step": 75600 |
| }, |
| { |
| "epoch": 22.023295090559664, |
| "grad_norm": 0.3952407240867615, |
| "learning_rate": 0.00033591960384503344, |
| "loss": 3.1341, |
| "step": 75650 |
| }, |
| { |
| "epoch": 22.037854522159456, |
| "grad_norm": 0.3831493556499481, |
| "learning_rate": 0.0003357448295951063, |
| "loss": 3.1399, |
| "step": 75700 |
| }, |
| { |
| "epoch": 22.052413953759245, |
| "grad_norm": 0.4123232960700989, |
| "learning_rate": 0.0003355700553451791, |
| "loss": 3.1501, |
| "step": 75750 |
| }, |
| { |
| "epoch": 22.066973385359034, |
| "grad_norm": 0.3950226902961731, |
| "learning_rate": 0.0003353952810952519, |
| "loss": 3.1533, |
| "step": 75800 |
| }, |
| { |
| "epoch": 22.081532816958827, |
| "grad_norm": 0.36806586384773254, |
| "learning_rate": 0.0003352205068453247, |
| "loss": 3.1547, |
| "step": 75850 |
| }, |
| { |
| "epoch": 22.096092248558616, |
| "grad_norm": 0.37511464953422546, |
| "learning_rate": 0.00033504573259539756, |
| "loss": 3.1488, |
| "step": 75900 |
| }, |
| { |
| "epoch": 22.110651680158405, |
| "grad_norm": 0.38706332445144653, |
| "learning_rate": 0.0003348709583454704, |
| "loss": 3.163, |
| "step": 75950 |
| }, |
| { |
| "epoch": 22.125211111758198, |
| "grad_norm": 0.3739766776561737, |
| "learning_rate": 0.0003346961840955432, |
| "loss": 3.162, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.125211111758198, |
| "eval_accuracy": 0.37371035559339666, |
| "eval_loss": 3.5474770069122314, |
| "eval_runtime": 55.0086, |
| "eval_samples_per_second": 302.553, |
| "eval_steps_per_second": 18.924, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.139770543357987, |
| "grad_norm": 0.40529629588127136, |
| "learning_rate": 0.000334521409845616, |
| "loss": 3.1764, |
| "step": 76050 |
| }, |
| { |
| "epoch": 22.15432997495778, |
| "grad_norm": 0.4074355661869049, |
| "learning_rate": 0.0003343466355956889, |
| "loss": 3.1671, |
| "step": 76100 |
| }, |
| { |
| "epoch": 22.168889406557568, |
| "grad_norm": 0.39302748441696167, |
| "learning_rate": 0.0003341718613457617, |
| "loss": 3.1593, |
| "step": 76150 |
| }, |
| { |
| "epoch": 22.183448838157357, |
| "grad_norm": 0.43343594670295715, |
| "learning_rate": 0.00033399708709583455, |
| "loss": 3.1704, |
| "step": 76200 |
| }, |
| { |
| "epoch": 22.19800826975715, |
| "grad_norm": 0.459338903427124, |
| "learning_rate": 0.0003338223128459074, |
| "loss": 3.1819, |
| "step": 76250 |
| }, |
| { |
| "epoch": 22.21256770135694, |
| "grad_norm": 0.43016624450683594, |
| "learning_rate": 0.00033364753859598016, |
| "loss": 3.174, |
| "step": 76300 |
| }, |
| { |
| "epoch": 22.227127132956728, |
| "grad_norm": 0.4130750000476837, |
| "learning_rate": 0.000333472764346053, |
| "loss": 3.1781, |
| "step": 76350 |
| }, |
| { |
| "epoch": 22.24168656455652, |
| "grad_norm": 0.4306490123271942, |
| "learning_rate": 0.00033329799009612583, |
| "loss": 3.1801, |
| "step": 76400 |
| }, |
| { |
| "epoch": 22.25624599615631, |
| "grad_norm": 0.38700979948043823, |
| "learning_rate": 0.00033312321584619866, |
| "loss": 3.1954, |
| "step": 76450 |
| }, |
| { |
| "epoch": 22.270805427756102, |
| "grad_norm": 0.38101381063461304, |
| "learning_rate": 0.0003329484415962715, |
| "loss": 3.1823, |
| "step": 76500 |
| }, |
| { |
| "epoch": 22.28536485935589, |
| "grad_norm": 0.3876706063747406, |
| "learning_rate": 0.0003327736673463443, |
| "loss": 3.1935, |
| "step": 76550 |
| }, |
| { |
| "epoch": 22.29992429095568, |
| "grad_norm": 0.41006049513816833, |
| "learning_rate": 0.0003325988930964171, |
| "loss": 3.1965, |
| "step": 76600 |
| }, |
| { |
| "epoch": 22.314483722555472, |
| "grad_norm": 0.3990377187728882, |
| "learning_rate": 0.00033242411884648994, |
| "loss": 3.1862, |
| "step": 76650 |
| }, |
| { |
| "epoch": 22.32904315415526, |
| "grad_norm": 0.408883661031723, |
| "learning_rate": 0.0003322493445965628, |
| "loss": 3.1911, |
| "step": 76700 |
| }, |
| { |
| "epoch": 22.34360258575505, |
| "grad_norm": 0.4165717363357544, |
| "learning_rate": 0.0003320745703466356, |
| "loss": 3.1866, |
| "step": 76750 |
| }, |
| { |
| "epoch": 22.358162017354843, |
| "grad_norm": 0.40794703364372253, |
| "learning_rate": 0.0003318997960967084, |
| "loss": 3.201, |
| "step": 76800 |
| }, |
| { |
| "epoch": 22.372721448954632, |
| "grad_norm": 0.43184053897857666, |
| "learning_rate": 0.0003317250218467812, |
| "loss": 3.1934, |
| "step": 76850 |
| }, |
| { |
| "epoch": 22.387280880554425, |
| "grad_norm": 0.3932684063911438, |
| "learning_rate": 0.00033155024759685405, |
| "loss": 3.1961, |
| "step": 76900 |
| }, |
| { |
| "epoch": 22.401840312154214, |
| "grad_norm": 0.4096086621284485, |
| "learning_rate": 0.0003313754733469269, |
| "loss": 3.2176, |
| "step": 76950 |
| }, |
| { |
| "epoch": 22.416399743754003, |
| "grad_norm": 0.39115527272224426, |
| "learning_rate": 0.00033120069909699966, |
| "loss": 3.1943, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.416399743754003, |
| "eval_accuracy": 0.37367707937835326, |
| "eval_loss": 3.5434892177581787, |
| "eval_runtime": 55.0354, |
| "eval_samples_per_second": 302.405, |
| "eval_steps_per_second": 18.915, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.430959175353795, |
| "grad_norm": 0.40539973974227905, |
| "learning_rate": 0.0003310259248470725, |
| "loss": 3.2077, |
| "step": 77050 |
| }, |
| { |
| "epoch": 22.445518606953584, |
| "grad_norm": 0.44398367404937744, |
| "learning_rate": 0.00033085115059714533, |
| "loss": 3.2003, |
| "step": 77100 |
| }, |
| { |
| "epoch": 22.460078038553373, |
| "grad_norm": 0.3839823305606842, |
| "learning_rate": 0.00033067637634721816, |
| "loss": 3.2015, |
| "step": 77150 |
| }, |
| { |
| "epoch": 22.474637470153166, |
| "grad_norm": 0.41733473539352417, |
| "learning_rate": 0.000330501602097291, |
| "loss": 3.1997, |
| "step": 77200 |
| }, |
| { |
| "epoch": 22.489196901752955, |
| "grad_norm": 0.41814976930618286, |
| "learning_rate": 0.0003303268278473638, |
| "loss": 3.2144, |
| "step": 77250 |
| }, |
| { |
| "epoch": 22.503756333352747, |
| "grad_norm": 0.4052809178829193, |
| "learning_rate": 0.0003301520535974366, |
| "loss": 3.1983, |
| "step": 77300 |
| }, |
| { |
| "epoch": 22.518315764952536, |
| "grad_norm": 0.3910194933414459, |
| "learning_rate": 0.00032997727934750944, |
| "loss": 3.2074, |
| "step": 77350 |
| }, |
| { |
| "epoch": 22.532875196552325, |
| "grad_norm": 0.4210590422153473, |
| "learning_rate": 0.0003298025050975823, |
| "loss": 3.2031, |
| "step": 77400 |
| }, |
| { |
| "epoch": 22.547434628152118, |
| "grad_norm": 0.4334495961666107, |
| "learning_rate": 0.0003296277308476551, |
| "loss": 3.2054, |
| "step": 77450 |
| }, |
| { |
| "epoch": 22.561994059751907, |
| "grad_norm": 0.3857288360595703, |
| "learning_rate": 0.0003294529565977279, |
| "loss": 3.2119, |
| "step": 77500 |
| }, |
| { |
| "epoch": 22.576553491351696, |
| "grad_norm": 0.3976559638977051, |
| "learning_rate": 0.0003292781823478007, |
| "loss": 3.1939, |
| "step": 77550 |
| }, |
| { |
| "epoch": 22.59111292295149, |
| "grad_norm": 0.3808916211128235, |
| "learning_rate": 0.00032910340809787355, |
| "loss": 3.2074, |
| "step": 77600 |
| }, |
| { |
| "epoch": 22.605672354551277, |
| "grad_norm": 0.38793352246284485, |
| "learning_rate": 0.0003289286338479464, |
| "loss": 3.2058, |
| "step": 77650 |
| }, |
| { |
| "epoch": 22.62023178615107, |
| "grad_norm": 0.40990737080574036, |
| "learning_rate": 0.00032875385959801916, |
| "loss": 3.2082, |
| "step": 77700 |
| }, |
| { |
| "epoch": 22.63479121775086, |
| "grad_norm": 0.40758588910102844, |
| "learning_rate": 0.000328579085348092, |
| "loss": 3.2192, |
| "step": 77750 |
| }, |
| { |
| "epoch": 22.649350649350648, |
| "grad_norm": 0.3899351954460144, |
| "learning_rate": 0.00032840431109816483, |
| "loss": 3.2044, |
| "step": 77800 |
| }, |
| { |
| "epoch": 22.66391008095044, |
| "grad_norm": 0.4190896153450012, |
| "learning_rate": 0.00032822953684823766, |
| "loss": 3.2246, |
| "step": 77850 |
| }, |
| { |
| "epoch": 22.67846951255023, |
| "grad_norm": 0.4242609739303589, |
| "learning_rate": 0.0003280547625983105, |
| "loss": 3.2182, |
| "step": 77900 |
| }, |
| { |
| "epoch": 22.693028944150022, |
| "grad_norm": 0.4129343330860138, |
| "learning_rate": 0.0003278799883483833, |
| "loss": 3.2331, |
| "step": 77950 |
| }, |
| { |
| "epoch": 22.70758837574981, |
| "grad_norm": 0.41839760541915894, |
| "learning_rate": 0.0003277052140984561, |
| "loss": 3.2206, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.70758837574981, |
| "eval_accuracy": 0.3743237902714222, |
| "eval_loss": 3.5353493690490723, |
| "eval_runtime": 55.1684, |
| "eval_samples_per_second": 301.676, |
| "eval_steps_per_second": 18.869, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.7221478073496, |
| "grad_norm": 0.3958353102207184, |
| "learning_rate": 0.00032753043984852894, |
| "loss": 3.2286, |
| "step": 78050 |
| }, |
| { |
| "epoch": 22.736707238949393, |
| "grad_norm": 0.4032213091850281, |
| "learning_rate": 0.0003273556655986018, |
| "loss": 3.2232, |
| "step": 78100 |
| }, |
| { |
| "epoch": 22.75126667054918, |
| "grad_norm": 0.3718810975551605, |
| "learning_rate": 0.00032718089134867455, |
| "loss": 3.225, |
| "step": 78150 |
| }, |
| { |
| "epoch": 22.76582610214897, |
| "grad_norm": 0.39873242378234863, |
| "learning_rate": 0.0003270061170987474, |
| "loss": 3.2038, |
| "step": 78200 |
| }, |
| { |
| "epoch": 22.780385533748763, |
| "grad_norm": 0.41898322105407715, |
| "learning_rate": 0.0003268313428488202, |
| "loss": 3.2325, |
| "step": 78250 |
| }, |
| { |
| "epoch": 22.794944965348552, |
| "grad_norm": 0.3915160596370697, |
| "learning_rate": 0.00032665656859889305, |
| "loss": 3.2263, |
| "step": 78300 |
| }, |
| { |
| "epoch": 22.80950439694834, |
| "grad_norm": 0.3975723087787628, |
| "learning_rate": 0.0003264817943489659, |
| "loss": 3.225, |
| "step": 78350 |
| }, |
| { |
| "epoch": 22.824063828548134, |
| "grad_norm": 0.4013522267341614, |
| "learning_rate": 0.00032630702009903866, |
| "loss": 3.2191, |
| "step": 78400 |
| }, |
| { |
| "epoch": 22.838623260147923, |
| "grad_norm": 0.40975821018218994, |
| "learning_rate": 0.0003261322458491115, |
| "loss": 3.2224, |
| "step": 78450 |
| }, |
| { |
| "epoch": 22.853182691747715, |
| "grad_norm": 0.4058254361152649, |
| "learning_rate": 0.00032595747159918433, |
| "loss": 3.2206, |
| "step": 78500 |
| }, |
| { |
| "epoch": 22.867742123347504, |
| "grad_norm": 0.414315789937973, |
| "learning_rate": 0.00032578269734925716, |
| "loss": 3.2192, |
| "step": 78550 |
| }, |
| { |
| "epoch": 22.882301554947293, |
| "grad_norm": 0.41001883149147034, |
| "learning_rate": 0.00032560792309933, |
| "loss": 3.2294, |
| "step": 78600 |
| }, |
| { |
| "epoch": 22.896860986547086, |
| "grad_norm": 0.4034368097782135, |
| "learning_rate": 0.0003254331488494029, |
| "loss": 3.2209, |
| "step": 78650 |
| }, |
| { |
| "epoch": 22.911420418146875, |
| "grad_norm": 0.39516010880470276, |
| "learning_rate": 0.00032525837459947566, |
| "loss": 3.2287, |
| "step": 78700 |
| }, |
| { |
| "epoch": 22.925979849746668, |
| "grad_norm": 0.4021008610725403, |
| "learning_rate": 0.0003250836003495485, |
| "loss": 3.2343, |
| "step": 78750 |
| }, |
| { |
| "epoch": 22.940539281346457, |
| "grad_norm": 0.4237155318260193, |
| "learning_rate": 0.0003249088260996213, |
| "loss": 3.2408, |
| "step": 78800 |
| }, |
| { |
| "epoch": 22.955098712946246, |
| "grad_norm": 0.3744175434112549, |
| "learning_rate": 0.00032473405184969416, |
| "loss": 3.2244, |
| "step": 78850 |
| }, |
| { |
| "epoch": 22.969658144546038, |
| "grad_norm": 0.4011383652687073, |
| "learning_rate": 0.00032455927759976694, |
| "loss": 3.2274, |
| "step": 78900 |
| }, |
| { |
| "epoch": 22.984217576145827, |
| "grad_norm": 0.3921768367290497, |
| "learning_rate": 0.00032438450334983977, |
| "loss": 3.2219, |
| "step": 78950 |
| }, |
| { |
| "epoch": 22.998777007745616, |
| "grad_norm": 0.41207054257392883, |
| "learning_rate": 0.0003242097290999126, |
| "loss": 3.2293, |
| "step": 79000 |
| }, |
| { |
| "epoch": 22.998777007745616, |
| "eval_accuracy": 0.3748646757456253, |
| "eval_loss": 3.5263609886169434, |
| "eval_runtime": 55.2416, |
| "eval_samples_per_second": 301.276, |
| "eval_steps_per_second": 18.844, |
| "step": 79000 |
| }, |
| { |
| "epoch": 23.01310348843981, |
| "grad_norm": 0.4697108566761017, |
| "learning_rate": 0.00032403495484998544, |
| "loss": 3.1308, |
| "step": 79050 |
| }, |
| { |
| "epoch": 23.027662920039603, |
| "grad_norm": 0.433270126581192, |
| "learning_rate": 0.00032386018060005827, |
| "loss": 3.1183, |
| "step": 79100 |
| }, |
| { |
| "epoch": 23.042222351639392, |
| "grad_norm": 0.3799859285354614, |
| "learning_rate": 0.00032368540635013105, |
| "loss": 3.1369, |
| "step": 79150 |
| }, |
| { |
| "epoch": 23.05678178323918, |
| "grad_norm": 0.4204369783401489, |
| "learning_rate": 0.0003235106321002039, |
| "loss": 3.1339, |
| "step": 79200 |
| }, |
| { |
| "epoch": 23.071341214838974, |
| "grad_norm": 0.42532503604888916, |
| "learning_rate": 0.0003233358578502767, |
| "loss": 3.1521, |
| "step": 79250 |
| }, |
| { |
| "epoch": 23.085900646438763, |
| "grad_norm": 0.39714616537094116, |
| "learning_rate": 0.00032316108360034955, |
| "loss": 3.1428, |
| "step": 79300 |
| }, |
| { |
| "epoch": 23.10046007803855, |
| "grad_norm": 0.4164256453514099, |
| "learning_rate": 0.0003229863093504224, |
| "loss": 3.1444, |
| "step": 79350 |
| }, |
| { |
| "epoch": 23.115019509638344, |
| "grad_norm": 0.41725417971611023, |
| "learning_rate": 0.00032281153510049516, |
| "loss": 3.1417, |
| "step": 79400 |
| }, |
| { |
| "epoch": 23.129578941238133, |
| "grad_norm": 0.4337908923625946, |
| "learning_rate": 0.000322636760850568, |
| "loss": 3.1551, |
| "step": 79450 |
| }, |
| { |
| "epoch": 23.144138372837926, |
| "grad_norm": 0.4192178249359131, |
| "learning_rate": 0.0003224619866006408, |
| "loss": 3.1568, |
| "step": 79500 |
| }, |
| { |
| "epoch": 23.158697804437715, |
| "grad_norm": 0.38770732283592224, |
| "learning_rate": 0.00032228721235071366, |
| "loss": 3.1595, |
| "step": 79550 |
| }, |
| { |
| "epoch": 23.173257236037504, |
| "grad_norm": 0.4365769922733307, |
| "learning_rate": 0.00032211243810078644, |
| "loss": 3.1469, |
| "step": 79600 |
| }, |
| { |
| "epoch": 23.187816667637296, |
| "grad_norm": 0.41161519289016724, |
| "learning_rate": 0.00032193766385085927, |
| "loss": 3.1547, |
| "step": 79650 |
| }, |
| { |
| "epoch": 23.202376099237085, |
| "grad_norm": 0.41916653513908386, |
| "learning_rate": 0.0003217628896009321, |
| "loss": 3.1662, |
| "step": 79700 |
| }, |
| { |
| "epoch": 23.216935530836878, |
| "grad_norm": 0.4116290807723999, |
| "learning_rate": 0.00032158811535100494, |
| "loss": 3.1636, |
| "step": 79750 |
| }, |
| { |
| "epoch": 23.231494962436667, |
| "grad_norm": 0.3962598443031311, |
| "learning_rate": 0.00032141334110107777, |
| "loss": 3.1618, |
| "step": 79800 |
| }, |
| { |
| "epoch": 23.246054394036456, |
| "grad_norm": 0.3972434401512146, |
| "learning_rate": 0.00032123856685115055, |
| "loss": 3.1679, |
| "step": 79850 |
| }, |
| { |
| "epoch": 23.26061382563625, |
| "grad_norm": 0.42853844165802, |
| "learning_rate": 0.0003210637926012234, |
| "loss": 3.1707, |
| "step": 79900 |
| }, |
| { |
| "epoch": 23.275173257236037, |
| "grad_norm": 0.4082277715206146, |
| "learning_rate": 0.0003208890183512962, |
| "loss": 3.1847, |
| "step": 79950 |
| }, |
| { |
| "epoch": 23.289732688835826, |
| "grad_norm": 0.4235358238220215, |
| "learning_rate": 0.00032071424410136905, |
| "loss": 3.1645, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.289732688835826, |
| "eval_accuracy": 0.3736458020878885, |
| "eval_loss": 3.548175811767578, |
| "eval_runtime": 55.4012, |
| "eval_samples_per_second": 300.409, |
| "eval_steps_per_second": 18.79, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.30429212043562, |
| "grad_norm": 0.4308682680130005, |
| "learning_rate": 0.0003205394698514419, |
| "loss": 3.1709, |
| "step": 80050 |
| }, |
| { |
| "epoch": 23.318851552035408, |
| "grad_norm": 0.40191811323165894, |
| "learning_rate": 0.00032036469560151466, |
| "loss": 3.1735, |
| "step": 80100 |
| }, |
| { |
| "epoch": 23.3334109836352, |
| "grad_norm": 0.43274742364883423, |
| "learning_rate": 0.0003201899213515875, |
| "loss": 3.1868, |
| "step": 80150 |
| }, |
| { |
| "epoch": 23.34797041523499, |
| "grad_norm": 0.4104597270488739, |
| "learning_rate": 0.0003200151471016603, |
| "loss": 3.1871, |
| "step": 80200 |
| }, |
| { |
| "epoch": 23.36252984683478, |
| "grad_norm": 0.419541597366333, |
| "learning_rate": 0.00031984037285173316, |
| "loss": 3.1847, |
| "step": 80250 |
| }, |
| { |
| "epoch": 23.37708927843457, |
| "grad_norm": 0.3963826596736908, |
| "learning_rate": 0.00031966559860180594, |
| "loss": 3.1771, |
| "step": 80300 |
| }, |
| { |
| "epoch": 23.39164871003436, |
| "grad_norm": 0.4283881187438965, |
| "learning_rate": 0.00031949082435187877, |
| "loss": 3.1902, |
| "step": 80350 |
| }, |
| { |
| "epoch": 23.40620814163415, |
| "grad_norm": 0.40874427556991577, |
| "learning_rate": 0.0003193160501019516, |
| "loss": 3.1907, |
| "step": 80400 |
| }, |
| { |
| "epoch": 23.42076757323394, |
| "grad_norm": 0.41835615038871765, |
| "learning_rate": 0.00031914127585202444, |
| "loss": 3.1836, |
| "step": 80450 |
| }, |
| { |
| "epoch": 23.43532700483373, |
| "grad_norm": 0.39473778009414673, |
| "learning_rate": 0.00031896650160209727, |
| "loss": 3.1851, |
| "step": 80500 |
| }, |
| { |
| "epoch": 23.449886436433523, |
| "grad_norm": 0.4216783344745636, |
| "learning_rate": 0.00031879172735217005, |
| "loss": 3.187, |
| "step": 80550 |
| }, |
| { |
| "epoch": 23.464445868033312, |
| "grad_norm": 0.4428766071796417, |
| "learning_rate": 0.0003186169531022429, |
| "loss": 3.1891, |
| "step": 80600 |
| }, |
| { |
| "epoch": 23.4790052996331, |
| "grad_norm": 0.40239444375038147, |
| "learning_rate": 0.0003184421788523157, |
| "loss": 3.1913, |
| "step": 80650 |
| }, |
| { |
| "epoch": 23.493564731232894, |
| "grad_norm": 0.4475367069244385, |
| "learning_rate": 0.00031826740460238855, |
| "loss": 3.201, |
| "step": 80700 |
| }, |
| { |
| "epoch": 23.508124162832683, |
| "grad_norm": 0.4128247797489166, |
| "learning_rate": 0.0003180926303524614, |
| "loss": 3.1934, |
| "step": 80750 |
| }, |
| { |
| "epoch": 23.522683594432472, |
| "grad_norm": 0.42478007078170776, |
| "learning_rate": 0.00031791785610253416, |
| "loss": 3.2025, |
| "step": 80800 |
| }, |
| { |
| "epoch": 23.537243026032264, |
| "grad_norm": 0.39820677042007446, |
| "learning_rate": 0.000317743081852607, |
| "loss": 3.1953, |
| "step": 80850 |
| }, |
| { |
| "epoch": 23.551802457632053, |
| "grad_norm": 0.41164451837539673, |
| "learning_rate": 0.0003175683076026798, |
| "loss": 3.2044, |
| "step": 80900 |
| }, |
| { |
| "epoch": 23.566361889231846, |
| "grad_norm": 0.42769721150398254, |
| "learning_rate": 0.00031739353335275266, |
| "loss": 3.2048, |
| "step": 80950 |
| }, |
| { |
| "epoch": 23.580921320831635, |
| "grad_norm": 0.37441954016685486, |
| "learning_rate": 0.00031721875910282544, |
| "loss": 3.201, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.580921320831635, |
| "eval_accuracy": 0.3742237264586946, |
| "eval_loss": 3.538792610168457, |
| "eval_runtime": 55.0335, |
| "eval_samples_per_second": 302.416, |
| "eval_steps_per_second": 18.916, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.595480752431424, |
| "grad_norm": 0.42640015482902527, |
| "learning_rate": 0.00031704398485289827, |
| "loss": 3.2071, |
| "step": 81050 |
| }, |
| { |
| "epoch": 23.610040184031217, |
| "grad_norm": 0.40239205956459045, |
| "learning_rate": 0.0003168692106029711, |
| "loss": 3.2035, |
| "step": 81100 |
| }, |
| { |
| "epoch": 23.624599615631006, |
| "grad_norm": 0.3871544897556305, |
| "learning_rate": 0.000316694436353044, |
| "loss": 3.2138, |
| "step": 81150 |
| }, |
| { |
| "epoch": 23.639159047230795, |
| "grad_norm": 0.3887910842895508, |
| "learning_rate": 0.0003165196621031168, |
| "loss": 3.2048, |
| "step": 81200 |
| }, |
| { |
| "epoch": 23.653718478830587, |
| "grad_norm": 0.3865518569946289, |
| "learning_rate": 0.00031634488785318966, |
| "loss": 3.214, |
| "step": 81250 |
| }, |
| { |
| "epoch": 23.668277910430376, |
| "grad_norm": 0.403149276971817, |
| "learning_rate": 0.00031617011360326243, |
| "loss": 3.2221, |
| "step": 81300 |
| }, |
| { |
| "epoch": 23.68283734203017, |
| "grad_norm": 0.40452221035957336, |
| "learning_rate": 0.00031599533935333527, |
| "loss": 3.1981, |
| "step": 81350 |
| }, |
| { |
| "epoch": 23.697396773629958, |
| "grad_norm": 0.4011068642139435, |
| "learning_rate": 0.0003158205651034081, |
| "loss": 3.1904, |
| "step": 81400 |
| }, |
| { |
| "epoch": 23.711956205229747, |
| "grad_norm": 0.4230940043926239, |
| "learning_rate": 0.00031564579085348093, |
| "loss": 3.2106, |
| "step": 81450 |
| }, |
| { |
| "epoch": 23.72651563682954, |
| "grad_norm": 0.3911479711532593, |
| "learning_rate": 0.0003154710166035537, |
| "loss": 3.2152, |
| "step": 81500 |
| }, |
| { |
| "epoch": 23.74107506842933, |
| "grad_norm": 0.41586339473724365, |
| "learning_rate": 0.00031529624235362655, |
| "loss": 3.2144, |
| "step": 81550 |
| }, |
| { |
| "epoch": 23.755634500029117, |
| "grad_norm": 0.4051753580570221, |
| "learning_rate": 0.0003151214681036994, |
| "loss": 3.2231, |
| "step": 81600 |
| }, |
| { |
| "epoch": 23.77019393162891, |
| "grad_norm": 0.41884034872055054, |
| "learning_rate": 0.0003149466938537722, |
| "loss": 3.2185, |
| "step": 81650 |
| }, |
| { |
| "epoch": 23.7847533632287, |
| "grad_norm": 0.38643860816955566, |
| "learning_rate": 0.00031477191960384504, |
| "loss": 3.2289, |
| "step": 81700 |
| }, |
| { |
| "epoch": 23.79931279482849, |
| "grad_norm": 0.3834930956363678, |
| "learning_rate": 0.0003145971453539178, |
| "loss": 3.2076, |
| "step": 81750 |
| }, |
| { |
| "epoch": 23.81387222642828, |
| "grad_norm": 0.43425455689430237, |
| "learning_rate": 0.00031442237110399066, |
| "loss": 3.2109, |
| "step": 81800 |
| }, |
| { |
| "epoch": 23.82843165802807, |
| "grad_norm": 0.40798330307006836, |
| "learning_rate": 0.0003142475968540635, |
| "loss": 3.2153, |
| "step": 81850 |
| }, |
| { |
| "epoch": 23.842991089627862, |
| "grad_norm": 0.387335866689682, |
| "learning_rate": 0.0003140728226041363, |
| "loss": 3.2292, |
| "step": 81900 |
| }, |
| { |
| "epoch": 23.85755052122765, |
| "grad_norm": 0.4191502332687378, |
| "learning_rate": 0.00031389804835420915, |
| "loss": 3.2335, |
| "step": 81950 |
| }, |
| { |
| "epoch": 23.87210995282744, |
| "grad_norm": 0.4108533263206482, |
| "learning_rate": 0.00031372327410428193, |
| "loss": 3.2211, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.87210995282744, |
| "eval_accuracy": 0.3747248686089237, |
| "eval_loss": 3.529864549636841, |
| "eval_runtime": 55.2779, |
| "eval_samples_per_second": 301.079, |
| "eval_steps_per_second": 18.832, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.886669384427233, |
| "grad_norm": 0.38015156984329224, |
| "learning_rate": 0.00031354849985435477, |
| "loss": 3.2234, |
| "step": 82050 |
| }, |
| { |
| "epoch": 23.90122881602702, |
| "grad_norm": 0.4143611490726471, |
| "learning_rate": 0.0003133737256044276, |
| "loss": 3.2185, |
| "step": 82100 |
| }, |
| { |
| "epoch": 23.915788247626814, |
| "grad_norm": 0.4278761148452759, |
| "learning_rate": 0.00031319895135450043, |
| "loss": 3.223, |
| "step": 82150 |
| }, |
| { |
| "epoch": 23.930347679226603, |
| "grad_norm": 0.3869278132915497, |
| "learning_rate": 0.0003130241771045732, |
| "loss": 3.2287, |
| "step": 82200 |
| }, |
| { |
| "epoch": 23.944907110826392, |
| "grad_norm": 0.40504610538482666, |
| "learning_rate": 0.00031284940285464604, |
| "loss": 3.2226, |
| "step": 82250 |
| }, |
| { |
| "epoch": 23.959466542426185, |
| "grad_norm": 0.4033997654914856, |
| "learning_rate": 0.0003126746286047189, |
| "loss": 3.2121, |
| "step": 82300 |
| }, |
| { |
| "epoch": 23.974025974025974, |
| "grad_norm": 0.4058132469654083, |
| "learning_rate": 0.0003124998543547917, |
| "loss": 3.2233, |
| "step": 82350 |
| }, |
| { |
| "epoch": 23.988585405625763, |
| "grad_norm": 0.4490283727645874, |
| "learning_rate": 0.00031232508010486454, |
| "loss": 3.2168, |
| "step": 82400 |
| }, |
| { |
| "epoch": 24.002911886319957, |
| "grad_norm": 0.4183407723903656, |
| "learning_rate": 0.0003121503058549373, |
| "loss": 3.2061, |
| "step": 82450 |
| }, |
| { |
| "epoch": 24.01747131791975, |
| "grad_norm": 0.42069587111473083, |
| "learning_rate": 0.00031197553160501016, |
| "loss": 3.1255, |
| "step": 82500 |
| }, |
| { |
| "epoch": 24.03203074951954, |
| "grad_norm": 0.39097297191619873, |
| "learning_rate": 0.000311800757355083, |
| "loss": 3.1343, |
| "step": 82550 |
| }, |
| { |
| "epoch": 24.046590181119328, |
| "grad_norm": 0.3835448920726776, |
| "learning_rate": 0.0003116259831051558, |
| "loss": 3.1294, |
| "step": 82600 |
| }, |
| { |
| "epoch": 24.06114961271912, |
| "grad_norm": 0.46320199966430664, |
| "learning_rate": 0.00031145120885522865, |
| "loss": 3.1294, |
| "step": 82650 |
| }, |
| { |
| "epoch": 24.07570904431891, |
| "grad_norm": 0.4252343773841858, |
| "learning_rate": 0.00031127643460530143, |
| "loss": 3.1271, |
| "step": 82700 |
| }, |
| { |
| "epoch": 24.090268475918702, |
| "grad_norm": 0.41149669885635376, |
| "learning_rate": 0.00031110166035537427, |
| "loss": 3.1377, |
| "step": 82750 |
| }, |
| { |
| "epoch": 24.10482790751849, |
| "grad_norm": 0.4045622944831848, |
| "learning_rate": 0.0003109268861054471, |
| "loss": 3.1455, |
| "step": 82800 |
| }, |
| { |
| "epoch": 24.11938733911828, |
| "grad_norm": 0.40084847807884216, |
| "learning_rate": 0.00031075211185551993, |
| "loss": 3.1424, |
| "step": 82850 |
| }, |
| { |
| "epoch": 24.133946770718072, |
| "grad_norm": 0.40093955397605896, |
| "learning_rate": 0.0003105773376055927, |
| "loss": 3.1459, |
| "step": 82900 |
| }, |
| { |
| "epoch": 24.14850620231786, |
| "grad_norm": 0.40974506735801697, |
| "learning_rate": 0.00031040256335566554, |
| "loss": 3.1353, |
| "step": 82950 |
| }, |
| { |
| "epoch": 24.16306563391765, |
| "grad_norm": 0.4075878858566284, |
| "learning_rate": 0.0003102277891057384, |
| "loss": 3.1515, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.16306563391765, |
| "eval_accuracy": 0.37420162070453156, |
| "eval_loss": 3.5482983589172363, |
| "eval_runtime": 55.4288, |
| "eval_samples_per_second": 300.259, |
| "eval_steps_per_second": 18.781, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.177625065517443, |
| "grad_norm": 0.47770339250564575, |
| "learning_rate": 0.0003100530148558112, |
| "loss": 3.14, |
| "step": 83050 |
| }, |
| { |
| "epoch": 24.192184497117232, |
| "grad_norm": 0.42492687702178955, |
| "learning_rate": 0.00030987824060588404, |
| "loss": 3.1502, |
| "step": 83100 |
| }, |
| { |
| "epoch": 24.206743928717025, |
| "grad_norm": 0.4377771317958832, |
| "learning_rate": 0.0003097034663559568, |
| "loss": 3.135, |
| "step": 83150 |
| }, |
| { |
| "epoch": 24.221303360316814, |
| "grad_norm": 0.407498300075531, |
| "learning_rate": 0.00030952869210602965, |
| "loss": 3.1564, |
| "step": 83200 |
| }, |
| { |
| "epoch": 24.235862791916603, |
| "grad_norm": 0.40489810705184937, |
| "learning_rate": 0.0003093539178561025, |
| "loss": 3.1645, |
| "step": 83250 |
| }, |
| { |
| "epoch": 24.250422223516395, |
| "grad_norm": 0.4017831087112427, |
| "learning_rate": 0.0003091791436061753, |
| "loss": 3.1623, |
| "step": 83300 |
| }, |
| { |
| "epoch": 24.264981655116184, |
| "grad_norm": 0.4044218063354492, |
| "learning_rate": 0.00030900436935624815, |
| "loss": 3.1628, |
| "step": 83350 |
| }, |
| { |
| "epoch": 24.279541086715973, |
| "grad_norm": 0.4220918118953705, |
| "learning_rate": 0.00030882959510632093, |
| "loss": 3.1707, |
| "step": 83400 |
| }, |
| { |
| "epoch": 24.294100518315766, |
| "grad_norm": 0.4163309335708618, |
| "learning_rate": 0.00030865482085639377, |
| "loss": 3.1552, |
| "step": 83450 |
| }, |
| { |
| "epoch": 24.308659949915555, |
| "grad_norm": 0.41771310567855835, |
| "learning_rate": 0.0003084800466064666, |
| "loss": 3.1748, |
| "step": 83500 |
| }, |
| { |
| "epoch": 24.323219381515347, |
| "grad_norm": 0.43180546164512634, |
| "learning_rate": 0.00030830527235653943, |
| "loss": 3.1792, |
| "step": 83550 |
| }, |
| { |
| "epoch": 24.337778813115136, |
| "grad_norm": 0.4262148141860962, |
| "learning_rate": 0.0003081304981066122, |
| "loss": 3.1772, |
| "step": 83600 |
| }, |
| { |
| "epoch": 24.352338244714925, |
| "grad_norm": 0.4312186539173126, |
| "learning_rate": 0.00030795572385668504, |
| "loss": 3.1681, |
| "step": 83650 |
| }, |
| { |
| "epoch": 24.366897676314718, |
| "grad_norm": 0.40577229857444763, |
| "learning_rate": 0.00030778094960675793, |
| "loss": 3.1762, |
| "step": 83700 |
| }, |
| { |
| "epoch": 24.381457107914507, |
| "grad_norm": 0.3947165906429291, |
| "learning_rate": 0.00030760617535683076, |
| "loss": 3.1619, |
| "step": 83750 |
| }, |
| { |
| "epoch": 24.396016539514296, |
| "grad_norm": 0.40584006905555725, |
| "learning_rate": 0.0003074314011069036, |
| "loss": 3.1753, |
| "step": 83800 |
| }, |
| { |
| "epoch": 24.41057597111409, |
| "grad_norm": 0.4310337007045746, |
| "learning_rate": 0.00030725662685697643, |
| "loss": 3.169, |
| "step": 83850 |
| }, |
| { |
| "epoch": 24.425135402713877, |
| "grad_norm": 0.4535801410675049, |
| "learning_rate": 0.0003070818526070492, |
| "loss": 3.1758, |
| "step": 83900 |
| }, |
| { |
| "epoch": 24.43969483431367, |
| "grad_norm": 0.4137560725212097, |
| "learning_rate": 0.00030690707835712204, |
| "loss": 3.1876, |
| "step": 83950 |
| }, |
| { |
| "epoch": 24.45425426591346, |
| "grad_norm": 0.4285685420036316, |
| "learning_rate": 0.0003067323041071949, |
| "loss": 3.1878, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.45425426591346, |
| "eval_accuracy": 0.37403418137512606, |
| "eval_loss": 3.5453009605407715, |
| "eval_runtime": 55.213, |
| "eval_samples_per_second": 301.432, |
| "eval_steps_per_second": 18.854, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.468813697513248, |
| "grad_norm": 0.40945473313331604, |
| "learning_rate": 0.0003065575298572677, |
| "loss": 3.1896, |
| "step": 84050 |
| }, |
| { |
| "epoch": 24.48337312911304, |
| "grad_norm": 0.44182032346725464, |
| "learning_rate": 0.00030638275560734054, |
| "loss": 3.1842, |
| "step": 84100 |
| }, |
| { |
| "epoch": 24.49793256071283, |
| "grad_norm": 0.43453431129455566, |
| "learning_rate": 0.0003062079813574133, |
| "loss": 3.1918, |
| "step": 84150 |
| }, |
| { |
| "epoch": 24.51249199231262, |
| "grad_norm": 0.3916553556919098, |
| "learning_rate": 0.00030603320710748615, |
| "loss": 3.1792, |
| "step": 84200 |
| }, |
| { |
| "epoch": 24.52705142391241, |
| "grad_norm": 0.4035761058330536, |
| "learning_rate": 0.000305858432857559, |
| "loss": 3.1862, |
| "step": 84250 |
| }, |
| { |
| "epoch": 24.5416108555122, |
| "grad_norm": 0.4316408932209015, |
| "learning_rate": 0.0003056836586076318, |
| "loss": 3.1937, |
| "step": 84300 |
| }, |
| { |
| "epoch": 24.556170287111993, |
| "grad_norm": 0.43183833360671997, |
| "learning_rate": 0.0003055088843577046, |
| "loss": 3.1832, |
| "step": 84350 |
| }, |
| { |
| "epoch": 24.57072971871178, |
| "grad_norm": 0.40193501114845276, |
| "learning_rate": 0.00030533411010777743, |
| "loss": 3.199, |
| "step": 84400 |
| }, |
| { |
| "epoch": 24.58528915031157, |
| "grad_norm": 0.4043532609939575, |
| "learning_rate": 0.00030515933585785026, |
| "loss": 3.1868, |
| "step": 84450 |
| }, |
| { |
| "epoch": 24.599848581911363, |
| "grad_norm": 0.40620195865631104, |
| "learning_rate": 0.0003049845616079231, |
| "loss": 3.1785, |
| "step": 84500 |
| }, |
| { |
| "epoch": 24.614408013511152, |
| "grad_norm": 0.41706621646881104, |
| "learning_rate": 0.00030480978735799593, |
| "loss": 3.205, |
| "step": 84550 |
| }, |
| { |
| "epoch": 24.62896744511094, |
| "grad_norm": 0.41417932510375977, |
| "learning_rate": 0.0003046350131080687, |
| "loss": 3.186, |
| "step": 84600 |
| }, |
| { |
| "epoch": 24.643526876710734, |
| "grad_norm": 0.45793968439102173, |
| "learning_rate": 0.00030446023885814154, |
| "loss": 3.1948, |
| "step": 84650 |
| }, |
| { |
| "epoch": 24.658086308310523, |
| "grad_norm": 0.4425913393497467, |
| "learning_rate": 0.0003042854646082144, |
| "loss": 3.1952, |
| "step": 84700 |
| }, |
| { |
| "epoch": 24.672645739910315, |
| "grad_norm": 0.41631364822387695, |
| "learning_rate": 0.0003041106903582872, |
| "loss": 3.1955, |
| "step": 84750 |
| }, |
| { |
| "epoch": 24.687205171510104, |
| "grad_norm": 0.395309180021286, |
| "learning_rate": 0.00030393591610836, |
| "loss": 3.2076, |
| "step": 84800 |
| }, |
| { |
| "epoch": 24.701764603109893, |
| "grad_norm": 0.4197699725627899, |
| "learning_rate": 0.0003037611418584328, |
| "loss": 3.2049, |
| "step": 84850 |
| }, |
| { |
| "epoch": 24.716324034709686, |
| "grad_norm": 0.44752877950668335, |
| "learning_rate": 0.00030358636760850565, |
| "loss": 3.2045, |
| "step": 84900 |
| }, |
| { |
| "epoch": 24.730883466309475, |
| "grad_norm": 0.43083128333091736, |
| "learning_rate": 0.0003034115933585785, |
| "loss": 3.203, |
| "step": 84950 |
| }, |
| { |
| "epoch": 24.745442897909264, |
| "grad_norm": 0.4351908564567566, |
| "learning_rate": 0.0003032368191086513, |
| "loss": 3.2018, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.745442897909264, |
| "eval_accuracy": 0.3745138056901857, |
| "eval_loss": 3.534050941467285, |
| "eval_runtime": 54.5127, |
| "eval_samples_per_second": 305.305, |
| "eval_steps_per_second": 19.096, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.760002329509057, |
| "grad_norm": 0.3845418393611908, |
| "learning_rate": 0.0003030620448587241, |
| "loss": 3.2058, |
| "step": 85050 |
| }, |
| { |
| "epoch": 24.774561761108846, |
| "grad_norm": 0.3963243365287781, |
| "learning_rate": 0.00030288727060879693, |
| "loss": 3.2008, |
| "step": 85100 |
| }, |
| { |
| "epoch": 24.789121192708638, |
| "grad_norm": 0.4321513772010803, |
| "learning_rate": 0.00030271249635886976, |
| "loss": 3.1983, |
| "step": 85150 |
| }, |
| { |
| "epoch": 24.803680624308427, |
| "grad_norm": 0.44876664876937866, |
| "learning_rate": 0.0003025377221089426, |
| "loss": 3.2135, |
| "step": 85200 |
| }, |
| { |
| "epoch": 24.818240055908216, |
| "grad_norm": 0.409246563911438, |
| "learning_rate": 0.00030236294785901543, |
| "loss": 3.217, |
| "step": 85250 |
| }, |
| { |
| "epoch": 24.83279948750801, |
| "grad_norm": 0.4117927551269531, |
| "learning_rate": 0.0003021881736090882, |
| "loss": 3.215, |
| "step": 85300 |
| }, |
| { |
| "epoch": 24.847358919107798, |
| "grad_norm": 0.40099072456359863, |
| "learning_rate": 0.00030201339935916104, |
| "loss": 3.2129, |
| "step": 85350 |
| }, |
| { |
| "epoch": 24.861918350707587, |
| "grad_norm": 0.3935149908065796, |
| "learning_rate": 0.00030183862510923387, |
| "loss": 3.218, |
| "step": 85400 |
| }, |
| { |
| "epoch": 24.87647778230738, |
| "grad_norm": 0.39060723781585693, |
| "learning_rate": 0.0003016638508593067, |
| "loss": 3.1989, |
| "step": 85450 |
| }, |
| { |
| "epoch": 24.89103721390717, |
| "grad_norm": 0.41106799244880676, |
| "learning_rate": 0.0003014890766093795, |
| "loss": 3.2128, |
| "step": 85500 |
| }, |
| { |
| "epoch": 24.90559664550696, |
| "grad_norm": 0.39192450046539307, |
| "learning_rate": 0.0003013143023594523, |
| "loss": 3.2181, |
| "step": 85550 |
| }, |
| { |
| "epoch": 24.92015607710675, |
| "grad_norm": 0.40416061878204346, |
| "learning_rate": 0.00030113952810952515, |
| "loss": 3.2211, |
| "step": 85600 |
| }, |
| { |
| "epoch": 24.93471550870654, |
| "grad_norm": 0.3965405821800232, |
| "learning_rate": 0.000300964753859598, |
| "loss": 3.2135, |
| "step": 85650 |
| }, |
| { |
| "epoch": 24.94927494030633, |
| "grad_norm": 0.42308685183525085, |
| "learning_rate": 0.0003007899796096708, |
| "loss": 3.2049, |
| "step": 85700 |
| }, |
| { |
| "epoch": 24.96383437190612, |
| "grad_norm": 0.41107550263404846, |
| "learning_rate": 0.0003006152053597436, |
| "loss": 3.2127, |
| "step": 85750 |
| }, |
| { |
| "epoch": 24.97839380350591, |
| "grad_norm": 0.42214998602867126, |
| "learning_rate": 0.00030044043110981643, |
| "loss": 3.2114, |
| "step": 85800 |
| }, |
| { |
| "epoch": 24.992953235105702, |
| "grad_norm": 0.4449729323387146, |
| "learning_rate": 0.00030026565685988926, |
| "loss": 3.2252, |
| "step": 85850 |
| }, |
| { |
| "epoch": 25.007279715799896, |
| "grad_norm": 0.4267348349094391, |
| "learning_rate": 0.0003000908826099621, |
| "loss": 3.1622, |
| "step": 85900 |
| }, |
| { |
| "epoch": 25.021839147399685, |
| "grad_norm": 0.40839192271232605, |
| "learning_rate": 0.0002999161083600349, |
| "loss": 3.106, |
| "step": 85950 |
| }, |
| { |
| "epoch": 25.036398578999474, |
| "grad_norm": 0.4623876214027405, |
| "learning_rate": 0.00029974133411010776, |
| "loss": 3.1168, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.036398578999474, |
| "eval_accuracy": 0.37416763898669575, |
| "eval_loss": 3.5501725673675537, |
| "eval_runtime": 55.1681, |
| "eval_samples_per_second": 301.678, |
| "eval_steps_per_second": 18.87, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.050958010599267, |
| "grad_norm": 0.41614830493927, |
| "learning_rate": 0.0002995665598601806, |
| "loss": 3.125, |
| "step": 86050 |
| }, |
| { |
| "epoch": 25.065517442199056, |
| "grad_norm": 0.3899344801902771, |
| "learning_rate": 0.00029939178561025337, |
| "loss": 3.1259, |
| "step": 86100 |
| }, |
| { |
| "epoch": 25.08007687379885, |
| "grad_norm": 0.4196447432041168, |
| "learning_rate": 0.0002992170113603262, |
| "loss": 3.1196, |
| "step": 86150 |
| }, |
| { |
| "epoch": 25.094636305398637, |
| "grad_norm": 0.43210721015930176, |
| "learning_rate": 0.00029904223711039904, |
| "loss": 3.1397, |
| "step": 86200 |
| }, |
| { |
| "epoch": 25.109195736998426, |
| "grad_norm": 0.4325079321861267, |
| "learning_rate": 0.00029886746286047187, |
| "loss": 3.1354, |
| "step": 86250 |
| }, |
| { |
| "epoch": 25.12375516859822, |
| "grad_norm": 0.39312827587127686, |
| "learning_rate": 0.0002986926886105447, |
| "loss": 3.1388, |
| "step": 86300 |
| }, |
| { |
| "epoch": 25.138314600198008, |
| "grad_norm": 0.4397601783275604, |
| "learning_rate": 0.0002985179143606175, |
| "loss": 3.1425, |
| "step": 86350 |
| }, |
| { |
| "epoch": 25.152874031797797, |
| "grad_norm": 0.446545273065567, |
| "learning_rate": 0.0002983431401106903, |
| "loss": 3.1282, |
| "step": 86400 |
| }, |
| { |
| "epoch": 25.16743346339759, |
| "grad_norm": 0.43214336037635803, |
| "learning_rate": 0.00029816836586076315, |
| "loss": 3.1458, |
| "step": 86450 |
| }, |
| { |
| "epoch": 25.18199289499738, |
| "grad_norm": 0.3953275680541992, |
| "learning_rate": 0.000297993591610836, |
| "loss": 3.1296, |
| "step": 86500 |
| }, |
| { |
| "epoch": 25.19655232659717, |
| "grad_norm": 0.40356674790382385, |
| "learning_rate": 0.0002978188173609088, |
| "loss": 3.1467, |
| "step": 86550 |
| }, |
| { |
| "epoch": 25.21111175819696, |
| "grad_norm": 0.41587722301483154, |
| "learning_rate": 0.0002976440431109816, |
| "loss": 3.1566, |
| "step": 86600 |
| }, |
| { |
| "epoch": 25.22567118979675, |
| "grad_norm": 0.39485782384872437, |
| "learning_rate": 0.0002974692688610544, |
| "loss": 3.1511, |
| "step": 86650 |
| }, |
| { |
| "epoch": 25.24023062139654, |
| "grad_norm": 0.41744348406791687, |
| "learning_rate": 0.00029729449461112726, |
| "loss": 3.1639, |
| "step": 86700 |
| }, |
| { |
| "epoch": 25.25479005299633, |
| "grad_norm": 0.4251966178417206, |
| "learning_rate": 0.0002971197203612001, |
| "loss": 3.1446, |
| "step": 86750 |
| }, |
| { |
| "epoch": 25.26934948459612, |
| "grad_norm": 0.4173142611980438, |
| "learning_rate": 0.00029694494611127287, |
| "loss": 3.1646, |
| "step": 86800 |
| }, |
| { |
| "epoch": 25.283908916195912, |
| "grad_norm": 0.401950478553772, |
| "learning_rate": 0.00029677017186134576, |
| "loss": 3.1517, |
| "step": 86850 |
| }, |
| { |
| "epoch": 25.2984683477957, |
| "grad_norm": 0.4227960407733917, |
| "learning_rate": 0.0002965953976114186, |
| "loss": 3.1665, |
| "step": 86900 |
| }, |
| { |
| "epoch": 25.313027779395494, |
| "grad_norm": 0.44079259037971497, |
| "learning_rate": 0.00029642062336149137, |
| "loss": 3.161, |
| "step": 86950 |
| }, |
| { |
| "epoch": 25.327587210995283, |
| "grad_norm": 0.3958124816417694, |
| "learning_rate": 0.0002962458491115642, |
| "loss": 3.158, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.327587210995283, |
| "eval_accuracy": 0.37426346978266867, |
| "eval_loss": 3.545079469680786, |
| "eval_runtime": 55.2417, |
| "eval_samples_per_second": 301.276, |
| "eval_steps_per_second": 18.844, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.342146642595072, |
| "grad_norm": 0.4321572184562683, |
| "learning_rate": 0.00029607107486163704, |
| "loss": 3.1602, |
| "step": 87050 |
| }, |
| { |
| "epoch": 25.356706074194864, |
| "grad_norm": 0.4014240503311157, |
| "learning_rate": 0.00029589630061170987, |
| "loss": 3.1579, |
| "step": 87100 |
| }, |
| { |
| "epoch": 25.371265505794653, |
| "grad_norm": 0.4316968619823456, |
| "learning_rate": 0.0002957215263617827, |
| "loss": 3.1832, |
| "step": 87150 |
| }, |
| { |
| "epoch": 25.385824937394442, |
| "grad_norm": 0.423284649848938, |
| "learning_rate": 0.0002955467521118555, |
| "loss": 3.162, |
| "step": 87200 |
| }, |
| { |
| "epoch": 25.400384368994235, |
| "grad_norm": 0.4421418011188507, |
| "learning_rate": 0.0002953719778619283, |
| "loss": 3.1639, |
| "step": 87250 |
| }, |
| { |
| "epoch": 25.414943800594024, |
| "grad_norm": 0.4119024872779846, |
| "learning_rate": 0.00029519720361200115, |
| "loss": 3.1674, |
| "step": 87300 |
| }, |
| { |
| "epoch": 25.429503232193817, |
| "grad_norm": 0.39992815256118774, |
| "learning_rate": 0.000295022429362074, |
| "loss": 3.1677, |
| "step": 87350 |
| }, |
| { |
| "epoch": 25.444062663793606, |
| "grad_norm": 0.42137086391448975, |
| "learning_rate": 0.00029484765511214676, |
| "loss": 3.1666, |
| "step": 87400 |
| }, |
| { |
| "epoch": 25.458622095393395, |
| "grad_norm": 0.42965787649154663, |
| "learning_rate": 0.0002946728808622196, |
| "loss": 3.1711, |
| "step": 87450 |
| }, |
| { |
| "epoch": 25.473181526993187, |
| "grad_norm": 0.4427332878112793, |
| "learning_rate": 0.0002944981066122924, |
| "loss": 3.1795, |
| "step": 87500 |
| }, |
| { |
| "epoch": 25.487740958592976, |
| "grad_norm": 0.43246063590049744, |
| "learning_rate": 0.00029432333236236526, |
| "loss": 3.1767, |
| "step": 87550 |
| }, |
| { |
| "epoch": 25.502300390192765, |
| "grad_norm": 0.44357001781463623, |
| "learning_rate": 0.0002941485581124381, |
| "loss": 3.1766, |
| "step": 87600 |
| }, |
| { |
| "epoch": 25.516859821792558, |
| "grad_norm": 0.4451310932636261, |
| "learning_rate": 0.00029397378386251087, |
| "loss": 3.1828, |
| "step": 87650 |
| }, |
| { |
| "epoch": 25.531419253392347, |
| "grad_norm": 0.4538399875164032, |
| "learning_rate": 0.0002937990096125837, |
| "loss": 3.1741, |
| "step": 87700 |
| }, |
| { |
| "epoch": 25.54597868499214, |
| "grad_norm": 0.4227699339389801, |
| "learning_rate": 0.00029362423536265654, |
| "loss": 3.1792, |
| "step": 87750 |
| }, |
| { |
| "epoch": 25.56053811659193, |
| "grad_norm": 0.4142102003097534, |
| "learning_rate": 0.00029344946111272937, |
| "loss": 3.1811, |
| "step": 87800 |
| }, |
| { |
| "epoch": 25.575097548191717, |
| "grad_norm": 0.4136430323123932, |
| "learning_rate": 0.0002932746868628022, |
| "loss": 3.1922, |
| "step": 87850 |
| }, |
| { |
| "epoch": 25.58965697979151, |
| "grad_norm": 0.435993492603302, |
| "learning_rate": 0.000293099912612875, |
| "loss": 3.1807, |
| "step": 87900 |
| }, |
| { |
| "epoch": 25.6042164113913, |
| "grad_norm": 0.43335679173469543, |
| "learning_rate": 0.0002929251383629478, |
| "loss": 3.1829, |
| "step": 87950 |
| }, |
| { |
| "epoch": 25.61877584299109, |
| "grad_norm": 0.4427852928638458, |
| "learning_rate": 0.00029275036411302065, |
| "loss": 3.1849, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.61877584299109, |
| "eval_accuracy": 0.37467183831569206, |
| "eval_loss": 3.540363311767578, |
| "eval_runtime": 54.1234, |
| "eval_samples_per_second": 307.501, |
| "eval_steps_per_second": 19.234, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.63333527459088, |
| "grad_norm": 0.42897364497184753, |
| "learning_rate": 0.0002925755898630935, |
| "loss": 3.1897, |
| "step": 88050 |
| }, |
| { |
| "epoch": 25.64789470619067, |
| "grad_norm": 0.42489439249038696, |
| "learning_rate": 0.0002924008156131663, |
| "loss": 3.1953, |
| "step": 88100 |
| }, |
| { |
| "epoch": 25.662454137790462, |
| "grad_norm": 0.4370625615119934, |
| "learning_rate": 0.00029222604136323915, |
| "loss": 3.1984, |
| "step": 88150 |
| }, |
| { |
| "epoch": 25.67701356939025, |
| "grad_norm": 0.4505648612976074, |
| "learning_rate": 0.000292051267113312, |
| "loss": 3.2036, |
| "step": 88200 |
| }, |
| { |
| "epoch": 25.69157300099004, |
| "grad_norm": 0.4252602458000183, |
| "learning_rate": 0.00029187649286338476, |
| "loss": 3.1819, |
| "step": 88250 |
| }, |
| { |
| "epoch": 25.706132432589833, |
| "grad_norm": 0.4348510205745697, |
| "learning_rate": 0.0002917017186134576, |
| "loss": 3.1889, |
| "step": 88300 |
| }, |
| { |
| "epoch": 25.72069186418962, |
| "grad_norm": 0.3967111110687256, |
| "learning_rate": 0.0002915269443635304, |
| "loss": 3.1895, |
| "step": 88350 |
| }, |
| { |
| "epoch": 25.73525129578941, |
| "grad_norm": 0.4344504773616791, |
| "learning_rate": 0.00029135217011360326, |
| "loss": 3.1952, |
| "step": 88400 |
| }, |
| { |
| "epoch": 25.749810727389203, |
| "grad_norm": 0.3971911072731018, |
| "learning_rate": 0.0002911773958636761, |
| "loss": 3.1879, |
| "step": 88450 |
| }, |
| { |
| "epoch": 25.764370158988992, |
| "grad_norm": 0.41786593198776245, |
| "learning_rate": 0.00029100262161374887, |
| "loss": 3.2031, |
| "step": 88500 |
| }, |
| { |
| "epoch": 25.778929590588785, |
| "grad_norm": 0.4138876497745514, |
| "learning_rate": 0.0002908278473638217, |
| "loss": 3.1932, |
| "step": 88550 |
| }, |
| { |
| "epoch": 25.793489022188574, |
| "grad_norm": 0.42914995551109314, |
| "learning_rate": 0.00029065307311389453, |
| "loss": 3.1864, |
| "step": 88600 |
| }, |
| { |
| "epoch": 25.808048453788363, |
| "grad_norm": 0.41269779205322266, |
| "learning_rate": 0.00029047829886396737, |
| "loss": 3.2, |
| "step": 88650 |
| }, |
| { |
| "epoch": 25.822607885388155, |
| "grad_norm": 0.39072921872138977, |
| "learning_rate": 0.0002903035246140402, |
| "loss": 3.195, |
| "step": 88700 |
| }, |
| { |
| "epoch": 25.837167316987944, |
| "grad_norm": 0.42368221282958984, |
| "learning_rate": 0.000290128750364113, |
| "loss": 3.1982, |
| "step": 88750 |
| }, |
| { |
| "epoch": 25.851726748587737, |
| "grad_norm": 0.38363558053970337, |
| "learning_rate": 0.0002899539761141858, |
| "loss": 3.2075, |
| "step": 88800 |
| }, |
| { |
| "epoch": 25.866286180187526, |
| "grad_norm": 0.4270095229148865, |
| "learning_rate": 0.00028977920186425864, |
| "loss": 3.2044, |
| "step": 88850 |
| }, |
| { |
| "epoch": 25.880845611787315, |
| "grad_norm": 0.4175295829772949, |
| "learning_rate": 0.0002896044276143315, |
| "loss": 3.1821, |
| "step": 88900 |
| }, |
| { |
| "epoch": 25.895405043387107, |
| "grad_norm": 0.4077088534832001, |
| "learning_rate": 0.00028942965336440426, |
| "loss": 3.1904, |
| "step": 88950 |
| }, |
| { |
| "epoch": 25.909964474986896, |
| "grad_norm": 0.41781890392303467, |
| "learning_rate": 0.0002892548791144771, |
| "loss": 3.1933, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.909964474986896, |
| "eval_accuracy": 0.37507350457218724, |
| "eval_loss": 3.53283953666687, |
| "eval_runtime": 53.8952, |
| "eval_samples_per_second": 308.803, |
| "eval_steps_per_second": 19.315, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.924523906586685, |
| "grad_norm": 0.412068247795105, |
| "learning_rate": 0.0002890801048645499, |
| "loss": 3.1933, |
| "step": 89050 |
| }, |
| { |
| "epoch": 25.939083338186478, |
| "grad_norm": 0.4057926535606384, |
| "learning_rate": 0.00028890533061462276, |
| "loss": 3.2093, |
| "step": 89100 |
| }, |
| { |
| "epoch": 25.953642769786267, |
| "grad_norm": 0.41411447525024414, |
| "learning_rate": 0.0002887305563646956, |
| "loss": 3.2055, |
| "step": 89150 |
| }, |
| { |
| "epoch": 25.968202201386056, |
| "grad_norm": 0.3971509039402008, |
| "learning_rate": 0.00028855578211476837, |
| "loss": 3.2091, |
| "step": 89200 |
| }, |
| { |
| "epoch": 25.98276163298585, |
| "grad_norm": 0.40727517008781433, |
| "learning_rate": 0.0002883810078648412, |
| "loss": 3.2159, |
| "step": 89250 |
| }, |
| { |
| "epoch": 25.997321064585638, |
| "grad_norm": 0.4136078655719757, |
| "learning_rate": 0.00028820623361491403, |
| "loss": 3.2117, |
| "step": 89300 |
| }, |
| { |
| "epoch": 26.011647545279832, |
| "grad_norm": 0.40786686539649963, |
| "learning_rate": 0.00028803145936498687, |
| "loss": 3.1265, |
| "step": 89350 |
| }, |
| { |
| "epoch": 26.02620697687962, |
| "grad_norm": 0.4166758060455322, |
| "learning_rate": 0.0002878566851150597, |
| "loss": 3.1048, |
| "step": 89400 |
| }, |
| { |
| "epoch": 26.040766408479413, |
| "grad_norm": 0.46875035762786865, |
| "learning_rate": 0.00028768191086513253, |
| "loss": 3.109, |
| "step": 89450 |
| }, |
| { |
| "epoch": 26.055325840079202, |
| "grad_norm": 0.44817620515823364, |
| "learning_rate": 0.00028750713661520536, |
| "loss": 3.1166, |
| "step": 89500 |
| }, |
| { |
| "epoch": 26.069885271678995, |
| "grad_norm": 0.423794150352478, |
| "learning_rate": 0.00028733236236527814, |
| "loss": 3.1114, |
| "step": 89550 |
| }, |
| { |
| "epoch": 26.084444703278784, |
| "grad_norm": 0.4060429036617279, |
| "learning_rate": 0.000287157588115351, |
| "loss": 3.1176, |
| "step": 89600 |
| }, |
| { |
| "epoch": 26.099004134878573, |
| "grad_norm": 0.4524560272693634, |
| "learning_rate": 0.0002869828138654238, |
| "loss": 3.1338, |
| "step": 89650 |
| }, |
| { |
| "epoch": 26.113563566478366, |
| "grad_norm": 0.40616855025291443, |
| "learning_rate": 0.00028680803961549664, |
| "loss": 3.1233, |
| "step": 89700 |
| }, |
| { |
| "epoch": 26.128122998078155, |
| "grad_norm": 0.4383051097393036, |
| "learning_rate": 0.0002866332653655695, |
| "loss": 3.1301, |
| "step": 89750 |
| }, |
| { |
| "epoch": 26.142682429677944, |
| "grad_norm": 0.4387468099594116, |
| "learning_rate": 0.00028645849111564225, |
| "loss": 3.1249, |
| "step": 89800 |
| }, |
| { |
| "epoch": 26.157241861277736, |
| "grad_norm": 0.40583929419517517, |
| "learning_rate": 0.0002862837168657151, |
| "loss": 3.1334, |
| "step": 89850 |
| }, |
| { |
| "epoch": 26.171801292877525, |
| "grad_norm": 0.4405916631221771, |
| "learning_rate": 0.0002861089426157879, |
| "loss": 3.1373, |
| "step": 89900 |
| }, |
| { |
| "epoch": 26.186360724477318, |
| "grad_norm": 0.42231664061546326, |
| "learning_rate": 0.00028593416836586075, |
| "loss": 3.1288, |
| "step": 89950 |
| }, |
| { |
| "epoch": 26.200920156077107, |
| "grad_norm": 0.4573913514614105, |
| "learning_rate": 0.0002857593941159336, |
| "loss": 3.1394, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.200920156077107, |
| "eval_accuracy": 0.3741908029950475, |
| "eval_loss": 3.55145263671875, |
| "eval_runtime": 53.8903, |
| "eval_samples_per_second": 308.831, |
| "eval_steps_per_second": 19.317, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.215479587676896, |
| "grad_norm": 0.427679181098938, |
| "learning_rate": 0.00028558461986600637, |
| "loss": 3.1458, |
| "step": 90050 |
| }, |
| { |
| "epoch": 26.23003901927669, |
| "grad_norm": 0.4170896112918854, |
| "learning_rate": 0.0002854098456160792, |
| "loss": 3.1466, |
| "step": 90100 |
| }, |
| { |
| "epoch": 26.244598450876477, |
| "grad_norm": 0.42612671852111816, |
| "learning_rate": 0.00028523507136615203, |
| "loss": 3.1484, |
| "step": 90150 |
| }, |
| { |
| "epoch": 26.259157882476266, |
| "grad_norm": 0.4112977385520935, |
| "learning_rate": 0.00028506029711622486, |
| "loss": 3.1561, |
| "step": 90200 |
| }, |
| { |
| "epoch": 26.27371731407606, |
| "grad_norm": 0.42549455165863037, |
| "learning_rate": 0.00028488552286629764, |
| "loss": 3.148, |
| "step": 90250 |
| }, |
| { |
| "epoch": 26.288276745675848, |
| "grad_norm": 0.41805240511894226, |
| "learning_rate": 0.0002847107486163705, |
| "loss": 3.1499, |
| "step": 90300 |
| }, |
| { |
| "epoch": 26.30283617727564, |
| "grad_norm": 0.42571932077407837, |
| "learning_rate": 0.0002845359743664433, |
| "loss": 3.1329, |
| "step": 90350 |
| }, |
| { |
| "epoch": 26.31739560887543, |
| "grad_norm": 0.4267009198665619, |
| "learning_rate": 0.00028436120011651614, |
| "loss": 3.1595, |
| "step": 90400 |
| }, |
| { |
| "epoch": 26.33195504047522, |
| "grad_norm": 0.402904748916626, |
| "learning_rate": 0.000284186425866589, |
| "loss": 3.1578, |
| "step": 90450 |
| }, |
| { |
| "epoch": 26.34651447207501, |
| "grad_norm": 0.41704311966896057, |
| "learning_rate": 0.00028401165161666175, |
| "loss": 3.1664, |
| "step": 90500 |
| }, |
| { |
| "epoch": 26.3610739036748, |
| "grad_norm": 0.43055227398872375, |
| "learning_rate": 0.0002838368773667346, |
| "loss": 3.1438, |
| "step": 90550 |
| }, |
| { |
| "epoch": 26.375633335274593, |
| "grad_norm": 0.42857736349105835, |
| "learning_rate": 0.0002836621031168075, |
| "loss": 3.1536, |
| "step": 90600 |
| }, |
| { |
| "epoch": 26.39019276687438, |
| "grad_norm": 0.40300217270851135, |
| "learning_rate": 0.00028348732886688025, |
| "loss": 3.1539, |
| "step": 90650 |
| }, |
| { |
| "epoch": 26.40475219847417, |
| "grad_norm": 0.4351729452610016, |
| "learning_rate": 0.0002833125546169531, |
| "loss": 3.1485, |
| "step": 90700 |
| }, |
| { |
| "epoch": 26.419311630073963, |
| "grad_norm": 0.4290107786655426, |
| "learning_rate": 0.0002831377803670259, |
| "loss": 3.1567, |
| "step": 90750 |
| }, |
| { |
| "epoch": 26.433871061673752, |
| "grad_norm": 0.45089462399482727, |
| "learning_rate": 0.00028296300611709875, |
| "loss": 3.1598, |
| "step": 90800 |
| }, |
| { |
| "epoch": 26.44843049327354, |
| "grad_norm": 0.4224042296409607, |
| "learning_rate": 0.00028278823186717153, |
| "loss": 3.1537, |
| "step": 90850 |
| }, |
| { |
| "epoch": 26.462989924873334, |
| "grad_norm": 0.42360836267471313, |
| "learning_rate": 0.00028261345761724436, |
| "loss": 3.1632, |
| "step": 90900 |
| }, |
| { |
| "epoch": 26.477549356473123, |
| "grad_norm": 0.42730751633644104, |
| "learning_rate": 0.0002824386833673172, |
| "loss": 3.1746, |
| "step": 90950 |
| }, |
| { |
| "epoch": 26.492108788072915, |
| "grad_norm": 0.4391239583492279, |
| "learning_rate": 0.00028226390911739003, |
| "loss": 3.1698, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.492108788072915, |
| "eval_accuracy": 0.3748994805500523, |
| "eval_loss": 3.539992094039917, |
| "eval_runtime": 53.9072, |
| "eval_samples_per_second": 308.734, |
| "eval_steps_per_second": 19.311, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.506668219672704, |
| "grad_norm": 0.42610499262809753, |
| "learning_rate": 0.00028208913486746286, |
| "loss": 3.1573, |
| "step": 91050 |
| }, |
| { |
| "epoch": 26.521227651272493, |
| "grad_norm": 0.44624412059783936, |
| "learning_rate": 0.00028191436061753564, |
| "loss": 3.1727, |
| "step": 91100 |
| }, |
| { |
| "epoch": 26.535787082872286, |
| "grad_norm": 0.3991565704345703, |
| "learning_rate": 0.0002817395863676085, |
| "loss": 3.1758, |
| "step": 91150 |
| }, |
| { |
| "epoch": 26.550346514472075, |
| "grad_norm": 0.4363832175731659, |
| "learning_rate": 0.0002815648121176813, |
| "loss": 3.1751, |
| "step": 91200 |
| }, |
| { |
| "epoch": 26.564905946071864, |
| "grad_norm": 0.4245220720767975, |
| "learning_rate": 0.00028139003786775414, |
| "loss": 3.1757, |
| "step": 91250 |
| }, |
| { |
| "epoch": 26.579465377671657, |
| "grad_norm": 0.4245074391365051, |
| "learning_rate": 0.000281215263617827, |
| "loss": 3.1701, |
| "step": 91300 |
| }, |
| { |
| "epoch": 26.594024809271446, |
| "grad_norm": 0.45923250913619995, |
| "learning_rate": 0.00028104048936789975, |
| "loss": 3.1626, |
| "step": 91350 |
| }, |
| { |
| "epoch": 26.608584240871238, |
| "grad_norm": 0.4250701367855072, |
| "learning_rate": 0.0002808657151179726, |
| "loss": 3.1795, |
| "step": 91400 |
| }, |
| { |
| "epoch": 26.623143672471027, |
| "grad_norm": 0.41687679290771484, |
| "learning_rate": 0.0002806909408680454, |
| "loss": 3.1825, |
| "step": 91450 |
| }, |
| { |
| "epoch": 26.637703104070816, |
| "grad_norm": 0.4420589804649353, |
| "learning_rate": 0.00028051616661811825, |
| "loss": 3.1845, |
| "step": 91500 |
| }, |
| { |
| "epoch": 26.65226253567061, |
| "grad_norm": 0.46142151951789856, |
| "learning_rate": 0.00028034139236819103, |
| "loss": 3.1727, |
| "step": 91550 |
| }, |
| { |
| "epoch": 26.666821967270398, |
| "grad_norm": 0.4562520682811737, |
| "learning_rate": 0.00028016661811826386, |
| "loss": 3.1747, |
| "step": 91600 |
| }, |
| { |
| "epoch": 26.681381398870187, |
| "grad_norm": 0.43289703130722046, |
| "learning_rate": 0.0002799918438683367, |
| "loss": 3.1772, |
| "step": 91650 |
| }, |
| { |
| "epoch": 26.69594083046998, |
| "grad_norm": 0.4018307030200958, |
| "learning_rate": 0.00027981706961840953, |
| "loss": 3.1794, |
| "step": 91700 |
| }, |
| { |
| "epoch": 26.71050026206977, |
| "grad_norm": 0.4325880706310272, |
| "learning_rate": 0.00027964229536848236, |
| "loss": 3.1831, |
| "step": 91750 |
| }, |
| { |
| "epoch": 26.72505969366956, |
| "grad_norm": 0.4957205057144165, |
| "learning_rate": 0.00027946752111855514, |
| "loss": 3.1828, |
| "step": 91800 |
| }, |
| { |
| "epoch": 26.73961912526935, |
| "grad_norm": 0.4486503303050995, |
| "learning_rate": 0.000279292746868628, |
| "loss": 3.1889, |
| "step": 91850 |
| }, |
| { |
| "epoch": 26.75417855686914, |
| "grad_norm": 0.4349845051765442, |
| "learning_rate": 0.00027911797261870086, |
| "loss": 3.1738, |
| "step": 91900 |
| }, |
| { |
| "epoch": 26.76873798846893, |
| "grad_norm": 0.39484739303588867, |
| "learning_rate": 0.00027894319836877364, |
| "loss": 3.1805, |
| "step": 91950 |
| }, |
| { |
| "epoch": 26.78329742006872, |
| "grad_norm": 0.427202433347702, |
| "learning_rate": 0.00027876842411884647, |
| "loss": 3.1911, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.78329742006872, |
| "eval_accuracy": 0.37494863057792555, |
| "eval_loss": 3.538325309753418, |
| "eval_runtime": 55.5984, |
| "eval_samples_per_second": 299.343, |
| "eval_steps_per_second": 18.724, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.79785685166851, |
| "grad_norm": 0.4337463080883026, |
| "learning_rate": 0.0002785936498689193, |
| "loss": 3.1873, |
| "step": 92050 |
| }, |
| { |
| "epoch": 26.812416283268302, |
| "grad_norm": 0.4569494426250458, |
| "learning_rate": 0.00027841887561899214, |
| "loss": 3.1808, |
| "step": 92100 |
| }, |
| { |
| "epoch": 26.82697571486809, |
| "grad_norm": 0.4768456816673279, |
| "learning_rate": 0.0002782441013690649, |
| "loss": 3.201, |
| "step": 92150 |
| }, |
| { |
| "epoch": 26.841535146467884, |
| "grad_norm": 0.4294453263282776, |
| "learning_rate": 0.00027806932711913775, |
| "loss": 3.1983, |
| "step": 92200 |
| }, |
| { |
| "epoch": 26.856094578067673, |
| "grad_norm": 0.42703986167907715, |
| "learning_rate": 0.0002778945528692106, |
| "loss": 3.1866, |
| "step": 92250 |
| }, |
| { |
| "epoch": 26.87065400966746, |
| "grad_norm": 0.4226187467575073, |
| "learning_rate": 0.0002777197786192834, |
| "loss": 3.1958, |
| "step": 92300 |
| }, |
| { |
| "epoch": 26.885213441267254, |
| "grad_norm": 0.41608643531799316, |
| "learning_rate": 0.00027754500436935625, |
| "loss": 3.1835, |
| "step": 92350 |
| }, |
| { |
| "epoch": 26.899772872867043, |
| "grad_norm": 0.4281024634838104, |
| "learning_rate": 0.00027737023011942903, |
| "loss": 3.1929, |
| "step": 92400 |
| }, |
| { |
| "epoch": 26.914332304466832, |
| "grad_norm": 0.4086182713508606, |
| "learning_rate": 0.00027719545586950186, |
| "loss": 3.1878, |
| "step": 92450 |
| }, |
| { |
| "epoch": 26.928891736066625, |
| "grad_norm": 0.4695216119289398, |
| "learning_rate": 0.0002770206816195747, |
| "loss": 3.2038, |
| "step": 92500 |
| }, |
| { |
| "epoch": 26.943451167666414, |
| "grad_norm": 0.4161894619464874, |
| "learning_rate": 0.0002768459073696475, |
| "loss": 3.1999, |
| "step": 92550 |
| }, |
| { |
| "epoch": 26.958010599266206, |
| "grad_norm": 0.4216294586658478, |
| "learning_rate": 0.00027667113311972036, |
| "loss": 3.1936, |
| "step": 92600 |
| }, |
| { |
| "epoch": 26.972570030865995, |
| "grad_norm": 0.4127732217311859, |
| "learning_rate": 0.00027649635886979314, |
| "loss": 3.2093, |
| "step": 92650 |
| }, |
| { |
| "epoch": 26.987129462465784, |
| "grad_norm": 0.40765687823295593, |
| "learning_rate": 0.00027632158461986597, |
| "loss": 3.1847, |
| "step": 92700 |
| }, |
| { |
| "epoch": 27.00145594315998, |
| "grad_norm": 0.44568103551864624, |
| "learning_rate": 0.0002761468103699388, |
| "loss": 3.203, |
| "step": 92750 |
| }, |
| { |
| "epoch": 27.01601537475977, |
| "grad_norm": 0.46214574575424194, |
| "learning_rate": 0.00027597203612001164, |
| "loss": 3.1023, |
| "step": 92800 |
| }, |
| { |
| "epoch": 27.03057480635956, |
| "grad_norm": 0.4300304651260376, |
| "learning_rate": 0.0002757972618700844, |
| "loss": 3.1, |
| "step": 92850 |
| }, |
| { |
| "epoch": 27.04513423795935, |
| "grad_norm": 0.4407559335231781, |
| "learning_rate": 0.00027562248762015725, |
| "loss": 3.1003, |
| "step": 92900 |
| }, |
| { |
| "epoch": 27.05969366955914, |
| "grad_norm": 0.45435330271720886, |
| "learning_rate": 0.0002754477133702301, |
| "loss": 3.1014, |
| "step": 92950 |
| }, |
| { |
| "epoch": 27.07425310115893, |
| "grad_norm": 0.4291175603866577, |
| "learning_rate": 0.0002752729391203029, |
| "loss": 3.099, |
| "step": 93000 |
| }, |
| { |
| "epoch": 27.07425310115893, |
| "eval_accuracy": 0.37463797418165495, |
| "eval_loss": 3.550109624862671, |
| "eval_runtime": 55.4523, |
| "eval_samples_per_second": 300.132, |
| "eval_steps_per_second": 18.773, |
| "step": 93000 |
| }, |
| { |
| "epoch": 27.08881253275872, |
| "grad_norm": 0.4241757392883301, |
| "learning_rate": 0.00027509816487037575, |
| "loss": 3.1121, |
| "step": 93050 |
| }, |
| { |
| "epoch": 27.103371964358512, |
| "grad_norm": 0.4496310353279114, |
| "learning_rate": 0.00027492339062044853, |
| "loss": 3.1109, |
| "step": 93100 |
| }, |
| { |
| "epoch": 27.1179313959583, |
| "grad_norm": 0.4374052882194519, |
| "learning_rate": 0.0002747486163705214, |
| "loss": 3.1225, |
| "step": 93150 |
| }, |
| { |
| "epoch": 27.132490827558094, |
| "grad_norm": 0.4774245321750641, |
| "learning_rate": 0.00027457384212059425, |
| "loss": 3.1188, |
| "step": 93200 |
| }, |
| { |
| "epoch": 27.147050259157883, |
| "grad_norm": 0.4365275204181671, |
| "learning_rate": 0.000274399067870667, |
| "loss": 3.1333, |
| "step": 93250 |
| }, |
| { |
| "epoch": 27.161609690757672, |
| "grad_norm": 0.429744154214859, |
| "learning_rate": 0.00027422429362073986, |
| "loss": 3.1266, |
| "step": 93300 |
| }, |
| { |
| "epoch": 27.176169122357464, |
| "grad_norm": 0.4345669746398926, |
| "learning_rate": 0.0002740495193708127, |
| "loss": 3.1184, |
| "step": 93350 |
| }, |
| { |
| "epoch": 27.190728553957253, |
| "grad_norm": 0.43556392192840576, |
| "learning_rate": 0.0002738747451208855, |
| "loss": 3.1196, |
| "step": 93400 |
| }, |
| { |
| "epoch": 27.205287985557042, |
| "grad_norm": 0.4418703317642212, |
| "learning_rate": 0.0002736999708709583, |
| "loss": 3.126, |
| "step": 93450 |
| }, |
| { |
| "epoch": 27.219847417156835, |
| "grad_norm": 0.4402376115322113, |
| "learning_rate": 0.00027352519662103114, |
| "loss": 3.1286, |
| "step": 93500 |
| }, |
| { |
| "epoch": 27.234406848756624, |
| "grad_norm": 0.42841628193855286, |
| "learning_rate": 0.00027335042237110397, |
| "loss": 3.1342, |
| "step": 93550 |
| }, |
| { |
| "epoch": 27.248966280356417, |
| "grad_norm": 0.42505958676338196, |
| "learning_rate": 0.0002731756481211768, |
| "loss": 3.1276, |
| "step": 93600 |
| }, |
| { |
| "epoch": 27.263525711956206, |
| "grad_norm": 0.4213034212589264, |
| "learning_rate": 0.00027300087387124964, |
| "loss": 3.1337, |
| "step": 93650 |
| }, |
| { |
| "epoch": 27.278085143555995, |
| "grad_norm": 0.4645802974700928, |
| "learning_rate": 0.0002728260996213224, |
| "loss": 3.1321, |
| "step": 93700 |
| }, |
| { |
| "epoch": 27.292644575155787, |
| "grad_norm": 0.4217985272407532, |
| "learning_rate": 0.00027265132537139525, |
| "loss": 3.1386, |
| "step": 93750 |
| }, |
| { |
| "epoch": 27.307204006755576, |
| "grad_norm": 0.4257556200027466, |
| "learning_rate": 0.0002724765511214681, |
| "loss": 3.1362, |
| "step": 93800 |
| }, |
| { |
| "epoch": 27.321763438355365, |
| "grad_norm": 0.4358249008655548, |
| "learning_rate": 0.0002723017768715409, |
| "loss": 3.14, |
| "step": 93850 |
| }, |
| { |
| "epoch": 27.336322869955158, |
| "grad_norm": 0.42620640993118286, |
| "learning_rate": 0.00027212700262161375, |
| "loss": 3.1548, |
| "step": 93900 |
| }, |
| { |
| "epoch": 27.350882301554947, |
| "grad_norm": 0.43325406312942505, |
| "learning_rate": 0.0002719522283716865, |
| "loss": 3.1507, |
| "step": 93950 |
| }, |
| { |
| "epoch": 27.36544173315474, |
| "grad_norm": 0.4155369997024536, |
| "learning_rate": 0.00027177745412175936, |
| "loss": 3.1493, |
| "step": 94000 |
| }, |
| { |
| "epoch": 27.36544173315474, |
| "eval_accuracy": 0.3745591930364993, |
| "eval_loss": 3.543552875518799, |
| "eval_runtime": 55.2955, |
| "eval_samples_per_second": 300.983, |
| "eval_steps_per_second": 18.826, |
| "step": 94000 |
| }, |
| { |
| "epoch": 27.38000116475453, |
| "grad_norm": 0.4649721682071686, |
| "learning_rate": 0.0002716026798718322, |
| "loss": 3.1425, |
| "step": 94050 |
| }, |
| { |
| "epoch": 27.394560596354317, |
| "grad_norm": 0.424880713224411, |
| "learning_rate": 0.000271427905621905, |
| "loss": 3.1603, |
| "step": 94100 |
| }, |
| { |
| "epoch": 27.40912002795411, |
| "grad_norm": 0.4196343421936035, |
| "learning_rate": 0.0002712531313719778, |
| "loss": 3.1546, |
| "step": 94150 |
| }, |
| { |
| "epoch": 27.4236794595539, |
| "grad_norm": 0.425191193819046, |
| "learning_rate": 0.00027107835712205064, |
| "loss": 3.1566, |
| "step": 94200 |
| }, |
| { |
| "epoch": 27.438238891153688, |
| "grad_norm": 0.44504839181900024, |
| "learning_rate": 0.00027090358287212347, |
| "loss": 3.149, |
| "step": 94250 |
| }, |
| { |
| "epoch": 27.45279832275348, |
| "grad_norm": 0.4285842478275299, |
| "learning_rate": 0.0002707288086221963, |
| "loss": 3.1469, |
| "step": 94300 |
| }, |
| { |
| "epoch": 27.46735775435327, |
| "grad_norm": 0.45686304569244385, |
| "learning_rate": 0.00027055403437226914, |
| "loss": 3.1682, |
| "step": 94350 |
| }, |
| { |
| "epoch": 27.481917185953062, |
| "grad_norm": 0.4554402530193329, |
| "learning_rate": 0.00027037926012234197, |
| "loss": 3.1603, |
| "step": 94400 |
| }, |
| { |
| "epoch": 27.49647661755285, |
| "grad_norm": 0.4219898283481598, |
| "learning_rate": 0.0002702044858724148, |
| "loss": 3.1582, |
| "step": 94450 |
| }, |
| { |
| "epoch": 27.51103604915264, |
| "grad_norm": 0.42963922023773193, |
| "learning_rate": 0.00027002971162248763, |
| "loss": 3.1599, |
| "step": 94500 |
| }, |
| { |
| "epoch": 27.525595480752433, |
| "grad_norm": 0.4354709982872009, |
| "learning_rate": 0.0002698549373725604, |
| "loss": 3.157, |
| "step": 94550 |
| }, |
| { |
| "epoch": 27.54015491235222, |
| "grad_norm": 0.4310063123703003, |
| "learning_rate": 0.00026968016312263325, |
| "loss": 3.1667, |
| "step": 94600 |
| }, |
| { |
| "epoch": 27.55471434395201, |
| "grad_norm": 0.4260794520378113, |
| "learning_rate": 0.0002695053888727061, |
| "loss": 3.1647, |
| "step": 94650 |
| }, |
| { |
| "epoch": 27.569273775551803, |
| "grad_norm": 0.4308391809463501, |
| "learning_rate": 0.0002693306146227789, |
| "loss": 3.1756, |
| "step": 94700 |
| }, |
| { |
| "epoch": 27.583833207151592, |
| "grad_norm": 0.43271583318710327, |
| "learning_rate": 0.0002691558403728517, |
| "loss": 3.1751, |
| "step": 94750 |
| }, |
| { |
| "epoch": 27.598392638751385, |
| "grad_norm": 0.4199526011943817, |
| "learning_rate": 0.0002689810661229245, |
| "loss": 3.1625, |
| "step": 94800 |
| }, |
| { |
| "epoch": 27.612952070351174, |
| "grad_norm": 0.4638511538505554, |
| "learning_rate": 0.00026880629187299736, |
| "loss": 3.1835, |
| "step": 94850 |
| }, |
| { |
| "epoch": 27.627511501950963, |
| "grad_norm": 0.4496512711048126, |
| "learning_rate": 0.0002686315176230702, |
| "loss": 3.1731, |
| "step": 94900 |
| }, |
| { |
| "epoch": 27.642070933550755, |
| "grad_norm": 0.44952234625816345, |
| "learning_rate": 0.000268456743373143, |
| "loss": 3.1743, |
| "step": 94950 |
| }, |
| { |
| "epoch": 27.656630365150544, |
| "grad_norm": 0.42717215418815613, |
| "learning_rate": 0.0002682819691232158, |
| "loss": 3.1648, |
| "step": 95000 |
| }, |
| { |
| "epoch": 27.656630365150544, |
| "eval_accuracy": 0.3751180688319096, |
| "eval_loss": 3.536431312561035, |
| "eval_runtime": 55.4927, |
| "eval_samples_per_second": 299.914, |
| "eval_steps_per_second": 18.759, |
| "step": 95000 |
| }, |
| { |
| "epoch": 27.671189796750333, |
| "grad_norm": 0.4398097097873688, |
| "learning_rate": 0.00026810719487328863, |
| "loss": 3.1798, |
| "step": 95050 |
| }, |
| { |
| "epoch": 27.685749228350126, |
| "grad_norm": 0.4571005702018738, |
| "learning_rate": 0.00026793242062336147, |
| "loss": 3.1786, |
| "step": 95100 |
| }, |
| { |
| "epoch": 27.700308659949915, |
| "grad_norm": 0.42584386467933655, |
| "learning_rate": 0.0002677576463734343, |
| "loss": 3.1811, |
| "step": 95150 |
| }, |
| { |
| "epoch": 27.714868091549707, |
| "grad_norm": 0.4381604790687561, |
| "learning_rate": 0.00026758287212350713, |
| "loss": 3.1708, |
| "step": 95200 |
| }, |
| { |
| "epoch": 27.729427523149496, |
| "grad_norm": 0.42696070671081543, |
| "learning_rate": 0.0002674080978735799, |
| "loss": 3.1845, |
| "step": 95250 |
| }, |
| { |
| "epoch": 27.743986954749285, |
| "grad_norm": 0.4467693865299225, |
| "learning_rate": 0.00026723332362365275, |
| "loss": 3.1682, |
| "step": 95300 |
| }, |
| { |
| "epoch": 27.758546386349078, |
| "grad_norm": 0.4203813076019287, |
| "learning_rate": 0.0002670585493737256, |
| "loss": 3.1724, |
| "step": 95350 |
| }, |
| { |
| "epoch": 27.773105817948867, |
| "grad_norm": 0.4817917048931122, |
| "learning_rate": 0.0002668837751237984, |
| "loss": 3.1876, |
| "step": 95400 |
| }, |
| { |
| "epoch": 27.787665249548656, |
| "grad_norm": 0.4254100024700165, |
| "learning_rate": 0.0002667090008738712, |
| "loss": 3.1805, |
| "step": 95450 |
| }, |
| { |
| "epoch": 27.80222468114845, |
| "grad_norm": 0.44012710452079773, |
| "learning_rate": 0.000266534226623944, |
| "loss": 3.1863, |
| "step": 95500 |
| }, |
| { |
| "epoch": 27.816784112748238, |
| "grad_norm": 0.45511409640312195, |
| "learning_rate": 0.00026635945237401686, |
| "loss": 3.1857, |
| "step": 95550 |
| }, |
| { |
| "epoch": 27.83134354434803, |
| "grad_norm": 0.431951105594635, |
| "learning_rate": 0.0002661846781240897, |
| "loss": 3.1765, |
| "step": 95600 |
| }, |
| { |
| "epoch": 27.84590297594782, |
| "grad_norm": 0.4615152180194855, |
| "learning_rate": 0.0002660099038741625, |
| "loss": 3.1802, |
| "step": 95650 |
| }, |
| { |
| "epoch": 27.860462407547608, |
| "grad_norm": 0.43755945563316345, |
| "learning_rate": 0.00026583512962423536, |
| "loss": 3.1821, |
| "step": 95700 |
| }, |
| { |
| "epoch": 27.8750218391474, |
| "grad_norm": 0.424096941947937, |
| "learning_rate": 0.0002656603553743082, |
| "loss": 3.1786, |
| "step": 95750 |
| }, |
| { |
| "epoch": 27.88958127074719, |
| "grad_norm": 0.44469699263572693, |
| "learning_rate": 0.000265485581124381, |
| "loss": 3.1789, |
| "step": 95800 |
| }, |
| { |
| "epoch": 27.90414070234698, |
| "grad_norm": 0.4233236610889435, |
| "learning_rate": 0.0002653108068744538, |
| "loss": 3.1792, |
| "step": 95850 |
| }, |
| { |
| "epoch": 27.91870013394677, |
| "grad_norm": 0.43293777108192444, |
| "learning_rate": 0.00026513603262452663, |
| "loss": 3.1763, |
| "step": 95900 |
| }, |
| { |
| "epoch": 27.93325956554656, |
| "grad_norm": 0.45778122544288635, |
| "learning_rate": 0.00026496125837459947, |
| "loss": 3.1841, |
| "step": 95950 |
| }, |
| { |
| "epoch": 27.947818997146353, |
| "grad_norm": 0.4322860538959503, |
| "learning_rate": 0.0002647864841246723, |
| "loss": 3.1793, |
| "step": 96000 |
| }, |
| { |
| "epoch": 27.947818997146353, |
| "eval_accuracy": 0.3751108962201865, |
| "eval_loss": 3.5336105823516846, |
| "eval_runtime": 55.4476, |
| "eval_samples_per_second": 300.157, |
| "eval_steps_per_second": 18.774, |
| "step": 96000 |
| }, |
| { |
| "epoch": 27.962378428746142, |
| "grad_norm": 0.4258585274219513, |
| "learning_rate": 0.0002646117098747451, |
| "loss": 3.1967, |
| "step": 96050 |
| }, |
| { |
| "epoch": 27.97693786034593, |
| "grad_norm": 0.4479847848415375, |
| "learning_rate": 0.0002644369356248179, |
| "loss": 3.1921, |
| "step": 96100 |
| }, |
| { |
| "epoch": 27.991497291945723, |
| "grad_norm": 0.4440501928329468, |
| "learning_rate": 0.00026426216137489074, |
| "loss": 3.1911, |
| "step": 96150 |
| }, |
| { |
| "epoch": 28.005823772639918, |
| "grad_norm": 0.4409486651420593, |
| "learning_rate": 0.0002640873871249636, |
| "loss": 3.1243, |
| "step": 96200 |
| }, |
| { |
| "epoch": 28.020383204239707, |
| "grad_norm": 0.4427194595336914, |
| "learning_rate": 0.0002639126128750364, |
| "loss": 3.0868, |
| "step": 96250 |
| }, |
| { |
| "epoch": 28.034942635839496, |
| "grad_norm": 0.43971019983291626, |
| "learning_rate": 0.0002637378386251092, |
| "loss": 3.0921, |
| "step": 96300 |
| }, |
| { |
| "epoch": 28.04950206743929, |
| "grad_norm": 0.45166823267936707, |
| "learning_rate": 0.000263563064375182, |
| "loss": 3.0929, |
| "step": 96350 |
| }, |
| { |
| "epoch": 28.064061499039077, |
| "grad_norm": 0.45439887046813965, |
| "learning_rate": 0.00026338829012525485, |
| "loss": 3.0843, |
| "step": 96400 |
| }, |
| { |
| "epoch": 28.078620930638866, |
| "grad_norm": 0.44814586639404297, |
| "learning_rate": 0.0002632135158753277, |
| "loss": 3.1028, |
| "step": 96450 |
| }, |
| { |
| "epoch": 28.09318036223866, |
| "grad_norm": 0.42964088916778564, |
| "learning_rate": 0.0002630387416254005, |
| "loss": 3.1121, |
| "step": 96500 |
| }, |
| { |
| "epoch": 28.107739793838448, |
| "grad_norm": 0.40774595737457275, |
| "learning_rate": 0.0002628639673754733, |
| "loss": 3.1128, |
| "step": 96550 |
| }, |
| { |
| "epoch": 28.12229922543824, |
| "grad_norm": 0.45805495977401733, |
| "learning_rate": 0.00026268919312554613, |
| "loss": 3.1068, |
| "step": 96600 |
| }, |
| { |
| "epoch": 28.13685865703803, |
| "grad_norm": 0.4234671890735626, |
| "learning_rate": 0.00026251441887561897, |
| "loss": 3.1048, |
| "step": 96650 |
| }, |
| { |
| "epoch": 28.15141808863782, |
| "grad_norm": 0.4316425621509552, |
| "learning_rate": 0.0002623396446256918, |
| "loss": 3.1249, |
| "step": 96700 |
| }, |
| { |
| "epoch": 28.16597752023761, |
| "grad_norm": 0.4478599429130554, |
| "learning_rate": 0.0002621648703757646, |
| "loss": 3.1169, |
| "step": 96750 |
| }, |
| { |
| "epoch": 28.1805369518374, |
| "grad_norm": 0.4409092962741852, |
| "learning_rate": 0.0002619900961258374, |
| "loss": 3.111, |
| "step": 96800 |
| }, |
| { |
| "epoch": 28.19509638343719, |
| "grad_norm": 0.48611727356910706, |
| "learning_rate": 0.00026181532187591024, |
| "loss": 3.1161, |
| "step": 96850 |
| }, |
| { |
| "epoch": 28.20965581503698, |
| "grad_norm": 0.4241076409816742, |
| "learning_rate": 0.0002616405476259831, |
| "loss": 3.1215, |
| "step": 96900 |
| }, |
| { |
| "epoch": 28.22421524663677, |
| "grad_norm": 0.44242626428604126, |
| "learning_rate": 0.0002614657733760559, |
| "loss": 3.1297, |
| "step": 96950 |
| }, |
| { |
| "epoch": 28.238774678236563, |
| "grad_norm": 0.45010676980018616, |
| "learning_rate": 0.00026129099912612874, |
| "loss": 3.1201, |
| "step": 97000 |
| }, |
| { |
| "epoch": 28.238774678236563, |
| "eval_accuracy": 0.37466113819000674, |
| "eval_loss": 3.5505800247192383, |
| "eval_runtime": 55.3733, |
| "eval_samples_per_second": 300.56, |
| "eval_steps_per_second": 18.8, |
| "step": 97000 |
| }, |
| { |
| "epoch": 28.253334109836352, |
| "grad_norm": 0.4329627752304077, |
| "learning_rate": 0.0002611162248762016, |
| "loss": 3.1274, |
| "step": 97050 |
| }, |
| { |
| "epoch": 28.26789354143614, |
| "grad_norm": 0.4276256263256073, |
| "learning_rate": 0.0002609414506262744, |
| "loss": 3.1237, |
| "step": 97100 |
| }, |
| { |
| "epoch": 28.282452973035934, |
| "grad_norm": 0.43386736512184143, |
| "learning_rate": 0.0002607666763763472, |
| "loss": 3.1408, |
| "step": 97150 |
| }, |
| { |
| "epoch": 28.297012404635723, |
| "grad_norm": 0.4516732394695282, |
| "learning_rate": 0.00026059190212642, |
| "loss": 3.1539, |
| "step": 97200 |
| }, |
| { |
| "epoch": 28.31157183623551, |
| "grad_norm": 0.4807344675064087, |
| "learning_rate": 0.00026041712787649285, |
| "loss": 3.1368, |
| "step": 97250 |
| }, |
| { |
| "epoch": 28.326131267835304, |
| "grad_norm": 0.4625699818134308, |
| "learning_rate": 0.0002602423536265657, |
| "loss": 3.1323, |
| "step": 97300 |
| }, |
| { |
| "epoch": 28.340690699435093, |
| "grad_norm": 0.4425176978111267, |
| "learning_rate": 0.00026006757937663846, |
| "loss": 3.1344, |
| "step": 97350 |
| }, |
| { |
| "epoch": 28.355250131034886, |
| "grad_norm": 0.46198466420173645, |
| "learning_rate": 0.0002598928051267113, |
| "loss": 3.1409, |
| "step": 97400 |
| }, |
| { |
| "epoch": 28.369809562634675, |
| "grad_norm": 0.4490656852722168, |
| "learning_rate": 0.00025971803087678413, |
| "loss": 3.1404, |
| "step": 97450 |
| }, |
| { |
| "epoch": 28.384368994234464, |
| "grad_norm": 0.436927855014801, |
| "learning_rate": 0.00025954325662685696, |
| "loss": 3.1442, |
| "step": 97500 |
| }, |
| { |
| "epoch": 28.398928425834256, |
| "grad_norm": 0.472896009683609, |
| "learning_rate": 0.0002593684823769298, |
| "loss": 3.1434, |
| "step": 97550 |
| }, |
| { |
| "epoch": 28.413487857434045, |
| "grad_norm": 0.4772699773311615, |
| "learning_rate": 0.0002591937081270026, |
| "loss": 3.1384, |
| "step": 97600 |
| }, |
| { |
| "epoch": 28.428047289033834, |
| "grad_norm": 0.44761937856674194, |
| "learning_rate": 0.0002590189338770754, |
| "loss": 3.1614, |
| "step": 97650 |
| }, |
| { |
| "epoch": 28.442606720633627, |
| "grad_norm": 0.4448677897453308, |
| "learning_rate": 0.00025884415962714824, |
| "loss": 3.1475, |
| "step": 97700 |
| }, |
| { |
| "epoch": 28.457166152233416, |
| "grad_norm": 0.4507066309452057, |
| "learning_rate": 0.0002586693853772211, |
| "loss": 3.1554, |
| "step": 97750 |
| }, |
| { |
| "epoch": 28.47172558383321, |
| "grad_norm": 0.4424573481082916, |
| "learning_rate": 0.0002584946111272939, |
| "loss": 3.1512, |
| "step": 97800 |
| }, |
| { |
| "epoch": 28.486285015432998, |
| "grad_norm": 0.42159804701805115, |
| "learning_rate": 0.0002583198368773667, |
| "loss": 3.14, |
| "step": 97850 |
| }, |
| { |
| "epoch": 28.500844447032787, |
| "grad_norm": 0.441180944442749, |
| "learning_rate": 0.0002581450626274395, |
| "loss": 3.1606, |
| "step": 97900 |
| }, |
| { |
| "epoch": 28.51540387863258, |
| "grad_norm": 0.4541659355163574, |
| "learning_rate": 0.00025797028837751235, |
| "loss": 3.16, |
| "step": 97950 |
| }, |
| { |
| "epoch": 28.529963310232368, |
| "grad_norm": 0.4408447742462158, |
| "learning_rate": 0.0002577955141275852, |
| "loss": 3.1451, |
| "step": 98000 |
| }, |
| { |
| "epoch": 28.529963310232368, |
| "eval_accuracy": 0.37521836781223467, |
| "eval_loss": 3.5380377769470215, |
| "eval_runtime": 55.3762, |
| "eval_samples_per_second": 300.545, |
| "eval_steps_per_second": 18.799, |
| "step": 98000 |
| }, |
| { |
| "epoch": 28.544522741832157, |
| "grad_norm": 0.42458575963974, |
| "learning_rate": 0.00025762073987765796, |
| "loss": 3.1664, |
| "step": 98050 |
| }, |
| { |
| "epoch": 28.55908217343195, |
| "grad_norm": 0.46024999022483826, |
| "learning_rate": 0.0002574459656277308, |
| "loss": 3.1775, |
| "step": 98100 |
| }, |
| { |
| "epoch": 28.57364160503174, |
| "grad_norm": 0.4679096043109894, |
| "learning_rate": 0.00025727119137780363, |
| "loss": 3.1596, |
| "step": 98150 |
| }, |
| { |
| "epoch": 28.58820103663153, |
| "grad_norm": 0.4214268922805786, |
| "learning_rate": 0.00025709641712787646, |
| "loss": 3.1469, |
| "step": 98200 |
| }, |
| { |
| "epoch": 28.60276046823132, |
| "grad_norm": 0.45428961515426636, |
| "learning_rate": 0.0002569216428779493, |
| "loss": 3.1632, |
| "step": 98250 |
| }, |
| { |
| "epoch": 28.61731989983111, |
| "grad_norm": 0.4411943256855011, |
| "learning_rate": 0.00025674686862802213, |
| "loss": 3.1604, |
| "step": 98300 |
| }, |
| { |
| "epoch": 28.631879331430902, |
| "grad_norm": 0.4553915560245514, |
| "learning_rate": 0.00025657209437809496, |
| "loss": 3.1644, |
| "step": 98350 |
| }, |
| { |
| "epoch": 28.64643876303069, |
| "grad_norm": 0.4235278069972992, |
| "learning_rate": 0.0002563973201281678, |
| "loss": 3.1465, |
| "step": 98400 |
| }, |
| { |
| "epoch": 28.66099819463048, |
| "grad_norm": 0.42616787552833557, |
| "learning_rate": 0.0002562225458782406, |
| "loss": 3.1613, |
| "step": 98450 |
| }, |
| { |
| "epoch": 28.675557626230272, |
| "grad_norm": 0.44103261828422546, |
| "learning_rate": 0.0002560477716283134, |
| "loss": 3.168, |
| "step": 98500 |
| }, |
| { |
| "epoch": 28.69011705783006, |
| "grad_norm": 0.457415908575058, |
| "learning_rate": 0.00025587299737838624, |
| "loss": 3.1648, |
| "step": 98550 |
| }, |
| { |
| "epoch": 28.704676489429854, |
| "grad_norm": 0.44758105278015137, |
| "learning_rate": 0.00025569822312845907, |
| "loss": 3.1643, |
| "step": 98600 |
| }, |
| { |
| "epoch": 28.719235921029643, |
| "grad_norm": 0.4369789659976959, |
| "learning_rate": 0.00025552344887853185, |
| "loss": 3.1629, |
| "step": 98650 |
| }, |
| { |
| "epoch": 28.733795352629432, |
| "grad_norm": 0.4466242492198944, |
| "learning_rate": 0.0002553486746286047, |
| "loss": 3.1671, |
| "step": 98700 |
| }, |
| { |
| "epoch": 28.748354784229225, |
| "grad_norm": 0.4449853301048279, |
| "learning_rate": 0.0002551739003786775, |
| "loss": 3.172, |
| "step": 98750 |
| }, |
| { |
| "epoch": 28.762914215829014, |
| "grad_norm": 0.4349530339241028, |
| "learning_rate": 0.00025499912612875035, |
| "loss": 3.1667, |
| "step": 98800 |
| }, |
| { |
| "epoch": 28.777473647428806, |
| "grad_norm": 0.4433923661708832, |
| "learning_rate": 0.0002548243518788232, |
| "loss": 3.1605, |
| "step": 98850 |
| }, |
| { |
| "epoch": 28.792033079028595, |
| "grad_norm": 0.45331159234046936, |
| "learning_rate": 0.00025464957762889596, |
| "loss": 3.1688, |
| "step": 98900 |
| }, |
| { |
| "epoch": 28.806592510628384, |
| "grad_norm": 0.4952585697174072, |
| "learning_rate": 0.0002544748033789688, |
| "loss": 3.1656, |
| "step": 98950 |
| }, |
| { |
| "epoch": 28.821151942228177, |
| "grad_norm": 0.438007116317749, |
| "learning_rate": 0.00025430002912904163, |
| "loss": 3.1765, |
| "step": 99000 |
| }, |
| { |
| "epoch": 28.821151942228177, |
| "eval_accuracy": 0.37500518838511937, |
| "eval_loss": 3.537261724472046, |
| "eval_runtime": 55.4042, |
| "eval_samples_per_second": 300.393, |
| "eval_steps_per_second": 18.789, |
| "step": 99000 |
| }, |
| { |
| "epoch": 28.821151942228177, |
| "step": 99000, |
| "total_flos": 2.068928077234176e+18, |
| "train_loss": 3.3924570856576013, |
| "train_runtime": 44486.5704, |
| "train_samples_per_second": 308.78, |
| "train_steps_per_second": 3.861 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 171750, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 20, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 11 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.068928077234176e+18, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|