| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 100, |
| "global_step": 8302, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.2810336351394653, |
| "epoch": 0.0012045290291496024, |
| "grad_norm": 836.0, |
| "learning_rate": 2.1634615384615387e-06, |
| "loss": 11.910458374023438, |
| "mean_token_accuracy": 0.15205802023410797, |
| "num_tokens": 67588.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.4772902846336364, |
| "epoch": 0.002409058058299205, |
| "grad_norm": 6368.0, |
| "learning_rate": 4.567307692307692e-06, |
| "loss": 10.921498107910157, |
| "mean_token_accuracy": 0.16450221911072732, |
| "num_tokens": 134788.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.9750785112380982, |
| "epoch": 0.0036135870874488074, |
| "grad_norm": 374.0, |
| "learning_rate": 6.9711538461538465e-06, |
| "loss": 8.950369262695313, |
| "mean_token_accuracy": 0.17044174969196318, |
| "num_tokens": 205314.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 3.4127318382263185, |
| "epoch": 0.00481811611659841, |
| "grad_norm": 117.5, |
| "learning_rate": 9.375000000000001e-06, |
| "loss": 6.107471084594726, |
| "mean_token_accuracy": 0.19763734340667724, |
| "num_tokens": 275377.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 4.608194398880005, |
| "epoch": 0.006022645145748013, |
| "grad_norm": 145.0, |
| "learning_rate": 1.1778846153846154e-05, |
| "loss": 4.998584365844726, |
| "mean_token_accuracy": 0.23358086347579957, |
| "num_tokens": 338642.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 4.467870378494263, |
| "epoch": 0.007227174174897615, |
| "grad_norm": 142.0, |
| "learning_rate": 1.4182692307692308e-05, |
| "loss": 4.630952453613281, |
| "mean_token_accuracy": 0.2546722576022148, |
| "num_tokens": 409902.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 4.305765962600708, |
| "epoch": 0.008431703204047217, |
| "grad_norm": 39.25, |
| "learning_rate": 1.6586538461538463e-05, |
| "loss": 4.396952819824219, |
| "mean_token_accuracy": 0.27402625530958175, |
| "num_tokens": 477073.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 4.207392024993896, |
| "epoch": 0.00963623223319682, |
| "grad_norm": 42.0, |
| "learning_rate": 1.8990384615384615e-05, |
| "loss": 4.2559349060058596, |
| "mean_token_accuracy": 0.282428053021431, |
| "num_tokens": 546516.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 3.8669612884521483, |
| "epoch": 0.010840761262346423, |
| "grad_norm": 864.0, |
| "learning_rate": 2.139423076923077e-05, |
| "loss": 3.9113719940185545, |
| "mean_token_accuracy": 0.319625186920166, |
| "num_tokens": 616970.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 3.7632123231887817, |
| "epoch": 0.012045290291496025, |
| "grad_norm": 37.25, |
| "learning_rate": 2.3798076923076922e-05, |
| "loss": 3.7912879943847657, |
| "mean_token_accuracy": 0.3272585391998291, |
| "num_tokens": 684911.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.012045290291496025, |
| "eval_entropy": 3.9685288667678833, |
| "eval_loss": 4.026243209838867, |
| "eval_mean_token_accuracy": 0.3059713691473007, |
| "eval_num_tokens": 684911.0, |
| "eval_runtime": 0.4169, |
| "eval_samples_per_second": 38.376, |
| "eval_steps_per_second": 4.797, |
| "step": 100 |
| }, |
| { |
| "entropy": 3.7600659847259523, |
| "epoch": 0.013249819320645628, |
| "grad_norm": 31.0, |
| "learning_rate": 2.620192307692308e-05, |
| "loss": 3.7905757904052733, |
| "mean_token_accuracy": 0.3308758944272995, |
| "num_tokens": 752070.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 3.621634840965271, |
| "epoch": 0.01445434834979523, |
| "grad_norm": 16.875, |
| "learning_rate": 2.860576923076923e-05, |
| "loss": 3.638637161254883, |
| "mean_token_accuracy": 0.3428749591112137, |
| "num_tokens": 818059.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 3.7145506381988525, |
| "epoch": 0.015658877378944832, |
| "grad_norm": 426.0, |
| "learning_rate": 3.1009615384615384e-05, |
| "loss": 3.687373733520508, |
| "mean_token_accuracy": 0.33822024166584014, |
| "num_tokens": 887211.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 3.5624027252197266, |
| "epoch": 0.016863406408094434, |
| "grad_norm": 28.625, |
| "learning_rate": 3.3413461538461536e-05, |
| "loss": 3.5997623443603515, |
| "mean_token_accuracy": 0.34836446344852445, |
| "num_tokens": 957271.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 3.424220418930054, |
| "epoch": 0.018067935437244036, |
| "grad_norm": 25.0, |
| "learning_rate": 3.5817307692307695e-05, |
| "loss": 3.464914321899414, |
| "mean_token_accuracy": 0.360398867726326, |
| "num_tokens": 1028063.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 3.524876356124878, |
| "epoch": 0.01927246446639364, |
| "grad_norm": 14.5625, |
| "learning_rate": 3.8221153846153846e-05, |
| "loss": 3.4790149688720704, |
| "mean_token_accuracy": 0.36009892225265505, |
| "num_tokens": 1098655.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 3.3341711282730104, |
| "epoch": 0.020476993495543244, |
| "grad_norm": 18.5, |
| "learning_rate": 4.0625000000000005e-05, |
| "loss": 3.3776344299316405, |
| "mean_token_accuracy": 0.37702774703502656, |
| "num_tokens": 1164462.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 3.3053973436355593, |
| "epoch": 0.021681522524692846, |
| "grad_norm": 16.75, |
| "learning_rate": 4.302884615384616e-05, |
| "loss": 3.3609580993652344, |
| "mean_token_accuracy": 0.37871713638305665, |
| "num_tokens": 1235254.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 3.3385438680648805, |
| "epoch": 0.02288605155384245, |
| "grad_norm": 15.6875, |
| "learning_rate": 4.543269230769231e-05, |
| "loss": 3.347671890258789, |
| "mean_token_accuracy": 0.37770530581474304, |
| "num_tokens": 1304364.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 3.262368583679199, |
| "epoch": 0.02409058058299205, |
| "grad_norm": 18.375, |
| "learning_rate": 4.783653846153847e-05, |
| "loss": 3.2868324279785157, |
| "mean_token_accuracy": 0.3836541771888733, |
| "num_tokens": 1373184.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.02409058058299205, |
| "eval_entropy": 3.401029348373413, |
| "eval_loss": 3.5328683853149414, |
| "eval_mean_token_accuracy": 0.3548418879508972, |
| "eval_num_tokens": 1373184.0, |
| "eval_runtime": 0.3788, |
| "eval_samples_per_second": 42.238, |
| "eval_steps_per_second": 5.28, |
| "step": 200 |
| }, |
| { |
| "entropy": 3.1738123893737793, |
| "epoch": 0.025295109612141653, |
| "grad_norm": 31.125, |
| "learning_rate": 5.024038461538462e-05, |
| "loss": 3.1927379608154296, |
| "mean_token_accuracy": 0.39391712546348573, |
| "num_tokens": 1442812.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 3.133743643760681, |
| "epoch": 0.026499638641291255, |
| "grad_norm": 8.625, |
| "learning_rate": 5.264423076923077e-05, |
| "loss": 3.1629384994506835, |
| "mean_token_accuracy": 0.40554835796356203, |
| "num_tokens": 1513954.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 3.1619156122207643, |
| "epoch": 0.027704167670440857, |
| "grad_norm": 14.9375, |
| "learning_rate": 5.504807692307693e-05, |
| "loss": 3.1566917419433596, |
| "mean_token_accuracy": 0.39930360019207, |
| "num_tokens": 1581774.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 3.1904892444610597, |
| "epoch": 0.02890869669959046, |
| "grad_norm": 13.4375, |
| "learning_rate": 5.7451923076923074e-05, |
| "loss": 3.2271888732910154, |
| "mean_token_accuracy": 0.38786653280258176, |
| "num_tokens": 1649999.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 3.090328550338745, |
| "epoch": 0.030113225728740062, |
| "grad_norm": 12.3125, |
| "learning_rate": 5.985576923076923e-05, |
| "loss": 3.1105745315551756, |
| "mean_token_accuracy": 0.410536527633667, |
| "num_tokens": 1720878.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 3.117401146888733, |
| "epoch": 0.031317754757889664, |
| "grad_norm": 8.0625, |
| "learning_rate": 6.225961538461539e-05, |
| "loss": 3.120888900756836, |
| "mean_token_accuracy": 0.40497787594795226, |
| "num_tokens": 1789793.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 3.114533042907715, |
| "epoch": 0.032522283787039266, |
| "grad_norm": 6.75, |
| "learning_rate": 6.466346153846154e-05, |
| "loss": 3.1246286392211915, |
| "mean_token_accuracy": 0.40853759050369265, |
| "num_tokens": 1858569.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 2.9943730354309084, |
| "epoch": 0.03372681281618887, |
| "grad_norm": 7.40625, |
| "learning_rate": 6.70673076923077e-05, |
| "loss": 3.0746133804321287, |
| "mean_token_accuracy": 0.4096983641386032, |
| "num_tokens": 1930700.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 3.0540405035018923, |
| "epoch": 0.03493134184533847, |
| "grad_norm": 17.5, |
| "learning_rate": 6.947115384615385e-05, |
| "loss": 3.002737617492676, |
| "mean_token_accuracy": 0.42073442935943606, |
| "num_tokens": 1997275.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 2.9349114656448365, |
| "epoch": 0.03613587087448807, |
| "grad_norm": 97.0, |
| "learning_rate": 7.1875e-05, |
| "loss": 2.9756820678710936, |
| "mean_token_accuracy": 0.4268068581819534, |
| "num_tokens": 2066990.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.03613587087448807, |
| "eval_entropy": 3.4880975484848022, |
| "eval_loss": 3.4292285442352295, |
| "eval_mean_token_accuracy": 0.33973701298236847, |
| "eval_num_tokens": 2066990.0, |
| "eval_runtime": 0.5504, |
| "eval_samples_per_second": 29.069, |
| "eval_steps_per_second": 3.634, |
| "step": 300 |
| }, |
| { |
| "entropy": 2.9692421674728395, |
| "epoch": 0.037340399903637675, |
| "grad_norm": 5.90625, |
| "learning_rate": 7.427884615384616e-05, |
| "loss": 2.9625797271728516, |
| "mean_token_accuracy": 0.4250692486763, |
| "num_tokens": 2137171.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 2.91152765750885, |
| "epoch": 0.03854492893278728, |
| "grad_norm": 43.0, |
| "learning_rate": 7.668269230769232e-05, |
| "loss": 2.9240610122680666, |
| "mean_token_accuracy": 0.4358662247657776, |
| "num_tokens": 2207592.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 2.8902512788772583, |
| "epoch": 0.03974945796193688, |
| "grad_norm": 6.1875, |
| "learning_rate": 7.908653846153847e-05, |
| "loss": 2.90927677154541, |
| "mean_token_accuracy": 0.4399706482887268, |
| "num_tokens": 2273887.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 2.9261450290679933, |
| "epoch": 0.04095398699108649, |
| "grad_norm": 11.0625, |
| "learning_rate": 8.149038461538462e-05, |
| "loss": 2.939039421081543, |
| "mean_token_accuracy": 0.42746458351612093, |
| "num_tokens": 2344104.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 2.9675798416137695, |
| "epoch": 0.04215851602023609, |
| "grad_norm": 4.28125, |
| "learning_rate": 8.389423076923077e-05, |
| "loss": 2.9653833389282225, |
| "mean_token_accuracy": 0.4271013975143433, |
| "num_tokens": 2413987.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 2.7644049644470217, |
| "epoch": 0.04336304504938569, |
| "grad_norm": 8.0625, |
| "learning_rate": 8.629807692307694e-05, |
| "loss": 2.8332645416259767, |
| "mean_token_accuracy": 0.45240907967090604, |
| "num_tokens": 2483166.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 2.9669906377792357, |
| "epoch": 0.044567574078535295, |
| "grad_norm": 7.0, |
| "learning_rate": 8.870192307692308e-05, |
| "loss": 2.9809848785400392, |
| "mean_token_accuracy": 0.42300455570220946, |
| "num_tokens": 2553919.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 2.926973581314087, |
| "epoch": 0.0457721031076849, |
| "grad_norm": 5.875, |
| "learning_rate": 9.110576923076923e-05, |
| "loss": 2.9251119613647463, |
| "mean_token_accuracy": 0.4301867544651031, |
| "num_tokens": 2621338.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 3.042341208457947, |
| "epoch": 0.0469766321368345, |
| "grad_norm": 8.0, |
| "learning_rate": 9.350961538461539e-05, |
| "loss": 3.0066806793212892, |
| "mean_token_accuracy": 0.4190910369157791, |
| "num_tokens": 2693649.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 2.855639863014221, |
| "epoch": 0.0481811611659841, |
| "grad_norm": 11.25, |
| "learning_rate": 9.591346153846154e-05, |
| "loss": 2.925531768798828, |
| "mean_token_accuracy": 0.43328951895236967, |
| "num_tokens": 2762855.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.0481811611659841, |
| "eval_entropy": 3.120109796524048, |
| "eval_loss": 3.337949275970459, |
| "eval_mean_token_accuracy": 0.36517854034900665, |
| "eval_num_tokens": 2762855.0, |
| "eval_runtime": 0.4155, |
| "eval_samples_per_second": 38.509, |
| "eval_steps_per_second": 4.814, |
| "step": 400 |
| }, |
| { |
| "entropy": 2.9335047960281373, |
| "epoch": 0.049385690195133704, |
| "grad_norm": 14.5625, |
| "learning_rate": 9.83173076923077e-05, |
| "loss": 2.9816661834716798, |
| "mean_token_accuracy": 0.4285028487443924, |
| "num_tokens": 2831768.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 2.796990966796875, |
| "epoch": 0.050590219224283306, |
| "grad_norm": 5.5625, |
| "learning_rate": 9.999996429174181e-05, |
| "loss": 2.8231491088867187, |
| "mean_token_accuracy": 0.4479815810918808, |
| "num_tokens": 2902774.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 2.800985240936279, |
| "epoch": 0.05179474825343291, |
| "grad_norm": 7.3125, |
| "learning_rate": 9.999932947968169e-05, |
| "loss": 2.816908073425293, |
| "mean_token_accuracy": 0.44685631394386294, |
| "num_tokens": 2974752.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 2.833960461616516, |
| "epoch": 0.05299927728258251, |
| "grad_norm": 4.4375, |
| "learning_rate": 9.999790116236919e-05, |
| "loss": 2.8537731170654297, |
| "mean_token_accuracy": 0.43992237448692323, |
| "num_tokens": 3040347.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 2.80394446849823, |
| "epoch": 0.05420380631173211, |
| "grad_norm": 5.0, |
| "learning_rate": 9.999567936247218e-05, |
| "loss": 2.8184518814086914, |
| "mean_token_accuracy": 0.4442470997571945, |
| "num_tokens": 3108408.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 2.8903349876403808, |
| "epoch": 0.055408335340881715, |
| "grad_norm": 3.375, |
| "learning_rate": 9.999266411525132e-05, |
| "loss": 2.90649471282959, |
| "mean_token_accuracy": 0.4361670553684235, |
| "num_tokens": 3178364.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 2.770217847824097, |
| "epoch": 0.05661286437003132, |
| "grad_norm": 2.671875, |
| "learning_rate": 9.998885546855956e-05, |
| "loss": 2.7700069427490233, |
| "mean_token_accuracy": 0.4557085156440735, |
| "num_tokens": 3249289.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 2.763340950012207, |
| "epoch": 0.05781739339918092, |
| "grad_norm": 3.5, |
| "learning_rate": 9.998425348284132e-05, |
| "loss": 2.7811700820922853, |
| "mean_token_accuracy": 0.4531765550374985, |
| "num_tokens": 3318854.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 2.7287254333496094, |
| "epoch": 0.05902192242833052, |
| "grad_norm": 2.96875, |
| "learning_rate": 9.997885823113159e-05, |
| "loss": 2.790057373046875, |
| "mean_token_accuracy": 0.44789515137672425, |
| "num_tokens": 3387544.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 2.775154709815979, |
| "epoch": 0.060226451457480124, |
| "grad_norm": 6.34375, |
| "learning_rate": 9.99726697990547e-05, |
| "loss": 2.7989133834838866, |
| "mean_token_accuracy": 0.45067694783210754, |
| "num_tokens": 3457225.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.060226451457480124, |
| "eval_entropy": 3.139446258544922, |
| "eval_loss": 3.338728189468384, |
| "eval_mean_token_accuracy": 0.3554842323064804, |
| "eval_num_tokens": 3457225.0, |
| "eval_runtime": 0.4113, |
| "eval_samples_per_second": 38.903, |
| "eval_steps_per_second": 4.863, |
| "step": 500 |
| }, |
| { |
| "entropy": 2.824716258049011, |
| "epoch": 0.061430980486629726, |
| "grad_norm": 6.21875, |
| "learning_rate": 9.996568828482307e-05, |
| "loss": 2.8506168365478515, |
| "mean_token_accuracy": 0.4408591091632843, |
| "num_tokens": 3525902.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 2.742767095565796, |
| "epoch": 0.06263550951577933, |
| "grad_norm": 4.875, |
| "learning_rate": 9.995791379923553e-05, |
| "loss": 2.774262237548828, |
| "mean_token_accuracy": 0.45442442297935487, |
| "num_tokens": 3596373.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 2.6956833362579347, |
| "epoch": 0.06384003854492894, |
| "grad_norm": 10.5, |
| "learning_rate": 9.994934646567564e-05, |
| "loss": 2.669524383544922, |
| "mean_token_accuracy": 0.4663479655981064, |
| "num_tokens": 3664949.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 2.7131295919418337, |
| "epoch": 0.06504456757407853, |
| "grad_norm": 8.625, |
| "learning_rate": 9.99399864201097e-05, |
| "loss": 2.7482681274414062, |
| "mean_token_accuracy": 0.45665947794914247, |
| "num_tokens": 3734769.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 2.7076703310012817, |
| "epoch": 0.06624909660322814, |
| "grad_norm": 4.5, |
| "learning_rate": 9.992983381108463e-05, |
| "loss": 2.7302698135375976, |
| "mean_token_accuracy": 0.4592186540365219, |
| "num_tokens": 3804019.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 2.6755930185317993, |
| "epoch": 0.06745362563237774, |
| "grad_norm": 16.25, |
| "learning_rate": 9.991888879972552e-05, |
| "loss": 2.713031196594238, |
| "mean_token_accuracy": 0.4650493025779724, |
| "num_tokens": 3869181.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 2.5656904935836793, |
| "epoch": 0.06865815466152735, |
| "grad_norm": 2.75, |
| "learning_rate": 9.990715155973325e-05, |
| "loss": 2.5744911193847657, |
| "mean_token_accuracy": 0.4835150271654129, |
| "num_tokens": 3936190.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 2.6105204105377195, |
| "epoch": 0.06986268369067694, |
| "grad_norm": 4.3125, |
| "learning_rate": 9.989462227738148e-05, |
| "loss": 2.6728843688964843, |
| "mean_token_accuracy": 0.46942216753959654, |
| "num_tokens": 4003274.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 2.729413914680481, |
| "epoch": 0.07106721271982655, |
| "grad_norm": 3.265625, |
| "learning_rate": 9.988130115151392e-05, |
| "loss": 2.7112655639648438, |
| "mean_token_accuracy": 0.46558586061000823, |
| "num_tokens": 4074673.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 2.57799232006073, |
| "epoch": 0.07227174174897615, |
| "grad_norm": 6.40625, |
| "learning_rate": 9.986718839354111e-05, |
| "loss": 2.5880659103393553, |
| "mean_token_accuracy": 0.47919810116291045, |
| "num_tokens": 4142427.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.07227174174897615, |
| "eval_entropy": 3.2087725400924683, |
| "eval_loss": 3.3046178817749023, |
| "eval_mean_token_accuracy": 0.35669657588005066, |
| "eval_num_tokens": 4142427.0, |
| "eval_runtime": 0.3908, |
| "eval_samples_per_second": 40.94, |
| "eval_steps_per_second": 5.118, |
| "step": 600 |
| }, |
| { |
| "entropy": 2.6087251901626587, |
| "epoch": 0.07347627077812575, |
| "grad_norm": 10.9375, |
| "learning_rate": 9.985228422743697e-05, |
| "loss": 2.642395782470703, |
| "mean_token_accuracy": 0.4696370273828506, |
| "num_tokens": 4208167.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 2.693037247657776, |
| "epoch": 0.07468079980727535, |
| "grad_norm": 5.96875, |
| "learning_rate": 9.983658888973537e-05, |
| "loss": 2.716563034057617, |
| "mean_token_accuracy": 0.46087419986724854, |
| "num_tokens": 4276554.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 2.598901891708374, |
| "epoch": 0.07588532883642496, |
| "grad_norm": 3.375, |
| "learning_rate": 9.982010262952629e-05, |
| "loss": 2.6250751495361326, |
| "mean_token_accuracy": 0.47428264617919924, |
| "num_tokens": 4346959.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 2.6499593019485475, |
| "epoch": 0.07708985786557455, |
| "grad_norm": 6.75, |
| "learning_rate": 9.980282570845192e-05, |
| "loss": 2.651329231262207, |
| "mean_token_accuracy": 0.4706882297992706, |
| "num_tokens": 4412980.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 2.6572098255157472, |
| "epoch": 0.07829438689472416, |
| "grad_norm": 2.8125, |
| "learning_rate": 9.978475840070251e-05, |
| "loss": 2.6689807891845705, |
| "mean_token_accuracy": 0.46909240186214446, |
| "num_tokens": 4482945.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 2.64446074962616, |
| "epoch": 0.07949891592387376, |
| "grad_norm": 4.40625, |
| "learning_rate": 9.976590099301197e-05, |
| "loss": 2.6496517181396486, |
| "mean_token_accuracy": 0.4731697618961334, |
| "num_tokens": 4550746.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 2.5607208251953124, |
| "epoch": 0.08070344495302337, |
| "grad_norm": 12.0, |
| "learning_rate": 9.974625378465337e-05, |
| "loss": 2.622728157043457, |
| "mean_token_accuracy": 0.47670555114746094, |
| "num_tokens": 4623290.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 2.594883131980896, |
| "epoch": 0.08190797398217298, |
| "grad_norm": 9.25, |
| "learning_rate": 9.97258170874341e-05, |
| "loss": 2.642950248718262, |
| "mean_token_accuracy": 0.4712438017129898, |
| "num_tokens": 4692716.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 2.5865614652633666, |
| "epoch": 0.08311250301132257, |
| "grad_norm": 4.15625, |
| "learning_rate": 9.970459122569109e-05, |
| "loss": 2.5889955520629884, |
| "mean_token_accuracy": 0.48189602196216585, |
| "num_tokens": 4758620.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 2.477654957771301, |
| "epoch": 0.08431703204047218, |
| "grad_norm": 4.125, |
| "learning_rate": 9.96825765362855e-05, |
| "loss": 2.4992355346679687, |
| "mean_token_accuracy": 0.49264668226242064, |
| "num_tokens": 4827946.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.08431703204047218, |
| "eval_entropy": 3.0759323835372925, |
| "eval_loss": 3.2211337089538574, |
| "eval_mean_token_accuracy": 0.3738161623477936, |
| "eval_num_tokens": 4827946.0, |
| "eval_runtime": 0.3902, |
| "eval_samples_per_second": 41.004, |
| "eval_steps_per_second": 5.126, |
| "step": 700 |
| }, |
| { |
| "entropy": 2.6039110660552978, |
| "epoch": 0.08552156106962178, |
| "grad_norm": 4.21875, |
| "learning_rate": 9.965977336859744e-05, |
| "loss": 2.637973403930664, |
| "mean_token_accuracy": 0.4767207413911819, |
| "num_tokens": 4893902.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 2.5275394916534424, |
| "epoch": 0.08672609009877139, |
| "grad_norm": 3.390625, |
| "learning_rate": 9.963618208452044e-05, |
| "loss": 2.50467529296875, |
| "mean_token_accuracy": 0.49372389614582063, |
| "num_tokens": 4961731.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 2.5056965589523315, |
| "epoch": 0.08793061912792098, |
| "grad_norm": 3.765625, |
| "learning_rate": 9.961180305845568e-05, |
| "loss": 2.555381011962891, |
| "mean_token_accuracy": 0.4927469611167908, |
| "num_tokens": 5031232.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 2.4905244588851927, |
| "epoch": 0.08913514815707059, |
| "grad_norm": 3.1875, |
| "learning_rate": 9.958663667730603e-05, |
| "loss": 2.4839048385620117, |
| "mean_token_accuracy": 0.49619937539100645, |
| "num_tokens": 5103147.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 2.492690992355347, |
| "epoch": 0.09033967718622019, |
| "grad_norm": 2.59375, |
| "learning_rate": 9.956068334047e-05, |
| "loss": 2.5731042861938476, |
| "mean_token_accuracy": 0.4881134808063507, |
| "num_tokens": 5171231.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 2.4902069091796877, |
| "epoch": 0.0915442062153698, |
| "grad_norm": 4.65625, |
| "learning_rate": 9.953394345983524e-05, |
| "loss": 2.4940427780151366, |
| "mean_token_accuracy": 0.49709913730621336, |
| "num_tokens": 5241133.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 2.554565668106079, |
| "epoch": 0.09274873524451939, |
| "grad_norm": 3.015625, |
| "learning_rate": 9.950641745977221e-05, |
| "loss": 2.5632354736328127, |
| "mean_token_accuracy": 0.484773388504982, |
| "num_tokens": 5307282.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 2.4865276336669924, |
| "epoch": 0.093953264273669, |
| "grad_norm": 2.875, |
| "learning_rate": 9.947810577712726e-05, |
| "loss": 2.5425525665283204, |
| "mean_token_accuracy": 0.49204943478107455, |
| "num_tokens": 5377403.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 2.4226608276367188, |
| "epoch": 0.0951577933028186, |
| "grad_norm": 4.1875, |
| "learning_rate": 9.944900886121577e-05, |
| "loss": 2.4426912307739257, |
| "mean_token_accuracy": 0.503141775727272, |
| "num_tokens": 5445802.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 2.4688761949539186, |
| "epoch": 0.0963623223319682, |
| "grad_norm": 3.703125, |
| "learning_rate": 9.941912717381508e-05, |
| "loss": 2.4866867065429688, |
| "mean_token_accuracy": 0.49062854051589966, |
| "num_tokens": 5517621.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.0963623223319682, |
| "eval_entropy": 3.0702708959579468, |
| "eval_loss": 3.130521297454834, |
| "eval_mean_token_accuracy": 0.3833482414484024, |
| "eval_num_tokens": 5517621.0, |
| "eval_runtime": 0.4137, |
| "eval_samples_per_second": 38.674, |
| "eval_steps_per_second": 4.834, |
| "step": 800 |
| }, |
| { |
| "entropy": 2.4292314291000365, |
| "epoch": 0.0975668513611178, |
| "grad_norm": 4.0, |
| "learning_rate": 9.938846118915706e-05, |
| "loss": 2.4692073822021485, |
| "mean_token_accuracy": 0.5002907454967499, |
| "num_tokens": 5586412.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 2.4135414600372314, |
| "epoch": 0.09877138039026741, |
| "grad_norm": 3.671875, |
| "learning_rate": 9.93570113939206e-05, |
| "loss": 2.4233545303344726, |
| "mean_token_accuracy": 0.5025378137826919, |
| "num_tokens": 5655647.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 2.394425559043884, |
| "epoch": 0.099975909419417, |
| "grad_norm": 2.84375, |
| "learning_rate": 9.9324778287224e-05, |
| "loss": 2.443798828125, |
| "mean_token_accuracy": 0.5025730848312377, |
| "num_tokens": 5723722.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 2.549955868721008, |
| "epoch": 0.10118043844856661, |
| "grad_norm": 3.109375, |
| "learning_rate": 9.929176238061687e-05, |
| "loss": 2.563458061218262, |
| "mean_token_accuracy": 0.48627259731292727, |
| "num_tokens": 5794281.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 2.468239426612854, |
| "epoch": 0.10238496747771621, |
| "grad_norm": 3.09375, |
| "learning_rate": 9.925796419807216e-05, |
| "loss": 2.4876327514648438, |
| "mean_token_accuracy": 0.49459480941295625, |
| "num_tokens": 5864854.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 2.542571449279785, |
| "epoch": 0.10358949650686582, |
| "grad_norm": 33.25, |
| "learning_rate": 9.922338427597777e-05, |
| "loss": 2.5742753982543944, |
| "mean_token_accuracy": 0.48645628094673155, |
| "num_tokens": 5933244.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 2.4776437759399412, |
| "epoch": 0.10479402553601541, |
| "grad_norm": 3.734375, |
| "learning_rate": 9.918802316312806e-05, |
| "loss": 2.4924917221069336, |
| "mean_token_accuracy": 0.4953279852867126, |
| "num_tokens": 5997292.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 2.3822479009628297, |
| "epoch": 0.10599855456516502, |
| "grad_norm": 2.59375, |
| "learning_rate": 9.915188142071512e-05, |
| "loss": 2.4000953674316405, |
| "mean_token_accuracy": 0.5090537935495376, |
| "num_tokens": 6072140.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 2.4909890413284304, |
| "epoch": 0.10720308359431463, |
| "grad_norm": 3.0, |
| "learning_rate": 9.91149596223199e-05, |
| "loss": 2.507156753540039, |
| "mean_token_accuracy": 0.4945272743701935, |
| "num_tokens": 6139671.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 2.3194159507751464, |
| "epoch": 0.10840761262346423, |
| "grad_norm": 3.3125, |
| "learning_rate": 9.907725835390305e-05, |
| "loss": 2.3592811584472657, |
| "mean_token_accuracy": 0.5138555377721786, |
| "num_tokens": 6211544.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.10840761262346423, |
| "eval_entropy": 3.031153082847595, |
| "eval_loss": 3.127545118331909, |
| "eval_mean_token_accuracy": 0.3903745263814926, |
| "eval_num_tokens": 6211544.0, |
| "eval_runtime": 0.4145, |
| "eval_samples_per_second": 38.602, |
| "eval_steps_per_second": 4.825, |
| "step": 900 |
| }, |
| { |
| "entropy": 2.4157155036926268, |
| "epoch": 0.10961214165261383, |
| "grad_norm": 2.96875, |
| "learning_rate": 9.903877821379573e-05, |
| "loss": 2.421117401123047, |
| "mean_token_accuracy": 0.5020445615053177, |
| "num_tokens": 6278991.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 2.4330294132232666, |
| "epoch": 0.11081667068176343, |
| "grad_norm": 5.5, |
| "learning_rate": 9.899951981268995e-05, |
| "loss": 2.429648590087891, |
| "mean_token_accuracy": 0.505955895781517, |
| "num_tokens": 6346983.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 2.4191589832305906, |
| "epoch": 0.11202119971091304, |
| "grad_norm": 2.375, |
| "learning_rate": 9.895948377362905e-05, |
| "loss": 2.4224737167358397, |
| "mean_token_accuracy": 0.5058120727539063, |
| "num_tokens": 6414537.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 2.4370260000228883, |
| "epoch": 0.11322572874006263, |
| "grad_norm": 3.46875, |
| "learning_rate": 9.891867073199768e-05, |
| "loss": 2.4694145202636717, |
| "mean_token_accuracy": 0.4992352068424225, |
| "num_tokens": 6481676.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 2.3773382186889647, |
| "epoch": 0.11443025776921224, |
| "grad_norm": 2.125, |
| "learning_rate": 9.88770813355118e-05, |
| "loss": 2.416706657409668, |
| "mean_token_accuracy": 0.5077957957983017, |
| "num_tokens": 6551157.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 2.4528631925582887, |
| "epoch": 0.11563478679836184, |
| "grad_norm": 2.609375, |
| "learning_rate": 9.883471624420832e-05, |
| "loss": 2.502288818359375, |
| "mean_token_accuracy": 0.49239401519298553, |
| "num_tokens": 6620994.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 2.4143874883651733, |
| "epoch": 0.11683931582751145, |
| "grad_norm": 2.71875, |
| "learning_rate": 9.879157613043474e-05, |
| "loss": 2.4089076995849608, |
| "mean_token_accuracy": 0.5080109655857086, |
| "num_tokens": 6687906.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 2.4254562616348267, |
| "epoch": 0.11804384485666104, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.874766167883836e-05, |
| "loss": 2.4518817901611327, |
| "mean_token_accuracy": 0.4991611152887344, |
| "num_tokens": 6755467.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 2.365486240386963, |
| "epoch": 0.11924837388581065, |
| "grad_norm": 5.1875, |
| "learning_rate": 9.870297358635547e-05, |
| "loss": 2.3928062438964846, |
| "mean_token_accuracy": 0.5166857928037644, |
| "num_tokens": 6822936.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 2.363088536262512, |
| "epoch": 0.12045290291496025, |
| "grad_norm": 4.6875, |
| "learning_rate": 9.865751256220035e-05, |
| "loss": 2.4319067001342773, |
| "mean_token_accuracy": 0.5071658194065094, |
| "num_tokens": 6890260.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.12045290291496025, |
| "eval_entropy": 3.0976409912109375, |
| "eval_loss": 3.1172609329223633, |
| "eval_mean_token_accuracy": 0.3839927762746811, |
| "eval_num_tokens": 6890260.0, |
| "eval_runtime": 0.3739, |
| "eval_samples_per_second": 42.789, |
| "eval_steps_per_second": 5.349, |
| "step": 1000 |
| }, |
| { |
| "entropy": 2.466799020767212, |
| "epoch": 0.12165743194410986, |
| "grad_norm": 3.09375, |
| "learning_rate": 9.861127932785386e-05, |
| "loss": 2.4565046310424803, |
| "mean_token_accuracy": 0.4963350534439087, |
| "num_tokens": 6956554.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 2.450888824462891, |
| "epoch": 0.12286196097325945, |
| "grad_norm": 2.5, |
| "learning_rate": 9.856427461705215e-05, |
| "loss": 2.460346221923828, |
| "mean_token_accuracy": 0.4983285039663315, |
| "num_tokens": 7026362.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 2.4059686422348023, |
| "epoch": 0.12406649000240906, |
| "grad_norm": 3.234375, |
| "learning_rate": 9.851649917577492e-05, |
| "loss": 2.420361328125, |
| "mean_token_accuracy": 0.505145075917244, |
| "num_tokens": 7098658.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 2.386690592765808, |
| "epoch": 0.12527101903155866, |
| "grad_norm": 2.9375, |
| "learning_rate": 9.846795376223358e-05, |
| "loss": 2.4095857620239256, |
| "mean_token_accuracy": 0.507191401720047, |
| "num_tokens": 7168145.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 2.31386137008667, |
| "epoch": 0.12647554806070826, |
| "grad_norm": 2.265625, |
| "learning_rate": 9.841863914685933e-05, |
| "loss": 2.3461013793945313, |
| "mean_token_accuracy": 0.5163414418697357, |
| "num_tokens": 7236180.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 2.3228036403656005, |
| "epoch": 0.12768007708985787, |
| "grad_norm": 6.25, |
| "learning_rate": 9.836855611229074e-05, |
| "loss": 2.3331735610961912, |
| "mean_token_accuracy": 0.5201371729373931, |
| "num_tokens": 7305057.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 2.3945634365081787, |
| "epoch": 0.12888460611900746, |
| "grad_norm": 6.96875, |
| "learning_rate": 9.831770545336151e-05, |
| "loss": 2.402456855773926, |
| "mean_token_accuracy": 0.5082537710666657, |
| "num_tokens": 7375054.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 2.424406051635742, |
| "epoch": 0.13008913514815706, |
| "grad_norm": 3.28125, |
| "learning_rate": 9.826608797708778e-05, |
| "loss": 2.4312997817993165, |
| "mean_token_accuracy": 0.5069911390542984, |
| "num_tokens": 7442873.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 2.3220206022262575, |
| "epoch": 0.13129366417730667, |
| "grad_norm": 3.859375, |
| "learning_rate": 9.821370450265529e-05, |
| "loss": 2.3608503341674805, |
| "mean_token_accuracy": 0.5150643199682235, |
| "num_tokens": 7510091.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 2.3308310985565184, |
| "epoch": 0.13249819320645628, |
| "grad_norm": 2.359375, |
| "learning_rate": 9.81605558614064e-05, |
| "loss": 2.3494470596313475, |
| "mean_token_accuracy": 0.5126039475202561, |
| "num_tokens": 7580058.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.13249819320645628, |
| "eval_entropy": 2.8751864433288574, |
| "eval_loss": 3.0592124462127686, |
| "eval_mean_token_accuracy": 0.4051549583673477, |
| "eval_num_tokens": 7580058.0, |
| "eval_runtime": 0.4025, |
| "eval_samples_per_second": 39.749, |
| "eval_steps_per_second": 4.969, |
| "step": 1100 |
| }, |
| { |
| "entropy": 2.207090878486633, |
| "epoch": 0.1337027222356059, |
| "grad_norm": 3.359375, |
| "learning_rate": 9.810664289682699e-05, |
| "loss": 2.2155645370483397, |
| "mean_token_accuracy": 0.5430127084255219, |
| "num_tokens": 7642123.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 2.3662970781326296, |
| "epoch": 0.13490725126475547, |
| "grad_norm": 3.46875, |
| "learning_rate": 9.80519664645329e-05, |
| "loss": 2.3862329483032227, |
| "mean_token_accuracy": 0.5138962984085083, |
| "num_tokens": 7710200.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 2.2497094869613647, |
| "epoch": 0.13611178029390508, |
| "grad_norm": 2.21875, |
| "learning_rate": 9.799652743225654e-05, |
| "loss": 2.2575557708740233, |
| "mean_token_accuracy": 0.5318409651517868, |
| "num_tokens": 7781182.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 2.2917282342910767, |
| "epoch": 0.1373163093230547, |
| "grad_norm": 6.21875, |
| "learning_rate": 9.794032667983293e-05, |
| "loss": 2.3167720794677735, |
| "mean_token_accuracy": 0.5249288141727447, |
| "num_tokens": 7849706.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 2.373197388648987, |
| "epoch": 0.1385208383522043, |
| "grad_norm": 2.609375, |
| "learning_rate": 9.78833650991859e-05, |
| "loss": 2.3759769439697265, |
| "mean_token_accuracy": 0.51370949447155, |
| "num_tokens": 7916623.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 2.3291378736495973, |
| "epoch": 0.13972536738135388, |
| "grad_norm": 3.796875, |
| "learning_rate": 9.782564359431385e-05, |
| "loss": 2.372611236572266, |
| "mean_token_accuracy": 0.5120194524526596, |
| "num_tokens": 7984218.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 2.244597578048706, |
| "epoch": 0.1409298964105035, |
| "grad_norm": 2.265625, |
| "learning_rate": 9.776716308127539e-05, |
| "loss": 2.2457481384277345, |
| "mean_token_accuracy": 0.5323881536722184, |
| "num_tokens": 8053595.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 2.381738781929016, |
| "epoch": 0.1421344254396531, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.770792448817485e-05, |
| "loss": 2.382204055786133, |
| "mean_token_accuracy": 0.5139793962240219, |
| "num_tokens": 8121149.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 2.3032596588134764, |
| "epoch": 0.1433389544688027, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.764792875514756e-05, |
| "loss": 2.3296224594116213, |
| "mean_token_accuracy": 0.5209127068519592, |
| "num_tokens": 8188046.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 2.3109251260757446, |
| "epoch": 0.1445434834979523, |
| "grad_norm": 3.3125, |
| "learning_rate": 9.758717683434484e-05, |
| "loss": 2.332454872131348, |
| "mean_token_accuracy": 0.5167243659496308, |
| "num_tokens": 8258677.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.1445434834979523, |
| "eval_entropy": 3.054787278175354, |
| "eval_loss": 3.034820556640625, |
| "eval_mean_token_accuracy": 0.4018402099609375, |
| "eval_num_tokens": 8258677.0, |
| "eval_runtime": 0.4108, |
| "eval_samples_per_second": 38.948, |
| "eval_steps_per_second": 4.869, |
| "step": 1200 |
| }, |
| { |
| "entropy": 2.392910861968994, |
| "epoch": 0.1457480125271019, |
| "grad_norm": 2.28125, |
| "learning_rate": 9.752566968991901e-05, |
| "loss": 2.428352928161621, |
| "mean_token_accuracy": 0.508216142654419, |
| "num_tokens": 8325649.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 2.313650631904602, |
| "epoch": 0.1469525415562515, |
| "grad_norm": 3.03125, |
| "learning_rate": 9.746340829800799e-05, |
| "loss": 2.3404510498046873, |
| "mean_token_accuracy": 0.5221627295017243, |
| "num_tokens": 8394365.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 2.341995906829834, |
| "epoch": 0.14815707058540112, |
| "grad_norm": 2.21875, |
| "learning_rate": 9.740039364671987e-05, |
| "loss": 2.3398111343383787, |
| "mean_token_accuracy": 0.5121251910924911, |
| "num_tokens": 8464333.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 2.259868335723877, |
| "epoch": 0.1493615996145507, |
| "grad_norm": 2.484375, |
| "learning_rate": 9.733662673611719e-05, |
| "loss": 2.282248878479004, |
| "mean_token_accuracy": 0.5302321165800095, |
| "num_tokens": 8531603.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 2.3321637392044066, |
| "epoch": 0.1505661286437003, |
| "grad_norm": 3.59375, |
| "learning_rate": 9.727210857820108e-05, |
| "loss": 2.371327590942383, |
| "mean_token_accuracy": 0.516041025519371, |
| "num_tokens": 8600388.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 2.351721405982971, |
| "epoch": 0.15177065767284992, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.720684019689524e-05, |
| "loss": 2.351237487792969, |
| "mean_token_accuracy": 0.5172569751739502, |
| "num_tokens": 8672129.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 2.2653574705123902, |
| "epoch": 0.15297518670199953, |
| "grad_norm": 4.8125, |
| "learning_rate": 9.714082262802961e-05, |
| "loss": 2.297392463684082, |
| "mean_token_accuracy": 0.523885440826416, |
| "num_tokens": 8743632.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 2.293268084526062, |
| "epoch": 0.1541797157311491, |
| "grad_norm": 2.59375, |
| "learning_rate": 9.707405691932402e-05, |
| "loss": 2.3110326766967773, |
| "mean_token_accuracy": 0.5206148803234101, |
| "num_tokens": 8813353.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 2.387986731529236, |
| "epoch": 0.15538424476029872, |
| "grad_norm": 3.484375, |
| "learning_rate": 9.700654413037144e-05, |
| "loss": 2.3922283172607424, |
| "mean_token_accuracy": 0.513523331284523, |
| "num_tokens": 8878273.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 2.2815721154212953, |
| "epoch": 0.15658877378944833, |
| "grad_norm": 3.171875, |
| "learning_rate": 9.693828533262135e-05, |
| "loss": 2.2999031066894533, |
| "mean_token_accuracy": 0.5234899431467056, |
| "num_tokens": 8947692.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.15658877378944833, |
| "eval_entropy": 2.8758058547973633, |
| "eval_loss": 2.97363018989563, |
| "eval_mean_token_accuracy": 0.40523606538772583, |
| "eval_num_tokens": 8947692.0, |
| "eval_runtime": 0.4094, |
| "eval_samples_per_second": 39.084, |
| "eval_steps_per_second": 4.886, |
| "step": 1300 |
| }, |
| { |
| "entropy": 2.253899431228638, |
| "epoch": 0.15779330281859794, |
| "grad_norm": 3.015625, |
| "learning_rate": 9.686928160936252e-05, |
| "loss": 2.275619316101074, |
| "mean_token_accuracy": 0.5288082420825958, |
| "num_tokens": 9016255.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 2.2969882249832154, |
| "epoch": 0.15899783184774752, |
| "grad_norm": 1.90625, |
| "learning_rate": 9.679953405570601e-05, |
| "loss": 2.3192108154296873, |
| "mean_token_accuracy": 0.5225553333759307, |
| "num_tokens": 9085956.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 2.290460777282715, |
| "epoch": 0.16020236087689713, |
| "grad_norm": 2.765625, |
| "learning_rate": 9.672904377856765e-05, |
| "loss": 2.307216262817383, |
| "mean_token_accuracy": 0.5200891375541687, |
| "num_tokens": 9153647.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 2.2761851072311403, |
| "epoch": 0.16140688990604674, |
| "grad_norm": 2.84375, |
| "learning_rate": 9.665781189665052e-05, |
| "loss": 2.299959182739258, |
| "mean_token_accuracy": 0.5264901399612427, |
| "num_tokens": 9225290.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 2.3141539812088014, |
| "epoch": 0.16261141893519634, |
| "grad_norm": 2.71875, |
| "learning_rate": 9.658583954042726e-05, |
| "loss": 2.3225004196166994, |
| "mean_token_accuracy": 0.5269528299570083, |
| "num_tokens": 9291378.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 2.2969261407852173, |
| "epoch": 0.16381594796434595, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.651312785212204e-05, |
| "loss": 2.3121112823486327, |
| "mean_token_accuracy": 0.5209438920021057, |
| "num_tokens": 9360511.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 2.303641176223755, |
| "epoch": 0.16502047699349554, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.643967798569247e-05, |
| "loss": 2.324795722961426, |
| "mean_token_accuracy": 0.5191195398569107, |
| "num_tokens": 9430201.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 2.2442052602767943, |
| "epoch": 0.16622500602264514, |
| "grad_norm": 5.59375, |
| "learning_rate": 9.636549110681125e-05, |
| "loss": 2.247611427307129, |
| "mean_token_accuracy": 0.527609360218048, |
| "num_tokens": 9497142.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 2.2580633640289305, |
| "epoch": 0.16742953505179475, |
| "grad_norm": 2.75, |
| "learning_rate": 9.629056839284778e-05, |
| "loss": 2.307590866088867, |
| "mean_token_accuracy": 0.52379210293293, |
| "num_tokens": 9562440.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 2.3624070405960085, |
| "epoch": 0.16863406408094436, |
| "grad_norm": 2.671875, |
| "learning_rate": 9.62149110328493e-05, |
| "loss": 2.3632463455200194, |
| "mean_token_accuracy": 0.5173578560352325, |
| "num_tokens": 9631227.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.16863406408094436, |
| "eval_entropy": 2.853332757949829, |
| "eval_loss": 2.9722647666931152, |
| "eval_mean_token_accuracy": 0.41226455569267273, |
| "eval_num_tokens": 9631227.0, |
| "eval_runtime": 0.4149, |
| "eval_samples_per_second": 38.564, |
| "eval_steps_per_second": 4.821, |
| "step": 1400 |
| }, |
| { |
| "entropy": 2.4244832038879394, |
| "epoch": 0.16983859311009394, |
| "grad_norm": 3.828125, |
| "learning_rate": 9.613852022752217e-05, |
| "loss": 2.4088199615478514, |
| "mean_token_accuracy": 0.5088783591985703, |
| "num_tokens": 9701139.0, |
| "step": 1410 |
| }, |
| { |
| "entropy": 2.155960202217102, |
| "epoch": 0.17104312213924355, |
| "grad_norm": 3.296875, |
| "learning_rate": 9.606139718921277e-05, |
| "loss": 2.2044837951660154, |
| "mean_token_accuracy": 0.5425004243850708, |
| "num_tokens": 9768895.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 2.2430294871330263, |
| "epoch": 0.17224765116839316, |
| "grad_norm": 2.578125, |
| "learning_rate": 9.598354314188823e-05, |
| "loss": 2.2836578369140623, |
| "mean_token_accuracy": 0.5263251423835754, |
| "num_tokens": 9843163.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 2.1803964376449585, |
| "epoch": 0.17345218019754277, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.590495932111703e-05, |
| "loss": 2.1800159454345702, |
| "mean_token_accuracy": 0.5434668481349945, |
| "num_tokens": 9917015.0, |
| "step": 1440 |
| }, |
| { |
| "entropy": 2.299999499320984, |
| "epoch": 0.17465670922669235, |
| "grad_norm": 2.390625, |
| "learning_rate": 9.582564697404936e-05, |
| "loss": 2.309121513366699, |
| "mean_token_accuracy": 0.5198373407125473, |
| "num_tokens": 9984193.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 2.2457360506057737, |
| "epoch": 0.17586123825584196, |
| "grad_norm": 3.0, |
| "learning_rate": 9.574560735939742e-05, |
| "loss": 2.2609222412109373, |
| "mean_token_accuracy": 0.5328970372676849, |
| "num_tokens": 10053695.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 2.2804399013519285, |
| "epoch": 0.17706576728499157, |
| "grad_norm": 3.046875, |
| "learning_rate": 9.56648417474153e-05, |
| "loss": 2.2925643920898438, |
| "mean_token_accuracy": 0.5243758141994477, |
| "num_tokens": 10124469.0, |
| "step": 1470 |
| }, |
| { |
| "entropy": 2.195032811164856, |
| "epoch": 0.17827029631414118, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.558335141987895e-05, |
| "loss": 2.2206499099731447, |
| "mean_token_accuracy": 0.5339515537023545, |
| "num_tokens": 10190836.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 2.3091806173324585, |
| "epoch": 0.17947482534329076, |
| "grad_norm": 2.84375, |
| "learning_rate": 9.550113767006578e-05, |
| "loss": 2.3288848876953123, |
| "mean_token_accuracy": 0.5248405992984772, |
| "num_tokens": 10255931.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 2.261967420578003, |
| "epoch": 0.18067935437244037, |
| "grad_norm": 4.125, |
| "learning_rate": 9.541820180273414e-05, |
| "loss": 2.234074592590332, |
| "mean_token_accuracy": 0.5378178834915162, |
| "num_tokens": 10323861.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.18067935437244037, |
| "eval_entropy": 2.846497416496277, |
| "eval_loss": 2.9528520107269287, |
| "eval_mean_token_accuracy": 0.41081106662750244, |
| "eval_num_tokens": 10323861.0, |
| "eval_runtime": 0.3965, |
| "eval_samples_per_second": 40.358, |
| "eval_steps_per_second": 5.045, |
| "step": 1500 |
| }, |
| { |
| "entropy": 2.1897090792655947, |
| "epoch": 0.18188388340158998, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.533454513410258e-05, |
| "loss": 2.2186933517456056, |
| "mean_token_accuracy": 0.5345078110694885, |
| "num_tokens": 10393479.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 2.2354114055633545, |
| "epoch": 0.1830884124307396, |
| "grad_norm": 5.78125, |
| "learning_rate": 9.525016899182905e-05, |
| "loss": 2.2384716033935548, |
| "mean_token_accuracy": 0.5285776913166046, |
| "num_tokens": 10460222.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 2.316632866859436, |
| "epoch": 0.18429294145988917, |
| "grad_norm": 3.109375, |
| "learning_rate": 9.516507471498972e-05, |
| "loss": 2.3353763580322267, |
| "mean_token_accuracy": 0.5186457633972168, |
| "num_tokens": 10530315.0, |
| "step": 1530 |
| }, |
| { |
| "entropy": 2.2956483721733094, |
| "epoch": 0.18549747048903878, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.507926365405784e-05, |
| "loss": 2.2976795196533204, |
| "mean_token_accuracy": 0.5249256074428559, |
| "num_tokens": 10597431.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 2.3254905223846434, |
| "epoch": 0.1867019995181884, |
| "grad_norm": 2.171875, |
| "learning_rate": 9.499273717088221e-05, |
| "loss": 2.334694480895996, |
| "mean_token_accuracy": 0.5172825694084168, |
| "num_tokens": 10667831.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 2.1734044551849365, |
| "epoch": 0.187906528547338, |
| "grad_norm": 2.40625, |
| "learning_rate": 9.490549663866563e-05, |
| "loss": 2.20830078125, |
| "mean_token_accuracy": 0.5397387742996216, |
| "num_tokens": 10736684.0, |
| "step": 1560 |
| }, |
| { |
| "entropy": 2.2810161113739014, |
| "epoch": 0.1891110575764876, |
| "grad_norm": 2.265625, |
| "learning_rate": 9.48175434419431e-05, |
| "loss": 2.2679985046386717, |
| "mean_token_accuracy": 0.5284598618745804, |
| "num_tokens": 10803794.0, |
| "step": 1570 |
| }, |
| { |
| "entropy": 2.182316446304321, |
| "epoch": 0.1903155866056372, |
| "grad_norm": 2.546875, |
| "learning_rate": 9.472887897655976e-05, |
| "loss": 2.210857963562012, |
| "mean_token_accuracy": 0.5313417106866837, |
| "num_tokens": 10873181.0, |
| "step": 1580 |
| }, |
| { |
| "entropy": 2.236132097244263, |
| "epoch": 0.1915201156347868, |
| "grad_norm": 1.9453125, |
| "learning_rate": 9.463950464964886e-05, |
| "loss": 2.2548202514648437, |
| "mean_token_accuracy": 0.5304252445697785, |
| "num_tokens": 10941894.0, |
| "step": 1590 |
| }, |
| { |
| "entropy": 2.134193646907806, |
| "epoch": 0.1927246446639364, |
| "grad_norm": 1.7421875, |
| "learning_rate": 9.454942187960943e-05, |
| "loss": 2.155434226989746, |
| "mean_token_accuracy": 0.5477861285209655, |
| "num_tokens": 11010768.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.1927246446639364, |
| "eval_entropy": 2.864907145500183, |
| "eval_loss": 2.914454936981201, |
| "eval_mean_token_accuracy": 0.4183240383863449, |
| "eval_num_tokens": 11010768.0, |
| "eval_runtime": 0.4288, |
| "eval_samples_per_second": 37.316, |
| "eval_steps_per_second": 4.664, |
| "step": 1600 |
| }, |
| { |
| "entropy": 2.188663959503174, |
| "epoch": 0.19392917369308602, |
| "grad_norm": 3.0625, |
| "learning_rate": 9.445863209608364e-05, |
| "loss": 2.195932960510254, |
| "mean_token_accuracy": 0.5427460491657257, |
| "num_tokens": 11082324.0, |
| "step": 1610 |
| }, |
| { |
| "entropy": 2.2581432461738586, |
| "epoch": 0.1951337027222356, |
| "grad_norm": 2.4375, |
| "learning_rate": 9.436713673993421e-05, |
| "loss": 2.2641986846923827, |
| "mean_token_accuracy": 0.5305707901716232, |
| "num_tokens": 11153380.0, |
| "step": 1620 |
| }, |
| { |
| "entropy": 2.2770965337753295, |
| "epoch": 0.1963382317513852, |
| "grad_norm": 17.625, |
| "learning_rate": 9.427493726322151e-05, |
| "loss": 2.3116901397705076, |
| "mean_token_accuracy": 0.5252400994300842, |
| "num_tokens": 11222740.0, |
| "step": 1630 |
| }, |
| { |
| "entropy": 2.2262574195861817, |
| "epoch": 0.19754276078053482, |
| "grad_norm": 2.71875, |
| "learning_rate": 9.418203512918058e-05, |
| "loss": 2.248627281188965, |
| "mean_token_accuracy": 0.533080717921257, |
| "num_tokens": 11292074.0, |
| "step": 1640 |
| }, |
| { |
| "entropy": 2.282968246936798, |
| "epoch": 0.19874728980968442, |
| "grad_norm": 4.8125, |
| "learning_rate": 9.40884318121978e-05, |
| "loss": 2.2738475799560547, |
| "mean_token_accuracy": 0.5312909007072448, |
| "num_tokens": 11363360.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 2.1889445066452025, |
| "epoch": 0.199951818838834, |
| "grad_norm": 2.0, |
| "learning_rate": 9.399412879778757e-05, |
| "loss": 2.1901546478271485, |
| "mean_token_accuracy": 0.5460667550563812, |
| "num_tokens": 11434578.0, |
| "step": 1660 |
| }, |
| { |
| "entropy": 2.2750983953475954, |
| "epoch": 0.20115634786798361, |
| "grad_norm": 2.5, |
| "learning_rate": 9.389912758256869e-05, |
| "loss": 2.2871360778808594, |
| "mean_token_accuracy": 0.5282805800437927, |
| "num_tokens": 11504260.0, |
| "step": 1670 |
| }, |
| { |
| "entropy": 2.1284833669662477, |
| "epoch": 0.20236087689713322, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.380342967424066e-05, |
| "loss": 2.1631650924682617, |
| "mean_token_accuracy": 0.5499304831027985, |
| "num_tokens": 11570786.0, |
| "step": 1680 |
| }, |
| { |
| "entropy": 2.210453176498413, |
| "epoch": 0.20356540592628283, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.370703659155969e-05, |
| "loss": 2.2247928619384765, |
| "mean_token_accuracy": 0.5347439706325531, |
| "num_tokens": 11640019.0, |
| "step": 1690 |
| }, |
| { |
| "entropy": 2.2434919834136964, |
| "epoch": 0.20476993495543241, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.360994986431465e-05, |
| "loss": 2.2586835861206054, |
| "mean_token_accuracy": 0.5289602160453797, |
| "num_tokens": 11709255.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.20476993495543241, |
| "eval_entropy": 2.8340182304382324, |
| "eval_loss": 2.9152491092681885, |
| "eval_mean_token_accuracy": 0.4188874661922455, |
| "eval_num_tokens": 11709255.0, |
| "eval_runtime": 0.4315, |
| "eval_samples_per_second": 37.078, |
| "eval_steps_per_second": 4.635, |
| "step": 1700 |
| }, |
| { |
| "entropy": 2.2331860780715944, |
| "epoch": 0.20597446398458202, |
| "grad_norm": 3.78125, |
| "learning_rate": 9.351217103330276e-05, |
| "loss": 2.2479530334472657, |
| "mean_token_accuracy": 0.5310386747121811, |
| "num_tokens": 11776203.0, |
| "step": 1710 |
| }, |
| { |
| "entropy": 2.3182583332061766, |
| "epoch": 0.20717899301373163, |
| "grad_norm": 2.0, |
| "learning_rate": 9.341370165030518e-05, |
| "loss": 2.3128843307495117, |
| "mean_token_accuracy": 0.52462497651577, |
| "num_tokens": 11843796.0, |
| "step": 1720 |
| }, |
| { |
| "entropy": 2.193679094314575, |
| "epoch": 0.20838352204288124, |
| "grad_norm": 2.0625, |
| "learning_rate": 9.331454327806228e-05, |
| "loss": 2.241754722595215, |
| "mean_token_accuracy": 0.5341065913438797, |
| "num_tokens": 11912238.0, |
| "step": 1730 |
| }, |
| { |
| "entropy": 2.163426327705383, |
| "epoch": 0.20958805107203082, |
| "grad_norm": 1.875, |
| "learning_rate": 9.321469749024895e-05, |
| "loss": 2.169114875793457, |
| "mean_token_accuracy": 0.5475439190864563, |
| "num_tokens": 11981947.0, |
| "step": 1740 |
| }, |
| { |
| "entropy": 2.2436394333839416, |
| "epoch": 0.21079258010118043, |
| "grad_norm": 2.375, |
| "learning_rate": 9.311416587144961e-05, |
| "loss": 2.248809814453125, |
| "mean_token_accuracy": 0.5373907715082169, |
| "num_tokens": 12049969.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 2.1564717292785645, |
| "epoch": 0.21199710913033004, |
| "grad_norm": 2.6875, |
| "learning_rate": 9.301295001713298e-05, |
| "loss": 2.1708261489868166, |
| "mean_token_accuracy": 0.5387319475412369, |
| "num_tokens": 12118220.0, |
| "step": 1760 |
| }, |
| { |
| "entropy": 2.1580832481384276, |
| "epoch": 0.21320163815947965, |
| "grad_norm": 2.390625, |
| "learning_rate": 9.291105153362685e-05, |
| "loss": 2.1711267471313476, |
| "mean_token_accuracy": 0.5511200129985809, |
| "num_tokens": 12188027.0, |
| "step": 1770 |
| }, |
| { |
| "entropy": 2.1681873559951783, |
| "epoch": 0.21440616718862926, |
| "grad_norm": 2.640625, |
| "learning_rate": 9.280847203809254e-05, |
| "loss": 2.167458152770996, |
| "mean_token_accuracy": 0.5453263878822326, |
| "num_tokens": 12257397.0, |
| "step": 1780 |
| }, |
| { |
| "entropy": 2.130517268180847, |
| "epoch": 0.21561069621777884, |
| "grad_norm": 2.203125, |
| "learning_rate": 9.270521315849928e-05, |
| "loss": 2.1511137008666994, |
| "mean_token_accuracy": 0.5517256915569305, |
| "num_tokens": 12327732.0, |
| "step": 1790 |
| }, |
| { |
| "entropy": 2.1457592606544496, |
| "epoch": 0.21681522524692845, |
| "grad_norm": 2.234375, |
| "learning_rate": 9.260127653359826e-05, |
| "loss": 2.172018814086914, |
| "mean_token_accuracy": 0.5436123073101043, |
| "num_tokens": 12399054.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.21681522524692845, |
| "eval_entropy": 2.959686517715454, |
| "eval_loss": 2.8896708488464355, |
| "eval_mean_token_accuracy": 0.42187774181365967, |
| "eval_num_tokens": 12399054.0, |
| "eval_runtime": 0.409, |
| "eval_samples_per_second": 39.117, |
| "eval_steps_per_second": 4.89, |
| "step": 1800 |
| }, |
| { |
| "entropy": 2.2030872344970702, |
| "epoch": 0.21801975427607806, |
| "grad_norm": 3.796875, |
| "learning_rate": 9.249666381289678e-05, |
| "loss": 2.205454444885254, |
| "mean_token_accuracy": 0.5406596213579178, |
| "num_tokens": 12468799.0, |
| "step": 1810 |
| }, |
| { |
| "entropy": 2.1178462505340576, |
| "epoch": 0.21922428330522767, |
| "grad_norm": 2.65625, |
| "learning_rate": 9.239137665663201e-05, |
| "loss": 2.1415390014648437, |
| "mean_token_accuracy": 0.5523227155208588, |
| "num_tokens": 12535527.0, |
| "step": 1820 |
| }, |
| { |
| "entropy": 2.3107515215873717, |
| "epoch": 0.22042881233437725, |
| "grad_norm": 2.765625, |
| "learning_rate": 9.228541673574453e-05, |
| "loss": 2.314019775390625, |
| "mean_token_accuracy": 0.5233344733715057, |
| "num_tokens": 12604182.0, |
| "step": 1830 |
| }, |
| { |
| "entropy": 2.2338968396186827, |
| "epoch": 0.22163334136352686, |
| "grad_norm": 3.328125, |
| "learning_rate": 9.217878573185202e-05, |
| "loss": 2.26546630859375, |
| "mean_token_accuracy": 0.5350278943777085, |
| "num_tokens": 12674202.0, |
| "step": 1840 |
| }, |
| { |
| "entropy": 2.2074631214141847, |
| "epoch": 0.22283787039267647, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.207148533722234e-05, |
| "loss": 2.2138628005981444, |
| "mean_token_accuracy": 0.537048852443695, |
| "num_tokens": 12741365.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 2.224149250984192, |
| "epoch": 0.22404239942182608, |
| "grad_norm": 1.734375, |
| "learning_rate": 9.196351725474693e-05, |
| "loss": 2.231917381286621, |
| "mean_token_accuracy": 0.5344621896743774, |
| "num_tokens": 12807216.0, |
| "step": 1860 |
| }, |
| { |
| "entropy": 2.1585075497627257, |
| "epoch": 0.22524692845097566, |
| "grad_norm": 3.859375, |
| "learning_rate": 9.185488319791352e-05, |
| "loss": 2.1571178436279297, |
| "mean_token_accuracy": 0.5523216247558593, |
| "num_tokens": 12873278.0, |
| "step": 1870 |
| }, |
| { |
| "entropy": 2.2243062257766724, |
| "epoch": 0.22645145748012527, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.174558489077917e-05, |
| "loss": 2.2570470809936523, |
| "mean_token_accuracy": 0.5269607186317444, |
| "num_tokens": 12944925.0, |
| "step": 1880 |
| }, |
| { |
| "entropy": 2.182615005970001, |
| "epoch": 0.22765598650927488, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.163562406794272e-05, |
| "loss": 2.1952205657958985, |
| "mean_token_accuracy": 0.5417321711778641, |
| "num_tokens": 13010487.0, |
| "step": 1890 |
| }, |
| { |
| "entropy": 2.111977481842041, |
| "epoch": 0.22886051553842449, |
| "grad_norm": 4.90625, |
| "learning_rate": 9.152500247451743e-05, |
| "loss": 2.155902290344238, |
| "mean_token_accuracy": 0.5541462600231171, |
| "num_tokens": 13080109.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.22886051553842449, |
| "eval_entropy": 2.8878990411758423, |
| "eval_loss": 2.8635730743408203, |
| "eval_mean_token_accuracy": 0.4313286989927292, |
| "eval_num_tokens": 13080109.0, |
| "eval_runtime": 0.4423, |
| "eval_samples_per_second": 36.175, |
| "eval_steps_per_second": 4.522, |
| "step": 1900 |
| }, |
| { |
| "entropy": 2.082282876968384, |
| "epoch": 0.23006504456757407, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.141372186610311e-05, |
| "loss": 2.0843496322631836, |
| "mean_token_accuracy": 0.5654984533786773, |
| "num_tokens": 13146870.0, |
| "step": 1910 |
| }, |
| { |
| "entropy": 2.183061623573303, |
| "epoch": 0.23126957359672368, |
| "grad_norm": 1.921875, |
| "learning_rate": 9.13017840087584e-05, |
| "loss": 2.2101274490356446, |
| "mean_token_accuracy": 0.5382312715053559, |
| "num_tokens": 13219259.0, |
| "step": 1920 |
| }, |
| { |
| "entropy": 2.2742142200469972, |
| "epoch": 0.23247410262587329, |
| "grad_norm": 3.859375, |
| "learning_rate": 9.118919067897268e-05, |
| "loss": 2.2773338317871095, |
| "mean_token_accuracy": 0.5264018833637237, |
| "num_tokens": 13290411.0, |
| "step": 1930 |
| }, |
| { |
| "entropy": 2.0873818516731264, |
| "epoch": 0.2336786316550229, |
| "grad_norm": 2.828125, |
| "learning_rate": 9.107594366363789e-05, |
| "loss": 2.0962757110595702, |
| "mean_token_accuracy": 0.5615902185440064, |
| "num_tokens": 13357767.0, |
| "step": 1940 |
| }, |
| { |
| "entropy": 2.0739306569099427, |
| "epoch": 0.23488316068417248, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.096204476002015e-05, |
| "loss": 2.087767219543457, |
| "mean_token_accuracy": 0.5554501116275787, |
| "num_tokens": 13423509.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 2.2131853818893434, |
| "epoch": 0.23608768971332209, |
| "grad_norm": 2.515625, |
| "learning_rate": 9.084749577573128e-05, |
| "loss": 2.209881401062012, |
| "mean_token_accuracy": 0.5431510090827942, |
| "num_tokens": 13491765.0, |
| "step": 1960 |
| }, |
| { |
| "entropy": 2.1343575000762938, |
| "epoch": 0.2372922187424717, |
| "grad_norm": 2.890625, |
| "learning_rate": 9.073229852870005e-05, |
| "loss": 2.1772743225097657, |
| "mean_token_accuracy": 0.5383218407630921, |
| "num_tokens": 13559252.0, |
| "step": 1970 |
| }, |
| { |
| "entropy": 2.165915012359619, |
| "epoch": 0.2384967477716213, |
| "grad_norm": 2.234375, |
| "learning_rate": 9.06164548471434e-05, |
| "loss": 2.1949913024902346, |
| "mean_token_accuracy": 0.5370178014039994, |
| "num_tokens": 13627185.0, |
| "step": 1980 |
| }, |
| { |
| "entropy": 2.219644045829773, |
| "epoch": 0.23970127680077088, |
| "grad_norm": 3.8125, |
| "learning_rate": 9.049996656953741e-05, |
| "loss": 2.2189016342163086, |
| "mean_token_accuracy": 0.5392367959022522, |
| "num_tokens": 13697307.0, |
| "step": 1990 |
| }, |
| { |
| "entropy": 2.163391101360321, |
| "epoch": 0.2409058058299205, |
| "grad_norm": 3.390625, |
| "learning_rate": 9.038283554458803e-05, |
| "loss": 2.1824737548828126, |
| "mean_token_accuracy": 0.5462636172771453, |
| "num_tokens": 13761202.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.2409058058299205, |
| "eval_entropy": 2.7357964515686035, |
| "eval_loss": 2.8540592193603516, |
| "eval_mean_token_accuracy": 0.426966056227684, |
| "eval_num_tokens": 13761202.0, |
| "eval_runtime": 0.4079, |
| "eval_samples_per_second": 39.222, |
| "eval_steps_per_second": 4.903, |
| "step": 2000 |
| }, |
| { |
| "entropy": 2.2309467077255247, |
| "epoch": 0.2421103348590701, |
| "grad_norm": 2.328125, |
| "learning_rate": 9.026506363120189e-05, |
| "loss": 2.2542724609375, |
| "mean_token_accuracy": 0.5328327417373657, |
| "num_tokens": 13830377.0, |
| "step": 2010 |
| }, |
| { |
| "entropy": 2.1426926970481874, |
| "epoch": 0.2433148638882197, |
| "grad_norm": 3.265625, |
| "learning_rate": 9.014665269845666e-05, |
| "loss": 2.164677047729492, |
| "mean_token_accuracy": 0.5467932879924774, |
| "num_tokens": 13900414.0, |
| "step": 2020 |
| }, |
| { |
| "entropy": 2.15235835313797, |
| "epoch": 0.24451939291736932, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.002760462557152e-05, |
| "loss": 2.1506536483764647, |
| "mean_token_accuracy": 0.5472699105739594, |
| "num_tokens": 13969474.0, |
| "step": 2030 |
| }, |
| { |
| "entropy": 2.112102437019348, |
| "epoch": 0.2457239219465189, |
| "grad_norm": 3.03125, |
| "learning_rate": 8.99079213018772e-05, |
| "loss": 2.1196800231933595, |
| "mean_token_accuracy": 0.5518890619277954, |
| "num_tokens": 14036404.0, |
| "step": 2040 |
| }, |
| { |
| "entropy": 2.1371890902519226, |
| "epoch": 0.2469284509756685, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.978760462678611e-05, |
| "loss": 2.153955841064453, |
| "mean_token_accuracy": 0.5453378796577454, |
| "num_tokens": 14107414.0, |
| "step": 2050 |
| }, |
| { |
| "entropy": 2.1680827260017397, |
| "epoch": 0.24813298000481812, |
| "grad_norm": 2.28125, |
| "learning_rate": 8.966665650976209e-05, |
| "loss": 2.1719219207763674, |
| "mean_token_accuracy": 0.5435125470161438, |
| "num_tokens": 14176407.0, |
| "step": 2060 |
| }, |
| { |
| "entropy": 2.1545125603675843, |
| "epoch": 0.24933750903396773, |
| "grad_norm": 2.140625, |
| "learning_rate": 8.954507887029026e-05, |
| "loss": 2.1777198791503904, |
| "mean_token_accuracy": 0.5442109107971191, |
| "num_tokens": 14247040.0, |
| "step": 2070 |
| }, |
| { |
| "entropy": 2.16602623462677, |
| "epoch": 0.2505420380631173, |
| "grad_norm": 1.6875, |
| "learning_rate": 8.942287363784638e-05, |
| "loss": 2.1973800659179688, |
| "mean_token_accuracy": 0.5392501056194305, |
| "num_tokens": 14316710.0, |
| "step": 2080 |
| }, |
| { |
| "entropy": 2.174078607559204, |
| "epoch": 0.25174656709226695, |
| "grad_norm": 2.25, |
| "learning_rate": 8.930004275186634e-05, |
| "loss": 2.182669258117676, |
| "mean_token_accuracy": 0.5412575840950012, |
| "num_tokens": 14385574.0, |
| "step": 2090 |
| }, |
| { |
| "entropy": 2.18578360080719, |
| "epoch": 0.25295109612141653, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.917658816171534e-05, |
| "loss": 2.2020822525024415, |
| "mean_token_accuracy": 0.5368836164474488, |
| "num_tokens": 14458459.0, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.25295109612141653, |
| "eval_entropy": 2.735661745071411, |
| "eval_loss": 2.849364757537842, |
| "eval_mean_token_accuracy": 0.42470361292362213, |
| "eval_num_tokens": 14458459.0, |
| "eval_runtime": 0.4067, |
| "eval_samples_per_second": 39.339, |
| "eval_steps_per_second": 4.917, |
| "step": 2100 |
| }, |
| { |
| "entropy": 2.0653456449508667, |
| "epoch": 0.2541556251505661, |
| "grad_norm": 2.5, |
| "learning_rate": 8.905251182665694e-05, |
| "loss": 2.071195602416992, |
| "mean_token_accuracy": 0.5605430185794831, |
| "num_tokens": 14528612.0, |
| "step": 2110 |
| }, |
| { |
| "entropy": 2.1143752455711367, |
| "epoch": 0.25536015417971575, |
| "grad_norm": 37.5, |
| "learning_rate": 8.892781571582209e-05, |
| "loss": 2.1292278289794924, |
| "mean_token_accuracy": 0.5506574988365174, |
| "num_tokens": 14601246.0, |
| "step": 2120 |
| }, |
| { |
| "entropy": 2.1357630252838136, |
| "epoch": 0.25656468320886533, |
| "grad_norm": 2.984375, |
| "learning_rate": 8.880250180817765e-05, |
| "loss": 2.1510900497436523, |
| "mean_token_accuracy": 0.5498823761940003, |
| "num_tokens": 14669430.0, |
| "step": 2130 |
| }, |
| { |
| "entropy": 2.191697120666504, |
| "epoch": 0.2577692122380149, |
| "grad_norm": 1.9375, |
| "learning_rate": 8.867657209249515e-05, |
| "loss": 2.210224914550781, |
| "mean_token_accuracy": 0.542282658815384, |
| "num_tokens": 14736231.0, |
| "step": 2140 |
| }, |
| { |
| "entropy": 2.1863580465316774, |
| "epoch": 0.25897374126716455, |
| "grad_norm": 2.265625, |
| "learning_rate": 8.855002856731927e-05, |
| "loss": 2.2168277740478515, |
| "mean_token_accuracy": 0.5371293991804122, |
| "num_tokens": 14800572.0, |
| "step": 2150 |
| }, |
| { |
| "entropy": 2.128365468978882, |
| "epoch": 0.26017827029631413, |
| "grad_norm": 1.9921875, |
| "learning_rate": 8.842287324093594e-05, |
| "loss": 2.1462024688720702, |
| "mean_token_accuracy": 0.5463616043329239, |
| "num_tokens": 14868668.0, |
| "step": 2160 |
| }, |
| { |
| "entropy": 2.1524484634399412, |
| "epoch": 0.26138279932546377, |
| "grad_norm": 1.9765625, |
| "learning_rate": 8.829510813134063e-05, |
| "loss": 2.1351104736328126, |
| "mean_token_accuracy": 0.5478598177433014, |
| "num_tokens": 14938978.0, |
| "step": 2170 |
| }, |
| { |
| "entropy": 2.198789381980896, |
| "epoch": 0.26258732835461335, |
| "grad_norm": 1.859375, |
| "learning_rate": 8.816673526620622e-05, |
| "loss": 2.2329919815063475, |
| "mean_token_accuracy": 0.5322647511959075, |
| "num_tokens": 15007348.0, |
| "step": 2180 |
| }, |
| { |
| "entropy": 2.1342248201370237, |
| "epoch": 0.26379185738376293, |
| "grad_norm": 3.375, |
| "learning_rate": 8.80377566828509e-05, |
| "loss": 2.148569679260254, |
| "mean_token_accuracy": 0.5494184136390686, |
| "num_tokens": 15076908.0, |
| "step": 2190 |
| }, |
| { |
| "entropy": 2.0989627480506896, |
| "epoch": 0.26499638641291257, |
| "grad_norm": 2.453125, |
| "learning_rate": 8.790817442820578e-05, |
| "loss": 2.1174949645996093, |
| "mean_token_accuracy": 0.5576492011547088, |
| "num_tokens": 15145467.0, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.26499638641291257, |
| "eval_entropy": 2.708386778831482, |
| "eval_loss": 2.8184127807617188, |
| "eval_mean_token_accuracy": 0.42567259073257446, |
| "eval_num_tokens": 15145467.0, |
| "eval_runtime": 0.429, |
| "eval_samples_per_second": 37.293, |
| "eval_steps_per_second": 4.662, |
| "step": 2200 |
| }, |
| { |
| "entropy": 2.0987525939941407, |
| "epoch": 0.26620091544206215, |
| "grad_norm": 2.15625, |
| "learning_rate": 8.777799055878243e-05, |
| "loss": 2.1152496337890625, |
| "mean_token_accuracy": 0.5476809203624725, |
| "num_tokens": 15215998.0, |
| "step": 2210 |
| }, |
| { |
| "entropy": 2.0828201174736023, |
| "epoch": 0.2674054444712118, |
| "grad_norm": 2.21875, |
| "learning_rate": 8.764720714064025e-05, |
| "loss": 2.0910402297973634, |
| "mean_token_accuracy": 0.5581553757190705, |
| "num_tokens": 15284221.0, |
| "step": 2220 |
| }, |
| { |
| "entropy": 2.1438043117523193, |
| "epoch": 0.26860997350036137, |
| "grad_norm": 84.5, |
| "learning_rate": 8.751582624935366e-05, |
| "loss": 2.159671401977539, |
| "mean_token_accuracy": 0.5466212332248688, |
| "num_tokens": 15351682.0, |
| "step": 2230 |
| }, |
| { |
| "entropy": 2.1918447971343995, |
| "epoch": 0.26981450252951095, |
| "grad_norm": 1.90625, |
| "learning_rate": 8.738384996997917e-05, |
| "loss": 2.194806671142578, |
| "mean_token_accuracy": 0.5432702243328095, |
| "num_tokens": 15419986.0, |
| "step": 2240 |
| }, |
| { |
| "entropy": 2.1559926509857177, |
| "epoch": 0.2710190315586606, |
| "grad_norm": 2.25, |
| "learning_rate": 8.725128039702226e-05, |
| "loss": 2.170298385620117, |
| "mean_token_accuracy": 0.5422053426504135, |
| "num_tokens": 15490655.0, |
| "step": 2250 |
| }, |
| { |
| "entropy": 2.112238824367523, |
| "epoch": 0.27222356058781016, |
| "grad_norm": 2.71875, |
| "learning_rate": 8.711811963440422e-05, |
| "loss": 2.1344348907470705, |
| "mean_token_accuracy": 0.5519571542739868, |
| "num_tokens": 15558940.0, |
| "step": 2260 |
| }, |
| { |
| "entropy": 2.122793698310852, |
| "epoch": 0.27342808961695975, |
| "grad_norm": 2.53125, |
| "learning_rate": 8.698436979542866e-05, |
| "loss": 2.1378543853759764, |
| "mean_token_accuracy": 0.5491474151611329, |
| "num_tokens": 15626777.0, |
| "step": 2270 |
| }, |
| { |
| "entropy": 2.1574150919914246, |
| "epoch": 0.2746326186461094, |
| "grad_norm": 1.734375, |
| "learning_rate": 8.685003300274807e-05, |
| "loss": 2.168122100830078, |
| "mean_token_accuracy": 0.5528741180896759, |
| "num_tokens": 15695490.0, |
| "step": 2280 |
| }, |
| { |
| "entropy": 2.086066424846649, |
| "epoch": 0.27583714767525896, |
| "grad_norm": 2.578125, |
| "learning_rate": 8.671511138833002e-05, |
| "loss": 2.095340538024902, |
| "mean_token_accuracy": 0.5564016461372375, |
| "num_tokens": 15765669.0, |
| "step": 2290 |
| }, |
| { |
| "entropy": 2.1277116537094116, |
| "epoch": 0.2770416767044086, |
| "grad_norm": 1.640625, |
| "learning_rate": 8.657960709342345e-05, |
| "loss": 2.1454301834106446, |
| "mean_token_accuracy": 0.5443755030632019, |
| "num_tokens": 15837031.0, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.2770416767044086, |
| "eval_entropy": 2.6647491455078125, |
| "eval_loss": 2.8141632080078125, |
| "eval_mean_token_accuracy": 0.42728830873966217, |
| "eval_num_tokens": 15837031.0, |
| "eval_runtime": 0.5308, |
| "eval_samples_per_second": 30.146, |
| "eval_steps_per_second": 3.768, |
| "step": 2300 |
| }, |
| { |
| "entropy": 2.0806057333946226, |
| "epoch": 0.2782462057335582, |
| "grad_norm": 1.6796875, |
| "learning_rate": 8.644352226852457e-05, |
| "loss": 2.100520133972168, |
| "mean_token_accuracy": 0.5565994262695313, |
| "num_tokens": 15905623.0, |
| "step": 2310 |
| }, |
| { |
| "entropy": 2.071013700962067, |
| "epoch": 0.27945073476270776, |
| "grad_norm": 2.109375, |
| "learning_rate": 8.63068590733428e-05, |
| "loss": 2.0833396911621094, |
| "mean_token_accuracy": 0.5573381006717681, |
| "num_tokens": 15974613.0, |
| "step": 2320 |
| }, |
| { |
| "entropy": 2.037881815433502, |
| "epoch": 0.2806552637918574, |
| "grad_norm": 1.828125, |
| "learning_rate": 8.616961967676653e-05, |
| "loss": 2.0440486907958983, |
| "mean_token_accuracy": 0.5672590255737304, |
| "num_tokens": 16043789.0, |
| "step": 2330 |
| }, |
| { |
| "entropy": 2.1182125926017763, |
| "epoch": 0.281859792821007, |
| "grad_norm": 2.796875, |
| "learning_rate": 8.603180625682856e-05, |
| "loss": 2.1264150619506834, |
| "mean_token_accuracy": 0.5492606639862061, |
| "num_tokens": 16113815.0, |
| "step": 2340 |
| }, |
| { |
| "entropy": 2.1378235816955566, |
| "epoch": 0.28306432185015656, |
| "grad_norm": 1.9140625, |
| "learning_rate": 8.589342100067171e-05, |
| "loss": 2.1428293228149413, |
| "mean_token_accuracy": 0.5489014446735382, |
| "num_tokens": 16184763.0, |
| "step": 2350 |
| }, |
| { |
| "entropy": 2.0700816035270693, |
| "epoch": 0.2842688508793062, |
| "grad_norm": 3.234375, |
| "learning_rate": 8.575446610451396e-05, |
| "loss": 2.101584053039551, |
| "mean_token_accuracy": 0.5581894755363465, |
| "num_tokens": 16254530.0, |
| "step": 2360 |
| }, |
| { |
| "entropy": 2.141230809688568, |
| "epoch": 0.2854733799084558, |
| "grad_norm": 2.234375, |
| "learning_rate": 8.561494377361371e-05, |
| "loss": 2.1607639312744142, |
| "mean_token_accuracy": 0.5493471801280976, |
| "num_tokens": 16322088.0, |
| "step": 2370 |
| }, |
| { |
| "entropy": 2.072463631629944, |
| "epoch": 0.2866779089376054, |
| "grad_norm": 2.046875, |
| "learning_rate": 8.547485622223468e-05, |
| "loss": 2.0758430480957033, |
| "mean_token_accuracy": 0.5595114946365356, |
| "num_tokens": 16393167.0, |
| "step": 2380 |
| }, |
| { |
| "entropy": 2.0443200945854185, |
| "epoch": 0.287882437966755, |
| "grad_norm": 2.859375, |
| "learning_rate": 8.533420567361084e-05, |
| "loss": 2.053724479675293, |
| "mean_token_accuracy": 0.5662404716014862, |
| "num_tokens": 16460254.0, |
| "step": 2390 |
| }, |
| { |
| "entropy": 2.0922821521759034, |
| "epoch": 0.2890869669959046, |
| "grad_norm": 1.96875, |
| "learning_rate": 8.519299435991111e-05, |
| "loss": 2.1047943115234373, |
| "mean_token_accuracy": 0.5579495280981064, |
| "num_tokens": 16525561.0, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.2890869669959046, |
| "eval_entropy": 2.6770076751708984, |
| "eval_loss": 2.8334004878997803, |
| "eval_mean_token_accuracy": 0.4258348345756531, |
| "eval_num_tokens": 16525561.0, |
| "eval_runtime": 0.4008, |
| "eval_samples_per_second": 39.923, |
| "eval_steps_per_second": 4.99, |
| "step": 2400 |
| }, |
| { |
| "entropy": 2.0860785961151125, |
| "epoch": 0.2902914960250542, |
| "grad_norm": 1.8359375, |
| "learning_rate": 8.505122452220393e-05, |
| "loss": 2.0898111343383787, |
| "mean_token_accuracy": 0.5583101451396942, |
| "num_tokens": 16595852.0, |
| "step": 2410 |
| }, |
| { |
| "entropy": 2.0555111885070803, |
| "epoch": 0.2914960250542038, |
| "grad_norm": 1.75, |
| "learning_rate": 8.490889841042167e-05, |
| "loss": 2.070912742614746, |
| "mean_token_accuracy": 0.5591276168823243, |
| "num_tokens": 16666291.0, |
| "step": 2420 |
| }, |
| { |
| "entropy": 2.0737984776496887, |
| "epoch": 0.2927005540833534, |
| "grad_norm": 4.8125, |
| "learning_rate": 8.4766018283325e-05, |
| "loss": 2.1162538528442383, |
| "mean_token_accuracy": 0.5509154379367829, |
| "num_tokens": 16736493.0, |
| "step": 2430 |
| }, |
| { |
| "entropy": 2.1692824363708496, |
| "epoch": 0.293905083112503, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.462258640846691e-05, |
| "loss": 2.1681695938110352, |
| "mean_token_accuracy": 0.549293601512909, |
| "num_tokens": 16805384.0, |
| "step": 2440 |
| }, |
| { |
| "entropy": 2.118699336051941, |
| "epoch": 0.2951096121416526, |
| "grad_norm": 2.359375, |
| "learning_rate": 8.447860506215691e-05, |
| "loss": 2.1451501846313477, |
| "mean_token_accuracy": 0.5516745418310165, |
| "num_tokens": 16875894.0, |
| "step": 2450 |
| }, |
| { |
| "entropy": 2.084539461135864, |
| "epoch": 0.29631414117080224, |
| "grad_norm": 2.046875, |
| "learning_rate": 8.433407652942469e-05, |
| "loss": 2.085173225402832, |
| "mean_token_accuracy": 0.560731440782547, |
| "num_tokens": 16940338.0, |
| "step": 2460 |
| }, |
| { |
| "entropy": 2.147885191440582, |
| "epoch": 0.2975186701999518, |
| "grad_norm": 2.109375, |
| "learning_rate": 8.4189003103984e-05, |
| "loss": 2.170233726501465, |
| "mean_token_accuracy": 0.5484267741441726, |
| "num_tokens": 17009690.0, |
| "step": 2470 |
| }, |
| { |
| "entropy": 2.0895490050315857, |
| "epoch": 0.2987231992291014, |
| "grad_norm": 1.8671875, |
| "learning_rate": 8.404338708819625e-05, |
| "loss": 2.098209190368652, |
| "mean_token_accuracy": 0.5598911285400391, |
| "num_tokens": 17079699.0, |
| "step": 2480 |
| }, |
| { |
| "entropy": 2.047213041782379, |
| "epoch": 0.29992772825825104, |
| "grad_norm": 1.890625, |
| "learning_rate": 8.389723079303387e-05, |
| "loss": 2.0544561386108398, |
| "mean_token_accuracy": 0.5627081871032715, |
| "num_tokens": 17149769.0, |
| "step": 2490 |
| }, |
| { |
| "entropy": 2.0825024962425234, |
| "epoch": 0.3011322572874006, |
| "grad_norm": 3.59375, |
| "learning_rate": 8.375053653804373e-05, |
| "loss": 2.0965812683105467, |
| "mean_token_accuracy": 0.5609676122665406, |
| "num_tokens": 17216861.0, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.3011322572874006, |
| "eval_entropy": 2.6770654916763306, |
| "eval_loss": 2.7862625122070312, |
| "eval_mean_token_accuracy": 0.4347201734781265, |
| "eval_num_tokens": 17216861.0, |
| "eval_runtime": 0.4051, |
| "eval_samples_per_second": 39.492, |
| "eval_steps_per_second": 4.937, |
| "step": 2500 |
| }, |
| { |
| "entropy": 2.03642942905426, |
| "epoch": 0.30233678631655025, |
| "grad_norm": 2.078125, |
| "learning_rate": 8.36033066513103e-05, |
| "loss": 2.076190376281738, |
| "mean_token_accuracy": 0.565330320596695, |
| "num_tokens": 17284775.0, |
| "step": 2510 |
| }, |
| { |
| "entropy": 2.105399656295776, |
| "epoch": 0.30354131534569984, |
| "grad_norm": 2.0, |
| "learning_rate": 8.345554346941866e-05, |
| "loss": 2.1103307723999025, |
| "mean_token_accuracy": 0.556881707906723, |
| "num_tokens": 17353646.0, |
| "step": 2520 |
| }, |
| { |
| "entropy": 1.988648521900177, |
| "epoch": 0.3047458443748494, |
| "grad_norm": 2.421875, |
| "learning_rate": 8.330724933741749e-05, |
| "loss": 2.002836990356445, |
| "mean_token_accuracy": 0.5724063634872436, |
| "num_tokens": 17418925.0, |
| "step": 2530 |
| }, |
| { |
| "entropy": 2.1860252976417542, |
| "epoch": 0.30595037340399905, |
| "grad_norm": 4.4375, |
| "learning_rate": 8.315842660878181e-05, |
| "loss": 2.1974233627319335, |
| "mean_token_accuracy": 0.5398978978395462, |
| "num_tokens": 17485994.0, |
| "step": 2540 |
| }, |
| { |
| "entropy": 2.1428608536720275, |
| "epoch": 0.30715490243314864, |
| "grad_norm": 2.15625, |
| "learning_rate": 8.300907764537565e-05, |
| "loss": 2.1552366256713866, |
| "mean_token_accuracy": 0.5474565207958222, |
| "num_tokens": 17556847.0, |
| "step": 2550 |
| }, |
| { |
| "entropy": 2.0490808844566346, |
| "epoch": 0.3083594314622982, |
| "grad_norm": 15.75, |
| "learning_rate": 8.285920481741448e-05, |
| "loss": 2.075417137145996, |
| "mean_token_accuracy": 0.5619370639324188, |
| "num_tokens": 17623063.0, |
| "step": 2560 |
| }, |
| { |
| "entropy": 2.08027925491333, |
| "epoch": 0.30956396049144785, |
| "grad_norm": 6.4375, |
| "learning_rate": 8.270881050342775e-05, |
| "loss": 2.0900177001953124, |
| "mean_token_accuracy": 0.5613270044326782, |
| "num_tokens": 17691470.0, |
| "step": 2570 |
| }, |
| { |
| "entropy": 2.0812355279922485, |
| "epoch": 0.31076848952059744, |
| "grad_norm": 2.15625, |
| "learning_rate": 8.255789709022104e-05, |
| "loss": 2.0924118041992186, |
| "mean_token_accuracy": 0.560305255651474, |
| "num_tokens": 17757875.0, |
| "step": 2580 |
| }, |
| { |
| "entropy": 1.9864863872528076, |
| "epoch": 0.31197301854974707, |
| "grad_norm": 2.171875, |
| "learning_rate": 8.240646697283818e-05, |
| "loss": 2.001949119567871, |
| "mean_token_accuracy": 0.5692925453186035, |
| "num_tokens": 17825941.0, |
| "step": 2590 |
| }, |
| { |
| "entropy": 2.031517136096954, |
| "epoch": 0.31317754757889665, |
| "grad_norm": 1.7890625, |
| "learning_rate": 8.225452255452325e-05, |
| "loss": 2.0528676986694334, |
| "mean_token_accuracy": 0.5633283078670501, |
| "num_tokens": 17899022.0, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.31317754757889665, |
| "eval_entropy": 2.6427732706069946, |
| "eval_loss": 2.7711801528930664, |
| "eval_mean_token_accuracy": 0.4326988756656647, |
| "eval_num_tokens": 17899022.0, |
| "eval_runtime": 0.398, |
| "eval_samples_per_second": 40.197, |
| "eval_steps_per_second": 5.025, |
| "step": 2600 |
| }, |
| { |
| "entropy": 2.148902940750122, |
| "epoch": 0.31438207660804623, |
| "grad_norm": 2.421875, |
| "learning_rate": 8.210206624668249e-05, |
| "loss": 2.164246940612793, |
| "mean_token_accuracy": 0.5463122427463531, |
| "num_tokens": 17968591.0, |
| "step": 2610 |
| }, |
| { |
| "entropy": 2.1614266276359557, |
| "epoch": 0.31558660563719587, |
| "grad_norm": 1.75, |
| "learning_rate": 8.194910046884595e-05, |
| "loss": 2.161795997619629, |
| "mean_token_accuracy": 0.5490990400314331, |
| "num_tokens": 18035770.0, |
| "step": 2620 |
| }, |
| { |
| "entropy": 2.0735299348831178, |
| "epoch": 0.31679113466634545, |
| "grad_norm": 1.8828125, |
| "learning_rate": 8.179562764862918e-05, |
| "loss": 2.090145492553711, |
| "mean_token_accuracy": 0.561646330356598, |
| "num_tokens": 18102572.0, |
| "step": 2630 |
| }, |
| { |
| "entropy": 2.059981143474579, |
| "epoch": 0.31799566369549503, |
| "grad_norm": 2.25, |
| "learning_rate": 8.16416502216946e-05, |
| "loss": 2.070825958251953, |
| "mean_token_accuracy": 0.559648597240448, |
| "num_tokens": 18168404.0, |
| "step": 2640 |
| }, |
| { |
| "entropy": 2.0211940169334413, |
| "epoch": 0.31920019272464467, |
| "grad_norm": 2.46875, |
| "learning_rate": 8.148717063171292e-05, |
| "loss": 2.0402278900146484, |
| "mean_token_accuracy": 0.5664457201957702, |
| "num_tokens": 18238822.0, |
| "step": 2650 |
| }, |
| { |
| "entropy": 2.115056884288788, |
| "epoch": 0.32040472175379425, |
| "grad_norm": 2.453125, |
| "learning_rate": 8.133219133032432e-05, |
| "loss": 2.125631904602051, |
| "mean_token_accuracy": 0.5588094294071198, |
| "num_tokens": 18304127.0, |
| "step": 2660 |
| }, |
| { |
| "entropy": 2.096850049495697, |
| "epoch": 0.3216092507829439, |
| "grad_norm": 2.875, |
| "learning_rate": 8.117671477709962e-05, |
| "loss": 2.1195550918579102, |
| "mean_token_accuracy": 0.5529998481273651, |
| "num_tokens": 18368937.0, |
| "step": 2670 |
| }, |
| { |
| "entropy": 2.0709152102470396, |
| "epoch": 0.32281377981209347, |
| "grad_norm": 1.6875, |
| "learning_rate": 8.102074343950113e-05, |
| "loss": 2.0848411560058593, |
| "mean_token_accuracy": 0.5579396069049836, |
| "num_tokens": 18438067.0, |
| "step": 2680 |
| }, |
| { |
| "entropy": 2.151025879383087, |
| "epoch": 0.32401830884124305, |
| "grad_norm": 2.0625, |
| "learning_rate": 8.086427979284352e-05, |
| "loss": 2.1543193817138673, |
| "mean_token_accuracy": 0.5485984563827515, |
| "num_tokens": 18508650.0, |
| "step": 2690 |
| }, |
| { |
| "entropy": 2.074105453491211, |
| "epoch": 0.3252228378703927, |
| "grad_norm": 1.7734375, |
| "learning_rate": 8.070732632025464e-05, |
| "loss": 2.0990238189697266, |
| "mean_token_accuracy": 0.558820104598999, |
| "num_tokens": 18578127.0, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.3252228378703927, |
| "eval_entropy": 2.59474778175354, |
| "eval_loss": 2.784816265106201, |
| "eval_mean_token_accuracy": 0.43576808273792267, |
| "eval_num_tokens": 18578127.0, |
| "eval_runtime": 0.4052, |
| "eval_samples_per_second": 39.487, |
| "eval_steps_per_second": 4.936, |
| "step": 2700 |
| }, |
| { |
| "entropy": 2.059889006614685, |
| "epoch": 0.32642736689954227, |
| "grad_norm": 1.828125, |
| "learning_rate": 8.054988551263596e-05, |
| "loss": 2.0819252014160154, |
| "mean_token_accuracy": 0.5544521391391755, |
| "num_tokens": 18648090.0, |
| "step": 2710 |
| }, |
| { |
| "entropy": 2.0784488558769225, |
| "epoch": 0.3276318959286919, |
| "grad_norm": 2.234375, |
| "learning_rate": 8.039195986862317e-05, |
| "loss": 2.0795106887817383, |
| "mean_token_accuracy": 0.5613572120666503, |
| "num_tokens": 18716753.0, |
| "step": 2720 |
| }, |
| { |
| "entropy": 2.0666807651519776, |
| "epoch": 0.3288364249578415, |
| "grad_norm": 2.234375, |
| "learning_rate": 8.023355189454645e-05, |
| "loss": 2.084706497192383, |
| "mean_token_accuracy": 0.5587452292442322, |
| "num_tokens": 18787606.0, |
| "step": 2730 |
| }, |
| { |
| "entropy": 1.9979267120361328, |
| "epoch": 0.33004095398699107, |
| "grad_norm": 1.734375, |
| "learning_rate": 8.007466410439065e-05, |
| "loss": 2.005443000793457, |
| "mean_token_accuracy": 0.570879477262497, |
| "num_tokens": 18851977.0, |
| "step": 2740 |
| }, |
| { |
| "entropy": 2.0454527735710144, |
| "epoch": 0.3312454830161407, |
| "grad_norm": 2.921875, |
| "learning_rate": 7.991529901975557e-05, |
| "loss": 2.0477304458618164, |
| "mean_token_accuracy": 0.567040553689003, |
| "num_tokens": 18922821.0, |
| "step": 2750 |
| }, |
| { |
| "entropy": 2.019996762275696, |
| "epoch": 0.3324500120452903, |
| "grad_norm": 4.46875, |
| "learning_rate": 7.97554591698157e-05, |
| "loss": 2.023045539855957, |
| "mean_token_accuracy": 0.5663474082946778, |
| "num_tokens": 18992620.0, |
| "step": 2760 |
| }, |
| { |
| "entropy": 2.016635000705719, |
| "epoch": 0.33365454107443987, |
| "grad_norm": 1.8125, |
| "learning_rate": 7.95951470912803e-05, |
| "loss": 2.0342525482177733, |
| "mean_token_accuracy": 0.5686039090156555, |
| "num_tokens": 19059719.0, |
| "step": 2770 |
| }, |
| { |
| "entropy": 2.0925203800201415, |
| "epoch": 0.3348590701035895, |
| "grad_norm": 2.53125, |
| "learning_rate": 7.943436532835304e-05, |
| "loss": 2.0941793441772463, |
| "mean_token_accuracy": 0.5635069251060486, |
| "num_tokens": 19128166.0, |
| "step": 2780 |
| }, |
| { |
| "entropy": 2.011734688282013, |
| "epoch": 0.3360635991327391, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.927311643269157e-05, |
| "loss": 2.0319175720214844, |
| "mean_token_accuracy": 0.5642287373542786, |
| "num_tokens": 19198350.0, |
| "step": 2790 |
| }, |
| { |
| "entropy": 2.161832857131958, |
| "epoch": 0.3372681281618887, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.911140296336712e-05, |
| "loss": 2.171921730041504, |
| "mean_token_accuracy": 0.544024670124054, |
| "num_tokens": 19265054.0, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.3372681281618887, |
| "eval_entropy": 2.61099910736084, |
| "eval_loss": 2.7639856338500977, |
| "eval_mean_token_accuracy": 0.4430377185344696, |
| "eval_num_tokens": 19265054.0, |
| "eval_runtime": 0.3884, |
| "eval_samples_per_second": 41.198, |
| "eval_steps_per_second": 5.15, |
| "step": 2800 |
| }, |
| { |
| "entropy": 1.932697057723999, |
| "epoch": 0.3384726571910383, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.894922748682387e-05, |
| "loss": 1.9408197402954102, |
| "mean_token_accuracy": 0.5819644033908844, |
| "num_tokens": 19335074.0, |
| "step": 2810 |
| }, |
| { |
| "entropy": 2.092177450656891, |
| "epoch": 0.3396771862201879, |
| "grad_norm": 1.9921875, |
| "learning_rate": 7.878659257683819e-05, |
| "loss": 2.1017913818359375, |
| "mean_token_accuracy": 0.5499498665332794, |
| "num_tokens": 19404061.0, |
| "step": 2820 |
| }, |
| { |
| "entropy": 2.06910434961319, |
| "epoch": 0.3408817152493375, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.862350081447777e-05, |
| "loss": 2.0780174255371096, |
| "mean_token_accuracy": 0.5591690957546234, |
| "num_tokens": 19470099.0, |
| "step": 2830 |
| }, |
| { |
| "entropy": 1.980467677116394, |
| "epoch": 0.3420862442784871, |
| "grad_norm": 2.46875, |
| "learning_rate": 7.845995478806075e-05, |
| "loss": 1.994614601135254, |
| "mean_token_accuracy": 0.5735546886920929, |
| "num_tokens": 19534144.0, |
| "step": 2840 |
| }, |
| { |
| "entropy": 2.00383620262146, |
| "epoch": 0.3432907733076367, |
| "grad_norm": 3.234375, |
| "learning_rate": 7.829595709311454e-05, |
| "loss": 2.006473159790039, |
| "mean_token_accuracy": 0.5768690049648285, |
| "num_tokens": 19603676.0, |
| "step": 2850 |
| }, |
| { |
| "entropy": 2.020020580291748, |
| "epoch": 0.3444953023367863, |
| "grad_norm": 1.9375, |
| "learning_rate": 7.813151033233469e-05, |
| "loss": 2.046031951904297, |
| "mean_token_accuracy": 0.5671627938747406, |
| "num_tokens": 19671521.0, |
| "step": 2860 |
| }, |
| { |
| "entropy": 2.0426268339157105, |
| "epoch": 0.3456998313659359, |
| "grad_norm": 2.421875, |
| "learning_rate": 7.796661711554358e-05, |
| "loss": 2.051487350463867, |
| "mean_token_accuracy": 0.5657259941101074, |
| "num_tokens": 19740842.0, |
| "step": 2870 |
| }, |
| { |
| "entropy": 2.0123592138290407, |
| "epoch": 0.34690436039508554, |
| "grad_norm": 1.96875, |
| "learning_rate": 7.780128005964897e-05, |
| "loss": 2.030988311767578, |
| "mean_token_accuracy": 0.569215327501297, |
| "num_tokens": 19810966.0, |
| "step": 2880 |
| }, |
| { |
| "entropy": 1.9815311908721924, |
| "epoch": 0.3481088894242351, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.763550178860249e-05, |
| "loss": 1.9935127258300782, |
| "mean_token_accuracy": 0.5781533777713775, |
| "num_tokens": 19882666.0, |
| "step": 2890 |
| }, |
| { |
| "entropy": 2.0456610202789305, |
| "epoch": 0.3493134184533847, |
| "grad_norm": 1.6171875, |
| "learning_rate": 7.746928493335798e-05, |
| "loss": 2.040866470336914, |
| "mean_token_accuracy": 0.5613146841526031, |
| "num_tokens": 19952153.0, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.3493134184533847, |
| "eval_entropy": 2.573462128639221, |
| "eval_loss": 2.7504706382751465, |
| "eval_mean_token_accuracy": 0.446028009057045, |
| "eval_num_tokens": 19952153.0, |
| "eval_runtime": 0.4463, |
| "eval_samples_per_second": 35.853, |
| "eval_steps_per_second": 4.482, |
| "step": 2900 |
| }, |
| { |
| "entropy": 2.051629662513733, |
| "epoch": 0.35051794748253434, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.73026321318298e-05, |
| "loss": 2.0937450408935545, |
| "mean_token_accuracy": 0.559838205575943, |
| "num_tokens": 20018290.0, |
| "step": 2910 |
| }, |
| { |
| "entropy": 2.0276250004768372, |
| "epoch": 0.3517224765116839, |
| "grad_norm": 3.4375, |
| "learning_rate": 7.713554602885086e-05, |
| "loss": 2.0289745330810547, |
| "mean_token_accuracy": 0.569722706079483, |
| "num_tokens": 20086770.0, |
| "step": 2920 |
| }, |
| { |
| "entropy": 2.0052628755569457, |
| "epoch": 0.35292700554083356, |
| "grad_norm": 1.8671875, |
| "learning_rate": 7.696802927613077e-05, |
| "loss": 2.0254388809204102, |
| "mean_token_accuracy": 0.5706159889698028, |
| "num_tokens": 20157059.0, |
| "step": 2930 |
| }, |
| { |
| "entropy": 2.06208598613739, |
| "epoch": 0.35413153456998314, |
| "grad_norm": 1.7890625, |
| "learning_rate": 7.68000845322136e-05, |
| "loss": 2.074476432800293, |
| "mean_token_accuracy": 0.5622613191604614, |
| "num_tokens": 20225520.0, |
| "step": 2940 |
| }, |
| { |
| "entropy": 2.0250381588935853, |
| "epoch": 0.3553360635991327, |
| "grad_norm": 2.9375, |
| "learning_rate": 7.663171446243582e-05, |
| "loss": 2.0442649841308596, |
| "mean_token_accuracy": 0.56498042345047, |
| "num_tokens": 20292984.0, |
| "step": 2950 |
| }, |
| { |
| "entropy": 2.0137596130371094, |
| "epoch": 0.35654059262828236, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.646292173888399e-05, |
| "loss": 2.0237215042114256, |
| "mean_token_accuracy": 0.569210535287857, |
| "num_tokens": 20364322.0, |
| "step": 2960 |
| }, |
| { |
| "entropy": 2.076921796798706, |
| "epoch": 0.35774512165743194, |
| "grad_norm": 2.65625, |
| "learning_rate": 7.629370904035227e-05, |
| "loss": 2.0826812744140626, |
| "mean_token_accuracy": 0.5618605375289917, |
| "num_tokens": 20427686.0, |
| "step": 2970 |
| }, |
| { |
| "entropy": 2.03226181268692, |
| "epoch": 0.3589496506865815, |
| "grad_norm": 2.0, |
| "learning_rate": 7.612407905229996e-05, |
| "loss": 2.067717361450195, |
| "mean_token_accuracy": 0.558159738779068, |
| "num_tokens": 20498265.0, |
| "step": 2980 |
| }, |
| { |
| "entropy": 1.9998491644859313, |
| "epoch": 0.36015417971573116, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.595403446680894e-05, |
| "loss": 2.007455825805664, |
| "mean_token_accuracy": 0.5721205115318299, |
| "num_tokens": 20564625.0, |
| "step": 2990 |
| }, |
| { |
| "entropy": 2.021931827068329, |
| "epoch": 0.36135870874488074, |
| "grad_norm": 3.03125, |
| "learning_rate": 7.578357798254076e-05, |
| "loss": 2.0407316207885744, |
| "mean_token_accuracy": 0.5663074970245361, |
| "num_tokens": 20632253.0, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.36135870874488074, |
| "eval_entropy": 2.6078274250030518, |
| "eval_loss": 2.7409777641296387, |
| "eval_mean_token_accuracy": 0.4436866343021393, |
| "eval_num_tokens": 20632253.0, |
| "eval_runtime": 0.505, |
| "eval_samples_per_second": 31.685, |
| "eval_steps_per_second": 3.961, |
| "step": 3000 |
| }, |
| { |
| "entropy": 2.0366801381111146, |
| "epoch": 0.3625632377740304, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.561271230469409e-05, |
| "loss": 2.0368415832519533, |
| "mean_token_accuracy": 0.5711302816867828, |
| "num_tokens": 20700182.0, |
| "step": 3010 |
| }, |
| { |
| "entropy": 2.064214277267456, |
| "epoch": 0.36376776680317996, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.544144014496148e-05, |
| "loss": 2.0829124450683594, |
| "mean_token_accuracy": 0.5591930508613586, |
| "num_tokens": 20769809.0, |
| "step": 3020 |
| }, |
| { |
| "entropy": 2.0133578896522524, |
| "epoch": 0.36497229583232954, |
| "grad_norm": 2.546875, |
| "learning_rate": 7.52697642214866e-05, |
| "loss": 2.0229619979858398, |
| "mean_token_accuracy": 0.5690375864505768, |
| "num_tokens": 20839028.0, |
| "step": 3030 |
| }, |
| { |
| "entropy": 1.9779277443885803, |
| "epoch": 0.3661768248614792, |
| "grad_norm": 1.65625, |
| "learning_rate": 7.50976872588209e-05, |
| "loss": 2.0208606719970703, |
| "mean_token_accuracy": 0.5698799788951874, |
| "num_tokens": 20908277.0, |
| "step": 3040 |
| }, |
| { |
| "entropy": 2.101050305366516, |
| "epoch": 0.36738135389062876, |
| "grad_norm": 1.84375, |
| "learning_rate": 7.492521198788049e-05, |
| "loss": 2.086177444458008, |
| "mean_token_accuracy": 0.5555986344814301, |
| "num_tokens": 20975369.0, |
| "step": 3050 |
| }, |
| { |
| "entropy": 1.9677135944366455, |
| "epoch": 0.36858588291977834, |
| "grad_norm": 1.6171875, |
| "learning_rate": 7.475234114590272e-05, |
| "loss": 1.979222297668457, |
| "mean_token_accuracy": 0.5747052133083344, |
| "num_tokens": 21044671.0, |
| "step": 3060 |
| }, |
| { |
| "entropy": 1.9906654000282287, |
| "epoch": 0.369790411948928, |
| "grad_norm": 1.8984375, |
| "learning_rate": 7.457907747640285e-05, |
| "loss": 2.018314170837402, |
| "mean_token_accuracy": 0.5706595480442047, |
| "num_tokens": 21114968.0, |
| "step": 3070 |
| }, |
| { |
| "entropy": 2.0226004838943483, |
| "epoch": 0.37099494097807756, |
| "grad_norm": 2.890625, |
| "learning_rate": 7.440542372913035e-05, |
| "loss": 2.0287120819091795, |
| "mean_token_accuracy": 0.5678030729293824, |
| "num_tokens": 21184669.0, |
| "step": 3080 |
| }, |
| { |
| "entropy": 2.075468158721924, |
| "epoch": 0.3721994700072272, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.42313826600254e-05, |
| "loss": 2.071501922607422, |
| "mean_token_accuracy": 0.5608702659606933, |
| "num_tokens": 21251886.0, |
| "step": 3090 |
| }, |
| { |
| "entropy": 1.97612144947052, |
| "epoch": 0.3734039990363768, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.40569570311751e-05, |
| "loss": 2.0007991790771484, |
| "mean_token_accuracy": 0.5756966292858123, |
| "num_tokens": 21321340.0, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.3734039990363768, |
| "eval_entropy": 2.5279862880706787, |
| "eval_loss": 2.7285852432250977, |
| "eval_mean_token_accuracy": 0.4447345584630966, |
| "eval_num_tokens": 21321340.0, |
| "eval_runtime": 0.4146, |
| "eval_samples_per_second": 38.592, |
| "eval_steps_per_second": 4.824, |
| "step": 3100 |
| }, |
| { |
| "entropy": 2.1051839351654054, |
| "epoch": 0.37460852806552636, |
| "grad_norm": 1.59375, |
| "learning_rate": 7.388214961076961e-05, |
| "loss": 2.1164289474487306, |
| "mean_token_accuracy": 0.5555289804935455, |
| "num_tokens": 21388828.0, |
| "step": 3110 |
| }, |
| { |
| "entropy": 1.9983577013015748, |
| "epoch": 0.375813057094676, |
| "grad_norm": 3.96875, |
| "learning_rate": 7.370696317305828e-05, |
| "loss": 2.0253278732299806, |
| "mean_token_accuracy": 0.5709738373756409, |
| "num_tokens": 21456866.0, |
| "step": 3120 |
| }, |
| { |
| "entropy": 1.9110666394233704, |
| "epoch": 0.3770175861238256, |
| "grad_norm": 1.953125, |
| "learning_rate": 7.353140049830552e-05, |
| "loss": 1.926797103881836, |
| "mean_token_accuracy": 0.5828823685646057, |
| "num_tokens": 21526510.0, |
| "step": 3130 |
| }, |
| { |
| "entropy": 2.082789492607117, |
| "epoch": 0.3782221151529752, |
| "grad_norm": 1.8125, |
| "learning_rate": 7.335546437274684e-05, |
| "loss": 2.072815704345703, |
| "mean_token_accuracy": 0.5597695082426071, |
| "num_tokens": 21593938.0, |
| "step": 3140 |
| }, |
| { |
| "entropy": 1.9497243881225585, |
| "epoch": 0.3794266441821248, |
| "grad_norm": 1.78125, |
| "learning_rate": 7.317915758854445e-05, |
| "loss": 1.986886978149414, |
| "mean_token_accuracy": 0.5727407991886139, |
| "num_tokens": 21659848.0, |
| "step": 3150 |
| }, |
| { |
| "entropy": 2.0966204047203063, |
| "epoch": 0.3806311732112744, |
| "grad_norm": 1.8359375, |
| "learning_rate": 7.300248294374305e-05, |
| "loss": 2.077589225769043, |
| "mean_token_accuracy": 0.5576497077941894, |
| "num_tokens": 21730692.0, |
| "step": 3160 |
| }, |
| { |
| "entropy": 1.9753968596458436, |
| "epoch": 0.381835702240424, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.282544324222544e-05, |
| "loss": 2.011469268798828, |
| "mean_token_accuracy": 0.5723488807678223, |
| "num_tokens": 21799339.0, |
| "step": 3170 |
| }, |
| { |
| "entropy": 2.0465827584266663, |
| "epoch": 0.3830402312695736, |
| "grad_norm": 2.796875, |
| "learning_rate": 7.264804129366796e-05, |
| "loss": 2.0382017135620116, |
| "mean_token_accuracy": 0.562414237856865, |
| "num_tokens": 21869480.0, |
| "step": 3180 |
| }, |
| { |
| "entropy": 1.9501657962799073, |
| "epoch": 0.3842447602987232, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.24702799134959e-05, |
| "loss": 1.9487823486328124, |
| "mean_token_accuracy": 0.5793662428855896, |
| "num_tokens": 21937348.0, |
| "step": 3190 |
| }, |
| { |
| "entropy": 1.964065945148468, |
| "epoch": 0.3854492893278728, |
| "grad_norm": 1.6953125, |
| "learning_rate": 7.229216192283887e-05, |
| "loss": 1.9955894470214843, |
| "mean_token_accuracy": 0.5740731418132782, |
| "num_tokens": 22007091.0, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.3854492893278728, |
| "eval_entropy": 2.611129403114319, |
| "eval_loss": 2.7415380477905273, |
| "eval_mean_token_accuracy": 0.4391617476940155, |
| "eval_num_tokens": 22007091.0, |
| "eval_runtime": 0.404, |
| "eval_samples_per_second": 39.603, |
| "eval_steps_per_second": 4.95, |
| "step": 3200 |
| }, |
| { |
| "entropy": 2.006621611118317, |
| "epoch": 0.3866538183570224, |
| "grad_norm": 1.8203125, |
| "learning_rate": 7.211369014848601e-05, |
| "loss": 1.9990266799926757, |
| "mean_token_accuracy": 0.5689637005329132, |
| "num_tokens": 22074714.0, |
| "step": 3210 |
| }, |
| { |
| "entropy": 2.0335186243057253, |
| "epoch": 0.38785834738617203, |
| "grad_norm": 1.8203125, |
| "learning_rate": 7.193486742284112e-05, |
| "loss": 2.0472951889038087, |
| "mean_token_accuracy": 0.5649256110191345, |
| "num_tokens": 22143715.0, |
| "step": 3220 |
| }, |
| { |
| "entropy": 1.8772284269332886, |
| "epoch": 0.3890628764153216, |
| "grad_norm": 2.125, |
| "learning_rate": 7.175569658387769e-05, |
| "loss": 1.885722541809082, |
| "mean_token_accuracy": 0.5915811598300934, |
| "num_tokens": 22212593.0, |
| "step": 3230 |
| }, |
| { |
| "entropy": 2.106627869606018, |
| "epoch": 0.3902674054444712, |
| "grad_norm": 2.4375, |
| "learning_rate": 7.157618047509387e-05, |
| "loss": 2.1436986923217773, |
| "mean_token_accuracy": 0.5498576521873474, |
| "num_tokens": 22277182.0, |
| "step": 3240 |
| }, |
| { |
| "entropy": 2.0270182132720946, |
| "epoch": 0.39147193447362083, |
| "grad_norm": 2.921875, |
| "learning_rate": 7.139632194546742e-05, |
| "loss": 2.0177675247192384, |
| "mean_token_accuracy": 0.5683903455734253, |
| "num_tokens": 22346583.0, |
| "step": 3250 |
| }, |
| { |
| "entropy": 1.972713053226471, |
| "epoch": 0.3926764635027704, |
| "grad_norm": 1.796875, |
| "learning_rate": 7.121612384941033e-05, |
| "loss": 1.9979860305786132, |
| "mean_token_accuracy": 0.5702839136123657, |
| "num_tokens": 22417261.0, |
| "step": 3260 |
| }, |
| { |
| "entropy": 2.0097543239593505, |
| "epoch": 0.39388099253192, |
| "grad_norm": 2.609375, |
| "learning_rate": 7.103558904672368e-05, |
| "loss": 2.0144615173339844, |
| "mean_token_accuracy": 0.5682908892631531, |
| "num_tokens": 22483947.0, |
| "step": 3270 |
| }, |
| { |
| "entropy": 1.9494592428207398, |
| "epoch": 0.39508552156106963, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.085472040255218e-05, |
| "loss": 1.9564752578735352, |
| "mean_token_accuracy": 0.5814142465591431, |
| "num_tokens": 22552472.0, |
| "step": 3280 |
| }, |
| { |
| "entropy": 1.9892062902450562, |
| "epoch": 0.3962900505902192, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.067352078733872e-05, |
| "loss": 2.0238758087158204, |
| "mean_token_accuracy": 0.5674407482147217, |
| "num_tokens": 22624444.0, |
| "step": 3290 |
| }, |
| { |
| "entropy": 2.102202832698822, |
| "epoch": 0.39749457961936885, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.049199307677876e-05, |
| "loss": 2.093918800354004, |
| "mean_token_accuracy": 0.5605535507202148, |
| "num_tokens": 22693651.0, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.39749457961936885, |
| "eval_entropy": 2.5247026681900024, |
| "eval_loss": 2.729149341583252, |
| "eval_mean_token_accuracy": 0.4524097591638565, |
| "eval_num_tokens": 22693651.0, |
| "eval_runtime": 0.4863, |
| "eval_samples_per_second": 32.903, |
| "eval_steps_per_second": 4.113, |
| "step": 3300 |
| }, |
| { |
| "entropy": 1.9703481793403625, |
| "epoch": 0.39869910864851843, |
| "grad_norm": 1.8046875, |
| "learning_rate": 7.031014015177478e-05, |
| "loss": 1.9685224533081054, |
| "mean_token_accuracy": 0.571619188785553, |
| "num_tokens": 22763765.0, |
| "step": 3310 |
| }, |
| { |
| "entropy": 1.9734365105628968, |
| "epoch": 0.399903637677668, |
| "grad_norm": 1.9765625, |
| "learning_rate": 7.012796489839053e-05, |
| "loss": 2.009431838989258, |
| "mean_token_accuracy": 0.5690336644649505, |
| "num_tokens": 22833017.0, |
| "step": 3320 |
| }, |
| { |
| "entropy": 1.9694549560546875, |
| "epoch": 0.40110816670681765, |
| "grad_norm": 2.03125, |
| "learning_rate": 6.994547020780516e-05, |
| "loss": 1.9559648513793946, |
| "mean_token_accuracy": 0.5767580509185791, |
| "num_tokens": 22900580.0, |
| "step": 3330 |
| }, |
| { |
| "entropy": 2.001281261444092, |
| "epoch": 0.40231269573596723, |
| "grad_norm": 2.140625, |
| "learning_rate": 6.976265897626743e-05, |
| "loss": 2.033053398132324, |
| "mean_token_accuracy": 0.5716820240020752, |
| "num_tokens": 22968600.0, |
| "step": 3340 |
| }, |
| { |
| "entropy": 2.025617945194244, |
| "epoch": 0.40351722476511687, |
| "grad_norm": 1.9609375, |
| "learning_rate": 6.95795341050497e-05, |
| "loss": 2.0243719100952147, |
| "mean_token_accuracy": 0.5736408472061157, |
| "num_tokens": 23034468.0, |
| "step": 3350 |
| }, |
| { |
| "entropy": 2.0567389011383055, |
| "epoch": 0.40472175379426645, |
| "grad_norm": 2.453125, |
| "learning_rate": 6.93960985004019e-05, |
| "loss": 2.0726577758789064, |
| "mean_token_accuracy": 0.5588089168071747, |
| "num_tokens": 23100522.0, |
| "step": 3360 |
| }, |
| { |
| "entropy": 2.011142539978027, |
| "epoch": 0.40592628282341603, |
| "grad_norm": 2.125, |
| "learning_rate": 6.921235507350536e-05, |
| "loss": 2.0313907623291017, |
| "mean_token_accuracy": 0.5747258961200714, |
| "num_tokens": 23171707.0, |
| "step": 3370 |
| }, |
| { |
| "entropy": 1.941812264919281, |
| "epoch": 0.40713081185256567, |
| "grad_norm": 2.0625, |
| "learning_rate": 6.902830674042667e-05, |
| "loss": 1.9646072387695312, |
| "mean_token_accuracy": 0.5840674757957458, |
| "num_tokens": 23237006.0, |
| "step": 3380 |
| }, |
| { |
| "entropy": 2.0653809905052185, |
| "epoch": 0.40833534088171525, |
| "grad_norm": 2.21875, |
| "learning_rate": 6.884395642207141e-05, |
| "loss": 2.0873439788818358, |
| "mean_token_accuracy": 0.5598014950752258, |
| "num_tokens": 23303618.0, |
| "step": 3390 |
| }, |
| { |
| "entropy": 2.088254952430725, |
| "epoch": 0.40953986991086483, |
| "grad_norm": 2.109375, |
| "learning_rate": 6.865930704413771e-05, |
| "loss": 2.0937089920043945, |
| "mean_token_accuracy": 0.5576972186565399, |
| "num_tokens": 23371466.0, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.40953986991086483, |
| "eval_entropy": 2.4787381887435913, |
| "eval_loss": 2.6963882446289062, |
| "eval_mean_token_accuracy": 0.45208749175071716, |
| "eval_num_tokens": 23371466.0, |
| "eval_runtime": 0.416, |
| "eval_samples_per_second": 38.462, |
| "eval_steps_per_second": 4.808, |
| "step": 3400 |
| }, |
| { |
| "entropy": 1.9383289098739624, |
| "epoch": 0.41074439894001447, |
| "grad_norm": 1.71875, |
| "learning_rate": 6.84743615370699e-05, |
| "loss": 1.9510780334472657, |
| "mean_token_accuracy": 0.584078460931778, |
| "num_tokens": 23438546.0, |
| "step": 3410 |
| }, |
| { |
| "entropy": 1.9791313052177428, |
| "epoch": 0.41194892796916405, |
| "grad_norm": 2.5625, |
| "learning_rate": 6.828912283601195e-05, |
| "loss": 1.9753469467163085, |
| "mean_token_accuracy": 0.5759646952152252, |
| "num_tokens": 23508413.0, |
| "step": 3420 |
| }, |
| { |
| "entropy": 1.9467034935951233, |
| "epoch": 0.4131534569983137, |
| "grad_norm": 1.71875, |
| "learning_rate": 6.810359388076097e-05, |
| "loss": 1.9725639343261718, |
| "mean_token_accuracy": 0.5774512171745301, |
| "num_tokens": 23578028.0, |
| "step": 3430 |
| }, |
| { |
| "entropy": 2.0062270283699037, |
| "epoch": 0.41435798602746327, |
| "grad_norm": 2.421875, |
| "learning_rate": 6.79177776157204e-05, |
| "loss": 2.0015352249145506, |
| "mean_token_accuracy": 0.5665505826473236, |
| "num_tokens": 23648449.0, |
| "step": 3440 |
| }, |
| { |
| "entropy": 2.0732170820236204, |
| "epoch": 0.41556251505661285, |
| "grad_norm": 2.328125, |
| "learning_rate": 6.773167698985348e-05, |
| "loss": 2.0981536865234376, |
| "mean_token_accuracy": 0.5579517900943756, |
| "num_tokens": 23717754.0, |
| "step": 3450 |
| }, |
| { |
| "entropy": 2.0526692628860475, |
| "epoch": 0.4167670440857625, |
| "grad_norm": 2.640625, |
| "learning_rate": 6.754529495663627e-05, |
| "loss": 2.053252601623535, |
| "mean_token_accuracy": 0.5652489423751831, |
| "num_tokens": 23785096.0, |
| "step": 3460 |
| }, |
| { |
| "entropy": 2.0952977895736695, |
| "epoch": 0.41797157311491206, |
| "grad_norm": 1.890625, |
| "learning_rate": 6.73586344740109e-05, |
| "loss": 2.1027814865112306, |
| "mean_token_accuracy": 0.5595345795154572, |
| "num_tokens": 23853741.0, |
| "step": 3470 |
| }, |
| { |
| "entropy": 2.0081144332885743, |
| "epoch": 0.41917610214406165, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.717169850433857e-05, |
| "loss": 2.0238536834716796, |
| "mean_token_accuracy": 0.5671063840389252, |
| "num_tokens": 23922684.0, |
| "step": 3480 |
| }, |
| { |
| "entropy": 1.9314401149749756, |
| "epoch": 0.4203806311732113, |
| "grad_norm": 5.875, |
| "learning_rate": 6.698449001435251e-05, |
| "loss": 1.9350923538208007, |
| "mean_token_accuracy": 0.5847425639629364, |
| "num_tokens": 23991091.0, |
| "step": 3490 |
| }, |
| { |
| "entropy": 1.9933564424514771, |
| "epoch": 0.42158516020236086, |
| "grad_norm": 2.40625, |
| "learning_rate": 6.679701197511098e-05, |
| "loss": 2.0253509521484374, |
| "mean_token_accuracy": 0.5672946512699127, |
| "num_tokens": 24062333.0, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.42158516020236086, |
| "eval_entropy": 2.4498034715652466, |
| "eval_loss": 2.6938982009887695, |
| "eval_mean_token_accuracy": 0.45604458451271057, |
| "eval_num_tokens": 24062333.0, |
| "eval_runtime": 0.4009, |
| "eval_samples_per_second": 39.915, |
| "eval_steps_per_second": 4.989, |
| "step": 3500 |
| }, |
| { |
| "entropy": 1.993247103691101, |
| "epoch": 0.4227896892315105, |
| "grad_norm": 2.0, |
| "learning_rate": 6.660926736195007e-05, |
| "loss": 1.985020065307617, |
| "mean_token_accuracy": 0.572957593202591, |
| "num_tokens": 24130009.0, |
| "step": 3510 |
| }, |
| { |
| "entropy": 2.004629743099213, |
| "epoch": 0.4239942182606601, |
| "grad_norm": 1.5859375, |
| "learning_rate": 6.642125915443646e-05, |
| "loss": 2.0138731002807617, |
| "mean_token_accuracy": 0.5689170718193054, |
| "num_tokens": 24198224.0, |
| "step": 3520 |
| }, |
| { |
| "entropy": 1.992073893547058, |
| "epoch": 0.42519874728980966, |
| "grad_norm": 1.8203125, |
| "learning_rate": 6.623299033632015e-05, |
| "loss": 2.003081703186035, |
| "mean_token_accuracy": 0.5710993528366088, |
| "num_tokens": 24269236.0, |
| "step": 3530 |
| }, |
| { |
| "entropy": 1.9667566299438477, |
| "epoch": 0.4264032763189593, |
| "grad_norm": 2.625, |
| "learning_rate": 6.604446389548718e-05, |
| "loss": 1.9854969024658202, |
| "mean_token_accuracy": 0.5752442955970765, |
| "num_tokens": 24336698.0, |
| "step": 3540 |
| }, |
| { |
| "entropy": 1.9823364615440369, |
| "epoch": 0.4276078053481089, |
| "grad_norm": 7.03125, |
| "learning_rate": 6.585568282391202e-05, |
| "loss": 2.002846336364746, |
| "mean_token_accuracy": 0.5752040565013885, |
| "num_tokens": 24404740.0, |
| "step": 3550 |
| }, |
| { |
| "entropy": 1.9646108746528625, |
| "epoch": 0.4288123343772585, |
| "grad_norm": 2.484375, |
| "learning_rate": 6.566665011761036e-05, |
| "loss": 1.9754671096801757, |
| "mean_token_accuracy": 0.5784339427947998, |
| "num_tokens": 24471666.0, |
| "step": 3560 |
| }, |
| { |
| "entropy": 1.9380438804626465, |
| "epoch": 0.4300168634064081, |
| "grad_norm": 2.015625, |
| "learning_rate": 6.547736877659129e-05, |
| "loss": 1.948002815246582, |
| "mean_token_accuracy": 0.5778752982616424, |
| "num_tokens": 24541106.0, |
| "step": 3570 |
| }, |
| { |
| "entropy": 1.9597272753715516, |
| "epoch": 0.4312213924355577, |
| "grad_norm": 2.046875, |
| "learning_rate": 6.528784180480987e-05, |
| "loss": 1.9658893585205077, |
| "mean_token_accuracy": 0.5809767782688141, |
| "num_tokens": 24610669.0, |
| "step": 3580 |
| }, |
| { |
| "entropy": 2.016139495372772, |
| "epoch": 0.4324259214647073, |
| "grad_norm": 1.90625, |
| "learning_rate": 6.509807221011939e-05, |
| "loss": 2.0279006958007812, |
| "mean_token_accuracy": 0.5701642394065857, |
| "num_tokens": 24677668.0, |
| "step": 3590 |
| }, |
| { |
| "entropy": 1.9099430084228515, |
| "epoch": 0.4336304504938569, |
| "grad_norm": 2.171875, |
| "learning_rate": 6.490806300422363e-05, |
| "loss": 1.9234348297119142, |
| "mean_token_accuracy": 0.5862063884735107, |
| "num_tokens": 24747672.0, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.4336304504938569, |
| "eval_entropy": 2.477397084236145, |
| "eval_loss": 2.695096969604492, |
| "eval_mean_token_accuracy": 0.463316410779953, |
| "eval_num_tokens": 24747672.0, |
| "eval_runtime": 0.4717, |
| "eval_samples_per_second": 33.919, |
| "eval_steps_per_second": 4.24, |
| "step": 3600 |
| }, |
| { |
| "entropy": 2.0054628014564515, |
| "epoch": 0.4348349795230065, |
| "grad_norm": 1.8203125, |
| "learning_rate": 6.47178172026291e-05, |
| "loss": 2.007233238220215, |
| "mean_token_accuracy": 0.5705864608287812, |
| "num_tokens": 24813790.0, |
| "step": 3610 |
| }, |
| { |
| "entropy": 2.009073185920715, |
| "epoch": 0.4360395085521561, |
| "grad_norm": 1.65625, |
| "learning_rate": 6.452733782459717e-05, |
| "loss": 2.029523468017578, |
| "mean_token_accuracy": 0.5658802688121796, |
| "num_tokens": 24883706.0, |
| "step": 3620 |
| }, |
| { |
| "entropy": 1.9586613178253174, |
| "epoch": 0.4372440375813057, |
| "grad_norm": 2.15625, |
| "learning_rate": 6.433662789309605e-05, |
| "loss": 1.9708032608032227, |
| "mean_token_accuracy": 0.5791431427001953, |
| "num_tokens": 24951765.0, |
| "step": 3630 |
| }, |
| { |
| "entropy": 1.9156463623046875, |
| "epoch": 0.43844856661045534, |
| "grad_norm": 1.625, |
| "learning_rate": 6.414569043475305e-05, |
| "loss": 1.9143449783325195, |
| "mean_token_accuracy": 0.5888265132904053, |
| "num_tokens": 25019817.0, |
| "step": 3640 |
| }, |
| { |
| "entropy": 1.9117112517356873, |
| "epoch": 0.4396530956396049, |
| "grad_norm": 1.578125, |
| "learning_rate": 6.395452847980628e-05, |
| "loss": 1.9290433883666993, |
| "mean_token_accuracy": 0.5843783140182495, |
| "num_tokens": 25088811.0, |
| "step": 3650 |
| }, |
| { |
| "entropy": 1.9448886394500733, |
| "epoch": 0.4408576246687545, |
| "grad_norm": 6.375, |
| "learning_rate": 6.376314506205675e-05, |
| "loss": 1.9512279510498047, |
| "mean_token_accuracy": 0.585245794057846, |
| "num_tokens": 25154490.0, |
| "step": 3660 |
| }, |
| { |
| "entropy": 1.9380404233932496, |
| "epoch": 0.44206215369790414, |
| "grad_norm": 1.8125, |
| "learning_rate": 6.357154321882012e-05, |
| "loss": 1.9418190002441407, |
| "mean_token_accuracy": 0.5806894898414612, |
| "num_tokens": 25222490.0, |
| "step": 3670 |
| }, |
| { |
| "entropy": 2.007444739341736, |
| "epoch": 0.4432666827270537, |
| "grad_norm": 2.90625, |
| "learning_rate": 6.337972599087857e-05, |
| "loss": 2.007956123352051, |
| "mean_token_accuracy": 0.5707302153110504, |
| "num_tokens": 25288925.0, |
| "step": 3680 |
| }, |
| { |
| "entropy": 1.9364651441574097, |
| "epoch": 0.4444712117562033, |
| "grad_norm": 1.953125, |
| "learning_rate": 6.318769642243245e-05, |
| "loss": 1.9320077896118164, |
| "mean_token_accuracy": 0.5831962287425995, |
| "num_tokens": 25357070.0, |
| "step": 3690 |
| }, |
| { |
| "entropy": 1.9276177763938904, |
| "epoch": 0.44567574078535294, |
| "grad_norm": 2.625, |
| "learning_rate": 6.299545756105209e-05, |
| "loss": 1.9602447509765626, |
| "mean_token_accuracy": 0.5792842030525207, |
| "num_tokens": 25426661.0, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.44567574078535294, |
| "eval_entropy": 2.525130867958069, |
| "eval_loss": 2.6856164932250977, |
| "eval_mean_token_accuracy": 0.46145734190940857, |
| "eval_num_tokens": 25426661.0, |
| "eval_runtime": 0.4119, |
| "eval_samples_per_second": 38.84, |
| "eval_steps_per_second": 4.855, |
| "step": 3700 |
| }, |
| { |
| "entropy": 1.9861109852790833, |
| "epoch": 0.4468802698145025, |
| "grad_norm": 2.390625, |
| "learning_rate": 6.280301245762929e-05, |
| "loss": 1.9737564086914063, |
| "mean_token_accuracy": 0.5805804789066314, |
| "num_tokens": 25493623.0, |
| "step": 3710 |
| }, |
| { |
| "entropy": 2.0100439190864563, |
| "epoch": 0.44808479884365215, |
| "grad_norm": 2.078125, |
| "learning_rate": 6.261036416632906e-05, |
| "loss": 2.036077117919922, |
| "mean_token_accuracy": 0.5688071370124816, |
| "num_tokens": 25564579.0, |
| "step": 3720 |
| }, |
| { |
| "entropy": 2.0535164713859557, |
| "epoch": 0.44928932787280174, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.241751574454098e-05, |
| "loss": 2.054555320739746, |
| "mean_token_accuracy": 0.565549087524414, |
| "num_tokens": 25633048.0, |
| "step": 3730 |
| }, |
| { |
| "entropy": 2.027570879459381, |
| "epoch": 0.4504938569019513, |
| "grad_norm": 1.734375, |
| "learning_rate": 6.222447025283082e-05, |
| "loss": 2.04293270111084, |
| "mean_token_accuracy": 0.5656031697988511, |
| "num_tokens": 25702607.0, |
| "step": 3740 |
| }, |
| { |
| "entropy": 2.0254459023475646, |
| "epoch": 0.45169838593110095, |
| "grad_norm": 1.7734375, |
| "learning_rate": 6.203123075489191e-05, |
| "loss": 2.023523139953613, |
| "mean_token_accuracy": 0.5712070286273956, |
| "num_tokens": 25771055.0, |
| "step": 3750 |
| }, |
| { |
| "entropy": 1.8324352741241454, |
| "epoch": 0.45290291496025054, |
| "grad_norm": 1.890625, |
| "learning_rate": 6.183780031749649e-05, |
| "loss": 1.8483991622924805, |
| "mean_token_accuracy": 0.5993020892143249, |
| "num_tokens": 25837460.0, |
| "step": 3760 |
| }, |
| { |
| "entropy": 1.9764248132705688, |
| "epoch": 0.4541074439894001, |
| "grad_norm": 1.734375, |
| "learning_rate": 6.164418201044709e-05, |
| "loss": 1.9906301498413086, |
| "mean_token_accuracy": 0.5752259314060211, |
| "num_tokens": 25908374.0, |
| "step": 3770 |
| }, |
| { |
| "entropy": 2.0057262182235718, |
| "epoch": 0.45531197301854975, |
| "grad_norm": 3.0, |
| "learning_rate": 6.145037890652777e-05, |
| "loss": 2.0180091857910156, |
| "mean_token_accuracy": 0.5674101829528808, |
| "num_tokens": 25976856.0, |
| "step": 3780 |
| }, |
| { |
| "entropy": 2.0065826296806337, |
| "epoch": 0.45651650204769934, |
| "grad_norm": 1.7734375, |
| "learning_rate": 6.125639408145545e-05, |
| "loss": 2.0031196594238283, |
| "mean_token_accuracy": 0.5769789397716523, |
| "num_tokens": 26044133.0, |
| "step": 3790 |
| }, |
| { |
| "entropy": 2.0009281516075133, |
| "epoch": 0.45772103107684897, |
| "grad_norm": 2.0625, |
| "learning_rate": 6.106223061383093e-05, |
| "loss": 2.026101303100586, |
| "mean_token_accuracy": 0.5688745856285096, |
| "num_tokens": 26113453.0, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.45772103107684897, |
| "eval_entropy": 2.4917021989822388, |
| "eval_loss": 2.6710691452026367, |
| "eval_mean_token_accuracy": 0.45515669882297516, |
| "eval_num_tokens": 26113453.0, |
| "eval_runtime": 0.5084, |
| "eval_samples_per_second": 31.474, |
| "eval_steps_per_second": 3.934, |
| "step": 3800 |
| }, |
| { |
| "entropy": 1.9381158471107482, |
| "epoch": 0.45892556010599855, |
| "grad_norm": 2.25, |
| "learning_rate": 6.0867891585090166e-05, |
| "loss": 1.9284444808959962, |
| "mean_token_accuracy": 0.5864272952079773, |
| "num_tokens": 26184767.0, |
| "step": 3810 |
| }, |
| { |
| "entropy": 1.9594375610351562, |
| "epoch": 0.46013008913514813, |
| "grad_norm": 1.75, |
| "learning_rate": 6.067338007945531e-05, |
| "loss": 1.9852970123291016, |
| "mean_token_accuracy": 0.5773785710334778, |
| "num_tokens": 26254484.0, |
| "step": 3820 |
| }, |
| { |
| "entropy": 1.9799617648124694, |
| "epoch": 0.46133461816429777, |
| "grad_norm": 1.9765625, |
| "learning_rate": 6.04786991838858e-05, |
| "loss": 1.9897350311279296, |
| "mean_token_accuracy": 0.5737548470497131, |
| "num_tokens": 26324694.0, |
| "step": 3830 |
| }, |
| { |
| "entropy": 1.9124466061592102, |
| "epoch": 0.46253914719344735, |
| "grad_norm": 1.796875, |
| "learning_rate": 6.028385198802935e-05, |
| "loss": 1.9315099716186523, |
| "mean_token_accuracy": 0.5868638217449188, |
| "num_tokens": 26393506.0, |
| "step": 3840 |
| }, |
| { |
| "entropy": 1.9165910959243775, |
| "epoch": 0.463743676222597, |
| "grad_norm": 1.6640625, |
| "learning_rate": 6.008884158417285e-05, |
| "loss": 1.9327356338500976, |
| "mean_token_accuracy": 0.5816730856895447, |
| "num_tokens": 26463539.0, |
| "step": 3850 |
| }, |
| { |
| "entropy": 1.947556233406067, |
| "epoch": 0.46494820525174657, |
| "grad_norm": 1.8828125, |
| "learning_rate": 5.989367106719342e-05, |
| "loss": 1.9421436309814453, |
| "mean_token_accuracy": 0.5770792663097382, |
| "num_tokens": 26533658.0, |
| "step": 3860 |
| }, |
| { |
| "entropy": 1.9143320441246032, |
| "epoch": 0.46615273428089615, |
| "grad_norm": 1.84375, |
| "learning_rate": 5.9698343534509206e-05, |
| "loss": 1.9342830657958985, |
| "mean_token_accuracy": 0.5813970685005188, |
| "num_tokens": 26603082.0, |
| "step": 3870 |
| }, |
| { |
| "entropy": 1.9642464518547058, |
| "epoch": 0.4673572633100458, |
| "grad_norm": 1.8046875, |
| "learning_rate": 5.9502862086030255e-05, |
| "loss": 1.967822265625, |
| "mean_token_accuracy": 0.5837168216705322, |
| "num_tokens": 26670867.0, |
| "step": 3880 |
| }, |
| { |
| "entropy": 2.036736857891083, |
| "epoch": 0.46856179233919537, |
| "grad_norm": 1.6171875, |
| "learning_rate": 5.930722982410928e-05, |
| "loss": 2.0509645462036135, |
| "mean_token_accuracy": 0.564321780204773, |
| "num_tokens": 26739725.0, |
| "step": 3890 |
| }, |
| { |
| "entropy": 1.9816170811653138, |
| "epoch": 0.46976632136834495, |
| "grad_norm": 4.0, |
| "learning_rate": 5.911144985349245e-05, |
| "loss": 1.9887372970581054, |
| "mean_token_accuracy": 0.5746063709259033, |
| "num_tokens": 26805376.0, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.46976632136834495, |
| "eval_entropy": 2.439876079559326, |
| "eval_loss": 2.6857075691223145, |
| "eval_mean_token_accuracy": 0.45507559180259705, |
| "eval_num_tokens": 26805376.0, |
| "eval_runtime": 0.5052, |
| "eval_samples_per_second": 31.67, |
| "eval_steps_per_second": 3.959, |
| "step": 3900 |
| }, |
| { |
| "entropy": 1.9576510548591615, |
| "epoch": 0.4709708503974946, |
| "grad_norm": 2.234375, |
| "learning_rate": 5.891552528127015e-05, |
| "loss": 1.9680488586425782, |
| "mean_token_accuracy": 0.5752190470695495, |
| "num_tokens": 26874810.0, |
| "step": 3910 |
| }, |
| { |
| "entropy": 1.9396398186683654, |
| "epoch": 0.47217537942664417, |
| "grad_norm": 2.015625, |
| "learning_rate": 5.871945921682762e-05, |
| "loss": 1.9553556442260742, |
| "mean_token_accuracy": 0.5795332670211792, |
| "num_tokens": 26947352.0, |
| "step": 3920 |
| }, |
| { |
| "entropy": 1.981644630432129, |
| "epoch": 0.4733799084557938, |
| "grad_norm": 1.9453125, |
| "learning_rate": 5.8523254771795635e-05, |
| "loss": 2.001514434814453, |
| "mean_token_accuracy": 0.5716517567634583, |
| "num_tokens": 27016298.0, |
| "step": 3930 |
| }, |
| { |
| "entropy": 2.0159306645393373, |
| "epoch": 0.4745844374849434, |
| "grad_norm": 1.921875, |
| "learning_rate": 5.8326915060001076e-05, |
| "loss": 2.0092771530151365, |
| "mean_token_accuracy": 0.5699867486953736, |
| "num_tokens": 27087211.0, |
| "step": 3940 |
| }, |
| { |
| "entropy": 1.8825679063796996, |
| "epoch": 0.47578896651409297, |
| "grad_norm": 1.96875, |
| "learning_rate": 5.81304431974176e-05, |
| "loss": 1.9066276550292969, |
| "mean_token_accuracy": 0.5872237503528595, |
| "num_tokens": 27155504.0, |
| "step": 3950 |
| }, |
| { |
| "entropy": 2.0146188974380492, |
| "epoch": 0.4769934955432426, |
| "grad_norm": 2.34375, |
| "learning_rate": 5.793384230211611e-05, |
| "loss": 2.0064815521240233, |
| "mean_token_accuracy": 0.5711189568042755, |
| "num_tokens": 27221744.0, |
| "step": 3960 |
| }, |
| { |
| "entropy": 1.9668913722038268, |
| "epoch": 0.4781980245723922, |
| "grad_norm": 1.8125, |
| "learning_rate": 5.7737115494215353e-05, |
| "loss": 1.993480110168457, |
| "mean_token_accuracy": 0.5775595366954803, |
| "num_tokens": 27288564.0, |
| "step": 3970 |
| }, |
| { |
| "entropy": 1.9702805399894714, |
| "epoch": 0.47940255360154177, |
| "grad_norm": 1.8203125, |
| "learning_rate": 5.754026589583224e-05, |
| "loss": 1.9741327285766601, |
| "mean_token_accuracy": 0.5786505699157715, |
| "num_tokens": 27353144.0, |
| "step": 3980 |
| }, |
| { |
| "entropy": 1.8980592966079712, |
| "epoch": 0.4806070826306914, |
| "grad_norm": 1.921875, |
| "learning_rate": 5.734329663103252e-05, |
| "loss": 1.9262420654296875, |
| "mean_token_accuracy": 0.5835274875164032, |
| "num_tokens": 27420105.0, |
| "step": 3990 |
| }, |
| { |
| "entropy": 1.9941051006317139, |
| "epoch": 0.481811611659841, |
| "grad_norm": 1.859375, |
| "learning_rate": 5.7146210825781e-05, |
| "loss": 1.980280303955078, |
| "mean_token_accuracy": 0.577644807100296, |
| "num_tokens": 27486818.0, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.481811611659841, |
| "eval_entropy": 2.42073392868042, |
| "eval_loss": 2.6700968742370605, |
| "eval_mean_token_accuracy": 0.4570968747138977, |
| "eval_num_tokens": 27486818.0, |
| "eval_runtime": 0.5345, |
| "eval_samples_per_second": 29.933, |
| "eval_steps_per_second": 3.742, |
| "step": 4000 |
| }, |
| { |
| "entropy": 1.917995858192444, |
| "epoch": 0.4830161406889906, |
| "grad_norm": 2.375, |
| "learning_rate": 5.694901160789209e-05, |
| "loss": 1.9306228637695313, |
| "mean_token_accuracy": 0.5779500126838684, |
| "num_tokens": 27555652.0, |
| "step": 4010 |
| }, |
| { |
| "entropy": 1.9785538196563721, |
| "epoch": 0.4842206697181402, |
| "grad_norm": 2.578125, |
| "learning_rate": 5.6751702106980044e-05, |
| "loss": 2.0008047103881834, |
| "mean_token_accuracy": 0.5703692853450775, |
| "num_tokens": 27623554.0, |
| "step": 4020 |
| }, |
| { |
| "entropy": 1.9732365250587462, |
| "epoch": 0.4854251987472898, |
| "grad_norm": 1.9765625, |
| "learning_rate": 5.655428545440936e-05, |
| "loss": 1.9656476974487305, |
| "mean_token_accuracy": 0.5797830998897553, |
| "num_tokens": 27692863.0, |
| "step": 4030 |
| }, |
| { |
| "entropy": 1.9249773025512695, |
| "epoch": 0.4866297277764394, |
| "grad_norm": 2.4375, |
| "learning_rate": 5.6356764783245075e-05, |
| "loss": 1.9510303497314454, |
| "mean_token_accuracy": 0.5802127003669739, |
| "num_tokens": 27761499.0, |
| "step": 4040 |
| }, |
| { |
| "entropy": 2.0097981095314026, |
| "epoch": 0.487834256805589, |
| "grad_norm": 1.609375, |
| "learning_rate": 5.6159143228203016e-05, |
| "loss": 1.999547576904297, |
| "mean_token_accuracy": 0.5743436753749848, |
| "num_tokens": 27829573.0, |
| "step": 4050 |
| }, |
| { |
| "entropy": 1.8779245853424071, |
| "epoch": 0.48903878583473864, |
| "grad_norm": 2.421875, |
| "learning_rate": 5.59614239256001e-05, |
| "loss": 1.9048309326171875, |
| "mean_token_accuracy": 0.5902513444423676, |
| "num_tokens": 27897536.0, |
| "step": 4060 |
| }, |
| { |
| "entropy": 2.007873523235321, |
| "epoch": 0.4902433148638882, |
| "grad_norm": 1.7578125, |
| "learning_rate": 5.576361001330451e-05, |
| "loss": 2.0190666198730467, |
| "mean_token_accuracy": 0.5717886984348297, |
| "num_tokens": 27965109.0, |
| "step": 4070 |
| }, |
| { |
| "entropy": 2.008230710029602, |
| "epoch": 0.4914478438930378, |
| "grad_norm": 2.125, |
| "learning_rate": 5.5565704630685886e-05, |
| "loss": 2.011882209777832, |
| "mean_token_accuracy": 0.5669187545776367, |
| "num_tokens": 28031616.0, |
| "step": 4080 |
| }, |
| { |
| "entropy": 1.9069723725318908, |
| "epoch": 0.49265237292218744, |
| "grad_norm": 2.171875, |
| "learning_rate": 5.536771091856559e-05, |
| "loss": 1.9020198822021483, |
| "mean_token_accuracy": 0.5873600125312806, |
| "num_tokens": 28099184.0, |
| "step": 4090 |
| }, |
| { |
| "entropy": 1.9309583067893983, |
| "epoch": 0.493856901951337, |
| "grad_norm": 2.625, |
| "learning_rate": 5.516963201916674e-05, |
| "loss": 1.967962646484375, |
| "mean_token_accuracy": 0.5801371574401856, |
| "num_tokens": 28169833.0, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.493856901951337, |
| "eval_entropy": 2.4127514362335205, |
| "eval_loss": 2.6565656661987305, |
| "eval_mean_token_accuracy": 0.45418770611286163, |
| "eval_num_tokens": 28169833.0, |
| "eval_runtime": 0.3983, |
| "eval_samples_per_second": 40.173, |
| "eval_steps_per_second": 5.022, |
| "step": 4100 |
| }, |
| { |
| "entropy": 1.9569150805473328, |
| "epoch": 0.4950614309804866, |
| "grad_norm": 1.6328125, |
| "learning_rate": 5.4971471076064475e-05, |
| "loss": 1.9444543838500976, |
| "mean_token_accuracy": 0.5801873207092285, |
| "num_tokens": 28237930.0, |
| "step": 4110 |
| }, |
| { |
| "entropy": 1.8962748527526856, |
| "epoch": 0.49626596000963624, |
| "grad_norm": 2.671875, |
| "learning_rate": 5.4773231234135916e-05, |
| "loss": 1.9189611434936524, |
| "mean_token_accuracy": 0.5859391510486602, |
| "num_tokens": 28306591.0, |
| "step": 4120 |
| }, |
| { |
| "entropy": 1.9929224729537964, |
| "epoch": 0.4974704890387858, |
| "grad_norm": 5.40625, |
| "learning_rate": 5.457491563951037e-05, |
| "loss": 2.0031759262084963, |
| "mean_token_accuracy": 0.57361621260643, |
| "num_tokens": 28376098.0, |
| "step": 4130 |
| }, |
| { |
| "entropy": 2.015347754955292, |
| "epoch": 0.49867501806793546, |
| "grad_norm": 1.84375, |
| "learning_rate": 5.4376527439519376e-05, |
| "loss": 2.0239486694335938, |
| "mean_token_accuracy": 0.5746548473834991, |
| "num_tokens": 28443069.0, |
| "step": 4140 |
| }, |
| { |
| "entropy": 1.9625335097312928, |
| "epoch": 0.49987954709708504, |
| "grad_norm": 2.671875, |
| "learning_rate": 5.417806978264673e-05, |
| "loss": 1.9675872802734375, |
| "mean_token_accuracy": 0.5780832052230835, |
| "num_tokens": 28512775.0, |
| "step": 4150 |
| }, |
| { |
| "entropy": 1.9938682675361634, |
| "epoch": 0.5010840761262346, |
| "grad_norm": 2.625, |
| "learning_rate": 5.397954581847855e-05, |
| "loss": 1.9934600830078124, |
| "mean_token_accuracy": 0.5703189671039581, |
| "num_tokens": 28579579.0, |
| "step": 4160 |
| }, |
| { |
| "entropy": 1.9511204719543458, |
| "epoch": 0.5022886051553842, |
| "grad_norm": 1.6875, |
| "learning_rate": 5.378095869765323e-05, |
| "loss": 1.9708211898803711, |
| "mean_token_accuracy": 0.5744782328605652, |
| "num_tokens": 28646564.0, |
| "step": 4170 |
| }, |
| { |
| "entropy": 1.9872597217559815, |
| "epoch": 0.5034931341845339, |
| "grad_norm": 1.90625, |
| "learning_rate": 5.358231157181149e-05, |
| "loss": 1.9806531906127929, |
| "mean_token_accuracy": 0.5780558466911316, |
| "num_tokens": 28716562.0, |
| "step": 4180 |
| }, |
| { |
| "entropy": 1.9043650150299072, |
| "epoch": 0.5046976632136835, |
| "grad_norm": 2.25, |
| "learning_rate": 5.338360759354639e-05, |
| "loss": 1.921070671081543, |
| "mean_token_accuracy": 0.5862109303474426, |
| "num_tokens": 28782253.0, |
| "step": 4190 |
| }, |
| { |
| "entropy": 1.9266673088073731, |
| "epoch": 0.5059021922428331, |
| "grad_norm": 1.703125, |
| "learning_rate": 5.318484991635323e-05, |
| "loss": 1.9321710586547851, |
| "mean_token_accuracy": 0.5806420803070068, |
| "num_tokens": 28853279.0, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.5059021922428331, |
| "eval_entropy": 2.398097038269043, |
| "eval_loss": 2.66256046295166, |
| "eval_mean_token_accuracy": 0.4587104022502899, |
| "eval_num_tokens": 28853279.0, |
| "eval_runtime": 0.4258, |
| "eval_samples_per_second": 37.58, |
| "eval_steps_per_second": 4.698, |
| "step": 4200 |
| }, |
| { |
| "entropy": 1.9215782046318055, |
| "epoch": 0.5071067212719826, |
| "grad_norm": 1.546875, |
| "learning_rate": 5.29860416945795e-05, |
| "loss": 1.927357292175293, |
| "mean_token_accuracy": 0.588092303276062, |
| "num_tokens": 28921713.0, |
| "step": 4210 |
| }, |
| { |
| "entropy": 1.890907645225525, |
| "epoch": 0.5083112503011322, |
| "grad_norm": 1.765625, |
| "learning_rate": 5.278718608337489e-05, |
| "loss": 1.911885643005371, |
| "mean_token_accuracy": 0.5910214900970459, |
| "num_tokens": 28991493.0, |
| "step": 4220 |
| }, |
| { |
| "entropy": 1.9491180300712585, |
| "epoch": 0.5095157793302819, |
| "grad_norm": 2.59375, |
| "learning_rate": 5.2588286238641146e-05, |
| "loss": 1.9617496490478517, |
| "mean_token_accuracy": 0.579128873348236, |
| "num_tokens": 29059104.0, |
| "step": 4230 |
| }, |
| { |
| "entropy": 2.0040785312652587, |
| "epoch": 0.5107203083594315, |
| "grad_norm": 2.015625, |
| "learning_rate": 5.238934531698206e-05, |
| "loss": 2.024494171142578, |
| "mean_token_accuracy": 0.5720134794712066, |
| "num_tokens": 29126675.0, |
| "step": 4240 |
| }, |
| { |
| "entropy": 1.9998656392097474, |
| "epoch": 0.5119248373885811, |
| "grad_norm": 2.140625, |
| "learning_rate": 5.21903664756533e-05, |
| "loss": 2.0020214080810548, |
| "mean_token_accuracy": 0.5735575735569001, |
| "num_tokens": 29195874.0, |
| "step": 4250 |
| }, |
| { |
| "entropy": 1.95628924369812, |
| "epoch": 0.5131293664177307, |
| "grad_norm": 3.25, |
| "learning_rate": 5.199135287251229e-05, |
| "loss": 1.9562814712524415, |
| "mean_token_accuracy": 0.585600209236145, |
| "num_tokens": 29262468.0, |
| "step": 4260 |
| }, |
| { |
| "entropy": 1.9958741307258605, |
| "epoch": 0.5143338954468802, |
| "grad_norm": 3.125, |
| "learning_rate": 5.1792307665968184e-05, |
| "loss": 2.0193979263305666, |
| "mean_token_accuracy": 0.5735933542251587, |
| "num_tokens": 29328578.0, |
| "step": 4270 |
| }, |
| { |
| "entropy": 1.9700656414031983, |
| "epoch": 0.5155384244760298, |
| "grad_norm": 1.6796875, |
| "learning_rate": 5.15932340149317e-05, |
| "loss": 1.963376808166504, |
| "mean_token_accuracy": 0.5765927374362946, |
| "num_tokens": 29395680.0, |
| "step": 4280 |
| }, |
| { |
| "entropy": 1.9745967507362365, |
| "epoch": 0.5167429535051795, |
| "grad_norm": 2.140625, |
| "learning_rate": 5.139413507876495e-05, |
| "loss": 1.9744836807250976, |
| "mean_token_accuracy": 0.5818488031625748, |
| "num_tokens": 29463513.0, |
| "step": 4290 |
| }, |
| { |
| "entropy": 1.9099359273910523, |
| "epoch": 0.5179474825343291, |
| "grad_norm": 2.40625, |
| "learning_rate": 5.1195014017231346e-05, |
| "loss": 1.9302894592285156, |
| "mean_token_accuracy": 0.5796979129314422, |
| "num_tokens": 29533253.0, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.5179474825343291, |
| "eval_entropy": 2.387773633003235, |
| "eval_loss": 2.6579198837280273, |
| "eval_mean_token_accuracy": 0.459276020526886, |
| "eval_num_tokens": 29533253.0, |
| "eval_runtime": 0.3914, |
| "eval_samples_per_second": 40.882, |
| "eval_steps_per_second": 5.11, |
| "step": 4300 |
| }, |
| { |
| "entropy": 2.002520430088043, |
| "epoch": 0.5191520115634787, |
| "grad_norm": 3.078125, |
| "learning_rate": 5.099587399044542e-05, |
| "loss": 2.0029012680053713, |
| "mean_token_accuracy": 0.5740825355052948, |
| "num_tokens": 29603758.0, |
| "step": 4310 |
| }, |
| { |
| "entropy": 1.9531593322753906, |
| "epoch": 0.5203565405926283, |
| "grad_norm": 2.171875, |
| "learning_rate": 5.0796718158822686e-05, |
| "loss": 1.9717971801757812, |
| "mean_token_accuracy": 0.5774181842803955, |
| "num_tokens": 29672871.0, |
| "step": 4320 |
| }, |
| { |
| "entropy": 1.9271060705184937, |
| "epoch": 0.5215610696217778, |
| "grad_norm": 1.9765625, |
| "learning_rate": 5.059754968302953e-05, |
| "loss": 1.9339466094970703, |
| "mean_token_accuracy": 0.5867386102676392, |
| "num_tokens": 29743463.0, |
| "step": 4330 |
| }, |
| { |
| "entropy": 1.918910849094391, |
| "epoch": 0.5227655986509275, |
| "grad_norm": 2.125, |
| "learning_rate": 5.039837172393297e-05, |
| "loss": 1.9289640426635741, |
| "mean_token_accuracy": 0.5850243151187897, |
| "num_tokens": 29812178.0, |
| "step": 4340 |
| }, |
| { |
| "entropy": 1.936630415916443, |
| "epoch": 0.5239701276800771, |
| "grad_norm": 1.7109375, |
| "learning_rate": 5.01991874425505e-05, |
| "loss": 1.945786476135254, |
| "mean_token_accuracy": 0.5822522580623627, |
| "num_tokens": 29881488.0, |
| "step": 4350 |
| }, |
| { |
| "entropy": 1.9208004593849182, |
| "epoch": 0.5251746567092267, |
| "grad_norm": 2.265625, |
| "learning_rate": 5e-05, |
| "loss": 1.9185230255126953, |
| "mean_token_accuracy": 0.5845731794834137, |
| "num_tokens": 29950167.0, |
| "step": 4360 |
| }, |
| { |
| "entropy": 1.9142470717430116, |
| "epoch": 0.5263791857383763, |
| "grad_norm": 1.6796875, |
| "learning_rate": 4.980081255744951e-05, |
| "loss": 1.944696044921875, |
| "mean_token_accuracy": 0.5828209400177002, |
| "num_tokens": 30018932.0, |
| "step": 4370 |
| }, |
| { |
| "entropy": 1.9780327558517456, |
| "epoch": 0.5275837147675259, |
| "grad_norm": 2.203125, |
| "learning_rate": 4.9601628276067044e-05, |
| "loss": 1.9682683944702148, |
| "mean_token_accuracy": 0.5764804124832154, |
| "num_tokens": 30087597.0, |
| "step": 4380 |
| }, |
| { |
| "entropy": 1.9604371428489684, |
| "epoch": 0.5287882437966756, |
| "grad_norm": 1.78125, |
| "learning_rate": 4.940245031697047e-05, |
| "loss": 1.9709638595581054, |
| "mean_token_accuracy": 0.5788785338401794, |
| "num_tokens": 30157904.0, |
| "step": 4390 |
| }, |
| { |
| "entropy": 1.850344479084015, |
| "epoch": 0.5299927728258251, |
| "grad_norm": 2.40625, |
| "learning_rate": 4.920328184117731e-05, |
| "loss": 1.858159065246582, |
| "mean_token_accuracy": 0.5930382966995239, |
| "num_tokens": 30228125.0, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.5299927728258251, |
| "eval_entropy": 2.3644343614578247, |
| "eval_loss": 2.6451668739318848, |
| "eval_mean_token_accuracy": 0.46767687797546387, |
| "eval_num_tokens": 30228125.0, |
| "eval_runtime": 0.4767, |
| "eval_samples_per_second": 33.567, |
| "eval_steps_per_second": 4.196, |
| "step": 4400 |
| }, |
| { |
| "entropy": 1.911184024810791, |
| "epoch": 0.5311973018549747, |
| "grad_norm": 2.09375, |
| "learning_rate": 4.9004126009554605e-05, |
| "loss": 1.928396987915039, |
| "mean_token_accuracy": 0.5842796444892884, |
| "num_tokens": 30297819.0, |
| "step": 4410 |
| }, |
| { |
| "entropy": 1.9067941188812256, |
| "epoch": 0.5324018308841243, |
| "grad_norm": 1.953125, |
| "learning_rate": 4.880498598276867e-05, |
| "loss": 1.940384864807129, |
| "mean_token_accuracy": 0.5868594408035278, |
| "num_tokens": 30363867.0, |
| "step": 4420 |
| }, |
| { |
| "entropy": 2.005669319629669, |
| "epoch": 0.5336063599132739, |
| "grad_norm": 1.921875, |
| "learning_rate": 4.860586492123506e-05, |
| "loss": 1.997481346130371, |
| "mean_token_accuracy": 0.5730924725532531, |
| "num_tokens": 30428552.0, |
| "step": 4430 |
| }, |
| { |
| "entropy": 1.8843210101127625, |
| "epoch": 0.5348108889424236, |
| "grad_norm": 2.9375, |
| "learning_rate": 4.8406765985068306e-05, |
| "loss": 1.9016788482666016, |
| "mean_token_accuracy": 0.5884897232055664, |
| "num_tokens": 30499828.0, |
| "step": 4440 |
| }, |
| { |
| "entropy": 1.9235470294952393, |
| "epoch": 0.5360154179715731, |
| "grad_norm": 1.8515625, |
| "learning_rate": 4.820769233403182e-05, |
| "loss": 1.9310134887695312, |
| "mean_token_accuracy": 0.5902485966682434, |
| "num_tokens": 30570663.0, |
| "step": 4450 |
| }, |
| { |
| "entropy": 2.002137005329132, |
| "epoch": 0.5372199470007227, |
| "grad_norm": 1.6640625, |
| "learning_rate": 4.800864712748773e-05, |
| "loss": 2.013439178466797, |
| "mean_token_accuracy": 0.5734929382801056, |
| "num_tokens": 30639768.0, |
| "step": 4460 |
| }, |
| { |
| "entropy": 1.9583630681037902, |
| "epoch": 0.5384244760298723, |
| "grad_norm": 2.078125, |
| "learning_rate": 4.7809633524346714e-05, |
| "loss": 1.9639848709106444, |
| "mean_token_accuracy": 0.581292986869812, |
| "num_tokens": 30709439.0, |
| "step": 4470 |
| }, |
| { |
| "entropy": 1.918102788925171, |
| "epoch": 0.5396290050590219, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.7610654683017935e-05, |
| "loss": 1.93857421875, |
| "mean_token_accuracy": 0.5793308973312378, |
| "num_tokens": 30778575.0, |
| "step": 4480 |
| }, |
| { |
| "entropy": 2.008819043636322, |
| "epoch": 0.5408335340881715, |
| "grad_norm": 2.5625, |
| "learning_rate": 4.741171376135885e-05, |
| "loss": 2.0007816314697267, |
| "mean_token_accuracy": 0.5684065818786621, |
| "num_tokens": 30849170.0, |
| "step": 4490 |
| }, |
| { |
| "entropy": 1.9800901770591737, |
| "epoch": 0.5420380631173212, |
| "grad_norm": 2.46875, |
| "learning_rate": 4.721281391662513e-05, |
| "loss": 1.9896121978759767, |
| "mean_token_accuracy": 0.578561270236969, |
| "num_tokens": 30920009.0, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.5420380631173212, |
| "eval_entropy": 2.363973617553711, |
| "eval_loss": 2.643784523010254, |
| "eval_mean_token_accuracy": 0.46993932127952576, |
| "eval_num_tokens": 30920009.0, |
| "eval_runtime": 0.4236, |
| "eval_samples_per_second": 37.772, |
| "eval_steps_per_second": 4.722, |
| "step": 4500 |
| }, |
| { |
| "entropy": 1.9288830518722535, |
| "epoch": 0.5432425921464707, |
| "grad_norm": 1.859375, |
| "learning_rate": 4.701395830542052e-05, |
| "loss": 1.9468812942504883, |
| "mean_token_accuracy": 0.5802593350410461, |
| "num_tokens": 30988375.0, |
| "step": 4510 |
| }, |
| { |
| "entropy": 1.976506495475769, |
| "epoch": 0.5444471211756203, |
| "grad_norm": 2.21875, |
| "learning_rate": 4.681515008364679e-05, |
| "loss": 1.9802528381347657, |
| "mean_token_accuracy": 0.5758872270584107, |
| "num_tokens": 31060076.0, |
| "step": 4520 |
| }, |
| { |
| "entropy": 1.9826183795928956, |
| "epoch": 0.5456516502047699, |
| "grad_norm": 2.234375, |
| "learning_rate": 4.661639240645362e-05, |
| "loss": 1.9977703094482422, |
| "mean_token_accuracy": 0.5760325610637664, |
| "num_tokens": 31129178.0, |
| "step": 4530 |
| }, |
| { |
| "entropy": 1.920491099357605, |
| "epoch": 0.5468561792339195, |
| "grad_norm": 1.75, |
| "learning_rate": 4.641768842818852e-05, |
| "loss": 1.9400859832763673, |
| "mean_token_accuracy": 0.5816328704357148, |
| "num_tokens": 31195811.0, |
| "step": 4540 |
| }, |
| { |
| "entropy": 1.9592141389846802, |
| "epoch": 0.5480607082630692, |
| "grad_norm": 5.59375, |
| "learning_rate": 4.621904130234678e-05, |
| "loss": 1.9684932708740235, |
| "mean_token_accuracy": 0.5793033003807068, |
| "num_tokens": 31269666.0, |
| "step": 4550 |
| }, |
| { |
| "entropy": 1.9467454433441163, |
| "epoch": 0.5492652372922188, |
| "grad_norm": 4.03125, |
| "learning_rate": 4.6020454181521456e-05, |
| "loss": 1.9361564636230468, |
| "mean_token_accuracy": 0.5842253565788269, |
| "num_tokens": 31334999.0, |
| "step": 4560 |
| }, |
| { |
| "entropy": 1.9928619265556335, |
| "epoch": 0.5504697663213683, |
| "grad_norm": 1.96875, |
| "learning_rate": 4.582193021735327e-05, |
| "loss": 2.0216325759887694, |
| "mean_token_accuracy": 0.5709339559078217, |
| "num_tokens": 31400897.0, |
| "step": 4570 |
| }, |
| { |
| "entropy": 1.9685459375381469, |
| "epoch": 0.5516742953505179, |
| "grad_norm": 1.6328125, |
| "learning_rate": 4.562347256048062e-05, |
| "loss": 1.9661026000976562, |
| "mean_token_accuracy": 0.5800518572330475, |
| "num_tokens": 31463610.0, |
| "step": 4580 |
| }, |
| { |
| "entropy": 1.8976296782493591, |
| "epoch": 0.5528788243796675, |
| "grad_norm": 1.8125, |
| "learning_rate": 4.542508436048964e-05, |
| "loss": 1.901803970336914, |
| "mean_token_accuracy": 0.5900285243988037, |
| "num_tokens": 31531288.0, |
| "step": 4590 |
| }, |
| { |
| "entropy": 1.8951862573623657, |
| "epoch": 0.5540833534088172, |
| "grad_norm": 1.9375, |
| "learning_rate": 4.5226768765864116e-05, |
| "loss": 1.9238201141357423, |
| "mean_token_accuracy": 0.5814629673957825, |
| "num_tokens": 31600252.0, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.5540833534088172, |
| "eval_entropy": 2.4053670167922974, |
| "eval_loss": 2.6405816078186035, |
| "eval_mean_token_accuracy": 0.46436651051044464, |
| "eval_num_tokens": 31600252.0, |
| "eval_runtime": 0.4015, |
| "eval_samples_per_second": 39.851, |
| "eval_steps_per_second": 4.981, |
| "step": 4600 |
| }, |
| { |
| "entropy": 1.9277650356292724, |
| "epoch": 0.5552878824379668, |
| "grad_norm": 2.0, |
| "learning_rate": 4.502852892393555e-05, |
| "loss": 1.9219816207885743, |
| "mean_token_accuracy": 0.5852508783340454, |
| "num_tokens": 31669817.0, |
| "step": 4610 |
| }, |
| { |
| "entropy": 1.8948925614356995, |
| "epoch": 0.5564924114671164, |
| "grad_norm": 1.8984375, |
| "learning_rate": 4.483036798083327e-05, |
| "loss": 1.9002313613891602, |
| "mean_token_accuracy": 0.5929959654808045, |
| "num_tokens": 31736160.0, |
| "step": 4620 |
| }, |
| { |
| "entropy": 2.036375272274017, |
| "epoch": 0.557696940496266, |
| "grad_norm": 2.625, |
| "learning_rate": 4.4632289081434425e-05, |
| "loss": 2.051458168029785, |
| "mean_token_accuracy": 0.5620020091533661, |
| "num_tokens": 31803863.0, |
| "step": 4630 |
| }, |
| { |
| "entropy": 1.998021125793457, |
| "epoch": 0.5589014695254155, |
| "grad_norm": 1.859375, |
| "learning_rate": 4.443429536931412e-05, |
| "loss": 2.0050621032714844, |
| "mean_token_accuracy": 0.5741972327232361, |
| "num_tokens": 31872612.0, |
| "step": 4640 |
| }, |
| { |
| "entropy": 1.9433504939079285, |
| "epoch": 0.5601059985545651, |
| "grad_norm": 2.015625, |
| "learning_rate": 4.4236389986695506e-05, |
| "loss": 1.9620183944702148, |
| "mean_token_accuracy": 0.5792094707489014, |
| "num_tokens": 31942349.0, |
| "step": 4650 |
| }, |
| { |
| "entropy": 1.8521189928054809, |
| "epoch": 0.5613105275837148, |
| "grad_norm": 2.59375, |
| "learning_rate": 4.40385760743999e-05, |
| "loss": 1.8720680236816407, |
| "mean_token_accuracy": 0.596925801038742, |
| "num_tokens": 32012514.0, |
| "step": 4660 |
| }, |
| { |
| "entropy": 1.8461775422096252, |
| "epoch": 0.5625150566128644, |
| "grad_norm": 1.7109375, |
| "learning_rate": 4.384085677179698e-05, |
| "loss": 1.8293455123901368, |
| "mean_token_accuracy": 0.599132776260376, |
| "num_tokens": 32078200.0, |
| "step": 4670 |
| }, |
| { |
| "entropy": 1.8661665320396423, |
| "epoch": 0.563719585642014, |
| "grad_norm": 2.828125, |
| "learning_rate": 4.3643235216754937e-05, |
| "loss": 1.900370216369629, |
| "mean_token_accuracy": 0.5908865690231323, |
| "num_tokens": 32146391.0, |
| "step": 4680 |
| }, |
| { |
| "entropy": 1.9501501083374024, |
| "epoch": 0.5649241146711635, |
| "grad_norm": 1.9609375, |
| "learning_rate": 4.344571454559066e-05, |
| "loss": 1.9497032165527344, |
| "mean_token_accuracy": 0.5843803644180298, |
| "num_tokens": 32215263.0, |
| "step": 4690 |
| }, |
| { |
| "entropy": 1.999093735218048, |
| "epoch": 0.5661286437003131, |
| "grad_norm": 1.8046875, |
| "learning_rate": 4.3248297893019974e-05, |
| "loss": 1.9907657623291015, |
| "mean_token_accuracy": 0.5720624268054962, |
| "num_tokens": 32286557.0, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.5661286437003131, |
| "eval_entropy": 2.3873685598373413, |
| "eval_loss": 2.64916729927063, |
| "eval_mean_token_accuracy": 0.4610539674758911, |
| "eval_num_tokens": 32286557.0, |
| "eval_runtime": 0.3893, |
| "eval_samples_per_second": 41.1, |
| "eval_steps_per_second": 5.137, |
| "step": 4700 |
| }, |
| { |
| "entropy": 1.8912575364112854, |
| "epoch": 0.5673331727294628, |
| "grad_norm": 2.078125, |
| "learning_rate": 4.305098839210793e-05, |
| "loss": 1.9253583908081056, |
| "mean_token_accuracy": 0.5839666426181793, |
| "num_tokens": 32353787.0, |
| "step": 4710 |
| }, |
| { |
| "entropy": 2.0117050647735595, |
| "epoch": 0.5685377017586124, |
| "grad_norm": 1.90625, |
| "learning_rate": 4.285378917421901e-05, |
| "loss": 2.027020263671875, |
| "mean_token_accuracy": 0.5696452796459198, |
| "num_tokens": 32422276.0, |
| "step": 4720 |
| }, |
| { |
| "entropy": 1.9269362688064575, |
| "epoch": 0.569742230787762, |
| "grad_norm": 1.8359375, |
| "learning_rate": 4.26567033689675e-05, |
| "loss": 1.9298131942749024, |
| "mean_token_accuracy": 0.5858283340930939, |
| "num_tokens": 32489865.0, |
| "step": 4730 |
| }, |
| { |
| "entropy": 1.9381520748138428, |
| "epoch": 0.5709467598169116, |
| "grad_norm": 1.9609375, |
| "learning_rate": 4.245973410416776e-05, |
| "loss": 1.9504453659057617, |
| "mean_token_accuracy": 0.5801201462745667, |
| "num_tokens": 32558311.0, |
| "step": 4740 |
| }, |
| { |
| "entropy": 1.957834541797638, |
| "epoch": 0.5721512888460611, |
| "grad_norm": 1.734375, |
| "learning_rate": 4.226288450578466e-05, |
| "loss": 1.9595251083374023, |
| "mean_token_accuracy": 0.5809877216815948, |
| "num_tokens": 32628771.0, |
| "step": 4750 |
| }, |
| { |
| "entropy": 2.0325303077697754, |
| "epoch": 0.5733558178752108, |
| "grad_norm": 2.15625, |
| "learning_rate": 4.206615769788388e-05, |
| "loss": 2.0349349975585938, |
| "mean_token_accuracy": 0.5715503364801406, |
| "num_tokens": 32696002.0, |
| "step": 4760 |
| }, |
| { |
| "entropy": 1.9599291920661925, |
| "epoch": 0.5745603469043604, |
| "grad_norm": 2.75, |
| "learning_rate": 4.18695568025824e-05, |
| "loss": 1.9857137680053711, |
| "mean_token_accuracy": 0.5762530505657196, |
| "num_tokens": 32765409.0, |
| "step": 4770 |
| }, |
| { |
| "entropy": 2.011521780490875, |
| "epoch": 0.57576487593351, |
| "grad_norm": 1.828125, |
| "learning_rate": 4.167308493999895e-05, |
| "loss": 2.0376073837280275, |
| "mean_token_accuracy": 0.5651960968971252, |
| "num_tokens": 32836052.0, |
| "step": 4780 |
| }, |
| { |
| "entropy": 1.9228283405303954, |
| "epoch": 0.5769694049626596, |
| "grad_norm": 2.15625, |
| "learning_rate": 4.1476745228204396e-05, |
| "loss": 1.912071418762207, |
| "mean_token_accuracy": 0.5910592377185822, |
| "num_tokens": 32905194.0, |
| "step": 4790 |
| }, |
| { |
| "entropy": 1.865368616580963, |
| "epoch": 0.5781739339918092, |
| "grad_norm": 2.4375, |
| "learning_rate": 4.12805407831724e-05, |
| "loss": 1.891912078857422, |
| "mean_token_accuracy": 0.5888952136039733, |
| "num_tokens": 32972498.0, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.5781739339918092, |
| "eval_entropy": 2.3830796480178833, |
| "eval_loss": 2.640810489654541, |
| "eval_mean_token_accuracy": 0.46129730343818665, |
| "eval_num_tokens": 32972498.0, |
| "eval_runtime": 0.4087, |
| "eval_samples_per_second": 39.153, |
| "eval_steps_per_second": 4.894, |
| "step": 4800 |
| }, |
| { |
| "entropy": 1.894623613357544, |
| "epoch": 0.5793784630209589, |
| "grad_norm": 1.640625, |
| "learning_rate": 4.1084474718729856e-05, |
| "loss": 1.8948657989501954, |
| "mean_token_accuracy": 0.590268349647522, |
| "num_tokens": 33045640.0, |
| "step": 4810 |
| }, |
| { |
| "entropy": 1.9580816626548767, |
| "epoch": 0.5805829920501084, |
| "grad_norm": 1.765625, |
| "learning_rate": 4.0888550146507565e-05, |
| "loss": 1.9551275253295899, |
| "mean_token_accuracy": 0.5806404232978821, |
| "num_tokens": 33113472.0, |
| "step": 4820 |
| }, |
| { |
| "entropy": 2.0462084293365477, |
| "epoch": 0.581787521079258, |
| "grad_norm": 2.1875, |
| "learning_rate": 4.069277017589074e-05, |
| "loss": 2.055453872680664, |
| "mean_token_accuracy": 0.5649996995925903, |
| "num_tokens": 33184222.0, |
| "step": 4830 |
| }, |
| { |
| "entropy": 1.9837548017501831, |
| "epoch": 0.5829920501084076, |
| "grad_norm": 1.796875, |
| "learning_rate": 4.0497137913969757e-05, |
| "loss": 1.9946985244750977, |
| "mean_token_accuracy": 0.5758336961269379, |
| "num_tokens": 33253556.0, |
| "step": 4840 |
| }, |
| { |
| "entropy": 1.9759496331214905, |
| "epoch": 0.5841965791375572, |
| "grad_norm": 3.015625, |
| "learning_rate": 4.030165646549079e-05, |
| "loss": 1.9906793594360352, |
| "mean_token_accuracy": 0.5723823666572571, |
| "num_tokens": 33322391.0, |
| "step": 4850 |
| }, |
| { |
| "entropy": 1.8935376286506653, |
| "epoch": 0.5854011081667068, |
| "grad_norm": 2.375, |
| "learning_rate": 4.010632893280659e-05, |
| "loss": 1.8841629028320312, |
| "mean_token_accuracy": 0.5903885960578918, |
| "num_tokens": 33390481.0, |
| "step": 4860 |
| }, |
| { |
| "entropy": 1.981140172481537, |
| "epoch": 0.5866056371958565, |
| "grad_norm": 1.984375, |
| "learning_rate": 3.991115841582718e-05, |
| "loss": 1.981198501586914, |
| "mean_token_accuracy": 0.5745481014251709, |
| "num_tokens": 33458697.0, |
| "step": 4870 |
| }, |
| { |
| "entropy": 1.9192126035690307, |
| "epoch": 0.587810166225006, |
| "grad_norm": 1.625, |
| "learning_rate": 3.971614801197068e-05, |
| "loss": 1.9621315002441406, |
| "mean_token_accuracy": 0.5829608976840973, |
| "num_tokens": 33528205.0, |
| "step": 4880 |
| }, |
| { |
| "entropy": 1.8959569573402404, |
| "epoch": 0.5890146952541556, |
| "grad_norm": 1.7890625, |
| "learning_rate": 3.95213008161142e-05, |
| "loss": 1.8824783325195313, |
| "mean_token_accuracy": 0.591416871547699, |
| "num_tokens": 33597857.0, |
| "step": 4890 |
| }, |
| { |
| "entropy": 1.9513839960098267, |
| "epoch": 0.5902192242833052, |
| "grad_norm": 2.015625, |
| "learning_rate": 3.9326619920544696e-05, |
| "loss": 1.9795387268066407, |
| "mean_token_accuracy": 0.5807128429412842, |
| "num_tokens": 33666832.0, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.5902192242833052, |
| "eval_entropy": 2.3947250843048096, |
| "eval_loss": 2.6315090656280518, |
| "eval_mean_token_accuracy": 0.46670788526535034, |
| "eval_num_tokens": 33666832.0, |
| "eval_runtime": 0.4129, |
| "eval_samples_per_second": 38.749, |
| "eval_steps_per_second": 4.844, |
| "step": 4900 |
| }, |
| { |
| "entropy": 1.9754149913787842, |
| "epoch": 0.5914237533124548, |
| "grad_norm": 1.765625, |
| "learning_rate": 3.9132108414909846e-05, |
| "loss": 1.9650136947631835, |
| "mean_token_accuracy": 0.5761143267154694, |
| "num_tokens": 33738439.0, |
| "step": 4910 |
| }, |
| { |
| "entropy": 1.9517736196517945, |
| "epoch": 0.5926282823416045, |
| "grad_norm": 2.484375, |
| "learning_rate": 3.893776938616908e-05, |
| "loss": 1.9693525314331055, |
| "mean_token_accuracy": 0.5792171418666839, |
| "num_tokens": 33808674.0, |
| "step": 4920 |
| }, |
| { |
| "entropy": 1.9107809901237487, |
| "epoch": 0.593832811370754, |
| "grad_norm": 2.0625, |
| "learning_rate": 3.874360591854456e-05, |
| "loss": 1.9263221740722656, |
| "mean_token_accuracy": 0.5864075660705567, |
| "num_tokens": 33876011.0, |
| "step": 4930 |
| }, |
| { |
| "entropy": 1.9395878314971924, |
| "epoch": 0.5950373403999036, |
| "grad_norm": 2.078125, |
| "learning_rate": 3.8549621093472225e-05, |
| "loss": 1.9504684448242187, |
| "mean_token_accuracy": 0.5788490891456604, |
| "num_tokens": 33943521.0, |
| "step": 4940 |
| }, |
| { |
| "entropy": 1.9085129261016847, |
| "epoch": 0.5962418694290532, |
| "grad_norm": 2.328125, |
| "learning_rate": 3.8355817989552925e-05, |
| "loss": 1.9079933166503906, |
| "mean_token_accuracy": 0.5859108805656433, |
| "num_tokens": 34011166.0, |
| "step": 4950 |
| }, |
| { |
| "entropy": 1.9190776228904725, |
| "epoch": 0.5974463984582028, |
| "grad_norm": 2.125, |
| "learning_rate": 3.816219968250354e-05, |
| "loss": 1.9361600875854492, |
| "mean_token_accuracy": 0.5844547927379609, |
| "num_tokens": 34081520.0, |
| "step": 4960 |
| }, |
| { |
| "entropy": 1.9144928336143494, |
| "epoch": 0.5986509274873525, |
| "grad_norm": 2.359375, |
| "learning_rate": 3.7968769245108116e-05, |
| "loss": 1.9301385879516602, |
| "mean_token_accuracy": 0.5838036835193634, |
| "num_tokens": 34150349.0, |
| "step": 4970 |
| }, |
| { |
| "entropy": 1.9991060137748717, |
| "epoch": 0.5998554565165021, |
| "grad_norm": 1.9296875, |
| "learning_rate": 3.777552974716919e-05, |
| "loss": 2.0122554779052733, |
| "mean_token_accuracy": 0.5668920397758483, |
| "num_tokens": 34221884.0, |
| "step": 4980 |
| }, |
| { |
| "entropy": 1.9388365507125855, |
| "epoch": 0.6010599855456517, |
| "grad_norm": 1.6484375, |
| "learning_rate": 3.7582484255459036e-05, |
| "loss": 1.943490219116211, |
| "mean_token_accuracy": 0.581670206785202, |
| "num_tokens": 34290048.0, |
| "step": 4990 |
| }, |
| { |
| "entropy": 1.9693029046058654, |
| "epoch": 0.6022645145748012, |
| "grad_norm": 4.5, |
| "learning_rate": 3.7389635833670956e-05, |
| "loss": 1.9767236709594727, |
| "mean_token_accuracy": 0.576853483915329, |
| "num_tokens": 34358309.0, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.6022645145748012, |
| "eval_entropy": 2.418100595474243, |
| "eval_loss": 2.630772113800049, |
| "eval_mean_token_accuracy": 0.4687269777059555, |
| "eval_num_tokens": 34358309.0, |
| "eval_runtime": 0.3895, |
| "eval_samples_per_second": 41.083, |
| "eval_steps_per_second": 5.135, |
| "step": 5000 |
| }, |
| { |
| "entropy": 1.9612276196479796, |
| "epoch": 0.6034690436039508, |
| "grad_norm": 1.7421875, |
| "learning_rate": 3.719698754237071e-05, |
| "loss": 1.9557125091552734, |
| "mean_token_accuracy": 0.5791131734848023, |
| "num_tokens": 34425037.0, |
| "step": 5010 |
| }, |
| { |
| "entropy": 1.9336675524711608, |
| "epoch": 0.6046735726331005, |
| "grad_norm": 2.453125, |
| "learning_rate": 3.700454243894792e-05, |
| "loss": 1.9650556564331054, |
| "mean_token_accuracy": 0.579778116941452, |
| "num_tokens": 34489640.0, |
| "step": 5020 |
| }, |
| { |
| "entropy": 1.9100453734397889, |
| "epoch": 0.6058781016622501, |
| "grad_norm": 1.8515625, |
| "learning_rate": 3.681230357756755e-05, |
| "loss": 1.9068069458007812, |
| "mean_token_accuracy": 0.58942751288414, |
| "num_tokens": 34558251.0, |
| "step": 5030 |
| }, |
| { |
| "entropy": 1.8939356923103332, |
| "epoch": 0.6070826306913997, |
| "grad_norm": 2.140625, |
| "learning_rate": 3.662027400912144e-05, |
| "loss": 1.8894643783569336, |
| "mean_token_accuracy": 0.5893211364746094, |
| "num_tokens": 34627206.0, |
| "step": 5040 |
| }, |
| { |
| "entropy": 1.8718860745429993, |
| "epoch": 0.6082871597205493, |
| "grad_norm": 1.96875, |
| "learning_rate": 3.642845678117989e-05, |
| "loss": 1.8915393829345704, |
| "mean_token_accuracy": 0.5894035339355469, |
| "num_tokens": 34698945.0, |
| "step": 5050 |
| }, |
| { |
| "entropy": 1.8748759269714355, |
| "epoch": 0.6094916887496988, |
| "grad_norm": 1.828125, |
| "learning_rate": 3.6236854937943265e-05, |
| "loss": 1.8840362548828125, |
| "mean_token_accuracy": 0.595303225517273, |
| "num_tokens": 34767145.0, |
| "step": 5060 |
| }, |
| { |
| "entropy": 1.9592718839645387, |
| "epoch": 0.6106962177788484, |
| "grad_norm": 1.90625, |
| "learning_rate": 3.604547152019373e-05, |
| "loss": 1.957082176208496, |
| "mean_token_accuracy": 0.5802127182483673, |
| "num_tokens": 34837448.0, |
| "step": 5070 |
| }, |
| { |
| "entropy": 1.9220492243766785, |
| "epoch": 0.6119007468079981, |
| "grad_norm": 2.109375, |
| "learning_rate": 3.5854309565246964e-05, |
| "loss": 1.932563591003418, |
| "mean_token_accuracy": 0.5790705144405365, |
| "num_tokens": 34908463.0, |
| "step": 5080 |
| }, |
| { |
| "entropy": 1.9812603712081909, |
| "epoch": 0.6131052758371477, |
| "grad_norm": 2.390625, |
| "learning_rate": 3.5663372106903945e-05, |
| "loss": 1.991549301147461, |
| "mean_token_accuracy": 0.5749566674232482, |
| "num_tokens": 34978001.0, |
| "step": 5090 |
| }, |
| { |
| "entropy": 1.8598801732063293, |
| "epoch": 0.6143098048662973, |
| "grad_norm": 1.6640625, |
| "learning_rate": 3.547266217540285e-05, |
| "loss": 1.8648416519165039, |
| "mean_token_accuracy": 0.5984355032444, |
| "num_tokens": 35044073.0, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.6143098048662973, |
| "eval_entropy": 2.3398125171661377, |
| "eval_loss": 2.62454891204834, |
| "eval_mean_token_accuracy": 0.46985819935798645, |
| "eval_num_tokens": 35044073.0, |
| "eval_runtime": 0.3855, |
| "eval_samples_per_second": 41.507, |
| "eval_steps_per_second": 5.188, |
| "step": 5100 |
| }, |
| { |
| "entropy": 1.8979187965393067, |
| "epoch": 0.6155143338954469, |
| "grad_norm": 1.875, |
| "learning_rate": 3.52821827973709e-05, |
| "loss": 1.929935836791992, |
| "mean_token_accuracy": 0.5857858061790466, |
| "num_tokens": 35113684.0, |
| "step": 5110 |
| }, |
| { |
| "entropy": 1.9798847556114196, |
| "epoch": 0.6167188629245964, |
| "grad_norm": 1.7578125, |
| "learning_rate": 3.509193699577638e-05, |
| "loss": 1.963231086730957, |
| "mean_token_accuracy": 0.5840341687202454, |
| "num_tokens": 35180721.0, |
| "step": 5120 |
| }, |
| { |
| "entropy": 1.8245866298675537, |
| "epoch": 0.6179233919537461, |
| "grad_norm": 1.9765625, |
| "learning_rate": 3.490192778988063e-05, |
| "loss": 1.8163463592529296, |
| "mean_token_accuracy": 0.6064060211181641, |
| "num_tokens": 35250848.0, |
| "step": 5130 |
| }, |
| { |
| "entropy": 1.9118698000907899, |
| "epoch": 0.6191279209828957, |
| "grad_norm": 1.75, |
| "learning_rate": 3.4712158195190145e-05, |
| "loss": 1.9413307189941407, |
| "mean_token_accuracy": 0.5859278023242951, |
| "num_tokens": 35321431.0, |
| "step": 5140 |
| }, |
| { |
| "entropy": 1.8520223259925843, |
| "epoch": 0.6203324500120453, |
| "grad_norm": 1.6171875, |
| "learning_rate": 3.452263122340873e-05, |
| "loss": 1.8534076690673829, |
| "mean_token_accuracy": 0.5987428843975067, |
| "num_tokens": 35393047.0, |
| "step": 5150 |
| }, |
| { |
| "entropy": 1.9185335278511046, |
| "epoch": 0.6215369790411949, |
| "grad_norm": 1.578125, |
| "learning_rate": 3.433334988238966e-05, |
| "loss": 1.9247661590576173, |
| "mean_token_accuracy": 0.5864275336265564, |
| "num_tokens": 35458684.0, |
| "step": 5160 |
| }, |
| { |
| "entropy": 1.930286693572998, |
| "epoch": 0.6227415080703445, |
| "grad_norm": 1.8125, |
| "learning_rate": 3.414431717608798e-05, |
| "loss": 1.9481643676757812, |
| "mean_token_accuracy": 0.5829527914524079, |
| "num_tokens": 35525629.0, |
| "step": 5170 |
| }, |
| { |
| "entropy": 1.9276029348373414, |
| "epoch": 0.6239460370994941, |
| "grad_norm": 2.765625, |
| "learning_rate": 3.395553610451284e-05, |
| "loss": 1.9277130126953126, |
| "mean_token_accuracy": 0.5824755191802978, |
| "num_tokens": 35595976.0, |
| "step": 5180 |
| }, |
| { |
| "entropy": 1.9220746278762817, |
| "epoch": 0.6251505661286437, |
| "grad_norm": 2.375, |
| "learning_rate": 3.376700966367985e-05, |
| "loss": 1.925217056274414, |
| "mean_token_accuracy": 0.5840397000312805, |
| "num_tokens": 35665532.0, |
| "step": 5190 |
| }, |
| { |
| "entropy": 1.8797456979751588, |
| "epoch": 0.6263550951577933, |
| "grad_norm": 1.8203125, |
| "learning_rate": 3.3578740845563555e-05, |
| "loss": 1.9007396697998047, |
| "mean_token_accuracy": 0.5888063788414002, |
| "num_tokens": 35737061.0, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.6263550951577933, |
| "eval_entropy": 2.3753281831741333, |
| "eval_loss": 2.6204867362976074, |
| "eval_mean_token_accuracy": 0.46662676334381104, |
| "eval_num_tokens": 35737061.0, |
| "eval_runtime": 0.4077, |
| "eval_samples_per_second": 39.244, |
| "eval_steps_per_second": 4.905, |
| "step": 5200 |
| }, |
| { |
| "entropy": 1.8981397032737732, |
| "epoch": 0.6275596241869429, |
| "grad_norm": 1.8671875, |
| "learning_rate": 3.339073263804994e-05, |
| "loss": 1.9124277114868165, |
| "mean_token_accuracy": 0.5864745378494263, |
| "num_tokens": 35804675.0, |
| "step": 5210 |
| }, |
| { |
| "entropy": 1.976684045791626, |
| "epoch": 0.6287641532160925, |
| "grad_norm": 1.7890625, |
| "learning_rate": 3.320298802488903e-05, |
| "loss": 1.9907527923583985, |
| "mean_token_accuracy": 0.5714164018630982, |
| "num_tokens": 35877161.0, |
| "step": 5220 |
| }, |
| { |
| "entropy": 1.8927918910980224, |
| "epoch": 0.6299686822452422, |
| "grad_norm": 1.7890625, |
| "learning_rate": 3.301550998564751e-05, |
| "loss": 1.8990289688110351, |
| "mean_token_accuracy": 0.5901903450489044, |
| "num_tokens": 35948270.0, |
| "step": 5230 |
| }, |
| { |
| "entropy": 1.9140401601791381, |
| "epoch": 0.6311732112743917, |
| "grad_norm": 1.8359375, |
| "learning_rate": 3.2828301495661456e-05, |
| "loss": 1.9242591857910156, |
| "mean_token_accuracy": 0.5829783380031586, |
| "num_tokens": 36017155.0, |
| "step": 5240 |
| }, |
| { |
| "entropy": 1.890995192527771, |
| "epoch": 0.6323777403035413, |
| "grad_norm": 2.5, |
| "learning_rate": 3.264136552598911e-05, |
| "loss": 1.8821306228637695, |
| "mean_token_accuracy": 0.5897811651229858, |
| "num_tokens": 36086123.0, |
| "step": 5250 |
| }, |
| { |
| "entropy": 1.8787309169769286, |
| "epoch": 0.6335822693326909, |
| "grad_norm": 2.59375, |
| "learning_rate": 3.245470504336374e-05, |
| "loss": 1.885854721069336, |
| "mean_token_accuracy": 0.5903471231460571, |
| "num_tokens": 36155392.0, |
| "step": 5260 |
| }, |
| { |
| "entropy": 1.8674795508384705, |
| "epoch": 0.6347867983618405, |
| "grad_norm": 2.453125, |
| "learning_rate": 3.2268323010146533e-05, |
| "loss": 1.8879039764404297, |
| "mean_token_accuracy": 0.5887195408344269, |
| "num_tokens": 36227815.0, |
| "step": 5270 |
| }, |
| { |
| "entropy": 1.9036988496780396, |
| "epoch": 0.6359913273909901, |
| "grad_norm": 2.234375, |
| "learning_rate": 3.2082222384279606e-05, |
| "loss": 1.9163215637207032, |
| "mean_token_accuracy": 0.5897344052791595, |
| "num_tokens": 36295653.0, |
| "step": 5280 |
| }, |
| { |
| "entropy": 1.8928561806678772, |
| "epoch": 0.6371958564201398, |
| "grad_norm": 1.7421875, |
| "learning_rate": 3.1896406119239056e-05, |
| "loss": 1.8859382629394532, |
| "mean_token_accuracy": 0.5898880422115326, |
| "num_tokens": 36365568.0, |
| "step": 5290 |
| }, |
| { |
| "entropy": 1.905668354034424, |
| "epoch": 0.6384003854492893, |
| "grad_norm": 1.59375, |
| "learning_rate": 3.171087716398806e-05, |
| "loss": 1.9199764251708984, |
| "mean_token_accuracy": 0.5883520424365998, |
| "num_tokens": 36435763.0, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.6384003854492893, |
| "eval_entropy": 2.3645377159118652, |
| "eval_loss": 2.616696834564209, |
| "eval_mean_token_accuracy": 0.46832360327243805, |
| "eval_num_tokens": 36435763.0, |
| "eval_runtime": 0.4223, |
| "eval_samples_per_second": 37.889, |
| "eval_steps_per_second": 4.736, |
| "step": 5300 |
| }, |
| { |
| "entropy": 1.907051146030426, |
| "epoch": 0.6396049144784389, |
| "grad_norm": 1.9375, |
| "learning_rate": 3.1525638462930115e-05, |
| "loss": 1.9220357894897462, |
| "mean_token_accuracy": 0.5886389434337616, |
| "num_tokens": 36502342.0, |
| "step": 5310 |
| }, |
| { |
| "entropy": 1.9605424404144287, |
| "epoch": 0.6408094435075885, |
| "grad_norm": 2.1875, |
| "learning_rate": 3.13406929558623e-05, |
| "loss": 1.9678442001342773, |
| "mean_token_accuracy": 0.578427666425705, |
| "num_tokens": 36573941.0, |
| "step": 5320 |
| }, |
| { |
| "entropy": 1.953044855594635, |
| "epoch": 0.6420139725367381, |
| "grad_norm": 2.09375, |
| "learning_rate": 3.115604357792861e-05, |
| "loss": 1.958037757873535, |
| "mean_token_accuracy": 0.5746124386787415, |
| "num_tokens": 36642724.0, |
| "step": 5330 |
| }, |
| { |
| "entropy": 1.8768929600715638, |
| "epoch": 0.6432185015658878, |
| "grad_norm": 1.75, |
| "learning_rate": 3.097169325957334e-05, |
| "loss": 1.8723608016967774, |
| "mean_token_accuracy": 0.5956127345561981, |
| "num_tokens": 36712778.0, |
| "step": 5340 |
| }, |
| { |
| "entropy": 1.841334068775177, |
| "epoch": 0.6444230305950374, |
| "grad_norm": 2.21875, |
| "learning_rate": 3.078764492649466e-05, |
| "loss": 1.8471950531005858, |
| "mean_token_accuracy": 0.5986435294151307, |
| "num_tokens": 36780809.0, |
| "step": 5350 |
| }, |
| { |
| "entropy": 1.9197816848754883, |
| "epoch": 0.6456275596241869, |
| "grad_norm": 1.9609375, |
| "learning_rate": 3.060390149959812e-05, |
| "loss": 1.9349895477294923, |
| "mean_token_accuracy": 0.5787651658058166, |
| "num_tokens": 36850234.0, |
| "step": 5360 |
| }, |
| { |
| "entropy": 1.929334855079651, |
| "epoch": 0.6468320886533365, |
| "grad_norm": 2.296875, |
| "learning_rate": 3.0420465894950308e-05, |
| "loss": 1.9481956481933593, |
| "mean_token_accuracy": 0.5793615520000458, |
| "num_tokens": 36919525.0, |
| "step": 5370 |
| }, |
| { |
| "entropy": 1.9005924344062806, |
| "epoch": 0.6480366176824861, |
| "grad_norm": 1.984375, |
| "learning_rate": 3.023734102373258e-05, |
| "loss": 1.9027469635009766, |
| "mean_token_accuracy": 0.5923004388809204, |
| "num_tokens": 36984363.0, |
| "step": 5380 |
| }, |
| { |
| "entropy": 1.9164150595664977, |
| "epoch": 0.6492411467116358, |
| "grad_norm": 2.546875, |
| "learning_rate": 3.0054529792194853e-05, |
| "loss": 1.9091907501220704, |
| "mean_token_accuracy": 0.5910762488842011, |
| "num_tokens": 37054526.0, |
| "step": 5390 |
| }, |
| { |
| "entropy": 1.9193613052368164, |
| "epoch": 0.6504456757407854, |
| "grad_norm": 2.4375, |
| "learning_rate": 2.9872035101609487e-05, |
| "loss": 1.9336360931396483, |
| "mean_token_accuracy": 0.580650019645691, |
| "num_tokens": 37121669.0, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.6504456757407854, |
| "eval_entropy": 2.350887656211853, |
| "eval_loss": 2.6190919876098633, |
| "eval_mean_token_accuracy": 0.4688080996274948, |
| "eval_num_tokens": 37121669.0, |
| "eval_runtime": 0.4531, |
| "eval_samples_per_second": 35.313, |
| "eval_steps_per_second": 4.414, |
| "step": 5400 |
| }, |
| { |
| "entropy": 1.977675235271454, |
| "epoch": 0.651650204769935, |
| "grad_norm": 1.890625, |
| "learning_rate": 2.968985984822522e-05, |
| "loss": 2.0003679275512694, |
| "mean_token_accuracy": 0.5737912714481354, |
| "num_tokens": 37186062.0, |
| "step": 5410 |
| }, |
| { |
| "entropy": 1.9420138478279114, |
| "epoch": 0.6528547337990845, |
| "grad_norm": 1.7578125, |
| "learning_rate": 2.9508006923221266e-05, |
| "loss": 1.9501548767089845, |
| "mean_token_accuracy": 0.5838521778583526, |
| "num_tokens": 37256065.0, |
| "step": 5420 |
| }, |
| { |
| "entropy": 1.9069531440734864, |
| "epoch": 0.6540592628282341, |
| "grad_norm": 1.734375, |
| "learning_rate": 2.9326479212661306e-05, |
| "loss": 1.9059297561645507, |
| "mean_token_accuracy": 0.5911825299263, |
| "num_tokens": 37324821.0, |
| "step": 5430 |
| }, |
| { |
| "entropy": 1.893869972229004, |
| "epoch": 0.6552637918573838, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.9145279597447828e-05, |
| "loss": 1.9049043655395508, |
| "mean_token_accuracy": 0.5932099878787994, |
| "num_tokens": 37390790.0, |
| "step": 5440 |
| }, |
| { |
| "entropy": 1.88713858127594, |
| "epoch": 0.6564683208865334, |
| "grad_norm": 1.78125, |
| "learning_rate": 2.896441095327632e-05, |
| "loss": 1.902921485900879, |
| "mean_token_accuracy": 0.5844146311283112, |
| "num_tokens": 37460500.0, |
| "step": 5450 |
| }, |
| { |
| "entropy": 1.8918554067611695, |
| "epoch": 0.657672849915683, |
| "grad_norm": 2.3125, |
| "learning_rate": 2.8783876150589683e-05, |
| "loss": 1.9041791915893556, |
| "mean_token_accuracy": 0.5867816925048828, |
| "num_tokens": 37528938.0, |
| "step": 5460 |
| }, |
| { |
| "entropy": 1.8805498123168944, |
| "epoch": 0.6588773789448326, |
| "grad_norm": 1.7734375, |
| "learning_rate": 2.860367805453258e-05, |
| "loss": 1.8810306549072267, |
| "mean_token_accuracy": 0.59105264544487, |
| "num_tokens": 37600188.0, |
| "step": 5470 |
| }, |
| { |
| "entropy": 1.9223363876342774, |
| "epoch": 0.6600819079739821, |
| "grad_norm": 1.734375, |
| "learning_rate": 2.8423819524906127e-05, |
| "loss": 1.9341358184814452, |
| "mean_token_accuracy": 0.5785215318202972, |
| "num_tokens": 37669493.0, |
| "step": 5480 |
| }, |
| { |
| "entropy": 1.9478025794029237, |
| "epoch": 0.6612864370031317, |
| "grad_norm": 2.078125, |
| "learning_rate": 2.8244303416122315e-05, |
| "loss": 1.9668249130249023, |
| "mean_token_accuracy": 0.5811723470687866, |
| "num_tokens": 37738632.0, |
| "step": 5490 |
| }, |
| { |
| "entropy": 1.9327033162117004, |
| "epoch": 0.6624909660322814, |
| "grad_norm": 2.046875, |
| "learning_rate": 2.8065132577158893e-05, |
| "loss": 1.9399887084960938, |
| "mean_token_accuracy": 0.5822966039180756, |
| "num_tokens": 37808296.0, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.6624909660322814, |
| "eval_entropy": 2.36837637424469, |
| "eval_loss": 2.610164165496826, |
| "eval_mean_token_accuracy": 0.4664645344018936, |
| "eval_num_tokens": 37808296.0, |
| "eval_runtime": 0.4178, |
| "eval_samples_per_second": 38.293, |
| "eval_steps_per_second": 4.787, |
| "step": 5500 |
| }, |
| { |
| "entropy": 1.9694913983345033, |
| "epoch": 0.663695495061431, |
| "grad_norm": 2.421875, |
| "learning_rate": 2.7886309851513988e-05, |
| "loss": 1.9779542922973632, |
| "mean_token_accuracy": 0.5814646899700164, |
| "num_tokens": 37876068.0, |
| "step": 5510 |
| }, |
| { |
| "entropy": 1.8879098653793336, |
| "epoch": 0.6649000240905806, |
| "grad_norm": 1.859375, |
| "learning_rate": 2.7707838077161164e-05, |
| "loss": 1.8773269653320312, |
| "mean_token_accuracy": 0.5935602962970734, |
| "num_tokens": 37946972.0, |
| "step": 5520 |
| }, |
| { |
| "entropy": 1.89464613199234, |
| "epoch": 0.6661045531197302, |
| "grad_norm": 1.671875, |
| "learning_rate": 2.7529720086504124e-05, |
| "loss": 1.9013668060302735, |
| "mean_token_accuracy": 0.5858488619327545, |
| "num_tokens": 38011397.0, |
| "step": 5530 |
| }, |
| { |
| "entropy": 1.9149715423583984, |
| "epoch": 0.6673090821488797, |
| "grad_norm": 1.75, |
| "learning_rate": 2.7351958706332047e-05, |
| "loss": 1.9140022277832032, |
| "mean_token_accuracy": 0.5807339906692505, |
| "num_tokens": 38080836.0, |
| "step": 5540 |
| }, |
| { |
| "entropy": 1.881326472759247, |
| "epoch": 0.6685136111780294, |
| "grad_norm": 1.71875, |
| "learning_rate": 2.7174556757774562e-05, |
| "loss": 1.8995676040649414, |
| "mean_token_accuracy": 0.587762427330017, |
| "num_tokens": 38151280.0, |
| "step": 5550 |
| }, |
| { |
| "entropy": 2.0221534729003907, |
| "epoch": 0.669718140207179, |
| "grad_norm": 2.0625, |
| "learning_rate": 2.6997517056256937e-05, |
| "loss": 2.0501468658447264, |
| "mean_token_accuracy": 0.5636834293603897, |
| "num_tokens": 38219291.0, |
| "step": 5560 |
| }, |
| { |
| "entropy": 1.9183872818946839, |
| "epoch": 0.6709226692363286, |
| "grad_norm": 1.859375, |
| "learning_rate": 2.682084241145556e-05, |
| "loss": 1.9107559204101563, |
| "mean_token_accuracy": 0.5877353131771088, |
| "num_tokens": 38286544.0, |
| "step": 5570 |
| }, |
| { |
| "entropy": 1.9487929224967957, |
| "epoch": 0.6721271982654782, |
| "grad_norm": 4.03125, |
| "learning_rate": 2.6644535627253157e-05, |
| "loss": 1.9566732406616212, |
| "mean_token_accuracy": 0.5794759750366211, |
| "num_tokens": 38357801.0, |
| "step": 5580 |
| }, |
| { |
| "entropy": 1.8050852179527284, |
| "epoch": 0.6733317272946278, |
| "grad_norm": 2.0625, |
| "learning_rate": 2.646859950169448e-05, |
| "loss": 1.7916004180908203, |
| "mean_token_accuracy": 0.60407754778862, |
| "num_tokens": 38429515.0, |
| "step": 5590 |
| }, |
| { |
| "entropy": 1.9309733152389525, |
| "epoch": 0.6745362563237775, |
| "grad_norm": 2.234375, |
| "learning_rate": 2.629303682694173e-05, |
| "loss": 1.9596979141235351, |
| "mean_token_accuracy": 0.5794607996940613, |
| "num_tokens": 38496132.0, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.6745362563237775, |
| "eval_entropy": 2.3521742820739746, |
| "eval_loss": 2.6118619441986084, |
| "eval_mean_token_accuracy": 0.4638798236846924, |
| "eval_num_tokens": 38496132.0, |
| "eval_runtime": 0.4978, |
| "eval_samples_per_second": 32.143, |
| "eval_steps_per_second": 4.018, |
| "step": 5600 |
| }, |
| { |
| "entropy": 1.9450265645980835, |
| "epoch": 0.675740785352927, |
| "grad_norm": 1.7421875, |
| "learning_rate": 2.611785038923042e-05, |
| "loss": 1.9604305267333983, |
| "mean_token_accuracy": 0.5766765594482421, |
| "num_tokens": 38565467.0, |
| "step": 5610 |
| }, |
| { |
| "entropy": 1.8920610785484313, |
| "epoch": 0.6769453143820766, |
| "grad_norm": 1.8203125, |
| "learning_rate": 2.594304296882492e-05, |
| "loss": 1.8906587600708007, |
| "mean_token_accuracy": 0.5891460299491882, |
| "num_tokens": 38636572.0, |
| "step": 5620 |
| }, |
| { |
| "entropy": 1.909081017971039, |
| "epoch": 0.6781498434112262, |
| "grad_norm": 2.4375, |
| "learning_rate": 2.5768617339974606e-05, |
| "loss": 1.9260461807250977, |
| "mean_token_accuracy": 0.5860518753528595, |
| "num_tokens": 38707209.0, |
| "step": 5630 |
| }, |
| { |
| "entropy": 1.9273210883140564, |
| "epoch": 0.6793543724403758, |
| "grad_norm": 1.765625, |
| "learning_rate": 2.5594576270869663e-05, |
| "loss": 1.9551538467407226, |
| "mean_token_accuracy": 0.5829238772392273, |
| "num_tokens": 38776194.0, |
| "step": 5640 |
| }, |
| { |
| "entropy": 1.9627973318099976, |
| "epoch": 0.6805589014695255, |
| "grad_norm": 1.8359375, |
| "learning_rate": 2.5420922523597156e-05, |
| "loss": 1.9638336181640625, |
| "mean_token_accuracy": 0.5761943578720092, |
| "num_tokens": 38847307.0, |
| "step": 5650 |
| }, |
| { |
| "entropy": 1.896808135509491, |
| "epoch": 0.681763430498675, |
| "grad_norm": 1.625, |
| "learning_rate": 2.5247658854097277e-05, |
| "loss": 1.898649024963379, |
| "mean_token_accuracy": 0.5869676113128662, |
| "num_tokens": 38916748.0, |
| "step": 5660 |
| }, |
| { |
| "entropy": 1.9466437339782714, |
| "epoch": 0.6829679595278246, |
| "grad_norm": 2.609375, |
| "learning_rate": 2.507478801211951e-05, |
| "loss": 1.9477453231811523, |
| "mean_token_accuracy": 0.5810700714588165, |
| "num_tokens": 38982369.0, |
| "step": 5670 |
| }, |
| { |
| "entropy": 1.925716495513916, |
| "epoch": 0.6841724885569742, |
| "grad_norm": 1.9375, |
| "learning_rate": 2.4902312741179108e-05, |
| "loss": 1.9344860076904298, |
| "mean_token_accuracy": 0.5786227762699128, |
| "num_tokens": 39052521.0, |
| "step": 5680 |
| }, |
| { |
| "entropy": 1.9241721272468566, |
| "epoch": 0.6853770175861238, |
| "grad_norm": 2.015625, |
| "learning_rate": 2.4730235778513394e-05, |
| "loss": 1.9304174423217773, |
| "mean_token_accuracy": 0.5819700300693512, |
| "num_tokens": 39116120.0, |
| "step": 5690 |
| }, |
| { |
| "entropy": 1.9030420541763307, |
| "epoch": 0.6865815466152734, |
| "grad_norm": 2.15625, |
| "learning_rate": 2.4558559855038537e-05, |
| "loss": 1.9098997116088867, |
| "mean_token_accuracy": 0.5879886984825134, |
| "num_tokens": 39185492.0, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.6865815466152734, |
| "eval_entropy": 2.36625599861145, |
| "eval_loss": 2.6117119789123535, |
| "eval_mean_token_accuracy": 0.46299195289611816, |
| "eval_num_tokens": 39185492.0, |
| "eval_runtime": 0.5422, |
| "eval_samples_per_second": 29.509, |
| "eval_steps_per_second": 3.689, |
| "step": 5700 |
| }, |
| { |
| "entropy": 1.9261456251144409, |
| "epoch": 0.6877860756444231, |
| "grad_norm": 1.9375, |
| "learning_rate": 2.438728769530593e-05, |
| "loss": 1.9247291564941407, |
| "mean_token_accuracy": 0.5803986310958862, |
| "num_tokens": 39249701.0, |
| "step": 5710 |
| }, |
| { |
| "entropy": 1.8991306185722352, |
| "epoch": 0.6889906046735726, |
| "grad_norm": 1.828125, |
| "learning_rate": 2.4216422017459234e-05, |
| "loss": 1.930657958984375, |
| "mean_token_accuracy": 0.58206005692482, |
| "num_tokens": 39319024.0, |
| "step": 5720 |
| }, |
| { |
| "entropy": 1.9198801875114442, |
| "epoch": 0.6901951337027222, |
| "grad_norm": 1.8671875, |
| "learning_rate": 2.4045965533191083e-05, |
| "loss": 1.9231132507324218, |
| "mean_token_accuracy": 0.5801682949066163, |
| "num_tokens": 39389627.0, |
| "step": 5730 |
| }, |
| { |
| "entropy": 1.8921023964881898, |
| "epoch": 0.6913996627318718, |
| "grad_norm": 1.921875, |
| "learning_rate": 2.3875920947700032e-05, |
| "loss": 1.886610221862793, |
| "mean_token_accuracy": 0.5911743700504303, |
| "num_tokens": 39459243.0, |
| "step": 5740 |
| }, |
| { |
| "entropy": 2.019665813446045, |
| "epoch": 0.6926041917610214, |
| "grad_norm": 1.734375, |
| "learning_rate": 2.3706290959647742e-05, |
| "loss": 2.031690216064453, |
| "mean_token_accuracy": 0.568154114484787, |
| "num_tokens": 39531010.0, |
| "step": 5750 |
| }, |
| { |
| "entropy": 1.8260622501373291, |
| "epoch": 0.6938087207901711, |
| "grad_norm": 2.421875, |
| "learning_rate": 2.3537078261116007e-05, |
| "loss": 1.8358779907226563, |
| "mean_token_accuracy": 0.6017170667648315, |
| "num_tokens": 39601384.0, |
| "step": 5760 |
| }, |
| { |
| "entropy": 1.82235267162323, |
| "epoch": 0.6950132498193207, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.336828553756418e-05, |
| "loss": 1.8454484939575195, |
| "mean_token_accuracy": 0.5994343221187591, |
| "num_tokens": 39675047.0, |
| "step": 5770 |
| }, |
| { |
| "entropy": 1.953169822692871, |
| "epoch": 0.6962177788484702, |
| "grad_norm": 2.46875, |
| "learning_rate": 2.3199915467786402e-05, |
| "loss": 1.9636747360229492, |
| "mean_token_accuracy": 0.5822606801986694, |
| "num_tokens": 39742957.0, |
| "step": 5780 |
| }, |
| { |
| "entropy": 1.9094646334648133, |
| "epoch": 0.6974223078776198, |
| "grad_norm": 1.7734375, |
| "learning_rate": 2.303197072386926e-05, |
| "loss": 1.924036407470703, |
| "mean_token_accuracy": 0.5854589581489563, |
| "num_tokens": 39814717.0, |
| "step": 5790 |
| }, |
| { |
| "entropy": 1.9667348742485047, |
| "epoch": 0.6986268369067694, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.286445397114914e-05, |
| "loss": 1.9706661224365234, |
| "mean_token_accuracy": 0.5856462627649307, |
| "num_tokens": 39878350.0, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.6986268369067694, |
| "eval_entropy": 2.358152151107788, |
| "eval_loss": 2.612285852432251, |
| "eval_mean_token_accuracy": 0.4662233889102936, |
| "eval_num_tokens": 39878350.0, |
| "eval_runtime": 0.47, |
| "eval_samples_per_second": 34.044, |
| "eval_steps_per_second": 4.255, |
| "step": 5800 |
| }, |
| { |
| "entropy": 1.862882673740387, |
| "epoch": 0.6998313659359191, |
| "grad_norm": 2.171875, |
| "learning_rate": 2.2697367868170204e-05, |
| "loss": 1.8640838623046876, |
| "mean_token_accuracy": 0.5986731469631195, |
| "num_tokens": 39948246.0, |
| "step": 5810 |
| }, |
| { |
| "entropy": 1.8730922937393188, |
| "epoch": 0.7010358949650687, |
| "grad_norm": 2.359375, |
| "learning_rate": 2.2530715066642034e-05, |
| "loss": 1.8938676834106445, |
| "mean_token_accuracy": 0.5908694744110108, |
| "num_tokens": 40016822.0, |
| "step": 5820 |
| }, |
| { |
| "entropy": 1.8263341665267945, |
| "epoch": 0.7022404239942183, |
| "grad_norm": 2.078125, |
| "learning_rate": 2.2364498211397523e-05, |
| "loss": 1.821687889099121, |
| "mean_token_accuracy": 0.600709855556488, |
| "num_tokens": 40085243.0, |
| "step": 5830 |
| }, |
| { |
| "entropy": 1.863125741481781, |
| "epoch": 0.7034449530233678, |
| "grad_norm": 1.921875, |
| "learning_rate": 2.2198719940351048e-05, |
| "loss": 1.8795415878295898, |
| "mean_token_accuracy": 0.5911260426044465, |
| "num_tokens": 40151875.0, |
| "step": 5840 |
| }, |
| { |
| "entropy": 1.90743248462677, |
| "epoch": 0.7046494820525174, |
| "grad_norm": 1.7265625, |
| "learning_rate": 2.203338288445642e-05, |
| "loss": 1.9162694931030273, |
| "mean_token_accuracy": 0.5857582867145539, |
| "num_tokens": 40222136.0, |
| "step": 5850 |
| }, |
| { |
| "entropy": 1.9515150785446167, |
| "epoch": 0.7058540110816671, |
| "grad_norm": 2.25, |
| "learning_rate": 2.1868489667665314e-05, |
| "loss": 1.9747983932495117, |
| "mean_token_accuracy": 0.5775373160839081, |
| "num_tokens": 40293874.0, |
| "step": 5860 |
| }, |
| { |
| "entropy": 1.9025539755821228, |
| "epoch": 0.7070585401108167, |
| "grad_norm": 1.7578125, |
| "learning_rate": 2.1704042906885457e-05, |
| "loss": 1.9145645141601562, |
| "mean_token_accuracy": 0.5868270993232727, |
| "num_tokens": 40360415.0, |
| "step": 5870 |
| }, |
| { |
| "entropy": 1.872169315814972, |
| "epoch": 0.7082630691399663, |
| "grad_norm": 2.640625, |
| "learning_rate": 2.154004521193925e-05, |
| "loss": 1.8736785888671874, |
| "mean_token_accuracy": 0.5887367010116578, |
| "num_tokens": 40430002.0, |
| "step": 5880 |
| }, |
| { |
| "entropy": 1.9155991435050965, |
| "epoch": 0.7094675981691159, |
| "grad_norm": 2.125, |
| "learning_rate": 2.1376499185522237e-05, |
| "loss": 1.9274738311767579, |
| "mean_token_accuracy": 0.5839588642120361, |
| "num_tokens": 40500557.0, |
| "step": 5890 |
| }, |
| { |
| "entropy": 1.9240437746047974, |
| "epoch": 0.7106721271982654, |
| "grad_norm": 1.7734375, |
| "learning_rate": 2.1213407423161812e-05, |
| "loss": 1.9209692001342773, |
| "mean_token_accuracy": 0.5875469863414764, |
| "num_tokens": 40566716.0, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.7106721271982654, |
| "eval_entropy": 2.3489553928375244, |
| "eval_loss": 2.6125404834747314, |
| "eval_mean_token_accuracy": 0.4673546105623245, |
| "eval_num_tokens": 40566716.0, |
| "eval_runtime": 0.5215, |
| "eval_samples_per_second": 30.678, |
| "eval_steps_per_second": 3.835, |
| "step": 5900 |
| }, |
| { |
| "entropy": 1.9260069489479066, |
| "epoch": 0.711876656227415, |
| "grad_norm": 1.71875, |
| "learning_rate": 2.1050772513176133e-05, |
| "loss": 1.922018051147461, |
| "mean_token_accuracy": 0.5860312581062317, |
| "num_tokens": 40633145.0, |
| "step": 5910 |
| }, |
| { |
| "entropy": 1.909291636943817, |
| "epoch": 0.7130811852565647, |
| "grad_norm": 1.5859375, |
| "learning_rate": 2.0888597036632874e-05, |
| "loss": 1.905712890625, |
| "mean_token_accuracy": 0.5870055973529815, |
| "num_tokens": 40700932.0, |
| "step": 5920 |
| }, |
| { |
| "entropy": 1.9476830124855042, |
| "epoch": 0.7142857142857143, |
| "grad_norm": 2.03125, |
| "learning_rate": 2.072688356730844e-05, |
| "loss": 1.970412826538086, |
| "mean_token_accuracy": 0.5819882094860077, |
| "num_tokens": 40771411.0, |
| "step": 5930 |
| }, |
| { |
| "entropy": 1.826759159564972, |
| "epoch": 0.7154902433148639, |
| "grad_norm": 1.875, |
| "learning_rate": 2.056563467164696e-05, |
| "loss": 1.8389589309692382, |
| "mean_token_accuracy": 0.6018831849098205, |
| "num_tokens": 40840010.0, |
| "step": 5940 |
| }, |
| { |
| "entropy": 1.8843806028366088, |
| "epoch": 0.7166947723440135, |
| "grad_norm": 1.8671875, |
| "learning_rate": 2.0404852908719698e-05, |
| "loss": 1.8859840393066407, |
| "mean_token_accuracy": 0.595301216840744, |
| "num_tokens": 40910205.0, |
| "step": 5950 |
| }, |
| { |
| "entropy": 1.912154495716095, |
| "epoch": 0.717899301373163, |
| "grad_norm": 2.3125, |
| "learning_rate": 2.0244540830184298e-05, |
| "loss": 1.9205114364624023, |
| "mean_token_accuracy": 0.5793304085731507, |
| "num_tokens": 40978311.0, |
| "step": 5960 |
| }, |
| { |
| "entropy": 1.8706116080284119, |
| "epoch": 0.7191038304023127, |
| "grad_norm": 1.8828125, |
| "learning_rate": 2.0084700980244454e-05, |
| "loss": 1.8852800369262694, |
| "mean_token_accuracy": 0.5874241411685943, |
| "num_tokens": 41048171.0, |
| "step": 5970 |
| }, |
| { |
| "entropy": 1.9051282048225402, |
| "epoch": 0.7203083594314623, |
| "grad_norm": 1.71875, |
| "learning_rate": 1.9925335895609365e-05, |
| "loss": 1.9160747528076172, |
| "mean_token_accuracy": 0.5877177476882934, |
| "num_tokens": 41116376.0, |
| "step": 5980 |
| }, |
| { |
| "entropy": 1.971784806251526, |
| "epoch": 0.7215128884606119, |
| "grad_norm": 1.96875, |
| "learning_rate": 1.976644810545357e-05, |
| "loss": 1.9614128112792968, |
| "mean_token_accuracy": 0.5810837090015412, |
| "num_tokens": 41186537.0, |
| "step": 5990 |
| }, |
| { |
| "entropy": 1.8875194787979126, |
| "epoch": 0.7227174174897615, |
| "grad_norm": 2.140625, |
| "learning_rate": 1.9608040131376842e-05, |
| "loss": 1.9109247207641602, |
| "mean_token_accuracy": 0.5865307509899139, |
| "num_tokens": 41258325.0, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.7227174174897615, |
| "eval_entropy": 2.366549849510193, |
| "eval_loss": 2.61251163482666, |
| "eval_mean_token_accuracy": 0.4671112596988678, |
| "eval_num_tokens": 41258325.0, |
| "eval_runtime": 0.5543, |
| "eval_samples_per_second": 28.867, |
| "eval_steps_per_second": 3.608, |
| "step": 6000 |
| }, |
| { |
| "entropy": 1.9798705458641053, |
| "epoch": 0.7239219465189111, |
| "grad_norm": 1.6328125, |
| "learning_rate": 1.9450114487364042e-05, |
| "loss": 1.9828369140625, |
| "mean_token_accuracy": 0.5758079171180726, |
| "num_tokens": 41326418.0, |
| "step": 6010 |
| }, |
| { |
| "entropy": 1.9611675500869752, |
| "epoch": 0.7251264755480608, |
| "grad_norm": 1.828125, |
| "learning_rate": 1.9292673679745382e-05, |
| "loss": 1.9798452377319335, |
| "mean_token_accuracy": 0.5820157170295716, |
| "num_tokens": 41392666.0, |
| "step": 6020 |
| }, |
| { |
| "entropy": 1.909050500392914, |
| "epoch": 0.7263310045772103, |
| "grad_norm": 1.984375, |
| "learning_rate": 1.9135720207156488e-05, |
| "loss": 1.9030824661254884, |
| "mean_token_accuracy": 0.5906743764877319, |
| "num_tokens": 41459774.0, |
| "step": 6030 |
| }, |
| { |
| "entropy": 1.9096762776374816, |
| "epoch": 0.7275355336063599, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.89792565604989e-05, |
| "loss": 1.93050537109375, |
| "mean_token_accuracy": 0.5873095750808716, |
| "num_tokens": 41531612.0, |
| "step": 6040 |
| }, |
| { |
| "entropy": 1.973576021194458, |
| "epoch": 0.7287400626355095, |
| "grad_norm": 1.796875, |
| "learning_rate": 1.8823285222900378e-05, |
| "loss": 1.9844182968139648, |
| "mean_token_accuracy": 0.5772220075130463, |
| "num_tokens": 41600679.0, |
| "step": 6050 |
| }, |
| { |
| "entropy": 1.8222813606262207, |
| "epoch": 0.7299445916646591, |
| "grad_norm": 1.9140625, |
| "learning_rate": 1.8667808669675686e-05, |
| "loss": 1.8241180419921874, |
| "mean_token_accuracy": 0.6043347537517547, |
| "num_tokens": 41669857.0, |
| "step": 6060 |
| }, |
| { |
| "entropy": 1.9602565884590148, |
| "epoch": 0.7311491206938088, |
| "grad_norm": 1.796875, |
| "learning_rate": 1.8512829368287106e-05, |
| "loss": 1.9717720031738282, |
| "mean_token_accuracy": 0.5753653764724731, |
| "num_tokens": 41740401.0, |
| "step": 6070 |
| }, |
| { |
| "entropy": 1.9181342720985413, |
| "epoch": 0.7323536497229584, |
| "grad_norm": 1.875, |
| "learning_rate": 1.8358349778305413e-05, |
| "loss": 1.9254276275634765, |
| "mean_token_accuracy": 0.5831866025924682, |
| "num_tokens": 41812058.0, |
| "step": 6080 |
| }, |
| { |
| "entropy": 1.8529590964317322, |
| "epoch": 0.7335581787521079, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.8204372351370837e-05, |
| "loss": 1.8680103302001954, |
| "mean_token_accuracy": 0.596212899684906, |
| "num_tokens": 41881963.0, |
| "step": 6090 |
| }, |
| { |
| "entropy": 2.052596926689148, |
| "epoch": 0.7347627077812575, |
| "grad_norm": 2.640625, |
| "learning_rate": 1.8050899531154047e-05, |
| "loss": 2.0838245391845702, |
| "mean_token_accuracy": 0.5623102843761444, |
| "num_tokens": 41952566.0, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.7347627077812575, |
| "eval_entropy": 2.362101435661316, |
| "eval_loss": 2.609923839569092, |
| "eval_mean_token_accuracy": 0.4649299383163452, |
| "eval_num_tokens": 41952566.0, |
| "eval_runtime": 0.4206, |
| "eval_samples_per_second": 38.041, |
| "eval_steps_per_second": 4.755, |
| "step": 6100 |
| }, |
| { |
| "entropy": 1.9244691729545593, |
| "epoch": 0.7359672368104071, |
| "grad_norm": 1.734375, |
| "learning_rate": 1.7897933753317524e-05, |
| "loss": 1.9250370025634767, |
| "mean_token_accuracy": 0.582695585489273, |
| "num_tokens": 42020999.0, |
| "step": 6110 |
| }, |
| { |
| "entropy": 1.9595140218734741, |
| "epoch": 0.7371717658395567, |
| "grad_norm": 3.6875, |
| "learning_rate": 1.7745477445476753e-05, |
| "loss": 1.956146240234375, |
| "mean_token_accuracy": 0.5800999224185943, |
| "num_tokens": 42089893.0, |
| "step": 6120 |
| }, |
| { |
| "entropy": 1.919549560546875, |
| "epoch": 0.7383762948687064, |
| "grad_norm": 2.09375, |
| "learning_rate": 1.759353302716184e-05, |
| "loss": 1.9333702087402345, |
| "mean_token_accuracy": 0.5832765638828278, |
| "num_tokens": 42157455.0, |
| "step": 6130 |
| }, |
| { |
| "entropy": 1.946086299419403, |
| "epoch": 0.739580823897856, |
| "grad_norm": 1.8671875, |
| "learning_rate": 1.744210290977896e-05, |
| "loss": 1.9349960327148437, |
| "mean_token_accuracy": 0.578400480747223, |
| "num_tokens": 42225162.0, |
| "step": 6140 |
| }, |
| { |
| "entropy": 1.873963975906372, |
| "epoch": 0.7407853529270055, |
| "grad_norm": 1.75, |
| "learning_rate": 1.7291189496572253e-05, |
| "loss": 1.884567642211914, |
| "mean_token_accuracy": 0.5981316328048706, |
| "num_tokens": 42294964.0, |
| "step": 6150 |
| }, |
| { |
| "entropy": 1.9695120215415955, |
| "epoch": 0.7419898819561551, |
| "grad_norm": 2.296875, |
| "learning_rate": 1.7140795182585534e-05, |
| "loss": 1.9876461029052734, |
| "mean_token_accuracy": 0.5724166512489319, |
| "num_tokens": 42362342.0, |
| "step": 6160 |
| }, |
| { |
| "entropy": 1.934032666683197, |
| "epoch": 0.7431944109853047, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.699092235462436e-05, |
| "loss": 1.9466995239257812, |
| "mean_token_accuracy": 0.5826898992061615, |
| "num_tokens": 42432625.0, |
| "step": 6170 |
| }, |
| { |
| "entropy": 1.9191329836845399, |
| "epoch": 0.7443989400144544, |
| "grad_norm": 2.71875, |
| "learning_rate": 1.684157339121819e-05, |
| "loss": 1.9333236694335938, |
| "mean_token_accuracy": 0.5826072692871094, |
| "num_tokens": 42498732.0, |
| "step": 6180 |
| }, |
| { |
| "entropy": 1.9656689882278442, |
| "epoch": 0.745603469043604, |
| "grad_norm": 1.7578125, |
| "learning_rate": 1.66927506625825e-05, |
| "loss": 1.9759876251220703, |
| "mean_token_accuracy": 0.5751698732376098, |
| "num_tokens": 42568957.0, |
| "step": 6190 |
| }, |
| { |
| "entropy": 1.9358470439910889, |
| "epoch": 0.7468079980727536, |
| "grad_norm": 1.7734375, |
| "learning_rate": 1.6544456530581347e-05, |
| "loss": 1.9460548400878905, |
| "mean_token_accuracy": 0.5818601489067078, |
| "num_tokens": 42640191.0, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.7468079980727536, |
| "eval_entropy": 2.362728238105774, |
| "eval_loss": 2.608748435974121, |
| "eval_mean_token_accuracy": 0.4665456563234329, |
| "eval_num_tokens": 42640191.0, |
| "eval_runtime": 0.4335, |
| "eval_samples_per_second": 36.906, |
| "eval_steps_per_second": 4.613, |
| "step": 6200 |
| }, |
| { |
| "entropy": 1.9571660161018372, |
| "epoch": 0.7480125271019031, |
| "grad_norm": 1.8125, |
| "learning_rate": 1.6396693348689708e-05, |
| "loss": 1.965768814086914, |
| "mean_token_accuracy": 0.5769582033157349, |
| "num_tokens": 42709254.0, |
| "step": 6210 |
| }, |
| { |
| "entropy": 1.8060012340545655, |
| "epoch": 0.7492170561310527, |
| "grad_norm": 1.640625, |
| "learning_rate": 1.6249463461956282e-05, |
| "loss": 1.8094675064086914, |
| "mean_token_accuracy": 0.6042569816112519, |
| "num_tokens": 42779914.0, |
| "step": 6220 |
| }, |
| { |
| "entropy": 1.8862876176834107, |
| "epoch": 0.7504215851602024, |
| "grad_norm": 1.703125, |
| "learning_rate": 1.6102769206966134e-05, |
| "loss": 1.8914718627929688, |
| "mean_token_accuracy": 0.5938247859477996, |
| "num_tokens": 42848428.0, |
| "step": 6230 |
| }, |
| { |
| "entropy": 1.8931943416595458, |
| "epoch": 0.751626114189352, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.5956612911803763e-05, |
| "loss": 1.9008750915527344, |
| "mean_token_accuracy": 0.5888782799243927, |
| "num_tokens": 42918406.0, |
| "step": 6240 |
| }, |
| { |
| "entropy": 1.9467383861541747, |
| "epoch": 0.7528306432185016, |
| "grad_norm": 1.859375, |
| "learning_rate": 1.5810996896016013e-05, |
| "loss": 1.9549457550048828, |
| "mean_token_accuracy": 0.5805156767368317, |
| "num_tokens": 42988884.0, |
| "step": 6250 |
| }, |
| { |
| "entropy": 1.8323127388954163, |
| "epoch": 0.7540351722476512, |
| "grad_norm": 3.265625, |
| "learning_rate": 1.5665923470575322e-05, |
| "loss": 1.8414487838745117, |
| "mean_token_accuracy": 0.6019357740879059, |
| "num_tokens": 43057529.0, |
| "step": 6260 |
| }, |
| { |
| "entropy": 1.9331823110580444, |
| "epoch": 0.7552397012768007, |
| "grad_norm": 1.8125, |
| "learning_rate": 1.5521394937843103e-05, |
| "loss": 1.9314424514770507, |
| "mean_token_accuracy": 0.5830797135829926, |
| "num_tokens": 43130313.0, |
| "step": 6270 |
| }, |
| { |
| "entropy": 1.891178059577942, |
| "epoch": 0.7564442303059504, |
| "grad_norm": 1.9765625, |
| "learning_rate": 1.537741359153308e-05, |
| "loss": 1.888962173461914, |
| "mean_token_accuracy": 0.592195349931717, |
| "num_tokens": 43198254.0, |
| "step": 6280 |
| }, |
| { |
| "entropy": 1.8627419114112853, |
| "epoch": 0.7576487593351, |
| "grad_norm": 1.6015625, |
| "learning_rate": 1.5233981716675017e-05, |
| "loss": 1.8827104568481445, |
| "mean_token_accuracy": 0.5912132978439331, |
| "num_tokens": 43269101.0, |
| "step": 6290 |
| }, |
| { |
| "entropy": 1.9497613430023193, |
| "epoch": 0.7588532883642496, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.5091101589578333e-05, |
| "loss": 1.9523366928100585, |
| "mean_token_accuracy": 0.5823286592960357, |
| "num_tokens": 43339492.0, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.7588532883642496, |
| "eval_entropy": 2.3515676259994507, |
| "eval_loss": 2.6067941188812256, |
| "eval_mean_token_accuracy": 0.46921147406101227, |
| "eval_num_tokens": 43339492.0, |
| "eval_runtime": 0.4579, |
| "eval_samples_per_second": 34.943, |
| "eval_steps_per_second": 4.368, |
| "step": 6300 |
| }, |
| { |
| "entropy": 1.742299199104309, |
| "epoch": 0.7600578173933992, |
| "grad_norm": 2.375, |
| "learning_rate": 1.4948775477796095e-05, |
| "loss": 1.7332212448120117, |
| "mean_token_accuracy": 0.6186984837055206, |
| "num_tokens": 43403475.0, |
| "step": 6310 |
| }, |
| { |
| "entropy": 1.9738173604011535, |
| "epoch": 0.7612623464225488, |
| "grad_norm": 1.984375, |
| "learning_rate": 1.48070056400889e-05, |
| "loss": 2.008579635620117, |
| "mean_token_accuracy": 0.5714421808719635, |
| "num_tokens": 43474221.0, |
| "step": 6320 |
| }, |
| { |
| "entropy": 1.8835377812385559, |
| "epoch": 0.7624668754516983, |
| "grad_norm": 2.0, |
| "learning_rate": 1.4665794326389175e-05, |
| "loss": 1.8993560791015625, |
| "mean_token_accuracy": 0.5936033010482789, |
| "num_tokens": 43538217.0, |
| "step": 6330 |
| }, |
| { |
| "entropy": 1.8982553243637086, |
| "epoch": 0.763671404480848, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.4525143777765327e-05, |
| "loss": 1.9044149398803711, |
| "mean_token_accuracy": 0.5876859664916992, |
| "num_tokens": 43607826.0, |
| "step": 6340 |
| }, |
| { |
| "entropy": 2.0122554659843446, |
| "epoch": 0.7648759335099976, |
| "grad_norm": 6.0, |
| "learning_rate": 1.4385056226386296e-05, |
| "loss": 2.0187328338623045, |
| "mean_token_accuracy": 0.5686930924654007, |
| "num_tokens": 43675060.0, |
| "step": 6350 |
| }, |
| { |
| "entropy": 1.9191255450248719, |
| "epoch": 0.7660804625391472, |
| "grad_norm": 1.9609375, |
| "learning_rate": 1.4245533895486047e-05, |
| "loss": 1.9368209838867188, |
| "mean_token_accuracy": 0.5836399495601654, |
| "num_tokens": 43746519.0, |
| "step": 6360 |
| }, |
| { |
| "entropy": 1.9163129210472107, |
| "epoch": 0.7672849915682968, |
| "grad_norm": 3.046875, |
| "learning_rate": 1.410657899932829e-05, |
| "loss": 1.911086082458496, |
| "mean_token_accuracy": 0.5841832041740418, |
| "num_tokens": 43816456.0, |
| "step": 6370 |
| }, |
| { |
| "entropy": 1.9154442310333253, |
| "epoch": 0.7684895205974464, |
| "grad_norm": 2.203125, |
| "learning_rate": 1.3968193743171448e-05, |
| "loss": 1.928656005859375, |
| "mean_token_accuracy": 0.5853952527046203, |
| "num_tokens": 43886694.0, |
| "step": 6380 |
| }, |
| { |
| "entropy": 1.8705685019493103, |
| "epoch": 0.769694049626596, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.3830380323233483e-05, |
| "loss": 1.873354148864746, |
| "mean_token_accuracy": 0.5968881070613861, |
| "num_tokens": 43954882.0, |
| "step": 6390 |
| }, |
| { |
| "entropy": 1.8823359847068786, |
| "epoch": 0.7708985786557456, |
| "grad_norm": 1.875, |
| "learning_rate": 1.3693140926657206e-05, |
| "loss": 1.8747835159301758, |
| "mean_token_accuracy": 0.5940642535686493, |
| "num_tokens": 44024348.0, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.7708985786557456, |
| "eval_entropy": 2.3486063480377197, |
| "eval_loss": 2.607100009918213, |
| "eval_mean_token_accuracy": 0.4709083139896393, |
| "eval_num_tokens": 44024348.0, |
| "eval_runtime": 0.4059, |
| "eval_samples_per_second": 39.42, |
| "eval_steps_per_second": 4.927, |
| "step": 6400 |
| }, |
| { |
| "entropy": 1.9085419058799744, |
| "epoch": 0.7721031076848952, |
| "grad_norm": 1.515625, |
| "learning_rate": 1.3556477731475436e-05, |
| "loss": 1.9057226181030273, |
| "mean_token_accuracy": 0.583581292629242, |
| "num_tokens": 44095192.0, |
| "step": 6410 |
| }, |
| { |
| "entropy": 1.9753173351287843, |
| "epoch": 0.7733076367140448, |
| "grad_norm": 2.921875, |
| "learning_rate": 1.3420392906576562e-05, |
| "loss": 2.002203178405762, |
| "mean_token_accuracy": 0.5682298004627228, |
| "num_tokens": 44166393.0, |
| "step": 6420 |
| }, |
| { |
| "entropy": 1.9596870183944701, |
| "epoch": 0.7745121657431944, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.3284888611669977e-05, |
| "loss": 1.9819063186645507, |
| "mean_token_accuracy": 0.577583122253418, |
| "num_tokens": 44236505.0, |
| "step": 6430 |
| }, |
| { |
| "entropy": 1.9110072374343872, |
| "epoch": 0.7757166947723441, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.314996699725194e-05, |
| "loss": 1.9248188018798829, |
| "mean_token_accuracy": 0.5864306092262268, |
| "num_tokens": 44302671.0, |
| "step": 6440 |
| }, |
| { |
| "entropy": 1.9159401893615722, |
| "epoch": 0.7769212238014936, |
| "grad_norm": 1.8671875, |
| "learning_rate": 1.3015630204571343e-05, |
| "loss": 1.936089324951172, |
| "mean_token_accuracy": 0.5859259188175201, |
| "num_tokens": 44371608.0, |
| "step": 6450 |
| }, |
| { |
| "entropy": 1.9239344954490663, |
| "epoch": 0.7781257528306432, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.288188036559579e-05, |
| "loss": 1.9287485122680663, |
| "mean_token_accuracy": 0.5864728391170502, |
| "num_tokens": 44439064.0, |
| "step": 6460 |
| }, |
| { |
| "entropy": 1.843450939655304, |
| "epoch": 0.7793302818597928, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.2748719602977755e-05, |
| "loss": 1.8487020492553712, |
| "mean_token_accuracy": 0.6032763719558716, |
| "num_tokens": 44509652.0, |
| "step": 6470 |
| }, |
| { |
| "entropy": 1.8990965366363526, |
| "epoch": 0.7805348108889424, |
| "grad_norm": 1.890625, |
| "learning_rate": 1.261615003002084e-05, |
| "loss": 1.8943010330200196, |
| "mean_token_accuracy": 0.586923462152481, |
| "num_tokens": 44581457.0, |
| "step": 6480 |
| }, |
| { |
| "entropy": 1.8844643354415893, |
| "epoch": 0.7817393399180921, |
| "grad_norm": 1.9765625, |
| "learning_rate": 1.248417375064635e-05, |
| "loss": 1.8843027114868165, |
| "mean_token_accuracy": 0.5940824329853058, |
| "num_tokens": 44647497.0, |
| "step": 6490 |
| }, |
| { |
| "entropy": 1.943538236618042, |
| "epoch": 0.7829438689472417, |
| "grad_norm": 1.9140625, |
| "learning_rate": 1.2352792859359746e-05, |
| "loss": 1.951897430419922, |
| "mean_token_accuracy": 0.5817996025085449, |
| "num_tokens": 44715951.0, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.7829438689472417, |
| "eval_entropy": 2.354966163635254, |
| "eval_loss": 2.6048130989074707, |
| "eval_mean_token_accuracy": 0.469454824924469, |
| "eval_num_tokens": 44715951.0, |
| "eval_runtime": 0.433, |
| "eval_samples_per_second": 36.955, |
| "eval_steps_per_second": 4.619, |
| "step": 6500 |
| }, |
| { |
| "entropy": 1.914371383190155, |
| "epoch": 0.7841483979763912, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.222200944121758e-05, |
| "loss": 1.9062223434448242, |
| "mean_token_accuracy": 0.5848917901515961, |
| "num_tokens": 44783665.0, |
| "step": 6510 |
| }, |
| { |
| "entropy": 1.9607685804367065, |
| "epoch": 0.7853529270055408, |
| "grad_norm": 1.6875, |
| "learning_rate": 1.209182557179423e-05, |
| "loss": 1.9632522583007812, |
| "mean_token_accuracy": 0.5831448435783386, |
| "num_tokens": 44848951.0, |
| "step": 6520 |
| }, |
| { |
| "entropy": 1.9036452770233154, |
| "epoch": 0.7865574560346904, |
| "grad_norm": 1.8125, |
| "learning_rate": 1.1962243317149113e-05, |
| "loss": 1.9184595108032227, |
| "mean_token_accuracy": 0.5862579584121704, |
| "num_tokens": 44918497.0, |
| "step": 6530 |
| }, |
| { |
| "entropy": 1.9451961398124695, |
| "epoch": 0.78776198506384, |
| "grad_norm": 2.296875, |
| "learning_rate": 1.1833264733793797e-05, |
| "loss": 1.9473270416259765, |
| "mean_token_accuracy": 0.5783241093158722, |
| "num_tokens": 44986596.0, |
| "step": 6540 |
| }, |
| { |
| "entropy": 1.8859020948410035, |
| "epoch": 0.7889665140929897, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.1704891868659385e-05, |
| "loss": 1.8870750427246095, |
| "mean_token_accuracy": 0.5893312156200409, |
| "num_tokens": 45056784.0, |
| "step": 6550 |
| }, |
| { |
| "entropy": 1.891354513168335, |
| "epoch": 0.7901710431221393, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.1577126759064067e-05, |
| "loss": 1.889866828918457, |
| "mean_token_accuracy": 0.5927616477012634, |
| "num_tokens": 45126044.0, |
| "step": 6560 |
| }, |
| { |
| "entropy": 1.8748571395874023, |
| "epoch": 0.7913755721512888, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.1449971432680734e-05, |
| "loss": 1.902470016479492, |
| "mean_token_accuracy": 0.5893660247325897, |
| "num_tokens": 45197142.0, |
| "step": 6570 |
| }, |
| { |
| "entropy": 1.921328365802765, |
| "epoch": 0.7925801011804384, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.1323427907504852e-05, |
| "loss": 1.9254425048828125, |
| "mean_token_accuracy": 0.5840743720531464, |
| "num_tokens": 45264525.0, |
| "step": 6580 |
| }, |
| { |
| "entropy": 1.8484970211982727, |
| "epoch": 0.793784630209588, |
| "grad_norm": 1.7578125, |
| "learning_rate": 1.119749819182237e-05, |
| "loss": 1.8502546310424806, |
| "mean_token_accuracy": 0.6023968100547791, |
| "num_tokens": 45330367.0, |
| "step": 6590 |
| }, |
| { |
| "entropy": 1.929791271686554, |
| "epoch": 0.7949891592387377, |
| "grad_norm": 2.171875, |
| "learning_rate": 1.1072184284177928e-05, |
| "loss": 1.9453422546386718, |
| "mean_token_accuracy": 0.5831649899482727, |
| "num_tokens": 45399268.0, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.7949891592387377, |
| "eval_entropy": 2.3468000888824463, |
| "eval_loss": 2.607370615005493, |
| "eval_mean_token_accuracy": 0.46759575605392456, |
| "eval_num_tokens": 45399268.0, |
| "eval_runtime": 0.5249, |
| "eval_samples_per_second": 30.482, |
| "eval_steps_per_second": 3.81, |
| "step": 6600 |
| }, |
| { |
| "entropy": 1.9908106565475463, |
| "epoch": 0.7961936882678873, |
| "grad_norm": 1.96875, |
| "learning_rate": 1.0947488173343045e-05, |
| "loss": 1.9914398193359375, |
| "mean_token_accuracy": 0.5752636313438415, |
| "num_tokens": 45464999.0, |
| "step": 6610 |
| }, |
| { |
| "entropy": 1.8240914940834045, |
| "epoch": 0.7973982172970369, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.0823411838284675e-05, |
| "loss": 1.8167228698730469, |
| "mean_token_accuracy": 0.6027800858020782, |
| "num_tokens": 45536036.0, |
| "step": 6620 |
| }, |
| { |
| "entropy": 1.8865158796310424, |
| "epoch": 0.7986027463261864, |
| "grad_norm": 2.0625, |
| "learning_rate": 1.0699957248133674e-05, |
| "loss": 1.8847312927246094, |
| "mean_token_accuracy": 0.5909352362155914, |
| "num_tokens": 45605062.0, |
| "step": 6630 |
| }, |
| { |
| "entropy": 1.9506030559539795, |
| "epoch": 0.799807275355336, |
| "grad_norm": 1.75, |
| "learning_rate": 1.0577126362153616e-05, |
| "loss": 1.9742616653442382, |
| "mean_token_accuracy": 0.5732921123504638, |
| "num_tokens": 45673092.0, |
| "step": 6640 |
| }, |
| { |
| "entropy": 1.8828906178474427, |
| "epoch": 0.8010118043844857, |
| "grad_norm": 1.65625, |
| "learning_rate": 1.0454921129709745e-05, |
| "loss": 1.8854589462280273, |
| "mean_token_accuracy": 0.5882959604263306, |
| "num_tokens": 45743944.0, |
| "step": 6650 |
| }, |
| { |
| "entropy": 1.901366901397705, |
| "epoch": 0.8022163334136353, |
| "grad_norm": 1.671875, |
| "learning_rate": 1.0333343490237907e-05, |
| "loss": 1.914388084411621, |
| "mean_token_accuracy": 0.5854258954524993, |
| "num_tokens": 45814879.0, |
| "step": 6660 |
| }, |
| { |
| "entropy": 1.9456279039382935, |
| "epoch": 0.8034208624427849, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.0212395373213918e-05, |
| "loss": 1.9462764739990235, |
| "mean_token_accuracy": 0.5851451516151428, |
| "num_tokens": 45882249.0, |
| "step": 6670 |
| }, |
| { |
| "entropy": 1.9052425861358642, |
| "epoch": 0.8046253914719345, |
| "grad_norm": 1.859375, |
| "learning_rate": 1.0092078698122815e-05, |
| "loss": 1.9133377075195312, |
| "mean_token_accuracy": 0.5850100219249725, |
| "num_tokens": 45951186.0, |
| "step": 6680 |
| }, |
| { |
| "entropy": 1.8830320119857789, |
| "epoch": 0.805829920501084, |
| "grad_norm": 2.171875, |
| "learning_rate": 9.9723953744285e-06, |
| "loss": 1.8878063201904296, |
| "mean_token_accuracy": 0.5881814539432526, |
| "num_tokens": 46021706.0, |
| "step": 6690 |
| }, |
| { |
| "entropy": 1.9019694447517395, |
| "epoch": 0.8070344495302337, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.853347301543343e-06, |
| "loss": 1.9098031997680665, |
| "mean_token_accuracy": 0.5850791215896607, |
| "num_tokens": 46092513.0, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.8070344495302337, |
| "eval_entropy": 2.344744324684143, |
| "eval_loss": 2.6055827140808105, |
| "eval_mean_token_accuracy": 0.4685647487640381, |
| "eval_num_tokens": 46092513.0, |
| "eval_runtime": 0.4373, |
| "eval_samples_per_second": 36.586, |
| "eval_steps_per_second": 4.573, |
| "step": 6700 |
| }, |
| { |
| "entropy": 1.9491977095603943, |
| "epoch": 0.8082389785593833, |
| "grad_norm": 2.609375, |
| "learning_rate": 9.73493636879813e-06, |
| "loss": 1.9542781829833984, |
| "mean_token_accuracy": 0.5842437386512757, |
| "num_tokens": 46161184.0, |
| "step": 6710 |
| }, |
| { |
| "entropy": 1.8885266542434693, |
| "epoch": 0.8094435075885329, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.617164455411987e-06, |
| "loss": 1.879570770263672, |
| "mean_token_accuracy": 0.5920573532581329, |
| "num_tokens": 46232334.0, |
| "step": 6720 |
| }, |
| { |
| "entropy": 1.8895246863365174, |
| "epoch": 0.8106480366176825, |
| "grad_norm": 1.90625, |
| "learning_rate": 9.500033430462602e-06, |
| "loss": 1.8861270904541017, |
| "mean_token_accuracy": 0.5923787474632263, |
| "num_tokens": 46300591.0, |
| "step": 6730 |
| }, |
| { |
| "entropy": 1.8397042512893678, |
| "epoch": 0.8118525656468321, |
| "grad_norm": 2.234375, |
| "learning_rate": 9.383545152856605e-06, |
| "loss": 1.8577287673950196, |
| "mean_token_accuracy": 0.5989863038063049, |
| "num_tokens": 46371286.0, |
| "step": 6740 |
| }, |
| { |
| "entropy": 1.8958259224891663, |
| "epoch": 0.8130570946759816, |
| "grad_norm": 1.953125, |
| "learning_rate": 9.267701471299956e-06, |
| "loss": 1.9154817581176757, |
| "mean_token_accuracy": 0.5852463603019714, |
| "num_tokens": 46442961.0, |
| "step": 6750 |
| }, |
| { |
| "entropy": 1.9718209743499755, |
| "epoch": 0.8142616237051313, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.152504224268742e-06, |
| "loss": 1.9979925155639648, |
| "mean_token_accuracy": 0.5743964821100235, |
| "num_tokens": 46510402.0, |
| "step": 6760 |
| }, |
| { |
| "entropy": 1.9272982358932496, |
| "epoch": 0.8154661527342809, |
| "grad_norm": 1.828125, |
| "learning_rate": 9.037955239979856e-06, |
| "loss": 1.9375040054321289, |
| "mean_token_accuracy": 0.5840645253658294, |
| "num_tokens": 46578401.0, |
| "step": 6770 |
| }, |
| { |
| "entropy": 1.969197118282318, |
| "epoch": 0.8166706817634305, |
| "grad_norm": 2.328125, |
| "learning_rate": 8.924056336362124e-06, |
| "loss": 1.9637868881225586, |
| "mean_token_accuracy": 0.5787004292011261, |
| "num_tokens": 46648189.0, |
| "step": 6780 |
| }, |
| { |
| "entropy": 1.9857244729995727, |
| "epoch": 0.8178752107925801, |
| "grad_norm": 2.0625, |
| "learning_rate": 8.810809321027325e-06, |
| "loss": 2.0006925582885744, |
| "mean_token_accuracy": 0.573920601606369, |
| "num_tokens": 46717650.0, |
| "step": 6790 |
| }, |
| { |
| "entropy": 1.8448691248893738, |
| "epoch": 0.8190797398217297, |
| "grad_norm": 1.5859375, |
| "learning_rate": 8.69821599124161e-06, |
| "loss": 1.8462907791137695, |
| "mean_token_accuracy": 0.5970932245254517, |
| "num_tokens": 46786658.0, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.8190797398217297, |
| "eval_entropy": 2.3488173484802246, |
| "eval_loss": 2.6056511402130127, |
| "eval_mean_token_accuracy": 0.4687269777059555, |
| "eval_num_tokens": 46786658.0, |
| "eval_runtime": 0.5738, |
| "eval_samples_per_second": 27.886, |
| "eval_steps_per_second": 3.486, |
| "step": 6800 |
| }, |
| { |
| "entropy": 1.8751570463180542, |
| "epoch": 0.8202842688508794, |
| "grad_norm": 1.953125, |
| "learning_rate": 8.586278133896908e-06, |
| "loss": 1.8820247650146484, |
| "mean_token_accuracy": 0.5926909029483796, |
| "num_tokens": 46852908.0, |
| "step": 6810 |
| }, |
| { |
| "entropy": 1.9577014803886414, |
| "epoch": 0.8214887978800289, |
| "grad_norm": 2.046875, |
| "learning_rate": 8.474997525482575e-06, |
| "loss": 1.9634927749633788, |
| "mean_token_accuracy": 0.5787667095661163, |
| "num_tokens": 46916551.0, |
| "step": 6820 |
| }, |
| { |
| "entropy": 1.9611721515655518, |
| "epoch": 0.8226933269091785, |
| "grad_norm": 2.828125, |
| "learning_rate": 8.364375932057278e-06, |
| "loss": 1.970509147644043, |
| "mean_token_accuracy": 0.5789299726486206, |
| "num_tokens": 46985403.0, |
| "step": 6830 |
| }, |
| { |
| "entropy": 1.9028493762016296, |
| "epoch": 0.8238978559383281, |
| "grad_norm": 10.0, |
| "learning_rate": 8.254415109220837e-06, |
| "loss": 1.899116325378418, |
| "mean_token_accuracy": 0.590817129611969, |
| "num_tokens": 47054944.0, |
| "step": 6840 |
| }, |
| { |
| "entropy": 1.9320483922958374, |
| "epoch": 0.8251023849674777, |
| "grad_norm": 1.6875, |
| "learning_rate": 8.145116802086489e-06, |
| "loss": 1.9514781951904296, |
| "mean_token_accuracy": 0.5878109574317932, |
| "num_tokens": 47123791.0, |
| "step": 6850 |
| }, |
| { |
| "entropy": 1.8979083180427552, |
| "epoch": 0.8263069139966274, |
| "grad_norm": 8.3125, |
| "learning_rate": 8.036482745253083e-06, |
| "loss": 1.888798141479492, |
| "mean_token_accuracy": 0.5931646287441253, |
| "num_tokens": 47190807.0, |
| "step": 6860 |
| }, |
| { |
| "entropy": 1.9247923970222474, |
| "epoch": 0.827511443025777, |
| "grad_norm": 1.8515625, |
| "learning_rate": 7.928514662777664e-06, |
| "loss": 1.9398954391479493, |
| "mean_token_accuracy": 0.582766991853714, |
| "num_tokens": 47254992.0, |
| "step": 6870 |
| }, |
| { |
| "entropy": 1.9788224577903748, |
| "epoch": 0.8287159720549265, |
| "grad_norm": 1.84375, |
| "learning_rate": 7.821214268147997e-06, |
| "loss": 1.982382583618164, |
| "mean_token_accuracy": 0.5736671209335327, |
| "num_tokens": 47324051.0, |
| "step": 6880 |
| }, |
| { |
| "entropy": 1.891750741004944, |
| "epoch": 0.8299205010840761, |
| "grad_norm": 1.640625, |
| "learning_rate": 7.714583264255471e-06, |
| "loss": 1.8812660217285155, |
| "mean_token_accuracy": 0.591460508108139, |
| "num_tokens": 47394995.0, |
| "step": 6890 |
| }, |
| { |
| "entropy": 1.9054150104522705, |
| "epoch": 0.8311250301132257, |
| "grad_norm": 2.671875, |
| "learning_rate": 7.6086233433680044e-06, |
| "loss": 1.9210380554199218, |
| "mean_token_accuracy": 0.587424921989441, |
| "num_tokens": 47463873.0, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.8311250301132257, |
| "eval_entropy": 2.3451857566833496, |
| "eval_loss": 2.604262351989746, |
| "eval_mean_token_accuracy": 0.4664645344018936, |
| "eval_num_tokens": 47463873.0, |
| "eval_runtime": 0.4522, |
| "eval_samples_per_second": 35.379, |
| "eval_steps_per_second": 4.422, |
| "step": 6900 |
| }, |
| { |
| "entropy": 1.9943472743034363, |
| "epoch": 0.8323295591423754, |
| "grad_norm": 1.7109375, |
| "learning_rate": 7.50333618710321e-06, |
| "loss": 2.015270805358887, |
| "mean_token_accuracy": 0.5674533128738404, |
| "num_tokens": 47535360.0, |
| "step": 6910 |
| }, |
| { |
| "entropy": 1.857893967628479, |
| "epoch": 0.833534088171525, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.398723466401752e-06, |
| "loss": 1.855722999572754, |
| "mean_token_accuracy": 0.5944087326526641, |
| "num_tokens": 47604620.0, |
| "step": 6920 |
| }, |
| { |
| "entropy": 1.893347978591919, |
| "epoch": 0.8347386172006745, |
| "grad_norm": 2.40625, |
| "learning_rate": 7.294786841500739e-06, |
| "loss": 1.8694910049438476, |
| "mean_token_accuracy": 0.5950228750705719, |
| "num_tokens": 47672995.0, |
| "step": 6930 |
| }, |
| { |
| "entropy": 1.8532984614372254, |
| "epoch": 0.8359431462298241, |
| "grad_norm": 1.890625, |
| "learning_rate": 7.1915279619074685e-06, |
| "loss": 1.8615072250366211, |
| "mean_token_accuracy": 0.5978223085403442, |
| "num_tokens": 47739483.0, |
| "step": 6940 |
| }, |
| { |
| "entropy": 1.9042423844337464, |
| "epoch": 0.8371476752589737, |
| "grad_norm": 2.546875, |
| "learning_rate": 7.088948466373157e-06, |
| "loss": 1.9140111923217773, |
| "mean_token_accuracy": 0.5868822991847992, |
| "num_tokens": 47809508.0, |
| "step": 6950 |
| }, |
| { |
| "entropy": 1.915585207939148, |
| "epoch": 0.8383522042881233, |
| "grad_norm": 1.96875, |
| "learning_rate": 6.9870499828670335e-06, |
| "loss": 1.9215312957763673, |
| "mean_token_accuracy": 0.5845146715641022, |
| "num_tokens": 47876262.0, |
| "step": 6960 |
| }, |
| { |
| "entropy": 1.9611434459686279, |
| "epoch": 0.839556733317273, |
| "grad_norm": 1.9140625, |
| "learning_rate": 6.8858341285504e-06, |
| "loss": 1.9818181991577148, |
| "mean_token_accuracy": 0.5766906619071961, |
| "num_tokens": 47945325.0, |
| "step": 6970 |
| }, |
| { |
| "entropy": 1.9311550855636597, |
| "epoch": 0.8407612623464226, |
| "grad_norm": 1.703125, |
| "learning_rate": 6.785302509751057e-06, |
| "loss": 1.9468086242675782, |
| "mean_token_accuracy": 0.578011155128479, |
| "num_tokens": 48015306.0, |
| "step": 6980 |
| }, |
| { |
| "entropy": 1.9162616729736328, |
| "epoch": 0.8419657913755721, |
| "grad_norm": 2.8125, |
| "learning_rate": 6.685456721937738e-06, |
| "loss": 1.936783218383789, |
| "mean_token_accuracy": 0.5843357384204865, |
| "num_tokens": 48086370.0, |
| "step": 6990 |
| }, |
| { |
| "entropy": 1.9087663173675538, |
| "epoch": 0.8431703204047217, |
| "grad_norm": 6.125, |
| "learning_rate": 6.58629834969483e-06, |
| "loss": 1.9089078903198242, |
| "mean_token_accuracy": 0.5885790526866913, |
| "num_tokens": 48154780.0, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.8431703204047217, |
| "eval_entropy": 2.3457889556884766, |
| "eval_loss": 2.6046531200408936, |
| "eval_mean_token_accuracy": 0.46921147406101227, |
| "eval_num_tokens": 48154780.0, |
| "eval_runtime": 0.4302, |
| "eval_samples_per_second": 37.194, |
| "eval_steps_per_second": 4.649, |
| "step": 7000 |
| }, |
| { |
| "entropy": 1.961945605278015, |
| "epoch": 0.8443748494338713, |
| "grad_norm": 1.6796875, |
| "learning_rate": 6.4878289666972405e-06, |
| "loss": 1.9694751739501952, |
| "mean_token_accuracy": 0.5779289305210114, |
| "num_tokens": 48223465.0, |
| "step": 7010 |
| }, |
| { |
| "entropy": 1.9014871001243592, |
| "epoch": 0.845579378463021, |
| "grad_norm": 1.953125, |
| "learning_rate": 6.390050135685355e-06, |
| "loss": 1.9117996215820312, |
| "mean_token_accuracy": 0.5887616813182831, |
| "num_tokens": 48293428.0, |
| "step": 7020 |
| }, |
| { |
| "entropy": 1.8740954041481017, |
| "epoch": 0.8467839074921706, |
| "grad_norm": 2.328125, |
| "learning_rate": 6.2929634084403275e-06, |
| "loss": 1.8785751342773438, |
| "mean_token_accuracy": 0.5922869861125946, |
| "num_tokens": 48361643.0, |
| "step": 7030 |
| }, |
| { |
| "entropy": 1.8738709330558776, |
| "epoch": 0.8479884365213202, |
| "grad_norm": 3.15625, |
| "learning_rate": 6.196570325759354e-06, |
| "loss": 1.8708406448364259, |
| "mean_token_accuracy": 0.5880731344223022, |
| "num_tokens": 48431388.0, |
| "step": 7040 |
| }, |
| { |
| "entropy": 1.854604971408844, |
| "epoch": 0.8491929655504697, |
| "grad_norm": 1.984375, |
| "learning_rate": 6.100872417431325e-06, |
| "loss": 1.8594350814819336, |
| "mean_token_accuracy": 0.5963862299919128, |
| "num_tokens": 48500006.0, |
| "step": 7050 |
| }, |
| { |
| "entropy": 1.908062756061554, |
| "epoch": 0.8503974945796193, |
| "grad_norm": 1.921875, |
| "learning_rate": 6.0058712022124374e-06, |
| "loss": 1.9291059494018554, |
| "mean_token_accuracy": 0.5871904790401459, |
| "num_tokens": 48565307.0, |
| "step": 7060 |
| }, |
| { |
| "entropy": 1.8915085315704345, |
| "epoch": 0.851602023608769, |
| "grad_norm": 1.828125, |
| "learning_rate": 5.911568187802202e-06, |
| "loss": 1.9014862060546875, |
| "mean_token_accuracy": 0.594680666923523, |
| "num_tokens": 48634315.0, |
| "step": 7070 |
| }, |
| { |
| "entropy": 1.8878782033920287, |
| "epoch": 0.8528065526379186, |
| "grad_norm": 2.046875, |
| "learning_rate": 5.8179648708194255e-06, |
| "loss": 1.8941951751708985, |
| "mean_token_accuracy": 0.5915153980255127, |
| "num_tokens": 48706116.0, |
| "step": 7080 |
| }, |
| { |
| "entropy": 1.9810489177703858, |
| "epoch": 0.8540110816670682, |
| "grad_norm": 1.8046875, |
| "learning_rate": 5.725062736778486e-06, |
| "loss": 1.9749015808105468, |
| "mean_token_accuracy": 0.5741178423166275, |
| "num_tokens": 48773243.0, |
| "step": 7090 |
| }, |
| { |
| "entropy": 1.895472002029419, |
| "epoch": 0.8552156106962178, |
| "grad_norm": 1.8671875, |
| "learning_rate": 5.632863260065802e-06, |
| "loss": 1.9204191207885741, |
| "mean_token_accuracy": 0.5905978083610535, |
| "num_tokens": 48841002.0, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.8552156106962178, |
| "eval_entropy": 2.3433711528778076, |
| "eval_loss": 2.607316017150879, |
| "eval_mean_token_accuracy": 0.4687269777059555, |
| "eval_num_tokens": 48841002.0, |
| "eval_runtime": 0.4384, |
| "eval_samples_per_second": 36.495, |
| "eval_steps_per_second": 4.562, |
| "step": 7100 |
| }, |
| { |
| "entropy": 1.9661857843399049, |
| "epoch": 0.8564201397253673, |
| "grad_norm": 2.5, |
| "learning_rate": 5.541367903916367e-06, |
| "loss": 1.9712324142456055, |
| "mean_token_accuracy": 0.5750096321105957, |
| "num_tokens": 48911106.0, |
| "step": 7110 |
| }, |
| { |
| "entropy": 1.9432387232780457, |
| "epoch": 0.857624668754517, |
| "grad_norm": 2.234375, |
| "learning_rate": 5.4505781203905705e-06, |
| "loss": 1.9617923736572265, |
| "mean_token_accuracy": 0.5740188658237457, |
| "num_tokens": 48979091.0, |
| "step": 7120 |
| }, |
| { |
| "entropy": 1.8688494324684144, |
| "epoch": 0.8588291977836666, |
| "grad_norm": 1.7265625, |
| "learning_rate": 5.360495350351136e-06, |
| "loss": 1.8777385711669923, |
| "mean_token_accuracy": 0.5954580247402191, |
| "num_tokens": 49045460.0, |
| "step": 7130 |
| }, |
| { |
| "entropy": 1.9055879592895508, |
| "epoch": 0.8600337268128162, |
| "grad_norm": 2.515625, |
| "learning_rate": 5.271121023440262e-06, |
| "loss": 1.9095733642578125, |
| "mean_token_accuracy": 0.5880828917026519, |
| "num_tokens": 49113551.0, |
| "step": 7140 |
| }, |
| { |
| "entropy": 1.9325553178787231, |
| "epoch": 0.8612382558419658, |
| "grad_norm": 1.90625, |
| "learning_rate": 5.182456558056914e-06, |
| "loss": 1.9428960800170898, |
| "mean_token_accuracy": 0.5813543915748596, |
| "num_tokens": 49181943.0, |
| "step": 7150 |
| }, |
| { |
| "entropy": 1.920418655872345, |
| "epoch": 0.8624427848711154, |
| "grad_norm": 1.984375, |
| "learning_rate": 5.094503361334363e-06, |
| "loss": 1.9210506439208985, |
| "mean_token_accuracy": 0.5861852407455445, |
| "num_tokens": 49249356.0, |
| "step": 7160 |
| }, |
| { |
| "entropy": 2.001292312145233, |
| "epoch": 0.863647313900265, |
| "grad_norm": 2.71875, |
| "learning_rate": 5.007262829117793e-06, |
| "loss": 2.0109493255615236, |
| "mean_token_accuracy": 0.5729002892971039, |
| "num_tokens": 49320388.0, |
| "step": 7170 |
| }, |
| { |
| "entropy": 1.8460107445716858, |
| "epoch": 0.8648518429294146, |
| "grad_norm": 5.09375, |
| "learning_rate": 4.920736345942157e-06, |
| "loss": 1.8592243194580078, |
| "mean_token_accuracy": 0.5964674472808837, |
| "num_tokens": 49390411.0, |
| "step": 7180 |
| }, |
| { |
| "entropy": 1.8660251021385192, |
| "epoch": 0.8660563719585642, |
| "grad_norm": 2.234375, |
| "learning_rate": 4.834925285010283e-06, |
| "loss": 1.8623886108398438, |
| "mean_token_accuracy": 0.5940340518951416, |
| "num_tokens": 49457795.0, |
| "step": 7190 |
| }, |
| { |
| "entropy": 1.8367921233177185, |
| "epoch": 0.8672609009877138, |
| "grad_norm": 2.234375, |
| "learning_rate": 4.749831008170957e-06, |
| "loss": 1.8184185028076172, |
| "mean_token_accuracy": 0.6032820522785187, |
| "num_tokens": 49527308.0, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.8672609009877138, |
| "eval_entropy": 2.3435826301574707, |
| "eval_loss": 2.60599684715271, |
| "eval_mean_token_accuracy": 0.46767687797546387, |
| "eval_num_tokens": 49527308.0, |
| "eval_runtime": 0.4499, |
| "eval_samples_per_second": 35.564, |
| "eval_steps_per_second": 4.445, |
| "step": 7200 |
| }, |
| { |
| "entropy": 1.94066721200943, |
| "epoch": 0.8684654300168634, |
| "grad_norm": 1.8828125, |
| "learning_rate": 4.665454865897423e-06, |
| "loss": 1.9611551284790039, |
| "mean_token_accuracy": 0.5786782205104828, |
| "num_tokens": 49597360.0, |
| "step": 7210 |
| }, |
| { |
| "entropy": 1.9247072339057922, |
| "epoch": 0.869669959046013, |
| "grad_norm": 2.140625, |
| "learning_rate": 4.581798197265863e-06, |
| "loss": 1.9551952362060547, |
| "mean_token_accuracy": 0.5819454908370971, |
| "num_tokens": 49667178.0, |
| "step": 7220 |
| }, |
| { |
| "entropy": 1.8539777040481566, |
| "epoch": 0.8708744880751627, |
| "grad_norm": 1.6796875, |
| "learning_rate": 4.498862329934217e-06, |
| "loss": 1.860626792907715, |
| "mean_token_accuracy": 0.5921060740947723, |
| "num_tokens": 49738727.0, |
| "step": 7230 |
| }, |
| { |
| "entropy": 1.8949302673339843, |
| "epoch": 0.8720790171043122, |
| "grad_norm": 2.890625, |
| "learning_rate": 4.416648580121047e-06, |
| "loss": 1.9052955627441406, |
| "mean_token_accuracy": 0.5828841984272003, |
| "num_tokens": 49808826.0, |
| "step": 7240 |
| }, |
| { |
| "entropy": 1.9745012044906616, |
| "epoch": 0.8732835461334618, |
| "grad_norm": 1.7890625, |
| "learning_rate": 4.335158252584709e-06, |
| "loss": 1.9993856430053711, |
| "mean_token_accuracy": 0.5721430122852326, |
| "num_tokens": 49875582.0, |
| "step": 7250 |
| }, |
| { |
| "entropy": 1.9271727800369263, |
| "epoch": 0.8744880751626114, |
| "grad_norm": 1.984375, |
| "learning_rate": 4.254392640602589e-06, |
| "loss": 1.9436689376831056, |
| "mean_token_accuracy": 0.5862739264965058, |
| "num_tokens": 49944018.0, |
| "step": 7260 |
| }, |
| { |
| "entropy": 1.8917516708374023, |
| "epoch": 0.875692604191761, |
| "grad_norm": 3.046875, |
| "learning_rate": 4.174353025950645e-06, |
| "loss": 1.9089736938476562, |
| "mean_token_accuracy": 0.587155544757843, |
| "num_tokens": 50010065.0, |
| "step": 7270 |
| }, |
| { |
| "entropy": 1.8943871140480042, |
| "epoch": 0.8768971332209107, |
| "grad_norm": 1.796875, |
| "learning_rate": 4.095040678882989e-06, |
| "loss": 1.9053379058837892, |
| "mean_token_accuracy": 0.5885473728179932, |
| "num_tokens": 50079485.0, |
| "step": 7280 |
| }, |
| { |
| "entropy": 1.8807770133018493, |
| "epoch": 0.8781016622500603, |
| "grad_norm": 2.359375, |
| "learning_rate": 4.016456858111778e-06, |
| "loss": 1.8898618698120118, |
| "mean_token_accuracy": 0.592292708158493, |
| "num_tokens": 50149875.0, |
| "step": 7290 |
| }, |
| { |
| "entropy": 1.9077269315719605, |
| "epoch": 0.8793061912792098, |
| "grad_norm": 1.828125, |
| "learning_rate": 3.938602810787234e-06, |
| "loss": 1.909377098083496, |
| "mean_token_accuracy": 0.5847647190093994, |
| "num_tokens": 50215594.0, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.8793061912792098, |
| "eval_entropy": 2.346004843711853, |
| "eval_loss": 2.604802131652832, |
| "eval_mean_token_accuracy": 0.4670301526784897, |
| "eval_num_tokens": 50215594.0, |
| "eval_runtime": 0.4248, |
| "eval_samples_per_second": 37.664, |
| "eval_steps_per_second": 4.708, |
| "step": 7300 |
| }, |
| { |
| "entropy": 1.9532364130020141, |
| "epoch": 0.8805107203083594, |
| "grad_norm": 2.109375, |
| "learning_rate": 3.8614797724778326e-06, |
| "loss": 1.96121826171875, |
| "mean_token_accuracy": 0.5782608270645142, |
| "num_tokens": 50282908.0, |
| "step": 7310 |
| }, |
| { |
| "entropy": 1.890508770942688, |
| "epoch": 0.881715249337509, |
| "grad_norm": 2.765625, |
| "learning_rate": 3.785088967150713e-06, |
| "loss": 1.8945865631103516, |
| "mean_token_accuracy": 0.586310613155365, |
| "num_tokens": 50351625.0, |
| "step": 7320 |
| }, |
| { |
| "entropy": 1.9098580598831176, |
| "epoch": 0.8829197783666587, |
| "grad_norm": 1.6875, |
| "learning_rate": 3.7094316071522305e-06, |
| "loss": 1.9120573043823241, |
| "mean_token_accuracy": 0.5849235951900482, |
| "num_tokens": 50421389.0, |
| "step": 7330 |
| }, |
| { |
| "entropy": 1.8125726580619812, |
| "epoch": 0.8841243073958083, |
| "grad_norm": 1.890625, |
| "learning_rate": 3.6345088931887482e-06, |
| "loss": 1.8193851470947267, |
| "mean_token_accuracy": 0.6028900504112243, |
| "num_tokens": 50491754.0, |
| "step": 7340 |
| }, |
| { |
| "entropy": 1.9427113771438598, |
| "epoch": 0.8853288364249579, |
| "grad_norm": 1.859375, |
| "learning_rate": 3.5603220143075323e-06, |
| "loss": 1.9375303268432618, |
| "mean_token_accuracy": 0.5842464864253998, |
| "num_tokens": 50555309.0, |
| "step": 7350 |
| }, |
| { |
| "entropy": 1.943008029460907, |
| "epoch": 0.8865333654541074, |
| "grad_norm": 2.140625, |
| "learning_rate": 3.486872147877962e-06, |
| "loss": 1.9568832397460938, |
| "mean_token_accuracy": 0.579954844713211, |
| "num_tokens": 50623376.0, |
| "step": 7360 |
| }, |
| { |
| "entropy": 1.8467905282974244, |
| "epoch": 0.887737894483257, |
| "grad_norm": 1.5625, |
| "learning_rate": 3.414160459572746e-06, |
| "loss": 1.8543149948120117, |
| "mean_token_accuracy": 0.5997041761875153, |
| "num_tokens": 50693971.0, |
| "step": 7370 |
| }, |
| { |
| "entropy": 1.9237533926963806, |
| "epoch": 0.8889424235124066, |
| "grad_norm": 1.7421875, |
| "learning_rate": 3.3421881033494863e-06, |
| "loss": 1.9374340057373047, |
| "mean_token_accuracy": 0.5844144821166992, |
| "num_tokens": 50763595.0, |
| "step": 7380 |
| }, |
| { |
| "entropy": 1.996901023387909, |
| "epoch": 0.8901469525415563, |
| "grad_norm": 2.625, |
| "learning_rate": 3.270956221432375e-06, |
| "loss": 2.023693656921387, |
| "mean_token_accuracy": 0.5725166618824005, |
| "num_tokens": 50830108.0, |
| "step": 7390 |
| }, |
| { |
| "entropy": 1.9481045484542847, |
| "epoch": 0.8913514815707059, |
| "grad_norm": 1.78125, |
| "learning_rate": 3.200465944294001e-06, |
| "loss": 1.974094772338867, |
| "mean_token_accuracy": 0.577884703874588, |
| "num_tokens": 50900072.0, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.8913514815707059, |
| "eval_entropy": 2.3446000814437866, |
| "eval_loss": 2.6050777435302734, |
| "eval_mean_token_accuracy": 0.46913036704063416, |
| "eval_num_tokens": 50900072.0, |
| "eval_runtime": 0.4574, |
| "eval_samples_per_second": 34.977, |
| "eval_steps_per_second": 4.372, |
| "step": 7400 |
| }, |
| { |
| "entropy": 1.949066948890686, |
| "epoch": 0.8925560105998555, |
| "grad_norm": 5.21875, |
| "learning_rate": 3.1307183906374825e-06, |
| "loss": 1.9471038818359374, |
| "mean_token_accuracy": 0.5768292903900146, |
| "num_tokens": 50970210.0, |
| "step": 7410 |
| }, |
| { |
| "entropy": 1.92543066740036, |
| "epoch": 0.893760539629005, |
| "grad_norm": 2.359375, |
| "learning_rate": 3.0617146673786565e-06, |
| "loss": 1.9237152099609376, |
| "mean_token_accuracy": 0.5863450348377228, |
| "num_tokens": 51041271.0, |
| "step": 7420 |
| }, |
| { |
| "entropy": 1.9166660070419312, |
| "epoch": 0.8949650686581546, |
| "grad_norm": 1.7734375, |
| "learning_rate": 2.993455869628553e-06, |
| "loss": 1.9338464736938477, |
| "mean_token_accuracy": 0.5904297232627869, |
| "num_tokens": 51111709.0, |
| "step": 7430 |
| }, |
| { |
| "entropy": 1.922934341430664, |
| "epoch": 0.8961695976873043, |
| "grad_norm": 2.796875, |
| "learning_rate": 2.925943080675986e-06, |
| "loss": 1.9388483047485352, |
| "mean_token_accuracy": 0.5841511905193328, |
| "num_tokens": 51181794.0, |
| "step": 7440 |
| }, |
| { |
| "entropy": 1.8780438542366027, |
| "epoch": 0.8973741267164539, |
| "grad_norm": 5.03125, |
| "learning_rate": 2.859177371970384e-06, |
| "loss": 1.8922248840332032, |
| "mean_token_accuracy": 0.5913154661655426, |
| "num_tokens": 51251431.0, |
| "step": 7450 |
| }, |
| { |
| "entropy": 1.9308564901351928, |
| "epoch": 0.8985786557456035, |
| "grad_norm": 2.125, |
| "learning_rate": 2.7931598031047667e-06, |
| "loss": 1.9351591110229491, |
| "mean_token_accuracy": 0.5805219411849976, |
| "num_tokens": 51321131.0, |
| "step": 7460 |
| }, |
| { |
| "entropy": 1.8909155964851379, |
| "epoch": 0.899783184774753, |
| "grad_norm": 2.125, |
| "learning_rate": 2.7278914217989226e-06, |
| "loss": 1.9051069259643554, |
| "mean_token_accuracy": 0.5878103852272034, |
| "num_tokens": 51390078.0, |
| "step": 7470 |
| }, |
| { |
| "entropy": 1.8541254878044129, |
| "epoch": 0.9009877138039026, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.663373263882829e-06, |
| "loss": 1.858312225341797, |
| "mean_token_accuracy": 0.5975033700466156, |
| "num_tokens": 51458203.0, |
| "step": 7480 |
| }, |
| { |
| "entropy": 1.964201307296753, |
| "epoch": 0.9021922428330523, |
| "grad_norm": 1.9140625, |
| "learning_rate": 2.5996063532801427e-06, |
| "loss": 1.9741592407226562, |
| "mean_token_accuracy": 0.5768255591392517, |
| "num_tokens": 51525421.0, |
| "step": 7490 |
| }, |
| { |
| "entropy": 1.9626407384872437, |
| "epoch": 0.9033967718622019, |
| "grad_norm": 1.7265625, |
| "learning_rate": 2.5365917019920194e-06, |
| "loss": 1.9685443878173827, |
| "mean_token_accuracy": 0.5776629745960236, |
| "num_tokens": 51593989.0, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.9033967718622019, |
| "eval_entropy": 2.345577836036682, |
| "eval_loss": 2.6050193309783936, |
| "eval_mean_token_accuracy": 0.4687269777059555, |
| "eval_num_tokens": 51593989.0, |
| "eval_runtime": 0.4921, |
| "eval_samples_per_second": 32.513, |
| "eval_steps_per_second": 4.064, |
| "step": 7500 |
| }, |
| { |
| "entropy": 1.925170862674713, |
| "epoch": 0.9046013008913515, |
| "grad_norm": 1.859375, |
| "learning_rate": 2.474330310080997e-06, |
| "loss": 1.941315269470215, |
| "mean_token_accuracy": 0.5777287960052491, |
| "num_tokens": 51666307.0, |
| "step": 7510 |
| }, |
| { |
| "entropy": 2.0162018299102784, |
| "epoch": 0.9058058299205011, |
| "grad_norm": 1.8359375, |
| "learning_rate": 2.4128231656551703e-06, |
| "loss": 2.0389646530151366, |
| "mean_token_accuracy": 0.5660183310508728, |
| "num_tokens": 51731751.0, |
| "step": 7520 |
| }, |
| { |
| "entropy": 1.893938374519348, |
| "epoch": 0.9070103589496507, |
| "grad_norm": 1.6015625, |
| "learning_rate": 2.3520712448524495e-06, |
| "loss": 1.905186080932617, |
| "mean_token_accuracy": 0.5870944142341614, |
| "num_tokens": 51803551.0, |
| "step": 7530 |
| }, |
| { |
| "entropy": 1.9854292273521423, |
| "epoch": 0.9082148879788002, |
| "grad_norm": 2.078125, |
| "learning_rate": 2.2920755118251535e-06, |
| "loss": 1.9940937042236329, |
| "mean_token_accuracy": 0.5745085537433624, |
| "num_tokens": 51871883.0, |
| "step": 7540 |
| }, |
| { |
| "entropy": 1.862353754043579, |
| "epoch": 0.9094194170079499, |
| "grad_norm": 1.7421875, |
| "learning_rate": 2.2328369187246235e-06, |
| "loss": 1.8595937728881835, |
| "mean_token_accuracy": 0.5929762482643127, |
| "num_tokens": 51938996.0, |
| "step": 7550 |
| }, |
| { |
| "entropy": 1.950656282901764, |
| "epoch": 0.9106239460370995, |
| "grad_norm": 4.71875, |
| "learning_rate": 2.1743564056861564e-06, |
| "loss": 1.9565111160278321, |
| "mean_token_accuracy": 0.5811770021915436, |
| "num_tokens": 52008995.0, |
| "step": 7560 |
| }, |
| { |
| "entropy": 1.9793375253677368, |
| "epoch": 0.9118284750662491, |
| "grad_norm": 2.84375, |
| "learning_rate": 2.1166349008141017e-06, |
| "loss": 1.9869726181030274, |
| "mean_token_accuracy": 0.5761931717395783, |
| "num_tokens": 52077383.0, |
| "step": 7570 |
| }, |
| { |
| "entropy": 1.9423883318901063, |
| "epoch": 0.9130330040953987, |
| "grad_norm": 1.8046875, |
| "learning_rate": 2.0596733201670715e-06, |
| "loss": 1.9672080993652343, |
| "mean_token_accuracy": 0.5782643377780914, |
| "num_tokens": 52145346.0, |
| "step": 7580 |
| }, |
| { |
| "entropy": 1.8426369071006774, |
| "epoch": 0.9142375331245483, |
| "grad_norm": 1.8203125, |
| "learning_rate": 2.003472567743475e-06, |
| "loss": 1.854864501953125, |
| "mean_token_accuracy": 0.5952829003334046, |
| "num_tokens": 52214518.0, |
| "step": 7590 |
| }, |
| { |
| "entropy": 1.9160435318946838, |
| "epoch": 0.9154420621536979, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.948033535467103e-06, |
| "loss": 1.919775390625, |
| "mean_token_accuracy": 0.5830646812915802, |
| "num_tokens": 52282796.0, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.9154420621536979, |
| "eval_entropy": 2.34560763835907, |
| "eval_loss": 2.60562801361084, |
| "eval_mean_token_accuracy": 0.46921147406101227, |
| "eval_num_tokens": 52282796.0, |
| "eval_runtime": 0.5914, |
| "eval_samples_per_second": 27.055, |
| "eval_steps_per_second": 3.382, |
| "step": 7600 |
| }, |
| { |
| "entropy": 1.9123671293258666, |
| "epoch": 0.9166465911828475, |
| "grad_norm": 1.6640625, |
| "learning_rate": 1.893357103173027e-06, |
| "loss": 1.940965461730957, |
| "mean_token_accuracy": 0.580702805519104, |
| "num_tokens": 52352611.0, |
| "step": 7610 |
| }, |
| { |
| "entropy": 1.9102510809898376, |
| "epoch": 0.9178511202119971, |
| "grad_norm": 1.90625, |
| "learning_rate": 1.8394441385936044e-06, |
| "loss": 1.9118707656860352, |
| "mean_token_accuracy": 0.5836511015892029, |
| "num_tokens": 52423407.0, |
| "step": 7620 |
| }, |
| { |
| "entropy": 1.8948777437210083, |
| "epoch": 0.9190556492411467, |
| "grad_norm": 1.7734375, |
| "learning_rate": 1.786295497344731e-06, |
| "loss": 1.9079843521118165, |
| "mean_token_accuracy": 0.583397525548935, |
| "num_tokens": 52492043.0, |
| "step": 7630 |
| }, |
| { |
| "entropy": 1.952082085609436, |
| "epoch": 0.9202601782702963, |
| "grad_norm": 1.9609375, |
| "learning_rate": 1.7339120229122263e-06, |
| "loss": 1.9593387603759767, |
| "mean_token_accuracy": 0.5791211247444152, |
| "num_tokens": 52558521.0, |
| "step": 7640 |
| }, |
| { |
| "entropy": 1.9148999094963073, |
| "epoch": 0.921464707299446, |
| "grad_norm": 1.5625, |
| "learning_rate": 1.6822945466384798e-06, |
| "loss": 1.9232852935791016, |
| "mean_token_accuracy": 0.5887005269527436, |
| "num_tokens": 52626715.0, |
| "step": 7650 |
| }, |
| { |
| "entropy": 1.8534759402275085, |
| "epoch": 0.9226692363285955, |
| "grad_norm": 1.71875, |
| "learning_rate": 1.6314438877092552e-06, |
| "loss": 1.8539718627929687, |
| "mean_token_accuracy": 0.5986401557922363, |
| "num_tokens": 52694904.0, |
| "step": 7660 |
| }, |
| { |
| "entropy": 1.9236411809921266, |
| "epoch": 0.9238737653577451, |
| "grad_norm": 2.046875, |
| "learning_rate": 1.581360853140673e-06, |
| "loss": 1.9403417587280274, |
| "mean_token_accuracy": 0.5860957503318787, |
| "num_tokens": 52764982.0, |
| "step": 7670 |
| }, |
| { |
| "entropy": 1.8666252613067627, |
| "epoch": 0.9250782943868947, |
| "grad_norm": 1.6640625, |
| "learning_rate": 1.5320462377664103e-06, |
| "loss": 1.861013412475586, |
| "mean_token_accuracy": 0.5939235925674439, |
| "num_tokens": 52834325.0, |
| "step": 7680 |
| }, |
| { |
| "entropy": 1.851730763912201, |
| "epoch": 0.9262828234160443, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.483500824225087e-06, |
| "loss": 1.8697208404541015, |
| "mean_token_accuracy": 0.594866144657135, |
| "num_tokens": 52901947.0, |
| "step": 7690 |
| }, |
| { |
| "entropy": 2.0280008792877195, |
| "epoch": 0.927487352445194, |
| "grad_norm": 2.140625, |
| "learning_rate": 1.43572538294785e-06, |
| "loss": 2.0267152786254883, |
| "mean_token_accuracy": 0.5662186741828918, |
| "num_tokens": 52972131.0, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.927487352445194, |
| "eval_entropy": 2.3444355726242065, |
| "eval_loss": 2.6051812171936035, |
| "eval_mean_token_accuracy": 0.46985819935798645, |
| "eval_num_tokens": 52972131.0, |
| "eval_runtime": 0.4263, |
| "eval_samples_per_second": 37.536, |
| "eval_steps_per_second": 4.692, |
| "step": 7700 |
| }, |
| { |
| "entropy": 1.9256292819976806, |
| "epoch": 0.9286918814743436, |
| "grad_norm": 2.953125, |
| "learning_rate": 1.3887206721461377e-06, |
| "loss": 1.9324100494384766, |
| "mean_token_accuracy": 0.5850838720798492, |
| "num_tokens": 53038106.0, |
| "step": 7710 |
| }, |
| { |
| "entropy": 1.9198288798332215, |
| "epoch": 0.9298964105034931, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.342487437799661e-06, |
| "loss": 1.930126953125, |
| "mean_token_accuracy": 0.5821510493755341, |
| "num_tokens": 53108130.0, |
| "step": 7720 |
| }, |
| { |
| "entropy": 2.0364713191986086, |
| "epoch": 0.9311009395326427, |
| "grad_norm": 4.0625, |
| "learning_rate": 1.297026413644531e-06, |
| "loss": 2.0539339065551756, |
| "mean_token_accuracy": 0.5626788675785065, |
| "num_tokens": 53178667.0, |
| "step": 7730 |
| }, |
| { |
| "entropy": 1.8647327065467834, |
| "epoch": 0.9323054685617923, |
| "grad_norm": 2.0, |
| "learning_rate": 1.2523383211616557e-06, |
| "loss": 1.8728973388671875, |
| "mean_token_accuracy": 0.5942056119441986, |
| "num_tokens": 53247763.0, |
| "step": 7740 |
| }, |
| { |
| "entropy": 1.8051078200340271, |
| "epoch": 0.9335099975909419, |
| "grad_norm": 2.5, |
| "learning_rate": 1.2084238695652728e-06, |
| "loss": 1.807421875, |
| "mean_token_accuracy": 0.6024011552333832, |
| "num_tokens": 53319178.0, |
| "step": 7750 |
| }, |
| { |
| "entropy": 1.8821719527244567, |
| "epoch": 0.9347145266200916, |
| "grad_norm": 2.78125, |
| "learning_rate": 1.1652837557916852e-06, |
| "loss": 1.8960037231445312, |
| "mean_token_accuracy": 0.5897580981254578, |
| "num_tokens": 53392119.0, |
| "step": 7760 |
| }, |
| { |
| "entropy": 1.8518799781799316, |
| "epoch": 0.9359190556492412, |
| "grad_norm": 1.75, |
| "learning_rate": 1.122918664488215e-06, |
| "loss": 1.8689956665039062, |
| "mean_token_accuracy": 0.5916654944419861, |
| "num_tokens": 53463254.0, |
| "step": 7770 |
| }, |
| { |
| "entropy": 1.904589629173279, |
| "epoch": 0.9371235846783907, |
| "grad_norm": 1.921875, |
| "learning_rate": 1.081329268002318e-06, |
| "loss": 1.9152328491210937, |
| "mean_token_accuracy": 0.5899509906768798, |
| "num_tokens": 53534268.0, |
| "step": 7780 |
| }, |
| { |
| "entropy": 1.9228339433670043, |
| "epoch": 0.9383281137075403, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.0405162263709522e-06, |
| "loss": 1.9269737243652343, |
| "mean_token_accuracy": 0.5832758903503418, |
| "num_tokens": 53604143.0, |
| "step": 7790 |
| }, |
| { |
| "entropy": 1.9582505464553832, |
| "epoch": 0.9395326427366899, |
| "grad_norm": 1.90625, |
| "learning_rate": 1.0004801873100488e-06, |
| "loss": 1.9452173233032226, |
| "mean_token_accuracy": 0.5848609387874604, |
| "num_tokens": 53672676.0, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.9395326427366899, |
| "eval_entropy": 2.345103621482849, |
| "eval_loss": 2.6041245460510254, |
| "eval_mean_token_accuracy": 0.4686458706855774, |
| "eval_num_tokens": 53672676.0, |
| "eval_runtime": 0.4458, |
| "eval_samples_per_second": 35.89, |
| "eval_steps_per_second": 4.486, |
| "step": 7800 |
| }, |
| { |
| "entropy": 1.8510336399078369, |
| "epoch": 0.9407371717658396, |
| "grad_norm": 2.75, |
| "learning_rate": 9.61221786204286e-07, |
| "loss": 1.8590290069580078, |
| "mean_token_accuracy": 0.5965212464332581, |
| "num_tokens": 53740242.0, |
| "step": 7810 |
| }, |
| { |
| "entropy": 1.9238565683364868, |
| "epoch": 0.9419417007949892, |
| "grad_norm": 1.828125, |
| "learning_rate": 9.227416460969584e-07, |
| "loss": 1.9245628356933593, |
| "mean_token_accuracy": 0.5861930787563324, |
| "num_tokens": 53811854.0, |
| "step": 7820 |
| }, |
| { |
| "entropy": 1.8533311009407043, |
| "epoch": 0.9431462298241388, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.850403776801186e-07, |
| "loss": 1.8664436340332031, |
| "mean_token_accuracy": 0.5968495309352875, |
| "num_tokens": 53881237.0, |
| "step": 7830 |
| }, |
| { |
| "entropy": 1.8412243723869324, |
| "epoch": 0.9443507588532883, |
| "grad_norm": 1.9140625, |
| "learning_rate": 8.481185792848955e-07, |
| "loss": 1.8526018142700196, |
| "mean_token_accuracy": 0.5925304174423218, |
| "num_tokens": 53950681.0, |
| "step": 7840 |
| }, |
| { |
| "entropy": 1.9087011337280273, |
| "epoch": 0.9455552878824379, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.119768368719471e-07, |
| "loss": 1.9235469818115234, |
| "mean_token_accuracy": 0.5897546947002411, |
| "num_tokens": 54017690.0, |
| "step": 7850 |
| }, |
| { |
| "entropy": 1.8836241483688354, |
| "epoch": 0.9467598169115876, |
| "grad_norm": 1.7109375, |
| "learning_rate": 7.766157240222338e-07, |
| "loss": 1.883704376220703, |
| "mean_token_accuracy": 0.5887205183506012, |
| "num_tokens": 54087341.0, |
| "step": 7860 |
| }, |
| { |
| "entropy": 1.7967774033546449, |
| "epoch": 0.9479643459407372, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.420358019278429e-07, |
| "loss": 1.8183822631835938, |
| "mean_token_accuracy": 0.6018091261386871, |
| "num_tokens": 54157056.0, |
| "step": 7870 |
| }, |
| { |
| "entropy": 1.8912227272987365, |
| "epoch": 0.9491688749698868, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.082376193831341e-07, |
| "loss": 1.9251108169555664, |
| "mean_token_accuracy": 0.584669029712677, |
| "num_tokens": 54228217.0, |
| "step": 7880 |
| }, |
| { |
| "entropy": 1.9450337767601014, |
| "epoch": 0.9503734039990364, |
| "grad_norm": 2.46875, |
| "learning_rate": 6.752217127760085e-07, |
| "loss": 1.9651473999023437, |
| "mean_token_accuracy": 0.5786853671073914, |
| "num_tokens": 54297830.0, |
| "step": 7890 |
| }, |
| { |
| "entropy": 1.9052072405815124, |
| "epoch": 0.9515779330281859, |
| "grad_norm": 1.8515625, |
| "learning_rate": 6.429886060793977e-07, |
| "loss": 1.9121397018432618, |
| "mean_token_accuracy": 0.5886496782302857, |
| "num_tokens": 54362575.0, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.9515779330281859, |
| "eval_entropy": 2.3454304933547974, |
| "eval_loss": 2.6043336391448975, |
| "eval_mean_token_accuracy": 0.4654955416917801, |
| "eval_num_tokens": 54362575.0, |
| "eval_runtime": 0.4115, |
| "eval_samples_per_second": 38.879, |
| "eval_steps_per_second": 4.86, |
| "step": 7900 |
| }, |
| { |
| "entropy": 1.8473743200302124, |
| "epoch": 0.9527824620573356, |
| "grad_norm": 1.8671875, |
| "learning_rate": 6.115388108429598e-07, |
| "loss": 1.859954261779785, |
| "mean_token_accuracy": 0.5982777655124665, |
| "num_tokens": 54431560.0, |
| "step": 7910 |
| }, |
| { |
| "entropy": 1.8762193322181702, |
| "epoch": 0.9539869910864852, |
| "grad_norm": 2.078125, |
| "learning_rate": 5.80872826184925e-07, |
| "loss": 1.877403450012207, |
| "mean_token_accuracy": 0.5930526316165924, |
| "num_tokens": 54500261.0, |
| "step": 7920 |
| }, |
| { |
| "entropy": 1.9170417070388794, |
| "epoch": 0.9551915201156348, |
| "grad_norm": 1.921875, |
| "learning_rate": 5.509911387842293e-07, |
| "loss": 1.9350042343139648, |
| "mean_token_accuracy": 0.5842573642730713, |
| "num_tokens": 54566244.0, |
| "step": 7930 |
| }, |
| { |
| "entropy": 1.803466534614563, |
| "epoch": 0.9563960491447844, |
| "grad_norm": 1.796875, |
| "learning_rate": 5.218942228727486e-07, |
| "loss": 1.8177553176879884, |
| "mean_token_accuracy": 0.6064643025398254, |
| "num_tokens": 54636921.0, |
| "step": 7940 |
| }, |
| { |
| "entropy": 1.9365626573562622, |
| "epoch": 0.957600578173934, |
| "grad_norm": 2.015625, |
| "learning_rate": 4.935825402277938e-07, |
| "loss": 1.9411260604858398, |
| "mean_token_accuracy": 0.5798054218292237, |
| "num_tokens": 54704310.0, |
| "step": 7950 |
| }, |
| { |
| "entropy": 1.9129403948783874, |
| "epoch": 0.9588051072030835, |
| "grad_norm": 1.90625, |
| "learning_rate": 4.660565401647554e-07, |
| "loss": 1.9295969009399414, |
| "mean_token_accuracy": 0.5854752659797668, |
| "num_tokens": 54771615.0, |
| "step": 7960 |
| }, |
| { |
| "entropy": 1.9536687135696411, |
| "epoch": 0.9600096362322332, |
| "grad_norm": 3.515625, |
| "learning_rate": 4.3931665953001466e-07, |
| "loss": 1.9652423858642578, |
| "mean_token_accuracy": 0.5795530259609223, |
| "num_tokens": 54839615.0, |
| "step": 7970 |
| }, |
| { |
| "entropy": 2.003860831260681, |
| "epoch": 0.9612141652613828, |
| "grad_norm": 2.921875, |
| "learning_rate": 4.133633226939659e-07, |
| "loss": 2.010598564147949, |
| "mean_token_accuracy": 0.5707609951496124, |
| "num_tokens": 54910017.0, |
| "step": 7980 |
| }, |
| { |
| "entropy": 1.8675318241119385, |
| "epoch": 0.9624186942905324, |
| "grad_norm": 1.5234375, |
| "learning_rate": 3.8819694154432763e-07, |
| "loss": 1.862639808654785, |
| "mean_token_accuracy": 0.5950047254562378, |
| "num_tokens": 54979715.0, |
| "step": 7990 |
| }, |
| { |
| "entropy": 1.8299493789672852, |
| "epoch": 0.963623223319682, |
| "grad_norm": 1.9296875, |
| "learning_rate": 3.63817915479564e-07, |
| "loss": 1.830225372314453, |
| "mean_token_accuracy": 0.6012236654758454, |
| "num_tokens": 55049807.0, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.963623223319682, |
| "eval_entropy": 2.344685196876526, |
| "eval_loss": 2.6045424938201904, |
| "eval_mean_token_accuracy": 0.46816137433052063, |
| "eval_num_tokens": 55049807.0, |
| "eval_runtime": 0.4229, |
| "eval_samples_per_second": 37.838, |
| "eval_steps_per_second": 4.73, |
| "step": 8000 |
| }, |
| { |
| "entropy": 1.9564312815666198, |
| "epoch": 0.9648277523488316, |
| "grad_norm": 2.703125, |
| "learning_rate": 3.402266314025626e-07, |
| "loss": 1.9624727249145508, |
| "mean_token_accuracy": 0.5809799790382385, |
| "num_tokens": 55115920.0, |
| "step": 8010 |
| }, |
| { |
| "entropy": 1.9024741530418396, |
| "epoch": 0.9660322813779813, |
| "grad_norm": 2.921875, |
| "learning_rate": 3.174234637145057e-07, |
| "loss": 1.9156721115112305, |
| "mean_token_accuracy": 0.5865259766578674, |
| "num_tokens": 55188204.0, |
| "step": 8020 |
| }, |
| { |
| "entropy": 1.8939731240272522, |
| "epoch": 0.9672368104071308, |
| "grad_norm": 1.5703125, |
| "learning_rate": 2.95408774308914e-07, |
| "loss": 1.8975866317749024, |
| "mean_token_accuracy": 0.5925525784492492, |
| "num_tokens": 55256110.0, |
| "step": 8030 |
| }, |
| { |
| "entropy": 1.9126658678054809, |
| "epoch": 0.9684413394362804, |
| "grad_norm": 4.03125, |
| "learning_rate": 2.7418291256590124e-07, |
| "loss": 1.9254207611083984, |
| "mean_token_accuracy": 0.5853155791759491, |
| "num_tokens": 55322680.0, |
| "step": 8040 |
| }, |
| { |
| "entropy": 2.01700644493103, |
| "epoch": 0.96964586846543, |
| "grad_norm": 3.203125, |
| "learning_rate": 2.537462153466452e-07, |
| "loss": 2.042200469970703, |
| "mean_token_accuracy": 0.5682180523872375, |
| "num_tokens": 55392990.0, |
| "step": 8050 |
| }, |
| { |
| "entropy": 1.8897229433059692, |
| "epoch": 0.9708503974945796, |
| "grad_norm": 2.125, |
| "learning_rate": 2.3409900698802556e-07, |
| "loss": 1.900827980041504, |
| "mean_token_accuracy": 0.5886753618717193, |
| "num_tokens": 55460771.0, |
| "step": 8060 |
| }, |
| { |
| "entropy": 1.889931321144104, |
| "epoch": 0.9720549265237293, |
| "grad_norm": 2.09375, |
| "learning_rate": 2.1524159929748323e-07, |
| "loss": 1.901046371459961, |
| "mean_token_accuracy": 0.583438491821289, |
| "num_tokens": 55528176.0, |
| "step": 8070 |
| }, |
| { |
| "entropy": 1.918817901611328, |
| "epoch": 0.9732594555528788, |
| "grad_norm": 1.9765625, |
| "learning_rate": 1.971742915480801e-07, |
| "loss": 1.9224884033203125, |
| "mean_token_accuracy": 0.579836505651474, |
| "num_tokens": 55596913.0, |
| "step": 8080 |
| }, |
| { |
| "entropy": 1.919544279575348, |
| "epoch": 0.9744639845820284, |
| "grad_norm": 1.6171875, |
| "learning_rate": 1.7989737047371946e-07, |
| "loss": 1.923672866821289, |
| "mean_token_accuracy": 0.5821768641471863, |
| "num_tokens": 55666122.0, |
| "step": 8090 |
| }, |
| { |
| "entropy": 1.8848103880882263, |
| "epoch": 0.975668513611178, |
| "grad_norm": 2.09375, |
| "learning_rate": 1.6341111026464407e-07, |
| "loss": 1.8861661911010743, |
| "mean_token_accuracy": 0.5957940876483917, |
| "num_tokens": 55734211.0, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.975668513611178, |
| "eval_entropy": 2.3450320959091187, |
| "eval_loss": 2.605994701385498, |
| "eval_mean_token_accuracy": 0.46759575605392456, |
| "eval_num_tokens": 55734211.0, |
| "eval_runtime": 0.4563, |
| "eval_samples_per_second": 35.063, |
| "eval_steps_per_second": 4.383, |
| "step": 8100 |
| }, |
| { |
| "entropy": 1.9739423155784608, |
| "epoch": 0.9768730426403276, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.4771577256303404e-07, |
| "loss": 1.9796539306640626, |
| "mean_token_accuracy": 0.5786350190639495, |
| "num_tokens": 55802782.0, |
| "step": 8110 |
| }, |
| { |
| "entropy": 1.8973981261253356, |
| "epoch": 0.9780775716694773, |
| "grad_norm": 1.921875, |
| "learning_rate": 1.3281160645889356e-07, |
| "loss": 1.9044807434082032, |
| "mean_token_accuracy": 0.5940297067165374, |
| "num_tokens": 55869507.0, |
| "step": 8120 |
| }, |
| { |
| "entropy": 1.8639543056488037, |
| "epoch": 0.9792821006986269, |
| "grad_norm": 4.90625, |
| "learning_rate": 1.1869884848607072e-07, |
| "loss": 1.856203842163086, |
| "mean_token_accuracy": 0.5933269500732422, |
| "num_tokens": 55938519.0, |
| "step": 8130 |
| }, |
| { |
| "entropy": 1.9350976824760437, |
| "epoch": 0.9804866297277764, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.0537772261852707e-07, |
| "loss": 1.953768539428711, |
| "mean_token_accuracy": 0.5803532361984253, |
| "num_tokens": 56005239.0, |
| "step": 8140 |
| }, |
| { |
| "entropy": 1.8861400604248046, |
| "epoch": 0.981691158756926, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.284844026676287e-08, |
| "loss": 1.8933399200439454, |
| "mean_token_accuracy": 0.5899116814136505, |
| "num_tokens": 56071780.0, |
| "step": 8150 |
| }, |
| { |
| "entropy": 1.916269016265869, |
| "epoch": 0.9828956877860756, |
| "grad_norm": 5.6875, |
| "learning_rate": 8.11112002744696e-08, |
| "loss": 1.9295597076416016, |
| "mean_token_accuracy": 0.5852110207080841, |
| "num_tokens": 56141110.0, |
| "step": 8160 |
| }, |
| { |
| "entropy": 1.861630380153656, |
| "epoch": 0.9841002168152252, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.016618891538262e-08, |
| "loss": 1.8723873138427733, |
| "mean_token_accuracy": 0.5910745859146118, |
| "num_tokens": 56208947.0, |
| "step": 8170 |
| }, |
| { |
| "entropy": 1.8281081676483155, |
| "epoch": 0.9853047458443749, |
| "grad_norm": 1.6484375, |
| "learning_rate": 6.001357989030564e-08, |
| "loss": 1.8359621047973633, |
| "mean_token_accuracy": 0.5977383434772492, |
| "num_tokens": 56280892.0, |
| "step": 8180 |
| }, |
| { |
| "entropy": 1.9129138946533204, |
| "epoch": 0.9865092748735245, |
| "grad_norm": 1.8671875, |
| "learning_rate": 5.065353432436859e-08, |
| "loss": 1.9148849487304687, |
| "mean_token_accuracy": 0.5854653120040894, |
| "num_tokens": 56352290.0, |
| "step": 8190 |
| }, |
| { |
| "entropy": 1.967962348461151, |
| "epoch": 0.987713803902674, |
| "grad_norm": 2.234375, |
| "learning_rate": 4.2086200764474004e-08, |
| "loss": 1.9775993347167968, |
| "mean_token_accuracy": 0.5787408590316773, |
| "num_tokens": 56421355.0, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.987713803902674, |
| "eval_entropy": 2.3451985120773315, |
| "eval_loss": 2.6038923263549805, |
| "eval_mean_token_accuracy": 0.4664645344018936, |
| "eval_num_tokens": 56421355.0, |
| "eval_runtime": 0.4588, |
| "eval_samples_per_second": 34.872, |
| "eval_steps_per_second": 4.359, |
| "step": 8200 |
| }, |
| { |
| "entropy": 1.8938051104545592, |
| "epoch": 0.9889183329318236, |
| "grad_norm": 1.9453125, |
| "learning_rate": 3.4311715176932327e-08, |
| "loss": 1.9075881958007812, |
| "mean_token_accuracy": 0.5848981618881226, |
| "num_tokens": 56493442.0, |
| "step": 8210 |
| }, |
| { |
| "entropy": 1.9249114871025086, |
| "epoch": 0.9901228619609732, |
| "grad_norm": 1.6484375, |
| "learning_rate": 2.7330200945296923e-08, |
| "loss": 1.9367279052734374, |
| "mean_token_accuracy": 0.578474622964859, |
| "num_tokens": 56566061.0, |
| "step": 8220 |
| }, |
| { |
| "entropy": 1.9657586932182312, |
| "epoch": 0.9913273909901229, |
| "grad_norm": 2.28125, |
| "learning_rate": 2.114176886841568e-08, |
| "loss": 1.9664051055908203, |
| "mean_token_accuracy": 0.5813057720661163, |
| "num_tokens": 56632140.0, |
| "step": 8230 |
| }, |
| { |
| "entropy": 2.01274037361145, |
| "epoch": 0.9925319200192725, |
| "grad_norm": 1.984375, |
| "learning_rate": 1.574651715867681e-08, |
| "loss": 2.029743957519531, |
| "mean_token_accuracy": 0.5682400166988373, |
| "num_tokens": 56701631.0, |
| "step": 8240 |
| }, |
| { |
| "entropy": 1.922510302066803, |
| "epoch": 0.9937364490484221, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.1144531440437921e-08, |
| "loss": 1.9430387496948243, |
| "mean_token_accuracy": 0.5863241016864776, |
| "num_tokens": 56771393.0, |
| "step": 8250 |
| }, |
| { |
| "entropy": 1.9053786993026733, |
| "epoch": 0.9949409780775716, |
| "grad_norm": 1.7421875, |
| "learning_rate": 7.33588474867708e-09, |
| "loss": 1.9177364349365233, |
| "mean_token_accuracy": 0.5853918194770813, |
| "num_tokens": 56839912.0, |
| "step": 8260 |
| }, |
| { |
| "entropy": 1.8363399386405945, |
| "epoch": 0.9961455071067212, |
| "grad_norm": 1.9453125, |
| "learning_rate": 4.320637527827076e-09, |
| "loss": 1.855323028564453, |
| "mean_token_accuracy": 0.5942590415477753, |
| "num_tokens": 56911627.0, |
| "step": 8270 |
| }, |
| { |
| "entropy": 1.8521572947502136, |
| "epoch": 0.9973500361358709, |
| "grad_norm": 1.84375, |
| "learning_rate": 2.098837630820638e-09, |
| "loss": 1.8712289810180665, |
| "mean_token_accuracy": 0.5923120260238648, |
| "num_tokens": 56982089.0, |
| "step": 8280 |
| }, |
| { |
| "entropy": 1.9446120381355285, |
| "epoch": 0.9985545651650205, |
| "grad_norm": 3.078125, |
| "learning_rate": 6.705203183243747e-10, |
| "loss": 1.9402387619018555, |
| "mean_token_accuracy": 0.5822612524032593, |
| "num_tokens": 57049724.0, |
| "step": 8290 |
| }, |
| { |
| "entropy": 1.8550827741622924, |
| "epoch": 0.9997590941941701, |
| "grad_norm": 1.8203125, |
| "learning_rate": 3.570825819476653e-11, |
| "loss": 1.8673707962036132, |
| "mean_token_accuracy": 0.5967185080051423, |
| "num_tokens": 57120947.0, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.9997590941941701, |
| "eval_entropy": 2.3444266319274902, |
| "eval_loss": 2.6046648025512695, |
| "eval_mean_token_accuracy": 0.4665456563234329, |
| "eval_num_tokens": 57120947.0, |
| "eval_runtime": 0.4096, |
| "eval_samples_per_second": 39.064, |
| "eval_steps_per_second": 4.883, |
| "step": 8300 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_entropy": 2.3444266319274902, |
| "eval_loss": 2.6046648025512695, |
| "eval_mean_token_accuracy": 0.4665456563234329, |
| "eval_num_tokens": 57135597.0, |
| "eval_runtime": 0.4353, |
| "eval_samples_per_second": 36.755, |
| "eval_steps_per_second": 4.594, |
| "step": 8302 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 8302, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.583470790653158e+17, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|