| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 6.0, |
| "eval_steps": 500, |
| "global_step": 10026, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 3.0024948120117188, |
| "epoch": 0.005984440454817474, |
| "grad_norm": 7.78125, |
| "learning_rate": 5.994614003590664e-05, |
| "loss": 1.692, |
| "mean_token_accuracy": 0.7139140546321869, |
| "num_tokens": 10416.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 2.6948025226593018, |
| "epoch": 0.011968880909634948, |
| "grad_norm": 9.0, |
| "learning_rate": 5.9886295631358466e-05, |
| "loss": 0.4662, |
| "mean_token_accuracy": 0.8840459525585175, |
| "num_tokens": 20947.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 2.547980785369873, |
| "epoch": 0.017953321364452424, |
| "grad_norm": 8.1875, |
| "learning_rate": 5.98264512268103e-05, |
| "loss": 0.287, |
| "mean_token_accuracy": 0.9282953619956971, |
| "num_tokens": 31370.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 2.285873460769653, |
| "epoch": 0.023937761819269897, |
| "grad_norm": 4.53125, |
| "learning_rate": 5.976660682226212e-05, |
| "loss": 0.1783, |
| "mean_token_accuracy": 0.9513040065765381, |
| "num_tokens": 41996.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 2.4315436601638796, |
| "epoch": 0.029922202274087373, |
| "grad_norm": 6.375, |
| "learning_rate": 5.9706762417713944e-05, |
| "loss": 0.183, |
| "mean_token_accuracy": 0.9531135976314544, |
| "num_tokens": 52464.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 2.605304408073425, |
| "epoch": 0.03590664272890485, |
| "grad_norm": 4.75, |
| "learning_rate": 5.9646918013165774e-05, |
| "loss": 0.1501, |
| "mean_token_accuracy": 0.9575372993946075, |
| "num_tokens": 63125.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 2.5587785959243776, |
| "epoch": 0.041891083183722325, |
| "grad_norm": 5.75, |
| "learning_rate": 5.95870736086176e-05, |
| "loss": 0.1631, |
| "mean_token_accuracy": 0.9575134456157685, |
| "num_tokens": 73701.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 2.4741239070892336, |
| "epoch": 0.047875523638539794, |
| "grad_norm": 4.1875, |
| "learning_rate": 5.9527229204069415e-05, |
| "loss": 0.1891, |
| "mean_token_accuracy": 0.9568629384040832, |
| "num_tokens": 84353.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 2.935908842086792, |
| "epoch": 0.05385996409335727, |
| "grad_norm": 4.0, |
| "learning_rate": 5.9467384799521245e-05, |
| "loss": 0.1554, |
| "mean_token_accuracy": 0.9589068710803985, |
| "num_tokens": 94954.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 2.6647072076797484, |
| "epoch": 0.059844404548174746, |
| "grad_norm": 8.6875, |
| "learning_rate": 5.940754039497307e-05, |
| "loss": 0.1459, |
| "mean_token_accuracy": 0.962292617559433, |
| "num_tokens": 105675.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 2.3634035110473635, |
| "epoch": 0.06582884500299221, |
| "grad_norm": 3.65625, |
| "learning_rate": 5.93476959904249e-05, |
| "loss": 0.1093, |
| "mean_token_accuracy": 0.9729574143886566, |
| "num_tokens": 116315.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 2.365880823135376, |
| "epoch": 0.0718132854578097, |
| "grad_norm": 3.109375, |
| "learning_rate": 5.928785158587672e-05, |
| "loss": 0.0802, |
| "mean_token_accuracy": 0.9765478909015656, |
| "num_tokens": 126940.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 2.392621302604675, |
| "epoch": 0.07779772591262717, |
| "grad_norm": 4.0625, |
| "learning_rate": 5.9228007181328546e-05, |
| "loss": 0.0809, |
| "mean_token_accuracy": 0.9781265318393707, |
| "num_tokens": 137513.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 2.3370444774627686, |
| "epoch": 0.08378216636744465, |
| "grad_norm": 2.78125, |
| "learning_rate": 5.916816277678038e-05, |
| "loss": 0.1093, |
| "mean_token_accuracy": 0.9741817772388458, |
| "num_tokens": 148032.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 2.2481669664382933, |
| "epoch": 0.08976660682226212, |
| "grad_norm": 5.71875, |
| "learning_rate": 5.91083183722322e-05, |
| "loss": 0.0974, |
| "mean_token_accuracy": 0.9788482308387756, |
| "num_tokens": 158653.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 2.246716928482056, |
| "epoch": 0.09575104727707959, |
| "grad_norm": 3.015625, |
| "learning_rate": 5.904847396768402e-05, |
| "loss": 0.1056, |
| "mean_token_accuracy": 0.9712280333042145, |
| "num_tokens": 169103.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 2.401676082611084, |
| "epoch": 0.10173548773189707, |
| "grad_norm": 3.8125, |
| "learning_rate": 5.898862956313585e-05, |
| "loss": 0.088, |
| "mean_token_accuracy": 0.978470116853714, |
| "num_tokens": 179647.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 2.4804759502410887, |
| "epoch": 0.10771992818671454, |
| "grad_norm": 6.9375, |
| "learning_rate": 5.892878515858767e-05, |
| "loss": 0.0924, |
| "mean_token_accuracy": 0.9786226749420166, |
| "num_tokens": 190046.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 2.4066271781921387, |
| "epoch": 0.11370436864153202, |
| "grad_norm": 3.71875, |
| "learning_rate": 5.88689407540395e-05, |
| "loss": 0.092, |
| "mean_token_accuracy": 0.9766816020011901, |
| "num_tokens": 200450.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 2.479763078689575, |
| "epoch": 0.11968880909634949, |
| "grad_norm": 5.65625, |
| "learning_rate": 5.8809096349491325e-05, |
| "loss": 0.0814, |
| "mean_token_accuracy": 0.9800179958343506, |
| "num_tokens": 210921.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 2.5948761463165284, |
| "epoch": 0.12567324955116696, |
| "grad_norm": 2.53125, |
| "learning_rate": 5.874925194494315e-05, |
| "loss": 0.0965, |
| "mean_token_accuracy": 0.9757415950298309, |
| "num_tokens": 221246.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 2.6720625877380373, |
| "epoch": 0.13165769000598443, |
| "grad_norm": 3.265625, |
| "learning_rate": 5.868940754039498e-05, |
| "loss": 0.0904, |
| "mean_token_accuracy": 0.9789108991622925, |
| "num_tokens": 231857.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 2.39876389503479, |
| "epoch": 0.13764213046080193, |
| "grad_norm": 4.28125, |
| "learning_rate": 5.86295631358468e-05, |
| "loss": 0.0519, |
| "mean_token_accuracy": 0.9840085744857788, |
| "num_tokens": 242592.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 2.083107316493988, |
| "epoch": 0.1436265709156194, |
| "grad_norm": 0.7734375, |
| "learning_rate": 5.856971873129862e-05, |
| "loss": 0.0667, |
| "mean_token_accuracy": 0.9842009723186493, |
| "num_tokens": 253100.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 2.1161860466003417, |
| "epoch": 0.14961101137043686, |
| "grad_norm": 4.34375, |
| "learning_rate": 5.850987432675045e-05, |
| "loss": 0.0655, |
| "mean_token_accuracy": 0.9847101211547852, |
| "num_tokens": 263645.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 2.261358118057251, |
| "epoch": 0.15559545182525433, |
| "grad_norm": 1.5703125, |
| "learning_rate": 5.845002992220227e-05, |
| "loss": 0.0641, |
| "mean_token_accuracy": 0.9861262500286102, |
| "num_tokens": 274021.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 2.2319637894630433, |
| "epoch": 0.1615798922800718, |
| "grad_norm": 4.15625, |
| "learning_rate": 5.83901855176541e-05, |
| "loss": 0.0549, |
| "mean_token_accuracy": 0.9827041685581207, |
| "num_tokens": 284428.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 2.225303065776825, |
| "epoch": 0.1675643327348893, |
| "grad_norm": 3.546875, |
| "learning_rate": 5.833034111310593e-05, |
| "loss": 0.0712, |
| "mean_token_accuracy": 0.9861607074737548, |
| "num_tokens": 295043.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 2.5183464765548704, |
| "epoch": 0.17354877318970677, |
| "grad_norm": 5.59375, |
| "learning_rate": 5.827049670855775e-05, |
| "loss": 0.0594, |
| "mean_token_accuracy": 0.9836657404899597, |
| "num_tokens": 305618.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 2.3645459413528442, |
| "epoch": 0.17953321364452424, |
| "grad_norm": 2.5, |
| "learning_rate": 5.821065230400958e-05, |
| "loss": 0.0537, |
| "mean_token_accuracy": 0.9861385583877563, |
| "num_tokens": 316134.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 2.3220778465270997, |
| "epoch": 0.1855176540993417, |
| "grad_norm": 0.70703125, |
| "learning_rate": 5.8150807899461405e-05, |
| "loss": 0.0412, |
| "mean_token_accuracy": 0.9872022569179535, |
| "num_tokens": 326594.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 2.41537606716156, |
| "epoch": 0.19150209455415917, |
| "grad_norm": 2.84375, |
| "learning_rate": 5.809096349491323e-05, |
| "loss": 0.0741, |
| "mean_token_accuracy": 0.9834691464900971, |
| "num_tokens": 337278.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 2.437514638900757, |
| "epoch": 0.19748653500897667, |
| "grad_norm": 6.1875, |
| "learning_rate": 5.803111909036505e-05, |
| "loss": 0.077, |
| "mean_token_accuracy": 0.9830995500087738, |
| "num_tokens": 347783.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 2.54942626953125, |
| "epoch": 0.20347097546379414, |
| "grad_norm": 3.015625, |
| "learning_rate": 5.7971274685816876e-05, |
| "loss": 0.0908, |
| "mean_token_accuracy": 0.9789189040660858, |
| "num_tokens": 358140.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 2.6029396533966063, |
| "epoch": 0.2094554159186116, |
| "grad_norm": 6.4375, |
| "learning_rate": 5.79114302812687e-05, |
| "loss": 0.0813, |
| "mean_token_accuracy": 0.9807312726974488, |
| "num_tokens": 368649.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 2.7818240880966187, |
| "epoch": 0.21543985637342908, |
| "grad_norm": 1.8203125, |
| "learning_rate": 5.785158587672053e-05, |
| "loss": 0.0516, |
| "mean_token_accuracy": 0.9896690011024475, |
| "num_tokens": 379399.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 2.6641830682754515, |
| "epoch": 0.22142429682824655, |
| "grad_norm": 5.5625, |
| "learning_rate": 5.779174147217235e-05, |
| "loss": 0.0422, |
| "mean_token_accuracy": 0.988187575340271, |
| "num_tokens": 389856.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 2.4715123176574707, |
| "epoch": 0.22740873728306404, |
| "grad_norm": 2.8125, |
| "learning_rate": 5.773189706762418e-05, |
| "loss": 0.0468, |
| "mean_token_accuracy": 0.9897852420806885, |
| "num_tokens": 400419.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 2.1948570370674134, |
| "epoch": 0.2333931777378815, |
| "grad_norm": 2.71875, |
| "learning_rate": 5.767205266307601e-05, |
| "loss": 0.0577, |
| "mean_token_accuracy": 0.9862517893314362, |
| "num_tokens": 410980.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 2.1625555634498594, |
| "epoch": 0.23937761819269898, |
| "grad_norm": 3.390625, |
| "learning_rate": 5.761220825852783e-05, |
| "loss": 0.0446, |
| "mean_token_accuracy": 0.9873133063316345, |
| "num_tokens": 421285.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 2.277107524871826, |
| "epoch": 0.24536205864751645, |
| "grad_norm": 2.734375, |
| "learning_rate": 5.7552363853979654e-05, |
| "loss": 0.0663, |
| "mean_token_accuracy": 0.984232223033905, |
| "num_tokens": 431743.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 2.079650855064392, |
| "epoch": 0.2513464991023339, |
| "grad_norm": 2.609375, |
| "learning_rate": 5.749251944943148e-05, |
| "loss": 0.042, |
| "mean_token_accuracy": 0.9889630198478698, |
| "num_tokens": 442208.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 2.0632903933525086, |
| "epoch": 0.2573309395571514, |
| "grad_norm": 3.28125, |
| "learning_rate": 5.74326750448833e-05, |
| "loss": 0.0658, |
| "mean_token_accuracy": 0.9887919366359711, |
| "num_tokens": 452907.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 2.0579532027244567, |
| "epoch": 0.26331538001196886, |
| "grad_norm": 5.3125, |
| "learning_rate": 5.737283064033513e-05, |
| "loss": 0.0586, |
| "mean_token_accuracy": 0.9878389358520507, |
| "num_tokens": 463376.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 2.272001266479492, |
| "epoch": 0.26929982046678635, |
| "grad_norm": 3.671875, |
| "learning_rate": 5.7312986235786956e-05, |
| "loss": 0.0555, |
| "mean_token_accuracy": 0.9843586683273315, |
| "num_tokens": 474050.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 2.440229368209839, |
| "epoch": 0.27528426092160385, |
| "grad_norm": 0.765625, |
| "learning_rate": 5.725314183123878e-05, |
| "loss": 0.0539, |
| "mean_token_accuracy": 0.9886721253395081, |
| "num_tokens": 484638.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 2.22398726940155, |
| "epoch": 0.2812687013764213, |
| "grad_norm": 2.375, |
| "learning_rate": 5.719329742669061e-05, |
| "loss": 0.038, |
| "mean_token_accuracy": 0.9924259662628174, |
| "num_tokens": 495422.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 2.2964001417160036, |
| "epoch": 0.2872531418312388, |
| "grad_norm": 1.796875, |
| "learning_rate": 5.713345302214243e-05, |
| "loss": 0.0403, |
| "mean_token_accuracy": 0.9900494039058685, |
| "num_tokens": 506057.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 2.329486632347107, |
| "epoch": 0.29323758228605623, |
| "grad_norm": 4.4375, |
| "learning_rate": 5.707360861759426e-05, |
| "loss": 0.0429, |
| "mean_token_accuracy": 0.989715838432312, |
| "num_tokens": 516463.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 2.3366709470748903, |
| "epoch": 0.2992220227408737, |
| "grad_norm": 1.0, |
| "learning_rate": 5.701376421304608e-05, |
| "loss": 0.0645, |
| "mean_token_accuracy": 0.9869139313697814, |
| "num_tokens": 526981.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 2.4269517421722413, |
| "epoch": 0.3052064631956912, |
| "grad_norm": 2.4375, |
| "learning_rate": 5.6953919808497904e-05, |
| "loss": 0.0475, |
| "mean_token_accuracy": 0.9899543404579163, |
| "num_tokens": 537626.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 2.1964712858200075, |
| "epoch": 0.31119090365050867, |
| "grad_norm": 1.984375, |
| "learning_rate": 5.6894075403949734e-05, |
| "loss": 0.0672, |
| "mean_token_accuracy": 0.9869764924049378, |
| "num_tokens": 548046.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 2.1039986968040467, |
| "epoch": 0.31717534410532616, |
| "grad_norm": 1.4921875, |
| "learning_rate": 5.683423099940156e-05, |
| "loss": 0.0266, |
| "mean_token_accuracy": 0.9941431105136871, |
| "num_tokens": 558631.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 2.3732316970825194, |
| "epoch": 0.3231597845601436, |
| "grad_norm": 0.796875, |
| "learning_rate": 5.677438659485338e-05, |
| "loss": 0.0666, |
| "mean_token_accuracy": 0.9879998922348022, |
| "num_tokens": 569247.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 2.28140549659729, |
| "epoch": 0.3291442250149611, |
| "grad_norm": 2.3125, |
| "learning_rate": 5.671454219030521e-05, |
| "loss": 0.0406, |
| "mean_token_accuracy": 0.9887649655342102, |
| "num_tokens": 579833.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 2.4391223430633544, |
| "epoch": 0.3351286654697786, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.6654697785757035e-05, |
| "loss": 0.0495, |
| "mean_token_accuracy": 0.9919759035110474, |
| "num_tokens": 590311.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 2.4771656036376952, |
| "epoch": 0.34111310592459604, |
| "grad_norm": 2.5625, |
| "learning_rate": 5.659485338120885e-05, |
| "loss": 0.0433, |
| "mean_token_accuracy": 0.9914680182933807, |
| "num_tokens": 600739.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 2.2438557863235475, |
| "epoch": 0.34709754637941354, |
| "grad_norm": 1.34375, |
| "learning_rate": 5.653500897666068e-05, |
| "loss": 0.0469, |
| "mean_token_accuracy": 0.9890537202358246, |
| "num_tokens": 611176.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 2.1311206340789797, |
| "epoch": 0.353081986834231, |
| "grad_norm": 3.25, |
| "learning_rate": 5.6475164572112506e-05, |
| "loss": 0.0274, |
| "mean_token_accuracy": 0.9945613145828247, |
| "num_tokens": 621635.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 2.1030924916267395, |
| "epoch": 0.3590664272890485, |
| "grad_norm": 0.76171875, |
| "learning_rate": 5.641532016756434e-05, |
| "loss": 0.0312, |
| "mean_token_accuracy": 0.9910819947719574, |
| "num_tokens": 632248.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 2.344179320335388, |
| "epoch": 0.36505086774386597, |
| "grad_norm": 0.984375, |
| "learning_rate": 5.635547576301616e-05, |
| "loss": 0.0472, |
| "mean_token_accuracy": 0.9889743030071259, |
| "num_tokens": 642682.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 2.39926335811615, |
| "epoch": 0.3710353081986834, |
| "grad_norm": 1.9765625, |
| "learning_rate": 5.6295631358467984e-05, |
| "loss": 0.0532, |
| "mean_token_accuracy": 0.9905676782131195, |
| "num_tokens": 653445.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 2.1937442779541017, |
| "epoch": 0.3770197486535009, |
| "grad_norm": 2.203125, |
| "learning_rate": 5.6235786953919814e-05, |
| "loss": 0.0378, |
| "mean_token_accuracy": 0.991233092546463, |
| "num_tokens": 663807.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 2.0357667565345765, |
| "epoch": 0.38300418910831835, |
| "grad_norm": 2.78125, |
| "learning_rate": 5.617594254937164e-05, |
| "loss": 0.0412, |
| "mean_token_accuracy": 0.9888907968997955, |
| "num_tokens": 674458.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 1.9218869924545288, |
| "epoch": 0.38898862956313585, |
| "grad_norm": 1.2890625, |
| "learning_rate": 5.6116098144823455e-05, |
| "loss": 0.024, |
| "mean_token_accuracy": 0.9945397853851319, |
| "num_tokens": 684951.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 1.7186935544013977, |
| "epoch": 0.39497307001795334, |
| "grad_norm": 2.484375, |
| "learning_rate": 5.6056253740275285e-05, |
| "loss": 0.0378, |
| "mean_token_accuracy": 0.9907946050167084, |
| "num_tokens": 695456.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 1.7377236485481262, |
| "epoch": 0.4009575104727708, |
| "grad_norm": 1.8828125, |
| "learning_rate": 5.599640933572711e-05, |
| "loss": 0.0589, |
| "mean_token_accuracy": 0.988366037607193, |
| "num_tokens": 706028.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 2.0943747401237487, |
| "epoch": 0.4069419509275883, |
| "grad_norm": 1.5859375, |
| "learning_rate": 5.593656493117893e-05, |
| "loss": 0.0281, |
| "mean_token_accuracy": 0.9936488032341003, |
| "num_tokens": 716721.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 2.1617534160614014, |
| "epoch": 0.4129263913824057, |
| "grad_norm": 1.0234375, |
| "learning_rate": 5.587672052663076e-05, |
| "loss": 0.0549, |
| "mean_token_accuracy": 0.9885023236274719, |
| "num_tokens": 727257.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 2.031095576286316, |
| "epoch": 0.4189108318372232, |
| "grad_norm": 2.34375, |
| "learning_rate": 5.5816876122082586e-05, |
| "loss": 0.0341, |
| "mean_token_accuracy": 0.991800045967102, |
| "num_tokens": 737999.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 1.7404110312461853, |
| "epoch": 0.4248952722920407, |
| "grad_norm": 1.8828125, |
| "learning_rate": 5.5757031717534417e-05, |
| "loss": 0.0375, |
| "mean_token_accuracy": 0.9914985001087189, |
| "num_tokens": 748758.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 1.7197586655616761, |
| "epoch": 0.43087971274685816, |
| "grad_norm": 2.078125, |
| "learning_rate": 5.569718731298624e-05, |
| "loss": 0.0367, |
| "mean_token_accuracy": 0.9925475895404816, |
| "num_tokens": 759197.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 1.8254467248916626, |
| "epoch": 0.43686415320167565, |
| "grad_norm": 0.251953125, |
| "learning_rate": 5.563734290843806e-05, |
| "loss": 0.0231, |
| "mean_token_accuracy": 0.9942022025585174, |
| "num_tokens": 769493.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 2.1383556604385374, |
| "epoch": 0.4428485936564931, |
| "grad_norm": 0.33984375, |
| "learning_rate": 5.557749850388989e-05, |
| "loss": 0.0438, |
| "mean_token_accuracy": 0.9910104990005493, |
| "num_tokens": 779975.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 2.26552300453186, |
| "epoch": 0.4488330341113106, |
| "grad_norm": 0.9296875, |
| "learning_rate": 5.551765409934171e-05, |
| "loss": 0.0396, |
| "mean_token_accuracy": 0.992530471086502, |
| "num_tokens": 790633.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 2.0240984559059143, |
| "epoch": 0.4548174745661281, |
| "grad_norm": 2.328125, |
| "learning_rate": 5.5457809694793535e-05, |
| "loss": 0.0387, |
| "mean_token_accuracy": 0.9894236505031586, |
| "num_tokens": 801174.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 1.966257667541504, |
| "epoch": 0.46080191502094553, |
| "grad_norm": 5.375, |
| "learning_rate": 5.5397965290245365e-05, |
| "loss": 0.0468, |
| "mean_token_accuracy": 0.9893896758556366, |
| "num_tokens": 811800.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 2.0768365025520326, |
| "epoch": 0.466786355475763, |
| "grad_norm": 5.8125, |
| "learning_rate": 5.533812088569719e-05, |
| "loss": 0.0328, |
| "mean_token_accuracy": 0.9934167742729187, |
| "num_tokens": 822464.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 1.9794872641563415, |
| "epoch": 0.47277079593058047, |
| "grad_norm": 1.8203125, |
| "learning_rate": 5.527827648114902e-05, |
| "loss": 0.0313, |
| "mean_token_accuracy": 0.9931986689567566, |
| "num_tokens": 832938.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 1.9216319561004638, |
| "epoch": 0.47875523638539796, |
| "grad_norm": 5.28125, |
| "learning_rate": 5.521843207660084e-05, |
| "loss": 0.053, |
| "mean_token_accuracy": 0.9898009598255157, |
| "num_tokens": 843616.0, |
| "step": 800 |
| }, |
| { |
| "entropy": 1.9540860176086425, |
| "epoch": 0.48473967684021546, |
| "grad_norm": 0.76953125, |
| "learning_rate": 5.5158587672052666e-05, |
| "loss": 0.0231, |
| "mean_token_accuracy": 0.9942503988742828, |
| "num_tokens": 854328.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 2.186979651451111, |
| "epoch": 0.4907241172950329, |
| "grad_norm": 0.07568359375, |
| "learning_rate": 5.509874326750449e-05, |
| "loss": 0.0117, |
| "mean_token_accuracy": 0.9971015155315399, |
| "num_tokens": 864919.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 2.058599293231964, |
| "epoch": 0.4967085577498504, |
| "grad_norm": 2.984375, |
| "learning_rate": 5.503889886295631e-05, |
| "loss": 0.013, |
| "mean_token_accuracy": 0.9951699852943421, |
| "num_tokens": 875488.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 1.8453468203544616, |
| "epoch": 0.5026929982046678, |
| "grad_norm": 2.21875, |
| "learning_rate": 5.497905445840814e-05, |
| "loss": 0.0298, |
| "mean_token_accuracy": 0.9947227597236633, |
| "num_tokens": 886025.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 1.9660922765731812, |
| "epoch": 0.5086774386594853, |
| "grad_norm": 0.294921875, |
| "learning_rate": 5.491921005385997e-05, |
| "loss": 0.0315, |
| "mean_token_accuracy": 0.9947958946228027, |
| "num_tokens": 896575.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 2.0921285152435303, |
| "epoch": 0.5146618791143028, |
| "grad_norm": 2.1875, |
| "learning_rate": 5.485936564931179e-05, |
| "loss": 0.0192, |
| "mean_token_accuracy": 0.9942731440067292, |
| "num_tokens": 907186.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 2.0400147795677186, |
| "epoch": 0.5206463195691203, |
| "grad_norm": 1.7578125, |
| "learning_rate": 5.4799521244763614e-05, |
| "loss": 0.0356, |
| "mean_token_accuracy": 0.993338668346405, |
| "num_tokens": 917742.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 2.140678858757019, |
| "epoch": 0.5266307600239377, |
| "grad_norm": 1.21875, |
| "learning_rate": 5.4739676840215445e-05, |
| "loss": 0.0227, |
| "mean_token_accuracy": 0.9957817852497101, |
| "num_tokens": 928312.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 2.284876358509064, |
| "epoch": 0.5326152004787552, |
| "grad_norm": 2.71875, |
| "learning_rate": 5.467983243566727e-05, |
| "loss": 0.0576, |
| "mean_token_accuracy": 0.9898476004600525, |
| "num_tokens": 939023.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 2.2926900386810303, |
| "epoch": 0.5385996409335727, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.461998803111909e-05, |
| "loss": 0.0218, |
| "mean_token_accuracy": 0.9931512713432312, |
| "num_tokens": 949275.0, |
| "step": 900 |
| }, |
| { |
| "entropy": 1.954952347278595, |
| "epoch": 0.5445840813883902, |
| "grad_norm": 1.3984375, |
| "learning_rate": 5.4560143626570916e-05, |
| "loss": 0.0236, |
| "mean_token_accuracy": 0.993185955286026, |
| "num_tokens": 959821.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 1.9035270810127258, |
| "epoch": 0.5505685218432077, |
| "grad_norm": 2.140625, |
| "learning_rate": 5.450029922202274e-05, |
| "loss": 0.0413, |
| "mean_token_accuracy": 0.9900493204593659, |
| "num_tokens": 970302.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 2.1859104156494142, |
| "epoch": 0.5565529622980251, |
| "grad_norm": 1.78125, |
| "learning_rate": 5.444045481747457e-05, |
| "loss": 0.0351, |
| "mean_token_accuracy": 0.9932608604431152, |
| "num_tokens": 980883.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 2.0663935422897337, |
| "epoch": 0.5625374027528426, |
| "grad_norm": 2.34375, |
| "learning_rate": 5.438061041292639e-05, |
| "loss": 0.023, |
| "mean_token_accuracy": 0.9912481963634491, |
| "num_tokens": 991526.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 1.8086812853813172, |
| "epoch": 0.5685218432076601, |
| "grad_norm": 0.4453125, |
| "learning_rate": 5.432076600837822e-05, |
| "loss": 0.0156, |
| "mean_token_accuracy": 0.9955720365047455, |
| "num_tokens": 1002052.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 1.7378576636314391, |
| "epoch": 0.5745062836624776, |
| "grad_norm": 18.125, |
| "learning_rate": 5.426092160383005e-05, |
| "loss": 0.0207, |
| "mean_token_accuracy": 0.9957412719726563, |
| "num_tokens": 1012679.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 1.6939527988433838, |
| "epoch": 0.5804907241172951, |
| "grad_norm": 1.546875, |
| "learning_rate": 5.420107719928187e-05, |
| "loss": 0.0183, |
| "mean_token_accuracy": 0.9954462170600891, |
| "num_tokens": 1023349.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 1.985690712928772, |
| "epoch": 0.5864751645721125, |
| "grad_norm": 0.357421875, |
| "learning_rate": 5.414123279473369e-05, |
| "loss": 0.0176, |
| "mean_token_accuracy": 0.995449674129486, |
| "num_tokens": 1034039.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 2.013271224498749, |
| "epoch": 0.59245960502693, |
| "grad_norm": 2.03125, |
| "learning_rate": 5.408138839018552e-05, |
| "loss": 0.0299, |
| "mean_token_accuracy": 0.992346465587616, |
| "num_tokens": 1044713.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 2.0139170050621034, |
| "epoch": 0.5984440454817475, |
| "grad_norm": 2.203125, |
| "learning_rate": 5.402154398563734e-05, |
| "loss": 0.0423, |
| "mean_token_accuracy": 0.9917679131031036, |
| "num_tokens": 1055311.0, |
| "step": 1000 |
| }, |
| { |
| "entropy": 2.1181130170822144, |
| "epoch": 0.604428485936565, |
| "grad_norm": 0.42578125, |
| "learning_rate": 5.396169958108917e-05, |
| "loss": 0.0331, |
| "mean_token_accuracy": 0.9931470155715942, |
| "num_tokens": 1065957.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 1.8641826033592224, |
| "epoch": 0.6104129263913824, |
| "grad_norm": 2.421875, |
| "learning_rate": 5.3901855176540995e-05, |
| "loss": 0.0322, |
| "mean_token_accuracy": 0.9937169075012207, |
| "num_tokens": 1076428.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 1.8986489415168761, |
| "epoch": 0.6163973668461998, |
| "grad_norm": 1.640625, |
| "learning_rate": 5.384201077199282e-05, |
| "loss": 0.023, |
| "mean_token_accuracy": 0.9938018679618835, |
| "num_tokens": 1086893.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 2.18128616809845, |
| "epoch": 0.6223818073010173, |
| "grad_norm": 1.4921875, |
| "learning_rate": 5.378216636744465e-05, |
| "loss": 0.0384, |
| "mean_token_accuracy": 0.9915229856967926, |
| "num_tokens": 1097584.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 2.236606168746948, |
| "epoch": 0.6283662477558348, |
| "grad_norm": 0.474609375, |
| "learning_rate": 5.372232196289647e-05, |
| "loss": 0.0309, |
| "mean_token_accuracy": 0.994802838563919, |
| "num_tokens": 1108360.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 2.085828936100006, |
| "epoch": 0.6343506882106523, |
| "grad_norm": 1.640625, |
| "learning_rate": 5.366247755834829e-05, |
| "loss": 0.0192, |
| "mean_token_accuracy": 0.9961528956890107, |
| "num_tokens": 1118906.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 1.8950690627098083, |
| "epoch": 0.6403351286654698, |
| "grad_norm": 0.546875, |
| "learning_rate": 5.360263315380012e-05, |
| "loss": 0.0111, |
| "mean_token_accuracy": 0.9958807468414307, |
| "num_tokens": 1129629.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 1.7149085760116578, |
| "epoch": 0.6463195691202872, |
| "grad_norm": 0.76171875, |
| "learning_rate": 5.3542788749251944e-05, |
| "loss": 0.0275, |
| "mean_token_accuracy": 0.9933666825294495, |
| "num_tokens": 1140311.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 1.746055507659912, |
| "epoch": 0.6523040095751047, |
| "grad_norm": 2.03125, |
| "learning_rate": 5.3482944344703774e-05, |
| "loss": 0.0142, |
| "mean_token_accuracy": 0.9978117167949676, |
| "num_tokens": 1150847.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 1.9340453743934631, |
| "epoch": 0.6582884500299222, |
| "grad_norm": 3.8125, |
| "learning_rate": 5.34230999401556e-05, |
| "loss": 0.0457, |
| "mean_token_accuracy": 0.9922799825668335, |
| "num_tokens": 1161560.0, |
| "step": 1100 |
| }, |
| { |
| "entropy": 2.1036330103874206, |
| "epoch": 0.6642728904847397, |
| "grad_norm": 3.375, |
| "learning_rate": 5.336325553560742e-05, |
| "loss": 0.0294, |
| "mean_token_accuracy": 0.9932465612888336, |
| "num_tokens": 1172079.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 2.233074736595154, |
| "epoch": 0.6702573309395572, |
| "grad_norm": 0.6875, |
| "learning_rate": 5.330341113105925e-05, |
| "loss": 0.0183, |
| "mean_token_accuracy": 0.9958370923995972, |
| "num_tokens": 1182693.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 2.2601813077926636, |
| "epoch": 0.6762417713943746, |
| "grad_norm": 0.0712890625, |
| "learning_rate": 5.3243566726511075e-05, |
| "loss": 0.024, |
| "mean_token_accuracy": 0.9958759665489196, |
| "num_tokens": 1193488.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 2.336117482185364, |
| "epoch": 0.6822262118491921, |
| "grad_norm": 2.140625, |
| "learning_rate": 5.318372232196289e-05, |
| "loss": 0.025, |
| "mean_token_accuracy": 0.9926075279712677, |
| "num_tokens": 1204193.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 2.229608154296875, |
| "epoch": 0.6882106523040096, |
| "grad_norm": 1.046875, |
| "learning_rate": 5.312387791741472e-05, |
| "loss": 0.026, |
| "mean_token_accuracy": 0.9936797678470611, |
| "num_tokens": 1214759.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 2.1003764390945436, |
| "epoch": 0.6941950927588271, |
| "grad_norm": 0.61328125, |
| "learning_rate": 5.3064033512866546e-05, |
| "loss": 0.0224, |
| "mean_token_accuracy": 0.9928372919559478, |
| "num_tokens": 1225411.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 1.9683186292648316, |
| "epoch": 0.7001795332136446, |
| "grad_norm": 1.0546875, |
| "learning_rate": 5.300418910831837e-05, |
| "loss": 0.0234, |
| "mean_token_accuracy": 0.9944151937961578, |
| "num_tokens": 1236020.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 1.9860737323760986, |
| "epoch": 0.706163973668462, |
| "grad_norm": 2.125, |
| "learning_rate": 5.29443447037702e-05, |
| "loss": 0.0201, |
| "mean_token_accuracy": 0.9947565972805024, |
| "num_tokens": 1246629.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 1.9602022528648377, |
| "epoch": 0.7121484141232794, |
| "grad_norm": 1.0234375, |
| "learning_rate": 5.2884500299222024e-05, |
| "loss": 0.016, |
| "mean_token_accuracy": 0.9972622811794281, |
| "num_tokens": 1257242.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 1.962342917919159, |
| "epoch": 0.718132854578097, |
| "grad_norm": 3.296875, |
| "learning_rate": 5.2824655894673854e-05, |
| "loss": 0.0278, |
| "mean_token_accuracy": 0.9961660385131836, |
| "num_tokens": 1267654.0, |
| "step": 1200 |
| }, |
| { |
| "entropy": 2.012917125225067, |
| "epoch": 0.7241172950329144, |
| "grad_norm": 0.54296875, |
| "learning_rate": 5.276481149012568e-05, |
| "loss": 0.0294, |
| "mean_token_accuracy": 0.9938123047351837, |
| "num_tokens": 1278416.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 2.1606369018554688, |
| "epoch": 0.7301017354877319, |
| "grad_norm": 0.98046875, |
| "learning_rate": 5.2704967085577494e-05, |
| "loss": 0.0096, |
| "mean_token_accuracy": 0.9982011258602143, |
| "num_tokens": 1288836.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 2.1367745637893676, |
| "epoch": 0.7360861759425493, |
| "grad_norm": 0.58984375, |
| "learning_rate": 5.2645122681029325e-05, |
| "loss": 0.016, |
| "mean_token_accuracy": 0.9962705135345459, |
| "num_tokens": 1299376.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 2.040121853351593, |
| "epoch": 0.7420706163973668, |
| "grad_norm": 0.5546875, |
| "learning_rate": 5.258527827648115e-05, |
| "loss": 0.0071, |
| "mean_token_accuracy": 0.9982105016708374, |
| "num_tokens": 1310023.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 1.9288089156150818, |
| "epoch": 0.7480550568521843, |
| "grad_norm": 1.4609375, |
| "learning_rate": 5.252543387193297e-05, |
| "loss": 0.0246, |
| "mean_token_accuracy": 0.9950582385063171, |
| "num_tokens": 1320431.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 1.9065364241600036, |
| "epoch": 0.7540394973070018, |
| "grad_norm": 1.546875, |
| "learning_rate": 5.24655894673848e-05, |
| "loss": 0.0145, |
| "mean_token_accuracy": 0.9972732722759247, |
| "num_tokens": 1331070.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 1.9331876635551453, |
| "epoch": 0.7600239377618193, |
| "grad_norm": 1.921875, |
| "learning_rate": 5.2405745062836626e-05, |
| "loss": 0.0137, |
| "mean_token_accuracy": 0.9962199032306671, |
| "num_tokens": 1341780.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 1.9782670497894288, |
| "epoch": 0.7660083782166367, |
| "grad_norm": 1.046875, |
| "learning_rate": 5.234590065828845e-05, |
| "loss": 0.015, |
| "mean_token_accuracy": 0.9963721334934235, |
| "num_tokens": 1352078.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 1.9941368341445922, |
| "epoch": 0.7719928186714542, |
| "grad_norm": 1.4140625, |
| "learning_rate": 5.228605625374028e-05, |
| "loss": 0.0171, |
| "mean_token_accuracy": 0.995795214176178, |
| "num_tokens": 1362574.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 2.0281198143959047, |
| "epoch": 0.7779772591262717, |
| "grad_norm": 1.984375, |
| "learning_rate": 5.2226211849192104e-05, |
| "loss": 0.0248, |
| "mean_token_accuracy": 0.9931159555912018, |
| "num_tokens": 1373157.0, |
| "step": 1300 |
| }, |
| { |
| "entropy": 2.243196725845337, |
| "epoch": 0.7839616995810892, |
| "grad_norm": 2.078125, |
| "learning_rate": 5.216636744464393e-05, |
| "loss": 0.0182, |
| "mean_token_accuracy": 0.9942145645618439, |
| "num_tokens": 1383719.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 2.1937716722488405, |
| "epoch": 0.7899461400359067, |
| "grad_norm": 1.8203125, |
| "learning_rate": 5.210652304009575e-05, |
| "loss": 0.0237, |
| "mean_token_accuracy": 0.9958289206027985, |
| "num_tokens": 1394282.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 2.4126478910446165, |
| "epoch": 0.7959305804907241, |
| "grad_norm": 0.2578125, |
| "learning_rate": 5.2046678635547574e-05, |
| "loss": 0.0176, |
| "mean_token_accuracy": 0.9958206951618195, |
| "num_tokens": 1404993.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 2.3782467603683473, |
| "epoch": 0.8019150209455416, |
| "grad_norm": 1.34375, |
| "learning_rate": 5.1986834230999405e-05, |
| "loss": 0.0205, |
| "mean_token_accuracy": 0.995442909002304, |
| "num_tokens": 1415486.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 2.243998336791992, |
| "epoch": 0.8078994614003591, |
| "grad_norm": 5.125, |
| "learning_rate": 5.192698982645123e-05, |
| "loss": 0.0266, |
| "mean_token_accuracy": 0.9941241443157196, |
| "num_tokens": 1426258.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 2.1216912388801576, |
| "epoch": 0.8138839018551766, |
| "grad_norm": 0.796875, |
| "learning_rate": 5.186714542190305e-05, |
| "loss": 0.0227, |
| "mean_token_accuracy": 0.9941264271736145, |
| "num_tokens": 1436794.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 2.2492347717285157, |
| "epoch": 0.8198683423099941, |
| "grad_norm": 0.26953125, |
| "learning_rate": 5.180730101735488e-05, |
| "loss": 0.0225, |
| "mean_token_accuracy": 0.995734578371048, |
| "num_tokens": 1447419.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 2.2173564434051514, |
| "epoch": 0.8258527827648114, |
| "grad_norm": 2.96875, |
| "learning_rate": 5.1747456612806706e-05, |
| "loss": 0.021, |
| "mean_token_accuracy": 0.9952861666679382, |
| "num_tokens": 1457898.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 2.240031886100769, |
| "epoch": 0.8318372232196289, |
| "grad_norm": 0.52734375, |
| "learning_rate": 5.168761220825853e-05, |
| "loss": 0.0167, |
| "mean_token_accuracy": 0.9952712118625641, |
| "num_tokens": 1468317.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 2.1114137172698975, |
| "epoch": 0.8378216636744464, |
| "grad_norm": 2.296875, |
| "learning_rate": 5.162776780371035e-05, |
| "loss": 0.0147, |
| "mean_token_accuracy": 0.9961936414241791, |
| "num_tokens": 1479024.0, |
| "step": 1400 |
| }, |
| { |
| "entropy": 2.1817475318908692, |
| "epoch": 0.8438061041292639, |
| "grad_norm": 0.38671875, |
| "learning_rate": 5.156792339916218e-05, |
| "loss": 0.0256, |
| "mean_token_accuracy": 0.9960037291049957, |
| "num_tokens": 1489537.0, |
| "step": 1410 |
| }, |
| { |
| "entropy": 2.067359519004822, |
| "epoch": 0.8497905445840814, |
| "grad_norm": 0.53515625, |
| "learning_rate": 5.150807899461401e-05, |
| "loss": 0.0092, |
| "mean_token_accuracy": 0.9971863865852356, |
| "num_tokens": 1500071.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 1.7851839542388916, |
| "epoch": 0.8557749850388988, |
| "grad_norm": 0.1533203125, |
| "learning_rate": 5.144823459006583e-05, |
| "loss": 0.0106, |
| "mean_token_accuracy": 0.9977741360664367, |
| "num_tokens": 1510569.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 1.6537530064582824, |
| "epoch": 0.8617594254937163, |
| "grad_norm": 0.7109375, |
| "learning_rate": 5.1388390185517654e-05, |
| "loss": 0.0216, |
| "mean_token_accuracy": 0.9965560495853424, |
| "num_tokens": 1521246.0, |
| "step": 1440 |
| }, |
| { |
| "entropy": 1.557056224346161, |
| "epoch": 0.8677438659485338, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.1328545780969485e-05, |
| "loss": 0.0119, |
| "mean_token_accuracy": 0.9958473801612854, |
| "num_tokens": 1531684.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 1.686435067653656, |
| "epoch": 0.8737283064033513, |
| "grad_norm": 1.3984375, |
| "learning_rate": 5.126870137642131e-05, |
| "loss": 0.0323, |
| "mean_token_accuracy": 0.9941241443157196, |
| "num_tokens": 1542355.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 1.9768336772918702, |
| "epoch": 0.8797127468581688, |
| "grad_norm": 2.328125, |
| "learning_rate": 5.1208856971873125e-05, |
| "loss": 0.0321, |
| "mean_token_accuracy": 0.992108279466629, |
| "num_tokens": 1552858.0, |
| "step": 1470 |
| }, |
| { |
| "entropy": 2.234424018859863, |
| "epoch": 0.8856971873129862, |
| "grad_norm": 0.87890625, |
| "learning_rate": 5.1149012567324955e-05, |
| "loss": 0.0137, |
| "mean_token_accuracy": 0.9972717106342316, |
| "num_tokens": 1563443.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 2.180929946899414, |
| "epoch": 0.8916816277678037, |
| "grad_norm": 1.5, |
| "learning_rate": 5.108916816277678e-05, |
| "loss": 0.0233, |
| "mean_token_accuracy": 0.9935569524765014, |
| "num_tokens": 1574132.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 2.016967070102692, |
| "epoch": 0.8976660682226212, |
| "grad_norm": 1.8515625, |
| "learning_rate": 5.102932375822861e-05, |
| "loss": 0.0151, |
| "mean_token_accuracy": 0.9970189332962036, |
| "num_tokens": 1584513.0, |
| "step": 1500 |
| }, |
| { |
| "entropy": 2.0316667675971987, |
| "epoch": 0.9036505086774387, |
| "grad_norm": 0.435546875, |
| "learning_rate": 5.096947935368043e-05, |
| "loss": 0.0195, |
| "mean_token_accuracy": 0.9942385494709015, |
| "num_tokens": 1594911.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 2.189347743988037, |
| "epoch": 0.9096349491322562, |
| "grad_norm": 1.15625, |
| "learning_rate": 5.090963494913226e-05, |
| "loss": 0.0159, |
| "mean_token_accuracy": 0.995745187997818, |
| "num_tokens": 1605415.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 2.2969648122787474, |
| "epoch": 0.9156193895870736, |
| "grad_norm": 0.32421875, |
| "learning_rate": 5.084979054458409e-05, |
| "loss": 0.0177, |
| "mean_token_accuracy": 0.9965317666530609, |
| "num_tokens": 1615920.0, |
| "step": 1530 |
| }, |
| { |
| "entropy": 2.4979332208633425, |
| "epoch": 0.9216038300418911, |
| "grad_norm": 0.2734375, |
| "learning_rate": 5.078994614003591e-05, |
| "loss": 0.0261, |
| "mean_token_accuracy": 0.9930246412754059, |
| "num_tokens": 1626519.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 2.381369423866272, |
| "epoch": 0.9275882704967086, |
| "grad_norm": 0.16796875, |
| "learning_rate": 5.073010173548773e-05, |
| "loss": 0.0168, |
| "mean_token_accuracy": 0.9979732036590576, |
| "num_tokens": 1637154.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 2.2379947185516356, |
| "epoch": 0.933572710951526, |
| "grad_norm": 0.248046875, |
| "learning_rate": 5.067025733093956e-05, |
| "loss": 0.0093, |
| "mean_token_accuracy": 0.9971390187740325, |
| "num_tokens": 1647633.0, |
| "step": 1560 |
| }, |
| { |
| "entropy": 2.245289707183838, |
| "epoch": 0.9395571514063435, |
| "grad_norm": 4.21875, |
| "learning_rate": 5.061041292639138e-05, |
| "loss": 0.0249, |
| "mean_token_accuracy": 0.9938650369644165, |
| "num_tokens": 1658248.0, |
| "step": 1570 |
| }, |
| { |
| "entropy": 2.252811074256897, |
| "epoch": 0.9455415918611609, |
| "grad_norm": 0.3203125, |
| "learning_rate": 5.0550568521843205e-05, |
| "loss": 0.0065, |
| "mean_token_accuracy": 0.9979120135307312, |
| "num_tokens": 1668889.0, |
| "step": 1580 |
| }, |
| { |
| "entropy": 2.2055715322494507, |
| "epoch": 0.9515260323159784, |
| "grad_norm": 0.029052734375, |
| "learning_rate": 5.0490724117295035e-05, |
| "loss": 0.0159, |
| "mean_token_accuracy": 0.995630270242691, |
| "num_tokens": 1679264.0, |
| "step": 1590 |
| }, |
| { |
| "entropy": 2.1671520948410032, |
| "epoch": 0.9575104727707959, |
| "grad_norm": 0.08935546875, |
| "learning_rate": 5.043087971274686e-05, |
| "loss": 0.0246, |
| "mean_token_accuracy": 0.9956823647022247, |
| "num_tokens": 1689812.0, |
| "step": 1600 |
| }, |
| { |
| "entropy": 2.264160418510437, |
| "epoch": 0.9634949132256134, |
| "grad_norm": 1.390625, |
| "learning_rate": 5.037103530819869e-05, |
| "loss": 0.021, |
| "mean_token_accuracy": 0.9950319647789001, |
| "num_tokens": 1700414.0, |
| "step": 1610 |
| }, |
| { |
| "entropy": 2.3583990335464478, |
| "epoch": 0.9694793536804309, |
| "grad_norm": 0.322265625, |
| "learning_rate": 5.031119090365051e-05, |
| "loss": 0.0075, |
| "mean_token_accuracy": 0.9978622436523438, |
| "num_tokens": 1711140.0, |
| "step": 1620 |
| }, |
| { |
| "entropy": 2.2860071897506713, |
| "epoch": 0.9754637941352483, |
| "grad_norm": 0.04736328125, |
| "learning_rate": 5.025134649910233e-05, |
| "loss": 0.0071, |
| "mean_token_accuracy": 0.9978203475475311, |
| "num_tokens": 1721755.0, |
| "step": 1630 |
| }, |
| { |
| "entropy": 2.346220374107361, |
| "epoch": 0.9814482345900658, |
| "grad_norm": 0.69921875, |
| "learning_rate": 5.019150209455416e-05, |
| "loss": 0.0115, |
| "mean_token_accuracy": 0.9975032925605773, |
| "num_tokens": 1732426.0, |
| "step": 1640 |
| }, |
| { |
| "entropy": 2.3739873647689818, |
| "epoch": 0.9874326750448833, |
| "grad_norm": 1.4375, |
| "learning_rate": 5.0131657690005984e-05, |
| "loss": 0.0186, |
| "mean_token_accuracy": 0.9963673233985901, |
| "num_tokens": 1742943.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 2.418501615524292, |
| "epoch": 0.9934171154997008, |
| "grad_norm": 2.625, |
| "learning_rate": 5.007181328545781e-05, |
| "loss": 0.0316, |
| "mean_token_accuracy": 0.9935234546661377, |
| "num_tokens": 1753459.0, |
| "step": 1660 |
| }, |
| { |
| "entropy": 2.5557403326034547, |
| "epoch": 0.9994015559545183, |
| "grad_norm": 4.90625, |
| "learning_rate": 5.001196888090964e-05, |
| "loss": 0.0184, |
| "mean_token_accuracy": 0.9944635927677155, |
| "num_tokens": 1764253.0, |
| "step": 1670 |
| }, |
| { |
| "entropy": 2.444777250289917, |
| "epoch": 1.0053859964093357, |
| "grad_norm": 0.78125, |
| "learning_rate": 4.995212447636146e-05, |
| "loss": 0.0058, |
| "mean_token_accuracy": 0.9985901474952698, |
| "num_tokens": 1774801.0, |
| "step": 1680 |
| }, |
| { |
| "entropy": 2.319710397720337, |
| "epoch": 1.0113704368641532, |
| "grad_norm": 0.94921875, |
| "learning_rate": 4.989228007181329e-05, |
| "loss": 0.0099, |
| "mean_token_accuracy": 0.9978196382522583, |
| "num_tokens": 1785099.0, |
| "step": 1690 |
| }, |
| { |
| "entropy": 2.311413550376892, |
| "epoch": 1.0173548773189707, |
| "grad_norm": 0.0703125, |
| "learning_rate": 4.9832435667265115e-05, |
| "loss": 0.0057, |
| "mean_token_accuracy": 0.9981013059616088, |
| "num_tokens": 1795558.0, |
| "step": 1700 |
| }, |
| { |
| "entropy": 2.040494406223297, |
| "epoch": 1.0233393177737882, |
| "grad_norm": 0.07958984375, |
| "learning_rate": 4.977259126271694e-05, |
| "loss": 0.0059, |
| "mean_token_accuracy": 0.9982348620891571, |
| "num_tokens": 1806044.0, |
| "step": 1710 |
| }, |
| { |
| "entropy": 1.9538646340370178, |
| "epoch": 1.0293237582286057, |
| "grad_norm": 0.0255126953125, |
| "learning_rate": 4.971274685816876e-05, |
| "loss": 0.0091, |
| "mean_token_accuracy": 0.9989152610301971, |
| "num_tokens": 1816591.0, |
| "step": 1720 |
| }, |
| { |
| "entropy": 1.9389848232269287, |
| "epoch": 1.0353081986834232, |
| "grad_norm": 0.10888671875, |
| "learning_rate": 4.9652902453620586e-05, |
| "loss": 0.0064, |
| "mean_token_accuracy": 0.9982307553291321, |
| "num_tokens": 1827193.0, |
| "step": 1730 |
| }, |
| { |
| "entropy": 2.034816288948059, |
| "epoch": 1.0412926391382407, |
| "grad_norm": 0.87109375, |
| "learning_rate": 4.959305804907241e-05, |
| "loss": 0.0068, |
| "mean_token_accuracy": 0.9974384307861328, |
| "num_tokens": 1837562.0, |
| "step": 1740 |
| }, |
| { |
| "entropy": 1.9168062806129456, |
| "epoch": 1.0472770795930582, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.953321364452424e-05, |
| "loss": 0.0126, |
| "mean_token_accuracy": 0.9967582404613495, |
| "num_tokens": 1848040.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 1.914523994922638, |
| "epoch": 1.0532615200478754, |
| "grad_norm": 0.2041015625, |
| "learning_rate": 4.9473369239976064e-05, |
| "loss": 0.0028, |
| "mean_token_accuracy": 0.9989884674549103, |
| "num_tokens": 1858732.0, |
| "step": 1760 |
| }, |
| { |
| "entropy": 1.8829883575439452, |
| "epoch": 1.059245960502693, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.941352483542789e-05, |
| "loss": 0.0104, |
| "mean_token_accuracy": 0.9978753447532653, |
| "num_tokens": 1869534.0, |
| "step": 1770 |
| }, |
| { |
| "entropy": 1.9079457640647888, |
| "epoch": 1.0652304009575104, |
| "grad_norm": 0.5234375, |
| "learning_rate": 4.935368043087972e-05, |
| "loss": 0.0158, |
| "mean_token_accuracy": 0.9964175879955292, |
| "num_tokens": 1880120.0, |
| "step": 1780 |
| }, |
| { |
| "entropy": 2.182413923740387, |
| "epoch": 1.071214841412328, |
| "grad_norm": 0.220703125, |
| "learning_rate": 4.929383602633154e-05, |
| "loss": 0.0081, |
| "mean_token_accuracy": 0.9970734059810639, |
| "num_tokens": 1890589.0, |
| "step": 1790 |
| }, |
| { |
| "entropy": 2.1202453851699827, |
| "epoch": 1.0771992818671454, |
| "grad_norm": 0.328125, |
| "learning_rate": 4.9233991621783365e-05, |
| "loss": 0.0095, |
| "mean_token_accuracy": 0.9975960195064545, |
| "num_tokens": 1901169.0, |
| "step": 1800 |
| }, |
| { |
| "entropy": 2.040514326095581, |
| "epoch": 1.083183722321963, |
| "grad_norm": 0.294921875, |
| "learning_rate": 4.917414721723519e-05, |
| "loss": 0.0054, |
| "mean_token_accuracy": 0.9978755116462708, |
| "num_tokens": 1911675.0, |
| "step": 1810 |
| }, |
| { |
| "entropy": 1.9972706317901612, |
| "epoch": 1.0891681627767804, |
| "grad_norm": 0.07275390625, |
| "learning_rate": 4.911430281268701e-05, |
| "loss": 0.0086, |
| "mean_token_accuracy": 0.9979366779327392, |
| "num_tokens": 1922328.0, |
| "step": 1820 |
| }, |
| { |
| "entropy": 1.960794413089752, |
| "epoch": 1.095152603231598, |
| "grad_norm": 0.0263671875, |
| "learning_rate": 4.905445840813884e-05, |
| "loss": 0.0069, |
| "mean_token_accuracy": 0.9982580840587616, |
| "num_tokens": 1932945.0, |
| "step": 1830 |
| }, |
| { |
| "entropy": 1.9782156348228455, |
| "epoch": 1.1011370436864154, |
| "grad_norm": 0.08447265625, |
| "learning_rate": 4.8994614003590666e-05, |
| "loss": 0.0134, |
| "mean_token_accuracy": 0.9974651634693146, |
| "num_tokens": 1943470.0, |
| "step": 1840 |
| }, |
| { |
| "entropy": 1.977518343925476, |
| "epoch": 1.1071214841412327, |
| "grad_norm": 0.98046875, |
| "learning_rate": 4.893476959904249e-05, |
| "loss": 0.0027, |
| "mean_token_accuracy": 0.9989652216434479, |
| "num_tokens": 1954059.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 1.8379858493804933, |
| "epoch": 1.1131059245960502, |
| "grad_norm": 0.076171875, |
| "learning_rate": 4.887492519449432e-05, |
| "loss": 0.0041, |
| "mean_token_accuracy": 0.9982218623161316, |
| "num_tokens": 1964445.0, |
| "step": 1860 |
| }, |
| { |
| "entropy": 1.8461663961410522, |
| "epoch": 1.1190903650508677, |
| "grad_norm": 0.01025390625, |
| "learning_rate": 4.8815080789946143e-05, |
| "loss": 0.0082, |
| "mean_token_accuracy": 0.9989234507083893, |
| "num_tokens": 1975144.0, |
| "step": 1870 |
| }, |
| { |
| "entropy": 1.744893491268158, |
| "epoch": 1.1250748055056852, |
| "grad_norm": 0.451171875, |
| "learning_rate": 4.875523638539796e-05, |
| "loss": 0.008, |
| "mean_token_accuracy": 0.9982535362243652, |
| "num_tokens": 1985655.0, |
| "step": 1880 |
| }, |
| { |
| "entropy": 1.8510890007019043, |
| "epoch": 1.1310592459605027, |
| "grad_norm": 2.171875, |
| "learning_rate": 4.869539198084979e-05, |
| "loss": 0.0074, |
| "mean_token_accuracy": 0.9972193837165833, |
| "num_tokens": 1996254.0, |
| "step": 1890 |
| }, |
| { |
| "entropy": 1.8942458748817443, |
| "epoch": 1.1370436864153202, |
| "grad_norm": 0.1484375, |
| "learning_rate": 4.8635547576301614e-05, |
| "loss": 0.0042, |
| "mean_token_accuracy": 0.9989045560359955, |
| "num_tokens": 2006855.0, |
| "step": 1900 |
| }, |
| { |
| "entropy": 1.8198019742965699, |
| "epoch": 1.1430281268701377, |
| "grad_norm": 1.3828125, |
| "learning_rate": 4.8575703171753445e-05, |
| "loss": 0.0032, |
| "mean_token_accuracy": 0.9992660522460938, |
| "num_tokens": 2017681.0, |
| "step": 1910 |
| }, |
| { |
| "entropy": 1.6781391501426697, |
| "epoch": 1.1490125673249552, |
| "grad_norm": 0.0291748046875, |
| "learning_rate": 4.851585876720527e-05, |
| "loss": 0.01, |
| "mean_token_accuracy": 0.9977579891681672, |
| "num_tokens": 2028065.0, |
| "step": 1920 |
| }, |
| { |
| "entropy": 1.7453859210014344, |
| "epoch": 1.1549970077797727, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.845601436265709e-05, |
| "loss": 0.0046, |
| "mean_token_accuracy": 0.9985866487026215, |
| "num_tokens": 2038744.0, |
| "step": 1930 |
| }, |
| { |
| "entropy": 1.7961567163467407, |
| "epoch": 1.1609814482345902, |
| "grad_norm": 0.0810546875, |
| "learning_rate": 4.839616995810892e-05, |
| "loss": 0.0103, |
| "mean_token_accuracy": 0.9979851245880127, |
| "num_tokens": 2049482.0, |
| "step": 1940 |
| }, |
| { |
| "entropy": 1.811640238761902, |
| "epoch": 1.1669658886894076, |
| "grad_norm": 3.0, |
| "learning_rate": 4.8336325553560746e-05, |
| "loss": 0.0035, |
| "mean_token_accuracy": 0.99932302236557, |
| "num_tokens": 2060236.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 1.7889364123344422, |
| "epoch": 1.172950329144225, |
| "grad_norm": 0.30078125, |
| "learning_rate": 4.827648114901256e-05, |
| "loss": 0.0137, |
| "mean_token_accuracy": 0.9968235552310943, |
| "num_tokens": 2070951.0, |
| "step": 1960 |
| }, |
| { |
| "entropy": 1.8480128169059753, |
| "epoch": 1.1789347695990424, |
| "grad_norm": 0.0771484375, |
| "learning_rate": 4.821663674446439e-05, |
| "loss": 0.0103, |
| "mean_token_accuracy": 0.9978732526302337, |
| "num_tokens": 2081499.0, |
| "step": 1970 |
| }, |
| { |
| "entropy": 1.7933115482330322, |
| "epoch": 1.18491921005386, |
| "grad_norm": 1.015625, |
| "learning_rate": 4.815679233991622e-05, |
| "loss": 0.0087, |
| "mean_token_accuracy": 0.9985387563705445, |
| "num_tokens": 2092063.0, |
| "step": 1980 |
| }, |
| { |
| "entropy": 1.7108964920043945, |
| "epoch": 1.1909036505086774, |
| "grad_norm": 0.380859375, |
| "learning_rate": 4.809694793536805e-05, |
| "loss": 0.0088, |
| "mean_token_accuracy": 0.9971836686134339, |
| "num_tokens": 2102495.0, |
| "step": 1990 |
| }, |
| { |
| "entropy": 1.7778636336326599, |
| "epoch": 1.196888090963495, |
| "grad_norm": 0.0859375, |
| "learning_rate": 4.803710353081987e-05, |
| "loss": 0.0068, |
| "mean_token_accuracy": 0.9988643646240234, |
| "num_tokens": 2112819.0, |
| "step": 2000 |
| }, |
| { |
| "entropy": 1.7624136567115785, |
| "epoch": 1.2028725314183124, |
| "grad_norm": 3.34375, |
| "learning_rate": 4.7977259126271694e-05, |
| "loss": 0.0038, |
| "mean_token_accuracy": 0.9986830711364746, |
| "num_tokens": 2123473.0, |
| "step": 2010 |
| }, |
| { |
| "entropy": 1.78448588848114, |
| "epoch": 1.20885697187313, |
| "grad_norm": 0.037353515625, |
| "learning_rate": 4.7917414721723525e-05, |
| "loss": 0.0172, |
| "mean_token_accuracy": 0.9978123724460601, |
| "num_tokens": 2134040.0, |
| "step": 2020 |
| }, |
| { |
| "entropy": 1.959210455417633, |
| "epoch": 1.2148414123279474, |
| "grad_norm": 0.08544921875, |
| "learning_rate": 4.785757031717535e-05, |
| "loss": 0.0016, |
| "mean_token_accuracy": 0.9996268630027771, |
| "num_tokens": 2144616.0, |
| "step": 2030 |
| }, |
| { |
| "entropy": 1.9250733256340027, |
| "epoch": 1.220825852782765, |
| "grad_norm": 0.0225830078125, |
| "learning_rate": 4.7797725912627165e-05, |
| "loss": 0.0084, |
| "mean_token_accuracy": 0.9989728569984436, |
| "num_tokens": 2155216.0, |
| "step": 2040 |
| }, |
| { |
| "entropy": 1.9832014560699462, |
| "epoch": 1.2268102932375822, |
| "grad_norm": 0.5234375, |
| "learning_rate": 4.7737881508078995e-05, |
| "loss": 0.0051, |
| "mean_token_accuracy": 0.9985514342784881, |
| "num_tokens": 2165907.0, |
| "step": 2050 |
| }, |
| { |
| "entropy": 2.074608051776886, |
| "epoch": 1.2327947336923997, |
| "grad_norm": 0.67578125, |
| "learning_rate": 4.767803710353082e-05, |
| "loss": 0.0073, |
| "mean_token_accuracy": 0.9982097864151, |
| "num_tokens": 2176500.0, |
| "step": 2060 |
| }, |
| { |
| "entropy": 2.164236378669739, |
| "epoch": 1.2387791741472172, |
| "grad_norm": 0.39453125, |
| "learning_rate": 4.761819269898264e-05, |
| "loss": 0.0065, |
| "mean_token_accuracy": 0.998193335533142, |
| "num_tokens": 2186941.0, |
| "step": 2070 |
| }, |
| { |
| "entropy": 2.0435985803604124, |
| "epoch": 1.2447636146020347, |
| "grad_norm": 0.02197265625, |
| "learning_rate": 4.755834829443447e-05, |
| "loss": 0.0084, |
| "mean_token_accuracy": 0.9974483609199524, |
| "num_tokens": 2197437.0, |
| "step": 2080 |
| }, |
| { |
| "entropy": 1.7942885041236878, |
| "epoch": 1.2507480550568522, |
| "grad_norm": 0.51171875, |
| "learning_rate": 4.7498503889886297e-05, |
| "loss": 0.012, |
| "mean_token_accuracy": 0.9985887467861175, |
| "num_tokens": 2208037.0, |
| "step": 2090 |
| }, |
| { |
| "entropy": 1.7626053810119628, |
| "epoch": 1.2567324955116697, |
| "grad_norm": 0.78125, |
| "learning_rate": 4.743865948533813e-05, |
| "loss": 0.0136, |
| "mean_token_accuracy": 0.9975202858448029, |
| "num_tokens": 2218593.0, |
| "step": 2100 |
| }, |
| { |
| "entropy": 1.821165430545807, |
| "epoch": 1.2627169359664872, |
| "grad_norm": 0.6328125, |
| "learning_rate": 4.737881508078995e-05, |
| "loss": 0.0087, |
| "mean_token_accuracy": 0.9983973145484925, |
| "num_tokens": 2229361.0, |
| "step": 2110 |
| }, |
| { |
| "entropy": 1.8355701923370362, |
| "epoch": 1.2687013764213046, |
| "grad_norm": 0.060302734375, |
| "learning_rate": 4.731897067624177e-05, |
| "loss": 0.0039, |
| "mean_token_accuracy": 0.9985953152179718, |
| "num_tokens": 2240050.0, |
| "step": 2120 |
| }, |
| { |
| "entropy": 1.7403881430625916, |
| "epoch": 1.2746858168761221, |
| "grad_norm": 1.09375, |
| "learning_rate": 4.72591262716936e-05, |
| "loss": 0.0059, |
| "mean_token_accuracy": 0.9986009895801544, |
| "num_tokens": 2250554.0, |
| "step": 2130 |
| }, |
| { |
| "entropy": 1.7938685655593871, |
| "epoch": 1.2806702573309394, |
| "grad_norm": 0.024658203125, |
| "learning_rate": 4.719928186714542e-05, |
| "loss": 0.0071, |
| "mean_token_accuracy": 0.9985599517822266, |
| "num_tokens": 2261237.0, |
| "step": 2140 |
| }, |
| { |
| "entropy": 1.8329176306724548, |
| "epoch": 1.2866546977857571, |
| "grad_norm": 0.047119140625, |
| "learning_rate": 4.7139437462597245e-05, |
| "loss": 0.0072, |
| "mean_token_accuracy": 0.9985998690128326, |
| "num_tokens": 2271802.0, |
| "step": 2150 |
| }, |
| { |
| "entropy": 1.7967113852500916, |
| "epoch": 1.2926391382405744, |
| "grad_norm": 1.375, |
| "learning_rate": 4.7079593058049075e-05, |
| "loss": 0.0038, |
| "mean_token_accuracy": 0.9993126451969147, |
| "num_tokens": 2282449.0, |
| "step": 2160 |
| }, |
| { |
| "entropy": 1.8445377826690674, |
| "epoch": 1.298623578695392, |
| "grad_norm": 0.018310546875, |
| "learning_rate": 4.70197486535009e-05, |
| "loss": 0.0032, |
| "mean_token_accuracy": 0.9992965221405029, |
| "num_tokens": 2293084.0, |
| "step": 2170 |
| }, |
| { |
| "entropy": 1.8734865069389344, |
| "epoch": 1.3046080191502094, |
| "grad_norm": 0.0322265625, |
| "learning_rate": 4.695990424895272e-05, |
| "loss": 0.0049, |
| "mean_token_accuracy": 0.9989112138748169, |
| "num_tokens": 2303735.0, |
| "step": 2180 |
| }, |
| { |
| "entropy": 1.9646783590316772, |
| "epoch": 1.310592459605027, |
| "grad_norm": 0.5078125, |
| "learning_rate": 4.690005984440455e-05, |
| "loss": 0.0188, |
| "mean_token_accuracy": 0.9973492562770844, |
| "num_tokens": 2314333.0, |
| "step": 2190 |
| }, |
| { |
| "entropy": 2.0574665307998656, |
| "epoch": 1.3165769000598444, |
| "grad_norm": 0.029296875, |
| "learning_rate": 4.6840215439856376e-05, |
| "loss": 0.0058, |
| "mean_token_accuracy": 0.9989279210567474, |
| "num_tokens": 2325056.0, |
| "step": 2200 |
| }, |
| { |
| "entropy": 1.9682793974876405, |
| "epoch": 1.322561340514662, |
| "grad_norm": 0.1142578125, |
| "learning_rate": 4.67803710353082e-05, |
| "loss": 0.0043, |
| "mean_token_accuracy": 0.9985645651817322, |
| "num_tokens": 2335451.0, |
| "step": 2210 |
| }, |
| { |
| "entropy": 1.978387975692749, |
| "epoch": 1.3285457809694794, |
| "grad_norm": 0.08740234375, |
| "learning_rate": 4.6720526630760024e-05, |
| "loss": 0.0043, |
| "mean_token_accuracy": 0.9985677063465118, |
| "num_tokens": 2345726.0, |
| "step": 2220 |
| }, |
| { |
| "entropy": 2.020477998256683, |
| "epoch": 1.334530221424297, |
| "grad_norm": 0.1005859375, |
| "learning_rate": 4.666068222621185e-05, |
| "loss": 0.004, |
| "mean_token_accuracy": 0.9985411047935486, |
| "num_tokens": 2356376.0, |
| "step": 2230 |
| }, |
| { |
| "entropy": 1.9890066623687743, |
| "epoch": 1.3405146618791144, |
| "grad_norm": 0.036376953125, |
| "learning_rate": 4.660083782166368e-05, |
| "loss": 0.0043, |
| "mean_token_accuracy": 0.9989206194877625, |
| "num_tokens": 2367064.0, |
| "step": 2240 |
| }, |
| { |
| "entropy": 1.907881224155426, |
| "epoch": 1.3464991023339317, |
| "grad_norm": 1.4609375, |
| "learning_rate": 4.65409934171155e-05, |
| "loss": 0.0062, |
| "mean_token_accuracy": 0.9981145322322845, |
| "num_tokens": 2377443.0, |
| "step": 2250 |
| }, |
| { |
| "entropy": 1.8871219635009766, |
| "epoch": 1.3524835427887494, |
| "grad_norm": 0.048095703125, |
| "learning_rate": 4.6481149012567325e-05, |
| "loss": 0.0029, |
| "mean_token_accuracy": 0.9985643148422241, |
| "num_tokens": 2388013.0, |
| "step": 2260 |
| }, |
| { |
| "entropy": 1.8452912092208862, |
| "epoch": 1.3584679832435667, |
| "grad_norm": 0.439453125, |
| "learning_rate": 4.6421304608019155e-05, |
| "loss": 0.0112, |
| "mean_token_accuracy": 0.9982944905757904, |
| "num_tokens": 2398654.0, |
| "step": 2270 |
| }, |
| { |
| "entropy": 1.8592095136642457, |
| "epoch": 1.3644524236983842, |
| "grad_norm": 0.70703125, |
| "learning_rate": 4.636146020347098e-05, |
| "loss": 0.0102, |
| "mean_token_accuracy": 0.9971856594085693, |
| "num_tokens": 2409209.0, |
| "step": 2280 |
| }, |
| { |
| "entropy": 1.9233819246292114, |
| "epoch": 1.3704368641532017, |
| "grad_norm": 0.09033203125, |
| "learning_rate": 4.63016157989228e-05, |
| "loss": 0.0065, |
| "mean_token_accuracy": 0.9982663273811341, |
| "num_tokens": 2419620.0, |
| "step": 2290 |
| }, |
| { |
| "entropy": 1.888377809524536, |
| "epoch": 1.3764213046080191, |
| "grad_norm": 0.16015625, |
| "learning_rate": 4.6241771394374626e-05, |
| "loss": 0.0079, |
| "mean_token_accuracy": 0.9981661736965179, |
| "num_tokens": 2430017.0, |
| "step": 2300 |
| }, |
| { |
| "entropy": 1.8767491102218627, |
| "epoch": 1.3824057450628366, |
| "grad_norm": 0.84375, |
| "learning_rate": 4.618192698982645e-05, |
| "loss": 0.0069, |
| "mean_token_accuracy": 0.9989192366600037, |
| "num_tokens": 2440671.0, |
| "step": 2310 |
| }, |
| { |
| "entropy": 1.8048344016075135, |
| "epoch": 1.3883901855176541, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.612208258527828e-05, |
| "loss": 0.0154, |
| "mean_token_accuracy": 0.9961064517498016, |
| "num_tokens": 2451241.0, |
| "step": 2320 |
| }, |
| { |
| "entropy": 1.7816383957862854, |
| "epoch": 1.3943746259724716, |
| "grad_norm": 1.390625, |
| "learning_rate": 4.6062238180730103e-05, |
| "loss": 0.0123, |
| "mean_token_accuracy": 0.9972090303897858, |
| "num_tokens": 2461831.0, |
| "step": 2330 |
| }, |
| { |
| "entropy": 1.803758704662323, |
| "epoch": 1.400359066427289, |
| "grad_norm": 0.12109375, |
| "learning_rate": 4.600239377618193e-05, |
| "loss": 0.014, |
| "mean_token_accuracy": 0.9982601046562195, |
| "num_tokens": 2472433.0, |
| "step": 2340 |
| }, |
| { |
| "entropy": 1.8111782312393188, |
| "epoch": 1.4063435068821066, |
| "grad_norm": 1.296875, |
| "learning_rate": 4.594254937163376e-05, |
| "loss": 0.0132, |
| "mean_token_accuracy": 0.997297465801239, |
| "num_tokens": 2483131.0, |
| "step": 2350 |
| }, |
| { |
| "entropy": 1.841699206829071, |
| "epoch": 1.412327947336924, |
| "grad_norm": 0.212890625, |
| "learning_rate": 4.588270496708558e-05, |
| "loss": 0.004, |
| "mean_token_accuracy": 0.9989056468009949, |
| "num_tokens": 2493799.0, |
| "step": 2360 |
| }, |
| { |
| "entropy": 1.9263641238212585, |
| "epoch": 1.4183123877917414, |
| "grad_norm": 2.890625, |
| "learning_rate": 4.58228605625374e-05, |
| "loss": 0.0153, |
| "mean_token_accuracy": 0.9967702269554138, |
| "num_tokens": 2504450.0, |
| "step": 2370 |
| }, |
| { |
| "entropy": 1.8764405727386475, |
| "epoch": 1.424296828246559, |
| "grad_norm": 0.0159912109375, |
| "learning_rate": 4.576301615798923e-05, |
| "loss": 0.0017, |
| "mean_token_accuracy": 0.9996539771556854, |
| "num_tokens": 2515097.0, |
| "step": 2380 |
| }, |
| { |
| "entropy": 1.8099690794944763, |
| "epoch": 1.4302812687013764, |
| "grad_norm": 0.88671875, |
| "learning_rate": 4.570317175344105e-05, |
| "loss": 0.0081, |
| "mean_token_accuracy": 0.9982343196868897, |
| "num_tokens": 2525527.0, |
| "step": 2390 |
| }, |
| { |
| "entropy": 1.7947665095329284, |
| "epoch": 1.436265709156194, |
| "grad_norm": 0.023681640625, |
| "learning_rate": 4.564332734889288e-05, |
| "loss": 0.0027, |
| "mean_token_accuracy": 0.999330735206604, |
| "num_tokens": 2536145.0, |
| "step": 2400 |
| }, |
| { |
| "entropy": 1.8335639595985413, |
| "epoch": 1.4422501496110114, |
| "grad_norm": 1.0234375, |
| "learning_rate": 4.5583482944344706e-05, |
| "loss": 0.0153, |
| "mean_token_accuracy": 0.9968725681304932, |
| "num_tokens": 2546774.0, |
| "step": 2410 |
| }, |
| { |
| "entropy": 1.857043170928955, |
| "epoch": 1.4482345900658289, |
| "grad_norm": 0.39453125, |
| "learning_rate": 4.552363853979653e-05, |
| "loss": 0.004, |
| "mean_token_accuracy": 0.9982848763465881, |
| "num_tokens": 2557417.0, |
| "step": 2420 |
| }, |
| { |
| "entropy": 1.8625115990638732, |
| "epoch": 1.4542190305206464, |
| "grad_norm": 0.1318359375, |
| "learning_rate": 4.546379413524836e-05, |
| "loss": 0.0132, |
| "mean_token_accuracy": 0.9964711248874665, |
| "num_tokens": 2567952.0, |
| "step": 2430 |
| }, |
| { |
| "entropy": 1.8767970681190491, |
| "epoch": 1.4602034709754639, |
| "grad_norm": 1.015625, |
| "learning_rate": 4.5403949730700183e-05, |
| "loss": 0.0079, |
| "mean_token_accuracy": 0.9982850551605225, |
| "num_tokens": 2578461.0, |
| "step": 2440 |
| }, |
| { |
| "entropy": 1.8251904726028443, |
| "epoch": 1.4661879114302812, |
| "grad_norm": 0.2197265625, |
| "learning_rate": 4.5344105326152e-05, |
| "loss": 0.004, |
| "mean_token_accuracy": 0.9989565730094909, |
| "num_tokens": 2588841.0, |
| "step": 2450 |
| }, |
| { |
| "entropy": 1.8160880088806153, |
| "epoch": 1.4721723518850989, |
| "grad_norm": 0.09326171875, |
| "learning_rate": 4.528426092160383e-05, |
| "loss": 0.0149, |
| "mean_token_accuracy": 0.9972167491912842, |
| "num_tokens": 2599306.0, |
| "step": 2460 |
| }, |
| { |
| "entropy": 1.8673020839691161, |
| "epoch": 1.4781567923399161, |
| "grad_norm": 1.5546875, |
| "learning_rate": 4.5224416517055654e-05, |
| "loss": 0.0024, |
| "mean_token_accuracy": 0.9992740452289581, |
| "num_tokens": 2609837.0, |
| "step": 2470 |
| }, |
| { |
| "entropy": 1.7704484939575196, |
| "epoch": 1.4841412327947336, |
| "grad_norm": 0.2734375, |
| "learning_rate": 4.516457211250748e-05, |
| "loss": 0.0037, |
| "mean_token_accuracy": 0.9985692203044891, |
| "num_tokens": 2620420.0, |
| "step": 2480 |
| }, |
| { |
| "entropy": 1.6958755135536194, |
| "epoch": 1.4901256732495511, |
| "grad_norm": 0.310546875, |
| "learning_rate": 4.510472770795931e-05, |
| "loss": 0.0049, |
| "mean_token_accuracy": 0.9993295550346375, |
| "num_tokens": 2630937.0, |
| "step": 2490 |
| }, |
| { |
| "entropy": 1.7750421166419983, |
| "epoch": 1.4961101137043686, |
| "grad_norm": 1.5, |
| "learning_rate": 4.504488330341113e-05, |
| "loss": 0.0063, |
| "mean_token_accuracy": 0.9982142150402069, |
| "num_tokens": 2641439.0, |
| "step": 2500 |
| }, |
| { |
| "entropy": 1.7922300696372986, |
| "epoch": 1.5020945541591861, |
| "grad_norm": 0.625, |
| "learning_rate": 4.498503889886296e-05, |
| "loss": 0.0066, |
| "mean_token_accuracy": 0.9982458829879761, |
| "num_tokens": 2652117.0, |
| "step": 2510 |
| }, |
| { |
| "entropy": 1.7609119415283203, |
| "epoch": 1.5080789946140036, |
| "grad_norm": 1.625, |
| "learning_rate": 4.4925194494314786e-05, |
| "loss": 0.0041, |
| "mean_token_accuracy": 0.9981802880764008, |
| "num_tokens": 2662835.0, |
| "step": 2520 |
| }, |
| { |
| "entropy": 1.6831945180892944, |
| "epoch": 1.5140634350688211, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.48653500897666e-05, |
| "loss": 0.0101, |
| "mean_token_accuracy": 0.9968818426132202, |
| "num_tokens": 2673431.0, |
| "step": 2530 |
| }, |
| { |
| "entropy": 1.6552810072898865, |
| "epoch": 1.5200478755236384, |
| "grad_norm": 0.396484375, |
| "learning_rate": 4.480550568521843e-05, |
| "loss": 0.0095, |
| "mean_token_accuracy": 0.9975987613201142, |
| "num_tokens": 2684029.0, |
| "step": 2540 |
| }, |
| { |
| "entropy": 1.6987668871879578, |
| "epoch": 1.5260323159784561, |
| "grad_norm": 0.04931640625, |
| "learning_rate": 4.4745661280670257e-05, |
| "loss": 0.0028, |
| "mean_token_accuracy": 0.9996428549289703, |
| "num_tokens": 2694474.0, |
| "step": 2550 |
| }, |
| { |
| "entropy": 1.6680127382278442, |
| "epoch": 1.5320167564332734, |
| "grad_norm": 0.041015625, |
| "learning_rate": 4.468581687612208e-05, |
| "loss": 0.0054, |
| "mean_token_accuracy": 0.9989441931247711, |
| "num_tokens": 2705076.0, |
| "step": 2560 |
| }, |
| { |
| "entropy": 1.6850673317909242, |
| "epoch": 1.5380011968880911, |
| "grad_norm": 0.8359375, |
| "learning_rate": 4.462597247157391e-05, |
| "loss": 0.0079, |
| "mean_token_accuracy": 0.9978463172912597, |
| "num_tokens": 2715754.0, |
| "step": 2570 |
| }, |
| { |
| "entropy": 1.6692217469215394, |
| "epoch": 1.5439856373429084, |
| "grad_norm": 0.048828125, |
| "learning_rate": 4.4566128067025734e-05, |
| "loss": 0.0054, |
| "mean_token_accuracy": 0.9985604822635651, |
| "num_tokens": 2726154.0, |
| "step": 2580 |
| }, |
| { |
| "entropy": 1.7106538891792298, |
| "epoch": 1.5499700777977259, |
| "grad_norm": 0.03662109375, |
| "learning_rate": 4.4506283662477564e-05, |
| "loss": 0.0064, |
| "mean_token_accuracy": 0.9978056967258453, |
| "num_tokens": 2736682.0, |
| "step": 2590 |
| }, |
| { |
| "entropy": 1.724018120765686, |
| "epoch": 1.5559545182525434, |
| "grad_norm": 0.0810546875, |
| "learning_rate": 4.444643925792939e-05, |
| "loss": 0.0116, |
| "mean_token_accuracy": 0.9978777050971985, |
| "num_tokens": 2747261.0, |
| "step": 2600 |
| }, |
| { |
| "entropy": 1.8123233675956727, |
| "epoch": 1.5619389587073609, |
| "grad_norm": 0.345703125, |
| "learning_rate": 4.438659485338121e-05, |
| "loss": 0.0118, |
| "mean_token_accuracy": 0.9970783770084382, |
| "num_tokens": 2757969.0, |
| "step": 2610 |
| }, |
| { |
| "entropy": 1.7606332778930665, |
| "epoch": 1.5679233991621784, |
| "grad_norm": 0.037353515625, |
| "learning_rate": 4.4326750448833035e-05, |
| "loss": 0.0099, |
| "mean_token_accuracy": 0.998226261138916, |
| "num_tokens": 2768599.0, |
| "step": 2620 |
| }, |
| { |
| "entropy": 1.7248129010200501, |
| "epoch": 1.5739078396169957, |
| "grad_norm": 0.033203125, |
| "learning_rate": 4.426690604428486e-05, |
| "loss": 0.0065, |
| "mean_token_accuracy": 0.9982411623001098, |
| "num_tokens": 2779152.0, |
| "step": 2630 |
| }, |
| { |
| "entropy": 1.6984099984169005, |
| "epoch": 1.5798922800718134, |
| "grad_norm": 0.30078125, |
| "learning_rate": 4.420706163973668e-05, |
| "loss": 0.0099, |
| "mean_token_accuracy": 0.9974049687385559, |
| "num_tokens": 2789672.0, |
| "step": 2640 |
| }, |
| { |
| "entropy": 1.7985439896583557, |
| "epoch": 1.5858767205266306, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.414721723518851e-05, |
| "loss": 0.0068, |
| "mean_token_accuracy": 0.9985246062278748, |
| "num_tokens": 2800058.0, |
| "step": 2650 |
| }, |
| { |
| "entropy": 1.848671793937683, |
| "epoch": 1.5918611609814484, |
| "grad_norm": 0.0245361328125, |
| "learning_rate": 4.4087372830640336e-05, |
| "loss": 0.0047, |
| "mean_token_accuracy": 0.9985762298107147, |
| "num_tokens": 2810671.0, |
| "step": 2660 |
| }, |
| { |
| "entropy": 1.8393208622932433, |
| "epoch": 1.5978456014362656, |
| "grad_norm": 0.027587890625, |
| "learning_rate": 4.402752842609216e-05, |
| "loss": 0.0068, |
| "mean_token_accuracy": 0.9992953419685364, |
| "num_tokens": 2821280.0, |
| "step": 2670 |
| }, |
| { |
| "entropy": 1.763173222541809, |
| "epoch": 1.6038300418910831, |
| "grad_norm": 0.1279296875, |
| "learning_rate": 4.396768402154399e-05, |
| "loss": 0.0025, |
| "mean_token_accuracy": 0.998900830745697, |
| "num_tokens": 2831716.0, |
| "step": 2680 |
| }, |
| { |
| "entropy": 1.8260416030883788, |
| "epoch": 1.6098144823459006, |
| "grad_norm": 0.0233154296875, |
| "learning_rate": 4.3907839616995814e-05, |
| "loss": 0.0069, |
| "mean_token_accuracy": 0.9986373245716095, |
| "num_tokens": 2842147.0, |
| "step": 2690 |
| }, |
| { |
| "entropy": 1.8105480790138244, |
| "epoch": 1.6157989228007181, |
| "grad_norm": 0.423828125, |
| "learning_rate": 4.384799521244764e-05, |
| "loss": 0.0042, |
| "mean_token_accuracy": 0.9993321299552917, |
| "num_tokens": 2852899.0, |
| "step": 2700 |
| }, |
| { |
| "entropy": 1.7745528101921082, |
| "epoch": 1.6217833632555356, |
| "grad_norm": 1.4296875, |
| "learning_rate": 4.378815080789946e-05, |
| "loss": 0.0117, |
| "mean_token_accuracy": 0.9975007236003876, |
| "num_tokens": 2863429.0, |
| "step": 2710 |
| }, |
| { |
| "entropy": 1.7714335441589355, |
| "epoch": 1.6277678037103531, |
| "grad_norm": 0.0169677734375, |
| "learning_rate": 4.3728306403351285e-05, |
| "loss": 0.0033, |
| "mean_token_accuracy": 0.9993084073066711, |
| "num_tokens": 2873934.0, |
| "step": 2720 |
| }, |
| { |
| "entropy": 1.7836796760559082, |
| "epoch": 1.6337522441651706, |
| "grad_norm": 0.00933837890625, |
| "learning_rate": 4.3668461998803115e-05, |
| "loss": 0.0051, |
| "mean_token_accuracy": 0.998683512210846, |
| "num_tokens": 2884469.0, |
| "step": 2730 |
| }, |
| { |
| "entropy": 1.7828012824058532, |
| "epoch": 1.639736684619988, |
| "grad_norm": 0.5078125, |
| "learning_rate": 4.360861759425494e-05, |
| "loss": 0.0042, |
| "mean_token_accuracy": 0.9992062389850617, |
| "num_tokens": 2894762.0, |
| "step": 2740 |
| }, |
| { |
| "entropy": 1.835528552532196, |
| "epoch": 1.6457211250748056, |
| "grad_norm": 0.5234375, |
| "learning_rate": 4.354877318970676e-05, |
| "loss": 0.0044, |
| "mean_token_accuracy": 0.9989313840866089, |
| "num_tokens": 2905249.0, |
| "step": 2750 |
| }, |
| { |
| "entropy": 1.9072396397590636, |
| "epoch": 1.6517055655296229, |
| "grad_norm": 1.4921875, |
| "learning_rate": 4.348892878515859e-05, |
| "loss": 0.0191, |
| "mean_token_accuracy": 0.9963602304458619, |
| "num_tokens": 2915557.0, |
| "step": 2760 |
| }, |
| { |
| "entropy": 2.0372175931930543, |
| "epoch": 1.6576900059844406, |
| "grad_norm": 0.025146484375, |
| "learning_rate": 4.3429084380610416e-05, |
| "loss": 0.0042, |
| "mean_token_accuracy": 0.9993102729320527, |
| "num_tokens": 2926153.0, |
| "step": 2770 |
| }, |
| { |
| "entropy": 2.0790203332901003, |
| "epoch": 1.6636744464392579, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.336923997606223e-05, |
| "loss": 0.0075, |
| "mean_token_accuracy": 0.9975761890411377, |
| "num_tokens": 2936597.0, |
| "step": 2780 |
| }, |
| { |
| "entropy": 1.9823689937591553, |
| "epoch": 1.6696588868940754, |
| "grad_norm": 0.0308837890625, |
| "learning_rate": 4.3309395571514063e-05, |
| "loss": 0.0045, |
| "mean_token_accuracy": 0.9985705196857453, |
| "num_tokens": 2947210.0, |
| "step": 2790 |
| }, |
| { |
| "entropy": 1.865414524078369, |
| "epoch": 1.6756433273488929, |
| "grad_norm": 1.4140625, |
| "learning_rate": 4.324955116696589e-05, |
| "loss": 0.0168, |
| "mean_token_accuracy": 0.9971051633358001, |
| "num_tokens": 2957718.0, |
| "step": 2800 |
| }, |
| { |
| "entropy": 1.8666903614997863, |
| "epoch": 1.6816277678037104, |
| "grad_norm": 0.7890625, |
| "learning_rate": 4.318970676241772e-05, |
| "loss": 0.0082, |
| "mean_token_accuracy": 0.998199051618576, |
| "num_tokens": 2968277.0, |
| "step": 2810 |
| }, |
| { |
| "entropy": 1.8577040791511537, |
| "epoch": 1.6876122082585279, |
| "grad_norm": 0.046875, |
| "learning_rate": 4.312986235786954e-05, |
| "loss": 0.0124, |
| "mean_token_accuracy": 0.9985236465930939, |
| "num_tokens": 2978802.0, |
| "step": 2820 |
| }, |
| { |
| "entropy": 1.8664780139923096, |
| "epoch": 1.6935966487133451, |
| "grad_norm": 0.05810546875, |
| "learning_rate": 4.3070017953321365e-05, |
| "loss": 0.0042, |
| "mean_token_accuracy": 0.9992798209190369, |
| "num_tokens": 2989285.0, |
| "step": 2830 |
| }, |
| { |
| "entropy": 1.8037607192993164, |
| "epoch": 1.6995810891681629, |
| "grad_norm": 0.376953125, |
| "learning_rate": 4.3010173548773195e-05, |
| "loss": 0.0155, |
| "mean_token_accuracy": 0.9978967070579529, |
| "num_tokens": 2999866.0, |
| "step": 2840 |
| }, |
| { |
| "entropy": 1.7806638836860658, |
| "epoch": 1.7055655296229801, |
| "grad_norm": 2.625, |
| "learning_rate": 4.295032914422502e-05, |
| "loss": 0.0143, |
| "mean_token_accuracy": 0.995397436618805, |
| "num_tokens": 3010270.0, |
| "step": 2850 |
| }, |
| { |
| "entropy": 1.6426480889320374, |
| "epoch": 1.7115499700777979, |
| "grad_norm": 0.1328125, |
| "learning_rate": 4.2890484739676835e-05, |
| "loss": 0.0099, |
| "mean_token_accuracy": 0.997680002450943, |
| "num_tokens": 3020978.0, |
| "step": 2860 |
| }, |
| { |
| "entropy": 1.570683479309082, |
| "epoch": 1.7175344105326151, |
| "grad_norm": 0.61328125, |
| "learning_rate": 4.2830640335128666e-05, |
| "loss": 0.0141, |
| "mean_token_accuracy": 0.997533792257309, |
| "num_tokens": 3031702.0, |
| "step": 2870 |
| }, |
| { |
| "entropy": 1.5966361045837403, |
| "epoch": 1.7235188509874326, |
| "grad_norm": 0.048828125, |
| "learning_rate": 4.277079593058049e-05, |
| "loss": 0.0052, |
| "mean_token_accuracy": 0.9989168524742127, |
| "num_tokens": 3042239.0, |
| "step": 2880 |
| }, |
| { |
| "entropy": 1.699927806854248, |
| "epoch": 1.7295032914422501, |
| "grad_norm": 0.71484375, |
| "learning_rate": 4.271095152603232e-05, |
| "loss": 0.009, |
| "mean_token_accuracy": 0.9989072799682617, |
| "num_tokens": 3052721.0, |
| "step": 2890 |
| }, |
| { |
| "entropy": 1.7357771277427674, |
| "epoch": 1.7354877318970676, |
| "grad_norm": 0.2578125, |
| "learning_rate": 4.2651107121484143e-05, |
| "loss": 0.0066, |
| "mean_token_accuracy": 0.9988929867744446, |
| "num_tokens": 3063139.0, |
| "step": 2900 |
| }, |
| { |
| "entropy": 1.7626645684242248, |
| "epoch": 1.7414721723518851, |
| "grad_norm": 0.025146484375, |
| "learning_rate": 4.259126271693597e-05, |
| "loss": 0.0039, |
| "mean_token_accuracy": 0.999007374048233, |
| "num_tokens": 3073708.0, |
| "step": 2910 |
| }, |
| { |
| "entropy": 1.7304651737213135, |
| "epoch": 1.7474566128067026, |
| "grad_norm": 1.7421875, |
| "learning_rate": 4.25314183123878e-05, |
| "loss": 0.0052, |
| "mean_token_accuracy": 0.9979542791843414, |
| "num_tokens": 3084324.0, |
| "step": 2920 |
| }, |
| { |
| "entropy": 1.6680734276771545, |
| "epoch": 1.75344105326152, |
| "grad_norm": 2.96875, |
| "learning_rate": 4.247157390783962e-05, |
| "loss": 0.0073, |
| "mean_token_accuracy": 0.9978868365287781, |
| "num_tokens": 3094786.0, |
| "step": 2930 |
| }, |
| { |
| "entropy": 1.5931068778038024, |
| "epoch": 1.7594254937163374, |
| "grad_norm": 0.63671875, |
| "learning_rate": 4.241172950329144e-05, |
| "loss": 0.0067, |
| "mean_token_accuracy": 0.9985966563224793, |
| "num_tokens": 3105311.0, |
| "step": 2940 |
| }, |
| { |
| "entropy": 1.7409747838974, |
| "epoch": 1.765409934171155, |
| "grad_norm": 0.4921875, |
| "learning_rate": 4.235188509874327e-05, |
| "loss": 0.0078, |
| "mean_token_accuracy": 0.9985540926456451, |
| "num_tokens": 3115948.0, |
| "step": 2950 |
| }, |
| { |
| "entropy": 1.7411438941955566, |
| "epoch": 1.7713943746259724, |
| "grad_norm": 0.07958984375, |
| "learning_rate": 4.229204069419509e-05, |
| "loss": 0.0031, |
| "mean_token_accuracy": 0.9996478855609894, |
| "num_tokens": 3126449.0, |
| "step": 2960 |
| }, |
| { |
| "entropy": 1.652970790863037, |
| "epoch": 1.77737881508079, |
| "grad_norm": 0.032958984375, |
| "learning_rate": 4.2232196289646915e-05, |
| "loss": 0.0021, |
| "mean_token_accuracy": 0.9996428549289703, |
| "num_tokens": 3137023.0, |
| "step": 2970 |
| }, |
| { |
| "entropy": 1.6763949155807496, |
| "epoch": 1.7833632555356074, |
| "grad_norm": 0.09619140625, |
| "learning_rate": 4.2172351885098746e-05, |
| "loss": 0.0026, |
| "mean_token_accuracy": 0.9996323525905609, |
| "num_tokens": 3147623.0, |
| "step": 2980 |
| }, |
| { |
| "entropy": 1.6129281163215636, |
| "epoch": 1.7893476959904249, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.211250748055057e-05, |
| "loss": 0.0033, |
| "mean_token_accuracy": 0.9985423862934113, |
| "num_tokens": 3158087.0, |
| "step": 2990 |
| }, |
| { |
| "entropy": 1.5717525839805604, |
| "epoch": 1.7953321364452424, |
| "grad_norm": 0.020263671875, |
| "learning_rate": 4.20526630760024e-05, |
| "loss": 0.0072, |
| "mean_token_accuracy": 0.9992836058139801, |
| "num_tokens": 3168795.0, |
| "step": 3000 |
| }, |
| { |
| "entropy": 1.5834394574165345, |
| "epoch": 1.8013165769000599, |
| "grad_norm": 0.0218505859375, |
| "learning_rate": 4.199281867145422e-05, |
| "loss": 0.0156, |
| "mean_token_accuracy": 0.9971341669559479, |
| "num_tokens": 3179498.0, |
| "step": 3010 |
| }, |
| { |
| "entropy": 1.6837530136108398, |
| "epoch": 1.8073010173548774, |
| "grad_norm": 1.296875, |
| "learning_rate": 4.193297426690604e-05, |
| "loss": 0.0168, |
| "mean_token_accuracy": 0.9972832024097442, |
| "num_tokens": 3190100.0, |
| "step": 3020 |
| }, |
| { |
| "entropy": 1.8728898286819458, |
| "epoch": 1.8132854578096946, |
| "grad_norm": 0.984375, |
| "learning_rate": 4.187312986235787e-05, |
| "loss": 0.0061, |
| "mean_token_accuracy": 0.9981972455978394, |
| "num_tokens": 3200638.0, |
| "step": 3030 |
| }, |
| { |
| "entropy": 1.7914460897445679, |
| "epoch": 1.8192698982645124, |
| "grad_norm": 0.55859375, |
| "learning_rate": 4.1813285457809694e-05, |
| "loss": 0.007, |
| "mean_token_accuracy": 0.997859263420105, |
| "num_tokens": 3211123.0, |
| "step": 3040 |
| }, |
| { |
| "entropy": 1.7274253368377686, |
| "epoch": 1.8252543387193296, |
| "grad_norm": 0.671875, |
| "learning_rate": 4.175344105326152e-05, |
| "loss": 0.0055, |
| "mean_token_accuracy": 0.9982551515102387, |
| "num_tokens": 3221763.0, |
| "step": 3050 |
| }, |
| { |
| "entropy": 1.6481835126876831, |
| "epoch": 1.8312387791741473, |
| "grad_norm": 0.0179443359375, |
| "learning_rate": 4.169359664871335e-05, |
| "loss": 0.0059, |
| "mean_token_accuracy": 0.9985887408256531, |
| "num_tokens": 3232290.0, |
| "step": 3060 |
| }, |
| { |
| "entropy": 1.6477917194366456, |
| "epoch": 1.8372232196289646, |
| "grad_norm": 0.032470703125, |
| "learning_rate": 4.163375224416517e-05, |
| "loss": 0.0065, |
| "mean_token_accuracy": 0.9989434242248535, |
| "num_tokens": 3242758.0, |
| "step": 3070 |
| }, |
| { |
| "entropy": 1.7404741525650025, |
| "epoch": 1.8432076600837821, |
| "grad_norm": 0.044677734375, |
| "learning_rate": 4.1573907839616995e-05, |
| "loss": 0.0102, |
| "mean_token_accuracy": 0.9978300333023071, |
| "num_tokens": 3253221.0, |
| "step": 3080 |
| }, |
| { |
| "entropy": 1.7434081315994263, |
| "epoch": 1.8491921005385996, |
| "grad_norm": 0.1015625, |
| "learning_rate": 4.1514063435068826e-05, |
| "loss": 0.0029, |
| "mean_token_accuracy": 0.9996563553810119, |
| "num_tokens": 3263868.0, |
| "step": 3090 |
| }, |
| { |
| "entropy": 1.7428674221038818, |
| "epoch": 1.8551765409934171, |
| "grad_norm": 0.1640625, |
| "learning_rate": 4.145421903052065e-05, |
| "loss": 0.0044, |
| "mean_token_accuracy": 0.9989525496959686, |
| "num_tokens": 3274441.0, |
| "step": 3100 |
| }, |
| { |
| "entropy": 1.7549742221832276, |
| "epoch": 1.8611609814482346, |
| "grad_norm": 0.027099609375, |
| "learning_rate": 4.139437462597247e-05, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9996551752090455, |
| "num_tokens": 3285089.0, |
| "step": 3110 |
| }, |
| { |
| "entropy": 1.7254215598106384, |
| "epoch": 1.867145421903052, |
| "grad_norm": 0.2255859375, |
| "learning_rate": 4.1334530221424296e-05, |
| "loss": 0.0031, |
| "mean_token_accuracy": 0.999311363697052, |
| "num_tokens": 3295802.0, |
| "step": 3120 |
| }, |
| { |
| "entropy": 1.6737028002738952, |
| "epoch": 1.8731298623578696, |
| "grad_norm": 0.5234375, |
| "learning_rate": 4.127468581687612e-05, |
| "loss": 0.0086, |
| "mean_token_accuracy": 0.9982663750648498, |
| "num_tokens": 3306563.0, |
| "step": 3130 |
| }, |
| { |
| "entropy": 1.725145435333252, |
| "epoch": 1.8791143028126869, |
| "grad_norm": 0.037841796875, |
| "learning_rate": 4.121484141232795e-05, |
| "loss": 0.0036, |
| "mean_token_accuracy": 0.9992632687091827, |
| "num_tokens": 3317087.0, |
| "step": 3140 |
| }, |
| { |
| "entropy": 1.7711864471435548, |
| "epoch": 1.8850987432675046, |
| "grad_norm": 0.224609375, |
| "learning_rate": 4.1154997007779774e-05, |
| "loss": 0.005, |
| "mean_token_accuracy": 0.9992907822132111, |
| "num_tokens": 3327581.0, |
| "step": 3150 |
| }, |
| { |
| "entropy": 1.7485621452331543, |
| "epoch": 1.8910831837223219, |
| "grad_norm": 0.609375, |
| "learning_rate": 4.10951526032316e-05, |
| "loss": 0.0029, |
| "mean_token_accuracy": 0.9996168553829193, |
| "num_tokens": 3338131.0, |
| "step": 3160 |
| }, |
| { |
| "entropy": 1.6710041880607605, |
| "epoch": 1.8970676241771396, |
| "grad_norm": 0.059326171875, |
| "learning_rate": 4.103530819868343e-05, |
| "loss": 0.0053, |
| "mean_token_accuracy": 0.9993249118328095, |
| "num_tokens": 3348685.0, |
| "step": 3170 |
| }, |
| { |
| "entropy": 1.696055793762207, |
| "epoch": 1.9030520646319569, |
| "grad_norm": 0.01708984375, |
| "learning_rate": 4.097546379413525e-05, |
| "loss": 0.0042, |
| "mean_token_accuracy": 0.9989370882511139, |
| "num_tokens": 3359299.0, |
| "step": 3180 |
| }, |
| { |
| "entropy": 1.6661385297775269, |
| "epoch": 1.9090365050867744, |
| "grad_norm": 1.4140625, |
| "learning_rate": 4.0915619389587075e-05, |
| "loss": 0.0037, |
| "mean_token_accuracy": 0.998573511838913, |
| "num_tokens": 3369753.0, |
| "step": 3190 |
| }, |
| { |
| "entropy": 1.6959844470024108, |
| "epoch": 1.9150209455415919, |
| "grad_norm": 0.369140625, |
| "learning_rate": 4.08557749850389e-05, |
| "loss": 0.0072, |
| "mean_token_accuracy": 0.9981562256813049, |
| "num_tokens": 3380273.0, |
| "step": 3200 |
| }, |
| { |
| "entropy": 1.6790688037872314, |
| "epoch": 1.9210053859964094, |
| "grad_norm": 0.0693359375, |
| "learning_rate": 4.079593058049072e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.999646645784378, |
| "num_tokens": 3390726.0, |
| "step": 3210 |
| }, |
| { |
| "entropy": 1.6851406931877135, |
| "epoch": 1.9269898264512269, |
| "grad_norm": 0.01177978515625, |
| "learning_rate": 4.073608617594255e-05, |
| "loss": 0.0033, |
| "mean_token_accuracy": 0.9992761254310608, |
| "num_tokens": 3401436.0, |
| "step": 3220 |
| }, |
| { |
| "entropy": 1.6872121214866638, |
| "epoch": 1.9329742669060441, |
| "grad_norm": 1.0546875, |
| "learning_rate": 4.0676241771394376e-05, |
| "loss": 0.0043, |
| "mean_token_accuracy": 0.9989988803863525, |
| "num_tokens": 3412090.0, |
| "step": 3230 |
| }, |
| { |
| "entropy": 1.716448712348938, |
| "epoch": 1.9389587073608618, |
| "grad_norm": 1.0390625, |
| "learning_rate": 4.06163973668462e-05, |
| "loss": 0.0033, |
| "mean_token_accuracy": 0.9992851316928864, |
| "num_tokens": 3422680.0, |
| "step": 3240 |
| }, |
| { |
| "entropy": 1.7055083990097046, |
| "epoch": 1.9449431478156791, |
| "grad_norm": 0.2109375, |
| "learning_rate": 4.055655296229803e-05, |
| "loss": 0.0037, |
| "mean_token_accuracy": 0.9986045360565186, |
| "num_tokens": 3433240.0, |
| "step": 3250 |
| }, |
| { |
| "entropy": 1.6845415115356446, |
| "epoch": 1.9509275882704968, |
| "grad_norm": 0.53515625, |
| "learning_rate": 4.0496708557749854e-05, |
| "loss": 0.006, |
| "mean_token_accuracy": 0.9979104697704315, |
| "num_tokens": 3443914.0, |
| "step": 3260 |
| }, |
| { |
| "entropy": 1.6189923405647277, |
| "epoch": 1.9569120287253141, |
| "grad_norm": 0.703125, |
| "learning_rate": 4.043686415320167e-05, |
| "loss": 0.0082, |
| "mean_token_accuracy": 0.9978470265865326, |
| "num_tokens": 3454396.0, |
| "step": 3270 |
| }, |
| { |
| "entropy": 1.6343070983886718, |
| "epoch": 1.9628964691801316, |
| "grad_norm": 0.029052734375, |
| "learning_rate": 4.03770197486535e-05, |
| "loss": 0.0034, |
| "mean_token_accuracy": 0.9992868721485137, |
| "num_tokens": 3465124.0, |
| "step": 3280 |
| }, |
| { |
| "entropy": 1.6529561638832093, |
| "epoch": 1.968880909634949, |
| "grad_norm": 0.054931640625, |
| "learning_rate": 4.0317175344105325e-05, |
| "loss": 0.0081, |
| "mean_token_accuracy": 0.998256516456604, |
| "num_tokens": 3475684.0, |
| "step": 3290 |
| }, |
| { |
| "entropy": 1.6316777467727661, |
| "epoch": 1.9748653500897666, |
| "grad_norm": 1.8125, |
| "learning_rate": 4.0257330939557155e-05, |
| "loss": 0.0054, |
| "mean_token_accuracy": 0.9990007400512695, |
| "num_tokens": 3486230.0, |
| "step": 3300 |
| }, |
| { |
| "entropy": 1.6272296071052552, |
| "epoch": 1.980849790544584, |
| "grad_norm": 0.130859375, |
| "learning_rate": 4.019748653500898e-05, |
| "loss": 0.0099, |
| "mean_token_accuracy": 0.9979246437549592, |
| "num_tokens": 3496872.0, |
| "step": 3310 |
| }, |
| { |
| "entropy": 1.5232258319854737, |
| "epoch": 1.9868342309994016, |
| "grad_norm": 0.0281982421875, |
| "learning_rate": 4.01376421304608e-05, |
| "loss": 0.0056, |
| "mean_token_accuracy": 0.9988779187202453, |
| "num_tokens": 3507359.0, |
| "step": 3320 |
| }, |
| { |
| "entropy": 1.4930276036262513, |
| "epoch": 1.992818671454219, |
| "grad_norm": 0.0225830078125, |
| "learning_rate": 4.007779772591263e-05, |
| "loss": 0.0024, |
| "mean_token_accuracy": 0.9993079602718353, |
| "num_tokens": 3517875.0, |
| "step": 3330 |
| }, |
| { |
| "entropy": 1.473480725288391, |
| "epoch": 1.9988031119090364, |
| "grad_norm": 0.171875, |
| "learning_rate": 4.0017953321364456e-05, |
| "loss": 0.0033, |
| "mean_token_accuracy": 0.999308729171753, |
| "num_tokens": 3528455.0, |
| "step": 3340 |
| }, |
| { |
| "entropy": 1.4143877744674682, |
| "epoch": 2.004787552363854, |
| "grad_norm": 0.35546875, |
| "learning_rate": 3.995810891681627e-05, |
| "loss": 0.0024, |
| "mean_token_accuracy": 0.9985805928707123, |
| "num_tokens": 3538886.0, |
| "step": 3350 |
| }, |
| { |
| "entropy": 1.4655809164047242, |
| "epoch": 2.0107719928186714, |
| "grad_norm": 0.10400390625, |
| "learning_rate": 3.98982645122681e-05, |
| "loss": 0.003, |
| "mean_token_accuracy": 0.9996428549289703, |
| "num_tokens": 3549329.0, |
| "step": 3360 |
| }, |
| { |
| "entropy": 1.5789464473724366, |
| "epoch": 2.016756433273489, |
| "grad_norm": 0.02783203125, |
| "learning_rate": 3.983842010771993e-05, |
| "loss": 0.0046, |
| "mean_token_accuracy": 0.9988843858242035, |
| "num_tokens": 3559966.0, |
| "step": 3370 |
| }, |
| { |
| "entropy": 1.6663800716400146, |
| "epoch": 2.0227408737283064, |
| "grad_norm": 0.404296875, |
| "learning_rate": 3.977857570317175e-05, |
| "loss": 0.0022, |
| "mean_token_accuracy": 0.999292403459549, |
| "num_tokens": 3570588.0, |
| "step": 3380 |
| }, |
| { |
| "entropy": 1.5711905360221863, |
| "epoch": 2.028725314183124, |
| "grad_norm": 0.1337890625, |
| "learning_rate": 3.971873129862358e-05, |
| "loss": 0.0031, |
| "mean_token_accuracy": 0.9993067562580109, |
| "num_tokens": 3581142.0, |
| "step": 3390 |
| }, |
| { |
| "entropy": 1.5718475818634032, |
| "epoch": 2.0347097546379413, |
| "grad_norm": 0.054931640625, |
| "learning_rate": 3.9658886894075405e-05, |
| "loss": 0.0058, |
| "mean_token_accuracy": 0.9988965094089508, |
| "num_tokens": 3591649.0, |
| "step": 3400 |
| }, |
| { |
| "entropy": 1.667174506187439, |
| "epoch": 2.0406941950927586, |
| "grad_norm": 0.095703125, |
| "learning_rate": 3.9599042489527235e-05, |
| "loss": 0.0032, |
| "mean_token_accuracy": 0.9992761373519897, |
| "num_tokens": 3602268.0, |
| "step": 3410 |
| }, |
| { |
| "entropy": 1.6831801772117614, |
| "epoch": 2.0466786355475763, |
| "grad_norm": 0.072265625, |
| "learning_rate": 3.953919808497906e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.999646645784378, |
| "num_tokens": 3613010.0, |
| "step": 3420 |
| }, |
| { |
| "entropy": 1.6643374800682067, |
| "epoch": 2.0526630760023936, |
| "grad_norm": 0.0859375, |
| "learning_rate": 3.9479353680430875e-05, |
| "loss": 0.0017, |
| "mean_token_accuracy": 0.9996296286582946, |
| "num_tokens": 3623611.0, |
| "step": 3430 |
| }, |
| { |
| "entropy": 1.5809536933898927, |
| "epoch": 2.0586475164572113, |
| "grad_norm": 0.0198974609375, |
| "learning_rate": 3.9419509275882706e-05, |
| "loss": 0.0015, |
| "mean_token_accuracy": 0.999646645784378, |
| "num_tokens": 3634063.0, |
| "step": 3440 |
| }, |
| { |
| "entropy": 1.6075554132461547, |
| "epoch": 2.0646319569120286, |
| "grad_norm": 0.0712890625, |
| "learning_rate": 3.935966487133453e-05, |
| "loss": 0.0028, |
| "mean_token_accuracy": 0.998986691236496, |
| "num_tokens": 3644779.0, |
| "step": 3450 |
| }, |
| { |
| "entropy": 1.5977044224739074, |
| "epoch": 2.0706163973668463, |
| "grad_norm": 0.02734375, |
| "learning_rate": 3.929982046678635e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9996527791023254, |
| "num_tokens": 3655353.0, |
| "step": 3460 |
| }, |
| { |
| "entropy": 1.5767899036407471, |
| "epoch": 2.0766008378216636, |
| "grad_norm": 0.515625, |
| "learning_rate": 3.923997606223818e-05, |
| "loss": 0.0015, |
| "mean_token_accuracy": 0.9996363639831543, |
| "num_tokens": 3665953.0, |
| "step": 3470 |
| }, |
| { |
| "entropy": 1.57765634059906, |
| "epoch": 2.0825852782764813, |
| "grad_norm": 0.55078125, |
| "learning_rate": 3.918013165769001e-05, |
| "loss": 0.003, |
| "mean_token_accuracy": 0.9988885760307312, |
| "num_tokens": 3676317.0, |
| "step": 3480 |
| }, |
| { |
| "entropy": 1.5800118088722228, |
| "epoch": 2.0885697187312986, |
| "grad_norm": 0.4453125, |
| "learning_rate": 3.912028725314184e-05, |
| "loss": 0.004, |
| "mean_token_accuracy": 0.9989434778690338, |
| "num_tokens": 3686778.0, |
| "step": 3490 |
| }, |
| { |
| "entropy": 1.5637386918067933, |
| "epoch": 2.0945541591861163, |
| "grad_norm": 0.046875, |
| "learning_rate": 3.906044284859366e-05, |
| "loss": 0.0031, |
| "mean_token_accuracy": 0.998903077840805, |
| "num_tokens": 3697341.0, |
| "step": 3500 |
| }, |
| { |
| "entropy": 1.5040214300155639, |
| "epoch": 2.1005385996409336, |
| "grad_norm": 0.0157470703125, |
| "learning_rate": 3.900059844404548e-05, |
| "loss": 0.0031, |
| "mean_token_accuracy": 0.9992480218410492, |
| "num_tokens": 3707750.0, |
| "step": 3510 |
| }, |
| { |
| "entropy": 1.53771892786026, |
| "epoch": 2.106523040095751, |
| "grad_norm": 0.02587890625, |
| "learning_rate": 3.894075403949731e-05, |
| "loss": 0.0042, |
| "mean_token_accuracy": 0.9992437899112702, |
| "num_tokens": 3718304.0, |
| "step": 3520 |
| }, |
| { |
| "entropy": 1.5398314595222473, |
| "epoch": 2.1125074805505686, |
| "grad_norm": 0.06884765625, |
| "learning_rate": 3.888090963494913e-05, |
| "loss": 0.0044, |
| "mean_token_accuracy": 0.9985848188400268, |
| "num_tokens": 3728881.0, |
| "step": 3530 |
| }, |
| { |
| "entropy": 1.5595148921012878, |
| "epoch": 2.118491921005386, |
| "grad_norm": 0.01031494140625, |
| "learning_rate": 3.8821065230400955e-05, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9996491253376008, |
| "num_tokens": 3739469.0, |
| "step": 3540 |
| }, |
| { |
| "entropy": 1.6275775909423829, |
| "epoch": 2.1244763614602036, |
| "grad_norm": 0.08642578125, |
| "learning_rate": 3.8761220825852786e-05, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9996402859687805, |
| "num_tokens": 3750007.0, |
| "step": 3550 |
| }, |
| { |
| "entropy": 1.5797725796699524, |
| "epoch": 2.130460801915021, |
| "grad_norm": 0.0390625, |
| "learning_rate": 3.870137642130461e-05, |
| "loss": 0.0046, |
| "mean_token_accuracy": 0.998933631181717, |
| "num_tokens": 3760603.0, |
| "step": 3560 |
| }, |
| { |
| "entropy": 1.518853211402893, |
| "epoch": 2.1364452423698386, |
| "grad_norm": 0.0299072265625, |
| "learning_rate": 3.864153201675643e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 3771047.0, |
| "step": 3570 |
| }, |
| { |
| "entropy": 1.4537081837654113, |
| "epoch": 2.142429682824656, |
| "grad_norm": 0.0869140625, |
| "learning_rate": 3.858168761220826e-05, |
| "loss": 0.0016, |
| "mean_token_accuracy": 0.9996587038040161, |
| "num_tokens": 3781678.0, |
| "step": 3580 |
| }, |
| { |
| "entropy": 1.4374495148658752, |
| "epoch": 2.1484141232794736, |
| "grad_norm": 0.059326171875, |
| "learning_rate": 3.852184320766009e-05, |
| "loss": 0.0023, |
| "mean_token_accuracy": 0.9989359259605408, |
| "num_tokens": 3792139.0, |
| "step": 3590 |
| }, |
| { |
| "entropy": 1.4887493014335633, |
| "epoch": 2.154398563734291, |
| "grad_norm": 0.061767578125, |
| "learning_rate": 3.846199880311191e-05, |
| "loss": 0.0048, |
| "mean_token_accuracy": 0.9985256731510163, |
| "num_tokens": 3802586.0, |
| "step": 3600 |
| }, |
| { |
| "entropy": 1.5625263094902038, |
| "epoch": 2.160383004189108, |
| "grad_norm": 0.412109375, |
| "learning_rate": 3.8402154398563734e-05, |
| "loss": 0.003, |
| "mean_token_accuracy": 0.998971951007843, |
| "num_tokens": 3813115.0, |
| "step": 3610 |
| }, |
| { |
| "entropy": 1.6105829358100892, |
| "epoch": 2.166367444643926, |
| "grad_norm": 0.12890625, |
| "learning_rate": 3.834230999401556e-05, |
| "loss": 0.0025, |
| "mean_token_accuracy": 0.9989726841449738, |
| "num_tokens": 3823587.0, |
| "step": 3620 |
| }, |
| { |
| "entropy": 1.543606126308441, |
| "epoch": 2.172351885098743, |
| "grad_norm": 0.01458740234375, |
| "learning_rate": 3.828246558946739e-05, |
| "loss": 0.0029, |
| "mean_token_accuracy": 0.9992787480354309, |
| "num_tokens": 3834170.0, |
| "step": 3630 |
| }, |
| { |
| "entropy": 1.5743085980415343, |
| "epoch": 2.178336325553561, |
| "grad_norm": 0.56640625, |
| "learning_rate": 3.822262118491921e-05, |
| "loss": 0.0035, |
| "mean_token_accuracy": 0.9985100030899048, |
| "num_tokens": 3844656.0, |
| "step": 3640 |
| }, |
| { |
| "entropy": 1.655414378643036, |
| "epoch": 2.184320766008378, |
| "grad_norm": 0.5625, |
| "learning_rate": 3.8162776780371035e-05, |
| "loss": 0.0048, |
| "mean_token_accuracy": 0.9978154480457306, |
| "num_tokens": 3855329.0, |
| "step": 3650 |
| }, |
| { |
| "entropy": 1.5091199278831482, |
| "epoch": 2.190305206463196, |
| "grad_norm": 0.00592041015625, |
| "learning_rate": 3.8102932375822866e-05, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996402859687805, |
| "num_tokens": 3865991.0, |
| "step": 3660 |
| }, |
| { |
| "entropy": 1.4430976867675782, |
| "epoch": 2.196289646918013, |
| "grad_norm": 0.5390625, |
| "learning_rate": 3.804308797127469e-05, |
| "loss": 0.0061, |
| "mean_token_accuracy": 0.9989435613155365, |
| "num_tokens": 3876625.0, |
| "step": 3670 |
| }, |
| { |
| "entropy": 1.4487504601478576, |
| "epoch": 2.202274087372831, |
| "grad_norm": 0.0213623046875, |
| "learning_rate": 3.7983243566726506e-05, |
| "loss": 0.0023, |
| "mean_token_accuracy": 0.9992740869522094, |
| "num_tokens": 3887170.0, |
| "step": 3680 |
| }, |
| { |
| "entropy": 1.4686145186424255, |
| "epoch": 2.208258527827648, |
| "grad_norm": 0.0230712890625, |
| "learning_rate": 3.7923399162178336e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 3897911.0, |
| "step": 3690 |
| }, |
| { |
| "entropy": 1.4431620836257935, |
| "epoch": 2.2142429682824654, |
| "grad_norm": 0.77734375, |
| "learning_rate": 3.786355475763016e-05, |
| "loss": 0.0032, |
| "mean_token_accuracy": 0.9993333339691162, |
| "num_tokens": 3908336.0, |
| "step": 3700 |
| }, |
| { |
| "entropy": 1.4750329613685609, |
| "epoch": 2.220227408737283, |
| "grad_norm": 0.00927734375, |
| "learning_rate": 3.780371035308199e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996336996555328, |
| "num_tokens": 3918781.0, |
| "step": 3710 |
| }, |
| { |
| "entropy": 1.5059397101402283, |
| "epoch": 2.2262118491921004, |
| "grad_norm": 0.00958251953125, |
| "learning_rate": 3.7743865948533814e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 3929424.0, |
| "step": 3720 |
| }, |
| { |
| "entropy": 1.464370334148407, |
| "epoch": 2.232196289646918, |
| "grad_norm": 0.01165771484375, |
| "learning_rate": 3.768402154398564e-05, |
| "loss": 0.0044, |
| "mean_token_accuracy": 0.9993080615997314, |
| "num_tokens": 3939959.0, |
| "step": 3730 |
| }, |
| { |
| "entropy": 1.461155390739441, |
| "epoch": 2.2381807301017353, |
| "grad_norm": 0.06201171875, |
| "learning_rate": 3.762417713943747e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 3950544.0, |
| "step": 3740 |
| }, |
| { |
| "entropy": 1.4727044582366944, |
| "epoch": 2.244165170556553, |
| "grad_norm": 0.09033203125, |
| "learning_rate": 3.756433273488929e-05, |
| "loss": 0.0033, |
| "mean_token_accuracy": 0.9992726743221283, |
| "num_tokens": 3960981.0, |
| "step": 3750 |
| }, |
| { |
| "entropy": 1.5011940956115724, |
| "epoch": 2.2501496110113703, |
| "grad_norm": 0.25390625, |
| "learning_rate": 3.750448833034111e-05, |
| "loss": 0.0036, |
| "mean_token_accuracy": 0.9992779731750489, |
| "num_tokens": 3971564.0, |
| "step": 3760 |
| }, |
| { |
| "entropy": 1.4078798294067383, |
| "epoch": 2.256134051466188, |
| "grad_norm": 0.103515625, |
| "learning_rate": 3.744464392579294e-05, |
| "loss": 0.0025, |
| "mean_token_accuracy": 0.9996212124824524, |
| "num_tokens": 3982005.0, |
| "step": 3770 |
| }, |
| { |
| "entropy": 1.4928112626075745, |
| "epoch": 2.2621184919210053, |
| "grad_norm": 0.021728515625, |
| "learning_rate": 3.738479952124476e-05, |
| "loss": 0.0021, |
| "mean_token_accuracy": 0.9996710538864135, |
| "num_tokens": 3992692.0, |
| "step": 3780 |
| }, |
| { |
| "entropy": 1.5147884964942933, |
| "epoch": 2.2681029323758226, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.732495511669659e-05, |
| "loss": 0.004, |
| "mean_token_accuracy": 0.9985506594181061, |
| "num_tokens": 4003180.0, |
| "step": 3790 |
| }, |
| { |
| "entropy": 1.541051721572876, |
| "epoch": 2.2740873728306403, |
| "grad_norm": 0.021728515625, |
| "learning_rate": 3.7265110712148416e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 4013778.0, |
| "step": 3800 |
| }, |
| { |
| "entropy": 1.5526644825935363, |
| "epoch": 2.280071813285458, |
| "grad_norm": 0.58203125, |
| "learning_rate": 3.720526630760024e-05, |
| "loss": 0.0027, |
| "mean_token_accuracy": 0.999664431810379, |
| "num_tokens": 4024306.0, |
| "step": 3810 |
| }, |
| { |
| "entropy": 1.516521191596985, |
| "epoch": 2.2860562537402753, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.714542190305207e-05, |
| "loss": 0.0053, |
| "mean_token_accuracy": 0.9985815584659576, |
| "num_tokens": 4034819.0, |
| "step": 3820 |
| }, |
| { |
| "entropy": 1.4400922060012817, |
| "epoch": 2.2920406941950926, |
| "grad_norm": 0.033203125, |
| "learning_rate": 3.7085577498503894e-05, |
| "loss": 0.0041, |
| "mean_token_accuracy": 0.9993127107620239, |
| "num_tokens": 4045226.0, |
| "step": 3830 |
| }, |
| { |
| "entropy": 1.454219126701355, |
| "epoch": 2.2980251346499103, |
| "grad_norm": 0.58203125, |
| "learning_rate": 3.702573309395571e-05, |
| "loss": 0.0025, |
| "mean_token_accuracy": 0.9996282517910003, |
| "num_tokens": 4055858.0, |
| "step": 3840 |
| }, |
| { |
| "entropy": 1.423510491847992, |
| "epoch": 2.3040095751047276, |
| "grad_norm": 0.458984375, |
| "learning_rate": 3.696588868940754e-05, |
| "loss": 0.0024, |
| "mean_token_accuracy": 0.9992446184158326, |
| "num_tokens": 4066452.0, |
| "step": 3850 |
| }, |
| { |
| "entropy": 1.453119683265686, |
| "epoch": 2.3099940155595453, |
| "grad_norm": 0.037109375, |
| "learning_rate": 3.6906044284859365e-05, |
| "loss": 0.0025, |
| "mean_token_accuracy": 0.9992790877819061, |
| "num_tokens": 4077100.0, |
| "step": 3860 |
| }, |
| { |
| "entropy": 1.3985321760177611, |
| "epoch": 2.3159784560143626, |
| "grad_norm": 0.09033203125, |
| "learning_rate": 3.684619988031119e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 4087538.0, |
| "step": 3870 |
| }, |
| { |
| "entropy": 1.4676225781440735, |
| "epoch": 2.3219628964691803, |
| "grad_norm": 0.010498046875, |
| "learning_rate": 3.678635547576302e-05, |
| "loss": 0.0021, |
| "mean_token_accuracy": 0.9989520311355591, |
| "num_tokens": 4098159.0, |
| "step": 3880 |
| }, |
| { |
| "entropy": 1.4613074779510498, |
| "epoch": 2.3279473369239976, |
| "grad_norm": 0.00579833984375, |
| "learning_rate": 3.672651107121484e-05, |
| "loss": 0.0036, |
| "mean_token_accuracy": 0.9989755749702454, |
| "num_tokens": 4108693.0, |
| "step": 3890 |
| }, |
| { |
| "entropy": 1.4477983593940735, |
| "epoch": 2.3339317773788153, |
| "grad_norm": 0.06787109375, |
| "learning_rate": 3.666666666666667e-05, |
| "loss": 0.0018, |
| "mean_token_accuracy": 0.9993051826953888, |
| "num_tokens": 4119341.0, |
| "step": 3900 |
| }, |
| { |
| "entropy": 1.442084550857544, |
| "epoch": 2.3399162178336326, |
| "grad_norm": 0.0546875, |
| "learning_rate": 3.6606822262118496e-05, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9996254682540894, |
| "num_tokens": 4129710.0, |
| "step": 3910 |
| }, |
| { |
| "entropy": 1.4917728424072265, |
| "epoch": 2.34590065828845, |
| "grad_norm": 0.064453125, |
| "learning_rate": 3.654697785757031e-05, |
| "loss": 0.0018, |
| "mean_token_accuracy": 0.999326479434967, |
| "num_tokens": 4140404.0, |
| "step": 3920 |
| }, |
| { |
| "entropy": 1.4556031465530395, |
| "epoch": 2.3518850987432676, |
| "grad_norm": 0.138671875, |
| "learning_rate": 3.648713345302214e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996336996555328, |
| "num_tokens": 4150763.0, |
| "step": 3930 |
| }, |
| { |
| "entropy": 1.594400453567505, |
| "epoch": 2.357869539198085, |
| "grad_norm": 0.01397705078125, |
| "learning_rate": 3.642728904847397e-05, |
| "loss": 0.0016, |
| "mean_token_accuracy": 0.9992455244064331, |
| "num_tokens": 4161320.0, |
| "step": 3940 |
| }, |
| { |
| "entropy": 1.4968407869338989, |
| "epoch": 2.3638539796529026, |
| "grad_norm": 0.0205078125, |
| "learning_rate": 3.636744464392579e-05, |
| "loss": 0.0026, |
| "mean_token_accuracy": 0.9992551147937775, |
| "num_tokens": 4171550.0, |
| "step": 3950 |
| }, |
| { |
| "entropy": 1.4865975022315978, |
| "epoch": 2.36983842010772, |
| "grad_norm": 0.0205078125, |
| "learning_rate": 3.630760023937762e-05, |
| "loss": 0.0028, |
| "mean_token_accuracy": 0.9993265926837921, |
| "num_tokens": 4182415.0, |
| "step": 3960 |
| }, |
| { |
| "entropy": 1.42852680683136, |
| "epoch": 2.3758228605625376, |
| "grad_norm": 0.0272216796875, |
| "learning_rate": 3.6247755834829444e-05, |
| "loss": 0.0017, |
| "mean_token_accuracy": 0.9996515691280365, |
| "num_tokens": 4193040.0, |
| "step": 3970 |
| }, |
| { |
| "entropy": 1.4439242601394653, |
| "epoch": 2.381807301017355, |
| "grad_norm": 0.0966796875, |
| "learning_rate": 3.618791143028127e-05, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996632993221283, |
| "num_tokens": 4203681.0, |
| "step": 3980 |
| }, |
| { |
| "entropy": 1.4410004258155822, |
| "epoch": 2.3877917414721725, |
| "grad_norm": 0.0084228515625, |
| "learning_rate": 3.61280670257331e-05, |
| "loss": 0.0016, |
| "mean_token_accuracy": 0.9992619931697846, |
| "num_tokens": 4214173.0, |
| "step": 3990 |
| }, |
| { |
| "entropy": 1.3759098768234252, |
| "epoch": 2.39377618192699, |
| "grad_norm": 5.96875, |
| "learning_rate": 3.606822262118492e-05, |
| "loss": 0.0024, |
| "mean_token_accuracy": 0.9993297576904296, |
| "num_tokens": 4224654.0, |
| "step": 4000 |
| }, |
| { |
| "entropy": 1.417907428741455, |
| "epoch": 2.399760622381807, |
| "grad_norm": 0.02734375, |
| "learning_rate": 3.6008378216636746e-05, |
| "loss": 0.0018, |
| "mean_token_accuracy": 0.9996415793895721, |
| "num_tokens": 4235203.0, |
| "step": 4010 |
| }, |
| { |
| "entropy": 1.4660828113555908, |
| "epoch": 2.405745062836625, |
| "grad_norm": 0.00799560546875, |
| "learning_rate": 3.594853381208857e-05, |
| "loss": 0.0034, |
| "mean_token_accuracy": 0.9993115305900574, |
| "num_tokens": 4245897.0, |
| "step": 4020 |
| }, |
| { |
| "entropy": 1.5142138838768004, |
| "epoch": 2.411729503291442, |
| "grad_norm": 0.197265625, |
| "learning_rate": 3.588868940754039e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 4256656.0, |
| "step": 4030 |
| }, |
| { |
| "entropy": 1.4816847085952758, |
| "epoch": 2.41771394374626, |
| "grad_norm": 0.11474609375, |
| "learning_rate": 3.582884500299222e-05, |
| "loss": 0.0069, |
| "mean_token_accuracy": 0.9981687843799592, |
| "num_tokens": 4267267.0, |
| "step": 4040 |
| }, |
| { |
| "entropy": 1.4937491059303283, |
| "epoch": 2.423698384201077, |
| "grad_norm": 0.2451171875, |
| "learning_rate": 3.576900059844405e-05, |
| "loss": 0.0043, |
| "mean_token_accuracy": 0.9981954574584961, |
| "num_tokens": 4277646.0, |
| "step": 4050 |
| }, |
| { |
| "entropy": 1.5693211913108827, |
| "epoch": 2.429682824655895, |
| "grad_norm": 0.126953125, |
| "learning_rate": 3.570915619389587e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996551752090455, |
| "num_tokens": 4288121.0, |
| "step": 4060 |
| }, |
| { |
| "entropy": 1.4717801094055176, |
| "epoch": 2.435667265110712, |
| "grad_norm": 0.0164794921875, |
| "learning_rate": 3.56493117893477e-05, |
| "loss": 0.0029, |
| "mean_token_accuracy": 0.9989271759986877, |
| "num_tokens": 4298628.0, |
| "step": 4070 |
| }, |
| { |
| "entropy": 1.4409169912338258, |
| "epoch": 2.44165170556553, |
| "grad_norm": 0.1083984375, |
| "learning_rate": 3.5589467384799524e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996268630027771, |
| "num_tokens": 4309009.0, |
| "step": 4080 |
| }, |
| { |
| "entropy": 1.4519354104995728, |
| "epoch": 2.447636146020347, |
| "grad_norm": 0.353515625, |
| "learning_rate": 3.552962298025135e-05, |
| "loss": 0.001, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 4319676.0, |
| "step": 4090 |
| }, |
| { |
| "entropy": 1.46173015832901, |
| "epoch": 2.4536205864751643, |
| "grad_norm": 0.07470703125, |
| "learning_rate": 3.546977857570317e-05, |
| "loss": 0.0032, |
| "mean_token_accuracy": 0.9989534914493561, |
| "num_tokens": 4330203.0, |
| "step": 4100 |
| }, |
| { |
| "entropy": 1.4660939931869508, |
| "epoch": 2.459605026929982, |
| "grad_norm": 0.1513671875, |
| "learning_rate": 3.5409934171154995e-05, |
| "loss": 0.0059, |
| "mean_token_accuracy": 0.9986146986484528, |
| "num_tokens": 4340739.0, |
| "step": 4110 |
| }, |
| { |
| "entropy": 1.5220659852027894, |
| "epoch": 2.4655894673847993, |
| "grad_norm": 0.671875, |
| "learning_rate": 3.5350089766606826e-05, |
| "loss": 0.0028, |
| "mean_token_accuracy": 0.9989289164543151, |
| "num_tokens": 4351318.0, |
| "step": 4120 |
| }, |
| { |
| "entropy": 1.4552037119865417, |
| "epoch": 2.471573907839617, |
| "grad_norm": 0.234375, |
| "learning_rate": 3.529024536205865e-05, |
| "loss": 0.0035, |
| "mean_token_accuracy": 0.9989991009235382, |
| "num_tokens": 4361930.0, |
| "step": 4130 |
| }, |
| { |
| "entropy": 1.4578218817710877, |
| "epoch": 2.4775583482944343, |
| "grad_norm": 0.024169921875, |
| "learning_rate": 3.523040095751047e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996855318546295, |
| "num_tokens": 4372650.0, |
| "step": 4140 |
| }, |
| { |
| "entropy": 1.4377676606178285, |
| "epoch": 2.483542788749252, |
| "grad_norm": 0.05615234375, |
| "learning_rate": 3.51705565529623e-05, |
| "loss": 0.0015, |
| "mean_token_accuracy": 0.9996784567832947, |
| "num_tokens": 4383173.0, |
| "step": 4150 |
| }, |
| { |
| "entropy": 1.453041100502014, |
| "epoch": 2.4895272292040693, |
| "grad_norm": 0.005828857421875, |
| "learning_rate": 3.511071214841413e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996389865875244, |
| "num_tokens": 4393773.0, |
| "step": 4160 |
| }, |
| { |
| "entropy": 1.4778467059135436, |
| "epoch": 2.495511669658887, |
| "grad_norm": 0.039794921875, |
| "learning_rate": 3.5050867743865943e-05, |
| "loss": 0.0079, |
| "mean_token_accuracy": 0.9985835254192352, |
| "num_tokens": 4404451.0, |
| "step": 4170 |
| }, |
| { |
| "entropy": 1.5435897827148437, |
| "epoch": 2.5014961101137043, |
| "grad_norm": 0.359375, |
| "learning_rate": 3.4991023339317774e-05, |
| "loss": 0.0036, |
| "mean_token_accuracy": 0.9992982268333435, |
| "num_tokens": 4414958.0, |
| "step": 4180 |
| }, |
| { |
| "entropy": 1.4607853293418884, |
| "epoch": 2.5074805505685216, |
| "grad_norm": 0.048828125, |
| "learning_rate": 3.49311789347696e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9996515691280365, |
| "num_tokens": 4425511.0, |
| "step": 4190 |
| }, |
| { |
| "entropy": 1.5232563734054565, |
| "epoch": 2.5134649910233393, |
| "grad_norm": 0.014892578125, |
| "learning_rate": 3.487133453022143e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996666669845581, |
| "num_tokens": 4436154.0, |
| "step": 4200 |
| }, |
| { |
| "entropy": 1.5244243621826172, |
| "epoch": 2.519449431478157, |
| "grad_norm": 0.2158203125, |
| "learning_rate": 3.481149012567325e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996323525905609, |
| "num_tokens": 4446848.0, |
| "step": 4210 |
| }, |
| { |
| "entropy": 1.456788754463196, |
| "epoch": 2.5254338719329743, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.4751645721125075e-05, |
| "loss": 0.0027, |
| "mean_token_accuracy": 0.9989286601543427, |
| "num_tokens": 4457385.0, |
| "step": 4220 |
| }, |
| { |
| "entropy": 1.5002227187156678, |
| "epoch": 2.5314183123877916, |
| "grad_norm": 1.0703125, |
| "learning_rate": 3.4691801316576905e-05, |
| "loss": 0.0072, |
| "mean_token_accuracy": 0.9986241698265076, |
| "num_tokens": 4468134.0, |
| "step": 4230 |
| }, |
| { |
| "entropy": 1.5510728359222412, |
| "epoch": 2.5374027528426093, |
| "grad_norm": 0.01348876953125, |
| "learning_rate": 3.463195691202873e-05, |
| "loss": 0.0022, |
| "mean_token_accuracy": 0.9992902100086212, |
| "num_tokens": 4478691.0, |
| "step": 4240 |
| }, |
| { |
| "entropy": 1.5891472101211548, |
| "epoch": 2.5433871932974266, |
| "grad_norm": 0.07568359375, |
| "learning_rate": 3.4572112507480546e-05, |
| "loss": 0.0049, |
| "mean_token_accuracy": 0.9989309251308441, |
| "num_tokens": 4489250.0, |
| "step": 4250 |
| }, |
| { |
| "entropy": 1.6060064077377318, |
| "epoch": 2.5493716337522443, |
| "grad_norm": 0.026611328125, |
| "learning_rate": 3.4512268102932376e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996503472328186, |
| "num_tokens": 4499839.0, |
| "step": 4260 |
| }, |
| { |
| "entropy": 1.5444846272468566, |
| "epoch": 2.5553560742070616, |
| "grad_norm": 0.0189208984375, |
| "learning_rate": 3.44524236983842e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9996632993221283, |
| "num_tokens": 4510430.0, |
| "step": 4270 |
| }, |
| { |
| "entropy": 1.5540019631385804, |
| "epoch": 2.561340514661879, |
| "grad_norm": 0.0130615234375, |
| "learning_rate": 3.439257929383602e-05, |
| "loss": 0.0035, |
| "mean_token_accuracy": 0.9989399433135986, |
| "num_tokens": 4521077.0, |
| "step": 4280 |
| }, |
| { |
| "entropy": 1.5348394870758058, |
| "epoch": 2.5673249551166966, |
| "grad_norm": 0.03369140625, |
| "learning_rate": 3.4332734889287854e-05, |
| "loss": 0.0015, |
| "mean_token_accuracy": 0.9996376812458039, |
| "num_tokens": 4531630.0, |
| "step": 4290 |
| }, |
| { |
| "entropy": 1.5469215869903565, |
| "epoch": 2.5733093955715143, |
| "grad_norm": 0.043701171875, |
| "learning_rate": 3.427289048473968e-05, |
| "loss": 0.0049, |
| "mean_token_accuracy": 0.9985177874565124, |
| "num_tokens": 4542041.0, |
| "step": 4300 |
| }, |
| { |
| "entropy": 1.5128920078277588, |
| "epoch": 2.5792938360263316, |
| "grad_norm": 0.1015625, |
| "learning_rate": 3.421304608019151e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 4552585.0, |
| "step": 4310 |
| }, |
| { |
| "entropy": 1.52936589717865, |
| "epoch": 2.585278276481149, |
| "grad_norm": 0.55859375, |
| "learning_rate": 3.415320167564333e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9992699146270752, |
| "num_tokens": 4563067.0, |
| "step": 4320 |
| }, |
| { |
| "entropy": 1.5417215347290039, |
| "epoch": 2.5912627169359665, |
| "grad_norm": 0.06005859375, |
| "learning_rate": 3.409335727109515e-05, |
| "loss": 0.0031, |
| "mean_token_accuracy": 0.9993090510368348, |
| "num_tokens": 4573914.0, |
| "step": 4330 |
| }, |
| { |
| "entropy": 1.5155917763710023, |
| "epoch": 2.597247157390784, |
| "grad_norm": 0.578125, |
| "learning_rate": 3.403351286654698e-05, |
| "loss": 0.0022, |
| "mean_token_accuracy": 0.9992700695991517, |
| "num_tokens": 4584524.0, |
| "step": 4340 |
| }, |
| { |
| "entropy": 1.5353880643844604, |
| "epoch": 2.6032315978456015, |
| "grad_norm": 0.1982421875, |
| "learning_rate": 3.39736684619988e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 4595094.0, |
| "step": 4350 |
| }, |
| { |
| "entropy": 1.5418899774551391, |
| "epoch": 2.609216038300419, |
| "grad_norm": 0.1318359375, |
| "learning_rate": 3.3913824057450626e-05, |
| "loss": 0.0061, |
| "mean_token_accuracy": 0.9981540739536285, |
| "num_tokens": 4605752.0, |
| "step": 4360 |
| }, |
| { |
| "entropy": 1.5859925389289855, |
| "epoch": 2.6152004787552365, |
| "grad_norm": 0.0595703125, |
| "learning_rate": 3.3853979652902456e-05, |
| "loss": 0.0018, |
| "mean_token_accuracy": 0.9996491253376008, |
| "num_tokens": 4616330.0, |
| "step": 4370 |
| }, |
| { |
| "entropy": 1.5407701492309571, |
| "epoch": 2.621184919210054, |
| "grad_norm": 0.2578125, |
| "learning_rate": 3.379413524835428e-05, |
| "loss": 0.0016, |
| "mean_token_accuracy": 0.999646645784378, |
| "num_tokens": 4626881.0, |
| "step": 4380 |
| }, |
| { |
| "entropy": 1.5098812222480773, |
| "epoch": 2.6271693596648715, |
| "grad_norm": 0.04931640625, |
| "learning_rate": 3.373429084380611e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 4637420.0, |
| "step": 4390 |
| }, |
| { |
| "entropy": 1.5174281120300293, |
| "epoch": 2.633153800119689, |
| "grad_norm": 0.0654296875, |
| "learning_rate": 3.3674446439257934e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9996503472328186, |
| "num_tokens": 4648257.0, |
| "step": 4400 |
| }, |
| { |
| "entropy": 1.5267613768577575, |
| "epoch": 2.639138240574506, |
| "grad_norm": 0.05859375, |
| "learning_rate": 3.361460203470975e-05, |
| "loss": 0.0065, |
| "mean_token_accuracy": 0.9981722176074982, |
| "num_tokens": 4658785.0, |
| "step": 4410 |
| }, |
| { |
| "entropy": 1.6269310355186462, |
| "epoch": 2.645122681029324, |
| "grad_norm": 0.007781982421875, |
| "learning_rate": 3.355475763016158e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9992606043815613, |
| "num_tokens": 4669411.0, |
| "step": 4420 |
| }, |
| { |
| "entropy": 1.5434075832366942, |
| "epoch": 2.651107121484141, |
| "grad_norm": 0.275390625, |
| "learning_rate": 3.3494913225613404e-05, |
| "loss": 0.0011, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 4680058.0, |
| "step": 4430 |
| }, |
| { |
| "entropy": 1.5232465744018555, |
| "epoch": 2.657091561938959, |
| "grad_norm": 0.0196533203125, |
| "learning_rate": 3.343506882106523e-05, |
| "loss": 0.0023, |
| "mean_token_accuracy": 0.9996710538864135, |
| "num_tokens": 4690722.0, |
| "step": 4440 |
| }, |
| { |
| "entropy": 1.5052083373069762, |
| "epoch": 2.663076002393776, |
| "grad_norm": 0.06689453125, |
| "learning_rate": 3.337522441651706e-05, |
| "loss": 0.0043, |
| "mean_token_accuracy": 0.9985235571861267, |
| "num_tokens": 4701334.0, |
| "step": 4450 |
| }, |
| { |
| "entropy": 1.4890037059783936, |
| "epoch": 2.669060442848594, |
| "grad_norm": 1.1796875, |
| "learning_rate": 3.331538001196888e-05, |
| "loss": 0.0032, |
| "mean_token_accuracy": 0.999231505393982, |
| "num_tokens": 4711696.0, |
| "step": 4460 |
| }, |
| { |
| "entropy": 1.5000533580780029, |
| "epoch": 2.675044883303411, |
| "grad_norm": 0.0546875, |
| "learning_rate": 3.3255535607420706e-05, |
| "loss": 0.0023, |
| "mean_token_accuracy": 0.9996138989925385, |
| "num_tokens": 4722098.0, |
| "step": 4470 |
| }, |
| { |
| "entropy": 1.51080322265625, |
| "epoch": 2.6810293237582288, |
| "grad_norm": 0.044189453125, |
| "learning_rate": 3.3195691202872536e-05, |
| "loss": 0.0021, |
| "mean_token_accuracy": 0.9996587038040161, |
| "num_tokens": 4732902.0, |
| "step": 4480 |
| }, |
| { |
| "entropy": 1.4408185839653016, |
| "epoch": 2.687013764213046, |
| "grad_norm": 0.63671875, |
| "learning_rate": 3.313584679832436e-05, |
| "loss": 0.0033, |
| "mean_token_accuracy": 0.9989372074604035, |
| "num_tokens": 4743519.0, |
| "step": 4490 |
| }, |
| { |
| "entropy": 1.432204270362854, |
| "epoch": 2.6929982046678633, |
| "grad_norm": 0.54296875, |
| "learning_rate": 3.307600239377618e-05, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.9996527791023254, |
| "num_tokens": 4754124.0, |
| "step": 4500 |
| }, |
| { |
| "entropy": 1.4771082758903504, |
| "epoch": 2.698982645122681, |
| "grad_norm": 0.205078125, |
| "learning_rate": 3.301615798922801e-05, |
| "loss": 0.0034, |
| "mean_token_accuracy": 0.9985949337482453, |
| "num_tokens": 4764763.0, |
| "step": 4510 |
| }, |
| { |
| "entropy": 1.4384092807769775, |
| "epoch": 2.7049670855774988, |
| "grad_norm": 0.00811767578125, |
| "learning_rate": 3.295631358467983e-05, |
| "loss": 0.0036, |
| "mean_token_accuracy": 0.9989839315414428, |
| "num_tokens": 4775386.0, |
| "step": 4520 |
| }, |
| { |
| "entropy": 1.4536986112594605, |
| "epoch": 2.710951526032316, |
| "grad_norm": 0.07373046875, |
| "learning_rate": 3.289646918013166e-05, |
| "loss": 0.0023, |
| "mean_token_accuracy": 0.9989380478858948, |
| "num_tokens": 4785874.0, |
| "step": 4530 |
| }, |
| { |
| "entropy": 1.5010742783546447, |
| "epoch": 2.7169359664871333, |
| "grad_norm": 0.033935546875, |
| "learning_rate": 3.2836624775583484e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 4796434.0, |
| "step": 4540 |
| }, |
| { |
| "entropy": 1.477715837955475, |
| "epoch": 2.722920406941951, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 3.277678037103531e-05, |
| "loss": 0.0056, |
| "mean_token_accuracy": 0.9985794126987457, |
| "num_tokens": 4806938.0, |
| "step": 4550 |
| }, |
| { |
| "entropy": 1.5574210286140442, |
| "epoch": 2.7289048473967683, |
| "grad_norm": 0.0216064453125, |
| "learning_rate": 3.271693596648714e-05, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9996389865875244, |
| "num_tokens": 4817756.0, |
| "step": 4560 |
| }, |
| { |
| "entropy": 1.5755536079406738, |
| "epoch": 2.734889287851586, |
| "grad_norm": 0.8046875, |
| "learning_rate": 3.265709156193896e-05, |
| "loss": 0.0025, |
| "mean_token_accuracy": 0.9992808401584625, |
| "num_tokens": 4828491.0, |
| "step": 4570 |
| }, |
| { |
| "entropy": 1.4965651631355286, |
| "epoch": 2.7408737283064033, |
| "grad_norm": 0.0986328125, |
| "learning_rate": 3.259724715739078e-05, |
| "loss": 0.0018, |
| "mean_token_accuracy": 0.9992878794670105, |
| "num_tokens": 4839082.0, |
| "step": 4580 |
| }, |
| { |
| "entropy": 1.4643235802650452, |
| "epoch": 2.7468581687612206, |
| "grad_norm": 0.1640625, |
| "learning_rate": 3.253740275284261e-05, |
| "loss": 0.0085, |
| "mean_token_accuracy": 0.998158860206604, |
| "num_tokens": 4849584.0, |
| "step": 4590 |
| }, |
| { |
| "entropy": 1.4505114316940309, |
| "epoch": 2.7528426092160383, |
| "grad_norm": 0.40625, |
| "learning_rate": 3.247755834829443e-05, |
| "loss": 0.0039, |
| "mean_token_accuracy": 0.9993296384811401, |
| "num_tokens": 4860264.0, |
| "step": 4600 |
| }, |
| { |
| "entropy": 1.3683255314826965, |
| "epoch": 2.758827049670856, |
| "grad_norm": 0.01373291015625, |
| "learning_rate": 3.241771394374626e-05, |
| "loss": 0.0023, |
| "mean_token_accuracy": 0.9996655523777008, |
| "num_tokens": 4870822.0, |
| "step": 4610 |
| }, |
| { |
| "entropy": 1.3456801891326904, |
| "epoch": 2.7648114901256733, |
| "grad_norm": 0.01031494140625, |
| "learning_rate": 3.235786953919809e-05, |
| "loss": 0.0052, |
| "mean_token_accuracy": 0.9979090332984925, |
| "num_tokens": 4881289.0, |
| "step": 4620 |
| }, |
| { |
| "entropy": 1.3935713291168212, |
| "epoch": 2.7707959305804906, |
| "grad_norm": 0.16796875, |
| "learning_rate": 3.229802513464991e-05, |
| "loss": 0.0036, |
| "mean_token_accuracy": 0.9988677442073822, |
| "num_tokens": 4891678.0, |
| "step": 4630 |
| }, |
| { |
| "entropy": 1.4410547733306884, |
| "epoch": 2.7767803710353083, |
| "grad_norm": 0.232421875, |
| "learning_rate": 3.223818073010174e-05, |
| "loss": 0.0028, |
| "mean_token_accuracy": 0.9989205062389374, |
| "num_tokens": 4902128.0, |
| "step": 4640 |
| }, |
| { |
| "entropy": 1.4120708227157592, |
| "epoch": 2.7827648114901256, |
| "grad_norm": 0.150390625, |
| "learning_rate": 3.2178336325553564e-05, |
| "loss": 0.0015, |
| "mean_token_accuracy": 0.999294513463974, |
| "num_tokens": 4912785.0, |
| "step": 4650 |
| }, |
| { |
| "entropy": 1.4368455290794373, |
| "epoch": 2.7887492519449433, |
| "grad_norm": 0.09375, |
| "learning_rate": 3.211849192100538e-05, |
| "loss": 0.0015, |
| "mean_token_accuracy": 0.9992712080478668, |
| "num_tokens": 4923429.0, |
| "step": 4660 |
| }, |
| { |
| "entropy": 1.436909818649292, |
| "epoch": 2.7947336923997605, |
| "grad_norm": 0.53125, |
| "learning_rate": 3.205864751645721e-05, |
| "loss": 0.0037, |
| "mean_token_accuracy": 0.9989143192768097, |
| "num_tokens": 4934074.0, |
| "step": 4670 |
| }, |
| { |
| "entropy": 1.5463759303092957, |
| "epoch": 2.800718132854578, |
| "grad_norm": 0.016357421875, |
| "learning_rate": 3.1998803111909035e-05, |
| "loss": 0.0028, |
| "mean_token_accuracy": 0.999250841140747, |
| "num_tokens": 4944751.0, |
| "step": 4680 |
| }, |
| { |
| "entropy": 1.4755305051803589, |
| "epoch": 2.8067025733093955, |
| "grad_norm": 2.015625, |
| "learning_rate": 3.1938958707360865e-05, |
| "loss": 0.0026, |
| "mean_token_accuracy": 0.9992857158184052, |
| "num_tokens": 4955387.0, |
| "step": 4690 |
| }, |
| { |
| "entropy": 1.5100136518478393, |
| "epoch": 2.8126870137642133, |
| "grad_norm": 0.11279296875, |
| "learning_rate": 3.187911430281269e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 4966201.0, |
| "step": 4700 |
| }, |
| { |
| "entropy": 1.4382469534873963, |
| "epoch": 2.8186714542190305, |
| "grad_norm": 0.018798828125, |
| "learning_rate": 3.181926989826451e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 4976551.0, |
| "step": 4710 |
| }, |
| { |
| "entropy": 1.470636510848999, |
| "epoch": 2.824655894673848, |
| "grad_norm": 0.69140625, |
| "learning_rate": 3.175942549371634e-05, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996108949184418, |
| "num_tokens": 4987218.0, |
| "step": 4720 |
| }, |
| { |
| "entropy": 1.53121098279953, |
| "epoch": 2.8306403351286655, |
| "grad_norm": 0.0234375, |
| "learning_rate": 3.1699581089168167e-05, |
| "loss": 0.0023, |
| "mean_token_accuracy": 0.9993226230144501, |
| "num_tokens": 4997799.0, |
| "step": 4730 |
| }, |
| { |
| "entropy": 1.4530461430549622, |
| "epoch": 2.836624775583483, |
| "grad_norm": 0.0172119140625, |
| "learning_rate": 3.163973668461998e-05, |
| "loss": 0.0028, |
| "mean_token_accuracy": 0.9989462852478027, |
| "num_tokens": 5008234.0, |
| "step": 4740 |
| }, |
| { |
| "entropy": 1.4743767023086547, |
| "epoch": 2.8426092160383005, |
| "grad_norm": 0.41015625, |
| "learning_rate": 3.1579892280071814e-05, |
| "loss": 0.0017, |
| "mean_token_accuracy": 0.9992892682552338, |
| "num_tokens": 5018644.0, |
| "step": 4750 |
| }, |
| { |
| "entropy": 1.4847406983375548, |
| "epoch": 2.848593656493118, |
| "grad_norm": 0.1103515625, |
| "learning_rate": 3.152004787552364e-05, |
| "loss": 0.005, |
| "mean_token_accuracy": 0.9986034750938415, |
| "num_tokens": 5029305.0, |
| "step": 4760 |
| }, |
| { |
| "entropy": 1.4679431319236755, |
| "epoch": 2.8545780969479355, |
| "grad_norm": 0.052001953125, |
| "learning_rate": 3.146020347097546e-05, |
| "loss": 0.002, |
| "mean_token_accuracy": 0.999646645784378, |
| "num_tokens": 5039922.0, |
| "step": 4770 |
| }, |
| { |
| "entropy": 1.5108673334121705, |
| "epoch": 2.860562537402753, |
| "grad_norm": 0.04248046875, |
| "learning_rate": 3.140035906642729e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5050363.0, |
| "step": 4780 |
| }, |
| { |
| "entropy": 1.4920130252838135, |
| "epoch": 2.8665469778575705, |
| "grad_norm": 0.0194091796875, |
| "learning_rate": 3.1340514661879115e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5061011.0, |
| "step": 4790 |
| }, |
| { |
| "entropy": 1.4699716448783875, |
| "epoch": 2.872531418312388, |
| "grad_norm": 0.051513671875, |
| "learning_rate": 3.1280670257330945e-05, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9996527791023254, |
| "num_tokens": 5071583.0, |
| "step": 4800 |
| }, |
| { |
| "entropy": 1.5170957326889039, |
| "epoch": 2.878515858767205, |
| "grad_norm": 0.028564453125, |
| "learning_rate": 3.122082585278277e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5082159.0, |
| "step": 4810 |
| }, |
| { |
| "entropy": 1.502884578704834, |
| "epoch": 2.884500299222023, |
| "grad_norm": 0.0235595703125, |
| "learning_rate": 3.1160981448234586e-05, |
| "loss": 0.0019, |
| "mean_token_accuracy": 0.9996503472328186, |
| "num_tokens": 5092755.0, |
| "step": 4820 |
| }, |
| { |
| "entropy": 1.4837321162223815, |
| "epoch": 2.89048473967684, |
| "grad_norm": 0.3984375, |
| "learning_rate": 3.1101137043686416e-05, |
| "loss": 0.0016, |
| "mean_token_accuracy": 0.9996296286582946, |
| "num_tokens": 5103320.0, |
| "step": 4830 |
| }, |
| { |
| "entropy": 1.4991668343544007, |
| "epoch": 2.8964691801316578, |
| "grad_norm": 0.337890625, |
| "learning_rate": 3.104129263913824e-05, |
| "loss": 0.004, |
| "mean_token_accuracy": 0.9984982252120972, |
| "num_tokens": 5113891.0, |
| "step": 4840 |
| }, |
| { |
| "entropy": 1.528916025161743, |
| "epoch": 2.902453620586475, |
| "grad_norm": 0.1748046875, |
| "learning_rate": 3.098144823459006e-05, |
| "loss": 0.0028, |
| "mean_token_accuracy": 0.9992779791355133, |
| "num_tokens": 5124448.0, |
| "step": 4850 |
| }, |
| { |
| "entropy": 1.5347569346427918, |
| "epoch": 2.9084380610412928, |
| "grad_norm": 0.03515625, |
| "learning_rate": 3.0921603830041894e-05, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.9996527791023254, |
| "num_tokens": 5134840.0, |
| "step": 4860 |
| }, |
| { |
| "entropy": 1.5149821162223815, |
| "epoch": 2.91442250149611, |
| "grad_norm": 0.10205078125, |
| "learning_rate": 3.086175942549372e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996666669845581, |
| "num_tokens": 5145434.0, |
| "step": 4870 |
| }, |
| { |
| "entropy": 1.4977423548698425, |
| "epoch": 2.9204069419509278, |
| "grad_norm": 0.07373046875, |
| "learning_rate": 3.080191502094554e-05, |
| "loss": 0.0018, |
| "mean_token_accuracy": 0.9996376812458039, |
| "num_tokens": 5155919.0, |
| "step": 4880 |
| }, |
| { |
| "entropy": 1.4655091524124146, |
| "epoch": 2.926391382405745, |
| "grad_norm": 0.016357421875, |
| "learning_rate": 3.074207061639737e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5166450.0, |
| "step": 4890 |
| }, |
| { |
| "entropy": 1.4794137835502625, |
| "epoch": 2.9323758228605623, |
| "grad_norm": 0.0299072265625, |
| "learning_rate": 3.068222621184919e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5176892.0, |
| "step": 4900 |
| }, |
| { |
| "entropy": 1.497855818271637, |
| "epoch": 2.93836026331538, |
| "grad_norm": 0.017333984375, |
| "learning_rate": 3.062238180730102e-05, |
| "loss": 0.0031, |
| "mean_token_accuracy": 0.9989420473575592, |
| "num_tokens": 5187383.0, |
| "step": 4910 |
| }, |
| { |
| "entropy": 1.5195743203163148, |
| "epoch": 2.9443447037701977, |
| "grad_norm": 0.0238037109375, |
| "learning_rate": 3.056253740275284e-05, |
| "loss": 0.0031, |
| "mean_token_accuracy": 0.9985025763511658, |
| "num_tokens": 5197908.0, |
| "step": 4920 |
| }, |
| { |
| "entropy": 1.4769899368286132, |
| "epoch": 2.950329144225015, |
| "grad_norm": 0.2197265625, |
| "learning_rate": 3.0502692998204666e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5208366.0, |
| "step": 4930 |
| }, |
| { |
| "entropy": 1.4623586893081666, |
| "epoch": 2.9563135846798323, |
| "grad_norm": 0.08642578125, |
| "learning_rate": 3.0442848593656496e-05, |
| "loss": 0.0018, |
| "mean_token_accuracy": 0.9996363639831543, |
| "num_tokens": 5219004.0, |
| "step": 4940 |
| }, |
| { |
| "entropy": 1.4855722069740296, |
| "epoch": 2.96229802513465, |
| "grad_norm": 0.265625, |
| "learning_rate": 3.038300418910832e-05, |
| "loss": 0.0051, |
| "mean_token_accuracy": 0.9985096752643585, |
| "num_tokens": 5229507.0, |
| "step": 4950 |
| }, |
| { |
| "entropy": 1.4608722567558288, |
| "epoch": 2.9682824655894673, |
| "grad_norm": 0.072265625, |
| "learning_rate": 3.0323159784560143e-05, |
| "loss": 0.0046, |
| "mean_token_accuracy": 0.9985809206962586, |
| "num_tokens": 5240053.0, |
| "step": 4960 |
| }, |
| { |
| "entropy": 1.4974953174591064, |
| "epoch": 2.974266906044285, |
| "grad_norm": 0.169921875, |
| "learning_rate": 3.026331538001197e-05, |
| "loss": 0.0026, |
| "mean_token_accuracy": 0.9989298462867737, |
| "num_tokens": 5250618.0, |
| "step": 4970 |
| }, |
| { |
| "entropy": 1.5181520104408264, |
| "epoch": 2.9802513464991023, |
| "grad_norm": 0.06689453125, |
| "learning_rate": 3.0203470975463794e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996428549289703, |
| "num_tokens": 5261074.0, |
| "step": 4980 |
| }, |
| { |
| "entropy": 1.4660282492637635, |
| "epoch": 2.9862357869539196, |
| "grad_norm": 0.076171875, |
| "learning_rate": 3.0143626570915624e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5271530.0, |
| "step": 4990 |
| }, |
| { |
| "entropy": 1.506136167049408, |
| "epoch": 2.9922202274087373, |
| "grad_norm": 0.578125, |
| "learning_rate": 3.0083782166367448e-05, |
| "loss": 0.0021, |
| "mean_token_accuracy": 0.9985557436943054, |
| "num_tokens": 5282008.0, |
| "step": 5000 |
| }, |
| { |
| "entropy": 1.4818431258201599, |
| "epoch": 2.998204667863555, |
| "grad_norm": 0.77734375, |
| "learning_rate": 3.0023937761819268e-05, |
| "loss": 0.0018, |
| "mean_token_accuracy": 0.9993121862411499, |
| "num_tokens": 5292635.0, |
| "step": 5010 |
| }, |
| { |
| "entropy": 1.4319651007652283, |
| "epoch": 3.0041891083183723, |
| "grad_norm": 0.134765625, |
| "learning_rate": 2.9964093357271095e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996402859687805, |
| "num_tokens": 5303190.0, |
| "step": 5020 |
| }, |
| { |
| "entropy": 1.4162240147590637, |
| "epoch": 3.0101735487731895, |
| "grad_norm": 0.01031494140625, |
| "learning_rate": 2.9904248952722922e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5313752.0, |
| "step": 5030 |
| }, |
| { |
| "entropy": 1.4751434803009034, |
| "epoch": 3.0161579892280073, |
| "grad_norm": 0.028564453125, |
| "learning_rate": 2.984440454817475e-05, |
| "loss": 0.0019, |
| "mean_token_accuracy": 0.9996268630027771, |
| "num_tokens": 5324371.0, |
| "step": 5040 |
| }, |
| { |
| "entropy": 1.4873990535736084, |
| "epoch": 3.0221424296828245, |
| "grad_norm": 0.6328125, |
| "learning_rate": 2.978456014362657e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996563553810119, |
| "num_tokens": 5334881.0, |
| "step": 5050 |
| }, |
| { |
| "entropy": 1.4645564079284668, |
| "epoch": 3.0281268701376423, |
| "grad_norm": 0.00909423828125, |
| "learning_rate": 2.9724715739078396e-05, |
| "loss": 0.002, |
| "mean_token_accuracy": 0.9992962300777435, |
| "num_tokens": 5345310.0, |
| "step": 5060 |
| }, |
| { |
| "entropy": 1.5135003924369812, |
| "epoch": 3.0341113105924595, |
| "grad_norm": 0.0390625, |
| "learning_rate": 2.9664871334530223e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.999672132730484, |
| "num_tokens": 5355813.0, |
| "step": 5070 |
| }, |
| { |
| "entropy": 1.5595512986183167, |
| "epoch": 3.0400957510472773, |
| "grad_norm": 0.03076171875, |
| "learning_rate": 2.960502692998205e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5366358.0, |
| "step": 5080 |
| }, |
| { |
| "entropy": 1.5525766372680665, |
| "epoch": 3.0460801915020945, |
| "grad_norm": 0.0264892578125, |
| "learning_rate": 2.954518252543387e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996138989925385, |
| "num_tokens": 5376822.0, |
| "step": 5090 |
| }, |
| { |
| "entropy": 1.518768072128296, |
| "epoch": 3.0520646319569122, |
| "grad_norm": 0.07080078125, |
| "learning_rate": 2.9485338120885697e-05, |
| "loss": 0.0015, |
| "mean_token_accuracy": 0.9992601990699768, |
| "num_tokens": 5387230.0, |
| "step": 5100 |
| }, |
| { |
| "entropy": 1.5416632294654846, |
| "epoch": 3.0580490724117295, |
| "grad_norm": 0.5859375, |
| "learning_rate": 2.9425493716337524e-05, |
| "loss": 0.0021, |
| "mean_token_accuracy": 0.9988771200180053, |
| "num_tokens": 5397736.0, |
| "step": 5110 |
| }, |
| { |
| "entropy": 1.587303638458252, |
| "epoch": 3.064033512866547, |
| "grad_norm": 0.03857421875, |
| "learning_rate": 2.936564931178935e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996731996536254, |
| "num_tokens": 5408278.0, |
| "step": 5120 |
| }, |
| { |
| "entropy": 1.5591753721237183, |
| "epoch": 3.0700179533213645, |
| "grad_norm": 0.058837890625, |
| "learning_rate": 2.930580490724117e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5418970.0, |
| "step": 5130 |
| }, |
| { |
| "entropy": 1.6015469908714295, |
| "epoch": 3.076002393776182, |
| "grad_norm": 0.32421875, |
| "learning_rate": 2.9245960502693e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5429604.0, |
| "step": 5140 |
| }, |
| { |
| "entropy": 1.5480207562446595, |
| "epoch": 3.0819868342309995, |
| "grad_norm": 0.62890625, |
| "learning_rate": 2.9186116098144825e-05, |
| "loss": 0.0021, |
| "mean_token_accuracy": 0.9992478489875793, |
| "num_tokens": 5440142.0, |
| "step": 5150 |
| }, |
| { |
| "entropy": 1.5280380129814148, |
| "epoch": 3.087971274685817, |
| "grad_norm": 0.314453125, |
| "learning_rate": 2.9126271693596652e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996551752090455, |
| "num_tokens": 5450721.0, |
| "step": 5160 |
| }, |
| { |
| "entropy": 1.5630383491516113, |
| "epoch": 3.0939557151406345, |
| "grad_norm": 0.0517578125, |
| "learning_rate": 2.9066427289048473e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5461260.0, |
| "step": 5170 |
| }, |
| { |
| "entropy": 1.5174272894859313, |
| "epoch": 3.0999401555954518, |
| "grad_norm": 0.443359375, |
| "learning_rate": 2.90065828845003e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9996632993221283, |
| "num_tokens": 5471701.0, |
| "step": 5180 |
| }, |
| { |
| "entropy": 1.5312930703163148, |
| "epoch": 3.1059245960502695, |
| "grad_norm": 0.54296875, |
| "learning_rate": 2.8946738479952127e-05, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.9996655523777008, |
| "num_tokens": 5482325.0, |
| "step": 5190 |
| }, |
| { |
| "entropy": 1.5398496747016908, |
| "epoch": 3.1119090365050868, |
| "grad_norm": 0.007232666015625, |
| "learning_rate": 2.888689407540395e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5492966.0, |
| "step": 5200 |
| }, |
| { |
| "entropy": 1.550033414363861, |
| "epoch": 3.117893476959904, |
| "grad_norm": 0.06103515625, |
| "learning_rate": 2.8827049670855774e-05, |
| "loss": 0.0016, |
| "mean_token_accuracy": 0.9989342570304871, |
| "num_tokens": 5503575.0, |
| "step": 5210 |
| }, |
| { |
| "entropy": 1.430702805519104, |
| "epoch": 3.1238779174147218, |
| "grad_norm": 0.328125, |
| "learning_rate": 2.87672052663076e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5514038.0, |
| "step": 5220 |
| }, |
| { |
| "entropy": 1.5385449290275575, |
| "epoch": 3.129862357869539, |
| "grad_norm": 0.0390625, |
| "learning_rate": 2.8707360861759428e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996376812458039, |
| "num_tokens": 5524786.0, |
| "step": 5230 |
| }, |
| { |
| "entropy": 1.4950608015060425, |
| "epoch": 3.1358467983243568, |
| "grad_norm": 0.0341796875, |
| "learning_rate": 2.864751645721125e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996503472328186, |
| "num_tokens": 5535199.0, |
| "step": 5240 |
| }, |
| { |
| "entropy": 1.506748330593109, |
| "epoch": 3.141831238779174, |
| "grad_norm": 0.015625, |
| "learning_rate": 2.8587672052663075e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996415793895721, |
| "num_tokens": 5545783.0, |
| "step": 5250 |
| }, |
| { |
| "entropy": 1.5544589042663575, |
| "epoch": 3.1478156792339917, |
| "grad_norm": 0.0125732421875, |
| "learning_rate": 2.8527827648114902e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9992754220962524, |
| "num_tokens": 5556377.0, |
| "step": 5260 |
| }, |
| { |
| "entropy": 1.5457674264907837, |
| "epoch": 3.153800119688809, |
| "grad_norm": 0.076171875, |
| "learning_rate": 2.846798324356673e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996389865875244, |
| "num_tokens": 5566903.0, |
| "step": 5270 |
| }, |
| { |
| "entropy": 1.5506306290626526, |
| "epoch": 3.1597845601436267, |
| "grad_norm": 0.10205078125, |
| "learning_rate": 2.8408138839018552e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5577356.0, |
| "step": 5280 |
| }, |
| { |
| "entropy": 1.5602880001068116, |
| "epoch": 3.165769000598444, |
| "grad_norm": 0.388671875, |
| "learning_rate": 2.8348294434470376e-05, |
| "loss": 0.0016, |
| "mean_token_accuracy": 0.9989531815052033, |
| "num_tokens": 5587965.0, |
| "step": 5290 |
| }, |
| { |
| "entropy": 1.563979482650757, |
| "epoch": 3.1717534410532613, |
| "grad_norm": 0.0147705078125, |
| "learning_rate": 2.8288450029922203e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5598333.0, |
| "step": 5300 |
| }, |
| { |
| "entropy": 1.5785459518432616, |
| "epoch": 3.177737881508079, |
| "grad_norm": 0.056396484375, |
| "learning_rate": 2.822860562537403e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.9996575355529785, |
| "num_tokens": 5608941.0, |
| "step": 5310 |
| }, |
| { |
| "entropy": 1.589229130744934, |
| "epoch": 3.1837223219628963, |
| "grad_norm": 0.0186767578125, |
| "learning_rate": 2.8168761220825854e-05, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.999609375, |
| "num_tokens": 5619596.0, |
| "step": 5320 |
| }, |
| { |
| "entropy": 1.5809313297271728, |
| "epoch": 3.189706762417714, |
| "grad_norm": 0.0224609375, |
| "learning_rate": 2.8108916816277677e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5630232.0, |
| "step": 5330 |
| }, |
| { |
| "entropy": 1.5661462068557739, |
| "epoch": 3.1956912028725313, |
| "grad_norm": 0.024169921875, |
| "learning_rate": 2.8049072411729504e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5640742.0, |
| "step": 5340 |
| }, |
| { |
| "entropy": 1.5573038578033447, |
| "epoch": 3.201675643327349, |
| "grad_norm": 0.0302734375, |
| "learning_rate": 2.7989228007181328e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.99963099360466, |
| "num_tokens": 5651153.0, |
| "step": 5350 |
| }, |
| { |
| "entropy": 1.5646984577178955, |
| "epoch": 3.2076600837821663, |
| "grad_norm": 0.035400390625, |
| "learning_rate": 2.7929383602633155e-05, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.999304324388504, |
| "num_tokens": 5661922.0, |
| "step": 5360 |
| }, |
| { |
| "entropy": 1.5028715133666992, |
| "epoch": 3.213644524236984, |
| "grad_norm": 0.349609375, |
| "learning_rate": 2.786953919808498e-05, |
| "loss": 0.0026, |
| "mean_token_accuracy": 0.9992653369903565, |
| "num_tokens": 5672420.0, |
| "step": 5370 |
| }, |
| { |
| "entropy": 1.5278252124786378, |
| "epoch": 3.2196289646918013, |
| "grad_norm": 0.028076171875, |
| "learning_rate": 2.7809694793536805e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5682901.0, |
| "step": 5380 |
| }, |
| { |
| "entropy": 1.564482367038727, |
| "epoch": 3.225613405146619, |
| "grad_norm": 0.408203125, |
| "learning_rate": 2.774985038898863e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996402859687805, |
| "num_tokens": 5693412.0, |
| "step": 5390 |
| }, |
| { |
| "entropy": 1.507829475402832, |
| "epoch": 3.2315978456014363, |
| "grad_norm": 0.035888671875, |
| "learning_rate": 2.7690005984440456e-05, |
| "loss": 0.0016, |
| "mean_token_accuracy": 0.9996415793895721, |
| "num_tokens": 5703833.0, |
| "step": 5400 |
| }, |
| { |
| "entropy": 1.5307364106178283, |
| "epoch": 3.2375822860562535, |
| "grad_norm": 0.057373046875, |
| "learning_rate": 2.7630161579892283e-05, |
| "loss": 0.0018, |
| "mean_token_accuracy": 0.9992765545845032, |
| "num_tokens": 5714464.0, |
| "step": 5410 |
| }, |
| { |
| "entropy": 1.5907382130622865, |
| "epoch": 3.2435667265110713, |
| "grad_norm": 0.08056640625, |
| "learning_rate": 2.7570317175344107e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996389865875244, |
| "num_tokens": 5725108.0, |
| "step": 5420 |
| }, |
| { |
| "entropy": 1.5480499148368836, |
| "epoch": 3.2495511669658885, |
| "grad_norm": 0.02978515625, |
| "learning_rate": 2.751047277079593e-05, |
| "loss": 0.0016, |
| "mean_token_accuracy": 0.9996632993221283, |
| "num_tokens": 5735618.0, |
| "step": 5430 |
| }, |
| { |
| "entropy": 1.4962282299995422, |
| "epoch": 3.2555356074207062, |
| "grad_norm": 0.404296875, |
| "learning_rate": 2.7450628366247757e-05, |
| "loss": 0.0015, |
| "mean_token_accuracy": 0.9996350347995758, |
| "num_tokens": 5746133.0, |
| "step": 5440 |
| }, |
| { |
| "entropy": 1.5920798301696777, |
| "epoch": 3.2615200478755235, |
| "grad_norm": 0.10546875, |
| "learning_rate": 2.7390783961699584e-05, |
| "loss": 0.0022, |
| "mean_token_accuracy": 0.9996240615844727, |
| "num_tokens": 5756785.0, |
| "step": 5450 |
| }, |
| { |
| "entropy": 1.597427499294281, |
| "epoch": 3.2675044883303412, |
| "grad_norm": 0.01495361328125, |
| "learning_rate": 2.7330939557151408e-05, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9996062994003296, |
| "num_tokens": 5767408.0, |
| "step": 5460 |
| }, |
| { |
| "entropy": 1.5337183713912963, |
| "epoch": 3.2734889287851585, |
| "grad_norm": 0.63671875, |
| "learning_rate": 2.727109515260323e-05, |
| "loss": 0.0019, |
| "mean_token_accuracy": 0.9989434540271759, |
| "num_tokens": 5777792.0, |
| "step": 5470 |
| }, |
| { |
| "entropy": 1.564418363571167, |
| "epoch": 3.2794733692399762, |
| "grad_norm": 0.017333984375, |
| "learning_rate": 2.7211250748055058e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5788108.0, |
| "step": 5480 |
| }, |
| { |
| "entropy": 1.6177725791931152, |
| "epoch": 3.2854578096947935, |
| "grad_norm": 0.099609375, |
| "learning_rate": 2.7151406343506885e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5798863.0, |
| "step": 5490 |
| }, |
| { |
| "entropy": 1.671896493434906, |
| "epoch": 3.2914422501496112, |
| "grad_norm": 0.0078125, |
| "learning_rate": 2.7091561938958705e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5809517.0, |
| "step": 5500 |
| }, |
| { |
| "entropy": 1.5772828936576844, |
| "epoch": 3.2974266906044285, |
| "grad_norm": 0.01068115234375, |
| "learning_rate": 2.7031717534410532e-05, |
| "loss": 0.0021, |
| "mean_token_accuracy": 0.9992832183837891, |
| "num_tokens": 5819979.0, |
| "step": 5510 |
| }, |
| { |
| "entropy": 1.6201642036437989, |
| "epoch": 3.3034111310592458, |
| "grad_norm": 0.74609375, |
| "learning_rate": 2.697187312986236e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996296286582946, |
| "num_tokens": 5830537.0, |
| "step": 5520 |
| }, |
| { |
| "entropy": 1.534337544441223, |
| "epoch": 3.3093955715140635, |
| "grad_norm": 0.03173828125, |
| "learning_rate": 2.6912028725314186e-05, |
| "loss": 0.0018, |
| "mean_token_accuracy": 0.9989047467708587, |
| "num_tokens": 5840879.0, |
| "step": 5530 |
| }, |
| { |
| "entropy": 1.4968370914459228, |
| "epoch": 3.3153800119688808, |
| "grad_norm": 0.62890625, |
| "learning_rate": 2.6852184320766007e-05, |
| "loss": 0.0017, |
| "mean_token_accuracy": 0.9993019163608551, |
| "num_tokens": 5851453.0, |
| "step": 5540 |
| }, |
| { |
| "entropy": 1.5559372425079345, |
| "epoch": 3.3213644524236985, |
| "grad_norm": 0.1640625, |
| "learning_rate": 2.6792339916217834e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5862229.0, |
| "step": 5550 |
| }, |
| { |
| "entropy": 1.4817229628562927, |
| "epoch": 3.3273488928785158, |
| "grad_norm": 0.11328125, |
| "learning_rate": 2.673249551166966e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9996835470199585, |
| "num_tokens": 5872759.0, |
| "step": 5560 |
| }, |
| { |
| "entropy": 1.4754049897193908, |
| "epoch": 3.3333333333333335, |
| "grad_norm": 0.004852294921875, |
| "learning_rate": 2.6672651107121488e-05, |
| "loss": 0.0021, |
| "mean_token_accuracy": 0.9992820382118225, |
| "num_tokens": 5883416.0, |
| "step": 5570 |
| }, |
| { |
| "entropy": 1.499694275856018, |
| "epoch": 3.3393177737881508, |
| "grad_norm": 0.01019287109375, |
| "learning_rate": 2.6612806702573308e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5893944.0, |
| "step": 5580 |
| }, |
| { |
| "entropy": 1.5177325487136841, |
| "epoch": 3.3453022142429685, |
| "grad_norm": 0.006927490234375, |
| "learning_rate": 2.6552962298025135e-05, |
| "loss": 0.0024, |
| "mean_token_accuracy": 0.9993056535720826, |
| "num_tokens": 5904513.0, |
| "step": 5590 |
| }, |
| { |
| "entropy": 1.5602369070053101, |
| "epoch": 3.3512866546977857, |
| "grad_norm": 0.64453125, |
| "learning_rate": 2.6493117893476962e-05, |
| "loss": 0.0015, |
| "mean_token_accuracy": 0.999676376581192, |
| "num_tokens": 5915107.0, |
| "step": 5600 |
| }, |
| { |
| "entropy": 1.4615213394165039, |
| "epoch": 3.357271095152603, |
| "grad_norm": 0.006103515625, |
| "learning_rate": 2.643327348892879e-05, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.9996539771556854, |
| "num_tokens": 5925481.0, |
| "step": 5610 |
| }, |
| { |
| "entropy": 1.5230321168899537, |
| "epoch": 3.3632555356074207, |
| "grad_norm": 0.04736328125, |
| "learning_rate": 2.637342908438061e-05, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9996503472328186, |
| "num_tokens": 5935980.0, |
| "step": 5620 |
| }, |
| { |
| "entropy": 1.4605862855911256, |
| "epoch": 3.369239976062238, |
| "grad_norm": 0.01336669921875, |
| "learning_rate": 2.6313584679832436e-05, |
| "loss": 0.0024, |
| "mean_token_accuracy": 0.9993006646633148, |
| "num_tokens": 5946578.0, |
| "step": 5630 |
| }, |
| { |
| "entropy": 1.5047585487365722, |
| "epoch": 3.3752244165170557, |
| "grad_norm": 0.046875, |
| "learning_rate": 2.6253740275284263e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 5957159.0, |
| "step": 5640 |
| }, |
| { |
| "entropy": 1.4994407773017884, |
| "epoch": 3.381208856971873, |
| "grad_norm": 0.01513671875, |
| "learning_rate": 2.6193895870736087e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9992932856082917, |
| "num_tokens": 5967691.0, |
| "step": 5650 |
| }, |
| { |
| "entropy": 1.5766767621040345, |
| "epoch": 3.3871932974266907, |
| "grad_norm": 0.228515625, |
| "learning_rate": 2.613405146618791e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9996254682540894, |
| "num_tokens": 5978513.0, |
| "step": 5660 |
| }, |
| { |
| "entropy": 1.5303641319274903, |
| "epoch": 3.393177737881508, |
| "grad_norm": 0.359375, |
| "learning_rate": 2.6074207061639737e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.999622642993927, |
| "num_tokens": 5989129.0, |
| "step": 5670 |
| }, |
| { |
| "entropy": 1.518871784210205, |
| "epoch": 3.3991621783363257, |
| "grad_norm": 0.3125, |
| "learning_rate": 2.6014362657091564e-05, |
| "loss": 0.0019, |
| "mean_token_accuracy": 0.999292927980423, |
| "num_tokens": 5999622.0, |
| "step": 5680 |
| }, |
| { |
| "entropy": 1.5809160351753235, |
| "epoch": 3.405146618791143, |
| "grad_norm": 0.220703125, |
| "learning_rate": 2.5954518252543388e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6010229.0, |
| "step": 5690 |
| }, |
| { |
| "entropy": 1.514602744579315, |
| "epoch": 3.4111310592459603, |
| "grad_norm": 0.59375, |
| "learning_rate": 2.589467384799521e-05, |
| "loss": 0.0022, |
| "mean_token_accuracy": 0.9993149936199188, |
| "num_tokens": 6020944.0, |
| "step": 5700 |
| }, |
| { |
| "entropy": 1.5365632772445679, |
| "epoch": 3.417115499700778, |
| "grad_norm": 0.09716796875, |
| "learning_rate": 2.5834829443447038e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6031625.0, |
| "step": 5710 |
| }, |
| { |
| "entropy": 1.4886379003524781, |
| "epoch": 3.4230999401555953, |
| "grad_norm": 0.09326171875, |
| "learning_rate": 2.5774985038898865e-05, |
| "loss": 0.0032, |
| "mean_token_accuracy": 0.9985065758228302, |
| "num_tokens": 6042088.0, |
| "step": 5720 |
| }, |
| { |
| "entropy": 1.5176212072372437, |
| "epoch": 3.429084380610413, |
| "grad_norm": 0.0712890625, |
| "learning_rate": 2.571514063435069e-05, |
| "loss": 0.0017, |
| "mean_token_accuracy": 0.9996731996536254, |
| "num_tokens": 6052684.0, |
| "step": 5730 |
| }, |
| { |
| "entropy": 1.445726454257965, |
| "epoch": 3.4350688210652303, |
| "grad_norm": 0.047607421875, |
| "learning_rate": 2.5655296229802512e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6063071.0, |
| "step": 5740 |
| }, |
| { |
| "entropy": 1.516676378250122, |
| "epoch": 3.441053261520048, |
| "grad_norm": 0.0030059814453125, |
| "learning_rate": 2.559545182525434e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6073640.0, |
| "step": 5750 |
| }, |
| { |
| "entropy": 1.500998282432556, |
| "epoch": 3.4470377019748653, |
| "grad_norm": 0.053466796875, |
| "learning_rate": 2.5535607420706166e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6084262.0, |
| "step": 5760 |
| }, |
| { |
| "entropy": 1.481552255153656, |
| "epoch": 3.453022142429683, |
| "grad_norm": 0.05224609375, |
| "learning_rate": 2.547576301615799e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996621608734131, |
| "num_tokens": 6094845.0, |
| "step": 5770 |
| }, |
| { |
| "entropy": 1.5202495098114013, |
| "epoch": 3.4590065828845002, |
| "grad_norm": 0.027587890625, |
| "learning_rate": 2.5415918611609814e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6105409.0, |
| "step": 5780 |
| }, |
| { |
| "entropy": 1.506263256072998, |
| "epoch": 3.464991023339318, |
| "grad_norm": 0.031005859375, |
| "learning_rate": 2.535607420706164e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9992824077606202, |
| "num_tokens": 6116062.0, |
| "step": 5790 |
| }, |
| { |
| "entropy": 1.5448615550994873, |
| "epoch": 3.4709754637941352, |
| "grad_norm": 0.17578125, |
| "learning_rate": 2.5296229802513464e-05, |
| "loss": 0.0016, |
| "mean_token_accuracy": 0.9996363639831543, |
| "num_tokens": 6126660.0, |
| "step": 5800 |
| }, |
| { |
| "entropy": 1.4823444962501526, |
| "epoch": 3.476959904248953, |
| "grad_norm": 0.030029296875, |
| "learning_rate": 2.523638539796529e-05, |
| "loss": 0.0027, |
| "mean_token_accuracy": 0.9990337431430817, |
| "num_tokens": 6137252.0, |
| "step": 5810 |
| }, |
| { |
| "entropy": 1.461562943458557, |
| "epoch": 3.4829443447037702, |
| "grad_norm": 0.06982421875, |
| "learning_rate": 2.5176540993417115e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9993062913417816, |
| "num_tokens": 6147825.0, |
| "step": 5820 |
| }, |
| { |
| "entropy": 1.4455949544906617, |
| "epoch": 3.4889287851585875, |
| "grad_norm": 0.0250244140625, |
| "learning_rate": 2.5116696588868942e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6158342.0, |
| "step": 5830 |
| }, |
| { |
| "entropy": 1.458959984779358, |
| "epoch": 3.4949132256134052, |
| "grad_norm": 0.04248046875, |
| "learning_rate": 2.5056852184320765e-05, |
| "loss": 0.0015, |
| "mean_token_accuracy": 0.9992868721485137, |
| "num_tokens": 6168949.0, |
| "step": 5840 |
| }, |
| { |
| "entropy": 1.5214141845703124, |
| "epoch": 3.5008976660682225, |
| "grad_norm": 0.78125, |
| "learning_rate": 2.4997007779772592e-05, |
| "loss": 0.002, |
| "mean_token_accuracy": 0.9989701211452484, |
| "num_tokens": 6179583.0, |
| "step": 5850 |
| }, |
| { |
| "entropy": 1.492224383354187, |
| "epoch": 3.50688210652304, |
| "grad_norm": 0.126953125, |
| "learning_rate": 2.493716337522442e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6190163.0, |
| "step": 5860 |
| }, |
| { |
| "entropy": 1.4495007276535035, |
| "epoch": 3.5128665469778575, |
| "grad_norm": 0.021728515625, |
| "learning_rate": 2.4877318970676243e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6200698.0, |
| "step": 5870 |
| }, |
| { |
| "entropy": 1.4565007090568542, |
| "epoch": 3.5188509874326748, |
| "grad_norm": 0.1533203125, |
| "learning_rate": 2.4817474566128067e-05, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.9996527791023254, |
| "num_tokens": 6211284.0, |
| "step": 5880 |
| }, |
| { |
| "entropy": 1.499528968334198, |
| "epoch": 3.5248354278874925, |
| "grad_norm": 0.028076171875, |
| "learning_rate": 2.4757630161579894e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 0.9996655523777008, |
| "num_tokens": 6221944.0, |
| "step": 5890 |
| }, |
| { |
| "entropy": 1.4583883881568909, |
| "epoch": 3.53081986834231, |
| "grad_norm": 0.59765625, |
| "learning_rate": 2.469778575703172e-05, |
| "loss": 0.0018, |
| "mean_token_accuracy": 0.9993514120578766, |
| "num_tokens": 6232700.0, |
| "step": 5900 |
| }, |
| { |
| "entropy": 1.5127885818481446, |
| "epoch": 3.5368043087971275, |
| "grad_norm": 0.033935546875, |
| "learning_rate": 2.4637941352483544e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6243487.0, |
| "step": 5910 |
| }, |
| { |
| "entropy": 1.5247217893600464, |
| "epoch": 3.5427887492519448, |
| "grad_norm": 0.07763671875, |
| "learning_rate": 2.4578096947935368e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996884763240814, |
| "num_tokens": 6254214.0, |
| "step": 5920 |
| }, |
| { |
| "entropy": 1.465972125530243, |
| "epoch": 3.5487731897067625, |
| "grad_norm": 0.06787109375, |
| "learning_rate": 2.4518252543387195e-05, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996563553810119, |
| "num_tokens": 6264680.0, |
| "step": 5930 |
| }, |
| { |
| "entropy": 1.4788655638694763, |
| "epoch": 3.5547576301615798, |
| "grad_norm": 0.0203857421875, |
| "learning_rate": 2.445840813883902e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6275332.0, |
| "step": 5940 |
| }, |
| { |
| "entropy": 1.4835572361946106, |
| "epoch": 3.5607420706163975, |
| "grad_norm": 0.7734375, |
| "learning_rate": 2.4398563734290842e-05, |
| "loss": 0.0018, |
| "mean_token_accuracy": 0.9992424249649048, |
| "num_tokens": 6285916.0, |
| "step": 5950 |
| }, |
| { |
| "entropy": 1.4669729709625243, |
| "epoch": 3.5667265110712147, |
| "grad_norm": 0.70703125, |
| "learning_rate": 2.433871932974267e-05, |
| "loss": 0.0028, |
| "mean_token_accuracy": 0.9985170543193818, |
| "num_tokens": 6296444.0, |
| "step": 5960 |
| }, |
| { |
| "entropy": 1.4348384976387023, |
| "epoch": 3.5727109515260325, |
| "grad_norm": 0.060302734375, |
| "learning_rate": 2.4278874925194496e-05, |
| "loss": 0.0017, |
| "mean_token_accuracy": 0.9993228852748871, |
| "num_tokens": 6307078.0, |
| "step": 5970 |
| }, |
| { |
| "entropy": 1.5015219926834107, |
| "epoch": 3.5786953919808497, |
| "grad_norm": 0.0184326171875, |
| "learning_rate": 2.4219030520646323e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6317497.0, |
| "step": 5980 |
| }, |
| { |
| "entropy": 1.56508469581604, |
| "epoch": 3.5846798324356675, |
| "grad_norm": 0.2001953125, |
| "learning_rate": 2.4159186116098143e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6328052.0, |
| "step": 5990 |
| }, |
| { |
| "entropy": 1.4645266771316527, |
| "epoch": 3.5906642728904847, |
| "grad_norm": 0.11572265625, |
| "learning_rate": 2.409934171154997e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9996363639831543, |
| "num_tokens": 6338487.0, |
| "step": 6000 |
| }, |
| { |
| "entropy": 1.4335883975028991, |
| "epoch": 3.596648713345302, |
| "grad_norm": 0.083984375, |
| "learning_rate": 2.4039497307001797e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6349011.0, |
| "step": 6010 |
| }, |
| { |
| "entropy": 1.4607133984565734, |
| "epoch": 3.6026331538001197, |
| "grad_norm": 0.0142822265625, |
| "learning_rate": 2.3979652902453624e-05, |
| "loss": 0.0001, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6359518.0, |
| "step": 6020 |
| }, |
| { |
| "entropy": 1.505584192276001, |
| "epoch": 3.608617594254937, |
| "grad_norm": 0.006195068359375, |
| "learning_rate": 2.3919808497905444e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.9996491253376008, |
| "num_tokens": 6370041.0, |
| "step": 6030 |
| }, |
| { |
| "entropy": 1.4937905073165894, |
| "epoch": 3.6146020347097547, |
| "grad_norm": 0.09326171875, |
| "learning_rate": 2.385996409335727e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6380815.0, |
| "step": 6040 |
| }, |
| { |
| "entropy": 1.4493468403816223, |
| "epoch": 3.620586475164572, |
| "grad_norm": 0.06005859375, |
| "learning_rate": 2.3800119688809098e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6391275.0, |
| "step": 6050 |
| }, |
| { |
| "entropy": 1.5009562849998475, |
| "epoch": 3.6265709156193897, |
| "grad_norm": 0.0150146484375, |
| "learning_rate": 2.3740275284260925e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6401857.0, |
| "step": 6060 |
| }, |
| { |
| "entropy": 1.4806856632232666, |
| "epoch": 3.632555356074207, |
| "grad_norm": 0.018798828125, |
| "learning_rate": 2.3680430879712745e-05, |
| "loss": 0.0026, |
| "mean_token_accuracy": 0.9989627659320831, |
| "num_tokens": 6412386.0, |
| "step": 6070 |
| }, |
| { |
| "entropy": 1.4495960474014282, |
| "epoch": 3.6385397965290247, |
| "grad_norm": 0.0235595703125, |
| "learning_rate": 2.3620586475164572e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6422788.0, |
| "step": 6080 |
| }, |
| { |
| "entropy": 1.4762946724891663, |
| "epoch": 3.644524236983842, |
| "grad_norm": 0.02783203125, |
| "learning_rate": 2.35607420706164e-05, |
| "loss": 0.0019, |
| "mean_token_accuracy": 0.9992830693721771, |
| "num_tokens": 6433332.0, |
| "step": 6090 |
| }, |
| { |
| "entropy": 1.4833276271820068, |
| "epoch": 3.6505086774386593, |
| "grad_norm": 0.5703125, |
| "learning_rate": 2.3500897666068223e-05, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996575355529785, |
| "num_tokens": 6444112.0, |
| "step": 6100 |
| }, |
| { |
| "entropy": 1.4752602219581603, |
| "epoch": 3.656493117893477, |
| "grad_norm": 0.00897216796875, |
| "learning_rate": 2.3441053261520047e-05, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9992787480354309, |
| "num_tokens": 6454710.0, |
| "step": 6110 |
| }, |
| { |
| "entropy": 1.5009476304054261, |
| "epoch": 3.6624775583482947, |
| "grad_norm": 0.1181640625, |
| "learning_rate": 2.3381208856971874e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6465333.0, |
| "step": 6120 |
| }, |
| { |
| "entropy": 1.49513281583786, |
| "epoch": 3.668461998803112, |
| "grad_norm": 0.0673828125, |
| "learning_rate": 2.33213644524237e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6476047.0, |
| "step": 6130 |
| }, |
| { |
| "entropy": 1.4687823891639709, |
| "epoch": 3.6744464392579292, |
| "grad_norm": 0.033935546875, |
| "learning_rate": 2.3261520047875524e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6486587.0, |
| "step": 6140 |
| }, |
| { |
| "entropy": 1.5005357146263123, |
| "epoch": 3.680430879712747, |
| "grad_norm": 0.138671875, |
| "learning_rate": 2.3201675643327348e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996212124824524, |
| "num_tokens": 6497319.0, |
| "step": 6150 |
| }, |
| { |
| "entropy": 1.5394233107566833, |
| "epoch": 3.6864153201675642, |
| "grad_norm": 0.0703125, |
| "learning_rate": 2.3141831238779175e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6508086.0, |
| "step": 6160 |
| }, |
| { |
| "entropy": 1.4763020753860474, |
| "epoch": 3.692399760622382, |
| "grad_norm": 0.02880859375, |
| "learning_rate": 2.3081986834231e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 0.9992766141891479, |
| "num_tokens": 6518792.0, |
| "step": 6170 |
| }, |
| { |
| "entropy": 1.4043538212776183, |
| "epoch": 3.6983842010771992, |
| "grad_norm": 0.83203125, |
| "learning_rate": 2.3022142429682825e-05, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.999609375, |
| "num_tokens": 6529300.0, |
| "step": 6180 |
| }, |
| { |
| "entropy": 1.4720135927200317, |
| "epoch": 3.7043686415320165, |
| "grad_norm": 0.1279296875, |
| "learning_rate": 2.296229802513465e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6539909.0, |
| "step": 6190 |
| }, |
| { |
| "entropy": 1.4578587770462037, |
| "epoch": 3.7103530819868342, |
| "grad_norm": 0.0240478515625, |
| "learning_rate": 2.2902453620586476e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9992994546890259, |
| "num_tokens": 6550391.0, |
| "step": 6200 |
| }, |
| { |
| "entropy": 1.4950265049934388, |
| "epoch": 3.716337522441652, |
| "grad_norm": 0.09228515625, |
| "learning_rate": 2.2842609216038303e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6560907.0, |
| "step": 6210 |
| }, |
| { |
| "entropy": 1.487864351272583, |
| "epoch": 3.722321962896469, |
| "grad_norm": 0.6328125, |
| "learning_rate": 2.2782764811490126e-05, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996282517910003, |
| "num_tokens": 6571382.0, |
| "step": 6220 |
| }, |
| { |
| "entropy": 1.4353316664695739, |
| "epoch": 3.7283064033512865, |
| "grad_norm": 0.07177734375, |
| "learning_rate": 2.272292040694195e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996268630027771, |
| "num_tokens": 6582007.0, |
| "step": 6230 |
| }, |
| { |
| "entropy": 1.5229133486747741, |
| "epoch": 3.734290843806104, |
| "grad_norm": 0.004180908203125, |
| "learning_rate": 2.2663076002393777e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6592605.0, |
| "step": 6240 |
| }, |
| { |
| "entropy": 1.498645055294037, |
| "epoch": 3.7402752842609215, |
| "grad_norm": 0.017578125, |
| "learning_rate": 2.26032315978456e-05, |
| "loss": 0.0017, |
| "mean_token_accuracy": 0.9989560008049011, |
| "num_tokens": 6603206.0, |
| "step": 6250 |
| }, |
| { |
| "entropy": 1.4413471937179565, |
| "epoch": 3.746259724715739, |
| "grad_norm": 0.1337890625, |
| "learning_rate": 2.2543387193297428e-05, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.9996389865875244, |
| "num_tokens": 6613700.0, |
| "step": 6260 |
| }, |
| { |
| "entropy": 1.4817732572555542, |
| "epoch": 3.7522441651705565, |
| "grad_norm": 0.0101318359375, |
| "learning_rate": 2.248354278874925e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6624266.0, |
| "step": 6270 |
| }, |
| { |
| "entropy": 1.485778033733368, |
| "epoch": 3.7582286056253738, |
| "grad_norm": 0.0089111328125, |
| "learning_rate": 2.2423698384201078e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 0.9996428549289703, |
| "num_tokens": 6634667.0, |
| "step": 6280 |
| }, |
| { |
| "entropy": 1.5231386780738831, |
| "epoch": 3.7642130460801915, |
| "grad_norm": 0.08740234375, |
| "learning_rate": 2.2363853979652902e-05, |
| "loss": 0.0016, |
| "mean_token_accuracy": 0.9996153831481933, |
| "num_tokens": 6645075.0, |
| "step": 6290 |
| }, |
| { |
| "entropy": 1.46702800989151, |
| "epoch": 3.770197486535009, |
| "grad_norm": 0.126953125, |
| "learning_rate": 2.230400957510473e-05, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9992537319660186, |
| "num_tokens": 6655535.0, |
| "step": 6300 |
| }, |
| { |
| "entropy": 1.473765778541565, |
| "epoch": 3.7761819269898265, |
| "grad_norm": 0.0458984375, |
| "learning_rate": 2.2244165170556556e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6665998.0, |
| "step": 6310 |
| }, |
| { |
| "entropy": 1.5671903252601624, |
| "epoch": 3.7821663674446437, |
| "grad_norm": 0.045654296875, |
| "learning_rate": 2.218432076600838e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6676480.0, |
| "step": 6320 |
| }, |
| { |
| "entropy": 1.5145403623580933, |
| "epoch": 3.7881508078994615, |
| "grad_norm": 0.0294189453125, |
| "learning_rate": 2.2124476361460203e-05, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.9996336996555328, |
| "num_tokens": 6686964.0, |
| "step": 6330 |
| }, |
| { |
| "entropy": 1.5250129222869873, |
| "epoch": 3.7941352483542787, |
| "grad_norm": 0.035888671875, |
| "learning_rate": 2.206463195691203e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9992864787578583, |
| "num_tokens": 6697518.0, |
| "step": 6340 |
| }, |
| { |
| "entropy": 1.533209502696991, |
| "epoch": 3.8001196888090965, |
| "grad_norm": 0.03125, |
| "learning_rate": 2.2004787552363857e-05, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.9996575355529785, |
| "num_tokens": 6708198.0, |
| "step": 6350 |
| }, |
| { |
| "entropy": 1.5412673473358154, |
| "epoch": 3.8061041292639137, |
| "grad_norm": 0.05029296875, |
| "learning_rate": 2.194494314781568e-05, |
| "loss": 0.0026, |
| "mean_token_accuracy": 0.9992578804492951, |
| "num_tokens": 6718602.0, |
| "step": 6360 |
| }, |
| { |
| "entropy": 1.5716055989265443, |
| "epoch": 3.8120885697187314, |
| "grad_norm": 0.609375, |
| "learning_rate": 2.1885098743267504e-05, |
| "loss": 0.0024, |
| "mean_token_accuracy": 0.9986324310302734, |
| "num_tokens": 6729226.0, |
| "step": 6370 |
| }, |
| { |
| "entropy": 1.5873057842254639, |
| "epoch": 3.8180730101735487, |
| "grad_norm": 0.0242919921875, |
| "learning_rate": 2.182525433871933e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6739892.0, |
| "step": 6380 |
| }, |
| { |
| "entropy": 1.5300064086914062, |
| "epoch": 3.8240574506283664, |
| "grad_norm": 0.03955078125, |
| "learning_rate": 2.1765409934171158e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996197700500489, |
| "num_tokens": 6750433.0, |
| "step": 6390 |
| }, |
| { |
| "entropy": 1.5160815238952636, |
| "epoch": 3.8300418910831837, |
| "grad_norm": 0.016357421875, |
| "learning_rate": 2.1705565529622978e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 0.9996183216571808, |
| "num_tokens": 6760886.0, |
| "step": 6400 |
| }, |
| { |
| "entropy": 1.5610576629638673, |
| "epoch": 3.836026331538001, |
| "grad_norm": 0.00872802734375, |
| "learning_rate": 2.1645721125074805e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6771573.0, |
| "step": 6410 |
| }, |
| { |
| "entropy": 1.554826021194458, |
| "epoch": 3.8420107719928187, |
| "grad_norm": 0.0289306640625, |
| "learning_rate": 2.1585876720526632e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.9996428549289703, |
| "num_tokens": 6782078.0, |
| "step": 6420 |
| }, |
| { |
| "entropy": 1.546287226676941, |
| "epoch": 3.847995212447636, |
| "grad_norm": 0.020751953125, |
| "learning_rate": 2.152603231597846e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6792786.0, |
| "step": 6430 |
| }, |
| { |
| "entropy": 1.5601895451545715, |
| "epoch": 3.8539796529024537, |
| "grad_norm": 0.1337890625, |
| "learning_rate": 2.146618791143028e-05, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9992664158344269, |
| "num_tokens": 6803447.0, |
| "step": 6440 |
| }, |
| { |
| "entropy": 1.5343787312507629, |
| "epoch": 3.859964093357271, |
| "grad_norm": 0.03759765625, |
| "learning_rate": 2.1406343506882106e-05, |
| "loss": 0.003, |
| "mean_token_accuracy": 0.9992684304714203, |
| "num_tokens": 6813995.0, |
| "step": 6450 |
| }, |
| { |
| "entropy": 1.546648907661438, |
| "epoch": 3.8659485338120887, |
| "grad_norm": 0.00958251953125, |
| "learning_rate": 2.1346499102333933e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6824662.0, |
| "step": 6460 |
| }, |
| { |
| "entropy": 1.5174113750457763, |
| "epoch": 3.871932974266906, |
| "grad_norm": 0.0196533203125, |
| "learning_rate": 2.128665469778576e-05, |
| "loss": 0.0024, |
| "mean_token_accuracy": 0.9995884776115418, |
| "num_tokens": 6834963.0, |
| "step": 6470 |
| }, |
| { |
| "entropy": 1.5120728850364684, |
| "epoch": 3.8779174147217237, |
| "grad_norm": 0.1064453125, |
| "learning_rate": 2.122681029323758e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996389865875244, |
| "num_tokens": 6845537.0, |
| "step": 6480 |
| }, |
| { |
| "entropy": 1.5367425560951233, |
| "epoch": 3.883901855176541, |
| "grad_norm": 0.0279541015625, |
| "learning_rate": 2.1166965888689408e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6856030.0, |
| "step": 6490 |
| }, |
| { |
| "entropy": 1.5492040872573853, |
| "epoch": 3.8898862956313582, |
| "grad_norm": 0.0654296875, |
| "learning_rate": 2.1107121484141235e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6866678.0, |
| "step": 6500 |
| }, |
| { |
| "entropy": 1.5331620454788208, |
| "epoch": 3.895870736086176, |
| "grad_norm": 0.039794921875, |
| "learning_rate": 2.104727707959306e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6877203.0, |
| "step": 6510 |
| }, |
| { |
| "entropy": 1.5546536684036254, |
| "epoch": 3.9018551765409937, |
| "grad_norm": 0.0201416015625, |
| "learning_rate": 2.0987432675044882e-05, |
| "loss": 0.004, |
| "mean_token_accuracy": 0.9985446095466614, |
| "num_tokens": 6887841.0, |
| "step": 6520 |
| }, |
| { |
| "entropy": 1.5250784516334535, |
| "epoch": 3.907839616995811, |
| "grad_norm": 0.01287841796875, |
| "learning_rate": 2.092758827049671e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6898328.0, |
| "step": 6530 |
| }, |
| { |
| "entropy": 1.5164906859397889, |
| "epoch": 3.9138240574506282, |
| "grad_norm": 0.8125, |
| "learning_rate": 2.0867743865948536e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996015965938568, |
| "num_tokens": 6908680.0, |
| "step": 6540 |
| }, |
| { |
| "entropy": 1.5334503293037414, |
| "epoch": 3.919808497905446, |
| "grad_norm": 0.185546875, |
| "learning_rate": 2.080789946140036e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6919228.0, |
| "step": 6550 |
| }, |
| { |
| "entropy": 1.5119147062301637, |
| "epoch": 3.925792938360263, |
| "grad_norm": 0.024658203125, |
| "learning_rate": 2.0748055056852183e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9992709934711457, |
| "num_tokens": 6929730.0, |
| "step": 6560 |
| }, |
| { |
| "entropy": 1.622498083114624, |
| "epoch": 3.931777378815081, |
| "grad_norm": 0.1044921875, |
| "learning_rate": 2.068821065230401e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6940536.0, |
| "step": 6570 |
| }, |
| { |
| "entropy": 1.6207013845443725, |
| "epoch": 3.937761819269898, |
| "grad_norm": 0.046630859375, |
| "learning_rate": 2.0628366247755837e-05, |
| "loss": 0.0031, |
| "mean_token_accuracy": 0.9993064045906067, |
| "num_tokens": 6951312.0, |
| "step": 6580 |
| }, |
| { |
| "entropy": 1.5574225544929505, |
| "epoch": 3.9437462597247155, |
| "grad_norm": 0.1220703125, |
| "learning_rate": 2.056852184320766e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996183216571808, |
| "num_tokens": 6961725.0, |
| "step": 6590 |
| }, |
| { |
| "entropy": 1.5951979756355286, |
| "epoch": 3.949730700179533, |
| "grad_norm": 0.0223388671875, |
| "learning_rate": 2.0508677438659484e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6972283.0, |
| "step": 6600 |
| }, |
| { |
| "entropy": 1.6055678009986878, |
| "epoch": 3.955715140634351, |
| "grad_norm": 0.033447265625, |
| "learning_rate": 2.044883303411131e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6982888.0, |
| "step": 6610 |
| }, |
| { |
| "entropy": 1.5691597938537598, |
| "epoch": 3.961699581089168, |
| "grad_norm": 0.039306640625, |
| "learning_rate": 2.0388988629563138e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 6993524.0, |
| "step": 6620 |
| }, |
| { |
| "entropy": 1.6033958792686462, |
| "epoch": 3.9676840215439855, |
| "grad_norm": 0.023193359375, |
| "learning_rate": 2.032914422501496e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7004124.0, |
| "step": 6630 |
| }, |
| { |
| "entropy": 1.56329288482666, |
| "epoch": 3.973668461998803, |
| "grad_norm": 0.026611328125, |
| "learning_rate": 2.0269299820466785e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7014539.0, |
| "step": 6640 |
| }, |
| { |
| "entropy": 1.5793173909187317, |
| "epoch": 3.9796529024536205, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 2.0209455415918612e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7025021.0, |
| "step": 6650 |
| }, |
| { |
| "entropy": 1.515379798412323, |
| "epoch": 3.985637342908438, |
| "grad_norm": 0.005523681640625, |
| "learning_rate": 2.014961101137044e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7035600.0, |
| "step": 6660 |
| }, |
| { |
| "entropy": 1.6099679350852967, |
| "epoch": 3.9916217833632555, |
| "grad_norm": 0.0081787109375, |
| "learning_rate": 2.0089766606822263e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996563553810119, |
| "num_tokens": 7046232.0, |
| "step": 6670 |
| }, |
| { |
| "entropy": 1.5835115551948546, |
| "epoch": 3.9976062238180727, |
| "grad_norm": 0.01312255859375, |
| "learning_rate": 2.0029922202274086e-05, |
| "loss": 0.0029, |
| "mean_token_accuracy": 0.9993050158023834, |
| "num_tokens": 7056783.0, |
| "step": 6680 |
| }, |
| { |
| "entropy": 1.5779105305671692, |
| "epoch": 4.003590664272891, |
| "grad_norm": 0.058349609375, |
| "learning_rate": 1.9970077797725913e-05, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9996598660945892, |
| "num_tokens": 7067409.0, |
| "step": 6690 |
| }, |
| { |
| "entropy": 1.5681239366531372, |
| "epoch": 4.009575104727708, |
| "grad_norm": 0.029541015625, |
| "learning_rate": 1.9910233393177737e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7078160.0, |
| "step": 6700 |
| }, |
| { |
| "entropy": 1.4813060760498047, |
| "epoch": 4.0155595451825254, |
| "grad_norm": 0.0186767578125, |
| "learning_rate": 1.9850388988629564e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7088509.0, |
| "step": 6710 |
| }, |
| { |
| "entropy": 1.594699537754059, |
| "epoch": 4.021543985637343, |
| "grad_norm": 0.11328125, |
| "learning_rate": 1.9790544584081388e-05, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996323525905609, |
| "num_tokens": 7099175.0, |
| "step": 6720 |
| }, |
| { |
| "entropy": 1.6191529512405396, |
| "epoch": 4.02752842609216, |
| "grad_norm": 0.0228271484375, |
| "learning_rate": 1.9730700179533215e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7109906.0, |
| "step": 6730 |
| }, |
| { |
| "entropy": 1.5749483346939086, |
| "epoch": 4.033512866546978, |
| "grad_norm": 0.0086669921875, |
| "learning_rate": 1.9670855774985038e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7120497.0, |
| "step": 6740 |
| }, |
| { |
| "entropy": 1.5680004119873048, |
| "epoch": 4.039497307001795, |
| "grad_norm": 0.01104736328125, |
| "learning_rate": 1.9611011370436865e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7131051.0, |
| "step": 6750 |
| }, |
| { |
| "entropy": 1.6218904495239257, |
| "epoch": 4.045481747456613, |
| "grad_norm": 0.0242919921875, |
| "learning_rate": 1.955116696588869e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996575355529785, |
| "num_tokens": 7141748.0, |
| "step": 6760 |
| }, |
| { |
| "entropy": 1.5259785056114197, |
| "epoch": 4.05146618791143, |
| "grad_norm": 0.359375, |
| "learning_rate": 1.9491322561340516e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7152133.0, |
| "step": 6770 |
| }, |
| { |
| "entropy": 1.5659233808517456, |
| "epoch": 4.057450628366248, |
| "grad_norm": 0.031982421875, |
| "learning_rate": 1.943147815679234e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 0.9995967745780945, |
| "num_tokens": 7162629.0, |
| "step": 6780 |
| }, |
| { |
| "entropy": 1.5872820377349854, |
| "epoch": 4.063435068821065, |
| "grad_norm": 0.486328125, |
| "learning_rate": 1.9371633752244166e-05, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.9992965221405029, |
| "num_tokens": 7173417.0, |
| "step": 6790 |
| }, |
| { |
| "entropy": 1.6818758726119996, |
| "epoch": 4.069419509275883, |
| "grad_norm": 0.037109375, |
| "learning_rate": 1.9311789347695993e-05, |
| "loss": 0.0015, |
| "mean_token_accuracy": 0.9993055462837219, |
| "num_tokens": 7184217.0, |
| "step": 6800 |
| }, |
| { |
| "entropy": 1.5825034737586976, |
| "epoch": 4.0754039497307, |
| "grad_norm": 0.12255859375, |
| "learning_rate": 1.9251944943147814e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7194696.0, |
| "step": 6810 |
| }, |
| { |
| "entropy": 1.590399932861328, |
| "epoch": 4.081388390185517, |
| "grad_norm": 0.0128173828125, |
| "learning_rate": 1.919210053859964e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7205359.0, |
| "step": 6820 |
| }, |
| { |
| "entropy": 1.5604485630989076, |
| "epoch": 4.087372830640335, |
| "grad_norm": 0.1328125, |
| "learning_rate": 1.9132256134051468e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7215909.0, |
| "step": 6830 |
| }, |
| { |
| "entropy": 1.5563522219657897, |
| "epoch": 4.093357271095153, |
| "grad_norm": 0.021484375, |
| "learning_rate": 1.9072411729503294e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7226295.0, |
| "step": 6840 |
| }, |
| { |
| "entropy": 1.549744188785553, |
| "epoch": 4.09934171154997, |
| "grad_norm": 0.08447265625, |
| "learning_rate": 1.9012567324955115e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7236827.0, |
| "step": 6850 |
| }, |
| { |
| "entropy": 1.5607978582382203, |
| "epoch": 4.105326152004787, |
| "grad_norm": 0.16796875, |
| "learning_rate": 1.895272292040694e-05, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996491253376008, |
| "num_tokens": 7247388.0, |
| "step": 6860 |
| }, |
| { |
| "entropy": 1.5611071705818176, |
| "epoch": 4.111310592459605, |
| "grad_norm": 0.0284423828125, |
| "learning_rate": 1.889287851585877e-05, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9995850622653961, |
| "num_tokens": 7257914.0, |
| "step": 6870 |
| }, |
| { |
| "entropy": 1.5587551593780518, |
| "epoch": 4.117295032914423, |
| "grad_norm": 0.10009765625, |
| "learning_rate": 1.8833034111310596e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9996699690818787, |
| "num_tokens": 7268506.0, |
| "step": 6880 |
| }, |
| { |
| "entropy": 1.5508070588111877, |
| "epoch": 4.12327947336924, |
| "grad_norm": 0.0576171875, |
| "learning_rate": 1.8773189706762416e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7278955.0, |
| "step": 6890 |
| }, |
| { |
| "entropy": 1.5777355074882506, |
| "epoch": 4.129263913824057, |
| "grad_norm": 0.04296875, |
| "learning_rate": 1.8713345302214243e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7289604.0, |
| "step": 6900 |
| }, |
| { |
| "entropy": 1.5634498238563537, |
| "epoch": 4.135248354278875, |
| "grad_norm": 0.0732421875, |
| "learning_rate": 1.865350089766607e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7300222.0, |
| "step": 6910 |
| }, |
| { |
| "entropy": 1.5446319460868836, |
| "epoch": 4.141232794733693, |
| "grad_norm": 0.1396484375, |
| "learning_rate": 1.8593656493117897e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7310848.0, |
| "step": 6920 |
| }, |
| { |
| "entropy": 1.514329993724823, |
| "epoch": 4.14721723518851, |
| "grad_norm": 0.007354736328125, |
| "learning_rate": 1.8533812088569717e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996197700500489, |
| "num_tokens": 7321225.0, |
| "step": 6930 |
| }, |
| { |
| "entropy": 1.5420185565948485, |
| "epoch": 4.153201675643327, |
| "grad_norm": 0.443359375, |
| "learning_rate": 1.8473967684021544e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 0.9996666669845581, |
| "num_tokens": 7331897.0, |
| "step": 6940 |
| }, |
| { |
| "entropy": 1.5471250891685486, |
| "epoch": 4.1591861160981445, |
| "grad_norm": 0.0162353515625, |
| "learning_rate": 1.841412327947337e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7342505.0, |
| "step": 6950 |
| }, |
| { |
| "entropy": 1.5706151604652405, |
| "epoch": 4.165170556552963, |
| "grad_norm": 0.1123046875, |
| "learning_rate": 1.8354278874925195e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7353115.0, |
| "step": 6960 |
| }, |
| { |
| "entropy": 1.5618083357810975, |
| "epoch": 4.17115499700778, |
| "grad_norm": 0.12060546875, |
| "learning_rate": 1.8294434470377018e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7363684.0, |
| "step": 6970 |
| }, |
| { |
| "entropy": 1.5535432338714599, |
| "epoch": 4.177139437462597, |
| "grad_norm": 0.037353515625, |
| "learning_rate": 1.8234590065828845e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7374096.0, |
| "step": 6980 |
| }, |
| { |
| "entropy": 1.5730727791786194, |
| "epoch": 4.1831238779174145, |
| "grad_norm": 0.361328125, |
| "learning_rate": 1.8174745661280672e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7384486.0, |
| "step": 6990 |
| }, |
| { |
| "entropy": 1.5279602527618408, |
| "epoch": 4.189108318372233, |
| "grad_norm": 0.0274658203125, |
| "learning_rate": 1.8114901256732496e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.9996336996555328, |
| "num_tokens": 7394884.0, |
| "step": 7000 |
| }, |
| { |
| "entropy": 1.5611447095870972, |
| "epoch": 4.19509275882705, |
| "grad_norm": 0.091796875, |
| "learning_rate": 1.805505685218432e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 0.9996794879436492, |
| "num_tokens": 7405510.0, |
| "step": 7010 |
| }, |
| { |
| "entropy": 1.5359971404075623, |
| "epoch": 4.201077199281867, |
| "grad_norm": 0.00958251953125, |
| "learning_rate": 1.7995212447636146e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 0.9996389865875244, |
| "num_tokens": 7416021.0, |
| "step": 7020 |
| }, |
| { |
| "entropy": 1.5243404746055602, |
| "epoch": 4.2070616397366845, |
| "grad_norm": 0.0140380859375, |
| "learning_rate": 1.7935368043087973e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996655523777008, |
| "num_tokens": 7426626.0, |
| "step": 7030 |
| }, |
| { |
| "entropy": 1.6068053007125855, |
| "epoch": 4.213046080191502, |
| "grad_norm": 0.0595703125, |
| "learning_rate": 1.7875523638539797e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7437210.0, |
| "step": 7040 |
| }, |
| { |
| "entropy": 1.572656273841858, |
| "epoch": 4.21903052064632, |
| "grad_norm": 0.57421875, |
| "learning_rate": 1.781567923399162e-05, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.9996621608734131, |
| "num_tokens": 7447772.0, |
| "step": 7050 |
| }, |
| { |
| "entropy": 1.5639835238456725, |
| "epoch": 4.225014961101137, |
| "grad_norm": 0.0390625, |
| "learning_rate": 1.7755834829443448e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9995951414108276, |
| "num_tokens": 7458249.0, |
| "step": 7060 |
| }, |
| { |
| "entropy": 1.5189979791641235, |
| "epoch": 4.230999401555954, |
| "grad_norm": 0.09814453125, |
| "learning_rate": 1.7695990424895274e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9996108949184418, |
| "num_tokens": 7468537.0, |
| "step": 7070 |
| }, |
| { |
| "entropy": 1.575409507751465, |
| "epoch": 4.236983842010772, |
| "grad_norm": 0.01171875, |
| "learning_rate": 1.7636146020347098e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7479461.0, |
| "step": 7080 |
| }, |
| { |
| "entropy": 1.5510912537574768, |
| "epoch": 4.24296828246559, |
| "grad_norm": 0.03076171875, |
| "learning_rate": 1.757630161579892e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7489959.0, |
| "step": 7090 |
| }, |
| { |
| "entropy": 1.547593891620636, |
| "epoch": 4.248952722920407, |
| "grad_norm": 0.2333984375, |
| "learning_rate": 1.751645721125075e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7500396.0, |
| "step": 7100 |
| }, |
| { |
| "entropy": 1.5397059679031373, |
| "epoch": 4.254937163375224, |
| "grad_norm": 0.023193359375, |
| "learning_rate": 1.7456612806702572e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7511001.0, |
| "step": 7110 |
| }, |
| { |
| "entropy": 1.566526746749878, |
| "epoch": 4.260921603830042, |
| "grad_norm": 0.07470703125, |
| "learning_rate": 1.73967684021544e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7521548.0, |
| "step": 7120 |
| }, |
| { |
| "entropy": 1.59982568025589, |
| "epoch": 4.266906044284859, |
| "grad_norm": 0.0223388671875, |
| "learning_rate": 1.7336923997606223e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9996296286582946, |
| "num_tokens": 7532357.0, |
| "step": 7130 |
| }, |
| { |
| "entropy": 1.5388713359832764, |
| "epoch": 4.272890484739677, |
| "grad_norm": 0.0159912109375, |
| "learning_rate": 1.727707959305805e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.9996551752090455, |
| "num_tokens": 7543029.0, |
| "step": 7140 |
| }, |
| { |
| "entropy": 1.613177978992462, |
| "epoch": 4.278874925194494, |
| "grad_norm": 0.024169921875, |
| "learning_rate": 1.7217235188509873e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7553583.0, |
| "step": 7150 |
| }, |
| { |
| "entropy": 1.5655102014541626, |
| "epoch": 4.284859365649312, |
| "grad_norm": 0.0181884765625, |
| "learning_rate": 1.71573907839617e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7564054.0, |
| "step": 7160 |
| }, |
| { |
| "entropy": 1.5656741619110108, |
| "epoch": 4.290843806104129, |
| "grad_norm": 0.00689697265625, |
| "learning_rate": 1.7097546379413524e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7574548.0, |
| "step": 7170 |
| }, |
| { |
| "entropy": 1.5077351450920105, |
| "epoch": 4.296828246558947, |
| "grad_norm": 0.027099609375, |
| "learning_rate": 1.703770197486535e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9996415793895721, |
| "num_tokens": 7585027.0, |
| "step": 7180 |
| }, |
| { |
| "entropy": 1.5920394659042358, |
| "epoch": 4.302812687013764, |
| "grad_norm": 0.00830078125, |
| "learning_rate": 1.6977857570317175e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7595631.0, |
| "step": 7190 |
| }, |
| { |
| "entropy": 1.5724570870399475, |
| "epoch": 4.308797127468582, |
| "grad_norm": 0.041259765625, |
| "learning_rate": 1.6918013165769e-05, |
| "loss": 0.0017, |
| "mean_token_accuracy": 0.9996587038040161, |
| "num_tokens": 7606196.0, |
| "step": 7200 |
| }, |
| { |
| "entropy": 1.5241859555244446, |
| "epoch": 4.314781567923399, |
| "grad_norm": 0.03125, |
| "learning_rate": 1.6858168761220825e-05, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996539771556854, |
| "num_tokens": 7616506.0, |
| "step": 7210 |
| }, |
| { |
| "entropy": 1.5611043930053712, |
| "epoch": 4.320766008378216, |
| "grad_norm": 0.0286865234375, |
| "learning_rate": 1.6798324356672652e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996527791023254, |
| "num_tokens": 7627126.0, |
| "step": 7220 |
| }, |
| { |
| "entropy": 1.5357125282287598, |
| "epoch": 4.326750448833034, |
| "grad_norm": 0.03955078125, |
| "learning_rate": 1.6738479952124476e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7637728.0, |
| "step": 7230 |
| }, |
| { |
| "entropy": 1.507653498649597, |
| "epoch": 4.332734889287852, |
| "grad_norm": 0.060791015625, |
| "learning_rate": 1.6678635547576303e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7648244.0, |
| "step": 7240 |
| }, |
| { |
| "entropy": 1.539263367652893, |
| "epoch": 4.338719329742669, |
| "grad_norm": 0.33984375, |
| "learning_rate": 1.661879114302813e-05, |
| "loss": 0.0019, |
| "mean_token_accuracy": 0.9986226677894592, |
| "num_tokens": 7658828.0, |
| "step": 7250 |
| }, |
| { |
| "entropy": 1.558404839038849, |
| "epoch": 4.344703770197486, |
| "grad_norm": 0.01239013671875, |
| "learning_rate": 1.655894673847995e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.9996478855609894, |
| "num_tokens": 7669461.0, |
| "step": 7260 |
| }, |
| { |
| "entropy": 1.5491536259651184, |
| "epoch": 4.350688210652304, |
| "grad_norm": 0.0289306640625, |
| "learning_rate": 1.6499102333931777e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7680062.0, |
| "step": 7270 |
| }, |
| { |
| "entropy": 1.5289602637290955, |
| "epoch": 4.356672651107122, |
| "grad_norm": 0.0250244140625, |
| "learning_rate": 1.6439257929383604e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.9996428549289703, |
| "num_tokens": 7690585.0, |
| "step": 7280 |
| }, |
| { |
| "entropy": 1.5975135564804077, |
| "epoch": 4.362657091561939, |
| "grad_norm": 0.064453125, |
| "learning_rate": 1.637941352483543e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7701131.0, |
| "step": 7290 |
| }, |
| { |
| "entropy": 1.6185077905654908, |
| "epoch": 4.368641532016756, |
| "grad_norm": 0.01171875, |
| "learning_rate": 1.631956912028725e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7711714.0, |
| "step": 7300 |
| }, |
| { |
| "entropy": 1.585240626335144, |
| "epoch": 4.3746259724715735, |
| "grad_norm": 0.107421875, |
| "learning_rate": 1.6259724715739078e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.9996539771556854, |
| "num_tokens": 7722427.0, |
| "step": 7310 |
| }, |
| { |
| "entropy": 1.621202325820923, |
| "epoch": 4.380610412926392, |
| "grad_norm": 0.0223388671875, |
| "learning_rate": 1.6199880311190905e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7733131.0, |
| "step": 7320 |
| }, |
| { |
| "entropy": 1.5082001328468322, |
| "epoch": 4.386594853381209, |
| "grad_norm": 0.0228271484375, |
| "learning_rate": 1.6140035906642732e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7743525.0, |
| "step": 7330 |
| }, |
| { |
| "entropy": 1.5455591917037963, |
| "epoch": 4.392579293836026, |
| "grad_norm": 0.0106201171875, |
| "learning_rate": 1.6080191502094552e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7754121.0, |
| "step": 7340 |
| }, |
| { |
| "entropy": 1.5201449394226074, |
| "epoch": 4.3985637342908435, |
| "grad_norm": 0.142578125, |
| "learning_rate": 1.602034709754638e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9996078431606292, |
| "num_tokens": 7764636.0, |
| "step": 7350 |
| }, |
| { |
| "entropy": 1.5823255062103272, |
| "epoch": 4.404548174745662, |
| "grad_norm": 0.0634765625, |
| "learning_rate": 1.5960502692998206e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 0.999664431810379, |
| "num_tokens": 7775359.0, |
| "step": 7360 |
| }, |
| { |
| "entropy": 1.5282755970954895, |
| "epoch": 4.410532615200479, |
| "grad_norm": 0.055908203125, |
| "learning_rate": 1.5900658288450033e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7785851.0, |
| "step": 7370 |
| }, |
| { |
| "entropy": 1.5407449364662171, |
| "epoch": 4.416517055655296, |
| "grad_norm": 0.01806640625, |
| "learning_rate": 1.5840813883901853e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996350347995758, |
| "num_tokens": 7796400.0, |
| "step": 7380 |
| }, |
| { |
| "entropy": 1.5680436968803406, |
| "epoch": 4.4225014961101135, |
| "grad_norm": 0.0283203125, |
| "learning_rate": 1.578096947935368e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7806854.0, |
| "step": 7390 |
| }, |
| { |
| "entropy": 1.5389506220817566, |
| "epoch": 4.428485936564931, |
| "grad_norm": 0.05908203125, |
| "learning_rate": 1.5721125074805507e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9993071675300598, |
| "num_tokens": 7817406.0, |
| "step": 7400 |
| }, |
| { |
| "entropy": 1.5411053538322448, |
| "epoch": 4.434470377019749, |
| "grad_norm": 0.005645751953125, |
| "learning_rate": 1.566128067025733e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7827914.0, |
| "step": 7410 |
| }, |
| { |
| "entropy": 1.5534313559532165, |
| "epoch": 4.440454817474566, |
| "grad_norm": 0.6640625, |
| "learning_rate": 1.5601436265709155e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996677756309509, |
| "num_tokens": 7838628.0, |
| "step": 7420 |
| }, |
| { |
| "entropy": 1.5759409070014954, |
| "epoch": 4.446439257929383, |
| "grad_norm": 0.50390625, |
| "learning_rate": 1.554159186116098e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9993143141269684, |
| "num_tokens": 7849032.0, |
| "step": 7430 |
| }, |
| { |
| "entropy": 1.5122934222221374, |
| "epoch": 4.452423698384201, |
| "grad_norm": 0.07275390625, |
| "learning_rate": 1.548174745661281e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9993093550205231, |
| "num_tokens": 7859524.0, |
| "step": 7440 |
| }, |
| { |
| "entropy": 1.5497113466262817, |
| "epoch": 4.458408138839019, |
| "grad_norm": 0.166015625, |
| "learning_rate": 1.5421903052064632e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.999622642993927, |
| "num_tokens": 7870109.0, |
| "step": 7450 |
| }, |
| { |
| "entropy": 1.5085930705070496, |
| "epoch": 4.464392579293836, |
| "grad_norm": 0.1376953125, |
| "learning_rate": 1.5362058647516456e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7880529.0, |
| "step": 7460 |
| }, |
| { |
| "entropy": 1.5613359570503236, |
| "epoch": 4.470377019748653, |
| "grad_norm": 0.0047607421875, |
| "learning_rate": 1.5302214242968283e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 0.9996350347995758, |
| "num_tokens": 7891117.0, |
| "step": 7470 |
| }, |
| { |
| "entropy": 1.5748088836669922, |
| "epoch": 4.476361460203471, |
| "grad_norm": 0.01080322265625, |
| "learning_rate": 1.524236983842011e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7901694.0, |
| "step": 7480 |
| }, |
| { |
| "entropy": 1.543941354751587, |
| "epoch": 4.482345900658289, |
| "grad_norm": 0.0306396484375, |
| "learning_rate": 1.5182525433871932e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.9996389865875244, |
| "num_tokens": 7912307.0, |
| "step": 7490 |
| }, |
| { |
| "entropy": 1.5929329633712768, |
| "epoch": 4.488330341113106, |
| "grad_norm": 0.0185546875, |
| "learning_rate": 1.5122681029323759e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7923076.0, |
| "step": 7500 |
| }, |
| { |
| "entropy": 1.5893060326576234, |
| "epoch": 4.494314781567923, |
| "grad_norm": 0.046630859375, |
| "learning_rate": 1.5062836624775584e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7933915.0, |
| "step": 7510 |
| }, |
| { |
| "entropy": 1.5541257858276367, |
| "epoch": 4.500299222022741, |
| "grad_norm": 0.0147705078125, |
| "learning_rate": 1.5002992220227411e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7944357.0, |
| "step": 7520 |
| }, |
| { |
| "entropy": 1.500285267829895, |
| "epoch": 4.506283662477558, |
| "grad_norm": 0.0264892578125, |
| "learning_rate": 1.4943147815679234e-05, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.9992882609367371, |
| "num_tokens": 7954810.0, |
| "step": 7530 |
| }, |
| { |
| "entropy": 1.5186502814292908, |
| "epoch": 4.512268102932376, |
| "grad_norm": 0.031005859375, |
| "learning_rate": 1.488330341113106e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996598660945892, |
| "num_tokens": 7965436.0, |
| "step": 7540 |
| }, |
| { |
| "entropy": 1.6063961029052733, |
| "epoch": 4.518252543387193, |
| "grad_norm": 0.734375, |
| "learning_rate": 1.4823459006582885e-05, |
| "loss": 0.002, |
| "mean_token_accuracy": 0.9993270456790924, |
| "num_tokens": 7976072.0, |
| "step": 7550 |
| }, |
| { |
| "entropy": 1.5510151267051697, |
| "epoch": 4.524236983842011, |
| "grad_norm": 0.0439453125, |
| "learning_rate": 1.476361460203471e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 7986738.0, |
| "step": 7560 |
| }, |
| { |
| "entropy": 1.55672527551651, |
| "epoch": 4.530221424296828, |
| "grad_norm": 0.0478515625, |
| "learning_rate": 1.4703770197486534e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996282517910003, |
| "num_tokens": 7997279.0, |
| "step": 7570 |
| }, |
| { |
| "entropy": 1.5957432746887208, |
| "epoch": 4.536205864751645, |
| "grad_norm": 0.0120849609375, |
| "learning_rate": 1.4643925792938361e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996527791023254, |
| "num_tokens": 8007886.0, |
| "step": 7580 |
| }, |
| { |
| "entropy": 1.5532257080078125, |
| "epoch": 4.542190305206463, |
| "grad_norm": 0.00994873046875, |
| "learning_rate": 1.4584081388390185e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.9996478855609894, |
| "num_tokens": 8018282.0, |
| "step": 7590 |
| }, |
| { |
| "entropy": 1.5621094346046447, |
| "epoch": 4.548174745661281, |
| "grad_norm": 0.1787109375, |
| "learning_rate": 1.4524236983842012e-05, |
| "loss": 0.0018, |
| "mean_token_accuracy": 0.999331396818161, |
| "num_tokens": 8028884.0, |
| "step": 7600 |
| }, |
| { |
| "entropy": 1.5396092414855957, |
| "epoch": 4.554159186116098, |
| "grad_norm": 0.0078125, |
| "learning_rate": 1.4464392579293835e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8039399.0, |
| "step": 7610 |
| }, |
| { |
| "entropy": 1.5150254487991333, |
| "epoch": 4.560143626570916, |
| "grad_norm": 0.055908203125, |
| "learning_rate": 1.4404548174745662e-05, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996621608734131, |
| "num_tokens": 8049911.0, |
| "step": 7620 |
| }, |
| { |
| "entropy": 1.5425100445747375, |
| "epoch": 4.566128067025733, |
| "grad_norm": 0.068359375, |
| "learning_rate": 1.4344703770197486e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8060483.0, |
| "step": 7630 |
| }, |
| { |
| "entropy": 1.6269634127616883, |
| "epoch": 4.572112507480551, |
| "grad_norm": 0.0233154296875, |
| "learning_rate": 1.4284859365649313e-05, |
| "loss": 0.0025, |
| "mean_token_accuracy": 0.9988581538200378, |
| "num_tokens": 8071125.0, |
| "step": 7640 |
| }, |
| { |
| "entropy": 1.5476625084877014, |
| "epoch": 4.578096947935368, |
| "grad_norm": 0.009765625, |
| "learning_rate": 1.4225014961101136e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8081725.0, |
| "step": 7650 |
| }, |
| { |
| "entropy": 1.5690398454666137, |
| "epoch": 4.584081388390185, |
| "grad_norm": 0.4140625, |
| "learning_rate": 1.4165170556552963e-05, |
| "loss": 0.0019, |
| "mean_token_accuracy": 0.9993531167507171, |
| "num_tokens": 8092450.0, |
| "step": 7660 |
| }, |
| { |
| "entropy": 1.5727166771888732, |
| "epoch": 4.590065828845003, |
| "grad_norm": 0.1240234375, |
| "learning_rate": 1.4105326152004787e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.99963099360466, |
| "num_tokens": 8102872.0, |
| "step": 7670 |
| }, |
| { |
| "entropy": 1.6031971335411073, |
| "epoch": 4.596050269299821, |
| "grad_norm": 0.00897216796875, |
| "learning_rate": 1.4045481747456614e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8113419.0, |
| "step": 7680 |
| }, |
| { |
| "entropy": 1.609036648273468, |
| "epoch": 4.602034709754638, |
| "grad_norm": 0.00921630859375, |
| "learning_rate": 1.3985637342908439e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8124188.0, |
| "step": 7690 |
| }, |
| { |
| "entropy": 1.557384467124939, |
| "epoch": 4.608019150209455, |
| "grad_norm": 0.0086669921875, |
| "learning_rate": 1.3925792938360264e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8134745.0, |
| "step": 7700 |
| }, |
| { |
| "entropy": 1.6228225708007813, |
| "epoch": 4.614003590664273, |
| "grad_norm": 0.150390625, |
| "learning_rate": 1.386594853381209e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8145468.0, |
| "step": 7710 |
| }, |
| { |
| "entropy": 1.6001514673233033, |
| "epoch": 4.619988031119091, |
| "grad_norm": 0.408203125, |
| "learning_rate": 1.3806104129263913e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.9996153831481933, |
| "num_tokens": 8156009.0, |
| "step": 7720 |
| }, |
| { |
| "entropy": 1.5820931553840638, |
| "epoch": 4.625972471573908, |
| "grad_norm": 0.162109375, |
| "learning_rate": 1.374625972471574e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8166610.0, |
| "step": 7730 |
| }, |
| { |
| "entropy": 1.5469802737236023, |
| "epoch": 4.631956912028725, |
| "grad_norm": 0.150390625, |
| "learning_rate": 1.3686415320167564e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8177136.0, |
| "step": 7740 |
| }, |
| { |
| "entropy": 1.6631050109863281, |
| "epoch": 4.6379413524835424, |
| "grad_norm": 0.005340576171875, |
| "learning_rate": 1.3626570915619391e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8187813.0, |
| "step": 7750 |
| }, |
| { |
| "entropy": 1.6081872582435608, |
| "epoch": 4.643925792938361, |
| "grad_norm": 0.01251220703125, |
| "learning_rate": 1.3566726511071214e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8198417.0, |
| "step": 7760 |
| }, |
| { |
| "entropy": 1.6233905792236327, |
| "epoch": 4.649910233393178, |
| "grad_norm": 0.0213623046875, |
| "learning_rate": 1.3506882106523041e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8209083.0, |
| "step": 7770 |
| }, |
| { |
| "entropy": 1.5728107690811157, |
| "epoch": 4.655894673847995, |
| "grad_norm": 0.002838134765625, |
| "learning_rate": 1.3447037701974865e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996336996555328, |
| "num_tokens": 8219442.0, |
| "step": 7780 |
| }, |
| { |
| "entropy": 1.5606521964073181, |
| "epoch": 4.661879114302812, |
| "grad_norm": 0.0908203125, |
| "learning_rate": 1.3387193297426692e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8230155.0, |
| "step": 7790 |
| }, |
| { |
| "entropy": 1.572113835811615, |
| "epoch": 4.667863554757631, |
| "grad_norm": 0.033203125, |
| "learning_rate": 1.3327348892878516e-05, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996677756309509, |
| "num_tokens": 8240827.0, |
| "step": 7800 |
| }, |
| { |
| "entropy": 1.5568471670150756, |
| "epoch": 4.673847995212448, |
| "grad_norm": 0.095703125, |
| "learning_rate": 1.3267504488330343e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8251414.0, |
| "step": 7810 |
| }, |
| { |
| "entropy": 1.5330272674560548, |
| "epoch": 4.679832435667265, |
| "grad_norm": 0.376953125, |
| "learning_rate": 1.3207660083782166e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8261779.0, |
| "step": 7820 |
| }, |
| { |
| "entropy": 1.5626088619232177, |
| "epoch": 4.685816876122082, |
| "grad_norm": 0.004425048828125, |
| "learning_rate": 1.3147815679233993e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996402859687805, |
| "num_tokens": 8272280.0, |
| "step": 7830 |
| }, |
| { |
| "entropy": 1.547228252887726, |
| "epoch": 4.6918013165769, |
| "grad_norm": 0.0576171875, |
| "learning_rate": 1.3087971274685817e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996078431606292, |
| "num_tokens": 8282690.0, |
| "step": 7840 |
| }, |
| { |
| "entropy": 1.5491391181945802, |
| "epoch": 4.697785757031718, |
| "grad_norm": 0.04248046875, |
| "learning_rate": 1.3028126870137644e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8293364.0, |
| "step": 7850 |
| }, |
| { |
| "entropy": 1.5352839589118958, |
| "epoch": 4.703770197486535, |
| "grad_norm": 0.055419921875, |
| "learning_rate": 1.2968282465589467e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8303770.0, |
| "step": 7860 |
| }, |
| { |
| "entropy": 1.5438146233558654, |
| "epoch": 4.709754637941352, |
| "grad_norm": 0.0174560546875, |
| "learning_rate": 1.2908438061041293e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8314295.0, |
| "step": 7870 |
| }, |
| { |
| "entropy": 1.5843072772026061, |
| "epoch": 4.71573907839617, |
| "grad_norm": 0.09033203125, |
| "learning_rate": 1.2848593656493118e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8325010.0, |
| "step": 7880 |
| }, |
| { |
| "entropy": 1.6056226730346679, |
| "epoch": 4.721723518850988, |
| "grad_norm": 0.00604248046875, |
| "learning_rate": 1.2788749251944943e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8335587.0, |
| "step": 7890 |
| }, |
| { |
| "entropy": 1.5527130126953126, |
| "epoch": 4.727707959305805, |
| "grad_norm": 0.05322265625, |
| "learning_rate": 1.2728904847396769e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996587038040161, |
| "num_tokens": 8346105.0, |
| "step": 7900 |
| }, |
| { |
| "entropy": 1.577527070045471, |
| "epoch": 4.733692399760622, |
| "grad_norm": 0.003692626953125, |
| "learning_rate": 1.2669060442848594e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8356730.0, |
| "step": 7910 |
| }, |
| { |
| "entropy": 1.5795585989952088, |
| "epoch": 4.73967684021544, |
| "grad_norm": 0.00188446044921875, |
| "learning_rate": 1.2609216038300419e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8367484.0, |
| "step": 7920 |
| }, |
| { |
| "entropy": 1.5367600798606873, |
| "epoch": 4.745661280670257, |
| "grad_norm": 0.0303955078125, |
| "learning_rate": 1.2549371633752244e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8377998.0, |
| "step": 7930 |
| }, |
| { |
| "entropy": 1.5698032140731812, |
| "epoch": 4.751645721125075, |
| "grad_norm": 0.013916015625, |
| "learning_rate": 1.248952722920407e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996503472328186, |
| "num_tokens": 8388556.0, |
| "step": 7940 |
| }, |
| { |
| "entropy": 1.5445321798324585, |
| "epoch": 4.757630161579892, |
| "grad_norm": 0.003143310546875, |
| "learning_rate": 1.2429682824655895e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8399096.0, |
| "step": 7950 |
| }, |
| { |
| "entropy": 1.5702390313148498, |
| "epoch": 4.76361460203471, |
| "grad_norm": 0.31640625, |
| "learning_rate": 1.236983842010772e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8409625.0, |
| "step": 7960 |
| }, |
| { |
| "entropy": 1.579737651348114, |
| "epoch": 4.769599042489527, |
| "grad_norm": 0.265625, |
| "learning_rate": 1.2309994015559546e-05, |
| "loss": 0.0015, |
| "mean_token_accuracy": 0.9996478855609894, |
| "num_tokens": 8420123.0, |
| "step": 7970 |
| }, |
| { |
| "entropy": 1.475909948348999, |
| "epoch": 4.775583482944345, |
| "grad_norm": 0.373046875, |
| "learning_rate": 1.2250149611011371e-05, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9996428549289703, |
| "num_tokens": 8430614.0, |
| "step": 7980 |
| }, |
| { |
| "entropy": 1.5520552158355714, |
| "epoch": 4.781567923399162, |
| "grad_norm": 0.00653076171875, |
| "learning_rate": 1.2190305206463196e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8441217.0, |
| "step": 7990 |
| }, |
| { |
| "entropy": 1.6095691800117493, |
| "epoch": 4.78755236385398, |
| "grad_norm": 0.28515625, |
| "learning_rate": 1.2130460801915021e-05, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9992459297180176, |
| "num_tokens": 8451802.0, |
| "step": 8000 |
| }, |
| { |
| "entropy": 1.5282339096069335, |
| "epoch": 4.793536804308797, |
| "grad_norm": 0.047607421875, |
| "learning_rate": 1.2070616397366847e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8462413.0, |
| "step": 8010 |
| }, |
| { |
| "entropy": 1.6001452922821044, |
| "epoch": 4.799521244763614, |
| "grad_norm": 0.06591796875, |
| "learning_rate": 1.201077199281867e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8473104.0, |
| "step": 8020 |
| }, |
| { |
| "entropy": 1.5145896315574645, |
| "epoch": 4.805505685218432, |
| "grad_norm": 0.016845703125, |
| "learning_rate": 1.1950927588270497e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8483737.0, |
| "step": 8030 |
| }, |
| { |
| "entropy": 1.5419116020202637, |
| "epoch": 4.81149012567325, |
| "grad_norm": 0.05029296875, |
| "learning_rate": 1.1891083183722321e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8494357.0, |
| "step": 8040 |
| }, |
| { |
| "entropy": 1.549897277355194, |
| "epoch": 4.817474566128067, |
| "grad_norm": 0.08203125, |
| "learning_rate": 1.1831238779174148e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8504875.0, |
| "step": 8050 |
| }, |
| { |
| "entropy": 1.5398707032203673, |
| "epoch": 4.823459006582884, |
| "grad_norm": 0.038818359375, |
| "learning_rate": 1.1771394374625972e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8515439.0, |
| "step": 8060 |
| }, |
| { |
| "entropy": 1.5555207967758178, |
| "epoch": 4.829443447037702, |
| "grad_norm": 0.03271484375, |
| "learning_rate": 1.1711549970077799e-05, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.9996376812458039, |
| "num_tokens": 8525836.0, |
| "step": 8070 |
| }, |
| { |
| "entropy": 1.5203309297561645, |
| "epoch": 4.83542788749252, |
| "grad_norm": 0.018310546875, |
| "learning_rate": 1.1651705565529622e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8536193.0, |
| "step": 8080 |
| }, |
| { |
| "entropy": 1.5281639218330383, |
| "epoch": 4.841412327947337, |
| "grad_norm": 0.0033111572265625, |
| "learning_rate": 1.1591861160981449e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8546804.0, |
| "step": 8090 |
| }, |
| { |
| "entropy": 1.5610571384429932, |
| "epoch": 4.847396768402154, |
| "grad_norm": 0.0130615234375, |
| "learning_rate": 1.1532016756433273e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8557454.0, |
| "step": 8100 |
| }, |
| { |
| "entropy": 1.5801493525505066, |
| "epoch": 4.853381208856971, |
| "grad_norm": 0.0035400390625, |
| "learning_rate": 1.14721723518851e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 0.9996699690818787, |
| "num_tokens": 8568170.0, |
| "step": 8110 |
| }, |
| { |
| "entropy": 1.559760570526123, |
| "epoch": 4.85936564931179, |
| "grad_norm": 0.07568359375, |
| "learning_rate": 1.1412327947336923e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8578714.0, |
| "step": 8120 |
| }, |
| { |
| "entropy": 1.5482122898101807, |
| "epoch": 4.865350089766607, |
| "grad_norm": 0.0179443359375, |
| "learning_rate": 1.135248354278875e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996453881263733, |
| "num_tokens": 8589194.0, |
| "step": 8130 |
| }, |
| { |
| "entropy": 1.5715525984764098, |
| "epoch": 4.871334530221424, |
| "grad_norm": 0.099609375, |
| "learning_rate": 1.1292639138240574e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8599725.0, |
| "step": 8140 |
| }, |
| { |
| "entropy": 1.4884485244750976, |
| "epoch": 4.877318970676241, |
| "grad_norm": 0.126953125, |
| "learning_rate": 1.12327947336924e-05, |
| "loss": 0.0016, |
| "mean_token_accuracy": 0.9992669522762299, |
| "num_tokens": 8610166.0, |
| "step": 8150 |
| }, |
| { |
| "entropy": 1.5385358095169068, |
| "epoch": 4.88330341113106, |
| "grad_norm": 0.07568359375, |
| "learning_rate": 1.1172950329144226e-05, |
| "loss": 0.0008, |
| "mean_token_accuracy": 0.9996153831481933, |
| "num_tokens": 8620615.0, |
| "step": 8160 |
| }, |
| { |
| "entropy": 1.5727779269218445, |
| "epoch": 4.889287851585877, |
| "grad_norm": 0.00144195556640625, |
| "learning_rate": 1.111310592459605e-05, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.9996666669845581, |
| "num_tokens": 8631292.0, |
| "step": 8170 |
| }, |
| { |
| "entropy": 1.4883596062660218, |
| "epoch": 4.895272292040694, |
| "grad_norm": 0.00677490234375, |
| "learning_rate": 1.1053261520047877e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8641757.0, |
| "step": 8180 |
| }, |
| { |
| "entropy": 1.6173644065856934, |
| "epoch": 4.901256732495511, |
| "grad_norm": 0.08984375, |
| "learning_rate": 1.09934171154997e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8652330.0, |
| "step": 8190 |
| }, |
| { |
| "entropy": 1.56162348985672, |
| "epoch": 4.907241172950329, |
| "grad_norm": 0.025634765625, |
| "learning_rate": 1.0933572710951527e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8662973.0, |
| "step": 8200 |
| }, |
| { |
| "entropy": 1.5082152366638184, |
| "epoch": 4.913225613405147, |
| "grad_norm": 0.011474609375, |
| "learning_rate": 1.0873728306403351e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8673530.0, |
| "step": 8210 |
| }, |
| { |
| "entropy": 1.553872811794281, |
| "epoch": 4.919210053859964, |
| "grad_norm": 0.0213623046875, |
| "learning_rate": 1.0813883901855178e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8684030.0, |
| "step": 8220 |
| }, |
| { |
| "entropy": 1.5669398188591004, |
| "epoch": 4.925194494314781, |
| "grad_norm": 0.09423828125, |
| "learning_rate": 1.0754039497307001e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8694514.0, |
| "step": 8230 |
| }, |
| { |
| "entropy": 1.540639054775238, |
| "epoch": 4.931178934769599, |
| "grad_norm": 0.12451171875, |
| "learning_rate": 1.0694195092758828e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8704968.0, |
| "step": 8240 |
| }, |
| { |
| "entropy": 1.6199857950210572, |
| "epoch": 4.937163375224417, |
| "grad_norm": 0.0419921875, |
| "learning_rate": 1.0634350688210652e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8715556.0, |
| "step": 8250 |
| }, |
| { |
| "entropy": 1.6003392696380616, |
| "epoch": 4.943147815679234, |
| "grad_norm": 0.009765625, |
| "learning_rate": 1.0574506283662479e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8726172.0, |
| "step": 8260 |
| }, |
| { |
| "entropy": 1.5800185084342957, |
| "epoch": 4.949132256134051, |
| "grad_norm": 0.06591796875, |
| "learning_rate": 1.0514661879114303e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996621608734131, |
| "num_tokens": 8736769.0, |
| "step": 8270 |
| }, |
| { |
| "entropy": 1.5508555054664612, |
| "epoch": 4.955116696588869, |
| "grad_norm": 0.01373291015625, |
| "learning_rate": 1.045481747456613e-05, |
| "loss": 0.0015, |
| "mean_token_accuracy": 0.9996031761169434, |
| "num_tokens": 8747098.0, |
| "step": 8280 |
| }, |
| { |
| "entropy": 1.5614091634750367, |
| "epoch": 4.961101137043686, |
| "grad_norm": 0.0020599365234375, |
| "learning_rate": 1.0394973070017953e-05, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8757653.0, |
| "step": 8290 |
| }, |
| { |
| "entropy": 1.6161825299263, |
| "epoch": 4.967085577498504, |
| "grad_norm": 0.4453125, |
| "learning_rate": 1.033512866546978e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8768239.0, |
| "step": 8300 |
| }, |
| { |
| "entropy": 1.5423355102539062, |
| "epoch": 4.973070017953321, |
| "grad_norm": 0.023193359375, |
| "learning_rate": 1.0275284260921604e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996376812458039, |
| "num_tokens": 8778716.0, |
| "step": 8310 |
| }, |
| { |
| "entropy": 1.5606900930404664, |
| "epoch": 4.979054458408139, |
| "grad_norm": 0.0830078125, |
| "learning_rate": 1.0215439856373429e-05, |
| "loss": 0.0015, |
| "mean_token_accuracy": 0.9992770373821258, |
| "num_tokens": 8789340.0, |
| "step": 8320 |
| }, |
| { |
| "entropy": 1.5358200788497924, |
| "epoch": 4.985038898862956, |
| "grad_norm": 0.048583984375, |
| "learning_rate": 1.0155595451825254e-05, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8799853.0, |
| "step": 8330 |
| }, |
| { |
| "entropy": 1.5231838941574096, |
| "epoch": 4.991023339317774, |
| "grad_norm": 0.016357421875, |
| "learning_rate": 1.009575104727708e-05, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8810489.0, |
| "step": 8340 |
| }, |
| { |
| "entropy": 1.5757528066635131, |
| "epoch": 4.997007779772591, |
| "grad_norm": 0.008056640625, |
| "learning_rate": 1.0035906642728905e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8820931.0, |
| "step": 8350 |
| }, |
| { |
| "entropy": 1.5420665383338927, |
| "epoch": 5.002992220227409, |
| "grad_norm": 0.018798828125, |
| "learning_rate": 9.97606223818073e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8831504.0, |
| "step": 8360 |
| }, |
| { |
| "entropy": 1.5548468589782716, |
| "epoch": 5.008976660682226, |
| "grad_norm": 0.05908203125, |
| "learning_rate": 9.916217833632556e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8842213.0, |
| "step": 8370 |
| }, |
| { |
| "entropy": 1.5424367904663085, |
| "epoch": 5.014961101137044, |
| "grad_norm": 0.022705078125, |
| "learning_rate": 9.85637342908438e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8852738.0, |
| "step": 8380 |
| }, |
| { |
| "entropy": 1.5555650353431703, |
| "epoch": 5.020945541591861, |
| "grad_norm": 0.042236328125, |
| "learning_rate": 9.796529024536206e-06, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.9996323525905609, |
| "num_tokens": 8863141.0, |
| "step": 8390 |
| }, |
| { |
| "entropy": 1.5822317361831666, |
| "epoch": 5.026929982046679, |
| "grad_norm": 0.138671875, |
| "learning_rate": 9.736684619988031e-06, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996323525905609, |
| "num_tokens": 8873691.0, |
| "step": 8400 |
| }, |
| { |
| "entropy": 1.5569300770759582, |
| "epoch": 5.032914422501496, |
| "grad_norm": 0.03955078125, |
| "learning_rate": 9.676840215439857e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8884255.0, |
| "step": 8410 |
| }, |
| { |
| "entropy": 1.6120752453804017, |
| "epoch": 5.038898862956313, |
| "grad_norm": 0.02880859375, |
| "learning_rate": 9.616995810891682e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8894845.0, |
| "step": 8420 |
| }, |
| { |
| "entropy": 1.5797799468040465, |
| "epoch": 5.044883303411131, |
| "grad_norm": 0.08447265625, |
| "learning_rate": 9.557151406343507e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.9996254682540894, |
| "num_tokens": 8905268.0, |
| "step": 8430 |
| }, |
| { |
| "entropy": 1.5771249175071715, |
| "epoch": 5.050867743865949, |
| "grad_norm": 0.0830078125, |
| "learning_rate": 9.497307001795333e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8915783.0, |
| "step": 8440 |
| }, |
| { |
| "entropy": 1.5601511240005492, |
| "epoch": 5.056852184320766, |
| "grad_norm": 0.0201416015625, |
| "learning_rate": 9.437462597247158e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8926367.0, |
| "step": 8450 |
| }, |
| { |
| "entropy": 1.5705560445785522, |
| "epoch": 5.062836624775583, |
| "grad_norm": 0.1025390625, |
| "learning_rate": 9.377618192698983e-06, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996441304683685, |
| "num_tokens": 8936810.0, |
| "step": 8460 |
| }, |
| { |
| "entropy": 1.5224881649017334, |
| "epoch": 5.068821065230401, |
| "grad_norm": 0.01129150390625, |
| "learning_rate": 9.317773788150807e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8947316.0, |
| "step": 8470 |
| }, |
| { |
| "entropy": 1.5385950803756714, |
| "epoch": 5.074805505685219, |
| "grad_norm": 0.1376953125, |
| "learning_rate": 9.257929383602634e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8957870.0, |
| "step": 8480 |
| }, |
| { |
| "entropy": 1.583323359489441, |
| "epoch": 5.080789946140036, |
| "grad_norm": 0.38671875, |
| "learning_rate": 9.198084979054457e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8968652.0, |
| "step": 8490 |
| }, |
| { |
| "entropy": 1.524876642227173, |
| "epoch": 5.086774386594853, |
| "grad_norm": 0.056884765625, |
| "learning_rate": 9.138240574506284e-06, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996363639831543, |
| "num_tokens": 8979194.0, |
| "step": 8500 |
| }, |
| { |
| "entropy": 1.565677809715271, |
| "epoch": 5.09275882704967, |
| "grad_norm": 0.04345703125, |
| "learning_rate": 9.078396169958108e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 8989644.0, |
| "step": 8510 |
| }, |
| { |
| "entropy": 1.5748510122299195, |
| "epoch": 5.098743267504489, |
| "grad_norm": 0.0286865234375, |
| "learning_rate": 9.018551765409935e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9000200.0, |
| "step": 8520 |
| }, |
| { |
| "entropy": 1.556013298034668, |
| "epoch": 5.104727707959306, |
| "grad_norm": 0.0235595703125, |
| "learning_rate": 8.958707360861759e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.999622642993927, |
| "num_tokens": 9010599.0, |
| "step": 8530 |
| }, |
| { |
| "entropy": 1.5656542539596559, |
| "epoch": 5.110712148414123, |
| "grad_norm": 0.2197265625, |
| "learning_rate": 8.898862956313585e-06, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.9993050158023834, |
| "num_tokens": 9021333.0, |
| "step": 8540 |
| }, |
| { |
| "entropy": 1.597491943836212, |
| "epoch": 5.11669658886894, |
| "grad_norm": 0.038818359375, |
| "learning_rate": 8.839018551765409e-06, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996254682540894, |
| "num_tokens": 9031830.0, |
| "step": 8550 |
| }, |
| { |
| "entropy": 1.5429089546203614, |
| "epoch": 5.122681029323759, |
| "grad_norm": 0.035888671875, |
| "learning_rate": 8.779174147217236e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9042318.0, |
| "step": 8560 |
| }, |
| { |
| "entropy": 1.6095686435699463, |
| "epoch": 5.128665469778576, |
| "grad_norm": 0.044677734375, |
| "learning_rate": 8.71932974266906e-06, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996539771556854, |
| "num_tokens": 9052817.0, |
| "step": 8570 |
| }, |
| { |
| "entropy": 1.5880122184753418, |
| "epoch": 5.134649910233393, |
| "grad_norm": 0.0098876953125, |
| "learning_rate": 8.659485338120887e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9063460.0, |
| "step": 8580 |
| }, |
| { |
| "entropy": 1.5641863226890564, |
| "epoch": 5.14063435068821, |
| "grad_norm": 0.00933837890625, |
| "learning_rate": 8.59964093357271e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9074005.0, |
| "step": 8590 |
| }, |
| { |
| "entropy": 1.5898365259170533, |
| "epoch": 5.146618791143029, |
| "grad_norm": 0.0078125, |
| "learning_rate": 8.539796529024537e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9084684.0, |
| "step": 8600 |
| }, |
| { |
| "entropy": 1.550035560131073, |
| "epoch": 5.152603231597846, |
| "grad_norm": 0.0703125, |
| "learning_rate": 8.479952124476363e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9095288.0, |
| "step": 8610 |
| }, |
| { |
| "entropy": 1.5923567295074463, |
| "epoch": 5.158587672052663, |
| "grad_norm": 0.02392578125, |
| "learning_rate": 8.420107719928186e-06, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996478855609894, |
| "num_tokens": 9105815.0, |
| "step": 8620 |
| }, |
| { |
| "entropy": 1.5446609854698181, |
| "epoch": 5.16457211250748, |
| "grad_norm": 0.0108642578125, |
| "learning_rate": 8.360263315380013e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9116319.0, |
| "step": 8630 |
| }, |
| { |
| "entropy": 1.5514729619026184, |
| "epoch": 5.170556552962298, |
| "grad_norm": 0.0205078125, |
| "learning_rate": 8.300418910831837e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9126714.0, |
| "step": 8640 |
| }, |
| { |
| "entropy": 1.5688727736473083, |
| "epoch": 5.176540993417116, |
| "grad_norm": 0.0169677734375, |
| "learning_rate": 8.240574506283664e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9137258.0, |
| "step": 8650 |
| }, |
| { |
| "entropy": 1.5155110716819764, |
| "epoch": 5.182525433871933, |
| "grad_norm": 0.019287109375, |
| "learning_rate": 8.180730101735487e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9147983.0, |
| "step": 8660 |
| }, |
| { |
| "entropy": 1.576178824901581, |
| "epoch": 5.18850987432675, |
| "grad_norm": 0.1416015625, |
| "learning_rate": 8.120885697187314e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9158424.0, |
| "step": 8670 |
| }, |
| { |
| "entropy": 1.6217318296432495, |
| "epoch": 5.194494314781568, |
| "grad_norm": 0.012939453125, |
| "learning_rate": 8.061041292639138e-06, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996515691280365, |
| "num_tokens": 9169127.0, |
| "step": 8680 |
| }, |
| { |
| "entropy": 1.5923057436943053, |
| "epoch": 5.200478755236386, |
| "grad_norm": 0.138671875, |
| "learning_rate": 8.001196888090965e-06, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9996282517910003, |
| "num_tokens": 9179719.0, |
| "step": 8690 |
| }, |
| { |
| "entropy": 1.5494077682495118, |
| "epoch": 5.206463195691203, |
| "grad_norm": 0.4921875, |
| "learning_rate": 7.941352483542788e-06, |
| "loss": 0.0009, |
| "mean_token_accuracy": 0.9996153831481933, |
| "num_tokens": 9190368.0, |
| "step": 8700 |
| }, |
| { |
| "entropy": 1.6158392786979676, |
| "epoch": 5.21244763614602, |
| "grad_norm": 0.00604248046875, |
| "learning_rate": 7.881508078994615e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 0.9996515691280365, |
| "num_tokens": 9201014.0, |
| "step": 8710 |
| }, |
| { |
| "entropy": 1.5365092873573303, |
| "epoch": 5.218432076600838, |
| "grad_norm": 0.010986328125, |
| "learning_rate": 7.821663674446439e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9211547.0, |
| "step": 8720 |
| }, |
| { |
| "entropy": 1.6001612663269043, |
| "epoch": 5.224416517055655, |
| "grad_norm": 0.00537109375, |
| "learning_rate": 7.761819269898266e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9222245.0, |
| "step": 8730 |
| }, |
| { |
| "entropy": 1.5610676050186156, |
| "epoch": 5.230400957510473, |
| "grad_norm": 0.021728515625, |
| "learning_rate": 7.70197486535009e-06, |
| "loss": 0.0021, |
| "mean_token_accuracy": 0.9992975473403931, |
| "num_tokens": 9232882.0, |
| "step": 8740 |
| }, |
| { |
| "entropy": 1.5196431040763856, |
| "epoch": 5.23638539796529, |
| "grad_norm": 0.1875, |
| "learning_rate": 7.642130460801917e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9243354.0, |
| "step": 8750 |
| }, |
| { |
| "entropy": 1.5230332970619203, |
| "epoch": 5.242369838420108, |
| "grad_norm": 0.00927734375, |
| "learning_rate": 7.58228605625374e-06, |
| "loss": 0.001, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9253906.0, |
| "step": 8760 |
| }, |
| { |
| "entropy": 1.56902277469635, |
| "epoch": 5.248354278874925, |
| "grad_norm": 0.1689453125, |
| "learning_rate": 7.522441651705565e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9264531.0, |
| "step": 8770 |
| }, |
| { |
| "entropy": 1.5179225206375122, |
| "epoch": 5.254338719329743, |
| "grad_norm": 0.00555419921875, |
| "learning_rate": 7.462597247157391e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9275165.0, |
| "step": 8780 |
| }, |
| { |
| "entropy": 1.6126029729843139, |
| "epoch": 5.26032315978456, |
| "grad_norm": 0.138671875, |
| "learning_rate": 7.402752842609216e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 0.9996389865875244, |
| "num_tokens": 9285899.0, |
| "step": 8790 |
| }, |
| { |
| "entropy": 1.5695497632026671, |
| "epoch": 5.266307600239378, |
| "grad_norm": 0.0118408203125, |
| "learning_rate": 7.342908438061041e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.9996415793895721, |
| "num_tokens": 9296579.0, |
| "step": 8800 |
| }, |
| { |
| "entropy": 1.5539406895637513, |
| "epoch": 5.272292040694195, |
| "grad_norm": 0.0146484375, |
| "learning_rate": 7.283064033512867e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9307184.0, |
| "step": 8810 |
| }, |
| { |
| "entropy": 1.6012183547019958, |
| "epoch": 5.278276481149012, |
| "grad_norm": 0.00762939453125, |
| "learning_rate": 7.223219628964692e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9317825.0, |
| "step": 8820 |
| }, |
| { |
| "entropy": 1.521978497505188, |
| "epoch": 5.28426092160383, |
| "grad_norm": 0.046875, |
| "learning_rate": 7.163375224416517e-06, |
| "loss": 0.0018, |
| "mean_token_accuracy": 0.9992892503738403, |
| "num_tokens": 9328583.0, |
| "step": 8830 |
| }, |
| { |
| "entropy": 1.5513456106185912, |
| "epoch": 5.290245362058648, |
| "grad_norm": 0.11083984375, |
| "learning_rate": 7.1035308198683425e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9339201.0, |
| "step": 8840 |
| }, |
| { |
| "entropy": 1.5179700970649719, |
| "epoch": 5.296229802513465, |
| "grad_norm": 0.13671875, |
| "learning_rate": 7.043686415320168e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9349689.0, |
| "step": 8850 |
| }, |
| { |
| "entropy": 1.5577180981636047, |
| "epoch": 5.302214242968282, |
| "grad_norm": 0.00531005859375, |
| "learning_rate": 6.983842010771993e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9360183.0, |
| "step": 8860 |
| }, |
| { |
| "entropy": 1.584286642074585, |
| "epoch": 5.3081986834231, |
| "grad_norm": 0.0478515625, |
| "learning_rate": 6.923997606223818e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9370657.0, |
| "step": 8870 |
| }, |
| { |
| "entropy": 1.5500143885612487, |
| "epoch": 5.314183123877918, |
| "grad_norm": 0.0228271484375, |
| "learning_rate": 6.864153201675644e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9381207.0, |
| "step": 8880 |
| }, |
| { |
| "entropy": 1.563618266582489, |
| "epoch": 5.320167564332735, |
| "grad_norm": 0.10595703125, |
| "learning_rate": 6.804308797127469e-06, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996350347995758, |
| "num_tokens": 9391788.0, |
| "step": 8890 |
| }, |
| { |
| "entropy": 1.5550428748130798, |
| "epoch": 5.326152004787552, |
| "grad_norm": 0.0260009765625, |
| "learning_rate": 6.744464392579294e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9402365.0, |
| "step": 8900 |
| }, |
| { |
| "entropy": 1.5502114057540894, |
| "epoch": 5.332136445242369, |
| "grad_norm": 0.039794921875, |
| "learning_rate": 6.6846199880311196e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9412918.0, |
| "step": 8910 |
| }, |
| { |
| "entropy": 1.4951329708099366, |
| "epoch": 5.338120885697188, |
| "grad_norm": 0.296875, |
| "learning_rate": 6.624775583482945e-06, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996428549289703, |
| "num_tokens": 9423435.0, |
| "step": 8920 |
| }, |
| { |
| "entropy": 1.556550133228302, |
| "epoch": 5.344105326152005, |
| "grad_norm": 0.0252685546875, |
| "learning_rate": 6.56493117893477e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9433957.0, |
| "step": 8930 |
| }, |
| { |
| "entropy": 1.532106137275696, |
| "epoch": 5.350089766606822, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 6.5050867743865954e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9444403.0, |
| "step": 8940 |
| }, |
| { |
| "entropy": 1.5472781896591186, |
| "epoch": 5.356074207061639, |
| "grad_norm": 0.07568359375, |
| "learning_rate": 6.445242369838421e-06, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9454926.0, |
| "step": 8950 |
| }, |
| { |
| "entropy": 1.6014922738075257, |
| "epoch": 5.3620586475164576, |
| "grad_norm": 0.01348876953125, |
| "learning_rate": 6.385397965290246e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9465381.0, |
| "step": 8960 |
| }, |
| { |
| "entropy": 1.5169659018516541, |
| "epoch": 5.368043087971275, |
| "grad_norm": 0.62109375, |
| "learning_rate": 6.325553560742071e-06, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9996610164642334, |
| "num_tokens": 9475976.0, |
| "step": 8970 |
| }, |
| { |
| "entropy": 1.547071349620819, |
| "epoch": 5.374027528426092, |
| "grad_norm": 0.08447265625, |
| "learning_rate": 6.265709156193896e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9486574.0, |
| "step": 8980 |
| }, |
| { |
| "entropy": 1.5052280187606812, |
| "epoch": 5.380011968880909, |
| "grad_norm": 0.0137939453125, |
| "learning_rate": 6.205864751645721e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9496974.0, |
| "step": 8990 |
| }, |
| { |
| "entropy": 1.5267866492271422, |
| "epoch": 5.385996409335727, |
| "grad_norm": 0.0142822265625, |
| "learning_rate": 6.146020347097546e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9507510.0, |
| "step": 9000 |
| }, |
| { |
| "entropy": 1.5410394072532654, |
| "epoch": 5.391980849790545, |
| "grad_norm": 0.330078125, |
| "learning_rate": 6.086175942549372e-06, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9518127.0, |
| "step": 9010 |
| }, |
| { |
| "entropy": 1.565923023223877, |
| "epoch": 5.397965290245362, |
| "grad_norm": 0.020751953125, |
| "learning_rate": 6.026331538001197e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9528583.0, |
| "step": 9020 |
| }, |
| { |
| "entropy": 1.5626705288887024, |
| "epoch": 5.403949730700179, |
| "grad_norm": 0.11669921875, |
| "learning_rate": 5.966487133453022e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.9996240615844727, |
| "num_tokens": 9539067.0, |
| "step": 9030 |
| }, |
| { |
| "entropy": 1.5308679223060608, |
| "epoch": 5.409934171154997, |
| "grad_norm": 0.0498046875, |
| "learning_rate": 5.9066427289048475e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 0.9996428549289703, |
| "num_tokens": 9549380.0, |
| "step": 9040 |
| }, |
| { |
| "entropy": 1.5556403517723083, |
| "epoch": 5.415918611609815, |
| "grad_norm": 0.171875, |
| "learning_rate": 5.846798324356673e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9560020.0, |
| "step": 9050 |
| }, |
| { |
| "entropy": 1.497513198852539, |
| "epoch": 5.421903052064632, |
| "grad_norm": 0.0247802734375, |
| "learning_rate": 5.786953919808498e-06, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996441304683685, |
| "num_tokens": 9570282.0, |
| "step": 9060 |
| }, |
| { |
| "entropy": 1.557295060157776, |
| "epoch": 5.427887492519449, |
| "grad_norm": 0.05810546875, |
| "learning_rate": 5.727109515260323e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.9996282517910003, |
| "num_tokens": 9580781.0, |
| "step": 9070 |
| }, |
| { |
| "entropy": 1.5526091694831847, |
| "epoch": 5.433871932974267, |
| "grad_norm": 0.08203125, |
| "learning_rate": 5.667265110712149e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9591317.0, |
| "step": 9080 |
| }, |
| { |
| "entropy": 1.5796686291694642, |
| "epoch": 5.439856373429085, |
| "grad_norm": 0.0162353515625, |
| "learning_rate": 5.607420706163974e-06, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996415793895721, |
| "num_tokens": 9601918.0, |
| "step": 9090 |
| }, |
| { |
| "entropy": 1.5661948442459106, |
| "epoch": 5.445840813883902, |
| "grad_norm": 0.0118408203125, |
| "learning_rate": 5.547576301615799e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9612550.0, |
| "step": 9100 |
| }, |
| { |
| "entropy": 1.5428655982017516, |
| "epoch": 5.451825254338719, |
| "grad_norm": 0.083984375, |
| "learning_rate": 5.4877318970676245e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9623018.0, |
| "step": 9110 |
| }, |
| { |
| "entropy": 1.5452999949455262, |
| "epoch": 5.457809694793537, |
| "grad_norm": 0.0240478515625, |
| "learning_rate": 5.42788749251945e-06, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.9996587038040161, |
| "num_tokens": 9633720.0, |
| "step": 9120 |
| }, |
| { |
| "entropy": 1.6031384706497191, |
| "epoch": 5.463794135248354, |
| "grad_norm": 0.2314453125, |
| "learning_rate": 5.368043087971274e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9644450.0, |
| "step": 9130 |
| }, |
| { |
| "entropy": 1.6057791471481324, |
| "epoch": 5.469778575703172, |
| "grad_norm": 0.020263671875, |
| "learning_rate": 5.3081986834230996e-06, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996575355529785, |
| "num_tokens": 9655007.0, |
| "step": 9140 |
| }, |
| { |
| "entropy": 1.5202216625213623, |
| "epoch": 5.475763016157989, |
| "grad_norm": 0.0181884765625, |
| "learning_rate": 5.248354278874925e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9665606.0, |
| "step": 9150 |
| }, |
| { |
| "entropy": 1.5834308981895446, |
| "epoch": 5.481747456612807, |
| "grad_norm": 0.07080078125, |
| "learning_rate": 5.18850987432675e-06, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9676299.0, |
| "step": 9160 |
| }, |
| { |
| "entropy": 1.5219863414764405, |
| "epoch": 5.487731897067624, |
| "grad_norm": 0.007080078125, |
| "learning_rate": 5.1286654697785754e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9686557.0, |
| "step": 9170 |
| }, |
| { |
| "entropy": 1.5743539810180665, |
| "epoch": 5.493716337522442, |
| "grad_norm": 0.042236328125, |
| "learning_rate": 5.068821065230401e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9697132.0, |
| "step": 9180 |
| }, |
| { |
| "entropy": 1.5550188302993775, |
| "epoch": 5.499700777977259, |
| "grad_norm": 0.00360107421875, |
| "learning_rate": 5.008976660682226e-06, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9996491253376008, |
| "num_tokens": 9707746.0, |
| "step": 9190 |
| }, |
| { |
| "entropy": 1.5604373812675476, |
| "epoch": 5.505685218432077, |
| "grad_norm": 0.01019287109375, |
| "learning_rate": 4.949132256134051e-06, |
| "loss": 0.0008, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9718292.0, |
| "step": 9200 |
| }, |
| { |
| "entropy": 1.540794813632965, |
| "epoch": 5.511669658886894, |
| "grad_norm": 0.052001953125, |
| "learning_rate": 4.889287851585877e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9728807.0, |
| "step": 9210 |
| }, |
| { |
| "entropy": 1.5922885179519652, |
| "epoch": 5.517654099341712, |
| "grad_norm": 0.0189208984375, |
| "learning_rate": 4.829443447037702e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9739308.0, |
| "step": 9220 |
| }, |
| { |
| "entropy": 1.5752901792526246, |
| "epoch": 5.523638539796529, |
| "grad_norm": 0.0625, |
| "learning_rate": 4.769599042489527e-06, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9749922.0, |
| "step": 9230 |
| }, |
| { |
| "entropy": 1.5295986771583556, |
| "epoch": 5.529622980251347, |
| "grad_norm": 0.01324462890625, |
| "learning_rate": 4.7097546379413525e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9760377.0, |
| "step": 9240 |
| }, |
| { |
| "entropy": 1.5306970953941346, |
| "epoch": 5.535607420706164, |
| "grad_norm": 0.005706787109375, |
| "learning_rate": 4.649910233393178e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9770647.0, |
| "step": 9250 |
| }, |
| { |
| "entropy": 1.5370991468429565, |
| "epoch": 5.541591861160981, |
| "grad_norm": 0.453125, |
| "learning_rate": 4.590065828845003e-06, |
| "loss": 0.0011, |
| "mean_token_accuracy": 0.9992578208446503, |
| "num_tokens": 9781114.0, |
| "step": 9260 |
| }, |
| { |
| "entropy": 1.5558555841445922, |
| "epoch": 5.547576301615799, |
| "grad_norm": 0.072265625, |
| "learning_rate": 4.530221424296828e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9791718.0, |
| "step": 9270 |
| }, |
| { |
| "entropy": 1.5621288776397706, |
| "epoch": 5.553560742070617, |
| "grad_norm": 0.027099609375, |
| "learning_rate": 4.470377019748653e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9802302.0, |
| "step": 9280 |
| }, |
| { |
| "entropy": 1.5378905177116393, |
| "epoch": 5.559545182525434, |
| "grad_norm": 0.021240234375, |
| "learning_rate": 4.410532615200479e-06, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996527791023254, |
| "num_tokens": 9812976.0, |
| "step": 9290 |
| }, |
| { |
| "entropy": 1.540464496612549, |
| "epoch": 5.565529622980251, |
| "grad_norm": 0.0177001953125, |
| "learning_rate": 4.350688210652304e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 0.9996363639831543, |
| "num_tokens": 9823503.0, |
| "step": 9300 |
| }, |
| { |
| "entropy": 1.5626393675804138, |
| "epoch": 5.571514063435069, |
| "grad_norm": 0.125, |
| "learning_rate": 4.2908438061041295e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9833941.0, |
| "step": 9310 |
| }, |
| { |
| "entropy": 1.5677518129348755, |
| "epoch": 5.5774985038898865, |
| "grad_norm": 0.0262451171875, |
| "learning_rate": 4.230999401555955e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9844468.0, |
| "step": 9320 |
| }, |
| { |
| "entropy": 1.5441372871398926, |
| "epoch": 5.583482944344704, |
| "grad_norm": 0.04638671875, |
| "learning_rate": 4.17115499700778e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9855138.0, |
| "step": 9330 |
| }, |
| { |
| "entropy": 1.5927303791046143, |
| "epoch": 5.589467384799521, |
| "grad_norm": 0.00994873046875, |
| "learning_rate": 4.111310592459605e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9865684.0, |
| "step": 9340 |
| }, |
| { |
| "entropy": 1.525864827632904, |
| "epoch": 5.595451825254338, |
| "grad_norm": 0.0301513671875, |
| "learning_rate": 4.051466187911431e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9875999.0, |
| "step": 9350 |
| }, |
| { |
| "entropy": 1.6044833660125732, |
| "epoch": 5.6014362657091565, |
| "grad_norm": 0.05419921875, |
| "learning_rate": 3.991621783363256e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9886567.0, |
| "step": 9360 |
| }, |
| { |
| "entropy": 1.5554206848144532, |
| "epoch": 5.607420706163974, |
| "grad_norm": 0.0322265625, |
| "learning_rate": 3.931777378815081e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 0.9996336996555328, |
| "num_tokens": 9897281.0, |
| "step": 9370 |
| }, |
| { |
| "entropy": 1.533079206943512, |
| "epoch": 5.613405146618791, |
| "grad_norm": 0.042236328125, |
| "learning_rate": 3.8719329742669066e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9907747.0, |
| "step": 9380 |
| }, |
| { |
| "entropy": 1.588419759273529, |
| "epoch": 5.619389587073608, |
| "grad_norm": 0.037109375, |
| "learning_rate": 3.8120885697187314e-06, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996710538864135, |
| "num_tokens": 9918369.0, |
| "step": 9390 |
| }, |
| { |
| "entropy": 1.5578839302062988, |
| "epoch": 5.6253740275284265, |
| "grad_norm": 0.03369140625, |
| "learning_rate": 3.7522441651705567e-06, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9996587038040161, |
| "num_tokens": 9929021.0, |
| "step": 9400 |
| }, |
| { |
| "entropy": 1.5441142320632935, |
| "epoch": 5.631358467983244, |
| "grad_norm": 0.047119140625, |
| "learning_rate": 3.692399760622382e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9939623.0, |
| "step": 9410 |
| }, |
| { |
| "entropy": 1.5249134063720704, |
| "epoch": 5.637342908438061, |
| "grad_norm": 0.0037689208984375, |
| "learning_rate": 3.6325553560742073e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9950139.0, |
| "step": 9420 |
| }, |
| { |
| "entropy": 1.5780072450637816, |
| "epoch": 5.643327348892878, |
| "grad_norm": 0.036376953125, |
| "learning_rate": 3.5727109515260326e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9960664.0, |
| "step": 9430 |
| }, |
| { |
| "entropy": 1.5765936493873596, |
| "epoch": 5.649311789347696, |
| "grad_norm": 0.035888671875, |
| "learning_rate": 3.512866546977858e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9971332.0, |
| "step": 9440 |
| }, |
| { |
| "entropy": 1.6016755819320678, |
| "epoch": 5.655296229802514, |
| "grad_norm": 0.447265625, |
| "learning_rate": 3.453022142429683e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 0.9996254682540894, |
| "num_tokens": 9981877.0, |
| "step": 9450 |
| }, |
| { |
| "entropy": 1.5063655138015748, |
| "epoch": 5.661280670257331, |
| "grad_norm": 0.103515625, |
| "learning_rate": 3.3931777378815085e-06, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 9992375.0, |
| "step": 9460 |
| }, |
| { |
| "entropy": 1.58121200799942, |
| "epoch": 5.667265110712148, |
| "grad_norm": 0.01513671875, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10003041.0, |
| "step": 9470 |
| }, |
| { |
| "entropy": 1.6163565397262574, |
| "epoch": 5.673249551166966, |
| "grad_norm": 0.283203125, |
| "learning_rate": 3.2734889287851586e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10013816.0, |
| "step": 9480 |
| }, |
| { |
| "entropy": 1.5436020016670227, |
| "epoch": 5.679233991621784, |
| "grad_norm": 0.0167236328125, |
| "learning_rate": 3.213644524236984e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10024267.0, |
| "step": 9490 |
| }, |
| { |
| "entropy": 1.528675389289856, |
| "epoch": 5.685218432076601, |
| "grad_norm": 0.1953125, |
| "learning_rate": 3.153800119688809e-06, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996453881263733, |
| "num_tokens": 10034645.0, |
| "step": 9500 |
| }, |
| { |
| "entropy": 1.5461339116096497, |
| "epoch": 5.691202872531418, |
| "grad_norm": 0.022216796875, |
| "learning_rate": 3.0939557151406345e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10045144.0, |
| "step": 9510 |
| }, |
| { |
| "entropy": 1.5538923740386963, |
| "epoch": 5.697187312986236, |
| "grad_norm": 0.04296875, |
| "learning_rate": 3.0341113105924598e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10055781.0, |
| "step": 9520 |
| }, |
| { |
| "entropy": 1.56598619222641, |
| "epoch": 5.703171753441053, |
| "grad_norm": 0.0174560546875, |
| "learning_rate": 2.974266906044285e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10066213.0, |
| "step": 9530 |
| }, |
| { |
| "entropy": 1.5303544521331787, |
| "epoch": 5.709156193895871, |
| "grad_norm": 0.10986328125, |
| "learning_rate": 2.9144225014961104e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10076706.0, |
| "step": 9540 |
| }, |
| { |
| "entropy": 1.5925257802009583, |
| "epoch": 5.715140634350688, |
| "grad_norm": 0.037109375, |
| "learning_rate": 2.8545780969479352e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10087369.0, |
| "step": 9550 |
| }, |
| { |
| "entropy": 1.607127583026886, |
| "epoch": 5.721125074805506, |
| "grad_norm": 0.0830078125, |
| "learning_rate": 2.7947336923997605e-06, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996183216571808, |
| "num_tokens": 10097967.0, |
| "step": 9560 |
| }, |
| { |
| "entropy": 1.5356394171714782, |
| "epoch": 5.727109515260323, |
| "grad_norm": 0.01226806640625, |
| "learning_rate": 2.734889287851586e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10108599.0, |
| "step": 9570 |
| }, |
| { |
| "entropy": 1.6435596108436585, |
| "epoch": 5.733093955715141, |
| "grad_norm": 0.00494384765625, |
| "learning_rate": 2.675044883303411e-06, |
| "loss": 0.0008, |
| "mean_token_accuracy": 0.999622642993927, |
| "num_tokens": 10119292.0, |
| "step": 9580 |
| }, |
| { |
| "entropy": 1.634545588493347, |
| "epoch": 5.739078396169958, |
| "grad_norm": 0.0517578125, |
| "learning_rate": 2.6152004787552364e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10129944.0, |
| "step": 9590 |
| }, |
| { |
| "entropy": 1.569711184501648, |
| "epoch": 5.745062836624776, |
| "grad_norm": 0.01123046875, |
| "learning_rate": 2.5553560742070617e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10140569.0, |
| "step": 9600 |
| }, |
| { |
| "entropy": 1.5555992722511292, |
| "epoch": 5.751047277079593, |
| "grad_norm": 0.0023956298828125, |
| "learning_rate": 2.495511669658887e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10151072.0, |
| "step": 9610 |
| }, |
| { |
| "entropy": 1.5273286819458007, |
| "epoch": 5.75703171753441, |
| "grad_norm": 0.029541015625, |
| "learning_rate": 2.435667265110712e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10161447.0, |
| "step": 9620 |
| }, |
| { |
| "entropy": 1.5150355458259583, |
| "epoch": 5.763016157989228, |
| "grad_norm": 0.53515625, |
| "learning_rate": 2.375822860562537e-06, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9992917656898499, |
| "num_tokens": 10171906.0, |
| "step": 9630 |
| }, |
| { |
| "entropy": 1.6199348092079162, |
| "epoch": 5.769000598444046, |
| "grad_norm": 0.0810546875, |
| "learning_rate": 2.315978456014363e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10182715.0, |
| "step": 9640 |
| }, |
| { |
| "entropy": 1.541762149333954, |
| "epoch": 5.774985038898863, |
| "grad_norm": 0.26171875, |
| "learning_rate": 2.256134051466188e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10193368.0, |
| "step": 9650 |
| }, |
| { |
| "entropy": 1.5536392092704774, |
| "epoch": 5.78096947935368, |
| "grad_norm": 0.051513671875, |
| "learning_rate": 2.1962896469180134e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10203739.0, |
| "step": 9660 |
| }, |
| { |
| "entropy": 1.5519014716148376, |
| "epoch": 5.786953919808498, |
| "grad_norm": 0.0152587890625, |
| "learning_rate": 2.1364452423698387e-06, |
| "loss": 0.0013, |
| "mean_token_accuracy": 0.9996515691280365, |
| "num_tokens": 10214252.0, |
| "step": 9670 |
| }, |
| { |
| "entropy": 1.553311824798584, |
| "epoch": 5.7929383602633155, |
| "grad_norm": 0.0247802734375, |
| "learning_rate": 2.076600837821664e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10224950.0, |
| "step": 9680 |
| }, |
| { |
| "entropy": 1.5511303305625916, |
| "epoch": 5.798922800718133, |
| "grad_norm": 0.0166015625, |
| "learning_rate": 2.0167564332734893e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10235495.0, |
| "step": 9690 |
| }, |
| { |
| "entropy": 1.5440836668014526, |
| "epoch": 5.80490724117295, |
| "grad_norm": 0.005218505859375, |
| "learning_rate": 1.956912028725314e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10246127.0, |
| "step": 9700 |
| }, |
| { |
| "entropy": 1.5713715076446533, |
| "epoch": 5.810891681627767, |
| "grad_norm": 0.025390625, |
| "learning_rate": 1.8970676241771395e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10256733.0, |
| "step": 9710 |
| }, |
| { |
| "entropy": 1.5970672011375426, |
| "epoch": 5.8168761220825855, |
| "grad_norm": 0.271484375, |
| "learning_rate": 1.8372232196289648e-06, |
| "loss": 0.0008, |
| "mean_token_accuracy": 0.9996240615844727, |
| "num_tokens": 10267236.0, |
| "step": 9720 |
| }, |
| { |
| "entropy": 1.556793713569641, |
| "epoch": 5.822860562537403, |
| "grad_norm": 0.029296875, |
| "learning_rate": 1.77737881508079e-06, |
| "loss": 0.0005, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10277749.0, |
| "step": 9730 |
| }, |
| { |
| "entropy": 1.6262453079223633, |
| "epoch": 5.82884500299222, |
| "grad_norm": 0.010986328125, |
| "learning_rate": 1.7175344105326153e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10288379.0, |
| "step": 9740 |
| }, |
| { |
| "entropy": 1.495825183391571, |
| "epoch": 5.834829443447037, |
| "grad_norm": 0.02978515625, |
| "learning_rate": 1.6576900059844404e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10298897.0, |
| "step": 9750 |
| }, |
| { |
| "entropy": 1.6074238777160645, |
| "epoch": 5.8408138839018555, |
| "grad_norm": 0.05615234375, |
| "learning_rate": 1.5978456014362657e-06, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10309496.0, |
| "step": 9760 |
| }, |
| { |
| "entropy": 1.5544676423072814, |
| "epoch": 5.846798324356673, |
| "grad_norm": 0.0030975341796875, |
| "learning_rate": 1.538001196888091e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10320081.0, |
| "step": 9770 |
| }, |
| { |
| "entropy": 1.580102515220642, |
| "epoch": 5.85278276481149, |
| "grad_norm": 0.2236328125, |
| "learning_rate": 1.4781567923399163e-06, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10330768.0, |
| "step": 9780 |
| }, |
| { |
| "entropy": 1.5838265538215637, |
| "epoch": 5.858767205266307, |
| "grad_norm": 0.09423828125, |
| "learning_rate": 1.4183123877917414e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10341607.0, |
| "step": 9790 |
| }, |
| { |
| "entropy": 1.5759095907211305, |
| "epoch": 5.864751645721125, |
| "grad_norm": 0.037841796875, |
| "learning_rate": 1.3584679832435667e-06, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.9996323525905609, |
| "num_tokens": 10352215.0, |
| "step": 9800 |
| }, |
| { |
| "entropy": 1.5455079555511475, |
| "epoch": 5.870736086175943, |
| "grad_norm": 0.053466796875, |
| "learning_rate": 1.2986235786953922e-06, |
| "loss": 0.0015, |
| "mean_token_accuracy": 0.9996453881263733, |
| "num_tokens": 10362808.0, |
| "step": 9810 |
| }, |
| { |
| "entropy": 1.5143304467201233, |
| "epoch": 5.87672052663076, |
| "grad_norm": 0.09228515625, |
| "learning_rate": 1.2387791741472175e-06, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10373485.0, |
| "step": 9820 |
| }, |
| { |
| "entropy": 1.559544062614441, |
| "epoch": 5.882704967085577, |
| "grad_norm": 0.016845703125, |
| "learning_rate": 1.1789347695990425e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10384069.0, |
| "step": 9830 |
| }, |
| { |
| "entropy": 1.5978240847587586, |
| "epoch": 5.888689407540395, |
| "grad_norm": 0.007781982421875, |
| "learning_rate": 1.1190903650508678e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10394680.0, |
| "step": 9840 |
| }, |
| { |
| "entropy": 1.5734472513198852, |
| "epoch": 5.894673847995213, |
| "grad_norm": 0.00860595703125, |
| "learning_rate": 1.0592459605026931e-06, |
| "loss": 0.0008, |
| "mean_token_accuracy": 0.9996575355529785, |
| "num_tokens": 10405414.0, |
| "step": 9850 |
| }, |
| { |
| "entropy": 1.5193457126617431, |
| "epoch": 5.90065828845003, |
| "grad_norm": 0.1552734375, |
| "learning_rate": 9.994015559545182e-07, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10415914.0, |
| "step": 9860 |
| }, |
| { |
| "entropy": 1.6170666337013244, |
| "epoch": 5.906642728904847, |
| "grad_norm": 0.08740234375, |
| "learning_rate": 9.395571514063435e-07, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9993006944656372, |
| "num_tokens": 10426593.0, |
| "step": 9870 |
| }, |
| { |
| "entropy": 1.579539179801941, |
| "epoch": 5.912627169359665, |
| "grad_norm": 0.5703125, |
| "learning_rate": 8.797127468581688e-07, |
| "loss": 0.0014, |
| "mean_token_accuracy": 0.9996621608734131, |
| "num_tokens": 10437212.0, |
| "step": 9880 |
| }, |
| { |
| "entropy": 1.6408529162406922, |
| "epoch": 5.918611609814482, |
| "grad_norm": 0.021728515625, |
| "learning_rate": 8.19868342309994e-07, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10447988.0, |
| "step": 9890 |
| }, |
| { |
| "entropy": 1.5982860207557679, |
| "epoch": 5.9245960502693, |
| "grad_norm": 0.00531005859375, |
| "learning_rate": 7.600239377618194e-07, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.9996666669845581, |
| "num_tokens": 10458522.0, |
| "step": 9900 |
| }, |
| { |
| "entropy": 1.5749751448631286, |
| "epoch": 5.930580490724117, |
| "grad_norm": 0.00927734375, |
| "learning_rate": 7.001795332136445e-07, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10468943.0, |
| "step": 9910 |
| }, |
| { |
| "entropy": 1.5508565783500672, |
| "epoch": 5.936564931178935, |
| "grad_norm": 0.010986328125, |
| "learning_rate": 6.403351286654698e-07, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10479652.0, |
| "step": 9920 |
| }, |
| { |
| "entropy": 1.546996283531189, |
| "epoch": 5.942549371633753, |
| "grad_norm": 0.1669921875, |
| "learning_rate": 5.80490724117295e-07, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10490439.0, |
| "step": 9930 |
| }, |
| { |
| "entropy": 1.5694025874137878, |
| "epoch": 5.94853381208857, |
| "grad_norm": 0.0123291015625, |
| "learning_rate": 5.206463195691203e-07, |
| "loss": 0.0008, |
| "mean_token_accuracy": 0.9996415793895721, |
| "num_tokens": 10501036.0, |
| "step": 9940 |
| }, |
| { |
| "entropy": 1.52097749710083, |
| "epoch": 5.954518252543387, |
| "grad_norm": 0.009521484375, |
| "learning_rate": 4.6080191502094555e-07, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10511443.0, |
| "step": 9950 |
| }, |
| { |
| "entropy": 1.5758405923843384, |
| "epoch": 5.960502692998205, |
| "grad_norm": 0.00811767578125, |
| "learning_rate": 4.009575104727708e-07, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.9996402859687805, |
| "num_tokens": 10521977.0, |
| "step": 9960 |
| }, |
| { |
| "entropy": 1.5352561831474305, |
| "epoch": 5.966487133453022, |
| "grad_norm": 0.0225830078125, |
| "learning_rate": 3.411131059245961e-07, |
| "loss": 0.0006, |
| "mean_token_accuracy": 0.9996710538864135, |
| "num_tokens": 10532647.0, |
| "step": 9970 |
| }, |
| { |
| "entropy": 1.564480447769165, |
| "epoch": 5.97247157390784, |
| "grad_norm": 0.0140380859375, |
| "learning_rate": 2.812687013764213e-07, |
| "loss": 0.0012, |
| "mean_token_accuracy": 0.9996138989925385, |
| "num_tokens": 10543219.0, |
| "step": 9980 |
| }, |
| { |
| "entropy": 1.5791572093963624, |
| "epoch": 5.978456014362657, |
| "grad_norm": 0.0087890625, |
| "learning_rate": 2.2142429682824658e-07, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10553793.0, |
| "step": 9990 |
| }, |
| { |
| "entropy": 1.5388386368751525, |
| "epoch": 5.9844404548174746, |
| "grad_norm": 0.04833984375, |
| "learning_rate": 1.6157989228007181e-07, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10564325.0, |
| "step": 10000 |
| }, |
| { |
| "entropy": 1.4956823945045472, |
| "epoch": 5.990424895272292, |
| "grad_norm": 0.154296875, |
| "learning_rate": 1.0173548773189707e-07, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10574730.0, |
| "step": 10010 |
| }, |
| { |
| "entropy": 1.491278338432312, |
| "epoch": 5.99640933572711, |
| "grad_norm": 0.0146484375, |
| "learning_rate": 4.1891083183722324e-08, |
| "loss": 0.0001, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 10585227.0, |
| "step": 10020 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10026, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.72830388146217e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|