| { |
| "best_global_step": 7000, |
| "best_metric": 0.19375726580619812, |
| "best_model_checkpoint": "./sft_model/checkpoint-7000", |
| "epoch": 3.9525691699604746, |
| "eval_steps": 500, |
| "global_step": 7000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 2.6964506268501283, |
| "epoch": 0.00282326369282891, |
| "grad_norm": 1879.6842041015625, |
| "learning_rate": 4.999999527987105e-06, |
| "loss": 3.1196, |
| "mean_token_accuracy": 0.5124428987503051, |
| "num_tokens": 40784.0, |
| "step": 5 |
| }, |
| { |
| "entropy": 1.8763649463653564, |
| "epoch": 0.00564652738565782, |
| "grad_norm": 538.1932983398438, |
| "learning_rate": 4.999997610435124e-06, |
| "loss": 1.8844, |
| "mean_token_accuracy": 0.5794065773487092, |
| "num_tokens": 81605.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.5137803554534912, |
| "epoch": 0.00846979107848673, |
| "grad_norm": 298.7164611816406, |
| "learning_rate": 4.99999421784476e-06, |
| "loss": 1.3327, |
| "mean_token_accuracy": 0.6726739525794982, |
| "num_tokens": 122393.0, |
| "step": 15 |
| }, |
| { |
| "entropy": 1.8568676948547362, |
| "epoch": 0.01129305477131564, |
| "grad_norm": 846.4222412109375, |
| "learning_rate": 4.9999893502186794e-06, |
| "loss": 1.7569, |
| "mean_token_accuracy": 0.6106071174144745, |
| "num_tokens": 162845.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.479284644126892, |
| "epoch": 0.014116318464144552, |
| "grad_norm": 303.95062255859375, |
| "learning_rate": 4.999983007560715e-06, |
| "loss": 1.2604, |
| "mean_token_accuracy": 0.6862282872200012, |
| "num_tokens": 203476.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 1.3392301559448243, |
| "epoch": 0.01693958215697346, |
| "grad_norm": 218.93763732910156, |
| "learning_rate": 4.999975189875853e-06, |
| "loss": 1.1434, |
| "mean_token_accuracy": 0.711875069141388, |
| "num_tokens": 244047.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 1.3543478012084962, |
| "epoch": 0.019762845849802372, |
| "grad_norm": 238.82598876953125, |
| "learning_rate": 4.999965897170247e-06, |
| "loss": 1.1883, |
| "mean_token_accuracy": 0.7010170936584472, |
| "num_tokens": 284628.0, |
| "step": 35 |
| }, |
| { |
| "entropy": 1.4084251165390014, |
| "epoch": 0.02258610954263128, |
| "grad_norm": 258.6508483886719, |
| "learning_rate": 4.999955129451204e-06, |
| "loss": 1.2144, |
| "mean_token_accuracy": 0.6951952338218689, |
| "num_tokens": 325111.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 1.4644812107086183, |
| "epoch": 0.025409373235460192, |
| "grad_norm": 236.89532470703125, |
| "learning_rate": 4.999942886727197e-06, |
| "loss": 1.1666, |
| "mean_token_accuracy": 0.7105009198188782, |
| "num_tokens": 365621.0, |
| "step": 45 |
| }, |
| { |
| "entropy": 1.4601025104522705, |
| "epoch": 0.028232636928289104, |
| "grad_norm": 234.15811157226562, |
| "learning_rate": 4.999929169007857e-06, |
| "loss": 1.2064, |
| "mean_token_accuracy": 0.7012509942054749, |
| "num_tokens": 406422.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 1.385688352584839, |
| "epoch": 0.031055900621118012, |
| "grad_norm": 202.43612670898438, |
| "learning_rate": 4.999913976303975e-06, |
| "loss": 1.168, |
| "mean_token_accuracy": 0.705574119091034, |
| "num_tokens": 447053.0, |
| "step": 55 |
| }, |
| { |
| "entropy": 1.329136037826538, |
| "epoch": 0.03387916431394692, |
| "grad_norm": 207.84481811523438, |
| "learning_rate": 4.9998973086275025e-06, |
| "loss": 1.0843, |
| "mean_token_accuracy": 0.7196493268013, |
| "num_tokens": 487480.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 1.398281502723694, |
| "epoch": 0.03670242800677583, |
| "grad_norm": 174.87757873535156, |
| "learning_rate": 4.999879165991553e-06, |
| "loss": 1.1706, |
| "mean_token_accuracy": 0.7041667461395263, |
| "num_tokens": 528088.0, |
| "step": 65 |
| }, |
| { |
| "entropy": 1.43794903755188, |
| "epoch": 0.039525691699604744, |
| "grad_norm": 196.69613647460938, |
| "learning_rate": 4.999859548410398e-06, |
| "loss": 1.1689, |
| "mean_token_accuracy": 0.7039241075515748, |
| "num_tokens": 568747.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 1.3418872117996217, |
| "epoch": 0.042348955392433656, |
| "grad_norm": 174.0625457763672, |
| "learning_rate": 4.999838455899471e-06, |
| "loss": 1.0773, |
| "mean_token_accuracy": 0.723293948173523, |
| "num_tokens": 609459.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 1.4742592096328735, |
| "epoch": 0.04517221908526256, |
| "grad_norm": 193.97073364257812, |
| "learning_rate": 4.999815888475366e-06, |
| "loss": 1.1699, |
| "mean_token_accuracy": 0.7035363435745239, |
| "num_tokens": 650270.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 1.397952675819397, |
| "epoch": 0.04799548277809147, |
| "grad_norm": 191.52386474609375, |
| "learning_rate": 4.999791846155835e-06, |
| "loss": 1.1596, |
| "mean_token_accuracy": 0.7045308589935303, |
| "num_tokens": 690754.0, |
| "step": 85 |
| }, |
| { |
| "entropy": 1.3352545976638794, |
| "epoch": 0.050818746470920384, |
| "grad_norm": 181.89520263671875, |
| "learning_rate": 4.999766328959792e-06, |
| "loss": 1.1273, |
| "mean_token_accuracy": 0.7095789313316345, |
| "num_tokens": 731281.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 1.2845103740692139, |
| "epoch": 0.053642010163749296, |
| "grad_norm": 195.17857360839844, |
| "learning_rate": 4.999739336907312e-06, |
| "loss": 1.0823, |
| "mean_token_accuracy": 0.7215634346008301, |
| "num_tokens": 770817.0, |
| "step": 95 |
| }, |
| { |
| "entropy": 1.2769271850585937, |
| "epoch": 0.05646527385657821, |
| "grad_norm": 177.11135864257812, |
| "learning_rate": 4.999710870019629e-06, |
| "loss": 1.0288, |
| "mean_token_accuracy": 0.7328497529029846, |
| "num_tokens": 811654.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 1.3198055982589723, |
| "epoch": 0.05928853754940711, |
| "grad_norm": 183.05325317382812, |
| "learning_rate": 4.9996809283191375e-06, |
| "loss": 1.1065, |
| "mean_token_accuracy": 0.7168498754501342, |
| "num_tokens": 852176.0, |
| "step": 105 |
| }, |
| { |
| "entropy": 1.2998111486434936, |
| "epoch": 0.062111801242236024, |
| "grad_norm": 173.55929565429688, |
| "learning_rate": 4.999649511829392e-06, |
| "loss": 1.0804, |
| "mean_token_accuracy": 0.7216156601905823, |
| "num_tokens": 892923.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 1.3105514764785766, |
| "epoch": 0.06493506493506493, |
| "grad_norm": 231.638427734375, |
| "learning_rate": 4.9996166205751075e-06, |
| "loss": 1.091, |
| "mean_token_accuracy": 0.7160016298294067, |
| "num_tokens": 933390.0, |
| "step": 115 |
| }, |
| { |
| "entropy": 1.3287595510482788, |
| "epoch": 0.06775832862789384, |
| "grad_norm": 195.63812255859375, |
| "learning_rate": 4.9995822545821596e-06, |
| "loss": 1.113, |
| "mean_token_accuracy": 0.7149634003639221, |
| "num_tokens": 974070.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 1.363778567314148, |
| "epoch": 0.07058159232072275, |
| "grad_norm": 162.81434631347656, |
| "learning_rate": 4.999546413877584e-06, |
| "loss": 1.0972, |
| "mean_token_accuracy": 0.7180424332618713, |
| "num_tokens": 1014869.0, |
| "step": 125 |
| }, |
| { |
| "entropy": 1.3109471559524537, |
| "epoch": 0.07340485601355166, |
| "grad_norm": 159.2827911376953, |
| "learning_rate": 4.999509098489574e-06, |
| "loss": 1.0542, |
| "mean_token_accuracy": 0.7276699781417847, |
| "num_tokens": 1055497.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 1.3256555795669556, |
| "epoch": 0.07622811970638058, |
| "grad_norm": 165.77072143554688, |
| "learning_rate": 4.999470308447488e-06, |
| "loss": 1.1039, |
| "mean_token_accuracy": 0.7153335094451905, |
| "num_tokens": 1096257.0, |
| "step": 135 |
| }, |
| { |
| "entropy": 1.3324081659317017, |
| "epoch": 0.07905138339920949, |
| "grad_norm": 172.11192321777344, |
| "learning_rate": 4.99943004378184e-06, |
| "loss": 1.1212, |
| "mean_token_accuracy": 0.7077261924743652, |
| "num_tokens": 1136940.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 1.1795243740081787, |
| "epoch": 0.0818746470920384, |
| "grad_norm": 158.4144287109375, |
| "learning_rate": 4.999388304524306e-06, |
| "loss": 0.9704, |
| "mean_token_accuracy": 0.7433805704116822, |
| "num_tokens": 1177564.0, |
| "step": 145 |
| }, |
| { |
| "entropy": 1.3331533193588256, |
| "epoch": 0.08469791078486731, |
| "grad_norm": 151.61849975585938, |
| "learning_rate": 4.999345090707721e-06, |
| "loss": 1.0556, |
| "mean_token_accuracy": 0.7271691083908081, |
| "num_tokens": 1218245.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 1.3680028915405273, |
| "epoch": 0.08752117447769622, |
| "grad_norm": 155.4457244873047, |
| "learning_rate": 4.999300402366083e-06, |
| "loss": 1.0887, |
| "mean_token_accuracy": 0.7194394826889038, |
| "num_tokens": 1259004.0, |
| "step": 155 |
| }, |
| { |
| "entropy": 1.3031083345413208, |
| "epoch": 0.09034443817052512, |
| "grad_norm": 148.3616180419922, |
| "learning_rate": 4.999254239534546e-06, |
| "loss": 1.0329, |
| "mean_token_accuracy": 0.734139358997345, |
| "num_tokens": 1299704.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 1.240130877494812, |
| "epoch": 0.09316770186335403, |
| "grad_norm": 158.9844970703125, |
| "learning_rate": 4.999206602249426e-06, |
| "loss": 0.9971, |
| "mean_token_accuracy": 0.7394698977470398, |
| "num_tokens": 1340262.0, |
| "step": 165 |
| }, |
| { |
| "entropy": 1.3012992858886718, |
| "epoch": 0.09599096555618294, |
| "grad_norm": 168.62123107910156, |
| "learning_rate": 4.999157490548199e-06, |
| "loss": 1.0457, |
| "mean_token_accuracy": 0.7285740613937378, |
| "num_tokens": 1380931.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 1.380574369430542, |
| "epoch": 0.09881422924901186, |
| "grad_norm": 159.16758728027344, |
| "learning_rate": 4.999106904469501e-06, |
| "loss": 1.0219, |
| "mean_token_accuracy": 0.7346728801727295, |
| "num_tokens": 1421612.0, |
| "step": 175 |
| }, |
| { |
| "entropy": 1.352856206893921, |
| "epoch": 0.10163749294184077, |
| "grad_norm": 167.45785522460938, |
| "learning_rate": 4.999054844053126e-06, |
| "loss": 1.0948, |
| "mean_token_accuracy": 0.7178587436676025, |
| "num_tokens": 1462305.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 1.2682753562927247, |
| "epoch": 0.10446075663466968, |
| "grad_norm": 141.07215881347656, |
| "learning_rate": 4.9990013093400315e-06, |
| "loss": 1.0059, |
| "mean_token_accuracy": 0.7368964791297913, |
| "num_tokens": 1502284.0, |
| "step": 185 |
| }, |
| { |
| "entropy": 1.2629669904708862, |
| "epoch": 0.10728402032749859, |
| "grad_norm": 175.1353302001953, |
| "learning_rate": 4.998946300372331e-06, |
| "loss": 0.9657, |
| "mean_token_accuracy": 0.7435346961021423, |
| "num_tokens": 1543011.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 1.3042139530181884, |
| "epoch": 0.1101072840203275, |
| "grad_norm": 138.55645751953125, |
| "learning_rate": 4.998889817193298e-06, |
| "loss": 1.0102, |
| "mean_token_accuracy": 0.7376075625419617, |
| "num_tokens": 1583669.0, |
| "step": 195 |
| }, |
| { |
| "entropy": 1.3252872467041015, |
| "epoch": 0.11293054771315642, |
| "grad_norm": 328.7073669433594, |
| "learning_rate": 4.998831859847371e-06, |
| "loss": 1.0797, |
| "mean_token_accuracy": 0.718978488445282, |
| "num_tokens": 1624164.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 1.3184748888015747, |
| "epoch": 0.11575381140598531, |
| "grad_norm": 140.2963104248047, |
| "learning_rate": 4.998772428380142e-06, |
| "loss": 1.0262, |
| "mean_token_accuracy": 0.7362207889556884, |
| "num_tokens": 1664679.0, |
| "step": 205 |
| }, |
| { |
| "entropy": 1.419853186607361, |
| "epoch": 0.11857707509881422, |
| "grad_norm": 146.3859100341797, |
| "learning_rate": 4.9987115228383654e-06, |
| "loss": 1.1075, |
| "mean_token_accuracy": 0.7169888734817504, |
| "num_tokens": 1705222.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 1.3418590307235718, |
| "epoch": 0.12140033879164314, |
| "grad_norm": 165.43157958984375, |
| "learning_rate": 4.9986491432699544e-06, |
| "loss": 1.0783, |
| "mean_token_accuracy": 0.7195831775665283, |
| "num_tokens": 1745798.0, |
| "step": 215 |
| }, |
| { |
| "entropy": 1.4498067855834962, |
| "epoch": 0.12422360248447205, |
| "grad_norm": 168.71444702148438, |
| "learning_rate": 4.998585289723983e-06, |
| "loss": 1.0693, |
| "mean_token_accuracy": 0.728911018371582, |
| "num_tokens": 1786323.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 1.3513702630996705, |
| "epoch": 0.12704686617730096, |
| "grad_norm": 141.62327575683594, |
| "learning_rate": 4.9985199622506835e-06, |
| "loss": 1.0801, |
| "mean_token_accuracy": 0.7193790435791015, |
| "num_tokens": 1827028.0, |
| "step": 225 |
| }, |
| { |
| "entropy": 1.1918700218200684, |
| "epoch": 0.12987012987012986, |
| "grad_norm": 147.93051147460938, |
| "learning_rate": 4.998453160901449e-06, |
| "loss": 0.9504, |
| "mean_token_accuracy": 0.7491826415061951, |
| "num_tokens": 1867817.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 1.3269594192504883, |
| "epoch": 0.13269339356295878, |
| "grad_norm": 139.9858856201172, |
| "learning_rate": 4.99838488572883e-06, |
| "loss": 1.0753, |
| "mean_token_accuracy": 0.7226388096809387, |
| "num_tokens": 1908308.0, |
| "step": 235 |
| }, |
| { |
| "entropy": 1.3837014436721802, |
| "epoch": 0.13551665725578768, |
| "grad_norm": 139.34576416015625, |
| "learning_rate": 4.998315136786539e-06, |
| "loss": 1.0673, |
| "mean_token_accuracy": 0.7262170910835266, |
| "num_tokens": 1948948.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 1.2679001569747925, |
| "epoch": 0.1383399209486166, |
| "grad_norm": 115.1655502319336, |
| "learning_rate": 4.998243914129446e-06, |
| "loss": 1.0151, |
| "mean_token_accuracy": 0.736802589893341, |
| "num_tokens": 1989686.0, |
| "step": 245 |
| }, |
| { |
| "entropy": 1.3858689785003662, |
| "epoch": 0.1411631846414455, |
| "grad_norm": 198.03237915039062, |
| "learning_rate": 4.99817121781358e-06, |
| "loss": 1.085, |
| "mean_token_accuracy": 0.7183008790016174, |
| "num_tokens": 2030103.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 1.3685580015182495, |
| "epoch": 0.14398644833427443, |
| "grad_norm": 134.53562927246094, |
| "learning_rate": 4.998097047896133e-06, |
| "loss": 1.0412, |
| "mean_token_accuracy": 0.7265231609344482, |
| "num_tokens": 2070842.0, |
| "step": 255 |
| }, |
| { |
| "entropy": 1.3303281307220458, |
| "epoch": 0.14680971202710333, |
| "grad_norm": 144.44908142089844, |
| "learning_rate": 4.998021404435452e-06, |
| "loss": 1.0653, |
| "mean_token_accuracy": 0.7241496682167053, |
| "num_tokens": 2110742.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 1.276755118370056, |
| "epoch": 0.14963297571993225, |
| "grad_norm": 139.25717163085938, |
| "learning_rate": 4.997944287491046e-06, |
| "loss": 0.9689, |
| "mean_token_accuracy": 0.7442183256149292, |
| "num_tokens": 2151458.0, |
| "step": 265 |
| }, |
| { |
| "entropy": 1.3310250043869019, |
| "epoch": 0.15245623941276115, |
| "grad_norm": 143.90011596679688, |
| "learning_rate": 4.997865697123579e-06, |
| "loss": 1.0313, |
| "mean_token_accuracy": 0.7334769487380981, |
| "num_tokens": 2192262.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 1.3676940202713013, |
| "epoch": 0.15527950310559005, |
| "grad_norm": 154.62408447265625, |
| "learning_rate": 4.99778563339488e-06, |
| "loss": 1.0256, |
| "mean_token_accuracy": 0.7308215498924255, |
| "num_tokens": 2232853.0, |
| "step": 275 |
| }, |
| { |
| "entropy": 1.2920289278030395, |
| "epoch": 0.15810276679841898, |
| "grad_norm": 145.27914428710938, |
| "learning_rate": 4.997704096367933e-06, |
| "loss": 0.97, |
| "mean_token_accuracy": 0.748300063610077, |
| "num_tokens": 2273536.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 1.3298648118972778, |
| "epoch": 0.16092603049124787, |
| "grad_norm": 127.40184783935547, |
| "learning_rate": 4.997621086106883e-06, |
| "loss": 1.0013, |
| "mean_token_accuracy": 0.7374109506607056, |
| "num_tokens": 2314211.0, |
| "step": 285 |
| }, |
| { |
| "entropy": 1.3977973222732545, |
| "epoch": 0.1637492941840768, |
| "grad_norm": 147.5937042236328, |
| "learning_rate": 4.997536602677031e-06, |
| "loss": 1.0307, |
| "mean_token_accuracy": 0.7304450035095215, |
| "num_tokens": 2355018.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 1.5217002391815186, |
| "epoch": 0.1665725578769057, |
| "grad_norm": 148.3541259765625, |
| "learning_rate": 4.997450646144843e-06, |
| "loss": 1.1454, |
| "mean_token_accuracy": 0.7052842378616333, |
| "num_tokens": 2394833.0, |
| "step": 295 |
| }, |
| { |
| "entropy": 1.2209747552871704, |
| "epoch": 0.16939582156973462, |
| "grad_norm": 128.4432830810547, |
| "learning_rate": 4.997363216577937e-06, |
| "loss": 0.9285, |
| "mean_token_accuracy": 0.7535837411880493, |
| "num_tokens": 2435708.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 1.3434046506881714, |
| "epoch": 0.17221908526256352, |
| "grad_norm": 120.20564270019531, |
| "learning_rate": 4.997274314045093e-06, |
| "loss": 1.0367, |
| "mean_token_accuracy": 0.7294707536697388, |
| "num_tokens": 2476300.0, |
| "step": 305 |
| }, |
| { |
| "entropy": 1.2988000869750977, |
| "epoch": 0.17504234895539245, |
| "grad_norm": 135.5797882080078, |
| "learning_rate": 4.9971839386162505e-06, |
| "loss": 1.008, |
| "mean_token_accuracy": 0.7358855485916138, |
| "num_tokens": 2517029.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 1.2970911979675293, |
| "epoch": 0.17786561264822134, |
| "grad_norm": 127.42134094238281, |
| "learning_rate": 4.997092090362506e-06, |
| "loss": 0.9862, |
| "mean_token_accuracy": 0.7406257271766663, |
| "num_tokens": 2557648.0, |
| "step": 315 |
| }, |
| { |
| "entropy": 1.3222782850265502, |
| "epoch": 0.18068887634105024, |
| "grad_norm": 132.6872100830078, |
| "learning_rate": 4.996998769356116e-06, |
| "loss": 1.0062, |
| "mean_token_accuracy": 0.7342755556106567, |
| "num_tokens": 2598313.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 1.3237468957901002, |
| "epoch": 0.18351214003387917, |
| "grad_norm": 115.4422378540039, |
| "learning_rate": 4.996903975670495e-06, |
| "loss": 1.0351, |
| "mean_token_accuracy": 0.7323682546615601, |
| "num_tokens": 2638879.0, |
| "step": 325 |
| }, |
| { |
| "entropy": 1.297943377494812, |
| "epoch": 0.18633540372670807, |
| "grad_norm": 142.4702911376953, |
| "learning_rate": 4.996807709380216e-06, |
| "loss": 0.9848, |
| "mean_token_accuracy": 0.7394223213195801, |
| "num_tokens": 2679280.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 1.3547947645187377, |
| "epoch": 0.189158667419537, |
| "grad_norm": 142.83367919921875, |
| "learning_rate": 4.996709970561011e-06, |
| "loss": 1.0522, |
| "mean_token_accuracy": 0.7255360841751098, |
| "num_tokens": 2719742.0, |
| "step": 335 |
| }, |
| { |
| "entropy": 1.4270910263061523, |
| "epoch": 0.1919819311123659, |
| "grad_norm": 137.50009155273438, |
| "learning_rate": 4.996610759289769e-06, |
| "loss": 1.0668, |
| "mean_token_accuracy": 0.7272424578666687, |
| "num_tokens": 2760484.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 1.3766878366470336, |
| "epoch": 0.19480519480519481, |
| "grad_norm": 130.84567260742188, |
| "learning_rate": 4.9965100756445385e-06, |
| "loss": 1.0055, |
| "mean_token_accuracy": 0.7384996533393859, |
| "num_tokens": 2801264.0, |
| "step": 345 |
| }, |
| { |
| "entropy": 1.306664514541626, |
| "epoch": 0.1976284584980237, |
| "grad_norm": 122.48930358886719, |
| "learning_rate": 4.996407919704527e-06, |
| "loss": 0.9605, |
| "mean_token_accuracy": 0.7464729905128479, |
| "num_tokens": 2842004.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 1.4950592756271361, |
| "epoch": 0.20045172219085264, |
| "grad_norm": 145.43069458007812, |
| "learning_rate": 4.9963042915500966e-06, |
| "loss": 1.055, |
| "mean_token_accuracy": 0.7293013811111451, |
| "num_tokens": 2882520.0, |
| "step": 355 |
| }, |
| { |
| "entropy": 1.2889564514160157, |
| "epoch": 0.20327498588368154, |
| "grad_norm": 119.11095428466797, |
| "learning_rate": 4.996199191262775e-06, |
| "loss": 0.9282, |
| "mean_token_accuracy": 0.7534731745719909, |
| "num_tokens": 2923041.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 1.2990325212478637, |
| "epoch": 0.20609824957651043, |
| "grad_norm": 128.93939208984375, |
| "learning_rate": 4.99609261892524e-06, |
| "loss": 0.9558, |
| "mean_token_accuracy": 0.7469653010368347, |
| "num_tokens": 2963601.0, |
| "step": 365 |
| }, |
| { |
| "entropy": 1.325394630432129, |
| "epoch": 0.20892151326933936, |
| "grad_norm": 127.29461669921875, |
| "learning_rate": 4.995984574621332e-06, |
| "loss": 0.9955, |
| "mean_token_accuracy": 0.7355307936668396, |
| "num_tokens": 3004286.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 1.438259530067444, |
| "epoch": 0.21174477696216826, |
| "grad_norm": 152.51705932617188, |
| "learning_rate": 4.995875058436047e-06, |
| "loss": 1.0516, |
| "mean_token_accuracy": 0.729459798336029, |
| "num_tokens": 3045092.0, |
| "step": 375 |
| }, |
| { |
| "entropy": 1.2735749006271362, |
| "epoch": 0.21456804065499718, |
| "grad_norm": 167.75277709960938, |
| "learning_rate": 4.995764070455542e-06, |
| "loss": 0.9568, |
| "mean_token_accuracy": 0.7488296508789063, |
| "num_tokens": 3086004.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 1.339252519607544, |
| "epoch": 0.21739130434782608, |
| "grad_norm": 137.4264678955078, |
| "learning_rate": 4.995651610767128e-06, |
| "loss": 1.0154, |
| "mean_token_accuracy": 0.7321458339691163, |
| "num_tokens": 3126547.0, |
| "step": 385 |
| }, |
| { |
| "entropy": 1.2893430948257447, |
| "epoch": 0.220214568040655, |
| "grad_norm": 126.54104614257812, |
| "learning_rate": 4.995537679459277e-06, |
| "loss": 0.9589, |
| "mean_token_accuracy": 0.7475377321243286, |
| "num_tokens": 3167244.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 1.2430254459381103, |
| "epoch": 0.2230378317334839, |
| "grad_norm": 119.59004974365234, |
| "learning_rate": 4.995422276621617e-06, |
| "loss": 0.8992, |
| "mean_token_accuracy": 0.757983124256134, |
| "num_tokens": 3207925.0, |
| "step": 395 |
| }, |
| { |
| "entropy": 1.2922404289245606, |
| "epoch": 0.22586109542631283, |
| "grad_norm": 148.28298950195312, |
| "learning_rate": 4.995305402344933e-06, |
| "loss": 1.0058, |
| "mean_token_accuracy": 0.7375632762908936, |
| "num_tokens": 3248528.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 1.3221865177154541, |
| "epoch": 0.22868435911914173, |
| "grad_norm": 126.97147369384766, |
| "learning_rate": 4.995187056721171e-06, |
| "loss": 0.9874, |
| "mean_token_accuracy": 0.7407760977745056, |
| "num_tokens": 3289340.0, |
| "step": 405 |
| }, |
| { |
| "entropy": 1.227053427696228, |
| "epoch": 0.23150762281197063, |
| "grad_norm": 113.13875579833984, |
| "learning_rate": 4.99506723984343e-06, |
| "loss": 0.8947, |
| "mean_token_accuracy": 0.760028600692749, |
| "num_tokens": 3329952.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 1.2992219924926758, |
| "epoch": 0.23433088650479955, |
| "grad_norm": 101.6119155883789, |
| "learning_rate": 4.994945951805969e-06, |
| "loss": 0.9453, |
| "mean_token_accuracy": 0.7457529783248902, |
| "num_tokens": 3370598.0, |
| "step": 415 |
| }, |
| { |
| "entropy": 1.3179654836654664, |
| "epoch": 0.23715415019762845, |
| "grad_norm": 117.23041534423828, |
| "learning_rate": 4.994823192704205e-06, |
| "loss": 0.9681, |
| "mean_token_accuracy": 0.745925772190094, |
| "num_tokens": 3411148.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 1.2884270429611206, |
| "epoch": 0.23997741389045738, |
| "grad_norm": 118.79703521728516, |
| "learning_rate": 4.994698962634709e-06, |
| "loss": 0.9963, |
| "mean_token_accuracy": 0.7392920970916748, |
| "num_tokens": 3451712.0, |
| "step": 425 |
| }, |
| { |
| "entropy": 1.289307141304016, |
| "epoch": 0.24280067758328627, |
| "grad_norm": 128.40139770507812, |
| "learning_rate": 4.994573261695213e-06, |
| "loss": 0.9873, |
| "mean_token_accuracy": 0.7425390005111694, |
| "num_tokens": 3492409.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 1.3000454664230348, |
| "epoch": 0.2456239412761152, |
| "grad_norm": 115.0731430053711, |
| "learning_rate": 4.9944460899846044e-06, |
| "loss": 0.9402, |
| "mean_token_accuracy": 0.7487621665000915, |
| "num_tokens": 3533099.0, |
| "step": 435 |
| }, |
| { |
| "entropy": 1.3730944633483886, |
| "epoch": 0.2484472049689441, |
| "grad_norm": 123.5392837524414, |
| "learning_rate": 4.994317447602927e-06, |
| "loss": 1.0273, |
| "mean_token_accuracy": 0.7346350908279419, |
| "num_tokens": 3573865.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 1.3912369728088378, |
| "epoch": 0.251270468661773, |
| "grad_norm": 136.06275939941406, |
| "learning_rate": 4.994187334651382e-06, |
| "loss": 1.0047, |
| "mean_token_accuracy": 0.7363410830497742, |
| "num_tokens": 3614426.0, |
| "step": 445 |
| }, |
| { |
| "entropy": 1.363344144821167, |
| "epoch": 0.2540937323546019, |
| "grad_norm": 120.8814468383789, |
| "learning_rate": 4.994055751232329e-06, |
| "loss": 0.9789, |
| "mean_token_accuracy": 0.7418818831443786, |
| "num_tokens": 3654639.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 1.1486493587493896, |
| "epoch": 0.25691699604743085, |
| "grad_norm": 99.43315124511719, |
| "learning_rate": 4.993922697449282e-06, |
| "loss": 0.8197, |
| "mean_token_accuracy": 0.7789113402366639, |
| "num_tokens": 3695164.0, |
| "step": 455 |
| }, |
| { |
| "entropy": 1.262733268737793, |
| "epoch": 0.2597402597402597, |
| "grad_norm": 116.33341217041016, |
| "learning_rate": 4.993788173406913e-06, |
| "loss": 0.9351, |
| "mean_token_accuracy": 0.7525886178016663, |
| "num_tokens": 3736048.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 1.3816607236862182, |
| "epoch": 0.26256352343308864, |
| "grad_norm": 110.50990295410156, |
| "learning_rate": 4.9936521792110505e-06, |
| "loss": 0.9687, |
| "mean_token_accuracy": 0.7439742684364319, |
| "num_tokens": 3776672.0, |
| "step": 465 |
| }, |
| { |
| "entropy": 1.2183295249938966, |
| "epoch": 0.26538678712591757, |
| "grad_norm": 123.17945861816406, |
| "learning_rate": 4.99351471496868e-06, |
| "loss": 0.8336, |
| "mean_token_accuracy": 0.7743356227874756, |
| "num_tokens": 3816553.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 1.3230255365371704, |
| "epoch": 0.2682100508187465, |
| "grad_norm": 112.20722198486328, |
| "learning_rate": 4.993375780787942e-06, |
| "loss": 0.9225, |
| "mean_token_accuracy": 0.7531169533729554, |
| "num_tokens": 3857251.0, |
| "step": 475 |
| }, |
| { |
| "entropy": 1.280331516265869, |
| "epoch": 0.27103331451157536, |
| "grad_norm": 101.07537841796875, |
| "learning_rate": 4.993235376778135e-06, |
| "loss": 0.9109, |
| "mean_token_accuracy": 0.7576413989067078, |
| "num_tokens": 3897861.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 1.2946168661117554, |
| "epoch": 0.2738565782044043, |
| "grad_norm": 117.32846069335938, |
| "learning_rate": 4.993093503049714e-06, |
| "loss": 0.9609, |
| "mean_token_accuracy": 0.7459181666374206, |
| "num_tokens": 3938460.0, |
| "step": 485 |
| }, |
| { |
| "entropy": 1.4364691019058227, |
| "epoch": 0.2766798418972332, |
| "grad_norm": 123.10442352294922, |
| "learning_rate": 4.992950159714288e-06, |
| "loss": 1.0288, |
| "mean_token_accuracy": 0.7280943512916564, |
| "num_tokens": 3978929.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 1.4275231122970582, |
| "epoch": 0.2795031055900621, |
| "grad_norm": 140.68606567382812, |
| "learning_rate": 4.992805346884624e-06, |
| "loss": 1.0296, |
| "mean_token_accuracy": 0.7304377675056457, |
| "num_tokens": 4019194.0, |
| "step": 495 |
| }, |
| { |
| "entropy": 1.2917508125305175, |
| "epoch": 0.282326369282891, |
| "grad_norm": 104.26236724853516, |
| "learning_rate": 4.992659064674645e-06, |
| "loss": 0.9386, |
| "mean_token_accuracy": 0.7486904263496399, |
| "num_tokens": 4059701.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.282326369282891, |
| "eval_entropy": 1.3246455669403077, |
| "eval_loss": 0.9719027876853943, |
| "eval_mean_token_accuracy": 0.7504606604576111, |
| "eval_num_tokens": 4059701.0, |
| "eval_runtime": 2.4542, |
| "eval_samples_per_second": 15.891, |
| "eval_steps_per_second": 2.037, |
| "step": 500 |
| }, |
| { |
| "entropy": 1.2074965000152589, |
| "epoch": 0.28514963297571994, |
| "grad_norm": 116.08982849121094, |
| "learning_rate": 4.992511313199429e-06, |
| "loss": 0.911, |
| "mean_token_accuracy": 0.7566827893257141, |
| "num_tokens": 4100589.0, |
| "step": 505 |
| }, |
| { |
| "entropy": 1.1781744480133056, |
| "epoch": 0.28797289666854886, |
| "grad_norm": 117.98184204101562, |
| "learning_rate": 4.99236209257521e-06, |
| "loss": 0.8898, |
| "mean_token_accuracy": 0.761328113079071, |
| "num_tokens": 4141263.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 1.341305184364319, |
| "epoch": 0.29079616036137773, |
| "grad_norm": 130.23411560058594, |
| "learning_rate": 4.992211402919379e-06, |
| "loss": 0.9646, |
| "mean_token_accuracy": 0.748078465461731, |
| "num_tokens": 4181932.0, |
| "step": 515 |
| }, |
| { |
| "entropy": 1.2573811054229735, |
| "epoch": 0.29361942405420666, |
| "grad_norm": 108.79557800292969, |
| "learning_rate": 4.992059244350481e-06, |
| "loss": 0.9193, |
| "mean_token_accuracy": 0.7583318710327148, |
| "num_tokens": 4222684.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 1.2942801475524903, |
| "epoch": 0.2964426877470356, |
| "grad_norm": 113.74977111816406, |
| "learning_rate": 4.991905616988217e-06, |
| "loss": 0.9397, |
| "mean_token_accuracy": 0.7517918109893799, |
| "num_tokens": 4263508.0, |
| "step": 525 |
| }, |
| { |
| "entropy": 1.2990548849105834, |
| "epoch": 0.2992659514398645, |
| "grad_norm": 105.88493347167969, |
| "learning_rate": 4.991750520953445e-06, |
| "loss": 0.9634, |
| "mean_token_accuracy": 0.7489338755607605, |
| "num_tokens": 4304250.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 1.265447235107422, |
| "epoch": 0.3020892151326934, |
| "grad_norm": 117.12853240966797, |
| "learning_rate": 4.991593956368177e-06, |
| "loss": 0.9167, |
| "mean_token_accuracy": 0.7550532460212708, |
| "num_tokens": 4344911.0, |
| "step": 535 |
| }, |
| { |
| "entropy": 1.229043436050415, |
| "epoch": 0.3049124788255223, |
| "grad_norm": 116.68711853027344, |
| "learning_rate": 4.9914359233555795e-06, |
| "loss": 0.855, |
| "mean_token_accuracy": 0.7696425437927246, |
| "num_tokens": 4385532.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 1.3290831565856933, |
| "epoch": 0.30773574251835123, |
| "grad_norm": 112.86345672607422, |
| "learning_rate": 4.991276422039976e-06, |
| "loss": 0.987, |
| "mean_token_accuracy": 0.7430547952651978, |
| "num_tokens": 4426255.0, |
| "step": 545 |
| }, |
| { |
| "entropy": 1.2172236442565918, |
| "epoch": 0.3105590062111801, |
| "grad_norm": 109.52008819580078, |
| "learning_rate": 4.9911154525468446e-06, |
| "loss": 0.8654, |
| "mean_token_accuracy": 0.7682685256004333, |
| "num_tokens": 4466692.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 1.3038493156433106, |
| "epoch": 0.313382269904009, |
| "grad_norm": 106.18077087402344, |
| "learning_rate": 4.990953015002817e-06, |
| "loss": 0.9641, |
| "mean_token_accuracy": 0.7453461289405823, |
| "num_tokens": 4507436.0, |
| "step": 555 |
| }, |
| { |
| "entropy": 1.3330723762512207, |
| "epoch": 0.31620553359683795, |
| "grad_norm": 120.44703674316406, |
| "learning_rate": 4.990789109535681e-06, |
| "loss": 0.991, |
| "mean_token_accuracy": 0.7382645726203918, |
| "num_tokens": 4548162.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 1.3030031204223633, |
| "epoch": 0.3190287972896669, |
| "grad_norm": 128.5010986328125, |
| "learning_rate": 4.99062373627438e-06, |
| "loss": 0.9184, |
| "mean_token_accuracy": 0.755344557762146, |
| "num_tokens": 4588651.0, |
| "step": 565 |
| }, |
| { |
| "entropy": 1.3902093887329101, |
| "epoch": 0.32185206098249575, |
| "grad_norm": 111.37005615234375, |
| "learning_rate": 4.990456895349011e-06, |
| "loss": 0.988, |
| "mean_token_accuracy": 0.74136883020401, |
| "num_tokens": 4629315.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 1.3531365394592285, |
| "epoch": 0.3246753246753247, |
| "grad_norm": 110.8538589477539, |
| "learning_rate": 4.9902885868908264e-06, |
| "loss": 0.9974, |
| "mean_token_accuracy": 0.7447136282920838, |
| "num_tokens": 4670100.0, |
| "step": 575 |
| }, |
| { |
| "entropy": 1.3364722967147826, |
| "epoch": 0.3274985883681536, |
| "grad_norm": 119.34349822998047, |
| "learning_rate": 4.990118811032231e-06, |
| "loss": 0.971, |
| "mean_token_accuracy": 0.742649781703949, |
| "num_tokens": 4710465.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 1.232515549659729, |
| "epoch": 0.33032185206098247, |
| "grad_norm": 120.2654800415039, |
| "learning_rate": 4.989947567906786e-06, |
| "loss": 0.9344, |
| "mean_token_accuracy": 0.7535375952720642, |
| "num_tokens": 4751125.0, |
| "step": 585 |
| }, |
| { |
| "entropy": 1.2825552940368652, |
| "epoch": 0.3331451157538114, |
| "grad_norm": 121.9236831665039, |
| "learning_rate": 4.9897748576492065e-06, |
| "loss": 0.9342, |
| "mean_token_accuracy": 0.7499042987823487, |
| "num_tokens": 4791855.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 1.3502115964889527, |
| "epoch": 0.3359683794466403, |
| "grad_norm": 117.69750213623047, |
| "learning_rate": 4.9896006803953615e-06, |
| "loss": 0.9549, |
| "mean_token_accuracy": 0.748097813129425, |
| "num_tokens": 4831821.0, |
| "step": 595 |
| }, |
| { |
| "entropy": 1.3454943418502807, |
| "epoch": 0.33879164313946925, |
| "grad_norm": 110.03791046142578, |
| "learning_rate": 4.9894250362822735e-06, |
| "loss": 0.9892, |
| "mean_token_accuracy": 0.7411907076835632, |
| "num_tokens": 4872146.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 1.338779044151306, |
| "epoch": 0.3416149068322981, |
| "grad_norm": 126.85265350341797, |
| "learning_rate": 4.989247925448122e-06, |
| "loss": 0.9397, |
| "mean_token_accuracy": 0.7515046119689941, |
| "num_tokens": 4912659.0, |
| "step": 605 |
| }, |
| { |
| "entropy": 1.3164394855499268, |
| "epoch": 0.34443817052512704, |
| "grad_norm": 128.91824340820312, |
| "learning_rate": 4.989069348032234e-06, |
| "loss": 0.8567, |
| "mean_token_accuracy": 0.7691178798675538, |
| "num_tokens": 4952825.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 1.2950055837631225, |
| "epoch": 0.34726143421795597, |
| "grad_norm": 102.4195556640625, |
| "learning_rate": 4.988889304175099e-06, |
| "loss": 0.9147, |
| "mean_token_accuracy": 0.7522725224494934, |
| "num_tokens": 4993585.0, |
| "step": 615 |
| }, |
| { |
| "entropy": 1.3483319997787475, |
| "epoch": 0.3500846979107849, |
| "grad_norm": 108.66871643066406, |
| "learning_rate": 4.988707794018351e-06, |
| "loss": 0.9596, |
| "mean_token_accuracy": 0.7457158684730529, |
| "num_tokens": 5034279.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 1.4173261642456054, |
| "epoch": 0.35290796160361376, |
| "grad_norm": 110.11769104003906, |
| "learning_rate": 4.988524817704784e-06, |
| "loss": 1.0058, |
| "mean_token_accuracy": 0.7339034914970398, |
| "num_tokens": 5074824.0, |
| "step": 625 |
| }, |
| { |
| "entropy": 1.3969521045684814, |
| "epoch": 0.3557312252964427, |
| "grad_norm": 111.39070129394531, |
| "learning_rate": 4.988340375378344e-06, |
| "loss": 0.9766, |
| "mean_token_accuracy": 0.7456167697906494, |
| "num_tokens": 5115452.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 1.3778595924377441, |
| "epoch": 0.3585544889892716, |
| "grad_norm": 130.38401794433594, |
| "learning_rate": 4.988154467184129e-06, |
| "loss": 0.9717, |
| "mean_token_accuracy": 0.7431756496429444, |
| "num_tokens": 5156122.0, |
| "step": 635 |
| }, |
| { |
| "entropy": 1.2438819527626037, |
| "epoch": 0.3613777526821005, |
| "grad_norm": 91.94094848632812, |
| "learning_rate": 4.98796709326839e-06, |
| "loss": 0.8557, |
| "mean_token_accuracy": 0.7728610277175904, |
| "num_tokens": 5196962.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 1.2364247560501098, |
| "epoch": 0.3642010163749294, |
| "grad_norm": 112.2281494140625, |
| "learning_rate": 4.987778253778532e-06, |
| "loss": 0.868, |
| "mean_token_accuracy": 0.7657705903053283, |
| "num_tokens": 5237644.0, |
| "step": 645 |
| }, |
| { |
| "entropy": 1.3769032955169678, |
| "epoch": 0.36702428006775834, |
| "grad_norm": 106.69970703125, |
| "learning_rate": 4.987587948863113e-06, |
| "loss": 0.9205, |
| "mean_token_accuracy": 0.7561796069145202, |
| "num_tokens": 5278368.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 1.3944639682769775, |
| "epoch": 0.36984754376058726, |
| "grad_norm": 104.28118133544922, |
| "learning_rate": 4.987396178671845e-06, |
| "loss": 0.9843, |
| "mean_token_accuracy": 0.7391315698623657, |
| "num_tokens": 5318926.0, |
| "step": 655 |
| }, |
| { |
| "entropy": 1.3362845420837401, |
| "epoch": 0.37267080745341613, |
| "grad_norm": 106.64762115478516, |
| "learning_rate": 4.987202943355588e-06, |
| "loss": 0.9393, |
| "mean_token_accuracy": 0.7537869215011597, |
| "num_tokens": 5359676.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 1.427641224861145, |
| "epoch": 0.37549407114624506, |
| "grad_norm": 114.0298080444336, |
| "learning_rate": 4.987008243066362e-06, |
| "loss": 1.0263, |
| "mean_token_accuracy": 0.7314809441566468, |
| "num_tokens": 5400488.0, |
| "step": 665 |
| }, |
| { |
| "entropy": 1.2906121969223023, |
| "epoch": 0.378317334839074, |
| "grad_norm": 121.60186004638672, |
| "learning_rate": 4.986812077957333e-06, |
| "loss": 0.8884, |
| "mean_token_accuracy": 0.761104142665863, |
| "num_tokens": 5440967.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 1.322801113128662, |
| "epoch": 0.38114059853190285, |
| "grad_norm": 100.61329650878906, |
| "learning_rate": 4.986614448182821e-06, |
| "loss": 0.9344, |
| "mean_token_accuracy": 0.7525824546813965, |
| "num_tokens": 5481714.0, |
| "step": 675 |
| }, |
| { |
| "entropy": 1.2744395971298217, |
| "epoch": 0.3839638622247318, |
| "grad_norm": 114.0245132446289, |
| "learning_rate": 4.986415353898301e-06, |
| "loss": 0.9013, |
| "mean_token_accuracy": 0.7586809635162354, |
| "num_tokens": 5522307.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 1.406579351425171, |
| "epoch": 0.3867871259175607, |
| "grad_norm": 110.2708740234375, |
| "learning_rate": 4.986214795260398e-06, |
| "loss": 0.9857, |
| "mean_token_accuracy": 0.7406947612762451, |
| "num_tokens": 5563030.0, |
| "step": 685 |
| }, |
| { |
| "entropy": 1.4012676239013673, |
| "epoch": 0.38961038961038963, |
| "grad_norm": 107.38058471679688, |
| "learning_rate": 4.986012772426887e-06, |
| "loss": 0.9898, |
| "mean_token_accuracy": 0.7373950242996216, |
| "num_tokens": 5603900.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 1.276191735267639, |
| "epoch": 0.3924336533032185, |
| "grad_norm": 102.94742584228516, |
| "learning_rate": 4.985809285556698e-06, |
| "loss": 0.905, |
| "mean_token_accuracy": 0.7557743310928344, |
| "num_tokens": 5644556.0, |
| "step": 695 |
| }, |
| { |
| "entropy": 1.3815409660339355, |
| "epoch": 0.3952569169960474, |
| "grad_norm": 132.46441650390625, |
| "learning_rate": 4.9856043348099134e-06, |
| "loss": 0.945, |
| "mean_token_accuracy": 0.7497841715812683, |
| "num_tokens": 5685234.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 1.2497349023818969, |
| "epoch": 0.39808018068887635, |
| "grad_norm": 105.15983581542969, |
| "learning_rate": 4.9853979203477644e-06, |
| "loss": 0.8568, |
| "mean_token_accuracy": 0.7651753425598145, |
| "num_tokens": 5725995.0, |
| "step": 705 |
| }, |
| { |
| "entropy": 1.2450405836105347, |
| "epoch": 0.4009034443817053, |
| "grad_norm": 101.96587371826172, |
| "learning_rate": 4.9851900423326335e-06, |
| "loss": 0.868, |
| "mean_token_accuracy": 0.7652771830558777, |
| "num_tokens": 5766777.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 1.3319795370101928, |
| "epoch": 0.40372670807453415, |
| "grad_norm": 129.9007110595703, |
| "learning_rate": 4.984980700928057e-06, |
| "loss": 0.9547, |
| "mean_token_accuracy": 0.7460948824882507, |
| "num_tokens": 5807316.0, |
| "step": 715 |
| }, |
| { |
| "entropy": 1.266514039039612, |
| "epoch": 0.40654997176736307, |
| "grad_norm": 97.09778594970703, |
| "learning_rate": 4.9847698962987224e-06, |
| "loss": 0.8828, |
| "mean_token_accuracy": 0.7616210579872131, |
| "num_tokens": 5848130.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 1.2651296854019165, |
| "epoch": 0.409373235460192, |
| "grad_norm": 119.74391174316406, |
| "learning_rate": 4.984557628610465e-06, |
| "loss": 0.912, |
| "mean_token_accuracy": 0.7551288604736328, |
| "num_tokens": 5888922.0, |
| "step": 725 |
| }, |
| { |
| "entropy": 1.2874022722244263, |
| "epoch": 0.41219649915302087, |
| "grad_norm": 110.32769012451172, |
| "learning_rate": 4.984343898030275e-06, |
| "loss": 0.887, |
| "mean_token_accuracy": 0.7651336789131165, |
| "num_tokens": 5929586.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 1.181906795501709, |
| "epoch": 0.4150197628458498, |
| "grad_norm": 102.93072509765625, |
| "learning_rate": 4.98412870472629e-06, |
| "loss": 0.8324, |
| "mean_token_accuracy": 0.7739112615585327, |
| "num_tokens": 5970120.0, |
| "step": 735 |
| }, |
| { |
| "entropy": 1.2475993156433105, |
| "epoch": 0.4178430265386787, |
| "grad_norm": 96.60658264160156, |
| "learning_rate": 4.9839120488678025e-06, |
| "loss": 0.8659, |
| "mean_token_accuracy": 0.7675553560256958, |
| "num_tokens": 6010881.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 1.277070164680481, |
| "epoch": 0.42066629023150764, |
| "grad_norm": 96.34147644042969, |
| "learning_rate": 4.983693930625251e-06, |
| "loss": 0.8726, |
| "mean_token_accuracy": 0.7630559086799622, |
| "num_tokens": 6051720.0, |
| "step": 745 |
| }, |
| { |
| "entropy": 1.2760138750076293, |
| "epoch": 0.4234895539243365, |
| "grad_norm": 100.36460876464844, |
| "learning_rate": 4.983474350170227e-06, |
| "loss": 0.8673, |
| "mean_token_accuracy": 0.7626476407051086, |
| "num_tokens": 6092424.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 1.2910543203353881, |
| "epoch": 0.42631281761716544, |
| "grad_norm": 105.85248565673828, |
| "learning_rate": 4.983253307675473e-06, |
| "loss": 0.8937, |
| "mean_token_accuracy": 0.7598207592964172, |
| "num_tokens": 6133094.0, |
| "step": 755 |
| }, |
| { |
| "entropy": 1.4057719707489014, |
| "epoch": 0.42913608130999437, |
| "grad_norm": 107.28651428222656, |
| "learning_rate": 4.983030803314878e-06, |
| "loss": 0.9538, |
| "mean_token_accuracy": 0.7475337266921998, |
| "num_tokens": 6173931.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 1.2181792259216309, |
| "epoch": 0.43195934500282324, |
| "grad_norm": 106.51260375976562, |
| "learning_rate": 4.982806837263486e-06, |
| "loss": 0.8524, |
| "mean_token_accuracy": 0.7689022898674012, |
| "num_tokens": 6214629.0, |
| "step": 765 |
| }, |
| { |
| "entropy": 1.2283953189849854, |
| "epoch": 0.43478260869565216, |
| "grad_norm": 100.91169738769531, |
| "learning_rate": 4.982581409697487e-06, |
| "loss": 0.8259, |
| "mean_token_accuracy": 0.7750494837760925, |
| "num_tokens": 6255482.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 1.4342861652374268, |
| "epoch": 0.4376058723884811, |
| "grad_norm": 112.93717956542969, |
| "learning_rate": 4.982354520794224e-06, |
| "loss": 0.9777, |
| "mean_token_accuracy": 0.7420659542083741, |
| "num_tokens": 6296007.0, |
| "step": 775 |
| }, |
| { |
| "entropy": 1.2862244129180909, |
| "epoch": 0.44042913608131, |
| "grad_norm": 123.29638671875, |
| "learning_rate": 4.982126170732185e-06, |
| "loss": 0.8873, |
| "mean_token_accuracy": 0.7592768907546997, |
| "num_tokens": 6336419.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 1.31422700881958, |
| "epoch": 0.4432523997741389, |
| "grad_norm": 98.10787963867188, |
| "learning_rate": 4.981896359691013e-06, |
| "loss": 0.9252, |
| "mean_token_accuracy": 0.7531617879867554, |
| "num_tokens": 6377083.0, |
| "step": 785 |
| }, |
| { |
| "entropy": 1.2349420070648194, |
| "epoch": 0.4460756634669678, |
| "grad_norm": 104.18644714355469, |
| "learning_rate": 4.981665087851495e-06, |
| "loss": 0.8817, |
| "mean_token_accuracy": 0.7612318158149719, |
| "num_tokens": 6417661.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 1.2322286128997804, |
| "epoch": 0.44889892715979673, |
| "grad_norm": 102.12158966064453, |
| "learning_rate": 4.981432355395572e-06, |
| "loss": 0.921, |
| "mean_token_accuracy": 0.7553870797157287, |
| "num_tokens": 6458267.0, |
| "step": 795 |
| }, |
| { |
| "entropy": 1.353945779800415, |
| "epoch": 0.45172219085262566, |
| "grad_norm": 94.95413970947266, |
| "learning_rate": 4.9811981625063315e-06, |
| "loss": 0.9211, |
| "mean_token_accuracy": 0.7573221802711487, |
| "num_tokens": 6498460.0, |
| "step": 800 |
| }, |
| { |
| "entropy": 1.3203566551208497, |
| "epoch": 0.45454545454545453, |
| "grad_norm": 87.52528381347656, |
| "learning_rate": 4.980962509368009e-06, |
| "loss": 0.8826, |
| "mean_token_accuracy": 0.7646165013313293, |
| "num_tokens": 6539065.0, |
| "step": 805 |
| }, |
| { |
| "entropy": 1.2447147130966187, |
| "epoch": 0.45736871823828346, |
| "grad_norm": 85.17706298828125, |
| "learning_rate": 4.980725396165992e-06, |
| "loss": 0.8552, |
| "mean_token_accuracy": 0.7722663283348083, |
| "num_tokens": 6579730.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 1.276610255241394, |
| "epoch": 0.4601919819311124, |
| "grad_norm": 102.33468627929688, |
| "learning_rate": 4.980486823086813e-06, |
| "loss": 0.8926, |
| "mean_token_accuracy": 0.7591757655143738, |
| "num_tokens": 6620403.0, |
| "step": 815 |
| }, |
| { |
| "entropy": 1.2437695741653443, |
| "epoch": 0.46301524562394125, |
| "grad_norm": 99.84664154052734, |
| "learning_rate": 4.980246790318156e-06, |
| "loss": 0.8763, |
| "mean_token_accuracy": 0.7644282937049866, |
| "num_tokens": 6661163.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 1.3313609600067138, |
| "epoch": 0.4658385093167702, |
| "grad_norm": 97.01880645751953, |
| "learning_rate": 4.98000529804885e-06, |
| "loss": 0.8914, |
| "mean_token_accuracy": 0.7625803470611572, |
| "num_tokens": 6701784.0, |
| "step": 825 |
| }, |
| { |
| "entropy": 1.3444613218307495, |
| "epoch": 0.4686617730095991, |
| "grad_norm": 99.8399429321289, |
| "learning_rate": 4.979762346468876e-06, |
| "loss": 0.9474, |
| "mean_token_accuracy": 0.7516453862190247, |
| "num_tokens": 6742391.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 1.2741617679595947, |
| "epoch": 0.47148503670242803, |
| "grad_norm": 89.9829330444336, |
| "learning_rate": 4.979517935769359e-06, |
| "loss": 0.8707, |
| "mean_token_accuracy": 0.7648941993713378, |
| "num_tokens": 6783130.0, |
| "step": 835 |
| }, |
| { |
| "entropy": 1.3345114469528199, |
| "epoch": 0.4743083003952569, |
| "grad_norm": 112.01448822021484, |
| "learning_rate": 4.979272066142576e-06, |
| "loss": 0.9033, |
| "mean_token_accuracy": 0.7620006084442139, |
| "num_tokens": 6823752.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 1.2797606706619262, |
| "epoch": 0.4771315640880858, |
| "grad_norm": 105.47362518310547, |
| "learning_rate": 4.97902473778195e-06, |
| "loss": 0.8948, |
| "mean_token_accuracy": 0.7626261591911316, |
| "num_tokens": 6864415.0, |
| "step": 845 |
| }, |
| { |
| "entropy": 1.2193082809448241, |
| "epoch": 0.47995482778091475, |
| "grad_norm": 125.71626281738281, |
| "learning_rate": 4.978775950882049e-06, |
| "loss": 0.8311, |
| "mean_token_accuracy": 0.7712972402572632, |
| "num_tokens": 6905160.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 1.1931323289871216, |
| "epoch": 0.4827780914737436, |
| "grad_norm": 109.39054107666016, |
| "learning_rate": 4.978525705638593e-06, |
| "loss": 0.8553, |
| "mean_token_accuracy": 0.7686418652534485, |
| "num_tokens": 6945864.0, |
| "step": 855 |
| }, |
| { |
| "entropy": 1.217218804359436, |
| "epoch": 0.48560135516657255, |
| "grad_norm": 93.509521484375, |
| "learning_rate": 4.9782740022484455e-06, |
| "loss": 0.9275, |
| "mean_token_accuracy": 0.7530880331993103, |
| "num_tokens": 6986552.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 1.3541555404663086, |
| "epoch": 0.48842461885940147, |
| "grad_norm": 112.28630065917969, |
| "learning_rate": 4.978020840909619e-06, |
| "loss": 0.9492, |
| "mean_token_accuracy": 0.7504236459732055, |
| "num_tokens": 7026900.0, |
| "step": 865 |
| }, |
| { |
| "entropy": 1.150540018081665, |
| "epoch": 0.4912478825522304, |
| "grad_norm": 91.7456283569336, |
| "learning_rate": 4.977766221821272e-06, |
| "loss": 0.8155, |
| "mean_token_accuracy": 0.779057776927948, |
| "num_tokens": 7067710.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 1.1883390903472901, |
| "epoch": 0.49407114624505927, |
| "grad_norm": 90.35485076904297, |
| "learning_rate": 4.977510145183713e-06, |
| "loss": 0.8517, |
| "mean_token_accuracy": 0.7698142886161804, |
| "num_tokens": 7108524.0, |
| "step": 875 |
| }, |
| { |
| "entropy": 1.2226394414901733, |
| "epoch": 0.4968944099378882, |
| "grad_norm": 94.40557861328125, |
| "learning_rate": 4.97725261119839e-06, |
| "loss": 0.8542, |
| "mean_token_accuracy": 0.7724049091339111, |
| "num_tokens": 7149141.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 1.2256523609161376, |
| "epoch": 0.4997176736307171, |
| "grad_norm": 118.25550842285156, |
| "learning_rate": 4.976993620067906e-06, |
| "loss": 0.9467, |
| "mean_token_accuracy": 0.747167456150055, |
| "num_tokens": 7189872.0, |
| "step": 885 |
| }, |
| { |
| "entropy": 1.3428989887237548, |
| "epoch": 0.502540937323546, |
| "grad_norm": 96.30892181396484, |
| "learning_rate": 4.9767331719960035e-06, |
| "loss": 0.9079, |
| "mean_token_accuracy": 0.7582340478897095, |
| "num_tokens": 7230448.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 1.184017586708069, |
| "epoch": 0.5053642010163749, |
| "grad_norm": 97.78132629394531, |
| "learning_rate": 4.976471267187575e-06, |
| "loss": 0.8507, |
| "mean_token_accuracy": 0.7695409536361695, |
| "num_tokens": 7271071.0, |
| "step": 895 |
| }, |
| { |
| "entropy": 1.371737289428711, |
| "epoch": 0.5081874647092038, |
| "grad_norm": 102.5118408203125, |
| "learning_rate": 4.976207905848659e-06, |
| "loss": 0.9116, |
| "mean_token_accuracy": 0.7551624774932861, |
| "num_tokens": 7311722.0, |
| "step": 900 |
| }, |
| { |
| "entropy": 1.23578884601593, |
| "epoch": 0.5110107284020328, |
| "grad_norm": 91.31993103027344, |
| "learning_rate": 4.975943088186437e-06, |
| "loss": 0.8693, |
| "mean_token_accuracy": 0.7666425943374634, |
| "num_tokens": 7352436.0, |
| "step": 905 |
| }, |
| { |
| "entropy": 1.1599351406097411, |
| "epoch": 0.5138339920948617, |
| "grad_norm": 91.41136932373047, |
| "learning_rate": 4.9756768144092385e-06, |
| "loss": 0.8766, |
| "mean_token_accuracy": 0.7643981575965881, |
| "num_tokens": 7392980.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 1.2577017545700073, |
| "epoch": 0.5166572557876906, |
| "grad_norm": 108.91261291503906, |
| "learning_rate": 4.975409084726538e-06, |
| "loss": 0.8776, |
| "mean_token_accuracy": 0.7656487226486206, |
| "num_tokens": 7433717.0, |
| "step": 915 |
| }, |
| { |
| "entropy": 1.1488389253616333, |
| "epoch": 0.5194805194805194, |
| "grad_norm": 90.68296813964844, |
| "learning_rate": 4.975139899348954e-06, |
| "loss": 0.8093, |
| "mean_token_accuracy": 0.7807123541831971, |
| "num_tokens": 7474433.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 1.1915344953536988, |
| "epoch": 0.5223037831733484, |
| "grad_norm": 96.44047546386719, |
| "learning_rate": 4.974869258488254e-06, |
| "loss": 0.8236, |
| "mean_token_accuracy": 0.7743138551712037, |
| "num_tokens": 7515077.0, |
| "step": 925 |
| }, |
| { |
| "entropy": 1.250411081314087, |
| "epoch": 0.5251270468661773, |
| "grad_norm": 96.81196594238281, |
| "learning_rate": 4.9745971623573465e-06, |
| "loss": 0.8852, |
| "mean_token_accuracy": 0.7619670391082763, |
| "num_tokens": 7555778.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 1.1861608266830443, |
| "epoch": 0.5279503105590062, |
| "grad_norm": 88.66297912597656, |
| "learning_rate": 4.974323611170286e-06, |
| "loss": 0.8606, |
| "mean_token_accuracy": 0.7666665554046631, |
| "num_tokens": 7596538.0, |
| "step": 935 |
| }, |
| { |
| "entropy": 1.1959111571311951, |
| "epoch": 0.5307735742518351, |
| "grad_norm": 104.4039535522461, |
| "learning_rate": 4.974048605142273e-06, |
| "loss": 0.8686, |
| "mean_token_accuracy": 0.7655381083488464, |
| "num_tokens": 7637030.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 1.1513007164001465, |
| "epoch": 0.5335968379446641, |
| "grad_norm": 102.5768051147461, |
| "learning_rate": 4.9737721444896506e-06, |
| "loss": 0.8039, |
| "mean_token_accuracy": 0.7838160276412964, |
| "num_tokens": 7677640.0, |
| "step": 945 |
| }, |
| { |
| "entropy": 1.3042636871337892, |
| "epoch": 0.536420101637493, |
| "grad_norm": 78.14281463623047, |
| "learning_rate": 4.973494229429907e-06, |
| "loss": 0.9158, |
| "mean_token_accuracy": 0.7570660233497619, |
| "num_tokens": 7718349.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 1.266080904006958, |
| "epoch": 0.5392433653303218, |
| "grad_norm": 110.9036865234375, |
| "learning_rate": 4.973214860181675e-06, |
| "loss": 0.93, |
| "mean_token_accuracy": 0.7506848573684692, |
| "num_tokens": 7759004.0, |
| "step": 955 |
| }, |
| { |
| "entropy": 1.1759474754333497, |
| "epoch": 0.5420666290231507, |
| "grad_norm": 108.20323944091797, |
| "learning_rate": 4.972934036964732e-06, |
| "loss": 0.7997, |
| "mean_token_accuracy": 0.7808601498603821, |
| "num_tokens": 7799425.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 1.2482519030570984, |
| "epoch": 0.5448898927159797, |
| "grad_norm": 100.69843292236328, |
| "learning_rate": 4.972651759999997e-06, |
| "loss": 0.8741, |
| "mean_token_accuracy": 0.7640202879905701, |
| "num_tokens": 7840200.0, |
| "step": 965 |
| }, |
| { |
| "entropy": 1.2493391752243042, |
| "epoch": 0.5477131564088086, |
| "grad_norm": 111.9268798828125, |
| "learning_rate": 4.9723680295095335e-06, |
| "loss": 0.8481, |
| "mean_token_accuracy": 0.7669167637825012, |
| "num_tokens": 7880885.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 1.213208556175232, |
| "epoch": 0.5505364201016375, |
| "grad_norm": 89.9885482788086, |
| "learning_rate": 4.972082845716551e-06, |
| "loss": 0.8463, |
| "mean_token_accuracy": 0.7700656652450562, |
| "num_tokens": 7921579.0, |
| "step": 975 |
| }, |
| { |
| "entropy": 1.2416555166244507, |
| "epoch": 0.5533596837944664, |
| "grad_norm": 94.0711669921875, |
| "learning_rate": 4.971796208845398e-06, |
| "loss": 0.8986, |
| "mean_token_accuracy": 0.759855318069458, |
| "num_tokens": 7961884.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 1.3858731746673585, |
| "epoch": 0.5561829474872954, |
| "grad_norm": 89.50563049316406, |
| "learning_rate": 4.9715081191215705e-06, |
| "loss": 0.9528, |
| "mean_token_accuracy": 0.7506495118141174, |
| "num_tokens": 8002593.0, |
| "step": 985 |
| }, |
| { |
| "entropy": 1.2900074481964112, |
| "epoch": 0.5590062111801242, |
| "grad_norm": 98.44612884521484, |
| "learning_rate": 4.971218576771703e-06, |
| "loss": 0.9108, |
| "mean_token_accuracy": 0.7587322473526001, |
| "num_tokens": 8043253.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 1.2571572303771972, |
| "epoch": 0.5618294748729531, |
| "grad_norm": 105.00495147705078, |
| "learning_rate": 4.970927582023577e-06, |
| "loss": 0.9269, |
| "mean_token_accuracy": 0.7483970880508423, |
| "num_tokens": 8083612.0, |
| "step": 995 |
| }, |
| { |
| "entropy": 1.245552134513855, |
| "epoch": 0.564652738565782, |
| "grad_norm": 98.6438980102539, |
| "learning_rate": 4.970635135106113e-06, |
| "loss": 0.8576, |
| "mean_token_accuracy": 0.7674945831298828, |
| "num_tokens": 8124241.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.564652738565782, |
| "eval_entropy": 1.337150764465332, |
| "eval_loss": 0.7948279976844788, |
| "eval_mean_token_accuracy": 0.7877452731132507, |
| "eval_num_tokens": 8124241.0, |
| "eval_runtime": 2.455, |
| "eval_samples_per_second": 15.886, |
| "eval_steps_per_second": 2.037, |
| "step": 1000 |
| }, |
| { |
| "entropy": 1.1067086219787599, |
| "epoch": 0.567476002258611, |
| "grad_norm": 106.83331298828125, |
| "learning_rate": 4.970341236249376e-06, |
| "loss": 0.7424, |
| "mean_token_accuracy": 0.7967899441719055, |
| "num_tokens": 8165061.0, |
| "step": 1005 |
| }, |
| { |
| "entropy": 1.2741168975830077, |
| "epoch": 0.5702992659514399, |
| "grad_norm": 98.59375762939453, |
| "learning_rate": 4.970045885684575e-06, |
| "loss": 0.8614, |
| "mean_token_accuracy": 0.7667156815528869, |
| "num_tokens": 8205689.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 1.3391781091690063, |
| "epoch": 0.5731225296442688, |
| "grad_norm": 99.1837387084961, |
| "learning_rate": 4.969749083644055e-06, |
| "loss": 0.9235, |
| "mean_token_accuracy": 0.7538702368736268, |
| "num_tokens": 8246286.0, |
| "step": 1015 |
| }, |
| { |
| "entropy": 1.2175718069076538, |
| "epoch": 0.5759457933370977, |
| "grad_norm": 97.83293151855469, |
| "learning_rate": 4.969450830361309e-06, |
| "loss": 0.8251, |
| "mean_token_accuracy": 0.7760945677757263, |
| "num_tokens": 8286897.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 1.2220812559127807, |
| "epoch": 0.5787690570299266, |
| "grad_norm": 84.45748901367188, |
| "learning_rate": 4.969151126070968e-06, |
| "loss": 0.8428, |
| "mean_token_accuracy": 0.7701049447059631, |
| "num_tokens": 8327581.0, |
| "step": 1025 |
| }, |
| { |
| "entropy": 1.2730301141738891, |
| "epoch": 0.5815923207227555, |
| "grad_norm": 105.70362091064453, |
| "learning_rate": 4.968849971008808e-06, |
| "loss": 0.8441, |
| "mean_token_accuracy": 0.771563708782196, |
| "num_tokens": 8368186.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 1.2853108644485474, |
| "epoch": 0.5844155844155844, |
| "grad_norm": 84.74578857421875, |
| "learning_rate": 4.968547365411742e-06, |
| "loss": 0.8923, |
| "mean_token_accuracy": 0.7610676050186157, |
| "num_tokens": 8408672.0, |
| "step": 1035 |
| }, |
| { |
| "entropy": 1.112477993965149, |
| "epoch": 0.5872388481084133, |
| "grad_norm": 90.72158813476562, |
| "learning_rate": 4.968243309517826e-06, |
| "loss": 0.7923, |
| "mean_token_accuracy": 0.7860908746719361, |
| "num_tokens": 8449222.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 1.3268202066421508, |
| "epoch": 0.5900621118012422, |
| "grad_norm": 86.42573547363281, |
| "learning_rate": 4.967937803566259e-06, |
| "loss": 0.8718, |
| "mean_token_accuracy": 0.7626396536827087, |
| "num_tokens": 8489553.0, |
| "step": 1045 |
| }, |
| { |
| "entropy": 1.240961480140686, |
| "epoch": 0.5928853754940712, |
| "grad_norm": 87.08723449707031, |
| "learning_rate": 4.967630847797378e-06, |
| "loss": 0.8409, |
| "mean_token_accuracy": 0.7718519926071167, |
| "num_tokens": 8530434.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 1.1958368062973022, |
| "epoch": 0.5957086391869001, |
| "grad_norm": 97.55753326416016, |
| "learning_rate": 4.967322442452661e-06, |
| "loss": 0.8397, |
| "mean_token_accuracy": 0.7687074780464173, |
| "num_tokens": 8571028.0, |
| "step": 1055 |
| }, |
| { |
| "entropy": 1.2576555967330934, |
| "epoch": 0.598531902879729, |
| "grad_norm": 97.74880981445312, |
| "learning_rate": 4.967012587774729e-06, |
| "loss": 0.8667, |
| "mean_token_accuracy": 0.7679498672485352, |
| "num_tokens": 8611332.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 1.1161712408065796, |
| "epoch": 0.6013551665725578, |
| "grad_norm": 84.0584487915039, |
| "learning_rate": 4.966701284007337e-06, |
| "loss": 0.811, |
| "mean_token_accuracy": 0.7786412715911866, |
| "num_tokens": 8651862.0, |
| "step": 1065 |
| }, |
| { |
| "entropy": 1.3346630334854126, |
| "epoch": 0.6041784302653868, |
| "grad_norm": 88.71247863769531, |
| "learning_rate": 4.966388531395388e-06, |
| "loss": 0.869, |
| "mean_token_accuracy": 0.768392026424408, |
| "num_tokens": 8692361.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 1.195302712917328, |
| "epoch": 0.6070016939582157, |
| "grad_norm": 80.38240051269531, |
| "learning_rate": 4.966074330184917e-06, |
| "loss": 0.7627, |
| "mean_token_accuracy": 0.790677547454834, |
| "num_tokens": 8733114.0, |
| "step": 1075 |
| }, |
| { |
| "entropy": 1.3030426979064942, |
| "epoch": 0.6098249576510446, |
| "grad_norm": 94.00823974609375, |
| "learning_rate": 4.965758680623106e-06, |
| "loss": 0.9205, |
| "mean_token_accuracy": 0.7608320355415344, |
| "num_tokens": 8773273.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 1.2931268215179443, |
| "epoch": 0.6126482213438735, |
| "grad_norm": 107.884033203125, |
| "learning_rate": 4.9654415829582714e-06, |
| "loss": 0.8814, |
| "mean_token_accuracy": 0.7640480279922486, |
| "num_tokens": 8813046.0, |
| "step": 1085 |
| }, |
| { |
| "entropy": 1.2563365936279296, |
| "epoch": 0.6154714850367025, |
| "grad_norm": 105.78944396972656, |
| "learning_rate": 4.965123037439869e-06, |
| "loss": 0.8697, |
| "mean_token_accuracy": 0.7670489430427552, |
| "num_tokens": 8853962.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 1.134628915786743, |
| "epoch": 0.6182947487295314, |
| "grad_norm": 99.70325469970703, |
| "learning_rate": 4.964803044318496e-06, |
| "loss": 0.8172, |
| "mean_token_accuracy": 0.7771477937698364, |
| "num_tokens": 8894573.0, |
| "step": 1095 |
| }, |
| { |
| "entropy": 1.2724319458007813, |
| "epoch": 0.6211180124223602, |
| "grad_norm": 93.54045104980469, |
| "learning_rate": 4.964481603845887e-06, |
| "loss": 0.8051, |
| "mean_token_accuracy": 0.7794610857963562, |
| "num_tokens": 8935263.0, |
| "step": 1100 |
| }, |
| { |
| "entropy": 1.3056188106536866, |
| "epoch": 0.6239412761151891, |
| "grad_norm": 107.5456771850586, |
| "learning_rate": 4.964158716274915e-06, |
| "loss": 0.9563, |
| "mean_token_accuracy": 0.7459239959716797, |
| "num_tokens": 8975867.0, |
| "step": 1105 |
| }, |
| { |
| "entropy": 1.379978322982788, |
| "epoch": 0.626764539808018, |
| "grad_norm": 95.43455505371094, |
| "learning_rate": 4.963834381859593e-06, |
| "loss": 0.9058, |
| "mean_token_accuracy": 0.7583242535591126, |
| "num_tokens": 9016727.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 1.316626739501953, |
| "epoch": 0.629587803500847, |
| "grad_norm": 94.45218658447266, |
| "learning_rate": 4.9635086008550694e-06, |
| "loss": 0.888, |
| "mean_token_accuracy": 0.7589772820472718, |
| "num_tokens": 9057474.0, |
| "step": 1115 |
| }, |
| { |
| "entropy": 1.2168570041656495, |
| "epoch": 0.6324110671936759, |
| "grad_norm": 90.01762390136719, |
| "learning_rate": 4.963181373517634e-06, |
| "loss": 0.8126, |
| "mean_token_accuracy": 0.7784791946411133, |
| "num_tokens": 9098180.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 1.3705679416656493, |
| "epoch": 0.6352343308865048, |
| "grad_norm": 89.59708404541016, |
| "learning_rate": 4.9628527001047105e-06, |
| "loss": 0.8904, |
| "mean_token_accuracy": 0.7621181845664978, |
| "num_tokens": 9138506.0, |
| "step": 1125 |
| }, |
| { |
| "entropy": 1.2591190934181213, |
| "epoch": 0.6380575945793338, |
| "grad_norm": 100.79552459716797, |
| "learning_rate": 4.9625225808748636e-06, |
| "loss": 0.8763, |
| "mean_token_accuracy": 0.7636958003044129, |
| "num_tokens": 9179140.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 1.2524430751800537, |
| "epoch": 0.6408808582721626, |
| "grad_norm": 83.18380737304688, |
| "learning_rate": 4.962191016087795e-06, |
| "loss": 0.8607, |
| "mean_token_accuracy": 0.7672532081604004, |
| "num_tokens": 9219760.0, |
| "step": 1135 |
| }, |
| { |
| "entropy": 1.302207589149475, |
| "epoch": 0.6437041219649915, |
| "grad_norm": 84.2943344116211, |
| "learning_rate": 4.961858006004342e-06, |
| "loss": 0.9161, |
| "mean_token_accuracy": 0.7545537352561951, |
| "num_tokens": 9259918.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 1.2336005210876464, |
| "epoch": 0.6465273856578204, |
| "grad_norm": 103.37287139892578, |
| "learning_rate": 4.961523550886479e-06, |
| "loss": 0.8367, |
| "mean_token_accuracy": 0.7728558421134949, |
| "num_tokens": 9300420.0, |
| "step": 1145 |
| }, |
| { |
| "entropy": 1.3373284339904785, |
| "epoch": 0.6493506493506493, |
| "grad_norm": 80.52223205566406, |
| "learning_rate": 4.9611876509973185e-06, |
| "loss": 0.8708, |
| "mean_token_accuracy": 0.7663377165794373, |
| "num_tokens": 9341081.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 1.1970614194869995, |
| "epoch": 0.6521739130434783, |
| "grad_norm": 93.88483428955078, |
| "learning_rate": 4.96085030660111e-06, |
| "loss": 0.7878, |
| "mean_token_accuracy": 0.7849771738052368, |
| "num_tokens": 9381651.0, |
| "step": 1155 |
| }, |
| { |
| "entropy": 1.1667418718338012, |
| "epoch": 0.6549971767363072, |
| "grad_norm": 94.35794830322266, |
| "learning_rate": 4.960511517963236e-06, |
| "loss": 0.7963, |
| "mean_token_accuracy": 0.7851962924003602, |
| "num_tokens": 9422500.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 1.244291114807129, |
| "epoch": 0.6578204404291361, |
| "grad_norm": 89.03894805908203, |
| "learning_rate": 4.96017128535022e-06, |
| "loss": 0.8179, |
| "mean_token_accuracy": 0.779565978050232, |
| "num_tokens": 9463027.0, |
| "step": 1165 |
| }, |
| { |
| "entropy": 1.2647228240966797, |
| "epoch": 0.6606437041219649, |
| "grad_norm": 91.51069641113281, |
| "learning_rate": 4.959829609029717e-06, |
| "loss": 0.8038, |
| "mean_token_accuracy": 0.7766362309455872, |
| "num_tokens": 9503101.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 1.3284830093383788, |
| "epoch": 0.6634669678147939, |
| "grad_norm": 94.90776062011719, |
| "learning_rate": 4.9594864892705204e-06, |
| "loss": 0.8927, |
| "mean_token_accuracy": 0.7618573546409607, |
| "num_tokens": 9543661.0, |
| "step": 1175 |
| }, |
| { |
| "entropy": 1.3220321416854859, |
| "epoch": 0.6662902315076228, |
| "grad_norm": 111.29701232910156, |
| "learning_rate": 4.959141926342559e-06, |
| "loss": 0.9056, |
| "mean_token_accuracy": 0.7612986207008362, |
| "num_tokens": 9584316.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 1.3569005489349366, |
| "epoch": 0.6691134952004517, |
| "grad_norm": 99.6978530883789, |
| "learning_rate": 4.958795920516895e-06, |
| "loss": 0.9268, |
| "mean_token_accuracy": 0.7549396872520446, |
| "num_tokens": 9624978.0, |
| "step": 1185 |
| }, |
| { |
| "entropy": 1.266989517211914, |
| "epoch": 0.6719367588932806, |
| "grad_norm": 84.81405639648438, |
| "learning_rate": 4.958448472065729e-06, |
| "loss": 0.8148, |
| "mean_token_accuracy": 0.7786801934242249, |
| "num_tokens": 9665779.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 1.2129095792770386, |
| "epoch": 0.6747600225861096, |
| "grad_norm": 97.6620864868164, |
| "learning_rate": 4.958099581262393e-06, |
| "loss": 0.8645, |
| "mean_token_accuracy": 0.7641335129737854, |
| "num_tokens": 9705918.0, |
| "step": 1195 |
| }, |
| { |
| "entropy": 1.3208855628967284, |
| "epoch": 0.6775832862789385, |
| "grad_norm": 84.83293914794922, |
| "learning_rate": 4.957749248381356e-06, |
| "loss": 0.8521, |
| "mean_token_accuracy": 0.7699254035949707, |
| "num_tokens": 9746614.0, |
| "step": 1200 |
| }, |
| { |
| "entropy": 1.2614996910095215, |
| "epoch": 0.6804065499717674, |
| "grad_norm": 90.78227996826172, |
| "learning_rate": 4.957397473698221e-06, |
| "loss": 0.8468, |
| "mean_token_accuracy": 0.7698518514633179, |
| "num_tokens": 9787436.0, |
| "step": 1205 |
| }, |
| { |
| "entropy": 1.1796211361885072, |
| "epoch": 0.6832298136645962, |
| "grad_norm": 89.12525177001953, |
| "learning_rate": 4.957044257489724e-06, |
| "loss": 0.7778, |
| "mean_token_accuracy": 0.7862819194793701, |
| "num_tokens": 9828249.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 1.2314157962799073, |
| "epoch": 0.6860530773574252, |
| "grad_norm": 83.9203109741211, |
| "learning_rate": 4.956689600033736e-06, |
| "loss": 0.8364, |
| "mean_token_accuracy": 0.7717383742332459, |
| "num_tokens": 9868382.0, |
| "step": 1215 |
| }, |
| { |
| "entropy": 1.3469780206680297, |
| "epoch": 0.6888763410502541, |
| "grad_norm": 94.05850982666016, |
| "learning_rate": 4.956333501609263e-06, |
| "loss": 0.8967, |
| "mean_token_accuracy": 0.7598065853118896, |
| "num_tokens": 9909206.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 1.4139271020889281, |
| "epoch": 0.691699604743083, |
| "grad_norm": 100.73336791992188, |
| "learning_rate": 4.955975962496443e-06, |
| "loss": 0.9323, |
| "mean_token_accuracy": 0.7512782216072083, |
| "num_tokens": 9949369.0, |
| "step": 1225 |
| }, |
| { |
| "entropy": 1.4305032253265382, |
| "epoch": 0.6945228684359119, |
| "grad_norm": 105.20040130615234, |
| "learning_rate": 4.955616982976546e-06, |
| "loss": 0.8752, |
| "mean_token_accuracy": 0.7681516885757447, |
| "num_tokens": 9989740.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 1.3764580249786378, |
| "epoch": 0.6973461321287409, |
| "grad_norm": 90.64266967773438, |
| "learning_rate": 4.95525656333198e-06, |
| "loss": 0.906, |
| "mean_token_accuracy": 0.7587046384811401, |
| "num_tokens": 10030301.0, |
| "step": 1235 |
| }, |
| { |
| "entropy": 1.3211364984512328, |
| "epoch": 0.7001693958215698, |
| "grad_norm": 80.13589477539062, |
| "learning_rate": 4.954894703846281e-06, |
| "loss": 0.8725, |
| "mean_token_accuracy": 0.7638087749481202, |
| "num_tokens": 10071097.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 1.285390853881836, |
| "epoch": 0.7029926595143986, |
| "grad_norm": 81.82262420654297, |
| "learning_rate": 4.95453140480412e-06, |
| "loss": 0.8349, |
| "mean_token_accuracy": 0.7691360473632812, |
| "num_tokens": 10111646.0, |
| "step": 1245 |
| }, |
| { |
| "entropy": 1.2638070821762084, |
| "epoch": 0.7058159232072275, |
| "grad_norm": 80.83232879638672, |
| "learning_rate": 4.954166666491299e-06, |
| "loss": 0.8681, |
| "mean_token_accuracy": 0.766269302368164, |
| "num_tokens": 10152363.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 1.2698018312454225, |
| "epoch": 0.7086391869000565, |
| "grad_norm": 92.83601379394531, |
| "learning_rate": 4.953800489194755e-06, |
| "loss": 0.864, |
| "mean_token_accuracy": 0.7671181201934815, |
| "num_tokens": 10192739.0, |
| "step": 1255 |
| }, |
| { |
| "entropy": 1.1670500040054321, |
| "epoch": 0.7114624505928854, |
| "grad_norm": 92.00151824951172, |
| "learning_rate": 4.953432873202555e-06, |
| "loss": 0.8179, |
| "mean_token_accuracy": 0.7748780965805053, |
| "num_tokens": 10232940.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 1.1253260135650636, |
| "epoch": 0.7142857142857143, |
| "grad_norm": 75.87653350830078, |
| "learning_rate": 4.953063818803897e-06, |
| "loss": 0.7016, |
| "mean_token_accuracy": 0.8046749353408813, |
| "num_tokens": 10273648.0, |
| "step": 1265 |
| }, |
| { |
| "entropy": 1.1963707447052, |
| "epoch": 0.7171089779785432, |
| "grad_norm": 89.2532958984375, |
| "learning_rate": 4.952693326289112e-06, |
| "loss": 0.8017, |
| "mean_token_accuracy": 0.7819836020469666, |
| "num_tokens": 10314317.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 1.2884133338928223, |
| "epoch": 0.7199322416713722, |
| "grad_norm": 94.43043518066406, |
| "learning_rate": 4.9523213959496635e-06, |
| "loss": 0.8479, |
| "mean_token_accuracy": 0.7720703601837158, |
| "num_tokens": 10355042.0, |
| "step": 1275 |
| }, |
| { |
| "entropy": 1.2198226928710938, |
| "epoch": 0.722755505364201, |
| "grad_norm": 76.82041931152344, |
| "learning_rate": 4.951948028078143e-06, |
| "loss": 0.8174, |
| "mean_token_accuracy": 0.7741351842880249, |
| "num_tokens": 10395782.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 1.3100014686584474, |
| "epoch": 0.7255787690570299, |
| "grad_norm": 84.25589752197266, |
| "learning_rate": 4.951573222968275e-06, |
| "loss": 0.8215, |
| "mean_token_accuracy": 0.7780505061149597, |
| "num_tokens": 10436544.0, |
| "step": 1285 |
| }, |
| { |
| "entropy": 1.349497175216675, |
| "epoch": 0.7284020327498588, |
| "grad_norm": 93.43528747558594, |
| "learning_rate": 4.951196980914915e-06, |
| "loss": 0.8345, |
| "mean_token_accuracy": 0.7736409544944763, |
| "num_tokens": 10477168.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 1.2174330472946167, |
| "epoch": 0.7312252964426877, |
| "grad_norm": 88.33943176269531, |
| "learning_rate": 4.950819302214048e-06, |
| "loss": 0.7522, |
| "mean_token_accuracy": 0.7905766010284424, |
| "num_tokens": 10517860.0, |
| "step": 1295 |
| }, |
| { |
| "entropy": 1.318717932701111, |
| "epoch": 0.7340485601355167, |
| "grad_norm": 110.97924041748047, |
| "learning_rate": 4.950440187162788e-06, |
| "loss": 0.8235, |
| "mean_token_accuracy": 0.7764236092567444, |
| "num_tokens": 10558572.0, |
| "step": 1300 |
| }, |
| { |
| "entropy": 1.2312336444854737, |
| "epoch": 0.7368718238283456, |
| "grad_norm": 79.44239044189453, |
| "learning_rate": 4.950059636059382e-06, |
| "loss": 0.8554, |
| "mean_token_accuracy": 0.7687599420547485, |
| "num_tokens": 10599139.0, |
| "step": 1305 |
| }, |
| { |
| "entropy": 1.2320231914520263, |
| "epoch": 0.7396950875211745, |
| "grad_norm": 80.55126953125, |
| "learning_rate": 4.949677649203205e-06, |
| "loss": 0.8153, |
| "mean_token_accuracy": 0.7771285057067872, |
| "num_tokens": 10639780.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 1.259970498085022, |
| "epoch": 0.7425183512140033, |
| "grad_norm": 84.45753479003906, |
| "learning_rate": 4.949294226894759e-06, |
| "loss": 0.8074, |
| "mean_token_accuracy": 0.7815461993217468, |
| "num_tokens": 10680141.0, |
| "step": 1315 |
| }, |
| { |
| "entropy": 1.3869355916976929, |
| "epoch": 0.7453416149068323, |
| "grad_norm": 83.97404479980469, |
| "learning_rate": 4.948909369435681e-06, |
| "loss": 0.8837, |
| "mean_token_accuracy": 0.7602412819862365, |
| "num_tokens": 10720965.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 1.2961002826690673, |
| "epoch": 0.7481648785996612, |
| "grad_norm": 86.73141479492188, |
| "learning_rate": 4.948523077128732e-06, |
| "loss": 0.7997, |
| "mean_token_accuracy": 0.779944372177124, |
| "num_tokens": 10761782.0, |
| "step": 1325 |
| }, |
| { |
| "entropy": 1.3356840848922729, |
| "epoch": 0.7509881422924901, |
| "grad_norm": 99.98656463623047, |
| "learning_rate": 4.948135350277804e-06, |
| "loss": 0.8639, |
| "mean_token_accuracy": 0.7667491197586059, |
| "num_tokens": 10802436.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 1.3833496570587158, |
| "epoch": 0.753811405985319, |
| "grad_norm": 97.27751922607422, |
| "learning_rate": 4.9477461891879175e-06, |
| "loss": 0.8625, |
| "mean_token_accuracy": 0.7689119338989258, |
| "num_tokens": 10842892.0, |
| "step": 1335 |
| }, |
| { |
| "entropy": 1.3714573621749877, |
| "epoch": 0.756634669678148, |
| "grad_norm": 96.33169555664062, |
| "learning_rate": 4.9473555941652205e-06, |
| "loss": 0.8321, |
| "mean_token_accuracy": 0.7727259278297425, |
| "num_tokens": 10883719.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 1.1747460842132569, |
| "epoch": 0.7594579333709769, |
| "grad_norm": 87.69495391845703, |
| "learning_rate": 4.94696356551699e-06, |
| "loss": 0.7642, |
| "mean_token_accuracy": 0.7857497334480286, |
| "num_tokens": 10924460.0, |
| "step": 1345 |
| }, |
| { |
| "entropy": 1.302380084991455, |
| "epoch": 0.7622811970638057, |
| "grad_norm": 83.05201721191406, |
| "learning_rate": 4.946570103551629e-06, |
| "loss": 0.8617, |
| "mean_token_accuracy": 0.7670197010040283, |
| "num_tokens": 10964412.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 1.3350585222244262, |
| "epoch": 0.7651044607566346, |
| "grad_norm": 106.51172637939453, |
| "learning_rate": 4.946175208578671e-06, |
| "loss": 0.8771, |
| "mean_token_accuracy": 0.7620217323303222, |
| "num_tokens": 11004987.0, |
| "step": 1355 |
| }, |
| { |
| "entropy": 1.40216805934906, |
| "epoch": 0.7679277244494636, |
| "grad_norm": 95.75938415527344, |
| "learning_rate": 4.945778880908774e-06, |
| "loss": 0.8833, |
| "mean_token_accuracy": 0.7614566802978515, |
| "num_tokens": 11045870.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 1.266040849685669, |
| "epoch": 0.7707509881422925, |
| "grad_norm": 95.71835327148438, |
| "learning_rate": 4.945381120853725e-06, |
| "loss": 0.8284, |
| "mean_token_accuracy": 0.7741784572601318, |
| "num_tokens": 11086215.0, |
| "step": 1365 |
| }, |
| { |
| "entropy": 1.22134690284729, |
| "epoch": 0.7735742518351214, |
| "grad_norm": 83.96733093261719, |
| "learning_rate": 4.9449819287264355e-06, |
| "loss": 0.8055, |
| "mean_token_accuracy": 0.7777843117713928, |
| "num_tokens": 11126570.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 1.4879465103149414, |
| "epoch": 0.7763975155279503, |
| "grad_norm": 105.71060943603516, |
| "learning_rate": 4.944581304840948e-06, |
| "loss": 0.8712, |
| "mean_token_accuracy": 0.7661378502845764, |
| "num_tokens": 11167233.0, |
| "step": 1375 |
| }, |
| { |
| "entropy": 1.2257203221321107, |
| "epoch": 0.7792207792207793, |
| "grad_norm": 77.5268325805664, |
| "learning_rate": 4.944179249512425e-06, |
| "loss": 0.7715, |
| "mean_token_accuracy": 0.7869171380996705, |
| "num_tokens": 11207695.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 1.3683911561965942, |
| "epoch": 0.7820440429136082, |
| "grad_norm": 82.34163665771484, |
| "learning_rate": 4.943775763057162e-06, |
| "loss": 0.862, |
| "mean_token_accuracy": 0.7655371904373169, |
| "num_tokens": 11248219.0, |
| "step": 1385 |
| }, |
| { |
| "entropy": 1.1952388167381287, |
| "epoch": 0.784867306606437, |
| "grad_norm": 85.42621612548828, |
| "learning_rate": 4.943370845792576e-06, |
| "loss": 0.7864, |
| "mean_token_accuracy": 0.7840098381042481, |
| "num_tokens": 11288948.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 1.15340895652771, |
| "epoch": 0.7876905702992659, |
| "grad_norm": 76.76214599609375, |
| "learning_rate": 4.942964498037211e-06, |
| "loss": 0.7282, |
| "mean_token_accuracy": 0.7976190567016601, |
| "num_tokens": 11329661.0, |
| "step": 1395 |
| }, |
| { |
| "entropy": 1.2893347024917603, |
| "epoch": 0.7905138339920948, |
| "grad_norm": 92.17630004882812, |
| "learning_rate": 4.942556720110734e-06, |
| "loss": 0.8198, |
| "mean_token_accuracy": 0.7803659439086914, |
| "num_tokens": 11370231.0, |
| "step": 1400 |
| }, |
| { |
| "entropy": 1.2334280252456664, |
| "epoch": 0.7933370976849238, |
| "grad_norm": 75.12127685546875, |
| "learning_rate": 4.942147512333941e-06, |
| "loss": 0.7722, |
| "mean_token_accuracy": 0.787669575214386, |
| "num_tokens": 11411082.0, |
| "step": 1405 |
| }, |
| { |
| "entropy": 1.2461157083511352, |
| "epoch": 0.7961603613777527, |
| "grad_norm": 73.56942749023438, |
| "learning_rate": 4.9417368750287505e-06, |
| "loss": 0.7603, |
| "mean_token_accuracy": 0.789132559299469, |
| "num_tokens": 11451850.0, |
| "step": 1410 |
| }, |
| { |
| "entropy": 1.2484428882598877, |
| "epoch": 0.7989836250705816, |
| "grad_norm": 105.41342163085938, |
| "learning_rate": 4.941324808518204e-06, |
| "loss": 0.8172, |
| "mean_token_accuracy": 0.7764388442039489, |
| "num_tokens": 11492503.0, |
| "step": 1415 |
| }, |
| { |
| "entropy": 1.3737143993377685, |
| "epoch": 0.8018068887634106, |
| "grad_norm": 94.12542724609375, |
| "learning_rate": 4.940911313126473e-06, |
| "loss": 0.8495, |
| "mean_token_accuracy": 0.7677985429763794, |
| "num_tokens": 11533093.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 1.1714343309402466, |
| "epoch": 0.8046301524562394, |
| "grad_norm": 99.68680572509766, |
| "learning_rate": 4.9404963891788475e-06, |
| "loss": 0.7494, |
| "mean_token_accuracy": 0.7915635108947754, |
| "num_tokens": 11573883.0, |
| "step": 1425 |
| }, |
| { |
| "entropy": 1.3285836696624755, |
| "epoch": 0.8074534161490683, |
| "grad_norm": 90.7094497680664, |
| "learning_rate": 4.940080037001742e-06, |
| "loss": 0.8449, |
| "mean_token_accuracy": 0.7693374752998352, |
| "num_tokens": 11614518.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 1.169933295249939, |
| "epoch": 0.8102766798418972, |
| "grad_norm": 72.55992889404297, |
| "learning_rate": 4.939662256922698e-06, |
| "loss": 0.7506, |
| "mean_token_accuracy": 0.7904896259307861, |
| "num_tokens": 11655176.0, |
| "step": 1435 |
| }, |
| { |
| "entropy": 1.2623589515686036, |
| "epoch": 0.8130999435347261, |
| "grad_norm": 75.47972869873047, |
| "learning_rate": 4.939243049270377e-06, |
| "loss": 0.8011, |
| "mean_token_accuracy": 0.7814219117164611, |
| "num_tokens": 11695698.0, |
| "step": 1440 |
| }, |
| { |
| "entropy": 1.2184853553771973, |
| "epoch": 0.8159232072275551, |
| "grad_norm": 83.59069061279297, |
| "learning_rate": 4.938822414374564e-06, |
| "loss": 0.7559, |
| "mean_token_accuracy": 0.7949077248573303, |
| "num_tokens": 11736485.0, |
| "step": 1445 |
| }, |
| { |
| "entropy": 1.1922895908355713, |
| "epoch": 0.818746470920384, |
| "grad_norm": 81.77254486083984, |
| "learning_rate": 4.938400352566171e-06, |
| "loss": 0.7586, |
| "mean_token_accuracy": 0.7909193158149719, |
| "num_tokens": 11777142.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 1.2163417339324951, |
| "epoch": 0.8215697346132129, |
| "grad_norm": 82.5959701538086, |
| "learning_rate": 4.937976864177224e-06, |
| "loss": 0.7848, |
| "mean_token_accuracy": 0.7832719922065735, |
| "num_tokens": 11817664.0, |
| "step": 1455 |
| }, |
| { |
| "entropy": 1.3492765426635742, |
| "epoch": 0.8243929983060417, |
| "grad_norm": 90.08179473876953, |
| "learning_rate": 4.937551949540879e-06, |
| "loss": 0.8054, |
| "mean_token_accuracy": 0.7786729335784912, |
| "num_tokens": 11858278.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 1.1865896463394165, |
| "epoch": 0.8272162619988707, |
| "grad_norm": 83.99300384521484, |
| "learning_rate": 4.937125608991411e-06, |
| "loss": 0.7966, |
| "mean_token_accuracy": 0.7846225380897522, |
| "num_tokens": 11899097.0, |
| "step": 1465 |
| }, |
| { |
| "entropy": 1.29942147731781, |
| "epoch": 0.8300395256916996, |
| "grad_norm": 106.03931427001953, |
| "learning_rate": 4.936697842864218e-06, |
| "loss": 0.8261, |
| "mean_token_accuracy": 0.7756908416748047, |
| "num_tokens": 11939670.0, |
| "step": 1470 |
| }, |
| { |
| "entropy": 1.3267370939254761, |
| "epoch": 0.8328627893845285, |
| "grad_norm": 91.82074737548828, |
| "learning_rate": 4.936268651495817e-06, |
| "loss": 0.8495, |
| "mean_token_accuracy": 0.7673832535743713, |
| "num_tokens": 11980324.0, |
| "step": 1475 |
| }, |
| { |
| "entropy": 1.3590010166168214, |
| "epoch": 0.8356860530773574, |
| "grad_norm": 103.03872680664062, |
| "learning_rate": 4.935838035223848e-06, |
| "loss": 0.8074, |
| "mean_token_accuracy": 0.7801963925361634, |
| "num_tokens": 12020951.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 1.2534837961196899, |
| "epoch": 0.8385093167701864, |
| "grad_norm": 83.96383666992188, |
| "learning_rate": 4.935405994387073e-06, |
| "loss": 0.7585, |
| "mean_token_accuracy": 0.7928261280059814, |
| "num_tokens": 12061701.0, |
| "step": 1485 |
| }, |
| { |
| "entropy": 1.2540729999542237, |
| "epoch": 0.8413325804630153, |
| "grad_norm": 79.00284576416016, |
| "learning_rate": 4.9349725293253716e-06, |
| "loss": 0.7991, |
| "mean_token_accuracy": 0.778445029258728, |
| "num_tokens": 12102385.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 1.3115083932876588, |
| "epoch": 0.8441558441558441, |
| "grad_norm": 90.81023406982422, |
| "learning_rate": 4.934537640379746e-06, |
| "loss": 0.8204, |
| "mean_token_accuracy": 0.7764154672622681, |
| "num_tokens": 12142334.0, |
| "step": 1495 |
| }, |
| { |
| "entropy": 1.2142093896865844, |
| "epoch": 0.846979107848673, |
| "grad_norm": 85.30113220214844, |
| "learning_rate": 4.93410132789232e-06, |
| "loss": 0.7324, |
| "mean_token_accuracy": 0.7949307560920715, |
| "num_tokens": 12182674.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.846979107848673, |
| "eval_entropy": 1.402895951271057, |
| "eval_loss": 0.7068600058555603, |
| "eval_mean_token_accuracy": 0.8026972770690918, |
| "eval_num_tokens": 12182674.0, |
| "eval_runtime": 2.4508, |
| "eval_samples_per_second": 15.913, |
| "eval_steps_per_second": 2.04, |
| "step": 1500 |
| }, |
| { |
| "entropy": 1.3486925840377808, |
| "epoch": 0.849802371541502, |
| "grad_norm": 87.53974914550781, |
| "learning_rate": 4.933663592206334e-06, |
| "loss": 0.8008, |
| "mean_token_accuracy": 0.7834212064743042, |
| "num_tokens": 12223259.0, |
| "step": 1505 |
| }, |
| { |
| "entropy": 1.3488259077072144, |
| "epoch": 0.8526256352343309, |
| "grad_norm": 100.90879821777344, |
| "learning_rate": 4.933224433666149e-06, |
| "loss": 0.8067, |
| "mean_token_accuracy": 0.7783913731575012, |
| "num_tokens": 12263995.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 1.2334957122802734, |
| "epoch": 0.8554488989271598, |
| "grad_norm": 80.5767593383789, |
| "learning_rate": 4.932783852617246e-06, |
| "loss": 0.7855, |
| "mean_token_accuracy": 0.7817227125167847, |
| "num_tokens": 12304800.0, |
| "step": 1515 |
| }, |
| { |
| "entropy": 1.302841305732727, |
| "epoch": 0.8582721626199887, |
| "grad_norm": 86.97444915771484, |
| "learning_rate": 4.932341849406226e-06, |
| "loss": 0.8505, |
| "mean_token_accuracy": 0.7658728957176208, |
| "num_tokens": 12345368.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 1.3894311428070067, |
| "epoch": 0.8610954263128177, |
| "grad_norm": 89.71028900146484, |
| "learning_rate": 4.931898424380807e-06, |
| "loss": 0.8677, |
| "mean_token_accuracy": 0.7690507411956787, |
| "num_tokens": 12385525.0, |
| "step": 1525 |
| }, |
| { |
| "entropy": 1.4505614280700683, |
| "epoch": 0.8639186900056465, |
| "grad_norm": 77.6048355102539, |
| "learning_rate": 4.9314535778898265e-06, |
| "loss": 0.8238, |
| "mean_token_accuracy": 0.7735900044441223, |
| "num_tokens": 12426154.0, |
| "step": 1530 |
| }, |
| { |
| "entropy": 1.2836755752563476, |
| "epoch": 0.8667419536984754, |
| "grad_norm": 86.86837005615234, |
| "learning_rate": 4.931007310283239e-06, |
| "loss": 0.7734, |
| "mean_token_accuracy": 0.7858689308166504, |
| "num_tokens": 12466767.0, |
| "step": 1535 |
| }, |
| { |
| "entropy": 1.3301030158996583, |
| "epoch": 0.8695652173913043, |
| "grad_norm": 91.63949584960938, |
| "learning_rate": 4.930559621912119e-06, |
| "loss": 0.8515, |
| "mean_token_accuracy": 0.7686718940734864, |
| "num_tokens": 12507425.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 1.179738998413086, |
| "epoch": 0.8723884810841332, |
| "grad_norm": 84.38855743408203, |
| "learning_rate": 4.9301105131286575e-06, |
| "loss": 0.7372, |
| "mean_token_accuracy": 0.7955639958381653, |
| "num_tokens": 12548198.0, |
| "step": 1545 |
| }, |
| { |
| "entropy": 1.2709600925445557, |
| "epoch": 0.8752117447769622, |
| "grad_norm": 81.11436462402344, |
| "learning_rate": 4.929659984286162e-06, |
| "loss": 0.793, |
| "mean_token_accuracy": 0.77951899766922, |
| "num_tokens": 12589035.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 1.2885040760040283, |
| "epoch": 0.8780350084697911, |
| "grad_norm": 82.41911315917969, |
| "learning_rate": 4.929208035739059e-06, |
| "loss": 0.8032, |
| "mean_token_accuracy": 0.7800351619720459, |
| "num_tokens": 12629659.0, |
| "step": 1555 |
| }, |
| { |
| "entropy": 1.336904239654541, |
| "epoch": 0.88085827216262, |
| "grad_norm": 81.08649444580078, |
| "learning_rate": 4.928754667842891e-06, |
| "loss": 0.8051, |
| "mean_token_accuracy": 0.7788230299949646, |
| "num_tokens": 12670444.0, |
| "step": 1560 |
| }, |
| { |
| "entropy": 1.2858728647232056, |
| "epoch": 0.883681535855449, |
| "grad_norm": 73.97679138183594, |
| "learning_rate": 4.9282998809543184e-06, |
| "loss": 0.8002, |
| "mean_token_accuracy": 0.7809403300285339, |
| "num_tokens": 12710507.0, |
| "step": 1565 |
| }, |
| { |
| "entropy": 1.1876578330993652, |
| "epoch": 0.8865047995482778, |
| "grad_norm": 75.67999267578125, |
| "learning_rate": 4.927843675431114e-06, |
| "loss": 0.7803, |
| "mean_token_accuracy": 0.7848599433898926, |
| "num_tokens": 12751257.0, |
| "step": 1570 |
| }, |
| { |
| "entropy": 1.3983800411224365, |
| "epoch": 0.8893280632411067, |
| "grad_norm": 93.8427734375, |
| "learning_rate": 4.927386051632171e-06, |
| "loss": 0.8444, |
| "mean_token_accuracy": 0.7689159154891968, |
| "num_tokens": 12791859.0, |
| "step": 1575 |
| }, |
| { |
| "entropy": 1.3883586883544923, |
| "epoch": 0.8921513269339356, |
| "grad_norm": 72.4537353515625, |
| "learning_rate": 4.926927009917497e-06, |
| "loss": 0.8619, |
| "mean_token_accuracy": 0.7652193188667298, |
| "num_tokens": 12832587.0, |
| "step": 1580 |
| }, |
| { |
| "entropy": 1.2995959281921388, |
| "epoch": 0.8949745906267645, |
| "grad_norm": 76.91584014892578, |
| "learning_rate": 4.926466550648214e-06, |
| "loss": 0.7652, |
| "mean_token_accuracy": 0.7863158702850341, |
| "num_tokens": 12873265.0, |
| "step": 1585 |
| }, |
| { |
| "entropy": 1.3862105369567872, |
| "epoch": 0.8977978543195935, |
| "grad_norm": 83.39083862304688, |
| "learning_rate": 4.926004674186559e-06, |
| "loss": 0.8303, |
| "mean_token_accuracy": 0.7747887969017029, |
| "num_tokens": 12913987.0, |
| "step": 1590 |
| }, |
| { |
| "entropy": 1.411944603919983, |
| "epoch": 0.9006211180124224, |
| "grad_norm": 100.23735046386719, |
| "learning_rate": 4.925541380895887e-06, |
| "loss": 0.8915, |
| "mean_token_accuracy": 0.7623481512069702, |
| "num_tokens": 12954481.0, |
| "step": 1595 |
| }, |
| { |
| "entropy": 1.271065902709961, |
| "epoch": 0.9034443817052513, |
| "grad_norm": 82.51213073730469, |
| "learning_rate": 4.925076671140663e-06, |
| "loss": 0.7595, |
| "mean_token_accuracy": 0.7901308059692382, |
| "num_tokens": 12994972.0, |
| "step": 1600 |
| }, |
| { |
| "entropy": 1.3827197790145873, |
| "epoch": 0.9062676453980801, |
| "grad_norm": 96.4259033203125, |
| "learning_rate": 4.924610545286469e-06, |
| "loss": 0.8563, |
| "mean_token_accuracy": 0.7717739820480347, |
| "num_tokens": 13035763.0, |
| "step": 1605 |
| }, |
| { |
| "entropy": 1.336452317237854, |
| "epoch": 0.9090909090909091, |
| "grad_norm": 81.70915985107422, |
| "learning_rate": 4.924143003700002e-06, |
| "loss": 0.794, |
| "mean_token_accuracy": 0.7825908780097961, |
| "num_tokens": 13076206.0, |
| "step": 1610 |
| }, |
| { |
| "entropy": 1.3284541368484497, |
| "epoch": 0.911914172783738, |
| "grad_norm": 76.34697723388672, |
| "learning_rate": 4.92367404674907e-06, |
| "loss": 0.7913, |
| "mean_token_accuracy": 0.7860976576805114, |
| "num_tokens": 13116891.0, |
| "step": 1615 |
| }, |
| { |
| "entropy": 1.2646841764450074, |
| "epoch": 0.9147374364765669, |
| "grad_norm": 72.51228332519531, |
| "learning_rate": 4.923203674802598e-06, |
| "loss": 0.7325, |
| "mean_token_accuracy": 0.7984351754188538, |
| "num_tokens": 13157468.0, |
| "step": 1620 |
| }, |
| { |
| "entropy": 1.3084574460983276, |
| "epoch": 0.9175607001693958, |
| "grad_norm": 107.15888214111328, |
| "learning_rate": 4.922731888230618e-06, |
| "loss": 0.8181, |
| "mean_token_accuracy": 0.7769845604896546, |
| "num_tokens": 13198363.0, |
| "step": 1625 |
| }, |
| { |
| "entropy": 1.1560762643814086, |
| "epoch": 0.9203839638622248, |
| "grad_norm": 76.61054229736328, |
| "learning_rate": 4.922258687404285e-06, |
| "loss": 0.7701, |
| "mean_token_accuracy": 0.7877731561660767, |
| "num_tokens": 13239144.0, |
| "step": 1630 |
| }, |
| { |
| "entropy": 1.353186583518982, |
| "epoch": 0.9232072275550537, |
| "grad_norm": 89.5690689086914, |
| "learning_rate": 4.9217840726958535e-06, |
| "loss": 0.8118, |
| "mean_token_accuracy": 0.778467345237732, |
| "num_tokens": 13279908.0, |
| "step": 1635 |
| }, |
| { |
| "entropy": 1.3830771446228027, |
| "epoch": 0.9260304912478825, |
| "grad_norm": 104.98886108398438, |
| "learning_rate": 4.921308044478703e-06, |
| "loss": 0.8618, |
| "mean_token_accuracy": 0.7697537779808045, |
| "num_tokens": 13320629.0, |
| "step": 1640 |
| }, |
| { |
| "entropy": 1.1967981815338136, |
| "epoch": 0.9288537549407114, |
| "grad_norm": 73.26326751708984, |
| "learning_rate": 4.9208306031273155e-06, |
| "loss": 0.7582, |
| "mean_token_accuracy": 0.7919500350952149, |
| "num_tokens": 13361048.0, |
| "step": 1645 |
| }, |
| { |
| "entropy": 1.3969624519348145, |
| "epoch": 0.9316770186335404, |
| "grad_norm": 93.82074737548828, |
| "learning_rate": 4.920351749017291e-06, |
| "loss": 0.8547, |
| "mean_token_accuracy": 0.7672523736953736, |
| "num_tokens": 13401586.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 1.3160523414611816, |
| "epoch": 0.9345002823263693, |
| "grad_norm": 104.9430923461914, |
| "learning_rate": 4.919871482525337e-06, |
| "loss": 0.846, |
| "mean_token_accuracy": 0.7698705196380615, |
| "num_tokens": 13442383.0, |
| "step": 1655 |
| }, |
| { |
| "entropy": 1.2329437971115111, |
| "epoch": 0.9373235460191982, |
| "grad_norm": 100.20277404785156, |
| "learning_rate": 4.919389804029273e-06, |
| "loss": 0.751, |
| "mean_token_accuracy": 0.7936769366264343, |
| "num_tokens": 13482846.0, |
| "step": 1660 |
| }, |
| { |
| "entropy": 1.271721601486206, |
| "epoch": 0.9401468097120271, |
| "grad_norm": 85.16747283935547, |
| "learning_rate": 4.918906713908032e-06, |
| "loss": 0.7839, |
| "mean_token_accuracy": 0.7872077226638794, |
| "num_tokens": 13523418.0, |
| "step": 1665 |
| }, |
| { |
| "entropy": 1.2720824241638184, |
| "epoch": 0.9429700734048561, |
| "grad_norm": 87.69062805175781, |
| "learning_rate": 4.918422212541653e-06, |
| "loss": 0.8469, |
| "mean_token_accuracy": 0.7712218523025512, |
| "num_tokens": 13564069.0, |
| "step": 1670 |
| }, |
| { |
| "entropy": 1.4191228151321411, |
| "epoch": 0.9457933370976849, |
| "grad_norm": 98.06558990478516, |
| "learning_rate": 4.917936300311288e-06, |
| "loss": 0.8549, |
| "mean_token_accuracy": 0.7684029340744019, |
| "num_tokens": 13604748.0, |
| "step": 1675 |
| }, |
| { |
| "entropy": 1.38045175075531, |
| "epoch": 0.9486166007905138, |
| "grad_norm": 81.36489868164062, |
| "learning_rate": 4.9174489775991985e-06, |
| "loss": 0.841, |
| "mean_token_accuracy": 0.767688262462616, |
| "num_tokens": 13645366.0, |
| "step": 1680 |
| }, |
| { |
| "entropy": 1.307987141609192, |
| "epoch": 0.9514398644833427, |
| "grad_norm": 83.6993408203125, |
| "learning_rate": 4.916960244788755e-06, |
| "loss": 0.8074, |
| "mean_token_accuracy": 0.7801409244537354, |
| "num_tokens": 13686143.0, |
| "step": 1685 |
| }, |
| { |
| "entropy": 1.3370389699935914, |
| "epoch": 0.9542631281761716, |
| "grad_norm": 87.65348052978516, |
| "learning_rate": 4.91647010226444e-06, |
| "loss": 0.7783, |
| "mean_token_accuracy": 0.7867790102958679, |
| "num_tokens": 13726865.0, |
| "step": 1690 |
| }, |
| { |
| "entropy": 1.392472004890442, |
| "epoch": 0.9570863918690006, |
| "grad_norm": 89.71502685546875, |
| "learning_rate": 4.9159785504118405e-06, |
| "loss": 0.808, |
| "mean_token_accuracy": 0.7795929908752441, |
| "num_tokens": 13767384.0, |
| "step": 1695 |
| }, |
| { |
| "entropy": 1.4235849142074586, |
| "epoch": 0.9599096555618295, |
| "grad_norm": 82.83116149902344, |
| "learning_rate": 4.9154855896176555e-06, |
| "loss": 0.8453, |
| "mean_token_accuracy": 0.7748985409736633, |
| "num_tokens": 13808044.0, |
| "step": 1700 |
| }, |
| { |
| "entropy": 1.4003316402435302, |
| "epoch": 0.9627329192546584, |
| "grad_norm": 101.7616958618164, |
| "learning_rate": 4.9149912202696905e-06, |
| "loss": 0.8407, |
| "mean_token_accuracy": 0.7737895011901855, |
| "num_tokens": 13848853.0, |
| "step": 1705 |
| }, |
| { |
| "entropy": 1.2794543743133544, |
| "epoch": 0.9655561829474872, |
| "grad_norm": 88.42160034179688, |
| "learning_rate": 4.9144954427568615e-06, |
| "loss": 0.775, |
| "mean_token_accuracy": 0.7884101748466492, |
| "num_tokens": 13889396.0, |
| "step": 1710 |
| }, |
| { |
| "entropy": 1.3160929203033447, |
| "epoch": 0.9683794466403162, |
| "grad_norm": 89.33016204833984, |
| "learning_rate": 4.913998257469189e-06, |
| "loss": 0.7975, |
| "mean_token_accuracy": 0.7818446278572082, |
| "num_tokens": 13930171.0, |
| "step": 1715 |
| }, |
| { |
| "entropy": 1.4001628637313843, |
| "epoch": 0.9712027103331451, |
| "grad_norm": 84.03208923339844, |
| "learning_rate": 4.913499664797805e-06, |
| "loss": 0.8569, |
| "mean_token_accuracy": 0.7669258832931518, |
| "num_tokens": 13970784.0, |
| "step": 1720 |
| }, |
| { |
| "entropy": 1.3456218719482422, |
| "epoch": 0.974025974025974, |
| "grad_norm": 91.97527313232422, |
| "learning_rate": 4.912999665134944e-06, |
| "loss": 0.8883, |
| "mean_token_accuracy": 0.7597482323646545, |
| "num_tokens": 14011324.0, |
| "step": 1725 |
| }, |
| { |
| "entropy": 1.305102777481079, |
| "epoch": 0.9768492377188029, |
| "grad_norm": 78.9338150024414, |
| "learning_rate": 4.912498258873952e-06, |
| "loss": 0.7794, |
| "mean_token_accuracy": 0.7846640110015869, |
| "num_tokens": 14052054.0, |
| "step": 1730 |
| }, |
| { |
| "entropy": 1.3051819801330566, |
| "epoch": 0.9796725014116319, |
| "grad_norm": 78.96739196777344, |
| "learning_rate": 4.911995446409277e-06, |
| "loss": 0.7876, |
| "mean_token_accuracy": 0.7836643576622009, |
| "num_tokens": 14092687.0, |
| "step": 1735 |
| }, |
| { |
| "entropy": 1.3111673593521118, |
| "epoch": 0.9824957651044608, |
| "grad_norm": 70.50989532470703, |
| "learning_rate": 4.911491228136478e-06, |
| "loss": 0.7716, |
| "mean_token_accuracy": 0.7886781573295594, |
| "num_tokens": 14133371.0, |
| "step": 1740 |
| }, |
| { |
| "entropy": 1.206014060974121, |
| "epoch": 0.9853190287972897, |
| "grad_norm": 84.27452087402344, |
| "learning_rate": 4.9109856044522164e-06, |
| "loss": 0.7606, |
| "mean_token_accuracy": 0.7878808975219727, |
| "num_tokens": 14173882.0, |
| "step": 1745 |
| }, |
| { |
| "entropy": 1.2502301692962647, |
| "epoch": 0.9881422924901185, |
| "grad_norm": 78.59927368164062, |
| "learning_rate": 4.91047857575426e-06, |
| "loss": 0.7112, |
| "mean_token_accuracy": 0.8004097700119018, |
| "num_tokens": 14214526.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 1.3274275541305542, |
| "epoch": 0.9909655561829475, |
| "grad_norm": 81.658447265625, |
| "learning_rate": 4.909970142441483e-06, |
| "loss": 0.8183, |
| "mean_token_accuracy": 0.7792202591896057, |
| "num_tokens": 14255273.0, |
| "step": 1755 |
| }, |
| { |
| "entropy": 1.3543532848358155, |
| "epoch": 0.9937888198757764, |
| "grad_norm": 87.37543487548828, |
| "learning_rate": 4.909460304913863e-06, |
| "loss": 0.8334, |
| "mean_token_accuracy": 0.77262362241745, |
| "num_tokens": 14296045.0, |
| "step": 1760 |
| }, |
| { |
| "entropy": 1.2293973207473754, |
| "epoch": 0.9966120835686053, |
| "grad_norm": 78.08673095703125, |
| "learning_rate": 4.9089490635724845e-06, |
| "loss": 0.794, |
| "mean_token_accuracy": 0.7820210456848145, |
| "num_tokens": 14336723.0, |
| "step": 1765 |
| }, |
| { |
| "entropy": 1.3779594898223877, |
| "epoch": 0.9994353472614342, |
| "grad_norm": 90.49781036376953, |
| "learning_rate": 4.908436418819533e-06, |
| "loss": 0.8196, |
| "mean_token_accuracy": 0.7763382792472839, |
| "num_tokens": 14377368.0, |
| "step": 1770 |
| }, |
| { |
| "entropy": 1.2417575359344482, |
| "epoch": 1.002258610954263, |
| "grad_norm": 78.08610534667969, |
| "learning_rate": 4.907922371058302e-06, |
| "loss": 0.7171, |
| "mean_token_accuracy": 0.7971941590309143, |
| "num_tokens": 14411228.0, |
| "step": 1775 |
| }, |
| { |
| "entropy": 0.9904776334762573, |
| "epoch": 1.005081874647092, |
| "grad_norm": 89.43754577636719, |
| "learning_rate": 4.907406920693187e-06, |
| "loss": 0.6408, |
| "mean_token_accuracy": 0.8144368052482605, |
| "num_tokens": 14451550.0, |
| "step": 1780 |
| }, |
| { |
| "entropy": 1.1428791284561157, |
| "epoch": 1.007905138339921, |
| "grad_norm": 81.54706573486328, |
| "learning_rate": 4.9068900681296845e-06, |
| "loss": 0.6205, |
| "mean_token_accuracy": 0.8160752058029175, |
| "num_tokens": 14492399.0, |
| "step": 1785 |
| }, |
| { |
| "entropy": 1.0305280327796935, |
| "epoch": 1.0107284020327498, |
| "grad_norm": 83.88504028320312, |
| "learning_rate": 4.906371813774398e-06, |
| "loss": 0.5993, |
| "mean_token_accuracy": 0.8244444727897644, |
| "num_tokens": 14533135.0, |
| "step": 1790 |
| }, |
| { |
| "entropy": 1.0244199752807617, |
| "epoch": 1.0135516657255788, |
| "grad_norm": 79.7117919921875, |
| "learning_rate": 4.90585215803503e-06, |
| "loss": 0.6408, |
| "mean_token_accuracy": 0.8139601945877075, |
| "num_tokens": 14573933.0, |
| "step": 1795 |
| }, |
| { |
| "entropy": 1.0104292154312133, |
| "epoch": 1.0163749294184077, |
| "grad_norm": 85.22904205322266, |
| "learning_rate": 4.9053311013203906e-06, |
| "loss": 0.6213, |
| "mean_token_accuracy": 0.8191802263259887, |
| "num_tokens": 14614666.0, |
| "step": 1800 |
| }, |
| { |
| "entropy": 1.0330065846443177, |
| "epoch": 1.0191981931112366, |
| "grad_norm": 79.9068832397461, |
| "learning_rate": 4.904808644040388e-06, |
| "loss": 0.585, |
| "mean_token_accuracy": 0.829715096950531, |
| "num_tokens": 14655486.0, |
| "step": 1805 |
| }, |
| { |
| "entropy": 1.1681673049926757, |
| "epoch": 1.0220214568040655, |
| "grad_norm": 96.07464599609375, |
| "learning_rate": 4.90428478660603e-06, |
| "loss": 0.7047, |
| "mean_token_accuracy": 0.7974439859390259, |
| "num_tokens": 14695925.0, |
| "step": 1810 |
| }, |
| { |
| "entropy": 1.026873731613159, |
| "epoch": 1.0248447204968945, |
| "grad_norm": 83.59033203125, |
| "learning_rate": 4.9037595294294334e-06, |
| "loss": 0.6233, |
| "mean_token_accuracy": 0.817428719997406, |
| "num_tokens": 14736564.0, |
| "step": 1815 |
| }, |
| { |
| "entropy": 1.0269073605537415, |
| "epoch": 1.0276679841897234, |
| "grad_norm": 87.5685806274414, |
| "learning_rate": 4.90323287292381e-06, |
| "loss": 0.6265, |
| "mean_token_accuracy": 0.8175249814987182, |
| "num_tokens": 14777242.0, |
| "step": 1820 |
| }, |
| { |
| "entropy": 1.0845494270324707, |
| "epoch": 1.0304912478825523, |
| "grad_norm": 87.97651672363281, |
| "learning_rate": 4.902704817503474e-06, |
| "loss": 0.6521, |
| "mean_token_accuracy": 0.8123294234275817, |
| "num_tokens": 14817756.0, |
| "step": 1825 |
| }, |
| { |
| "entropy": 1.094653880596161, |
| "epoch": 1.0333145115753812, |
| "grad_norm": 73.90741729736328, |
| "learning_rate": 4.90217536358384e-06, |
| "loss": 0.6592, |
| "mean_token_accuracy": 0.8132083773612976, |
| "num_tokens": 14857979.0, |
| "step": 1830 |
| }, |
| { |
| "entropy": 1.0332510590553283, |
| "epoch": 1.0361377752682102, |
| "grad_norm": 80.54743957519531, |
| "learning_rate": 4.901644511581425e-06, |
| "loss": 0.6275, |
| "mean_token_accuracy": 0.817654836177826, |
| "num_tokens": 14898498.0, |
| "step": 1835 |
| }, |
| { |
| "entropy": 1.1335099458694458, |
| "epoch": 1.0389610389610389, |
| "grad_norm": 111.70860290527344, |
| "learning_rate": 4.901112261913841e-06, |
| "loss": 0.702, |
| "mean_token_accuracy": 0.7976698398590087, |
| "num_tokens": 14938994.0, |
| "step": 1840 |
| }, |
| { |
| "entropy": 1.0480782866477967, |
| "epoch": 1.0417843026538678, |
| "grad_norm": 65.32731628417969, |
| "learning_rate": 4.9005786149998045e-06, |
| "loss": 0.5995, |
| "mean_token_accuracy": 0.8246279358863831, |
| "num_tokens": 14979080.0, |
| "step": 1845 |
| }, |
| { |
| "entropy": 1.1315045833587647, |
| "epoch": 1.0446075663466967, |
| "grad_norm": 94.34065246582031, |
| "learning_rate": 4.90004357125913e-06, |
| "loss": 0.6734, |
| "mean_token_accuracy": 0.8067026734352112, |
| "num_tokens": 15019684.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 1.2490570545196533, |
| "epoch": 1.0474308300395256, |
| "grad_norm": 91.20641326904297, |
| "learning_rate": 4.899507131112727e-06, |
| "loss": 0.75, |
| "mean_token_accuracy": 0.7861335158348084, |
| "num_tokens": 15060297.0, |
| "step": 1855 |
| }, |
| { |
| "entropy": 1.2040926933288574, |
| "epoch": 1.0502540937323546, |
| "grad_norm": 77.99979400634766, |
| "learning_rate": 4.89896929498261e-06, |
| "loss": 0.6917, |
| "mean_token_accuracy": 0.8017707467079163, |
| "num_tokens": 15100764.0, |
| "step": 1860 |
| }, |
| { |
| "entropy": 1.2546609878540038, |
| "epoch": 1.0530773574251835, |
| "grad_norm": 84.84165954589844, |
| "learning_rate": 4.898430063291886e-06, |
| "loss": 0.6753, |
| "mean_token_accuracy": 0.8059080719947815, |
| "num_tokens": 15141502.0, |
| "step": 1865 |
| }, |
| { |
| "entropy": 1.1682037115097046, |
| "epoch": 1.0559006211180124, |
| "grad_norm": 77.86764526367188, |
| "learning_rate": 4.897889436464763e-06, |
| "loss": 0.6515, |
| "mean_token_accuracy": 0.8100051641464233, |
| "num_tokens": 15182082.0, |
| "step": 1870 |
| }, |
| { |
| "entropy": 1.063246726989746, |
| "epoch": 1.0587238848108413, |
| "grad_norm": 74.87948608398438, |
| "learning_rate": 4.8973474149265456e-06, |
| "loss": 0.6319, |
| "mean_token_accuracy": 0.8158110737800598, |
| "num_tokens": 15222792.0, |
| "step": 1875 |
| }, |
| { |
| "entropy": 1.170669722557068, |
| "epoch": 1.0615471485036703, |
| "grad_norm": 88.10620880126953, |
| "learning_rate": 4.896803999103636e-06, |
| "loss": 0.6891, |
| "mean_token_accuracy": 0.8011930465698243, |
| "num_tokens": 15263508.0, |
| "step": 1880 |
| }, |
| { |
| "entropy": 1.0335828423500062, |
| "epoch": 1.0643704121964992, |
| "grad_norm": 93.42755889892578, |
| "learning_rate": 4.896259189423533e-06, |
| "loss": 0.5967, |
| "mean_token_accuracy": 0.8240804553031922, |
| "num_tokens": 15304067.0, |
| "step": 1885 |
| }, |
| { |
| "entropy": 1.0640571594238282, |
| "epoch": 1.0671936758893281, |
| "grad_norm": 100.56329345703125, |
| "learning_rate": 4.895712986314831e-06, |
| "loss": 0.6666, |
| "mean_token_accuracy": 0.8102141976356506, |
| "num_tokens": 15344671.0, |
| "step": 1890 |
| }, |
| { |
| "entropy": 1.099123191833496, |
| "epoch": 1.070016939582157, |
| "grad_norm": 84.1073989868164, |
| "learning_rate": 4.8951653902072226e-06, |
| "loss": 0.6195, |
| "mean_token_accuracy": 0.8202435731887817, |
| "num_tokens": 15384831.0, |
| "step": 1895 |
| }, |
| { |
| "entropy": 1.1124640941619872, |
| "epoch": 1.072840203274986, |
| "grad_norm": 80.79557800292969, |
| "learning_rate": 4.894616401531495e-06, |
| "loss": 0.6671, |
| "mean_token_accuracy": 0.8074574589729309, |
| "num_tokens": 15425501.0, |
| "step": 1900 |
| }, |
| { |
| "entropy": 1.1609252452850343, |
| "epoch": 1.075663466967815, |
| "grad_norm": 73.7085189819336, |
| "learning_rate": 4.89406602071953e-06, |
| "loss": 0.7432, |
| "mean_token_accuracy": 0.7908249497413635, |
| "num_tokens": 15466052.0, |
| "step": 1905 |
| }, |
| { |
| "entropy": 1.2594902276992799, |
| "epoch": 1.0784867306606438, |
| "grad_norm": 90.67098236083984, |
| "learning_rate": 4.893514248204307e-06, |
| "loss": 0.7448, |
| "mean_token_accuracy": 0.7901643276214599, |
| "num_tokens": 15506509.0, |
| "step": 1910 |
| }, |
| { |
| "entropy": 1.089804220199585, |
| "epoch": 1.0813099943534725, |
| "grad_norm": 83.40077209472656, |
| "learning_rate": 4.892961084419899e-06, |
| "loss": 0.6137, |
| "mean_token_accuracy": 0.8197898030281067, |
| "num_tokens": 15547127.0, |
| "step": 1915 |
| }, |
| { |
| "entropy": 1.0303892850875855, |
| "epoch": 1.0841332580463015, |
| "grad_norm": 88.45729064941406, |
| "learning_rate": 4.892406529801472e-06, |
| "loss": 0.662, |
| "mean_token_accuracy": 0.8114518284797668, |
| "num_tokens": 15587691.0, |
| "step": 1920 |
| }, |
| { |
| "entropy": 1.036658251285553, |
| "epoch": 1.0869565217391304, |
| "grad_norm": 74.7301254272461, |
| "learning_rate": 4.8918505847852885e-06, |
| "loss": 0.6001, |
| "mean_token_accuracy": 0.8228102445602417, |
| "num_tokens": 15628447.0, |
| "step": 1925 |
| }, |
| { |
| "entropy": 1.0575623393058777, |
| "epoch": 1.0897797854319593, |
| "grad_norm": 84.69684600830078, |
| "learning_rate": 4.8912932498087035e-06, |
| "loss": 0.6356, |
| "mean_token_accuracy": 0.8159940361976623, |
| "num_tokens": 15668880.0, |
| "step": 1930 |
| }, |
| { |
| "entropy": 1.0984156847000122, |
| "epoch": 1.0926030491247882, |
| "grad_norm": 83.59989166259766, |
| "learning_rate": 4.890734525310166e-06, |
| "loss": 0.6254, |
| "mean_token_accuracy": 0.8188717722892761, |
| "num_tokens": 15709301.0, |
| "step": 1935 |
| }, |
| { |
| "entropy": 1.264682650566101, |
| "epoch": 1.0954263128176172, |
| "grad_norm": 111.08476257324219, |
| "learning_rate": 4.890174411729218e-06, |
| "loss": 0.7209, |
| "mean_token_accuracy": 0.7967895984649658, |
| "num_tokens": 15749861.0, |
| "step": 1940 |
| }, |
| { |
| "entropy": 1.0748285174369812, |
| "epoch": 1.098249576510446, |
| "grad_norm": 72.8355941772461, |
| "learning_rate": 4.889612909506495e-06, |
| "loss": 0.6334, |
| "mean_token_accuracy": 0.8148902177810669, |
| "num_tokens": 15790605.0, |
| "step": 1945 |
| }, |
| { |
| "entropy": 1.1956205368041992, |
| "epoch": 1.101072840203275, |
| "grad_norm": 76.77680206298828, |
| "learning_rate": 4.889050019083722e-06, |
| "loss": 0.6858, |
| "mean_token_accuracy": 0.8026182293891907, |
| "num_tokens": 15831241.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 1.1811149835586547, |
| "epoch": 1.103896103896104, |
| "grad_norm": 85.11073303222656, |
| "learning_rate": 4.88848574090372e-06, |
| "loss": 0.6532, |
| "mean_token_accuracy": 0.8134795784950256, |
| "num_tokens": 15871995.0, |
| "step": 1955 |
| }, |
| { |
| "entropy": 1.0546734929084778, |
| "epoch": 1.1067193675889329, |
| "grad_norm": 91.46461486816406, |
| "learning_rate": 4.8879200754104e-06, |
| "loss": 0.6257, |
| "mean_token_accuracy": 0.8180089235305786, |
| "num_tokens": 15912519.0, |
| "step": 1960 |
| }, |
| { |
| "entropy": 1.1119594335556031, |
| "epoch": 1.1095426312817618, |
| "grad_norm": 81.14483642578125, |
| "learning_rate": 4.887353023048762e-06, |
| "loss": 0.6163, |
| "mean_token_accuracy": 0.8205843210220337, |
| "num_tokens": 15953027.0, |
| "step": 1965 |
| }, |
| { |
| "entropy": 1.1416823863983154, |
| "epoch": 1.1123658949745907, |
| "grad_norm": 77.70703887939453, |
| "learning_rate": 4.886784584264903e-06, |
| "loss": 0.6972, |
| "mean_token_accuracy": 0.8091330289840698, |
| "num_tokens": 15993823.0, |
| "step": 1970 |
| }, |
| { |
| "entropy": 1.098362350463867, |
| "epoch": 1.1151891586674196, |
| "grad_norm": 70.67080688476562, |
| "learning_rate": 4.8862147595060045e-06, |
| "loss": 0.64, |
| "mean_token_accuracy": 0.8146771907806396, |
| "num_tokens": 16034476.0, |
| "step": 1975 |
| }, |
| { |
| "entropy": 1.0884705901145935, |
| "epoch": 1.1180124223602483, |
| "grad_norm": 82.03054809570312, |
| "learning_rate": 4.885643549220342e-06, |
| "loss": 0.5783, |
| "mean_token_accuracy": 0.8282935738563537, |
| "num_tokens": 16075193.0, |
| "step": 1980 |
| }, |
| { |
| "entropy": 1.2014532089233398, |
| "epoch": 1.1208356860530773, |
| "grad_norm": 75.95311737060547, |
| "learning_rate": 4.885070953857279e-06, |
| "loss": 0.7202, |
| "mean_token_accuracy": 0.7983322143554688, |
| "num_tokens": 16115418.0, |
| "step": 1985 |
| }, |
| { |
| "entropy": 1.1905818343162538, |
| "epoch": 1.1236589497459062, |
| "grad_norm": 88.39530181884766, |
| "learning_rate": 4.88449697386727e-06, |
| "loss": 0.658, |
| "mean_token_accuracy": 0.8100619435310363, |
| "num_tokens": 16156001.0, |
| "step": 1990 |
| }, |
| { |
| "entropy": 1.1695854306221007, |
| "epoch": 1.1264822134387351, |
| "grad_norm": 75.26634979248047, |
| "learning_rate": 4.883921609701858e-06, |
| "loss": 0.6497, |
| "mean_token_accuracy": 0.8133507966995239, |
| "num_tokens": 16196588.0, |
| "step": 1995 |
| }, |
| { |
| "entropy": 1.1279015779495238, |
| "epoch": 1.129305477131564, |
| "grad_norm": 79.23324584960938, |
| "learning_rate": 4.883344861813675e-06, |
| "loss": 0.6328, |
| "mean_token_accuracy": 0.8204566597938537, |
| "num_tokens": 16237363.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.129305477131564, |
| "eval_entropy": 1.276236355304718, |
| "eval_loss": 0.6153639554977417, |
| "eval_mean_token_accuracy": 0.8272382497787476, |
| "eval_num_tokens": 16237363.0, |
| "eval_runtime": 2.4525, |
| "eval_samples_per_second": 15.902, |
| "eval_steps_per_second": 2.039, |
| "step": 2000 |
| }, |
| { |
| "entropy": 1.1289226531982421, |
| "epoch": 1.132128740824393, |
| "grad_norm": 93.11396789550781, |
| "learning_rate": 4.882766730656443e-06, |
| "loss": 0.6587, |
| "mean_token_accuracy": 0.8116627216339112, |
| "num_tokens": 16278012.0, |
| "step": 2005 |
| }, |
| { |
| "entropy": 1.1176002979278565, |
| "epoch": 1.134952004517222, |
| "grad_norm": 70.02957153320312, |
| "learning_rate": 4.882187216684969e-06, |
| "loss": 0.6425, |
| "mean_token_accuracy": 0.8133005023002624, |
| "num_tokens": 16318860.0, |
| "step": 2010 |
| }, |
| { |
| "entropy": 1.1600654363632201, |
| "epoch": 1.1377752682100508, |
| "grad_norm": 87.49974060058594, |
| "learning_rate": 4.881606320355152e-06, |
| "loss": 0.6819, |
| "mean_token_accuracy": 0.8076816439628601, |
| "num_tokens": 16359499.0, |
| "step": 2015 |
| }, |
| { |
| "entropy": 1.3200592517852783, |
| "epoch": 1.1405985319028797, |
| "grad_norm": 90.8593521118164, |
| "learning_rate": 4.881024042123974e-06, |
| "loss": 0.7624, |
| "mean_token_accuracy": 0.787363862991333, |
| "num_tokens": 16400128.0, |
| "step": 2020 |
| }, |
| { |
| "entropy": 1.1962620735168457, |
| "epoch": 1.1434217955957087, |
| "grad_norm": 73.66609954833984, |
| "learning_rate": 4.880440382449508e-06, |
| "loss": 0.6969, |
| "mean_token_accuracy": 0.8003793478012085, |
| "num_tokens": 16440829.0, |
| "step": 2025 |
| }, |
| { |
| "entropy": 1.1741215705871582, |
| "epoch": 1.1462450592885376, |
| "grad_norm": 93.21961212158203, |
| "learning_rate": 4.87985534179091e-06, |
| "loss": 0.6988, |
| "mean_token_accuracy": 0.8001942276954651, |
| "num_tokens": 16481536.0, |
| "step": 2030 |
| }, |
| { |
| "entropy": 1.1626744151115418, |
| "epoch": 1.1490683229813665, |
| "grad_norm": 78.82553100585938, |
| "learning_rate": 4.879268920608428e-06, |
| "loss": 0.684, |
| "mean_token_accuracy": 0.8021666407585144, |
| "num_tokens": 16522024.0, |
| "step": 2035 |
| }, |
| { |
| "entropy": 1.2014680624008178, |
| "epoch": 1.1518915866741954, |
| "grad_norm": 69.18099212646484, |
| "learning_rate": 4.87868111936339e-06, |
| "loss": 0.6776, |
| "mean_token_accuracy": 0.8064275741577148, |
| "num_tokens": 16562819.0, |
| "step": 2040 |
| }, |
| { |
| "entropy": 1.194064235687256, |
| "epoch": 1.1547148503670244, |
| "grad_norm": 100.71419525146484, |
| "learning_rate": 4.878091938518213e-06, |
| "loss": 0.6952, |
| "mean_token_accuracy": 0.8035964369773865, |
| "num_tokens": 16603097.0, |
| "step": 2045 |
| }, |
| { |
| "entropy": 1.0866102695465087, |
| "epoch": 1.1575381140598533, |
| "grad_norm": 86.4557876586914, |
| "learning_rate": 4.877501378536398e-06, |
| "loss": 0.6586, |
| "mean_token_accuracy": 0.8083908200263977, |
| "num_tokens": 16643682.0, |
| "step": 2050 |
| }, |
| { |
| "entropy": 1.177332592010498, |
| "epoch": 1.160361377752682, |
| "grad_norm": 79.74742126464844, |
| "learning_rate": 4.876909439882533e-06, |
| "loss": 0.7028, |
| "mean_token_accuracy": 0.8003194212913514, |
| "num_tokens": 16684377.0, |
| "step": 2055 |
| }, |
| { |
| "entropy": 1.1629464387893678, |
| "epoch": 1.163184641445511, |
| "grad_norm": 71.04415130615234, |
| "learning_rate": 4.8763161230222875e-06, |
| "loss": 0.6799, |
| "mean_token_accuracy": 0.8050651669502258, |
| "num_tokens": 16725224.0, |
| "step": 2060 |
| }, |
| { |
| "entropy": 1.272722601890564, |
| "epoch": 1.1660079051383399, |
| "grad_norm": 95.0716323852539, |
| "learning_rate": 4.875721428422418e-06, |
| "loss": 0.7034, |
| "mean_token_accuracy": 0.7992398619651795, |
| "num_tokens": 16765834.0, |
| "step": 2065 |
| }, |
| { |
| "entropy": 1.1523715257644653, |
| "epoch": 1.1688311688311688, |
| "grad_norm": 71.78240966796875, |
| "learning_rate": 4.875125356550762e-06, |
| "loss": 0.6712, |
| "mean_token_accuracy": 0.8078425049781799, |
| "num_tokens": 16806092.0, |
| "step": 2070 |
| }, |
| { |
| "entropy": 1.1174561262130738, |
| "epoch": 1.1716544325239977, |
| "grad_norm": 80.72341918945312, |
| "learning_rate": 4.874527907876244e-06, |
| "loss": 0.6329, |
| "mean_token_accuracy": 0.8179211616516113, |
| "num_tokens": 16846837.0, |
| "step": 2075 |
| }, |
| { |
| "entropy": 1.1514964818954467, |
| "epoch": 1.1744776962168266, |
| "grad_norm": 97.3735580444336, |
| "learning_rate": 4.87392908286887e-06, |
| "loss": 0.673, |
| "mean_token_accuracy": 0.8057490825653076, |
| "num_tokens": 16887539.0, |
| "step": 2080 |
| }, |
| { |
| "entropy": 1.1512172937393188, |
| "epoch": 1.1773009599096556, |
| "grad_norm": 90.2729263305664, |
| "learning_rate": 4.873328881999726e-06, |
| "loss": 0.6707, |
| "mean_token_accuracy": 0.8072671890258789, |
| "num_tokens": 16928198.0, |
| "step": 2085 |
| }, |
| { |
| "entropy": 1.1971535444259644, |
| "epoch": 1.1801242236024845, |
| "grad_norm": 84.18321990966797, |
| "learning_rate": 4.872727305740986e-06, |
| "loss": 0.6697, |
| "mean_token_accuracy": 0.8083752751350403, |
| "num_tokens": 16968681.0, |
| "step": 2090 |
| }, |
| { |
| "entropy": 1.1049229860305787, |
| "epoch": 1.1829474872953134, |
| "grad_norm": 83.14247131347656, |
| "learning_rate": 4.872124354565901e-06, |
| "loss": 0.658, |
| "mean_token_accuracy": 0.8073788404464721, |
| "num_tokens": 17009546.0, |
| "step": 2095 |
| }, |
| { |
| "entropy": 1.0792413353919983, |
| "epoch": 1.1857707509881423, |
| "grad_norm": 70.80720520019531, |
| "learning_rate": 4.871520028948807e-06, |
| "loss": 0.6335, |
| "mean_token_accuracy": 0.8118250489234924, |
| "num_tokens": 17050298.0, |
| "step": 2100 |
| }, |
| { |
| "entropy": 1.1736824989318848, |
| "epoch": 1.1885940146809713, |
| "grad_norm": 81.12787628173828, |
| "learning_rate": 4.870914329365117e-06, |
| "loss": 0.724, |
| "mean_token_accuracy": 0.7910011529922485, |
| "num_tokens": 17090911.0, |
| "step": 2105 |
| }, |
| { |
| "entropy": 1.2028324127197265, |
| "epoch": 1.1914172783738002, |
| "grad_norm": 89.57659912109375, |
| "learning_rate": 4.870307256291331e-06, |
| "loss": 0.7212, |
| "mean_token_accuracy": 0.7946621179580688, |
| "num_tokens": 17131732.0, |
| "step": 2110 |
| }, |
| { |
| "entropy": 1.023105502128601, |
| "epoch": 1.194240542066629, |
| "grad_norm": 77.13057708740234, |
| "learning_rate": 4.869698810205025e-06, |
| "loss": 0.6184, |
| "mean_token_accuracy": 0.8203395962715149, |
| "num_tokens": 17172456.0, |
| "step": 2115 |
| }, |
| { |
| "entropy": 1.155041766166687, |
| "epoch": 1.1970638057594578, |
| "grad_norm": 73.1446304321289, |
| "learning_rate": 4.869088991584854e-06, |
| "loss": 0.6336, |
| "mean_token_accuracy": 0.8165818333625794, |
| "num_tokens": 17212966.0, |
| "step": 2120 |
| }, |
| { |
| "entropy": 1.1935111284255981, |
| "epoch": 1.199887069452287, |
| "grad_norm": 81.32876586914062, |
| "learning_rate": 4.8684778009105596e-06, |
| "loss": 0.6916, |
| "mean_token_accuracy": 0.79750075340271, |
| "num_tokens": 17253526.0, |
| "step": 2125 |
| }, |
| { |
| "entropy": 1.0915156602859497, |
| "epoch": 1.2027103331451157, |
| "grad_norm": 91.79847717285156, |
| "learning_rate": 4.867865238662954e-06, |
| "loss": 0.6711, |
| "mean_token_accuracy": 0.8062323927879333, |
| "num_tokens": 17293948.0, |
| "step": 2130 |
| }, |
| { |
| "entropy": 1.118806290626526, |
| "epoch": 1.2055335968379446, |
| "grad_norm": 73.3872299194336, |
| "learning_rate": 4.867251305323935e-06, |
| "loss": 0.6266, |
| "mean_token_accuracy": 0.8176208019256592, |
| "num_tokens": 17334493.0, |
| "step": 2135 |
| }, |
| { |
| "entropy": 1.1635672569274902, |
| "epoch": 1.2083568605307735, |
| "grad_norm": 89.14440155029297, |
| "learning_rate": 4.866636001376475e-06, |
| "loss": 0.679, |
| "mean_token_accuracy": 0.8068656921386719, |
| "num_tokens": 17375343.0, |
| "step": 2140 |
| }, |
| { |
| "entropy": 1.1096044063568116, |
| "epoch": 1.2111801242236024, |
| "grad_norm": 77.38127136230469, |
| "learning_rate": 4.8660193273046295e-06, |
| "loss": 0.6625, |
| "mean_token_accuracy": 0.8062254667282105, |
| "num_tokens": 17415968.0, |
| "step": 2145 |
| }, |
| { |
| "entropy": 1.2033586382865906, |
| "epoch": 1.2140033879164314, |
| "grad_norm": 89.66434478759766, |
| "learning_rate": 4.865401283593525e-06, |
| "loss": 0.6891, |
| "mean_token_accuracy": 0.8035327792167664, |
| "num_tokens": 17456339.0, |
| "step": 2150 |
| }, |
| { |
| "entropy": 1.1876700401306153, |
| "epoch": 1.2168266516092603, |
| "grad_norm": 76.18563842773438, |
| "learning_rate": 4.864781870729371e-06, |
| "loss": 0.6963, |
| "mean_token_accuracy": 0.8002577424049377, |
| "num_tokens": 17496415.0, |
| "step": 2155 |
| }, |
| { |
| "entropy": 1.2191560983657836, |
| "epoch": 1.2196499153020892, |
| "grad_norm": 70.3829574584961, |
| "learning_rate": 4.864161089199453e-06, |
| "loss": 0.7123, |
| "mean_token_accuracy": 0.7971852898597718, |
| "num_tokens": 17537171.0, |
| "step": 2160 |
| }, |
| { |
| "entropy": 1.0485622882843018, |
| "epoch": 1.2224731789949181, |
| "grad_norm": 72.504150390625, |
| "learning_rate": 4.86353893949213e-06, |
| "loss": 0.5931, |
| "mean_token_accuracy": 0.827324640750885, |
| "num_tokens": 17577545.0, |
| "step": 2165 |
| }, |
| { |
| "entropy": 1.0504500150680542, |
| "epoch": 1.225296442687747, |
| "grad_norm": 79.71308135986328, |
| "learning_rate": 4.862915422096842e-06, |
| "loss": 0.6625, |
| "mean_token_accuracy": 0.8063273549079895, |
| "num_tokens": 17618424.0, |
| "step": 2170 |
| }, |
| { |
| "entropy": 1.089062762260437, |
| "epoch": 1.228119706380576, |
| "grad_norm": 76.05992889404297, |
| "learning_rate": 4.862290537504102e-06, |
| "loss": 0.6293, |
| "mean_token_accuracy": 0.8189480900764465, |
| "num_tokens": 17659160.0, |
| "step": 2175 |
| }, |
| { |
| "entropy": 1.1212316513061524, |
| "epoch": 1.230942970073405, |
| "grad_norm": 84.9821548461914, |
| "learning_rate": 4.861664286205499e-06, |
| "loss": 0.6649, |
| "mean_token_accuracy": 0.8077459692955017, |
| "num_tokens": 17699632.0, |
| "step": 2180 |
| }, |
| { |
| "entropy": 1.063609516620636, |
| "epoch": 1.2337662337662338, |
| "grad_norm": 75.26050567626953, |
| "learning_rate": 4.861036668693698e-06, |
| "loss": 0.6225, |
| "mean_token_accuracy": 0.8181837797164917, |
| "num_tokens": 17740472.0, |
| "step": 2185 |
| }, |
| { |
| "entropy": 1.2392653703689576, |
| "epoch": 1.2365894974590628, |
| "grad_norm": 82.38720703125, |
| "learning_rate": 4.860407685462438e-06, |
| "loss": 0.7298, |
| "mean_token_accuracy": 0.7937769293785095, |
| "num_tokens": 17780447.0, |
| "step": 2190 |
| }, |
| { |
| "entropy": 1.1494086742401124, |
| "epoch": 1.2394127611518915, |
| "grad_norm": 84.39624786376953, |
| "learning_rate": 4.859777337006533e-06, |
| "loss": 0.6817, |
| "mean_token_accuracy": 0.8082091212272644, |
| "num_tokens": 17821131.0, |
| "step": 2195 |
| }, |
| { |
| "entropy": 1.169661521911621, |
| "epoch": 1.2422360248447206, |
| "grad_norm": 80.15399169921875, |
| "learning_rate": 4.85914562382187e-06, |
| "loss": 0.6588, |
| "mean_token_accuracy": 0.8093379259109497, |
| "num_tokens": 17861753.0, |
| "step": 2200 |
| }, |
| { |
| "entropy": 1.1408036470413208, |
| "epoch": 1.2450592885375493, |
| "grad_norm": 71.21530151367188, |
| "learning_rate": 4.858512546405411e-06, |
| "loss": 0.6404, |
| "mean_token_accuracy": 0.8124599695205689, |
| "num_tokens": 17902320.0, |
| "step": 2205 |
| }, |
| { |
| "entropy": 1.0396205544471742, |
| "epoch": 1.2478825522303783, |
| "grad_norm": 94.92953491210938, |
| "learning_rate": 4.857878105255189e-06, |
| "loss": 0.6568, |
| "mean_token_accuracy": 0.8109822630882263, |
| "num_tokens": 17943114.0, |
| "step": 2210 |
| }, |
| { |
| "entropy": 1.1258668422698974, |
| "epoch": 1.2507058159232072, |
| "grad_norm": 84.43074798583984, |
| "learning_rate": 4.857242300870313e-06, |
| "loss": 0.6687, |
| "mean_token_accuracy": 0.8072779536247253, |
| "num_tokens": 17983284.0, |
| "step": 2215 |
| }, |
| { |
| "entropy": 1.2762274265289306, |
| "epoch": 1.253529079616036, |
| "grad_norm": 80.23455810546875, |
| "learning_rate": 4.8566051337509626e-06, |
| "loss": 0.7305, |
| "mean_token_accuracy": 0.7920625567436218, |
| "num_tokens": 18023783.0, |
| "step": 2220 |
| }, |
| { |
| "entropy": 1.1090112805366517, |
| "epoch": 1.256352343308865, |
| "grad_norm": 83.74242401123047, |
| "learning_rate": 4.8559666043983886e-06, |
| "loss": 0.6528, |
| "mean_token_accuracy": 0.8134839653968811, |
| "num_tokens": 18064489.0, |
| "step": 2225 |
| }, |
| { |
| "entropy": 1.1557289123535157, |
| "epoch": 1.259175607001694, |
| "grad_norm": 76.00527954101562, |
| "learning_rate": 4.855326713314916e-06, |
| "loss": 0.6507, |
| "mean_token_accuracy": 0.8127437829971313, |
| "num_tokens": 18105144.0, |
| "step": 2230 |
| }, |
| { |
| "entropy": 1.1039715051651, |
| "epoch": 1.2619988706945229, |
| "grad_norm": 84.07186126708984, |
| "learning_rate": 4.854685461003939e-06, |
| "loss": 0.6655, |
| "mean_token_accuracy": 0.8063163757324219, |
| "num_tokens": 18145903.0, |
| "step": 2235 |
| }, |
| { |
| "entropy": 1.129080843925476, |
| "epoch": 1.2648221343873518, |
| "grad_norm": 75.82025909423828, |
| "learning_rate": 4.854042847969921e-06, |
| "loss": 0.6461, |
| "mean_token_accuracy": 0.8113402366638184, |
| "num_tokens": 18186731.0, |
| "step": 2240 |
| }, |
| { |
| "entropy": 1.0956727743148804, |
| "epoch": 1.2676453980801807, |
| "grad_norm": 74.54117584228516, |
| "learning_rate": 4.8533988747184e-06, |
| "loss": 0.6425, |
| "mean_token_accuracy": 0.8132710099220276, |
| "num_tokens": 18227163.0, |
| "step": 2245 |
| }, |
| { |
| "entropy": 1.207816481590271, |
| "epoch": 1.2704686617730097, |
| "grad_norm": 69.57730865478516, |
| "learning_rate": 4.852753541755983e-06, |
| "loss": 0.6468, |
| "mean_token_accuracy": 0.8123814582824707, |
| "num_tokens": 18267863.0, |
| "step": 2250 |
| }, |
| { |
| "entropy": 1.2019951105117799, |
| "epoch": 1.2732919254658386, |
| "grad_norm": 77.39385986328125, |
| "learning_rate": 4.852106849590344e-06, |
| "loss": 0.6667, |
| "mean_token_accuracy": 0.8075017094612121, |
| "num_tokens": 18308454.0, |
| "step": 2255 |
| }, |
| { |
| "entropy": 1.2332281589508056, |
| "epoch": 1.2761151891586673, |
| "grad_norm": 82.1375503540039, |
| "learning_rate": 4.8514587987302295e-06, |
| "loss": 0.7035, |
| "mean_token_accuracy": 0.8001717686653137, |
| "num_tokens": 18349213.0, |
| "step": 2260 |
| }, |
| { |
| "entropy": 1.103496754169464, |
| "epoch": 1.2789384528514964, |
| "grad_norm": 77.35172271728516, |
| "learning_rate": 4.850809389685452e-06, |
| "loss": 0.6464, |
| "mean_token_accuracy": 0.8140517354011536, |
| "num_tokens": 18390006.0, |
| "step": 2265 |
| }, |
| { |
| "entropy": 1.0481646060943604, |
| "epoch": 1.2817617165443251, |
| "grad_norm": 74.0533447265625, |
| "learning_rate": 4.8501586229668955e-06, |
| "loss": 0.5638, |
| "mean_token_accuracy": 0.8321304798126221, |
| "num_tokens": 18430668.0, |
| "step": 2270 |
| }, |
| { |
| "entropy": 1.1348963975906372, |
| "epoch": 1.2845849802371543, |
| "grad_norm": 74.89289093017578, |
| "learning_rate": 4.849506499086509e-06, |
| "loss": 0.6235, |
| "mean_token_accuracy": 0.8209902882575989, |
| "num_tokens": 18471333.0, |
| "step": 2275 |
| }, |
| { |
| "entropy": 1.181236457824707, |
| "epoch": 1.287408243929983, |
| "grad_norm": 77.86975860595703, |
| "learning_rate": 4.848853018557311e-06, |
| "loss": 0.6914, |
| "mean_token_accuracy": 0.7989266395568848, |
| "num_tokens": 18511771.0, |
| "step": 2280 |
| }, |
| { |
| "entropy": 1.1591632604598998, |
| "epoch": 1.290231507622812, |
| "grad_norm": 91.24014282226562, |
| "learning_rate": 4.848198181893388e-06, |
| "loss": 0.64, |
| "mean_token_accuracy": 0.8149643659591674, |
| "num_tokens": 18552129.0, |
| "step": 2285 |
| }, |
| { |
| "entropy": 1.2816951274871826, |
| "epoch": 1.2930547713156408, |
| "grad_norm": 86.56483459472656, |
| "learning_rate": 4.847541989609891e-06, |
| "loss": 0.7635, |
| "mean_token_accuracy": 0.7828594326972962, |
| "num_tokens": 18592730.0, |
| "step": 2290 |
| }, |
| { |
| "entropy": 1.1080434799194336, |
| "epoch": 1.2958780350084698, |
| "grad_norm": 82.74638366699219, |
| "learning_rate": 4.846884442223038e-06, |
| "loss": 0.6774, |
| "mean_token_accuracy": 0.8058300733566284, |
| "num_tokens": 18633524.0, |
| "step": 2295 |
| }, |
| { |
| "entropy": 1.1663718938827514, |
| "epoch": 1.2987012987012987, |
| "grad_norm": 75.71199035644531, |
| "learning_rate": 4.8462255402501155e-06, |
| "loss": 0.7107, |
| "mean_token_accuracy": 0.7965960144996643, |
| "num_tokens": 18674190.0, |
| "step": 2300 |
| }, |
| { |
| "entropy": 1.2041790723800658, |
| "epoch": 1.3015245623941276, |
| "grad_norm": 80.01366424560547, |
| "learning_rate": 4.8455652842094735e-06, |
| "loss": 0.655, |
| "mean_token_accuracy": 0.8082357287406922, |
| "num_tokens": 18714763.0, |
| "step": 2305 |
| }, |
| { |
| "entropy": 1.097007966041565, |
| "epoch": 1.3043478260869565, |
| "grad_norm": 77.5677261352539, |
| "learning_rate": 4.8449036746205266e-06, |
| "loss": 0.657, |
| "mean_token_accuracy": 0.8100764632225037, |
| "num_tokens": 18755405.0, |
| "step": 2310 |
| }, |
| { |
| "entropy": 1.173832392692566, |
| "epoch": 1.3071710897797855, |
| "grad_norm": 79.28545379638672, |
| "learning_rate": 4.844240712003756e-06, |
| "loss": 0.6431, |
| "mean_token_accuracy": 0.8142745494842529, |
| "num_tokens": 18796177.0, |
| "step": 2315 |
| }, |
| { |
| "entropy": 1.1786026000976562, |
| "epoch": 1.3099943534726144, |
| "grad_norm": 85.30624389648438, |
| "learning_rate": 4.843576396880707e-06, |
| "loss": 0.6551, |
| "mean_token_accuracy": 0.8108649253845215, |
| "num_tokens": 18836972.0, |
| "step": 2320 |
| }, |
| { |
| "entropy": 1.0986263751983643, |
| "epoch": 1.3128176171654433, |
| "grad_norm": 84.06642150878906, |
| "learning_rate": 4.8429107297739875e-06, |
| "loss": 0.6553, |
| "mean_token_accuracy": 0.8121555089950562, |
| "num_tokens": 18877396.0, |
| "step": 2325 |
| }, |
| { |
| "entropy": 1.1171608209609984, |
| "epoch": 1.3156408808582722, |
| "grad_norm": 74.25096130371094, |
| "learning_rate": 4.84224371120727e-06, |
| "loss": 0.6276, |
| "mean_token_accuracy": 0.8181084632873535, |
| "num_tokens": 18918291.0, |
| "step": 2330 |
| }, |
| { |
| "entropy": 1.1926301956176757, |
| "epoch": 1.318464144551101, |
| "grad_norm": 68.66618347167969, |
| "learning_rate": 4.84157534170529e-06, |
| "loss": 0.6589, |
| "mean_token_accuracy": 0.8088850378990173, |
| "num_tokens": 18958467.0, |
| "step": 2335 |
| }, |
| { |
| "entropy": 1.2194635629653932, |
| "epoch": 1.32128740824393, |
| "grad_norm": 80.56359100341797, |
| "learning_rate": 4.8409056217938465e-06, |
| "loss": 0.6687, |
| "mean_token_accuracy": 0.8101451516151428, |
| "num_tokens": 18999140.0, |
| "step": 2340 |
| }, |
| { |
| "entropy": 1.1420178532600402, |
| "epoch": 1.3241106719367588, |
| "grad_norm": 89.24362182617188, |
| "learning_rate": 4.8402345519998e-06, |
| "loss": 0.657, |
| "mean_token_accuracy": 0.8115506768226624, |
| "num_tokens": 19039704.0, |
| "step": 2345 |
| }, |
| { |
| "entropy": 1.297519588470459, |
| "epoch": 1.3269339356295877, |
| "grad_norm": 69.71318054199219, |
| "learning_rate": 4.839562132851073e-06, |
| "loss": 0.7476, |
| "mean_token_accuracy": 0.788333511352539, |
| "num_tokens": 19080331.0, |
| "step": 2350 |
| }, |
| { |
| "entropy": 1.158388113975525, |
| "epoch": 1.3297571993224166, |
| "grad_norm": 93.63565826416016, |
| "learning_rate": 4.8388883648766495e-06, |
| "loss": 0.6935, |
| "mean_token_accuracy": 0.8003996729850769, |
| "num_tokens": 19120683.0, |
| "step": 2355 |
| }, |
| { |
| "entropy": 1.1665226459503173, |
| "epoch": 1.3325804630152456, |
| "grad_norm": 71.25414276123047, |
| "learning_rate": 4.838213248606575e-06, |
| "loss": 0.7153, |
| "mean_token_accuracy": 0.7993328690528869, |
| "num_tokens": 19161294.0, |
| "step": 2360 |
| }, |
| { |
| "entropy": 1.2286199092864991, |
| "epoch": 1.3354037267080745, |
| "grad_norm": 85.82389068603516, |
| "learning_rate": 4.837536784571955e-06, |
| "loss": 0.7382, |
| "mean_token_accuracy": 0.7923282504081726, |
| "num_tokens": 19201996.0, |
| "step": 2365 |
| }, |
| { |
| "entropy": 1.2367011785507203, |
| "epoch": 1.3382269904009034, |
| "grad_norm": 78.86309051513672, |
| "learning_rate": 4.836858973304957e-06, |
| "loss": 0.7283, |
| "mean_token_accuracy": 0.7932451248168946, |
| "num_tokens": 19242804.0, |
| "step": 2370 |
| }, |
| { |
| "entropy": 1.1243971109390258, |
| "epoch": 1.3410502540937324, |
| "grad_norm": 80.71333312988281, |
| "learning_rate": 4.836179815338805e-06, |
| "loss": 0.6538, |
| "mean_token_accuracy": 0.8101875901222229, |
| "num_tokens": 19283424.0, |
| "step": 2375 |
| }, |
| { |
| "entropy": 1.1276620268821715, |
| "epoch": 1.3438735177865613, |
| "grad_norm": 67.6820068359375, |
| "learning_rate": 4.835499311207788e-06, |
| "loss": 0.6764, |
| "mean_token_accuracy": 0.8059876799583435, |
| "num_tokens": 19324228.0, |
| "step": 2380 |
| }, |
| { |
| "entropy": 1.085622775554657, |
| "epoch": 1.3466967814793902, |
| "grad_norm": 66.39501953125, |
| "learning_rate": 4.8348174614472465e-06, |
| "loss": 0.6318, |
| "mean_token_accuracy": 0.8171636939048768, |
| "num_tokens": 19364593.0, |
| "step": 2385 |
| }, |
| { |
| "entropy": 1.1541183471679688, |
| "epoch": 1.3495200451722191, |
| "grad_norm": 92.84614562988281, |
| "learning_rate": 4.834134266593586e-06, |
| "loss": 0.6552, |
| "mean_token_accuracy": 0.8120688199996948, |
| "num_tokens": 19405413.0, |
| "step": 2390 |
| }, |
| { |
| "entropy": 1.199662709236145, |
| "epoch": 1.352343308865048, |
| "grad_norm": 80.65885162353516, |
| "learning_rate": 4.833449727184267e-06, |
| "loss": 0.7151, |
| "mean_token_accuracy": 0.7959373831748963, |
| "num_tokens": 19446043.0, |
| "step": 2395 |
| }, |
| { |
| "entropy": 1.153480863571167, |
| "epoch": 1.355166572557877, |
| "grad_norm": 79.00860595703125, |
| "learning_rate": 4.832763843757809e-06, |
| "loss": 0.6588, |
| "mean_token_accuracy": 0.8133233785629272, |
| "num_tokens": 19486895.0, |
| "step": 2400 |
| }, |
| { |
| "entropy": 1.0923040628433227, |
| "epoch": 1.357989836250706, |
| "grad_norm": 73.47067260742188, |
| "learning_rate": 4.832076616853788e-06, |
| "loss": 0.6212, |
| "mean_token_accuracy": 0.8210492372512818, |
| "num_tokens": 19527502.0, |
| "step": 2405 |
| }, |
| { |
| "entropy": 1.1808124303817749, |
| "epoch": 1.3608130999435346, |
| "grad_norm": 86.0345687866211, |
| "learning_rate": 4.831388047012836e-06, |
| "loss": 0.7206, |
| "mean_token_accuracy": 0.7948956370353699, |
| "num_tokens": 19568274.0, |
| "step": 2410 |
| }, |
| { |
| "entropy": 1.201215958595276, |
| "epoch": 1.3636363636363638, |
| "grad_norm": 82.31108856201172, |
| "learning_rate": 4.830698134776647e-06, |
| "loss": 0.6994, |
| "mean_token_accuracy": 0.8000857710838318, |
| "num_tokens": 19608913.0, |
| "step": 2415 |
| }, |
| { |
| "entropy": 1.1461955547332763, |
| "epoch": 1.3664596273291925, |
| "grad_norm": 80.74882507324219, |
| "learning_rate": 4.830006880687961e-06, |
| "loss": 0.662, |
| "mean_token_accuracy": 0.8095049500465393, |
| "num_tokens": 19649662.0, |
| "step": 2420 |
| }, |
| { |
| "entropy": 1.2065489768981934, |
| "epoch": 1.3692828910220214, |
| "grad_norm": 85.66714477539062, |
| "learning_rate": 4.829314285290584e-06, |
| "loss": 0.7143, |
| "mean_token_accuracy": 0.7959868550300598, |
| "num_tokens": 19689993.0, |
| "step": 2425 |
| }, |
| { |
| "entropy": 1.148555874824524, |
| "epoch": 1.3721061547148503, |
| "grad_norm": 77.31533813476562, |
| "learning_rate": 4.8286203491293706e-06, |
| "loss": 0.6792, |
| "mean_token_accuracy": 0.805145263671875, |
| "num_tokens": 19730308.0, |
| "step": 2430 |
| }, |
| { |
| "entropy": 1.1263495683670044, |
| "epoch": 1.3749294184076792, |
| "grad_norm": 72.85306549072266, |
| "learning_rate": 4.827925072750232e-06, |
| "loss": 0.6206, |
| "mean_token_accuracy": 0.819097375869751, |
| "num_tokens": 19770873.0, |
| "step": 2435 |
| }, |
| { |
| "entropy": 1.1379968404769898, |
| "epoch": 1.3777526821005082, |
| "grad_norm": 74.16294860839844, |
| "learning_rate": 4.827228456700135e-06, |
| "loss": 0.672, |
| "mean_token_accuracy": 0.8076503992080688, |
| "num_tokens": 19811620.0, |
| "step": 2440 |
| }, |
| { |
| "entropy": 1.2445213794708252, |
| "epoch": 1.380575945793337, |
| "grad_norm": 96.99238586425781, |
| "learning_rate": 4.826530501527097e-06, |
| "loss": 0.7006, |
| "mean_token_accuracy": 0.8044170498847961, |
| "num_tokens": 19852266.0, |
| "step": 2445 |
| }, |
| { |
| "entropy": 1.3386611223220826, |
| "epoch": 1.383399209486166, |
| "grad_norm": 74.49800872802734, |
| "learning_rate": 4.825831207780193e-06, |
| "loss": 0.6981, |
| "mean_token_accuracy": 0.8017369747161865, |
| "num_tokens": 19892609.0, |
| "step": 2450 |
| }, |
| { |
| "entropy": 1.2711436033248902, |
| "epoch": 1.386222473178995, |
| "grad_norm": 90.28303527832031, |
| "learning_rate": 4.82513057600955e-06, |
| "loss": 0.6718, |
| "mean_token_accuracy": 0.8058541059494019, |
| "num_tokens": 19932916.0, |
| "step": 2455 |
| }, |
| { |
| "entropy": 1.1395226001739502, |
| "epoch": 1.3890457368718239, |
| "grad_norm": 81.18891143798828, |
| "learning_rate": 4.8244286067663435e-06, |
| "loss": 0.6496, |
| "mean_token_accuracy": 0.8136471390724183, |
| "num_tokens": 19973568.0, |
| "step": 2460 |
| }, |
| { |
| "entropy": 1.2806391954421996, |
| "epoch": 1.3918690005646528, |
| "grad_norm": 74.18885040283203, |
| "learning_rate": 4.823725300602807e-06, |
| "loss": 0.7242, |
| "mean_token_accuracy": 0.7983367204666137, |
| "num_tokens": 20014256.0, |
| "step": 2465 |
| }, |
| { |
| "entropy": 1.2575560808181763, |
| "epoch": 1.3946922642574817, |
| "grad_norm": 83.14079284667969, |
| "learning_rate": 4.823020658072222e-06, |
| "loss": 0.7266, |
| "mean_token_accuracy": 0.7935426712036133, |
| "num_tokens": 20054679.0, |
| "step": 2470 |
| }, |
| { |
| "entropy": 1.1245292901992798, |
| "epoch": 1.3975155279503104, |
| "grad_norm": 86.17759704589844, |
| "learning_rate": 4.8223146797289235e-06, |
| "loss": 0.6472, |
| "mean_token_accuracy": 0.8125198125839234, |
| "num_tokens": 20095540.0, |
| "step": 2475 |
| }, |
| { |
| "entropy": 1.2133588790893555, |
| "epoch": 1.4003387916431396, |
| "grad_norm": 67.61619567871094, |
| "learning_rate": 4.8216073661282945e-06, |
| "loss": 0.6501, |
| "mean_token_accuracy": 0.8147984504699707, |
| "num_tokens": 20136277.0, |
| "step": 2480 |
| }, |
| { |
| "entropy": 1.1730206727981567, |
| "epoch": 1.4031620553359683, |
| "grad_norm": 70.51668548583984, |
| "learning_rate": 4.820898717826772e-06, |
| "loss": 0.6244, |
| "mean_token_accuracy": 0.8181105494499207, |
| "num_tokens": 20177010.0, |
| "step": 2485 |
| }, |
| { |
| "entropy": 1.1870003461837768, |
| "epoch": 1.4059853190287974, |
| "grad_norm": 65.29812622070312, |
| "learning_rate": 4.82018873538184e-06, |
| "loss": 0.6537, |
| "mean_token_accuracy": 0.8110229969024658, |
| "num_tokens": 20217605.0, |
| "step": 2490 |
| }, |
| { |
| "entropy": 1.1333913803100586, |
| "epoch": 1.4088085827216261, |
| "grad_norm": 71.14463806152344, |
| "learning_rate": 4.819477419352034e-06, |
| "loss": 0.6183, |
| "mean_token_accuracy": 0.8201487541198731, |
| "num_tokens": 20258267.0, |
| "step": 2495 |
| }, |
| { |
| "entropy": 1.1184187650680542, |
| "epoch": 1.411631846414455, |
| "grad_norm": 68.24232482910156, |
| "learning_rate": 4.818764770296938e-06, |
| "loss": 0.6249, |
| "mean_token_accuracy": 0.8183603525161743, |
| "num_tokens": 20299023.0, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.411631846414455, |
| "eval_entropy": 1.3717174053192138, |
| "eval_loss": 0.6318865418434143, |
| "eval_mean_token_accuracy": 0.8233636379241943, |
| "eval_num_tokens": 20299023.0, |
| "eval_runtime": 2.457, |
| "eval_samples_per_second": 15.873, |
| "eval_steps_per_second": 2.035, |
| "step": 2500 |
| }, |
| { |
| "entropy": 1.2598673343658446, |
| "epoch": 1.414455110107284, |
| "grad_norm": 71.59434509277344, |
| "learning_rate": 4.8180507887771835e-06, |
| "loss": 0.6998, |
| "mean_token_accuracy": 0.8029447555541992, |
| "num_tokens": 20339637.0, |
| "step": 2505 |
| }, |
| { |
| "entropy": 1.2127498626708983, |
| "epoch": 1.417278373800113, |
| "grad_norm": 77.03498077392578, |
| "learning_rate": 4.8173354753544524e-06, |
| "loss": 0.6909, |
| "mean_token_accuracy": 0.804262387752533, |
| "num_tokens": 20380116.0, |
| "step": 2510 |
| }, |
| { |
| "entropy": 1.237819457054138, |
| "epoch": 1.4201016374929418, |
| "grad_norm": 76.66390991210938, |
| "learning_rate": 4.816618830591473e-06, |
| "loss": 0.689, |
| "mean_token_accuracy": 0.8024984955787658, |
| "num_tokens": 20420739.0, |
| "step": 2515 |
| }, |
| { |
| "entropy": 1.206669521331787, |
| "epoch": 1.4229249011857708, |
| "grad_norm": 82.84412384033203, |
| "learning_rate": 4.815900855052021e-06, |
| "loss": 0.684, |
| "mean_token_accuracy": 0.8014244318008423, |
| "num_tokens": 20461380.0, |
| "step": 2520 |
| }, |
| { |
| "entropy": 1.1622983455657958, |
| "epoch": 1.4257481648785997, |
| "grad_norm": 69.7125244140625, |
| "learning_rate": 4.8151815493009186e-06, |
| "loss": 0.7123, |
| "mean_token_accuracy": 0.7954689502716065, |
| "num_tokens": 20502242.0, |
| "step": 2525 |
| }, |
| { |
| "entropy": 1.2152176380157471, |
| "epoch": 1.4285714285714286, |
| "grad_norm": 71.72361755371094, |
| "learning_rate": 4.814460913904036e-06, |
| "loss": 0.689, |
| "mean_token_accuracy": 0.8025439500808715, |
| "num_tokens": 20543050.0, |
| "step": 2530 |
| }, |
| { |
| "entropy": 1.1802116870880126, |
| "epoch": 1.4313946922642575, |
| "grad_norm": 83.00234985351562, |
| "learning_rate": 4.813738949428289e-06, |
| "loss": 0.6551, |
| "mean_token_accuracy": 0.8088852882385253, |
| "num_tokens": 20583787.0, |
| "step": 2535 |
| }, |
| { |
| "entropy": 1.1717280864715576, |
| "epoch": 1.4342179559570865, |
| "grad_norm": 74.20233154296875, |
| "learning_rate": 4.8130156564416374e-06, |
| "loss": 0.6143, |
| "mean_token_accuracy": 0.8246188163757324, |
| "num_tokens": 20624442.0, |
| "step": 2540 |
| }, |
| { |
| "entropy": 1.1795841693878173, |
| "epoch": 1.4370412196499154, |
| "grad_norm": 89.12123107910156, |
| "learning_rate": 4.812291035513088e-06, |
| "loss": 0.6578, |
| "mean_token_accuracy": 0.8118916988372803, |
| "num_tokens": 20664910.0, |
| "step": 2545 |
| }, |
| { |
| "entropy": 1.3075191736221314, |
| "epoch": 1.439864483342744, |
| "grad_norm": 76.54231262207031, |
| "learning_rate": 4.811565087212691e-06, |
| "loss": 0.7455, |
| "mean_token_accuracy": 0.7889067530632019, |
| "num_tokens": 20705591.0, |
| "step": 2550 |
| }, |
| { |
| "entropy": 1.1278613567352296, |
| "epoch": 1.4426877470355732, |
| "grad_norm": 66.26628112792969, |
| "learning_rate": 4.810837812111541e-06, |
| "loss": 0.5953, |
| "mean_token_accuracy": 0.8274539947509766, |
| "num_tokens": 20746404.0, |
| "step": 2555 |
| }, |
| { |
| "entropy": 1.1617911815643311, |
| "epoch": 1.445511010728402, |
| "grad_norm": 82.94552612304688, |
| "learning_rate": 4.810109210781778e-06, |
| "loss": 0.6439, |
| "mean_token_accuracy": 0.8134812235832214, |
| "num_tokens": 20787015.0, |
| "step": 2560 |
| }, |
| { |
| "entropy": 1.323712158203125, |
| "epoch": 1.4483342744212309, |
| "grad_norm": 67.05265808105469, |
| "learning_rate": 4.809379283796582e-06, |
| "loss": 0.6898, |
| "mean_token_accuracy": 0.8058351755142212, |
| "num_tokens": 20827604.0, |
| "step": 2565 |
| }, |
| { |
| "entropy": 1.1775886774063111, |
| "epoch": 1.4511575381140598, |
| "grad_norm": 98.45421600341797, |
| "learning_rate": 4.80864803173018e-06, |
| "loss": 0.6564, |
| "mean_token_accuracy": 0.8137942314147949, |
| "num_tokens": 20868096.0, |
| "step": 2570 |
| }, |
| { |
| "entropy": 1.272672414779663, |
| "epoch": 1.4539808018068887, |
| "grad_norm": 84.572509765625, |
| "learning_rate": 4.807915455157839e-06, |
| "loss": 0.643, |
| "mean_token_accuracy": 0.8139849543571472, |
| "num_tokens": 20907891.0, |
| "step": 2575 |
| }, |
| { |
| "entropy": 1.1502018213272094, |
| "epoch": 1.4568040654997176, |
| "grad_norm": 85.15162658691406, |
| "learning_rate": 4.807181554655866e-06, |
| "loss": 0.6461, |
| "mean_token_accuracy": 0.8113275051116944, |
| "num_tokens": 20948609.0, |
| "step": 2580 |
| }, |
| { |
| "entropy": 1.240171766281128, |
| "epoch": 1.4596273291925466, |
| "grad_norm": 73.13277435302734, |
| "learning_rate": 4.8064463308016154e-06, |
| "loss": 0.7099, |
| "mean_token_accuracy": 0.7971065878868103, |
| "num_tokens": 20989030.0, |
| "step": 2585 |
| }, |
| { |
| "entropy": 1.210387110710144, |
| "epoch": 1.4624505928853755, |
| "grad_norm": 80.91059875488281, |
| "learning_rate": 4.805709784173477e-06, |
| "loss": 0.6892, |
| "mean_token_accuracy": 0.8043756723403931, |
| "num_tokens": 21029630.0, |
| "step": 2590 |
| }, |
| { |
| "entropy": 1.1126335978507995, |
| "epoch": 1.4652738565782044, |
| "grad_norm": 76.07735443115234, |
| "learning_rate": 4.804971915350882e-06, |
| "loss": 0.593, |
| "mean_token_accuracy": 0.8269088268280029, |
| "num_tokens": 21069981.0, |
| "step": 2595 |
| }, |
| { |
| "entropy": 1.153779721260071, |
| "epoch": 1.4680971202710333, |
| "grad_norm": 72.62789154052734, |
| "learning_rate": 4.804232724914306e-06, |
| "loss": 0.6317, |
| "mean_token_accuracy": 0.8180077075958252, |
| "num_tokens": 21110649.0, |
| "step": 2600 |
| }, |
| { |
| "entropy": 1.2609638452529908, |
| "epoch": 1.4709203839638623, |
| "grad_norm": 82.38737487792969, |
| "learning_rate": 4.803492213445259e-06, |
| "loss": 0.6631, |
| "mean_token_accuracy": 0.8072648644447327, |
| "num_tokens": 21151346.0, |
| "step": 2605 |
| }, |
| { |
| "entropy": 1.251114010810852, |
| "epoch": 1.4737436476566912, |
| "grad_norm": 83.15131378173828, |
| "learning_rate": 4.802750381526294e-06, |
| "loss": 0.6537, |
| "mean_token_accuracy": 0.810612428188324, |
| "num_tokens": 21191808.0, |
| "step": 2610 |
| }, |
| { |
| "entropy": 1.2838777303695679, |
| "epoch": 1.4765669113495201, |
| "grad_norm": 72.97293090820312, |
| "learning_rate": 4.802007229741001e-06, |
| "loss": 0.6986, |
| "mean_token_accuracy": 0.8005695700645447, |
| "num_tokens": 21232422.0, |
| "step": 2615 |
| }, |
| { |
| "entropy": 1.2007624864578248, |
| "epoch": 1.479390175042349, |
| "grad_norm": 76.61715698242188, |
| "learning_rate": 4.801262758674009e-06, |
| "loss": 0.6429, |
| "mean_token_accuracy": 0.8100405931472778, |
| "num_tokens": 21273249.0, |
| "step": 2620 |
| }, |
| { |
| "entropy": 1.20713312625885, |
| "epoch": 1.4822134387351777, |
| "grad_norm": 66.92652130126953, |
| "learning_rate": 4.800516968910984e-06, |
| "loss": 0.6362, |
| "mean_token_accuracy": 0.8153573513031006, |
| "num_tokens": 21313887.0, |
| "step": 2625 |
| }, |
| { |
| "entropy": 1.246811294555664, |
| "epoch": 1.485036702428007, |
| "grad_norm": 75.15606689453125, |
| "learning_rate": 4.79976986103863e-06, |
| "loss": 0.7113, |
| "mean_token_accuracy": 0.7945464491844177, |
| "num_tokens": 21354393.0, |
| "step": 2630 |
| }, |
| { |
| "entropy": 1.2210959911346435, |
| "epoch": 1.4878599661208356, |
| "grad_norm": 72.59530639648438, |
| "learning_rate": 4.799021435644687e-06, |
| "loss": 0.6313, |
| "mean_token_accuracy": 0.813930869102478, |
| "num_tokens": 21395084.0, |
| "step": 2635 |
| }, |
| { |
| "entropy": 1.3068056583404541, |
| "epoch": 1.4906832298136645, |
| "grad_norm": 80.09661102294922, |
| "learning_rate": 4.798271693317935e-06, |
| "loss": 0.6833, |
| "mean_token_accuracy": 0.8025308609008789, |
| "num_tokens": 21435779.0, |
| "step": 2640 |
| }, |
| { |
| "entropy": 1.1782183170318603, |
| "epoch": 1.4935064935064934, |
| "grad_norm": 67.90818786621094, |
| "learning_rate": 4.797520634648185e-06, |
| "loss": 0.656, |
| "mean_token_accuracy": 0.811971914768219, |
| "num_tokens": 21476448.0, |
| "step": 2645 |
| }, |
| { |
| "entropy": 1.2379778146743774, |
| "epoch": 1.4963297571993224, |
| "grad_norm": 82.61946868896484, |
| "learning_rate": 4.7967682602262866e-06, |
| "loss": 0.7022, |
| "mean_token_accuracy": 0.8024298191070557, |
| "num_tokens": 21517170.0, |
| "step": 2650 |
| }, |
| { |
| "entropy": 1.2261282444000243, |
| "epoch": 1.4991530208921513, |
| "grad_norm": 82.52174377441406, |
| "learning_rate": 4.796014570644123e-06, |
| "loss": 0.6638, |
| "mean_token_accuracy": 0.8089081048965454, |
| "num_tokens": 21558007.0, |
| "step": 2655 |
| }, |
| { |
| "entropy": 1.1248329162597657, |
| "epoch": 1.5019762845849802, |
| "grad_norm": 70.8136978149414, |
| "learning_rate": 4.795259566494615e-06, |
| "loss": 0.6235, |
| "mean_token_accuracy": 0.8185378551483155, |
| "num_tokens": 21598768.0, |
| "step": 2660 |
| }, |
| { |
| "entropy": 1.1765445470809937, |
| "epoch": 1.5047995482778092, |
| "grad_norm": 85.58080291748047, |
| "learning_rate": 4.794503248371715e-06, |
| "loss": 0.6518, |
| "mean_token_accuracy": 0.8130599617958069, |
| "num_tokens": 21639426.0, |
| "step": 2665 |
| }, |
| { |
| "entropy": 1.2815346717834473, |
| "epoch": 1.507622811970638, |
| "grad_norm": 92.36046600341797, |
| "learning_rate": 4.7937456168704075e-06, |
| "loss": 0.7405, |
| "mean_token_accuracy": 0.7926273345947266, |
| "num_tokens": 21680145.0, |
| "step": 2670 |
| }, |
| { |
| "entropy": 1.3271814823150634, |
| "epoch": 1.510446075663467, |
| "grad_norm": 63.72608184814453, |
| "learning_rate": 4.792986672586715e-06, |
| "loss": 0.6925, |
| "mean_token_accuracy": 0.8034780144691467, |
| "num_tokens": 21720157.0, |
| "step": 2675 |
| }, |
| { |
| "entropy": 1.1895153999328614, |
| "epoch": 1.513269339356296, |
| "grad_norm": 66.45240783691406, |
| "learning_rate": 4.7922264161176865e-06, |
| "loss": 0.6386, |
| "mean_token_accuracy": 0.8131694197654724, |
| "num_tokens": 21760810.0, |
| "step": 2680 |
| }, |
| { |
| "entropy": 1.2537084341049194, |
| "epoch": 1.5160926030491249, |
| "grad_norm": 99.64230346679688, |
| "learning_rate": 4.79146484806141e-06, |
| "loss": 0.6932, |
| "mean_token_accuracy": 0.8014739036560059, |
| "num_tokens": 21801436.0, |
| "step": 2685 |
| }, |
| { |
| "entropy": 1.243509340286255, |
| "epoch": 1.5189158667419536, |
| "grad_norm": 76.32308197021484, |
| "learning_rate": 4.7907019690169995e-06, |
| "loss": 0.6595, |
| "mean_token_accuracy": 0.8102675080299377, |
| "num_tokens": 21842068.0, |
| "step": 2690 |
| }, |
| { |
| "entropy": 1.1510627508163451, |
| "epoch": 1.5217391304347827, |
| "grad_norm": 65.6766128540039, |
| "learning_rate": 4.789937779584606e-06, |
| "loss": 0.5932, |
| "mean_token_accuracy": 0.8254171371459961, |
| "num_tokens": 21882781.0, |
| "step": 2695 |
| }, |
| { |
| "entropy": 1.3232401371002198, |
| "epoch": 1.5245623941276114, |
| "grad_norm": 82.84342956542969, |
| "learning_rate": 4.789172280365405e-06, |
| "loss": 0.7381, |
| "mean_token_accuracy": 0.7899990320205689, |
| "num_tokens": 21923429.0, |
| "step": 2700 |
| }, |
| { |
| "entropy": 1.1786120891571046, |
| "epoch": 1.5273856578204406, |
| "grad_norm": 75.24930572509766, |
| "learning_rate": 4.788405471961607e-06, |
| "loss": 0.6657, |
| "mean_token_accuracy": 0.8080781698226929, |
| "num_tokens": 21964070.0, |
| "step": 2705 |
| }, |
| { |
| "entropy": 1.3117302417755128, |
| "epoch": 1.5302089215132693, |
| "grad_norm": 82.70320129394531, |
| "learning_rate": 4.787637354976451e-06, |
| "loss": 0.7032, |
| "mean_token_accuracy": 0.7976889133453369, |
| "num_tokens": 22004553.0, |
| "step": 2710 |
| }, |
| { |
| "entropy": 1.159567379951477, |
| "epoch": 1.5330321852060984, |
| "grad_norm": 77.04248809814453, |
| "learning_rate": 4.7868679300142075e-06, |
| "loss": 0.6286, |
| "mean_token_accuracy": 0.8175629734992981, |
| "num_tokens": 22045280.0, |
| "step": 2715 |
| }, |
| { |
| "entropy": 1.240707230567932, |
| "epoch": 1.5358554488989271, |
| "grad_norm": 63.95314025878906, |
| "learning_rate": 4.7860971976801705e-06, |
| "loss": 0.6701, |
| "mean_token_accuracy": 0.8070308327674866, |
| "num_tokens": 22085964.0, |
| "step": 2720 |
| }, |
| { |
| "entropy": 1.1840417623519897, |
| "epoch": 1.538678712591756, |
| "grad_norm": 92.01813507080078, |
| "learning_rate": 4.785325158580667e-06, |
| "loss": 0.6363, |
| "mean_token_accuracy": 0.8143904328346252, |
| "num_tokens": 22126478.0, |
| "step": 2725 |
| }, |
| { |
| "entropy": 1.1806485176086425, |
| "epoch": 1.541501976284585, |
| "grad_norm": 78.07498931884766, |
| "learning_rate": 4.784551813323053e-06, |
| "loss": 0.623, |
| "mean_token_accuracy": 0.8173678278923034, |
| "num_tokens": 22167400.0, |
| "step": 2730 |
| }, |
| { |
| "entropy": 1.2515090465545655, |
| "epoch": 1.5443252399774139, |
| "grad_norm": 81.79811096191406, |
| "learning_rate": 4.783777162515708e-06, |
| "loss": 0.6986, |
| "mean_token_accuracy": 0.7993507623672486, |
| "num_tokens": 22208168.0, |
| "step": 2735 |
| }, |
| { |
| "entropy": 1.2478914260864258, |
| "epoch": 1.5471485036702428, |
| "grad_norm": 78.49143981933594, |
| "learning_rate": 4.783001206768042e-06, |
| "loss": 0.6894, |
| "mean_token_accuracy": 0.8029946327209473, |
| "num_tokens": 22248225.0, |
| "step": 2740 |
| }, |
| { |
| "entropy": 1.2676709175109864, |
| "epoch": 1.5499717673630717, |
| "grad_norm": 79.54261016845703, |
| "learning_rate": 4.7822239466904885e-06, |
| "loss": 0.6677, |
| "mean_token_accuracy": 0.8095703125, |
| "num_tokens": 22288552.0, |
| "step": 2745 |
| }, |
| { |
| "entropy": 1.2544928550720216, |
| "epoch": 1.5527950310559007, |
| "grad_norm": 80.72503662109375, |
| "learning_rate": 4.781445382894511e-06, |
| "loss": 0.6714, |
| "mean_token_accuracy": 0.8068174123764038, |
| "num_tokens": 22329056.0, |
| "step": 2750 |
| }, |
| { |
| "entropy": 1.1922768592834472, |
| "epoch": 1.5556182947487294, |
| "grad_norm": 70.30680084228516, |
| "learning_rate": 4.780665515992594e-06, |
| "loss": 0.6414, |
| "mean_token_accuracy": 0.8172025442123413, |
| "num_tokens": 22369641.0, |
| "step": 2755 |
| }, |
| { |
| "entropy": 1.1814116358757019, |
| "epoch": 1.5584415584415585, |
| "grad_norm": 58.10313415527344, |
| "learning_rate": 4.779884346598251e-06, |
| "loss": 0.619, |
| "mean_token_accuracy": 0.8186899900436402, |
| "num_tokens": 22410436.0, |
| "step": 2760 |
| }, |
| { |
| "entropy": 1.187838339805603, |
| "epoch": 1.5612648221343872, |
| "grad_norm": 80.88594818115234, |
| "learning_rate": 4.7791018753260186e-06, |
| "loss": 0.68, |
| "mean_token_accuracy": 0.8037778973579407, |
| "num_tokens": 22451071.0, |
| "step": 2765 |
| }, |
| { |
| "entropy": 1.1621285438537599, |
| "epoch": 1.5640880858272164, |
| "grad_norm": 74.58294677734375, |
| "learning_rate": 4.778318102791458e-06, |
| "loss": 0.663, |
| "mean_token_accuracy": 0.8116282105445862, |
| "num_tokens": 22491576.0, |
| "step": 2770 |
| }, |
| { |
| "entropy": 1.262639617919922, |
| "epoch": 1.566911349520045, |
| "grad_norm": 85.41896057128906, |
| "learning_rate": 4.777533029611152e-06, |
| "loss": 0.692, |
| "mean_token_accuracy": 0.8043246865272522, |
| "num_tokens": 22532330.0, |
| "step": 2775 |
| }, |
| { |
| "entropy": 1.1400727033615112, |
| "epoch": 1.5697346132128742, |
| "grad_norm": 73.55253601074219, |
| "learning_rate": 4.77674665640271e-06, |
| "loss": 0.6136, |
| "mean_token_accuracy": 0.8233813762664794, |
| "num_tokens": 22573071.0, |
| "step": 2780 |
| }, |
| { |
| "entropy": 1.1831058979034423, |
| "epoch": 1.572557876905703, |
| "grad_norm": 71.34568786621094, |
| "learning_rate": 4.775958983784762e-06, |
| "loss": 0.6038, |
| "mean_token_accuracy": 0.8231364607810974, |
| "num_tokens": 22613586.0, |
| "step": 2785 |
| }, |
| { |
| "entropy": 1.2381763458251953, |
| "epoch": 1.5753811405985318, |
| "grad_norm": 89.24989318847656, |
| "learning_rate": 4.7751700123769615e-06, |
| "loss": 0.7051, |
| "mean_token_accuracy": 0.7986539959907532, |
| "num_tokens": 22654078.0, |
| "step": 2790 |
| }, |
| { |
| "entropy": 1.2196001529693603, |
| "epoch": 1.5782044042913608, |
| "grad_norm": 75.8842544555664, |
| "learning_rate": 4.774379742799982e-06, |
| "loss": 0.6416, |
| "mean_token_accuracy": 0.8148429989814758, |
| "num_tokens": 22694488.0, |
| "step": 2795 |
| }, |
| { |
| "entropy": 1.1762643814086915, |
| "epoch": 1.5810276679841897, |
| "grad_norm": 70.9981460571289, |
| "learning_rate": 4.773588175675519e-06, |
| "loss": 0.6995, |
| "mean_token_accuracy": 0.801743483543396, |
| "num_tokens": 22735246.0, |
| "step": 2800 |
| }, |
| { |
| "entropy": 1.3188316106796265, |
| "epoch": 1.5838509316770186, |
| "grad_norm": 77.98265075683594, |
| "learning_rate": 4.77279531162629e-06, |
| "loss": 0.7117, |
| "mean_token_accuracy": 0.7977353572845459, |
| "num_tokens": 22776009.0, |
| "step": 2805 |
| }, |
| { |
| "entropy": 1.1862847805023193, |
| "epoch": 1.5866741953698476, |
| "grad_norm": 73.0948486328125, |
| "learning_rate": 4.772001151276031e-06, |
| "loss": 0.6288, |
| "mean_token_accuracy": 0.8189311504364014, |
| "num_tokens": 22816752.0, |
| "step": 2810 |
| }, |
| { |
| "entropy": 1.2885456085205078, |
| "epoch": 1.5894974590626765, |
| "grad_norm": 96.79470825195312, |
| "learning_rate": 4.771205695249498e-06, |
| "loss": 0.6912, |
| "mean_token_accuracy": 0.804690134525299, |
| "num_tokens": 22857463.0, |
| "step": 2815 |
| }, |
| { |
| "entropy": 1.2456789016723633, |
| "epoch": 1.5923207227555054, |
| "grad_norm": 98.59661865234375, |
| "learning_rate": 4.770408944172468e-06, |
| "loss": 0.7048, |
| "mean_token_accuracy": 0.7998390793800354, |
| "num_tokens": 22898192.0, |
| "step": 2820 |
| }, |
| { |
| "entropy": 1.263387417793274, |
| "epoch": 1.5951439864483343, |
| "grad_norm": 86.11984252929688, |
| "learning_rate": 4.769610898671735e-06, |
| "loss": 0.6847, |
| "mean_token_accuracy": 0.8015385270118713, |
| "num_tokens": 22938960.0, |
| "step": 2825 |
| }, |
| { |
| "entropy": 1.1750084161758423, |
| "epoch": 1.597967250141163, |
| "grad_norm": 68.83621215820312, |
| "learning_rate": 4.768811559375112e-06, |
| "loss": 0.644, |
| "mean_token_accuracy": 0.8148982763290405, |
| "num_tokens": 22979853.0, |
| "step": 2830 |
| }, |
| { |
| "entropy": 1.2434749603271484, |
| "epoch": 1.6007905138339922, |
| "grad_norm": 63.998146057128906, |
| "learning_rate": 4.76801092691143e-06, |
| "loss": 0.6424, |
| "mean_token_accuracy": 0.8153539896011353, |
| "num_tokens": 23020582.0, |
| "step": 2835 |
| }, |
| { |
| "entropy": 1.180994987487793, |
| "epoch": 1.6036137775268209, |
| "grad_norm": 75.83289337158203, |
| "learning_rate": 4.7672090019105365e-06, |
| "loss": 0.6582, |
| "mean_token_accuracy": 0.8138703823089599, |
| "num_tokens": 23061192.0, |
| "step": 2840 |
| }, |
| { |
| "entropy": 1.41214861869812, |
| "epoch": 1.60643704121965, |
| "grad_norm": 80.3683853149414, |
| "learning_rate": 4.7664057850032974e-06, |
| "loss": 0.7149, |
| "mean_token_accuracy": 0.7960960865020752, |
| "num_tokens": 23101697.0, |
| "step": 2845 |
| }, |
| { |
| "entropy": 1.2189382076263429, |
| "epoch": 1.6092603049124787, |
| "grad_norm": 85.0681381225586, |
| "learning_rate": 4.765601276821593e-06, |
| "loss": 0.6258, |
| "mean_token_accuracy": 0.8162513375282288, |
| "num_tokens": 23142380.0, |
| "step": 2850 |
| }, |
| { |
| "entropy": 1.2289648056030273, |
| "epoch": 1.6120835686053079, |
| "grad_norm": 72.83939361572266, |
| "learning_rate": 4.76479547799832e-06, |
| "loss": 0.6472, |
| "mean_token_accuracy": 0.8116074323654174, |
| "num_tokens": 23183087.0, |
| "step": 2855 |
| }, |
| { |
| "entropy": 1.1334474802017211, |
| "epoch": 1.6149068322981366, |
| "grad_norm": 76.32488250732422, |
| "learning_rate": 4.763988389167392e-06, |
| "loss": 0.6349, |
| "mean_token_accuracy": 0.8176516652107239, |
| "num_tokens": 23223891.0, |
| "step": 2860 |
| }, |
| { |
| "entropy": 1.2608877182006837, |
| "epoch": 1.6177300959909655, |
| "grad_norm": 80.27566528320312, |
| "learning_rate": 4.763180010963735e-06, |
| "loss": 0.6582, |
| "mean_token_accuracy": 0.8102393507957458, |
| "num_tokens": 23264505.0, |
| "step": 2865 |
| }, |
| { |
| "entropy": 1.3712619066238403, |
| "epoch": 1.6205533596837944, |
| "grad_norm": 75.8729476928711, |
| "learning_rate": 4.762370344023291e-06, |
| "loss": 0.7595, |
| "mean_token_accuracy": 0.7833655118942261, |
| "num_tokens": 23304834.0, |
| "step": 2870 |
| }, |
| { |
| "entropy": 1.1733553886413575, |
| "epoch": 1.6233766233766234, |
| "grad_norm": 80.17864990234375, |
| "learning_rate": 4.761559388983017e-06, |
| "loss": 0.6424, |
| "mean_token_accuracy": 0.8149887919425964, |
| "num_tokens": 23344930.0, |
| "step": 2875 |
| }, |
| { |
| "entropy": 1.2769611835479737, |
| "epoch": 1.6261998870694523, |
| "grad_norm": 69.23902130126953, |
| "learning_rate": 4.760747146480879e-06, |
| "loss": 0.6904, |
| "mean_token_accuracy": 0.8037740349769592, |
| "num_tokens": 23385656.0, |
| "step": 2880 |
| }, |
| { |
| "entropy": 1.2944100141525268, |
| "epoch": 1.6290231507622812, |
| "grad_norm": 74.3133544921875, |
| "learning_rate": 4.75993361715586e-06, |
| "loss": 0.7092, |
| "mean_token_accuracy": 0.7989227890968322, |
| "num_tokens": 23425661.0, |
| "step": 2885 |
| }, |
| { |
| "entropy": 1.1579317331314087, |
| "epoch": 1.6318464144551101, |
| "grad_norm": 60.908485412597656, |
| "learning_rate": 4.759118801647955e-06, |
| "loss": 0.5889, |
| "mean_token_accuracy": 0.8264386534690857, |
| "num_tokens": 23466543.0, |
| "step": 2890 |
| }, |
| { |
| "entropy": 1.2280898094177246, |
| "epoch": 1.634669678147939, |
| "grad_norm": 78.14270782470703, |
| "learning_rate": 4.758302700598166e-06, |
| "loss": 0.6932, |
| "mean_token_accuracy": 0.802795660495758, |
| "num_tokens": 23506666.0, |
| "step": 2895 |
| }, |
| { |
| "entropy": 1.2708383798599243, |
| "epoch": 1.637492941840768, |
| "grad_norm": 63.11326217651367, |
| "learning_rate": 4.757485314648514e-06, |
| "loss": 0.6657, |
| "mean_token_accuracy": 0.8090845704078674, |
| "num_tokens": 23547306.0, |
| "step": 2900 |
| }, |
| { |
| "entropy": 1.1924805402755738, |
| "epoch": 1.6403162055335967, |
| "grad_norm": 86.47279357910156, |
| "learning_rate": 4.756666644442024e-06, |
| "loss": 0.6424, |
| "mean_token_accuracy": 0.8108976602554321, |
| "num_tokens": 23588034.0, |
| "step": 2905 |
| }, |
| { |
| "entropy": 1.230809736251831, |
| "epoch": 1.6431394692264258, |
| "grad_norm": 73.81243896484375, |
| "learning_rate": 4.755846690622736e-06, |
| "loss": 0.6891, |
| "mean_token_accuracy": 0.8049345135688781, |
| "num_tokens": 23628731.0, |
| "step": 2910 |
| }, |
| { |
| "entropy": 1.129303252696991, |
| "epoch": 1.6459627329192545, |
| "grad_norm": 79.32695770263672, |
| "learning_rate": 4.755025453835698e-06, |
| "loss": 0.6375, |
| "mean_token_accuracy": 0.8155039429664612, |
| "num_tokens": 23669377.0, |
| "step": 2915 |
| }, |
| { |
| "entropy": 1.1930960178375245, |
| "epoch": 1.6487859966120837, |
| "grad_norm": 79.17987823486328, |
| "learning_rate": 4.754202934726965e-06, |
| "loss": 0.6437, |
| "mean_token_accuracy": 0.8137888431549072, |
| "num_tokens": 23710215.0, |
| "step": 2920 |
| }, |
| { |
| "entropy": 1.1872772932052613, |
| "epoch": 1.6516092603049124, |
| "grad_norm": 73.08032989501953, |
| "learning_rate": 4.753379133943606e-06, |
| "loss": 0.6098, |
| "mean_token_accuracy": 0.8207122087478638, |
| "num_tokens": 23750886.0, |
| "step": 2925 |
| }, |
| { |
| "entropy": 1.2139918804168701, |
| "epoch": 1.6544325239977415, |
| "grad_norm": 85.57366180419922, |
| "learning_rate": 4.752554052133693e-06, |
| "loss": 0.6708, |
| "mean_token_accuracy": 0.8065659999847412, |
| "num_tokens": 23791517.0, |
| "step": 2930 |
| }, |
| { |
| "entropy": 1.2267068386077882, |
| "epoch": 1.6572557876905702, |
| "grad_norm": 82.90193176269531, |
| "learning_rate": 4.751727689946309e-06, |
| "loss": 0.6589, |
| "mean_token_accuracy": 0.8087624788284302, |
| "num_tokens": 23832183.0, |
| "step": 2935 |
| }, |
| { |
| "entropy": 1.1001429557800293, |
| "epoch": 1.6600790513833992, |
| "grad_norm": 74.51642608642578, |
| "learning_rate": 4.750900048031543e-06, |
| "loss": 0.6045, |
| "mean_token_accuracy": 0.8216747164726257, |
| "num_tokens": 23873086.0, |
| "step": 2940 |
| }, |
| { |
| "entropy": 1.2163544416427612, |
| "epoch": 1.662902315076228, |
| "grad_norm": 70.33753967285156, |
| "learning_rate": 4.750071127040493e-06, |
| "loss": 0.644, |
| "mean_token_accuracy": 0.8124571919441224, |
| "num_tokens": 23913676.0, |
| "step": 2945 |
| }, |
| { |
| "entropy": 1.2160048484802246, |
| "epoch": 1.665725578769057, |
| "grad_norm": 63.329654693603516, |
| "learning_rate": 4.749240927625258e-06, |
| "loss": 0.6378, |
| "mean_token_accuracy": 0.8146205306053161, |
| "num_tokens": 23954446.0, |
| "step": 2950 |
| }, |
| { |
| "entropy": 1.3222097873687744, |
| "epoch": 1.668548842461886, |
| "grad_norm": 78.38276672363281, |
| "learning_rate": 4.748409450438948e-06, |
| "loss": 0.6993, |
| "mean_token_accuracy": 0.8010660886764527, |
| "num_tokens": 23994980.0, |
| "step": 2955 |
| }, |
| { |
| "entropy": 1.3176050662994385, |
| "epoch": 1.6713721061547149, |
| "grad_norm": 73.32920837402344, |
| "learning_rate": 4.747576696135676e-06, |
| "loss": 0.7089, |
| "mean_token_accuracy": 0.7956379055976868, |
| "num_tokens": 24035743.0, |
| "step": 2960 |
| }, |
| { |
| "entropy": 1.1523848533630372, |
| "epoch": 1.6741953698475438, |
| "grad_norm": 74.85565185546875, |
| "learning_rate": 4.746742665370561e-06, |
| "loss": 0.6256, |
| "mean_token_accuracy": 0.8195020914077759, |
| "num_tokens": 24076284.0, |
| "step": 2965 |
| }, |
| { |
| "entropy": 1.1982210397720336, |
| "epoch": 1.6770186335403725, |
| "grad_norm": 66.0778579711914, |
| "learning_rate": 4.7459073587997215e-06, |
| "loss": 0.6389, |
| "mean_token_accuracy": 0.8123613357543945, |
| "num_tokens": 24117006.0, |
| "step": 2970 |
| }, |
| { |
| "entropy": 1.3042704105377196, |
| "epoch": 1.6798418972332017, |
| "grad_norm": 73.61102294921875, |
| "learning_rate": 4.745070777080288e-06, |
| "loss": 0.7255, |
| "mean_token_accuracy": 0.7976557850837708, |
| "num_tokens": 24157561.0, |
| "step": 2975 |
| }, |
| { |
| "entropy": 1.222891139984131, |
| "epoch": 1.6826651609260304, |
| "grad_norm": 72.3819580078125, |
| "learning_rate": 4.744232920870387e-06, |
| "loss": 0.6421, |
| "mean_token_accuracy": 0.8140509843826294, |
| "num_tokens": 24198058.0, |
| "step": 2980 |
| }, |
| { |
| "entropy": 1.3084670066833497, |
| "epoch": 1.6854884246188595, |
| "grad_norm": 114.97191619873047, |
| "learning_rate": 4.743393790829149e-06, |
| "loss": 0.676, |
| "mean_token_accuracy": 0.8067424893379211, |
| "num_tokens": 24238634.0, |
| "step": 2985 |
| }, |
| { |
| "entropy": 1.318178415298462, |
| "epoch": 1.6883116883116882, |
| "grad_norm": 75.34236907958984, |
| "learning_rate": 4.742553387616709e-06, |
| "loss": 0.7423, |
| "mean_token_accuracy": 0.7923501610755921, |
| "num_tokens": 24279411.0, |
| "step": 2990 |
| }, |
| { |
| "entropy": 1.2532196283340453, |
| "epoch": 1.6911349520045174, |
| "grad_norm": 86.58775329589844, |
| "learning_rate": 4.741711711894203e-06, |
| "loss": 0.7164, |
| "mean_token_accuracy": 0.7993324875831604, |
| "num_tokens": 24319998.0, |
| "step": 2995 |
| }, |
| { |
| "entropy": 1.1841705083847045, |
| "epoch": 1.693958215697346, |
| "grad_norm": 86.2491683959961, |
| "learning_rate": 4.740868764323765e-06, |
| "loss": 0.6665, |
| "mean_token_accuracy": 0.8101414680480957, |
| "num_tokens": 24360590.0, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.693958215697346, |
| "eval_entropy": 1.3606500387191773, |
| "eval_loss": 0.5422791838645935, |
| "eval_mean_token_accuracy": 0.8444921255111695, |
| "eval_num_tokens": 24360590.0, |
| "eval_runtime": 2.4503, |
| "eval_samples_per_second": 15.916, |
| "eval_steps_per_second": 2.041, |
| "step": 3000 |
| }, |
| { |
| "entropy": 1.3304717063903808, |
| "epoch": 1.6967814793901752, |
| "grad_norm": 78.38925170898438, |
| "learning_rate": 4.740024545568535e-06, |
| "loss": 0.726, |
| "mean_token_accuracy": 0.7918658256530762, |
| "num_tokens": 24401025.0, |
| "step": 3005 |
| }, |
| { |
| "entropy": 1.2585272312164306, |
| "epoch": 1.699604743083004, |
| "grad_norm": 69.97944641113281, |
| "learning_rate": 4.739179056292647e-06, |
| "loss": 0.6643, |
| "mean_token_accuracy": 0.8107133388519288, |
| "num_tokens": 24441667.0, |
| "step": 3010 |
| }, |
| { |
| "entropy": 1.2609277486801147, |
| "epoch": 1.7024280067758328, |
| "grad_norm": 77.39826202392578, |
| "learning_rate": 4.738332297161239e-06, |
| "loss": 0.6728, |
| "mean_token_accuracy": 0.8045721054077148, |
| "num_tokens": 24482605.0, |
| "step": 3015 |
| }, |
| { |
| "entropy": 1.425962710380554, |
| "epoch": 1.7052512704686618, |
| "grad_norm": 83.03507995605469, |
| "learning_rate": 4.737484268840446e-06, |
| "loss": 0.7411, |
| "mean_token_accuracy": 0.7898955702781677, |
| "num_tokens": 24523306.0, |
| "step": 3020 |
| }, |
| { |
| "entropy": 1.1658933877944946, |
| "epoch": 1.7080745341614907, |
| "grad_norm": 67.21765899658203, |
| "learning_rate": 4.736634971997401e-06, |
| "loss": 0.6395, |
| "mean_token_accuracy": 0.8146594166755676, |
| "num_tokens": 24563887.0, |
| "step": 3025 |
| }, |
| { |
| "entropy": 1.106322956085205, |
| "epoch": 1.7108977978543196, |
| "grad_norm": 86.37903594970703, |
| "learning_rate": 4.735784407300238e-06, |
| "loss": 0.6227, |
| "mean_token_accuracy": 0.8204713344573975, |
| "num_tokens": 24604633.0, |
| "step": 3030 |
| }, |
| { |
| "entropy": 1.141917634010315, |
| "epoch": 1.7137210615471485, |
| "grad_norm": 85.14436340332031, |
| "learning_rate": 4.734932575418084e-06, |
| "loss": 0.6376, |
| "mean_token_accuracy": 0.8149391174316406, |
| "num_tokens": 24645098.0, |
| "step": 3035 |
| }, |
| { |
| "entropy": 1.1664824724197387, |
| "epoch": 1.7165443252399775, |
| "grad_norm": 61.968544006347656, |
| "learning_rate": 4.734079477021065e-06, |
| "loss": 0.6516, |
| "mean_token_accuracy": 0.8103834867477417, |
| "num_tokens": 24685871.0, |
| "step": 3040 |
| }, |
| { |
| "entropy": 1.171910786628723, |
| "epoch": 1.7193675889328062, |
| "grad_norm": 75.73184967041016, |
| "learning_rate": 4.733225112780305e-06, |
| "loss": 0.6435, |
| "mean_token_accuracy": 0.8168030619621277, |
| "num_tokens": 24726477.0, |
| "step": 3045 |
| }, |
| { |
| "entropy": 1.3494631767272949, |
| "epoch": 1.7221908526256353, |
| "grad_norm": 89.85262298583984, |
| "learning_rate": 4.73236948336792e-06, |
| "loss": 0.6898, |
| "mean_token_accuracy": 0.8040600776672363, |
| "num_tokens": 24767322.0, |
| "step": 3050 |
| }, |
| { |
| "entropy": 1.3355523347854614, |
| "epoch": 1.725014116318464, |
| "grad_norm": 76.07821655273438, |
| "learning_rate": 4.731512589457026e-06, |
| "loss": 0.7062, |
| "mean_token_accuracy": 0.801560926437378, |
| "num_tokens": 24807997.0, |
| "step": 3055 |
| }, |
| { |
| "entropy": 1.3271613121032715, |
| "epoch": 1.7278373800112932, |
| "grad_norm": 69.96381378173828, |
| "learning_rate": 4.7306544317217295e-06, |
| "loss": 0.6768, |
| "mean_token_accuracy": 0.8040451407432556, |
| "num_tokens": 24848529.0, |
| "step": 3060 |
| }, |
| { |
| "entropy": 1.173279881477356, |
| "epoch": 1.7306606437041219, |
| "grad_norm": 67.66761779785156, |
| "learning_rate": 4.729795010837134e-06, |
| "loss": 0.6449, |
| "mean_token_accuracy": 0.8127846121788025, |
| "num_tokens": 24889192.0, |
| "step": 3065 |
| }, |
| { |
| "entropy": 1.2362797021865846, |
| "epoch": 1.733483907396951, |
| "grad_norm": 71.30282592773438, |
| "learning_rate": 4.728934327479335e-06, |
| "loss": 0.6755, |
| "mean_token_accuracy": 0.8028325915336609, |
| "num_tokens": 24929839.0, |
| "step": 3070 |
| }, |
| { |
| "entropy": 1.2514007568359375, |
| "epoch": 1.7363071710897797, |
| "grad_norm": 62.189842224121094, |
| "learning_rate": 4.728072382325423e-06, |
| "loss": 0.6584, |
| "mean_token_accuracy": 0.8121908187866211, |
| "num_tokens": 24970329.0, |
| "step": 3075 |
| }, |
| { |
| "entropy": 1.3371544361114502, |
| "epoch": 1.7391304347826086, |
| "grad_norm": 77.2507553100586, |
| "learning_rate": 4.727209176053478e-06, |
| "loss": 0.7225, |
| "mean_token_accuracy": 0.7927798509597779, |
| "num_tokens": 25011018.0, |
| "step": 3080 |
| }, |
| { |
| "entropy": 1.2675202369689942, |
| "epoch": 1.7419536984754376, |
| "grad_norm": 79.65899658203125, |
| "learning_rate": 4.726344709342576e-06, |
| "loss": 0.6745, |
| "mean_token_accuracy": 0.8066824793815612, |
| "num_tokens": 25051589.0, |
| "step": 3085 |
| }, |
| { |
| "entropy": 1.3616349935531615, |
| "epoch": 1.7447769621682665, |
| "grad_norm": 76.38388061523438, |
| "learning_rate": 4.725478982872782e-06, |
| "loss": 0.6926, |
| "mean_token_accuracy": 0.8023685574531555, |
| "num_tokens": 25092165.0, |
| "step": 3090 |
| }, |
| { |
| "entropy": 1.2535524129867555, |
| "epoch": 1.7476002258610954, |
| "grad_norm": 85.11703491210938, |
| "learning_rate": 4.724611997325153e-06, |
| "loss": 0.7213, |
| "mean_token_accuracy": 0.7959471702575683, |
| "num_tokens": 25132923.0, |
| "step": 3095 |
| }, |
| { |
| "entropy": 1.2510310173034669, |
| "epoch": 1.7504234895539243, |
| "grad_norm": 78.4201889038086, |
| "learning_rate": 4.723743753381736e-06, |
| "loss": 0.6994, |
| "mean_token_accuracy": 0.799690055847168, |
| "num_tokens": 25173566.0, |
| "step": 3100 |
| }, |
| { |
| "entropy": 1.2111499547958373, |
| "epoch": 1.7532467532467533, |
| "grad_norm": 68.65093994140625, |
| "learning_rate": 4.7228742517255684e-06, |
| "loss": 0.6093, |
| "mean_token_accuracy": 0.8221887230873108, |
| "num_tokens": 25214439.0, |
| "step": 3105 |
| }, |
| { |
| "entropy": 1.2008164405822754, |
| "epoch": 1.7560700169395822, |
| "grad_norm": 69.03833770751953, |
| "learning_rate": 4.722003493040676e-06, |
| "loss": 0.6276, |
| "mean_token_accuracy": 0.8147233247756958, |
| "num_tokens": 25254743.0, |
| "step": 3110 |
| }, |
| { |
| "entropy": 1.2858773469924927, |
| "epoch": 1.7588932806324111, |
| "grad_norm": 83.34830474853516, |
| "learning_rate": 4.721131478012076e-06, |
| "loss": 0.7253, |
| "mean_token_accuracy": 0.7950618982315063, |
| "num_tokens": 25295419.0, |
| "step": 3115 |
| }, |
| { |
| "entropy": 1.3381348848342896, |
| "epoch": 1.7617165443252398, |
| "grad_norm": 78.60142517089844, |
| "learning_rate": 4.720258207325771e-06, |
| "loss": 0.6849, |
| "mean_token_accuracy": 0.8035218834877014, |
| "num_tokens": 25336300.0, |
| "step": 3120 |
| }, |
| { |
| "entropy": 1.1491690158843995, |
| "epoch": 1.764539808018069, |
| "grad_norm": 72.92134094238281, |
| "learning_rate": 4.7193836816687525e-06, |
| "loss": 0.6543, |
| "mean_token_accuracy": 0.8121623754501343, |
| "num_tokens": 25376953.0, |
| "step": 3125 |
| }, |
| { |
| "entropy": 1.2614521503448486, |
| "epoch": 1.7673630717108977, |
| "grad_norm": 82.32136535644531, |
| "learning_rate": 4.718507901729001e-06, |
| "loss": 0.675, |
| "mean_token_accuracy": 0.8051144242286682, |
| "num_tokens": 25417751.0, |
| "step": 3130 |
| }, |
| { |
| "entropy": 1.2652166366577149, |
| "epoch": 1.7701863354037268, |
| "grad_norm": 61.305721282958984, |
| "learning_rate": 4.717630868195481e-06, |
| "loss": 0.6639, |
| "mean_token_accuracy": 0.8103567004203797, |
| "num_tokens": 25458579.0, |
| "step": 3135 |
| }, |
| { |
| "entropy": 1.2356239557266235, |
| "epoch": 1.7730095990965555, |
| "grad_norm": 79.6635513305664, |
| "learning_rate": 4.716752581758144e-06, |
| "loss": 0.6785, |
| "mean_token_accuracy": 0.8023215413093567, |
| "num_tokens": 25499394.0, |
| "step": 3140 |
| }, |
| { |
| "entropy": 1.2099516153335572, |
| "epoch": 1.7758328627893847, |
| "grad_norm": 81.50817108154297, |
| "learning_rate": 4.715873043107928e-06, |
| "loss": 0.6561, |
| "mean_token_accuracy": 0.8066185235977172, |
| "num_tokens": 25539969.0, |
| "step": 3145 |
| }, |
| { |
| "entropy": 1.218087124824524, |
| "epoch": 1.7786561264822134, |
| "grad_norm": 89.5859146118164, |
| "learning_rate": 4.714992252936757e-06, |
| "loss": 0.7023, |
| "mean_token_accuracy": 0.7991890192031861, |
| "num_tokens": 25580664.0, |
| "step": 3150 |
| }, |
| { |
| "entropy": 1.205983853340149, |
| "epoch": 1.7814793901750423, |
| "grad_norm": 77.29006958007812, |
| "learning_rate": 4.714110211937536e-06, |
| "loss": 0.6169, |
| "mean_token_accuracy": 0.8210288166999817, |
| "num_tokens": 25621449.0, |
| "step": 3155 |
| }, |
| { |
| "entropy": 1.2510562658309936, |
| "epoch": 1.7843026538678712, |
| "grad_norm": 76.20855712890625, |
| "learning_rate": 4.713226920804157e-06, |
| "loss": 0.6714, |
| "mean_token_accuracy": 0.8030560255050659, |
| "num_tokens": 25662023.0, |
| "step": 3160 |
| }, |
| { |
| "entropy": 1.3263770818710328, |
| "epoch": 1.7871259175607002, |
| "grad_norm": 78.84805297851562, |
| "learning_rate": 4.712342380231494e-06, |
| "loss": 0.6848, |
| "mean_token_accuracy": 0.8034481525421142, |
| "num_tokens": 25702546.0, |
| "step": 3165 |
| }, |
| { |
| "entropy": 1.2178564548492432, |
| "epoch": 1.789949181253529, |
| "grad_norm": 67.33372497558594, |
| "learning_rate": 4.711456590915406e-06, |
| "loss": 0.6608, |
| "mean_token_accuracy": 0.8104692459106445, |
| "num_tokens": 25743265.0, |
| "step": 3170 |
| }, |
| { |
| "entropy": 1.2496137619018555, |
| "epoch": 1.792772444946358, |
| "grad_norm": 74.43345642089844, |
| "learning_rate": 4.710569553552733e-06, |
| "loss": 0.714, |
| "mean_token_accuracy": 0.7992898225784302, |
| "num_tokens": 25784091.0, |
| "step": 3175 |
| }, |
| { |
| "entropy": 1.2508557558059692, |
| "epoch": 1.795595708639187, |
| "grad_norm": 79.79424285888672, |
| "learning_rate": 4.709681268841295e-06, |
| "loss": 0.6524, |
| "mean_token_accuracy": 0.81306471824646, |
| "num_tokens": 25824786.0, |
| "step": 3180 |
| }, |
| { |
| "entropy": 1.3131820201873778, |
| "epoch": 1.7984189723320159, |
| "grad_norm": 77.71157836914062, |
| "learning_rate": 4.708791737479897e-06, |
| "loss": 0.6898, |
| "mean_token_accuracy": 0.8016238570213318, |
| "num_tokens": 25865374.0, |
| "step": 3185 |
| }, |
| { |
| "entropy": 1.3496314764022828, |
| "epoch": 1.8012422360248448, |
| "grad_norm": 78.83152770996094, |
| "learning_rate": 4.707900960168322e-06, |
| "loss": 0.6688, |
| "mean_token_accuracy": 0.805900776386261, |
| "num_tokens": 25906110.0, |
| "step": 3190 |
| }, |
| { |
| "entropy": 1.3347065925598145, |
| "epoch": 1.8040654997176735, |
| "grad_norm": 72.0447769165039, |
| "learning_rate": 4.707008937607333e-06, |
| "loss": 0.6904, |
| "mean_token_accuracy": 0.8011209011077881, |
| "num_tokens": 25946487.0, |
| "step": 3195 |
| }, |
| { |
| "entropy": 1.1633864879608153, |
| "epoch": 1.8068887634105026, |
| "grad_norm": 83.76029205322266, |
| "learning_rate": 4.7061156704986746e-06, |
| "loss": 0.6679, |
| "mean_token_accuracy": 0.8113258719444275, |
| "num_tokens": 25987147.0, |
| "step": 3200 |
| }, |
| { |
| "entropy": 1.259621810913086, |
| "epoch": 1.8097120271033313, |
| "grad_norm": 69.44400787353516, |
| "learning_rate": 4.70522115954507e-06, |
| "loss": 0.6571, |
| "mean_token_accuracy": 0.8144752264022828, |
| "num_tokens": 26027930.0, |
| "step": 3205 |
| }, |
| { |
| "entropy": 1.330192494392395, |
| "epoch": 1.8125352907961605, |
| "grad_norm": 76.76569366455078, |
| "learning_rate": 4.704325405450219e-06, |
| "loss": 0.6999, |
| "mean_token_accuracy": 0.8003972887992858, |
| "num_tokens": 26068498.0, |
| "step": 3210 |
| }, |
| { |
| "entropy": 1.2917050123214722, |
| "epoch": 1.8153585544889892, |
| "grad_norm": 77.54668426513672, |
| "learning_rate": 4.703428408918801e-06, |
| "loss": 0.6613, |
| "mean_token_accuracy": 0.8059993982315063, |
| "num_tokens": 26109014.0, |
| "step": 3215 |
| }, |
| { |
| "entropy": 1.247635555267334, |
| "epoch": 1.8181818181818183, |
| "grad_norm": 74.58939361572266, |
| "learning_rate": 4.702530170656473e-06, |
| "loss": 0.6191, |
| "mean_token_accuracy": 0.8206661820411683, |
| "num_tokens": 26149710.0, |
| "step": 3220 |
| }, |
| { |
| "entropy": 1.3052506685256957, |
| "epoch": 1.821005081874647, |
| "grad_norm": 78.04215240478516, |
| "learning_rate": 4.7016306913698684e-06, |
| "loss": 0.668, |
| "mean_token_accuracy": 0.8105997920036316, |
| "num_tokens": 26190345.0, |
| "step": 3225 |
| }, |
| { |
| "entropy": 1.4105382680892944, |
| "epoch": 1.823828345567476, |
| "grad_norm": 66.63508605957031, |
| "learning_rate": 4.700729971766597e-06, |
| "loss": 0.6894, |
| "mean_token_accuracy": 0.8041513562202454, |
| "num_tokens": 26230630.0, |
| "step": 3230 |
| }, |
| { |
| "entropy": 1.2475630044937134, |
| "epoch": 1.826651609260305, |
| "grad_norm": 71.62044525146484, |
| "learning_rate": 4.6998280125552435e-06, |
| "loss": 0.6067, |
| "mean_token_accuracy": 0.8237750291824341, |
| "num_tokens": 26271035.0, |
| "step": 3235 |
| }, |
| { |
| "entropy": 1.4181864261627197, |
| "epoch": 1.8294748729531338, |
| "grad_norm": 70.9604263305664, |
| "learning_rate": 4.6989248144453695e-06, |
| "loss": 0.7388, |
| "mean_token_accuracy": 0.791705870628357, |
| "num_tokens": 26311785.0, |
| "step": 3240 |
| }, |
| { |
| "entropy": 1.4545698642730713, |
| "epoch": 1.8322981366459627, |
| "grad_norm": 82.96871185302734, |
| "learning_rate": 4.698020378147509e-06, |
| "loss": 0.7193, |
| "mean_token_accuracy": 0.7931790471076965, |
| "num_tokens": 26352651.0, |
| "step": 3245 |
| }, |
| { |
| "entropy": 1.288289976119995, |
| "epoch": 1.8351214003387917, |
| "grad_norm": 76.78516387939453, |
| "learning_rate": 4.6971147043731725e-06, |
| "loss": 0.6374, |
| "mean_token_accuracy": 0.8140319347381592, |
| "num_tokens": 26393263.0, |
| "step": 3250 |
| }, |
| { |
| "entropy": 1.287929368019104, |
| "epoch": 1.8379446640316206, |
| "grad_norm": 73.65550231933594, |
| "learning_rate": 4.696207793834843e-06, |
| "loss": 0.6379, |
| "mean_token_accuracy": 0.811824631690979, |
| "num_tokens": 26434015.0, |
| "step": 3255 |
| }, |
| { |
| "entropy": 1.2089492917060851, |
| "epoch": 1.8407679277244493, |
| "grad_norm": 83.56748962402344, |
| "learning_rate": 4.695299647245975e-06, |
| "loss": 0.6485, |
| "mean_token_accuracy": 0.8146473169326782, |
| "num_tokens": 26474409.0, |
| "step": 3260 |
| }, |
| { |
| "entropy": 1.2106802225112916, |
| "epoch": 1.8435911914172785, |
| "grad_norm": 73.27950286865234, |
| "learning_rate": 4.694390265320997e-06, |
| "loss": 0.6976, |
| "mean_token_accuracy": 0.8010765910148621, |
| "num_tokens": 26515206.0, |
| "step": 3265 |
| }, |
| { |
| "entropy": 1.2146219491958619, |
| "epoch": 1.8464144551101072, |
| "grad_norm": 70.8027114868164, |
| "learning_rate": 4.6934796487753095e-06, |
| "loss": 0.6656, |
| "mean_token_accuracy": 0.8075640559196472, |
| "num_tokens": 26555638.0, |
| "step": 3270 |
| }, |
| { |
| "entropy": 1.2627388954162597, |
| "epoch": 1.8492377188029363, |
| "grad_norm": 82.94084167480469, |
| "learning_rate": 4.6925677983252836e-06, |
| "loss": 0.677, |
| "mean_token_accuracy": 0.8059833526611329, |
| "num_tokens": 26596276.0, |
| "step": 3275 |
| }, |
| { |
| "entropy": 1.2478217363357544, |
| "epoch": 1.852060982495765, |
| "grad_norm": 69.82963562011719, |
| "learning_rate": 4.69165471468826e-06, |
| "loss": 0.6692, |
| "mean_token_accuracy": 0.8066045045852661, |
| "num_tokens": 26637010.0, |
| "step": 3280 |
| }, |
| { |
| "entropy": 1.3499293088912965, |
| "epoch": 1.8548842461885942, |
| "grad_norm": 92.67427062988281, |
| "learning_rate": 4.690740398582554e-06, |
| "loss": 0.686, |
| "mean_token_accuracy": 0.8015271663665772, |
| "num_tokens": 26677722.0, |
| "step": 3285 |
| }, |
| { |
| "entropy": 1.312140154838562, |
| "epoch": 1.8577075098814229, |
| "grad_norm": 80.48558044433594, |
| "learning_rate": 4.689824850727443e-06, |
| "loss": 0.644, |
| "mean_token_accuracy": 0.8171864151954651, |
| "num_tokens": 26718495.0, |
| "step": 3290 |
| }, |
| { |
| "entropy": 1.1620086908340455, |
| "epoch": 1.8605307735742518, |
| "grad_norm": 64.68397521972656, |
| "learning_rate": 4.68890807184318e-06, |
| "loss": 0.6173, |
| "mean_token_accuracy": 0.8194415926933288, |
| "num_tokens": 26759062.0, |
| "step": 3295 |
| }, |
| { |
| "entropy": 1.183970856666565, |
| "epoch": 1.8633540372670807, |
| "grad_norm": 78.46247100830078, |
| "learning_rate": 4.687990062650986e-06, |
| "loss": 0.6762, |
| "mean_token_accuracy": 0.8039267778396606, |
| "num_tokens": 26799736.0, |
| "step": 3300 |
| }, |
| { |
| "entropy": 1.2066641569137573, |
| "epoch": 1.8661773009599096, |
| "grad_norm": 78.06271362304688, |
| "learning_rate": 4.687070823873044e-06, |
| "loss": 0.6518, |
| "mean_token_accuracy": 0.8096506357192993, |
| "num_tokens": 26840464.0, |
| "step": 3305 |
| }, |
| { |
| "entropy": 1.2104302644729614, |
| "epoch": 1.8690005646527386, |
| "grad_norm": 60.512386322021484, |
| "learning_rate": 4.68615035623251e-06, |
| "loss": 0.6688, |
| "mean_token_accuracy": 0.8049030423164367, |
| "num_tokens": 26881179.0, |
| "step": 3310 |
| }, |
| { |
| "entropy": 1.309780240058899, |
| "epoch": 1.8718238283455675, |
| "grad_norm": 89.17145538330078, |
| "learning_rate": 4.685228660453505e-06, |
| "loss": 0.7278, |
| "mean_token_accuracy": 0.7917387366294861, |
| "num_tokens": 26921890.0, |
| "step": 3315 |
| }, |
| { |
| "entropy": 1.2725276470184326, |
| "epoch": 1.8746470920383964, |
| "grad_norm": 73.05801391601562, |
| "learning_rate": 4.684305737261116e-06, |
| "loss": 0.6943, |
| "mean_token_accuracy": 0.7972886681556701, |
| "num_tokens": 26962473.0, |
| "step": 3320 |
| }, |
| { |
| "entropy": 1.2042244672775269, |
| "epoch": 1.8774703557312253, |
| "grad_norm": 75.5986328125, |
| "learning_rate": 4.683381587381396e-06, |
| "loss": 0.6685, |
| "mean_token_accuracy": 0.8059680342674256, |
| "num_tokens": 27002893.0, |
| "step": 3325 |
| }, |
| { |
| "entropy": 1.297372031211853, |
| "epoch": 1.8802936194240543, |
| "grad_norm": 60.600040435791016, |
| "learning_rate": 4.682456211541363e-06, |
| "loss": 0.7187, |
| "mean_token_accuracy": 0.7960585474967956, |
| "num_tokens": 27043613.0, |
| "step": 3330 |
| }, |
| { |
| "entropy": 1.2372675895690919, |
| "epoch": 1.883116883116883, |
| "grad_norm": 77.02068328857422, |
| "learning_rate": 4.681529610468999e-06, |
| "loss": 0.6627, |
| "mean_token_accuracy": 0.8110163450241089, |
| "num_tokens": 27084223.0, |
| "step": 3335 |
| }, |
| { |
| "entropy": 1.2115526437759399, |
| "epoch": 1.8859401468097121, |
| "grad_norm": 71.88720703125, |
| "learning_rate": 4.68060178489325e-06, |
| "loss": 0.6416, |
| "mean_token_accuracy": 0.8154277205467224, |
| "num_tokens": 27124888.0, |
| "step": 3340 |
| }, |
| { |
| "entropy": 1.1731568574905396, |
| "epoch": 1.8887634105025408, |
| "grad_norm": 75.11717224121094, |
| "learning_rate": 4.679672735544024e-06, |
| "loss": 0.6238, |
| "mean_token_accuracy": 0.817597496509552, |
| "num_tokens": 27165493.0, |
| "step": 3345 |
| }, |
| { |
| "entropy": 1.2224584341049194, |
| "epoch": 1.89158667419537, |
| "grad_norm": 79.28688049316406, |
| "learning_rate": 4.678742463152196e-06, |
| "loss": 0.6627, |
| "mean_token_accuracy": 0.807835042476654, |
| "num_tokens": 27205997.0, |
| "step": 3350 |
| }, |
| { |
| "entropy": 1.1851969718933106, |
| "epoch": 1.8944099378881987, |
| "grad_norm": 77.38661193847656, |
| "learning_rate": 4.677810968449598e-06, |
| "loss": 0.6489, |
| "mean_token_accuracy": 0.8135580539703369, |
| "num_tokens": 27246679.0, |
| "step": 3355 |
| }, |
| { |
| "entropy": 1.2727444410324096, |
| "epoch": 1.8972332015810278, |
| "grad_norm": 64.15800476074219, |
| "learning_rate": 4.676878252169025e-06, |
| "loss": 0.6672, |
| "mean_token_accuracy": 0.8050463199615479, |
| "num_tokens": 27287296.0, |
| "step": 3360 |
| }, |
| { |
| "entropy": 1.3533100366592408, |
| "epoch": 1.9000564652738565, |
| "grad_norm": 74.59213256835938, |
| "learning_rate": 4.6759443150442375e-06, |
| "loss": 0.7046, |
| "mean_token_accuracy": 0.7972202181816102, |
| "num_tokens": 27327899.0, |
| "step": 3365 |
| }, |
| { |
| "entropy": 1.2930695295333863, |
| "epoch": 1.9028797289666854, |
| "grad_norm": 74.31491088867188, |
| "learning_rate": 4.675009157809949e-06, |
| "loss": 0.6536, |
| "mean_token_accuracy": 0.810335111618042, |
| "num_tokens": 27368677.0, |
| "step": 3370 |
| }, |
| { |
| "entropy": 1.267875075340271, |
| "epoch": 1.9057029926595144, |
| "grad_norm": 91.21916198730469, |
| "learning_rate": 4.67407278120184e-06, |
| "loss": 0.6716, |
| "mean_token_accuracy": 0.8077351331710816, |
| "num_tokens": 27408549.0, |
| "step": 3375 |
| }, |
| { |
| "entropy": 1.2228721261024476, |
| "epoch": 1.9085262563523433, |
| "grad_norm": 76.48754119873047, |
| "learning_rate": 4.6731351859565435e-06, |
| "loss": 0.6306, |
| "mean_token_accuracy": 0.8189470052719117, |
| "num_tokens": 27449132.0, |
| "step": 3380 |
| }, |
| { |
| "entropy": 1.3247106790542602, |
| "epoch": 1.9113495200451722, |
| "grad_norm": 70.40728759765625, |
| "learning_rate": 4.672196372811656e-06, |
| "loss": 0.7196, |
| "mean_token_accuracy": 0.7973332285881043, |
| "num_tokens": 27489492.0, |
| "step": 3385 |
| }, |
| { |
| "entropy": 1.1619846105575562, |
| "epoch": 1.9141727837380011, |
| "grad_norm": 69.71743774414062, |
| "learning_rate": 4.671256342505731e-06, |
| "loss": 0.6233, |
| "mean_token_accuracy": 0.8190138101577759, |
| "num_tokens": 27530158.0, |
| "step": 3390 |
| }, |
| { |
| "entropy": 1.2606486558914185, |
| "epoch": 1.91699604743083, |
| "grad_norm": 71.09142303466797, |
| "learning_rate": 4.6703150957782795e-06, |
| "loss": 0.7106, |
| "mean_token_accuracy": 0.7955459117889404, |
| "num_tokens": 27570628.0, |
| "step": 3395 |
| }, |
| { |
| "entropy": 1.2582112312316895, |
| "epoch": 1.919819311123659, |
| "grad_norm": 71.53373718261719, |
| "learning_rate": 4.669372633369769e-06, |
| "loss": 0.6667, |
| "mean_token_accuracy": 0.8072426319122314, |
| "num_tokens": 27611277.0, |
| "step": 3400 |
| }, |
| { |
| "entropy": 1.3039365530014038, |
| "epoch": 1.922642574816488, |
| "grad_norm": 81.98397827148438, |
| "learning_rate": 4.668428956021622e-06, |
| "loss": 0.703, |
| "mean_token_accuracy": 0.8007499933242798, |
| "num_tokens": 27651697.0, |
| "step": 3405 |
| }, |
| { |
| "entropy": 1.3271862268447876, |
| "epoch": 1.9254658385093166, |
| "grad_norm": 73.20526885986328, |
| "learning_rate": 4.667484064476219e-06, |
| "loss": 0.6535, |
| "mean_token_accuracy": 0.8104861855506897, |
| "num_tokens": 27692305.0, |
| "step": 3410 |
| }, |
| { |
| "entropy": 1.3667465209960938, |
| "epoch": 1.9282891022021458, |
| "grad_norm": 79.74845123291016, |
| "learning_rate": 4.666537959476897e-06, |
| "loss": 0.6962, |
| "mean_token_accuracy": 0.7966463446617127, |
| "num_tokens": 27732690.0, |
| "step": 3415 |
| }, |
| { |
| "entropy": 1.2887543439865112, |
| "epoch": 1.9311123658949745, |
| "grad_norm": 66.48158264160156, |
| "learning_rate": 4.665590641767943e-06, |
| "loss": 0.621, |
| "mean_token_accuracy": 0.8193240642547608, |
| "num_tokens": 27773352.0, |
| "step": 3420 |
| }, |
| { |
| "entropy": 1.1709359884262085, |
| "epoch": 1.9339356295878036, |
| "grad_norm": 72.53215026855469, |
| "learning_rate": 4.664642112094601e-06, |
| "loss": 0.5945, |
| "mean_token_accuracy": 0.8274150848388672, |
| "num_tokens": 27813931.0, |
| "step": 3425 |
| }, |
| { |
| "entropy": 1.322715401649475, |
| "epoch": 1.9367588932806323, |
| "grad_norm": 75.37449645996094, |
| "learning_rate": 4.66369237120307e-06, |
| "loss": 0.7161, |
| "mean_token_accuracy": 0.7968868970870971, |
| "num_tokens": 27854738.0, |
| "step": 3430 |
| }, |
| { |
| "entropy": 1.4540338277816773, |
| "epoch": 1.9395821569734615, |
| "grad_norm": 84.57624053955078, |
| "learning_rate": 4.662741419840497e-06, |
| "loss": 0.7683, |
| "mean_token_accuracy": 0.7849366426467895, |
| "num_tokens": 27895401.0, |
| "step": 3435 |
| }, |
| { |
| "entropy": 1.200544047355652, |
| "epoch": 1.9424054206662902, |
| "grad_norm": 61.72697067260742, |
| "learning_rate": 4.6617892587549865e-06, |
| "loss": 0.611, |
| "mean_token_accuracy": 0.8247037768363953, |
| "num_tokens": 27935721.0, |
| "step": 3440 |
| }, |
| { |
| "entropy": 1.2894334077835083, |
| "epoch": 1.945228684359119, |
| "grad_norm": 70.79447937011719, |
| "learning_rate": 4.66083588869559e-06, |
| "loss": 0.6648, |
| "mean_token_accuracy": 0.8107550263404846, |
| "num_tokens": 27976426.0, |
| "step": 3445 |
| }, |
| { |
| "entropy": 1.2915485858917237, |
| "epoch": 1.948051948051948, |
| "grad_norm": 77.85231018066406, |
| "learning_rate": 4.659881310412316e-06, |
| "loss": 0.6462, |
| "mean_token_accuracy": 0.8118519902229309, |
| "num_tokens": 28017120.0, |
| "step": 3450 |
| }, |
| { |
| "entropy": 1.2960253000259399, |
| "epoch": 1.950875211744777, |
| "grad_norm": 91.22349548339844, |
| "learning_rate": 4.658925524656117e-06, |
| "loss": 0.7125, |
| "mean_token_accuracy": 0.7955609560012817, |
| "num_tokens": 28056932.0, |
| "step": 3455 |
| }, |
| { |
| "entropy": 1.2702165603637696, |
| "epoch": 1.9536984754376059, |
| "grad_norm": 60.86642837524414, |
| "learning_rate": 4.657968532178899e-06, |
| "loss": 0.615, |
| "mean_token_accuracy": 0.8187849164009094, |
| "num_tokens": 28097762.0, |
| "step": 3460 |
| }, |
| { |
| "entropy": 1.2905339479446412, |
| "epoch": 1.9565217391304348, |
| "grad_norm": 64.9665756225586, |
| "learning_rate": 4.657010333733517e-06, |
| "loss": 0.6653, |
| "mean_token_accuracy": 0.8115843653678894, |
| "num_tokens": 28138448.0, |
| "step": 3465 |
| }, |
| { |
| "entropy": 1.3494179725646973, |
| "epoch": 1.9593450028232637, |
| "grad_norm": 81.62896728515625, |
| "learning_rate": 4.656050930073775e-06, |
| "loss": 0.6935, |
| "mean_token_accuracy": 0.7997290372848511, |
| "num_tokens": 28179033.0, |
| "step": 3470 |
| }, |
| { |
| "entropy": 1.2266453981399537, |
| "epoch": 1.9621682665160924, |
| "grad_norm": 71.46363830566406, |
| "learning_rate": 4.655090321954422e-06, |
| "loss": 0.6402, |
| "mean_token_accuracy": 0.8146908640861511, |
| "num_tokens": 28219678.0, |
| "step": 3475 |
| }, |
| { |
| "entropy": 1.2505339860916138, |
| "epoch": 1.9649915302089216, |
| "grad_norm": 62.542510986328125, |
| "learning_rate": 4.654128510131159e-06, |
| "loss": 0.6569, |
| "mean_token_accuracy": 0.8119516491889953, |
| "num_tokens": 28260389.0, |
| "step": 3480 |
| }, |
| { |
| "entropy": 1.2989205360412597, |
| "epoch": 1.9678147939017503, |
| "grad_norm": 73.96013641357422, |
| "learning_rate": 4.653165495360632e-06, |
| "loss": 0.6984, |
| "mean_token_accuracy": 0.8002416968345643, |
| "num_tokens": 28301129.0, |
| "step": 3485 |
| }, |
| { |
| "entropy": 1.2366380214691162, |
| "epoch": 1.9706380575945794, |
| "grad_norm": 92.0043716430664, |
| "learning_rate": 4.652201278400432e-06, |
| "loss": 0.7118, |
| "mean_token_accuracy": 0.7954460859298706, |
| "num_tokens": 28341239.0, |
| "step": 3490 |
| }, |
| { |
| "entropy": 1.2122334718704224, |
| "epoch": 1.9734613212874081, |
| "grad_norm": 68.99957275390625, |
| "learning_rate": 4.651235860009099e-06, |
| "loss": 0.6665, |
| "mean_token_accuracy": 0.810545551776886, |
| "num_tokens": 28381589.0, |
| "step": 3495 |
| }, |
| { |
| "entropy": 1.3472963571548462, |
| "epoch": 1.9762845849802373, |
| "grad_norm": 69.63176727294922, |
| "learning_rate": 4.650269240946115e-06, |
| "loss": 0.6928, |
| "mean_token_accuracy": 0.8018389105796814, |
| "num_tokens": 28422151.0, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.9762845849802373, |
| "eval_entropy": 1.3732946634292602, |
| "eval_loss": 0.4887203872203827, |
| "eval_mean_token_accuracy": 0.8582573294639587, |
| "eval_num_tokens": 28422151.0, |
| "eval_runtime": 2.4536, |
| "eval_samples_per_second": 15.895, |
| "eval_steps_per_second": 2.038, |
| "step": 3500 |
| }, |
| { |
| "entropy": 1.1886157512664794, |
| "epoch": 1.979107848673066, |
| "grad_norm": 80.48574829101562, |
| "learning_rate": 4.6493014219719064e-06, |
| "loss": 0.6651, |
| "mean_token_accuracy": 0.8061770439147949, |
| "num_tokens": 28462742.0, |
| "step": 3505 |
| }, |
| { |
| "entropy": 1.2385428190231322, |
| "epoch": 1.981931112365895, |
| "grad_norm": 68.21446990966797, |
| "learning_rate": 4.648332403847849e-06, |
| "loss": 0.6922, |
| "mean_token_accuracy": 0.8008740901947021, |
| "num_tokens": 28503283.0, |
| "step": 3510 |
| }, |
| { |
| "entropy": 1.185453176498413, |
| "epoch": 1.9847543760587238, |
| "grad_norm": 58.8889274597168, |
| "learning_rate": 4.6473621873362525e-06, |
| "loss": 0.5914, |
| "mean_token_accuracy": 0.8272942423820495, |
| "num_tokens": 28544089.0, |
| "step": 3515 |
| }, |
| { |
| "entropy": 1.3709485530853271, |
| "epoch": 1.9875776397515528, |
| "grad_norm": 72.34013366699219, |
| "learning_rate": 4.64639077320038e-06, |
| "loss": 0.7202, |
| "mean_token_accuracy": 0.7945428967475892, |
| "num_tokens": 28584668.0, |
| "step": 3520 |
| }, |
| { |
| "entropy": 1.3020499706268311, |
| "epoch": 1.9904009034443817, |
| "grad_norm": 77.33309173583984, |
| "learning_rate": 4.645418162204427e-06, |
| "loss": 0.6731, |
| "mean_token_accuracy": 0.8073466181755066, |
| "num_tokens": 28625384.0, |
| "step": 3525 |
| }, |
| { |
| "entropy": 1.331102156639099, |
| "epoch": 1.9932241671372106, |
| "grad_norm": 62.944297790527344, |
| "learning_rate": 4.644444355113538e-06, |
| "loss": 0.6745, |
| "mean_token_accuracy": 0.8051925539970398, |
| "num_tokens": 28665929.0, |
| "step": 3530 |
| }, |
| { |
| "entropy": 1.3355406045913696, |
| "epoch": 1.9960474308300395, |
| "grad_norm": 82.5616683959961, |
| "learning_rate": 4.643469352693793e-06, |
| "loss": 0.7338, |
| "mean_token_accuracy": 0.7874128460884094, |
| "num_tokens": 28706452.0, |
| "step": 3535 |
| }, |
| { |
| "entropy": 1.2611999750137328, |
| "epoch": 1.9988706945228685, |
| "grad_norm": 70.33882141113281, |
| "learning_rate": 4.642493155712218e-06, |
| "loss": 0.6555, |
| "mean_token_accuracy": 0.8140320301055908, |
| "num_tokens": 28747167.0, |
| "step": 3540 |
| }, |
| { |
| "entropy": 1.2613008499145508, |
| "epoch": 2.0016939582156974, |
| "grad_norm": 62.02672576904297, |
| "learning_rate": 4.641515764936774e-06, |
| "loss": 0.5603, |
| "mean_token_accuracy": 0.8346445918083191, |
| "num_tokens": 28781356.0, |
| "step": 3545 |
| }, |
| { |
| "entropy": 1.1375208854675294, |
| "epoch": 2.004517221908526, |
| "grad_norm": 90.73487091064453, |
| "learning_rate": 4.640537181136361e-06, |
| "loss": 0.5311, |
| "mean_token_accuracy": 0.8399060368537903, |
| "num_tokens": 28822142.0, |
| "step": 3550 |
| }, |
| { |
| "entropy": 1.1094812870025634, |
| "epoch": 2.0073404856013553, |
| "grad_norm": 72.12708282470703, |
| "learning_rate": 4.639557405080822e-06, |
| "loss": 0.4815, |
| "mean_token_accuracy": 0.8520576357841492, |
| "num_tokens": 28862874.0, |
| "step": 3555 |
| }, |
| { |
| "entropy": 1.0384588479995727, |
| "epoch": 2.010163749294184, |
| "grad_norm": 88.40074157714844, |
| "learning_rate": 4.638576437540935e-06, |
| "loss": 0.4597, |
| "mean_token_accuracy": 0.8593630313873291, |
| "num_tokens": 28903529.0, |
| "step": 3560 |
| }, |
| { |
| "entropy": 0.9648947477340698, |
| "epoch": 2.012987012987013, |
| "grad_norm": 84.00352478027344, |
| "learning_rate": 4.637594279288412e-06, |
| "loss": 0.4597, |
| "mean_token_accuracy": 0.8570084929466247, |
| "num_tokens": 28944097.0, |
| "step": 3565 |
| }, |
| { |
| "entropy": 1.0692477583885194, |
| "epoch": 2.015810276679842, |
| "grad_norm": 79.34220886230469, |
| "learning_rate": 4.63661093109591e-06, |
| "loss": 0.4578, |
| "mean_token_accuracy": 0.8585654139518738, |
| "num_tokens": 28984715.0, |
| "step": 3570 |
| }, |
| { |
| "entropy": 0.9981315493583679, |
| "epoch": 2.018633540372671, |
| "grad_norm": 95.38790130615234, |
| "learning_rate": 4.635626393737015e-06, |
| "loss": 0.4884, |
| "mean_token_accuracy": 0.8481999158859252, |
| "num_tokens": 29025080.0, |
| "step": 3575 |
| }, |
| { |
| "entropy": 1.0125423192977905, |
| "epoch": 2.0214568040654997, |
| "grad_norm": 73.60315704345703, |
| "learning_rate": 4.634640667986251e-06, |
| "loss": 0.4606, |
| "mean_token_accuracy": 0.8575116157531738, |
| "num_tokens": 29065999.0, |
| "step": 3580 |
| }, |
| { |
| "entropy": 1.0129147291183471, |
| "epoch": 2.024280067758329, |
| "grad_norm": 70.65982818603516, |
| "learning_rate": 4.633653754619076e-06, |
| "loss": 0.4568, |
| "mean_token_accuracy": 0.8609514951705932, |
| "num_tokens": 29106693.0, |
| "step": 3585 |
| }, |
| { |
| "entropy": 1.0722288131713866, |
| "epoch": 2.0271033314511575, |
| "grad_norm": 80.4547348022461, |
| "learning_rate": 4.632665654411885e-06, |
| "loss": 0.4978, |
| "mean_token_accuracy": 0.8478749513626098, |
| "num_tokens": 29147326.0, |
| "step": 3590 |
| }, |
| { |
| "entropy": 0.9768878698349, |
| "epoch": 2.0299265951439867, |
| "grad_norm": 72.7870864868164, |
| "learning_rate": 4.631676368142003e-06, |
| "loss": 0.4441, |
| "mean_token_accuracy": 0.862525200843811, |
| "num_tokens": 29188095.0, |
| "step": 3595 |
| }, |
| { |
| "entropy": 0.9430813789367676, |
| "epoch": 2.0327498588368154, |
| "grad_norm": 67.3983383178711, |
| "learning_rate": 4.630685896587691e-06, |
| "loss": 0.4408, |
| "mean_token_accuracy": 0.8643229246139527, |
| "num_tokens": 29228722.0, |
| "step": 3600 |
| }, |
| { |
| "entropy": 0.9772986054420472, |
| "epoch": 2.035573122529644, |
| "grad_norm": 73.83796691894531, |
| "learning_rate": 4.6296942405281405e-06, |
| "loss": 0.443, |
| "mean_token_accuracy": 0.8627371668815613, |
| "num_tokens": 29269343.0, |
| "step": 3605 |
| }, |
| { |
| "entropy": 0.9941539525985718, |
| "epoch": 2.038396386222473, |
| "grad_norm": 69.57147979736328, |
| "learning_rate": 4.628701400743475e-06, |
| "loss": 0.4446, |
| "mean_token_accuracy": 0.8636790156364441, |
| "num_tokens": 29310171.0, |
| "step": 3610 |
| }, |
| { |
| "entropy": 1.0042089581489564, |
| "epoch": 2.041219649915302, |
| "grad_norm": 74.8926010131836, |
| "learning_rate": 4.627707378014751e-06, |
| "loss": 0.4682, |
| "mean_token_accuracy": 0.8554182171821594, |
| "num_tokens": 29350746.0, |
| "step": 3615 |
| }, |
| { |
| "entropy": 1.0139081001281738, |
| "epoch": 2.044042913608131, |
| "grad_norm": 71.61470031738281, |
| "learning_rate": 4.626712173123953e-06, |
| "loss": 0.4986, |
| "mean_token_accuracy": 0.8487652540206909, |
| "num_tokens": 29391411.0, |
| "step": 3620 |
| }, |
| { |
| "entropy": 1.0470230221748351, |
| "epoch": 2.0468661773009598, |
| "grad_norm": 83.09550476074219, |
| "learning_rate": 4.625715786853999e-06, |
| "loss": 0.5225, |
| "mean_token_accuracy": 0.8379368543624878, |
| "num_tokens": 29432182.0, |
| "step": 3625 |
| }, |
| { |
| "entropy": 1.1465112209320067, |
| "epoch": 2.049689440993789, |
| "grad_norm": 80.86943817138672, |
| "learning_rate": 4.624718219988732e-06, |
| "loss": 0.4994, |
| "mean_token_accuracy": 0.8471787929534912, |
| "num_tokens": 29472990.0, |
| "step": 3630 |
| }, |
| { |
| "entropy": 0.9782078504562378, |
| "epoch": 2.0525127046866176, |
| "grad_norm": 84.73113250732422, |
| "learning_rate": 4.623719473312928e-06, |
| "loss": 0.4619, |
| "mean_token_accuracy": 0.8561048865318298, |
| "num_tokens": 29513219.0, |
| "step": 3635 |
| }, |
| { |
| "entropy": 1.056278944015503, |
| "epoch": 2.0553359683794468, |
| "grad_norm": 73.62364959716797, |
| "learning_rate": 4.622719547612288e-06, |
| "loss": 0.4899, |
| "mean_token_accuracy": 0.8500555634498597, |
| "num_tokens": 29553939.0, |
| "step": 3640 |
| }, |
| { |
| "entropy": 1.126291823387146, |
| "epoch": 2.0581592320722755, |
| "grad_norm": 73.61604309082031, |
| "learning_rate": 4.621718443673442e-06, |
| "loss": 0.5254, |
| "mean_token_accuracy": 0.8411227583885192, |
| "num_tokens": 29594657.0, |
| "step": 3645 |
| }, |
| { |
| "entropy": 0.9695918083190918, |
| "epoch": 2.0609824957651046, |
| "grad_norm": 67.4301528930664, |
| "learning_rate": 4.620716162283945e-06, |
| "loss": 0.4758, |
| "mean_token_accuracy": 0.8537283182144165, |
| "num_tokens": 29635355.0, |
| "step": 3650 |
| }, |
| { |
| "entropy": 0.9435236334800721, |
| "epoch": 2.0638057594579333, |
| "grad_norm": 80.18263244628906, |
| "learning_rate": 4.619712704232283e-06, |
| "loss": 0.4425, |
| "mean_token_accuracy": 0.8651264548301697, |
| "num_tokens": 29676040.0, |
| "step": 3655 |
| }, |
| { |
| "entropy": 1.0257262706756591, |
| "epoch": 2.0666290231507625, |
| "grad_norm": 83.10700225830078, |
| "learning_rate": 4.618708070307863e-06, |
| "loss": 0.4829, |
| "mean_token_accuracy": 0.8504552364349365, |
| "num_tokens": 29716831.0, |
| "step": 3660 |
| }, |
| { |
| "entropy": 1.0027997612953186, |
| "epoch": 2.069452286843591, |
| "grad_norm": 68.62332153320312, |
| "learning_rate": 4.617702261301018e-06, |
| "loss": 0.4656, |
| "mean_token_accuracy": 0.8585775852203369, |
| "num_tokens": 29757384.0, |
| "step": 3665 |
| }, |
| { |
| "entropy": 1.088618552684784, |
| "epoch": 2.0722755505364203, |
| "grad_norm": 74.5041275024414, |
| "learning_rate": 4.616695278003006e-06, |
| "loss": 0.4726, |
| "mean_token_accuracy": 0.8526723146438598, |
| "num_tokens": 29797784.0, |
| "step": 3670 |
| }, |
| { |
| "entropy": 1.1112963557243347, |
| "epoch": 2.075098814229249, |
| "grad_norm": 79.9881362915039, |
| "learning_rate": 4.61568712120601e-06, |
| "loss": 0.4785, |
| "mean_token_accuracy": 0.8534006595611572, |
| "num_tokens": 29838386.0, |
| "step": 3675 |
| }, |
| { |
| "entropy": 0.9924672245979309, |
| "epoch": 2.0779220779220777, |
| "grad_norm": 75.17504119873047, |
| "learning_rate": 4.614677791703134e-06, |
| "loss": 0.4477, |
| "mean_token_accuracy": 0.8596629619598388, |
| "num_tokens": 29879193.0, |
| "step": 3680 |
| }, |
| { |
| "entropy": 0.996102774143219, |
| "epoch": 2.080745341614907, |
| "grad_norm": 75.02816009521484, |
| "learning_rate": 4.613667290288406e-06, |
| "loss": 0.489, |
| "mean_token_accuracy": 0.8493238091468811, |
| "num_tokens": 29919999.0, |
| "step": 3685 |
| }, |
| { |
| "entropy": 1.0089183807373048, |
| "epoch": 2.0835686053077356, |
| "grad_norm": 68.33311462402344, |
| "learning_rate": 4.612655617756776e-06, |
| "loss": 0.4584, |
| "mean_token_accuracy": 0.8591845631599426, |
| "num_tokens": 29960014.0, |
| "step": 3690 |
| }, |
| { |
| "entropy": 1.080802845954895, |
| "epoch": 2.0863918690005647, |
| "grad_norm": 74.33163452148438, |
| "learning_rate": 4.611642774904113e-06, |
| "loss": 0.4702, |
| "mean_token_accuracy": 0.8538957357406616, |
| "num_tokens": 30000495.0, |
| "step": 3695 |
| }, |
| { |
| "entropy": 1.1036657571792603, |
| "epoch": 2.0892151326933934, |
| "grad_norm": 77.43889617919922, |
| "learning_rate": 4.6106287625272106e-06, |
| "loss": 0.5021, |
| "mean_token_accuracy": 0.8491263151168823, |
| "num_tokens": 30041011.0, |
| "step": 3700 |
| }, |
| { |
| "entropy": 1.0965331435203551, |
| "epoch": 2.0920383963862226, |
| "grad_norm": 102.71356964111328, |
| "learning_rate": 4.609613581423779e-06, |
| "loss": 0.5103, |
| "mean_token_accuracy": 0.8435364127159118, |
| "num_tokens": 30081604.0, |
| "step": 3705 |
| }, |
| { |
| "entropy": 1.1061514854431151, |
| "epoch": 2.0948616600790513, |
| "grad_norm": 75.31636047363281, |
| "learning_rate": 4.6085972323924485e-06, |
| "loss": 0.5042, |
| "mean_token_accuracy": 0.8477765440940856, |
| "num_tokens": 30121410.0, |
| "step": 3710 |
| }, |
| { |
| "entropy": 1.0610831737518311, |
| "epoch": 2.0976849237718804, |
| "grad_norm": 76.53677368164062, |
| "learning_rate": 4.607579716232771e-06, |
| "loss": 0.4706, |
| "mean_token_accuracy": 0.8552014946937561, |
| "num_tokens": 30162093.0, |
| "step": 3715 |
| }, |
| { |
| "entropy": 1.0802299499511718, |
| "epoch": 2.100508187464709, |
| "grad_norm": 70.81407928466797, |
| "learning_rate": 4.606561033745213e-06, |
| "loss": 0.5047, |
| "mean_token_accuracy": 0.847411835193634, |
| "num_tokens": 30202390.0, |
| "step": 3720 |
| }, |
| { |
| "entropy": 1.0219064354896545, |
| "epoch": 2.1033314511575383, |
| "grad_norm": 68.52303314208984, |
| "learning_rate": 4.6055411857311605e-06, |
| "loss": 0.5117, |
| "mean_token_accuracy": 0.8439998030662537, |
| "num_tokens": 30243104.0, |
| "step": 3725 |
| }, |
| { |
| "entropy": 1.0835317373275757, |
| "epoch": 2.106154714850367, |
| "grad_norm": 80.77493286132812, |
| "learning_rate": 4.6045201729929145e-06, |
| "loss": 0.4638, |
| "mean_token_accuracy": 0.8567952990531922, |
| "num_tokens": 30283599.0, |
| "step": 3730 |
| }, |
| { |
| "entropy": 0.975482976436615, |
| "epoch": 2.108977978543196, |
| "grad_norm": 72.1488265991211, |
| "learning_rate": 4.603497996333695e-06, |
| "loss": 0.457, |
| "mean_token_accuracy": 0.8579200267791748, |
| "num_tokens": 30324432.0, |
| "step": 3735 |
| }, |
| { |
| "entropy": 0.9810956478118896, |
| "epoch": 2.111801242236025, |
| "grad_norm": 69.67237091064453, |
| "learning_rate": 4.602474656557636e-06, |
| "loss": 0.5009, |
| "mean_token_accuracy": 0.846557891368866, |
| "num_tokens": 30365264.0, |
| "step": 3740 |
| }, |
| { |
| "entropy": 1.076551580429077, |
| "epoch": 2.1146245059288535, |
| "grad_norm": 78.01778411865234, |
| "learning_rate": 4.601450154469786e-06, |
| "loss": 0.4761, |
| "mean_token_accuracy": 0.8549446344375611, |
| "num_tokens": 30405939.0, |
| "step": 3745 |
| }, |
| { |
| "entropy": 1.0331781387329102, |
| "epoch": 2.1174477696216827, |
| "grad_norm": 78.21675109863281, |
| "learning_rate": 4.60042449087611e-06, |
| "loss": 0.5095, |
| "mean_token_accuracy": 0.8455055952072144, |
| "num_tokens": 30446514.0, |
| "step": 3750 |
| }, |
| { |
| "entropy": 1.047632908821106, |
| "epoch": 2.1202710333145114, |
| "grad_norm": 62.341732025146484, |
| "learning_rate": 4.599397666583484e-06, |
| "loss": 0.4864, |
| "mean_token_accuracy": 0.850367295742035, |
| "num_tokens": 30487017.0, |
| "step": 3755 |
| }, |
| { |
| "entropy": 1.0679221272468566, |
| "epoch": 2.1230942970073405, |
| "grad_norm": 74.56029510498047, |
| "learning_rate": 4.598369682399699e-06, |
| "loss": 0.4839, |
| "mean_token_accuracy": 0.8506836295127869, |
| "num_tokens": 30527623.0, |
| "step": 3760 |
| }, |
| { |
| "entropy": 1.043572473526001, |
| "epoch": 2.1259175607001692, |
| "grad_norm": 70.33014678955078, |
| "learning_rate": 4.597340539133459e-06, |
| "loss": 0.4852, |
| "mean_token_accuracy": 0.8513245105743408, |
| "num_tokens": 30568412.0, |
| "step": 3765 |
| }, |
| { |
| "entropy": 1.1451680421829225, |
| "epoch": 2.1287408243929984, |
| "grad_norm": 79.97354888916016, |
| "learning_rate": 4.5963102375943775e-06, |
| "loss": 0.5012, |
| "mean_token_accuracy": 0.8455309748649598, |
| "num_tokens": 30609183.0, |
| "step": 3770 |
| }, |
| { |
| "entropy": 1.1353947281837464, |
| "epoch": 2.131564088085827, |
| "grad_norm": 76.01318359375, |
| "learning_rate": 4.59527877859298e-06, |
| "loss": 0.4989, |
| "mean_token_accuracy": 0.847420847415924, |
| "num_tokens": 30649821.0, |
| "step": 3775 |
| }, |
| { |
| "entropy": 1.1371727347373963, |
| "epoch": 2.1343873517786562, |
| "grad_norm": 66.74515533447266, |
| "learning_rate": 4.594246162940705e-06, |
| "loss": 0.4459, |
| "mean_token_accuracy": 0.8620406270027161, |
| "num_tokens": 30690300.0, |
| "step": 3780 |
| }, |
| { |
| "entropy": 1.032710576057434, |
| "epoch": 2.137210615471485, |
| "grad_norm": 68.04978942871094, |
| "learning_rate": 4.593212391449897e-06, |
| "loss": 0.5073, |
| "mean_token_accuracy": 0.8437552571296691, |
| "num_tokens": 30731044.0, |
| "step": 3785 |
| }, |
| { |
| "entropy": 1.0063154816627502, |
| "epoch": 2.140033879164314, |
| "grad_norm": 72.64938354492188, |
| "learning_rate": 4.592177464933814e-06, |
| "loss": 0.4871, |
| "mean_token_accuracy": 0.8506463766098022, |
| "num_tokens": 30771802.0, |
| "step": 3790 |
| }, |
| { |
| "entropy": 1.0658278465270996, |
| "epoch": 2.142857142857143, |
| "grad_norm": 64.7853012084961, |
| "learning_rate": 4.591141384206619e-06, |
| "loss": 0.4997, |
| "mean_token_accuracy": 0.8490427970886231, |
| "num_tokens": 30812323.0, |
| "step": 3795 |
| }, |
| { |
| "entropy": 1.0591883182525634, |
| "epoch": 2.145680406549972, |
| "grad_norm": 60.86104202270508, |
| "learning_rate": 4.590104150083383e-06, |
| "loss": 0.4585, |
| "mean_token_accuracy": 0.8590384602546692, |
| "num_tokens": 30852654.0, |
| "step": 3800 |
| }, |
| { |
| "entropy": 1.0125213027000428, |
| "epoch": 2.1485036702428006, |
| "grad_norm": 72.03414916992188, |
| "learning_rate": 4.5890657633800885e-06, |
| "loss": 0.4658, |
| "mean_token_accuracy": 0.8582098603248596, |
| "num_tokens": 30893491.0, |
| "step": 3805 |
| }, |
| { |
| "entropy": 1.1036086440086366, |
| "epoch": 2.15132693393563, |
| "grad_norm": 79.08403778076172, |
| "learning_rate": 4.588026224913621e-06, |
| "loss": 0.4996, |
| "mean_token_accuracy": 0.8465798616409301, |
| "num_tokens": 30934006.0, |
| "step": 3810 |
| }, |
| { |
| "entropy": 1.000651216506958, |
| "epoch": 2.1541501976284585, |
| "grad_norm": 76.18484497070312, |
| "learning_rate": 4.586985535501772e-06, |
| "loss": 0.4644, |
| "mean_token_accuracy": 0.8542026162147522, |
| "num_tokens": 30974555.0, |
| "step": 3815 |
| }, |
| { |
| "entropy": 1.0279491305351258, |
| "epoch": 2.1569734613212876, |
| "grad_norm": 82.19313049316406, |
| "learning_rate": 4.585943695963241e-06, |
| "loss": 0.4713, |
| "mean_token_accuracy": 0.8553337931632996, |
| "num_tokens": 31015251.0, |
| "step": 3820 |
| }, |
| { |
| "entropy": 1.0351431369781494, |
| "epoch": 2.1597967250141163, |
| "grad_norm": 74.23955535888672, |
| "learning_rate": 4.584900707117631e-06, |
| "loss": 0.4949, |
| "mean_token_accuracy": 0.8505377411842346, |
| "num_tokens": 31055845.0, |
| "step": 3825 |
| }, |
| { |
| "entropy": 1.0322145819664001, |
| "epoch": 2.162619988706945, |
| "grad_norm": 88.02880859375, |
| "learning_rate": 4.583856569785447e-06, |
| "loss": 0.4934, |
| "mean_token_accuracy": 0.8474498748779297, |
| "num_tokens": 31096141.0, |
| "step": 3830 |
| }, |
| { |
| "entropy": 0.9805418610572815, |
| "epoch": 2.165443252399774, |
| "grad_norm": 64.54179382324219, |
| "learning_rate": 4.582811284788101e-06, |
| "loss": 0.4624, |
| "mean_token_accuracy": 0.8562976837158203, |
| "num_tokens": 31136904.0, |
| "step": 3835 |
| }, |
| { |
| "entropy": 1.0411486148834228, |
| "epoch": 2.168266516092603, |
| "grad_norm": 71.99884033203125, |
| "learning_rate": 4.581764852947906e-06, |
| "loss": 0.4996, |
| "mean_token_accuracy": 0.8462500929832458, |
| "num_tokens": 31177433.0, |
| "step": 3840 |
| }, |
| { |
| "entropy": 1.0053257465362548, |
| "epoch": 2.171089779785432, |
| "grad_norm": 90.74437713623047, |
| "learning_rate": 4.580717275088077e-06, |
| "loss": 0.4753, |
| "mean_token_accuracy": 0.8529630064964294, |
| "num_tokens": 31218132.0, |
| "step": 3845 |
| }, |
| { |
| "entropy": 1.1293895483016967, |
| "epoch": 2.1739130434782608, |
| "grad_norm": 75.32796478271484, |
| "learning_rate": 4.5796685520327326e-06, |
| "loss": 0.5193, |
| "mean_token_accuracy": 0.8408478140830994, |
| "num_tokens": 31258670.0, |
| "step": 3850 |
| }, |
| { |
| "entropy": 1.0781844973564148, |
| "epoch": 2.17673630717109, |
| "grad_norm": 77.02909851074219, |
| "learning_rate": 4.578618684606889e-06, |
| "loss": 0.4716, |
| "mean_token_accuracy": 0.853575599193573, |
| "num_tokens": 31299497.0, |
| "step": 3855 |
| }, |
| { |
| "entropy": 1.1342519521713257, |
| "epoch": 2.1795595708639186, |
| "grad_norm": 82.19872283935547, |
| "learning_rate": 4.5775676736364664e-06, |
| "loss": 0.4946, |
| "mean_token_accuracy": 0.8480924963951111, |
| "num_tokens": 31340229.0, |
| "step": 3860 |
| }, |
| { |
| "entropy": 1.0584368586540223, |
| "epoch": 2.1823828345567478, |
| "grad_norm": 87.4374008178711, |
| "learning_rate": 4.57651551994828e-06, |
| "loss": 0.5106, |
| "mean_token_accuracy": 0.8445464015007019, |
| "num_tokens": 31380786.0, |
| "step": 3865 |
| }, |
| { |
| "entropy": 1.0729819297790528, |
| "epoch": 2.1852060982495765, |
| "grad_norm": 71.60746765136719, |
| "learning_rate": 4.575462224370048e-06, |
| "loss": 0.4664, |
| "mean_token_accuracy": 0.8555550336837768, |
| "num_tokens": 31421341.0, |
| "step": 3870 |
| }, |
| { |
| "entropy": 1.0335511565208435, |
| "epoch": 2.1880293619424056, |
| "grad_norm": 85.08560943603516, |
| "learning_rate": 4.574407787730387e-06, |
| "loss": 0.4892, |
| "mean_token_accuracy": 0.8508115291595459, |
| "num_tokens": 31461914.0, |
| "step": 3875 |
| }, |
| { |
| "entropy": 1.0181528091430665, |
| "epoch": 2.1908526256352343, |
| "grad_norm": 76.76180267333984, |
| "learning_rate": 4.573352210858808e-06, |
| "loss": 0.4807, |
| "mean_token_accuracy": 0.8531227588653565, |
| "num_tokens": 31502338.0, |
| "step": 3880 |
| }, |
| { |
| "entropy": 1.0412158966064453, |
| "epoch": 2.1936758893280635, |
| "grad_norm": 69.1751480102539, |
| "learning_rate": 4.57229549458572e-06, |
| "loss": 0.474, |
| "mean_token_accuracy": 0.8575033068656921, |
| "num_tokens": 31543167.0, |
| "step": 3885 |
| }, |
| { |
| "entropy": 1.0609189033508302, |
| "epoch": 2.196499153020892, |
| "grad_norm": 81.97909545898438, |
| "learning_rate": 4.571237639742432e-06, |
| "loss": 0.4969, |
| "mean_token_accuracy": 0.8498437285423279, |
| "num_tokens": 31583456.0, |
| "step": 3890 |
| }, |
| { |
| "entropy": 1.244982409477234, |
| "epoch": 2.199322416713721, |
| "grad_norm": 74.12896728515625, |
| "learning_rate": 4.570178647161144e-06, |
| "loss": 0.5426, |
| "mean_token_accuracy": 0.8337196350097656, |
| "num_tokens": 31624063.0, |
| "step": 3895 |
| }, |
| { |
| "entropy": 1.0766704201698303, |
| "epoch": 2.20214568040655, |
| "grad_norm": 73.96541595458984, |
| "learning_rate": 4.5691185176749524e-06, |
| "loss": 0.5046, |
| "mean_token_accuracy": 0.8464478969573974, |
| "num_tokens": 31664720.0, |
| "step": 3900 |
| }, |
| { |
| "entropy": 1.108527374267578, |
| "epoch": 2.2049689440993787, |
| "grad_norm": 84.28549194335938, |
| "learning_rate": 4.568057252117849e-06, |
| "loss": 0.5143, |
| "mean_token_accuracy": 0.8434839248657227, |
| "num_tokens": 31705454.0, |
| "step": 3905 |
| }, |
| { |
| "entropy": 1.099939227104187, |
| "epoch": 2.207792207792208, |
| "grad_norm": 75.40135192871094, |
| "learning_rate": 4.56699485132472e-06, |
| "loss": 0.4983, |
| "mean_token_accuracy": 0.8485525488853455, |
| "num_tokens": 31745936.0, |
| "step": 3910 |
| }, |
| { |
| "entropy": 1.175764226913452, |
| "epoch": 2.2106154714850366, |
| "grad_norm": 77.33585357666016, |
| "learning_rate": 4.565931316131344e-06, |
| "loss": 0.4914, |
| "mean_token_accuracy": 0.848324978351593, |
| "num_tokens": 31786678.0, |
| "step": 3915 |
| }, |
| { |
| "entropy": 1.0288933634757995, |
| "epoch": 2.2134387351778657, |
| "grad_norm": 87.8160400390625, |
| "learning_rate": 4.564866647374388e-06, |
| "loss": 0.4807, |
| "mean_token_accuracy": 0.8544167518615723, |
| "num_tokens": 31827383.0, |
| "step": 3920 |
| }, |
| { |
| "entropy": 1.101294755935669, |
| "epoch": 2.2162619988706944, |
| "grad_norm": 70.5611343383789, |
| "learning_rate": 4.5638008458914164e-06, |
| "loss": 0.4902, |
| "mean_token_accuracy": 0.8476052165031434, |
| "num_tokens": 31867913.0, |
| "step": 3925 |
| }, |
| { |
| "entropy": 1.151436412334442, |
| "epoch": 2.2190852625635236, |
| "grad_norm": 66.06202697753906, |
| "learning_rate": 4.562733912520883e-06, |
| "loss": 0.5328, |
| "mean_token_accuracy": 0.8389103293418885, |
| "num_tokens": 31908537.0, |
| "step": 3930 |
| }, |
| { |
| "entropy": 1.0838784456253052, |
| "epoch": 2.2219085262563523, |
| "grad_norm": 71.66177368164062, |
| "learning_rate": 4.5616658481021315e-06, |
| "loss": 0.4674, |
| "mean_token_accuracy": 0.8557949662208557, |
| "num_tokens": 31949054.0, |
| "step": 3935 |
| }, |
| { |
| "entropy": 1.0479053497314452, |
| "epoch": 2.2247317899491814, |
| "grad_norm": 67.57572174072266, |
| "learning_rate": 4.560596653475394e-06, |
| "loss": 0.478, |
| "mean_token_accuracy": 0.8529563784599304, |
| "num_tokens": 31989703.0, |
| "step": 3940 |
| }, |
| { |
| "entropy": 1.0893271923065186, |
| "epoch": 2.22755505364201, |
| "grad_norm": 77.84688568115234, |
| "learning_rate": 4.559526329481796e-06, |
| "loss": 0.4716, |
| "mean_token_accuracy": 0.8553184747695923, |
| "num_tokens": 32030477.0, |
| "step": 3945 |
| }, |
| { |
| "entropy": 1.123781180381775, |
| "epoch": 2.2303783173348393, |
| "grad_norm": 79.98429107666016, |
| "learning_rate": 4.5584548769633465e-06, |
| "loss": 0.5369, |
| "mean_token_accuracy": 0.8361510276794434, |
| "num_tokens": 32071224.0, |
| "step": 3950 |
| }, |
| { |
| "entropy": 1.1011250257492065, |
| "epoch": 2.233201581027668, |
| "grad_norm": 74.83203125, |
| "learning_rate": 4.557382296762946e-06, |
| "loss": 0.4905, |
| "mean_token_accuracy": 0.8503419160842896, |
| "num_tokens": 32111863.0, |
| "step": 3955 |
| }, |
| { |
| "entropy": 1.028159761428833, |
| "epoch": 2.2360248447204967, |
| "grad_norm": 81.0724868774414, |
| "learning_rate": 4.556308589724379e-06, |
| "loss": 0.4352, |
| "mean_token_accuracy": 0.8625659227371216, |
| "num_tokens": 32152470.0, |
| "step": 3960 |
| }, |
| { |
| "entropy": 1.0775118827819825, |
| "epoch": 2.238848108413326, |
| "grad_norm": 59.63459396362305, |
| "learning_rate": 4.555233756692319e-06, |
| "loss": 0.4762, |
| "mean_token_accuracy": 0.8518986225128173, |
| "num_tokens": 32193281.0, |
| "step": 3965 |
| }, |
| { |
| "entropy": 1.057163155078888, |
| "epoch": 2.2416713721061545, |
| "grad_norm": 95.08134460449219, |
| "learning_rate": 4.5541577985123245e-06, |
| "loss": 0.4973, |
| "mean_token_accuracy": 0.847400176525116, |
| "num_tokens": 32233871.0, |
| "step": 3970 |
| }, |
| { |
| "entropy": 1.021597111225128, |
| "epoch": 2.2444946357989837, |
| "grad_norm": 91.9679183959961, |
| "learning_rate": 4.553080716030838e-06, |
| "loss": 0.474, |
| "mean_token_accuracy": 0.8553819775581359, |
| "num_tokens": 32274313.0, |
| "step": 3975 |
| }, |
| { |
| "entropy": 1.0437519192695617, |
| "epoch": 2.2473178994918124, |
| "grad_norm": 66.00928497314453, |
| "learning_rate": 4.552002510095189e-06, |
| "loss": 0.5062, |
| "mean_token_accuracy": 0.8446116924285889, |
| "num_tokens": 32315012.0, |
| "step": 3980 |
| }, |
| { |
| "entropy": 1.0313972234725952, |
| "epoch": 2.2501411631846415, |
| "grad_norm": 85.3726577758789, |
| "learning_rate": 4.550923181553588e-06, |
| "loss": 0.5009, |
| "mean_token_accuracy": 0.8493191599845886, |
| "num_tokens": 32355634.0, |
| "step": 3985 |
| }, |
| { |
| "entropy": 1.1947339057922364, |
| "epoch": 2.2529644268774702, |
| "grad_norm": 83.1342544555664, |
| "learning_rate": 4.5498427312551316e-06, |
| "loss": 0.5782, |
| "mean_token_accuracy": 0.8298458218574524, |
| "num_tokens": 32396388.0, |
| "step": 3990 |
| }, |
| { |
| "entropy": 1.0725741267204285, |
| "epoch": 2.2557876905702994, |
| "grad_norm": 82.63168334960938, |
| "learning_rate": 4.548761160049796e-06, |
| "loss": 0.5244, |
| "mean_token_accuracy": 0.8417415022850037, |
| "num_tokens": 32437109.0, |
| "step": 3995 |
| }, |
| { |
| "entropy": 1.0708600401878356, |
| "epoch": 2.258610954263128, |
| "grad_norm": 82.65215301513672, |
| "learning_rate": 4.54767846878844e-06, |
| "loss": 0.5044, |
| "mean_token_accuracy": 0.8447011232376098, |
| "num_tokens": 32477860.0, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.258610954263128, |
| "eval_entropy": 1.2540549159049987, |
| "eval_loss": 0.44106096029281616, |
| "eval_mean_token_accuracy": 0.8703569531440735, |
| "eval_num_tokens": 32477860.0, |
| "eval_runtime": 2.4511, |
| "eval_samples_per_second": 15.911, |
| "eval_steps_per_second": 2.04, |
| "step": 4000 |
| }, |
| { |
| "entropy": 1.1180259704589843, |
| "epoch": 2.2614342179559572, |
| "grad_norm": 75.52294158935547, |
| "learning_rate": 4.546594658322806e-06, |
| "loss": 0.4942, |
| "mean_token_accuracy": 0.849030327796936, |
| "num_tokens": 32518346.0, |
| "step": 4005 |
| }, |
| { |
| "entropy": 1.1107259273529053, |
| "epoch": 2.264257481648786, |
| "grad_norm": 77.14009857177734, |
| "learning_rate": 4.545509729505513e-06, |
| "loss": 0.5014, |
| "mean_token_accuracy": 0.8447972536087036, |
| "num_tokens": 32558964.0, |
| "step": 4010 |
| }, |
| { |
| "entropy": 1.1611681938171388, |
| "epoch": 2.267080745341615, |
| "grad_norm": 69.12362670898438, |
| "learning_rate": 4.544423683190061e-06, |
| "loss": 0.4876, |
| "mean_token_accuracy": 0.8494725942611694, |
| "num_tokens": 32599726.0, |
| "step": 4015 |
| }, |
| { |
| "entropy": 1.1129327774047852, |
| "epoch": 2.269904009034444, |
| "grad_norm": 85.86442565917969, |
| "learning_rate": 4.543336520230831e-06, |
| "loss": 0.484, |
| "mean_token_accuracy": 0.8541438221931458, |
| "num_tokens": 32640450.0, |
| "step": 4020 |
| }, |
| { |
| "entropy": 1.1703894138336182, |
| "epoch": 2.2727272727272725, |
| "grad_norm": 73.08917236328125, |
| "learning_rate": 4.542248241483083e-06, |
| "loss": 0.4916, |
| "mean_token_accuracy": 0.8498480677604675, |
| "num_tokens": 32680488.0, |
| "step": 4025 |
| }, |
| { |
| "entropy": 1.0600883603096007, |
| "epoch": 2.2755505364201016, |
| "grad_norm": 77.29441833496094, |
| "learning_rate": 4.541158847802949e-06, |
| "loss": 0.4882, |
| "mean_token_accuracy": 0.8504791259765625, |
| "num_tokens": 32721056.0, |
| "step": 4030 |
| }, |
| { |
| "entropy": 1.041144812107086, |
| "epoch": 2.278373800112931, |
| "grad_norm": 76.53392028808594, |
| "learning_rate": 4.540068340047446e-06, |
| "loss": 0.4705, |
| "mean_token_accuracy": 0.8558522939682007, |
| "num_tokens": 32761727.0, |
| "step": 4035 |
| }, |
| { |
| "entropy": 1.0352703213691712, |
| "epoch": 2.2811970638057595, |
| "grad_norm": 62.13428497314453, |
| "learning_rate": 4.53897671907446e-06, |
| "loss": 0.4904, |
| "mean_token_accuracy": 0.8505735397338867, |
| "num_tokens": 32802000.0, |
| "step": 4040 |
| }, |
| { |
| "entropy": 1.0567790031433106, |
| "epoch": 2.284020327498588, |
| "grad_norm": 81.95452117919922, |
| "learning_rate": 4.537883985742759e-06, |
| "loss": 0.5078, |
| "mean_token_accuracy": 0.8440911769866943, |
| "num_tokens": 32842578.0, |
| "step": 4045 |
| }, |
| { |
| "entropy": 1.1024208545684815, |
| "epoch": 2.2868435911914173, |
| "grad_norm": 67.22496032714844, |
| "learning_rate": 4.536790140911982e-06, |
| "loss": 0.5114, |
| "mean_token_accuracy": 0.8444732785224914, |
| "num_tokens": 32882686.0, |
| "step": 4050 |
| }, |
| { |
| "entropy": 1.0241257309913636, |
| "epoch": 2.289666854884246, |
| "grad_norm": 66.01680755615234, |
| "learning_rate": 4.535695185442644e-06, |
| "loss": 0.5025, |
| "mean_token_accuracy": 0.8479160904884339, |
| "num_tokens": 32923226.0, |
| "step": 4055 |
| }, |
| { |
| "entropy": 1.0995964646339416, |
| "epoch": 2.292490118577075, |
| "grad_norm": 83.64592742919922, |
| "learning_rate": 4.534599120196134e-06, |
| "loss": 0.5606, |
| "mean_token_accuracy": 0.8310135126113891, |
| "num_tokens": 32963815.0, |
| "step": 4060 |
| }, |
| { |
| "entropy": 1.085460638999939, |
| "epoch": 2.295313382269904, |
| "grad_norm": 77.6670150756836, |
| "learning_rate": 4.533501946034712e-06, |
| "loss": 0.4963, |
| "mean_token_accuracy": 0.8473443269729615, |
| "num_tokens": 33004619.0, |
| "step": 4065 |
| }, |
| { |
| "entropy": 1.0653952002525329, |
| "epoch": 2.298136645962733, |
| "grad_norm": 83.31126403808594, |
| "learning_rate": 4.532403663821513e-06, |
| "loss": 0.4863, |
| "mean_token_accuracy": 0.8500688552856446, |
| "num_tokens": 33045463.0, |
| "step": 4070 |
| }, |
| { |
| "entropy": 1.245663857460022, |
| "epoch": 2.3009599096555617, |
| "grad_norm": 86.89364624023438, |
| "learning_rate": 4.5313042744205436e-06, |
| "loss": 0.5591, |
| "mean_token_accuracy": 0.8307137608528137, |
| "num_tokens": 33086131.0, |
| "step": 4075 |
| }, |
| { |
| "entropy": 1.157560443878174, |
| "epoch": 2.303783173348391, |
| "grad_norm": 74.83079528808594, |
| "learning_rate": 4.530203778696679e-06, |
| "loss": 0.517, |
| "mean_token_accuracy": 0.8417824983596802, |
| "num_tokens": 33126676.0, |
| "step": 4080 |
| }, |
| { |
| "entropy": 1.0899484157562256, |
| "epoch": 2.3066064370412196, |
| "grad_norm": 70.4035415649414, |
| "learning_rate": 4.529102177515666e-06, |
| "loss": 0.4936, |
| "mean_token_accuracy": 0.8485661268234252, |
| "num_tokens": 33167352.0, |
| "step": 4085 |
| }, |
| { |
| "entropy": 1.0630066752433778, |
| "epoch": 2.3094297007340487, |
| "grad_norm": 74.47498321533203, |
| "learning_rate": 4.5279994717441235e-06, |
| "loss": 0.4873, |
| "mean_token_accuracy": 0.8489712834358215, |
| "num_tokens": 33208163.0, |
| "step": 4090 |
| }, |
| { |
| "entropy": 1.1108952403068542, |
| "epoch": 2.3122529644268774, |
| "grad_norm": 62.73479080200195, |
| "learning_rate": 4.526895662249534e-06, |
| "loss": 0.509, |
| "mean_token_accuracy": 0.8456613183021545, |
| "num_tokens": 33248736.0, |
| "step": 4095 |
| }, |
| { |
| "entropy": 1.0792955875396728, |
| "epoch": 2.3150762281197066, |
| "grad_norm": 70.62771606445312, |
| "learning_rate": 4.525790749900252e-06, |
| "loss": 0.5138, |
| "mean_token_accuracy": 0.8433370232582093, |
| "num_tokens": 33289452.0, |
| "step": 4100 |
| }, |
| { |
| "entropy": 1.1269087076187134, |
| "epoch": 2.3178994918125353, |
| "grad_norm": 78.53477478027344, |
| "learning_rate": 4.5246847355655e-06, |
| "loss": 0.497, |
| "mean_token_accuracy": 0.8441904664039612, |
| "num_tokens": 33330181.0, |
| "step": 4105 |
| }, |
| { |
| "entropy": 1.0207838654518127, |
| "epoch": 2.320722755505364, |
| "grad_norm": 76.28987121582031, |
| "learning_rate": 4.523577620115367e-06, |
| "loss": 0.4814, |
| "mean_token_accuracy": 0.8532437562942505, |
| "num_tokens": 33370966.0, |
| "step": 4110 |
| }, |
| { |
| "entropy": 1.0918394804000855, |
| "epoch": 2.323546019198193, |
| "grad_norm": 89.29618835449219, |
| "learning_rate": 4.522469404420805e-06, |
| "loss": 0.45, |
| "mean_token_accuracy": 0.8594119310379028, |
| "num_tokens": 33411762.0, |
| "step": 4115 |
| }, |
| { |
| "entropy": 1.079656708240509, |
| "epoch": 2.326369282891022, |
| "grad_norm": 76.26483917236328, |
| "learning_rate": 4.521360089353635e-06, |
| "loss": 0.4774, |
| "mean_token_accuracy": 0.8538398027420044, |
| "num_tokens": 33452518.0, |
| "step": 4120 |
| }, |
| { |
| "entropy": 1.1360549926757812, |
| "epoch": 2.329192546583851, |
| "grad_norm": 74.6915283203125, |
| "learning_rate": 4.520249675786544e-06, |
| "loss": 0.5148, |
| "mean_token_accuracy": 0.8441103577613831, |
| "num_tokens": 33493150.0, |
| "step": 4125 |
| }, |
| { |
| "entropy": 1.0715775847434998, |
| "epoch": 2.3320158102766797, |
| "grad_norm": 80.18423461914062, |
| "learning_rate": 4.519138164593081e-06, |
| "loss": 0.4907, |
| "mean_token_accuracy": 0.851858401298523, |
| "num_tokens": 33533504.0, |
| "step": 4130 |
| }, |
| { |
| "entropy": 1.1397679209709168, |
| "epoch": 2.334839073969509, |
| "grad_norm": 68.14742279052734, |
| "learning_rate": 4.518025556647656e-06, |
| "loss": 0.5158, |
| "mean_token_accuracy": 0.8431363701820374, |
| "num_tokens": 33573938.0, |
| "step": 4135 |
| }, |
| { |
| "entropy": 1.106320285797119, |
| "epoch": 2.3376623376623376, |
| "grad_norm": 74.76013946533203, |
| "learning_rate": 4.5169118528255455e-06, |
| "loss": 0.5011, |
| "mean_token_accuracy": 0.845686161518097, |
| "num_tokens": 33614513.0, |
| "step": 4140 |
| }, |
| { |
| "entropy": 1.10515398979187, |
| "epoch": 2.3404856013551667, |
| "grad_norm": 81.91299438476562, |
| "learning_rate": 4.515797054002888e-06, |
| "loss": 0.4993, |
| "mean_token_accuracy": 0.8454554438591003, |
| "num_tokens": 33655239.0, |
| "step": 4145 |
| }, |
| { |
| "entropy": 1.0563299179077148, |
| "epoch": 2.3433088650479954, |
| "grad_norm": 76.27633666992188, |
| "learning_rate": 4.5146811610566825e-06, |
| "loss": 0.5093, |
| "mean_token_accuracy": 0.8462665915489197, |
| "num_tokens": 33695649.0, |
| "step": 4150 |
| }, |
| { |
| "entropy": 1.1209071040153504, |
| "epoch": 2.3461321287408246, |
| "grad_norm": 74.98118591308594, |
| "learning_rate": 4.513564174864789e-06, |
| "loss": 0.4801, |
| "mean_token_accuracy": 0.854029405117035, |
| "num_tokens": 33735838.0, |
| "step": 4155 |
| }, |
| { |
| "entropy": 1.158321738243103, |
| "epoch": 2.3489553924336533, |
| "grad_norm": 80.96183013916016, |
| "learning_rate": 4.512446096305924e-06, |
| "loss": 0.5152, |
| "mean_token_accuracy": 0.8451248645782471, |
| "num_tokens": 33776577.0, |
| "step": 4160 |
| }, |
| { |
| "entropy": 1.3016461849212646, |
| "epoch": 2.3517786561264824, |
| "grad_norm": 82.59611511230469, |
| "learning_rate": 4.511326926259672e-06, |
| "loss": 0.5629, |
| "mean_token_accuracy": 0.8303740382194519, |
| "num_tokens": 33817458.0, |
| "step": 4165 |
| }, |
| { |
| "entropy": 1.1589308381080627, |
| "epoch": 2.354601919819311, |
| "grad_norm": 73.16849517822266, |
| "learning_rate": 4.510206665606467e-06, |
| "loss": 0.5079, |
| "mean_token_accuracy": 0.8437391996383667, |
| "num_tokens": 33858157.0, |
| "step": 4170 |
| }, |
| { |
| "entropy": 1.0495543360710144, |
| "epoch": 2.35742518351214, |
| "grad_norm": 74.79431915283203, |
| "learning_rate": 4.509085315227606e-06, |
| "loss": 0.5483, |
| "mean_token_accuracy": 0.8367022156715394, |
| "num_tokens": 33898868.0, |
| "step": 4175 |
| }, |
| { |
| "entropy": 1.1106764674186707, |
| "epoch": 2.360248447204969, |
| "grad_norm": 85.37079620361328, |
| "learning_rate": 4.507962876005241e-06, |
| "loss": 0.5271, |
| "mean_token_accuracy": 0.8417307376861572, |
| "num_tokens": 33939206.0, |
| "step": 4180 |
| }, |
| { |
| "entropy": 1.135885977745056, |
| "epoch": 2.3630717108977977, |
| "grad_norm": 93.99333190917969, |
| "learning_rate": 4.506839348822384e-06, |
| "loss": 0.5443, |
| "mean_token_accuracy": 0.8377628087997436, |
| "num_tokens": 33979854.0, |
| "step": 4185 |
| }, |
| { |
| "entropy": 1.1180438637733459, |
| "epoch": 2.365894974590627, |
| "grad_norm": 60.93057632446289, |
| "learning_rate": 4.5057147345628985e-06, |
| "loss": 0.5308, |
| "mean_token_accuracy": 0.8384897470474243, |
| "num_tokens": 34020341.0, |
| "step": 4190 |
| }, |
| { |
| "entropy": 1.1248787641525269, |
| "epoch": 2.3687182382834555, |
| "grad_norm": 72.28102111816406, |
| "learning_rate": 4.504589034111505e-06, |
| "loss": 0.4869, |
| "mean_token_accuracy": 0.8506036043167114, |
| "num_tokens": 34061022.0, |
| "step": 4195 |
| }, |
| { |
| "entropy": 1.140468454360962, |
| "epoch": 2.3715415019762847, |
| "grad_norm": 78.96949005126953, |
| "learning_rate": 4.503462248353781e-06, |
| "loss": 0.5261, |
| "mean_token_accuracy": 0.8412546992301941, |
| "num_tokens": 34101705.0, |
| "step": 4200 |
| }, |
| { |
| "entropy": 1.1987942218780518, |
| "epoch": 2.3743647656691134, |
| "grad_norm": 77.41453552246094, |
| "learning_rate": 4.5023343781761516e-06, |
| "loss": 0.5068, |
| "mean_token_accuracy": 0.8431342601776123, |
| "num_tokens": 34142280.0, |
| "step": 4205 |
| }, |
| { |
| "entropy": 1.128602349758148, |
| "epoch": 2.3771880293619425, |
| "grad_norm": 75.12047576904297, |
| "learning_rate": 4.501205424465902e-06, |
| "loss": 0.5026, |
| "mean_token_accuracy": 0.8446843266487122, |
| "num_tokens": 34181976.0, |
| "step": 4210 |
| }, |
| { |
| "entropy": 1.1833925724029541, |
| "epoch": 2.380011293054771, |
| "grad_norm": 75.71656799316406, |
| "learning_rate": 4.500075388111167e-06, |
| "loss": 0.5242, |
| "mean_token_accuracy": 0.8402136087417602, |
| "num_tokens": 34222876.0, |
| "step": 4215 |
| }, |
| { |
| "entropy": 1.1418941140174865, |
| "epoch": 2.3828345567476004, |
| "grad_norm": 73.26214599609375, |
| "learning_rate": 4.498944270000931e-06, |
| "loss": 0.4817, |
| "mean_token_accuracy": 0.8517927408218384, |
| "num_tokens": 34263315.0, |
| "step": 4220 |
| }, |
| { |
| "entropy": 1.1809669256210327, |
| "epoch": 2.385657820440429, |
| "grad_norm": 86.13017272949219, |
| "learning_rate": 4.497812071025031e-06, |
| "loss": 0.5071, |
| "mean_token_accuracy": 0.8443432688713074, |
| "num_tokens": 34303434.0, |
| "step": 4225 |
| }, |
| { |
| "entropy": 1.175835919380188, |
| "epoch": 2.388481084133258, |
| "grad_norm": 64.92132568359375, |
| "learning_rate": 4.496678792074157e-06, |
| "loss": 0.5243, |
| "mean_token_accuracy": 0.8411461234092712, |
| "num_tokens": 34344189.0, |
| "step": 4230 |
| }, |
| { |
| "entropy": 1.0863433599472045, |
| "epoch": 2.391304347826087, |
| "grad_norm": 76.76641845703125, |
| "learning_rate": 4.495544434039843e-06, |
| "loss": 0.4991, |
| "mean_token_accuracy": 0.8467756628990173, |
| "num_tokens": 34384662.0, |
| "step": 4235 |
| }, |
| { |
| "entropy": 1.1002304792404174, |
| "epoch": 2.3941276115189156, |
| "grad_norm": 66.72636413574219, |
| "learning_rate": 4.494408997814478e-06, |
| "loss": 0.4921, |
| "mean_token_accuracy": 0.8480172276496887, |
| "num_tokens": 34425282.0, |
| "step": 4240 |
| }, |
| { |
| "entropy": 1.0731504201889037, |
| "epoch": 2.3969508752117448, |
| "grad_norm": 72.19840240478516, |
| "learning_rate": 4.493272484291293e-06, |
| "loss": 0.486, |
| "mean_token_accuracy": 0.8504210710525513, |
| "num_tokens": 34465824.0, |
| "step": 4245 |
| }, |
| { |
| "entropy": 1.068647289276123, |
| "epoch": 2.399774138904574, |
| "grad_norm": 74.20718383789062, |
| "learning_rate": 4.4921348943643736e-06, |
| "loss": 0.4766, |
| "mean_token_accuracy": 0.8515720963478088, |
| "num_tokens": 34506561.0, |
| "step": 4250 |
| }, |
| { |
| "entropy": 1.1240432024002076, |
| "epoch": 2.4025974025974026, |
| "grad_norm": 78.96634674072266, |
| "learning_rate": 4.490996228928645e-06, |
| "loss": 0.5126, |
| "mean_token_accuracy": 0.8429255843162536, |
| "num_tokens": 34547280.0, |
| "step": 4255 |
| }, |
| { |
| "entropy": 1.026216447353363, |
| "epoch": 2.4054206662902313, |
| "grad_norm": 69.10794067382812, |
| "learning_rate": 4.489856488879882e-06, |
| "loss": 0.4543, |
| "mean_token_accuracy": 0.8586987733840943, |
| "num_tokens": 34587754.0, |
| "step": 4260 |
| }, |
| { |
| "entropy": 1.0782729625701903, |
| "epoch": 2.4082439299830605, |
| "grad_norm": 74.8205337524414, |
| "learning_rate": 4.488715675114706e-06, |
| "loss": 0.5149, |
| "mean_token_accuracy": 0.8438615202903748, |
| "num_tokens": 34628435.0, |
| "step": 4265 |
| }, |
| { |
| "entropy": 1.1428865432739257, |
| "epoch": 2.411067193675889, |
| "grad_norm": 74.1771011352539, |
| "learning_rate": 4.4875737885305825e-06, |
| "loss": 0.5406, |
| "mean_token_accuracy": 0.8364495754241943, |
| "num_tokens": 34669206.0, |
| "step": 4270 |
| }, |
| { |
| "entropy": 1.106465208530426, |
| "epoch": 2.4138904573687183, |
| "grad_norm": 65.03327178955078, |
| "learning_rate": 4.486430830025818e-06, |
| "loss": 0.5122, |
| "mean_token_accuracy": 0.8434538841247559, |
| "num_tokens": 34709829.0, |
| "step": 4275 |
| }, |
| { |
| "entropy": 1.099475383758545, |
| "epoch": 2.416713721061547, |
| "grad_norm": 75.65325164794922, |
| "learning_rate": 4.485286800499564e-06, |
| "loss": 0.486, |
| "mean_token_accuracy": 0.8493196606636048, |
| "num_tokens": 34750478.0, |
| "step": 4280 |
| }, |
| { |
| "entropy": 1.1192373633384705, |
| "epoch": 2.419536984754376, |
| "grad_norm": 91.78784942626953, |
| "learning_rate": 4.484141700851819e-06, |
| "loss": 0.5286, |
| "mean_token_accuracy": 0.841631555557251, |
| "num_tokens": 34791351.0, |
| "step": 4285 |
| }, |
| { |
| "entropy": 1.1912834763526916, |
| "epoch": 2.422360248447205, |
| "grad_norm": 60.360713958740234, |
| "learning_rate": 4.482995531983414e-06, |
| "loss": 0.5217, |
| "mean_token_accuracy": 0.8427740693092346, |
| "num_tokens": 34831811.0, |
| "step": 4290 |
| }, |
| { |
| "entropy": 1.1366937160491943, |
| "epoch": 2.425183512140034, |
| "grad_norm": 73.9558334350586, |
| "learning_rate": 4.48184829479603e-06, |
| "loss": 0.5532, |
| "mean_token_accuracy": 0.8314054369926452, |
| "num_tokens": 34872689.0, |
| "step": 4295 |
| }, |
| { |
| "entropy": 1.0390868544578553, |
| "epoch": 2.4280067758328627, |
| "grad_norm": 74.2223892211914, |
| "learning_rate": 4.480699990192184e-06, |
| "loss": 0.4752, |
| "mean_token_accuracy": 0.8538849472999572, |
| "num_tokens": 34913355.0, |
| "step": 4300 |
| }, |
| { |
| "entropy": 1.1691401720046997, |
| "epoch": 2.430830039525692, |
| "grad_norm": 70.82209777832031, |
| "learning_rate": 4.479550619075233e-06, |
| "loss": 0.5103, |
| "mean_token_accuracy": 0.843414044380188, |
| "num_tokens": 34953912.0, |
| "step": 4305 |
| }, |
| { |
| "entropy": 1.032659935951233, |
| "epoch": 2.4336533032185206, |
| "grad_norm": 75.93842315673828, |
| "learning_rate": 4.478400182349374e-06, |
| "loss": 0.4923, |
| "mean_token_accuracy": 0.848256242275238, |
| "num_tokens": 34994417.0, |
| "step": 4310 |
| }, |
| { |
| "entropy": 1.110754704475403, |
| "epoch": 2.4364765669113497, |
| "grad_norm": 84.19135284423828, |
| "learning_rate": 4.477248680919643e-06, |
| "loss": 0.5161, |
| "mean_token_accuracy": 0.8396423697471619, |
| "num_tokens": 35034735.0, |
| "step": 4315 |
| }, |
| { |
| "entropy": 1.1798797607421876, |
| "epoch": 2.4392998306041784, |
| "grad_norm": 83.10535430908203, |
| "learning_rate": 4.476096115691909e-06, |
| "loss": 0.5277, |
| "mean_token_accuracy": 0.8388562560081482, |
| "num_tokens": 35075100.0, |
| "step": 4320 |
| }, |
| { |
| "entropy": 1.1197029232978821, |
| "epoch": 2.442123094297007, |
| "grad_norm": 74.39781188964844, |
| "learning_rate": 4.474942487572886e-06, |
| "loss": 0.478, |
| "mean_token_accuracy": 0.852222740650177, |
| "num_tokens": 35115732.0, |
| "step": 4325 |
| }, |
| { |
| "entropy": 1.1389746904373168, |
| "epoch": 2.4449463579898363, |
| "grad_norm": 77.96908569335938, |
| "learning_rate": 4.473787797470117e-06, |
| "loss": 0.5159, |
| "mean_token_accuracy": 0.8402628898620605, |
| "num_tokens": 35156421.0, |
| "step": 4330 |
| }, |
| { |
| "entropy": 1.121785569190979, |
| "epoch": 2.447769621682665, |
| "grad_norm": 77.48990631103516, |
| "learning_rate": 4.472632046291984e-06, |
| "loss": 0.5153, |
| "mean_token_accuracy": 0.8415657639503479, |
| "num_tokens": 35196916.0, |
| "step": 4335 |
| }, |
| { |
| "entropy": 1.0282376170158387, |
| "epoch": 2.450592885375494, |
| "grad_norm": 85.62969970703125, |
| "learning_rate": 4.471475234947701e-06, |
| "loss": 0.4711, |
| "mean_token_accuracy": 0.8563955187797546, |
| "num_tokens": 35237302.0, |
| "step": 4340 |
| }, |
| { |
| "entropy": 1.0324091553688048, |
| "epoch": 2.453416149068323, |
| "grad_norm": 73.95626831054688, |
| "learning_rate": 4.470317364347321e-06, |
| "loss": 0.4628, |
| "mean_token_accuracy": 0.856763768196106, |
| "num_tokens": 35278024.0, |
| "step": 4345 |
| }, |
| { |
| "entropy": 1.2146705389022827, |
| "epoch": 2.456239412761152, |
| "grad_norm": 90.1251220703125, |
| "learning_rate": 4.469158435401723e-06, |
| "loss": 0.5288, |
| "mean_token_accuracy": 0.8385349392890931, |
| "num_tokens": 35317872.0, |
| "step": 4350 |
| }, |
| { |
| "entropy": 1.168654775619507, |
| "epoch": 2.4590626764539807, |
| "grad_norm": 66.60228729248047, |
| "learning_rate": 4.467998449022626e-06, |
| "loss": 0.4924, |
| "mean_token_accuracy": 0.847309160232544, |
| "num_tokens": 35358677.0, |
| "step": 4355 |
| }, |
| { |
| "entropy": 1.1382317781448363, |
| "epoch": 2.46188594014681, |
| "grad_norm": 74.64222717285156, |
| "learning_rate": 4.466837406122576e-06, |
| "loss": 0.512, |
| "mean_token_accuracy": 0.8408301472663879, |
| "num_tokens": 35399398.0, |
| "step": 4360 |
| }, |
| { |
| "entropy": 1.1394990921020507, |
| "epoch": 2.4647092038396385, |
| "grad_norm": 77.15666198730469, |
| "learning_rate": 4.465675307614952e-06, |
| "loss": 0.5254, |
| "mean_token_accuracy": 0.840006697177887, |
| "num_tokens": 35440245.0, |
| "step": 4365 |
| }, |
| { |
| "entropy": 1.1206412315368652, |
| "epoch": 2.4675324675324677, |
| "grad_norm": 66.12518310546875, |
| "learning_rate": 4.464512154413963e-06, |
| "loss": 0.4988, |
| "mean_token_accuracy": 0.8488071084022522, |
| "num_tokens": 35480513.0, |
| "step": 4370 |
| }, |
| { |
| "entropy": 1.2848740220069885, |
| "epoch": 2.4703557312252964, |
| "grad_norm": 75.88241577148438, |
| "learning_rate": 4.463347947434647e-06, |
| "loss": 0.5434, |
| "mean_token_accuracy": 0.8377517104148865, |
| "num_tokens": 35521108.0, |
| "step": 4375 |
| }, |
| { |
| "entropy": 1.170305109024048, |
| "epoch": 2.4731789949181255, |
| "grad_norm": 76.86499786376953, |
| "learning_rate": 4.462182687592875e-06, |
| "loss": 0.5246, |
| "mean_token_accuracy": 0.8381713032722473, |
| "num_tokens": 35561905.0, |
| "step": 4380 |
| }, |
| { |
| "entropy": 1.0816607832908631, |
| "epoch": 2.4760022586109542, |
| "grad_norm": 69.17800903320312, |
| "learning_rate": 4.4610163758053385e-06, |
| "loss": 0.4845, |
| "mean_token_accuracy": 0.8541519999504089, |
| "num_tokens": 35602692.0, |
| "step": 4385 |
| }, |
| { |
| "entropy": 1.1398390531539917, |
| "epoch": 2.478825522303783, |
| "grad_norm": 85.82377624511719, |
| "learning_rate": 4.459849012989564e-06, |
| "loss": 0.5134, |
| "mean_token_accuracy": 0.8435853004455567, |
| "num_tokens": 35643253.0, |
| "step": 4390 |
| }, |
| { |
| "entropy": 1.1162615537643432, |
| "epoch": 2.481648785996612, |
| "grad_norm": 72.97199249267578, |
| "learning_rate": 4.458680600063902e-06, |
| "loss": 0.4924, |
| "mean_token_accuracy": 0.8486401915550232, |
| "num_tokens": 35684015.0, |
| "step": 4395 |
| }, |
| { |
| "entropy": 1.1746024131774901, |
| "epoch": 2.4844720496894412, |
| "grad_norm": 78.95182800292969, |
| "learning_rate": 4.457511137947528e-06, |
| "loss": 0.5611, |
| "mean_token_accuracy": 0.8314670324325562, |
| "num_tokens": 35724408.0, |
| "step": 4400 |
| }, |
| { |
| "entropy": 1.1452197551727294, |
| "epoch": 2.48729531338227, |
| "grad_norm": 75.60063171386719, |
| "learning_rate": 4.456340627560444e-06, |
| "loss": 0.5306, |
| "mean_token_accuracy": 0.8401831388473511, |
| "num_tokens": 35764983.0, |
| "step": 4405 |
| }, |
| { |
| "entropy": 1.1957187175750732, |
| "epoch": 2.4901185770750986, |
| "grad_norm": 72.1159439086914, |
| "learning_rate": 4.4551690698234774e-06, |
| "loss": 0.5057, |
| "mean_token_accuracy": 0.8457194209098816, |
| "num_tokens": 35805660.0, |
| "step": 4410 |
| }, |
| { |
| "entropy": 1.1455501317977905, |
| "epoch": 2.492941840767928, |
| "grad_norm": 62.46710205078125, |
| "learning_rate": 4.4539964656582795e-06, |
| "loss": 0.5132, |
| "mean_token_accuracy": 0.8429795026779174, |
| "num_tokens": 35846379.0, |
| "step": 4415 |
| }, |
| { |
| "entropy": 1.155365228652954, |
| "epoch": 2.4957651044607565, |
| "grad_norm": 74.61618041992188, |
| "learning_rate": 4.452822815987322e-06, |
| "loss": 0.5407, |
| "mean_token_accuracy": 0.8346656799316406, |
| "num_tokens": 35886996.0, |
| "step": 4420 |
| }, |
| { |
| "entropy": 1.139185333251953, |
| "epoch": 2.4985883681535856, |
| "grad_norm": 73.73220825195312, |
| "learning_rate": 4.4516481217339035e-06, |
| "loss": 0.5251, |
| "mean_token_accuracy": 0.8383039832115173, |
| "num_tokens": 35927734.0, |
| "step": 4425 |
| }, |
| { |
| "entropy": 1.0487828493118285, |
| "epoch": 2.5014116318464144, |
| "grad_norm": 82.3303451538086, |
| "learning_rate": 4.45047238382214e-06, |
| "loss": 0.5166, |
| "mean_token_accuracy": 0.84206303358078, |
| "num_tokens": 35968230.0, |
| "step": 4430 |
| }, |
| { |
| "entropy": 1.1584802865982056, |
| "epoch": 2.5042348955392435, |
| "grad_norm": 84.05821990966797, |
| "learning_rate": 4.449295603176972e-06, |
| "loss": 0.4825, |
| "mean_token_accuracy": 0.8500637769699096, |
| "num_tokens": 36008859.0, |
| "step": 4435 |
| }, |
| { |
| "entropy": 1.091279971599579, |
| "epoch": 2.507058159232072, |
| "grad_norm": 70.52880859375, |
| "learning_rate": 4.448117780724157e-06, |
| "loss": 0.4574, |
| "mean_token_accuracy": 0.8563731193542481, |
| "num_tokens": 36049479.0, |
| "step": 4440 |
| }, |
| { |
| "entropy": 1.172715425491333, |
| "epoch": 2.5098814229249014, |
| "grad_norm": 76.75308227539062, |
| "learning_rate": 4.446938917390276e-06, |
| "loss": 0.505, |
| "mean_token_accuracy": 0.8474242210388183, |
| "num_tokens": 36090217.0, |
| "step": 4445 |
| }, |
| { |
| "entropy": 1.1646355390548706, |
| "epoch": 2.51270468661773, |
| "grad_norm": 74.46258544921875, |
| "learning_rate": 4.445759014102726e-06, |
| "loss": 0.4888, |
| "mean_token_accuracy": 0.8501151442527771, |
| "num_tokens": 36130610.0, |
| "step": 4450 |
| }, |
| { |
| "entropy": 1.1875864028930665, |
| "epoch": 2.5155279503105588, |
| "grad_norm": 80.89794158935547, |
| "learning_rate": 4.444578071789724e-06, |
| "loss": 0.507, |
| "mean_token_accuracy": 0.8434122800827026, |
| "num_tokens": 36171102.0, |
| "step": 4455 |
| }, |
| { |
| "entropy": 1.1913317441940308, |
| "epoch": 2.518351214003388, |
| "grad_norm": 70.1068344116211, |
| "learning_rate": 4.443396091380301e-06, |
| "loss": 0.532, |
| "mean_token_accuracy": 0.8388489723205567, |
| "num_tokens": 36211713.0, |
| "step": 4460 |
| }, |
| { |
| "entropy": 1.0263278841972352, |
| "epoch": 2.521174477696217, |
| "grad_norm": 68.75521850585938, |
| "learning_rate": 4.4422130738043085e-06, |
| "loss": 0.4724, |
| "mean_token_accuracy": 0.8537720799446106, |
| "num_tokens": 36252422.0, |
| "step": 4465 |
| }, |
| { |
| "entropy": 1.1719030618667603, |
| "epoch": 2.5239977413890458, |
| "grad_norm": 74.74054718017578, |
| "learning_rate": 4.4410290199924124e-06, |
| "loss": 0.5065, |
| "mean_token_accuracy": 0.8453158020973206, |
| "num_tokens": 36292740.0, |
| "step": 4470 |
| }, |
| { |
| "entropy": 1.153631329536438, |
| "epoch": 2.5268210050818745, |
| "grad_norm": 69.42818450927734, |
| "learning_rate": 4.439843930876093e-06, |
| "loss": 0.4845, |
| "mean_token_accuracy": 0.8510221719741822, |
| "num_tokens": 36333469.0, |
| "step": 4475 |
| }, |
| { |
| "entropy": 1.1073312044143677, |
| "epoch": 2.5296442687747036, |
| "grad_norm": 73.57260131835938, |
| "learning_rate": 4.4386578073876475e-06, |
| "loss": 0.4914, |
| "mean_token_accuracy": 0.850137197971344, |
| "num_tokens": 36374242.0, |
| "step": 4480 |
| }, |
| { |
| "entropy": 1.1316781520843506, |
| "epoch": 2.5324675324675323, |
| "grad_norm": 72.3252944946289, |
| "learning_rate": 4.437470650460183e-06, |
| "loss": 0.5267, |
| "mean_token_accuracy": 0.8415279865264893, |
| "num_tokens": 36414827.0, |
| "step": 4485 |
| }, |
| { |
| "entropy": 1.1887487888336181, |
| "epoch": 2.5352907961603615, |
| "grad_norm": 74.52176666259766, |
| "learning_rate": 4.4362824610276234e-06, |
| "loss": 0.5024, |
| "mean_token_accuracy": 0.8490213751792908, |
| "num_tokens": 36455421.0, |
| "step": 4490 |
| }, |
| { |
| "entropy": 1.1459205865859985, |
| "epoch": 2.53811405985319, |
| "grad_norm": 71.28825378417969, |
| "learning_rate": 4.435093240024702e-06, |
| "loss": 0.5353, |
| "mean_token_accuracy": 0.8379881143569946, |
| "num_tokens": 36496056.0, |
| "step": 4495 |
| }, |
| { |
| "entropy": 0.9882930994033814, |
| "epoch": 2.5409373235460193, |
| "grad_norm": 66.19135284423828, |
| "learning_rate": 4.433902988386966e-06, |
| "loss": 0.4861, |
| "mean_token_accuracy": 0.8508020997047424, |
| "num_tokens": 36536748.0, |
| "step": 4500 |
| }, |
| { |
| "epoch": 2.5409373235460193, |
| "eval_entropy": 1.29656081199646, |
| "eval_loss": 0.3836705982685089, |
| "eval_mean_token_accuracy": 0.8863630533218384, |
| "eval_num_tokens": 36536748.0, |
| "eval_runtime": 2.456, |
| "eval_samples_per_second": 15.879, |
| "eval_steps_per_second": 2.036, |
| "step": 4500 |
| }, |
| { |
| "entropy": 1.126239013671875, |
| "epoch": 2.543760587238848, |
| "grad_norm": 78.8668441772461, |
| "learning_rate": 4.432711707050772e-06, |
| "loss": 0.499, |
| "mean_token_accuracy": 0.8451325535774231, |
| "num_tokens": 36577256.0, |
| "step": 4505 |
| }, |
| { |
| "entropy": 1.1737711906433106, |
| "epoch": 2.546583850931677, |
| "grad_norm": 77.37610626220703, |
| "learning_rate": 4.431519396953287e-06, |
| "loss": 0.506, |
| "mean_token_accuracy": 0.8457542061805725, |
| "num_tokens": 36617975.0, |
| "step": 4510 |
| }, |
| { |
| "entropy": 1.240240716934204, |
| "epoch": 2.549407114624506, |
| "grad_norm": 97.81411743164062, |
| "learning_rate": 4.430326059032486e-06, |
| "loss": 0.5605, |
| "mean_token_accuracy": 0.8328823566436767, |
| "num_tokens": 36658764.0, |
| "step": 4515 |
| }, |
| { |
| "entropy": 1.1290704488754273, |
| "epoch": 2.5522303783173346, |
| "grad_norm": 67.75629425048828, |
| "learning_rate": 4.429131694227155e-06, |
| "loss": 0.4866, |
| "mean_token_accuracy": 0.8508628010749817, |
| "num_tokens": 36699563.0, |
| "step": 4520 |
| }, |
| { |
| "entropy": 1.0336142778396606, |
| "epoch": 2.5550536420101637, |
| "grad_norm": 64.22755432128906, |
| "learning_rate": 4.427936303476886e-06, |
| "loss": 0.4984, |
| "mean_token_accuracy": 0.8513097763061523, |
| "num_tokens": 36740143.0, |
| "step": 4525 |
| }, |
| { |
| "entropy": 1.1085405230522156, |
| "epoch": 2.557876905702993, |
| "grad_norm": 81.93550872802734, |
| "learning_rate": 4.426739887722079e-06, |
| "loss": 0.5394, |
| "mean_token_accuracy": 0.83584965467453, |
| "num_tokens": 36780717.0, |
| "step": 4530 |
| }, |
| { |
| "entropy": 1.146517848968506, |
| "epoch": 2.5607001693958216, |
| "grad_norm": 68.24474334716797, |
| "learning_rate": 4.4255424479039414e-06, |
| "loss": 0.5027, |
| "mean_token_accuracy": 0.8451008915901184, |
| "num_tokens": 36821365.0, |
| "step": 4535 |
| }, |
| { |
| "entropy": 1.1648792028427124, |
| "epoch": 2.5635234330886503, |
| "grad_norm": 71.85755157470703, |
| "learning_rate": 4.424343984964483e-06, |
| "loss": 0.4687, |
| "mean_token_accuracy": 0.8573217153549194, |
| "num_tokens": 36862029.0, |
| "step": 4540 |
| }, |
| { |
| "entropy": 1.3023412704467774, |
| "epoch": 2.5663466967814794, |
| "grad_norm": 77.11023712158203, |
| "learning_rate": 4.42314449984652e-06, |
| "loss": 0.5796, |
| "mean_token_accuracy": 0.8275646448135376, |
| "num_tokens": 36902879.0, |
| "step": 4545 |
| }, |
| { |
| "entropy": 1.1683407545089721, |
| "epoch": 2.5691699604743086, |
| "grad_norm": 83.67930603027344, |
| "learning_rate": 4.421943993493676e-06, |
| "loss": 0.5168, |
| "mean_token_accuracy": 0.8405487775802613, |
| "num_tokens": 36943275.0, |
| "step": 4550 |
| }, |
| { |
| "entropy": 1.1550110578536987, |
| "epoch": 2.5719932241671373, |
| "grad_norm": 82.8023452758789, |
| "learning_rate": 4.4207424668503715e-06, |
| "loss": 0.535, |
| "mean_token_accuracy": 0.8371553182601928, |
| "num_tokens": 36984050.0, |
| "step": 4555 |
| }, |
| { |
| "entropy": 1.1033604502677918, |
| "epoch": 2.574816487859966, |
| "grad_norm": 66.57779693603516, |
| "learning_rate": 4.4195399208618354e-06, |
| "loss": 0.5156, |
| "mean_token_accuracy": 0.8456992626190185, |
| "num_tokens": 37024707.0, |
| "step": 4560 |
| }, |
| { |
| "entropy": 1.1740542650222778, |
| "epoch": 2.577639751552795, |
| "grad_norm": 78.03885650634766, |
| "learning_rate": 4.418336356474097e-06, |
| "loss": 0.5059, |
| "mean_token_accuracy": 0.8445262908935547, |
| "num_tokens": 37065406.0, |
| "step": 4565 |
| }, |
| { |
| "entropy": 1.096956205368042, |
| "epoch": 2.580463015245624, |
| "grad_norm": 75.66728973388672, |
| "learning_rate": 4.4171317746339846e-06, |
| "loss": 0.5026, |
| "mean_token_accuracy": 0.844276738166809, |
| "num_tokens": 37105960.0, |
| "step": 4570 |
| }, |
| { |
| "entropy": 0.9889927744865418, |
| "epoch": 2.583286278938453, |
| "grad_norm": 59.227474212646484, |
| "learning_rate": 4.415926176289128e-06, |
| "loss": 0.4506, |
| "mean_token_accuracy": 0.8623033881187439, |
| "num_tokens": 37146789.0, |
| "step": 4575 |
| }, |
| { |
| "entropy": 1.1664816856384277, |
| "epoch": 2.5861095426312817, |
| "grad_norm": 70.5433120727539, |
| "learning_rate": 4.414719562387959e-06, |
| "loss": 0.5092, |
| "mean_token_accuracy": 0.8431604027748107, |
| "num_tokens": 37187240.0, |
| "step": 4580 |
| }, |
| { |
| "entropy": 1.1500779867172242, |
| "epoch": 2.588932806324111, |
| "grad_norm": 76.07451629638672, |
| "learning_rate": 4.413511933879705e-06, |
| "loss": 0.54, |
| "mean_token_accuracy": 0.8370203971862793, |
| "num_tokens": 37227801.0, |
| "step": 4585 |
| }, |
| { |
| "entropy": 1.150613832473755, |
| "epoch": 2.5917560700169395, |
| "grad_norm": 65.93510437011719, |
| "learning_rate": 4.412303291714394e-06, |
| "loss": 0.518, |
| "mean_token_accuracy": 0.8397469758987427, |
| "num_tokens": 37268477.0, |
| "step": 4590 |
| }, |
| { |
| "entropy": 1.0623825907707214, |
| "epoch": 2.5945793337097687, |
| "grad_norm": 59.88905334472656, |
| "learning_rate": 4.4110936368428505e-06, |
| "loss": 0.4683, |
| "mean_token_accuracy": 0.8573758602142334, |
| "num_tokens": 37309299.0, |
| "step": 4595 |
| }, |
| { |
| "entropy": 1.1400325775146485, |
| "epoch": 2.5974025974025974, |
| "grad_norm": 81.33668518066406, |
| "learning_rate": 4.4098829702166945e-06, |
| "loss": 0.5115, |
| "mean_token_accuracy": 0.8455300331115723, |
| "num_tokens": 37350031.0, |
| "step": 4600 |
| }, |
| { |
| "entropy": 1.1422762393951416, |
| "epoch": 2.600225861095426, |
| "grad_norm": 72.15546417236328, |
| "learning_rate": 4.408671292788343e-06, |
| "loss": 0.5096, |
| "mean_token_accuracy": 0.8442867636680603, |
| "num_tokens": 37390863.0, |
| "step": 4605 |
| }, |
| { |
| "entropy": 1.0977005004882812, |
| "epoch": 2.6030491247882552, |
| "grad_norm": 72.4885482788086, |
| "learning_rate": 4.40745860551101e-06, |
| "loss": 0.5053, |
| "mean_token_accuracy": 0.8472009539604187, |
| "num_tokens": 37431621.0, |
| "step": 4610 |
| }, |
| { |
| "entropy": 1.159875512123108, |
| "epoch": 2.6058723884810844, |
| "grad_norm": 78.24819946289062, |
| "learning_rate": 4.4062449093387e-06, |
| "loss": 0.5253, |
| "mean_token_accuracy": 0.8395026326179504, |
| "num_tokens": 37472515.0, |
| "step": 4615 |
| }, |
| { |
| "entropy": 1.0633836030960082, |
| "epoch": 2.608695652173913, |
| "grad_norm": 68.64786529541016, |
| "learning_rate": 4.405030205226217e-06, |
| "loss": 0.504, |
| "mean_token_accuracy": 0.8456165671348572, |
| "num_tokens": 37513418.0, |
| "step": 4620 |
| }, |
| { |
| "entropy": 1.1453440189361572, |
| "epoch": 2.611518915866742, |
| "grad_norm": 72.81915283203125, |
| "learning_rate": 4.40381449412915e-06, |
| "loss": 0.5209, |
| "mean_token_accuracy": 0.8431556344032287, |
| "num_tokens": 37554168.0, |
| "step": 4625 |
| }, |
| { |
| "entropy": 1.1775591611862182, |
| "epoch": 2.614342179559571, |
| "grad_norm": 67.98880004882812, |
| "learning_rate": 4.402597777003886e-06, |
| "loss": 0.4841, |
| "mean_token_accuracy": 0.8511347770690918, |
| "num_tokens": 37594939.0, |
| "step": 4630 |
| }, |
| { |
| "entropy": 1.1267094373703004, |
| "epoch": 2.6171654432523996, |
| "grad_norm": 72.29106140136719, |
| "learning_rate": 4.401380054807603e-06, |
| "loss": 0.528, |
| "mean_token_accuracy": 0.8410808086395264, |
| "num_tokens": 37635516.0, |
| "step": 4635 |
| }, |
| { |
| "entropy": 1.1401848554611207, |
| "epoch": 2.619988706945229, |
| "grad_norm": 84.17682647705078, |
| "learning_rate": 4.400161328498269e-06, |
| "loss": 0.5462, |
| "mean_token_accuracy": 0.8344476222991943, |
| "num_tokens": 37676273.0, |
| "step": 4640 |
| }, |
| { |
| "entropy": 1.138647985458374, |
| "epoch": 2.6228119706380575, |
| "grad_norm": 82.6788330078125, |
| "learning_rate": 4.398941599034639e-06, |
| "loss": 0.5338, |
| "mean_token_accuracy": 0.8394053339958191, |
| "num_tokens": 37716664.0, |
| "step": 4645 |
| }, |
| { |
| "entropy": 1.2400930404663086, |
| "epoch": 2.6256352343308866, |
| "grad_norm": 75.30484008789062, |
| "learning_rate": 4.397720867376262e-06, |
| "loss": 0.4985, |
| "mean_token_accuracy": 0.8481818795204162, |
| "num_tokens": 37757396.0, |
| "step": 4650 |
| }, |
| { |
| "entropy": 1.2581698894500732, |
| "epoch": 2.6284584980237153, |
| "grad_norm": 84.51012420654297, |
| "learning_rate": 4.396499134483472e-06, |
| "loss": 0.5403, |
| "mean_token_accuracy": 0.8340572237968444, |
| "num_tokens": 37797918.0, |
| "step": 4655 |
| }, |
| { |
| "entropy": 1.1882836818695068, |
| "epoch": 2.6312817617165445, |
| "grad_norm": 70.45535278320312, |
| "learning_rate": 4.395276401317392e-06, |
| "loss": 0.4958, |
| "mean_token_accuracy": 0.8483729481697082, |
| "num_tokens": 37838614.0, |
| "step": 4660 |
| }, |
| { |
| "entropy": 1.1779079675674438, |
| "epoch": 2.634105025409373, |
| "grad_norm": 75.23043060302734, |
| "learning_rate": 4.394052668839931e-06, |
| "loss": 0.504, |
| "mean_token_accuracy": 0.8456653475761413, |
| "num_tokens": 37879171.0, |
| "step": 4665 |
| }, |
| { |
| "entropy": 1.1830149650573731, |
| "epoch": 2.636928289102202, |
| "grad_norm": 75.51518249511719, |
| "learning_rate": 4.392827938013786e-06, |
| "loss": 0.5183, |
| "mean_token_accuracy": 0.8426408410072327, |
| "num_tokens": 37919636.0, |
| "step": 4670 |
| }, |
| { |
| "entropy": 1.1361161470413208, |
| "epoch": 2.639751552795031, |
| "grad_norm": 81.5411376953125, |
| "learning_rate": 4.3916022098024395e-06, |
| "loss": 0.4845, |
| "mean_token_accuracy": 0.8498412609100342, |
| "num_tokens": 37960560.0, |
| "step": 4675 |
| }, |
| { |
| "entropy": 1.1308854579925538, |
| "epoch": 2.64257481648786, |
| "grad_norm": 77.25973510742188, |
| "learning_rate": 4.390375485170154e-06, |
| "loss": 0.5472, |
| "mean_token_accuracy": 0.8356089353561401, |
| "num_tokens": 38001197.0, |
| "step": 4680 |
| }, |
| { |
| "entropy": 1.222540831565857, |
| "epoch": 2.645398080180689, |
| "grad_norm": 76.51905059814453, |
| "learning_rate": 4.3891477650819805e-06, |
| "loss": 0.5798, |
| "mean_token_accuracy": 0.8243926763534546, |
| "num_tokens": 38041972.0, |
| "step": 4685 |
| }, |
| { |
| "entropy": 1.131513738632202, |
| "epoch": 2.6482213438735176, |
| "grad_norm": 81.08806610107422, |
| "learning_rate": 4.387919050503754e-06, |
| "loss": 0.4859, |
| "mean_token_accuracy": 0.8490037560462952, |
| "num_tokens": 38082599.0, |
| "step": 4690 |
| }, |
| { |
| "entropy": 1.0864753246307373, |
| "epoch": 2.6510446075663467, |
| "grad_norm": 76.36005401611328, |
| "learning_rate": 4.386689342402086e-06, |
| "loss": 0.4748, |
| "mean_token_accuracy": 0.8526203989982605, |
| "num_tokens": 38123268.0, |
| "step": 4695 |
| }, |
| { |
| "entropy": 1.1534552812576293, |
| "epoch": 2.6538678712591754, |
| "grad_norm": 82.96961975097656, |
| "learning_rate": 4.385458641744376e-06, |
| "loss": 0.5053, |
| "mean_token_accuracy": 0.8451831459999084, |
| "num_tokens": 38164005.0, |
| "step": 4700 |
| }, |
| { |
| "entropy": 1.2800405502319336, |
| "epoch": 2.6566911349520046, |
| "grad_norm": 89.06993865966797, |
| "learning_rate": 4.3842269494988005e-06, |
| "loss": 0.564, |
| "mean_token_accuracy": 0.8315343141555787, |
| "num_tokens": 38204063.0, |
| "step": 4705 |
| }, |
| { |
| "entropy": 1.168971014022827, |
| "epoch": 2.6595143986448333, |
| "grad_norm": 72.02811431884766, |
| "learning_rate": 4.382994266634317e-06, |
| "loss": 0.5092, |
| "mean_token_accuracy": 0.8427629709243775, |
| "num_tokens": 38244627.0, |
| "step": 4710 |
| }, |
| { |
| "entropy": 1.1421557903289794, |
| "epoch": 2.6623376623376624, |
| "grad_norm": 76.09336853027344, |
| "learning_rate": 4.381760594120664e-06, |
| "loss": 0.4969, |
| "mean_token_accuracy": 0.8478555679321289, |
| "num_tokens": 38285225.0, |
| "step": 4715 |
| }, |
| { |
| "entropy": 1.2609493494033814, |
| "epoch": 2.665160926030491, |
| "grad_norm": 80.14908599853516, |
| "learning_rate": 4.380525932928355e-06, |
| "loss": 0.5985, |
| "mean_token_accuracy": 0.8232489943504333, |
| "num_tokens": 38325900.0, |
| "step": 4720 |
| }, |
| { |
| "entropy": 1.1968196153640747, |
| "epoch": 2.6679841897233203, |
| "grad_norm": 80.70729064941406, |
| "learning_rate": 4.379290284028685e-06, |
| "loss": 0.5013, |
| "mean_token_accuracy": 0.847633171081543, |
| "num_tokens": 38366526.0, |
| "step": 4725 |
| }, |
| { |
| "entropy": 1.087080430984497, |
| "epoch": 2.670807453416149, |
| "grad_norm": 74.0415267944336, |
| "learning_rate": 4.378053648393724e-06, |
| "loss": 0.4941, |
| "mean_token_accuracy": 0.8482417821884155, |
| "num_tokens": 38407344.0, |
| "step": 4730 |
| }, |
| { |
| "entropy": 1.0690407276153564, |
| "epoch": 2.6736307171089777, |
| "grad_norm": 89.44586181640625, |
| "learning_rate": 4.376816026996317e-06, |
| "loss": 0.5069, |
| "mean_token_accuracy": 0.8457733273506165, |
| "num_tokens": 38448103.0, |
| "step": 4735 |
| }, |
| { |
| "entropy": 1.196146512031555, |
| "epoch": 2.676453980801807, |
| "grad_norm": 67.38997650146484, |
| "learning_rate": 4.375577420810089e-06, |
| "loss": 0.5479, |
| "mean_token_accuracy": 0.8355863332748413, |
| "num_tokens": 38488784.0, |
| "step": 4740 |
| }, |
| { |
| "entropy": 1.1895936965942382, |
| "epoch": 2.679277244494636, |
| "grad_norm": 80.26905059814453, |
| "learning_rate": 4.374337830809434e-06, |
| "loss": 0.5418, |
| "mean_token_accuracy": 0.8347990155220032, |
| "num_tokens": 38529528.0, |
| "step": 4745 |
| }, |
| { |
| "entropy": 1.1369389533996581, |
| "epoch": 2.6821005081874647, |
| "grad_norm": 81.48678588867188, |
| "learning_rate": 4.373097257969523e-06, |
| "loss": 0.5226, |
| "mean_token_accuracy": 0.8407035112380982, |
| "num_tokens": 38570284.0, |
| "step": 4750 |
| }, |
| { |
| "entropy": 1.1514408111572265, |
| "epoch": 2.6849237718802934, |
| "grad_norm": 70.67035675048828, |
| "learning_rate": 4.3718557032663025e-06, |
| "loss": 0.5391, |
| "mean_token_accuracy": 0.8364932179450989, |
| "num_tokens": 38611100.0, |
| "step": 4755 |
| }, |
| { |
| "entropy": 1.2531227827072144, |
| "epoch": 2.6877470355731226, |
| "grad_norm": 72.68204498291016, |
| "learning_rate": 4.370613167676486e-06, |
| "loss": 0.5643, |
| "mean_token_accuracy": 0.8283812403678894, |
| "num_tokens": 38651605.0, |
| "step": 4760 |
| }, |
| { |
| "entropy": 1.1133297443389893, |
| "epoch": 2.6905702992659517, |
| "grad_norm": 82.70066833496094, |
| "learning_rate": 4.369369652177563e-06, |
| "loss": 0.559, |
| "mean_token_accuracy": 0.8325229167938233, |
| "num_tokens": 38692228.0, |
| "step": 4765 |
| }, |
| { |
| "entropy": 1.1255622148513793, |
| "epoch": 2.6933935629587804, |
| "grad_norm": 74.11168670654297, |
| "learning_rate": 4.368125157747792e-06, |
| "loss": 0.5061, |
| "mean_token_accuracy": 0.8454763174057007, |
| "num_tokens": 38732867.0, |
| "step": 4770 |
| }, |
| { |
| "entropy": 1.0742725372314452, |
| "epoch": 2.696216826651609, |
| "grad_norm": 68.80258178710938, |
| "learning_rate": 4.366879685366202e-06, |
| "loss": 0.4704, |
| "mean_token_accuracy": 0.8558419108390808, |
| "num_tokens": 38773697.0, |
| "step": 4775 |
| }, |
| { |
| "entropy": 1.0947855234146118, |
| "epoch": 2.6990400903444383, |
| "grad_norm": 85.78025817871094, |
| "learning_rate": 4.365633236012592e-06, |
| "loss": 0.4765, |
| "mean_token_accuracy": 0.8518133997917176, |
| "num_tokens": 38814311.0, |
| "step": 4780 |
| }, |
| { |
| "entropy": 1.1548483610153197, |
| "epoch": 2.701863354037267, |
| "grad_norm": 70.87236785888672, |
| "learning_rate": 4.364385810667528e-06, |
| "loss": 0.5273, |
| "mean_token_accuracy": 0.8392892003059387, |
| "num_tokens": 38854978.0, |
| "step": 4785 |
| }, |
| { |
| "entropy": 1.131790852546692, |
| "epoch": 2.704686617730096, |
| "grad_norm": 74.22408294677734, |
| "learning_rate": 4.363137410312345e-06, |
| "loss": 0.4883, |
| "mean_token_accuracy": 0.8489778161048889, |
| "num_tokens": 38895851.0, |
| "step": 4790 |
| }, |
| { |
| "entropy": 1.1404380559921266, |
| "epoch": 2.707509881422925, |
| "grad_norm": 78.29373931884766, |
| "learning_rate": 4.361888035929144e-06, |
| "loss": 0.4853, |
| "mean_token_accuracy": 0.8509568572044373, |
| "num_tokens": 38936622.0, |
| "step": 4795 |
| }, |
| { |
| "entropy": 1.1203672647476197, |
| "epoch": 2.710333145115754, |
| "grad_norm": 81.51094818115234, |
| "learning_rate": 4.360637688500795e-06, |
| "loss": 0.4953, |
| "mean_token_accuracy": 0.8473028421401978, |
| "num_tokens": 38977231.0, |
| "step": 4800 |
| }, |
| { |
| "entropy": 1.191361141204834, |
| "epoch": 2.7131564088085827, |
| "grad_norm": 82.80384063720703, |
| "learning_rate": 4.35938636901093e-06, |
| "loss": 0.5602, |
| "mean_token_accuracy": 0.8294562697410583, |
| "num_tokens": 39017864.0, |
| "step": 4805 |
| }, |
| { |
| "entropy": 1.1114810943603515, |
| "epoch": 2.715979672501412, |
| "grad_norm": 67.35503387451172, |
| "learning_rate": 4.358134078443948e-06, |
| "loss": 0.5224, |
| "mean_token_accuracy": 0.8406103849411011, |
| "num_tokens": 39058519.0, |
| "step": 4810 |
| }, |
| { |
| "entropy": 1.1259804010391234, |
| "epoch": 2.7188029361942405, |
| "grad_norm": 73.83805084228516, |
| "learning_rate": 4.35688081778501e-06, |
| "loss": 0.5203, |
| "mean_token_accuracy": 0.8408207297325134, |
| "num_tokens": 39099066.0, |
| "step": 4815 |
| }, |
| { |
| "entropy": 1.1642281055450439, |
| "epoch": 2.721626199887069, |
| "grad_norm": 69.04283905029297, |
| "learning_rate": 4.355626588020042e-06, |
| "loss": 0.5417, |
| "mean_token_accuracy": 0.8353393197059631, |
| "num_tokens": 39139740.0, |
| "step": 4820 |
| }, |
| { |
| "entropy": 1.144355297088623, |
| "epoch": 2.7244494635798984, |
| "grad_norm": 80.45765686035156, |
| "learning_rate": 4.354371390135732e-06, |
| "loss": 0.4747, |
| "mean_token_accuracy": 0.850922417640686, |
| "num_tokens": 39180427.0, |
| "step": 4825 |
| }, |
| { |
| "entropy": 1.097638154029846, |
| "epoch": 2.7272727272727275, |
| "grad_norm": 65.31901550292969, |
| "learning_rate": 4.3531152251195286e-06, |
| "loss": 0.5098, |
| "mean_token_accuracy": 0.8428983092308044, |
| "num_tokens": 39220839.0, |
| "step": 4830 |
| }, |
| { |
| "entropy": 1.0980995416641235, |
| "epoch": 2.730095990965556, |
| "grad_norm": 83.73595428466797, |
| "learning_rate": 4.351858093959642e-06, |
| "loss": 0.5119, |
| "mean_token_accuracy": 0.8436989903450012, |
| "num_tokens": 39261561.0, |
| "step": 4835 |
| }, |
| { |
| "entropy": 1.2633298873901366, |
| "epoch": 2.732919254658385, |
| "grad_norm": 88.29671478271484, |
| "learning_rate": 4.350599997645044e-06, |
| "loss": 0.5693, |
| "mean_token_accuracy": 0.828402316570282, |
| "num_tokens": 39302258.0, |
| "step": 4840 |
| }, |
| { |
| "entropy": 1.2692286968231201, |
| "epoch": 2.735742518351214, |
| "grad_norm": 71.21768188476562, |
| "learning_rate": 4.349340937165462e-06, |
| "loss": 0.5245, |
| "mean_token_accuracy": 0.8410576820373535, |
| "num_tokens": 39342662.0, |
| "step": 4845 |
| }, |
| { |
| "entropy": 1.2515573263168336, |
| "epoch": 2.7385657820440428, |
| "grad_norm": 81.95990753173828, |
| "learning_rate": 4.348080913511383e-06, |
| "loss": 0.5276, |
| "mean_token_accuracy": 0.8408760786056518, |
| "num_tokens": 39383426.0, |
| "step": 4850 |
| }, |
| { |
| "entropy": 1.178449559211731, |
| "epoch": 2.741389045736872, |
| "grad_norm": 71.170166015625, |
| "learning_rate": 4.3468199276740565e-06, |
| "loss": 0.5232, |
| "mean_token_accuracy": 0.8398305416107178, |
| "num_tokens": 39424010.0, |
| "step": 4855 |
| }, |
| { |
| "entropy": 1.1536587238311768, |
| "epoch": 2.7442123094297006, |
| "grad_norm": 76.049072265625, |
| "learning_rate": 4.3455579806454814e-06, |
| "loss": 0.5285, |
| "mean_token_accuracy": 0.8364124417304992, |
| "num_tokens": 39464190.0, |
| "step": 4860 |
| }, |
| { |
| "entropy": 1.1046669960021973, |
| "epoch": 2.7470355731225298, |
| "grad_norm": 76.04798126220703, |
| "learning_rate": 4.344295073418419e-06, |
| "loss": 0.5186, |
| "mean_token_accuracy": 0.8435781955718994, |
| "num_tokens": 39504820.0, |
| "step": 4865 |
| }, |
| { |
| "entropy": 1.1494717836380004, |
| "epoch": 2.7498588368153585, |
| "grad_norm": 78.44160461425781, |
| "learning_rate": 4.343031206986382e-06, |
| "loss": 0.535, |
| "mean_token_accuracy": 0.8405123353004456, |
| "num_tokens": 39545451.0, |
| "step": 4870 |
| }, |
| { |
| "entropy": 1.2063377141952514, |
| "epoch": 2.7526821005081876, |
| "grad_norm": 64.78626251220703, |
| "learning_rate": 4.3417663823436395e-06, |
| "loss": 0.5318, |
| "mean_token_accuracy": 0.8373534440994262, |
| "num_tokens": 39586024.0, |
| "step": 4875 |
| }, |
| { |
| "entropy": 1.2055204868316651, |
| "epoch": 2.7555053642010163, |
| "grad_norm": 79.77772521972656, |
| "learning_rate": 4.340500600485213e-06, |
| "loss": 0.5334, |
| "mean_token_accuracy": 0.8385087370872497, |
| "num_tokens": 39626675.0, |
| "step": 4880 |
| }, |
| { |
| "entropy": 1.2107580423355102, |
| "epoch": 2.758328627893845, |
| "grad_norm": 82.31134033203125, |
| "learning_rate": 4.339233862406882e-06, |
| "loss": 0.4883, |
| "mean_token_accuracy": 0.8484996676445007, |
| "num_tokens": 39667244.0, |
| "step": 4885 |
| }, |
| { |
| "entropy": 1.0674038529396057, |
| "epoch": 2.761151891586674, |
| "grad_norm": 79.40963745117188, |
| "learning_rate": 4.3379661691051695e-06, |
| "loss": 0.5143, |
| "mean_token_accuracy": 0.8448100090026855, |
| "num_tokens": 39707813.0, |
| "step": 4890 |
| }, |
| { |
| "entropy": 1.0879161715507508, |
| "epoch": 2.7639751552795033, |
| "grad_norm": 71.71478271484375, |
| "learning_rate": 4.3366975215773564e-06, |
| "loss": 0.4987, |
| "mean_token_accuracy": 0.8481094598770141, |
| "num_tokens": 39748415.0, |
| "step": 4895 |
| }, |
| { |
| "entropy": 1.1267621994018555, |
| "epoch": 2.766798418972332, |
| "grad_norm": 87.01432037353516, |
| "learning_rate": 4.335427920821474e-06, |
| "loss": 0.5392, |
| "mean_token_accuracy": 0.837181282043457, |
| "num_tokens": 39789148.0, |
| "step": 4900 |
| }, |
| { |
| "entropy": 1.1177945613861084, |
| "epoch": 2.7696216826651607, |
| "grad_norm": 76.5605239868164, |
| "learning_rate": 4.334157367836301e-06, |
| "loss": 0.5153, |
| "mean_token_accuracy": 0.845607578754425, |
| "num_tokens": 39829885.0, |
| "step": 4905 |
| }, |
| { |
| "entropy": 1.1427584886550903, |
| "epoch": 2.77244494635799, |
| "grad_norm": 68.80554962158203, |
| "learning_rate": 4.332885863621367e-06, |
| "loss": 0.5072, |
| "mean_token_accuracy": 0.8441779494285584, |
| "num_tokens": 39870465.0, |
| "step": 4910 |
| }, |
| { |
| "entropy": 1.111513876914978, |
| "epoch": 2.7752682100508186, |
| "grad_norm": 78.51546478271484, |
| "learning_rate": 4.331613409176948e-06, |
| "loss": 0.5141, |
| "mean_token_accuracy": 0.8416643738746643, |
| "num_tokens": 39911218.0, |
| "step": 4915 |
| }, |
| { |
| "entropy": 1.125947070121765, |
| "epoch": 2.7780914737436477, |
| "grad_norm": 100.48128509521484, |
| "learning_rate": 4.330340005504069e-06, |
| "loss": 0.5525, |
| "mean_token_accuracy": 0.8351372361183167, |
| "num_tokens": 39951838.0, |
| "step": 4920 |
| }, |
| { |
| "entropy": 1.1707452058792114, |
| "epoch": 2.7809147374364764, |
| "grad_norm": 76.68048858642578, |
| "learning_rate": 4.329065653604504e-06, |
| "loss": 0.5287, |
| "mean_token_accuracy": 0.8395331859588623, |
| "num_tokens": 39992459.0, |
| "step": 4925 |
| }, |
| { |
| "entropy": 1.0487358808517455, |
| "epoch": 2.7837380011293056, |
| "grad_norm": 82.47752380371094, |
| "learning_rate": 4.327790354480767e-06, |
| "loss": 0.4733, |
| "mean_token_accuracy": 0.8563852667808532, |
| "num_tokens": 40033259.0, |
| "step": 4930 |
| }, |
| { |
| "entropy": 1.1915284037590026, |
| "epoch": 2.7865612648221343, |
| "grad_norm": 69.0811767578125, |
| "learning_rate": 4.326514109136124e-06, |
| "loss": 0.5118, |
| "mean_token_accuracy": 0.8428994297981263, |
| "num_tokens": 40073922.0, |
| "step": 4935 |
| }, |
| { |
| "entropy": 1.2053372740745545, |
| "epoch": 2.7893845285149634, |
| "grad_norm": 84.73222351074219, |
| "learning_rate": 4.325236918574579e-06, |
| "loss": 0.5432, |
| "mean_token_accuracy": 0.8382733345031739, |
| "num_tokens": 40114429.0, |
| "step": 4940 |
| }, |
| { |
| "entropy": 1.251980471611023, |
| "epoch": 2.792207792207792, |
| "grad_norm": 79.34503173828125, |
| "learning_rate": 4.3239587838008854e-06, |
| "loss": 0.5302, |
| "mean_token_accuracy": 0.8375539660453797, |
| "num_tokens": 40155089.0, |
| "step": 4945 |
| }, |
| { |
| "entropy": 1.2567893505096435, |
| "epoch": 2.795031055900621, |
| "grad_norm": 73.0334701538086, |
| "learning_rate": 4.322679705820536e-06, |
| "loss": 0.575, |
| "mean_token_accuracy": 0.8274977684020997, |
| "num_tokens": 40195605.0, |
| "step": 4950 |
| }, |
| { |
| "entropy": 1.2066476345062256, |
| "epoch": 2.79785431959345, |
| "grad_norm": 75.87010192871094, |
| "learning_rate": 4.321399685639764e-06, |
| "loss": 0.525, |
| "mean_token_accuracy": 0.8371191024780273, |
| "num_tokens": 40236108.0, |
| "step": 4955 |
| }, |
| { |
| "entropy": 1.1763107776641846, |
| "epoch": 2.800677583286279, |
| "grad_norm": 74.75856018066406, |
| "learning_rate": 4.320118724265549e-06, |
| "loss": 0.5465, |
| "mean_token_accuracy": 0.8359721064567566, |
| "num_tokens": 40276738.0, |
| "step": 4960 |
| }, |
| { |
| "entropy": 1.0980966806411743, |
| "epoch": 2.803500846979108, |
| "grad_norm": 78.28801727294922, |
| "learning_rate": 4.318836822705607e-06, |
| "loss": 0.5079, |
| "mean_token_accuracy": 0.8445510983467102, |
| "num_tokens": 40317366.0, |
| "step": 4965 |
| }, |
| { |
| "entropy": 1.031636393070221, |
| "epoch": 2.8063241106719365, |
| "grad_norm": 73.09954071044922, |
| "learning_rate": 4.317553981968394e-06, |
| "loss": 0.4553, |
| "mean_token_accuracy": 0.8609274625778198, |
| "num_tokens": 40358076.0, |
| "step": 4970 |
| }, |
| { |
| "entropy": 1.161788272857666, |
| "epoch": 2.8091473743647657, |
| "grad_norm": 71.20985412597656, |
| "learning_rate": 4.3162702030631074e-06, |
| "loss": 0.5243, |
| "mean_token_accuracy": 0.8453732013702393, |
| "num_tokens": 40398739.0, |
| "step": 4975 |
| }, |
| { |
| "entropy": 1.1623048067092896, |
| "epoch": 2.811970638057595, |
| "grad_norm": 75.88655090332031, |
| "learning_rate": 4.314985486999679e-06, |
| "loss": 0.5438, |
| "mean_token_accuracy": 0.8361968874931336, |
| "num_tokens": 40439544.0, |
| "step": 4980 |
| }, |
| { |
| "entropy": 1.1252004146575927, |
| "epoch": 2.8147939017504235, |
| "grad_norm": 64.87108612060547, |
| "learning_rate": 4.313699834788781e-06, |
| "loss": 0.5267, |
| "mean_token_accuracy": 0.8420312166213989, |
| "num_tokens": 40480154.0, |
| "step": 4985 |
| }, |
| { |
| "entropy": 1.1458193778991699, |
| "epoch": 2.8176171654432522, |
| "grad_norm": 81.05680847167969, |
| "learning_rate": 4.312413247441819e-06, |
| "loss": 0.5012, |
| "mean_token_accuracy": 0.8471116781234741, |
| "num_tokens": 40519961.0, |
| "step": 4990 |
| }, |
| { |
| "entropy": 1.1213873147964477, |
| "epoch": 2.8204404291360814, |
| "grad_norm": 71.18330383300781, |
| "learning_rate": 4.311125725970938e-06, |
| "loss": 0.5273, |
| "mean_token_accuracy": 0.8355283141136169, |
| "num_tokens": 40560630.0, |
| "step": 4995 |
| }, |
| { |
| "entropy": 1.1988248109817505, |
| "epoch": 2.82326369282891, |
| "grad_norm": 94.21017456054688, |
| "learning_rate": 4.309837271389015e-06, |
| "loss": 0.5374, |
| "mean_token_accuracy": 0.8381911158561707, |
| "num_tokens": 40601177.0, |
| "step": 5000 |
| }, |
| { |
| "epoch": 2.82326369282891, |
| "eval_entropy": 1.349253487586975, |
| "eval_loss": 0.35211896896362305, |
| "eval_mean_token_accuracy": 0.8927880883216858, |
| "eval_num_tokens": 40601177.0, |
| "eval_runtime": 2.4557, |
| "eval_samples_per_second": 15.882, |
| "eval_steps_per_second": 2.036, |
| "step": 5000 |
| }, |
| { |
| "entropy": 1.1538643002510072, |
| "epoch": 2.8260869565217392, |
| "grad_norm": 74.06842803955078, |
| "learning_rate": 4.308547884709662e-06, |
| "loss": 0.501, |
| "mean_token_accuracy": 0.848921275138855, |
| "num_tokens": 40641757.0, |
| "step": 5005 |
| }, |
| { |
| "entropy": 1.1794851779937745, |
| "epoch": 2.828910220214568, |
| "grad_norm": 76.33892822265625, |
| "learning_rate": 4.307257566947225e-06, |
| "loss": 0.5416, |
| "mean_token_accuracy": 0.836717689037323, |
| "num_tokens": 40682470.0, |
| "step": 5010 |
| }, |
| { |
| "entropy": 1.1910365343093872, |
| "epoch": 2.831733483907397, |
| "grad_norm": 68.9273452758789, |
| "learning_rate": 4.30596631911678e-06, |
| "loss": 0.5393, |
| "mean_token_accuracy": 0.8378082513809204, |
| "num_tokens": 40722298.0, |
| "step": 5015 |
| }, |
| { |
| "entropy": 1.0947319865226746, |
| "epoch": 2.834556747600226, |
| "grad_norm": 77.48676300048828, |
| "learning_rate": 4.304674142234137e-06, |
| "loss": 0.4782, |
| "mean_token_accuracy": 0.853443443775177, |
| "num_tokens": 40762940.0, |
| "step": 5020 |
| }, |
| { |
| "entropy": 1.157103681564331, |
| "epoch": 2.837380011293055, |
| "grad_norm": 74.99523162841797, |
| "learning_rate": 4.303381037315837e-06, |
| "loss": 0.5632, |
| "mean_token_accuracy": 0.8314770102500916, |
| "num_tokens": 40803373.0, |
| "step": 5025 |
| }, |
| { |
| "entropy": 1.0990593075752257, |
| "epoch": 2.8402032749858837, |
| "grad_norm": 58.09937286376953, |
| "learning_rate": 4.30208700537915e-06, |
| "loss": 0.4981, |
| "mean_token_accuracy": 0.8482685565948487, |
| "num_tokens": 40844047.0, |
| "step": 5030 |
| }, |
| { |
| "entropy": 1.1404879331588744, |
| "epoch": 2.8430265386787124, |
| "grad_norm": 72.91802215576172, |
| "learning_rate": 4.300792047442074e-06, |
| "loss": 0.495, |
| "mean_token_accuracy": 0.8491962909698486, |
| "num_tokens": 40884825.0, |
| "step": 5035 |
| }, |
| { |
| "entropy": 1.1204862236976623, |
| "epoch": 2.8458498023715415, |
| "grad_norm": 61.5856819152832, |
| "learning_rate": 4.29949616452334e-06, |
| "loss": 0.5091, |
| "mean_token_accuracy": 0.8451560616493226, |
| "num_tokens": 40925345.0, |
| "step": 5040 |
| }, |
| { |
| "entropy": 1.1306064128875732, |
| "epoch": 2.8486730660643707, |
| "grad_norm": 72.08314514160156, |
| "learning_rate": 4.2981993576424015e-06, |
| "loss": 0.5112, |
| "mean_token_accuracy": 0.845184576511383, |
| "num_tokens": 40966061.0, |
| "step": 5045 |
| }, |
| { |
| "entropy": 1.1932409048080443, |
| "epoch": 2.8514963297571994, |
| "grad_norm": 66.057861328125, |
| "learning_rate": 4.296901627819442e-06, |
| "loss": 0.5432, |
| "mean_token_accuracy": 0.8331049919128418, |
| "num_tokens": 41005966.0, |
| "step": 5050 |
| }, |
| { |
| "entropy": 1.1901793003082275, |
| "epoch": 2.854319593450028, |
| "grad_norm": 78.77214050292969, |
| "learning_rate": 4.29560297607537e-06, |
| "loss": 0.5656, |
| "mean_token_accuracy": 0.8264202117919922, |
| "num_tokens": 41046552.0, |
| "step": 5055 |
| }, |
| { |
| "entropy": 1.1799126148223877, |
| "epoch": 2.857142857142857, |
| "grad_norm": 78.53099822998047, |
| "learning_rate": 4.2943034034318185e-06, |
| "loss": 0.5488, |
| "mean_token_accuracy": 0.8338027834892273, |
| "num_tokens": 41087072.0, |
| "step": 5060 |
| }, |
| { |
| "entropy": 1.099338126182556, |
| "epoch": 2.859966120835686, |
| "grad_norm": 68.20406341552734, |
| "learning_rate": 4.293002910911147e-06, |
| "loss": 0.4488, |
| "mean_token_accuracy": 0.8597991704940796, |
| "num_tokens": 41127628.0, |
| "step": 5065 |
| }, |
| { |
| "entropy": 1.1174504041671753, |
| "epoch": 2.862789384528515, |
| "grad_norm": 69.75657653808594, |
| "learning_rate": 4.291701499536438e-06, |
| "loss": 0.4756, |
| "mean_token_accuracy": 0.8555531144142151, |
| "num_tokens": 41168327.0, |
| "step": 5070 |
| }, |
| { |
| "entropy": 1.155361032485962, |
| "epoch": 2.8656126482213438, |
| "grad_norm": 72.0925521850586, |
| "learning_rate": 4.2903991703314954e-06, |
| "loss": 0.5072, |
| "mean_token_accuracy": 0.8479303359985352, |
| "num_tokens": 41208824.0, |
| "step": 5075 |
| }, |
| { |
| "entropy": 1.1240201950073243, |
| "epoch": 2.868435911914173, |
| "grad_norm": 80.29562377929688, |
| "learning_rate": 4.289095924320846e-06, |
| "loss": 0.5227, |
| "mean_token_accuracy": 0.8412837505340576, |
| "num_tokens": 41249209.0, |
| "step": 5080 |
| }, |
| { |
| "entropy": 1.1458587884902953, |
| "epoch": 2.8712591756070016, |
| "grad_norm": 84.07976531982422, |
| "learning_rate": 4.287791762529738e-06, |
| "loss": 0.567, |
| "mean_token_accuracy": 0.8279375076293946, |
| "num_tokens": 41289873.0, |
| "step": 5085 |
| }, |
| { |
| "entropy": 1.1406369805335999, |
| "epoch": 2.8740824392998308, |
| "grad_norm": 75.96171569824219, |
| "learning_rate": 4.286486685984142e-06, |
| "loss": 0.5312, |
| "mean_token_accuracy": 0.8363900184631348, |
| "num_tokens": 41330717.0, |
| "step": 5090 |
| }, |
| { |
| "entropy": 1.117619562149048, |
| "epoch": 2.8769057029926595, |
| "grad_norm": 73.16439056396484, |
| "learning_rate": 4.285180695710742e-06, |
| "loss": 0.5419, |
| "mean_token_accuracy": 0.8377798557281494, |
| "num_tokens": 41371121.0, |
| "step": 5095 |
| }, |
| { |
| "entropy": 1.2503567218780518, |
| "epoch": 2.879728966685488, |
| "grad_norm": 80.75735473632812, |
| "learning_rate": 4.28387379273695e-06, |
| "loss": 0.5512, |
| "mean_token_accuracy": 0.83378164768219, |
| "num_tokens": 41411741.0, |
| "step": 5100 |
| }, |
| { |
| "entropy": 1.1296322226524353, |
| "epoch": 2.8825522303783173, |
| "grad_norm": 72.27934265136719, |
| "learning_rate": 4.282565978090888e-06, |
| "loss": 0.52, |
| "mean_token_accuracy": 0.8386531591415405, |
| "num_tokens": 41452180.0, |
| "step": 5105 |
| }, |
| { |
| "entropy": 1.1246942281723022, |
| "epoch": 2.8853754940711465, |
| "grad_norm": 72.93498992919922, |
| "learning_rate": 4.281257252801399e-06, |
| "loss": 0.5144, |
| "mean_token_accuracy": 0.8428778409957886, |
| "num_tokens": 41492779.0, |
| "step": 5110 |
| }, |
| { |
| "entropy": 1.1266536712646484, |
| "epoch": 2.888198757763975, |
| "grad_norm": 67.28563690185547, |
| "learning_rate": 4.279947617898042e-06, |
| "loss": 0.4951, |
| "mean_token_accuracy": 0.8494343519210815, |
| "num_tokens": 41533409.0, |
| "step": 5115 |
| }, |
| { |
| "entropy": 1.1207608938217164, |
| "epoch": 2.891022021456804, |
| "grad_norm": 76.63964080810547, |
| "learning_rate": 4.27863707441109e-06, |
| "loss": 0.5156, |
| "mean_token_accuracy": 0.8436846733093262, |
| "num_tokens": 41573962.0, |
| "step": 5120 |
| }, |
| { |
| "entropy": 1.1186698794364929, |
| "epoch": 2.893845285149633, |
| "grad_norm": 69.15577697753906, |
| "learning_rate": 4.277325623371534e-06, |
| "loss": 0.4647, |
| "mean_token_accuracy": 0.8576582551002503, |
| "num_tokens": 41614674.0, |
| "step": 5125 |
| }, |
| { |
| "entropy": 1.2201684951782226, |
| "epoch": 2.8966685488424617, |
| "grad_norm": 65.40320587158203, |
| "learning_rate": 4.276013265811075e-06, |
| "loss": 0.5036, |
| "mean_token_accuracy": 0.8451929330825806, |
| "num_tokens": 41655535.0, |
| "step": 5130 |
| }, |
| { |
| "entropy": 1.1049484729766845, |
| "epoch": 2.899491812535291, |
| "grad_norm": 69.20492553710938, |
| "learning_rate": 4.274700002762131e-06, |
| "loss": 0.5007, |
| "mean_token_accuracy": 0.848007595539093, |
| "num_tokens": 41696289.0, |
| "step": 5135 |
| }, |
| { |
| "entropy": 1.1586164236068726, |
| "epoch": 2.9023150762281196, |
| "grad_norm": 79.74908447265625, |
| "learning_rate": 4.273385835257829e-06, |
| "loss": 0.5178, |
| "mean_token_accuracy": 0.8397310614585877, |
| "num_tokens": 41736100.0, |
| "step": 5140 |
| }, |
| { |
| "entropy": 1.2396832942962646, |
| "epoch": 2.9051383399209487, |
| "grad_norm": 82.225341796875, |
| "learning_rate": 4.272070764332009e-06, |
| "loss": 0.5261, |
| "mean_token_accuracy": 0.8408646941184997, |
| "num_tokens": 41776894.0, |
| "step": 5145 |
| }, |
| { |
| "entropy": 1.249162983894348, |
| "epoch": 2.9079616036137774, |
| "grad_norm": 76.64154052734375, |
| "learning_rate": 4.270754791019224e-06, |
| "loss": 0.5298, |
| "mean_token_accuracy": 0.8383278131484986, |
| "num_tokens": 41817499.0, |
| "step": 5150 |
| }, |
| { |
| "entropy": 1.270682430267334, |
| "epoch": 2.9107848673066066, |
| "grad_norm": 81.33303833007812, |
| "learning_rate": 4.2694379163547315e-06, |
| "loss": 0.5679, |
| "mean_token_accuracy": 0.8281817555427551, |
| "num_tokens": 41858247.0, |
| "step": 5155 |
| }, |
| { |
| "entropy": 1.2870656728744507, |
| "epoch": 2.9136081309994353, |
| "grad_norm": 78.17098999023438, |
| "learning_rate": 4.268120141374503e-06, |
| "loss": 0.5402, |
| "mean_token_accuracy": 0.8346587181091308, |
| "num_tokens": 41898836.0, |
| "step": 5160 |
| }, |
| { |
| "entropy": 1.2478781938552856, |
| "epoch": 2.9164313946922644, |
| "grad_norm": 78.640869140625, |
| "learning_rate": 4.266801467115215e-06, |
| "loss": 0.5031, |
| "mean_token_accuracy": 0.8475107431411744, |
| "num_tokens": 41939549.0, |
| "step": 5165 |
| }, |
| { |
| "entropy": 1.2414112091064453, |
| "epoch": 2.919254658385093, |
| "grad_norm": 75.24764251708984, |
| "learning_rate": 4.265481894614255e-06, |
| "loss": 0.4988, |
| "mean_token_accuracy": 0.8469999432563782, |
| "num_tokens": 41980269.0, |
| "step": 5170 |
| }, |
| { |
| "entropy": 1.2622466325759887, |
| "epoch": 2.9220779220779223, |
| "grad_norm": 82.20746612548828, |
| "learning_rate": 4.264161424909713e-06, |
| "loss": 0.5352, |
| "mean_token_accuracy": 0.8388458490371704, |
| "num_tokens": 42020982.0, |
| "step": 5175 |
| }, |
| { |
| "entropy": 1.1029718399047852, |
| "epoch": 2.924901185770751, |
| "grad_norm": 69.88716888427734, |
| "learning_rate": 4.262840059040388e-06, |
| "loss": 0.4763, |
| "mean_token_accuracy": 0.8518707990646363, |
| "num_tokens": 42061751.0, |
| "step": 5180 |
| }, |
| { |
| "entropy": 1.12943195104599, |
| "epoch": 2.9277244494635797, |
| "grad_norm": 78.41839599609375, |
| "learning_rate": 4.261517798045783e-06, |
| "loss": 0.5058, |
| "mean_token_accuracy": 0.84534991979599, |
| "num_tokens": 42102450.0, |
| "step": 5185 |
| }, |
| { |
| "entropy": 1.0798253417015076, |
| "epoch": 2.930547713156409, |
| "grad_norm": 67.44549560546875, |
| "learning_rate": 4.260194642966105e-06, |
| "loss": 0.46, |
| "mean_token_accuracy": 0.8567140102386475, |
| "num_tokens": 42142939.0, |
| "step": 5190 |
| }, |
| { |
| "entropy": 1.199964690208435, |
| "epoch": 2.933370976849238, |
| "grad_norm": 78.28135681152344, |
| "learning_rate": 4.258870594842262e-06, |
| "loss": 0.5382, |
| "mean_token_accuracy": 0.8380404353141785, |
| "num_tokens": 42183415.0, |
| "step": 5195 |
| }, |
| { |
| "entropy": 1.2671225309371947, |
| "epoch": 2.9361942405420667, |
| "grad_norm": 74.52027893066406, |
| "learning_rate": 4.257545654715872e-06, |
| "loss": 0.5074, |
| "mean_token_accuracy": 0.8436587452888489, |
| "num_tokens": 42224156.0, |
| "step": 5200 |
| }, |
| { |
| "entropy": 1.1756804943084718, |
| "epoch": 2.9390175042348954, |
| "grad_norm": 66.07868194580078, |
| "learning_rate": 4.256219823629244e-06, |
| "loss": 0.5174, |
| "mean_token_accuracy": 0.841261339187622, |
| "num_tokens": 42264867.0, |
| "step": 5205 |
| }, |
| { |
| "entropy": 1.3112341165542603, |
| "epoch": 2.9418407679277245, |
| "grad_norm": 79.60833740234375, |
| "learning_rate": 4.254893102625398e-06, |
| "loss": 0.5779, |
| "mean_token_accuracy": 0.825363290309906, |
| "num_tokens": 42305366.0, |
| "step": 5210 |
| }, |
| { |
| "entropy": 1.2153887033462525, |
| "epoch": 2.9446640316205532, |
| "grad_norm": 74.00979614257812, |
| "learning_rate": 4.253565492748048e-06, |
| "loss": 0.5067, |
| "mean_token_accuracy": 0.8458914637565613, |
| "num_tokens": 42345890.0, |
| "step": 5215 |
| }, |
| { |
| "entropy": 1.1424734592437744, |
| "epoch": 2.9474872953133824, |
| "grad_norm": 59.49595642089844, |
| "learning_rate": 4.252236995041609e-06, |
| "loss": 0.5383, |
| "mean_token_accuracy": 0.8360925078392029, |
| "num_tokens": 42386578.0, |
| "step": 5220 |
| }, |
| { |
| "entropy": 1.1367671012878418, |
| "epoch": 2.950310559006211, |
| "grad_norm": 69.0216064453125, |
| "learning_rate": 4.250907610551193e-06, |
| "loss": 0.5681, |
| "mean_token_accuracy": 0.8281669616699219, |
| "num_tokens": 42426996.0, |
| "step": 5225 |
| }, |
| { |
| "entropy": 1.160671877861023, |
| "epoch": 2.9531338226990402, |
| "grad_norm": 69.35001373291016, |
| "learning_rate": 4.249577340322612e-06, |
| "loss": 0.5345, |
| "mean_token_accuracy": 0.8406946778297424, |
| "num_tokens": 42467518.0, |
| "step": 5230 |
| }, |
| { |
| "entropy": 1.21805260181427, |
| "epoch": 2.955957086391869, |
| "grad_norm": 76.58956909179688, |
| "learning_rate": 4.248246185402376e-06, |
| "loss": 0.5615, |
| "mean_token_accuracy": 0.827870512008667, |
| "num_tokens": 42508111.0, |
| "step": 5235 |
| }, |
| { |
| "entropy": 1.1750145196914672, |
| "epoch": 2.958780350084698, |
| "grad_norm": 69.74154663085938, |
| "learning_rate": 4.246914146837686e-06, |
| "loss": 0.4865, |
| "mean_token_accuracy": 0.8525615811347962, |
| "num_tokens": 42548839.0, |
| "step": 5240 |
| }, |
| { |
| "entropy": 1.1301527976989747, |
| "epoch": 2.961603613777527, |
| "grad_norm": 70.07862091064453, |
| "learning_rate": 4.245581225676443e-06, |
| "loss": 0.4933, |
| "mean_token_accuracy": 0.8495101571083069, |
| "num_tokens": 42589430.0, |
| "step": 5245 |
| }, |
| { |
| "entropy": 1.2715572595596314, |
| "epoch": 2.9644268774703555, |
| "grad_norm": 66.82747650146484, |
| "learning_rate": 4.244247422967237e-06, |
| "loss": 0.5695, |
| "mean_token_accuracy": 0.8271044254302978, |
| "num_tokens": 42630131.0, |
| "step": 5250 |
| }, |
| { |
| "entropy": 1.203652834892273, |
| "epoch": 2.9672501411631846, |
| "grad_norm": 79.75418090820312, |
| "learning_rate": 4.2429127397593585e-06, |
| "loss": 0.5481, |
| "mean_token_accuracy": 0.8342503547668457, |
| "num_tokens": 42670817.0, |
| "step": 5255 |
| }, |
| { |
| "entropy": 1.2326645374298095, |
| "epoch": 2.970073404856014, |
| "grad_norm": 70.31961059570312, |
| "learning_rate": 4.241577177102785e-06, |
| "loss": 0.5606, |
| "mean_token_accuracy": 0.830009937286377, |
| "num_tokens": 42711537.0, |
| "step": 5260 |
| }, |
| { |
| "entropy": 1.2372756958007813, |
| "epoch": 2.9728966685488425, |
| "grad_norm": 77.57453918457031, |
| "learning_rate": 4.240240736048188e-06, |
| "loss": 0.5462, |
| "mean_token_accuracy": 0.8349868655204773, |
| "num_tokens": 42751836.0, |
| "step": 5265 |
| }, |
| { |
| "entropy": 1.0865499377250671, |
| "epoch": 2.975719932241671, |
| "grad_norm": 66.26081085205078, |
| "learning_rate": 4.23890341764693e-06, |
| "loss": 0.5199, |
| "mean_token_accuracy": 0.8439796090126037, |
| "num_tokens": 42792490.0, |
| "step": 5270 |
| }, |
| { |
| "entropy": 1.1261334896087647, |
| "epoch": 2.9785431959345003, |
| "grad_norm": 74.02761840820312, |
| "learning_rate": 4.237565222951063e-06, |
| "loss": 0.5352, |
| "mean_token_accuracy": 0.8395409226417542, |
| "num_tokens": 42832932.0, |
| "step": 5275 |
| }, |
| { |
| "entropy": 1.153111171722412, |
| "epoch": 2.981366459627329, |
| "grad_norm": 66.55201721191406, |
| "learning_rate": 4.2362261530133294e-06, |
| "loss": 0.517, |
| "mean_token_accuracy": 0.8424826622009277, |
| "num_tokens": 42873706.0, |
| "step": 5280 |
| }, |
| { |
| "entropy": 1.159202551841736, |
| "epoch": 2.984189723320158, |
| "grad_norm": 74.84986877441406, |
| "learning_rate": 4.234886208887161e-06, |
| "loss": 0.5285, |
| "mean_token_accuracy": 0.8411277890205383, |
| "num_tokens": 42914346.0, |
| "step": 5285 |
| }, |
| { |
| "entropy": 1.1488636016845704, |
| "epoch": 2.987012987012987, |
| "grad_norm": 72.25592803955078, |
| "learning_rate": 4.233545391626674e-06, |
| "loss": 0.5471, |
| "mean_token_accuracy": 0.8350270032882691, |
| "num_tokens": 42954898.0, |
| "step": 5290 |
| }, |
| { |
| "entropy": 1.1714290857315064, |
| "epoch": 2.989836250705816, |
| "grad_norm": 70.09166717529297, |
| "learning_rate": 4.232203702286673e-06, |
| "loss": 0.5587, |
| "mean_token_accuracy": 0.8305088877677917, |
| "num_tokens": 42995672.0, |
| "step": 5295 |
| }, |
| { |
| "entropy": 1.0723572731018067, |
| "epoch": 2.9926595143986447, |
| "grad_norm": 68.43204498291016, |
| "learning_rate": 4.230861141922652e-06, |
| "loss": 0.4805, |
| "mean_token_accuracy": 0.8510453224182128, |
| "num_tokens": 43036032.0, |
| "step": 5300 |
| }, |
| { |
| "entropy": 1.042343759536743, |
| "epoch": 2.995482778091474, |
| "grad_norm": 72.96656036376953, |
| "learning_rate": 4.229517711590785e-06, |
| "loss": 0.4807, |
| "mean_token_accuracy": 0.8531039834022522, |
| "num_tokens": 43076947.0, |
| "step": 5305 |
| }, |
| { |
| "entropy": 1.172229290008545, |
| "epoch": 2.9983060417843026, |
| "grad_norm": 68.45772552490234, |
| "learning_rate": 4.228173412347932e-06, |
| "loss": 0.5296, |
| "mean_token_accuracy": 0.8403589129447937, |
| "num_tokens": 43117547.0, |
| "step": 5310 |
| }, |
| { |
| "entropy": 1.2335005044937133, |
| "epoch": 3.0011293054771317, |
| "grad_norm": 91.20881652832031, |
| "learning_rate": 4.226828245251639e-06, |
| "loss": 0.5007, |
| "mean_token_accuracy": 0.8487535119056702, |
| "num_tokens": 43152204.0, |
| "step": 5315 |
| }, |
| { |
| "entropy": 1.0431846261024476, |
| "epoch": 3.0039525691699605, |
| "grad_norm": 86.15254974365234, |
| "learning_rate": 4.2254822113601326e-06, |
| "loss": 0.3079, |
| "mean_token_accuracy": 0.9018093705177307, |
| "num_tokens": 43192950.0, |
| "step": 5320 |
| }, |
| { |
| "entropy": 0.8944714188575744, |
| "epoch": 3.0067758328627896, |
| "grad_norm": 83.4747314453125, |
| "learning_rate": 4.224135311732321e-06, |
| "loss": 0.3301, |
| "mean_token_accuracy": 0.8947166442871094, |
| "num_tokens": 43233859.0, |
| "step": 5325 |
| }, |
| { |
| "entropy": 0.9511932849884033, |
| "epoch": 3.0095990965556183, |
| "grad_norm": 98.4117660522461, |
| "learning_rate": 4.222787547427796e-06, |
| "loss": 0.3137, |
| "mean_token_accuracy": 0.8992406487464905, |
| "num_tokens": 43274528.0, |
| "step": 5330 |
| }, |
| { |
| "entropy": 1.0112687826156617, |
| "epoch": 3.012422360248447, |
| "grad_norm": 81.14041900634766, |
| "learning_rate": 4.221438919506825e-06, |
| "loss": 0.3306, |
| "mean_token_accuracy": 0.8945652127265931, |
| "num_tokens": 43315196.0, |
| "step": 5335 |
| }, |
| { |
| "entropy": 1.044990885257721, |
| "epoch": 3.015245623941276, |
| "grad_norm": 80.31888580322266, |
| "learning_rate": 4.2200894290303595e-06, |
| "loss": 0.3248, |
| "mean_token_accuracy": 0.8962285280227661, |
| "num_tokens": 43355979.0, |
| "step": 5340 |
| }, |
| { |
| "entropy": 1.019908332824707, |
| "epoch": 3.018068887634105, |
| "grad_norm": 89.42028045654297, |
| "learning_rate": 4.218739077060028e-06, |
| "loss": 0.3208, |
| "mean_token_accuracy": 0.8979032635688782, |
| "num_tokens": 43396122.0, |
| "step": 5345 |
| }, |
| { |
| "entropy": 0.9794248342514038, |
| "epoch": 3.020892151326934, |
| "grad_norm": 78.72344207763672, |
| "learning_rate": 4.217387864658135e-06, |
| "loss": 0.3468, |
| "mean_token_accuracy": 0.8871222257614135, |
| "num_tokens": 43436697.0, |
| "step": 5350 |
| }, |
| { |
| "entropy": 0.9811869382858276, |
| "epoch": 3.0237154150197627, |
| "grad_norm": 79.8103256225586, |
| "learning_rate": 4.216035792887664e-06, |
| "loss": 0.3315, |
| "mean_token_accuracy": 0.8930273175239563, |
| "num_tokens": 43477397.0, |
| "step": 5355 |
| }, |
| { |
| "entropy": 0.958527147769928, |
| "epoch": 3.026538678712592, |
| "grad_norm": 75.03720092773438, |
| "learning_rate": 4.214682862812274e-06, |
| "loss": 0.3301, |
| "mean_token_accuracy": 0.8914317131042481, |
| "num_tokens": 43517882.0, |
| "step": 5360 |
| }, |
| { |
| "entropy": 0.9316562294960022, |
| "epoch": 3.0293619424054206, |
| "grad_norm": 74.98342895507812, |
| "learning_rate": 4.213329075496298e-06, |
| "loss": 0.2988, |
| "mean_token_accuracy": 0.9033580541610717, |
| "num_tokens": 43558258.0, |
| "step": 5365 |
| }, |
| { |
| "entropy": 1.0227751016616822, |
| "epoch": 3.0321852060982497, |
| "grad_norm": 90.37271118164062, |
| "learning_rate": 4.211974432004745e-06, |
| "loss": 0.3243, |
| "mean_token_accuracy": 0.8961042642593384, |
| "num_tokens": 43598741.0, |
| "step": 5370 |
| }, |
| { |
| "entropy": 0.865773344039917, |
| "epoch": 3.0350084697910784, |
| "grad_norm": 77.36763763427734, |
| "learning_rate": 4.210618933403299e-06, |
| "loss": 0.3013, |
| "mean_token_accuracy": 0.9025385260581971, |
| "num_tokens": 43639066.0, |
| "step": 5375 |
| }, |
| { |
| "entropy": 0.8853742003440856, |
| "epoch": 3.0378317334839076, |
| "grad_norm": 70.10348510742188, |
| "learning_rate": 4.209262580758311e-06, |
| "loss": 0.274, |
| "mean_token_accuracy": 0.9107710480690002, |
| "num_tokens": 43679643.0, |
| "step": 5380 |
| }, |
| { |
| "entropy": 0.9764909148216248, |
| "epoch": 3.0406549971767363, |
| "grad_norm": 92.78551483154297, |
| "learning_rate": 4.207905375136811e-06, |
| "loss": 0.3209, |
| "mean_token_accuracy": 0.8971151232719421, |
| "num_tokens": 43720215.0, |
| "step": 5385 |
| }, |
| { |
| "entropy": 0.9666017770767212, |
| "epoch": 3.0434782608695654, |
| "grad_norm": 79.50763702392578, |
| "learning_rate": 4.206547317606493e-06, |
| "loss": 0.281, |
| "mean_token_accuracy": 0.9079766511917114, |
| "num_tokens": 43761009.0, |
| "step": 5390 |
| }, |
| { |
| "entropy": 0.9946901082992554, |
| "epoch": 3.046301524562394, |
| "grad_norm": 79.8683090209961, |
| "learning_rate": 4.205188409235728e-06, |
| "loss": 0.3093, |
| "mean_token_accuracy": 0.9002600073814392, |
| "num_tokens": 43801591.0, |
| "step": 5395 |
| }, |
| { |
| "entropy": 0.9930772542953491, |
| "epoch": 3.049124788255223, |
| "grad_norm": 77.45728302001953, |
| "learning_rate": 4.203828651093551e-06, |
| "loss": 0.3177, |
| "mean_token_accuracy": 0.8979378700256347, |
| "num_tokens": 43842173.0, |
| "step": 5400 |
| }, |
| { |
| "entropy": 0.9636677026748657, |
| "epoch": 3.051948051948052, |
| "grad_norm": 77.16109466552734, |
| "learning_rate": 4.2024680442496694e-06, |
| "loss": 0.3143, |
| "mean_token_accuracy": 0.8984180212020874, |
| "num_tokens": 43882801.0, |
| "step": 5405 |
| }, |
| { |
| "entropy": 0.9925666213035583, |
| "epoch": 3.0547713156408807, |
| "grad_norm": 75.40894317626953, |
| "learning_rate": 4.2011065897744545e-06, |
| "loss": 0.3234, |
| "mean_token_accuracy": 0.8965913653373718, |
| "num_tokens": 43923429.0, |
| "step": 5410 |
| }, |
| { |
| "entropy": 1.02843519449234, |
| "epoch": 3.05759457933371, |
| "grad_norm": 57.404842376708984, |
| "learning_rate": 4.199744288738948e-06, |
| "loss": 0.3181, |
| "mean_token_accuracy": 0.8978499293327331, |
| "num_tokens": 43963461.0, |
| "step": 5415 |
| }, |
| { |
| "entropy": 0.9605874657630921, |
| "epoch": 3.0604178430265385, |
| "grad_norm": 72.65766906738281, |
| "learning_rate": 4.198381142214856e-06, |
| "loss": 0.3283, |
| "mean_token_accuracy": 0.8947899460792541, |
| "num_tokens": 44004053.0, |
| "step": 5420 |
| }, |
| { |
| "entropy": 0.9381905317306518, |
| "epoch": 3.0632411067193677, |
| "grad_norm": 81.37139892578125, |
| "learning_rate": 4.197017151274547e-06, |
| "loss": 0.3082, |
| "mean_token_accuracy": 0.8991894960403443, |
| "num_tokens": 44044520.0, |
| "step": 5425 |
| }, |
| { |
| "entropy": 1.069422674179077, |
| "epoch": 3.0660643704121964, |
| "grad_norm": 98.56239318847656, |
| "learning_rate": 4.1956523169910605e-06, |
| "loss": 0.3471, |
| "mean_token_accuracy": 0.8895352363586426, |
| "num_tokens": 44085119.0, |
| "step": 5430 |
| }, |
| { |
| "entropy": 0.8803645491600036, |
| "epoch": 3.0688876341050255, |
| "grad_norm": 69.33189392089844, |
| "learning_rate": 4.194286640438092e-06, |
| "loss": 0.2933, |
| "mean_token_accuracy": 0.9065402269363403, |
| "num_tokens": 44125683.0, |
| "step": 5435 |
| }, |
| { |
| "entropy": 1.0519742131233216, |
| "epoch": 3.0717108977978542, |
| "grad_norm": 88.03885650634766, |
| "learning_rate": 4.192920122690005e-06, |
| "loss": 0.34, |
| "mean_token_accuracy": 0.8912493824958801, |
| "num_tokens": 44166348.0, |
| "step": 5440 |
| }, |
| { |
| "entropy": 0.9997652292251586, |
| "epoch": 3.0745341614906834, |
| "grad_norm": 85.6423568725586, |
| "learning_rate": 4.191552764821823e-06, |
| "loss": 0.3317, |
| "mean_token_accuracy": 0.8926982164382935, |
| "num_tokens": 44206892.0, |
| "step": 5445 |
| }, |
| { |
| "entropy": 1.0732812404632568, |
| "epoch": 3.077357425183512, |
| "grad_norm": 72.29247283935547, |
| "learning_rate": 4.190184567909229e-06, |
| "loss": 0.3411, |
| "mean_token_accuracy": 0.89251549243927, |
| "num_tokens": 44247388.0, |
| "step": 5450 |
| }, |
| { |
| "entropy": 0.9219252943992615, |
| "epoch": 3.080180688876341, |
| "grad_norm": 93.68437957763672, |
| "learning_rate": 4.188815533028569e-06, |
| "loss": 0.3186, |
| "mean_token_accuracy": 0.8986498594284058, |
| "num_tokens": 44287827.0, |
| "step": 5455 |
| }, |
| { |
| "entropy": 1.0143481492996216, |
| "epoch": 3.08300395256917, |
| "grad_norm": 80.94513702392578, |
| "learning_rate": 4.1874456612568435e-06, |
| "loss": 0.296, |
| "mean_token_accuracy": 0.9054910182952881, |
| "num_tokens": 44327894.0, |
| "step": 5460 |
| }, |
| { |
| "entropy": 1.0362035751342773, |
| "epoch": 3.085827216261999, |
| "grad_norm": 88.25469207763672, |
| "learning_rate": 4.186074953671717e-06, |
| "loss": 0.3444, |
| "mean_token_accuracy": 0.8891435742378235, |
| "num_tokens": 44368586.0, |
| "step": 5465 |
| }, |
| { |
| "entropy": 0.9385774374008179, |
| "epoch": 3.0886504799548278, |
| "grad_norm": 66.99293518066406, |
| "learning_rate": 4.184703411351508e-06, |
| "loss": 0.3234, |
| "mean_token_accuracy": 0.8954385399818421, |
| "num_tokens": 44409209.0, |
| "step": 5470 |
| }, |
| { |
| "entropy": 0.8999487400054932, |
| "epoch": 3.0914737436476565, |
| "grad_norm": 89.90604400634766, |
| "learning_rate": 4.1833310353751935e-06, |
| "loss": 0.3418, |
| "mean_token_accuracy": 0.8911889672279358, |
| "num_tokens": 44449767.0, |
| "step": 5475 |
| }, |
| { |
| "entropy": 0.9921982645988464, |
| "epoch": 3.0942970073404856, |
| "grad_norm": 81.72102355957031, |
| "learning_rate": 4.181957826822403e-06, |
| "loss": 0.3254, |
| "mean_token_accuracy": 0.8946803331375122, |
| "num_tokens": 44490423.0, |
| "step": 5480 |
| }, |
| { |
| "entropy": 0.9456246733665467, |
| "epoch": 3.0971202710333143, |
| "grad_norm": 85.97330474853516, |
| "learning_rate": 4.1805837867734255e-06, |
| "loss": 0.332, |
| "mean_token_accuracy": 0.8957900285720826, |
| "num_tokens": 44530989.0, |
| "step": 5485 |
| }, |
| { |
| "entropy": 0.9648449778556824, |
| "epoch": 3.0999435347261435, |
| "grad_norm": 74.89656066894531, |
| "learning_rate": 4.179208916309202e-06, |
| "loss": 0.2918, |
| "mean_token_accuracy": 0.9066829919815064, |
| "num_tokens": 44571777.0, |
| "step": 5490 |
| }, |
| { |
| "entropy": 0.9347667455673218, |
| "epoch": 3.102766798418972, |
| "grad_norm": 74.6075668334961, |
| "learning_rate": 4.177833216511326e-06, |
| "loss": 0.3189, |
| "mean_token_accuracy": 0.8962038516998291, |
| "num_tokens": 44612462.0, |
| "step": 5495 |
| }, |
| { |
| "entropy": 0.9365931272506713, |
| "epoch": 3.1055900621118013, |
| "grad_norm": 76.95111846923828, |
| "learning_rate": 4.176456688462045e-06, |
| "loss": 0.3247, |
| "mean_token_accuracy": 0.8957284688949585, |
| "num_tokens": 44652841.0, |
| "step": 5500 |
| }, |
| { |
| "epoch": 3.1055900621118013, |
| "eval_entropy": 1.2302906274795533, |
| "eval_loss": 0.2950093150138855, |
| "eval_mean_token_accuracy": 0.9086108207702637, |
| "eval_num_tokens": 44652841.0, |
| "eval_runtime": 2.453, |
| "eval_samples_per_second": 15.899, |
| "eval_steps_per_second": 2.038, |
| "step": 5500 |
| }, |
| { |
| "entropy": 1.0472152709960938, |
| "epoch": 3.10841332580463, |
| "grad_norm": 69.57112884521484, |
| "learning_rate": 4.175079333244257e-06, |
| "loss": 0.3251, |
| "mean_token_accuracy": 0.894036340713501, |
| "num_tokens": 44693407.0, |
| "step": 5505 |
| }, |
| { |
| "entropy": 1.0573584675788879, |
| "epoch": 3.111236589497459, |
| "grad_norm": 73.89783477783203, |
| "learning_rate": 4.17370115194151e-06, |
| "loss": 0.3342, |
| "mean_token_accuracy": 0.8917381048202515, |
| "num_tokens": 44734101.0, |
| "step": 5510 |
| }, |
| { |
| "entropy": 1.1182607412338257, |
| "epoch": 3.114059853190288, |
| "grad_norm": 77.34064483642578, |
| "learning_rate": 4.172322145638004e-06, |
| "loss": 0.3338, |
| "mean_token_accuracy": 0.8926242351531982, |
| "num_tokens": 44774581.0, |
| "step": 5515 |
| }, |
| { |
| "entropy": 0.9524365782737731, |
| "epoch": 3.116883116883117, |
| "grad_norm": 79.16650390625, |
| "learning_rate": 4.1709423154185855e-06, |
| "loss": 0.2924, |
| "mean_token_accuracy": 0.9028084397315979, |
| "num_tokens": 44815304.0, |
| "step": 5520 |
| }, |
| { |
| "entropy": 0.9392463088035583, |
| "epoch": 3.1197063805759457, |
| "grad_norm": 91.71111297607422, |
| "learning_rate": 4.169561662368753e-06, |
| "loss": 0.3366, |
| "mean_token_accuracy": 0.8923919320106506, |
| "num_tokens": 44856030.0, |
| "step": 5525 |
| }, |
| { |
| "entropy": 0.9322085857391358, |
| "epoch": 3.122529644268775, |
| "grad_norm": 73.85060119628906, |
| "learning_rate": 4.168180187574649e-06, |
| "loss": 0.3098, |
| "mean_token_accuracy": 0.8995050311088562, |
| "num_tokens": 44896751.0, |
| "step": 5530 |
| }, |
| { |
| "entropy": 1.0029746174812317, |
| "epoch": 3.1253529079616036, |
| "grad_norm": 84.3502426147461, |
| "learning_rate": 4.166797892123062e-06, |
| "loss": 0.3559, |
| "mean_token_accuracy": 0.8850750803947449, |
| "num_tokens": 44937348.0, |
| "step": 5535 |
| }, |
| { |
| "entropy": 0.9944419145584107, |
| "epoch": 3.1281761716544327, |
| "grad_norm": 76.57456970214844, |
| "learning_rate": 4.1654147771014285e-06, |
| "loss": 0.3343, |
| "mean_token_accuracy": 0.8934206604957581, |
| "num_tokens": 44977981.0, |
| "step": 5540 |
| }, |
| { |
| "entropy": 1.0101613402366638, |
| "epoch": 3.1309994353472614, |
| "grad_norm": 71.5161361694336, |
| "learning_rate": 4.164030843597829e-06, |
| "loss": 0.3135, |
| "mean_token_accuracy": 0.9011573910713195, |
| "num_tokens": 45018766.0, |
| "step": 5545 |
| }, |
| { |
| "entropy": 0.9782228708267212, |
| "epoch": 3.13382269904009, |
| "grad_norm": 92.19779968261719, |
| "learning_rate": 4.1626460927009855e-06, |
| "loss": 0.3547, |
| "mean_token_accuracy": 0.8878259062767029, |
| "num_tokens": 45059560.0, |
| "step": 5550 |
| }, |
| { |
| "entropy": 1.0341717839241027, |
| "epoch": 3.1366459627329193, |
| "grad_norm": 88.86711120605469, |
| "learning_rate": 4.161260525500268e-06, |
| "loss": 0.334, |
| "mean_token_accuracy": 0.8918552994728088, |
| "num_tokens": 45100197.0, |
| "step": 5555 |
| }, |
| { |
| "entropy": 0.9899235010147095, |
| "epoch": 3.139469226425748, |
| "grad_norm": 82.83612060546875, |
| "learning_rate": 4.159874143085685e-06, |
| "loss": 0.3116, |
| "mean_token_accuracy": 0.899685287475586, |
| "num_tokens": 45140887.0, |
| "step": 5560 |
| }, |
| { |
| "entropy": 1.0915488839149474, |
| "epoch": 3.142292490118577, |
| "grad_norm": 67.84982299804688, |
| "learning_rate": 4.1584869465478846e-06, |
| "loss": 0.3385, |
| "mean_token_accuracy": 0.8910330533981323, |
| "num_tokens": 45181297.0, |
| "step": 5565 |
| }, |
| { |
| "entropy": 1.012636399269104, |
| "epoch": 3.145115753811406, |
| "grad_norm": 91.24017333984375, |
| "learning_rate": 4.157098936978162e-06, |
| "loss": 0.338, |
| "mean_token_accuracy": 0.893407940864563, |
| "num_tokens": 45222015.0, |
| "step": 5570 |
| }, |
| { |
| "entropy": 0.9732036113739013, |
| "epoch": 3.147939017504235, |
| "grad_norm": 77.7698745727539, |
| "learning_rate": 4.155710115468444e-06, |
| "loss": 0.351, |
| "mean_token_accuracy": 0.887684988975525, |
| "num_tokens": 45262703.0, |
| "step": 5575 |
| }, |
| { |
| "entropy": 0.8633365988731384, |
| "epoch": 3.1507622811970637, |
| "grad_norm": 74.60352325439453, |
| "learning_rate": 4.154320483111303e-06, |
| "loss": 0.2891, |
| "mean_token_accuracy": 0.9053243041038513, |
| "num_tokens": 45303158.0, |
| "step": 5580 |
| }, |
| { |
| "entropy": 0.9112148880958557, |
| "epoch": 3.153585544889893, |
| "grad_norm": 80.281005859375, |
| "learning_rate": 4.152930040999944e-06, |
| "loss": 0.3474, |
| "mean_token_accuracy": 0.8875756382942199, |
| "num_tokens": 45343759.0, |
| "step": 5585 |
| }, |
| { |
| "entropy": 0.9924912214279175, |
| "epoch": 3.1564088085827215, |
| "grad_norm": 66.05514526367188, |
| "learning_rate": 4.151538790228213e-06, |
| "loss": 0.3419, |
| "mean_token_accuracy": 0.8918880581855774, |
| "num_tokens": 45383961.0, |
| "step": 5590 |
| }, |
| { |
| "entropy": 0.9940900683403016, |
| "epoch": 3.1592320722755507, |
| "grad_norm": 84.66516876220703, |
| "learning_rate": 4.15014673189059e-06, |
| "loss": 0.3045, |
| "mean_token_accuracy": 0.9007033705711365, |
| "num_tokens": 45424529.0, |
| "step": 5595 |
| }, |
| { |
| "entropy": 0.9432126879692078, |
| "epoch": 3.1620553359683794, |
| "grad_norm": 77.25856018066406, |
| "learning_rate": 4.14875386708219e-06, |
| "loss": 0.3152, |
| "mean_token_accuracy": 0.8971645593643188, |
| "num_tokens": 45465227.0, |
| "step": 5600 |
| }, |
| { |
| "entropy": 0.9745931267738343, |
| "epoch": 3.1648785996612085, |
| "grad_norm": 81.02576446533203, |
| "learning_rate": 4.147360196898763e-06, |
| "loss": 0.3568, |
| "mean_token_accuracy": 0.8874773502349853, |
| "num_tokens": 45505830.0, |
| "step": 5605 |
| }, |
| { |
| "entropy": 0.92342050075531, |
| "epoch": 3.1677018633540373, |
| "grad_norm": 85.53850555419922, |
| "learning_rate": 4.145965722436695e-06, |
| "loss": 0.3219, |
| "mean_token_accuracy": 0.8958787202835083, |
| "num_tokens": 45546417.0, |
| "step": 5610 |
| }, |
| { |
| "entropy": 0.9303161263465881, |
| "epoch": 3.170525127046866, |
| "grad_norm": 94.72078704833984, |
| "learning_rate": 4.144570444793002e-06, |
| "loss": 0.3571, |
| "mean_token_accuracy": 0.8850953698158264, |
| "num_tokens": 45587028.0, |
| "step": 5615 |
| }, |
| { |
| "entropy": 1.091172707080841, |
| "epoch": 3.173348390739695, |
| "grad_norm": 79.4405288696289, |
| "learning_rate": 4.14317436506533e-06, |
| "loss": 0.3378, |
| "mean_token_accuracy": 0.8912994861602783, |
| "num_tokens": 45627727.0, |
| "step": 5620 |
| }, |
| { |
| "entropy": 1.1407584190368651, |
| "epoch": 3.176171654432524, |
| "grad_norm": 83.2149658203125, |
| "learning_rate": 4.14177748435196e-06, |
| "loss": 0.3481, |
| "mean_token_accuracy": 0.8884922146797181, |
| "num_tokens": 45668433.0, |
| "step": 5625 |
| }, |
| { |
| "entropy": 1.024580430984497, |
| "epoch": 3.178994918125353, |
| "grad_norm": 79.87594604492188, |
| "learning_rate": 4.140379803751803e-06, |
| "loss": 0.3253, |
| "mean_token_accuracy": 0.8933813095092773, |
| "num_tokens": 45709184.0, |
| "step": 5630 |
| }, |
| { |
| "entropy": 1.0046086192131043, |
| "epoch": 3.1818181818181817, |
| "grad_norm": 96.62107849121094, |
| "learning_rate": 4.138981324364394e-06, |
| "loss": 0.3872, |
| "mean_token_accuracy": 0.8781454682350158, |
| "num_tokens": 45749907.0, |
| "step": 5635 |
| }, |
| { |
| "entropy": 0.9690511703491211, |
| "epoch": 3.184641445511011, |
| "grad_norm": 70.9329605102539, |
| "learning_rate": 4.137582047289903e-06, |
| "loss": 0.3215, |
| "mean_token_accuracy": 0.8958381295204163, |
| "num_tokens": 45790644.0, |
| "step": 5640 |
| }, |
| { |
| "entropy": 1.0030096292495727, |
| "epoch": 3.1874647092038395, |
| "grad_norm": 68.79771423339844, |
| "learning_rate": 4.1361819736291244e-06, |
| "loss": 0.3199, |
| "mean_token_accuracy": 0.8972474575042725, |
| "num_tokens": 45831271.0, |
| "step": 5645 |
| }, |
| { |
| "entropy": 0.9273014307022095, |
| "epoch": 3.1902879728966687, |
| "grad_norm": 87.60462951660156, |
| "learning_rate": 4.134781104483479e-06, |
| "loss": 0.3464, |
| "mean_token_accuracy": 0.8891156673431396, |
| "num_tokens": 45871883.0, |
| "step": 5650 |
| }, |
| { |
| "entropy": 0.9342420339584351, |
| "epoch": 3.1931112365894974, |
| "grad_norm": 78.78738403320312, |
| "learning_rate": 4.133379440955014e-06, |
| "loss": 0.3088, |
| "mean_token_accuracy": 0.8999145150184631, |
| "num_tokens": 45912620.0, |
| "step": 5655 |
| }, |
| { |
| "entropy": 0.9332594275474548, |
| "epoch": 3.1959345002823265, |
| "grad_norm": 70.32312774658203, |
| "learning_rate": 4.131976984146401e-06, |
| "loss": 0.3419, |
| "mean_token_accuracy": 0.8888086795806884, |
| "num_tokens": 45953260.0, |
| "step": 5660 |
| }, |
| { |
| "entropy": 0.9139317393302917, |
| "epoch": 3.198757763975155, |
| "grad_norm": 72.15721893310547, |
| "learning_rate": 4.130573735160937e-06, |
| "loss": 0.3126, |
| "mean_token_accuracy": 0.9001538991928101, |
| "num_tokens": 45993897.0, |
| "step": 5665 |
| }, |
| { |
| "entropy": 1.0630685806274414, |
| "epoch": 3.2015810276679844, |
| "grad_norm": 81.17543029785156, |
| "learning_rate": 4.129169695102541e-06, |
| "loss": 0.3618, |
| "mean_token_accuracy": 0.8834628224372864, |
| "num_tokens": 46034427.0, |
| "step": 5670 |
| }, |
| { |
| "entropy": 0.9357471466064453, |
| "epoch": 3.204404291360813, |
| "grad_norm": 88.41699981689453, |
| "learning_rate": 4.127764865075755e-06, |
| "loss": 0.3283, |
| "mean_token_accuracy": 0.8928990125656128, |
| "num_tokens": 46074757.0, |
| "step": 5675 |
| }, |
| { |
| "entropy": 0.9587173461914062, |
| "epoch": 3.207227555053642, |
| "grad_norm": 78.16218566894531, |
| "learning_rate": 4.126359246185741e-06, |
| "loss": 0.3537, |
| "mean_token_accuracy": 0.888952374458313, |
| "num_tokens": 46115210.0, |
| "step": 5680 |
| }, |
| { |
| "entropy": 0.97600417137146, |
| "epoch": 3.210050818746471, |
| "grad_norm": 86.20440673828125, |
| "learning_rate": 4.124952839538284e-06, |
| "loss": 0.3607, |
| "mean_token_accuracy": 0.8835627913475037, |
| "num_tokens": 46155735.0, |
| "step": 5685 |
| }, |
| { |
| "entropy": 0.9152452230453492, |
| "epoch": 3.2128740824393, |
| "grad_norm": 67.98258209228516, |
| "learning_rate": 4.123545646239787e-06, |
| "loss": 0.3209, |
| "mean_token_accuracy": 0.8964808464050293, |
| "num_tokens": 46195983.0, |
| "step": 5690 |
| }, |
| { |
| "entropy": 0.960852038860321, |
| "epoch": 3.2156973461321288, |
| "grad_norm": 79.35540008544922, |
| "learning_rate": 4.122137667397272e-06, |
| "loss": 0.3052, |
| "mean_token_accuracy": 0.9016862750053406, |
| "num_tokens": 46236777.0, |
| "step": 5695 |
| }, |
| { |
| "entropy": 1.0134177327156066, |
| "epoch": 3.2185206098249575, |
| "grad_norm": 79.44649505615234, |
| "learning_rate": 4.1207289041183805e-06, |
| "loss": 0.311, |
| "mean_token_accuracy": 0.8982547640800476, |
| "num_tokens": 46277026.0, |
| "step": 5700 |
| }, |
| { |
| "entropy": 1.012182354927063, |
| "epoch": 3.2213438735177866, |
| "grad_norm": 86.40589904785156, |
| "learning_rate": 4.1193193575113685e-06, |
| "loss": 0.3175, |
| "mean_token_accuracy": 0.898042368888855, |
| "num_tokens": 46317567.0, |
| "step": 5705 |
| }, |
| { |
| "entropy": 0.9427968263626099, |
| "epoch": 3.2241671372106153, |
| "grad_norm": 76.3041000366211, |
| "learning_rate": 4.117909028685108e-06, |
| "loss": 0.3382, |
| "mean_token_accuracy": 0.8900476813316345, |
| "num_tokens": 46358373.0, |
| "step": 5710 |
| }, |
| { |
| "entropy": 0.9410587787628174, |
| "epoch": 3.2269904009034445, |
| "grad_norm": 72.00779724121094, |
| "learning_rate": 4.116497918749093e-06, |
| "loss": 0.3349, |
| "mean_token_accuracy": 0.8919649243354797, |
| "num_tokens": 46399093.0, |
| "step": 5715 |
| }, |
| { |
| "entropy": 0.8953568816184998, |
| "epoch": 3.229813664596273, |
| "grad_norm": 84.89020538330078, |
| "learning_rate": 4.115086028813422e-06, |
| "loss": 0.3361, |
| "mean_token_accuracy": 0.8915575385093689, |
| "num_tokens": 46439722.0, |
| "step": 5720 |
| }, |
| { |
| "entropy": 0.892897367477417, |
| "epoch": 3.2326369282891023, |
| "grad_norm": 68.52616119384766, |
| "learning_rate": 4.113673359988814e-06, |
| "loss": 0.3132, |
| "mean_token_accuracy": 0.8966469287872314, |
| "num_tokens": 46479801.0, |
| "step": 5725 |
| }, |
| { |
| "entropy": 0.9588320255279541, |
| "epoch": 3.235460191981931, |
| "grad_norm": 92.84066009521484, |
| "learning_rate": 4.112259913386599e-06, |
| "loss": 0.3385, |
| "mean_token_accuracy": 0.8905380010604859, |
| "num_tokens": 46520573.0, |
| "step": 5730 |
| }, |
| { |
| "entropy": 0.9230941891670227, |
| "epoch": 3.23828345567476, |
| "grad_norm": 75.7746810913086, |
| "learning_rate": 4.110845690118718e-06, |
| "loss": 0.3411, |
| "mean_token_accuracy": 0.8905932664871216, |
| "num_tokens": 46560697.0, |
| "step": 5735 |
| }, |
| { |
| "entropy": 0.975538969039917, |
| "epoch": 3.241106719367589, |
| "grad_norm": 97.85185241699219, |
| "learning_rate": 4.109430691297724e-06, |
| "loss": 0.3104, |
| "mean_token_accuracy": 0.8992527008056641, |
| "num_tokens": 46601246.0, |
| "step": 5740 |
| }, |
| { |
| "entropy": 1.0089300632476808, |
| "epoch": 3.243929983060418, |
| "grad_norm": 73.65741729736328, |
| "learning_rate": 4.10801491803678e-06, |
| "loss": 0.354, |
| "mean_token_accuracy": 0.887215518951416, |
| "num_tokens": 46641991.0, |
| "step": 5745 |
| }, |
| { |
| "entropy": 0.9730260252952576, |
| "epoch": 3.2467532467532467, |
| "grad_norm": 87.6568374633789, |
| "learning_rate": 4.106598371449659e-06, |
| "loss": 0.3361, |
| "mean_token_accuracy": 0.8924851059913635, |
| "num_tokens": 46682519.0, |
| "step": 5750 |
| }, |
| { |
| "entropy": 0.9061202526092529, |
| "epoch": 3.249576510446076, |
| "grad_norm": 76.36370849609375, |
| "learning_rate": 4.105181052650739e-06, |
| "loss": 0.3222, |
| "mean_token_accuracy": 0.8965223670005799, |
| "num_tokens": 46723263.0, |
| "step": 5755 |
| }, |
| { |
| "entropy": 0.9181980729103089, |
| "epoch": 3.2523997741389046, |
| "grad_norm": 78.70237731933594, |
| "learning_rate": 4.10376296275501e-06, |
| "loss": 0.303, |
| "mean_token_accuracy": 0.9024981141090394, |
| "num_tokens": 46764047.0, |
| "step": 5760 |
| }, |
| { |
| "entropy": 0.9052846193313598, |
| "epoch": 3.2552230378317333, |
| "grad_norm": 72.70726013183594, |
| "learning_rate": 4.1023441028780655e-06, |
| "loss": 0.336, |
| "mean_token_accuracy": 0.8925952315330505, |
| "num_tokens": 46804692.0, |
| "step": 5765 |
| }, |
| { |
| "entropy": 1.03635413646698, |
| "epoch": 3.2580463015245624, |
| "grad_norm": 77.33209228515625, |
| "learning_rate": 4.100924474136105e-06, |
| "loss": 0.357, |
| "mean_token_accuracy": 0.8842707514762879, |
| "num_tokens": 46845216.0, |
| "step": 5770 |
| }, |
| { |
| "entropy": 0.9469284415245056, |
| "epoch": 3.260869565217391, |
| "grad_norm": 81.36337280273438, |
| "learning_rate": 4.099504077645936e-06, |
| "loss": 0.3423, |
| "mean_token_accuracy": 0.8893475532531738, |
| "num_tokens": 46885838.0, |
| "step": 5775 |
| }, |
| { |
| "entropy": 0.92877277135849, |
| "epoch": 3.2636928289102203, |
| "grad_norm": 65.81452178955078, |
| "learning_rate": 4.098082914524966e-06, |
| "loss": 0.3517, |
| "mean_token_accuracy": 0.8867884755134583, |
| "num_tokens": 46926701.0, |
| "step": 5780 |
| }, |
| { |
| "entropy": 0.9620746612548828, |
| "epoch": 3.266516092603049, |
| "grad_norm": 86.10861206054688, |
| "learning_rate": 4.096660985891207e-06, |
| "loss": 0.3372, |
| "mean_token_accuracy": 0.8917975783348083, |
| "num_tokens": 46967294.0, |
| "step": 5785 |
| }, |
| { |
| "entropy": 0.8969259858131409, |
| "epoch": 3.269339356295878, |
| "grad_norm": 77.28433990478516, |
| "learning_rate": 4.095238292863273e-06, |
| "loss": 0.3342, |
| "mean_token_accuracy": 0.8923115849494934, |
| "num_tokens": 47007892.0, |
| "step": 5790 |
| }, |
| { |
| "entropy": 0.9607129454612732, |
| "epoch": 3.272162619988707, |
| "grad_norm": 80.7704086303711, |
| "learning_rate": 4.093814836560381e-06, |
| "loss": 0.3537, |
| "mean_token_accuracy": 0.8840513944625854, |
| "num_tokens": 47048615.0, |
| "step": 5795 |
| }, |
| { |
| "entropy": 0.9498526930809021, |
| "epoch": 3.274985883681536, |
| "grad_norm": 95.09829711914062, |
| "learning_rate": 4.092390618102346e-06, |
| "loss": 0.338, |
| "mean_token_accuracy": 0.8934663891792297, |
| "num_tokens": 47088694.0, |
| "step": 5800 |
| }, |
| { |
| "entropy": 1.060532569885254, |
| "epoch": 3.2778091473743647, |
| "grad_norm": 80.7383041381836, |
| "learning_rate": 4.0909656386095854e-06, |
| "loss": 0.3493, |
| "mean_token_accuracy": 0.8909806966781616, |
| "num_tokens": 47129331.0, |
| "step": 5805 |
| }, |
| { |
| "entropy": 1.0316852688789369, |
| "epoch": 3.280632411067194, |
| "grad_norm": 83.49606323242188, |
| "learning_rate": 4.089539899203111e-06, |
| "loss": 0.366, |
| "mean_token_accuracy": 0.8828284740447998, |
| "num_tokens": 47169941.0, |
| "step": 5810 |
| }, |
| { |
| "entropy": 0.868412721157074, |
| "epoch": 3.2834556747600225, |
| "grad_norm": 69.75527954101562, |
| "learning_rate": 4.088113401004539e-06, |
| "loss": 0.3143, |
| "mean_token_accuracy": 0.89917072057724, |
| "num_tokens": 47210747.0, |
| "step": 5815 |
| }, |
| { |
| "entropy": 0.8937187194824219, |
| "epoch": 3.2862789384528517, |
| "grad_norm": 96.84244537353516, |
| "learning_rate": 4.086686145136074e-06, |
| "loss": 0.3619, |
| "mean_token_accuracy": 0.8864521026611328, |
| "num_tokens": 47250964.0, |
| "step": 5820 |
| }, |
| { |
| "entropy": 0.8895792603492737, |
| "epoch": 3.2891022021456804, |
| "grad_norm": 69.03482818603516, |
| "learning_rate": 4.085258132720525e-06, |
| "loss": 0.3411, |
| "mean_token_accuracy": 0.8903668642044067, |
| "num_tokens": 47291582.0, |
| "step": 5825 |
| }, |
| { |
| "entropy": 0.9830967545509338, |
| "epoch": 3.291925465838509, |
| "grad_norm": 83.75342559814453, |
| "learning_rate": 4.083829364881291e-06, |
| "loss": 0.3095, |
| "mean_token_accuracy": 0.8991586565971375, |
| "num_tokens": 47332346.0, |
| "step": 5830 |
| }, |
| { |
| "entropy": 0.9528472185134887, |
| "epoch": 3.2947487295313382, |
| "grad_norm": 87.47962951660156, |
| "learning_rate": 4.082399842742366e-06, |
| "loss": 0.3639, |
| "mean_token_accuracy": 0.8823336362838745, |
| "num_tokens": 47372949.0, |
| "step": 5835 |
| }, |
| { |
| "entropy": 0.9320926308631897, |
| "epoch": 3.2975719932241674, |
| "grad_norm": 74.84449768066406, |
| "learning_rate": 4.08096956742834e-06, |
| "loss": 0.3073, |
| "mean_token_accuracy": 0.8998920917510986, |
| "num_tokens": 47413544.0, |
| "step": 5840 |
| }, |
| { |
| "entropy": 0.8553381204605103, |
| "epoch": 3.300395256916996, |
| "grad_norm": 76.03280639648438, |
| "learning_rate": 4.0795385400643916e-06, |
| "loss": 0.3284, |
| "mean_token_accuracy": 0.8952413558959961, |
| "num_tokens": 47454255.0, |
| "step": 5845 |
| }, |
| { |
| "entropy": 0.9816694140434266, |
| "epoch": 3.303218520609825, |
| "grad_norm": 86.18427276611328, |
| "learning_rate": 4.078106761776294e-06, |
| "loss": 0.3428, |
| "mean_token_accuracy": 0.8892660975456238, |
| "num_tokens": 47494914.0, |
| "step": 5850 |
| }, |
| { |
| "entropy": 0.9079144239425659, |
| "epoch": 3.306041784302654, |
| "grad_norm": 89.10929870605469, |
| "learning_rate": 4.076674233690411e-06, |
| "loss": 0.3589, |
| "mean_token_accuracy": 0.8852792263031006, |
| "num_tokens": 47535582.0, |
| "step": 5855 |
| }, |
| { |
| "entropy": 0.893176531791687, |
| "epoch": 3.3088650479954826, |
| "grad_norm": 80.25196075439453, |
| "learning_rate": 4.075240956933694e-06, |
| "loss": 0.3476, |
| "mean_token_accuracy": 0.8886914730072022, |
| "num_tokens": 47576458.0, |
| "step": 5860 |
| }, |
| { |
| "entropy": 0.988261365890503, |
| "epoch": 3.311688311688312, |
| "grad_norm": 85.58906555175781, |
| "learning_rate": 4.073806932633685e-06, |
| "loss": 0.3393, |
| "mean_token_accuracy": 0.8919550776481628, |
| "num_tokens": 47617186.0, |
| "step": 5865 |
| }, |
| { |
| "entropy": 0.9218690037727356, |
| "epoch": 3.3145115753811405, |
| "grad_norm": 76.2083740234375, |
| "learning_rate": 4.072372161918514e-06, |
| "loss": 0.3374, |
| "mean_token_accuracy": 0.8905277132987977, |
| "num_tokens": 47657963.0, |
| "step": 5870 |
| }, |
| { |
| "entropy": 0.9897948741912842, |
| "epoch": 3.3173348390739696, |
| "grad_norm": 72.44279479980469, |
| "learning_rate": 4.0709366459169e-06, |
| "loss": 0.3708, |
| "mean_token_accuracy": 0.8791788578033447, |
| "num_tokens": 47698582.0, |
| "step": 5875 |
| }, |
| { |
| "entropy": 0.9159058570861817, |
| "epoch": 3.3201581027667983, |
| "grad_norm": 75.1999282836914, |
| "learning_rate": 4.069500385758144e-06, |
| "loss": 0.3177, |
| "mean_token_accuracy": 0.8962335586547852, |
| "num_tokens": 47738986.0, |
| "step": 5880 |
| }, |
| { |
| "entropy": 0.9603091716766358, |
| "epoch": 3.3229813664596275, |
| "grad_norm": 70.61493682861328, |
| "learning_rate": 4.068063382572136e-06, |
| "loss": 0.3392, |
| "mean_token_accuracy": 0.8907322883605957, |
| "num_tokens": 47779675.0, |
| "step": 5885 |
| }, |
| { |
| "entropy": 0.985940134525299, |
| "epoch": 3.325804630152456, |
| "grad_norm": 73.92083740234375, |
| "learning_rate": 4.066625637489349e-06, |
| "loss": 0.3389, |
| "mean_token_accuracy": 0.8892409563064575, |
| "num_tokens": 47820199.0, |
| "step": 5890 |
| }, |
| { |
| "entropy": 1.023607885837555, |
| "epoch": 3.328627893845285, |
| "grad_norm": 75.44142150878906, |
| "learning_rate": 4.065187151640839e-06, |
| "loss": 0.3389, |
| "mean_token_accuracy": 0.8911162734031677, |
| "num_tokens": 47860723.0, |
| "step": 5895 |
| }, |
| { |
| "entropy": 1.027356994152069, |
| "epoch": 3.331451157538114, |
| "grad_norm": 78.74333190917969, |
| "learning_rate": 4.063747926158248e-06, |
| "loss": 0.3476, |
| "mean_token_accuracy": 0.8885314106941223, |
| "num_tokens": 47901477.0, |
| "step": 5900 |
| }, |
| { |
| "entropy": 0.9395134568214416, |
| "epoch": 3.334274421230943, |
| "grad_norm": 78.92701721191406, |
| "learning_rate": 4.062307962173796e-06, |
| "loss": 0.3548, |
| "mean_token_accuracy": 0.88640958070755, |
| "num_tokens": 47941620.0, |
| "step": 5905 |
| }, |
| { |
| "entropy": 0.8830117225646973, |
| "epoch": 3.337097684923772, |
| "grad_norm": 69.30189514160156, |
| "learning_rate": 4.060867260820287e-06, |
| "loss": 0.3458, |
| "mean_token_accuracy": 0.8891350269317627, |
| "num_tokens": 47982251.0, |
| "step": 5910 |
| }, |
| { |
| "entropy": 0.943314504623413, |
| "epoch": 3.3399209486166006, |
| "grad_norm": 58.88145446777344, |
| "learning_rate": 4.059425823231101e-06, |
| "loss": 0.3606, |
| "mean_token_accuracy": 0.8849634170532227, |
| "num_tokens": 48023080.0, |
| "step": 5915 |
| }, |
| { |
| "entropy": 0.9857373237609863, |
| "epoch": 3.3427442123094298, |
| "grad_norm": 78.76705169677734, |
| "learning_rate": 4.057983650540203e-06, |
| "loss": 0.3461, |
| "mean_token_accuracy": 0.8870670199394226, |
| "num_tokens": 48063945.0, |
| "step": 5920 |
| }, |
| { |
| "entropy": 1.0337161064147948, |
| "epoch": 3.3455674760022585, |
| "grad_norm": 81.34748840332031, |
| "learning_rate": 4.05654074388213e-06, |
| "loss": 0.3636, |
| "mean_token_accuracy": 0.8839574098587036, |
| "num_tokens": 48104362.0, |
| "step": 5925 |
| }, |
| { |
| "entropy": 0.9524666666984558, |
| "epoch": 3.3483907396950876, |
| "grad_norm": 70.2374267578125, |
| "learning_rate": 4.055097104392003e-06, |
| "loss": 0.3416, |
| "mean_token_accuracy": 0.8884192824363708, |
| "num_tokens": 48145068.0, |
| "step": 5930 |
| }, |
| { |
| "entropy": 0.994706106185913, |
| "epoch": 3.3512140033879163, |
| "grad_norm": 82.83272552490234, |
| "learning_rate": 4.053652733205513e-06, |
| "loss": 0.3345, |
| "mean_token_accuracy": 0.8928604006767273, |
| "num_tokens": 48185762.0, |
| "step": 5935 |
| }, |
| { |
| "entropy": 1.0792868852615356, |
| "epoch": 3.3540372670807455, |
| "grad_norm": 72.94239807128906, |
| "learning_rate": 4.052207631458933e-06, |
| "loss": 0.3692, |
| "mean_token_accuracy": 0.8833429217338562, |
| "num_tokens": 48226457.0, |
| "step": 5940 |
| }, |
| { |
| "entropy": 1.0501051068305969, |
| "epoch": 3.356860530773574, |
| "grad_norm": 70.4188232421875, |
| "learning_rate": 4.050761800289104e-06, |
| "loss": 0.3402, |
| "mean_token_accuracy": 0.8886079668998719, |
| "num_tokens": 48266456.0, |
| "step": 5945 |
| }, |
| { |
| "entropy": 0.9432408094406128, |
| "epoch": 3.3596837944664033, |
| "grad_norm": 64.5877456665039, |
| "learning_rate": 4.049315240833445e-06, |
| "loss": 0.3185, |
| "mean_token_accuracy": 0.8974138140678406, |
| "num_tokens": 48307115.0, |
| "step": 5950 |
| }, |
| { |
| "entropy": 0.987043297290802, |
| "epoch": 3.362507058159232, |
| "grad_norm": 94.49236297607422, |
| "learning_rate": 4.047867954229949e-06, |
| "loss": 0.3702, |
| "mean_token_accuracy": 0.8812648177146911, |
| "num_tokens": 48347890.0, |
| "step": 5955 |
| }, |
| { |
| "entropy": 0.9090808153152465, |
| "epoch": 3.365330321852061, |
| "grad_norm": 79.62647247314453, |
| "learning_rate": 4.046419941617177e-06, |
| "loss": 0.3327, |
| "mean_token_accuracy": 0.8922478914260864, |
| "num_tokens": 48387673.0, |
| "step": 5960 |
| }, |
| { |
| "entropy": 0.8052204251289368, |
| "epoch": 3.36815358554489, |
| "grad_norm": 75.73529815673828, |
| "learning_rate": 4.044971204134266e-06, |
| "loss": 0.3119, |
| "mean_token_accuracy": 0.9001648783683777, |
| "num_tokens": 48428374.0, |
| "step": 5965 |
| }, |
| { |
| "entropy": 1.0093549251556397, |
| "epoch": 3.370976849237719, |
| "grad_norm": 66.04109954833984, |
| "learning_rate": 4.043521742920918e-06, |
| "loss": 0.3552, |
| "mean_token_accuracy": 0.8854212641716004, |
| "num_tokens": 48469022.0, |
| "step": 5970 |
| }, |
| { |
| "entropy": 0.8731826066970825, |
| "epoch": 3.3738001129305477, |
| "grad_norm": 79.09840393066406, |
| "learning_rate": 4.042071559117408e-06, |
| "loss": 0.3009, |
| "mean_token_accuracy": 0.9022910356521606, |
| "num_tokens": 48509642.0, |
| "step": 5975 |
| }, |
| { |
| "entropy": 0.9268733263015747, |
| "epoch": 3.3766233766233764, |
| "grad_norm": 73.78678894042969, |
| "learning_rate": 4.040620653864578e-06, |
| "loss": 0.3718, |
| "mean_token_accuracy": 0.8803731322288513, |
| "num_tokens": 48550455.0, |
| "step": 5980 |
| }, |
| { |
| "entropy": 0.985349690914154, |
| "epoch": 3.3794466403162056, |
| "grad_norm": 74.12259674072266, |
| "learning_rate": 4.0391690283038384e-06, |
| "loss": 0.3467, |
| "mean_token_accuracy": 0.887589693069458, |
| "num_tokens": 48591115.0, |
| "step": 5985 |
| }, |
| { |
| "entropy": 1.0588406562805175, |
| "epoch": 3.3822699040090343, |
| "grad_norm": 85.06888580322266, |
| "learning_rate": 4.0377166835771665e-06, |
| "loss": 0.3629, |
| "mean_token_accuracy": 0.8824954628944397, |
| "num_tokens": 48631699.0, |
| "step": 5990 |
| }, |
| { |
| "entropy": 1.0037429928779602, |
| "epoch": 3.3850931677018634, |
| "grad_norm": 77.73078918457031, |
| "learning_rate": 4.036263620827103e-06, |
| "loss": 0.354, |
| "mean_token_accuracy": 0.8870412349700928, |
| "num_tokens": 48672449.0, |
| "step": 5995 |
| }, |
| { |
| "entropy": 0.9787355661392212, |
| "epoch": 3.387916431394692, |
| "grad_norm": 79.08088684082031, |
| "learning_rate": 4.034809841196756e-06, |
| "loss": 0.3292, |
| "mean_token_accuracy": 0.8948890924453735, |
| "num_tokens": 48713156.0, |
| "step": 6000 |
| }, |
| { |
| "epoch": 3.387916431394692, |
| "eval_entropy": 1.23082674741745, |
| "eval_loss": 0.2607056200504303, |
| "eval_mean_token_accuracy": 0.9218904733657837, |
| "eval_num_tokens": 48713156.0, |
| "eval_runtime": 2.454, |
| "eval_samples_per_second": 15.893, |
| "eval_steps_per_second": 2.038, |
| "step": 6000 |
| }, |
| { |
| "entropy": 1.0537591814994811, |
| "epoch": 3.3907396950875213, |
| "grad_norm": 86.71538543701172, |
| "learning_rate": 4.033355345829797e-06, |
| "loss": 0.3366, |
| "mean_token_accuracy": 0.8912654995918274, |
| "num_tokens": 48753696.0, |
| "step": 6005 |
| }, |
| { |
| "entropy": 1.0819080591201782, |
| "epoch": 3.39356295878035, |
| "grad_norm": 71.52223205566406, |
| "learning_rate": 4.03190013587046e-06, |
| "loss": 0.3612, |
| "mean_token_accuracy": 0.8849057555198669, |
| "num_tokens": 48794454.0, |
| "step": 6010 |
| }, |
| { |
| "entropy": 1.0233477354049683, |
| "epoch": 3.396386222473179, |
| "grad_norm": 85.10845947265625, |
| "learning_rate": 4.030444212463542e-06, |
| "loss": 0.3629, |
| "mean_token_accuracy": 0.8837889313697815, |
| "num_tokens": 48835161.0, |
| "step": 6015 |
| }, |
| { |
| "entropy": 1.0686038136482239, |
| "epoch": 3.399209486166008, |
| "grad_norm": 72.09638214111328, |
| "learning_rate": 4.028987576754398e-06, |
| "loss": 0.3841, |
| "mean_token_accuracy": 0.8787945985794068, |
| "num_tokens": 48875996.0, |
| "step": 6020 |
| }, |
| { |
| "entropy": 0.9429172515869141, |
| "epoch": 3.402032749858837, |
| "grad_norm": 91.68360137939453, |
| "learning_rate": 4.0275302298889495e-06, |
| "loss": 0.3576, |
| "mean_token_accuracy": 0.8851586222648621, |
| "num_tokens": 48916665.0, |
| "step": 6025 |
| }, |
| { |
| "entropy": 0.8698290228843689, |
| "epoch": 3.4048560135516657, |
| "grad_norm": 59.9684944152832, |
| "learning_rate": 4.026072173013673e-06, |
| "loss": 0.3334, |
| "mean_token_accuracy": 0.8916546106338501, |
| "num_tokens": 48957359.0, |
| "step": 6030 |
| }, |
| { |
| "entropy": 0.9015409588813782, |
| "epoch": 3.407679277244495, |
| "grad_norm": 87.83787536621094, |
| "learning_rate": 4.024613407275603e-06, |
| "loss": 0.3439, |
| "mean_token_accuracy": 0.8881507873535156, |
| "num_tokens": 48998238.0, |
| "step": 6035 |
| }, |
| { |
| "entropy": 0.9665756225585938, |
| "epoch": 3.4105025409373235, |
| "grad_norm": 75.79359436035156, |
| "learning_rate": 4.023153933822335e-06, |
| "loss": 0.3213, |
| "mean_token_accuracy": 0.8947265267372131, |
| "num_tokens": 49038813.0, |
| "step": 6040 |
| }, |
| { |
| "entropy": 1.0080926299095154, |
| "epoch": 3.4133258046301522, |
| "grad_norm": 76.97696685791016, |
| "learning_rate": 4.021693753802019e-06, |
| "loss": 0.3388, |
| "mean_token_accuracy": 0.8899381160736084, |
| "num_tokens": 49079529.0, |
| "step": 6045 |
| }, |
| { |
| "entropy": 0.9414087653160095, |
| "epoch": 3.4161490683229814, |
| "grad_norm": 78.20440673828125, |
| "learning_rate": 4.0202328683633605e-06, |
| "loss": 0.3069, |
| "mean_token_accuracy": 0.9005836963653564, |
| "num_tokens": 49120374.0, |
| "step": 6050 |
| }, |
| { |
| "entropy": 0.9534523367881775, |
| "epoch": 3.4189723320158105, |
| "grad_norm": 67.13829040527344, |
| "learning_rate": 4.018771278655622e-06, |
| "loss": 0.3221, |
| "mean_token_accuracy": 0.8969206929206848, |
| "num_tokens": 49161020.0, |
| "step": 6055 |
| }, |
| { |
| "entropy": 1.1293304204940795, |
| "epoch": 3.4217955957086392, |
| "grad_norm": 81.1747055053711, |
| "learning_rate": 4.017308985828617e-06, |
| "loss": 0.3829, |
| "mean_token_accuracy": 0.876332950592041, |
| "num_tokens": 49201345.0, |
| "step": 6060 |
| }, |
| { |
| "entropy": 0.9646120667457581, |
| "epoch": 3.424618859401468, |
| "grad_norm": 78.29570770263672, |
| "learning_rate": 4.015845991032716e-06, |
| "loss": 0.3259, |
| "mean_token_accuracy": 0.8951454758644104, |
| "num_tokens": 49241850.0, |
| "step": 6065 |
| }, |
| { |
| "entropy": 1.0473296403884889, |
| "epoch": 3.427442123094297, |
| "grad_norm": 76.8552474975586, |
| "learning_rate": 4.014382295418838e-06, |
| "loss": 0.3649, |
| "mean_token_accuracy": 0.8825054168701172, |
| "num_tokens": 49282433.0, |
| "step": 6070 |
| }, |
| { |
| "entropy": 0.9831879496574402, |
| "epoch": 3.430265386787126, |
| "grad_norm": 75.86959838867188, |
| "learning_rate": 4.012917900138457e-06, |
| "loss": 0.3619, |
| "mean_token_accuracy": 0.8826367855072021, |
| "num_tokens": 49323030.0, |
| "step": 6075 |
| }, |
| { |
| "entropy": 1.117681658267975, |
| "epoch": 3.433088650479955, |
| "grad_norm": 75.12294006347656, |
| "learning_rate": 4.011452806343593e-06, |
| "loss": 0.372, |
| "mean_token_accuracy": 0.88106929063797, |
| "num_tokens": 49363673.0, |
| "step": 6080 |
| }, |
| { |
| "entropy": 0.9735101342201233, |
| "epoch": 3.4359119141727836, |
| "grad_norm": 65.7935562133789, |
| "learning_rate": 4.00998701518682e-06, |
| "loss": 0.3439, |
| "mean_token_accuracy": 0.8863797664642334, |
| "num_tokens": 49404345.0, |
| "step": 6085 |
| }, |
| { |
| "entropy": 0.9507794737815857, |
| "epoch": 3.438735177865613, |
| "grad_norm": 90.31965637207031, |
| "learning_rate": 4.008520527821257e-06, |
| "loss": 0.3537, |
| "mean_token_accuracy": 0.8880294799804688, |
| "num_tokens": 49444980.0, |
| "step": 6090 |
| }, |
| { |
| "entropy": 1.0107581615447998, |
| "epoch": 3.4415584415584415, |
| "grad_norm": 74.97982025146484, |
| "learning_rate": 4.007053345400572e-06, |
| "loss": 0.3233, |
| "mean_token_accuracy": 0.8946360349655151, |
| "num_tokens": 49485573.0, |
| "step": 6095 |
| }, |
| { |
| "entropy": 0.9869904041290283, |
| "epoch": 3.4443817052512706, |
| "grad_norm": 73.46428680419922, |
| "learning_rate": 4.0055854690789815e-06, |
| "loss": 0.3513, |
| "mean_token_accuracy": 0.8861337780952454, |
| "num_tokens": 49526252.0, |
| "step": 6100 |
| }, |
| { |
| "entropy": 0.9334820985794068, |
| "epoch": 3.4472049689440993, |
| "grad_norm": 75.04248809814453, |
| "learning_rate": 4.0041169000112454e-06, |
| "loss": 0.3368, |
| "mean_token_accuracy": 0.8914402961730957, |
| "num_tokens": 49566738.0, |
| "step": 6105 |
| }, |
| { |
| "entropy": 1.0121949315071106, |
| "epoch": 3.4500282326369285, |
| "grad_norm": 86.6436767578125, |
| "learning_rate": 4.00264763935267e-06, |
| "loss": 0.3305, |
| "mean_token_accuracy": 0.8928271055221557, |
| "num_tokens": 49607487.0, |
| "step": 6110 |
| }, |
| { |
| "entropy": 1.1302125215530396, |
| "epoch": 3.452851496329757, |
| "grad_norm": 65.75306701660156, |
| "learning_rate": 4.001177688259105e-06, |
| "loss": 0.339, |
| "mean_token_accuracy": 0.8919134378433228, |
| "num_tokens": 49648166.0, |
| "step": 6115 |
| }, |
| { |
| "entropy": 1.0888399124145507, |
| "epoch": 3.4556747600225863, |
| "grad_norm": 69.06067657470703, |
| "learning_rate": 3.999707047886944e-06, |
| "loss": 0.3429, |
| "mean_token_accuracy": 0.8888397812843323, |
| "num_tokens": 49688823.0, |
| "step": 6120 |
| }, |
| { |
| "entropy": 1.025403904914856, |
| "epoch": 3.458498023715415, |
| "grad_norm": 85.14940643310547, |
| "learning_rate": 3.998235719393121e-06, |
| "loss": 0.3627, |
| "mean_token_accuracy": 0.8827525019645691, |
| "num_tokens": 49728661.0, |
| "step": 6125 |
| }, |
| { |
| "entropy": 0.9839401364326477, |
| "epoch": 3.4613212874082437, |
| "grad_norm": 71.55946350097656, |
| "learning_rate": 3.996763703935114e-06, |
| "loss": 0.3503, |
| "mean_token_accuracy": 0.8873334646224975, |
| "num_tokens": 49769275.0, |
| "step": 6130 |
| }, |
| { |
| "entropy": 1.0336027026176453, |
| "epoch": 3.464144551101073, |
| "grad_norm": 78.25820922851562, |
| "learning_rate": 3.995291002670941e-06, |
| "loss": 0.3675, |
| "mean_token_accuracy": 0.8819956421852112, |
| "num_tokens": 49810066.0, |
| "step": 6135 |
| }, |
| { |
| "entropy": 1.0026185154914855, |
| "epoch": 3.4669678147939016, |
| "grad_norm": 85.74049377441406, |
| "learning_rate": 3.993817616759155e-06, |
| "loss": 0.3653, |
| "mean_token_accuracy": 0.882231068611145, |
| "num_tokens": 49850531.0, |
| "step": 6140 |
| }, |
| { |
| "entropy": 1.05128253698349, |
| "epoch": 3.4697910784867307, |
| "grad_norm": 68.15999603271484, |
| "learning_rate": 3.992343547358854e-06, |
| "loss": 0.3311, |
| "mean_token_accuracy": 0.8922553777694702, |
| "num_tokens": 49891169.0, |
| "step": 6145 |
| }, |
| { |
| "entropy": 1.0605325102806091, |
| "epoch": 3.4726143421795594, |
| "grad_norm": 90.25965881347656, |
| "learning_rate": 3.990868795629671e-06, |
| "loss": 0.3493, |
| "mean_token_accuracy": 0.8885184526443481, |
| "num_tokens": 49931622.0, |
| "step": 6150 |
| }, |
| { |
| "entropy": 0.9792709231376648, |
| "epoch": 3.4754376058723886, |
| "grad_norm": 78.75196838378906, |
| "learning_rate": 3.989393362731775e-06, |
| "loss": 0.3647, |
| "mean_token_accuracy": 0.8825397968292237, |
| "num_tokens": 49972332.0, |
| "step": 6155 |
| }, |
| { |
| "entropy": 0.8609856843948365, |
| "epoch": 3.4782608695652173, |
| "grad_norm": 74.58406066894531, |
| "learning_rate": 3.987917249825872e-06, |
| "loss": 0.3152, |
| "mean_token_accuracy": 0.8976006627082824, |
| "num_tokens": 50012990.0, |
| "step": 6160 |
| }, |
| { |
| "entropy": 0.9564788579940796, |
| "epoch": 3.4810841332580464, |
| "grad_norm": 79.9758529663086, |
| "learning_rate": 3.986440458073202e-06, |
| "loss": 0.362, |
| "mean_token_accuracy": 0.8848256587982177, |
| "num_tokens": 50053801.0, |
| "step": 6165 |
| }, |
| { |
| "entropy": 0.9849433898925781, |
| "epoch": 3.483907396950875, |
| "grad_norm": 88.46871948242188, |
| "learning_rate": 3.98496298863554e-06, |
| "loss": 0.3297, |
| "mean_token_accuracy": 0.8946364402770997, |
| "num_tokens": 50094571.0, |
| "step": 6170 |
| }, |
| { |
| "entropy": 0.9942429542541504, |
| "epoch": 3.4867306606437043, |
| "grad_norm": 65.55095672607422, |
| "learning_rate": 3.983484842675194e-06, |
| "loss": 0.3493, |
| "mean_token_accuracy": 0.8884786128997803, |
| "num_tokens": 50135471.0, |
| "step": 6175 |
| }, |
| { |
| "entropy": 1.002113664150238, |
| "epoch": 3.489553924336533, |
| "grad_norm": 66.22470092773438, |
| "learning_rate": 3.982006021355002e-06, |
| "loss": 0.3722, |
| "mean_token_accuracy": 0.8818007349967957, |
| "num_tokens": 50176220.0, |
| "step": 6180 |
| }, |
| { |
| "entropy": 1.0745681047439575, |
| "epoch": 3.492377188029362, |
| "grad_norm": 94.96135711669922, |
| "learning_rate": 3.980526525838337e-06, |
| "loss": 0.3788, |
| "mean_token_accuracy": 0.8774041533470154, |
| "num_tokens": 50216826.0, |
| "step": 6185 |
| }, |
| { |
| "entropy": 1.0016021370887755, |
| "epoch": 3.495200451722191, |
| "grad_norm": 84.10227966308594, |
| "learning_rate": 3.979046357289101e-06, |
| "loss": 0.346, |
| "mean_token_accuracy": 0.888908052444458, |
| "num_tokens": 50257345.0, |
| "step": 6190 |
| }, |
| { |
| "entropy": 1.0220289945602417, |
| "epoch": 3.4980237154150196, |
| "grad_norm": 80.87281036376953, |
| "learning_rate": 3.977565516871723e-06, |
| "loss": 0.3422, |
| "mean_token_accuracy": 0.8883512258529663, |
| "num_tokens": 50298037.0, |
| "step": 6195 |
| }, |
| { |
| "entropy": 0.958783769607544, |
| "epoch": 3.5008469791078487, |
| "grad_norm": 88.97132110595703, |
| "learning_rate": 3.976084005751164e-06, |
| "loss": 0.3575, |
| "mean_token_accuracy": 0.8871893882751465, |
| "num_tokens": 50338663.0, |
| "step": 6200 |
| }, |
| { |
| "entropy": 1.02311053276062, |
| "epoch": 3.503670242800678, |
| "grad_norm": 76.60072326660156, |
| "learning_rate": 3.974601825092911e-06, |
| "loss": 0.3783, |
| "mean_token_accuracy": 0.8799753189086914, |
| "num_tokens": 50379319.0, |
| "step": 6205 |
| }, |
| { |
| "entropy": 1.044371199607849, |
| "epoch": 3.5064935064935066, |
| "grad_norm": 87.33512115478516, |
| "learning_rate": 3.973118976062978e-06, |
| "loss": 0.3701, |
| "mean_token_accuracy": 0.8824043035507202, |
| "num_tokens": 50420067.0, |
| "step": 6210 |
| }, |
| { |
| "entropy": 0.9192242622375488, |
| "epoch": 3.5093167701863353, |
| "grad_norm": 73.0194091796875, |
| "learning_rate": 3.971635459827905e-06, |
| "loss": 0.3236, |
| "mean_token_accuracy": 0.893168592453003, |
| "num_tokens": 50460623.0, |
| "step": 6215 |
| }, |
| { |
| "entropy": 0.9499261617660523, |
| "epoch": 3.5121400338791644, |
| "grad_norm": 89.47998809814453, |
| "learning_rate": 3.970151277554756e-06, |
| "loss": 0.3667, |
| "mean_token_accuracy": 0.8812254905700684, |
| "num_tokens": 50501429.0, |
| "step": 6220 |
| }, |
| { |
| "entropy": 0.9651218175888061, |
| "epoch": 3.514963297571993, |
| "grad_norm": 73.77303314208984, |
| "learning_rate": 3.96866643041112e-06, |
| "loss": 0.3355, |
| "mean_token_accuracy": 0.8922844529151917, |
| "num_tokens": 50542128.0, |
| "step": 6225 |
| }, |
| { |
| "entropy": 1.00862113237381, |
| "epoch": 3.5177865612648223, |
| "grad_norm": 82.71814727783203, |
| "learning_rate": 3.967180919565108e-06, |
| "loss": 0.3576, |
| "mean_token_accuracy": 0.8849758267402649, |
| "num_tokens": 50582754.0, |
| "step": 6230 |
| }, |
| { |
| "entropy": 1.0719767332077026, |
| "epoch": 3.520609824957651, |
| "grad_norm": 81.17707061767578, |
| "learning_rate": 3.965694746185355e-06, |
| "loss": 0.3651, |
| "mean_token_accuracy": 0.8814258694648742, |
| "num_tokens": 50623547.0, |
| "step": 6235 |
| }, |
| { |
| "entropy": 0.9911883711814881, |
| "epoch": 3.52343308865048, |
| "grad_norm": 83.75228881835938, |
| "learning_rate": 3.964207911441015e-06, |
| "loss": 0.3755, |
| "mean_token_accuracy": 0.8790446758270264, |
| "num_tokens": 50664345.0, |
| "step": 6240 |
| }, |
| { |
| "entropy": 0.9944733381271362, |
| "epoch": 3.526256352343309, |
| "grad_norm": 122.08747863769531, |
| "learning_rate": 3.962720416501763e-06, |
| "loss": 0.3689, |
| "mean_token_accuracy": 0.8826006412506103, |
| "num_tokens": 50704948.0, |
| "step": 6245 |
| }, |
| { |
| "entropy": 1.0450132608413696, |
| "epoch": 3.529079616036138, |
| "grad_norm": 84.4200668334961, |
| "learning_rate": 3.961232262537795e-06, |
| "loss": 0.3687, |
| "mean_token_accuracy": 0.8846492290496826, |
| "num_tokens": 50745604.0, |
| "step": 6250 |
| }, |
| { |
| "entropy": 0.9697313427925109, |
| "epoch": 3.5319028797289667, |
| "grad_norm": 73.62430572509766, |
| "learning_rate": 3.959743450719824e-06, |
| "loss": 0.3654, |
| "mean_token_accuracy": 0.8818155765533447, |
| "num_tokens": 50786209.0, |
| "step": 6255 |
| }, |
| { |
| "entropy": 0.973599910736084, |
| "epoch": 3.5347261434217954, |
| "grad_norm": 87.3067855834961, |
| "learning_rate": 3.958253982219079e-06, |
| "loss": 0.3534, |
| "mean_token_accuracy": 0.8868489623069763, |
| "num_tokens": 50826700.0, |
| "step": 6260 |
| }, |
| { |
| "entropy": 0.994738757610321, |
| "epoch": 3.5375494071146245, |
| "grad_norm": 71.6942138671875, |
| "learning_rate": 3.956763858207308e-06, |
| "loss": 0.3537, |
| "mean_token_accuracy": 0.8836980581283569, |
| "num_tokens": 50867329.0, |
| "step": 6265 |
| }, |
| { |
| "entropy": 1.0417511463165283, |
| "epoch": 3.5403726708074537, |
| "grad_norm": 76.70450592041016, |
| "learning_rate": 3.955273079856773e-06, |
| "loss": 0.3757, |
| "mean_token_accuracy": 0.8782451033592225, |
| "num_tokens": 50908046.0, |
| "step": 6270 |
| }, |
| { |
| "entropy": 0.98163822889328, |
| "epoch": 3.5431959345002824, |
| "grad_norm": 66.1574478149414, |
| "learning_rate": 3.953781648340254e-06, |
| "loss": 0.3648, |
| "mean_token_accuracy": 0.8820971846580505, |
| "num_tokens": 50948751.0, |
| "step": 6275 |
| }, |
| { |
| "entropy": 1.049662470817566, |
| "epoch": 3.546019198193111, |
| "grad_norm": 72.4782485961914, |
| "learning_rate": 3.95228956483104e-06, |
| "loss": 0.3379, |
| "mean_token_accuracy": 0.8903756499290466, |
| "num_tokens": 50989488.0, |
| "step": 6280 |
| }, |
| { |
| "entropy": 1.001114797592163, |
| "epoch": 3.54884246188594, |
| "grad_norm": 67.1705551147461, |
| "learning_rate": 3.950796830502938e-06, |
| "loss": 0.3485, |
| "mean_token_accuracy": 0.8888326168060303, |
| "num_tokens": 51030140.0, |
| "step": 6285 |
| }, |
| { |
| "entropy": 1.0564101696014405, |
| "epoch": 3.551665725578769, |
| "grad_norm": 71.83964538574219, |
| "learning_rate": 3.949303446530262e-06, |
| "loss": 0.3527, |
| "mean_token_accuracy": 0.8852935433387756, |
| "num_tokens": 51070831.0, |
| "step": 6290 |
| }, |
| { |
| "entropy": 1.024319851398468, |
| "epoch": 3.554488989271598, |
| "grad_norm": 80.63823699951172, |
| "learning_rate": 3.94780941408784e-06, |
| "loss": 0.3723, |
| "mean_token_accuracy": 0.8803096055984497, |
| "num_tokens": 51111402.0, |
| "step": 6295 |
| }, |
| { |
| "entropy": 1.0112637758255005, |
| "epoch": 3.5573122529644268, |
| "grad_norm": 65.74320983886719, |
| "learning_rate": 3.94631473435101e-06, |
| "loss": 0.3562, |
| "mean_token_accuracy": 0.885633933544159, |
| "num_tokens": 51151931.0, |
| "step": 6300 |
| }, |
| { |
| "entropy": 1.126786994934082, |
| "epoch": 3.560135516657256, |
| "grad_norm": 78.12380981445312, |
| "learning_rate": 3.9448194084956185e-06, |
| "loss": 0.3561, |
| "mean_token_accuracy": 0.8849801540374755, |
| "num_tokens": 51192618.0, |
| "step": 6305 |
| }, |
| { |
| "entropy": 0.9513525009155274, |
| "epoch": 3.5629587803500846, |
| "grad_norm": 75.2828598022461, |
| "learning_rate": 3.943323437698021e-06, |
| "loss": 0.3609, |
| "mean_token_accuracy": 0.8847909450531006, |
| "num_tokens": 51233084.0, |
| "step": 6310 |
| }, |
| { |
| "entropy": 1.0396467089653014, |
| "epoch": 3.5657820440429138, |
| "grad_norm": 77.07085418701172, |
| "learning_rate": 3.941826823135079e-06, |
| "loss": 0.3533, |
| "mean_token_accuracy": 0.8857412457466125, |
| "num_tokens": 51273692.0, |
| "step": 6315 |
| }, |
| { |
| "entropy": 1.0362180352211, |
| "epoch": 3.5686053077357425, |
| "grad_norm": 85.95170593261719, |
| "learning_rate": 3.940329565984165e-06, |
| "loss": 0.3769, |
| "mean_token_accuracy": 0.8745699167251587, |
| "num_tokens": 51314442.0, |
| "step": 6320 |
| }, |
| { |
| "entropy": 0.92795330286026, |
| "epoch": 3.571428571428571, |
| "grad_norm": 86.3418960571289, |
| "learning_rate": 3.938831667423149e-06, |
| "loss": 0.3307, |
| "mean_token_accuracy": 0.8915099501609802, |
| "num_tokens": 51355241.0, |
| "step": 6325 |
| }, |
| { |
| "entropy": 0.9906257629394531, |
| "epoch": 3.5742518351214003, |
| "grad_norm": 81.33384704589844, |
| "learning_rate": 3.937333128630411e-06, |
| "loss": 0.3599, |
| "mean_token_accuracy": 0.884676706790924, |
| "num_tokens": 51395216.0, |
| "step": 6330 |
| }, |
| { |
| "entropy": 1.071572208404541, |
| "epoch": 3.5770750988142295, |
| "grad_norm": 78.2898941040039, |
| "learning_rate": 3.9358339507848355e-06, |
| "loss": 0.4006, |
| "mean_token_accuracy": 0.8738093256950379, |
| "num_tokens": 51435914.0, |
| "step": 6335 |
| }, |
| { |
| "entropy": 0.9324329853057861, |
| "epoch": 3.579898362507058, |
| "grad_norm": 67.50933837890625, |
| "learning_rate": 3.934334135065807e-06, |
| "loss": 0.3637, |
| "mean_token_accuracy": 0.8848002195358277, |
| "num_tokens": 51476772.0, |
| "step": 6340 |
| }, |
| { |
| "entropy": 0.939254081249237, |
| "epoch": 3.582721626199887, |
| "grad_norm": 77.13788604736328, |
| "learning_rate": 3.932833682653212e-06, |
| "loss": 0.3532, |
| "mean_token_accuracy": 0.8868015527725219, |
| "num_tokens": 51517491.0, |
| "step": 6345 |
| }, |
| { |
| "entropy": 1.0642353653907777, |
| "epoch": 3.585544889892716, |
| "grad_norm": 82.49517822265625, |
| "learning_rate": 3.93133259472744e-06, |
| "loss": 0.3904, |
| "mean_token_accuracy": 0.8745889067649841, |
| "num_tokens": 51558219.0, |
| "step": 6350 |
| }, |
| { |
| "entropy": 0.9392816543579101, |
| "epoch": 3.5883681535855447, |
| "grad_norm": 80.28068542480469, |
| "learning_rate": 3.929830872469378e-06, |
| "loss": 0.3242, |
| "mean_token_accuracy": 0.8940078973770141, |
| "num_tokens": 51598791.0, |
| "step": 6355 |
| }, |
| { |
| "entropy": 0.9706831693649292, |
| "epoch": 3.591191417278374, |
| "grad_norm": 81.84841918945312, |
| "learning_rate": 3.928328517060412e-06, |
| "loss": 0.3714, |
| "mean_token_accuracy": 0.8811930298805237, |
| "num_tokens": 51639315.0, |
| "step": 6360 |
| }, |
| { |
| "entropy": 1.0625698804855346, |
| "epoch": 3.5940146809712026, |
| "grad_norm": 89.82206726074219, |
| "learning_rate": 3.926825529682431e-06, |
| "loss": 0.3758, |
| "mean_token_accuracy": 0.8791163086891174, |
| "num_tokens": 51679609.0, |
| "step": 6365 |
| }, |
| { |
| "entropy": 1.0034614324569702, |
| "epoch": 3.5968379446640317, |
| "grad_norm": 71.02519226074219, |
| "learning_rate": 3.925321911517814e-06, |
| "loss": 0.3403, |
| "mean_token_accuracy": 0.8907212376594543, |
| "num_tokens": 51720208.0, |
| "step": 6370 |
| }, |
| { |
| "entropy": 1.0046752095222473, |
| "epoch": 3.5996612083568604, |
| "grad_norm": 95.1485595703125, |
| "learning_rate": 3.92381766374944e-06, |
| "loss": 0.3617, |
| "mean_token_accuracy": 0.8833420157432557, |
| "num_tokens": 51760749.0, |
| "step": 6375 |
| }, |
| { |
| "entropy": 1.0107667088508605, |
| "epoch": 3.6024844720496896, |
| "grad_norm": 79.1152572631836, |
| "learning_rate": 3.922312787560684e-06, |
| "loss": 0.3315, |
| "mean_token_accuracy": 0.8931966900825501, |
| "num_tokens": 51801541.0, |
| "step": 6380 |
| }, |
| { |
| "entropy": 1.0665692925453185, |
| "epoch": 3.6053077357425183, |
| "grad_norm": 83.31388092041016, |
| "learning_rate": 3.920807284135413e-06, |
| "loss": 0.3908, |
| "mean_token_accuracy": 0.8754578113555909, |
| "num_tokens": 51842293.0, |
| "step": 6385 |
| }, |
| { |
| "entropy": 1.0210911154747009, |
| "epoch": 3.608130999435347, |
| "grad_norm": 79.14932250976562, |
| "learning_rate": 3.919301154657989e-06, |
| "loss": 0.359, |
| "mean_token_accuracy": 0.881727647781372, |
| "num_tokens": 51882745.0, |
| "step": 6390 |
| }, |
| { |
| "entropy": 1.031436562538147, |
| "epoch": 3.610954263128176, |
| "grad_norm": 81.52947998046875, |
| "learning_rate": 3.917794400313268e-06, |
| "loss": 0.3432, |
| "mean_token_accuracy": 0.8924546480178833, |
| "num_tokens": 51923109.0, |
| "step": 6395 |
| }, |
| { |
| "entropy": 1.0299935698509217, |
| "epoch": 3.6137775268210053, |
| "grad_norm": 77.39228820800781, |
| "learning_rate": 3.916287022286593e-06, |
| "loss": 0.3797, |
| "mean_token_accuracy": 0.8783790588378906, |
| "num_tokens": 51963770.0, |
| "step": 6400 |
| }, |
| { |
| "entropy": 1.0109672904014588, |
| "epoch": 3.616600790513834, |
| "grad_norm": 78.6328125, |
| "learning_rate": 3.914779021763803e-06, |
| "loss": 0.3537, |
| "mean_token_accuracy": 0.8864980340003967, |
| "num_tokens": 52004112.0, |
| "step": 6405 |
| }, |
| { |
| "entropy": 0.9662197709083558, |
| "epoch": 3.6194240542066627, |
| "grad_norm": 76.94004821777344, |
| "learning_rate": 3.913270399931223e-06, |
| "loss": 0.347, |
| "mean_token_accuracy": 0.8888106346130371, |
| "num_tokens": 52044761.0, |
| "step": 6410 |
| }, |
| { |
| "entropy": 0.8939793229103088, |
| "epoch": 3.622247317899492, |
| "grad_norm": 78.16267395019531, |
| "learning_rate": 3.911761157975667e-06, |
| "loss": 0.3457, |
| "mean_token_accuracy": 0.8896233439445496, |
| "num_tokens": 52085606.0, |
| "step": 6415 |
| }, |
| { |
| "entropy": 0.898774790763855, |
| "epoch": 3.625070581592321, |
| "grad_norm": 75.20874786376953, |
| "learning_rate": 3.910251297084438e-06, |
| "loss": 0.3809, |
| "mean_token_accuracy": 0.878812849521637, |
| "num_tokens": 52126164.0, |
| "step": 6420 |
| }, |
| { |
| "entropy": 0.9831461429595947, |
| "epoch": 3.6278938452851497, |
| "grad_norm": 89.09337615966797, |
| "learning_rate": 3.908740818445327e-06, |
| "loss": 0.3808, |
| "mean_token_accuracy": 0.8773584604263306, |
| "num_tokens": 52166919.0, |
| "step": 6425 |
| }, |
| { |
| "entropy": 1.0048586010932923, |
| "epoch": 3.6307171089779784, |
| "grad_norm": 79.22754669189453, |
| "learning_rate": 3.907229723246607e-06, |
| "loss": 0.3643, |
| "mean_token_accuracy": 0.8850665092468262, |
| "num_tokens": 52207627.0, |
| "step": 6430 |
| }, |
| { |
| "entropy": 1.0400293827056886, |
| "epoch": 3.6335403726708075, |
| "grad_norm": 88.93228912353516, |
| "learning_rate": 3.905718012677042e-06, |
| "loss": 0.349, |
| "mean_token_accuracy": 0.8870396375656128, |
| "num_tokens": 52248312.0, |
| "step": 6435 |
| }, |
| { |
| "entropy": 1.1385428428649902, |
| "epoch": 3.6363636363636362, |
| "grad_norm": 78.16153717041016, |
| "learning_rate": 3.9042056879258754e-06, |
| "loss": 0.3662, |
| "mean_token_accuracy": 0.8810193061828613, |
| "num_tokens": 52288933.0, |
| "step": 6440 |
| }, |
| { |
| "entropy": 1.0581382632255554, |
| "epoch": 3.6391869000564654, |
| "grad_norm": 94.95269775390625, |
| "learning_rate": 3.902692750182835e-06, |
| "loss": 0.3421, |
| "mean_token_accuracy": 0.8892552137374878, |
| "num_tokens": 52329843.0, |
| "step": 6445 |
| }, |
| { |
| "entropy": 0.9416268587112426, |
| "epoch": 3.642010163749294, |
| "grad_norm": 71.47803497314453, |
| "learning_rate": 3.901179200638131e-06, |
| "loss": 0.3358, |
| "mean_token_accuracy": 0.8925686478614807, |
| "num_tokens": 52370377.0, |
| "step": 6450 |
| }, |
| { |
| "entropy": 0.9823377370834351, |
| "epoch": 3.6448334274421232, |
| "grad_norm": 66.98043823242188, |
| "learning_rate": 3.899665040482453e-06, |
| "loss": 0.3524, |
| "mean_token_accuracy": 0.8869312644004822, |
| "num_tokens": 52411012.0, |
| "step": 6455 |
| }, |
| { |
| "entropy": 1.073030376434326, |
| "epoch": 3.647656691134952, |
| "grad_norm": 77.24723815917969, |
| "learning_rate": 3.898150270906977e-06, |
| "loss": 0.3877, |
| "mean_token_accuracy": 0.8758072853088379, |
| "num_tokens": 52451634.0, |
| "step": 6460 |
| }, |
| { |
| "entropy": 0.9715201020240783, |
| "epoch": 3.650479954827781, |
| "grad_norm": 88.40100860595703, |
| "learning_rate": 3.896634893103351e-06, |
| "loss": 0.3779, |
| "mean_token_accuracy": 0.8794341802597045, |
| "num_tokens": 52492298.0, |
| "step": 6465 |
| }, |
| { |
| "entropy": 0.9602640748023987, |
| "epoch": 3.65330321852061, |
| "grad_norm": 74.9549560546875, |
| "learning_rate": 3.895118908263706e-06, |
| "loss": 0.355, |
| "mean_token_accuracy": 0.8859039068222045, |
| "num_tokens": 52532916.0, |
| "step": 6470 |
| }, |
| { |
| "entropy": 0.9144500970840455, |
| "epoch": 3.6561264822134385, |
| "grad_norm": 83.213623046875, |
| "learning_rate": 3.893602317580649e-06, |
| "loss": 0.3401, |
| "mean_token_accuracy": 0.8914003133773803, |
| "num_tokens": 52573512.0, |
| "step": 6475 |
| }, |
| { |
| "entropy": 1.025104033946991, |
| "epoch": 3.6589497459062676, |
| "grad_norm": 89.1195068359375, |
| "learning_rate": 3.892085122247263e-06, |
| "loss": 0.376, |
| "mean_token_accuracy": 0.8797497034072876, |
| "num_tokens": 52614126.0, |
| "step": 6480 |
| }, |
| { |
| "entropy": 1.0915439128875732, |
| "epoch": 3.661773009599097, |
| "grad_norm": 89.44082641601562, |
| "learning_rate": 3.89056732345711e-06, |
| "loss": 0.3694, |
| "mean_token_accuracy": 0.8832651495933532, |
| "num_tokens": 52654968.0, |
| "step": 6485 |
| }, |
| { |
| "entropy": 0.9922754287719726, |
| "epoch": 3.6645962732919255, |
| "grad_norm": 84.6336669921875, |
| "learning_rate": 3.889048922404222e-06, |
| "loss": 0.3486, |
| "mean_token_accuracy": 0.8878530979156494, |
| "num_tokens": 52695576.0, |
| "step": 6490 |
| }, |
| { |
| "entropy": 0.9639823079109192, |
| "epoch": 3.667419536984754, |
| "grad_norm": 75.57162475585938, |
| "learning_rate": 3.887529920283108e-06, |
| "loss": 0.3459, |
| "mean_token_accuracy": 0.8889827013015748, |
| "num_tokens": 52736251.0, |
| "step": 6495 |
| }, |
| { |
| "entropy": 1.0114650011062623, |
| "epoch": 3.6702428006775834, |
| "grad_norm": 83.69818878173828, |
| "learning_rate": 3.886010318288748e-06, |
| "loss": 0.3659, |
| "mean_token_accuracy": 0.8834073305130005, |
| "num_tokens": 52777101.0, |
| "step": 6500 |
| }, |
| { |
| "epoch": 3.6702428006775834, |
| "eval_entropy": 1.2794333457946778, |
| "eval_loss": 0.2388431578874588, |
| "eval_mean_token_accuracy": 0.9266616106033325, |
| "eval_num_tokens": 52777101.0, |
| "eval_runtime": 2.4569, |
| "eval_samples_per_second": 15.873, |
| "eval_steps_per_second": 2.035, |
| "step": 6500 |
| }, |
| { |
| "entropy": 1.0842282056808472, |
| "epoch": 3.673066064370412, |
| "grad_norm": 72.50702667236328, |
| "learning_rate": 3.884490117616596e-06, |
| "loss": 0.3576, |
| "mean_token_accuracy": 0.8858759045600891, |
| "num_tokens": 52817788.0, |
| "step": 6505 |
| }, |
| { |
| "entropy": 1.0758524060249328, |
| "epoch": 3.675889328063241, |
| "grad_norm": 82.36662292480469, |
| "learning_rate": 3.882969319462576e-06, |
| "loss": 0.3628, |
| "mean_token_accuracy": 0.8835553526878357, |
| "num_tokens": 52858373.0, |
| "step": 6510 |
| }, |
| { |
| "entropy": 1.006167435646057, |
| "epoch": 3.67871259175607, |
| "grad_norm": 80.42717742919922, |
| "learning_rate": 3.8814479250230816e-06, |
| "loss": 0.3533, |
| "mean_token_accuracy": 0.8864710927009583, |
| "num_tokens": 52898795.0, |
| "step": 6515 |
| }, |
| { |
| "entropy": 0.9807133316993714, |
| "epoch": 3.681535855448899, |
| "grad_norm": 81.09001922607422, |
| "learning_rate": 3.879925935494974e-06, |
| "loss": 0.3699, |
| "mean_token_accuracy": 0.882834541797638, |
| "num_tokens": 52939434.0, |
| "step": 6520 |
| }, |
| { |
| "entropy": 0.9380959749221802, |
| "epoch": 3.6843591191417278, |
| "grad_norm": 71.36829376220703, |
| "learning_rate": 3.878403352075588e-06, |
| "loss": 0.3613, |
| "mean_token_accuracy": 0.8826104998588562, |
| "num_tokens": 52980014.0, |
| "step": 6525 |
| }, |
| { |
| "entropy": 0.9877384066581726, |
| "epoch": 3.687182382834557, |
| "grad_norm": 66.0042953491211, |
| "learning_rate": 3.87688017596272e-06, |
| "loss": 0.3426, |
| "mean_token_accuracy": 0.8897479772567749, |
| "num_tokens": 53020720.0, |
| "step": 6530 |
| }, |
| { |
| "entropy": 1.0412828922271729, |
| "epoch": 3.6900056465273856, |
| "grad_norm": 79.44898986816406, |
| "learning_rate": 3.875356408354633e-06, |
| "loss": 0.3771, |
| "mean_token_accuracy": 0.8798229336738587, |
| "num_tokens": 53061436.0, |
| "step": 6535 |
| }, |
| { |
| "entropy": 0.9982946991920472, |
| "epoch": 3.6928289102202143, |
| "grad_norm": 84.61109161376953, |
| "learning_rate": 3.873832050450058e-06, |
| "loss": 0.3562, |
| "mean_token_accuracy": 0.8826686978340149, |
| "num_tokens": 53102129.0, |
| "step": 6540 |
| }, |
| { |
| "entropy": 0.900448226928711, |
| "epoch": 3.6956521739130435, |
| "grad_norm": 67.6589584350586, |
| "learning_rate": 3.87230710344819e-06, |
| "loss": 0.3474, |
| "mean_token_accuracy": 0.8889101147651672, |
| "num_tokens": 53142665.0, |
| "step": 6545 |
| }, |
| { |
| "entropy": 0.9328884720802307, |
| "epoch": 3.6984754376058726, |
| "grad_norm": 69.66155242919922, |
| "learning_rate": 3.870781568548686e-06, |
| "loss": 0.3825, |
| "mean_token_accuracy": 0.879358434677124, |
| "num_tokens": 53183474.0, |
| "step": 6550 |
| }, |
| { |
| "entropy": 0.9914533376693726, |
| "epoch": 3.7012987012987013, |
| "grad_norm": 73.55729675292969, |
| "learning_rate": 3.869255446951668e-06, |
| "loss": 0.3702, |
| "mean_token_accuracy": 0.8832436203956604, |
| "num_tokens": 53224148.0, |
| "step": 6555 |
| }, |
| { |
| "entropy": 0.9467785477638244, |
| "epoch": 3.70412196499153, |
| "grad_norm": 73.56095886230469, |
| "learning_rate": 3.8677287398577145e-06, |
| "loss": 0.3574, |
| "mean_token_accuracy": 0.8833266258239746, |
| "num_tokens": 53264653.0, |
| "step": 6560 |
| }, |
| { |
| "entropy": 0.9588563442230225, |
| "epoch": 3.706945228684359, |
| "grad_norm": 68.61029815673828, |
| "learning_rate": 3.86620144846787e-06, |
| "loss": 0.3552, |
| "mean_token_accuracy": 0.8847254753112793, |
| "num_tokens": 53305136.0, |
| "step": 6565 |
| }, |
| { |
| "entropy": 1.008836305141449, |
| "epoch": 3.709768492377188, |
| "grad_norm": 85.92555236816406, |
| "learning_rate": 3.8646735739836375e-06, |
| "loss": 0.3376, |
| "mean_token_accuracy": 0.8911909461021423, |
| "num_tokens": 53345833.0, |
| "step": 6570 |
| }, |
| { |
| "entropy": 1.0201958298683167, |
| "epoch": 3.712591756070017, |
| "grad_norm": 71.72399139404297, |
| "learning_rate": 3.863145117606976e-06, |
| "loss": 0.3754, |
| "mean_token_accuracy": 0.8834819436073303, |
| "num_tokens": 53386385.0, |
| "step": 6575 |
| }, |
| { |
| "entropy": 1.042575967311859, |
| "epoch": 3.7154150197628457, |
| "grad_norm": 82.75652313232422, |
| "learning_rate": 3.861616080540303e-06, |
| "loss": 0.3696, |
| "mean_token_accuracy": 0.8814401984214782, |
| "num_tokens": 53427047.0, |
| "step": 6580 |
| }, |
| { |
| "entropy": 1.0726915359497071, |
| "epoch": 3.718238283455675, |
| "grad_norm": 70.38341522216797, |
| "learning_rate": 3.860086463986496e-06, |
| "loss": 0.3413, |
| "mean_token_accuracy": 0.8903828144073487, |
| "num_tokens": 53467848.0, |
| "step": 6585 |
| }, |
| { |
| "entropy": 1.0652672290802, |
| "epoch": 3.7210615471485036, |
| "grad_norm": 74.48110961914062, |
| "learning_rate": 3.858556269148885e-06, |
| "loss": 0.3531, |
| "mean_token_accuracy": 0.8858086943626404, |
| "num_tokens": 53508723.0, |
| "step": 6590 |
| }, |
| { |
| "entropy": 1.1102969884872436, |
| "epoch": 3.7238848108413327, |
| "grad_norm": 93.84978485107422, |
| "learning_rate": 3.857025497231258e-06, |
| "loss": 0.3513, |
| "mean_token_accuracy": 0.8857390761375428, |
| "num_tokens": 53549279.0, |
| "step": 6595 |
| }, |
| { |
| "entropy": 0.9921191811561585, |
| "epoch": 3.7267080745341614, |
| "grad_norm": 66.51078033447266, |
| "learning_rate": 3.855494149437853e-06, |
| "loss": 0.3423, |
| "mean_token_accuracy": 0.8900970578193664, |
| "num_tokens": 53589987.0, |
| "step": 6600 |
| }, |
| { |
| "entropy": 0.9519596576690674, |
| "epoch": 3.7295313382269906, |
| "grad_norm": 85.44746398925781, |
| "learning_rate": 3.853962226973364e-06, |
| "loss": 0.3494, |
| "mean_token_accuracy": 0.8887536764144898, |
| "num_tokens": 53630587.0, |
| "step": 6605 |
| }, |
| { |
| "entropy": 0.9617591619491577, |
| "epoch": 3.7323546019198193, |
| "grad_norm": 76.92646026611328, |
| "learning_rate": 3.852429731042936e-06, |
| "loss": 0.3437, |
| "mean_token_accuracy": 0.8902799129486084, |
| "num_tokens": 53671322.0, |
| "step": 6610 |
| }, |
| { |
| "entropy": 1.070969033241272, |
| "epoch": 3.7351778656126484, |
| "grad_norm": 100.68677520751953, |
| "learning_rate": 3.850896662852165e-06, |
| "loss": 0.3558, |
| "mean_token_accuracy": 0.8854732871055603, |
| "num_tokens": 53711800.0, |
| "step": 6615 |
| }, |
| { |
| "entropy": 1.0809642195701599, |
| "epoch": 3.738001129305477, |
| "grad_norm": 75.52942657470703, |
| "learning_rate": 3.8493630236070975e-06, |
| "loss": 0.366, |
| "mean_token_accuracy": 0.8825573921203613, |
| "num_tokens": 53752357.0, |
| "step": 6620 |
| }, |
| { |
| "entropy": 1.0825310945510864, |
| "epoch": 3.740824392998306, |
| "grad_norm": 79.78958892822266, |
| "learning_rate": 3.847828814514231e-06, |
| "loss": 0.3664, |
| "mean_token_accuracy": 0.8812973737716675, |
| "num_tokens": 53793065.0, |
| "step": 6625 |
| }, |
| { |
| "entropy": 0.9908598780632019, |
| "epoch": 3.743647656691135, |
| "grad_norm": 77.1722412109375, |
| "learning_rate": 3.846294036780508e-06, |
| "loss": 0.3412, |
| "mean_token_accuracy": 0.8887288570404053, |
| "num_tokens": 53833798.0, |
| "step": 6630 |
| }, |
| { |
| "entropy": 1.1215206027030944, |
| "epoch": 3.746470920383964, |
| "grad_norm": 78.24017333984375, |
| "learning_rate": 3.84475869161332e-06, |
| "loss": 0.3539, |
| "mean_token_accuracy": 0.8852620720863342, |
| "num_tokens": 53874604.0, |
| "step": 6635 |
| }, |
| { |
| "entropy": 1.0309231758117676, |
| "epoch": 3.749294184076793, |
| "grad_norm": 76.8600082397461, |
| "learning_rate": 3.8432227802205055e-06, |
| "loss": 0.34, |
| "mean_token_accuracy": 0.8884881615638733, |
| "num_tokens": 53915339.0, |
| "step": 6640 |
| }, |
| { |
| "entropy": 1.0157047390937806, |
| "epoch": 3.7521174477696215, |
| "grad_norm": 67.16350555419922, |
| "learning_rate": 3.841686303810347e-06, |
| "loss": 0.3666, |
| "mean_token_accuracy": 0.8807543754577637, |
| "num_tokens": 53956009.0, |
| "step": 6645 |
| }, |
| { |
| "entropy": 1.0168555617332458, |
| "epoch": 3.7549407114624507, |
| "grad_norm": 71.06246948242188, |
| "learning_rate": 3.840149263591573e-06, |
| "loss": 0.3339, |
| "mean_token_accuracy": 0.8932982087135315, |
| "num_tokens": 53996726.0, |
| "step": 6650 |
| }, |
| { |
| "entropy": 1.0992830872535706, |
| "epoch": 3.7577639751552794, |
| "grad_norm": 76.54785919189453, |
| "learning_rate": 3.838611660773355e-06, |
| "loss": 0.3729, |
| "mean_token_accuracy": 0.87835294008255, |
| "num_tokens": 54036856.0, |
| "step": 6655 |
| }, |
| { |
| "entropy": 1.0346566915512085, |
| "epoch": 3.7605872388481085, |
| "grad_norm": 82.71453857421875, |
| "learning_rate": 3.837073496565307e-06, |
| "loss": 0.3762, |
| "mean_token_accuracy": 0.8808693885803223, |
| "num_tokens": 54077601.0, |
| "step": 6660 |
| }, |
| { |
| "entropy": 1.0158017158508301, |
| "epoch": 3.7634105025409372, |
| "grad_norm": 72.65914916992188, |
| "learning_rate": 3.8355347721774825e-06, |
| "loss": 0.3466, |
| "mean_token_accuracy": 0.8884905695915222, |
| "num_tokens": 54117960.0, |
| "step": 6665 |
| }, |
| { |
| "entropy": 1.0506127595901489, |
| "epoch": 3.7662337662337664, |
| "grad_norm": 73.87583923339844, |
| "learning_rate": 3.83399548882038e-06, |
| "loss": 0.3773, |
| "mean_token_accuracy": 0.8785854339599609, |
| "num_tokens": 54158767.0, |
| "step": 6670 |
| }, |
| { |
| "entropy": 1.0176648139953612, |
| "epoch": 3.769057029926595, |
| "grad_norm": 83.29283905029297, |
| "learning_rate": 3.832455647704934e-06, |
| "loss": 0.3899, |
| "mean_token_accuracy": 0.8775892972946167, |
| "num_tokens": 54199377.0, |
| "step": 6675 |
| }, |
| { |
| "entropy": 0.9590134263038635, |
| "epoch": 3.7718802936194242, |
| "grad_norm": 68.5479507446289, |
| "learning_rate": 3.83091525004252e-06, |
| "loss": 0.3286, |
| "mean_token_accuracy": 0.892620575428009, |
| "num_tokens": 54239593.0, |
| "step": 6680 |
| }, |
| { |
| "entropy": 1.0638235330581665, |
| "epoch": 3.774703557312253, |
| "grad_norm": 69.84130859375, |
| "learning_rate": 3.8293742970449516e-06, |
| "loss": 0.3703, |
| "mean_token_accuracy": 0.8817868709564209, |
| "num_tokens": 54280310.0, |
| "step": 6685 |
| }, |
| { |
| "entropy": 1.046445870399475, |
| "epoch": 3.7775268210050816, |
| "grad_norm": 76.98202514648438, |
| "learning_rate": 3.827832789924476e-06, |
| "loss": 0.3619, |
| "mean_token_accuracy": 0.8842405676841736, |
| "num_tokens": 54320974.0, |
| "step": 6690 |
| }, |
| { |
| "entropy": 1.0114327549934388, |
| "epoch": 3.780350084697911, |
| "grad_norm": 73.97267150878906, |
| "learning_rate": 3.8262907298937805e-06, |
| "loss": 0.3374, |
| "mean_token_accuracy": 0.891106104850769, |
| "num_tokens": 54361592.0, |
| "step": 6695 |
| }, |
| { |
| "entropy": 0.9996057510375976, |
| "epoch": 3.78317334839074, |
| "grad_norm": 70.08989715576172, |
| "learning_rate": 3.824748118165984e-06, |
| "loss": 0.3558, |
| "mean_token_accuracy": 0.885913097858429, |
| "num_tokens": 54402267.0, |
| "step": 6700 |
| }, |
| { |
| "entropy": 0.9400599360466003, |
| "epoch": 3.7859966120835686, |
| "grad_norm": 75.78137969970703, |
| "learning_rate": 3.823204955954642e-06, |
| "loss": 0.363, |
| "mean_token_accuracy": 0.8856919765472412, |
| "num_tokens": 54442692.0, |
| "step": 6705 |
| }, |
| { |
| "entropy": 0.9907450199127197, |
| "epoch": 3.7888198757763973, |
| "grad_norm": 69.72572326660156, |
| "learning_rate": 3.821661244473741e-06, |
| "loss": 0.3784, |
| "mean_token_accuracy": 0.8794691681861877, |
| "num_tokens": 54483180.0, |
| "step": 6710 |
| }, |
| { |
| "entropy": 1.0102770328521729, |
| "epoch": 3.7916431394692265, |
| "grad_norm": 63.0318489074707, |
| "learning_rate": 3.820116984937702e-06, |
| "loss": 0.3661, |
| "mean_token_accuracy": 0.886728823184967, |
| "num_tokens": 54523904.0, |
| "step": 6715 |
| }, |
| { |
| "entropy": 0.9674598574638367, |
| "epoch": 3.794466403162055, |
| "grad_norm": 86.56275939941406, |
| "learning_rate": 3.8185721785613735e-06, |
| "loss": 0.3531, |
| "mean_token_accuracy": 0.8867643713951111, |
| "num_tokens": 54564607.0, |
| "step": 6720 |
| }, |
| { |
| "entropy": 1.0311317205429078, |
| "epoch": 3.7972896668548843, |
| "grad_norm": 84.28353118896484, |
| "learning_rate": 3.817026826560038e-06, |
| "loss": 0.3571, |
| "mean_token_accuracy": 0.8858803272247314, |
| "num_tokens": 54604553.0, |
| "step": 6725 |
| }, |
| { |
| "entropy": 1.0266733169555664, |
| "epoch": 3.800112930547713, |
| "grad_norm": 82.10366821289062, |
| "learning_rate": 3.815480930149404e-06, |
| "loss": 0.3257, |
| "mean_token_accuracy": 0.8958070278167725, |
| "num_tokens": 54644926.0, |
| "step": 6730 |
| }, |
| { |
| "entropy": 1.1459343433380127, |
| "epoch": 3.802936194240542, |
| "grad_norm": 71.1697006225586, |
| "learning_rate": 3.8139344905456116e-06, |
| "loss": 0.4013, |
| "mean_token_accuracy": 0.8718471527099609, |
| "num_tokens": 54685606.0, |
| "step": 6735 |
| }, |
| { |
| "entropy": 0.952794349193573, |
| "epoch": 3.805759457933371, |
| "grad_norm": 67.65644836425781, |
| "learning_rate": 3.8123875089652264e-06, |
| "loss": 0.3409, |
| "mean_token_accuracy": 0.8891062021255494, |
| "num_tokens": 54725844.0, |
| "step": 6740 |
| }, |
| { |
| "entropy": 1.0092663884162902, |
| "epoch": 3.8085827216262, |
| "grad_norm": 85.99578857421875, |
| "learning_rate": 3.8108399866252386e-06, |
| "loss": 0.3458, |
| "mean_token_accuracy": 0.8878180980682373, |
| "num_tokens": 54766418.0, |
| "step": 6745 |
| }, |
| { |
| "entropy": 1.0750229239463807, |
| "epoch": 3.8114059853190287, |
| "grad_norm": 70.52570343017578, |
| "learning_rate": 3.809291924743068e-06, |
| "loss": 0.3733, |
| "mean_token_accuracy": 0.8812821865081787, |
| "num_tokens": 54807117.0, |
| "step": 6750 |
| }, |
| { |
| "entropy": 1.0017638206481934, |
| "epoch": 3.8142292490118574, |
| "grad_norm": 86.5273666381836, |
| "learning_rate": 3.807743324536556e-06, |
| "loss": 0.3594, |
| "mean_token_accuracy": 0.8835506319999695, |
| "num_tokens": 54847916.0, |
| "step": 6755 |
| }, |
| { |
| "entropy": 1.0813376545906066, |
| "epoch": 3.8170525127046866, |
| "grad_norm": 83.56548309326172, |
| "learning_rate": 3.806194187223966e-06, |
| "loss": 0.3478, |
| "mean_token_accuracy": 0.8874451518058777, |
| "num_tokens": 54888714.0, |
| "step": 6760 |
| }, |
| { |
| "entropy": 1.1086195468902589, |
| "epoch": 3.8198757763975157, |
| "grad_norm": 83.7651596069336, |
| "learning_rate": 3.804644514023988e-06, |
| "loss": 0.3848, |
| "mean_token_accuracy": 0.8768295884132385, |
| "num_tokens": 54929298.0, |
| "step": 6765 |
| }, |
| { |
| "entropy": 1.1223967313766479, |
| "epoch": 3.8226990400903444, |
| "grad_norm": 78.98551940917969, |
| "learning_rate": 3.803094306155731e-06, |
| "loss": 0.359, |
| "mean_token_accuracy": 0.8833388090133667, |
| "num_tokens": 54969964.0, |
| "step": 6770 |
| }, |
| { |
| "entropy": 1.0287705659866333, |
| "epoch": 3.825522303783173, |
| "grad_norm": 77.01542663574219, |
| "learning_rate": 3.8015435648387257e-06, |
| "loss": 0.3612, |
| "mean_token_accuracy": 0.8854536652565003, |
| "num_tokens": 55010341.0, |
| "step": 6775 |
| }, |
| { |
| "entropy": 1.0934155106544494, |
| "epoch": 3.8283455674760023, |
| "grad_norm": 74.22586822509766, |
| "learning_rate": 3.7999922912929206e-06, |
| "loss": 0.3537, |
| "mean_token_accuracy": 0.8861262202262878, |
| "num_tokens": 55051089.0, |
| "step": 6780 |
| }, |
| { |
| "entropy": 1.0119221925735473, |
| "epoch": 3.8311688311688314, |
| "grad_norm": 81.14582824707031, |
| "learning_rate": 3.7984404867386848e-06, |
| "loss": 0.3752, |
| "mean_token_accuracy": 0.8784032344818116, |
| "num_tokens": 55091717.0, |
| "step": 6785 |
| }, |
| { |
| "entropy": 1.0168304681777953, |
| "epoch": 3.83399209486166, |
| "grad_norm": 63.245872497558594, |
| "learning_rate": 3.7968881523968047e-06, |
| "loss": 0.3558, |
| "mean_token_accuracy": 0.8880203247070313, |
| "num_tokens": 55132472.0, |
| "step": 6790 |
| }, |
| { |
| "entropy": 1.0747297167778016, |
| "epoch": 3.836815358554489, |
| "grad_norm": 89.36796569824219, |
| "learning_rate": 3.795335289488484e-06, |
| "loss": 0.3771, |
| "mean_token_accuracy": 0.8779423356056213, |
| "num_tokens": 55172730.0, |
| "step": 6795 |
| }, |
| { |
| "entropy": 1.177517795562744, |
| "epoch": 3.839638622247318, |
| "grad_norm": 85.73871612548828, |
| "learning_rate": 3.79378189923534e-06, |
| "loss": 0.3572, |
| "mean_token_accuracy": 0.8844528555870056, |
| "num_tokens": 55213564.0, |
| "step": 6800 |
| }, |
| { |
| "entropy": 1.1382277488708497, |
| "epoch": 3.8424618859401467, |
| "grad_norm": 78.42530822753906, |
| "learning_rate": 3.7922279828594076e-06, |
| "loss": 0.3779, |
| "mean_token_accuracy": 0.8786744832992553, |
| "num_tokens": 55253742.0, |
| "step": 6805 |
| }, |
| { |
| "entropy": 1.0296044111251832, |
| "epoch": 3.845285149632976, |
| "grad_norm": 81.2005386352539, |
| "learning_rate": 3.7906735415831344e-06, |
| "loss": 0.355, |
| "mean_token_accuracy": 0.8866505265235901, |
| "num_tokens": 55294435.0, |
| "step": 6810 |
| }, |
| { |
| "entropy": 0.9833127617835998, |
| "epoch": 3.8481084133258046, |
| "grad_norm": 77.91568756103516, |
| "learning_rate": 3.7891185766293797e-06, |
| "loss": 0.3565, |
| "mean_token_accuracy": 0.8863444924354553, |
| "num_tokens": 55335023.0, |
| "step": 6815 |
| }, |
| { |
| "entropy": 1.0741397500038148, |
| "epoch": 3.8509316770186337, |
| "grad_norm": 83.5990982055664, |
| "learning_rate": 3.7875630892214167e-06, |
| "loss": 0.3577, |
| "mean_token_accuracy": 0.883382785320282, |
| "num_tokens": 55375580.0, |
| "step": 6820 |
| }, |
| { |
| "entropy": 1.1074389219284058, |
| "epoch": 3.8537549407114624, |
| "grad_norm": 84.55143737792969, |
| "learning_rate": 3.7860070805829295e-06, |
| "loss": 0.3824, |
| "mean_token_accuracy": 0.8763737678527832, |
| "num_tokens": 55416275.0, |
| "step": 6825 |
| }, |
| { |
| "entropy": 1.0039127230644227, |
| "epoch": 3.8565782044042916, |
| "grad_norm": 73.82699584960938, |
| "learning_rate": 3.784450551938011e-06, |
| "loss": 0.3623, |
| "mean_token_accuracy": 0.8839982032775879, |
| "num_tokens": 55457007.0, |
| "step": 6830 |
| }, |
| { |
| "entropy": 1.0227969527244567, |
| "epoch": 3.8594014680971203, |
| "grad_norm": 73.1678466796875, |
| "learning_rate": 3.782893504511164e-06, |
| "loss": 0.3556, |
| "mean_token_accuracy": 0.8841433644294738, |
| "num_tokens": 55497737.0, |
| "step": 6835 |
| }, |
| { |
| "entropy": 1.0409348011016846, |
| "epoch": 3.862224731789949, |
| "grad_norm": 78.6789321899414, |
| "learning_rate": 3.7813359395272998e-06, |
| "loss": 0.3914, |
| "mean_token_accuracy": 0.8746375679969788, |
| "num_tokens": 55538169.0, |
| "step": 6840 |
| }, |
| { |
| "entropy": 1.0004459023475647, |
| "epoch": 3.865047995482778, |
| "grad_norm": 76.7342529296875, |
| "learning_rate": 3.779777858211735e-06, |
| "loss": 0.3844, |
| "mean_token_accuracy": 0.8788429498672485, |
| "num_tokens": 55578708.0, |
| "step": 6845 |
| }, |
| { |
| "entropy": 1.0666333556175231, |
| "epoch": 3.8678712591756073, |
| "grad_norm": 74.42332458496094, |
| "learning_rate": 3.778219261790194e-06, |
| "loss": 0.3577, |
| "mean_token_accuracy": 0.8858496069908142, |
| "num_tokens": 55619553.0, |
| "step": 6850 |
| }, |
| { |
| "entropy": 1.0770619630813598, |
| "epoch": 3.870694522868436, |
| "grad_norm": 78.50506591796875, |
| "learning_rate": 3.776660151488807e-06, |
| "loss": 0.3836, |
| "mean_token_accuracy": 0.8765832185745239, |
| "num_tokens": 55660426.0, |
| "step": 6855 |
| }, |
| { |
| "entropy": 1.1385752201080321, |
| "epoch": 3.8735177865612647, |
| "grad_norm": 74.58927917480469, |
| "learning_rate": 3.775100528534107e-06, |
| "loss": 0.3897, |
| "mean_token_accuracy": 0.877284836769104, |
| "num_tokens": 55700919.0, |
| "step": 6860 |
| }, |
| { |
| "entropy": 1.0381620287895204, |
| "epoch": 3.876341050254094, |
| "grad_norm": 76.63037872314453, |
| "learning_rate": 3.7735403941530306e-06, |
| "loss": 0.3586, |
| "mean_token_accuracy": 0.8844604849815368, |
| "num_tokens": 55741495.0, |
| "step": 6865 |
| }, |
| { |
| "entropy": 0.9769273281097413, |
| "epoch": 3.8791643139469225, |
| "grad_norm": 71.60755920410156, |
| "learning_rate": 3.7719797495729184e-06, |
| "loss": 0.3553, |
| "mean_token_accuracy": 0.8843575596809388, |
| "num_tokens": 55782200.0, |
| "step": 6870 |
| }, |
| { |
| "entropy": 1.0441035032272339, |
| "epoch": 3.8819875776397517, |
| "grad_norm": 75.18911743164062, |
| "learning_rate": 3.7704185960215096e-06, |
| "loss": 0.3899, |
| "mean_token_accuracy": 0.8751221537590027, |
| "num_tokens": 55822987.0, |
| "step": 6875 |
| }, |
| { |
| "entropy": 1.0410512208938598, |
| "epoch": 3.8848108413325804, |
| "grad_norm": 82.20500183105469, |
| "learning_rate": 3.7688569347269456e-06, |
| "loss": 0.3573, |
| "mean_token_accuracy": 0.8842342138290405, |
| "num_tokens": 55863486.0, |
| "step": 6880 |
| }, |
| { |
| "entropy": 1.07995103597641, |
| "epoch": 3.8876341050254095, |
| "grad_norm": 79.17044830322266, |
| "learning_rate": 3.7672947669177663e-06, |
| "loss": 0.3666, |
| "mean_token_accuracy": 0.8829922795295715, |
| "num_tokens": 55904162.0, |
| "step": 6885 |
| }, |
| { |
| "entropy": 1.071269142627716, |
| "epoch": 3.890457368718238, |
| "grad_norm": 80.15768432617188, |
| "learning_rate": 3.765732093822911e-06, |
| "loss": 0.3987, |
| "mean_token_accuracy": 0.8745517253875732, |
| "num_tokens": 55944636.0, |
| "step": 6890 |
| }, |
| { |
| "entropy": 1.0229296922683715, |
| "epoch": 3.8932806324110674, |
| "grad_norm": 75.17719268798828, |
| "learning_rate": 3.7641689166717164e-06, |
| "loss": 0.3728, |
| "mean_token_accuracy": 0.8799932956695556, |
| "num_tokens": 55985078.0, |
| "step": 6895 |
| }, |
| { |
| "entropy": 1.017443561553955, |
| "epoch": 3.896103896103896, |
| "grad_norm": 81.72650909423828, |
| "learning_rate": 3.7626052366939154e-06, |
| "loss": 0.3439, |
| "mean_token_accuracy": 0.8882031202316284, |
| "num_tokens": 56025710.0, |
| "step": 6900 |
| }, |
| { |
| "entropy": 0.9397255301475524, |
| "epoch": 3.8989271597967248, |
| "grad_norm": 80.24227142333984, |
| "learning_rate": 3.7610410551196362e-06, |
| "loss": 0.3506, |
| "mean_token_accuracy": 0.8859299421310425, |
| "num_tokens": 56066297.0, |
| "step": 6905 |
| }, |
| { |
| "entropy": 1.0021834969520569, |
| "epoch": 3.901750423489554, |
| "grad_norm": 68.56148529052734, |
| "learning_rate": 3.7594763731794015e-06, |
| "loss": 0.3871, |
| "mean_token_accuracy": 0.8764629125595093, |
| "num_tokens": 56106729.0, |
| "step": 6910 |
| }, |
| { |
| "entropy": 0.9779050350189209, |
| "epoch": 3.904573687182383, |
| "grad_norm": 72.72400665283203, |
| "learning_rate": 3.7579111921041287e-06, |
| "loss": 0.358, |
| "mean_token_accuracy": 0.8842010378837586, |
| "num_tokens": 56147418.0, |
| "step": 6915 |
| }, |
| { |
| "entropy": 1.0261463880538941, |
| "epoch": 3.9073969508752118, |
| "grad_norm": 84.04011535644531, |
| "learning_rate": 3.756345513125128e-06, |
| "loss": 0.3676, |
| "mean_token_accuracy": 0.8818102598190307, |
| "num_tokens": 56188071.0, |
| "step": 6920 |
| }, |
| { |
| "entropy": 1.0177555203437805, |
| "epoch": 3.9102202145680405, |
| "grad_norm": 86.4022216796875, |
| "learning_rate": 3.7547793374740987e-06, |
| "loss": 0.3871, |
| "mean_token_accuracy": 0.8769708275794983, |
| "num_tokens": 56228901.0, |
| "step": 6925 |
| }, |
| { |
| "entropy": 1.0417796850204468, |
| "epoch": 3.9130434782608696, |
| "grad_norm": 89.51969909667969, |
| "learning_rate": 3.7532126663831337e-06, |
| "loss": 0.364, |
| "mean_token_accuracy": 0.881694233417511, |
| "num_tokens": 56269695.0, |
| "step": 6930 |
| }, |
| { |
| "entropy": 0.9011264562606811, |
| "epoch": 3.9158667419536983, |
| "grad_norm": 70.96135711669922, |
| "learning_rate": 3.7516455010847135e-06, |
| "loss": 0.3469, |
| "mean_token_accuracy": 0.886352801322937, |
| "num_tokens": 56310472.0, |
| "step": 6935 |
| }, |
| { |
| "entropy": 0.9600812792778015, |
| "epoch": 3.9186900056465275, |
| "grad_norm": 72.96127319335938, |
| "learning_rate": 3.7500778428117097e-06, |
| "loss": 0.3695, |
| "mean_token_accuracy": 0.8812779784202576, |
| "num_tokens": 56351272.0, |
| "step": 6940 |
| }, |
| { |
| "entropy": 0.9371390104293823, |
| "epoch": 3.921513269339356, |
| "grad_norm": 61.435157775878906, |
| "learning_rate": 3.7485096927973797e-06, |
| "loss": 0.3528, |
| "mean_token_accuracy": 0.8869521021842957, |
| "num_tokens": 56391825.0, |
| "step": 6945 |
| }, |
| { |
| "entropy": 1.0107155442237854, |
| "epoch": 3.9243365330321853, |
| "grad_norm": 71.20367431640625, |
| "learning_rate": 3.746941052275369e-06, |
| "loss": 0.3762, |
| "mean_token_accuracy": 0.8818397402763367, |
| "num_tokens": 56432358.0, |
| "step": 6950 |
| }, |
| { |
| "entropy": 0.9440315723419189, |
| "epoch": 3.927159796725014, |
| "grad_norm": 90.81354522705078, |
| "learning_rate": 3.7453719224797084e-06, |
| "loss": 0.3582, |
| "mean_token_accuracy": 0.8828841805458069, |
| "num_tokens": 56472999.0, |
| "step": 6955 |
| }, |
| { |
| "entropy": 1.0316729187965392, |
| "epoch": 3.929983060417843, |
| "grad_norm": 81.88134002685547, |
| "learning_rate": 3.743802304644814e-06, |
| "loss": 0.3741, |
| "mean_token_accuracy": 0.8808148026466369, |
| "num_tokens": 56513580.0, |
| "step": 6960 |
| }, |
| { |
| "entropy": 1.0075525403022767, |
| "epoch": 3.932806324110672, |
| "grad_norm": 71.31327056884766, |
| "learning_rate": 3.7422322000054844e-06, |
| "loss": 0.3557, |
| "mean_token_accuracy": 0.8877139568328858, |
| "num_tokens": 56554082.0, |
| "step": 6965 |
| }, |
| { |
| "entropy": 1.0570897817611695, |
| "epoch": 3.9356295878035006, |
| "grad_norm": 77.34210205078125, |
| "learning_rate": 3.7406616097969034e-06, |
| "loss": 0.3757, |
| "mean_token_accuracy": 0.8809595227241516, |
| "num_tokens": 56594775.0, |
| "step": 6970 |
| }, |
| { |
| "entropy": 0.9789751529693603, |
| "epoch": 3.9384528514963297, |
| "grad_norm": 79.00511169433594, |
| "learning_rate": 3.7390905352546346e-06, |
| "loss": 0.3426, |
| "mean_token_accuracy": 0.8881513237953186, |
| "num_tokens": 56635303.0, |
| "step": 6975 |
| }, |
| { |
| "entropy": 1.0750985145568848, |
| "epoch": 3.941276115189159, |
| "grad_norm": 83.38872528076172, |
| "learning_rate": 3.7375189776146252e-06, |
| "loss": 0.3875, |
| "mean_token_accuracy": 0.8767378211021424, |
| "num_tokens": 56676033.0, |
| "step": 6980 |
| }, |
| { |
| "entropy": 1.1415322065353393, |
| "epoch": 3.9440993788819876, |
| "grad_norm": 79.72837829589844, |
| "learning_rate": 3.7359469381132008e-06, |
| "loss": 0.4027, |
| "mean_token_accuracy": 0.8735546827316284, |
| "num_tokens": 56716838.0, |
| "step": 6985 |
| }, |
| { |
| "entropy": 1.1199346899986267, |
| "epoch": 3.9469226425748163, |
| "grad_norm": 82.7437973022461, |
| "learning_rate": 3.734374417987065e-06, |
| "loss": 0.3539, |
| "mean_token_accuracy": 0.8858396887779236, |
| "num_tokens": 56757396.0, |
| "step": 6990 |
| }, |
| { |
| "entropy": 0.9379915833473206, |
| "epoch": 3.9497459062676454, |
| "grad_norm": 80.79170989990234, |
| "learning_rate": 3.7328014184733008e-06, |
| "loss": 0.3453, |
| "mean_token_accuracy": 0.8876606464385987, |
| "num_tokens": 56798125.0, |
| "step": 6995 |
| }, |
| { |
| "entropy": 1.0453456044197083, |
| "epoch": 3.9525691699604746, |
| "grad_norm": 80.65121459960938, |
| "learning_rate": 3.7312279408093693e-06, |
| "loss": 0.3801, |
| "mean_token_accuracy": 0.8763364791870117, |
| "num_tokens": 56838798.0, |
| "step": 7000 |
| }, |
| { |
| "epoch": 3.9525691699604746, |
| "eval_entropy": 1.234108328819275, |
| "eval_loss": 0.19375726580619812, |
| "eval_mean_token_accuracy": 0.9425987362861633, |
| "eval_num_tokens": 56838798.0, |
| "eval_runtime": 2.4515, |
| "eval_samples_per_second": 15.909, |
| "eval_steps_per_second": 2.04, |
| "step": 7000 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 17710, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.2391467057653576e+18, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|