finetune-phase1-7000 / trainer_state.json
vera6's picture
Upload folder using huggingface_hub
f6efee7 verified
{
"best_global_step": 7000,
"best_metric": 0.19375726580619812,
"best_model_checkpoint": "./sft_model/checkpoint-7000",
"epoch": 3.9525691699604746,
"eval_steps": 500,
"global_step": 7000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 2.6964506268501283,
"epoch": 0.00282326369282891,
"grad_norm": 1879.6842041015625,
"learning_rate": 4.999999527987105e-06,
"loss": 3.1196,
"mean_token_accuracy": 0.5124428987503051,
"num_tokens": 40784.0,
"step": 5
},
{
"entropy": 1.8763649463653564,
"epoch": 0.00564652738565782,
"grad_norm": 538.1932983398438,
"learning_rate": 4.999997610435124e-06,
"loss": 1.8844,
"mean_token_accuracy": 0.5794065773487092,
"num_tokens": 81605.0,
"step": 10
},
{
"entropy": 1.5137803554534912,
"epoch": 0.00846979107848673,
"grad_norm": 298.7164611816406,
"learning_rate": 4.99999421784476e-06,
"loss": 1.3327,
"mean_token_accuracy": 0.6726739525794982,
"num_tokens": 122393.0,
"step": 15
},
{
"entropy": 1.8568676948547362,
"epoch": 0.01129305477131564,
"grad_norm": 846.4222412109375,
"learning_rate": 4.9999893502186794e-06,
"loss": 1.7569,
"mean_token_accuracy": 0.6106071174144745,
"num_tokens": 162845.0,
"step": 20
},
{
"entropy": 1.479284644126892,
"epoch": 0.014116318464144552,
"grad_norm": 303.95062255859375,
"learning_rate": 4.999983007560715e-06,
"loss": 1.2604,
"mean_token_accuracy": 0.6862282872200012,
"num_tokens": 203476.0,
"step": 25
},
{
"entropy": 1.3392301559448243,
"epoch": 0.01693958215697346,
"grad_norm": 218.93763732910156,
"learning_rate": 4.999975189875853e-06,
"loss": 1.1434,
"mean_token_accuracy": 0.711875069141388,
"num_tokens": 244047.0,
"step": 30
},
{
"entropy": 1.3543478012084962,
"epoch": 0.019762845849802372,
"grad_norm": 238.82598876953125,
"learning_rate": 4.999965897170247e-06,
"loss": 1.1883,
"mean_token_accuracy": 0.7010170936584472,
"num_tokens": 284628.0,
"step": 35
},
{
"entropy": 1.4084251165390014,
"epoch": 0.02258610954263128,
"grad_norm": 258.6508483886719,
"learning_rate": 4.999955129451204e-06,
"loss": 1.2144,
"mean_token_accuracy": 0.6951952338218689,
"num_tokens": 325111.0,
"step": 40
},
{
"entropy": 1.4644812107086183,
"epoch": 0.025409373235460192,
"grad_norm": 236.89532470703125,
"learning_rate": 4.999942886727197e-06,
"loss": 1.1666,
"mean_token_accuracy": 0.7105009198188782,
"num_tokens": 365621.0,
"step": 45
},
{
"entropy": 1.4601025104522705,
"epoch": 0.028232636928289104,
"grad_norm": 234.15811157226562,
"learning_rate": 4.999929169007857e-06,
"loss": 1.2064,
"mean_token_accuracy": 0.7012509942054749,
"num_tokens": 406422.0,
"step": 50
},
{
"entropy": 1.385688352584839,
"epoch": 0.031055900621118012,
"grad_norm": 202.43612670898438,
"learning_rate": 4.999913976303975e-06,
"loss": 1.168,
"mean_token_accuracy": 0.705574119091034,
"num_tokens": 447053.0,
"step": 55
},
{
"entropy": 1.329136037826538,
"epoch": 0.03387916431394692,
"grad_norm": 207.84481811523438,
"learning_rate": 4.9998973086275025e-06,
"loss": 1.0843,
"mean_token_accuracy": 0.7196493268013,
"num_tokens": 487480.0,
"step": 60
},
{
"entropy": 1.398281502723694,
"epoch": 0.03670242800677583,
"grad_norm": 174.87757873535156,
"learning_rate": 4.999879165991553e-06,
"loss": 1.1706,
"mean_token_accuracy": 0.7041667461395263,
"num_tokens": 528088.0,
"step": 65
},
{
"entropy": 1.43794903755188,
"epoch": 0.039525691699604744,
"grad_norm": 196.69613647460938,
"learning_rate": 4.999859548410398e-06,
"loss": 1.1689,
"mean_token_accuracy": 0.7039241075515748,
"num_tokens": 568747.0,
"step": 70
},
{
"entropy": 1.3418872117996217,
"epoch": 0.042348955392433656,
"grad_norm": 174.0625457763672,
"learning_rate": 4.999838455899471e-06,
"loss": 1.0773,
"mean_token_accuracy": 0.723293948173523,
"num_tokens": 609459.0,
"step": 75
},
{
"entropy": 1.4742592096328735,
"epoch": 0.04517221908526256,
"grad_norm": 193.97073364257812,
"learning_rate": 4.999815888475366e-06,
"loss": 1.1699,
"mean_token_accuracy": 0.7035363435745239,
"num_tokens": 650270.0,
"step": 80
},
{
"entropy": 1.397952675819397,
"epoch": 0.04799548277809147,
"grad_norm": 191.52386474609375,
"learning_rate": 4.999791846155835e-06,
"loss": 1.1596,
"mean_token_accuracy": 0.7045308589935303,
"num_tokens": 690754.0,
"step": 85
},
{
"entropy": 1.3352545976638794,
"epoch": 0.050818746470920384,
"grad_norm": 181.89520263671875,
"learning_rate": 4.999766328959792e-06,
"loss": 1.1273,
"mean_token_accuracy": 0.7095789313316345,
"num_tokens": 731281.0,
"step": 90
},
{
"entropy": 1.2845103740692139,
"epoch": 0.053642010163749296,
"grad_norm": 195.17857360839844,
"learning_rate": 4.999739336907312e-06,
"loss": 1.0823,
"mean_token_accuracy": 0.7215634346008301,
"num_tokens": 770817.0,
"step": 95
},
{
"entropy": 1.2769271850585937,
"epoch": 0.05646527385657821,
"grad_norm": 177.11135864257812,
"learning_rate": 4.999710870019629e-06,
"loss": 1.0288,
"mean_token_accuracy": 0.7328497529029846,
"num_tokens": 811654.0,
"step": 100
},
{
"entropy": 1.3198055982589723,
"epoch": 0.05928853754940711,
"grad_norm": 183.05325317382812,
"learning_rate": 4.9996809283191375e-06,
"loss": 1.1065,
"mean_token_accuracy": 0.7168498754501342,
"num_tokens": 852176.0,
"step": 105
},
{
"entropy": 1.2998111486434936,
"epoch": 0.062111801242236024,
"grad_norm": 173.55929565429688,
"learning_rate": 4.999649511829392e-06,
"loss": 1.0804,
"mean_token_accuracy": 0.7216156601905823,
"num_tokens": 892923.0,
"step": 110
},
{
"entropy": 1.3105514764785766,
"epoch": 0.06493506493506493,
"grad_norm": 231.638427734375,
"learning_rate": 4.9996166205751075e-06,
"loss": 1.091,
"mean_token_accuracy": 0.7160016298294067,
"num_tokens": 933390.0,
"step": 115
},
{
"entropy": 1.3287595510482788,
"epoch": 0.06775832862789384,
"grad_norm": 195.63812255859375,
"learning_rate": 4.9995822545821596e-06,
"loss": 1.113,
"mean_token_accuracy": 0.7149634003639221,
"num_tokens": 974070.0,
"step": 120
},
{
"entropy": 1.363778567314148,
"epoch": 0.07058159232072275,
"grad_norm": 162.81434631347656,
"learning_rate": 4.999546413877584e-06,
"loss": 1.0972,
"mean_token_accuracy": 0.7180424332618713,
"num_tokens": 1014869.0,
"step": 125
},
{
"entropy": 1.3109471559524537,
"epoch": 0.07340485601355166,
"grad_norm": 159.2827911376953,
"learning_rate": 4.999509098489574e-06,
"loss": 1.0542,
"mean_token_accuracy": 0.7276699781417847,
"num_tokens": 1055497.0,
"step": 130
},
{
"entropy": 1.3256555795669556,
"epoch": 0.07622811970638058,
"grad_norm": 165.77072143554688,
"learning_rate": 4.999470308447488e-06,
"loss": 1.1039,
"mean_token_accuracy": 0.7153335094451905,
"num_tokens": 1096257.0,
"step": 135
},
{
"entropy": 1.3324081659317017,
"epoch": 0.07905138339920949,
"grad_norm": 172.11192321777344,
"learning_rate": 4.99943004378184e-06,
"loss": 1.1212,
"mean_token_accuracy": 0.7077261924743652,
"num_tokens": 1136940.0,
"step": 140
},
{
"entropy": 1.1795243740081787,
"epoch": 0.0818746470920384,
"grad_norm": 158.4144287109375,
"learning_rate": 4.999388304524306e-06,
"loss": 0.9704,
"mean_token_accuracy": 0.7433805704116822,
"num_tokens": 1177564.0,
"step": 145
},
{
"entropy": 1.3331533193588256,
"epoch": 0.08469791078486731,
"grad_norm": 151.61849975585938,
"learning_rate": 4.999345090707721e-06,
"loss": 1.0556,
"mean_token_accuracy": 0.7271691083908081,
"num_tokens": 1218245.0,
"step": 150
},
{
"entropy": 1.3680028915405273,
"epoch": 0.08752117447769622,
"grad_norm": 155.4457244873047,
"learning_rate": 4.999300402366083e-06,
"loss": 1.0887,
"mean_token_accuracy": 0.7194394826889038,
"num_tokens": 1259004.0,
"step": 155
},
{
"entropy": 1.3031083345413208,
"epoch": 0.09034443817052512,
"grad_norm": 148.3616180419922,
"learning_rate": 4.999254239534546e-06,
"loss": 1.0329,
"mean_token_accuracy": 0.734139358997345,
"num_tokens": 1299704.0,
"step": 160
},
{
"entropy": 1.240130877494812,
"epoch": 0.09316770186335403,
"grad_norm": 158.9844970703125,
"learning_rate": 4.999206602249426e-06,
"loss": 0.9971,
"mean_token_accuracy": 0.7394698977470398,
"num_tokens": 1340262.0,
"step": 165
},
{
"entropy": 1.3012992858886718,
"epoch": 0.09599096555618294,
"grad_norm": 168.62123107910156,
"learning_rate": 4.999157490548199e-06,
"loss": 1.0457,
"mean_token_accuracy": 0.7285740613937378,
"num_tokens": 1380931.0,
"step": 170
},
{
"entropy": 1.380574369430542,
"epoch": 0.09881422924901186,
"grad_norm": 159.16758728027344,
"learning_rate": 4.999106904469501e-06,
"loss": 1.0219,
"mean_token_accuracy": 0.7346728801727295,
"num_tokens": 1421612.0,
"step": 175
},
{
"entropy": 1.352856206893921,
"epoch": 0.10163749294184077,
"grad_norm": 167.45785522460938,
"learning_rate": 4.999054844053126e-06,
"loss": 1.0948,
"mean_token_accuracy": 0.7178587436676025,
"num_tokens": 1462305.0,
"step": 180
},
{
"entropy": 1.2682753562927247,
"epoch": 0.10446075663466968,
"grad_norm": 141.07215881347656,
"learning_rate": 4.9990013093400315e-06,
"loss": 1.0059,
"mean_token_accuracy": 0.7368964791297913,
"num_tokens": 1502284.0,
"step": 185
},
{
"entropy": 1.2629669904708862,
"epoch": 0.10728402032749859,
"grad_norm": 175.1353302001953,
"learning_rate": 4.998946300372331e-06,
"loss": 0.9657,
"mean_token_accuracy": 0.7435346961021423,
"num_tokens": 1543011.0,
"step": 190
},
{
"entropy": 1.3042139530181884,
"epoch": 0.1101072840203275,
"grad_norm": 138.55645751953125,
"learning_rate": 4.998889817193298e-06,
"loss": 1.0102,
"mean_token_accuracy": 0.7376075625419617,
"num_tokens": 1583669.0,
"step": 195
},
{
"entropy": 1.3252872467041015,
"epoch": 0.11293054771315642,
"grad_norm": 328.7073669433594,
"learning_rate": 4.998831859847371e-06,
"loss": 1.0797,
"mean_token_accuracy": 0.718978488445282,
"num_tokens": 1624164.0,
"step": 200
},
{
"entropy": 1.3184748888015747,
"epoch": 0.11575381140598531,
"grad_norm": 140.2963104248047,
"learning_rate": 4.998772428380142e-06,
"loss": 1.0262,
"mean_token_accuracy": 0.7362207889556884,
"num_tokens": 1664679.0,
"step": 205
},
{
"entropy": 1.419853186607361,
"epoch": 0.11857707509881422,
"grad_norm": 146.3859100341797,
"learning_rate": 4.9987115228383654e-06,
"loss": 1.1075,
"mean_token_accuracy": 0.7169888734817504,
"num_tokens": 1705222.0,
"step": 210
},
{
"entropy": 1.3418590307235718,
"epoch": 0.12140033879164314,
"grad_norm": 165.43157958984375,
"learning_rate": 4.9986491432699544e-06,
"loss": 1.0783,
"mean_token_accuracy": 0.7195831775665283,
"num_tokens": 1745798.0,
"step": 215
},
{
"entropy": 1.4498067855834962,
"epoch": 0.12422360248447205,
"grad_norm": 168.71444702148438,
"learning_rate": 4.998585289723983e-06,
"loss": 1.0693,
"mean_token_accuracy": 0.728911018371582,
"num_tokens": 1786323.0,
"step": 220
},
{
"entropy": 1.3513702630996705,
"epoch": 0.12704686617730096,
"grad_norm": 141.62327575683594,
"learning_rate": 4.9985199622506835e-06,
"loss": 1.0801,
"mean_token_accuracy": 0.7193790435791015,
"num_tokens": 1827028.0,
"step": 225
},
{
"entropy": 1.1918700218200684,
"epoch": 0.12987012987012986,
"grad_norm": 147.93051147460938,
"learning_rate": 4.998453160901449e-06,
"loss": 0.9504,
"mean_token_accuracy": 0.7491826415061951,
"num_tokens": 1867817.0,
"step": 230
},
{
"entropy": 1.3269594192504883,
"epoch": 0.13269339356295878,
"grad_norm": 139.9858856201172,
"learning_rate": 4.99838488572883e-06,
"loss": 1.0753,
"mean_token_accuracy": 0.7226388096809387,
"num_tokens": 1908308.0,
"step": 235
},
{
"entropy": 1.3837014436721802,
"epoch": 0.13551665725578768,
"grad_norm": 139.34576416015625,
"learning_rate": 4.998315136786539e-06,
"loss": 1.0673,
"mean_token_accuracy": 0.7262170910835266,
"num_tokens": 1948948.0,
"step": 240
},
{
"entropy": 1.2679001569747925,
"epoch": 0.1383399209486166,
"grad_norm": 115.1655502319336,
"learning_rate": 4.998243914129446e-06,
"loss": 1.0151,
"mean_token_accuracy": 0.736802589893341,
"num_tokens": 1989686.0,
"step": 245
},
{
"entropy": 1.3858689785003662,
"epoch": 0.1411631846414455,
"grad_norm": 198.03237915039062,
"learning_rate": 4.99817121781358e-06,
"loss": 1.085,
"mean_token_accuracy": 0.7183008790016174,
"num_tokens": 2030103.0,
"step": 250
},
{
"entropy": 1.3685580015182495,
"epoch": 0.14398644833427443,
"grad_norm": 134.53562927246094,
"learning_rate": 4.998097047896133e-06,
"loss": 1.0412,
"mean_token_accuracy": 0.7265231609344482,
"num_tokens": 2070842.0,
"step": 255
},
{
"entropy": 1.3303281307220458,
"epoch": 0.14680971202710333,
"grad_norm": 144.44908142089844,
"learning_rate": 4.998021404435452e-06,
"loss": 1.0653,
"mean_token_accuracy": 0.7241496682167053,
"num_tokens": 2110742.0,
"step": 260
},
{
"entropy": 1.276755118370056,
"epoch": 0.14963297571993225,
"grad_norm": 139.25717163085938,
"learning_rate": 4.997944287491046e-06,
"loss": 0.9689,
"mean_token_accuracy": 0.7442183256149292,
"num_tokens": 2151458.0,
"step": 265
},
{
"entropy": 1.3310250043869019,
"epoch": 0.15245623941276115,
"grad_norm": 143.90011596679688,
"learning_rate": 4.997865697123579e-06,
"loss": 1.0313,
"mean_token_accuracy": 0.7334769487380981,
"num_tokens": 2192262.0,
"step": 270
},
{
"entropy": 1.3676940202713013,
"epoch": 0.15527950310559005,
"grad_norm": 154.62408447265625,
"learning_rate": 4.99778563339488e-06,
"loss": 1.0256,
"mean_token_accuracy": 0.7308215498924255,
"num_tokens": 2232853.0,
"step": 275
},
{
"entropy": 1.2920289278030395,
"epoch": 0.15810276679841898,
"grad_norm": 145.27914428710938,
"learning_rate": 4.997704096367933e-06,
"loss": 0.97,
"mean_token_accuracy": 0.748300063610077,
"num_tokens": 2273536.0,
"step": 280
},
{
"entropy": 1.3298648118972778,
"epoch": 0.16092603049124787,
"grad_norm": 127.40184783935547,
"learning_rate": 4.997621086106883e-06,
"loss": 1.0013,
"mean_token_accuracy": 0.7374109506607056,
"num_tokens": 2314211.0,
"step": 285
},
{
"entropy": 1.3977973222732545,
"epoch": 0.1637492941840768,
"grad_norm": 147.5937042236328,
"learning_rate": 4.997536602677031e-06,
"loss": 1.0307,
"mean_token_accuracy": 0.7304450035095215,
"num_tokens": 2355018.0,
"step": 290
},
{
"entropy": 1.5217002391815186,
"epoch": 0.1665725578769057,
"grad_norm": 148.3541259765625,
"learning_rate": 4.997450646144843e-06,
"loss": 1.1454,
"mean_token_accuracy": 0.7052842378616333,
"num_tokens": 2394833.0,
"step": 295
},
{
"entropy": 1.2209747552871704,
"epoch": 0.16939582156973462,
"grad_norm": 128.4432830810547,
"learning_rate": 4.997363216577937e-06,
"loss": 0.9285,
"mean_token_accuracy": 0.7535837411880493,
"num_tokens": 2435708.0,
"step": 300
},
{
"entropy": 1.3434046506881714,
"epoch": 0.17221908526256352,
"grad_norm": 120.20564270019531,
"learning_rate": 4.997274314045093e-06,
"loss": 1.0367,
"mean_token_accuracy": 0.7294707536697388,
"num_tokens": 2476300.0,
"step": 305
},
{
"entropy": 1.2988000869750977,
"epoch": 0.17504234895539245,
"grad_norm": 135.5797882080078,
"learning_rate": 4.9971839386162505e-06,
"loss": 1.008,
"mean_token_accuracy": 0.7358855485916138,
"num_tokens": 2517029.0,
"step": 310
},
{
"entropy": 1.2970911979675293,
"epoch": 0.17786561264822134,
"grad_norm": 127.42134094238281,
"learning_rate": 4.997092090362506e-06,
"loss": 0.9862,
"mean_token_accuracy": 0.7406257271766663,
"num_tokens": 2557648.0,
"step": 315
},
{
"entropy": 1.3222782850265502,
"epoch": 0.18068887634105024,
"grad_norm": 132.6872100830078,
"learning_rate": 4.996998769356116e-06,
"loss": 1.0062,
"mean_token_accuracy": 0.7342755556106567,
"num_tokens": 2598313.0,
"step": 320
},
{
"entropy": 1.3237468957901002,
"epoch": 0.18351214003387917,
"grad_norm": 115.4422378540039,
"learning_rate": 4.996903975670495e-06,
"loss": 1.0351,
"mean_token_accuracy": 0.7323682546615601,
"num_tokens": 2638879.0,
"step": 325
},
{
"entropy": 1.297943377494812,
"epoch": 0.18633540372670807,
"grad_norm": 142.4702911376953,
"learning_rate": 4.996807709380216e-06,
"loss": 0.9848,
"mean_token_accuracy": 0.7394223213195801,
"num_tokens": 2679280.0,
"step": 330
},
{
"entropy": 1.3547947645187377,
"epoch": 0.189158667419537,
"grad_norm": 142.83367919921875,
"learning_rate": 4.996709970561011e-06,
"loss": 1.0522,
"mean_token_accuracy": 0.7255360841751098,
"num_tokens": 2719742.0,
"step": 335
},
{
"entropy": 1.4270910263061523,
"epoch": 0.1919819311123659,
"grad_norm": 137.50009155273438,
"learning_rate": 4.996610759289769e-06,
"loss": 1.0668,
"mean_token_accuracy": 0.7272424578666687,
"num_tokens": 2760484.0,
"step": 340
},
{
"entropy": 1.3766878366470336,
"epoch": 0.19480519480519481,
"grad_norm": 130.84567260742188,
"learning_rate": 4.9965100756445385e-06,
"loss": 1.0055,
"mean_token_accuracy": 0.7384996533393859,
"num_tokens": 2801264.0,
"step": 345
},
{
"entropy": 1.306664514541626,
"epoch": 0.1976284584980237,
"grad_norm": 122.48930358886719,
"learning_rate": 4.996407919704527e-06,
"loss": 0.9605,
"mean_token_accuracy": 0.7464729905128479,
"num_tokens": 2842004.0,
"step": 350
},
{
"entropy": 1.4950592756271361,
"epoch": 0.20045172219085264,
"grad_norm": 145.43069458007812,
"learning_rate": 4.9963042915500966e-06,
"loss": 1.055,
"mean_token_accuracy": 0.7293013811111451,
"num_tokens": 2882520.0,
"step": 355
},
{
"entropy": 1.2889564514160157,
"epoch": 0.20327498588368154,
"grad_norm": 119.11095428466797,
"learning_rate": 4.996199191262775e-06,
"loss": 0.9282,
"mean_token_accuracy": 0.7534731745719909,
"num_tokens": 2923041.0,
"step": 360
},
{
"entropy": 1.2990325212478637,
"epoch": 0.20609824957651043,
"grad_norm": 128.93939208984375,
"learning_rate": 4.99609261892524e-06,
"loss": 0.9558,
"mean_token_accuracy": 0.7469653010368347,
"num_tokens": 2963601.0,
"step": 365
},
{
"entropy": 1.325394630432129,
"epoch": 0.20892151326933936,
"grad_norm": 127.29461669921875,
"learning_rate": 4.995984574621332e-06,
"loss": 0.9955,
"mean_token_accuracy": 0.7355307936668396,
"num_tokens": 3004286.0,
"step": 370
},
{
"entropy": 1.438259530067444,
"epoch": 0.21174477696216826,
"grad_norm": 152.51705932617188,
"learning_rate": 4.995875058436047e-06,
"loss": 1.0516,
"mean_token_accuracy": 0.729459798336029,
"num_tokens": 3045092.0,
"step": 375
},
{
"entropy": 1.2735749006271362,
"epoch": 0.21456804065499718,
"grad_norm": 167.75277709960938,
"learning_rate": 4.995764070455542e-06,
"loss": 0.9568,
"mean_token_accuracy": 0.7488296508789063,
"num_tokens": 3086004.0,
"step": 380
},
{
"entropy": 1.339252519607544,
"epoch": 0.21739130434782608,
"grad_norm": 137.4264678955078,
"learning_rate": 4.995651610767128e-06,
"loss": 1.0154,
"mean_token_accuracy": 0.7321458339691163,
"num_tokens": 3126547.0,
"step": 385
},
{
"entropy": 1.2893430948257447,
"epoch": 0.220214568040655,
"grad_norm": 126.54104614257812,
"learning_rate": 4.995537679459277e-06,
"loss": 0.9589,
"mean_token_accuracy": 0.7475377321243286,
"num_tokens": 3167244.0,
"step": 390
},
{
"entropy": 1.2430254459381103,
"epoch": 0.2230378317334839,
"grad_norm": 119.59004974365234,
"learning_rate": 4.995422276621617e-06,
"loss": 0.8992,
"mean_token_accuracy": 0.757983124256134,
"num_tokens": 3207925.0,
"step": 395
},
{
"entropy": 1.2922404289245606,
"epoch": 0.22586109542631283,
"grad_norm": 148.28298950195312,
"learning_rate": 4.995305402344933e-06,
"loss": 1.0058,
"mean_token_accuracy": 0.7375632762908936,
"num_tokens": 3248528.0,
"step": 400
},
{
"entropy": 1.3221865177154541,
"epoch": 0.22868435911914173,
"grad_norm": 126.97147369384766,
"learning_rate": 4.995187056721171e-06,
"loss": 0.9874,
"mean_token_accuracy": 0.7407760977745056,
"num_tokens": 3289340.0,
"step": 405
},
{
"entropy": 1.227053427696228,
"epoch": 0.23150762281197063,
"grad_norm": 113.13875579833984,
"learning_rate": 4.99506723984343e-06,
"loss": 0.8947,
"mean_token_accuracy": 0.760028600692749,
"num_tokens": 3329952.0,
"step": 410
},
{
"entropy": 1.2992219924926758,
"epoch": 0.23433088650479955,
"grad_norm": 101.6119155883789,
"learning_rate": 4.994945951805969e-06,
"loss": 0.9453,
"mean_token_accuracy": 0.7457529783248902,
"num_tokens": 3370598.0,
"step": 415
},
{
"entropy": 1.3179654836654664,
"epoch": 0.23715415019762845,
"grad_norm": 117.23041534423828,
"learning_rate": 4.994823192704205e-06,
"loss": 0.9681,
"mean_token_accuracy": 0.745925772190094,
"num_tokens": 3411148.0,
"step": 420
},
{
"entropy": 1.2884270429611206,
"epoch": 0.23997741389045738,
"grad_norm": 118.79703521728516,
"learning_rate": 4.994698962634709e-06,
"loss": 0.9963,
"mean_token_accuracy": 0.7392920970916748,
"num_tokens": 3451712.0,
"step": 425
},
{
"entropy": 1.289307141304016,
"epoch": 0.24280067758328627,
"grad_norm": 128.40139770507812,
"learning_rate": 4.994573261695213e-06,
"loss": 0.9873,
"mean_token_accuracy": 0.7425390005111694,
"num_tokens": 3492409.0,
"step": 430
},
{
"entropy": 1.3000454664230348,
"epoch": 0.2456239412761152,
"grad_norm": 115.0731430053711,
"learning_rate": 4.9944460899846044e-06,
"loss": 0.9402,
"mean_token_accuracy": 0.7487621665000915,
"num_tokens": 3533099.0,
"step": 435
},
{
"entropy": 1.3730944633483886,
"epoch": 0.2484472049689441,
"grad_norm": 123.5392837524414,
"learning_rate": 4.994317447602927e-06,
"loss": 1.0273,
"mean_token_accuracy": 0.7346350908279419,
"num_tokens": 3573865.0,
"step": 440
},
{
"entropy": 1.3912369728088378,
"epoch": 0.251270468661773,
"grad_norm": 136.06275939941406,
"learning_rate": 4.994187334651382e-06,
"loss": 1.0047,
"mean_token_accuracy": 0.7363410830497742,
"num_tokens": 3614426.0,
"step": 445
},
{
"entropy": 1.363344144821167,
"epoch": 0.2540937323546019,
"grad_norm": 120.8814468383789,
"learning_rate": 4.994055751232329e-06,
"loss": 0.9789,
"mean_token_accuracy": 0.7418818831443786,
"num_tokens": 3654639.0,
"step": 450
},
{
"entropy": 1.1486493587493896,
"epoch": 0.25691699604743085,
"grad_norm": 99.43315124511719,
"learning_rate": 4.993922697449282e-06,
"loss": 0.8197,
"mean_token_accuracy": 0.7789113402366639,
"num_tokens": 3695164.0,
"step": 455
},
{
"entropy": 1.262733268737793,
"epoch": 0.2597402597402597,
"grad_norm": 116.33341217041016,
"learning_rate": 4.993788173406913e-06,
"loss": 0.9351,
"mean_token_accuracy": 0.7525886178016663,
"num_tokens": 3736048.0,
"step": 460
},
{
"entropy": 1.3816607236862182,
"epoch": 0.26256352343308864,
"grad_norm": 110.50990295410156,
"learning_rate": 4.9936521792110505e-06,
"loss": 0.9687,
"mean_token_accuracy": 0.7439742684364319,
"num_tokens": 3776672.0,
"step": 465
},
{
"entropy": 1.2183295249938966,
"epoch": 0.26538678712591757,
"grad_norm": 123.17945861816406,
"learning_rate": 4.99351471496868e-06,
"loss": 0.8336,
"mean_token_accuracy": 0.7743356227874756,
"num_tokens": 3816553.0,
"step": 470
},
{
"entropy": 1.3230255365371704,
"epoch": 0.2682100508187465,
"grad_norm": 112.20722198486328,
"learning_rate": 4.993375780787942e-06,
"loss": 0.9225,
"mean_token_accuracy": 0.7531169533729554,
"num_tokens": 3857251.0,
"step": 475
},
{
"entropy": 1.280331516265869,
"epoch": 0.27103331451157536,
"grad_norm": 101.07537841796875,
"learning_rate": 4.993235376778135e-06,
"loss": 0.9109,
"mean_token_accuracy": 0.7576413989067078,
"num_tokens": 3897861.0,
"step": 480
},
{
"entropy": 1.2946168661117554,
"epoch": 0.2738565782044043,
"grad_norm": 117.32846069335938,
"learning_rate": 4.993093503049714e-06,
"loss": 0.9609,
"mean_token_accuracy": 0.7459181666374206,
"num_tokens": 3938460.0,
"step": 485
},
{
"entropy": 1.4364691019058227,
"epoch": 0.2766798418972332,
"grad_norm": 123.10442352294922,
"learning_rate": 4.992950159714288e-06,
"loss": 1.0288,
"mean_token_accuracy": 0.7280943512916564,
"num_tokens": 3978929.0,
"step": 490
},
{
"entropy": 1.4275231122970582,
"epoch": 0.2795031055900621,
"grad_norm": 140.68606567382812,
"learning_rate": 4.992805346884624e-06,
"loss": 1.0296,
"mean_token_accuracy": 0.7304377675056457,
"num_tokens": 4019194.0,
"step": 495
},
{
"entropy": 1.2917508125305175,
"epoch": 0.282326369282891,
"grad_norm": 104.26236724853516,
"learning_rate": 4.992659064674645e-06,
"loss": 0.9386,
"mean_token_accuracy": 0.7486904263496399,
"num_tokens": 4059701.0,
"step": 500
},
{
"epoch": 0.282326369282891,
"eval_entropy": 1.3246455669403077,
"eval_loss": 0.9719027876853943,
"eval_mean_token_accuracy": 0.7504606604576111,
"eval_num_tokens": 4059701.0,
"eval_runtime": 2.4542,
"eval_samples_per_second": 15.891,
"eval_steps_per_second": 2.037,
"step": 500
},
{
"entropy": 1.2074965000152589,
"epoch": 0.28514963297571994,
"grad_norm": 116.08982849121094,
"learning_rate": 4.992511313199429e-06,
"loss": 0.911,
"mean_token_accuracy": 0.7566827893257141,
"num_tokens": 4100589.0,
"step": 505
},
{
"entropy": 1.1781744480133056,
"epoch": 0.28797289666854886,
"grad_norm": 117.98184204101562,
"learning_rate": 4.99236209257521e-06,
"loss": 0.8898,
"mean_token_accuracy": 0.761328113079071,
"num_tokens": 4141263.0,
"step": 510
},
{
"entropy": 1.341305184364319,
"epoch": 0.29079616036137773,
"grad_norm": 130.23411560058594,
"learning_rate": 4.992211402919379e-06,
"loss": 0.9646,
"mean_token_accuracy": 0.748078465461731,
"num_tokens": 4181932.0,
"step": 515
},
{
"entropy": 1.2573811054229735,
"epoch": 0.29361942405420666,
"grad_norm": 108.79557800292969,
"learning_rate": 4.992059244350481e-06,
"loss": 0.9193,
"mean_token_accuracy": 0.7583318710327148,
"num_tokens": 4222684.0,
"step": 520
},
{
"entropy": 1.2942801475524903,
"epoch": 0.2964426877470356,
"grad_norm": 113.74977111816406,
"learning_rate": 4.991905616988217e-06,
"loss": 0.9397,
"mean_token_accuracy": 0.7517918109893799,
"num_tokens": 4263508.0,
"step": 525
},
{
"entropy": 1.2990548849105834,
"epoch": 0.2992659514398645,
"grad_norm": 105.88493347167969,
"learning_rate": 4.991750520953445e-06,
"loss": 0.9634,
"mean_token_accuracy": 0.7489338755607605,
"num_tokens": 4304250.0,
"step": 530
},
{
"entropy": 1.265447235107422,
"epoch": 0.3020892151326934,
"grad_norm": 117.12853240966797,
"learning_rate": 4.991593956368177e-06,
"loss": 0.9167,
"mean_token_accuracy": 0.7550532460212708,
"num_tokens": 4344911.0,
"step": 535
},
{
"entropy": 1.229043436050415,
"epoch": 0.3049124788255223,
"grad_norm": 116.68711853027344,
"learning_rate": 4.9914359233555795e-06,
"loss": 0.855,
"mean_token_accuracy": 0.7696425437927246,
"num_tokens": 4385532.0,
"step": 540
},
{
"entropy": 1.3290831565856933,
"epoch": 0.30773574251835123,
"grad_norm": 112.86345672607422,
"learning_rate": 4.991276422039976e-06,
"loss": 0.987,
"mean_token_accuracy": 0.7430547952651978,
"num_tokens": 4426255.0,
"step": 545
},
{
"entropy": 1.2172236442565918,
"epoch": 0.3105590062111801,
"grad_norm": 109.52008819580078,
"learning_rate": 4.9911154525468446e-06,
"loss": 0.8654,
"mean_token_accuracy": 0.7682685256004333,
"num_tokens": 4466692.0,
"step": 550
},
{
"entropy": 1.3038493156433106,
"epoch": 0.313382269904009,
"grad_norm": 106.18077087402344,
"learning_rate": 4.990953015002817e-06,
"loss": 0.9641,
"mean_token_accuracy": 0.7453461289405823,
"num_tokens": 4507436.0,
"step": 555
},
{
"entropy": 1.3330723762512207,
"epoch": 0.31620553359683795,
"grad_norm": 120.44703674316406,
"learning_rate": 4.990789109535681e-06,
"loss": 0.991,
"mean_token_accuracy": 0.7382645726203918,
"num_tokens": 4548162.0,
"step": 560
},
{
"entropy": 1.3030031204223633,
"epoch": 0.3190287972896669,
"grad_norm": 128.5010986328125,
"learning_rate": 4.99062373627438e-06,
"loss": 0.9184,
"mean_token_accuracy": 0.755344557762146,
"num_tokens": 4588651.0,
"step": 565
},
{
"entropy": 1.3902093887329101,
"epoch": 0.32185206098249575,
"grad_norm": 111.37005615234375,
"learning_rate": 4.990456895349011e-06,
"loss": 0.988,
"mean_token_accuracy": 0.74136883020401,
"num_tokens": 4629315.0,
"step": 570
},
{
"entropy": 1.3531365394592285,
"epoch": 0.3246753246753247,
"grad_norm": 110.8538589477539,
"learning_rate": 4.9902885868908264e-06,
"loss": 0.9974,
"mean_token_accuracy": 0.7447136282920838,
"num_tokens": 4670100.0,
"step": 575
},
{
"entropy": 1.3364722967147826,
"epoch": 0.3274985883681536,
"grad_norm": 119.34349822998047,
"learning_rate": 4.990118811032231e-06,
"loss": 0.971,
"mean_token_accuracy": 0.742649781703949,
"num_tokens": 4710465.0,
"step": 580
},
{
"entropy": 1.232515549659729,
"epoch": 0.33032185206098247,
"grad_norm": 120.2654800415039,
"learning_rate": 4.989947567906786e-06,
"loss": 0.9344,
"mean_token_accuracy": 0.7535375952720642,
"num_tokens": 4751125.0,
"step": 585
},
{
"entropy": 1.2825552940368652,
"epoch": 0.3331451157538114,
"grad_norm": 121.9236831665039,
"learning_rate": 4.9897748576492065e-06,
"loss": 0.9342,
"mean_token_accuracy": 0.7499042987823487,
"num_tokens": 4791855.0,
"step": 590
},
{
"entropy": 1.3502115964889527,
"epoch": 0.3359683794466403,
"grad_norm": 117.69750213623047,
"learning_rate": 4.9896006803953615e-06,
"loss": 0.9549,
"mean_token_accuracy": 0.748097813129425,
"num_tokens": 4831821.0,
"step": 595
},
{
"entropy": 1.3454943418502807,
"epoch": 0.33879164313946925,
"grad_norm": 110.03791046142578,
"learning_rate": 4.9894250362822735e-06,
"loss": 0.9892,
"mean_token_accuracy": 0.7411907076835632,
"num_tokens": 4872146.0,
"step": 600
},
{
"entropy": 1.338779044151306,
"epoch": 0.3416149068322981,
"grad_norm": 126.85265350341797,
"learning_rate": 4.989247925448122e-06,
"loss": 0.9397,
"mean_token_accuracy": 0.7515046119689941,
"num_tokens": 4912659.0,
"step": 605
},
{
"entropy": 1.3164394855499268,
"epoch": 0.34443817052512704,
"grad_norm": 128.91824340820312,
"learning_rate": 4.989069348032234e-06,
"loss": 0.8567,
"mean_token_accuracy": 0.7691178798675538,
"num_tokens": 4952825.0,
"step": 610
},
{
"entropy": 1.2950055837631225,
"epoch": 0.34726143421795597,
"grad_norm": 102.4195556640625,
"learning_rate": 4.988889304175099e-06,
"loss": 0.9147,
"mean_token_accuracy": 0.7522725224494934,
"num_tokens": 4993585.0,
"step": 615
},
{
"entropy": 1.3483319997787475,
"epoch": 0.3500846979107849,
"grad_norm": 108.66871643066406,
"learning_rate": 4.988707794018351e-06,
"loss": 0.9596,
"mean_token_accuracy": 0.7457158684730529,
"num_tokens": 5034279.0,
"step": 620
},
{
"entropy": 1.4173261642456054,
"epoch": 0.35290796160361376,
"grad_norm": 110.11769104003906,
"learning_rate": 4.988524817704784e-06,
"loss": 1.0058,
"mean_token_accuracy": 0.7339034914970398,
"num_tokens": 5074824.0,
"step": 625
},
{
"entropy": 1.3969521045684814,
"epoch": 0.3557312252964427,
"grad_norm": 111.39070129394531,
"learning_rate": 4.988340375378344e-06,
"loss": 0.9766,
"mean_token_accuracy": 0.7456167697906494,
"num_tokens": 5115452.0,
"step": 630
},
{
"entropy": 1.3778595924377441,
"epoch": 0.3585544889892716,
"grad_norm": 130.38401794433594,
"learning_rate": 4.988154467184129e-06,
"loss": 0.9717,
"mean_token_accuracy": 0.7431756496429444,
"num_tokens": 5156122.0,
"step": 635
},
{
"entropy": 1.2438819527626037,
"epoch": 0.3613777526821005,
"grad_norm": 91.94094848632812,
"learning_rate": 4.98796709326839e-06,
"loss": 0.8557,
"mean_token_accuracy": 0.7728610277175904,
"num_tokens": 5196962.0,
"step": 640
},
{
"entropy": 1.2364247560501098,
"epoch": 0.3642010163749294,
"grad_norm": 112.2281494140625,
"learning_rate": 4.987778253778532e-06,
"loss": 0.868,
"mean_token_accuracy": 0.7657705903053283,
"num_tokens": 5237644.0,
"step": 645
},
{
"entropy": 1.3769032955169678,
"epoch": 0.36702428006775834,
"grad_norm": 106.69970703125,
"learning_rate": 4.987587948863113e-06,
"loss": 0.9205,
"mean_token_accuracy": 0.7561796069145202,
"num_tokens": 5278368.0,
"step": 650
},
{
"entropy": 1.3944639682769775,
"epoch": 0.36984754376058726,
"grad_norm": 104.28118133544922,
"learning_rate": 4.987396178671845e-06,
"loss": 0.9843,
"mean_token_accuracy": 0.7391315698623657,
"num_tokens": 5318926.0,
"step": 655
},
{
"entropy": 1.3362845420837401,
"epoch": 0.37267080745341613,
"grad_norm": 106.64762115478516,
"learning_rate": 4.987202943355588e-06,
"loss": 0.9393,
"mean_token_accuracy": 0.7537869215011597,
"num_tokens": 5359676.0,
"step": 660
},
{
"entropy": 1.427641224861145,
"epoch": 0.37549407114624506,
"grad_norm": 114.0298080444336,
"learning_rate": 4.987008243066362e-06,
"loss": 1.0263,
"mean_token_accuracy": 0.7314809441566468,
"num_tokens": 5400488.0,
"step": 665
},
{
"entropy": 1.2906121969223023,
"epoch": 0.378317334839074,
"grad_norm": 121.60186004638672,
"learning_rate": 4.986812077957333e-06,
"loss": 0.8884,
"mean_token_accuracy": 0.761104142665863,
"num_tokens": 5440967.0,
"step": 670
},
{
"entropy": 1.322801113128662,
"epoch": 0.38114059853190285,
"grad_norm": 100.61329650878906,
"learning_rate": 4.986614448182821e-06,
"loss": 0.9344,
"mean_token_accuracy": 0.7525824546813965,
"num_tokens": 5481714.0,
"step": 675
},
{
"entropy": 1.2744395971298217,
"epoch": 0.3839638622247318,
"grad_norm": 114.0245132446289,
"learning_rate": 4.986415353898301e-06,
"loss": 0.9013,
"mean_token_accuracy": 0.7586809635162354,
"num_tokens": 5522307.0,
"step": 680
},
{
"entropy": 1.406579351425171,
"epoch": 0.3867871259175607,
"grad_norm": 110.2708740234375,
"learning_rate": 4.986214795260398e-06,
"loss": 0.9857,
"mean_token_accuracy": 0.7406947612762451,
"num_tokens": 5563030.0,
"step": 685
},
{
"entropy": 1.4012676239013673,
"epoch": 0.38961038961038963,
"grad_norm": 107.38058471679688,
"learning_rate": 4.986012772426887e-06,
"loss": 0.9898,
"mean_token_accuracy": 0.7373950242996216,
"num_tokens": 5603900.0,
"step": 690
},
{
"entropy": 1.276191735267639,
"epoch": 0.3924336533032185,
"grad_norm": 102.94742584228516,
"learning_rate": 4.985809285556698e-06,
"loss": 0.905,
"mean_token_accuracy": 0.7557743310928344,
"num_tokens": 5644556.0,
"step": 695
},
{
"entropy": 1.3815409660339355,
"epoch": 0.3952569169960474,
"grad_norm": 132.46441650390625,
"learning_rate": 4.9856043348099134e-06,
"loss": 0.945,
"mean_token_accuracy": 0.7497841715812683,
"num_tokens": 5685234.0,
"step": 700
},
{
"entropy": 1.2497349023818969,
"epoch": 0.39808018068887635,
"grad_norm": 105.15983581542969,
"learning_rate": 4.9853979203477644e-06,
"loss": 0.8568,
"mean_token_accuracy": 0.7651753425598145,
"num_tokens": 5725995.0,
"step": 705
},
{
"entropy": 1.2450405836105347,
"epoch": 0.4009034443817053,
"grad_norm": 101.96587371826172,
"learning_rate": 4.9851900423326335e-06,
"loss": 0.868,
"mean_token_accuracy": 0.7652771830558777,
"num_tokens": 5766777.0,
"step": 710
},
{
"entropy": 1.3319795370101928,
"epoch": 0.40372670807453415,
"grad_norm": 129.9007110595703,
"learning_rate": 4.984980700928057e-06,
"loss": 0.9547,
"mean_token_accuracy": 0.7460948824882507,
"num_tokens": 5807316.0,
"step": 715
},
{
"entropy": 1.266514039039612,
"epoch": 0.40654997176736307,
"grad_norm": 97.09778594970703,
"learning_rate": 4.9847698962987224e-06,
"loss": 0.8828,
"mean_token_accuracy": 0.7616210579872131,
"num_tokens": 5848130.0,
"step": 720
},
{
"entropy": 1.2651296854019165,
"epoch": 0.409373235460192,
"grad_norm": 119.74391174316406,
"learning_rate": 4.984557628610465e-06,
"loss": 0.912,
"mean_token_accuracy": 0.7551288604736328,
"num_tokens": 5888922.0,
"step": 725
},
{
"entropy": 1.2874022722244263,
"epoch": 0.41219649915302087,
"grad_norm": 110.32769012451172,
"learning_rate": 4.984343898030275e-06,
"loss": 0.887,
"mean_token_accuracy": 0.7651336789131165,
"num_tokens": 5929586.0,
"step": 730
},
{
"entropy": 1.181906795501709,
"epoch": 0.4150197628458498,
"grad_norm": 102.93072509765625,
"learning_rate": 4.98412870472629e-06,
"loss": 0.8324,
"mean_token_accuracy": 0.7739112615585327,
"num_tokens": 5970120.0,
"step": 735
},
{
"entropy": 1.2475993156433105,
"epoch": 0.4178430265386787,
"grad_norm": 96.60658264160156,
"learning_rate": 4.9839120488678025e-06,
"loss": 0.8659,
"mean_token_accuracy": 0.7675553560256958,
"num_tokens": 6010881.0,
"step": 740
},
{
"entropy": 1.277070164680481,
"epoch": 0.42066629023150764,
"grad_norm": 96.34147644042969,
"learning_rate": 4.983693930625251e-06,
"loss": 0.8726,
"mean_token_accuracy": 0.7630559086799622,
"num_tokens": 6051720.0,
"step": 745
},
{
"entropy": 1.2760138750076293,
"epoch": 0.4234895539243365,
"grad_norm": 100.36460876464844,
"learning_rate": 4.983474350170227e-06,
"loss": 0.8673,
"mean_token_accuracy": 0.7626476407051086,
"num_tokens": 6092424.0,
"step": 750
},
{
"entropy": 1.2910543203353881,
"epoch": 0.42631281761716544,
"grad_norm": 105.85248565673828,
"learning_rate": 4.983253307675473e-06,
"loss": 0.8937,
"mean_token_accuracy": 0.7598207592964172,
"num_tokens": 6133094.0,
"step": 755
},
{
"entropy": 1.4057719707489014,
"epoch": 0.42913608130999437,
"grad_norm": 107.28651428222656,
"learning_rate": 4.983030803314878e-06,
"loss": 0.9538,
"mean_token_accuracy": 0.7475337266921998,
"num_tokens": 6173931.0,
"step": 760
},
{
"entropy": 1.2181792259216309,
"epoch": 0.43195934500282324,
"grad_norm": 106.51260375976562,
"learning_rate": 4.982806837263486e-06,
"loss": 0.8524,
"mean_token_accuracy": 0.7689022898674012,
"num_tokens": 6214629.0,
"step": 765
},
{
"entropy": 1.2283953189849854,
"epoch": 0.43478260869565216,
"grad_norm": 100.91169738769531,
"learning_rate": 4.982581409697487e-06,
"loss": 0.8259,
"mean_token_accuracy": 0.7750494837760925,
"num_tokens": 6255482.0,
"step": 770
},
{
"entropy": 1.4342861652374268,
"epoch": 0.4376058723884811,
"grad_norm": 112.93717956542969,
"learning_rate": 4.982354520794224e-06,
"loss": 0.9777,
"mean_token_accuracy": 0.7420659542083741,
"num_tokens": 6296007.0,
"step": 775
},
{
"entropy": 1.2862244129180909,
"epoch": 0.44042913608131,
"grad_norm": 123.29638671875,
"learning_rate": 4.982126170732185e-06,
"loss": 0.8873,
"mean_token_accuracy": 0.7592768907546997,
"num_tokens": 6336419.0,
"step": 780
},
{
"entropy": 1.31422700881958,
"epoch": 0.4432523997741389,
"grad_norm": 98.10787963867188,
"learning_rate": 4.981896359691013e-06,
"loss": 0.9252,
"mean_token_accuracy": 0.7531617879867554,
"num_tokens": 6377083.0,
"step": 785
},
{
"entropy": 1.2349420070648194,
"epoch": 0.4460756634669678,
"grad_norm": 104.18644714355469,
"learning_rate": 4.981665087851495e-06,
"loss": 0.8817,
"mean_token_accuracy": 0.7612318158149719,
"num_tokens": 6417661.0,
"step": 790
},
{
"entropy": 1.2322286128997804,
"epoch": 0.44889892715979673,
"grad_norm": 102.12158966064453,
"learning_rate": 4.981432355395572e-06,
"loss": 0.921,
"mean_token_accuracy": 0.7553870797157287,
"num_tokens": 6458267.0,
"step": 795
},
{
"entropy": 1.353945779800415,
"epoch": 0.45172219085262566,
"grad_norm": 94.95413970947266,
"learning_rate": 4.9811981625063315e-06,
"loss": 0.9211,
"mean_token_accuracy": 0.7573221802711487,
"num_tokens": 6498460.0,
"step": 800
},
{
"entropy": 1.3203566551208497,
"epoch": 0.45454545454545453,
"grad_norm": 87.52528381347656,
"learning_rate": 4.980962509368009e-06,
"loss": 0.8826,
"mean_token_accuracy": 0.7646165013313293,
"num_tokens": 6539065.0,
"step": 805
},
{
"entropy": 1.2447147130966187,
"epoch": 0.45736871823828346,
"grad_norm": 85.17706298828125,
"learning_rate": 4.980725396165992e-06,
"loss": 0.8552,
"mean_token_accuracy": 0.7722663283348083,
"num_tokens": 6579730.0,
"step": 810
},
{
"entropy": 1.276610255241394,
"epoch": 0.4601919819311124,
"grad_norm": 102.33468627929688,
"learning_rate": 4.980486823086813e-06,
"loss": 0.8926,
"mean_token_accuracy": 0.7591757655143738,
"num_tokens": 6620403.0,
"step": 815
},
{
"entropy": 1.2437695741653443,
"epoch": 0.46301524562394125,
"grad_norm": 99.84664154052734,
"learning_rate": 4.980246790318156e-06,
"loss": 0.8763,
"mean_token_accuracy": 0.7644282937049866,
"num_tokens": 6661163.0,
"step": 820
},
{
"entropy": 1.3313609600067138,
"epoch": 0.4658385093167702,
"grad_norm": 97.01880645751953,
"learning_rate": 4.98000529804885e-06,
"loss": 0.8914,
"mean_token_accuracy": 0.7625803470611572,
"num_tokens": 6701784.0,
"step": 825
},
{
"entropy": 1.3444613218307495,
"epoch": 0.4686617730095991,
"grad_norm": 99.8399429321289,
"learning_rate": 4.979762346468876e-06,
"loss": 0.9474,
"mean_token_accuracy": 0.7516453862190247,
"num_tokens": 6742391.0,
"step": 830
},
{
"entropy": 1.2741617679595947,
"epoch": 0.47148503670242803,
"grad_norm": 89.9829330444336,
"learning_rate": 4.979517935769359e-06,
"loss": 0.8707,
"mean_token_accuracy": 0.7648941993713378,
"num_tokens": 6783130.0,
"step": 835
},
{
"entropy": 1.3345114469528199,
"epoch": 0.4743083003952569,
"grad_norm": 112.01448822021484,
"learning_rate": 4.979272066142576e-06,
"loss": 0.9033,
"mean_token_accuracy": 0.7620006084442139,
"num_tokens": 6823752.0,
"step": 840
},
{
"entropy": 1.2797606706619262,
"epoch": 0.4771315640880858,
"grad_norm": 105.47362518310547,
"learning_rate": 4.97902473778195e-06,
"loss": 0.8948,
"mean_token_accuracy": 0.7626261591911316,
"num_tokens": 6864415.0,
"step": 845
},
{
"entropy": 1.2193082809448241,
"epoch": 0.47995482778091475,
"grad_norm": 125.71626281738281,
"learning_rate": 4.978775950882049e-06,
"loss": 0.8311,
"mean_token_accuracy": 0.7712972402572632,
"num_tokens": 6905160.0,
"step": 850
},
{
"entropy": 1.1931323289871216,
"epoch": 0.4827780914737436,
"grad_norm": 109.39054107666016,
"learning_rate": 4.978525705638593e-06,
"loss": 0.8553,
"mean_token_accuracy": 0.7686418652534485,
"num_tokens": 6945864.0,
"step": 855
},
{
"entropy": 1.217218804359436,
"epoch": 0.48560135516657255,
"grad_norm": 93.509521484375,
"learning_rate": 4.9782740022484455e-06,
"loss": 0.9275,
"mean_token_accuracy": 0.7530880331993103,
"num_tokens": 6986552.0,
"step": 860
},
{
"entropy": 1.3541555404663086,
"epoch": 0.48842461885940147,
"grad_norm": 112.28630065917969,
"learning_rate": 4.978020840909619e-06,
"loss": 0.9492,
"mean_token_accuracy": 0.7504236459732055,
"num_tokens": 7026900.0,
"step": 865
},
{
"entropy": 1.150540018081665,
"epoch": 0.4912478825522304,
"grad_norm": 91.7456283569336,
"learning_rate": 4.977766221821272e-06,
"loss": 0.8155,
"mean_token_accuracy": 0.779057776927948,
"num_tokens": 7067710.0,
"step": 870
},
{
"entropy": 1.1883390903472901,
"epoch": 0.49407114624505927,
"grad_norm": 90.35485076904297,
"learning_rate": 4.977510145183713e-06,
"loss": 0.8517,
"mean_token_accuracy": 0.7698142886161804,
"num_tokens": 7108524.0,
"step": 875
},
{
"entropy": 1.2226394414901733,
"epoch": 0.4968944099378882,
"grad_norm": 94.40557861328125,
"learning_rate": 4.97725261119839e-06,
"loss": 0.8542,
"mean_token_accuracy": 0.7724049091339111,
"num_tokens": 7149141.0,
"step": 880
},
{
"entropy": 1.2256523609161376,
"epoch": 0.4997176736307171,
"grad_norm": 118.25550842285156,
"learning_rate": 4.976993620067906e-06,
"loss": 0.9467,
"mean_token_accuracy": 0.747167456150055,
"num_tokens": 7189872.0,
"step": 885
},
{
"entropy": 1.3428989887237548,
"epoch": 0.502540937323546,
"grad_norm": 96.30892181396484,
"learning_rate": 4.9767331719960035e-06,
"loss": 0.9079,
"mean_token_accuracy": 0.7582340478897095,
"num_tokens": 7230448.0,
"step": 890
},
{
"entropy": 1.184017586708069,
"epoch": 0.5053642010163749,
"grad_norm": 97.78132629394531,
"learning_rate": 4.976471267187575e-06,
"loss": 0.8507,
"mean_token_accuracy": 0.7695409536361695,
"num_tokens": 7271071.0,
"step": 895
},
{
"entropy": 1.371737289428711,
"epoch": 0.5081874647092038,
"grad_norm": 102.5118408203125,
"learning_rate": 4.976207905848659e-06,
"loss": 0.9116,
"mean_token_accuracy": 0.7551624774932861,
"num_tokens": 7311722.0,
"step": 900
},
{
"entropy": 1.23578884601593,
"epoch": 0.5110107284020328,
"grad_norm": 91.31993103027344,
"learning_rate": 4.975943088186437e-06,
"loss": 0.8693,
"mean_token_accuracy": 0.7666425943374634,
"num_tokens": 7352436.0,
"step": 905
},
{
"entropy": 1.1599351406097411,
"epoch": 0.5138339920948617,
"grad_norm": 91.41136932373047,
"learning_rate": 4.9756768144092385e-06,
"loss": 0.8766,
"mean_token_accuracy": 0.7643981575965881,
"num_tokens": 7392980.0,
"step": 910
},
{
"entropy": 1.2577017545700073,
"epoch": 0.5166572557876906,
"grad_norm": 108.91261291503906,
"learning_rate": 4.975409084726538e-06,
"loss": 0.8776,
"mean_token_accuracy": 0.7656487226486206,
"num_tokens": 7433717.0,
"step": 915
},
{
"entropy": 1.1488389253616333,
"epoch": 0.5194805194805194,
"grad_norm": 90.68296813964844,
"learning_rate": 4.975139899348954e-06,
"loss": 0.8093,
"mean_token_accuracy": 0.7807123541831971,
"num_tokens": 7474433.0,
"step": 920
},
{
"entropy": 1.1915344953536988,
"epoch": 0.5223037831733484,
"grad_norm": 96.44047546386719,
"learning_rate": 4.974869258488254e-06,
"loss": 0.8236,
"mean_token_accuracy": 0.7743138551712037,
"num_tokens": 7515077.0,
"step": 925
},
{
"entropy": 1.250411081314087,
"epoch": 0.5251270468661773,
"grad_norm": 96.81196594238281,
"learning_rate": 4.9745971623573465e-06,
"loss": 0.8852,
"mean_token_accuracy": 0.7619670391082763,
"num_tokens": 7555778.0,
"step": 930
},
{
"entropy": 1.1861608266830443,
"epoch": 0.5279503105590062,
"grad_norm": 88.66297912597656,
"learning_rate": 4.974323611170286e-06,
"loss": 0.8606,
"mean_token_accuracy": 0.7666665554046631,
"num_tokens": 7596538.0,
"step": 935
},
{
"entropy": 1.1959111571311951,
"epoch": 0.5307735742518351,
"grad_norm": 104.4039535522461,
"learning_rate": 4.974048605142273e-06,
"loss": 0.8686,
"mean_token_accuracy": 0.7655381083488464,
"num_tokens": 7637030.0,
"step": 940
},
{
"entropy": 1.1513007164001465,
"epoch": 0.5335968379446641,
"grad_norm": 102.5768051147461,
"learning_rate": 4.9737721444896506e-06,
"loss": 0.8039,
"mean_token_accuracy": 0.7838160276412964,
"num_tokens": 7677640.0,
"step": 945
},
{
"entropy": 1.3042636871337892,
"epoch": 0.536420101637493,
"grad_norm": 78.14281463623047,
"learning_rate": 4.973494229429907e-06,
"loss": 0.9158,
"mean_token_accuracy": 0.7570660233497619,
"num_tokens": 7718349.0,
"step": 950
},
{
"entropy": 1.266080904006958,
"epoch": 0.5392433653303218,
"grad_norm": 110.9036865234375,
"learning_rate": 4.973214860181675e-06,
"loss": 0.93,
"mean_token_accuracy": 0.7506848573684692,
"num_tokens": 7759004.0,
"step": 955
},
{
"entropy": 1.1759474754333497,
"epoch": 0.5420666290231507,
"grad_norm": 108.20323944091797,
"learning_rate": 4.972934036964732e-06,
"loss": 0.7997,
"mean_token_accuracy": 0.7808601498603821,
"num_tokens": 7799425.0,
"step": 960
},
{
"entropy": 1.2482519030570984,
"epoch": 0.5448898927159797,
"grad_norm": 100.69843292236328,
"learning_rate": 4.972651759999997e-06,
"loss": 0.8741,
"mean_token_accuracy": 0.7640202879905701,
"num_tokens": 7840200.0,
"step": 965
},
{
"entropy": 1.2493391752243042,
"epoch": 0.5477131564088086,
"grad_norm": 111.9268798828125,
"learning_rate": 4.9723680295095335e-06,
"loss": 0.8481,
"mean_token_accuracy": 0.7669167637825012,
"num_tokens": 7880885.0,
"step": 970
},
{
"entropy": 1.213208556175232,
"epoch": 0.5505364201016375,
"grad_norm": 89.9885482788086,
"learning_rate": 4.972082845716551e-06,
"loss": 0.8463,
"mean_token_accuracy": 0.7700656652450562,
"num_tokens": 7921579.0,
"step": 975
},
{
"entropy": 1.2416555166244507,
"epoch": 0.5533596837944664,
"grad_norm": 94.0711669921875,
"learning_rate": 4.971796208845398e-06,
"loss": 0.8986,
"mean_token_accuracy": 0.759855318069458,
"num_tokens": 7961884.0,
"step": 980
},
{
"entropy": 1.3858731746673585,
"epoch": 0.5561829474872954,
"grad_norm": 89.50563049316406,
"learning_rate": 4.9715081191215705e-06,
"loss": 0.9528,
"mean_token_accuracy": 0.7506495118141174,
"num_tokens": 8002593.0,
"step": 985
},
{
"entropy": 1.2900074481964112,
"epoch": 0.5590062111801242,
"grad_norm": 98.44612884521484,
"learning_rate": 4.971218576771703e-06,
"loss": 0.9108,
"mean_token_accuracy": 0.7587322473526001,
"num_tokens": 8043253.0,
"step": 990
},
{
"entropy": 1.2571572303771972,
"epoch": 0.5618294748729531,
"grad_norm": 105.00495147705078,
"learning_rate": 4.970927582023577e-06,
"loss": 0.9269,
"mean_token_accuracy": 0.7483970880508423,
"num_tokens": 8083612.0,
"step": 995
},
{
"entropy": 1.245552134513855,
"epoch": 0.564652738565782,
"grad_norm": 98.6438980102539,
"learning_rate": 4.970635135106113e-06,
"loss": 0.8576,
"mean_token_accuracy": 0.7674945831298828,
"num_tokens": 8124241.0,
"step": 1000
},
{
"epoch": 0.564652738565782,
"eval_entropy": 1.337150764465332,
"eval_loss": 0.7948279976844788,
"eval_mean_token_accuracy": 0.7877452731132507,
"eval_num_tokens": 8124241.0,
"eval_runtime": 2.455,
"eval_samples_per_second": 15.886,
"eval_steps_per_second": 2.037,
"step": 1000
},
{
"entropy": 1.1067086219787599,
"epoch": 0.567476002258611,
"grad_norm": 106.83331298828125,
"learning_rate": 4.970341236249376e-06,
"loss": 0.7424,
"mean_token_accuracy": 0.7967899441719055,
"num_tokens": 8165061.0,
"step": 1005
},
{
"entropy": 1.2741168975830077,
"epoch": 0.5702992659514399,
"grad_norm": 98.59375762939453,
"learning_rate": 4.970045885684575e-06,
"loss": 0.8614,
"mean_token_accuracy": 0.7667156815528869,
"num_tokens": 8205689.0,
"step": 1010
},
{
"entropy": 1.3391781091690063,
"epoch": 0.5731225296442688,
"grad_norm": 99.1837387084961,
"learning_rate": 4.969749083644055e-06,
"loss": 0.9235,
"mean_token_accuracy": 0.7538702368736268,
"num_tokens": 8246286.0,
"step": 1015
},
{
"entropy": 1.2175718069076538,
"epoch": 0.5759457933370977,
"grad_norm": 97.83293151855469,
"learning_rate": 4.969450830361309e-06,
"loss": 0.8251,
"mean_token_accuracy": 0.7760945677757263,
"num_tokens": 8286897.0,
"step": 1020
},
{
"entropy": 1.2220812559127807,
"epoch": 0.5787690570299266,
"grad_norm": 84.45748901367188,
"learning_rate": 4.969151126070968e-06,
"loss": 0.8428,
"mean_token_accuracy": 0.7701049447059631,
"num_tokens": 8327581.0,
"step": 1025
},
{
"entropy": 1.2730301141738891,
"epoch": 0.5815923207227555,
"grad_norm": 105.70362091064453,
"learning_rate": 4.968849971008808e-06,
"loss": 0.8441,
"mean_token_accuracy": 0.771563708782196,
"num_tokens": 8368186.0,
"step": 1030
},
{
"entropy": 1.2853108644485474,
"epoch": 0.5844155844155844,
"grad_norm": 84.74578857421875,
"learning_rate": 4.968547365411742e-06,
"loss": 0.8923,
"mean_token_accuracy": 0.7610676050186157,
"num_tokens": 8408672.0,
"step": 1035
},
{
"entropy": 1.112477993965149,
"epoch": 0.5872388481084133,
"grad_norm": 90.72158813476562,
"learning_rate": 4.968243309517826e-06,
"loss": 0.7923,
"mean_token_accuracy": 0.7860908746719361,
"num_tokens": 8449222.0,
"step": 1040
},
{
"entropy": 1.3268202066421508,
"epoch": 0.5900621118012422,
"grad_norm": 86.42573547363281,
"learning_rate": 4.967937803566259e-06,
"loss": 0.8718,
"mean_token_accuracy": 0.7626396536827087,
"num_tokens": 8489553.0,
"step": 1045
},
{
"entropy": 1.240961480140686,
"epoch": 0.5928853754940712,
"grad_norm": 87.08723449707031,
"learning_rate": 4.967630847797378e-06,
"loss": 0.8409,
"mean_token_accuracy": 0.7718519926071167,
"num_tokens": 8530434.0,
"step": 1050
},
{
"entropy": 1.1958368062973022,
"epoch": 0.5957086391869001,
"grad_norm": 97.55753326416016,
"learning_rate": 4.967322442452661e-06,
"loss": 0.8397,
"mean_token_accuracy": 0.7687074780464173,
"num_tokens": 8571028.0,
"step": 1055
},
{
"entropy": 1.2576555967330934,
"epoch": 0.598531902879729,
"grad_norm": 97.74880981445312,
"learning_rate": 4.967012587774729e-06,
"loss": 0.8667,
"mean_token_accuracy": 0.7679498672485352,
"num_tokens": 8611332.0,
"step": 1060
},
{
"entropy": 1.1161712408065796,
"epoch": 0.6013551665725578,
"grad_norm": 84.0584487915039,
"learning_rate": 4.966701284007337e-06,
"loss": 0.811,
"mean_token_accuracy": 0.7786412715911866,
"num_tokens": 8651862.0,
"step": 1065
},
{
"entropy": 1.3346630334854126,
"epoch": 0.6041784302653868,
"grad_norm": 88.71247863769531,
"learning_rate": 4.966388531395388e-06,
"loss": 0.869,
"mean_token_accuracy": 0.768392026424408,
"num_tokens": 8692361.0,
"step": 1070
},
{
"entropy": 1.195302712917328,
"epoch": 0.6070016939582157,
"grad_norm": 80.38240051269531,
"learning_rate": 4.966074330184917e-06,
"loss": 0.7627,
"mean_token_accuracy": 0.790677547454834,
"num_tokens": 8733114.0,
"step": 1075
},
{
"entropy": 1.3030426979064942,
"epoch": 0.6098249576510446,
"grad_norm": 94.00823974609375,
"learning_rate": 4.965758680623106e-06,
"loss": 0.9205,
"mean_token_accuracy": 0.7608320355415344,
"num_tokens": 8773273.0,
"step": 1080
},
{
"entropy": 1.2931268215179443,
"epoch": 0.6126482213438735,
"grad_norm": 107.884033203125,
"learning_rate": 4.9654415829582714e-06,
"loss": 0.8814,
"mean_token_accuracy": 0.7640480279922486,
"num_tokens": 8813046.0,
"step": 1085
},
{
"entropy": 1.2563365936279296,
"epoch": 0.6154714850367025,
"grad_norm": 105.78944396972656,
"learning_rate": 4.965123037439869e-06,
"loss": 0.8697,
"mean_token_accuracy": 0.7670489430427552,
"num_tokens": 8853962.0,
"step": 1090
},
{
"entropy": 1.134628915786743,
"epoch": 0.6182947487295314,
"grad_norm": 99.70325469970703,
"learning_rate": 4.964803044318496e-06,
"loss": 0.8172,
"mean_token_accuracy": 0.7771477937698364,
"num_tokens": 8894573.0,
"step": 1095
},
{
"entropy": 1.2724319458007813,
"epoch": 0.6211180124223602,
"grad_norm": 93.54045104980469,
"learning_rate": 4.964481603845887e-06,
"loss": 0.8051,
"mean_token_accuracy": 0.7794610857963562,
"num_tokens": 8935263.0,
"step": 1100
},
{
"entropy": 1.3056188106536866,
"epoch": 0.6239412761151891,
"grad_norm": 107.5456771850586,
"learning_rate": 4.964158716274915e-06,
"loss": 0.9563,
"mean_token_accuracy": 0.7459239959716797,
"num_tokens": 8975867.0,
"step": 1105
},
{
"entropy": 1.379978322982788,
"epoch": 0.626764539808018,
"grad_norm": 95.43455505371094,
"learning_rate": 4.963834381859593e-06,
"loss": 0.9058,
"mean_token_accuracy": 0.7583242535591126,
"num_tokens": 9016727.0,
"step": 1110
},
{
"entropy": 1.316626739501953,
"epoch": 0.629587803500847,
"grad_norm": 94.45218658447266,
"learning_rate": 4.9635086008550694e-06,
"loss": 0.888,
"mean_token_accuracy": 0.7589772820472718,
"num_tokens": 9057474.0,
"step": 1115
},
{
"entropy": 1.2168570041656495,
"epoch": 0.6324110671936759,
"grad_norm": 90.01762390136719,
"learning_rate": 4.963181373517634e-06,
"loss": 0.8126,
"mean_token_accuracy": 0.7784791946411133,
"num_tokens": 9098180.0,
"step": 1120
},
{
"entropy": 1.3705679416656493,
"epoch": 0.6352343308865048,
"grad_norm": 89.59708404541016,
"learning_rate": 4.9628527001047105e-06,
"loss": 0.8904,
"mean_token_accuracy": 0.7621181845664978,
"num_tokens": 9138506.0,
"step": 1125
},
{
"entropy": 1.2591190934181213,
"epoch": 0.6380575945793338,
"grad_norm": 100.79552459716797,
"learning_rate": 4.9625225808748636e-06,
"loss": 0.8763,
"mean_token_accuracy": 0.7636958003044129,
"num_tokens": 9179140.0,
"step": 1130
},
{
"entropy": 1.2524430751800537,
"epoch": 0.6408808582721626,
"grad_norm": 83.18380737304688,
"learning_rate": 4.962191016087795e-06,
"loss": 0.8607,
"mean_token_accuracy": 0.7672532081604004,
"num_tokens": 9219760.0,
"step": 1135
},
{
"entropy": 1.302207589149475,
"epoch": 0.6437041219649915,
"grad_norm": 84.2943344116211,
"learning_rate": 4.961858006004342e-06,
"loss": 0.9161,
"mean_token_accuracy": 0.7545537352561951,
"num_tokens": 9259918.0,
"step": 1140
},
{
"entropy": 1.2336005210876464,
"epoch": 0.6465273856578204,
"grad_norm": 103.37287139892578,
"learning_rate": 4.961523550886479e-06,
"loss": 0.8367,
"mean_token_accuracy": 0.7728558421134949,
"num_tokens": 9300420.0,
"step": 1145
},
{
"entropy": 1.3373284339904785,
"epoch": 0.6493506493506493,
"grad_norm": 80.52223205566406,
"learning_rate": 4.9611876509973185e-06,
"loss": 0.8708,
"mean_token_accuracy": 0.7663377165794373,
"num_tokens": 9341081.0,
"step": 1150
},
{
"entropy": 1.1970614194869995,
"epoch": 0.6521739130434783,
"grad_norm": 93.88483428955078,
"learning_rate": 4.96085030660111e-06,
"loss": 0.7878,
"mean_token_accuracy": 0.7849771738052368,
"num_tokens": 9381651.0,
"step": 1155
},
{
"entropy": 1.1667418718338012,
"epoch": 0.6549971767363072,
"grad_norm": 94.35794830322266,
"learning_rate": 4.960511517963236e-06,
"loss": 0.7963,
"mean_token_accuracy": 0.7851962924003602,
"num_tokens": 9422500.0,
"step": 1160
},
{
"entropy": 1.244291114807129,
"epoch": 0.6578204404291361,
"grad_norm": 89.03894805908203,
"learning_rate": 4.96017128535022e-06,
"loss": 0.8179,
"mean_token_accuracy": 0.779565978050232,
"num_tokens": 9463027.0,
"step": 1165
},
{
"entropy": 1.2647228240966797,
"epoch": 0.6606437041219649,
"grad_norm": 91.51069641113281,
"learning_rate": 4.959829609029717e-06,
"loss": 0.8038,
"mean_token_accuracy": 0.7766362309455872,
"num_tokens": 9503101.0,
"step": 1170
},
{
"entropy": 1.3284830093383788,
"epoch": 0.6634669678147939,
"grad_norm": 94.90776062011719,
"learning_rate": 4.9594864892705204e-06,
"loss": 0.8927,
"mean_token_accuracy": 0.7618573546409607,
"num_tokens": 9543661.0,
"step": 1175
},
{
"entropy": 1.3220321416854859,
"epoch": 0.6662902315076228,
"grad_norm": 111.29701232910156,
"learning_rate": 4.959141926342559e-06,
"loss": 0.9056,
"mean_token_accuracy": 0.7612986207008362,
"num_tokens": 9584316.0,
"step": 1180
},
{
"entropy": 1.3569005489349366,
"epoch": 0.6691134952004517,
"grad_norm": 99.6978530883789,
"learning_rate": 4.958795920516895e-06,
"loss": 0.9268,
"mean_token_accuracy": 0.7549396872520446,
"num_tokens": 9624978.0,
"step": 1185
},
{
"entropy": 1.266989517211914,
"epoch": 0.6719367588932806,
"grad_norm": 84.81405639648438,
"learning_rate": 4.958448472065729e-06,
"loss": 0.8148,
"mean_token_accuracy": 0.7786801934242249,
"num_tokens": 9665779.0,
"step": 1190
},
{
"entropy": 1.2129095792770386,
"epoch": 0.6747600225861096,
"grad_norm": 97.6620864868164,
"learning_rate": 4.958099581262393e-06,
"loss": 0.8645,
"mean_token_accuracy": 0.7641335129737854,
"num_tokens": 9705918.0,
"step": 1195
},
{
"entropy": 1.3208855628967284,
"epoch": 0.6775832862789385,
"grad_norm": 84.83293914794922,
"learning_rate": 4.957749248381356e-06,
"loss": 0.8521,
"mean_token_accuracy": 0.7699254035949707,
"num_tokens": 9746614.0,
"step": 1200
},
{
"entropy": 1.2614996910095215,
"epoch": 0.6804065499717674,
"grad_norm": 90.78227996826172,
"learning_rate": 4.957397473698221e-06,
"loss": 0.8468,
"mean_token_accuracy": 0.7698518514633179,
"num_tokens": 9787436.0,
"step": 1205
},
{
"entropy": 1.1796211361885072,
"epoch": 0.6832298136645962,
"grad_norm": 89.12525177001953,
"learning_rate": 4.957044257489724e-06,
"loss": 0.7778,
"mean_token_accuracy": 0.7862819194793701,
"num_tokens": 9828249.0,
"step": 1210
},
{
"entropy": 1.2314157962799073,
"epoch": 0.6860530773574252,
"grad_norm": 83.9203109741211,
"learning_rate": 4.956689600033736e-06,
"loss": 0.8364,
"mean_token_accuracy": 0.7717383742332459,
"num_tokens": 9868382.0,
"step": 1215
},
{
"entropy": 1.3469780206680297,
"epoch": 0.6888763410502541,
"grad_norm": 94.05850982666016,
"learning_rate": 4.956333501609263e-06,
"loss": 0.8967,
"mean_token_accuracy": 0.7598065853118896,
"num_tokens": 9909206.0,
"step": 1220
},
{
"entropy": 1.4139271020889281,
"epoch": 0.691699604743083,
"grad_norm": 100.73336791992188,
"learning_rate": 4.955975962496443e-06,
"loss": 0.9323,
"mean_token_accuracy": 0.7512782216072083,
"num_tokens": 9949369.0,
"step": 1225
},
{
"entropy": 1.4305032253265382,
"epoch": 0.6945228684359119,
"grad_norm": 105.20040130615234,
"learning_rate": 4.955616982976546e-06,
"loss": 0.8752,
"mean_token_accuracy": 0.7681516885757447,
"num_tokens": 9989740.0,
"step": 1230
},
{
"entropy": 1.3764580249786378,
"epoch": 0.6973461321287409,
"grad_norm": 90.64266967773438,
"learning_rate": 4.95525656333198e-06,
"loss": 0.906,
"mean_token_accuracy": 0.7587046384811401,
"num_tokens": 10030301.0,
"step": 1235
},
{
"entropy": 1.3211364984512328,
"epoch": 0.7001693958215698,
"grad_norm": 80.13589477539062,
"learning_rate": 4.954894703846281e-06,
"loss": 0.8725,
"mean_token_accuracy": 0.7638087749481202,
"num_tokens": 10071097.0,
"step": 1240
},
{
"entropy": 1.285390853881836,
"epoch": 0.7029926595143986,
"grad_norm": 81.82262420654297,
"learning_rate": 4.95453140480412e-06,
"loss": 0.8349,
"mean_token_accuracy": 0.7691360473632812,
"num_tokens": 10111646.0,
"step": 1245
},
{
"entropy": 1.2638070821762084,
"epoch": 0.7058159232072275,
"grad_norm": 80.83232879638672,
"learning_rate": 4.954166666491299e-06,
"loss": 0.8681,
"mean_token_accuracy": 0.766269302368164,
"num_tokens": 10152363.0,
"step": 1250
},
{
"entropy": 1.2698018312454225,
"epoch": 0.7086391869000565,
"grad_norm": 92.83601379394531,
"learning_rate": 4.953800489194755e-06,
"loss": 0.864,
"mean_token_accuracy": 0.7671181201934815,
"num_tokens": 10192739.0,
"step": 1255
},
{
"entropy": 1.1670500040054321,
"epoch": 0.7114624505928854,
"grad_norm": 92.00151824951172,
"learning_rate": 4.953432873202555e-06,
"loss": 0.8179,
"mean_token_accuracy": 0.7748780965805053,
"num_tokens": 10232940.0,
"step": 1260
},
{
"entropy": 1.1253260135650636,
"epoch": 0.7142857142857143,
"grad_norm": 75.87653350830078,
"learning_rate": 4.953063818803897e-06,
"loss": 0.7016,
"mean_token_accuracy": 0.8046749353408813,
"num_tokens": 10273648.0,
"step": 1265
},
{
"entropy": 1.1963707447052,
"epoch": 0.7171089779785432,
"grad_norm": 89.2532958984375,
"learning_rate": 4.952693326289112e-06,
"loss": 0.8017,
"mean_token_accuracy": 0.7819836020469666,
"num_tokens": 10314317.0,
"step": 1270
},
{
"entropy": 1.2884133338928223,
"epoch": 0.7199322416713722,
"grad_norm": 94.43043518066406,
"learning_rate": 4.9523213959496635e-06,
"loss": 0.8479,
"mean_token_accuracy": 0.7720703601837158,
"num_tokens": 10355042.0,
"step": 1275
},
{
"entropy": 1.2198226928710938,
"epoch": 0.722755505364201,
"grad_norm": 76.82041931152344,
"learning_rate": 4.951948028078143e-06,
"loss": 0.8174,
"mean_token_accuracy": 0.7741351842880249,
"num_tokens": 10395782.0,
"step": 1280
},
{
"entropy": 1.3100014686584474,
"epoch": 0.7255787690570299,
"grad_norm": 84.25589752197266,
"learning_rate": 4.951573222968275e-06,
"loss": 0.8215,
"mean_token_accuracy": 0.7780505061149597,
"num_tokens": 10436544.0,
"step": 1285
},
{
"entropy": 1.349497175216675,
"epoch": 0.7284020327498588,
"grad_norm": 93.43528747558594,
"learning_rate": 4.951196980914915e-06,
"loss": 0.8345,
"mean_token_accuracy": 0.7736409544944763,
"num_tokens": 10477168.0,
"step": 1290
},
{
"entropy": 1.2174330472946167,
"epoch": 0.7312252964426877,
"grad_norm": 88.33943176269531,
"learning_rate": 4.950819302214048e-06,
"loss": 0.7522,
"mean_token_accuracy": 0.7905766010284424,
"num_tokens": 10517860.0,
"step": 1295
},
{
"entropy": 1.318717932701111,
"epoch": 0.7340485601355167,
"grad_norm": 110.97924041748047,
"learning_rate": 4.950440187162788e-06,
"loss": 0.8235,
"mean_token_accuracy": 0.7764236092567444,
"num_tokens": 10558572.0,
"step": 1300
},
{
"entropy": 1.2312336444854737,
"epoch": 0.7368718238283456,
"grad_norm": 79.44239044189453,
"learning_rate": 4.950059636059382e-06,
"loss": 0.8554,
"mean_token_accuracy": 0.7687599420547485,
"num_tokens": 10599139.0,
"step": 1305
},
{
"entropy": 1.2320231914520263,
"epoch": 0.7396950875211745,
"grad_norm": 80.55126953125,
"learning_rate": 4.949677649203205e-06,
"loss": 0.8153,
"mean_token_accuracy": 0.7771285057067872,
"num_tokens": 10639780.0,
"step": 1310
},
{
"entropy": 1.259970498085022,
"epoch": 0.7425183512140033,
"grad_norm": 84.45753479003906,
"learning_rate": 4.949294226894759e-06,
"loss": 0.8074,
"mean_token_accuracy": 0.7815461993217468,
"num_tokens": 10680141.0,
"step": 1315
},
{
"entropy": 1.3869355916976929,
"epoch": 0.7453416149068323,
"grad_norm": 83.97404479980469,
"learning_rate": 4.948909369435681e-06,
"loss": 0.8837,
"mean_token_accuracy": 0.7602412819862365,
"num_tokens": 10720965.0,
"step": 1320
},
{
"entropy": 1.2961002826690673,
"epoch": 0.7481648785996612,
"grad_norm": 86.73141479492188,
"learning_rate": 4.948523077128732e-06,
"loss": 0.7997,
"mean_token_accuracy": 0.779944372177124,
"num_tokens": 10761782.0,
"step": 1325
},
{
"entropy": 1.3356840848922729,
"epoch": 0.7509881422924901,
"grad_norm": 99.98656463623047,
"learning_rate": 4.948135350277804e-06,
"loss": 0.8639,
"mean_token_accuracy": 0.7667491197586059,
"num_tokens": 10802436.0,
"step": 1330
},
{
"entropy": 1.3833496570587158,
"epoch": 0.753811405985319,
"grad_norm": 97.27751922607422,
"learning_rate": 4.9477461891879175e-06,
"loss": 0.8625,
"mean_token_accuracy": 0.7689119338989258,
"num_tokens": 10842892.0,
"step": 1335
},
{
"entropy": 1.3714573621749877,
"epoch": 0.756634669678148,
"grad_norm": 96.33169555664062,
"learning_rate": 4.9473555941652205e-06,
"loss": 0.8321,
"mean_token_accuracy": 0.7727259278297425,
"num_tokens": 10883719.0,
"step": 1340
},
{
"entropy": 1.1747460842132569,
"epoch": 0.7594579333709769,
"grad_norm": 87.69495391845703,
"learning_rate": 4.94696356551699e-06,
"loss": 0.7642,
"mean_token_accuracy": 0.7857497334480286,
"num_tokens": 10924460.0,
"step": 1345
},
{
"entropy": 1.302380084991455,
"epoch": 0.7622811970638057,
"grad_norm": 83.05201721191406,
"learning_rate": 4.946570103551629e-06,
"loss": 0.8617,
"mean_token_accuracy": 0.7670197010040283,
"num_tokens": 10964412.0,
"step": 1350
},
{
"entropy": 1.3350585222244262,
"epoch": 0.7651044607566346,
"grad_norm": 106.51172637939453,
"learning_rate": 4.946175208578671e-06,
"loss": 0.8771,
"mean_token_accuracy": 0.7620217323303222,
"num_tokens": 11004987.0,
"step": 1355
},
{
"entropy": 1.40216805934906,
"epoch": 0.7679277244494636,
"grad_norm": 95.75938415527344,
"learning_rate": 4.945778880908774e-06,
"loss": 0.8833,
"mean_token_accuracy": 0.7614566802978515,
"num_tokens": 11045870.0,
"step": 1360
},
{
"entropy": 1.266040849685669,
"epoch": 0.7707509881422925,
"grad_norm": 95.71835327148438,
"learning_rate": 4.945381120853725e-06,
"loss": 0.8284,
"mean_token_accuracy": 0.7741784572601318,
"num_tokens": 11086215.0,
"step": 1365
},
{
"entropy": 1.22134690284729,
"epoch": 0.7735742518351214,
"grad_norm": 83.96733093261719,
"learning_rate": 4.9449819287264355e-06,
"loss": 0.8055,
"mean_token_accuracy": 0.7777843117713928,
"num_tokens": 11126570.0,
"step": 1370
},
{
"entropy": 1.4879465103149414,
"epoch": 0.7763975155279503,
"grad_norm": 105.71060943603516,
"learning_rate": 4.944581304840948e-06,
"loss": 0.8712,
"mean_token_accuracy": 0.7661378502845764,
"num_tokens": 11167233.0,
"step": 1375
},
{
"entropy": 1.2257203221321107,
"epoch": 0.7792207792207793,
"grad_norm": 77.5268325805664,
"learning_rate": 4.944179249512425e-06,
"loss": 0.7715,
"mean_token_accuracy": 0.7869171380996705,
"num_tokens": 11207695.0,
"step": 1380
},
{
"entropy": 1.3683911561965942,
"epoch": 0.7820440429136082,
"grad_norm": 82.34163665771484,
"learning_rate": 4.943775763057162e-06,
"loss": 0.862,
"mean_token_accuracy": 0.7655371904373169,
"num_tokens": 11248219.0,
"step": 1385
},
{
"entropy": 1.1952388167381287,
"epoch": 0.784867306606437,
"grad_norm": 85.42621612548828,
"learning_rate": 4.943370845792576e-06,
"loss": 0.7864,
"mean_token_accuracy": 0.7840098381042481,
"num_tokens": 11288948.0,
"step": 1390
},
{
"entropy": 1.15340895652771,
"epoch": 0.7876905702992659,
"grad_norm": 76.76214599609375,
"learning_rate": 4.942964498037211e-06,
"loss": 0.7282,
"mean_token_accuracy": 0.7976190567016601,
"num_tokens": 11329661.0,
"step": 1395
},
{
"entropy": 1.2893347024917603,
"epoch": 0.7905138339920948,
"grad_norm": 92.17630004882812,
"learning_rate": 4.942556720110734e-06,
"loss": 0.8198,
"mean_token_accuracy": 0.7803659439086914,
"num_tokens": 11370231.0,
"step": 1400
},
{
"entropy": 1.2334280252456664,
"epoch": 0.7933370976849238,
"grad_norm": 75.12127685546875,
"learning_rate": 4.942147512333941e-06,
"loss": 0.7722,
"mean_token_accuracy": 0.787669575214386,
"num_tokens": 11411082.0,
"step": 1405
},
{
"entropy": 1.2461157083511352,
"epoch": 0.7961603613777527,
"grad_norm": 73.56942749023438,
"learning_rate": 4.9417368750287505e-06,
"loss": 0.7603,
"mean_token_accuracy": 0.789132559299469,
"num_tokens": 11451850.0,
"step": 1410
},
{
"entropy": 1.2484428882598877,
"epoch": 0.7989836250705816,
"grad_norm": 105.41342163085938,
"learning_rate": 4.941324808518204e-06,
"loss": 0.8172,
"mean_token_accuracy": 0.7764388442039489,
"num_tokens": 11492503.0,
"step": 1415
},
{
"entropy": 1.3737143993377685,
"epoch": 0.8018068887634106,
"grad_norm": 94.12542724609375,
"learning_rate": 4.940911313126473e-06,
"loss": 0.8495,
"mean_token_accuracy": 0.7677985429763794,
"num_tokens": 11533093.0,
"step": 1420
},
{
"entropy": 1.1714343309402466,
"epoch": 0.8046301524562394,
"grad_norm": 99.68680572509766,
"learning_rate": 4.9404963891788475e-06,
"loss": 0.7494,
"mean_token_accuracy": 0.7915635108947754,
"num_tokens": 11573883.0,
"step": 1425
},
{
"entropy": 1.3285836696624755,
"epoch": 0.8074534161490683,
"grad_norm": 90.7094497680664,
"learning_rate": 4.940080037001742e-06,
"loss": 0.8449,
"mean_token_accuracy": 0.7693374752998352,
"num_tokens": 11614518.0,
"step": 1430
},
{
"entropy": 1.169933295249939,
"epoch": 0.8102766798418972,
"grad_norm": 72.55992889404297,
"learning_rate": 4.939662256922698e-06,
"loss": 0.7506,
"mean_token_accuracy": 0.7904896259307861,
"num_tokens": 11655176.0,
"step": 1435
},
{
"entropy": 1.2623589515686036,
"epoch": 0.8130999435347261,
"grad_norm": 75.47972869873047,
"learning_rate": 4.939243049270377e-06,
"loss": 0.8011,
"mean_token_accuracy": 0.7814219117164611,
"num_tokens": 11695698.0,
"step": 1440
},
{
"entropy": 1.2184853553771973,
"epoch": 0.8159232072275551,
"grad_norm": 83.59069061279297,
"learning_rate": 4.938822414374564e-06,
"loss": 0.7559,
"mean_token_accuracy": 0.7949077248573303,
"num_tokens": 11736485.0,
"step": 1445
},
{
"entropy": 1.1922895908355713,
"epoch": 0.818746470920384,
"grad_norm": 81.77254486083984,
"learning_rate": 4.938400352566171e-06,
"loss": 0.7586,
"mean_token_accuracy": 0.7909193158149719,
"num_tokens": 11777142.0,
"step": 1450
},
{
"entropy": 1.2163417339324951,
"epoch": 0.8215697346132129,
"grad_norm": 82.5959701538086,
"learning_rate": 4.937976864177224e-06,
"loss": 0.7848,
"mean_token_accuracy": 0.7832719922065735,
"num_tokens": 11817664.0,
"step": 1455
},
{
"entropy": 1.3492765426635742,
"epoch": 0.8243929983060417,
"grad_norm": 90.08179473876953,
"learning_rate": 4.937551949540879e-06,
"loss": 0.8054,
"mean_token_accuracy": 0.7786729335784912,
"num_tokens": 11858278.0,
"step": 1460
},
{
"entropy": 1.1865896463394165,
"epoch": 0.8272162619988707,
"grad_norm": 83.99300384521484,
"learning_rate": 4.937125608991411e-06,
"loss": 0.7966,
"mean_token_accuracy": 0.7846225380897522,
"num_tokens": 11899097.0,
"step": 1465
},
{
"entropy": 1.29942147731781,
"epoch": 0.8300395256916996,
"grad_norm": 106.03931427001953,
"learning_rate": 4.936697842864218e-06,
"loss": 0.8261,
"mean_token_accuracy": 0.7756908416748047,
"num_tokens": 11939670.0,
"step": 1470
},
{
"entropy": 1.3267370939254761,
"epoch": 0.8328627893845285,
"grad_norm": 91.82074737548828,
"learning_rate": 4.936268651495817e-06,
"loss": 0.8495,
"mean_token_accuracy": 0.7673832535743713,
"num_tokens": 11980324.0,
"step": 1475
},
{
"entropy": 1.3590010166168214,
"epoch": 0.8356860530773574,
"grad_norm": 103.03872680664062,
"learning_rate": 4.935838035223848e-06,
"loss": 0.8074,
"mean_token_accuracy": 0.7801963925361634,
"num_tokens": 12020951.0,
"step": 1480
},
{
"entropy": 1.2534837961196899,
"epoch": 0.8385093167701864,
"grad_norm": 83.96383666992188,
"learning_rate": 4.935405994387073e-06,
"loss": 0.7585,
"mean_token_accuracy": 0.7928261280059814,
"num_tokens": 12061701.0,
"step": 1485
},
{
"entropy": 1.2540729999542237,
"epoch": 0.8413325804630153,
"grad_norm": 79.00284576416016,
"learning_rate": 4.9349725293253716e-06,
"loss": 0.7991,
"mean_token_accuracy": 0.778445029258728,
"num_tokens": 12102385.0,
"step": 1490
},
{
"entropy": 1.3115083932876588,
"epoch": 0.8441558441558441,
"grad_norm": 90.81023406982422,
"learning_rate": 4.934537640379746e-06,
"loss": 0.8204,
"mean_token_accuracy": 0.7764154672622681,
"num_tokens": 12142334.0,
"step": 1495
},
{
"entropy": 1.2142093896865844,
"epoch": 0.846979107848673,
"grad_norm": 85.30113220214844,
"learning_rate": 4.93410132789232e-06,
"loss": 0.7324,
"mean_token_accuracy": 0.7949307560920715,
"num_tokens": 12182674.0,
"step": 1500
},
{
"epoch": 0.846979107848673,
"eval_entropy": 1.402895951271057,
"eval_loss": 0.7068600058555603,
"eval_mean_token_accuracy": 0.8026972770690918,
"eval_num_tokens": 12182674.0,
"eval_runtime": 2.4508,
"eval_samples_per_second": 15.913,
"eval_steps_per_second": 2.04,
"step": 1500
},
{
"entropy": 1.3486925840377808,
"epoch": 0.849802371541502,
"grad_norm": 87.53974914550781,
"learning_rate": 4.933663592206334e-06,
"loss": 0.8008,
"mean_token_accuracy": 0.7834212064743042,
"num_tokens": 12223259.0,
"step": 1505
},
{
"entropy": 1.3488259077072144,
"epoch": 0.8526256352343309,
"grad_norm": 100.90879821777344,
"learning_rate": 4.933224433666149e-06,
"loss": 0.8067,
"mean_token_accuracy": 0.7783913731575012,
"num_tokens": 12263995.0,
"step": 1510
},
{
"entropy": 1.2334957122802734,
"epoch": 0.8554488989271598,
"grad_norm": 80.5767593383789,
"learning_rate": 4.932783852617246e-06,
"loss": 0.7855,
"mean_token_accuracy": 0.7817227125167847,
"num_tokens": 12304800.0,
"step": 1515
},
{
"entropy": 1.302841305732727,
"epoch": 0.8582721626199887,
"grad_norm": 86.97444915771484,
"learning_rate": 4.932341849406226e-06,
"loss": 0.8505,
"mean_token_accuracy": 0.7658728957176208,
"num_tokens": 12345368.0,
"step": 1520
},
{
"entropy": 1.3894311428070067,
"epoch": 0.8610954263128177,
"grad_norm": 89.71028900146484,
"learning_rate": 4.931898424380807e-06,
"loss": 0.8677,
"mean_token_accuracy": 0.7690507411956787,
"num_tokens": 12385525.0,
"step": 1525
},
{
"entropy": 1.4505614280700683,
"epoch": 0.8639186900056465,
"grad_norm": 77.6048355102539,
"learning_rate": 4.9314535778898265e-06,
"loss": 0.8238,
"mean_token_accuracy": 0.7735900044441223,
"num_tokens": 12426154.0,
"step": 1530
},
{
"entropy": 1.2836755752563476,
"epoch": 0.8667419536984754,
"grad_norm": 86.86837005615234,
"learning_rate": 4.931007310283239e-06,
"loss": 0.7734,
"mean_token_accuracy": 0.7858689308166504,
"num_tokens": 12466767.0,
"step": 1535
},
{
"entropy": 1.3301030158996583,
"epoch": 0.8695652173913043,
"grad_norm": 91.63949584960938,
"learning_rate": 4.930559621912119e-06,
"loss": 0.8515,
"mean_token_accuracy": 0.7686718940734864,
"num_tokens": 12507425.0,
"step": 1540
},
{
"entropy": 1.179738998413086,
"epoch": 0.8723884810841332,
"grad_norm": 84.38855743408203,
"learning_rate": 4.9301105131286575e-06,
"loss": 0.7372,
"mean_token_accuracy": 0.7955639958381653,
"num_tokens": 12548198.0,
"step": 1545
},
{
"entropy": 1.2709600925445557,
"epoch": 0.8752117447769622,
"grad_norm": 81.11436462402344,
"learning_rate": 4.929659984286162e-06,
"loss": 0.793,
"mean_token_accuracy": 0.77951899766922,
"num_tokens": 12589035.0,
"step": 1550
},
{
"entropy": 1.2885040760040283,
"epoch": 0.8780350084697911,
"grad_norm": 82.41911315917969,
"learning_rate": 4.929208035739059e-06,
"loss": 0.8032,
"mean_token_accuracy": 0.7800351619720459,
"num_tokens": 12629659.0,
"step": 1555
},
{
"entropy": 1.336904239654541,
"epoch": 0.88085827216262,
"grad_norm": 81.08649444580078,
"learning_rate": 4.928754667842891e-06,
"loss": 0.8051,
"mean_token_accuracy": 0.7788230299949646,
"num_tokens": 12670444.0,
"step": 1560
},
{
"entropy": 1.2858728647232056,
"epoch": 0.883681535855449,
"grad_norm": 73.97679138183594,
"learning_rate": 4.9282998809543184e-06,
"loss": 0.8002,
"mean_token_accuracy": 0.7809403300285339,
"num_tokens": 12710507.0,
"step": 1565
},
{
"entropy": 1.1876578330993652,
"epoch": 0.8865047995482778,
"grad_norm": 75.67999267578125,
"learning_rate": 4.927843675431114e-06,
"loss": 0.7803,
"mean_token_accuracy": 0.7848599433898926,
"num_tokens": 12751257.0,
"step": 1570
},
{
"entropy": 1.3983800411224365,
"epoch": 0.8893280632411067,
"grad_norm": 93.8427734375,
"learning_rate": 4.927386051632171e-06,
"loss": 0.8444,
"mean_token_accuracy": 0.7689159154891968,
"num_tokens": 12791859.0,
"step": 1575
},
{
"entropy": 1.3883586883544923,
"epoch": 0.8921513269339356,
"grad_norm": 72.4537353515625,
"learning_rate": 4.926927009917497e-06,
"loss": 0.8619,
"mean_token_accuracy": 0.7652193188667298,
"num_tokens": 12832587.0,
"step": 1580
},
{
"entropy": 1.2995959281921388,
"epoch": 0.8949745906267645,
"grad_norm": 76.91584014892578,
"learning_rate": 4.926466550648214e-06,
"loss": 0.7652,
"mean_token_accuracy": 0.7863158702850341,
"num_tokens": 12873265.0,
"step": 1585
},
{
"entropy": 1.3862105369567872,
"epoch": 0.8977978543195935,
"grad_norm": 83.39083862304688,
"learning_rate": 4.926004674186559e-06,
"loss": 0.8303,
"mean_token_accuracy": 0.7747887969017029,
"num_tokens": 12913987.0,
"step": 1590
},
{
"entropy": 1.411944603919983,
"epoch": 0.9006211180124224,
"grad_norm": 100.23735046386719,
"learning_rate": 4.925541380895887e-06,
"loss": 0.8915,
"mean_token_accuracy": 0.7623481512069702,
"num_tokens": 12954481.0,
"step": 1595
},
{
"entropy": 1.271065902709961,
"epoch": 0.9034443817052513,
"grad_norm": 82.51213073730469,
"learning_rate": 4.925076671140663e-06,
"loss": 0.7595,
"mean_token_accuracy": 0.7901308059692382,
"num_tokens": 12994972.0,
"step": 1600
},
{
"entropy": 1.3827197790145873,
"epoch": 0.9062676453980801,
"grad_norm": 96.4259033203125,
"learning_rate": 4.924610545286469e-06,
"loss": 0.8563,
"mean_token_accuracy": 0.7717739820480347,
"num_tokens": 13035763.0,
"step": 1605
},
{
"entropy": 1.336452317237854,
"epoch": 0.9090909090909091,
"grad_norm": 81.70915985107422,
"learning_rate": 4.924143003700002e-06,
"loss": 0.794,
"mean_token_accuracy": 0.7825908780097961,
"num_tokens": 13076206.0,
"step": 1610
},
{
"entropy": 1.3284541368484497,
"epoch": 0.911914172783738,
"grad_norm": 76.34697723388672,
"learning_rate": 4.92367404674907e-06,
"loss": 0.7913,
"mean_token_accuracy": 0.7860976576805114,
"num_tokens": 13116891.0,
"step": 1615
},
{
"entropy": 1.2646841764450074,
"epoch": 0.9147374364765669,
"grad_norm": 72.51228332519531,
"learning_rate": 4.923203674802598e-06,
"loss": 0.7325,
"mean_token_accuracy": 0.7984351754188538,
"num_tokens": 13157468.0,
"step": 1620
},
{
"entropy": 1.3084574460983276,
"epoch": 0.9175607001693958,
"grad_norm": 107.15888214111328,
"learning_rate": 4.922731888230618e-06,
"loss": 0.8181,
"mean_token_accuracy": 0.7769845604896546,
"num_tokens": 13198363.0,
"step": 1625
},
{
"entropy": 1.1560762643814086,
"epoch": 0.9203839638622248,
"grad_norm": 76.61054229736328,
"learning_rate": 4.922258687404285e-06,
"loss": 0.7701,
"mean_token_accuracy": 0.7877731561660767,
"num_tokens": 13239144.0,
"step": 1630
},
{
"entropy": 1.353186583518982,
"epoch": 0.9232072275550537,
"grad_norm": 89.5690689086914,
"learning_rate": 4.9217840726958535e-06,
"loss": 0.8118,
"mean_token_accuracy": 0.778467345237732,
"num_tokens": 13279908.0,
"step": 1635
},
{
"entropy": 1.3830771446228027,
"epoch": 0.9260304912478825,
"grad_norm": 104.98886108398438,
"learning_rate": 4.921308044478703e-06,
"loss": 0.8618,
"mean_token_accuracy": 0.7697537779808045,
"num_tokens": 13320629.0,
"step": 1640
},
{
"entropy": 1.1967981815338136,
"epoch": 0.9288537549407114,
"grad_norm": 73.26326751708984,
"learning_rate": 4.9208306031273155e-06,
"loss": 0.7582,
"mean_token_accuracy": 0.7919500350952149,
"num_tokens": 13361048.0,
"step": 1645
},
{
"entropy": 1.3969624519348145,
"epoch": 0.9316770186335404,
"grad_norm": 93.82074737548828,
"learning_rate": 4.920351749017291e-06,
"loss": 0.8547,
"mean_token_accuracy": 0.7672523736953736,
"num_tokens": 13401586.0,
"step": 1650
},
{
"entropy": 1.3160523414611816,
"epoch": 0.9345002823263693,
"grad_norm": 104.9430923461914,
"learning_rate": 4.919871482525337e-06,
"loss": 0.846,
"mean_token_accuracy": 0.7698705196380615,
"num_tokens": 13442383.0,
"step": 1655
},
{
"entropy": 1.2329437971115111,
"epoch": 0.9373235460191982,
"grad_norm": 100.20277404785156,
"learning_rate": 4.919389804029273e-06,
"loss": 0.751,
"mean_token_accuracy": 0.7936769366264343,
"num_tokens": 13482846.0,
"step": 1660
},
{
"entropy": 1.271721601486206,
"epoch": 0.9401468097120271,
"grad_norm": 85.16747283935547,
"learning_rate": 4.918906713908032e-06,
"loss": 0.7839,
"mean_token_accuracy": 0.7872077226638794,
"num_tokens": 13523418.0,
"step": 1665
},
{
"entropy": 1.2720824241638184,
"epoch": 0.9429700734048561,
"grad_norm": 87.69062805175781,
"learning_rate": 4.918422212541653e-06,
"loss": 0.8469,
"mean_token_accuracy": 0.7712218523025512,
"num_tokens": 13564069.0,
"step": 1670
},
{
"entropy": 1.4191228151321411,
"epoch": 0.9457933370976849,
"grad_norm": 98.06558990478516,
"learning_rate": 4.917936300311288e-06,
"loss": 0.8549,
"mean_token_accuracy": 0.7684029340744019,
"num_tokens": 13604748.0,
"step": 1675
},
{
"entropy": 1.38045175075531,
"epoch": 0.9486166007905138,
"grad_norm": 81.36489868164062,
"learning_rate": 4.9174489775991985e-06,
"loss": 0.841,
"mean_token_accuracy": 0.767688262462616,
"num_tokens": 13645366.0,
"step": 1680
},
{
"entropy": 1.307987141609192,
"epoch": 0.9514398644833427,
"grad_norm": 83.6993408203125,
"learning_rate": 4.916960244788755e-06,
"loss": 0.8074,
"mean_token_accuracy": 0.7801409244537354,
"num_tokens": 13686143.0,
"step": 1685
},
{
"entropy": 1.3370389699935914,
"epoch": 0.9542631281761716,
"grad_norm": 87.65348052978516,
"learning_rate": 4.91647010226444e-06,
"loss": 0.7783,
"mean_token_accuracy": 0.7867790102958679,
"num_tokens": 13726865.0,
"step": 1690
},
{
"entropy": 1.392472004890442,
"epoch": 0.9570863918690006,
"grad_norm": 89.71502685546875,
"learning_rate": 4.9159785504118405e-06,
"loss": 0.808,
"mean_token_accuracy": 0.7795929908752441,
"num_tokens": 13767384.0,
"step": 1695
},
{
"entropy": 1.4235849142074586,
"epoch": 0.9599096555618295,
"grad_norm": 82.83116149902344,
"learning_rate": 4.9154855896176555e-06,
"loss": 0.8453,
"mean_token_accuracy": 0.7748985409736633,
"num_tokens": 13808044.0,
"step": 1700
},
{
"entropy": 1.4003316402435302,
"epoch": 0.9627329192546584,
"grad_norm": 101.7616958618164,
"learning_rate": 4.9149912202696905e-06,
"loss": 0.8407,
"mean_token_accuracy": 0.7737895011901855,
"num_tokens": 13848853.0,
"step": 1705
},
{
"entropy": 1.2794543743133544,
"epoch": 0.9655561829474872,
"grad_norm": 88.42160034179688,
"learning_rate": 4.9144954427568615e-06,
"loss": 0.775,
"mean_token_accuracy": 0.7884101748466492,
"num_tokens": 13889396.0,
"step": 1710
},
{
"entropy": 1.3160929203033447,
"epoch": 0.9683794466403162,
"grad_norm": 89.33016204833984,
"learning_rate": 4.913998257469189e-06,
"loss": 0.7975,
"mean_token_accuracy": 0.7818446278572082,
"num_tokens": 13930171.0,
"step": 1715
},
{
"entropy": 1.4001628637313843,
"epoch": 0.9712027103331451,
"grad_norm": 84.03208923339844,
"learning_rate": 4.913499664797805e-06,
"loss": 0.8569,
"mean_token_accuracy": 0.7669258832931518,
"num_tokens": 13970784.0,
"step": 1720
},
{
"entropy": 1.3456218719482422,
"epoch": 0.974025974025974,
"grad_norm": 91.97527313232422,
"learning_rate": 4.912999665134944e-06,
"loss": 0.8883,
"mean_token_accuracy": 0.7597482323646545,
"num_tokens": 14011324.0,
"step": 1725
},
{
"entropy": 1.305102777481079,
"epoch": 0.9768492377188029,
"grad_norm": 78.9338150024414,
"learning_rate": 4.912498258873952e-06,
"loss": 0.7794,
"mean_token_accuracy": 0.7846640110015869,
"num_tokens": 14052054.0,
"step": 1730
},
{
"entropy": 1.3051819801330566,
"epoch": 0.9796725014116319,
"grad_norm": 78.96739196777344,
"learning_rate": 4.911995446409277e-06,
"loss": 0.7876,
"mean_token_accuracy": 0.7836643576622009,
"num_tokens": 14092687.0,
"step": 1735
},
{
"entropy": 1.3111673593521118,
"epoch": 0.9824957651044608,
"grad_norm": 70.50989532470703,
"learning_rate": 4.911491228136478e-06,
"loss": 0.7716,
"mean_token_accuracy": 0.7886781573295594,
"num_tokens": 14133371.0,
"step": 1740
},
{
"entropy": 1.206014060974121,
"epoch": 0.9853190287972897,
"grad_norm": 84.27452087402344,
"learning_rate": 4.9109856044522164e-06,
"loss": 0.7606,
"mean_token_accuracy": 0.7878808975219727,
"num_tokens": 14173882.0,
"step": 1745
},
{
"entropy": 1.2502301692962647,
"epoch": 0.9881422924901185,
"grad_norm": 78.59927368164062,
"learning_rate": 4.91047857575426e-06,
"loss": 0.7112,
"mean_token_accuracy": 0.8004097700119018,
"num_tokens": 14214526.0,
"step": 1750
},
{
"entropy": 1.3274275541305542,
"epoch": 0.9909655561829475,
"grad_norm": 81.658447265625,
"learning_rate": 4.909970142441483e-06,
"loss": 0.8183,
"mean_token_accuracy": 0.7792202591896057,
"num_tokens": 14255273.0,
"step": 1755
},
{
"entropy": 1.3543532848358155,
"epoch": 0.9937888198757764,
"grad_norm": 87.37543487548828,
"learning_rate": 4.909460304913863e-06,
"loss": 0.8334,
"mean_token_accuracy": 0.77262362241745,
"num_tokens": 14296045.0,
"step": 1760
},
{
"entropy": 1.2293973207473754,
"epoch": 0.9966120835686053,
"grad_norm": 78.08673095703125,
"learning_rate": 4.9089490635724845e-06,
"loss": 0.794,
"mean_token_accuracy": 0.7820210456848145,
"num_tokens": 14336723.0,
"step": 1765
},
{
"entropy": 1.3779594898223877,
"epoch": 0.9994353472614342,
"grad_norm": 90.49781036376953,
"learning_rate": 4.908436418819533e-06,
"loss": 0.8196,
"mean_token_accuracy": 0.7763382792472839,
"num_tokens": 14377368.0,
"step": 1770
},
{
"entropy": 1.2417575359344482,
"epoch": 1.002258610954263,
"grad_norm": 78.08610534667969,
"learning_rate": 4.907922371058302e-06,
"loss": 0.7171,
"mean_token_accuracy": 0.7971941590309143,
"num_tokens": 14411228.0,
"step": 1775
},
{
"entropy": 0.9904776334762573,
"epoch": 1.005081874647092,
"grad_norm": 89.43754577636719,
"learning_rate": 4.907406920693187e-06,
"loss": 0.6408,
"mean_token_accuracy": 0.8144368052482605,
"num_tokens": 14451550.0,
"step": 1780
},
{
"entropy": 1.1428791284561157,
"epoch": 1.007905138339921,
"grad_norm": 81.54706573486328,
"learning_rate": 4.9068900681296845e-06,
"loss": 0.6205,
"mean_token_accuracy": 0.8160752058029175,
"num_tokens": 14492399.0,
"step": 1785
},
{
"entropy": 1.0305280327796935,
"epoch": 1.0107284020327498,
"grad_norm": 83.88504028320312,
"learning_rate": 4.906371813774398e-06,
"loss": 0.5993,
"mean_token_accuracy": 0.8244444727897644,
"num_tokens": 14533135.0,
"step": 1790
},
{
"entropy": 1.0244199752807617,
"epoch": 1.0135516657255788,
"grad_norm": 79.7117919921875,
"learning_rate": 4.90585215803503e-06,
"loss": 0.6408,
"mean_token_accuracy": 0.8139601945877075,
"num_tokens": 14573933.0,
"step": 1795
},
{
"entropy": 1.0104292154312133,
"epoch": 1.0163749294184077,
"grad_norm": 85.22904205322266,
"learning_rate": 4.9053311013203906e-06,
"loss": 0.6213,
"mean_token_accuracy": 0.8191802263259887,
"num_tokens": 14614666.0,
"step": 1800
},
{
"entropy": 1.0330065846443177,
"epoch": 1.0191981931112366,
"grad_norm": 79.9068832397461,
"learning_rate": 4.904808644040388e-06,
"loss": 0.585,
"mean_token_accuracy": 0.829715096950531,
"num_tokens": 14655486.0,
"step": 1805
},
{
"entropy": 1.1681673049926757,
"epoch": 1.0220214568040655,
"grad_norm": 96.07464599609375,
"learning_rate": 4.90428478660603e-06,
"loss": 0.7047,
"mean_token_accuracy": 0.7974439859390259,
"num_tokens": 14695925.0,
"step": 1810
},
{
"entropy": 1.026873731613159,
"epoch": 1.0248447204968945,
"grad_norm": 83.59033203125,
"learning_rate": 4.9037595294294334e-06,
"loss": 0.6233,
"mean_token_accuracy": 0.817428719997406,
"num_tokens": 14736564.0,
"step": 1815
},
{
"entropy": 1.0269073605537415,
"epoch": 1.0276679841897234,
"grad_norm": 87.5685806274414,
"learning_rate": 4.90323287292381e-06,
"loss": 0.6265,
"mean_token_accuracy": 0.8175249814987182,
"num_tokens": 14777242.0,
"step": 1820
},
{
"entropy": 1.0845494270324707,
"epoch": 1.0304912478825523,
"grad_norm": 87.97651672363281,
"learning_rate": 4.902704817503474e-06,
"loss": 0.6521,
"mean_token_accuracy": 0.8123294234275817,
"num_tokens": 14817756.0,
"step": 1825
},
{
"entropy": 1.094653880596161,
"epoch": 1.0333145115753812,
"grad_norm": 73.90741729736328,
"learning_rate": 4.90217536358384e-06,
"loss": 0.6592,
"mean_token_accuracy": 0.8132083773612976,
"num_tokens": 14857979.0,
"step": 1830
},
{
"entropy": 1.0332510590553283,
"epoch": 1.0361377752682102,
"grad_norm": 80.54743957519531,
"learning_rate": 4.901644511581425e-06,
"loss": 0.6275,
"mean_token_accuracy": 0.817654836177826,
"num_tokens": 14898498.0,
"step": 1835
},
{
"entropy": 1.1335099458694458,
"epoch": 1.0389610389610389,
"grad_norm": 111.70860290527344,
"learning_rate": 4.901112261913841e-06,
"loss": 0.702,
"mean_token_accuracy": 0.7976698398590087,
"num_tokens": 14938994.0,
"step": 1840
},
{
"entropy": 1.0480782866477967,
"epoch": 1.0417843026538678,
"grad_norm": 65.32731628417969,
"learning_rate": 4.9005786149998045e-06,
"loss": 0.5995,
"mean_token_accuracy": 0.8246279358863831,
"num_tokens": 14979080.0,
"step": 1845
},
{
"entropy": 1.1315045833587647,
"epoch": 1.0446075663466967,
"grad_norm": 94.34065246582031,
"learning_rate": 4.90004357125913e-06,
"loss": 0.6734,
"mean_token_accuracy": 0.8067026734352112,
"num_tokens": 15019684.0,
"step": 1850
},
{
"entropy": 1.2490570545196533,
"epoch": 1.0474308300395256,
"grad_norm": 91.20641326904297,
"learning_rate": 4.899507131112727e-06,
"loss": 0.75,
"mean_token_accuracy": 0.7861335158348084,
"num_tokens": 15060297.0,
"step": 1855
},
{
"entropy": 1.2040926933288574,
"epoch": 1.0502540937323546,
"grad_norm": 77.99979400634766,
"learning_rate": 4.89896929498261e-06,
"loss": 0.6917,
"mean_token_accuracy": 0.8017707467079163,
"num_tokens": 15100764.0,
"step": 1860
},
{
"entropy": 1.2546609878540038,
"epoch": 1.0530773574251835,
"grad_norm": 84.84165954589844,
"learning_rate": 4.898430063291886e-06,
"loss": 0.6753,
"mean_token_accuracy": 0.8059080719947815,
"num_tokens": 15141502.0,
"step": 1865
},
{
"entropy": 1.1682037115097046,
"epoch": 1.0559006211180124,
"grad_norm": 77.86764526367188,
"learning_rate": 4.897889436464763e-06,
"loss": 0.6515,
"mean_token_accuracy": 0.8100051641464233,
"num_tokens": 15182082.0,
"step": 1870
},
{
"entropy": 1.063246726989746,
"epoch": 1.0587238848108413,
"grad_norm": 74.87948608398438,
"learning_rate": 4.8973474149265456e-06,
"loss": 0.6319,
"mean_token_accuracy": 0.8158110737800598,
"num_tokens": 15222792.0,
"step": 1875
},
{
"entropy": 1.170669722557068,
"epoch": 1.0615471485036703,
"grad_norm": 88.10620880126953,
"learning_rate": 4.896803999103636e-06,
"loss": 0.6891,
"mean_token_accuracy": 0.8011930465698243,
"num_tokens": 15263508.0,
"step": 1880
},
{
"entropy": 1.0335828423500062,
"epoch": 1.0643704121964992,
"grad_norm": 93.42755889892578,
"learning_rate": 4.896259189423533e-06,
"loss": 0.5967,
"mean_token_accuracy": 0.8240804553031922,
"num_tokens": 15304067.0,
"step": 1885
},
{
"entropy": 1.0640571594238282,
"epoch": 1.0671936758893281,
"grad_norm": 100.56329345703125,
"learning_rate": 4.895712986314831e-06,
"loss": 0.6666,
"mean_token_accuracy": 0.8102141976356506,
"num_tokens": 15344671.0,
"step": 1890
},
{
"entropy": 1.099123191833496,
"epoch": 1.070016939582157,
"grad_norm": 84.1073989868164,
"learning_rate": 4.8951653902072226e-06,
"loss": 0.6195,
"mean_token_accuracy": 0.8202435731887817,
"num_tokens": 15384831.0,
"step": 1895
},
{
"entropy": 1.1124640941619872,
"epoch": 1.072840203274986,
"grad_norm": 80.79557800292969,
"learning_rate": 4.894616401531495e-06,
"loss": 0.6671,
"mean_token_accuracy": 0.8074574589729309,
"num_tokens": 15425501.0,
"step": 1900
},
{
"entropy": 1.1609252452850343,
"epoch": 1.075663466967815,
"grad_norm": 73.7085189819336,
"learning_rate": 4.89406602071953e-06,
"loss": 0.7432,
"mean_token_accuracy": 0.7908249497413635,
"num_tokens": 15466052.0,
"step": 1905
},
{
"entropy": 1.2594902276992799,
"epoch": 1.0784867306606438,
"grad_norm": 90.67098236083984,
"learning_rate": 4.893514248204307e-06,
"loss": 0.7448,
"mean_token_accuracy": 0.7901643276214599,
"num_tokens": 15506509.0,
"step": 1910
},
{
"entropy": 1.089804220199585,
"epoch": 1.0813099943534725,
"grad_norm": 83.40077209472656,
"learning_rate": 4.892961084419899e-06,
"loss": 0.6137,
"mean_token_accuracy": 0.8197898030281067,
"num_tokens": 15547127.0,
"step": 1915
},
{
"entropy": 1.0303892850875855,
"epoch": 1.0841332580463015,
"grad_norm": 88.45729064941406,
"learning_rate": 4.892406529801472e-06,
"loss": 0.662,
"mean_token_accuracy": 0.8114518284797668,
"num_tokens": 15587691.0,
"step": 1920
},
{
"entropy": 1.036658251285553,
"epoch": 1.0869565217391304,
"grad_norm": 74.7301254272461,
"learning_rate": 4.8918505847852885e-06,
"loss": 0.6001,
"mean_token_accuracy": 0.8228102445602417,
"num_tokens": 15628447.0,
"step": 1925
},
{
"entropy": 1.0575623393058777,
"epoch": 1.0897797854319593,
"grad_norm": 84.69684600830078,
"learning_rate": 4.8912932498087035e-06,
"loss": 0.6356,
"mean_token_accuracy": 0.8159940361976623,
"num_tokens": 15668880.0,
"step": 1930
},
{
"entropy": 1.0984156847000122,
"epoch": 1.0926030491247882,
"grad_norm": 83.59989166259766,
"learning_rate": 4.890734525310166e-06,
"loss": 0.6254,
"mean_token_accuracy": 0.8188717722892761,
"num_tokens": 15709301.0,
"step": 1935
},
{
"entropy": 1.264682650566101,
"epoch": 1.0954263128176172,
"grad_norm": 111.08476257324219,
"learning_rate": 4.890174411729218e-06,
"loss": 0.7209,
"mean_token_accuracy": 0.7967895984649658,
"num_tokens": 15749861.0,
"step": 1940
},
{
"entropy": 1.0748285174369812,
"epoch": 1.098249576510446,
"grad_norm": 72.8355941772461,
"learning_rate": 4.889612909506495e-06,
"loss": 0.6334,
"mean_token_accuracy": 0.8148902177810669,
"num_tokens": 15790605.0,
"step": 1945
},
{
"entropy": 1.1956205368041992,
"epoch": 1.101072840203275,
"grad_norm": 76.77680206298828,
"learning_rate": 4.889050019083722e-06,
"loss": 0.6858,
"mean_token_accuracy": 0.8026182293891907,
"num_tokens": 15831241.0,
"step": 1950
},
{
"entropy": 1.1811149835586547,
"epoch": 1.103896103896104,
"grad_norm": 85.11073303222656,
"learning_rate": 4.88848574090372e-06,
"loss": 0.6532,
"mean_token_accuracy": 0.8134795784950256,
"num_tokens": 15871995.0,
"step": 1955
},
{
"entropy": 1.0546734929084778,
"epoch": 1.1067193675889329,
"grad_norm": 91.46461486816406,
"learning_rate": 4.8879200754104e-06,
"loss": 0.6257,
"mean_token_accuracy": 0.8180089235305786,
"num_tokens": 15912519.0,
"step": 1960
},
{
"entropy": 1.1119594335556031,
"epoch": 1.1095426312817618,
"grad_norm": 81.14483642578125,
"learning_rate": 4.887353023048762e-06,
"loss": 0.6163,
"mean_token_accuracy": 0.8205843210220337,
"num_tokens": 15953027.0,
"step": 1965
},
{
"entropy": 1.1416823863983154,
"epoch": 1.1123658949745907,
"grad_norm": 77.70703887939453,
"learning_rate": 4.886784584264903e-06,
"loss": 0.6972,
"mean_token_accuracy": 0.8091330289840698,
"num_tokens": 15993823.0,
"step": 1970
},
{
"entropy": 1.098362350463867,
"epoch": 1.1151891586674196,
"grad_norm": 70.67080688476562,
"learning_rate": 4.8862147595060045e-06,
"loss": 0.64,
"mean_token_accuracy": 0.8146771907806396,
"num_tokens": 16034476.0,
"step": 1975
},
{
"entropy": 1.0884705901145935,
"epoch": 1.1180124223602483,
"grad_norm": 82.03054809570312,
"learning_rate": 4.885643549220342e-06,
"loss": 0.5783,
"mean_token_accuracy": 0.8282935738563537,
"num_tokens": 16075193.0,
"step": 1980
},
{
"entropy": 1.2014532089233398,
"epoch": 1.1208356860530773,
"grad_norm": 75.95311737060547,
"learning_rate": 4.885070953857279e-06,
"loss": 0.7202,
"mean_token_accuracy": 0.7983322143554688,
"num_tokens": 16115418.0,
"step": 1985
},
{
"entropy": 1.1905818343162538,
"epoch": 1.1236589497459062,
"grad_norm": 88.39530181884766,
"learning_rate": 4.88449697386727e-06,
"loss": 0.658,
"mean_token_accuracy": 0.8100619435310363,
"num_tokens": 16156001.0,
"step": 1990
},
{
"entropy": 1.1695854306221007,
"epoch": 1.1264822134387351,
"grad_norm": 75.26634979248047,
"learning_rate": 4.883921609701858e-06,
"loss": 0.6497,
"mean_token_accuracy": 0.8133507966995239,
"num_tokens": 16196588.0,
"step": 1995
},
{
"entropy": 1.1279015779495238,
"epoch": 1.129305477131564,
"grad_norm": 79.23324584960938,
"learning_rate": 4.883344861813675e-06,
"loss": 0.6328,
"mean_token_accuracy": 0.8204566597938537,
"num_tokens": 16237363.0,
"step": 2000
},
{
"epoch": 1.129305477131564,
"eval_entropy": 1.276236355304718,
"eval_loss": 0.6153639554977417,
"eval_mean_token_accuracy": 0.8272382497787476,
"eval_num_tokens": 16237363.0,
"eval_runtime": 2.4525,
"eval_samples_per_second": 15.902,
"eval_steps_per_second": 2.039,
"step": 2000
},
{
"entropy": 1.1289226531982421,
"epoch": 1.132128740824393,
"grad_norm": 93.11396789550781,
"learning_rate": 4.882766730656443e-06,
"loss": 0.6587,
"mean_token_accuracy": 0.8116627216339112,
"num_tokens": 16278012.0,
"step": 2005
},
{
"entropy": 1.1176002979278565,
"epoch": 1.134952004517222,
"grad_norm": 70.02957153320312,
"learning_rate": 4.882187216684969e-06,
"loss": 0.6425,
"mean_token_accuracy": 0.8133005023002624,
"num_tokens": 16318860.0,
"step": 2010
},
{
"entropy": 1.1600654363632201,
"epoch": 1.1377752682100508,
"grad_norm": 87.49974060058594,
"learning_rate": 4.881606320355152e-06,
"loss": 0.6819,
"mean_token_accuracy": 0.8076816439628601,
"num_tokens": 16359499.0,
"step": 2015
},
{
"entropy": 1.3200592517852783,
"epoch": 1.1405985319028797,
"grad_norm": 90.8593521118164,
"learning_rate": 4.881024042123974e-06,
"loss": 0.7624,
"mean_token_accuracy": 0.787363862991333,
"num_tokens": 16400128.0,
"step": 2020
},
{
"entropy": 1.1962620735168457,
"epoch": 1.1434217955957087,
"grad_norm": 73.66609954833984,
"learning_rate": 4.880440382449508e-06,
"loss": 0.6969,
"mean_token_accuracy": 0.8003793478012085,
"num_tokens": 16440829.0,
"step": 2025
},
{
"entropy": 1.1741215705871582,
"epoch": 1.1462450592885376,
"grad_norm": 93.21961212158203,
"learning_rate": 4.87985534179091e-06,
"loss": 0.6988,
"mean_token_accuracy": 0.8001942276954651,
"num_tokens": 16481536.0,
"step": 2030
},
{
"entropy": 1.1626744151115418,
"epoch": 1.1490683229813665,
"grad_norm": 78.82553100585938,
"learning_rate": 4.879268920608428e-06,
"loss": 0.684,
"mean_token_accuracy": 0.8021666407585144,
"num_tokens": 16522024.0,
"step": 2035
},
{
"entropy": 1.2014680624008178,
"epoch": 1.1518915866741954,
"grad_norm": 69.18099212646484,
"learning_rate": 4.87868111936339e-06,
"loss": 0.6776,
"mean_token_accuracy": 0.8064275741577148,
"num_tokens": 16562819.0,
"step": 2040
},
{
"entropy": 1.194064235687256,
"epoch": 1.1547148503670244,
"grad_norm": 100.71419525146484,
"learning_rate": 4.878091938518213e-06,
"loss": 0.6952,
"mean_token_accuracy": 0.8035964369773865,
"num_tokens": 16603097.0,
"step": 2045
},
{
"entropy": 1.0866102695465087,
"epoch": 1.1575381140598533,
"grad_norm": 86.4557876586914,
"learning_rate": 4.877501378536398e-06,
"loss": 0.6586,
"mean_token_accuracy": 0.8083908200263977,
"num_tokens": 16643682.0,
"step": 2050
},
{
"entropy": 1.177332592010498,
"epoch": 1.160361377752682,
"grad_norm": 79.74742126464844,
"learning_rate": 4.876909439882533e-06,
"loss": 0.7028,
"mean_token_accuracy": 0.8003194212913514,
"num_tokens": 16684377.0,
"step": 2055
},
{
"entropy": 1.1629464387893678,
"epoch": 1.163184641445511,
"grad_norm": 71.04415130615234,
"learning_rate": 4.8763161230222875e-06,
"loss": 0.6799,
"mean_token_accuracy": 0.8050651669502258,
"num_tokens": 16725224.0,
"step": 2060
},
{
"entropy": 1.272722601890564,
"epoch": 1.1660079051383399,
"grad_norm": 95.0716323852539,
"learning_rate": 4.875721428422418e-06,
"loss": 0.7034,
"mean_token_accuracy": 0.7992398619651795,
"num_tokens": 16765834.0,
"step": 2065
},
{
"entropy": 1.1523715257644653,
"epoch": 1.1688311688311688,
"grad_norm": 71.78240966796875,
"learning_rate": 4.875125356550762e-06,
"loss": 0.6712,
"mean_token_accuracy": 0.8078425049781799,
"num_tokens": 16806092.0,
"step": 2070
},
{
"entropy": 1.1174561262130738,
"epoch": 1.1716544325239977,
"grad_norm": 80.72341918945312,
"learning_rate": 4.874527907876244e-06,
"loss": 0.6329,
"mean_token_accuracy": 0.8179211616516113,
"num_tokens": 16846837.0,
"step": 2075
},
{
"entropy": 1.1514964818954467,
"epoch": 1.1744776962168266,
"grad_norm": 97.3735580444336,
"learning_rate": 4.87392908286887e-06,
"loss": 0.673,
"mean_token_accuracy": 0.8057490825653076,
"num_tokens": 16887539.0,
"step": 2080
},
{
"entropy": 1.1512172937393188,
"epoch": 1.1773009599096556,
"grad_norm": 90.2729263305664,
"learning_rate": 4.873328881999726e-06,
"loss": 0.6707,
"mean_token_accuracy": 0.8072671890258789,
"num_tokens": 16928198.0,
"step": 2085
},
{
"entropy": 1.1971535444259644,
"epoch": 1.1801242236024845,
"grad_norm": 84.18321990966797,
"learning_rate": 4.872727305740986e-06,
"loss": 0.6697,
"mean_token_accuracy": 0.8083752751350403,
"num_tokens": 16968681.0,
"step": 2090
},
{
"entropy": 1.1049229860305787,
"epoch": 1.1829474872953134,
"grad_norm": 83.14247131347656,
"learning_rate": 4.872124354565901e-06,
"loss": 0.658,
"mean_token_accuracy": 0.8073788404464721,
"num_tokens": 17009546.0,
"step": 2095
},
{
"entropy": 1.0792413353919983,
"epoch": 1.1857707509881423,
"grad_norm": 70.80720520019531,
"learning_rate": 4.871520028948807e-06,
"loss": 0.6335,
"mean_token_accuracy": 0.8118250489234924,
"num_tokens": 17050298.0,
"step": 2100
},
{
"entropy": 1.1736824989318848,
"epoch": 1.1885940146809713,
"grad_norm": 81.12787628173828,
"learning_rate": 4.870914329365117e-06,
"loss": 0.724,
"mean_token_accuracy": 0.7910011529922485,
"num_tokens": 17090911.0,
"step": 2105
},
{
"entropy": 1.2028324127197265,
"epoch": 1.1914172783738002,
"grad_norm": 89.57659912109375,
"learning_rate": 4.870307256291331e-06,
"loss": 0.7212,
"mean_token_accuracy": 0.7946621179580688,
"num_tokens": 17131732.0,
"step": 2110
},
{
"entropy": 1.023105502128601,
"epoch": 1.194240542066629,
"grad_norm": 77.13057708740234,
"learning_rate": 4.869698810205025e-06,
"loss": 0.6184,
"mean_token_accuracy": 0.8203395962715149,
"num_tokens": 17172456.0,
"step": 2115
},
{
"entropy": 1.155041766166687,
"epoch": 1.1970638057594578,
"grad_norm": 73.1446304321289,
"learning_rate": 4.869088991584854e-06,
"loss": 0.6336,
"mean_token_accuracy": 0.8165818333625794,
"num_tokens": 17212966.0,
"step": 2120
},
{
"entropy": 1.1935111284255981,
"epoch": 1.199887069452287,
"grad_norm": 81.32876586914062,
"learning_rate": 4.8684778009105596e-06,
"loss": 0.6916,
"mean_token_accuracy": 0.79750075340271,
"num_tokens": 17253526.0,
"step": 2125
},
{
"entropy": 1.0915156602859497,
"epoch": 1.2027103331451157,
"grad_norm": 91.79847717285156,
"learning_rate": 4.867865238662954e-06,
"loss": 0.6711,
"mean_token_accuracy": 0.8062323927879333,
"num_tokens": 17293948.0,
"step": 2130
},
{
"entropy": 1.118806290626526,
"epoch": 1.2055335968379446,
"grad_norm": 73.3872299194336,
"learning_rate": 4.867251305323935e-06,
"loss": 0.6266,
"mean_token_accuracy": 0.8176208019256592,
"num_tokens": 17334493.0,
"step": 2135
},
{
"entropy": 1.1635672569274902,
"epoch": 1.2083568605307735,
"grad_norm": 89.14440155029297,
"learning_rate": 4.866636001376475e-06,
"loss": 0.679,
"mean_token_accuracy": 0.8068656921386719,
"num_tokens": 17375343.0,
"step": 2140
},
{
"entropy": 1.1096044063568116,
"epoch": 1.2111801242236024,
"grad_norm": 77.38127136230469,
"learning_rate": 4.8660193273046295e-06,
"loss": 0.6625,
"mean_token_accuracy": 0.8062254667282105,
"num_tokens": 17415968.0,
"step": 2145
},
{
"entropy": 1.2033586382865906,
"epoch": 1.2140033879164314,
"grad_norm": 89.66434478759766,
"learning_rate": 4.865401283593525e-06,
"loss": 0.6891,
"mean_token_accuracy": 0.8035327792167664,
"num_tokens": 17456339.0,
"step": 2150
},
{
"entropy": 1.1876700401306153,
"epoch": 1.2168266516092603,
"grad_norm": 76.18563842773438,
"learning_rate": 4.864781870729371e-06,
"loss": 0.6963,
"mean_token_accuracy": 0.8002577424049377,
"num_tokens": 17496415.0,
"step": 2155
},
{
"entropy": 1.2191560983657836,
"epoch": 1.2196499153020892,
"grad_norm": 70.3829574584961,
"learning_rate": 4.864161089199453e-06,
"loss": 0.7123,
"mean_token_accuracy": 0.7971852898597718,
"num_tokens": 17537171.0,
"step": 2160
},
{
"entropy": 1.0485622882843018,
"epoch": 1.2224731789949181,
"grad_norm": 72.504150390625,
"learning_rate": 4.86353893949213e-06,
"loss": 0.5931,
"mean_token_accuracy": 0.827324640750885,
"num_tokens": 17577545.0,
"step": 2165
},
{
"entropy": 1.0504500150680542,
"epoch": 1.225296442687747,
"grad_norm": 79.71308135986328,
"learning_rate": 4.862915422096842e-06,
"loss": 0.6625,
"mean_token_accuracy": 0.8063273549079895,
"num_tokens": 17618424.0,
"step": 2170
},
{
"entropy": 1.089062762260437,
"epoch": 1.228119706380576,
"grad_norm": 76.05992889404297,
"learning_rate": 4.862290537504102e-06,
"loss": 0.6293,
"mean_token_accuracy": 0.8189480900764465,
"num_tokens": 17659160.0,
"step": 2175
},
{
"entropy": 1.1212316513061524,
"epoch": 1.230942970073405,
"grad_norm": 84.9821548461914,
"learning_rate": 4.861664286205499e-06,
"loss": 0.6649,
"mean_token_accuracy": 0.8077459692955017,
"num_tokens": 17699632.0,
"step": 2180
},
{
"entropy": 1.063609516620636,
"epoch": 1.2337662337662338,
"grad_norm": 75.26050567626953,
"learning_rate": 4.861036668693698e-06,
"loss": 0.6225,
"mean_token_accuracy": 0.8181837797164917,
"num_tokens": 17740472.0,
"step": 2185
},
{
"entropy": 1.2392653703689576,
"epoch": 1.2365894974590628,
"grad_norm": 82.38720703125,
"learning_rate": 4.860407685462438e-06,
"loss": 0.7298,
"mean_token_accuracy": 0.7937769293785095,
"num_tokens": 17780447.0,
"step": 2190
},
{
"entropy": 1.1494086742401124,
"epoch": 1.2394127611518915,
"grad_norm": 84.39624786376953,
"learning_rate": 4.859777337006533e-06,
"loss": 0.6817,
"mean_token_accuracy": 0.8082091212272644,
"num_tokens": 17821131.0,
"step": 2195
},
{
"entropy": 1.169661521911621,
"epoch": 1.2422360248447206,
"grad_norm": 80.15399169921875,
"learning_rate": 4.85914562382187e-06,
"loss": 0.6588,
"mean_token_accuracy": 0.8093379259109497,
"num_tokens": 17861753.0,
"step": 2200
},
{
"entropy": 1.1408036470413208,
"epoch": 1.2450592885375493,
"grad_norm": 71.21530151367188,
"learning_rate": 4.858512546405411e-06,
"loss": 0.6404,
"mean_token_accuracy": 0.8124599695205689,
"num_tokens": 17902320.0,
"step": 2205
},
{
"entropy": 1.0396205544471742,
"epoch": 1.2478825522303783,
"grad_norm": 94.92953491210938,
"learning_rate": 4.857878105255189e-06,
"loss": 0.6568,
"mean_token_accuracy": 0.8109822630882263,
"num_tokens": 17943114.0,
"step": 2210
},
{
"entropy": 1.1258668422698974,
"epoch": 1.2507058159232072,
"grad_norm": 84.43074798583984,
"learning_rate": 4.857242300870313e-06,
"loss": 0.6687,
"mean_token_accuracy": 0.8072779536247253,
"num_tokens": 17983284.0,
"step": 2215
},
{
"entropy": 1.2762274265289306,
"epoch": 1.253529079616036,
"grad_norm": 80.23455810546875,
"learning_rate": 4.8566051337509626e-06,
"loss": 0.7305,
"mean_token_accuracy": 0.7920625567436218,
"num_tokens": 18023783.0,
"step": 2220
},
{
"entropy": 1.1090112805366517,
"epoch": 1.256352343308865,
"grad_norm": 83.74242401123047,
"learning_rate": 4.8559666043983886e-06,
"loss": 0.6528,
"mean_token_accuracy": 0.8134839653968811,
"num_tokens": 18064489.0,
"step": 2225
},
{
"entropy": 1.1557289123535157,
"epoch": 1.259175607001694,
"grad_norm": 76.00527954101562,
"learning_rate": 4.855326713314916e-06,
"loss": 0.6507,
"mean_token_accuracy": 0.8127437829971313,
"num_tokens": 18105144.0,
"step": 2230
},
{
"entropy": 1.1039715051651,
"epoch": 1.2619988706945229,
"grad_norm": 84.07186126708984,
"learning_rate": 4.854685461003939e-06,
"loss": 0.6655,
"mean_token_accuracy": 0.8063163757324219,
"num_tokens": 18145903.0,
"step": 2235
},
{
"entropy": 1.129080843925476,
"epoch": 1.2648221343873518,
"grad_norm": 75.82025909423828,
"learning_rate": 4.854042847969921e-06,
"loss": 0.6461,
"mean_token_accuracy": 0.8113402366638184,
"num_tokens": 18186731.0,
"step": 2240
},
{
"entropy": 1.0956727743148804,
"epoch": 1.2676453980801807,
"grad_norm": 74.54117584228516,
"learning_rate": 4.8533988747184e-06,
"loss": 0.6425,
"mean_token_accuracy": 0.8132710099220276,
"num_tokens": 18227163.0,
"step": 2245
},
{
"entropy": 1.207816481590271,
"epoch": 1.2704686617730097,
"grad_norm": 69.57730865478516,
"learning_rate": 4.852753541755983e-06,
"loss": 0.6468,
"mean_token_accuracy": 0.8123814582824707,
"num_tokens": 18267863.0,
"step": 2250
},
{
"entropy": 1.2019951105117799,
"epoch": 1.2732919254658386,
"grad_norm": 77.39385986328125,
"learning_rate": 4.852106849590344e-06,
"loss": 0.6667,
"mean_token_accuracy": 0.8075017094612121,
"num_tokens": 18308454.0,
"step": 2255
},
{
"entropy": 1.2332281589508056,
"epoch": 1.2761151891586673,
"grad_norm": 82.1375503540039,
"learning_rate": 4.8514587987302295e-06,
"loss": 0.7035,
"mean_token_accuracy": 0.8001717686653137,
"num_tokens": 18349213.0,
"step": 2260
},
{
"entropy": 1.103496754169464,
"epoch": 1.2789384528514964,
"grad_norm": 77.35172271728516,
"learning_rate": 4.850809389685452e-06,
"loss": 0.6464,
"mean_token_accuracy": 0.8140517354011536,
"num_tokens": 18390006.0,
"step": 2265
},
{
"entropy": 1.0481646060943604,
"epoch": 1.2817617165443251,
"grad_norm": 74.0533447265625,
"learning_rate": 4.8501586229668955e-06,
"loss": 0.5638,
"mean_token_accuracy": 0.8321304798126221,
"num_tokens": 18430668.0,
"step": 2270
},
{
"entropy": 1.1348963975906372,
"epoch": 1.2845849802371543,
"grad_norm": 74.89289093017578,
"learning_rate": 4.849506499086509e-06,
"loss": 0.6235,
"mean_token_accuracy": 0.8209902882575989,
"num_tokens": 18471333.0,
"step": 2275
},
{
"entropy": 1.181236457824707,
"epoch": 1.287408243929983,
"grad_norm": 77.86975860595703,
"learning_rate": 4.848853018557311e-06,
"loss": 0.6914,
"mean_token_accuracy": 0.7989266395568848,
"num_tokens": 18511771.0,
"step": 2280
},
{
"entropy": 1.1591632604598998,
"epoch": 1.290231507622812,
"grad_norm": 91.24014282226562,
"learning_rate": 4.848198181893388e-06,
"loss": 0.64,
"mean_token_accuracy": 0.8149643659591674,
"num_tokens": 18552129.0,
"step": 2285
},
{
"entropy": 1.2816951274871826,
"epoch": 1.2930547713156408,
"grad_norm": 86.56483459472656,
"learning_rate": 4.847541989609891e-06,
"loss": 0.7635,
"mean_token_accuracy": 0.7828594326972962,
"num_tokens": 18592730.0,
"step": 2290
},
{
"entropy": 1.1080434799194336,
"epoch": 1.2958780350084698,
"grad_norm": 82.74638366699219,
"learning_rate": 4.846884442223038e-06,
"loss": 0.6774,
"mean_token_accuracy": 0.8058300733566284,
"num_tokens": 18633524.0,
"step": 2295
},
{
"entropy": 1.1663718938827514,
"epoch": 1.2987012987012987,
"grad_norm": 75.71199035644531,
"learning_rate": 4.8462255402501155e-06,
"loss": 0.7107,
"mean_token_accuracy": 0.7965960144996643,
"num_tokens": 18674190.0,
"step": 2300
},
{
"entropy": 1.2041790723800658,
"epoch": 1.3015245623941276,
"grad_norm": 80.01366424560547,
"learning_rate": 4.8455652842094735e-06,
"loss": 0.655,
"mean_token_accuracy": 0.8082357287406922,
"num_tokens": 18714763.0,
"step": 2305
},
{
"entropy": 1.097007966041565,
"epoch": 1.3043478260869565,
"grad_norm": 77.5677261352539,
"learning_rate": 4.8449036746205266e-06,
"loss": 0.657,
"mean_token_accuracy": 0.8100764632225037,
"num_tokens": 18755405.0,
"step": 2310
},
{
"entropy": 1.173832392692566,
"epoch": 1.3071710897797855,
"grad_norm": 79.28545379638672,
"learning_rate": 4.844240712003756e-06,
"loss": 0.6431,
"mean_token_accuracy": 0.8142745494842529,
"num_tokens": 18796177.0,
"step": 2315
},
{
"entropy": 1.1786026000976562,
"epoch": 1.3099943534726144,
"grad_norm": 85.30624389648438,
"learning_rate": 4.843576396880707e-06,
"loss": 0.6551,
"mean_token_accuracy": 0.8108649253845215,
"num_tokens": 18836972.0,
"step": 2320
},
{
"entropy": 1.0986263751983643,
"epoch": 1.3128176171654433,
"grad_norm": 84.06642150878906,
"learning_rate": 4.8429107297739875e-06,
"loss": 0.6553,
"mean_token_accuracy": 0.8121555089950562,
"num_tokens": 18877396.0,
"step": 2325
},
{
"entropy": 1.1171608209609984,
"epoch": 1.3156408808582722,
"grad_norm": 74.25096130371094,
"learning_rate": 4.84224371120727e-06,
"loss": 0.6276,
"mean_token_accuracy": 0.8181084632873535,
"num_tokens": 18918291.0,
"step": 2330
},
{
"entropy": 1.1926301956176757,
"epoch": 1.318464144551101,
"grad_norm": 68.66618347167969,
"learning_rate": 4.84157534170529e-06,
"loss": 0.6589,
"mean_token_accuracy": 0.8088850378990173,
"num_tokens": 18958467.0,
"step": 2335
},
{
"entropy": 1.2194635629653932,
"epoch": 1.32128740824393,
"grad_norm": 80.56359100341797,
"learning_rate": 4.8409056217938465e-06,
"loss": 0.6687,
"mean_token_accuracy": 0.8101451516151428,
"num_tokens": 18999140.0,
"step": 2340
},
{
"entropy": 1.1420178532600402,
"epoch": 1.3241106719367588,
"grad_norm": 89.24362182617188,
"learning_rate": 4.8402345519998e-06,
"loss": 0.657,
"mean_token_accuracy": 0.8115506768226624,
"num_tokens": 19039704.0,
"step": 2345
},
{
"entropy": 1.297519588470459,
"epoch": 1.3269339356295877,
"grad_norm": 69.71318054199219,
"learning_rate": 4.839562132851073e-06,
"loss": 0.7476,
"mean_token_accuracy": 0.788333511352539,
"num_tokens": 19080331.0,
"step": 2350
},
{
"entropy": 1.158388113975525,
"epoch": 1.3297571993224166,
"grad_norm": 93.63565826416016,
"learning_rate": 4.8388883648766495e-06,
"loss": 0.6935,
"mean_token_accuracy": 0.8003996729850769,
"num_tokens": 19120683.0,
"step": 2355
},
{
"entropy": 1.1665226459503173,
"epoch": 1.3325804630152456,
"grad_norm": 71.25414276123047,
"learning_rate": 4.838213248606575e-06,
"loss": 0.7153,
"mean_token_accuracy": 0.7993328690528869,
"num_tokens": 19161294.0,
"step": 2360
},
{
"entropy": 1.2286199092864991,
"epoch": 1.3354037267080745,
"grad_norm": 85.82389068603516,
"learning_rate": 4.837536784571955e-06,
"loss": 0.7382,
"mean_token_accuracy": 0.7923282504081726,
"num_tokens": 19201996.0,
"step": 2365
},
{
"entropy": 1.2367011785507203,
"epoch": 1.3382269904009034,
"grad_norm": 78.86309051513672,
"learning_rate": 4.836858973304957e-06,
"loss": 0.7283,
"mean_token_accuracy": 0.7932451248168946,
"num_tokens": 19242804.0,
"step": 2370
},
{
"entropy": 1.1243971109390258,
"epoch": 1.3410502540937324,
"grad_norm": 80.71333312988281,
"learning_rate": 4.836179815338805e-06,
"loss": 0.6538,
"mean_token_accuracy": 0.8101875901222229,
"num_tokens": 19283424.0,
"step": 2375
},
{
"entropy": 1.1276620268821715,
"epoch": 1.3438735177865613,
"grad_norm": 67.6820068359375,
"learning_rate": 4.835499311207788e-06,
"loss": 0.6764,
"mean_token_accuracy": 0.8059876799583435,
"num_tokens": 19324228.0,
"step": 2380
},
{
"entropy": 1.085622775554657,
"epoch": 1.3466967814793902,
"grad_norm": 66.39501953125,
"learning_rate": 4.8348174614472465e-06,
"loss": 0.6318,
"mean_token_accuracy": 0.8171636939048768,
"num_tokens": 19364593.0,
"step": 2385
},
{
"entropy": 1.1541183471679688,
"epoch": 1.3495200451722191,
"grad_norm": 92.84614562988281,
"learning_rate": 4.834134266593586e-06,
"loss": 0.6552,
"mean_token_accuracy": 0.8120688199996948,
"num_tokens": 19405413.0,
"step": 2390
},
{
"entropy": 1.199662709236145,
"epoch": 1.352343308865048,
"grad_norm": 80.65885162353516,
"learning_rate": 4.833449727184267e-06,
"loss": 0.7151,
"mean_token_accuracy": 0.7959373831748963,
"num_tokens": 19446043.0,
"step": 2395
},
{
"entropy": 1.153480863571167,
"epoch": 1.355166572557877,
"grad_norm": 79.00860595703125,
"learning_rate": 4.832763843757809e-06,
"loss": 0.6588,
"mean_token_accuracy": 0.8133233785629272,
"num_tokens": 19486895.0,
"step": 2400
},
{
"entropy": 1.0923040628433227,
"epoch": 1.357989836250706,
"grad_norm": 73.47067260742188,
"learning_rate": 4.832076616853788e-06,
"loss": 0.6212,
"mean_token_accuracy": 0.8210492372512818,
"num_tokens": 19527502.0,
"step": 2405
},
{
"entropy": 1.1808124303817749,
"epoch": 1.3608130999435346,
"grad_norm": 86.0345687866211,
"learning_rate": 4.831388047012836e-06,
"loss": 0.7206,
"mean_token_accuracy": 0.7948956370353699,
"num_tokens": 19568274.0,
"step": 2410
},
{
"entropy": 1.201215958595276,
"epoch": 1.3636363636363638,
"grad_norm": 82.31108856201172,
"learning_rate": 4.830698134776647e-06,
"loss": 0.6994,
"mean_token_accuracy": 0.8000857710838318,
"num_tokens": 19608913.0,
"step": 2415
},
{
"entropy": 1.1461955547332763,
"epoch": 1.3664596273291925,
"grad_norm": 80.74882507324219,
"learning_rate": 4.830006880687961e-06,
"loss": 0.662,
"mean_token_accuracy": 0.8095049500465393,
"num_tokens": 19649662.0,
"step": 2420
},
{
"entropy": 1.2065489768981934,
"epoch": 1.3692828910220214,
"grad_norm": 85.66714477539062,
"learning_rate": 4.829314285290584e-06,
"loss": 0.7143,
"mean_token_accuracy": 0.7959868550300598,
"num_tokens": 19689993.0,
"step": 2425
},
{
"entropy": 1.148555874824524,
"epoch": 1.3721061547148503,
"grad_norm": 77.31533813476562,
"learning_rate": 4.8286203491293706e-06,
"loss": 0.6792,
"mean_token_accuracy": 0.805145263671875,
"num_tokens": 19730308.0,
"step": 2430
},
{
"entropy": 1.1263495683670044,
"epoch": 1.3749294184076792,
"grad_norm": 72.85306549072266,
"learning_rate": 4.827925072750232e-06,
"loss": 0.6206,
"mean_token_accuracy": 0.819097375869751,
"num_tokens": 19770873.0,
"step": 2435
},
{
"entropy": 1.1379968404769898,
"epoch": 1.3777526821005082,
"grad_norm": 74.16294860839844,
"learning_rate": 4.827228456700135e-06,
"loss": 0.672,
"mean_token_accuracy": 0.8076503992080688,
"num_tokens": 19811620.0,
"step": 2440
},
{
"entropy": 1.2445213794708252,
"epoch": 1.380575945793337,
"grad_norm": 96.99238586425781,
"learning_rate": 4.826530501527097e-06,
"loss": 0.7006,
"mean_token_accuracy": 0.8044170498847961,
"num_tokens": 19852266.0,
"step": 2445
},
{
"entropy": 1.3386611223220826,
"epoch": 1.383399209486166,
"grad_norm": 74.49800872802734,
"learning_rate": 4.825831207780193e-06,
"loss": 0.6981,
"mean_token_accuracy": 0.8017369747161865,
"num_tokens": 19892609.0,
"step": 2450
},
{
"entropy": 1.2711436033248902,
"epoch": 1.386222473178995,
"grad_norm": 90.28303527832031,
"learning_rate": 4.82513057600955e-06,
"loss": 0.6718,
"mean_token_accuracy": 0.8058541059494019,
"num_tokens": 19932916.0,
"step": 2455
},
{
"entropy": 1.1395226001739502,
"epoch": 1.3890457368718239,
"grad_norm": 81.18891143798828,
"learning_rate": 4.8244286067663435e-06,
"loss": 0.6496,
"mean_token_accuracy": 0.8136471390724183,
"num_tokens": 19973568.0,
"step": 2460
},
{
"entropy": 1.2806391954421996,
"epoch": 1.3918690005646528,
"grad_norm": 74.18885040283203,
"learning_rate": 4.823725300602807e-06,
"loss": 0.7242,
"mean_token_accuracy": 0.7983367204666137,
"num_tokens": 20014256.0,
"step": 2465
},
{
"entropy": 1.2575560808181763,
"epoch": 1.3946922642574817,
"grad_norm": 83.14079284667969,
"learning_rate": 4.823020658072222e-06,
"loss": 0.7266,
"mean_token_accuracy": 0.7935426712036133,
"num_tokens": 20054679.0,
"step": 2470
},
{
"entropy": 1.1245292901992798,
"epoch": 1.3975155279503104,
"grad_norm": 86.17759704589844,
"learning_rate": 4.8223146797289235e-06,
"loss": 0.6472,
"mean_token_accuracy": 0.8125198125839234,
"num_tokens": 20095540.0,
"step": 2475
},
{
"entropy": 1.2133588790893555,
"epoch": 1.4003387916431396,
"grad_norm": 67.61619567871094,
"learning_rate": 4.8216073661282945e-06,
"loss": 0.6501,
"mean_token_accuracy": 0.8147984504699707,
"num_tokens": 20136277.0,
"step": 2480
},
{
"entropy": 1.1730206727981567,
"epoch": 1.4031620553359683,
"grad_norm": 70.51668548583984,
"learning_rate": 4.820898717826772e-06,
"loss": 0.6244,
"mean_token_accuracy": 0.8181105494499207,
"num_tokens": 20177010.0,
"step": 2485
},
{
"entropy": 1.1870003461837768,
"epoch": 1.4059853190287974,
"grad_norm": 65.29812622070312,
"learning_rate": 4.82018873538184e-06,
"loss": 0.6537,
"mean_token_accuracy": 0.8110229969024658,
"num_tokens": 20217605.0,
"step": 2490
},
{
"entropy": 1.1333913803100586,
"epoch": 1.4088085827216261,
"grad_norm": 71.14463806152344,
"learning_rate": 4.819477419352034e-06,
"loss": 0.6183,
"mean_token_accuracy": 0.8201487541198731,
"num_tokens": 20258267.0,
"step": 2495
},
{
"entropy": 1.1184187650680542,
"epoch": 1.411631846414455,
"grad_norm": 68.24232482910156,
"learning_rate": 4.818764770296938e-06,
"loss": 0.6249,
"mean_token_accuracy": 0.8183603525161743,
"num_tokens": 20299023.0,
"step": 2500
},
{
"epoch": 1.411631846414455,
"eval_entropy": 1.3717174053192138,
"eval_loss": 0.6318865418434143,
"eval_mean_token_accuracy": 0.8233636379241943,
"eval_num_tokens": 20299023.0,
"eval_runtime": 2.457,
"eval_samples_per_second": 15.873,
"eval_steps_per_second": 2.035,
"step": 2500
},
{
"entropy": 1.2598673343658446,
"epoch": 1.414455110107284,
"grad_norm": 71.59434509277344,
"learning_rate": 4.8180507887771835e-06,
"loss": 0.6998,
"mean_token_accuracy": 0.8029447555541992,
"num_tokens": 20339637.0,
"step": 2505
},
{
"entropy": 1.2127498626708983,
"epoch": 1.417278373800113,
"grad_norm": 77.03498077392578,
"learning_rate": 4.8173354753544524e-06,
"loss": 0.6909,
"mean_token_accuracy": 0.804262387752533,
"num_tokens": 20380116.0,
"step": 2510
},
{
"entropy": 1.237819457054138,
"epoch": 1.4201016374929418,
"grad_norm": 76.66390991210938,
"learning_rate": 4.816618830591473e-06,
"loss": 0.689,
"mean_token_accuracy": 0.8024984955787658,
"num_tokens": 20420739.0,
"step": 2515
},
{
"entropy": 1.206669521331787,
"epoch": 1.4229249011857708,
"grad_norm": 82.84412384033203,
"learning_rate": 4.815900855052021e-06,
"loss": 0.684,
"mean_token_accuracy": 0.8014244318008423,
"num_tokens": 20461380.0,
"step": 2520
},
{
"entropy": 1.1622983455657958,
"epoch": 1.4257481648785997,
"grad_norm": 69.7125244140625,
"learning_rate": 4.8151815493009186e-06,
"loss": 0.7123,
"mean_token_accuracy": 0.7954689502716065,
"num_tokens": 20502242.0,
"step": 2525
},
{
"entropy": 1.2152176380157471,
"epoch": 1.4285714285714286,
"grad_norm": 71.72361755371094,
"learning_rate": 4.814460913904036e-06,
"loss": 0.689,
"mean_token_accuracy": 0.8025439500808715,
"num_tokens": 20543050.0,
"step": 2530
},
{
"entropy": 1.1802116870880126,
"epoch": 1.4313946922642575,
"grad_norm": 83.00234985351562,
"learning_rate": 4.813738949428289e-06,
"loss": 0.6551,
"mean_token_accuracy": 0.8088852882385253,
"num_tokens": 20583787.0,
"step": 2535
},
{
"entropy": 1.1717280864715576,
"epoch": 1.4342179559570865,
"grad_norm": 74.20233154296875,
"learning_rate": 4.8130156564416374e-06,
"loss": 0.6143,
"mean_token_accuracy": 0.8246188163757324,
"num_tokens": 20624442.0,
"step": 2540
},
{
"entropy": 1.1795841693878173,
"epoch": 1.4370412196499154,
"grad_norm": 89.12123107910156,
"learning_rate": 4.812291035513088e-06,
"loss": 0.6578,
"mean_token_accuracy": 0.8118916988372803,
"num_tokens": 20664910.0,
"step": 2545
},
{
"entropy": 1.3075191736221314,
"epoch": 1.439864483342744,
"grad_norm": 76.54231262207031,
"learning_rate": 4.811565087212691e-06,
"loss": 0.7455,
"mean_token_accuracy": 0.7889067530632019,
"num_tokens": 20705591.0,
"step": 2550
},
{
"entropy": 1.1278613567352296,
"epoch": 1.4426877470355732,
"grad_norm": 66.26628112792969,
"learning_rate": 4.810837812111541e-06,
"loss": 0.5953,
"mean_token_accuracy": 0.8274539947509766,
"num_tokens": 20746404.0,
"step": 2555
},
{
"entropy": 1.1617911815643311,
"epoch": 1.445511010728402,
"grad_norm": 82.94552612304688,
"learning_rate": 4.810109210781778e-06,
"loss": 0.6439,
"mean_token_accuracy": 0.8134812235832214,
"num_tokens": 20787015.0,
"step": 2560
},
{
"entropy": 1.323712158203125,
"epoch": 1.4483342744212309,
"grad_norm": 67.05265808105469,
"learning_rate": 4.809379283796582e-06,
"loss": 0.6898,
"mean_token_accuracy": 0.8058351755142212,
"num_tokens": 20827604.0,
"step": 2565
},
{
"entropy": 1.1775886774063111,
"epoch": 1.4511575381140598,
"grad_norm": 98.45421600341797,
"learning_rate": 4.80864803173018e-06,
"loss": 0.6564,
"mean_token_accuracy": 0.8137942314147949,
"num_tokens": 20868096.0,
"step": 2570
},
{
"entropy": 1.272672414779663,
"epoch": 1.4539808018068887,
"grad_norm": 84.572509765625,
"learning_rate": 4.807915455157839e-06,
"loss": 0.643,
"mean_token_accuracy": 0.8139849543571472,
"num_tokens": 20907891.0,
"step": 2575
},
{
"entropy": 1.1502018213272094,
"epoch": 1.4568040654997176,
"grad_norm": 85.15162658691406,
"learning_rate": 4.807181554655866e-06,
"loss": 0.6461,
"mean_token_accuracy": 0.8113275051116944,
"num_tokens": 20948609.0,
"step": 2580
},
{
"entropy": 1.240171766281128,
"epoch": 1.4596273291925466,
"grad_norm": 73.13277435302734,
"learning_rate": 4.8064463308016154e-06,
"loss": 0.7099,
"mean_token_accuracy": 0.7971065878868103,
"num_tokens": 20989030.0,
"step": 2585
},
{
"entropy": 1.210387110710144,
"epoch": 1.4624505928853755,
"grad_norm": 80.91059875488281,
"learning_rate": 4.805709784173477e-06,
"loss": 0.6892,
"mean_token_accuracy": 0.8043756723403931,
"num_tokens": 21029630.0,
"step": 2590
},
{
"entropy": 1.1126335978507995,
"epoch": 1.4652738565782044,
"grad_norm": 76.07735443115234,
"learning_rate": 4.804971915350882e-06,
"loss": 0.593,
"mean_token_accuracy": 0.8269088268280029,
"num_tokens": 21069981.0,
"step": 2595
},
{
"entropy": 1.153779721260071,
"epoch": 1.4680971202710333,
"grad_norm": 72.62789154052734,
"learning_rate": 4.804232724914306e-06,
"loss": 0.6317,
"mean_token_accuracy": 0.8180077075958252,
"num_tokens": 21110649.0,
"step": 2600
},
{
"entropy": 1.2609638452529908,
"epoch": 1.4709203839638623,
"grad_norm": 82.38737487792969,
"learning_rate": 4.803492213445259e-06,
"loss": 0.6631,
"mean_token_accuracy": 0.8072648644447327,
"num_tokens": 21151346.0,
"step": 2605
},
{
"entropy": 1.251114010810852,
"epoch": 1.4737436476566912,
"grad_norm": 83.15131378173828,
"learning_rate": 4.802750381526294e-06,
"loss": 0.6537,
"mean_token_accuracy": 0.810612428188324,
"num_tokens": 21191808.0,
"step": 2610
},
{
"entropy": 1.2838777303695679,
"epoch": 1.4765669113495201,
"grad_norm": 72.97293090820312,
"learning_rate": 4.802007229741001e-06,
"loss": 0.6986,
"mean_token_accuracy": 0.8005695700645447,
"num_tokens": 21232422.0,
"step": 2615
},
{
"entropy": 1.2007624864578248,
"epoch": 1.479390175042349,
"grad_norm": 76.61715698242188,
"learning_rate": 4.801262758674009e-06,
"loss": 0.6429,
"mean_token_accuracy": 0.8100405931472778,
"num_tokens": 21273249.0,
"step": 2620
},
{
"entropy": 1.20713312625885,
"epoch": 1.4822134387351777,
"grad_norm": 66.92652130126953,
"learning_rate": 4.800516968910984e-06,
"loss": 0.6362,
"mean_token_accuracy": 0.8153573513031006,
"num_tokens": 21313887.0,
"step": 2625
},
{
"entropy": 1.246811294555664,
"epoch": 1.485036702428007,
"grad_norm": 75.15606689453125,
"learning_rate": 4.79976986103863e-06,
"loss": 0.7113,
"mean_token_accuracy": 0.7945464491844177,
"num_tokens": 21354393.0,
"step": 2630
},
{
"entropy": 1.2210959911346435,
"epoch": 1.4878599661208356,
"grad_norm": 72.59530639648438,
"learning_rate": 4.799021435644687e-06,
"loss": 0.6313,
"mean_token_accuracy": 0.813930869102478,
"num_tokens": 21395084.0,
"step": 2635
},
{
"entropy": 1.3068056583404541,
"epoch": 1.4906832298136645,
"grad_norm": 80.09661102294922,
"learning_rate": 4.798271693317935e-06,
"loss": 0.6833,
"mean_token_accuracy": 0.8025308609008789,
"num_tokens": 21435779.0,
"step": 2640
},
{
"entropy": 1.1782183170318603,
"epoch": 1.4935064935064934,
"grad_norm": 67.90818786621094,
"learning_rate": 4.797520634648185e-06,
"loss": 0.656,
"mean_token_accuracy": 0.811971914768219,
"num_tokens": 21476448.0,
"step": 2645
},
{
"entropy": 1.2379778146743774,
"epoch": 1.4963297571993224,
"grad_norm": 82.61946868896484,
"learning_rate": 4.7967682602262866e-06,
"loss": 0.7022,
"mean_token_accuracy": 0.8024298191070557,
"num_tokens": 21517170.0,
"step": 2650
},
{
"entropy": 1.2261282444000243,
"epoch": 1.4991530208921513,
"grad_norm": 82.52174377441406,
"learning_rate": 4.796014570644123e-06,
"loss": 0.6638,
"mean_token_accuracy": 0.8089081048965454,
"num_tokens": 21558007.0,
"step": 2655
},
{
"entropy": 1.1248329162597657,
"epoch": 1.5019762845849802,
"grad_norm": 70.8136978149414,
"learning_rate": 4.795259566494615e-06,
"loss": 0.6235,
"mean_token_accuracy": 0.8185378551483155,
"num_tokens": 21598768.0,
"step": 2660
},
{
"entropy": 1.1765445470809937,
"epoch": 1.5047995482778092,
"grad_norm": 85.58080291748047,
"learning_rate": 4.794503248371715e-06,
"loss": 0.6518,
"mean_token_accuracy": 0.8130599617958069,
"num_tokens": 21639426.0,
"step": 2665
},
{
"entropy": 1.2815346717834473,
"epoch": 1.507622811970638,
"grad_norm": 92.36046600341797,
"learning_rate": 4.7937456168704075e-06,
"loss": 0.7405,
"mean_token_accuracy": 0.7926273345947266,
"num_tokens": 21680145.0,
"step": 2670
},
{
"entropy": 1.3271814823150634,
"epoch": 1.510446075663467,
"grad_norm": 63.72608184814453,
"learning_rate": 4.792986672586715e-06,
"loss": 0.6925,
"mean_token_accuracy": 0.8034780144691467,
"num_tokens": 21720157.0,
"step": 2675
},
{
"entropy": 1.1895153999328614,
"epoch": 1.513269339356296,
"grad_norm": 66.45240783691406,
"learning_rate": 4.7922264161176865e-06,
"loss": 0.6386,
"mean_token_accuracy": 0.8131694197654724,
"num_tokens": 21760810.0,
"step": 2680
},
{
"entropy": 1.2537084341049194,
"epoch": 1.5160926030491249,
"grad_norm": 99.64230346679688,
"learning_rate": 4.79146484806141e-06,
"loss": 0.6932,
"mean_token_accuracy": 0.8014739036560059,
"num_tokens": 21801436.0,
"step": 2685
},
{
"entropy": 1.243509340286255,
"epoch": 1.5189158667419536,
"grad_norm": 76.32308197021484,
"learning_rate": 4.7907019690169995e-06,
"loss": 0.6595,
"mean_token_accuracy": 0.8102675080299377,
"num_tokens": 21842068.0,
"step": 2690
},
{
"entropy": 1.1510627508163451,
"epoch": 1.5217391304347827,
"grad_norm": 65.6766128540039,
"learning_rate": 4.789937779584606e-06,
"loss": 0.5932,
"mean_token_accuracy": 0.8254171371459961,
"num_tokens": 21882781.0,
"step": 2695
},
{
"entropy": 1.3232401371002198,
"epoch": 1.5245623941276114,
"grad_norm": 82.84342956542969,
"learning_rate": 4.789172280365405e-06,
"loss": 0.7381,
"mean_token_accuracy": 0.7899990320205689,
"num_tokens": 21923429.0,
"step": 2700
},
{
"entropy": 1.1786120891571046,
"epoch": 1.5273856578204406,
"grad_norm": 75.24930572509766,
"learning_rate": 4.788405471961607e-06,
"loss": 0.6657,
"mean_token_accuracy": 0.8080781698226929,
"num_tokens": 21964070.0,
"step": 2705
},
{
"entropy": 1.3117302417755128,
"epoch": 1.5302089215132693,
"grad_norm": 82.70320129394531,
"learning_rate": 4.787637354976451e-06,
"loss": 0.7032,
"mean_token_accuracy": 0.7976889133453369,
"num_tokens": 22004553.0,
"step": 2710
},
{
"entropy": 1.159567379951477,
"epoch": 1.5330321852060984,
"grad_norm": 77.04248809814453,
"learning_rate": 4.7868679300142075e-06,
"loss": 0.6286,
"mean_token_accuracy": 0.8175629734992981,
"num_tokens": 22045280.0,
"step": 2715
},
{
"entropy": 1.240707230567932,
"epoch": 1.5358554488989271,
"grad_norm": 63.95314025878906,
"learning_rate": 4.7860971976801705e-06,
"loss": 0.6701,
"mean_token_accuracy": 0.8070308327674866,
"num_tokens": 22085964.0,
"step": 2720
},
{
"entropy": 1.1840417623519897,
"epoch": 1.538678712591756,
"grad_norm": 92.01813507080078,
"learning_rate": 4.785325158580667e-06,
"loss": 0.6363,
"mean_token_accuracy": 0.8143904328346252,
"num_tokens": 22126478.0,
"step": 2725
},
{
"entropy": 1.1806485176086425,
"epoch": 1.541501976284585,
"grad_norm": 78.07498931884766,
"learning_rate": 4.784551813323053e-06,
"loss": 0.623,
"mean_token_accuracy": 0.8173678278923034,
"num_tokens": 22167400.0,
"step": 2730
},
{
"entropy": 1.2515090465545655,
"epoch": 1.5443252399774139,
"grad_norm": 81.79811096191406,
"learning_rate": 4.783777162515708e-06,
"loss": 0.6986,
"mean_token_accuracy": 0.7993507623672486,
"num_tokens": 22208168.0,
"step": 2735
},
{
"entropy": 1.2478914260864258,
"epoch": 1.5471485036702428,
"grad_norm": 78.49143981933594,
"learning_rate": 4.783001206768042e-06,
"loss": 0.6894,
"mean_token_accuracy": 0.8029946327209473,
"num_tokens": 22248225.0,
"step": 2740
},
{
"entropy": 1.2676709175109864,
"epoch": 1.5499717673630717,
"grad_norm": 79.54261016845703,
"learning_rate": 4.7822239466904885e-06,
"loss": 0.6677,
"mean_token_accuracy": 0.8095703125,
"num_tokens": 22288552.0,
"step": 2745
},
{
"entropy": 1.2544928550720216,
"epoch": 1.5527950310559007,
"grad_norm": 80.72503662109375,
"learning_rate": 4.781445382894511e-06,
"loss": 0.6714,
"mean_token_accuracy": 0.8068174123764038,
"num_tokens": 22329056.0,
"step": 2750
},
{
"entropy": 1.1922768592834472,
"epoch": 1.5556182947487294,
"grad_norm": 70.30680084228516,
"learning_rate": 4.780665515992594e-06,
"loss": 0.6414,
"mean_token_accuracy": 0.8172025442123413,
"num_tokens": 22369641.0,
"step": 2755
},
{
"entropy": 1.1814116358757019,
"epoch": 1.5584415584415585,
"grad_norm": 58.10313415527344,
"learning_rate": 4.779884346598251e-06,
"loss": 0.619,
"mean_token_accuracy": 0.8186899900436402,
"num_tokens": 22410436.0,
"step": 2760
},
{
"entropy": 1.187838339805603,
"epoch": 1.5612648221343872,
"grad_norm": 80.88594818115234,
"learning_rate": 4.7791018753260186e-06,
"loss": 0.68,
"mean_token_accuracy": 0.8037778973579407,
"num_tokens": 22451071.0,
"step": 2765
},
{
"entropy": 1.1621285438537599,
"epoch": 1.5640880858272164,
"grad_norm": 74.58294677734375,
"learning_rate": 4.778318102791458e-06,
"loss": 0.663,
"mean_token_accuracy": 0.8116282105445862,
"num_tokens": 22491576.0,
"step": 2770
},
{
"entropy": 1.262639617919922,
"epoch": 1.566911349520045,
"grad_norm": 85.41896057128906,
"learning_rate": 4.777533029611152e-06,
"loss": 0.692,
"mean_token_accuracy": 0.8043246865272522,
"num_tokens": 22532330.0,
"step": 2775
},
{
"entropy": 1.1400727033615112,
"epoch": 1.5697346132128742,
"grad_norm": 73.55253601074219,
"learning_rate": 4.77674665640271e-06,
"loss": 0.6136,
"mean_token_accuracy": 0.8233813762664794,
"num_tokens": 22573071.0,
"step": 2780
},
{
"entropy": 1.1831058979034423,
"epoch": 1.572557876905703,
"grad_norm": 71.34568786621094,
"learning_rate": 4.775958983784762e-06,
"loss": 0.6038,
"mean_token_accuracy": 0.8231364607810974,
"num_tokens": 22613586.0,
"step": 2785
},
{
"entropy": 1.2381763458251953,
"epoch": 1.5753811405985318,
"grad_norm": 89.24989318847656,
"learning_rate": 4.7751700123769615e-06,
"loss": 0.7051,
"mean_token_accuracy": 0.7986539959907532,
"num_tokens": 22654078.0,
"step": 2790
},
{
"entropy": 1.2196001529693603,
"epoch": 1.5782044042913608,
"grad_norm": 75.8842544555664,
"learning_rate": 4.774379742799982e-06,
"loss": 0.6416,
"mean_token_accuracy": 0.8148429989814758,
"num_tokens": 22694488.0,
"step": 2795
},
{
"entropy": 1.1762643814086915,
"epoch": 1.5810276679841897,
"grad_norm": 70.9981460571289,
"learning_rate": 4.773588175675519e-06,
"loss": 0.6995,
"mean_token_accuracy": 0.801743483543396,
"num_tokens": 22735246.0,
"step": 2800
},
{
"entropy": 1.3188316106796265,
"epoch": 1.5838509316770186,
"grad_norm": 77.98265075683594,
"learning_rate": 4.77279531162629e-06,
"loss": 0.7117,
"mean_token_accuracy": 0.7977353572845459,
"num_tokens": 22776009.0,
"step": 2805
},
{
"entropy": 1.1862847805023193,
"epoch": 1.5866741953698476,
"grad_norm": 73.0948486328125,
"learning_rate": 4.772001151276031e-06,
"loss": 0.6288,
"mean_token_accuracy": 0.8189311504364014,
"num_tokens": 22816752.0,
"step": 2810
},
{
"entropy": 1.2885456085205078,
"epoch": 1.5894974590626765,
"grad_norm": 96.79470825195312,
"learning_rate": 4.771205695249498e-06,
"loss": 0.6912,
"mean_token_accuracy": 0.804690134525299,
"num_tokens": 22857463.0,
"step": 2815
},
{
"entropy": 1.2456789016723633,
"epoch": 1.5923207227555054,
"grad_norm": 98.59661865234375,
"learning_rate": 4.770408944172468e-06,
"loss": 0.7048,
"mean_token_accuracy": 0.7998390793800354,
"num_tokens": 22898192.0,
"step": 2820
},
{
"entropy": 1.263387417793274,
"epoch": 1.5951439864483343,
"grad_norm": 86.11984252929688,
"learning_rate": 4.769610898671735e-06,
"loss": 0.6847,
"mean_token_accuracy": 0.8015385270118713,
"num_tokens": 22938960.0,
"step": 2825
},
{
"entropy": 1.1750084161758423,
"epoch": 1.597967250141163,
"grad_norm": 68.83621215820312,
"learning_rate": 4.768811559375112e-06,
"loss": 0.644,
"mean_token_accuracy": 0.8148982763290405,
"num_tokens": 22979853.0,
"step": 2830
},
{
"entropy": 1.2434749603271484,
"epoch": 1.6007905138339922,
"grad_norm": 63.998146057128906,
"learning_rate": 4.76801092691143e-06,
"loss": 0.6424,
"mean_token_accuracy": 0.8153539896011353,
"num_tokens": 23020582.0,
"step": 2835
},
{
"entropy": 1.180994987487793,
"epoch": 1.6036137775268209,
"grad_norm": 75.83289337158203,
"learning_rate": 4.7672090019105365e-06,
"loss": 0.6582,
"mean_token_accuracy": 0.8138703823089599,
"num_tokens": 23061192.0,
"step": 2840
},
{
"entropy": 1.41214861869812,
"epoch": 1.60643704121965,
"grad_norm": 80.3683853149414,
"learning_rate": 4.7664057850032974e-06,
"loss": 0.7149,
"mean_token_accuracy": 0.7960960865020752,
"num_tokens": 23101697.0,
"step": 2845
},
{
"entropy": 1.2189382076263429,
"epoch": 1.6092603049124787,
"grad_norm": 85.0681381225586,
"learning_rate": 4.765601276821593e-06,
"loss": 0.6258,
"mean_token_accuracy": 0.8162513375282288,
"num_tokens": 23142380.0,
"step": 2850
},
{
"entropy": 1.2289648056030273,
"epoch": 1.6120835686053079,
"grad_norm": 72.83939361572266,
"learning_rate": 4.76479547799832e-06,
"loss": 0.6472,
"mean_token_accuracy": 0.8116074323654174,
"num_tokens": 23183087.0,
"step": 2855
},
{
"entropy": 1.1334474802017211,
"epoch": 1.6149068322981366,
"grad_norm": 76.32488250732422,
"learning_rate": 4.763988389167392e-06,
"loss": 0.6349,
"mean_token_accuracy": 0.8176516652107239,
"num_tokens": 23223891.0,
"step": 2860
},
{
"entropy": 1.2608877182006837,
"epoch": 1.6177300959909655,
"grad_norm": 80.27566528320312,
"learning_rate": 4.763180010963735e-06,
"loss": 0.6582,
"mean_token_accuracy": 0.8102393507957458,
"num_tokens": 23264505.0,
"step": 2865
},
{
"entropy": 1.3712619066238403,
"epoch": 1.6205533596837944,
"grad_norm": 75.8729476928711,
"learning_rate": 4.762370344023291e-06,
"loss": 0.7595,
"mean_token_accuracy": 0.7833655118942261,
"num_tokens": 23304834.0,
"step": 2870
},
{
"entropy": 1.1733553886413575,
"epoch": 1.6233766233766234,
"grad_norm": 80.17864990234375,
"learning_rate": 4.761559388983017e-06,
"loss": 0.6424,
"mean_token_accuracy": 0.8149887919425964,
"num_tokens": 23344930.0,
"step": 2875
},
{
"entropy": 1.2769611835479737,
"epoch": 1.6261998870694523,
"grad_norm": 69.23902130126953,
"learning_rate": 4.760747146480879e-06,
"loss": 0.6904,
"mean_token_accuracy": 0.8037740349769592,
"num_tokens": 23385656.0,
"step": 2880
},
{
"entropy": 1.2944100141525268,
"epoch": 1.6290231507622812,
"grad_norm": 74.3133544921875,
"learning_rate": 4.75993361715586e-06,
"loss": 0.7092,
"mean_token_accuracy": 0.7989227890968322,
"num_tokens": 23425661.0,
"step": 2885
},
{
"entropy": 1.1579317331314087,
"epoch": 1.6318464144551101,
"grad_norm": 60.908485412597656,
"learning_rate": 4.759118801647955e-06,
"loss": 0.5889,
"mean_token_accuracy": 0.8264386534690857,
"num_tokens": 23466543.0,
"step": 2890
},
{
"entropy": 1.2280898094177246,
"epoch": 1.634669678147939,
"grad_norm": 78.14270782470703,
"learning_rate": 4.758302700598166e-06,
"loss": 0.6932,
"mean_token_accuracy": 0.802795660495758,
"num_tokens": 23506666.0,
"step": 2895
},
{
"entropy": 1.2708383798599243,
"epoch": 1.637492941840768,
"grad_norm": 63.11326217651367,
"learning_rate": 4.757485314648514e-06,
"loss": 0.6657,
"mean_token_accuracy": 0.8090845704078674,
"num_tokens": 23547306.0,
"step": 2900
},
{
"entropy": 1.1924805402755738,
"epoch": 1.6403162055335967,
"grad_norm": 86.47279357910156,
"learning_rate": 4.756666644442024e-06,
"loss": 0.6424,
"mean_token_accuracy": 0.8108976602554321,
"num_tokens": 23588034.0,
"step": 2905
},
{
"entropy": 1.230809736251831,
"epoch": 1.6431394692264258,
"grad_norm": 73.81243896484375,
"learning_rate": 4.755846690622736e-06,
"loss": 0.6891,
"mean_token_accuracy": 0.8049345135688781,
"num_tokens": 23628731.0,
"step": 2910
},
{
"entropy": 1.129303252696991,
"epoch": 1.6459627329192545,
"grad_norm": 79.32695770263672,
"learning_rate": 4.755025453835698e-06,
"loss": 0.6375,
"mean_token_accuracy": 0.8155039429664612,
"num_tokens": 23669377.0,
"step": 2915
},
{
"entropy": 1.1930960178375245,
"epoch": 1.6487859966120837,
"grad_norm": 79.17987823486328,
"learning_rate": 4.754202934726965e-06,
"loss": 0.6437,
"mean_token_accuracy": 0.8137888431549072,
"num_tokens": 23710215.0,
"step": 2920
},
{
"entropy": 1.1872772932052613,
"epoch": 1.6516092603049124,
"grad_norm": 73.08032989501953,
"learning_rate": 4.753379133943606e-06,
"loss": 0.6098,
"mean_token_accuracy": 0.8207122087478638,
"num_tokens": 23750886.0,
"step": 2925
},
{
"entropy": 1.2139918804168701,
"epoch": 1.6544325239977415,
"grad_norm": 85.57366180419922,
"learning_rate": 4.752554052133693e-06,
"loss": 0.6708,
"mean_token_accuracy": 0.8065659999847412,
"num_tokens": 23791517.0,
"step": 2930
},
{
"entropy": 1.2267068386077882,
"epoch": 1.6572557876905702,
"grad_norm": 82.90193176269531,
"learning_rate": 4.751727689946309e-06,
"loss": 0.6589,
"mean_token_accuracy": 0.8087624788284302,
"num_tokens": 23832183.0,
"step": 2935
},
{
"entropy": 1.1001429557800293,
"epoch": 1.6600790513833992,
"grad_norm": 74.51642608642578,
"learning_rate": 4.750900048031543e-06,
"loss": 0.6045,
"mean_token_accuracy": 0.8216747164726257,
"num_tokens": 23873086.0,
"step": 2940
},
{
"entropy": 1.2163544416427612,
"epoch": 1.662902315076228,
"grad_norm": 70.33753967285156,
"learning_rate": 4.750071127040493e-06,
"loss": 0.644,
"mean_token_accuracy": 0.8124571919441224,
"num_tokens": 23913676.0,
"step": 2945
},
{
"entropy": 1.2160048484802246,
"epoch": 1.665725578769057,
"grad_norm": 63.329654693603516,
"learning_rate": 4.749240927625258e-06,
"loss": 0.6378,
"mean_token_accuracy": 0.8146205306053161,
"num_tokens": 23954446.0,
"step": 2950
},
{
"entropy": 1.3222097873687744,
"epoch": 1.668548842461886,
"grad_norm": 78.38276672363281,
"learning_rate": 4.748409450438948e-06,
"loss": 0.6993,
"mean_token_accuracy": 0.8010660886764527,
"num_tokens": 23994980.0,
"step": 2955
},
{
"entropy": 1.3176050662994385,
"epoch": 1.6713721061547149,
"grad_norm": 73.32920837402344,
"learning_rate": 4.747576696135676e-06,
"loss": 0.7089,
"mean_token_accuracy": 0.7956379055976868,
"num_tokens": 24035743.0,
"step": 2960
},
{
"entropy": 1.1523848533630372,
"epoch": 1.6741953698475438,
"grad_norm": 74.85565185546875,
"learning_rate": 4.746742665370561e-06,
"loss": 0.6256,
"mean_token_accuracy": 0.8195020914077759,
"num_tokens": 24076284.0,
"step": 2965
},
{
"entropy": 1.1982210397720336,
"epoch": 1.6770186335403725,
"grad_norm": 66.0778579711914,
"learning_rate": 4.7459073587997215e-06,
"loss": 0.6389,
"mean_token_accuracy": 0.8123613357543945,
"num_tokens": 24117006.0,
"step": 2970
},
{
"entropy": 1.3042704105377196,
"epoch": 1.6798418972332017,
"grad_norm": 73.61102294921875,
"learning_rate": 4.745070777080288e-06,
"loss": 0.7255,
"mean_token_accuracy": 0.7976557850837708,
"num_tokens": 24157561.0,
"step": 2975
},
{
"entropy": 1.222891139984131,
"epoch": 1.6826651609260304,
"grad_norm": 72.3819580078125,
"learning_rate": 4.744232920870387e-06,
"loss": 0.6421,
"mean_token_accuracy": 0.8140509843826294,
"num_tokens": 24198058.0,
"step": 2980
},
{
"entropy": 1.3084670066833497,
"epoch": 1.6854884246188595,
"grad_norm": 114.97191619873047,
"learning_rate": 4.743393790829149e-06,
"loss": 0.676,
"mean_token_accuracy": 0.8067424893379211,
"num_tokens": 24238634.0,
"step": 2985
},
{
"entropy": 1.318178415298462,
"epoch": 1.6883116883116882,
"grad_norm": 75.34236907958984,
"learning_rate": 4.742553387616709e-06,
"loss": 0.7423,
"mean_token_accuracy": 0.7923501610755921,
"num_tokens": 24279411.0,
"step": 2990
},
{
"entropy": 1.2532196283340453,
"epoch": 1.6911349520045174,
"grad_norm": 86.58775329589844,
"learning_rate": 4.741711711894203e-06,
"loss": 0.7164,
"mean_token_accuracy": 0.7993324875831604,
"num_tokens": 24319998.0,
"step": 2995
},
{
"entropy": 1.1841705083847045,
"epoch": 1.693958215697346,
"grad_norm": 86.2491683959961,
"learning_rate": 4.740868764323765e-06,
"loss": 0.6665,
"mean_token_accuracy": 0.8101414680480957,
"num_tokens": 24360590.0,
"step": 3000
},
{
"epoch": 1.693958215697346,
"eval_entropy": 1.3606500387191773,
"eval_loss": 0.5422791838645935,
"eval_mean_token_accuracy": 0.8444921255111695,
"eval_num_tokens": 24360590.0,
"eval_runtime": 2.4503,
"eval_samples_per_second": 15.916,
"eval_steps_per_second": 2.041,
"step": 3000
},
{
"entropy": 1.3304717063903808,
"epoch": 1.6967814793901752,
"grad_norm": 78.38925170898438,
"learning_rate": 4.740024545568535e-06,
"loss": 0.726,
"mean_token_accuracy": 0.7918658256530762,
"num_tokens": 24401025.0,
"step": 3005
},
{
"entropy": 1.2585272312164306,
"epoch": 1.699604743083004,
"grad_norm": 69.97944641113281,
"learning_rate": 4.739179056292647e-06,
"loss": 0.6643,
"mean_token_accuracy": 0.8107133388519288,
"num_tokens": 24441667.0,
"step": 3010
},
{
"entropy": 1.2609277486801147,
"epoch": 1.7024280067758328,
"grad_norm": 77.39826202392578,
"learning_rate": 4.738332297161239e-06,
"loss": 0.6728,
"mean_token_accuracy": 0.8045721054077148,
"num_tokens": 24482605.0,
"step": 3015
},
{
"entropy": 1.425962710380554,
"epoch": 1.7052512704686618,
"grad_norm": 83.03507995605469,
"learning_rate": 4.737484268840446e-06,
"loss": 0.7411,
"mean_token_accuracy": 0.7898955702781677,
"num_tokens": 24523306.0,
"step": 3020
},
{
"entropy": 1.1658933877944946,
"epoch": 1.7080745341614907,
"grad_norm": 67.21765899658203,
"learning_rate": 4.736634971997401e-06,
"loss": 0.6395,
"mean_token_accuracy": 0.8146594166755676,
"num_tokens": 24563887.0,
"step": 3025
},
{
"entropy": 1.106322956085205,
"epoch": 1.7108977978543196,
"grad_norm": 86.37903594970703,
"learning_rate": 4.735784407300238e-06,
"loss": 0.6227,
"mean_token_accuracy": 0.8204713344573975,
"num_tokens": 24604633.0,
"step": 3030
},
{
"entropy": 1.141917634010315,
"epoch": 1.7137210615471485,
"grad_norm": 85.14436340332031,
"learning_rate": 4.734932575418084e-06,
"loss": 0.6376,
"mean_token_accuracy": 0.8149391174316406,
"num_tokens": 24645098.0,
"step": 3035
},
{
"entropy": 1.1664824724197387,
"epoch": 1.7165443252399775,
"grad_norm": 61.968544006347656,
"learning_rate": 4.734079477021065e-06,
"loss": 0.6516,
"mean_token_accuracy": 0.8103834867477417,
"num_tokens": 24685871.0,
"step": 3040
},
{
"entropy": 1.171910786628723,
"epoch": 1.7193675889328062,
"grad_norm": 75.73184967041016,
"learning_rate": 4.733225112780305e-06,
"loss": 0.6435,
"mean_token_accuracy": 0.8168030619621277,
"num_tokens": 24726477.0,
"step": 3045
},
{
"entropy": 1.3494631767272949,
"epoch": 1.7221908526256353,
"grad_norm": 89.85262298583984,
"learning_rate": 4.73236948336792e-06,
"loss": 0.6898,
"mean_token_accuracy": 0.8040600776672363,
"num_tokens": 24767322.0,
"step": 3050
},
{
"entropy": 1.3355523347854614,
"epoch": 1.725014116318464,
"grad_norm": 76.07821655273438,
"learning_rate": 4.731512589457026e-06,
"loss": 0.7062,
"mean_token_accuracy": 0.801560926437378,
"num_tokens": 24807997.0,
"step": 3055
},
{
"entropy": 1.3271613121032715,
"epoch": 1.7278373800112932,
"grad_norm": 69.96381378173828,
"learning_rate": 4.7306544317217295e-06,
"loss": 0.6768,
"mean_token_accuracy": 0.8040451407432556,
"num_tokens": 24848529.0,
"step": 3060
},
{
"entropy": 1.173279881477356,
"epoch": 1.7306606437041219,
"grad_norm": 67.66761779785156,
"learning_rate": 4.729795010837134e-06,
"loss": 0.6449,
"mean_token_accuracy": 0.8127846121788025,
"num_tokens": 24889192.0,
"step": 3065
},
{
"entropy": 1.2362797021865846,
"epoch": 1.733483907396951,
"grad_norm": 71.30282592773438,
"learning_rate": 4.728934327479335e-06,
"loss": 0.6755,
"mean_token_accuracy": 0.8028325915336609,
"num_tokens": 24929839.0,
"step": 3070
},
{
"entropy": 1.2514007568359375,
"epoch": 1.7363071710897797,
"grad_norm": 62.189842224121094,
"learning_rate": 4.728072382325423e-06,
"loss": 0.6584,
"mean_token_accuracy": 0.8121908187866211,
"num_tokens": 24970329.0,
"step": 3075
},
{
"entropy": 1.3371544361114502,
"epoch": 1.7391304347826086,
"grad_norm": 77.2507553100586,
"learning_rate": 4.727209176053478e-06,
"loss": 0.7225,
"mean_token_accuracy": 0.7927798509597779,
"num_tokens": 25011018.0,
"step": 3080
},
{
"entropy": 1.2675202369689942,
"epoch": 1.7419536984754376,
"grad_norm": 79.65899658203125,
"learning_rate": 4.726344709342576e-06,
"loss": 0.6745,
"mean_token_accuracy": 0.8066824793815612,
"num_tokens": 25051589.0,
"step": 3085
},
{
"entropy": 1.3616349935531615,
"epoch": 1.7447769621682665,
"grad_norm": 76.38388061523438,
"learning_rate": 4.725478982872782e-06,
"loss": 0.6926,
"mean_token_accuracy": 0.8023685574531555,
"num_tokens": 25092165.0,
"step": 3090
},
{
"entropy": 1.2535524129867555,
"epoch": 1.7476002258610954,
"grad_norm": 85.11703491210938,
"learning_rate": 4.724611997325153e-06,
"loss": 0.7213,
"mean_token_accuracy": 0.7959471702575683,
"num_tokens": 25132923.0,
"step": 3095
},
{
"entropy": 1.2510310173034669,
"epoch": 1.7504234895539243,
"grad_norm": 78.4201889038086,
"learning_rate": 4.723743753381736e-06,
"loss": 0.6994,
"mean_token_accuracy": 0.799690055847168,
"num_tokens": 25173566.0,
"step": 3100
},
{
"entropy": 1.2111499547958373,
"epoch": 1.7532467532467533,
"grad_norm": 68.65093994140625,
"learning_rate": 4.7228742517255684e-06,
"loss": 0.6093,
"mean_token_accuracy": 0.8221887230873108,
"num_tokens": 25214439.0,
"step": 3105
},
{
"entropy": 1.2008164405822754,
"epoch": 1.7560700169395822,
"grad_norm": 69.03833770751953,
"learning_rate": 4.722003493040676e-06,
"loss": 0.6276,
"mean_token_accuracy": 0.8147233247756958,
"num_tokens": 25254743.0,
"step": 3110
},
{
"entropy": 1.2858773469924927,
"epoch": 1.7588932806324111,
"grad_norm": 83.34830474853516,
"learning_rate": 4.721131478012076e-06,
"loss": 0.7253,
"mean_token_accuracy": 0.7950618982315063,
"num_tokens": 25295419.0,
"step": 3115
},
{
"entropy": 1.3381348848342896,
"epoch": 1.7617165443252398,
"grad_norm": 78.60142517089844,
"learning_rate": 4.720258207325771e-06,
"loss": 0.6849,
"mean_token_accuracy": 0.8035218834877014,
"num_tokens": 25336300.0,
"step": 3120
},
{
"entropy": 1.1491690158843995,
"epoch": 1.764539808018069,
"grad_norm": 72.92134094238281,
"learning_rate": 4.7193836816687525e-06,
"loss": 0.6543,
"mean_token_accuracy": 0.8121623754501343,
"num_tokens": 25376953.0,
"step": 3125
},
{
"entropy": 1.2614521503448486,
"epoch": 1.7673630717108977,
"grad_norm": 82.32136535644531,
"learning_rate": 4.718507901729001e-06,
"loss": 0.675,
"mean_token_accuracy": 0.8051144242286682,
"num_tokens": 25417751.0,
"step": 3130
},
{
"entropy": 1.2652166366577149,
"epoch": 1.7701863354037268,
"grad_norm": 61.305721282958984,
"learning_rate": 4.717630868195481e-06,
"loss": 0.6639,
"mean_token_accuracy": 0.8103567004203797,
"num_tokens": 25458579.0,
"step": 3135
},
{
"entropy": 1.2356239557266235,
"epoch": 1.7730095990965555,
"grad_norm": 79.6635513305664,
"learning_rate": 4.716752581758144e-06,
"loss": 0.6785,
"mean_token_accuracy": 0.8023215413093567,
"num_tokens": 25499394.0,
"step": 3140
},
{
"entropy": 1.2099516153335572,
"epoch": 1.7758328627893847,
"grad_norm": 81.50817108154297,
"learning_rate": 4.715873043107928e-06,
"loss": 0.6561,
"mean_token_accuracy": 0.8066185235977172,
"num_tokens": 25539969.0,
"step": 3145
},
{
"entropy": 1.218087124824524,
"epoch": 1.7786561264822134,
"grad_norm": 89.5859146118164,
"learning_rate": 4.714992252936757e-06,
"loss": 0.7023,
"mean_token_accuracy": 0.7991890192031861,
"num_tokens": 25580664.0,
"step": 3150
},
{
"entropy": 1.205983853340149,
"epoch": 1.7814793901750423,
"grad_norm": 77.29006958007812,
"learning_rate": 4.714110211937536e-06,
"loss": 0.6169,
"mean_token_accuracy": 0.8210288166999817,
"num_tokens": 25621449.0,
"step": 3155
},
{
"entropy": 1.2510562658309936,
"epoch": 1.7843026538678712,
"grad_norm": 76.20855712890625,
"learning_rate": 4.713226920804157e-06,
"loss": 0.6714,
"mean_token_accuracy": 0.8030560255050659,
"num_tokens": 25662023.0,
"step": 3160
},
{
"entropy": 1.3263770818710328,
"epoch": 1.7871259175607002,
"grad_norm": 78.84805297851562,
"learning_rate": 4.712342380231494e-06,
"loss": 0.6848,
"mean_token_accuracy": 0.8034481525421142,
"num_tokens": 25702546.0,
"step": 3165
},
{
"entropy": 1.2178564548492432,
"epoch": 1.789949181253529,
"grad_norm": 67.33372497558594,
"learning_rate": 4.711456590915406e-06,
"loss": 0.6608,
"mean_token_accuracy": 0.8104692459106445,
"num_tokens": 25743265.0,
"step": 3170
},
{
"entropy": 1.2496137619018555,
"epoch": 1.792772444946358,
"grad_norm": 74.43345642089844,
"learning_rate": 4.710569553552733e-06,
"loss": 0.714,
"mean_token_accuracy": 0.7992898225784302,
"num_tokens": 25784091.0,
"step": 3175
},
{
"entropy": 1.2508557558059692,
"epoch": 1.795595708639187,
"grad_norm": 79.79424285888672,
"learning_rate": 4.709681268841295e-06,
"loss": 0.6524,
"mean_token_accuracy": 0.81306471824646,
"num_tokens": 25824786.0,
"step": 3180
},
{
"entropy": 1.3131820201873778,
"epoch": 1.7984189723320159,
"grad_norm": 77.71157836914062,
"learning_rate": 4.708791737479897e-06,
"loss": 0.6898,
"mean_token_accuracy": 0.8016238570213318,
"num_tokens": 25865374.0,
"step": 3185
},
{
"entropy": 1.3496314764022828,
"epoch": 1.8012422360248448,
"grad_norm": 78.83152770996094,
"learning_rate": 4.707900960168322e-06,
"loss": 0.6688,
"mean_token_accuracy": 0.805900776386261,
"num_tokens": 25906110.0,
"step": 3190
},
{
"entropy": 1.3347065925598145,
"epoch": 1.8040654997176735,
"grad_norm": 72.0447769165039,
"learning_rate": 4.707008937607333e-06,
"loss": 0.6904,
"mean_token_accuracy": 0.8011209011077881,
"num_tokens": 25946487.0,
"step": 3195
},
{
"entropy": 1.1633864879608153,
"epoch": 1.8068887634105026,
"grad_norm": 83.76029205322266,
"learning_rate": 4.7061156704986746e-06,
"loss": 0.6679,
"mean_token_accuracy": 0.8113258719444275,
"num_tokens": 25987147.0,
"step": 3200
},
{
"entropy": 1.259621810913086,
"epoch": 1.8097120271033313,
"grad_norm": 69.44400787353516,
"learning_rate": 4.70522115954507e-06,
"loss": 0.6571,
"mean_token_accuracy": 0.8144752264022828,
"num_tokens": 26027930.0,
"step": 3205
},
{
"entropy": 1.330192494392395,
"epoch": 1.8125352907961605,
"grad_norm": 76.76569366455078,
"learning_rate": 4.704325405450219e-06,
"loss": 0.6999,
"mean_token_accuracy": 0.8003972887992858,
"num_tokens": 26068498.0,
"step": 3210
},
{
"entropy": 1.2917050123214722,
"epoch": 1.8153585544889892,
"grad_norm": 77.54668426513672,
"learning_rate": 4.703428408918801e-06,
"loss": 0.6613,
"mean_token_accuracy": 0.8059993982315063,
"num_tokens": 26109014.0,
"step": 3215
},
{
"entropy": 1.247635555267334,
"epoch": 1.8181818181818183,
"grad_norm": 74.58939361572266,
"learning_rate": 4.702530170656473e-06,
"loss": 0.6191,
"mean_token_accuracy": 0.8206661820411683,
"num_tokens": 26149710.0,
"step": 3220
},
{
"entropy": 1.3052506685256957,
"epoch": 1.821005081874647,
"grad_norm": 78.04215240478516,
"learning_rate": 4.7016306913698684e-06,
"loss": 0.668,
"mean_token_accuracy": 0.8105997920036316,
"num_tokens": 26190345.0,
"step": 3225
},
{
"entropy": 1.4105382680892944,
"epoch": 1.823828345567476,
"grad_norm": 66.63508605957031,
"learning_rate": 4.700729971766597e-06,
"loss": 0.6894,
"mean_token_accuracy": 0.8041513562202454,
"num_tokens": 26230630.0,
"step": 3230
},
{
"entropy": 1.2475630044937134,
"epoch": 1.826651609260305,
"grad_norm": 71.62044525146484,
"learning_rate": 4.6998280125552435e-06,
"loss": 0.6067,
"mean_token_accuracy": 0.8237750291824341,
"num_tokens": 26271035.0,
"step": 3235
},
{
"entropy": 1.4181864261627197,
"epoch": 1.8294748729531338,
"grad_norm": 70.9604263305664,
"learning_rate": 4.6989248144453695e-06,
"loss": 0.7388,
"mean_token_accuracy": 0.791705870628357,
"num_tokens": 26311785.0,
"step": 3240
},
{
"entropy": 1.4545698642730713,
"epoch": 1.8322981366459627,
"grad_norm": 82.96871185302734,
"learning_rate": 4.698020378147509e-06,
"loss": 0.7193,
"mean_token_accuracy": 0.7931790471076965,
"num_tokens": 26352651.0,
"step": 3245
},
{
"entropy": 1.288289976119995,
"epoch": 1.8351214003387917,
"grad_norm": 76.78516387939453,
"learning_rate": 4.6971147043731725e-06,
"loss": 0.6374,
"mean_token_accuracy": 0.8140319347381592,
"num_tokens": 26393263.0,
"step": 3250
},
{
"entropy": 1.287929368019104,
"epoch": 1.8379446640316206,
"grad_norm": 73.65550231933594,
"learning_rate": 4.696207793834843e-06,
"loss": 0.6379,
"mean_token_accuracy": 0.811824631690979,
"num_tokens": 26434015.0,
"step": 3255
},
{
"entropy": 1.2089492917060851,
"epoch": 1.8407679277244493,
"grad_norm": 83.56748962402344,
"learning_rate": 4.695299647245975e-06,
"loss": 0.6485,
"mean_token_accuracy": 0.8146473169326782,
"num_tokens": 26474409.0,
"step": 3260
},
{
"entropy": 1.2106802225112916,
"epoch": 1.8435911914172785,
"grad_norm": 73.27950286865234,
"learning_rate": 4.694390265320997e-06,
"loss": 0.6976,
"mean_token_accuracy": 0.8010765910148621,
"num_tokens": 26515206.0,
"step": 3265
},
{
"entropy": 1.2146219491958619,
"epoch": 1.8464144551101072,
"grad_norm": 70.8027114868164,
"learning_rate": 4.6934796487753095e-06,
"loss": 0.6656,
"mean_token_accuracy": 0.8075640559196472,
"num_tokens": 26555638.0,
"step": 3270
},
{
"entropy": 1.2627388954162597,
"epoch": 1.8492377188029363,
"grad_norm": 82.94084167480469,
"learning_rate": 4.6925677983252836e-06,
"loss": 0.677,
"mean_token_accuracy": 0.8059833526611329,
"num_tokens": 26596276.0,
"step": 3275
},
{
"entropy": 1.2478217363357544,
"epoch": 1.852060982495765,
"grad_norm": 69.82963562011719,
"learning_rate": 4.69165471468826e-06,
"loss": 0.6692,
"mean_token_accuracy": 0.8066045045852661,
"num_tokens": 26637010.0,
"step": 3280
},
{
"entropy": 1.3499293088912965,
"epoch": 1.8548842461885942,
"grad_norm": 92.67427062988281,
"learning_rate": 4.690740398582554e-06,
"loss": 0.686,
"mean_token_accuracy": 0.8015271663665772,
"num_tokens": 26677722.0,
"step": 3285
},
{
"entropy": 1.312140154838562,
"epoch": 1.8577075098814229,
"grad_norm": 80.48558044433594,
"learning_rate": 4.689824850727443e-06,
"loss": 0.644,
"mean_token_accuracy": 0.8171864151954651,
"num_tokens": 26718495.0,
"step": 3290
},
{
"entropy": 1.1620086908340455,
"epoch": 1.8605307735742518,
"grad_norm": 64.68397521972656,
"learning_rate": 4.68890807184318e-06,
"loss": 0.6173,
"mean_token_accuracy": 0.8194415926933288,
"num_tokens": 26759062.0,
"step": 3295
},
{
"entropy": 1.183970856666565,
"epoch": 1.8633540372670807,
"grad_norm": 78.46247100830078,
"learning_rate": 4.687990062650986e-06,
"loss": 0.6762,
"mean_token_accuracy": 0.8039267778396606,
"num_tokens": 26799736.0,
"step": 3300
},
{
"entropy": 1.2066641569137573,
"epoch": 1.8661773009599096,
"grad_norm": 78.06271362304688,
"learning_rate": 4.687070823873044e-06,
"loss": 0.6518,
"mean_token_accuracy": 0.8096506357192993,
"num_tokens": 26840464.0,
"step": 3305
},
{
"entropy": 1.2104302644729614,
"epoch": 1.8690005646527386,
"grad_norm": 60.512386322021484,
"learning_rate": 4.68615035623251e-06,
"loss": 0.6688,
"mean_token_accuracy": 0.8049030423164367,
"num_tokens": 26881179.0,
"step": 3310
},
{
"entropy": 1.309780240058899,
"epoch": 1.8718238283455675,
"grad_norm": 89.17145538330078,
"learning_rate": 4.685228660453505e-06,
"loss": 0.7278,
"mean_token_accuracy": 0.7917387366294861,
"num_tokens": 26921890.0,
"step": 3315
},
{
"entropy": 1.2725276470184326,
"epoch": 1.8746470920383964,
"grad_norm": 73.05801391601562,
"learning_rate": 4.684305737261116e-06,
"loss": 0.6943,
"mean_token_accuracy": 0.7972886681556701,
"num_tokens": 26962473.0,
"step": 3320
},
{
"entropy": 1.2042244672775269,
"epoch": 1.8774703557312253,
"grad_norm": 75.5986328125,
"learning_rate": 4.683381587381396e-06,
"loss": 0.6685,
"mean_token_accuracy": 0.8059680342674256,
"num_tokens": 27002893.0,
"step": 3325
},
{
"entropy": 1.297372031211853,
"epoch": 1.8802936194240543,
"grad_norm": 60.600040435791016,
"learning_rate": 4.682456211541363e-06,
"loss": 0.7187,
"mean_token_accuracy": 0.7960585474967956,
"num_tokens": 27043613.0,
"step": 3330
},
{
"entropy": 1.2372675895690919,
"epoch": 1.883116883116883,
"grad_norm": 77.02068328857422,
"learning_rate": 4.681529610468999e-06,
"loss": 0.6627,
"mean_token_accuracy": 0.8110163450241089,
"num_tokens": 27084223.0,
"step": 3335
},
{
"entropy": 1.2115526437759399,
"epoch": 1.8859401468097121,
"grad_norm": 71.88720703125,
"learning_rate": 4.68060178489325e-06,
"loss": 0.6416,
"mean_token_accuracy": 0.8154277205467224,
"num_tokens": 27124888.0,
"step": 3340
},
{
"entropy": 1.1731568574905396,
"epoch": 1.8887634105025408,
"grad_norm": 75.11717224121094,
"learning_rate": 4.679672735544024e-06,
"loss": 0.6238,
"mean_token_accuracy": 0.817597496509552,
"num_tokens": 27165493.0,
"step": 3345
},
{
"entropy": 1.2224584341049194,
"epoch": 1.89158667419537,
"grad_norm": 79.28688049316406,
"learning_rate": 4.678742463152196e-06,
"loss": 0.6627,
"mean_token_accuracy": 0.807835042476654,
"num_tokens": 27205997.0,
"step": 3350
},
{
"entropy": 1.1851969718933106,
"epoch": 1.8944099378881987,
"grad_norm": 77.38661193847656,
"learning_rate": 4.677810968449598e-06,
"loss": 0.6489,
"mean_token_accuracy": 0.8135580539703369,
"num_tokens": 27246679.0,
"step": 3355
},
{
"entropy": 1.2727444410324096,
"epoch": 1.8972332015810278,
"grad_norm": 64.15800476074219,
"learning_rate": 4.676878252169025e-06,
"loss": 0.6672,
"mean_token_accuracy": 0.8050463199615479,
"num_tokens": 27287296.0,
"step": 3360
},
{
"entropy": 1.3533100366592408,
"epoch": 1.9000564652738565,
"grad_norm": 74.59213256835938,
"learning_rate": 4.6759443150442375e-06,
"loss": 0.7046,
"mean_token_accuracy": 0.7972202181816102,
"num_tokens": 27327899.0,
"step": 3365
},
{
"entropy": 1.2930695295333863,
"epoch": 1.9028797289666854,
"grad_norm": 74.31491088867188,
"learning_rate": 4.675009157809949e-06,
"loss": 0.6536,
"mean_token_accuracy": 0.810335111618042,
"num_tokens": 27368677.0,
"step": 3370
},
{
"entropy": 1.267875075340271,
"epoch": 1.9057029926595144,
"grad_norm": 91.21916198730469,
"learning_rate": 4.67407278120184e-06,
"loss": 0.6716,
"mean_token_accuracy": 0.8077351331710816,
"num_tokens": 27408549.0,
"step": 3375
},
{
"entropy": 1.2228721261024476,
"epoch": 1.9085262563523433,
"grad_norm": 76.48754119873047,
"learning_rate": 4.6731351859565435e-06,
"loss": 0.6306,
"mean_token_accuracy": 0.8189470052719117,
"num_tokens": 27449132.0,
"step": 3380
},
{
"entropy": 1.3247106790542602,
"epoch": 1.9113495200451722,
"grad_norm": 70.40728759765625,
"learning_rate": 4.672196372811656e-06,
"loss": 0.7196,
"mean_token_accuracy": 0.7973332285881043,
"num_tokens": 27489492.0,
"step": 3385
},
{
"entropy": 1.1619846105575562,
"epoch": 1.9141727837380011,
"grad_norm": 69.71743774414062,
"learning_rate": 4.671256342505731e-06,
"loss": 0.6233,
"mean_token_accuracy": 0.8190138101577759,
"num_tokens": 27530158.0,
"step": 3390
},
{
"entropy": 1.2606486558914185,
"epoch": 1.91699604743083,
"grad_norm": 71.09142303466797,
"learning_rate": 4.6703150957782795e-06,
"loss": 0.7106,
"mean_token_accuracy": 0.7955459117889404,
"num_tokens": 27570628.0,
"step": 3395
},
{
"entropy": 1.2582112312316895,
"epoch": 1.919819311123659,
"grad_norm": 71.53373718261719,
"learning_rate": 4.669372633369769e-06,
"loss": 0.6667,
"mean_token_accuracy": 0.8072426319122314,
"num_tokens": 27611277.0,
"step": 3400
},
{
"entropy": 1.3039365530014038,
"epoch": 1.922642574816488,
"grad_norm": 81.98397827148438,
"learning_rate": 4.668428956021622e-06,
"loss": 0.703,
"mean_token_accuracy": 0.8007499933242798,
"num_tokens": 27651697.0,
"step": 3405
},
{
"entropy": 1.3271862268447876,
"epoch": 1.9254658385093166,
"grad_norm": 73.20526885986328,
"learning_rate": 4.667484064476219e-06,
"loss": 0.6535,
"mean_token_accuracy": 0.8104861855506897,
"num_tokens": 27692305.0,
"step": 3410
},
{
"entropy": 1.3667465209960938,
"epoch": 1.9282891022021458,
"grad_norm": 79.74845123291016,
"learning_rate": 4.666537959476897e-06,
"loss": 0.6962,
"mean_token_accuracy": 0.7966463446617127,
"num_tokens": 27732690.0,
"step": 3415
},
{
"entropy": 1.2887543439865112,
"epoch": 1.9311123658949745,
"grad_norm": 66.48158264160156,
"learning_rate": 4.665590641767943e-06,
"loss": 0.621,
"mean_token_accuracy": 0.8193240642547608,
"num_tokens": 27773352.0,
"step": 3420
},
{
"entropy": 1.1709359884262085,
"epoch": 1.9339356295878036,
"grad_norm": 72.53215026855469,
"learning_rate": 4.664642112094601e-06,
"loss": 0.5945,
"mean_token_accuracy": 0.8274150848388672,
"num_tokens": 27813931.0,
"step": 3425
},
{
"entropy": 1.322715401649475,
"epoch": 1.9367588932806323,
"grad_norm": 75.37449645996094,
"learning_rate": 4.66369237120307e-06,
"loss": 0.7161,
"mean_token_accuracy": 0.7968868970870971,
"num_tokens": 27854738.0,
"step": 3430
},
{
"entropy": 1.4540338277816773,
"epoch": 1.9395821569734615,
"grad_norm": 84.57624053955078,
"learning_rate": 4.662741419840497e-06,
"loss": 0.7683,
"mean_token_accuracy": 0.7849366426467895,
"num_tokens": 27895401.0,
"step": 3435
},
{
"entropy": 1.200544047355652,
"epoch": 1.9424054206662902,
"grad_norm": 61.72697067260742,
"learning_rate": 4.6617892587549865e-06,
"loss": 0.611,
"mean_token_accuracy": 0.8247037768363953,
"num_tokens": 27935721.0,
"step": 3440
},
{
"entropy": 1.2894334077835083,
"epoch": 1.945228684359119,
"grad_norm": 70.79447937011719,
"learning_rate": 4.66083588869559e-06,
"loss": 0.6648,
"mean_token_accuracy": 0.8107550263404846,
"num_tokens": 27976426.0,
"step": 3445
},
{
"entropy": 1.2915485858917237,
"epoch": 1.948051948051948,
"grad_norm": 77.85231018066406,
"learning_rate": 4.659881310412316e-06,
"loss": 0.6462,
"mean_token_accuracy": 0.8118519902229309,
"num_tokens": 28017120.0,
"step": 3450
},
{
"entropy": 1.2960253000259399,
"epoch": 1.950875211744777,
"grad_norm": 91.22349548339844,
"learning_rate": 4.658925524656117e-06,
"loss": 0.7125,
"mean_token_accuracy": 0.7955609560012817,
"num_tokens": 28056932.0,
"step": 3455
},
{
"entropy": 1.2702165603637696,
"epoch": 1.9536984754376059,
"grad_norm": 60.86642837524414,
"learning_rate": 4.657968532178899e-06,
"loss": 0.615,
"mean_token_accuracy": 0.8187849164009094,
"num_tokens": 28097762.0,
"step": 3460
},
{
"entropy": 1.2905339479446412,
"epoch": 1.9565217391304348,
"grad_norm": 64.9665756225586,
"learning_rate": 4.657010333733517e-06,
"loss": 0.6653,
"mean_token_accuracy": 0.8115843653678894,
"num_tokens": 28138448.0,
"step": 3465
},
{
"entropy": 1.3494179725646973,
"epoch": 1.9593450028232637,
"grad_norm": 81.62896728515625,
"learning_rate": 4.656050930073775e-06,
"loss": 0.6935,
"mean_token_accuracy": 0.7997290372848511,
"num_tokens": 28179033.0,
"step": 3470
},
{
"entropy": 1.2266453981399537,
"epoch": 1.9621682665160924,
"grad_norm": 71.46363830566406,
"learning_rate": 4.655090321954422e-06,
"loss": 0.6402,
"mean_token_accuracy": 0.8146908640861511,
"num_tokens": 28219678.0,
"step": 3475
},
{
"entropy": 1.2505339860916138,
"epoch": 1.9649915302089216,
"grad_norm": 62.542510986328125,
"learning_rate": 4.654128510131159e-06,
"loss": 0.6569,
"mean_token_accuracy": 0.8119516491889953,
"num_tokens": 28260389.0,
"step": 3480
},
{
"entropy": 1.2989205360412597,
"epoch": 1.9678147939017503,
"grad_norm": 73.96013641357422,
"learning_rate": 4.653165495360632e-06,
"loss": 0.6984,
"mean_token_accuracy": 0.8002416968345643,
"num_tokens": 28301129.0,
"step": 3485
},
{
"entropy": 1.2366380214691162,
"epoch": 1.9706380575945794,
"grad_norm": 92.0043716430664,
"learning_rate": 4.652201278400432e-06,
"loss": 0.7118,
"mean_token_accuracy": 0.7954460859298706,
"num_tokens": 28341239.0,
"step": 3490
},
{
"entropy": 1.2122334718704224,
"epoch": 1.9734613212874081,
"grad_norm": 68.99957275390625,
"learning_rate": 4.651235860009099e-06,
"loss": 0.6665,
"mean_token_accuracy": 0.810545551776886,
"num_tokens": 28381589.0,
"step": 3495
},
{
"entropy": 1.3472963571548462,
"epoch": 1.9762845849802373,
"grad_norm": 69.63176727294922,
"learning_rate": 4.650269240946115e-06,
"loss": 0.6928,
"mean_token_accuracy": 0.8018389105796814,
"num_tokens": 28422151.0,
"step": 3500
},
{
"epoch": 1.9762845849802373,
"eval_entropy": 1.3732946634292602,
"eval_loss": 0.4887203872203827,
"eval_mean_token_accuracy": 0.8582573294639587,
"eval_num_tokens": 28422151.0,
"eval_runtime": 2.4536,
"eval_samples_per_second": 15.895,
"eval_steps_per_second": 2.038,
"step": 3500
},
{
"entropy": 1.1886157512664794,
"epoch": 1.979107848673066,
"grad_norm": 80.48574829101562,
"learning_rate": 4.6493014219719064e-06,
"loss": 0.6651,
"mean_token_accuracy": 0.8061770439147949,
"num_tokens": 28462742.0,
"step": 3505
},
{
"entropy": 1.2385428190231322,
"epoch": 1.981931112365895,
"grad_norm": 68.21446990966797,
"learning_rate": 4.648332403847849e-06,
"loss": 0.6922,
"mean_token_accuracy": 0.8008740901947021,
"num_tokens": 28503283.0,
"step": 3510
},
{
"entropy": 1.185453176498413,
"epoch": 1.9847543760587238,
"grad_norm": 58.8889274597168,
"learning_rate": 4.6473621873362525e-06,
"loss": 0.5914,
"mean_token_accuracy": 0.8272942423820495,
"num_tokens": 28544089.0,
"step": 3515
},
{
"entropy": 1.3709485530853271,
"epoch": 1.9875776397515528,
"grad_norm": 72.34013366699219,
"learning_rate": 4.64639077320038e-06,
"loss": 0.7202,
"mean_token_accuracy": 0.7945428967475892,
"num_tokens": 28584668.0,
"step": 3520
},
{
"entropy": 1.3020499706268311,
"epoch": 1.9904009034443817,
"grad_norm": 77.33309173583984,
"learning_rate": 4.645418162204427e-06,
"loss": 0.6731,
"mean_token_accuracy": 0.8073466181755066,
"num_tokens": 28625384.0,
"step": 3525
},
{
"entropy": 1.331102156639099,
"epoch": 1.9932241671372106,
"grad_norm": 62.944297790527344,
"learning_rate": 4.644444355113538e-06,
"loss": 0.6745,
"mean_token_accuracy": 0.8051925539970398,
"num_tokens": 28665929.0,
"step": 3530
},
{
"entropy": 1.3355406045913696,
"epoch": 1.9960474308300395,
"grad_norm": 82.5616683959961,
"learning_rate": 4.643469352693793e-06,
"loss": 0.7338,
"mean_token_accuracy": 0.7874128460884094,
"num_tokens": 28706452.0,
"step": 3535
},
{
"entropy": 1.2611999750137328,
"epoch": 1.9988706945228685,
"grad_norm": 70.33882141113281,
"learning_rate": 4.642493155712218e-06,
"loss": 0.6555,
"mean_token_accuracy": 0.8140320301055908,
"num_tokens": 28747167.0,
"step": 3540
},
{
"entropy": 1.2613008499145508,
"epoch": 2.0016939582156974,
"grad_norm": 62.02672576904297,
"learning_rate": 4.641515764936774e-06,
"loss": 0.5603,
"mean_token_accuracy": 0.8346445918083191,
"num_tokens": 28781356.0,
"step": 3545
},
{
"entropy": 1.1375208854675294,
"epoch": 2.004517221908526,
"grad_norm": 90.73487091064453,
"learning_rate": 4.640537181136361e-06,
"loss": 0.5311,
"mean_token_accuracy": 0.8399060368537903,
"num_tokens": 28822142.0,
"step": 3550
},
{
"entropy": 1.1094812870025634,
"epoch": 2.0073404856013553,
"grad_norm": 72.12708282470703,
"learning_rate": 4.639557405080822e-06,
"loss": 0.4815,
"mean_token_accuracy": 0.8520576357841492,
"num_tokens": 28862874.0,
"step": 3555
},
{
"entropy": 1.0384588479995727,
"epoch": 2.010163749294184,
"grad_norm": 88.40074157714844,
"learning_rate": 4.638576437540935e-06,
"loss": 0.4597,
"mean_token_accuracy": 0.8593630313873291,
"num_tokens": 28903529.0,
"step": 3560
},
{
"entropy": 0.9648947477340698,
"epoch": 2.012987012987013,
"grad_norm": 84.00352478027344,
"learning_rate": 4.637594279288412e-06,
"loss": 0.4597,
"mean_token_accuracy": 0.8570084929466247,
"num_tokens": 28944097.0,
"step": 3565
},
{
"entropy": 1.0692477583885194,
"epoch": 2.015810276679842,
"grad_norm": 79.34220886230469,
"learning_rate": 4.63661093109591e-06,
"loss": 0.4578,
"mean_token_accuracy": 0.8585654139518738,
"num_tokens": 28984715.0,
"step": 3570
},
{
"entropy": 0.9981315493583679,
"epoch": 2.018633540372671,
"grad_norm": 95.38790130615234,
"learning_rate": 4.635626393737015e-06,
"loss": 0.4884,
"mean_token_accuracy": 0.8481999158859252,
"num_tokens": 29025080.0,
"step": 3575
},
{
"entropy": 1.0125423192977905,
"epoch": 2.0214568040654997,
"grad_norm": 73.60315704345703,
"learning_rate": 4.634640667986251e-06,
"loss": 0.4606,
"mean_token_accuracy": 0.8575116157531738,
"num_tokens": 29065999.0,
"step": 3580
},
{
"entropy": 1.0129147291183471,
"epoch": 2.024280067758329,
"grad_norm": 70.65982818603516,
"learning_rate": 4.633653754619076e-06,
"loss": 0.4568,
"mean_token_accuracy": 0.8609514951705932,
"num_tokens": 29106693.0,
"step": 3585
},
{
"entropy": 1.0722288131713866,
"epoch": 2.0271033314511575,
"grad_norm": 80.4547348022461,
"learning_rate": 4.632665654411885e-06,
"loss": 0.4978,
"mean_token_accuracy": 0.8478749513626098,
"num_tokens": 29147326.0,
"step": 3590
},
{
"entropy": 0.9768878698349,
"epoch": 2.0299265951439867,
"grad_norm": 72.7870864868164,
"learning_rate": 4.631676368142003e-06,
"loss": 0.4441,
"mean_token_accuracy": 0.862525200843811,
"num_tokens": 29188095.0,
"step": 3595
},
{
"entropy": 0.9430813789367676,
"epoch": 2.0327498588368154,
"grad_norm": 67.3983383178711,
"learning_rate": 4.630685896587691e-06,
"loss": 0.4408,
"mean_token_accuracy": 0.8643229246139527,
"num_tokens": 29228722.0,
"step": 3600
},
{
"entropy": 0.9772986054420472,
"epoch": 2.035573122529644,
"grad_norm": 73.83796691894531,
"learning_rate": 4.6296942405281405e-06,
"loss": 0.443,
"mean_token_accuracy": 0.8627371668815613,
"num_tokens": 29269343.0,
"step": 3605
},
{
"entropy": 0.9941539525985718,
"epoch": 2.038396386222473,
"grad_norm": 69.57147979736328,
"learning_rate": 4.628701400743475e-06,
"loss": 0.4446,
"mean_token_accuracy": 0.8636790156364441,
"num_tokens": 29310171.0,
"step": 3610
},
{
"entropy": 1.0042089581489564,
"epoch": 2.041219649915302,
"grad_norm": 74.8926010131836,
"learning_rate": 4.627707378014751e-06,
"loss": 0.4682,
"mean_token_accuracy": 0.8554182171821594,
"num_tokens": 29350746.0,
"step": 3615
},
{
"entropy": 1.0139081001281738,
"epoch": 2.044042913608131,
"grad_norm": 71.61470031738281,
"learning_rate": 4.626712173123953e-06,
"loss": 0.4986,
"mean_token_accuracy": 0.8487652540206909,
"num_tokens": 29391411.0,
"step": 3620
},
{
"entropy": 1.0470230221748351,
"epoch": 2.0468661773009598,
"grad_norm": 83.09550476074219,
"learning_rate": 4.625715786853999e-06,
"loss": 0.5225,
"mean_token_accuracy": 0.8379368543624878,
"num_tokens": 29432182.0,
"step": 3625
},
{
"entropy": 1.1465112209320067,
"epoch": 2.049689440993789,
"grad_norm": 80.86943817138672,
"learning_rate": 4.624718219988732e-06,
"loss": 0.4994,
"mean_token_accuracy": 0.8471787929534912,
"num_tokens": 29472990.0,
"step": 3630
},
{
"entropy": 0.9782078504562378,
"epoch": 2.0525127046866176,
"grad_norm": 84.73113250732422,
"learning_rate": 4.623719473312928e-06,
"loss": 0.4619,
"mean_token_accuracy": 0.8561048865318298,
"num_tokens": 29513219.0,
"step": 3635
},
{
"entropy": 1.056278944015503,
"epoch": 2.0553359683794468,
"grad_norm": 73.62364959716797,
"learning_rate": 4.622719547612288e-06,
"loss": 0.4899,
"mean_token_accuracy": 0.8500555634498597,
"num_tokens": 29553939.0,
"step": 3640
},
{
"entropy": 1.126291823387146,
"epoch": 2.0581592320722755,
"grad_norm": 73.61604309082031,
"learning_rate": 4.621718443673442e-06,
"loss": 0.5254,
"mean_token_accuracy": 0.8411227583885192,
"num_tokens": 29594657.0,
"step": 3645
},
{
"entropy": 0.9695918083190918,
"epoch": 2.0609824957651046,
"grad_norm": 67.4301528930664,
"learning_rate": 4.620716162283945e-06,
"loss": 0.4758,
"mean_token_accuracy": 0.8537283182144165,
"num_tokens": 29635355.0,
"step": 3650
},
{
"entropy": 0.9435236334800721,
"epoch": 2.0638057594579333,
"grad_norm": 80.18263244628906,
"learning_rate": 4.619712704232283e-06,
"loss": 0.4425,
"mean_token_accuracy": 0.8651264548301697,
"num_tokens": 29676040.0,
"step": 3655
},
{
"entropy": 1.0257262706756591,
"epoch": 2.0666290231507625,
"grad_norm": 83.10700225830078,
"learning_rate": 4.618708070307863e-06,
"loss": 0.4829,
"mean_token_accuracy": 0.8504552364349365,
"num_tokens": 29716831.0,
"step": 3660
},
{
"entropy": 1.0027997612953186,
"epoch": 2.069452286843591,
"grad_norm": 68.62332153320312,
"learning_rate": 4.617702261301018e-06,
"loss": 0.4656,
"mean_token_accuracy": 0.8585775852203369,
"num_tokens": 29757384.0,
"step": 3665
},
{
"entropy": 1.088618552684784,
"epoch": 2.0722755505364203,
"grad_norm": 74.5041275024414,
"learning_rate": 4.616695278003006e-06,
"loss": 0.4726,
"mean_token_accuracy": 0.8526723146438598,
"num_tokens": 29797784.0,
"step": 3670
},
{
"entropy": 1.1112963557243347,
"epoch": 2.075098814229249,
"grad_norm": 79.9881362915039,
"learning_rate": 4.61568712120601e-06,
"loss": 0.4785,
"mean_token_accuracy": 0.8534006595611572,
"num_tokens": 29838386.0,
"step": 3675
},
{
"entropy": 0.9924672245979309,
"epoch": 2.0779220779220777,
"grad_norm": 75.17504119873047,
"learning_rate": 4.614677791703134e-06,
"loss": 0.4477,
"mean_token_accuracy": 0.8596629619598388,
"num_tokens": 29879193.0,
"step": 3680
},
{
"entropy": 0.996102774143219,
"epoch": 2.080745341614907,
"grad_norm": 75.02816009521484,
"learning_rate": 4.613667290288406e-06,
"loss": 0.489,
"mean_token_accuracy": 0.8493238091468811,
"num_tokens": 29919999.0,
"step": 3685
},
{
"entropy": 1.0089183807373048,
"epoch": 2.0835686053077356,
"grad_norm": 68.33311462402344,
"learning_rate": 4.612655617756776e-06,
"loss": 0.4584,
"mean_token_accuracy": 0.8591845631599426,
"num_tokens": 29960014.0,
"step": 3690
},
{
"entropy": 1.080802845954895,
"epoch": 2.0863918690005647,
"grad_norm": 74.33163452148438,
"learning_rate": 4.611642774904113e-06,
"loss": 0.4702,
"mean_token_accuracy": 0.8538957357406616,
"num_tokens": 30000495.0,
"step": 3695
},
{
"entropy": 1.1036657571792603,
"epoch": 2.0892151326933934,
"grad_norm": 77.43889617919922,
"learning_rate": 4.6106287625272106e-06,
"loss": 0.5021,
"mean_token_accuracy": 0.8491263151168823,
"num_tokens": 30041011.0,
"step": 3700
},
{
"entropy": 1.0965331435203551,
"epoch": 2.0920383963862226,
"grad_norm": 102.71356964111328,
"learning_rate": 4.609613581423779e-06,
"loss": 0.5103,
"mean_token_accuracy": 0.8435364127159118,
"num_tokens": 30081604.0,
"step": 3705
},
{
"entropy": 1.1061514854431151,
"epoch": 2.0948616600790513,
"grad_norm": 75.31636047363281,
"learning_rate": 4.6085972323924485e-06,
"loss": 0.5042,
"mean_token_accuracy": 0.8477765440940856,
"num_tokens": 30121410.0,
"step": 3710
},
{
"entropy": 1.0610831737518311,
"epoch": 2.0976849237718804,
"grad_norm": 76.53677368164062,
"learning_rate": 4.607579716232771e-06,
"loss": 0.4706,
"mean_token_accuracy": 0.8552014946937561,
"num_tokens": 30162093.0,
"step": 3715
},
{
"entropy": 1.0802299499511718,
"epoch": 2.100508187464709,
"grad_norm": 70.81407928466797,
"learning_rate": 4.606561033745213e-06,
"loss": 0.5047,
"mean_token_accuracy": 0.847411835193634,
"num_tokens": 30202390.0,
"step": 3720
},
{
"entropy": 1.0219064354896545,
"epoch": 2.1033314511575383,
"grad_norm": 68.52303314208984,
"learning_rate": 4.6055411857311605e-06,
"loss": 0.5117,
"mean_token_accuracy": 0.8439998030662537,
"num_tokens": 30243104.0,
"step": 3725
},
{
"entropy": 1.0835317373275757,
"epoch": 2.106154714850367,
"grad_norm": 80.77493286132812,
"learning_rate": 4.6045201729929145e-06,
"loss": 0.4638,
"mean_token_accuracy": 0.8567952990531922,
"num_tokens": 30283599.0,
"step": 3730
},
{
"entropy": 0.975482976436615,
"epoch": 2.108977978543196,
"grad_norm": 72.1488265991211,
"learning_rate": 4.603497996333695e-06,
"loss": 0.457,
"mean_token_accuracy": 0.8579200267791748,
"num_tokens": 30324432.0,
"step": 3735
},
{
"entropy": 0.9810956478118896,
"epoch": 2.111801242236025,
"grad_norm": 69.67237091064453,
"learning_rate": 4.602474656557636e-06,
"loss": 0.5009,
"mean_token_accuracy": 0.846557891368866,
"num_tokens": 30365264.0,
"step": 3740
},
{
"entropy": 1.076551580429077,
"epoch": 2.1146245059288535,
"grad_norm": 78.01778411865234,
"learning_rate": 4.601450154469786e-06,
"loss": 0.4761,
"mean_token_accuracy": 0.8549446344375611,
"num_tokens": 30405939.0,
"step": 3745
},
{
"entropy": 1.0331781387329102,
"epoch": 2.1174477696216827,
"grad_norm": 78.21675109863281,
"learning_rate": 4.60042449087611e-06,
"loss": 0.5095,
"mean_token_accuracy": 0.8455055952072144,
"num_tokens": 30446514.0,
"step": 3750
},
{
"entropy": 1.047632908821106,
"epoch": 2.1202710333145114,
"grad_norm": 62.341732025146484,
"learning_rate": 4.599397666583484e-06,
"loss": 0.4864,
"mean_token_accuracy": 0.850367295742035,
"num_tokens": 30487017.0,
"step": 3755
},
{
"entropy": 1.0679221272468566,
"epoch": 2.1230942970073405,
"grad_norm": 74.56029510498047,
"learning_rate": 4.598369682399699e-06,
"loss": 0.4839,
"mean_token_accuracy": 0.8506836295127869,
"num_tokens": 30527623.0,
"step": 3760
},
{
"entropy": 1.043572473526001,
"epoch": 2.1259175607001692,
"grad_norm": 70.33014678955078,
"learning_rate": 4.597340539133459e-06,
"loss": 0.4852,
"mean_token_accuracy": 0.8513245105743408,
"num_tokens": 30568412.0,
"step": 3765
},
{
"entropy": 1.1451680421829225,
"epoch": 2.1287408243929984,
"grad_norm": 79.97354888916016,
"learning_rate": 4.5963102375943775e-06,
"loss": 0.5012,
"mean_token_accuracy": 0.8455309748649598,
"num_tokens": 30609183.0,
"step": 3770
},
{
"entropy": 1.1353947281837464,
"epoch": 2.131564088085827,
"grad_norm": 76.01318359375,
"learning_rate": 4.59527877859298e-06,
"loss": 0.4989,
"mean_token_accuracy": 0.847420847415924,
"num_tokens": 30649821.0,
"step": 3775
},
{
"entropy": 1.1371727347373963,
"epoch": 2.1343873517786562,
"grad_norm": 66.74515533447266,
"learning_rate": 4.594246162940705e-06,
"loss": 0.4459,
"mean_token_accuracy": 0.8620406270027161,
"num_tokens": 30690300.0,
"step": 3780
},
{
"entropy": 1.032710576057434,
"epoch": 2.137210615471485,
"grad_norm": 68.04978942871094,
"learning_rate": 4.593212391449897e-06,
"loss": 0.5073,
"mean_token_accuracy": 0.8437552571296691,
"num_tokens": 30731044.0,
"step": 3785
},
{
"entropy": 1.0063154816627502,
"epoch": 2.140033879164314,
"grad_norm": 72.64938354492188,
"learning_rate": 4.592177464933814e-06,
"loss": 0.4871,
"mean_token_accuracy": 0.8506463766098022,
"num_tokens": 30771802.0,
"step": 3790
},
{
"entropy": 1.0658278465270996,
"epoch": 2.142857142857143,
"grad_norm": 64.7853012084961,
"learning_rate": 4.591141384206619e-06,
"loss": 0.4997,
"mean_token_accuracy": 0.8490427970886231,
"num_tokens": 30812323.0,
"step": 3795
},
{
"entropy": 1.0591883182525634,
"epoch": 2.145680406549972,
"grad_norm": 60.86104202270508,
"learning_rate": 4.590104150083383e-06,
"loss": 0.4585,
"mean_token_accuracy": 0.8590384602546692,
"num_tokens": 30852654.0,
"step": 3800
},
{
"entropy": 1.0125213027000428,
"epoch": 2.1485036702428006,
"grad_norm": 72.03414916992188,
"learning_rate": 4.5890657633800885e-06,
"loss": 0.4658,
"mean_token_accuracy": 0.8582098603248596,
"num_tokens": 30893491.0,
"step": 3805
},
{
"entropy": 1.1036086440086366,
"epoch": 2.15132693393563,
"grad_norm": 79.08403778076172,
"learning_rate": 4.588026224913621e-06,
"loss": 0.4996,
"mean_token_accuracy": 0.8465798616409301,
"num_tokens": 30934006.0,
"step": 3810
},
{
"entropy": 1.000651216506958,
"epoch": 2.1541501976284585,
"grad_norm": 76.18484497070312,
"learning_rate": 4.586985535501772e-06,
"loss": 0.4644,
"mean_token_accuracy": 0.8542026162147522,
"num_tokens": 30974555.0,
"step": 3815
},
{
"entropy": 1.0279491305351258,
"epoch": 2.1569734613212876,
"grad_norm": 82.19313049316406,
"learning_rate": 4.585943695963241e-06,
"loss": 0.4713,
"mean_token_accuracy": 0.8553337931632996,
"num_tokens": 31015251.0,
"step": 3820
},
{
"entropy": 1.0351431369781494,
"epoch": 2.1597967250141163,
"grad_norm": 74.23955535888672,
"learning_rate": 4.584900707117631e-06,
"loss": 0.4949,
"mean_token_accuracy": 0.8505377411842346,
"num_tokens": 31055845.0,
"step": 3825
},
{
"entropy": 1.0322145819664001,
"epoch": 2.162619988706945,
"grad_norm": 88.02880859375,
"learning_rate": 4.583856569785447e-06,
"loss": 0.4934,
"mean_token_accuracy": 0.8474498748779297,
"num_tokens": 31096141.0,
"step": 3830
},
{
"entropy": 0.9805418610572815,
"epoch": 2.165443252399774,
"grad_norm": 64.54179382324219,
"learning_rate": 4.582811284788101e-06,
"loss": 0.4624,
"mean_token_accuracy": 0.8562976837158203,
"num_tokens": 31136904.0,
"step": 3835
},
{
"entropy": 1.0411486148834228,
"epoch": 2.168266516092603,
"grad_norm": 71.99884033203125,
"learning_rate": 4.581764852947906e-06,
"loss": 0.4996,
"mean_token_accuracy": 0.8462500929832458,
"num_tokens": 31177433.0,
"step": 3840
},
{
"entropy": 1.0053257465362548,
"epoch": 2.171089779785432,
"grad_norm": 90.74437713623047,
"learning_rate": 4.580717275088077e-06,
"loss": 0.4753,
"mean_token_accuracy": 0.8529630064964294,
"num_tokens": 31218132.0,
"step": 3845
},
{
"entropy": 1.1293895483016967,
"epoch": 2.1739130434782608,
"grad_norm": 75.32796478271484,
"learning_rate": 4.5796685520327326e-06,
"loss": 0.5193,
"mean_token_accuracy": 0.8408478140830994,
"num_tokens": 31258670.0,
"step": 3850
},
{
"entropy": 1.0781844973564148,
"epoch": 2.17673630717109,
"grad_norm": 77.02909851074219,
"learning_rate": 4.578618684606889e-06,
"loss": 0.4716,
"mean_token_accuracy": 0.853575599193573,
"num_tokens": 31299497.0,
"step": 3855
},
{
"entropy": 1.1342519521713257,
"epoch": 2.1795595708639186,
"grad_norm": 82.19872283935547,
"learning_rate": 4.5775676736364664e-06,
"loss": 0.4946,
"mean_token_accuracy": 0.8480924963951111,
"num_tokens": 31340229.0,
"step": 3860
},
{
"entropy": 1.0584368586540223,
"epoch": 2.1823828345567478,
"grad_norm": 87.4374008178711,
"learning_rate": 4.57651551994828e-06,
"loss": 0.5106,
"mean_token_accuracy": 0.8445464015007019,
"num_tokens": 31380786.0,
"step": 3865
},
{
"entropy": 1.0729819297790528,
"epoch": 2.1852060982495765,
"grad_norm": 71.60746765136719,
"learning_rate": 4.575462224370048e-06,
"loss": 0.4664,
"mean_token_accuracy": 0.8555550336837768,
"num_tokens": 31421341.0,
"step": 3870
},
{
"entropy": 1.0335511565208435,
"epoch": 2.1880293619424056,
"grad_norm": 85.08560943603516,
"learning_rate": 4.574407787730387e-06,
"loss": 0.4892,
"mean_token_accuracy": 0.8508115291595459,
"num_tokens": 31461914.0,
"step": 3875
},
{
"entropy": 1.0181528091430665,
"epoch": 2.1908526256352343,
"grad_norm": 76.76180267333984,
"learning_rate": 4.573352210858808e-06,
"loss": 0.4807,
"mean_token_accuracy": 0.8531227588653565,
"num_tokens": 31502338.0,
"step": 3880
},
{
"entropy": 1.0412158966064453,
"epoch": 2.1936758893280635,
"grad_norm": 69.1751480102539,
"learning_rate": 4.57229549458572e-06,
"loss": 0.474,
"mean_token_accuracy": 0.8575033068656921,
"num_tokens": 31543167.0,
"step": 3885
},
{
"entropy": 1.0609189033508302,
"epoch": 2.196499153020892,
"grad_norm": 81.97909545898438,
"learning_rate": 4.571237639742432e-06,
"loss": 0.4969,
"mean_token_accuracy": 0.8498437285423279,
"num_tokens": 31583456.0,
"step": 3890
},
{
"entropy": 1.244982409477234,
"epoch": 2.199322416713721,
"grad_norm": 74.12896728515625,
"learning_rate": 4.570178647161144e-06,
"loss": 0.5426,
"mean_token_accuracy": 0.8337196350097656,
"num_tokens": 31624063.0,
"step": 3895
},
{
"entropy": 1.0766704201698303,
"epoch": 2.20214568040655,
"grad_norm": 73.96541595458984,
"learning_rate": 4.5691185176749524e-06,
"loss": 0.5046,
"mean_token_accuracy": 0.8464478969573974,
"num_tokens": 31664720.0,
"step": 3900
},
{
"entropy": 1.108527374267578,
"epoch": 2.2049689440993787,
"grad_norm": 84.28549194335938,
"learning_rate": 4.568057252117849e-06,
"loss": 0.5143,
"mean_token_accuracy": 0.8434839248657227,
"num_tokens": 31705454.0,
"step": 3905
},
{
"entropy": 1.099939227104187,
"epoch": 2.207792207792208,
"grad_norm": 75.40135192871094,
"learning_rate": 4.56699485132472e-06,
"loss": 0.4983,
"mean_token_accuracy": 0.8485525488853455,
"num_tokens": 31745936.0,
"step": 3910
},
{
"entropy": 1.175764226913452,
"epoch": 2.2106154714850366,
"grad_norm": 77.33585357666016,
"learning_rate": 4.565931316131344e-06,
"loss": 0.4914,
"mean_token_accuracy": 0.848324978351593,
"num_tokens": 31786678.0,
"step": 3915
},
{
"entropy": 1.0288933634757995,
"epoch": 2.2134387351778657,
"grad_norm": 87.8160400390625,
"learning_rate": 4.564866647374388e-06,
"loss": 0.4807,
"mean_token_accuracy": 0.8544167518615723,
"num_tokens": 31827383.0,
"step": 3920
},
{
"entropy": 1.101294755935669,
"epoch": 2.2162619988706944,
"grad_norm": 70.5611343383789,
"learning_rate": 4.5638008458914164e-06,
"loss": 0.4902,
"mean_token_accuracy": 0.8476052165031434,
"num_tokens": 31867913.0,
"step": 3925
},
{
"entropy": 1.151436412334442,
"epoch": 2.2190852625635236,
"grad_norm": 66.06202697753906,
"learning_rate": 4.562733912520883e-06,
"loss": 0.5328,
"mean_token_accuracy": 0.8389103293418885,
"num_tokens": 31908537.0,
"step": 3930
},
{
"entropy": 1.0838784456253052,
"epoch": 2.2219085262563523,
"grad_norm": 71.66177368164062,
"learning_rate": 4.5616658481021315e-06,
"loss": 0.4674,
"mean_token_accuracy": 0.8557949662208557,
"num_tokens": 31949054.0,
"step": 3935
},
{
"entropy": 1.0479053497314452,
"epoch": 2.2247317899491814,
"grad_norm": 67.57572174072266,
"learning_rate": 4.560596653475394e-06,
"loss": 0.478,
"mean_token_accuracy": 0.8529563784599304,
"num_tokens": 31989703.0,
"step": 3940
},
{
"entropy": 1.0893271923065186,
"epoch": 2.22755505364201,
"grad_norm": 77.84688568115234,
"learning_rate": 4.559526329481796e-06,
"loss": 0.4716,
"mean_token_accuracy": 0.8553184747695923,
"num_tokens": 32030477.0,
"step": 3945
},
{
"entropy": 1.123781180381775,
"epoch": 2.2303783173348393,
"grad_norm": 79.98429107666016,
"learning_rate": 4.5584548769633465e-06,
"loss": 0.5369,
"mean_token_accuracy": 0.8361510276794434,
"num_tokens": 32071224.0,
"step": 3950
},
{
"entropy": 1.1011250257492065,
"epoch": 2.233201581027668,
"grad_norm": 74.83203125,
"learning_rate": 4.557382296762946e-06,
"loss": 0.4905,
"mean_token_accuracy": 0.8503419160842896,
"num_tokens": 32111863.0,
"step": 3955
},
{
"entropy": 1.028159761428833,
"epoch": 2.2360248447204967,
"grad_norm": 81.0724868774414,
"learning_rate": 4.556308589724379e-06,
"loss": 0.4352,
"mean_token_accuracy": 0.8625659227371216,
"num_tokens": 32152470.0,
"step": 3960
},
{
"entropy": 1.0775118827819825,
"epoch": 2.238848108413326,
"grad_norm": 59.63459396362305,
"learning_rate": 4.555233756692319e-06,
"loss": 0.4762,
"mean_token_accuracy": 0.8518986225128173,
"num_tokens": 32193281.0,
"step": 3965
},
{
"entropy": 1.057163155078888,
"epoch": 2.2416713721061545,
"grad_norm": 95.08134460449219,
"learning_rate": 4.5541577985123245e-06,
"loss": 0.4973,
"mean_token_accuracy": 0.847400176525116,
"num_tokens": 32233871.0,
"step": 3970
},
{
"entropy": 1.021597111225128,
"epoch": 2.2444946357989837,
"grad_norm": 91.9679183959961,
"learning_rate": 4.553080716030838e-06,
"loss": 0.474,
"mean_token_accuracy": 0.8553819775581359,
"num_tokens": 32274313.0,
"step": 3975
},
{
"entropy": 1.0437519192695617,
"epoch": 2.2473178994918124,
"grad_norm": 66.00928497314453,
"learning_rate": 4.552002510095189e-06,
"loss": 0.5062,
"mean_token_accuracy": 0.8446116924285889,
"num_tokens": 32315012.0,
"step": 3980
},
{
"entropy": 1.0313972234725952,
"epoch": 2.2501411631846415,
"grad_norm": 85.3726577758789,
"learning_rate": 4.550923181553588e-06,
"loss": 0.5009,
"mean_token_accuracy": 0.8493191599845886,
"num_tokens": 32355634.0,
"step": 3985
},
{
"entropy": 1.1947339057922364,
"epoch": 2.2529644268774702,
"grad_norm": 83.1342544555664,
"learning_rate": 4.5498427312551316e-06,
"loss": 0.5782,
"mean_token_accuracy": 0.8298458218574524,
"num_tokens": 32396388.0,
"step": 3990
},
{
"entropy": 1.0725741267204285,
"epoch": 2.2557876905702994,
"grad_norm": 82.63168334960938,
"learning_rate": 4.548761160049796e-06,
"loss": 0.5244,
"mean_token_accuracy": 0.8417415022850037,
"num_tokens": 32437109.0,
"step": 3995
},
{
"entropy": 1.0708600401878356,
"epoch": 2.258610954263128,
"grad_norm": 82.65215301513672,
"learning_rate": 4.54767846878844e-06,
"loss": 0.5044,
"mean_token_accuracy": 0.8447011232376098,
"num_tokens": 32477860.0,
"step": 4000
},
{
"epoch": 2.258610954263128,
"eval_entropy": 1.2540549159049987,
"eval_loss": 0.44106096029281616,
"eval_mean_token_accuracy": 0.8703569531440735,
"eval_num_tokens": 32477860.0,
"eval_runtime": 2.4511,
"eval_samples_per_second": 15.911,
"eval_steps_per_second": 2.04,
"step": 4000
},
{
"entropy": 1.1180259704589843,
"epoch": 2.2614342179559572,
"grad_norm": 75.52294158935547,
"learning_rate": 4.546594658322806e-06,
"loss": 0.4942,
"mean_token_accuracy": 0.849030327796936,
"num_tokens": 32518346.0,
"step": 4005
},
{
"entropy": 1.1107259273529053,
"epoch": 2.264257481648786,
"grad_norm": 77.14009857177734,
"learning_rate": 4.545509729505513e-06,
"loss": 0.5014,
"mean_token_accuracy": 0.8447972536087036,
"num_tokens": 32558964.0,
"step": 4010
},
{
"entropy": 1.1611681938171388,
"epoch": 2.267080745341615,
"grad_norm": 69.12362670898438,
"learning_rate": 4.544423683190061e-06,
"loss": 0.4876,
"mean_token_accuracy": 0.8494725942611694,
"num_tokens": 32599726.0,
"step": 4015
},
{
"entropy": 1.1129327774047852,
"epoch": 2.269904009034444,
"grad_norm": 85.86442565917969,
"learning_rate": 4.543336520230831e-06,
"loss": 0.484,
"mean_token_accuracy": 0.8541438221931458,
"num_tokens": 32640450.0,
"step": 4020
},
{
"entropy": 1.1703894138336182,
"epoch": 2.2727272727272725,
"grad_norm": 73.08917236328125,
"learning_rate": 4.542248241483083e-06,
"loss": 0.4916,
"mean_token_accuracy": 0.8498480677604675,
"num_tokens": 32680488.0,
"step": 4025
},
{
"entropy": 1.0600883603096007,
"epoch": 2.2755505364201016,
"grad_norm": 77.29441833496094,
"learning_rate": 4.541158847802949e-06,
"loss": 0.4882,
"mean_token_accuracy": 0.8504791259765625,
"num_tokens": 32721056.0,
"step": 4030
},
{
"entropy": 1.041144812107086,
"epoch": 2.278373800112931,
"grad_norm": 76.53392028808594,
"learning_rate": 4.540068340047446e-06,
"loss": 0.4705,
"mean_token_accuracy": 0.8558522939682007,
"num_tokens": 32761727.0,
"step": 4035
},
{
"entropy": 1.0352703213691712,
"epoch": 2.2811970638057595,
"grad_norm": 62.13428497314453,
"learning_rate": 4.53897671907446e-06,
"loss": 0.4904,
"mean_token_accuracy": 0.8505735397338867,
"num_tokens": 32802000.0,
"step": 4040
},
{
"entropy": 1.0567790031433106,
"epoch": 2.284020327498588,
"grad_norm": 81.95452117919922,
"learning_rate": 4.537883985742759e-06,
"loss": 0.5078,
"mean_token_accuracy": 0.8440911769866943,
"num_tokens": 32842578.0,
"step": 4045
},
{
"entropy": 1.1024208545684815,
"epoch": 2.2868435911914173,
"grad_norm": 67.22496032714844,
"learning_rate": 4.536790140911982e-06,
"loss": 0.5114,
"mean_token_accuracy": 0.8444732785224914,
"num_tokens": 32882686.0,
"step": 4050
},
{
"entropy": 1.0241257309913636,
"epoch": 2.289666854884246,
"grad_norm": 66.01680755615234,
"learning_rate": 4.535695185442644e-06,
"loss": 0.5025,
"mean_token_accuracy": 0.8479160904884339,
"num_tokens": 32923226.0,
"step": 4055
},
{
"entropy": 1.0995964646339416,
"epoch": 2.292490118577075,
"grad_norm": 83.64592742919922,
"learning_rate": 4.534599120196134e-06,
"loss": 0.5606,
"mean_token_accuracy": 0.8310135126113891,
"num_tokens": 32963815.0,
"step": 4060
},
{
"entropy": 1.085460638999939,
"epoch": 2.295313382269904,
"grad_norm": 77.6670150756836,
"learning_rate": 4.533501946034712e-06,
"loss": 0.4963,
"mean_token_accuracy": 0.8473443269729615,
"num_tokens": 33004619.0,
"step": 4065
},
{
"entropy": 1.0653952002525329,
"epoch": 2.298136645962733,
"grad_norm": 83.31126403808594,
"learning_rate": 4.532403663821513e-06,
"loss": 0.4863,
"mean_token_accuracy": 0.8500688552856446,
"num_tokens": 33045463.0,
"step": 4070
},
{
"entropy": 1.245663857460022,
"epoch": 2.3009599096555617,
"grad_norm": 86.89364624023438,
"learning_rate": 4.5313042744205436e-06,
"loss": 0.5591,
"mean_token_accuracy": 0.8307137608528137,
"num_tokens": 33086131.0,
"step": 4075
},
{
"entropy": 1.157560443878174,
"epoch": 2.303783173348391,
"grad_norm": 74.83079528808594,
"learning_rate": 4.530203778696679e-06,
"loss": 0.517,
"mean_token_accuracy": 0.8417824983596802,
"num_tokens": 33126676.0,
"step": 4080
},
{
"entropy": 1.0899484157562256,
"epoch": 2.3066064370412196,
"grad_norm": 70.4035415649414,
"learning_rate": 4.529102177515666e-06,
"loss": 0.4936,
"mean_token_accuracy": 0.8485661268234252,
"num_tokens": 33167352.0,
"step": 4085
},
{
"entropy": 1.0630066752433778,
"epoch": 2.3094297007340487,
"grad_norm": 74.47498321533203,
"learning_rate": 4.5279994717441235e-06,
"loss": 0.4873,
"mean_token_accuracy": 0.8489712834358215,
"num_tokens": 33208163.0,
"step": 4090
},
{
"entropy": 1.1108952403068542,
"epoch": 2.3122529644268774,
"grad_norm": 62.73479080200195,
"learning_rate": 4.526895662249534e-06,
"loss": 0.509,
"mean_token_accuracy": 0.8456613183021545,
"num_tokens": 33248736.0,
"step": 4095
},
{
"entropy": 1.0792955875396728,
"epoch": 2.3150762281197066,
"grad_norm": 70.62771606445312,
"learning_rate": 4.525790749900252e-06,
"loss": 0.5138,
"mean_token_accuracy": 0.8433370232582093,
"num_tokens": 33289452.0,
"step": 4100
},
{
"entropy": 1.1269087076187134,
"epoch": 2.3178994918125353,
"grad_norm": 78.53477478027344,
"learning_rate": 4.5246847355655e-06,
"loss": 0.497,
"mean_token_accuracy": 0.8441904664039612,
"num_tokens": 33330181.0,
"step": 4105
},
{
"entropy": 1.0207838654518127,
"epoch": 2.320722755505364,
"grad_norm": 76.28987121582031,
"learning_rate": 4.523577620115367e-06,
"loss": 0.4814,
"mean_token_accuracy": 0.8532437562942505,
"num_tokens": 33370966.0,
"step": 4110
},
{
"entropy": 1.0918394804000855,
"epoch": 2.323546019198193,
"grad_norm": 89.29618835449219,
"learning_rate": 4.522469404420805e-06,
"loss": 0.45,
"mean_token_accuracy": 0.8594119310379028,
"num_tokens": 33411762.0,
"step": 4115
},
{
"entropy": 1.079656708240509,
"epoch": 2.326369282891022,
"grad_norm": 76.26483917236328,
"learning_rate": 4.521360089353635e-06,
"loss": 0.4774,
"mean_token_accuracy": 0.8538398027420044,
"num_tokens": 33452518.0,
"step": 4120
},
{
"entropy": 1.1360549926757812,
"epoch": 2.329192546583851,
"grad_norm": 74.6915283203125,
"learning_rate": 4.520249675786544e-06,
"loss": 0.5148,
"mean_token_accuracy": 0.8441103577613831,
"num_tokens": 33493150.0,
"step": 4125
},
{
"entropy": 1.0715775847434998,
"epoch": 2.3320158102766797,
"grad_norm": 80.18423461914062,
"learning_rate": 4.519138164593081e-06,
"loss": 0.4907,
"mean_token_accuracy": 0.851858401298523,
"num_tokens": 33533504.0,
"step": 4130
},
{
"entropy": 1.1397679209709168,
"epoch": 2.334839073969509,
"grad_norm": 68.14742279052734,
"learning_rate": 4.518025556647656e-06,
"loss": 0.5158,
"mean_token_accuracy": 0.8431363701820374,
"num_tokens": 33573938.0,
"step": 4135
},
{
"entropy": 1.106320285797119,
"epoch": 2.3376623376623376,
"grad_norm": 74.76013946533203,
"learning_rate": 4.5169118528255455e-06,
"loss": 0.5011,
"mean_token_accuracy": 0.845686161518097,
"num_tokens": 33614513.0,
"step": 4140
},
{
"entropy": 1.10515398979187,
"epoch": 2.3404856013551667,
"grad_norm": 81.91299438476562,
"learning_rate": 4.515797054002888e-06,
"loss": 0.4993,
"mean_token_accuracy": 0.8454554438591003,
"num_tokens": 33655239.0,
"step": 4145
},
{
"entropy": 1.0563299179077148,
"epoch": 2.3433088650479954,
"grad_norm": 76.27633666992188,
"learning_rate": 4.5146811610566825e-06,
"loss": 0.5093,
"mean_token_accuracy": 0.8462665915489197,
"num_tokens": 33695649.0,
"step": 4150
},
{
"entropy": 1.1209071040153504,
"epoch": 2.3461321287408246,
"grad_norm": 74.98118591308594,
"learning_rate": 4.513564174864789e-06,
"loss": 0.4801,
"mean_token_accuracy": 0.854029405117035,
"num_tokens": 33735838.0,
"step": 4155
},
{
"entropy": 1.158321738243103,
"epoch": 2.3489553924336533,
"grad_norm": 80.96183013916016,
"learning_rate": 4.512446096305924e-06,
"loss": 0.5152,
"mean_token_accuracy": 0.8451248645782471,
"num_tokens": 33776577.0,
"step": 4160
},
{
"entropy": 1.3016461849212646,
"epoch": 2.3517786561264824,
"grad_norm": 82.59611511230469,
"learning_rate": 4.511326926259672e-06,
"loss": 0.5629,
"mean_token_accuracy": 0.8303740382194519,
"num_tokens": 33817458.0,
"step": 4165
},
{
"entropy": 1.1589308381080627,
"epoch": 2.354601919819311,
"grad_norm": 73.16849517822266,
"learning_rate": 4.510206665606467e-06,
"loss": 0.5079,
"mean_token_accuracy": 0.8437391996383667,
"num_tokens": 33858157.0,
"step": 4170
},
{
"entropy": 1.0495543360710144,
"epoch": 2.35742518351214,
"grad_norm": 74.79431915283203,
"learning_rate": 4.509085315227606e-06,
"loss": 0.5483,
"mean_token_accuracy": 0.8367022156715394,
"num_tokens": 33898868.0,
"step": 4175
},
{
"entropy": 1.1106764674186707,
"epoch": 2.360248447204969,
"grad_norm": 85.37079620361328,
"learning_rate": 4.507962876005241e-06,
"loss": 0.5271,
"mean_token_accuracy": 0.8417307376861572,
"num_tokens": 33939206.0,
"step": 4180
},
{
"entropy": 1.135885977745056,
"epoch": 2.3630717108977977,
"grad_norm": 93.99333190917969,
"learning_rate": 4.506839348822384e-06,
"loss": 0.5443,
"mean_token_accuracy": 0.8377628087997436,
"num_tokens": 33979854.0,
"step": 4185
},
{
"entropy": 1.1180438637733459,
"epoch": 2.365894974590627,
"grad_norm": 60.93057632446289,
"learning_rate": 4.5057147345628985e-06,
"loss": 0.5308,
"mean_token_accuracy": 0.8384897470474243,
"num_tokens": 34020341.0,
"step": 4190
},
{
"entropy": 1.1248787641525269,
"epoch": 2.3687182382834555,
"grad_norm": 72.28102111816406,
"learning_rate": 4.504589034111505e-06,
"loss": 0.4869,
"mean_token_accuracy": 0.8506036043167114,
"num_tokens": 34061022.0,
"step": 4195
},
{
"entropy": 1.140468454360962,
"epoch": 2.3715415019762847,
"grad_norm": 78.96949005126953,
"learning_rate": 4.503462248353781e-06,
"loss": 0.5261,
"mean_token_accuracy": 0.8412546992301941,
"num_tokens": 34101705.0,
"step": 4200
},
{
"entropy": 1.1987942218780518,
"epoch": 2.3743647656691134,
"grad_norm": 77.41453552246094,
"learning_rate": 4.5023343781761516e-06,
"loss": 0.5068,
"mean_token_accuracy": 0.8431342601776123,
"num_tokens": 34142280.0,
"step": 4205
},
{
"entropy": 1.128602349758148,
"epoch": 2.3771880293619425,
"grad_norm": 75.12047576904297,
"learning_rate": 4.501205424465902e-06,
"loss": 0.5026,
"mean_token_accuracy": 0.8446843266487122,
"num_tokens": 34181976.0,
"step": 4210
},
{
"entropy": 1.1833925724029541,
"epoch": 2.380011293054771,
"grad_norm": 75.71656799316406,
"learning_rate": 4.500075388111167e-06,
"loss": 0.5242,
"mean_token_accuracy": 0.8402136087417602,
"num_tokens": 34222876.0,
"step": 4215
},
{
"entropy": 1.1418941140174865,
"epoch": 2.3828345567476004,
"grad_norm": 73.26214599609375,
"learning_rate": 4.498944270000931e-06,
"loss": 0.4817,
"mean_token_accuracy": 0.8517927408218384,
"num_tokens": 34263315.0,
"step": 4220
},
{
"entropy": 1.1809669256210327,
"epoch": 2.385657820440429,
"grad_norm": 86.13017272949219,
"learning_rate": 4.497812071025031e-06,
"loss": 0.5071,
"mean_token_accuracy": 0.8443432688713074,
"num_tokens": 34303434.0,
"step": 4225
},
{
"entropy": 1.175835919380188,
"epoch": 2.388481084133258,
"grad_norm": 64.92132568359375,
"learning_rate": 4.496678792074157e-06,
"loss": 0.5243,
"mean_token_accuracy": 0.8411461234092712,
"num_tokens": 34344189.0,
"step": 4230
},
{
"entropy": 1.0863433599472045,
"epoch": 2.391304347826087,
"grad_norm": 76.76641845703125,
"learning_rate": 4.495544434039843e-06,
"loss": 0.4991,
"mean_token_accuracy": 0.8467756628990173,
"num_tokens": 34384662.0,
"step": 4235
},
{
"entropy": 1.1002304792404174,
"epoch": 2.3941276115189156,
"grad_norm": 66.72636413574219,
"learning_rate": 4.494408997814478e-06,
"loss": 0.4921,
"mean_token_accuracy": 0.8480172276496887,
"num_tokens": 34425282.0,
"step": 4240
},
{
"entropy": 1.0731504201889037,
"epoch": 2.3969508752117448,
"grad_norm": 72.19840240478516,
"learning_rate": 4.493272484291293e-06,
"loss": 0.486,
"mean_token_accuracy": 0.8504210710525513,
"num_tokens": 34465824.0,
"step": 4245
},
{
"entropy": 1.068647289276123,
"epoch": 2.399774138904574,
"grad_norm": 74.20718383789062,
"learning_rate": 4.4921348943643736e-06,
"loss": 0.4766,
"mean_token_accuracy": 0.8515720963478088,
"num_tokens": 34506561.0,
"step": 4250
},
{
"entropy": 1.1240432024002076,
"epoch": 2.4025974025974026,
"grad_norm": 78.96634674072266,
"learning_rate": 4.490996228928645e-06,
"loss": 0.5126,
"mean_token_accuracy": 0.8429255843162536,
"num_tokens": 34547280.0,
"step": 4255
},
{
"entropy": 1.026216447353363,
"epoch": 2.4054206662902313,
"grad_norm": 69.10794067382812,
"learning_rate": 4.489856488879882e-06,
"loss": 0.4543,
"mean_token_accuracy": 0.8586987733840943,
"num_tokens": 34587754.0,
"step": 4260
},
{
"entropy": 1.0782729625701903,
"epoch": 2.4082439299830605,
"grad_norm": 74.8205337524414,
"learning_rate": 4.488715675114706e-06,
"loss": 0.5149,
"mean_token_accuracy": 0.8438615202903748,
"num_tokens": 34628435.0,
"step": 4265
},
{
"entropy": 1.1428865432739257,
"epoch": 2.411067193675889,
"grad_norm": 74.1771011352539,
"learning_rate": 4.4875737885305825e-06,
"loss": 0.5406,
"mean_token_accuracy": 0.8364495754241943,
"num_tokens": 34669206.0,
"step": 4270
},
{
"entropy": 1.106465208530426,
"epoch": 2.4138904573687183,
"grad_norm": 65.03327178955078,
"learning_rate": 4.486430830025818e-06,
"loss": 0.5122,
"mean_token_accuracy": 0.8434538841247559,
"num_tokens": 34709829.0,
"step": 4275
},
{
"entropy": 1.099475383758545,
"epoch": 2.416713721061547,
"grad_norm": 75.65325164794922,
"learning_rate": 4.485286800499564e-06,
"loss": 0.486,
"mean_token_accuracy": 0.8493196606636048,
"num_tokens": 34750478.0,
"step": 4280
},
{
"entropy": 1.1192373633384705,
"epoch": 2.419536984754376,
"grad_norm": 91.78784942626953,
"learning_rate": 4.484141700851819e-06,
"loss": 0.5286,
"mean_token_accuracy": 0.841631555557251,
"num_tokens": 34791351.0,
"step": 4285
},
{
"entropy": 1.1912834763526916,
"epoch": 2.422360248447205,
"grad_norm": 60.360713958740234,
"learning_rate": 4.482995531983414e-06,
"loss": 0.5217,
"mean_token_accuracy": 0.8427740693092346,
"num_tokens": 34831811.0,
"step": 4290
},
{
"entropy": 1.1366937160491943,
"epoch": 2.425183512140034,
"grad_norm": 73.9558334350586,
"learning_rate": 4.48184829479603e-06,
"loss": 0.5532,
"mean_token_accuracy": 0.8314054369926452,
"num_tokens": 34872689.0,
"step": 4295
},
{
"entropy": 1.0390868544578553,
"epoch": 2.4280067758328627,
"grad_norm": 74.2223892211914,
"learning_rate": 4.480699990192184e-06,
"loss": 0.4752,
"mean_token_accuracy": 0.8538849472999572,
"num_tokens": 34913355.0,
"step": 4300
},
{
"entropy": 1.1691401720046997,
"epoch": 2.430830039525692,
"grad_norm": 70.82209777832031,
"learning_rate": 4.479550619075233e-06,
"loss": 0.5103,
"mean_token_accuracy": 0.843414044380188,
"num_tokens": 34953912.0,
"step": 4305
},
{
"entropy": 1.032659935951233,
"epoch": 2.4336533032185206,
"grad_norm": 75.93842315673828,
"learning_rate": 4.478400182349374e-06,
"loss": 0.4923,
"mean_token_accuracy": 0.848256242275238,
"num_tokens": 34994417.0,
"step": 4310
},
{
"entropy": 1.110754704475403,
"epoch": 2.4364765669113497,
"grad_norm": 84.19135284423828,
"learning_rate": 4.477248680919643e-06,
"loss": 0.5161,
"mean_token_accuracy": 0.8396423697471619,
"num_tokens": 35034735.0,
"step": 4315
},
{
"entropy": 1.1798797607421876,
"epoch": 2.4392998306041784,
"grad_norm": 83.10535430908203,
"learning_rate": 4.476096115691909e-06,
"loss": 0.5277,
"mean_token_accuracy": 0.8388562560081482,
"num_tokens": 35075100.0,
"step": 4320
},
{
"entropy": 1.1197029232978821,
"epoch": 2.442123094297007,
"grad_norm": 74.39781188964844,
"learning_rate": 4.474942487572886e-06,
"loss": 0.478,
"mean_token_accuracy": 0.852222740650177,
"num_tokens": 35115732.0,
"step": 4325
},
{
"entropy": 1.1389746904373168,
"epoch": 2.4449463579898363,
"grad_norm": 77.96908569335938,
"learning_rate": 4.473787797470117e-06,
"loss": 0.5159,
"mean_token_accuracy": 0.8402628898620605,
"num_tokens": 35156421.0,
"step": 4330
},
{
"entropy": 1.121785569190979,
"epoch": 2.447769621682665,
"grad_norm": 77.48990631103516,
"learning_rate": 4.472632046291984e-06,
"loss": 0.5153,
"mean_token_accuracy": 0.8415657639503479,
"num_tokens": 35196916.0,
"step": 4335
},
{
"entropy": 1.0282376170158387,
"epoch": 2.450592885375494,
"grad_norm": 85.62969970703125,
"learning_rate": 4.471475234947701e-06,
"loss": 0.4711,
"mean_token_accuracy": 0.8563955187797546,
"num_tokens": 35237302.0,
"step": 4340
},
{
"entropy": 1.0324091553688048,
"epoch": 2.453416149068323,
"grad_norm": 73.95626831054688,
"learning_rate": 4.470317364347321e-06,
"loss": 0.4628,
"mean_token_accuracy": 0.856763768196106,
"num_tokens": 35278024.0,
"step": 4345
},
{
"entropy": 1.2146705389022827,
"epoch": 2.456239412761152,
"grad_norm": 90.1251220703125,
"learning_rate": 4.469158435401723e-06,
"loss": 0.5288,
"mean_token_accuracy": 0.8385349392890931,
"num_tokens": 35317872.0,
"step": 4350
},
{
"entropy": 1.168654775619507,
"epoch": 2.4590626764539807,
"grad_norm": 66.60228729248047,
"learning_rate": 4.467998449022626e-06,
"loss": 0.4924,
"mean_token_accuracy": 0.847309160232544,
"num_tokens": 35358677.0,
"step": 4355
},
{
"entropy": 1.1382317781448363,
"epoch": 2.46188594014681,
"grad_norm": 74.64222717285156,
"learning_rate": 4.466837406122576e-06,
"loss": 0.512,
"mean_token_accuracy": 0.8408301472663879,
"num_tokens": 35399398.0,
"step": 4360
},
{
"entropy": 1.1394990921020507,
"epoch": 2.4647092038396385,
"grad_norm": 77.15666198730469,
"learning_rate": 4.465675307614952e-06,
"loss": 0.5254,
"mean_token_accuracy": 0.840006697177887,
"num_tokens": 35440245.0,
"step": 4365
},
{
"entropy": 1.1206412315368652,
"epoch": 2.4675324675324677,
"grad_norm": 66.12518310546875,
"learning_rate": 4.464512154413963e-06,
"loss": 0.4988,
"mean_token_accuracy": 0.8488071084022522,
"num_tokens": 35480513.0,
"step": 4370
},
{
"entropy": 1.2848740220069885,
"epoch": 2.4703557312252964,
"grad_norm": 75.88241577148438,
"learning_rate": 4.463347947434647e-06,
"loss": 0.5434,
"mean_token_accuracy": 0.8377517104148865,
"num_tokens": 35521108.0,
"step": 4375
},
{
"entropy": 1.170305109024048,
"epoch": 2.4731789949181255,
"grad_norm": 76.86499786376953,
"learning_rate": 4.462182687592875e-06,
"loss": 0.5246,
"mean_token_accuracy": 0.8381713032722473,
"num_tokens": 35561905.0,
"step": 4380
},
{
"entropy": 1.0816607832908631,
"epoch": 2.4760022586109542,
"grad_norm": 69.17800903320312,
"learning_rate": 4.4610163758053385e-06,
"loss": 0.4845,
"mean_token_accuracy": 0.8541519999504089,
"num_tokens": 35602692.0,
"step": 4385
},
{
"entropy": 1.1398390531539917,
"epoch": 2.478825522303783,
"grad_norm": 85.82377624511719,
"learning_rate": 4.459849012989564e-06,
"loss": 0.5134,
"mean_token_accuracy": 0.8435853004455567,
"num_tokens": 35643253.0,
"step": 4390
},
{
"entropy": 1.1162615537643432,
"epoch": 2.481648785996612,
"grad_norm": 72.97199249267578,
"learning_rate": 4.458680600063902e-06,
"loss": 0.4924,
"mean_token_accuracy": 0.8486401915550232,
"num_tokens": 35684015.0,
"step": 4395
},
{
"entropy": 1.1746024131774901,
"epoch": 2.4844720496894412,
"grad_norm": 78.95182800292969,
"learning_rate": 4.457511137947528e-06,
"loss": 0.5611,
"mean_token_accuracy": 0.8314670324325562,
"num_tokens": 35724408.0,
"step": 4400
},
{
"entropy": 1.1452197551727294,
"epoch": 2.48729531338227,
"grad_norm": 75.60063171386719,
"learning_rate": 4.456340627560444e-06,
"loss": 0.5306,
"mean_token_accuracy": 0.8401831388473511,
"num_tokens": 35764983.0,
"step": 4405
},
{
"entropy": 1.1957187175750732,
"epoch": 2.4901185770750986,
"grad_norm": 72.1159439086914,
"learning_rate": 4.4551690698234774e-06,
"loss": 0.5057,
"mean_token_accuracy": 0.8457194209098816,
"num_tokens": 35805660.0,
"step": 4410
},
{
"entropy": 1.1455501317977905,
"epoch": 2.492941840767928,
"grad_norm": 62.46710205078125,
"learning_rate": 4.4539964656582795e-06,
"loss": 0.5132,
"mean_token_accuracy": 0.8429795026779174,
"num_tokens": 35846379.0,
"step": 4415
},
{
"entropy": 1.155365228652954,
"epoch": 2.4957651044607565,
"grad_norm": 74.61618041992188,
"learning_rate": 4.452822815987322e-06,
"loss": 0.5407,
"mean_token_accuracy": 0.8346656799316406,
"num_tokens": 35886996.0,
"step": 4420
},
{
"entropy": 1.139185333251953,
"epoch": 2.4985883681535856,
"grad_norm": 73.73220825195312,
"learning_rate": 4.4516481217339035e-06,
"loss": 0.5251,
"mean_token_accuracy": 0.8383039832115173,
"num_tokens": 35927734.0,
"step": 4425
},
{
"entropy": 1.0487828493118285,
"epoch": 2.5014116318464144,
"grad_norm": 82.3303451538086,
"learning_rate": 4.45047238382214e-06,
"loss": 0.5166,
"mean_token_accuracy": 0.84206303358078,
"num_tokens": 35968230.0,
"step": 4430
},
{
"entropy": 1.1584802865982056,
"epoch": 2.5042348955392435,
"grad_norm": 84.05821990966797,
"learning_rate": 4.449295603176972e-06,
"loss": 0.4825,
"mean_token_accuracy": 0.8500637769699096,
"num_tokens": 36008859.0,
"step": 4435
},
{
"entropy": 1.091279971599579,
"epoch": 2.507058159232072,
"grad_norm": 70.52880859375,
"learning_rate": 4.448117780724157e-06,
"loss": 0.4574,
"mean_token_accuracy": 0.8563731193542481,
"num_tokens": 36049479.0,
"step": 4440
},
{
"entropy": 1.172715425491333,
"epoch": 2.5098814229249014,
"grad_norm": 76.75308227539062,
"learning_rate": 4.446938917390276e-06,
"loss": 0.505,
"mean_token_accuracy": 0.8474242210388183,
"num_tokens": 36090217.0,
"step": 4445
},
{
"entropy": 1.1646355390548706,
"epoch": 2.51270468661773,
"grad_norm": 74.46258544921875,
"learning_rate": 4.445759014102726e-06,
"loss": 0.4888,
"mean_token_accuracy": 0.8501151442527771,
"num_tokens": 36130610.0,
"step": 4450
},
{
"entropy": 1.1875864028930665,
"epoch": 2.5155279503105588,
"grad_norm": 80.89794158935547,
"learning_rate": 4.444578071789724e-06,
"loss": 0.507,
"mean_token_accuracy": 0.8434122800827026,
"num_tokens": 36171102.0,
"step": 4455
},
{
"entropy": 1.1913317441940308,
"epoch": 2.518351214003388,
"grad_norm": 70.1068344116211,
"learning_rate": 4.443396091380301e-06,
"loss": 0.532,
"mean_token_accuracy": 0.8388489723205567,
"num_tokens": 36211713.0,
"step": 4460
},
{
"entropy": 1.0263278841972352,
"epoch": 2.521174477696217,
"grad_norm": 68.75521850585938,
"learning_rate": 4.4422130738043085e-06,
"loss": 0.4724,
"mean_token_accuracy": 0.8537720799446106,
"num_tokens": 36252422.0,
"step": 4465
},
{
"entropy": 1.1719030618667603,
"epoch": 2.5239977413890458,
"grad_norm": 74.74054718017578,
"learning_rate": 4.4410290199924124e-06,
"loss": 0.5065,
"mean_token_accuracy": 0.8453158020973206,
"num_tokens": 36292740.0,
"step": 4470
},
{
"entropy": 1.153631329536438,
"epoch": 2.5268210050818745,
"grad_norm": 69.42818450927734,
"learning_rate": 4.439843930876093e-06,
"loss": 0.4845,
"mean_token_accuracy": 0.8510221719741822,
"num_tokens": 36333469.0,
"step": 4475
},
{
"entropy": 1.1073312044143677,
"epoch": 2.5296442687747036,
"grad_norm": 73.57260131835938,
"learning_rate": 4.4386578073876475e-06,
"loss": 0.4914,
"mean_token_accuracy": 0.850137197971344,
"num_tokens": 36374242.0,
"step": 4480
},
{
"entropy": 1.1316781520843506,
"epoch": 2.5324675324675323,
"grad_norm": 72.3252944946289,
"learning_rate": 4.437470650460183e-06,
"loss": 0.5267,
"mean_token_accuracy": 0.8415279865264893,
"num_tokens": 36414827.0,
"step": 4485
},
{
"entropy": 1.1887487888336181,
"epoch": 2.5352907961603615,
"grad_norm": 74.52176666259766,
"learning_rate": 4.4362824610276234e-06,
"loss": 0.5024,
"mean_token_accuracy": 0.8490213751792908,
"num_tokens": 36455421.0,
"step": 4490
},
{
"entropy": 1.1459205865859985,
"epoch": 2.53811405985319,
"grad_norm": 71.28825378417969,
"learning_rate": 4.435093240024702e-06,
"loss": 0.5353,
"mean_token_accuracy": 0.8379881143569946,
"num_tokens": 36496056.0,
"step": 4495
},
{
"entropy": 0.9882930994033814,
"epoch": 2.5409373235460193,
"grad_norm": 66.19135284423828,
"learning_rate": 4.433902988386966e-06,
"loss": 0.4861,
"mean_token_accuracy": 0.8508020997047424,
"num_tokens": 36536748.0,
"step": 4500
},
{
"epoch": 2.5409373235460193,
"eval_entropy": 1.29656081199646,
"eval_loss": 0.3836705982685089,
"eval_mean_token_accuracy": 0.8863630533218384,
"eval_num_tokens": 36536748.0,
"eval_runtime": 2.456,
"eval_samples_per_second": 15.879,
"eval_steps_per_second": 2.036,
"step": 4500
},
{
"entropy": 1.126239013671875,
"epoch": 2.543760587238848,
"grad_norm": 78.8668441772461,
"learning_rate": 4.432711707050772e-06,
"loss": 0.499,
"mean_token_accuracy": 0.8451325535774231,
"num_tokens": 36577256.0,
"step": 4505
},
{
"entropy": 1.1737711906433106,
"epoch": 2.546583850931677,
"grad_norm": 77.37610626220703,
"learning_rate": 4.431519396953287e-06,
"loss": 0.506,
"mean_token_accuracy": 0.8457542061805725,
"num_tokens": 36617975.0,
"step": 4510
},
{
"entropy": 1.240240716934204,
"epoch": 2.549407114624506,
"grad_norm": 97.81411743164062,
"learning_rate": 4.430326059032486e-06,
"loss": 0.5605,
"mean_token_accuracy": 0.8328823566436767,
"num_tokens": 36658764.0,
"step": 4515
},
{
"entropy": 1.1290704488754273,
"epoch": 2.5522303783173346,
"grad_norm": 67.75629425048828,
"learning_rate": 4.429131694227155e-06,
"loss": 0.4866,
"mean_token_accuracy": 0.8508628010749817,
"num_tokens": 36699563.0,
"step": 4520
},
{
"entropy": 1.0336142778396606,
"epoch": 2.5550536420101637,
"grad_norm": 64.22755432128906,
"learning_rate": 4.427936303476886e-06,
"loss": 0.4984,
"mean_token_accuracy": 0.8513097763061523,
"num_tokens": 36740143.0,
"step": 4525
},
{
"entropy": 1.1085405230522156,
"epoch": 2.557876905702993,
"grad_norm": 81.93550872802734,
"learning_rate": 4.426739887722079e-06,
"loss": 0.5394,
"mean_token_accuracy": 0.83584965467453,
"num_tokens": 36780717.0,
"step": 4530
},
{
"entropy": 1.146517848968506,
"epoch": 2.5607001693958216,
"grad_norm": 68.24474334716797,
"learning_rate": 4.4255424479039414e-06,
"loss": 0.5027,
"mean_token_accuracy": 0.8451008915901184,
"num_tokens": 36821365.0,
"step": 4535
},
{
"entropy": 1.1648792028427124,
"epoch": 2.5635234330886503,
"grad_norm": 71.85755157470703,
"learning_rate": 4.424343984964483e-06,
"loss": 0.4687,
"mean_token_accuracy": 0.8573217153549194,
"num_tokens": 36862029.0,
"step": 4540
},
{
"entropy": 1.3023412704467774,
"epoch": 2.5663466967814794,
"grad_norm": 77.11023712158203,
"learning_rate": 4.42314449984652e-06,
"loss": 0.5796,
"mean_token_accuracy": 0.8275646448135376,
"num_tokens": 36902879.0,
"step": 4545
},
{
"entropy": 1.1683407545089721,
"epoch": 2.5691699604743086,
"grad_norm": 83.67930603027344,
"learning_rate": 4.421943993493676e-06,
"loss": 0.5168,
"mean_token_accuracy": 0.8405487775802613,
"num_tokens": 36943275.0,
"step": 4550
},
{
"entropy": 1.1550110578536987,
"epoch": 2.5719932241671373,
"grad_norm": 82.8023452758789,
"learning_rate": 4.4207424668503715e-06,
"loss": 0.535,
"mean_token_accuracy": 0.8371553182601928,
"num_tokens": 36984050.0,
"step": 4555
},
{
"entropy": 1.1033604502677918,
"epoch": 2.574816487859966,
"grad_norm": 66.57779693603516,
"learning_rate": 4.4195399208618354e-06,
"loss": 0.5156,
"mean_token_accuracy": 0.8456992626190185,
"num_tokens": 37024707.0,
"step": 4560
},
{
"entropy": 1.1740542650222778,
"epoch": 2.577639751552795,
"grad_norm": 78.03885650634766,
"learning_rate": 4.418336356474097e-06,
"loss": 0.5059,
"mean_token_accuracy": 0.8445262908935547,
"num_tokens": 37065406.0,
"step": 4565
},
{
"entropy": 1.096956205368042,
"epoch": 2.580463015245624,
"grad_norm": 75.66728973388672,
"learning_rate": 4.4171317746339846e-06,
"loss": 0.5026,
"mean_token_accuracy": 0.844276738166809,
"num_tokens": 37105960.0,
"step": 4570
},
{
"entropy": 0.9889927744865418,
"epoch": 2.583286278938453,
"grad_norm": 59.227474212646484,
"learning_rate": 4.415926176289128e-06,
"loss": 0.4506,
"mean_token_accuracy": 0.8623033881187439,
"num_tokens": 37146789.0,
"step": 4575
},
{
"entropy": 1.1664816856384277,
"epoch": 2.5861095426312817,
"grad_norm": 70.5433120727539,
"learning_rate": 4.414719562387959e-06,
"loss": 0.5092,
"mean_token_accuracy": 0.8431604027748107,
"num_tokens": 37187240.0,
"step": 4580
},
{
"entropy": 1.1500779867172242,
"epoch": 2.588932806324111,
"grad_norm": 76.07451629638672,
"learning_rate": 4.413511933879705e-06,
"loss": 0.54,
"mean_token_accuracy": 0.8370203971862793,
"num_tokens": 37227801.0,
"step": 4585
},
{
"entropy": 1.150613832473755,
"epoch": 2.5917560700169395,
"grad_norm": 65.93510437011719,
"learning_rate": 4.412303291714394e-06,
"loss": 0.518,
"mean_token_accuracy": 0.8397469758987427,
"num_tokens": 37268477.0,
"step": 4590
},
{
"entropy": 1.0623825907707214,
"epoch": 2.5945793337097687,
"grad_norm": 59.88905334472656,
"learning_rate": 4.4110936368428505e-06,
"loss": 0.4683,
"mean_token_accuracy": 0.8573758602142334,
"num_tokens": 37309299.0,
"step": 4595
},
{
"entropy": 1.1400325775146485,
"epoch": 2.5974025974025974,
"grad_norm": 81.33668518066406,
"learning_rate": 4.4098829702166945e-06,
"loss": 0.5115,
"mean_token_accuracy": 0.8455300331115723,
"num_tokens": 37350031.0,
"step": 4600
},
{
"entropy": 1.1422762393951416,
"epoch": 2.600225861095426,
"grad_norm": 72.15546417236328,
"learning_rate": 4.408671292788343e-06,
"loss": 0.5096,
"mean_token_accuracy": 0.8442867636680603,
"num_tokens": 37390863.0,
"step": 4605
},
{
"entropy": 1.0977005004882812,
"epoch": 2.6030491247882552,
"grad_norm": 72.4885482788086,
"learning_rate": 4.40745860551101e-06,
"loss": 0.5053,
"mean_token_accuracy": 0.8472009539604187,
"num_tokens": 37431621.0,
"step": 4610
},
{
"entropy": 1.159875512123108,
"epoch": 2.6058723884810844,
"grad_norm": 78.24819946289062,
"learning_rate": 4.4062449093387e-06,
"loss": 0.5253,
"mean_token_accuracy": 0.8395026326179504,
"num_tokens": 37472515.0,
"step": 4615
},
{
"entropy": 1.0633836030960082,
"epoch": 2.608695652173913,
"grad_norm": 68.64786529541016,
"learning_rate": 4.405030205226217e-06,
"loss": 0.504,
"mean_token_accuracy": 0.8456165671348572,
"num_tokens": 37513418.0,
"step": 4620
},
{
"entropy": 1.1453440189361572,
"epoch": 2.611518915866742,
"grad_norm": 72.81915283203125,
"learning_rate": 4.40381449412915e-06,
"loss": 0.5209,
"mean_token_accuracy": 0.8431556344032287,
"num_tokens": 37554168.0,
"step": 4625
},
{
"entropy": 1.1775591611862182,
"epoch": 2.614342179559571,
"grad_norm": 67.98880004882812,
"learning_rate": 4.402597777003886e-06,
"loss": 0.4841,
"mean_token_accuracy": 0.8511347770690918,
"num_tokens": 37594939.0,
"step": 4630
},
{
"entropy": 1.1267094373703004,
"epoch": 2.6171654432523996,
"grad_norm": 72.29106140136719,
"learning_rate": 4.401380054807603e-06,
"loss": 0.528,
"mean_token_accuracy": 0.8410808086395264,
"num_tokens": 37635516.0,
"step": 4635
},
{
"entropy": 1.1401848554611207,
"epoch": 2.619988706945229,
"grad_norm": 84.17682647705078,
"learning_rate": 4.400161328498269e-06,
"loss": 0.5462,
"mean_token_accuracy": 0.8344476222991943,
"num_tokens": 37676273.0,
"step": 4640
},
{
"entropy": 1.138647985458374,
"epoch": 2.6228119706380575,
"grad_norm": 82.6788330078125,
"learning_rate": 4.398941599034639e-06,
"loss": 0.5338,
"mean_token_accuracy": 0.8394053339958191,
"num_tokens": 37716664.0,
"step": 4645
},
{
"entropy": 1.2400930404663086,
"epoch": 2.6256352343308866,
"grad_norm": 75.30484008789062,
"learning_rate": 4.397720867376262e-06,
"loss": 0.4985,
"mean_token_accuracy": 0.8481818795204162,
"num_tokens": 37757396.0,
"step": 4650
},
{
"entropy": 1.2581698894500732,
"epoch": 2.6284584980237153,
"grad_norm": 84.51012420654297,
"learning_rate": 4.396499134483472e-06,
"loss": 0.5403,
"mean_token_accuracy": 0.8340572237968444,
"num_tokens": 37797918.0,
"step": 4655
},
{
"entropy": 1.1882836818695068,
"epoch": 2.6312817617165445,
"grad_norm": 70.45535278320312,
"learning_rate": 4.395276401317392e-06,
"loss": 0.4958,
"mean_token_accuracy": 0.8483729481697082,
"num_tokens": 37838614.0,
"step": 4660
},
{
"entropy": 1.1779079675674438,
"epoch": 2.634105025409373,
"grad_norm": 75.23043060302734,
"learning_rate": 4.394052668839931e-06,
"loss": 0.504,
"mean_token_accuracy": 0.8456653475761413,
"num_tokens": 37879171.0,
"step": 4665
},
{
"entropy": 1.1830149650573731,
"epoch": 2.636928289102202,
"grad_norm": 75.51518249511719,
"learning_rate": 4.392827938013786e-06,
"loss": 0.5183,
"mean_token_accuracy": 0.8426408410072327,
"num_tokens": 37919636.0,
"step": 4670
},
{
"entropy": 1.1361161470413208,
"epoch": 2.639751552795031,
"grad_norm": 81.5411376953125,
"learning_rate": 4.3916022098024395e-06,
"loss": 0.4845,
"mean_token_accuracy": 0.8498412609100342,
"num_tokens": 37960560.0,
"step": 4675
},
{
"entropy": 1.1308854579925538,
"epoch": 2.64257481648786,
"grad_norm": 77.25973510742188,
"learning_rate": 4.390375485170154e-06,
"loss": 0.5472,
"mean_token_accuracy": 0.8356089353561401,
"num_tokens": 38001197.0,
"step": 4680
},
{
"entropy": 1.222540831565857,
"epoch": 2.645398080180689,
"grad_norm": 76.51905059814453,
"learning_rate": 4.3891477650819805e-06,
"loss": 0.5798,
"mean_token_accuracy": 0.8243926763534546,
"num_tokens": 38041972.0,
"step": 4685
},
{
"entropy": 1.131513738632202,
"epoch": 2.6482213438735176,
"grad_norm": 81.08806610107422,
"learning_rate": 4.387919050503754e-06,
"loss": 0.4859,
"mean_token_accuracy": 0.8490037560462952,
"num_tokens": 38082599.0,
"step": 4690
},
{
"entropy": 1.0864753246307373,
"epoch": 2.6510446075663467,
"grad_norm": 76.36005401611328,
"learning_rate": 4.386689342402086e-06,
"loss": 0.4748,
"mean_token_accuracy": 0.8526203989982605,
"num_tokens": 38123268.0,
"step": 4695
},
{
"entropy": 1.1534552812576293,
"epoch": 2.6538678712591754,
"grad_norm": 82.96961975097656,
"learning_rate": 4.385458641744376e-06,
"loss": 0.5053,
"mean_token_accuracy": 0.8451831459999084,
"num_tokens": 38164005.0,
"step": 4700
},
{
"entropy": 1.2800405502319336,
"epoch": 2.6566911349520046,
"grad_norm": 89.06993865966797,
"learning_rate": 4.3842269494988005e-06,
"loss": 0.564,
"mean_token_accuracy": 0.8315343141555787,
"num_tokens": 38204063.0,
"step": 4705
},
{
"entropy": 1.168971014022827,
"epoch": 2.6595143986448333,
"grad_norm": 72.02811431884766,
"learning_rate": 4.382994266634317e-06,
"loss": 0.5092,
"mean_token_accuracy": 0.8427629709243775,
"num_tokens": 38244627.0,
"step": 4710
},
{
"entropy": 1.1421557903289794,
"epoch": 2.6623376623376624,
"grad_norm": 76.09336853027344,
"learning_rate": 4.381760594120664e-06,
"loss": 0.4969,
"mean_token_accuracy": 0.8478555679321289,
"num_tokens": 38285225.0,
"step": 4715
},
{
"entropy": 1.2609493494033814,
"epoch": 2.665160926030491,
"grad_norm": 80.14908599853516,
"learning_rate": 4.380525932928355e-06,
"loss": 0.5985,
"mean_token_accuracy": 0.8232489943504333,
"num_tokens": 38325900.0,
"step": 4720
},
{
"entropy": 1.1968196153640747,
"epoch": 2.6679841897233203,
"grad_norm": 80.70729064941406,
"learning_rate": 4.379290284028685e-06,
"loss": 0.5013,
"mean_token_accuracy": 0.847633171081543,
"num_tokens": 38366526.0,
"step": 4725
},
{
"entropy": 1.087080430984497,
"epoch": 2.670807453416149,
"grad_norm": 74.0415267944336,
"learning_rate": 4.378053648393724e-06,
"loss": 0.4941,
"mean_token_accuracy": 0.8482417821884155,
"num_tokens": 38407344.0,
"step": 4730
},
{
"entropy": 1.0690407276153564,
"epoch": 2.6736307171089777,
"grad_norm": 89.44586181640625,
"learning_rate": 4.376816026996317e-06,
"loss": 0.5069,
"mean_token_accuracy": 0.8457733273506165,
"num_tokens": 38448103.0,
"step": 4735
},
{
"entropy": 1.196146512031555,
"epoch": 2.676453980801807,
"grad_norm": 67.38997650146484,
"learning_rate": 4.375577420810089e-06,
"loss": 0.5479,
"mean_token_accuracy": 0.8355863332748413,
"num_tokens": 38488784.0,
"step": 4740
},
{
"entropy": 1.1895936965942382,
"epoch": 2.679277244494636,
"grad_norm": 80.26905059814453,
"learning_rate": 4.374337830809434e-06,
"loss": 0.5418,
"mean_token_accuracy": 0.8347990155220032,
"num_tokens": 38529528.0,
"step": 4745
},
{
"entropy": 1.1369389533996581,
"epoch": 2.6821005081874647,
"grad_norm": 81.48678588867188,
"learning_rate": 4.373097257969523e-06,
"loss": 0.5226,
"mean_token_accuracy": 0.8407035112380982,
"num_tokens": 38570284.0,
"step": 4750
},
{
"entropy": 1.1514408111572265,
"epoch": 2.6849237718802934,
"grad_norm": 70.67035675048828,
"learning_rate": 4.3718557032663025e-06,
"loss": 0.5391,
"mean_token_accuracy": 0.8364932179450989,
"num_tokens": 38611100.0,
"step": 4755
},
{
"entropy": 1.2531227827072144,
"epoch": 2.6877470355731226,
"grad_norm": 72.68204498291016,
"learning_rate": 4.370613167676486e-06,
"loss": 0.5643,
"mean_token_accuracy": 0.8283812403678894,
"num_tokens": 38651605.0,
"step": 4760
},
{
"entropy": 1.1133297443389893,
"epoch": 2.6905702992659517,
"grad_norm": 82.70066833496094,
"learning_rate": 4.369369652177563e-06,
"loss": 0.559,
"mean_token_accuracy": 0.8325229167938233,
"num_tokens": 38692228.0,
"step": 4765
},
{
"entropy": 1.1255622148513793,
"epoch": 2.6933935629587804,
"grad_norm": 74.11168670654297,
"learning_rate": 4.368125157747792e-06,
"loss": 0.5061,
"mean_token_accuracy": 0.8454763174057007,
"num_tokens": 38732867.0,
"step": 4770
},
{
"entropy": 1.0742725372314452,
"epoch": 2.696216826651609,
"grad_norm": 68.80258178710938,
"learning_rate": 4.366879685366202e-06,
"loss": 0.4704,
"mean_token_accuracy": 0.8558419108390808,
"num_tokens": 38773697.0,
"step": 4775
},
{
"entropy": 1.0947855234146118,
"epoch": 2.6990400903444383,
"grad_norm": 85.78025817871094,
"learning_rate": 4.365633236012592e-06,
"loss": 0.4765,
"mean_token_accuracy": 0.8518133997917176,
"num_tokens": 38814311.0,
"step": 4780
},
{
"entropy": 1.1548483610153197,
"epoch": 2.701863354037267,
"grad_norm": 70.87236785888672,
"learning_rate": 4.364385810667528e-06,
"loss": 0.5273,
"mean_token_accuracy": 0.8392892003059387,
"num_tokens": 38854978.0,
"step": 4785
},
{
"entropy": 1.131790852546692,
"epoch": 2.704686617730096,
"grad_norm": 74.22408294677734,
"learning_rate": 4.363137410312345e-06,
"loss": 0.4883,
"mean_token_accuracy": 0.8489778161048889,
"num_tokens": 38895851.0,
"step": 4790
},
{
"entropy": 1.1404380559921266,
"epoch": 2.707509881422925,
"grad_norm": 78.29373931884766,
"learning_rate": 4.361888035929144e-06,
"loss": 0.4853,
"mean_token_accuracy": 0.8509568572044373,
"num_tokens": 38936622.0,
"step": 4795
},
{
"entropy": 1.1203672647476197,
"epoch": 2.710333145115754,
"grad_norm": 81.51094818115234,
"learning_rate": 4.360637688500795e-06,
"loss": 0.4953,
"mean_token_accuracy": 0.8473028421401978,
"num_tokens": 38977231.0,
"step": 4800
},
{
"entropy": 1.191361141204834,
"epoch": 2.7131564088085827,
"grad_norm": 82.80384063720703,
"learning_rate": 4.35938636901093e-06,
"loss": 0.5602,
"mean_token_accuracy": 0.8294562697410583,
"num_tokens": 39017864.0,
"step": 4805
},
{
"entropy": 1.1114810943603515,
"epoch": 2.715979672501412,
"grad_norm": 67.35503387451172,
"learning_rate": 4.358134078443948e-06,
"loss": 0.5224,
"mean_token_accuracy": 0.8406103849411011,
"num_tokens": 39058519.0,
"step": 4810
},
{
"entropy": 1.1259804010391234,
"epoch": 2.7188029361942405,
"grad_norm": 73.83805084228516,
"learning_rate": 4.35688081778501e-06,
"loss": 0.5203,
"mean_token_accuracy": 0.8408207297325134,
"num_tokens": 39099066.0,
"step": 4815
},
{
"entropy": 1.1642281055450439,
"epoch": 2.721626199887069,
"grad_norm": 69.04283905029297,
"learning_rate": 4.355626588020042e-06,
"loss": 0.5417,
"mean_token_accuracy": 0.8353393197059631,
"num_tokens": 39139740.0,
"step": 4820
},
{
"entropy": 1.144355297088623,
"epoch": 2.7244494635798984,
"grad_norm": 80.45765686035156,
"learning_rate": 4.354371390135732e-06,
"loss": 0.4747,
"mean_token_accuracy": 0.850922417640686,
"num_tokens": 39180427.0,
"step": 4825
},
{
"entropy": 1.097638154029846,
"epoch": 2.7272727272727275,
"grad_norm": 65.31901550292969,
"learning_rate": 4.3531152251195286e-06,
"loss": 0.5098,
"mean_token_accuracy": 0.8428983092308044,
"num_tokens": 39220839.0,
"step": 4830
},
{
"entropy": 1.0980995416641235,
"epoch": 2.730095990965556,
"grad_norm": 83.73595428466797,
"learning_rate": 4.351858093959642e-06,
"loss": 0.5119,
"mean_token_accuracy": 0.8436989903450012,
"num_tokens": 39261561.0,
"step": 4835
},
{
"entropy": 1.2633298873901366,
"epoch": 2.732919254658385,
"grad_norm": 88.29671478271484,
"learning_rate": 4.350599997645044e-06,
"loss": 0.5693,
"mean_token_accuracy": 0.828402316570282,
"num_tokens": 39302258.0,
"step": 4840
},
{
"entropy": 1.2692286968231201,
"epoch": 2.735742518351214,
"grad_norm": 71.21768188476562,
"learning_rate": 4.349340937165462e-06,
"loss": 0.5245,
"mean_token_accuracy": 0.8410576820373535,
"num_tokens": 39342662.0,
"step": 4845
},
{
"entropy": 1.2515573263168336,
"epoch": 2.7385657820440428,
"grad_norm": 81.95990753173828,
"learning_rate": 4.348080913511383e-06,
"loss": 0.5276,
"mean_token_accuracy": 0.8408760786056518,
"num_tokens": 39383426.0,
"step": 4850
},
{
"entropy": 1.178449559211731,
"epoch": 2.741389045736872,
"grad_norm": 71.170166015625,
"learning_rate": 4.3468199276740565e-06,
"loss": 0.5232,
"mean_token_accuracy": 0.8398305416107178,
"num_tokens": 39424010.0,
"step": 4855
},
{
"entropy": 1.1536587238311768,
"epoch": 2.7442123094297006,
"grad_norm": 76.049072265625,
"learning_rate": 4.3455579806454814e-06,
"loss": 0.5285,
"mean_token_accuracy": 0.8364124417304992,
"num_tokens": 39464190.0,
"step": 4860
},
{
"entropy": 1.1046669960021973,
"epoch": 2.7470355731225298,
"grad_norm": 76.04798126220703,
"learning_rate": 4.344295073418419e-06,
"loss": 0.5186,
"mean_token_accuracy": 0.8435781955718994,
"num_tokens": 39504820.0,
"step": 4865
},
{
"entropy": 1.1494717836380004,
"epoch": 2.7498588368153585,
"grad_norm": 78.44160461425781,
"learning_rate": 4.343031206986382e-06,
"loss": 0.535,
"mean_token_accuracy": 0.8405123353004456,
"num_tokens": 39545451.0,
"step": 4870
},
{
"entropy": 1.2063377141952514,
"epoch": 2.7526821005081876,
"grad_norm": 64.78626251220703,
"learning_rate": 4.3417663823436395e-06,
"loss": 0.5318,
"mean_token_accuracy": 0.8373534440994262,
"num_tokens": 39586024.0,
"step": 4875
},
{
"entropy": 1.2055204868316651,
"epoch": 2.7555053642010163,
"grad_norm": 79.77772521972656,
"learning_rate": 4.340500600485213e-06,
"loss": 0.5334,
"mean_token_accuracy": 0.8385087370872497,
"num_tokens": 39626675.0,
"step": 4880
},
{
"entropy": 1.2107580423355102,
"epoch": 2.758328627893845,
"grad_norm": 82.31134033203125,
"learning_rate": 4.339233862406882e-06,
"loss": 0.4883,
"mean_token_accuracy": 0.8484996676445007,
"num_tokens": 39667244.0,
"step": 4885
},
{
"entropy": 1.0674038529396057,
"epoch": 2.761151891586674,
"grad_norm": 79.40963745117188,
"learning_rate": 4.3379661691051695e-06,
"loss": 0.5143,
"mean_token_accuracy": 0.8448100090026855,
"num_tokens": 39707813.0,
"step": 4890
},
{
"entropy": 1.0879161715507508,
"epoch": 2.7639751552795033,
"grad_norm": 71.71478271484375,
"learning_rate": 4.3366975215773564e-06,
"loss": 0.4987,
"mean_token_accuracy": 0.8481094598770141,
"num_tokens": 39748415.0,
"step": 4895
},
{
"entropy": 1.1267621994018555,
"epoch": 2.766798418972332,
"grad_norm": 87.01432037353516,
"learning_rate": 4.335427920821474e-06,
"loss": 0.5392,
"mean_token_accuracy": 0.837181282043457,
"num_tokens": 39789148.0,
"step": 4900
},
{
"entropy": 1.1177945613861084,
"epoch": 2.7696216826651607,
"grad_norm": 76.5605239868164,
"learning_rate": 4.334157367836301e-06,
"loss": 0.5153,
"mean_token_accuracy": 0.845607578754425,
"num_tokens": 39829885.0,
"step": 4905
},
{
"entropy": 1.1427584886550903,
"epoch": 2.77244494635799,
"grad_norm": 68.80554962158203,
"learning_rate": 4.332885863621367e-06,
"loss": 0.5072,
"mean_token_accuracy": 0.8441779494285584,
"num_tokens": 39870465.0,
"step": 4910
},
{
"entropy": 1.111513876914978,
"epoch": 2.7752682100508186,
"grad_norm": 78.51546478271484,
"learning_rate": 4.331613409176948e-06,
"loss": 0.5141,
"mean_token_accuracy": 0.8416643738746643,
"num_tokens": 39911218.0,
"step": 4915
},
{
"entropy": 1.125947070121765,
"epoch": 2.7780914737436477,
"grad_norm": 100.48128509521484,
"learning_rate": 4.330340005504069e-06,
"loss": 0.5525,
"mean_token_accuracy": 0.8351372361183167,
"num_tokens": 39951838.0,
"step": 4920
},
{
"entropy": 1.1707452058792114,
"epoch": 2.7809147374364764,
"grad_norm": 76.68048858642578,
"learning_rate": 4.329065653604504e-06,
"loss": 0.5287,
"mean_token_accuracy": 0.8395331859588623,
"num_tokens": 39992459.0,
"step": 4925
},
{
"entropy": 1.0487358808517455,
"epoch": 2.7837380011293056,
"grad_norm": 82.47752380371094,
"learning_rate": 4.327790354480767e-06,
"loss": 0.4733,
"mean_token_accuracy": 0.8563852667808532,
"num_tokens": 40033259.0,
"step": 4930
},
{
"entropy": 1.1915284037590026,
"epoch": 2.7865612648221343,
"grad_norm": 69.0811767578125,
"learning_rate": 4.326514109136124e-06,
"loss": 0.5118,
"mean_token_accuracy": 0.8428994297981263,
"num_tokens": 40073922.0,
"step": 4935
},
{
"entropy": 1.2053372740745545,
"epoch": 2.7893845285149634,
"grad_norm": 84.73222351074219,
"learning_rate": 4.325236918574579e-06,
"loss": 0.5432,
"mean_token_accuracy": 0.8382733345031739,
"num_tokens": 40114429.0,
"step": 4940
},
{
"entropy": 1.251980471611023,
"epoch": 2.792207792207792,
"grad_norm": 79.34503173828125,
"learning_rate": 4.3239587838008854e-06,
"loss": 0.5302,
"mean_token_accuracy": 0.8375539660453797,
"num_tokens": 40155089.0,
"step": 4945
},
{
"entropy": 1.2567893505096435,
"epoch": 2.795031055900621,
"grad_norm": 73.0334701538086,
"learning_rate": 4.322679705820536e-06,
"loss": 0.575,
"mean_token_accuracy": 0.8274977684020997,
"num_tokens": 40195605.0,
"step": 4950
},
{
"entropy": 1.2066476345062256,
"epoch": 2.79785431959345,
"grad_norm": 75.87010192871094,
"learning_rate": 4.321399685639764e-06,
"loss": 0.525,
"mean_token_accuracy": 0.8371191024780273,
"num_tokens": 40236108.0,
"step": 4955
},
{
"entropy": 1.1763107776641846,
"epoch": 2.800677583286279,
"grad_norm": 74.75856018066406,
"learning_rate": 4.320118724265549e-06,
"loss": 0.5465,
"mean_token_accuracy": 0.8359721064567566,
"num_tokens": 40276738.0,
"step": 4960
},
{
"entropy": 1.0980966806411743,
"epoch": 2.803500846979108,
"grad_norm": 78.28801727294922,
"learning_rate": 4.318836822705607e-06,
"loss": 0.5079,
"mean_token_accuracy": 0.8445510983467102,
"num_tokens": 40317366.0,
"step": 4965
},
{
"entropy": 1.031636393070221,
"epoch": 2.8063241106719365,
"grad_norm": 73.09954071044922,
"learning_rate": 4.317553981968394e-06,
"loss": 0.4553,
"mean_token_accuracy": 0.8609274625778198,
"num_tokens": 40358076.0,
"step": 4970
},
{
"entropy": 1.161788272857666,
"epoch": 2.8091473743647657,
"grad_norm": 71.20985412597656,
"learning_rate": 4.3162702030631074e-06,
"loss": 0.5243,
"mean_token_accuracy": 0.8453732013702393,
"num_tokens": 40398739.0,
"step": 4975
},
{
"entropy": 1.1623048067092896,
"epoch": 2.811970638057595,
"grad_norm": 75.88655090332031,
"learning_rate": 4.314985486999679e-06,
"loss": 0.5438,
"mean_token_accuracy": 0.8361968874931336,
"num_tokens": 40439544.0,
"step": 4980
},
{
"entropy": 1.1252004146575927,
"epoch": 2.8147939017504235,
"grad_norm": 64.87108612060547,
"learning_rate": 4.313699834788781e-06,
"loss": 0.5267,
"mean_token_accuracy": 0.8420312166213989,
"num_tokens": 40480154.0,
"step": 4985
},
{
"entropy": 1.1458193778991699,
"epoch": 2.8176171654432522,
"grad_norm": 81.05680847167969,
"learning_rate": 4.312413247441819e-06,
"loss": 0.5012,
"mean_token_accuracy": 0.8471116781234741,
"num_tokens": 40519961.0,
"step": 4990
},
{
"entropy": 1.1213873147964477,
"epoch": 2.8204404291360814,
"grad_norm": 71.18330383300781,
"learning_rate": 4.311125725970938e-06,
"loss": 0.5273,
"mean_token_accuracy": 0.8355283141136169,
"num_tokens": 40560630.0,
"step": 4995
},
{
"entropy": 1.1988248109817505,
"epoch": 2.82326369282891,
"grad_norm": 94.21017456054688,
"learning_rate": 4.309837271389015e-06,
"loss": 0.5374,
"mean_token_accuracy": 0.8381911158561707,
"num_tokens": 40601177.0,
"step": 5000
},
{
"epoch": 2.82326369282891,
"eval_entropy": 1.349253487586975,
"eval_loss": 0.35211896896362305,
"eval_mean_token_accuracy": 0.8927880883216858,
"eval_num_tokens": 40601177.0,
"eval_runtime": 2.4557,
"eval_samples_per_second": 15.882,
"eval_steps_per_second": 2.036,
"step": 5000
},
{
"entropy": 1.1538643002510072,
"epoch": 2.8260869565217392,
"grad_norm": 74.06842803955078,
"learning_rate": 4.308547884709662e-06,
"loss": 0.501,
"mean_token_accuracy": 0.848921275138855,
"num_tokens": 40641757.0,
"step": 5005
},
{
"entropy": 1.1794851779937745,
"epoch": 2.828910220214568,
"grad_norm": 76.33892822265625,
"learning_rate": 4.307257566947225e-06,
"loss": 0.5416,
"mean_token_accuracy": 0.836717689037323,
"num_tokens": 40682470.0,
"step": 5010
},
{
"entropy": 1.1910365343093872,
"epoch": 2.831733483907397,
"grad_norm": 68.9273452758789,
"learning_rate": 4.30596631911678e-06,
"loss": 0.5393,
"mean_token_accuracy": 0.8378082513809204,
"num_tokens": 40722298.0,
"step": 5015
},
{
"entropy": 1.0947319865226746,
"epoch": 2.834556747600226,
"grad_norm": 77.48676300048828,
"learning_rate": 4.304674142234137e-06,
"loss": 0.4782,
"mean_token_accuracy": 0.853443443775177,
"num_tokens": 40762940.0,
"step": 5020
},
{
"entropy": 1.157103681564331,
"epoch": 2.837380011293055,
"grad_norm": 74.99523162841797,
"learning_rate": 4.303381037315837e-06,
"loss": 0.5632,
"mean_token_accuracy": 0.8314770102500916,
"num_tokens": 40803373.0,
"step": 5025
},
{
"entropy": 1.0990593075752257,
"epoch": 2.8402032749858837,
"grad_norm": 58.09937286376953,
"learning_rate": 4.30208700537915e-06,
"loss": 0.4981,
"mean_token_accuracy": 0.8482685565948487,
"num_tokens": 40844047.0,
"step": 5030
},
{
"entropy": 1.1404879331588744,
"epoch": 2.8430265386787124,
"grad_norm": 72.91802215576172,
"learning_rate": 4.300792047442074e-06,
"loss": 0.495,
"mean_token_accuracy": 0.8491962909698486,
"num_tokens": 40884825.0,
"step": 5035
},
{
"entropy": 1.1204862236976623,
"epoch": 2.8458498023715415,
"grad_norm": 61.5856819152832,
"learning_rate": 4.29949616452334e-06,
"loss": 0.5091,
"mean_token_accuracy": 0.8451560616493226,
"num_tokens": 40925345.0,
"step": 5040
},
{
"entropy": 1.1306064128875732,
"epoch": 2.8486730660643707,
"grad_norm": 72.08314514160156,
"learning_rate": 4.2981993576424015e-06,
"loss": 0.5112,
"mean_token_accuracy": 0.845184576511383,
"num_tokens": 40966061.0,
"step": 5045
},
{
"entropy": 1.1932409048080443,
"epoch": 2.8514963297571994,
"grad_norm": 66.057861328125,
"learning_rate": 4.296901627819442e-06,
"loss": 0.5432,
"mean_token_accuracy": 0.8331049919128418,
"num_tokens": 41005966.0,
"step": 5050
},
{
"entropy": 1.1901793003082275,
"epoch": 2.854319593450028,
"grad_norm": 78.77214050292969,
"learning_rate": 4.29560297607537e-06,
"loss": 0.5656,
"mean_token_accuracy": 0.8264202117919922,
"num_tokens": 41046552.0,
"step": 5055
},
{
"entropy": 1.1799126148223877,
"epoch": 2.857142857142857,
"grad_norm": 78.53099822998047,
"learning_rate": 4.2943034034318185e-06,
"loss": 0.5488,
"mean_token_accuracy": 0.8338027834892273,
"num_tokens": 41087072.0,
"step": 5060
},
{
"entropy": 1.099338126182556,
"epoch": 2.859966120835686,
"grad_norm": 68.20406341552734,
"learning_rate": 4.293002910911147e-06,
"loss": 0.4488,
"mean_token_accuracy": 0.8597991704940796,
"num_tokens": 41127628.0,
"step": 5065
},
{
"entropy": 1.1174504041671753,
"epoch": 2.862789384528515,
"grad_norm": 69.75657653808594,
"learning_rate": 4.291701499536438e-06,
"loss": 0.4756,
"mean_token_accuracy": 0.8555531144142151,
"num_tokens": 41168327.0,
"step": 5070
},
{
"entropy": 1.155361032485962,
"epoch": 2.8656126482213438,
"grad_norm": 72.0925521850586,
"learning_rate": 4.2903991703314954e-06,
"loss": 0.5072,
"mean_token_accuracy": 0.8479303359985352,
"num_tokens": 41208824.0,
"step": 5075
},
{
"entropy": 1.1240201950073243,
"epoch": 2.868435911914173,
"grad_norm": 80.29562377929688,
"learning_rate": 4.289095924320846e-06,
"loss": 0.5227,
"mean_token_accuracy": 0.8412837505340576,
"num_tokens": 41249209.0,
"step": 5080
},
{
"entropy": 1.1458587884902953,
"epoch": 2.8712591756070016,
"grad_norm": 84.07976531982422,
"learning_rate": 4.287791762529738e-06,
"loss": 0.567,
"mean_token_accuracy": 0.8279375076293946,
"num_tokens": 41289873.0,
"step": 5085
},
{
"entropy": 1.1406369805335999,
"epoch": 2.8740824392998308,
"grad_norm": 75.96171569824219,
"learning_rate": 4.286486685984142e-06,
"loss": 0.5312,
"mean_token_accuracy": 0.8363900184631348,
"num_tokens": 41330717.0,
"step": 5090
},
{
"entropy": 1.117619562149048,
"epoch": 2.8769057029926595,
"grad_norm": 73.16439056396484,
"learning_rate": 4.285180695710742e-06,
"loss": 0.5419,
"mean_token_accuracy": 0.8377798557281494,
"num_tokens": 41371121.0,
"step": 5095
},
{
"entropy": 1.2503567218780518,
"epoch": 2.879728966685488,
"grad_norm": 80.75735473632812,
"learning_rate": 4.28387379273695e-06,
"loss": 0.5512,
"mean_token_accuracy": 0.83378164768219,
"num_tokens": 41411741.0,
"step": 5100
},
{
"entropy": 1.1296322226524353,
"epoch": 2.8825522303783173,
"grad_norm": 72.27934265136719,
"learning_rate": 4.282565978090888e-06,
"loss": 0.52,
"mean_token_accuracy": 0.8386531591415405,
"num_tokens": 41452180.0,
"step": 5105
},
{
"entropy": 1.1246942281723022,
"epoch": 2.8853754940711465,
"grad_norm": 72.93498992919922,
"learning_rate": 4.281257252801399e-06,
"loss": 0.5144,
"mean_token_accuracy": 0.8428778409957886,
"num_tokens": 41492779.0,
"step": 5110
},
{
"entropy": 1.1266536712646484,
"epoch": 2.888198757763975,
"grad_norm": 67.28563690185547,
"learning_rate": 4.279947617898042e-06,
"loss": 0.4951,
"mean_token_accuracy": 0.8494343519210815,
"num_tokens": 41533409.0,
"step": 5115
},
{
"entropy": 1.1207608938217164,
"epoch": 2.891022021456804,
"grad_norm": 76.63964080810547,
"learning_rate": 4.27863707441109e-06,
"loss": 0.5156,
"mean_token_accuracy": 0.8436846733093262,
"num_tokens": 41573962.0,
"step": 5120
},
{
"entropy": 1.1186698794364929,
"epoch": 2.893845285149633,
"grad_norm": 69.15577697753906,
"learning_rate": 4.277325623371534e-06,
"loss": 0.4647,
"mean_token_accuracy": 0.8576582551002503,
"num_tokens": 41614674.0,
"step": 5125
},
{
"entropy": 1.2201684951782226,
"epoch": 2.8966685488424617,
"grad_norm": 65.40320587158203,
"learning_rate": 4.276013265811075e-06,
"loss": 0.5036,
"mean_token_accuracy": 0.8451929330825806,
"num_tokens": 41655535.0,
"step": 5130
},
{
"entropy": 1.1049484729766845,
"epoch": 2.899491812535291,
"grad_norm": 69.20492553710938,
"learning_rate": 4.274700002762131e-06,
"loss": 0.5007,
"mean_token_accuracy": 0.848007595539093,
"num_tokens": 41696289.0,
"step": 5135
},
{
"entropy": 1.1586164236068726,
"epoch": 2.9023150762281196,
"grad_norm": 79.74908447265625,
"learning_rate": 4.273385835257829e-06,
"loss": 0.5178,
"mean_token_accuracy": 0.8397310614585877,
"num_tokens": 41736100.0,
"step": 5140
},
{
"entropy": 1.2396832942962646,
"epoch": 2.9051383399209487,
"grad_norm": 82.225341796875,
"learning_rate": 4.272070764332009e-06,
"loss": 0.5261,
"mean_token_accuracy": 0.8408646941184997,
"num_tokens": 41776894.0,
"step": 5145
},
{
"entropy": 1.249162983894348,
"epoch": 2.9079616036137774,
"grad_norm": 76.64154052734375,
"learning_rate": 4.270754791019224e-06,
"loss": 0.5298,
"mean_token_accuracy": 0.8383278131484986,
"num_tokens": 41817499.0,
"step": 5150
},
{
"entropy": 1.270682430267334,
"epoch": 2.9107848673066066,
"grad_norm": 81.33303833007812,
"learning_rate": 4.2694379163547315e-06,
"loss": 0.5679,
"mean_token_accuracy": 0.8281817555427551,
"num_tokens": 41858247.0,
"step": 5155
},
{
"entropy": 1.2870656728744507,
"epoch": 2.9136081309994353,
"grad_norm": 78.17098999023438,
"learning_rate": 4.268120141374503e-06,
"loss": 0.5402,
"mean_token_accuracy": 0.8346587181091308,
"num_tokens": 41898836.0,
"step": 5160
},
{
"entropy": 1.2478781938552856,
"epoch": 2.9164313946922644,
"grad_norm": 78.640869140625,
"learning_rate": 4.266801467115215e-06,
"loss": 0.5031,
"mean_token_accuracy": 0.8475107431411744,
"num_tokens": 41939549.0,
"step": 5165
},
{
"entropy": 1.2414112091064453,
"epoch": 2.919254658385093,
"grad_norm": 75.24764251708984,
"learning_rate": 4.265481894614255e-06,
"loss": 0.4988,
"mean_token_accuracy": 0.8469999432563782,
"num_tokens": 41980269.0,
"step": 5170
},
{
"entropy": 1.2622466325759887,
"epoch": 2.9220779220779223,
"grad_norm": 82.20746612548828,
"learning_rate": 4.264161424909713e-06,
"loss": 0.5352,
"mean_token_accuracy": 0.8388458490371704,
"num_tokens": 42020982.0,
"step": 5175
},
{
"entropy": 1.1029718399047852,
"epoch": 2.924901185770751,
"grad_norm": 69.88716888427734,
"learning_rate": 4.262840059040388e-06,
"loss": 0.4763,
"mean_token_accuracy": 0.8518707990646363,
"num_tokens": 42061751.0,
"step": 5180
},
{
"entropy": 1.12943195104599,
"epoch": 2.9277244494635797,
"grad_norm": 78.41839599609375,
"learning_rate": 4.261517798045783e-06,
"loss": 0.5058,
"mean_token_accuracy": 0.84534991979599,
"num_tokens": 42102450.0,
"step": 5185
},
{
"entropy": 1.0798253417015076,
"epoch": 2.930547713156409,
"grad_norm": 67.44549560546875,
"learning_rate": 4.260194642966105e-06,
"loss": 0.46,
"mean_token_accuracy": 0.8567140102386475,
"num_tokens": 42142939.0,
"step": 5190
},
{
"entropy": 1.199964690208435,
"epoch": 2.933370976849238,
"grad_norm": 78.28135681152344,
"learning_rate": 4.258870594842262e-06,
"loss": 0.5382,
"mean_token_accuracy": 0.8380404353141785,
"num_tokens": 42183415.0,
"step": 5195
},
{
"entropy": 1.2671225309371947,
"epoch": 2.9361942405420667,
"grad_norm": 74.52027893066406,
"learning_rate": 4.257545654715872e-06,
"loss": 0.5074,
"mean_token_accuracy": 0.8436587452888489,
"num_tokens": 42224156.0,
"step": 5200
},
{
"entropy": 1.1756804943084718,
"epoch": 2.9390175042348954,
"grad_norm": 66.07868194580078,
"learning_rate": 4.256219823629244e-06,
"loss": 0.5174,
"mean_token_accuracy": 0.841261339187622,
"num_tokens": 42264867.0,
"step": 5205
},
{
"entropy": 1.3112341165542603,
"epoch": 2.9418407679277245,
"grad_norm": 79.60833740234375,
"learning_rate": 4.254893102625398e-06,
"loss": 0.5779,
"mean_token_accuracy": 0.825363290309906,
"num_tokens": 42305366.0,
"step": 5210
},
{
"entropy": 1.2153887033462525,
"epoch": 2.9446640316205532,
"grad_norm": 74.00979614257812,
"learning_rate": 4.253565492748048e-06,
"loss": 0.5067,
"mean_token_accuracy": 0.8458914637565613,
"num_tokens": 42345890.0,
"step": 5215
},
{
"entropy": 1.1424734592437744,
"epoch": 2.9474872953133824,
"grad_norm": 59.49595642089844,
"learning_rate": 4.252236995041609e-06,
"loss": 0.5383,
"mean_token_accuracy": 0.8360925078392029,
"num_tokens": 42386578.0,
"step": 5220
},
{
"entropy": 1.1367671012878418,
"epoch": 2.950310559006211,
"grad_norm": 69.0216064453125,
"learning_rate": 4.250907610551193e-06,
"loss": 0.5681,
"mean_token_accuracy": 0.8281669616699219,
"num_tokens": 42426996.0,
"step": 5225
},
{
"entropy": 1.160671877861023,
"epoch": 2.9531338226990402,
"grad_norm": 69.35001373291016,
"learning_rate": 4.249577340322612e-06,
"loss": 0.5345,
"mean_token_accuracy": 0.8406946778297424,
"num_tokens": 42467518.0,
"step": 5230
},
{
"entropy": 1.21805260181427,
"epoch": 2.955957086391869,
"grad_norm": 76.58956909179688,
"learning_rate": 4.248246185402376e-06,
"loss": 0.5615,
"mean_token_accuracy": 0.827870512008667,
"num_tokens": 42508111.0,
"step": 5235
},
{
"entropy": 1.1750145196914672,
"epoch": 2.958780350084698,
"grad_norm": 69.74154663085938,
"learning_rate": 4.246914146837686e-06,
"loss": 0.4865,
"mean_token_accuracy": 0.8525615811347962,
"num_tokens": 42548839.0,
"step": 5240
},
{
"entropy": 1.1301527976989747,
"epoch": 2.961603613777527,
"grad_norm": 70.07862091064453,
"learning_rate": 4.245581225676443e-06,
"loss": 0.4933,
"mean_token_accuracy": 0.8495101571083069,
"num_tokens": 42589430.0,
"step": 5245
},
{
"entropy": 1.2715572595596314,
"epoch": 2.9644268774703555,
"grad_norm": 66.82747650146484,
"learning_rate": 4.244247422967237e-06,
"loss": 0.5695,
"mean_token_accuracy": 0.8271044254302978,
"num_tokens": 42630131.0,
"step": 5250
},
{
"entropy": 1.203652834892273,
"epoch": 2.9672501411631846,
"grad_norm": 79.75418090820312,
"learning_rate": 4.2429127397593585e-06,
"loss": 0.5481,
"mean_token_accuracy": 0.8342503547668457,
"num_tokens": 42670817.0,
"step": 5255
},
{
"entropy": 1.2326645374298095,
"epoch": 2.970073404856014,
"grad_norm": 70.31961059570312,
"learning_rate": 4.241577177102785e-06,
"loss": 0.5606,
"mean_token_accuracy": 0.830009937286377,
"num_tokens": 42711537.0,
"step": 5260
},
{
"entropy": 1.2372756958007813,
"epoch": 2.9728966685488425,
"grad_norm": 77.57453918457031,
"learning_rate": 4.240240736048188e-06,
"loss": 0.5462,
"mean_token_accuracy": 0.8349868655204773,
"num_tokens": 42751836.0,
"step": 5265
},
{
"entropy": 1.0865499377250671,
"epoch": 2.975719932241671,
"grad_norm": 66.26081085205078,
"learning_rate": 4.23890341764693e-06,
"loss": 0.5199,
"mean_token_accuracy": 0.8439796090126037,
"num_tokens": 42792490.0,
"step": 5270
},
{
"entropy": 1.1261334896087647,
"epoch": 2.9785431959345003,
"grad_norm": 74.02761840820312,
"learning_rate": 4.237565222951063e-06,
"loss": 0.5352,
"mean_token_accuracy": 0.8395409226417542,
"num_tokens": 42832932.0,
"step": 5275
},
{
"entropy": 1.153111171722412,
"epoch": 2.981366459627329,
"grad_norm": 66.55201721191406,
"learning_rate": 4.2362261530133294e-06,
"loss": 0.517,
"mean_token_accuracy": 0.8424826622009277,
"num_tokens": 42873706.0,
"step": 5280
},
{
"entropy": 1.159202551841736,
"epoch": 2.984189723320158,
"grad_norm": 74.84986877441406,
"learning_rate": 4.234886208887161e-06,
"loss": 0.5285,
"mean_token_accuracy": 0.8411277890205383,
"num_tokens": 42914346.0,
"step": 5285
},
{
"entropy": 1.1488636016845704,
"epoch": 2.987012987012987,
"grad_norm": 72.25592803955078,
"learning_rate": 4.233545391626674e-06,
"loss": 0.5471,
"mean_token_accuracy": 0.8350270032882691,
"num_tokens": 42954898.0,
"step": 5290
},
{
"entropy": 1.1714290857315064,
"epoch": 2.989836250705816,
"grad_norm": 70.09166717529297,
"learning_rate": 4.232203702286673e-06,
"loss": 0.5587,
"mean_token_accuracy": 0.8305088877677917,
"num_tokens": 42995672.0,
"step": 5295
},
{
"entropy": 1.0723572731018067,
"epoch": 2.9926595143986447,
"grad_norm": 68.43204498291016,
"learning_rate": 4.230861141922652e-06,
"loss": 0.4805,
"mean_token_accuracy": 0.8510453224182128,
"num_tokens": 43036032.0,
"step": 5300
},
{
"entropy": 1.042343759536743,
"epoch": 2.995482778091474,
"grad_norm": 72.96656036376953,
"learning_rate": 4.229517711590785e-06,
"loss": 0.4807,
"mean_token_accuracy": 0.8531039834022522,
"num_tokens": 43076947.0,
"step": 5305
},
{
"entropy": 1.172229290008545,
"epoch": 2.9983060417843026,
"grad_norm": 68.45772552490234,
"learning_rate": 4.228173412347932e-06,
"loss": 0.5296,
"mean_token_accuracy": 0.8403589129447937,
"num_tokens": 43117547.0,
"step": 5310
},
{
"entropy": 1.2335005044937133,
"epoch": 3.0011293054771317,
"grad_norm": 91.20881652832031,
"learning_rate": 4.226828245251639e-06,
"loss": 0.5007,
"mean_token_accuracy": 0.8487535119056702,
"num_tokens": 43152204.0,
"step": 5315
},
{
"entropy": 1.0431846261024476,
"epoch": 3.0039525691699605,
"grad_norm": 86.15254974365234,
"learning_rate": 4.2254822113601326e-06,
"loss": 0.3079,
"mean_token_accuracy": 0.9018093705177307,
"num_tokens": 43192950.0,
"step": 5320
},
{
"entropy": 0.8944714188575744,
"epoch": 3.0067758328627896,
"grad_norm": 83.4747314453125,
"learning_rate": 4.224135311732321e-06,
"loss": 0.3301,
"mean_token_accuracy": 0.8947166442871094,
"num_tokens": 43233859.0,
"step": 5325
},
{
"entropy": 0.9511932849884033,
"epoch": 3.0095990965556183,
"grad_norm": 98.4117660522461,
"learning_rate": 4.222787547427796e-06,
"loss": 0.3137,
"mean_token_accuracy": 0.8992406487464905,
"num_tokens": 43274528.0,
"step": 5330
},
{
"entropy": 1.0112687826156617,
"epoch": 3.012422360248447,
"grad_norm": 81.14041900634766,
"learning_rate": 4.221438919506825e-06,
"loss": 0.3306,
"mean_token_accuracy": 0.8945652127265931,
"num_tokens": 43315196.0,
"step": 5335
},
{
"entropy": 1.044990885257721,
"epoch": 3.015245623941276,
"grad_norm": 80.31888580322266,
"learning_rate": 4.2200894290303595e-06,
"loss": 0.3248,
"mean_token_accuracy": 0.8962285280227661,
"num_tokens": 43355979.0,
"step": 5340
},
{
"entropy": 1.019908332824707,
"epoch": 3.018068887634105,
"grad_norm": 89.42028045654297,
"learning_rate": 4.218739077060028e-06,
"loss": 0.3208,
"mean_token_accuracy": 0.8979032635688782,
"num_tokens": 43396122.0,
"step": 5345
},
{
"entropy": 0.9794248342514038,
"epoch": 3.020892151326934,
"grad_norm": 78.72344207763672,
"learning_rate": 4.217387864658135e-06,
"loss": 0.3468,
"mean_token_accuracy": 0.8871222257614135,
"num_tokens": 43436697.0,
"step": 5350
},
{
"entropy": 0.9811869382858276,
"epoch": 3.0237154150197627,
"grad_norm": 79.8103256225586,
"learning_rate": 4.216035792887664e-06,
"loss": 0.3315,
"mean_token_accuracy": 0.8930273175239563,
"num_tokens": 43477397.0,
"step": 5355
},
{
"entropy": 0.958527147769928,
"epoch": 3.026538678712592,
"grad_norm": 75.03720092773438,
"learning_rate": 4.214682862812274e-06,
"loss": 0.3301,
"mean_token_accuracy": 0.8914317131042481,
"num_tokens": 43517882.0,
"step": 5360
},
{
"entropy": 0.9316562294960022,
"epoch": 3.0293619424054206,
"grad_norm": 74.98342895507812,
"learning_rate": 4.213329075496298e-06,
"loss": 0.2988,
"mean_token_accuracy": 0.9033580541610717,
"num_tokens": 43558258.0,
"step": 5365
},
{
"entropy": 1.0227751016616822,
"epoch": 3.0321852060982497,
"grad_norm": 90.37271118164062,
"learning_rate": 4.211974432004745e-06,
"loss": 0.3243,
"mean_token_accuracy": 0.8961042642593384,
"num_tokens": 43598741.0,
"step": 5370
},
{
"entropy": 0.865773344039917,
"epoch": 3.0350084697910784,
"grad_norm": 77.36763763427734,
"learning_rate": 4.210618933403299e-06,
"loss": 0.3013,
"mean_token_accuracy": 0.9025385260581971,
"num_tokens": 43639066.0,
"step": 5375
},
{
"entropy": 0.8853742003440856,
"epoch": 3.0378317334839076,
"grad_norm": 70.10348510742188,
"learning_rate": 4.209262580758311e-06,
"loss": 0.274,
"mean_token_accuracy": 0.9107710480690002,
"num_tokens": 43679643.0,
"step": 5380
},
{
"entropy": 0.9764909148216248,
"epoch": 3.0406549971767363,
"grad_norm": 92.78551483154297,
"learning_rate": 4.207905375136811e-06,
"loss": 0.3209,
"mean_token_accuracy": 0.8971151232719421,
"num_tokens": 43720215.0,
"step": 5385
},
{
"entropy": 0.9666017770767212,
"epoch": 3.0434782608695654,
"grad_norm": 79.50763702392578,
"learning_rate": 4.206547317606493e-06,
"loss": 0.281,
"mean_token_accuracy": 0.9079766511917114,
"num_tokens": 43761009.0,
"step": 5390
},
{
"entropy": 0.9946901082992554,
"epoch": 3.046301524562394,
"grad_norm": 79.8683090209961,
"learning_rate": 4.205188409235728e-06,
"loss": 0.3093,
"mean_token_accuracy": 0.9002600073814392,
"num_tokens": 43801591.0,
"step": 5395
},
{
"entropy": 0.9930772542953491,
"epoch": 3.049124788255223,
"grad_norm": 77.45728302001953,
"learning_rate": 4.203828651093551e-06,
"loss": 0.3177,
"mean_token_accuracy": 0.8979378700256347,
"num_tokens": 43842173.0,
"step": 5400
},
{
"entropy": 0.9636677026748657,
"epoch": 3.051948051948052,
"grad_norm": 77.16109466552734,
"learning_rate": 4.2024680442496694e-06,
"loss": 0.3143,
"mean_token_accuracy": 0.8984180212020874,
"num_tokens": 43882801.0,
"step": 5405
},
{
"entropy": 0.9925666213035583,
"epoch": 3.0547713156408807,
"grad_norm": 75.40894317626953,
"learning_rate": 4.2011065897744545e-06,
"loss": 0.3234,
"mean_token_accuracy": 0.8965913653373718,
"num_tokens": 43923429.0,
"step": 5410
},
{
"entropy": 1.02843519449234,
"epoch": 3.05759457933371,
"grad_norm": 57.404842376708984,
"learning_rate": 4.199744288738948e-06,
"loss": 0.3181,
"mean_token_accuracy": 0.8978499293327331,
"num_tokens": 43963461.0,
"step": 5415
},
{
"entropy": 0.9605874657630921,
"epoch": 3.0604178430265385,
"grad_norm": 72.65766906738281,
"learning_rate": 4.198381142214856e-06,
"loss": 0.3283,
"mean_token_accuracy": 0.8947899460792541,
"num_tokens": 44004053.0,
"step": 5420
},
{
"entropy": 0.9381905317306518,
"epoch": 3.0632411067193677,
"grad_norm": 81.37139892578125,
"learning_rate": 4.197017151274547e-06,
"loss": 0.3082,
"mean_token_accuracy": 0.8991894960403443,
"num_tokens": 44044520.0,
"step": 5425
},
{
"entropy": 1.069422674179077,
"epoch": 3.0660643704121964,
"grad_norm": 98.56239318847656,
"learning_rate": 4.1956523169910605e-06,
"loss": 0.3471,
"mean_token_accuracy": 0.8895352363586426,
"num_tokens": 44085119.0,
"step": 5430
},
{
"entropy": 0.8803645491600036,
"epoch": 3.0688876341050255,
"grad_norm": 69.33189392089844,
"learning_rate": 4.194286640438092e-06,
"loss": 0.2933,
"mean_token_accuracy": 0.9065402269363403,
"num_tokens": 44125683.0,
"step": 5435
},
{
"entropy": 1.0519742131233216,
"epoch": 3.0717108977978542,
"grad_norm": 88.03885650634766,
"learning_rate": 4.192920122690005e-06,
"loss": 0.34,
"mean_token_accuracy": 0.8912493824958801,
"num_tokens": 44166348.0,
"step": 5440
},
{
"entropy": 0.9997652292251586,
"epoch": 3.0745341614906834,
"grad_norm": 85.6423568725586,
"learning_rate": 4.191552764821823e-06,
"loss": 0.3317,
"mean_token_accuracy": 0.8926982164382935,
"num_tokens": 44206892.0,
"step": 5445
},
{
"entropy": 1.0732812404632568,
"epoch": 3.077357425183512,
"grad_norm": 72.29247283935547,
"learning_rate": 4.190184567909229e-06,
"loss": 0.3411,
"mean_token_accuracy": 0.89251549243927,
"num_tokens": 44247388.0,
"step": 5450
},
{
"entropy": 0.9219252943992615,
"epoch": 3.080180688876341,
"grad_norm": 93.68437957763672,
"learning_rate": 4.188815533028569e-06,
"loss": 0.3186,
"mean_token_accuracy": 0.8986498594284058,
"num_tokens": 44287827.0,
"step": 5455
},
{
"entropy": 1.0143481492996216,
"epoch": 3.08300395256917,
"grad_norm": 80.94513702392578,
"learning_rate": 4.1874456612568435e-06,
"loss": 0.296,
"mean_token_accuracy": 0.9054910182952881,
"num_tokens": 44327894.0,
"step": 5460
},
{
"entropy": 1.0362035751342773,
"epoch": 3.085827216261999,
"grad_norm": 88.25469207763672,
"learning_rate": 4.186074953671717e-06,
"loss": 0.3444,
"mean_token_accuracy": 0.8891435742378235,
"num_tokens": 44368586.0,
"step": 5465
},
{
"entropy": 0.9385774374008179,
"epoch": 3.0886504799548278,
"grad_norm": 66.99293518066406,
"learning_rate": 4.184703411351508e-06,
"loss": 0.3234,
"mean_token_accuracy": 0.8954385399818421,
"num_tokens": 44409209.0,
"step": 5470
},
{
"entropy": 0.8999487400054932,
"epoch": 3.0914737436476565,
"grad_norm": 89.90604400634766,
"learning_rate": 4.1833310353751935e-06,
"loss": 0.3418,
"mean_token_accuracy": 0.8911889672279358,
"num_tokens": 44449767.0,
"step": 5475
},
{
"entropy": 0.9921982645988464,
"epoch": 3.0942970073404856,
"grad_norm": 81.72102355957031,
"learning_rate": 4.181957826822403e-06,
"loss": 0.3254,
"mean_token_accuracy": 0.8946803331375122,
"num_tokens": 44490423.0,
"step": 5480
},
{
"entropy": 0.9456246733665467,
"epoch": 3.0971202710333143,
"grad_norm": 85.97330474853516,
"learning_rate": 4.1805837867734255e-06,
"loss": 0.332,
"mean_token_accuracy": 0.8957900285720826,
"num_tokens": 44530989.0,
"step": 5485
},
{
"entropy": 0.9648449778556824,
"epoch": 3.0999435347261435,
"grad_norm": 74.89656066894531,
"learning_rate": 4.179208916309202e-06,
"loss": 0.2918,
"mean_token_accuracy": 0.9066829919815064,
"num_tokens": 44571777.0,
"step": 5490
},
{
"entropy": 0.9347667455673218,
"epoch": 3.102766798418972,
"grad_norm": 74.6075668334961,
"learning_rate": 4.177833216511326e-06,
"loss": 0.3189,
"mean_token_accuracy": 0.8962038516998291,
"num_tokens": 44612462.0,
"step": 5495
},
{
"entropy": 0.9365931272506713,
"epoch": 3.1055900621118013,
"grad_norm": 76.95111846923828,
"learning_rate": 4.176456688462045e-06,
"loss": 0.3247,
"mean_token_accuracy": 0.8957284688949585,
"num_tokens": 44652841.0,
"step": 5500
},
{
"epoch": 3.1055900621118013,
"eval_entropy": 1.2302906274795533,
"eval_loss": 0.2950093150138855,
"eval_mean_token_accuracy": 0.9086108207702637,
"eval_num_tokens": 44652841.0,
"eval_runtime": 2.453,
"eval_samples_per_second": 15.899,
"eval_steps_per_second": 2.038,
"step": 5500
},
{
"entropy": 1.0472152709960938,
"epoch": 3.10841332580463,
"grad_norm": 69.57112884521484,
"learning_rate": 4.175079333244257e-06,
"loss": 0.3251,
"mean_token_accuracy": 0.894036340713501,
"num_tokens": 44693407.0,
"step": 5505
},
{
"entropy": 1.0573584675788879,
"epoch": 3.111236589497459,
"grad_norm": 73.89783477783203,
"learning_rate": 4.17370115194151e-06,
"loss": 0.3342,
"mean_token_accuracy": 0.8917381048202515,
"num_tokens": 44734101.0,
"step": 5510
},
{
"entropy": 1.1182607412338257,
"epoch": 3.114059853190288,
"grad_norm": 77.34064483642578,
"learning_rate": 4.172322145638004e-06,
"loss": 0.3338,
"mean_token_accuracy": 0.8926242351531982,
"num_tokens": 44774581.0,
"step": 5515
},
{
"entropy": 0.9524365782737731,
"epoch": 3.116883116883117,
"grad_norm": 79.16650390625,
"learning_rate": 4.1709423154185855e-06,
"loss": 0.2924,
"mean_token_accuracy": 0.9028084397315979,
"num_tokens": 44815304.0,
"step": 5520
},
{
"entropy": 0.9392463088035583,
"epoch": 3.1197063805759457,
"grad_norm": 91.71111297607422,
"learning_rate": 4.169561662368753e-06,
"loss": 0.3366,
"mean_token_accuracy": 0.8923919320106506,
"num_tokens": 44856030.0,
"step": 5525
},
{
"entropy": 0.9322085857391358,
"epoch": 3.122529644268775,
"grad_norm": 73.85060119628906,
"learning_rate": 4.168180187574649e-06,
"loss": 0.3098,
"mean_token_accuracy": 0.8995050311088562,
"num_tokens": 44896751.0,
"step": 5530
},
{
"entropy": 1.0029746174812317,
"epoch": 3.1253529079616036,
"grad_norm": 84.3502426147461,
"learning_rate": 4.166797892123062e-06,
"loss": 0.3559,
"mean_token_accuracy": 0.8850750803947449,
"num_tokens": 44937348.0,
"step": 5535
},
{
"entropy": 0.9944419145584107,
"epoch": 3.1281761716544327,
"grad_norm": 76.57456970214844,
"learning_rate": 4.1654147771014285e-06,
"loss": 0.3343,
"mean_token_accuracy": 0.8934206604957581,
"num_tokens": 44977981.0,
"step": 5540
},
{
"entropy": 1.0101613402366638,
"epoch": 3.1309994353472614,
"grad_norm": 71.5161361694336,
"learning_rate": 4.164030843597829e-06,
"loss": 0.3135,
"mean_token_accuracy": 0.9011573910713195,
"num_tokens": 45018766.0,
"step": 5545
},
{
"entropy": 0.9782228708267212,
"epoch": 3.13382269904009,
"grad_norm": 92.19779968261719,
"learning_rate": 4.1626460927009855e-06,
"loss": 0.3547,
"mean_token_accuracy": 0.8878259062767029,
"num_tokens": 45059560.0,
"step": 5550
},
{
"entropy": 1.0341717839241027,
"epoch": 3.1366459627329193,
"grad_norm": 88.86711120605469,
"learning_rate": 4.161260525500268e-06,
"loss": 0.334,
"mean_token_accuracy": 0.8918552994728088,
"num_tokens": 45100197.0,
"step": 5555
},
{
"entropy": 0.9899235010147095,
"epoch": 3.139469226425748,
"grad_norm": 82.83612060546875,
"learning_rate": 4.159874143085685e-06,
"loss": 0.3116,
"mean_token_accuracy": 0.899685287475586,
"num_tokens": 45140887.0,
"step": 5560
},
{
"entropy": 1.0915488839149474,
"epoch": 3.142292490118577,
"grad_norm": 67.84982299804688,
"learning_rate": 4.1584869465478846e-06,
"loss": 0.3385,
"mean_token_accuracy": 0.8910330533981323,
"num_tokens": 45181297.0,
"step": 5565
},
{
"entropy": 1.012636399269104,
"epoch": 3.145115753811406,
"grad_norm": 91.24017333984375,
"learning_rate": 4.157098936978162e-06,
"loss": 0.338,
"mean_token_accuracy": 0.893407940864563,
"num_tokens": 45222015.0,
"step": 5570
},
{
"entropy": 0.9732036113739013,
"epoch": 3.147939017504235,
"grad_norm": 77.7698745727539,
"learning_rate": 4.155710115468444e-06,
"loss": 0.351,
"mean_token_accuracy": 0.887684988975525,
"num_tokens": 45262703.0,
"step": 5575
},
{
"entropy": 0.8633365988731384,
"epoch": 3.1507622811970637,
"grad_norm": 74.60352325439453,
"learning_rate": 4.154320483111303e-06,
"loss": 0.2891,
"mean_token_accuracy": 0.9053243041038513,
"num_tokens": 45303158.0,
"step": 5580
},
{
"entropy": 0.9112148880958557,
"epoch": 3.153585544889893,
"grad_norm": 80.281005859375,
"learning_rate": 4.152930040999944e-06,
"loss": 0.3474,
"mean_token_accuracy": 0.8875756382942199,
"num_tokens": 45343759.0,
"step": 5585
},
{
"entropy": 0.9924912214279175,
"epoch": 3.1564088085827215,
"grad_norm": 66.05514526367188,
"learning_rate": 4.151538790228213e-06,
"loss": 0.3419,
"mean_token_accuracy": 0.8918880581855774,
"num_tokens": 45383961.0,
"step": 5590
},
{
"entropy": 0.9940900683403016,
"epoch": 3.1592320722755507,
"grad_norm": 84.66516876220703,
"learning_rate": 4.15014673189059e-06,
"loss": 0.3045,
"mean_token_accuracy": 0.9007033705711365,
"num_tokens": 45424529.0,
"step": 5595
},
{
"entropy": 0.9432126879692078,
"epoch": 3.1620553359683794,
"grad_norm": 77.25856018066406,
"learning_rate": 4.14875386708219e-06,
"loss": 0.3152,
"mean_token_accuracy": 0.8971645593643188,
"num_tokens": 45465227.0,
"step": 5600
},
{
"entropy": 0.9745931267738343,
"epoch": 3.1648785996612085,
"grad_norm": 81.02576446533203,
"learning_rate": 4.147360196898763e-06,
"loss": 0.3568,
"mean_token_accuracy": 0.8874773502349853,
"num_tokens": 45505830.0,
"step": 5605
},
{
"entropy": 0.92342050075531,
"epoch": 3.1677018633540373,
"grad_norm": 85.53850555419922,
"learning_rate": 4.145965722436695e-06,
"loss": 0.3219,
"mean_token_accuracy": 0.8958787202835083,
"num_tokens": 45546417.0,
"step": 5610
},
{
"entropy": 0.9303161263465881,
"epoch": 3.170525127046866,
"grad_norm": 94.72078704833984,
"learning_rate": 4.144570444793002e-06,
"loss": 0.3571,
"mean_token_accuracy": 0.8850953698158264,
"num_tokens": 45587028.0,
"step": 5615
},
{
"entropy": 1.091172707080841,
"epoch": 3.173348390739695,
"grad_norm": 79.4405288696289,
"learning_rate": 4.14317436506533e-06,
"loss": 0.3378,
"mean_token_accuracy": 0.8912994861602783,
"num_tokens": 45627727.0,
"step": 5620
},
{
"entropy": 1.1407584190368651,
"epoch": 3.176171654432524,
"grad_norm": 83.2149658203125,
"learning_rate": 4.14177748435196e-06,
"loss": 0.3481,
"mean_token_accuracy": 0.8884922146797181,
"num_tokens": 45668433.0,
"step": 5625
},
{
"entropy": 1.024580430984497,
"epoch": 3.178994918125353,
"grad_norm": 79.87594604492188,
"learning_rate": 4.140379803751803e-06,
"loss": 0.3253,
"mean_token_accuracy": 0.8933813095092773,
"num_tokens": 45709184.0,
"step": 5630
},
{
"entropy": 1.0046086192131043,
"epoch": 3.1818181818181817,
"grad_norm": 96.62107849121094,
"learning_rate": 4.138981324364394e-06,
"loss": 0.3872,
"mean_token_accuracy": 0.8781454682350158,
"num_tokens": 45749907.0,
"step": 5635
},
{
"entropy": 0.9690511703491211,
"epoch": 3.184641445511011,
"grad_norm": 70.9329605102539,
"learning_rate": 4.137582047289903e-06,
"loss": 0.3215,
"mean_token_accuracy": 0.8958381295204163,
"num_tokens": 45790644.0,
"step": 5640
},
{
"entropy": 1.0030096292495727,
"epoch": 3.1874647092038395,
"grad_norm": 68.79771423339844,
"learning_rate": 4.1361819736291244e-06,
"loss": 0.3199,
"mean_token_accuracy": 0.8972474575042725,
"num_tokens": 45831271.0,
"step": 5645
},
{
"entropy": 0.9273014307022095,
"epoch": 3.1902879728966687,
"grad_norm": 87.60462951660156,
"learning_rate": 4.134781104483479e-06,
"loss": 0.3464,
"mean_token_accuracy": 0.8891156673431396,
"num_tokens": 45871883.0,
"step": 5650
},
{
"entropy": 0.9342420339584351,
"epoch": 3.1931112365894974,
"grad_norm": 78.78738403320312,
"learning_rate": 4.133379440955014e-06,
"loss": 0.3088,
"mean_token_accuracy": 0.8999145150184631,
"num_tokens": 45912620.0,
"step": 5655
},
{
"entropy": 0.9332594275474548,
"epoch": 3.1959345002823265,
"grad_norm": 70.32312774658203,
"learning_rate": 4.131976984146401e-06,
"loss": 0.3419,
"mean_token_accuracy": 0.8888086795806884,
"num_tokens": 45953260.0,
"step": 5660
},
{
"entropy": 0.9139317393302917,
"epoch": 3.198757763975155,
"grad_norm": 72.15721893310547,
"learning_rate": 4.130573735160937e-06,
"loss": 0.3126,
"mean_token_accuracy": 0.9001538991928101,
"num_tokens": 45993897.0,
"step": 5665
},
{
"entropy": 1.0630685806274414,
"epoch": 3.2015810276679844,
"grad_norm": 81.17543029785156,
"learning_rate": 4.129169695102541e-06,
"loss": 0.3618,
"mean_token_accuracy": 0.8834628224372864,
"num_tokens": 46034427.0,
"step": 5670
},
{
"entropy": 0.9357471466064453,
"epoch": 3.204404291360813,
"grad_norm": 88.41699981689453,
"learning_rate": 4.127764865075755e-06,
"loss": 0.3283,
"mean_token_accuracy": 0.8928990125656128,
"num_tokens": 46074757.0,
"step": 5675
},
{
"entropy": 0.9587173461914062,
"epoch": 3.207227555053642,
"grad_norm": 78.16218566894531,
"learning_rate": 4.126359246185741e-06,
"loss": 0.3537,
"mean_token_accuracy": 0.888952374458313,
"num_tokens": 46115210.0,
"step": 5680
},
{
"entropy": 0.97600417137146,
"epoch": 3.210050818746471,
"grad_norm": 86.20440673828125,
"learning_rate": 4.124952839538284e-06,
"loss": 0.3607,
"mean_token_accuracy": 0.8835627913475037,
"num_tokens": 46155735.0,
"step": 5685
},
{
"entropy": 0.9152452230453492,
"epoch": 3.2128740824393,
"grad_norm": 67.98258209228516,
"learning_rate": 4.123545646239787e-06,
"loss": 0.3209,
"mean_token_accuracy": 0.8964808464050293,
"num_tokens": 46195983.0,
"step": 5690
},
{
"entropy": 0.960852038860321,
"epoch": 3.2156973461321288,
"grad_norm": 79.35540008544922,
"learning_rate": 4.122137667397272e-06,
"loss": 0.3052,
"mean_token_accuracy": 0.9016862750053406,
"num_tokens": 46236777.0,
"step": 5695
},
{
"entropy": 1.0134177327156066,
"epoch": 3.2185206098249575,
"grad_norm": 79.44649505615234,
"learning_rate": 4.1207289041183805e-06,
"loss": 0.311,
"mean_token_accuracy": 0.8982547640800476,
"num_tokens": 46277026.0,
"step": 5700
},
{
"entropy": 1.012182354927063,
"epoch": 3.2213438735177866,
"grad_norm": 86.40589904785156,
"learning_rate": 4.1193193575113685e-06,
"loss": 0.3175,
"mean_token_accuracy": 0.898042368888855,
"num_tokens": 46317567.0,
"step": 5705
},
{
"entropy": 0.9427968263626099,
"epoch": 3.2241671372106153,
"grad_norm": 76.3041000366211,
"learning_rate": 4.117909028685108e-06,
"loss": 0.3382,
"mean_token_accuracy": 0.8900476813316345,
"num_tokens": 46358373.0,
"step": 5710
},
{
"entropy": 0.9410587787628174,
"epoch": 3.2269904009034445,
"grad_norm": 72.00779724121094,
"learning_rate": 4.116497918749093e-06,
"loss": 0.3349,
"mean_token_accuracy": 0.8919649243354797,
"num_tokens": 46399093.0,
"step": 5715
},
{
"entropy": 0.8953568816184998,
"epoch": 3.229813664596273,
"grad_norm": 84.89020538330078,
"learning_rate": 4.115086028813422e-06,
"loss": 0.3361,
"mean_token_accuracy": 0.8915575385093689,
"num_tokens": 46439722.0,
"step": 5720
},
{
"entropy": 0.892897367477417,
"epoch": 3.2326369282891023,
"grad_norm": 68.52616119384766,
"learning_rate": 4.113673359988814e-06,
"loss": 0.3132,
"mean_token_accuracy": 0.8966469287872314,
"num_tokens": 46479801.0,
"step": 5725
},
{
"entropy": 0.9588320255279541,
"epoch": 3.235460191981931,
"grad_norm": 92.84066009521484,
"learning_rate": 4.112259913386599e-06,
"loss": 0.3385,
"mean_token_accuracy": 0.8905380010604859,
"num_tokens": 46520573.0,
"step": 5730
},
{
"entropy": 0.9230941891670227,
"epoch": 3.23828345567476,
"grad_norm": 75.7746810913086,
"learning_rate": 4.110845690118718e-06,
"loss": 0.3411,
"mean_token_accuracy": 0.8905932664871216,
"num_tokens": 46560697.0,
"step": 5735
},
{
"entropy": 0.975538969039917,
"epoch": 3.241106719367589,
"grad_norm": 97.85185241699219,
"learning_rate": 4.109430691297724e-06,
"loss": 0.3104,
"mean_token_accuracy": 0.8992527008056641,
"num_tokens": 46601246.0,
"step": 5740
},
{
"entropy": 1.0089300632476808,
"epoch": 3.243929983060418,
"grad_norm": 73.65741729736328,
"learning_rate": 4.10801491803678e-06,
"loss": 0.354,
"mean_token_accuracy": 0.887215518951416,
"num_tokens": 46641991.0,
"step": 5745
},
{
"entropy": 0.9730260252952576,
"epoch": 3.2467532467532467,
"grad_norm": 87.6568374633789,
"learning_rate": 4.106598371449659e-06,
"loss": 0.3361,
"mean_token_accuracy": 0.8924851059913635,
"num_tokens": 46682519.0,
"step": 5750
},
{
"entropy": 0.9061202526092529,
"epoch": 3.249576510446076,
"grad_norm": 76.36370849609375,
"learning_rate": 4.105181052650739e-06,
"loss": 0.3222,
"mean_token_accuracy": 0.8965223670005799,
"num_tokens": 46723263.0,
"step": 5755
},
{
"entropy": 0.9181980729103089,
"epoch": 3.2523997741389046,
"grad_norm": 78.70237731933594,
"learning_rate": 4.10376296275501e-06,
"loss": 0.303,
"mean_token_accuracy": 0.9024981141090394,
"num_tokens": 46764047.0,
"step": 5760
},
{
"entropy": 0.9052846193313598,
"epoch": 3.2552230378317333,
"grad_norm": 72.70726013183594,
"learning_rate": 4.1023441028780655e-06,
"loss": 0.336,
"mean_token_accuracy": 0.8925952315330505,
"num_tokens": 46804692.0,
"step": 5765
},
{
"entropy": 1.03635413646698,
"epoch": 3.2580463015245624,
"grad_norm": 77.33209228515625,
"learning_rate": 4.100924474136105e-06,
"loss": 0.357,
"mean_token_accuracy": 0.8842707514762879,
"num_tokens": 46845216.0,
"step": 5770
},
{
"entropy": 0.9469284415245056,
"epoch": 3.260869565217391,
"grad_norm": 81.36337280273438,
"learning_rate": 4.099504077645936e-06,
"loss": 0.3423,
"mean_token_accuracy": 0.8893475532531738,
"num_tokens": 46885838.0,
"step": 5775
},
{
"entropy": 0.92877277135849,
"epoch": 3.2636928289102203,
"grad_norm": 65.81452178955078,
"learning_rate": 4.098082914524966e-06,
"loss": 0.3517,
"mean_token_accuracy": 0.8867884755134583,
"num_tokens": 46926701.0,
"step": 5780
},
{
"entropy": 0.9620746612548828,
"epoch": 3.266516092603049,
"grad_norm": 86.10861206054688,
"learning_rate": 4.096660985891207e-06,
"loss": 0.3372,
"mean_token_accuracy": 0.8917975783348083,
"num_tokens": 46967294.0,
"step": 5785
},
{
"entropy": 0.8969259858131409,
"epoch": 3.269339356295878,
"grad_norm": 77.28433990478516,
"learning_rate": 4.095238292863273e-06,
"loss": 0.3342,
"mean_token_accuracy": 0.8923115849494934,
"num_tokens": 47007892.0,
"step": 5790
},
{
"entropy": 0.9607129454612732,
"epoch": 3.272162619988707,
"grad_norm": 80.7704086303711,
"learning_rate": 4.093814836560381e-06,
"loss": 0.3537,
"mean_token_accuracy": 0.8840513944625854,
"num_tokens": 47048615.0,
"step": 5795
},
{
"entropy": 0.9498526930809021,
"epoch": 3.274985883681536,
"grad_norm": 95.09829711914062,
"learning_rate": 4.092390618102346e-06,
"loss": 0.338,
"mean_token_accuracy": 0.8934663891792297,
"num_tokens": 47088694.0,
"step": 5800
},
{
"entropy": 1.060532569885254,
"epoch": 3.2778091473743647,
"grad_norm": 80.7383041381836,
"learning_rate": 4.0909656386095854e-06,
"loss": 0.3493,
"mean_token_accuracy": 0.8909806966781616,
"num_tokens": 47129331.0,
"step": 5805
},
{
"entropy": 1.0316852688789369,
"epoch": 3.280632411067194,
"grad_norm": 83.49606323242188,
"learning_rate": 4.089539899203111e-06,
"loss": 0.366,
"mean_token_accuracy": 0.8828284740447998,
"num_tokens": 47169941.0,
"step": 5810
},
{
"entropy": 0.868412721157074,
"epoch": 3.2834556747600225,
"grad_norm": 69.75527954101562,
"learning_rate": 4.088113401004539e-06,
"loss": 0.3143,
"mean_token_accuracy": 0.89917072057724,
"num_tokens": 47210747.0,
"step": 5815
},
{
"entropy": 0.8937187194824219,
"epoch": 3.2862789384528517,
"grad_norm": 96.84244537353516,
"learning_rate": 4.086686145136074e-06,
"loss": 0.3619,
"mean_token_accuracy": 0.8864521026611328,
"num_tokens": 47250964.0,
"step": 5820
},
{
"entropy": 0.8895792603492737,
"epoch": 3.2891022021456804,
"grad_norm": 69.03482818603516,
"learning_rate": 4.085258132720525e-06,
"loss": 0.3411,
"mean_token_accuracy": 0.8903668642044067,
"num_tokens": 47291582.0,
"step": 5825
},
{
"entropy": 0.9830967545509338,
"epoch": 3.291925465838509,
"grad_norm": 83.75342559814453,
"learning_rate": 4.083829364881291e-06,
"loss": 0.3095,
"mean_token_accuracy": 0.8991586565971375,
"num_tokens": 47332346.0,
"step": 5830
},
{
"entropy": 0.9528472185134887,
"epoch": 3.2947487295313382,
"grad_norm": 87.47962951660156,
"learning_rate": 4.082399842742366e-06,
"loss": 0.3639,
"mean_token_accuracy": 0.8823336362838745,
"num_tokens": 47372949.0,
"step": 5835
},
{
"entropy": 0.9320926308631897,
"epoch": 3.2975719932241674,
"grad_norm": 74.84449768066406,
"learning_rate": 4.08096956742834e-06,
"loss": 0.3073,
"mean_token_accuracy": 0.8998920917510986,
"num_tokens": 47413544.0,
"step": 5840
},
{
"entropy": 0.8553381204605103,
"epoch": 3.300395256916996,
"grad_norm": 76.03280639648438,
"learning_rate": 4.0795385400643916e-06,
"loss": 0.3284,
"mean_token_accuracy": 0.8952413558959961,
"num_tokens": 47454255.0,
"step": 5845
},
{
"entropy": 0.9816694140434266,
"epoch": 3.303218520609825,
"grad_norm": 86.18427276611328,
"learning_rate": 4.078106761776294e-06,
"loss": 0.3428,
"mean_token_accuracy": 0.8892660975456238,
"num_tokens": 47494914.0,
"step": 5850
},
{
"entropy": 0.9079144239425659,
"epoch": 3.306041784302654,
"grad_norm": 89.10929870605469,
"learning_rate": 4.076674233690411e-06,
"loss": 0.3589,
"mean_token_accuracy": 0.8852792263031006,
"num_tokens": 47535582.0,
"step": 5855
},
{
"entropy": 0.893176531791687,
"epoch": 3.3088650479954826,
"grad_norm": 80.25196075439453,
"learning_rate": 4.075240956933694e-06,
"loss": 0.3476,
"mean_token_accuracy": 0.8886914730072022,
"num_tokens": 47576458.0,
"step": 5860
},
{
"entropy": 0.988261365890503,
"epoch": 3.311688311688312,
"grad_norm": 85.58906555175781,
"learning_rate": 4.073806932633685e-06,
"loss": 0.3393,
"mean_token_accuracy": 0.8919550776481628,
"num_tokens": 47617186.0,
"step": 5865
},
{
"entropy": 0.9218690037727356,
"epoch": 3.3145115753811405,
"grad_norm": 76.2083740234375,
"learning_rate": 4.072372161918514e-06,
"loss": 0.3374,
"mean_token_accuracy": 0.8905277132987977,
"num_tokens": 47657963.0,
"step": 5870
},
{
"entropy": 0.9897948741912842,
"epoch": 3.3173348390739696,
"grad_norm": 72.44279479980469,
"learning_rate": 4.0709366459169e-06,
"loss": 0.3708,
"mean_token_accuracy": 0.8791788578033447,
"num_tokens": 47698582.0,
"step": 5875
},
{
"entropy": 0.9159058570861817,
"epoch": 3.3201581027667983,
"grad_norm": 75.1999282836914,
"learning_rate": 4.069500385758144e-06,
"loss": 0.3177,
"mean_token_accuracy": 0.8962335586547852,
"num_tokens": 47738986.0,
"step": 5880
},
{
"entropy": 0.9603091716766358,
"epoch": 3.3229813664596275,
"grad_norm": 70.61493682861328,
"learning_rate": 4.068063382572136e-06,
"loss": 0.3392,
"mean_token_accuracy": 0.8907322883605957,
"num_tokens": 47779675.0,
"step": 5885
},
{
"entropy": 0.985940134525299,
"epoch": 3.325804630152456,
"grad_norm": 73.92083740234375,
"learning_rate": 4.066625637489349e-06,
"loss": 0.3389,
"mean_token_accuracy": 0.8892409563064575,
"num_tokens": 47820199.0,
"step": 5890
},
{
"entropy": 1.023607885837555,
"epoch": 3.328627893845285,
"grad_norm": 75.44142150878906,
"learning_rate": 4.065187151640839e-06,
"loss": 0.3389,
"mean_token_accuracy": 0.8911162734031677,
"num_tokens": 47860723.0,
"step": 5895
},
{
"entropy": 1.027356994152069,
"epoch": 3.331451157538114,
"grad_norm": 78.74333190917969,
"learning_rate": 4.063747926158248e-06,
"loss": 0.3476,
"mean_token_accuracy": 0.8885314106941223,
"num_tokens": 47901477.0,
"step": 5900
},
{
"entropy": 0.9395134568214416,
"epoch": 3.334274421230943,
"grad_norm": 78.92701721191406,
"learning_rate": 4.062307962173796e-06,
"loss": 0.3548,
"mean_token_accuracy": 0.88640958070755,
"num_tokens": 47941620.0,
"step": 5905
},
{
"entropy": 0.8830117225646973,
"epoch": 3.337097684923772,
"grad_norm": 69.30189514160156,
"learning_rate": 4.060867260820287e-06,
"loss": 0.3458,
"mean_token_accuracy": 0.8891350269317627,
"num_tokens": 47982251.0,
"step": 5910
},
{
"entropy": 0.943314504623413,
"epoch": 3.3399209486166006,
"grad_norm": 58.88145446777344,
"learning_rate": 4.059425823231101e-06,
"loss": 0.3606,
"mean_token_accuracy": 0.8849634170532227,
"num_tokens": 48023080.0,
"step": 5915
},
{
"entropy": 0.9857373237609863,
"epoch": 3.3427442123094298,
"grad_norm": 78.76705169677734,
"learning_rate": 4.057983650540203e-06,
"loss": 0.3461,
"mean_token_accuracy": 0.8870670199394226,
"num_tokens": 48063945.0,
"step": 5920
},
{
"entropy": 1.0337161064147948,
"epoch": 3.3455674760022585,
"grad_norm": 81.34748840332031,
"learning_rate": 4.05654074388213e-06,
"loss": 0.3636,
"mean_token_accuracy": 0.8839574098587036,
"num_tokens": 48104362.0,
"step": 5925
},
{
"entropy": 0.9524666666984558,
"epoch": 3.3483907396950876,
"grad_norm": 70.2374267578125,
"learning_rate": 4.055097104392003e-06,
"loss": 0.3416,
"mean_token_accuracy": 0.8884192824363708,
"num_tokens": 48145068.0,
"step": 5930
},
{
"entropy": 0.994706106185913,
"epoch": 3.3512140033879163,
"grad_norm": 82.83272552490234,
"learning_rate": 4.053652733205513e-06,
"loss": 0.3345,
"mean_token_accuracy": 0.8928604006767273,
"num_tokens": 48185762.0,
"step": 5935
},
{
"entropy": 1.0792868852615356,
"epoch": 3.3540372670807455,
"grad_norm": 72.94239807128906,
"learning_rate": 4.052207631458933e-06,
"loss": 0.3692,
"mean_token_accuracy": 0.8833429217338562,
"num_tokens": 48226457.0,
"step": 5940
},
{
"entropy": 1.0501051068305969,
"epoch": 3.356860530773574,
"grad_norm": 70.4188232421875,
"learning_rate": 4.050761800289104e-06,
"loss": 0.3402,
"mean_token_accuracy": 0.8886079668998719,
"num_tokens": 48266456.0,
"step": 5945
},
{
"entropy": 0.9432408094406128,
"epoch": 3.3596837944664033,
"grad_norm": 64.5877456665039,
"learning_rate": 4.049315240833445e-06,
"loss": 0.3185,
"mean_token_accuracy": 0.8974138140678406,
"num_tokens": 48307115.0,
"step": 5950
},
{
"entropy": 0.987043297290802,
"epoch": 3.362507058159232,
"grad_norm": 94.49236297607422,
"learning_rate": 4.047867954229949e-06,
"loss": 0.3702,
"mean_token_accuracy": 0.8812648177146911,
"num_tokens": 48347890.0,
"step": 5955
},
{
"entropy": 0.9090808153152465,
"epoch": 3.365330321852061,
"grad_norm": 79.62647247314453,
"learning_rate": 4.046419941617177e-06,
"loss": 0.3327,
"mean_token_accuracy": 0.8922478914260864,
"num_tokens": 48387673.0,
"step": 5960
},
{
"entropy": 0.8052204251289368,
"epoch": 3.36815358554489,
"grad_norm": 75.73529815673828,
"learning_rate": 4.044971204134266e-06,
"loss": 0.3119,
"mean_token_accuracy": 0.9001648783683777,
"num_tokens": 48428374.0,
"step": 5965
},
{
"entropy": 1.0093549251556397,
"epoch": 3.370976849237719,
"grad_norm": 66.04109954833984,
"learning_rate": 4.043521742920918e-06,
"loss": 0.3552,
"mean_token_accuracy": 0.8854212641716004,
"num_tokens": 48469022.0,
"step": 5970
},
{
"entropy": 0.8731826066970825,
"epoch": 3.3738001129305477,
"grad_norm": 79.09840393066406,
"learning_rate": 4.042071559117408e-06,
"loss": 0.3009,
"mean_token_accuracy": 0.9022910356521606,
"num_tokens": 48509642.0,
"step": 5975
},
{
"entropy": 0.9268733263015747,
"epoch": 3.3766233766233764,
"grad_norm": 73.78678894042969,
"learning_rate": 4.040620653864578e-06,
"loss": 0.3718,
"mean_token_accuracy": 0.8803731322288513,
"num_tokens": 48550455.0,
"step": 5980
},
{
"entropy": 0.985349690914154,
"epoch": 3.3794466403162056,
"grad_norm": 74.12259674072266,
"learning_rate": 4.0391690283038384e-06,
"loss": 0.3467,
"mean_token_accuracy": 0.887589693069458,
"num_tokens": 48591115.0,
"step": 5985
},
{
"entropy": 1.0588406562805175,
"epoch": 3.3822699040090343,
"grad_norm": 85.06888580322266,
"learning_rate": 4.0377166835771665e-06,
"loss": 0.3629,
"mean_token_accuracy": 0.8824954628944397,
"num_tokens": 48631699.0,
"step": 5990
},
{
"entropy": 1.0037429928779602,
"epoch": 3.3850931677018634,
"grad_norm": 77.73078918457031,
"learning_rate": 4.036263620827103e-06,
"loss": 0.354,
"mean_token_accuracy": 0.8870412349700928,
"num_tokens": 48672449.0,
"step": 5995
},
{
"entropy": 0.9787355661392212,
"epoch": 3.387916431394692,
"grad_norm": 79.08088684082031,
"learning_rate": 4.034809841196756e-06,
"loss": 0.3292,
"mean_token_accuracy": 0.8948890924453735,
"num_tokens": 48713156.0,
"step": 6000
},
{
"epoch": 3.387916431394692,
"eval_entropy": 1.23082674741745,
"eval_loss": 0.2607056200504303,
"eval_mean_token_accuracy": 0.9218904733657837,
"eval_num_tokens": 48713156.0,
"eval_runtime": 2.454,
"eval_samples_per_second": 15.893,
"eval_steps_per_second": 2.038,
"step": 6000
},
{
"entropy": 1.0537591814994811,
"epoch": 3.3907396950875213,
"grad_norm": 86.71538543701172,
"learning_rate": 4.033355345829797e-06,
"loss": 0.3366,
"mean_token_accuracy": 0.8912654995918274,
"num_tokens": 48753696.0,
"step": 6005
},
{
"entropy": 1.0819080591201782,
"epoch": 3.39356295878035,
"grad_norm": 71.52223205566406,
"learning_rate": 4.03190013587046e-06,
"loss": 0.3612,
"mean_token_accuracy": 0.8849057555198669,
"num_tokens": 48794454.0,
"step": 6010
},
{
"entropy": 1.0233477354049683,
"epoch": 3.396386222473179,
"grad_norm": 85.10845947265625,
"learning_rate": 4.030444212463542e-06,
"loss": 0.3629,
"mean_token_accuracy": 0.8837889313697815,
"num_tokens": 48835161.0,
"step": 6015
},
{
"entropy": 1.0686038136482239,
"epoch": 3.399209486166008,
"grad_norm": 72.09638214111328,
"learning_rate": 4.028987576754398e-06,
"loss": 0.3841,
"mean_token_accuracy": 0.8787945985794068,
"num_tokens": 48875996.0,
"step": 6020
},
{
"entropy": 0.9429172515869141,
"epoch": 3.402032749858837,
"grad_norm": 91.68360137939453,
"learning_rate": 4.0275302298889495e-06,
"loss": 0.3576,
"mean_token_accuracy": 0.8851586222648621,
"num_tokens": 48916665.0,
"step": 6025
},
{
"entropy": 0.8698290228843689,
"epoch": 3.4048560135516657,
"grad_norm": 59.9684944152832,
"learning_rate": 4.026072173013673e-06,
"loss": 0.3334,
"mean_token_accuracy": 0.8916546106338501,
"num_tokens": 48957359.0,
"step": 6030
},
{
"entropy": 0.9015409588813782,
"epoch": 3.407679277244495,
"grad_norm": 87.83787536621094,
"learning_rate": 4.024613407275603e-06,
"loss": 0.3439,
"mean_token_accuracy": 0.8881507873535156,
"num_tokens": 48998238.0,
"step": 6035
},
{
"entropy": 0.9665756225585938,
"epoch": 3.4105025409373235,
"grad_norm": 75.79359436035156,
"learning_rate": 4.023153933822335e-06,
"loss": 0.3213,
"mean_token_accuracy": 0.8947265267372131,
"num_tokens": 49038813.0,
"step": 6040
},
{
"entropy": 1.0080926299095154,
"epoch": 3.4133258046301522,
"grad_norm": 76.97696685791016,
"learning_rate": 4.021693753802019e-06,
"loss": 0.3388,
"mean_token_accuracy": 0.8899381160736084,
"num_tokens": 49079529.0,
"step": 6045
},
{
"entropy": 0.9414087653160095,
"epoch": 3.4161490683229814,
"grad_norm": 78.20440673828125,
"learning_rate": 4.0202328683633605e-06,
"loss": 0.3069,
"mean_token_accuracy": 0.9005836963653564,
"num_tokens": 49120374.0,
"step": 6050
},
{
"entropy": 0.9534523367881775,
"epoch": 3.4189723320158105,
"grad_norm": 67.13829040527344,
"learning_rate": 4.018771278655622e-06,
"loss": 0.3221,
"mean_token_accuracy": 0.8969206929206848,
"num_tokens": 49161020.0,
"step": 6055
},
{
"entropy": 1.1293304204940795,
"epoch": 3.4217955957086392,
"grad_norm": 81.1747055053711,
"learning_rate": 4.017308985828617e-06,
"loss": 0.3829,
"mean_token_accuracy": 0.876332950592041,
"num_tokens": 49201345.0,
"step": 6060
},
{
"entropy": 0.9646120667457581,
"epoch": 3.424618859401468,
"grad_norm": 78.29570770263672,
"learning_rate": 4.015845991032716e-06,
"loss": 0.3259,
"mean_token_accuracy": 0.8951454758644104,
"num_tokens": 49241850.0,
"step": 6065
},
{
"entropy": 1.0473296403884889,
"epoch": 3.427442123094297,
"grad_norm": 76.8552474975586,
"learning_rate": 4.014382295418838e-06,
"loss": 0.3649,
"mean_token_accuracy": 0.8825054168701172,
"num_tokens": 49282433.0,
"step": 6070
},
{
"entropy": 0.9831879496574402,
"epoch": 3.430265386787126,
"grad_norm": 75.86959838867188,
"learning_rate": 4.012917900138457e-06,
"loss": 0.3619,
"mean_token_accuracy": 0.8826367855072021,
"num_tokens": 49323030.0,
"step": 6075
},
{
"entropy": 1.117681658267975,
"epoch": 3.433088650479955,
"grad_norm": 75.12294006347656,
"learning_rate": 4.011452806343593e-06,
"loss": 0.372,
"mean_token_accuracy": 0.88106929063797,
"num_tokens": 49363673.0,
"step": 6080
},
{
"entropy": 0.9735101342201233,
"epoch": 3.4359119141727836,
"grad_norm": 65.7935562133789,
"learning_rate": 4.00998701518682e-06,
"loss": 0.3439,
"mean_token_accuracy": 0.8863797664642334,
"num_tokens": 49404345.0,
"step": 6085
},
{
"entropy": 0.9507794737815857,
"epoch": 3.438735177865613,
"grad_norm": 90.31965637207031,
"learning_rate": 4.008520527821257e-06,
"loss": 0.3537,
"mean_token_accuracy": 0.8880294799804688,
"num_tokens": 49444980.0,
"step": 6090
},
{
"entropy": 1.0107581615447998,
"epoch": 3.4415584415584415,
"grad_norm": 74.97982025146484,
"learning_rate": 4.007053345400572e-06,
"loss": 0.3233,
"mean_token_accuracy": 0.8946360349655151,
"num_tokens": 49485573.0,
"step": 6095
},
{
"entropy": 0.9869904041290283,
"epoch": 3.4443817052512706,
"grad_norm": 73.46428680419922,
"learning_rate": 4.0055854690789815e-06,
"loss": 0.3513,
"mean_token_accuracy": 0.8861337780952454,
"num_tokens": 49526252.0,
"step": 6100
},
{
"entropy": 0.9334820985794068,
"epoch": 3.4472049689440993,
"grad_norm": 75.04248809814453,
"learning_rate": 4.0041169000112454e-06,
"loss": 0.3368,
"mean_token_accuracy": 0.8914402961730957,
"num_tokens": 49566738.0,
"step": 6105
},
{
"entropy": 1.0121949315071106,
"epoch": 3.4500282326369285,
"grad_norm": 86.6436767578125,
"learning_rate": 4.00264763935267e-06,
"loss": 0.3305,
"mean_token_accuracy": 0.8928271055221557,
"num_tokens": 49607487.0,
"step": 6110
},
{
"entropy": 1.1302125215530396,
"epoch": 3.452851496329757,
"grad_norm": 65.75306701660156,
"learning_rate": 4.001177688259105e-06,
"loss": 0.339,
"mean_token_accuracy": 0.8919134378433228,
"num_tokens": 49648166.0,
"step": 6115
},
{
"entropy": 1.0888399124145507,
"epoch": 3.4556747600225863,
"grad_norm": 69.06067657470703,
"learning_rate": 3.999707047886944e-06,
"loss": 0.3429,
"mean_token_accuracy": 0.8888397812843323,
"num_tokens": 49688823.0,
"step": 6120
},
{
"entropy": 1.025403904914856,
"epoch": 3.458498023715415,
"grad_norm": 85.14940643310547,
"learning_rate": 3.998235719393121e-06,
"loss": 0.3627,
"mean_token_accuracy": 0.8827525019645691,
"num_tokens": 49728661.0,
"step": 6125
},
{
"entropy": 0.9839401364326477,
"epoch": 3.4613212874082437,
"grad_norm": 71.55946350097656,
"learning_rate": 3.996763703935114e-06,
"loss": 0.3503,
"mean_token_accuracy": 0.8873334646224975,
"num_tokens": 49769275.0,
"step": 6130
},
{
"entropy": 1.0336027026176453,
"epoch": 3.464144551101073,
"grad_norm": 78.25820922851562,
"learning_rate": 3.995291002670941e-06,
"loss": 0.3675,
"mean_token_accuracy": 0.8819956421852112,
"num_tokens": 49810066.0,
"step": 6135
},
{
"entropy": 1.0026185154914855,
"epoch": 3.4669678147939016,
"grad_norm": 85.74049377441406,
"learning_rate": 3.993817616759155e-06,
"loss": 0.3653,
"mean_token_accuracy": 0.882231068611145,
"num_tokens": 49850531.0,
"step": 6140
},
{
"entropy": 1.05128253698349,
"epoch": 3.4697910784867307,
"grad_norm": 68.15999603271484,
"learning_rate": 3.992343547358854e-06,
"loss": 0.3311,
"mean_token_accuracy": 0.8922553777694702,
"num_tokens": 49891169.0,
"step": 6145
},
{
"entropy": 1.0605325102806091,
"epoch": 3.4726143421795594,
"grad_norm": 90.25965881347656,
"learning_rate": 3.990868795629671e-06,
"loss": 0.3493,
"mean_token_accuracy": 0.8885184526443481,
"num_tokens": 49931622.0,
"step": 6150
},
{
"entropy": 0.9792709231376648,
"epoch": 3.4754376058723886,
"grad_norm": 78.75196838378906,
"learning_rate": 3.989393362731775e-06,
"loss": 0.3647,
"mean_token_accuracy": 0.8825397968292237,
"num_tokens": 49972332.0,
"step": 6155
},
{
"entropy": 0.8609856843948365,
"epoch": 3.4782608695652173,
"grad_norm": 74.58406066894531,
"learning_rate": 3.987917249825872e-06,
"loss": 0.3152,
"mean_token_accuracy": 0.8976006627082824,
"num_tokens": 50012990.0,
"step": 6160
},
{
"entropy": 0.9564788579940796,
"epoch": 3.4810841332580464,
"grad_norm": 79.9758529663086,
"learning_rate": 3.986440458073202e-06,
"loss": 0.362,
"mean_token_accuracy": 0.8848256587982177,
"num_tokens": 50053801.0,
"step": 6165
},
{
"entropy": 0.9849433898925781,
"epoch": 3.483907396950875,
"grad_norm": 88.46871948242188,
"learning_rate": 3.98496298863554e-06,
"loss": 0.3297,
"mean_token_accuracy": 0.8946364402770997,
"num_tokens": 50094571.0,
"step": 6170
},
{
"entropy": 0.9942429542541504,
"epoch": 3.4867306606437043,
"grad_norm": 65.55095672607422,
"learning_rate": 3.983484842675194e-06,
"loss": 0.3493,
"mean_token_accuracy": 0.8884786128997803,
"num_tokens": 50135471.0,
"step": 6175
},
{
"entropy": 1.002113664150238,
"epoch": 3.489553924336533,
"grad_norm": 66.22470092773438,
"learning_rate": 3.982006021355002e-06,
"loss": 0.3722,
"mean_token_accuracy": 0.8818007349967957,
"num_tokens": 50176220.0,
"step": 6180
},
{
"entropy": 1.0745681047439575,
"epoch": 3.492377188029362,
"grad_norm": 94.96135711669922,
"learning_rate": 3.980526525838337e-06,
"loss": 0.3788,
"mean_token_accuracy": 0.8774041533470154,
"num_tokens": 50216826.0,
"step": 6185
},
{
"entropy": 1.0016021370887755,
"epoch": 3.495200451722191,
"grad_norm": 84.10227966308594,
"learning_rate": 3.979046357289101e-06,
"loss": 0.346,
"mean_token_accuracy": 0.888908052444458,
"num_tokens": 50257345.0,
"step": 6190
},
{
"entropy": 1.0220289945602417,
"epoch": 3.4980237154150196,
"grad_norm": 80.87281036376953,
"learning_rate": 3.977565516871723e-06,
"loss": 0.3422,
"mean_token_accuracy": 0.8883512258529663,
"num_tokens": 50298037.0,
"step": 6195
},
{
"entropy": 0.958783769607544,
"epoch": 3.5008469791078487,
"grad_norm": 88.97132110595703,
"learning_rate": 3.976084005751164e-06,
"loss": 0.3575,
"mean_token_accuracy": 0.8871893882751465,
"num_tokens": 50338663.0,
"step": 6200
},
{
"entropy": 1.02311053276062,
"epoch": 3.503670242800678,
"grad_norm": 76.60072326660156,
"learning_rate": 3.974601825092911e-06,
"loss": 0.3783,
"mean_token_accuracy": 0.8799753189086914,
"num_tokens": 50379319.0,
"step": 6205
},
{
"entropy": 1.044371199607849,
"epoch": 3.5064935064935066,
"grad_norm": 87.33512115478516,
"learning_rate": 3.973118976062978e-06,
"loss": 0.3701,
"mean_token_accuracy": 0.8824043035507202,
"num_tokens": 50420067.0,
"step": 6210
},
{
"entropy": 0.9192242622375488,
"epoch": 3.5093167701863353,
"grad_norm": 73.0194091796875,
"learning_rate": 3.971635459827905e-06,
"loss": 0.3236,
"mean_token_accuracy": 0.893168592453003,
"num_tokens": 50460623.0,
"step": 6215
},
{
"entropy": 0.9499261617660523,
"epoch": 3.5121400338791644,
"grad_norm": 89.47998809814453,
"learning_rate": 3.970151277554756e-06,
"loss": 0.3667,
"mean_token_accuracy": 0.8812254905700684,
"num_tokens": 50501429.0,
"step": 6220
},
{
"entropy": 0.9651218175888061,
"epoch": 3.514963297571993,
"grad_norm": 73.77303314208984,
"learning_rate": 3.96866643041112e-06,
"loss": 0.3355,
"mean_token_accuracy": 0.8922844529151917,
"num_tokens": 50542128.0,
"step": 6225
},
{
"entropy": 1.00862113237381,
"epoch": 3.5177865612648223,
"grad_norm": 82.71814727783203,
"learning_rate": 3.967180919565108e-06,
"loss": 0.3576,
"mean_token_accuracy": 0.8849758267402649,
"num_tokens": 50582754.0,
"step": 6230
},
{
"entropy": 1.0719767332077026,
"epoch": 3.520609824957651,
"grad_norm": 81.17707061767578,
"learning_rate": 3.965694746185355e-06,
"loss": 0.3651,
"mean_token_accuracy": 0.8814258694648742,
"num_tokens": 50623547.0,
"step": 6235
},
{
"entropy": 0.9911883711814881,
"epoch": 3.52343308865048,
"grad_norm": 83.75228881835938,
"learning_rate": 3.964207911441015e-06,
"loss": 0.3755,
"mean_token_accuracy": 0.8790446758270264,
"num_tokens": 50664345.0,
"step": 6240
},
{
"entropy": 0.9944733381271362,
"epoch": 3.526256352343309,
"grad_norm": 122.08747863769531,
"learning_rate": 3.962720416501763e-06,
"loss": 0.3689,
"mean_token_accuracy": 0.8826006412506103,
"num_tokens": 50704948.0,
"step": 6245
},
{
"entropy": 1.0450132608413696,
"epoch": 3.529079616036138,
"grad_norm": 84.4200668334961,
"learning_rate": 3.961232262537795e-06,
"loss": 0.3687,
"mean_token_accuracy": 0.8846492290496826,
"num_tokens": 50745604.0,
"step": 6250
},
{
"entropy": 0.9697313427925109,
"epoch": 3.5319028797289667,
"grad_norm": 73.62430572509766,
"learning_rate": 3.959743450719824e-06,
"loss": 0.3654,
"mean_token_accuracy": 0.8818155765533447,
"num_tokens": 50786209.0,
"step": 6255
},
{
"entropy": 0.973599910736084,
"epoch": 3.5347261434217954,
"grad_norm": 87.3067855834961,
"learning_rate": 3.958253982219079e-06,
"loss": 0.3534,
"mean_token_accuracy": 0.8868489623069763,
"num_tokens": 50826700.0,
"step": 6260
},
{
"entropy": 0.994738757610321,
"epoch": 3.5375494071146245,
"grad_norm": 71.6942138671875,
"learning_rate": 3.956763858207308e-06,
"loss": 0.3537,
"mean_token_accuracy": 0.8836980581283569,
"num_tokens": 50867329.0,
"step": 6265
},
{
"entropy": 1.0417511463165283,
"epoch": 3.5403726708074537,
"grad_norm": 76.70450592041016,
"learning_rate": 3.955273079856773e-06,
"loss": 0.3757,
"mean_token_accuracy": 0.8782451033592225,
"num_tokens": 50908046.0,
"step": 6270
},
{
"entropy": 0.98163822889328,
"epoch": 3.5431959345002824,
"grad_norm": 66.1574478149414,
"learning_rate": 3.953781648340254e-06,
"loss": 0.3648,
"mean_token_accuracy": 0.8820971846580505,
"num_tokens": 50948751.0,
"step": 6275
},
{
"entropy": 1.049662470817566,
"epoch": 3.546019198193111,
"grad_norm": 72.4782485961914,
"learning_rate": 3.95228956483104e-06,
"loss": 0.3379,
"mean_token_accuracy": 0.8903756499290466,
"num_tokens": 50989488.0,
"step": 6280
},
{
"entropy": 1.001114797592163,
"epoch": 3.54884246188594,
"grad_norm": 67.1705551147461,
"learning_rate": 3.950796830502938e-06,
"loss": 0.3485,
"mean_token_accuracy": 0.8888326168060303,
"num_tokens": 51030140.0,
"step": 6285
},
{
"entropy": 1.0564101696014405,
"epoch": 3.551665725578769,
"grad_norm": 71.83964538574219,
"learning_rate": 3.949303446530262e-06,
"loss": 0.3527,
"mean_token_accuracy": 0.8852935433387756,
"num_tokens": 51070831.0,
"step": 6290
},
{
"entropy": 1.024319851398468,
"epoch": 3.554488989271598,
"grad_norm": 80.63823699951172,
"learning_rate": 3.94780941408784e-06,
"loss": 0.3723,
"mean_token_accuracy": 0.8803096055984497,
"num_tokens": 51111402.0,
"step": 6295
},
{
"entropy": 1.0112637758255005,
"epoch": 3.5573122529644268,
"grad_norm": 65.74320983886719,
"learning_rate": 3.94631473435101e-06,
"loss": 0.3562,
"mean_token_accuracy": 0.885633933544159,
"num_tokens": 51151931.0,
"step": 6300
},
{
"entropy": 1.126786994934082,
"epoch": 3.560135516657256,
"grad_norm": 78.12380981445312,
"learning_rate": 3.9448194084956185e-06,
"loss": 0.3561,
"mean_token_accuracy": 0.8849801540374755,
"num_tokens": 51192618.0,
"step": 6305
},
{
"entropy": 0.9513525009155274,
"epoch": 3.5629587803500846,
"grad_norm": 75.2828598022461,
"learning_rate": 3.943323437698021e-06,
"loss": 0.3609,
"mean_token_accuracy": 0.8847909450531006,
"num_tokens": 51233084.0,
"step": 6310
},
{
"entropy": 1.0396467089653014,
"epoch": 3.5657820440429138,
"grad_norm": 77.07085418701172,
"learning_rate": 3.941826823135079e-06,
"loss": 0.3533,
"mean_token_accuracy": 0.8857412457466125,
"num_tokens": 51273692.0,
"step": 6315
},
{
"entropy": 1.0362180352211,
"epoch": 3.5686053077357425,
"grad_norm": 85.95170593261719,
"learning_rate": 3.940329565984165e-06,
"loss": 0.3769,
"mean_token_accuracy": 0.8745699167251587,
"num_tokens": 51314442.0,
"step": 6320
},
{
"entropy": 0.92795330286026,
"epoch": 3.571428571428571,
"grad_norm": 86.3418960571289,
"learning_rate": 3.938831667423149e-06,
"loss": 0.3307,
"mean_token_accuracy": 0.8915099501609802,
"num_tokens": 51355241.0,
"step": 6325
},
{
"entropy": 0.9906257629394531,
"epoch": 3.5742518351214003,
"grad_norm": 81.33384704589844,
"learning_rate": 3.937333128630411e-06,
"loss": 0.3599,
"mean_token_accuracy": 0.884676706790924,
"num_tokens": 51395216.0,
"step": 6330
},
{
"entropy": 1.071572208404541,
"epoch": 3.5770750988142295,
"grad_norm": 78.2898941040039,
"learning_rate": 3.9358339507848355e-06,
"loss": 0.4006,
"mean_token_accuracy": 0.8738093256950379,
"num_tokens": 51435914.0,
"step": 6335
},
{
"entropy": 0.9324329853057861,
"epoch": 3.579898362507058,
"grad_norm": 67.50933837890625,
"learning_rate": 3.934334135065807e-06,
"loss": 0.3637,
"mean_token_accuracy": 0.8848002195358277,
"num_tokens": 51476772.0,
"step": 6340
},
{
"entropy": 0.939254081249237,
"epoch": 3.582721626199887,
"grad_norm": 77.13788604736328,
"learning_rate": 3.932833682653212e-06,
"loss": 0.3532,
"mean_token_accuracy": 0.8868015527725219,
"num_tokens": 51517491.0,
"step": 6345
},
{
"entropy": 1.0642353653907777,
"epoch": 3.585544889892716,
"grad_norm": 82.49517822265625,
"learning_rate": 3.93133259472744e-06,
"loss": 0.3904,
"mean_token_accuracy": 0.8745889067649841,
"num_tokens": 51558219.0,
"step": 6350
},
{
"entropy": 0.9392816543579101,
"epoch": 3.5883681535855447,
"grad_norm": 80.28068542480469,
"learning_rate": 3.929830872469378e-06,
"loss": 0.3242,
"mean_token_accuracy": 0.8940078973770141,
"num_tokens": 51598791.0,
"step": 6355
},
{
"entropy": 0.9706831693649292,
"epoch": 3.591191417278374,
"grad_norm": 81.84841918945312,
"learning_rate": 3.928328517060412e-06,
"loss": 0.3714,
"mean_token_accuracy": 0.8811930298805237,
"num_tokens": 51639315.0,
"step": 6360
},
{
"entropy": 1.0625698804855346,
"epoch": 3.5940146809712026,
"grad_norm": 89.82206726074219,
"learning_rate": 3.926825529682431e-06,
"loss": 0.3758,
"mean_token_accuracy": 0.8791163086891174,
"num_tokens": 51679609.0,
"step": 6365
},
{
"entropy": 1.0034614324569702,
"epoch": 3.5968379446640317,
"grad_norm": 71.02519226074219,
"learning_rate": 3.925321911517814e-06,
"loss": 0.3403,
"mean_token_accuracy": 0.8907212376594543,
"num_tokens": 51720208.0,
"step": 6370
},
{
"entropy": 1.0046752095222473,
"epoch": 3.5996612083568604,
"grad_norm": 95.1485595703125,
"learning_rate": 3.92381766374944e-06,
"loss": 0.3617,
"mean_token_accuracy": 0.8833420157432557,
"num_tokens": 51760749.0,
"step": 6375
},
{
"entropy": 1.0107667088508605,
"epoch": 3.6024844720496896,
"grad_norm": 79.1152572631836,
"learning_rate": 3.922312787560684e-06,
"loss": 0.3315,
"mean_token_accuracy": 0.8931966900825501,
"num_tokens": 51801541.0,
"step": 6380
},
{
"entropy": 1.0665692925453185,
"epoch": 3.6053077357425183,
"grad_norm": 83.31388092041016,
"learning_rate": 3.920807284135413e-06,
"loss": 0.3908,
"mean_token_accuracy": 0.8754578113555909,
"num_tokens": 51842293.0,
"step": 6385
},
{
"entropy": 1.0210911154747009,
"epoch": 3.608130999435347,
"grad_norm": 79.14932250976562,
"learning_rate": 3.919301154657989e-06,
"loss": 0.359,
"mean_token_accuracy": 0.881727647781372,
"num_tokens": 51882745.0,
"step": 6390
},
{
"entropy": 1.031436562538147,
"epoch": 3.610954263128176,
"grad_norm": 81.52947998046875,
"learning_rate": 3.917794400313268e-06,
"loss": 0.3432,
"mean_token_accuracy": 0.8924546480178833,
"num_tokens": 51923109.0,
"step": 6395
},
{
"entropy": 1.0299935698509217,
"epoch": 3.6137775268210053,
"grad_norm": 77.39228820800781,
"learning_rate": 3.916287022286593e-06,
"loss": 0.3797,
"mean_token_accuracy": 0.8783790588378906,
"num_tokens": 51963770.0,
"step": 6400
},
{
"entropy": 1.0109672904014588,
"epoch": 3.616600790513834,
"grad_norm": 78.6328125,
"learning_rate": 3.914779021763803e-06,
"loss": 0.3537,
"mean_token_accuracy": 0.8864980340003967,
"num_tokens": 52004112.0,
"step": 6405
},
{
"entropy": 0.9662197709083558,
"epoch": 3.6194240542066627,
"grad_norm": 76.94004821777344,
"learning_rate": 3.913270399931223e-06,
"loss": 0.347,
"mean_token_accuracy": 0.8888106346130371,
"num_tokens": 52044761.0,
"step": 6410
},
{
"entropy": 0.8939793229103088,
"epoch": 3.622247317899492,
"grad_norm": 78.16267395019531,
"learning_rate": 3.911761157975667e-06,
"loss": 0.3457,
"mean_token_accuracy": 0.8896233439445496,
"num_tokens": 52085606.0,
"step": 6415
},
{
"entropy": 0.898774790763855,
"epoch": 3.625070581592321,
"grad_norm": 75.20874786376953,
"learning_rate": 3.910251297084438e-06,
"loss": 0.3809,
"mean_token_accuracy": 0.878812849521637,
"num_tokens": 52126164.0,
"step": 6420
},
{
"entropy": 0.9831461429595947,
"epoch": 3.6278938452851497,
"grad_norm": 89.09337615966797,
"learning_rate": 3.908740818445327e-06,
"loss": 0.3808,
"mean_token_accuracy": 0.8773584604263306,
"num_tokens": 52166919.0,
"step": 6425
},
{
"entropy": 1.0048586010932923,
"epoch": 3.6307171089779784,
"grad_norm": 79.22754669189453,
"learning_rate": 3.907229723246607e-06,
"loss": 0.3643,
"mean_token_accuracy": 0.8850665092468262,
"num_tokens": 52207627.0,
"step": 6430
},
{
"entropy": 1.0400293827056886,
"epoch": 3.6335403726708075,
"grad_norm": 88.93228912353516,
"learning_rate": 3.905718012677042e-06,
"loss": 0.349,
"mean_token_accuracy": 0.8870396375656128,
"num_tokens": 52248312.0,
"step": 6435
},
{
"entropy": 1.1385428428649902,
"epoch": 3.6363636363636362,
"grad_norm": 78.16153717041016,
"learning_rate": 3.9042056879258754e-06,
"loss": 0.3662,
"mean_token_accuracy": 0.8810193061828613,
"num_tokens": 52288933.0,
"step": 6440
},
{
"entropy": 1.0581382632255554,
"epoch": 3.6391869000564654,
"grad_norm": 94.95269775390625,
"learning_rate": 3.902692750182835e-06,
"loss": 0.3421,
"mean_token_accuracy": 0.8892552137374878,
"num_tokens": 52329843.0,
"step": 6445
},
{
"entropy": 0.9416268587112426,
"epoch": 3.642010163749294,
"grad_norm": 71.47803497314453,
"learning_rate": 3.901179200638131e-06,
"loss": 0.3358,
"mean_token_accuracy": 0.8925686478614807,
"num_tokens": 52370377.0,
"step": 6450
},
{
"entropy": 0.9823377370834351,
"epoch": 3.6448334274421232,
"grad_norm": 66.98043823242188,
"learning_rate": 3.899665040482453e-06,
"loss": 0.3524,
"mean_token_accuracy": 0.8869312644004822,
"num_tokens": 52411012.0,
"step": 6455
},
{
"entropy": 1.073030376434326,
"epoch": 3.647656691134952,
"grad_norm": 77.24723815917969,
"learning_rate": 3.898150270906977e-06,
"loss": 0.3877,
"mean_token_accuracy": 0.8758072853088379,
"num_tokens": 52451634.0,
"step": 6460
},
{
"entropy": 0.9715201020240783,
"epoch": 3.650479954827781,
"grad_norm": 88.40100860595703,
"learning_rate": 3.896634893103351e-06,
"loss": 0.3779,
"mean_token_accuracy": 0.8794341802597045,
"num_tokens": 52492298.0,
"step": 6465
},
{
"entropy": 0.9602640748023987,
"epoch": 3.65330321852061,
"grad_norm": 74.9549560546875,
"learning_rate": 3.895118908263706e-06,
"loss": 0.355,
"mean_token_accuracy": 0.8859039068222045,
"num_tokens": 52532916.0,
"step": 6470
},
{
"entropy": 0.9144500970840455,
"epoch": 3.6561264822134385,
"grad_norm": 83.213623046875,
"learning_rate": 3.893602317580649e-06,
"loss": 0.3401,
"mean_token_accuracy": 0.8914003133773803,
"num_tokens": 52573512.0,
"step": 6475
},
{
"entropy": 1.025104033946991,
"epoch": 3.6589497459062676,
"grad_norm": 89.1195068359375,
"learning_rate": 3.892085122247263e-06,
"loss": 0.376,
"mean_token_accuracy": 0.8797497034072876,
"num_tokens": 52614126.0,
"step": 6480
},
{
"entropy": 1.0915439128875732,
"epoch": 3.661773009599097,
"grad_norm": 89.44082641601562,
"learning_rate": 3.89056732345711e-06,
"loss": 0.3694,
"mean_token_accuracy": 0.8832651495933532,
"num_tokens": 52654968.0,
"step": 6485
},
{
"entropy": 0.9922754287719726,
"epoch": 3.6645962732919255,
"grad_norm": 84.6336669921875,
"learning_rate": 3.889048922404222e-06,
"loss": 0.3486,
"mean_token_accuracy": 0.8878530979156494,
"num_tokens": 52695576.0,
"step": 6490
},
{
"entropy": 0.9639823079109192,
"epoch": 3.667419536984754,
"grad_norm": 75.57162475585938,
"learning_rate": 3.887529920283108e-06,
"loss": 0.3459,
"mean_token_accuracy": 0.8889827013015748,
"num_tokens": 52736251.0,
"step": 6495
},
{
"entropy": 1.0114650011062623,
"epoch": 3.6702428006775834,
"grad_norm": 83.69818878173828,
"learning_rate": 3.886010318288748e-06,
"loss": 0.3659,
"mean_token_accuracy": 0.8834073305130005,
"num_tokens": 52777101.0,
"step": 6500
},
{
"epoch": 3.6702428006775834,
"eval_entropy": 1.2794333457946778,
"eval_loss": 0.2388431578874588,
"eval_mean_token_accuracy": 0.9266616106033325,
"eval_num_tokens": 52777101.0,
"eval_runtime": 2.4569,
"eval_samples_per_second": 15.873,
"eval_steps_per_second": 2.035,
"step": 6500
},
{
"entropy": 1.0842282056808472,
"epoch": 3.673066064370412,
"grad_norm": 72.50702667236328,
"learning_rate": 3.884490117616596e-06,
"loss": 0.3576,
"mean_token_accuracy": 0.8858759045600891,
"num_tokens": 52817788.0,
"step": 6505
},
{
"entropy": 1.0758524060249328,
"epoch": 3.675889328063241,
"grad_norm": 82.36662292480469,
"learning_rate": 3.882969319462576e-06,
"loss": 0.3628,
"mean_token_accuracy": 0.8835553526878357,
"num_tokens": 52858373.0,
"step": 6510
},
{
"entropy": 1.006167435646057,
"epoch": 3.67871259175607,
"grad_norm": 80.42717742919922,
"learning_rate": 3.8814479250230816e-06,
"loss": 0.3533,
"mean_token_accuracy": 0.8864710927009583,
"num_tokens": 52898795.0,
"step": 6515
},
{
"entropy": 0.9807133316993714,
"epoch": 3.681535855448899,
"grad_norm": 81.09001922607422,
"learning_rate": 3.879925935494974e-06,
"loss": 0.3699,
"mean_token_accuracy": 0.882834541797638,
"num_tokens": 52939434.0,
"step": 6520
},
{
"entropy": 0.9380959749221802,
"epoch": 3.6843591191417278,
"grad_norm": 71.36829376220703,
"learning_rate": 3.878403352075588e-06,
"loss": 0.3613,
"mean_token_accuracy": 0.8826104998588562,
"num_tokens": 52980014.0,
"step": 6525
},
{
"entropy": 0.9877384066581726,
"epoch": 3.687182382834557,
"grad_norm": 66.0042953491211,
"learning_rate": 3.87688017596272e-06,
"loss": 0.3426,
"mean_token_accuracy": 0.8897479772567749,
"num_tokens": 53020720.0,
"step": 6530
},
{
"entropy": 1.0412828922271729,
"epoch": 3.6900056465273856,
"grad_norm": 79.44898986816406,
"learning_rate": 3.875356408354633e-06,
"loss": 0.3771,
"mean_token_accuracy": 0.8798229336738587,
"num_tokens": 53061436.0,
"step": 6535
},
{
"entropy": 0.9982946991920472,
"epoch": 3.6928289102202143,
"grad_norm": 84.61109161376953,
"learning_rate": 3.873832050450058e-06,
"loss": 0.3562,
"mean_token_accuracy": 0.8826686978340149,
"num_tokens": 53102129.0,
"step": 6540
},
{
"entropy": 0.900448226928711,
"epoch": 3.6956521739130435,
"grad_norm": 67.6589584350586,
"learning_rate": 3.87230710344819e-06,
"loss": 0.3474,
"mean_token_accuracy": 0.8889101147651672,
"num_tokens": 53142665.0,
"step": 6545
},
{
"entropy": 0.9328884720802307,
"epoch": 3.6984754376058726,
"grad_norm": 69.66155242919922,
"learning_rate": 3.870781568548686e-06,
"loss": 0.3825,
"mean_token_accuracy": 0.879358434677124,
"num_tokens": 53183474.0,
"step": 6550
},
{
"entropy": 0.9914533376693726,
"epoch": 3.7012987012987013,
"grad_norm": 73.55729675292969,
"learning_rate": 3.869255446951668e-06,
"loss": 0.3702,
"mean_token_accuracy": 0.8832436203956604,
"num_tokens": 53224148.0,
"step": 6555
},
{
"entropy": 0.9467785477638244,
"epoch": 3.70412196499153,
"grad_norm": 73.56095886230469,
"learning_rate": 3.8677287398577145e-06,
"loss": 0.3574,
"mean_token_accuracy": 0.8833266258239746,
"num_tokens": 53264653.0,
"step": 6560
},
{
"entropy": 0.9588563442230225,
"epoch": 3.706945228684359,
"grad_norm": 68.61029815673828,
"learning_rate": 3.86620144846787e-06,
"loss": 0.3552,
"mean_token_accuracy": 0.8847254753112793,
"num_tokens": 53305136.0,
"step": 6565
},
{
"entropy": 1.008836305141449,
"epoch": 3.709768492377188,
"grad_norm": 85.92555236816406,
"learning_rate": 3.8646735739836375e-06,
"loss": 0.3376,
"mean_token_accuracy": 0.8911909461021423,
"num_tokens": 53345833.0,
"step": 6570
},
{
"entropy": 1.0201958298683167,
"epoch": 3.712591756070017,
"grad_norm": 71.72399139404297,
"learning_rate": 3.863145117606976e-06,
"loss": 0.3754,
"mean_token_accuracy": 0.8834819436073303,
"num_tokens": 53386385.0,
"step": 6575
},
{
"entropy": 1.042575967311859,
"epoch": 3.7154150197628457,
"grad_norm": 82.75652313232422,
"learning_rate": 3.861616080540303e-06,
"loss": 0.3696,
"mean_token_accuracy": 0.8814401984214782,
"num_tokens": 53427047.0,
"step": 6580
},
{
"entropy": 1.0726915359497071,
"epoch": 3.718238283455675,
"grad_norm": 70.38341522216797,
"learning_rate": 3.860086463986496e-06,
"loss": 0.3413,
"mean_token_accuracy": 0.8903828144073487,
"num_tokens": 53467848.0,
"step": 6585
},
{
"entropy": 1.0652672290802,
"epoch": 3.7210615471485036,
"grad_norm": 74.48110961914062,
"learning_rate": 3.858556269148885e-06,
"loss": 0.3531,
"mean_token_accuracy": 0.8858086943626404,
"num_tokens": 53508723.0,
"step": 6590
},
{
"entropy": 1.1102969884872436,
"epoch": 3.7238848108413327,
"grad_norm": 93.84978485107422,
"learning_rate": 3.857025497231258e-06,
"loss": 0.3513,
"mean_token_accuracy": 0.8857390761375428,
"num_tokens": 53549279.0,
"step": 6595
},
{
"entropy": 0.9921191811561585,
"epoch": 3.7267080745341614,
"grad_norm": 66.51078033447266,
"learning_rate": 3.855494149437853e-06,
"loss": 0.3423,
"mean_token_accuracy": 0.8900970578193664,
"num_tokens": 53589987.0,
"step": 6600
},
{
"entropy": 0.9519596576690674,
"epoch": 3.7295313382269906,
"grad_norm": 85.44746398925781,
"learning_rate": 3.853962226973364e-06,
"loss": 0.3494,
"mean_token_accuracy": 0.8887536764144898,
"num_tokens": 53630587.0,
"step": 6605
},
{
"entropy": 0.9617591619491577,
"epoch": 3.7323546019198193,
"grad_norm": 76.92646026611328,
"learning_rate": 3.852429731042936e-06,
"loss": 0.3437,
"mean_token_accuracy": 0.8902799129486084,
"num_tokens": 53671322.0,
"step": 6610
},
{
"entropy": 1.070969033241272,
"epoch": 3.7351778656126484,
"grad_norm": 100.68677520751953,
"learning_rate": 3.850896662852165e-06,
"loss": 0.3558,
"mean_token_accuracy": 0.8854732871055603,
"num_tokens": 53711800.0,
"step": 6615
},
{
"entropy": 1.0809642195701599,
"epoch": 3.738001129305477,
"grad_norm": 75.52942657470703,
"learning_rate": 3.8493630236070975e-06,
"loss": 0.366,
"mean_token_accuracy": 0.8825573921203613,
"num_tokens": 53752357.0,
"step": 6620
},
{
"entropy": 1.0825310945510864,
"epoch": 3.740824392998306,
"grad_norm": 79.78958892822266,
"learning_rate": 3.847828814514231e-06,
"loss": 0.3664,
"mean_token_accuracy": 0.8812973737716675,
"num_tokens": 53793065.0,
"step": 6625
},
{
"entropy": 0.9908598780632019,
"epoch": 3.743647656691135,
"grad_norm": 77.1722412109375,
"learning_rate": 3.846294036780508e-06,
"loss": 0.3412,
"mean_token_accuracy": 0.8887288570404053,
"num_tokens": 53833798.0,
"step": 6630
},
{
"entropy": 1.1215206027030944,
"epoch": 3.746470920383964,
"grad_norm": 78.24017333984375,
"learning_rate": 3.84475869161332e-06,
"loss": 0.3539,
"mean_token_accuracy": 0.8852620720863342,
"num_tokens": 53874604.0,
"step": 6635
},
{
"entropy": 1.0309231758117676,
"epoch": 3.749294184076793,
"grad_norm": 76.8600082397461,
"learning_rate": 3.8432227802205055e-06,
"loss": 0.34,
"mean_token_accuracy": 0.8884881615638733,
"num_tokens": 53915339.0,
"step": 6640
},
{
"entropy": 1.0157047390937806,
"epoch": 3.7521174477696215,
"grad_norm": 67.16350555419922,
"learning_rate": 3.841686303810347e-06,
"loss": 0.3666,
"mean_token_accuracy": 0.8807543754577637,
"num_tokens": 53956009.0,
"step": 6645
},
{
"entropy": 1.0168555617332458,
"epoch": 3.7549407114624507,
"grad_norm": 71.06246948242188,
"learning_rate": 3.840149263591573e-06,
"loss": 0.3339,
"mean_token_accuracy": 0.8932982087135315,
"num_tokens": 53996726.0,
"step": 6650
},
{
"entropy": 1.0992830872535706,
"epoch": 3.7577639751552794,
"grad_norm": 76.54785919189453,
"learning_rate": 3.838611660773355e-06,
"loss": 0.3729,
"mean_token_accuracy": 0.87835294008255,
"num_tokens": 54036856.0,
"step": 6655
},
{
"entropy": 1.0346566915512085,
"epoch": 3.7605872388481085,
"grad_norm": 82.71453857421875,
"learning_rate": 3.837073496565307e-06,
"loss": 0.3762,
"mean_token_accuracy": 0.8808693885803223,
"num_tokens": 54077601.0,
"step": 6660
},
{
"entropy": 1.0158017158508301,
"epoch": 3.7634105025409372,
"grad_norm": 72.65914916992188,
"learning_rate": 3.8355347721774825e-06,
"loss": 0.3466,
"mean_token_accuracy": 0.8884905695915222,
"num_tokens": 54117960.0,
"step": 6665
},
{
"entropy": 1.0506127595901489,
"epoch": 3.7662337662337664,
"grad_norm": 73.87583923339844,
"learning_rate": 3.83399548882038e-06,
"loss": 0.3773,
"mean_token_accuracy": 0.8785854339599609,
"num_tokens": 54158767.0,
"step": 6670
},
{
"entropy": 1.0176648139953612,
"epoch": 3.769057029926595,
"grad_norm": 83.29283905029297,
"learning_rate": 3.832455647704934e-06,
"loss": 0.3899,
"mean_token_accuracy": 0.8775892972946167,
"num_tokens": 54199377.0,
"step": 6675
},
{
"entropy": 0.9590134263038635,
"epoch": 3.7718802936194242,
"grad_norm": 68.5479507446289,
"learning_rate": 3.83091525004252e-06,
"loss": 0.3286,
"mean_token_accuracy": 0.892620575428009,
"num_tokens": 54239593.0,
"step": 6680
},
{
"entropy": 1.0638235330581665,
"epoch": 3.774703557312253,
"grad_norm": 69.84130859375,
"learning_rate": 3.8293742970449516e-06,
"loss": 0.3703,
"mean_token_accuracy": 0.8817868709564209,
"num_tokens": 54280310.0,
"step": 6685
},
{
"entropy": 1.046445870399475,
"epoch": 3.7775268210050816,
"grad_norm": 76.98202514648438,
"learning_rate": 3.827832789924476e-06,
"loss": 0.3619,
"mean_token_accuracy": 0.8842405676841736,
"num_tokens": 54320974.0,
"step": 6690
},
{
"entropy": 1.0114327549934388,
"epoch": 3.780350084697911,
"grad_norm": 73.97267150878906,
"learning_rate": 3.8262907298937805e-06,
"loss": 0.3374,
"mean_token_accuracy": 0.891106104850769,
"num_tokens": 54361592.0,
"step": 6695
},
{
"entropy": 0.9996057510375976,
"epoch": 3.78317334839074,
"grad_norm": 70.08989715576172,
"learning_rate": 3.824748118165984e-06,
"loss": 0.3558,
"mean_token_accuracy": 0.885913097858429,
"num_tokens": 54402267.0,
"step": 6700
},
{
"entropy": 0.9400599360466003,
"epoch": 3.7859966120835686,
"grad_norm": 75.78137969970703,
"learning_rate": 3.823204955954642e-06,
"loss": 0.363,
"mean_token_accuracy": 0.8856919765472412,
"num_tokens": 54442692.0,
"step": 6705
},
{
"entropy": 0.9907450199127197,
"epoch": 3.7888198757763973,
"grad_norm": 69.72572326660156,
"learning_rate": 3.821661244473741e-06,
"loss": 0.3784,
"mean_token_accuracy": 0.8794691681861877,
"num_tokens": 54483180.0,
"step": 6710
},
{
"entropy": 1.0102770328521729,
"epoch": 3.7916431394692265,
"grad_norm": 63.0318489074707,
"learning_rate": 3.820116984937702e-06,
"loss": 0.3661,
"mean_token_accuracy": 0.886728823184967,
"num_tokens": 54523904.0,
"step": 6715
},
{
"entropy": 0.9674598574638367,
"epoch": 3.794466403162055,
"grad_norm": 86.56275939941406,
"learning_rate": 3.8185721785613735e-06,
"loss": 0.3531,
"mean_token_accuracy": 0.8867643713951111,
"num_tokens": 54564607.0,
"step": 6720
},
{
"entropy": 1.0311317205429078,
"epoch": 3.7972896668548843,
"grad_norm": 84.28353118896484,
"learning_rate": 3.817026826560038e-06,
"loss": 0.3571,
"mean_token_accuracy": 0.8858803272247314,
"num_tokens": 54604553.0,
"step": 6725
},
{
"entropy": 1.0266733169555664,
"epoch": 3.800112930547713,
"grad_norm": 82.10366821289062,
"learning_rate": 3.815480930149404e-06,
"loss": 0.3257,
"mean_token_accuracy": 0.8958070278167725,
"num_tokens": 54644926.0,
"step": 6730
},
{
"entropy": 1.1459343433380127,
"epoch": 3.802936194240542,
"grad_norm": 71.1697006225586,
"learning_rate": 3.8139344905456116e-06,
"loss": 0.4013,
"mean_token_accuracy": 0.8718471527099609,
"num_tokens": 54685606.0,
"step": 6735
},
{
"entropy": 0.952794349193573,
"epoch": 3.805759457933371,
"grad_norm": 67.65644836425781,
"learning_rate": 3.8123875089652264e-06,
"loss": 0.3409,
"mean_token_accuracy": 0.8891062021255494,
"num_tokens": 54725844.0,
"step": 6740
},
{
"entropy": 1.0092663884162902,
"epoch": 3.8085827216262,
"grad_norm": 85.99578857421875,
"learning_rate": 3.8108399866252386e-06,
"loss": 0.3458,
"mean_token_accuracy": 0.8878180980682373,
"num_tokens": 54766418.0,
"step": 6745
},
{
"entropy": 1.0750229239463807,
"epoch": 3.8114059853190287,
"grad_norm": 70.52570343017578,
"learning_rate": 3.809291924743068e-06,
"loss": 0.3733,
"mean_token_accuracy": 0.8812821865081787,
"num_tokens": 54807117.0,
"step": 6750
},
{
"entropy": 1.0017638206481934,
"epoch": 3.8142292490118574,
"grad_norm": 86.5273666381836,
"learning_rate": 3.807743324536556e-06,
"loss": 0.3594,
"mean_token_accuracy": 0.8835506319999695,
"num_tokens": 54847916.0,
"step": 6755
},
{
"entropy": 1.0813376545906066,
"epoch": 3.8170525127046866,
"grad_norm": 83.56548309326172,
"learning_rate": 3.806194187223966e-06,
"loss": 0.3478,
"mean_token_accuracy": 0.8874451518058777,
"num_tokens": 54888714.0,
"step": 6760
},
{
"entropy": 1.1086195468902589,
"epoch": 3.8198757763975157,
"grad_norm": 83.7651596069336,
"learning_rate": 3.804644514023988e-06,
"loss": 0.3848,
"mean_token_accuracy": 0.8768295884132385,
"num_tokens": 54929298.0,
"step": 6765
},
{
"entropy": 1.1223967313766479,
"epoch": 3.8226990400903444,
"grad_norm": 78.98551940917969,
"learning_rate": 3.803094306155731e-06,
"loss": 0.359,
"mean_token_accuracy": 0.8833388090133667,
"num_tokens": 54969964.0,
"step": 6770
},
{
"entropy": 1.0287705659866333,
"epoch": 3.825522303783173,
"grad_norm": 77.01542663574219,
"learning_rate": 3.8015435648387257e-06,
"loss": 0.3612,
"mean_token_accuracy": 0.8854536652565003,
"num_tokens": 55010341.0,
"step": 6775
},
{
"entropy": 1.0934155106544494,
"epoch": 3.8283455674760023,
"grad_norm": 74.22586822509766,
"learning_rate": 3.7999922912929206e-06,
"loss": 0.3537,
"mean_token_accuracy": 0.8861262202262878,
"num_tokens": 55051089.0,
"step": 6780
},
{
"entropy": 1.0119221925735473,
"epoch": 3.8311688311688314,
"grad_norm": 81.14582824707031,
"learning_rate": 3.7984404867386848e-06,
"loss": 0.3752,
"mean_token_accuracy": 0.8784032344818116,
"num_tokens": 55091717.0,
"step": 6785
},
{
"entropy": 1.0168304681777953,
"epoch": 3.83399209486166,
"grad_norm": 63.245872497558594,
"learning_rate": 3.7968881523968047e-06,
"loss": 0.3558,
"mean_token_accuracy": 0.8880203247070313,
"num_tokens": 55132472.0,
"step": 6790
},
{
"entropy": 1.0747297167778016,
"epoch": 3.836815358554489,
"grad_norm": 89.36796569824219,
"learning_rate": 3.795335289488484e-06,
"loss": 0.3771,
"mean_token_accuracy": 0.8779423356056213,
"num_tokens": 55172730.0,
"step": 6795
},
{
"entropy": 1.177517795562744,
"epoch": 3.839638622247318,
"grad_norm": 85.73871612548828,
"learning_rate": 3.79378189923534e-06,
"loss": 0.3572,
"mean_token_accuracy": 0.8844528555870056,
"num_tokens": 55213564.0,
"step": 6800
},
{
"entropy": 1.1382277488708497,
"epoch": 3.8424618859401467,
"grad_norm": 78.42530822753906,
"learning_rate": 3.7922279828594076e-06,
"loss": 0.3779,
"mean_token_accuracy": 0.8786744832992553,
"num_tokens": 55253742.0,
"step": 6805
},
{
"entropy": 1.0296044111251832,
"epoch": 3.845285149632976,
"grad_norm": 81.2005386352539,
"learning_rate": 3.7906735415831344e-06,
"loss": 0.355,
"mean_token_accuracy": 0.8866505265235901,
"num_tokens": 55294435.0,
"step": 6810
},
{
"entropy": 0.9833127617835998,
"epoch": 3.8481084133258046,
"grad_norm": 77.91568756103516,
"learning_rate": 3.7891185766293797e-06,
"loss": 0.3565,
"mean_token_accuracy": 0.8863444924354553,
"num_tokens": 55335023.0,
"step": 6815
},
{
"entropy": 1.0741397500038148,
"epoch": 3.8509316770186337,
"grad_norm": 83.5990982055664,
"learning_rate": 3.7875630892214167e-06,
"loss": 0.3577,
"mean_token_accuracy": 0.883382785320282,
"num_tokens": 55375580.0,
"step": 6820
},
{
"entropy": 1.1074389219284058,
"epoch": 3.8537549407114624,
"grad_norm": 84.55143737792969,
"learning_rate": 3.7860070805829295e-06,
"loss": 0.3824,
"mean_token_accuracy": 0.8763737678527832,
"num_tokens": 55416275.0,
"step": 6825
},
{
"entropy": 1.0039127230644227,
"epoch": 3.8565782044042916,
"grad_norm": 73.82699584960938,
"learning_rate": 3.784450551938011e-06,
"loss": 0.3623,
"mean_token_accuracy": 0.8839982032775879,
"num_tokens": 55457007.0,
"step": 6830
},
{
"entropy": 1.0227969527244567,
"epoch": 3.8594014680971203,
"grad_norm": 73.1678466796875,
"learning_rate": 3.782893504511164e-06,
"loss": 0.3556,
"mean_token_accuracy": 0.8841433644294738,
"num_tokens": 55497737.0,
"step": 6835
},
{
"entropy": 1.0409348011016846,
"epoch": 3.862224731789949,
"grad_norm": 78.6789321899414,
"learning_rate": 3.7813359395272998e-06,
"loss": 0.3914,
"mean_token_accuracy": 0.8746375679969788,
"num_tokens": 55538169.0,
"step": 6840
},
{
"entropy": 1.0004459023475647,
"epoch": 3.865047995482778,
"grad_norm": 76.7342529296875,
"learning_rate": 3.779777858211735e-06,
"loss": 0.3844,
"mean_token_accuracy": 0.8788429498672485,
"num_tokens": 55578708.0,
"step": 6845
},
{
"entropy": 1.0666333556175231,
"epoch": 3.8678712591756073,
"grad_norm": 74.42332458496094,
"learning_rate": 3.778219261790194e-06,
"loss": 0.3577,
"mean_token_accuracy": 0.8858496069908142,
"num_tokens": 55619553.0,
"step": 6850
},
{
"entropy": 1.0770619630813598,
"epoch": 3.870694522868436,
"grad_norm": 78.50506591796875,
"learning_rate": 3.776660151488807e-06,
"loss": 0.3836,
"mean_token_accuracy": 0.8765832185745239,
"num_tokens": 55660426.0,
"step": 6855
},
{
"entropy": 1.1385752201080321,
"epoch": 3.8735177865612647,
"grad_norm": 74.58927917480469,
"learning_rate": 3.775100528534107e-06,
"loss": 0.3897,
"mean_token_accuracy": 0.877284836769104,
"num_tokens": 55700919.0,
"step": 6860
},
{
"entropy": 1.0381620287895204,
"epoch": 3.876341050254094,
"grad_norm": 76.63037872314453,
"learning_rate": 3.7735403941530306e-06,
"loss": 0.3586,
"mean_token_accuracy": 0.8844604849815368,
"num_tokens": 55741495.0,
"step": 6865
},
{
"entropy": 0.9769273281097413,
"epoch": 3.8791643139469225,
"grad_norm": 71.60755920410156,
"learning_rate": 3.7719797495729184e-06,
"loss": 0.3553,
"mean_token_accuracy": 0.8843575596809388,
"num_tokens": 55782200.0,
"step": 6870
},
{
"entropy": 1.0441035032272339,
"epoch": 3.8819875776397517,
"grad_norm": 75.18911743164062,
"learning_rate": 3.7704185960215096e-06,
"loss": 0.3899,
"mean_token_accuracy": 0.8751221537590027,
"num_tokens": 55822987.0,
"step": 6875
},
{
"entropy": 1.0410512208938598,
"epoch": 3.8848108413325804,
"grad_norm": 82.20500183105469,
"learning_rate": 3.7688569347269456e-06,
"loss": 0.3573,
"mean_token_accuracy": 0.8842342138290405,
"num_tokens": 55863486.0,
"step": 6880
},
{
"entropy": 1.07995103597641,
"epoch": 3.8876341050254095,
"grad_norm": 79.17044830322266,
"learning_rate": 3.7672947669177663e-06,
"loss": 0.3666,
"mean_token_accuracy": 0.8829922795295715,
"num_tokens": 55904162.0,
"step": 6885
},
{
"entropy": 1.071269142627716,
"epoch": 3.890457368718238,
"grad_norm": 80.15768432617188,
"learning_rate": 3.765732093822911e-06,
"loss": 0.3987,
"mean_token_accuracy": 0.8745517253875732,
"num_tokens": 55944636.0,
"step": 6890
},
{
"entropy": 1.0229296922683715,
"epoch": 3.8932806324110674,
"grad_norm": 75.17719268798828,
"learning_rate": 3.7641689166717164e-06,
"loss": 0.3728,
"mean_token_accuracy": 0.8799932956695556,
"num_tokens": 55985078.0,
"step": 6895
},
{
"entropy": 1.017443561553955,
"epoch": 3.896103896103896,
"grad_norm": 81.72650909423828,
"learning_rate": 3.7626052366939154e-06,
"loss": 0.3439,
"mean_token_accuracy": 0.8882031202316284,
"num_tokens": 56025710.0,
"step": 6900
},
{
"entropy": 0.9397255301475524,
"epoch": 3.8989271597967248,
"grad_norm": 80.24227142333984,
"learning_rate": 3.7610410551196362e-06,
"loss": 0.3506,
"mean_token_accuracy": 0.8859299421310425,
"num_tokens": 56066297.0,
"step": 6905
},
{
"entropy": 1.0021834969520569,
"epoch": 3.901750423489554,
"grad_norm": 68.56148529052734,
"learning_rate": 3.7594763731794015e-06,
"loss": 0.3871,
"mean_token_accuracy": 0.8764629125595093,
"num_tokens": 56106729.0,
"step": 6910
},
{
"entropy": 0.9779050350189209,
"epoch": 3.904573687182383,
"grad_norm": 72.72400665283203,
"learning_rate": 3.7579111921041287e-06,
"loss": 0.358,
"mean_token_accuracy": 0.8842010378837586,
"num_tokens": 56147418.0,
"step": 6915
},
{
"entropy": 1.0261463880538941,
"epoch": 3.9073969508752118,
"grad_norm": 84.04011535644531,
"learning_rate": 3.756345513125128e-06,
"loss": 0.3676,
"mean_token_accuracy": 0.8818102598190307,
"num_tokens": 56188071.0,
"step": 6920
},
{
"entropy": 1.0177555203437805,
"epoch": 3.9102202145680405,
"grad_norm": 86.4022216796875,
"learning_rate": 3.7547793374740987e-06,
"loss": 0.3871,
"mean_token_accuracy": 0.8769708275794983,
"num_tokens": 56228901.0,
"step": 6925
},
{
"entropy": 1.0417796850204468,
"epoch": 3.9130434782608696,
"grad_norm": 89.51969909667969,
"learning_rate": 3.7532126663831337e-06,
"loss": 0.364,
"mean_token_accuracy": 0.881694233417511,
"num_tokens": 56269695.0,
"step": 6930
},
{
"entropy": 0.9011264562606811,
"epoch": 3.9158667419536983,
"grad_norm": 70.96135711669922,
"learning_rate": 3.7516455010847135e-06,
"loss": 0.3469,
"mean_token_accuracy": 0.886352801322937,
"num_tokens": 56310472.0,
"step": 6935
},
{
"entropy": 0.9600812792778015,
"epoch": 3.9186900056465275,
"grad_norm": 72.96127319335938,
"learning_rate": 3.7500778428117097e-06,
"loss": 0.3695,
"mean_token_accuracy": 0.8812779784202576,
"num_tokens": 56351272.0,
"step": 6940
},
{
"entropy": 0.9371390104293823,
"epoch": 3.921513269339356,
"grad_norm": 61.435157775878906,
"learning_rate": 3.7485096927973797e-06,
"loss": 0.3528,
"mean_token_accuracy": 0.8869521021842957,
"num_tokens": 56391825.0,
"step": 6945
},
{
"entropy": 1.0107155442237854,
"epoch": 3.9243365330321853,
"grad_norm": 71.20367431640625,
"learning_rate": 3.746941052275369e-06,
"loss": 0.3762,
"mean_token_accuracy": 0.8818397402763367,
"num_tokens": 56432358.0,
"step": 6950
},
{
"entropy": 0.9440315723419189,
"epoch": 3.927159796725014,
"grad_norm": 90.81354522705078,
"learning_rate": 3.7453719224797084e-06,
"loss": 0.3582,
"mean_token_accuracy": 0.8828841805458069,
"num_tokens": 56472999.0,
"step": 6955
},
{
"entropy": 1.0316729187965392,
"epoch": 3.929983060417843,
"grad_norm": 81.88134002685547,
"learning_rate": 3.743802304644814e-06,
"loss": 0.3741,
"mean_token_accuracy": 0.8808148026466369,
"num_tokens": 56513580.0,
"step": 6960
},
{
"entropy": 1.0075525403022767,
"epoch": 3.932806324110672,
"grad_norm": 71.31327056884766,
"learning_rate": 3.7422322000054844e-06,
"loss": 0.3557,
"mean_token_accuracy": 0.8877139568328858,
"num_tokens": 56554082.0,
"step": 6965
},
{
"entropy": 1.0570897817611695,
"epoch": 3.9356295878035006,
"grad_norm": 77.34210205078125,
"learning_rate": 3.7406616097969034e-06,
"loss": 0.3757,
"mean_token_accuracy": 0.8809595227241516,
"num_tokens": 56594775.0,
"step": 6970
},
{
"entropy": 0.9789751529693603,
"epoch": 3.9384528514963297,
"grad_norm": 79.00511169433594,
"learning_rate": 3.7390905352546346e-06,
"loss": 0.3426,
"mean_token_accuracy": 0.8881513237953186,
"num_tokens": 56635303.0,
"step": 6975
},
{
"entropy": 1.0750985145568848,
"epoch": 3.941276115189159,
"grad_norm": 83.38872528076172,
"learning_rate": 3.7375189776146252e-06,
"loss": 0.3875,
"mean_token_accuracy": 0.8767378211021424,
"num_tokens": 56676033.0,
"step": 6980
},
{
"entropy": 1.1415322065353393,
"epoch": 3.9440993788819876,
"grad_norm": 79.72837829589844,
"learning_rate": 3.7359469381132008e-06,
"loss": 0.4027,
"mean_token_accuracy": 0.8735546827316284,
"num_tokens": 56716838.0,
"step": 6985
},
{
"entropy": 1.1199346899986267,
"epoch": 3.9469226425748163,
"grad_norm": 82.7437973022461,
"learning_rate": 3.734374417987065e-06,
"loss": 0.3539,
"mean_token_accuracy": 0.8858396887779236,
"num_tokens": 56757396.0,
"step": 6990
},
{
"entropy": 0.9379915833473206,
"epoch": 3.9497459062676454,
"grad_norm": 80.79170989990234,
"learning_rate": 3.7328014184733008e-06,
"loss": 0.3453,
"mean_token_accuracy": 0.8876606464385987,
"num_tokens": 56798125.0,
"step": 6995
},
{
"entropy": 1.0453456044197083,
"epoch": 3.9525691699604746,
"grad_norm": 80.65121459960938,
"learning_rate": 3.7312279408093693e-06,
"loss": 0.3801,
"mean_token_accuracy": 0.8763364791870117,
"num_tokens": 56838798.0,
"step": 7000
},
{
"epoch": 3.9525691699604746,
"eval_entropy": 1.234108328819275,
"eval_loss": 0.19375726580619812,
"eval_mean_token_accuracy": 0.9425987362861633,
"eval_num_tokens": 56838798.0,
"eval_runtime": 2.4515,
"eval_samples_per_second": 15.909,
"eval_steps_per_second": 2.04,
"step": 7000
}
],
"logging_steps": 5,
"max_steps": 17710,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.2391467057653576e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}