eac123's picture
Upload final checkpoint (checkpoint-804)
01e41d8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 804,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.1184664368629456,
"epoch": 0.003734827264239029,
"grad_norm": 0.411286324262619,
"learning_rate": 0.0002,
"loss": 2.457291841506958,
"mean_token_accuracy": 0.5408388376235962,
"num_tokens": 16491.0,
"step": 1
},
{
"entropy": 1.2453091144561768,
"epoch": 0.007469654528478058,
"grad_norm": 0.37089085578918457,
"learning_rate": 0.0002,
"loss": 2.1685681343078613,
"mean_token_accuracy": 0.5649923086166382,
"num_tokens": 32759.0,
"step": 2
},
{
"entropy": 1.4064331948757172,
"epoch": 0.011204481792717087,
"grad_norm": 0.2906820774078369,
"learning_rate": 0.0002,
"loss": 1.710010051727295,
"mean_token_accuracy": 0.5920955091714859,
"num_tokens": 49020.0,
"step": 3
},
{
"entropy": 1.364386886358261,
"epoch": 0.014939309056956116,
"grad_norm": 0.22797873616218567,
"learning_rate": 0.0002,
"loss": 1.3888747692108154,
"mean_token_accuracy": 0.6421842128038406,
"num_tokens": 65604.0,
"step": 4
},
{
"entropy": 1.3538264036178589,
"epoch": 0.018674136321195144,
"grad_norm": 0.2804432809352875,
"learning_rate": 0.0002,
"loss": 1.29875648021698,
"mean_token_accuracy": 0.6417761594057083,
"num_tokens": 81941.0,
"step": 5
},
{
"entropy": 1.2739673852920532,
"epoch": 0.022408963585434174,
"grad_norm": 0.15289267897605896,
"learning_rate": 0.0002,
"loss": 1.1843445301055908,
"mean_token_accuracy": 0.6661720275878906,
"num_tokens": 98022.0,
"step": 6
},
{
"entropy": 1.1963406801223755,
"epoch": 0.026143790849673203,
"grad_norm": 0.1057928279042244,
"learning_rate": 0.0002,
"loss": 1.089585304260254,
"mean_token_accuracy": 0.6709173172712326,
"num_tokens": 114552.0,
"step": 7
},
{
"entropy": 1.1228278279304504,
"epoch": 0.029878618113912233,
"grad_norm": 0.10864286869764328,
"learning_rate": 0.0002,
"loss": 1.028782844543457,
"mean_token_accuracy": 0.6796794384717941,
"num_tokens": 130943.0,
"step": 8
},
{
"entropy": 1.0330480933189392,
"epoch": 0.03361344537815126,
"grad_norm": 0.1194700375199318,
"learning_rate": 0.0002,
"loss": 0.978877067565918,
"mean_token_accuracy": 0.6896098554134369,
"num_tokens": 147432.0,
"step": 9
},
{
"entropy": 0.9659490436315536,
"epoch": 0.03734827264239029,
"grad_norm": 0.13075368106365204,
"learning_rate": 0.0002,
"loss": 0.93321692943573,
"mean_token_accuracy": 0.6966541409492493,
"num_tokens": 163753.0,
"step": 10
},
{
"entropy": 0.9611389189958572,
"epoch": 0.04108309990662932,
"grad_norm": 0.10369610041379929,
"learning_rate": 0.0002,
"loss": 0.8770816922187805,
"mean_token_accuracy": 0.7034913301467896,
"num_tokens": 180090.0,
"step": 11
},
{
"entropy": 0.9063249081373215,
"epoch": 0.04481792717086835,
"grad_norm": 0.10426584631204605,
"learning_rate": 0.0002,
"loss": 0.8171504139900208,
"mean_token_accuracy": 0.7150022834539413,
"num_tokens": 196381.0,
"step": 12
},
{
"entropy": 0.8290252089500427,
"epoch": 0.04855275443510738,
"grad_norm": 0.10911860316991806,
"learning_rate": 0.0002,
"loss": 0.7891132831573486,
"mean_token_accuracy": 0.7208491563796997,
"num_tokens": 212795.0,
"step": 13
},
{
"entropy": 0.7808938026428223,
"epoch": 0.05228758169934641,
"grad_norm": 0.10144662111997604,
"learning_rate": 0.0002,
"loss": 0.7427304983139038,
"mean_token_accuracy": 0.7313003540039062,
"num_tokens": 228936.0,
"step": 14
},
{
"entropy": 0.7421854734420776,
"epoch": 0.056022408963585436,
"grad_norm": 0.6942080855369568,
"learning_rate": 0.0002,
"loss": 0.7379668354988098,
"mean_token_accuracy": 0.7287779599428177,
"num_tokens": 245241.0,
"step": 15
},
{
"entropy": 0.7045212388038635,
"epoch": 0.059757236227824466,
"grad_norm": 0.16263937950134277,
"learning_rate": 0.0002,
"loss": 0.7117007374763489,
"mean_token_accuracy": 0.7335064858198166,
"num_tokens": 261386.0,
"step": 16
},
{
"entropy": 0.6911872327327728,
"epoch": 0.06349206349206349,
"grad_norm": 0.08423176407814026,
"learning_rate": 0.0002,
"loss": 0.6914121508598328,
"mean_token_accuracy": 0.7408997714519501,
"num_tokens": 278017.0,
"step": 17
},
{
"entropy": 0.6928284466266632,
"epoch": 0.06722689075630252,
"grad_norm": 0.08306165784597397,
"learning_rate": 0.0002,
"loss": 0.679314911365509,
"mean_token_accuracy": 0.7417374551296234,
"num_tokens": 294613.0,
"step": 18
},
{
"entropy": 0.6805895417928696,
"epoch": 0.07096171802054155,
"grad_norm": 0.7392253279685974,
"learning_rate": 0.0002,
"loss": 0.6667531728744507,
"mean_token_accuracy": 0.7472580522298813,
"num_tokens": 311040.0,
"step": 19
},
{
"entropy": 0.6846933215856552,
"epoch": 0.07469654528478058,
"grad_norm": 0.08478110283613205,
"learning_rate": 0.0002,
"loss": 0.6531012654304504,
"mean_token_accuracy": 0.7482306957244873,
"num_tokens": 327255.0,
"step": 20
},
{
"entropy": 0.6761725544929504,
"epoch": 0.0784313725490196,
"grad_norm": 0.07354654371738434,
"learning_rate": 0.0002,
"loss": 0.6507971882820129,
"mean_token_accuracy": 0.7495593726634979,
"num_tokens": 343726.0,
"step": 21
},
{
"entropy": 0.6475691944360733,
"epoch": 0.08216619981325864,
"grad_norm": 0.0701100155711174,
"learning_rate": 0.0002,
"loss": 0.6324924826622009,
"mean_token_accuracy": 0.7519394010305405,
"num_tokens": 360032.0,
"step": 22
},
{
"entropy": 0.6286474466323853,
"epoch": 0.08590102707749767,
"grad_norm": 0.07334811985492706,
"learning_rate": 0.0002,
"loss": 0.6221117377281189,
"mean_token_accuracy": 0.7562299370765686,
"num_tokens": 376211.0,
"step": 23
},
{
"entropy": 0.6444061696529388,
"epoch": 0.0896358543417367,
"grad_norm": 0.10214248299598694,
"learning_rate": 0.0002,
"loss": 0.6270927786827087,
"mean_token_accuracy": 0.7587939649820328,
"num_tokens": 392746.0,
"step": 24
},
{
"entropy": 0.6239012628793716,
"epoch": 0.09337068160597572,
"grad_norm": 0.07120268046855927,
"learning_rate": 0.0002,
"loss": 0.6152804493904114,
"mean_token_accuracy": 0.7588517516851425,
"num_tokens": 409085.0,
"step": 25
},
{
"entropy": 0.6276111602783203,
"epoch": 0.09710550887021475,
"grad_norm": 0.05954922363162041,
"learning_rate": 0.0002,
"loss": 0.6084893345832825,
"mean_token_accuracy": 0.7613021731376648,
"num_tokens": 425336.0,
"step": 26
},
{
"entropy": 0.6411866247653961,
"epoch": 0.10084033613445378,
"grad_norm": 0.05856655165553093,
"learning_rate": 0.0002,
"loss": 0.6222058534622192,
"mean_token_accuracy": 0.7564119845628738,
"num_tokens": 441729.0,
"step": 27
},
{
"entropy": 0.6264622807502747,
"epoch": 0.10457516339869281,
"grad_norm": 0.06027727574110031,
"learning_rate": 0.0002,
"loss": 0.6105791330337524,
"mean_token_accuracy": 0.7609841376543045,
"num_tokens": 457957.0,
"step": 28
},
{
"entropy": 0.6167244166135788,
"epoch": 0.10830999066293184,
"grad_norm": 0.07074937224388123,
"learning_rate": 0.0002,
"loss": 0.6111780405044556,
"mean_token_accuracy": 0.7601886689662933,
"num_tokens": 474399.0,
"step": 29
},
{
"entropy": 0.6115850210189819,
"epoch": 0.11204481792717087,
"grad_norm": 0.07707173377275467,
"learning_rate": 0.0002,
"loss": 0.6015152335166931,
"mean_token_accuracy": 0.7627497315406799,
"num_tokens": 490919.0,
"step": 30
},
{
"entropy": 0.6094368547201157,
"epoch": 0.1157796451914099,
"grad_norm": 0.059265896677970886,
"learning_rate": 0.0002,
"loss": 0.6023207902908325,
"mean_token_accuracy": 0.758778989315033,
"num_tokens": 507283.0,
"step": 31
},
{
"entropy": 0.6125481128692627,
"epoch": 0.11951447245564893,
"grad_norm": 0.07099295407533646,
"learning_rate": 0.0002,
"loss": 0.603573203086853,
"mean_token_accuracy": 0.7601557075977325,
"num_tokens": 523521.0,
"step": 32
},
{
"entropy": 0.6020256727933884,
"epoch": 0.12324929971988796,
"grad_norm": 0.05661124736070633,
"learning_rate": 0.0002,
"loss": 0.5916649103164673,
"mean_token_accuracy": 0.7667604386806488,
"num_tokens": 540024.0,
"step": 33
},
{
"entropy": 0.5748983919620514,
"epoch": 0.12698412698412698,
"grad_norm": 0.05405418947339058,
"learning_rate": 0.0002,
"loss": 0.5715272426605225,
"mean_token_accuracy": 0.7717257738113403,
"num_tokens": 555993.0,
"step": 34
},
{
"entropy": 0.5811779201030731,
"epoch": 0.13071895424836602,
"grad_norm": 0.04870233312249184,
"learning_rate": 0.0002,
"loss": 0.5783013701438904,
"mean_token_accuracy": 0.7701490372419357,
"num_tokens": 572358.0,
"step": 35
},
{
"entropy": 0.574293926358223,
"epoch": 0.13445378151260504,
"grad_norm": 0.05332570523023605,
"learning_rate": 0.0002,
"loss": 0.5724313259124756,
"mean_token_accuracy": 0.7740762829780579,
"num_tokens": 588766.0,
"step": 36
},
{
"entropy": 0.5665481090545654,
"epoch": 0.13818860877684408,
"grad_norm": 0.0575035996735096,
"learning_rate": 0.0002,
"loss": 0.5736980438232422,
"mean_token_accuracy": 0.7706244140863419,
"num_tokens": 604968.0,
"step": 37
},
{
"entropy": 0.5721801668405533,
"epoch": 0.1419234360410831,
"grad_norm": 0.07653734087944031,
"learning_rate": 0.0002,
"loss": 0.5833261013031006,
"mean_token_accuracy": 0.7672377377748489,
"num_tokens": 621192.0,
"step": 38
},
{
"entropy": 0.5661971271038055,
"epoch": 0.14565826330532214,
"grad_norm": 0.052845459431409836,
"learning_rate": 0.0002,
"loss": 0.5691311955451965,
"mean_token_accuracy": 0.7725834846496582,
"num_tokens": 637384.0,
"step": 39
},
{
"entropy": 0.5870122313499451,
"epoch": 0.14939309056956115,
"grad_norm": 0.05704643577337265,
"learning_rate": 0.0002,
"loss": 0.5838981866836548,
"mean_token_accuracy": 0.7632379680871964,
"num_tokens": 653697.0,
"step": 40
},
{
"entropy": 0.5808418691158295,
"epoch": 0.1531279178338002,
"grad_norm": 0.05715522915124893,
"learning_rate": 0.0002,
"loss": 0.5737625360488892,
"mean_token_accuracy": 0.7728984951972961,
"num_tokens": 670046.0,
"step": 41
},
{
"entropy": 0.5726363211870193,
"epoch": 0.1568627450980392,
"grad_norm": 0.053971655666828156,
"learning_rate": 0.0002,
"loss": 0.5629984736442566,
"mean_token_accuracy": 0.7752888798713684,
"num_tokens": 686076.0,
"step": 42
},
{
"entropy": 0.5652015507221222,
"epoch": 0.16059757236227826,
"grad_norm": 0.04180985689163208,
"learning_rate": 0.0002,
"loss": 0.5623056292533875,
"mean_token_accuracy": 0.7748470306396484,
"num_tokens": 702484.0,
"step": 43
},
{
"entropy": 0.5733779072761536,
"epoch": 0.16433239962651727,
"grad_norm": 0.050310708582401276,
"learning_rate": 0.0002,
"loss": 0.5759532451629639,
"mean_token_accuracy": 0.7717497199773788,
"num_tokens": 718709.0,
"step": 44
},
{
"entropy": 0.5682821422815323,
"epoch": 0.16806722689075632,
"grad_norm": 0.049945104867219925,
"learning_rate": 0.0002,
"loss": 0.5656522512435913,
"mean_token_accuracy": 0.7735471576452255,
"num_tokens": 735195.0,
"step": 45
},
{
"entropy": 0.5685591697692871,
"epoch": 0.17180205415499533,
"grad_norm": 0.044939614832401276,
"learning_rate": 0.0002,
"loss": 0.5674217939376831,
"mean_token_accuracy": 0.7736205905675888,
"num_tokens": 751212.0,
"step": 46
},
{
"entropy": 0.5851640552282333,
"epoch": 0.17553688141923435,
"grad_norm": 0.0478069968521595,
"learning_rate": 0.0002,
"loss": 0.5874634981155396,
"mean_token_accuracy": 0.7659626305103302,
"num_tokens": 767689.0,
"step": 47
},
{
"entropy": 0.5731439292430878,
"epoch": 0.1792717086834734,
"grad_norm": 0.046887464821338654,
"learning_rate": 0.0002,
"loss": 0.571601152420044,
"mean_token_accuracy": 0.7696335017681122,
"num_tokens": 784074.0,
"step": 48
},
{
"entropy": 0.5621766149997711,
"epoch": 0.1830065359477124,
"grad_norm": 0.04711559787392616,
"learning_rate": 0.0002,
"loss": 0.5606247782707214,
"mean_token_accuracy": 0.7760322690010071,
"num_tokens": 800292.0,
"step": 49
},
{
"entropy": 0.5671460330486298,
"epoch": 0.18674136321195145,
"grad_norm": 0.04404276981949806,
"learning_rate": 0.0002,
"loss": 0.5589705109596252,
"mean_token_accuracy": 0.7788618206977844,
"num_tokens": 816651.0,
"step": 50
},
{
"entropy": 0.5850909501314163,
"epoch": 0.19047619047619047,
"grad_norm": 0.04509448632597923,
"learning_rate": 0.0002,
"loss": 0.5727294683456421,
"mean_token_accuracy": 0.7689620703458786,
"num_tokens": 833150.0,
"step": 51
},
{
"entropy": 0.585056334733963,
"epoch": 0.1942110177404295,
"grad_norm": 0.04984965920448303,
"learning_rate": 0.0002,
"loss": 0.5666245818138123,
"mean_token_accuracy": 0.771300658583641,
"num_tokens": 849637.0,
"step": 52
},
{
"entropy": 0.5864798873662949,
"epoch": 0.19794584500466852,
"grad_norm": 0.03626571223139763,
"learning_rate": 0.0002,
"loss": 0.5745272636413574,
"mean_token_accuracy": 0.7683106511831284,
"num_tokens": 865989.0,
"step": 53
},
{
"entropy": 0.5780556201934814,
"epoch": 0.20168067226890757,
"grad_norm": 0.043707672506570816,
"learning_rate": 0.0002,
"loss": 0.5741198062896729,
"mean_token_accuracy": 0.7700863778591156,
"num_tokens": 882298.0,
"step": 54
},
{
"entropy": 0.5698854774236679,
"epoch": 0.20541549953314658,
"grad_norm": 0.04839429631829262,
"learning_rate": 0.0002,
"loss": 0.5747280120849609,
"mean_token_accuracy": 0.7678831219673157,
"num_tokens": 898608.0,
"step": 55
},
{
"entropy": 0.5627169758081436,
"epoch": 0.20915032679738563,
"grad_norm": 0.04472200199961662,
"learning_rate": 0.0002,
"loss": 0.5670843124389648,
"mean_token_accuracy": 0.7717523276805878,
"num_tokens": 914851.0,
"step": 56
},
{
"entropy": 0.5779636800289154,
"epoch": 0.21288515406162464,
"grad_norm": 0.040940672159194946,
"learning_rate": 0.0002,
"loss": 0.5778319239616394,
"mean_token_accuracy": 0.7675311863422394,
"num_tokens": 931487.0,
"step": 57
},
{
"entropy": 0.563320592045784,
"epoch": 0.2166199813258637,
"grad_norm": 0.0448877178132534,
"learning_rate": 0.0002,
"loss": 0.5575067400932312,
"mean_token_accuracy": 0.7765846252441406,
"num_tokens": 947878.0,
"step": 58
},
{
"entropy": 0.6058623939752579,
"epoch": 0.2203548085901027,
"grad_norm": 0.04985905811190605,
"learning_rate": 0.0002,
"loss": 0.6082996726036072,
"mean_token_accuracy": 0.7539926767349243,
"num_tokens": 964324.0,
"step": 59
},
{
"entropy": 0.5625719428062439,
"epoch": 0.22408963585434175,
"grad_norm": 0.038407351821660995,
"learning_rate": 0.0002,
"loss": 0.5598542094230652,
"mean_token_accuracy": 0.7735666781663895,
"num_tokens": 980437.0,
"step": 60
},
{
"entropy": 0.5738561451435089,
"epoch": 0.22782446311858076,
"grad_norm": 0.04555477574467659,
"learning_rate": 0.0002,
"loss": 0.5709559917449951,
"mean_token_accuracy": 0.7690570503473282,
"num_tokens": 996568.0,
"step": 61
},
{
"entropy": 0.5673829317092896,
"epoch": 0.2315592903828198,
"grad_norm": 0.04602229222655296,
"learning_rate": 0.0002,
"loss": 0.5713279843330383,
"mean_token_accuracy": 0.7713401615619659,
"num_tokens": 1012870.0,
"step": 62
},
{
"entropy": 0.5517095476388931,
"epoch": 0.23529411764705882,
"grad_norm": 0.043136853724718094,
"learning_rate": 0.0002,
"loss": 0.5557603240013123,
"mean_token_accuracy": 0.7759266495704651,
"num_tokens": 1029066.0,
"step": 63
},
{
"entropy": 0.5658771097660065,
"epoch": 0.23902894491129786,
"grad_norm": 0.04121146723628044,
"learning_rate": 0.0002,
"loss": 0.5609080791473389,
"mean_token_accuracy": 0.7747898399829865,
"num_tokens": 1045590.0,
"step": 64
},
{
"entropy": 0.549357607960701,
"epoch": 0.24276377217553688,
"grad_norm": 0.044083524495363235,
"learning_rate": 0.0002,
"loss": 0.5459793210029602,
"mean_token_accuracy": 0.7811493426561356,
"num_tokens": 1061874.0,
"step": 65
},
{
"entropy": 0.5573842078447342,
"epoch": 0.24649859943977592,
"grad_norm": 0.04087769240140915,
"learning_rate": 0.0002,
"loss": 0.5592548847198486,
"mean_token_accuracy": 0.775547593832016,
"num_tokens": 1078103.0,
"step": 66
},
{
"entropy": 0.5658538043498993,
"epoch": 0.25023342670401494,
"grad_norm": 0.03777799755334854,
"learning_rate": 0.0002,
"loss": 0.5519559979438782,
"mean_token_accuracy": 0.776710718870163,
"num_tokens": 1094650.0,
"step": 67
},
{
"entropy": 0.583881214261055,
"epoch": 0.25396825396825395,
"grad_norm": 0.044072795659303665,
"learning_rate": 0.0002,
"loss": 0.5742916464805603,
"mean_token_accuracy": 0.7709541469812393,
"num_tokens": 1110961.0,
"step": 68
},
{
"entropy": 0.5731556266546249,
"epoch": 0.25770308123249297,
"grad_norm": 0.045354213565588,
"learning_rate": 0.0002,
"loss": 0.5748150944709778,
"mean_token_accuracy": 0.7677215486764908,
"num_tokens": 1127571.0,
"step": 69
},
{
"entropy": 0.5605138093233109,
"epoch": 0.26143790849673204,
"grad_norm": 0.03672546148300171,
"learning_rate": 0.0002,
"loss": 0.5605238080024719,
"mean_token_accuracy": 0.7723149508237839,
"num_tokens": 1143932.0,
"step": 70
},
{
"entropy": 0.5381516218185425,
"epoch": 0.26517273576097106,
"grad_norm": 0.04045504331588745,
"learning_rate": 0.0002,
"loss": 0.5391750335693359,
"mean_token_accuracy": 0.7822330445051193,
"num_tokens": 1159972.0,
"step": 71
},
{
"entropy": 0.5469133257865906,
"epoch": 0.2689075630252101,
"grad_norm": 0.03917838633060455,
"learning_rate": 0.0002,
"loss": 0.552070140838623,
"mean_token_accuracy": 0.776424303650856,
"num_tokens": 1176122.0,
"step": 72
},
{
"entropy": 0.5674256831407547,
"epoch": 0.2726423902894491,
"grad_norm": 0.0378127247095108,
"learning_rate": 0.0002,
"loss": 0.5667495727539062,
"mean_token_accuracy": 0.7705131769180298,
"num_tokens": 1192483.0,
"step": 73
},
{
"entropy": 0.568048432469368,
"epoch": 0.27637721755368816,
"grad_norm": 0.035798948258161545,
"learning_rate": 0.0002,
"loss": 0.5668107867240906,
"mean_token_accuracy": 0.7710251212120056,
"num_tokens": 1209110.0,
"step": 74
},
{
"entropy": 0.5850978642702103,
"epoch": 0.2801120448179272,
"grad_norm": 0.03812864422798157,
"learning_rate": 0.0002,
"loss": 0.5801389217376709,
"mean_token_accuracy": 0.7685801237821579,
"num_tokens": 1225656.0,
"step": 75
},
{
"entropy": 0.5744365155696869,
"epoch": 0.2838468720821662,
"grad_norm": 0.03252263367176056,
"learning_rate": 0.0002,
"loss": 0.5715938806533813,
"mean_token_accuracy": 0.7678718268871307,
"num_tokens": 1241986.0,
"step": 76
},
{
"entropy": 0.5737413763999939,
"epoch": 0.2875816993464052,
"grad_norm": 0.03566081449389458,
"learning_rate": 0.0002,
"loss": 0.5768669843673706,
"mean_token_accuracy": 0.768094465136528,
"num_tokens": 1258437.0,
"step": 77
},
{
"entropy": 0.5403539538383484,
"epoch": 0.2913165266106443,
"grad_norm": 0.03335001692175865,
"learning_rate": 0.0002,
"loss": 0.5388357639312744,
"mean_token_accuracy": 0.7831095159053802,
"num_tokens": 1274706.0,
"step": 78
},
{
"entropy": 0.5797998905181885,
"epoch": 0.2950513538748833,
"grad_norm": 0.036791976541280746,
"learning_rate": 0.0002,
"loss": 0.5749024152755737,
"mean_token_accuracy": 0.7673221081495285,
"num_tokens": 1291375.0,
"step": 79
},
{
"entropy": 0.5663541257381439,
"epoch": 0.2987861811391223,
"grad_norm": 0.04374934732913971,
"learning_rate": 0.0002,
"loss": 0.5602323412895203,
"mean_token_accuracy": 0.7732456177473068,
"num_tokens": 1307621.0,
"step": 80
},
{
"entropy": 0.5841106921434402,
"epoch": 0.3025210084033613,
"grad_norm": 0.03585761412978172,
"learning_rate": 0.0002,
"loss": 0.5774515271186829,
"mean_token_accuracy": 0.7695471197366714,
"num_tokens": 1324292.0,
"step": 81
},
{
"entropy": 0.5769794583320618,
"epoch": 0.3062558356676004,
"grad_norm": 0.032680612057447433,
"learning_rate": 0.0002,
"loss": 0.5758101940155029,
"mean_token_accuracy": 0.7648481875658035,
"num_tokens": 1340714.0,
"step": 82
},
{
"entropy": 0.557876318693161,
"epoch": 0.3099906629318394,
"grad_norm": 0.036271534860134125,
"learning_rate": 0.0002,
"loss": 0.5576061010360718,
"mean_token_accuracy": 0.7769448161125183,
"num_tokens": 1357063.0,
"step": 83
},
{
"entropy": 0.5480719208717346,
"epoch": 0.3137254901960784,
"grad_norm": 0.04093662649393082,
"learning_rate": 0.0002,
"loss": 0.5554815530776978,
"mean_token_accuracy": 0.7730589210987091,
"num_tokens": 1373048.0,
"step": 84
},
{
"entropy": 0.5651550590991974,
"epoch": 0.31746031746031744,
"grad_norm": 0.03605310246348381,
"learning_rate": 0.0002,
"loss": 0.5752359628677368,
"mean_token_accuracy": 0.767627626657486,
"num_tokens": 1389533.0,
"step": 85
},
{
"entropy": 0.5644277483224869,
"epoch": 0.3211951447245565,
"grad_norm": 0.03757842630147934,
"learning_rate": 0.0002,
"loss": 0.5678563117980957,
"mean_token_accuracy": 0.7691835910081863,
"num_tokens": 1406026.0,
"step": 86
},
{
"entropy": 0.5682397186756134,
"epoch": 0.32492997198879553,
"grad_norm": 0.033709567040205,
"learning_rate": 0.0002,
"loss": 0.5628086924552917,
"mean_token_accuracy": 0.7722707390785217,
"num_tokens": 1422562.0,
"step": 87
},
{
"entropy": 0.5635691732168198,
"epoch": 0.32866479925303455,
"grad_norm": 0.03606971353292465,
"learning_rate": 0.0002,
"loss": 0.5536225438117981,
"mean_token_accuracy": 0.7781998217105865,
"num_tokens": 1438929.0,
"step": 88
},
{
"entropy": 0.5673100650310516,
"epoch": 0.33239962651727356,
"grad_norm": 0.03673219308257103,
"learning_rate": 0.0002,
"loss": 0.5621542930603027,
"mean_token_accuracy": 0.7736853212118149,
"num_tokens": 1455379.0,
"step": 89
},
{
"entropy": 0.5614307522773743,
"epoch": 0.33613445378151263,
"grad_norm": 0.037591755390167236,
"learning_rate": 0.0002,
"loss": 0.5566410422325134,
"mean_token_accuracy": 0.7733979523181915,
"num_tokens": 1471484.0,
"step": 90
},
{
"entropy": 0.5533501952886581,
"epoch": 0.33986928104575165,
"grad_norm": 0.03392329066991806,
"learning_rate": 0.0002,
"loss": 0.5534408092498779,
"mean_token_accuracy": 0.7756673395633698,
"num_tokens": 1487940.0,
"step": 91
},
{
"entropy": 0.5670682638883591,
"epoch": 0.34360410830999066,
"grad_norm": 0.038744084537029266,
"learning_rate": 0.0002,
"loss": 0.5757073760032654,
"mean_token_accuracy": 0.7674537003040314,
"num_tokens": 1504516.0,
"step": 92
},
{
"entropy": 0.5437405109405518,
"epoch": 0.3473389355742297,
"grad_norm": 0.03382673114538193,
"learning_rate": 0.0002,
"loss": 0.5484196543693542,
"mean_token_accuracy": 0.7756420075893402,
"num_tokens": 1520914.0,
"step": 93
},
{
"entropy": 0.5495916306972504,
"epoch": 0.3510737628384687,
"grad_norm": 0.03743721917271614,
"learning_rate": 0.0002,
"loss": 0.5565813183784485,
"mean_token_accuracy": 0.7735388725996017,
"num_tokens": 1537124.0,
"step": 94
},
{
"entropy": 0.568208858370781,
"epoch": 0.35480859010270777,
"grad_norm": 0.03229435160756111,
"learning_rate": 0.0002,
"loss": 0.5690167546272278,
"mean_token_accuracy": 0.7696976512670517,
"num_tokens": 1553562.0,
"step": 95
},
{
"entropy": 0.5612770318984985,
"epoch": 0.3585434173669468,
"grad_norm": 0.03424388915300369,
"learning_rate": 0.0002,
"loss": 0.5587109923362732,
"mean_token_accuracy": 0.774835541844368,
"num_tokens": 1569896.0,
"step": 96
},
{
"entropy": 0.5718783587217331,
"epoch": 0.3622782446311858,
"grad_norm": 0.033101778477430344,
"learning_rate": 0.0002,
"loss": 0.5643482208251953,
"mean_token_accuracy": 0.7721461206674576,
"num_tokens": 1586284.0,
"step": 97
},
{
"entropy": 0.5654337555170059,
"epoch": 0.3660130718954248,
"grad_norm": 0.035547658801078796,
"learning_rate": 0.0002,
"loss": 0.5555263757705688,
"mean_token_accuracy": 0.7783078551292419,
"num_tokens": 1602584.0,
"step": 98
},
{
"entropy": 0.5639571994543076,
"epoch": 0.3697478991596639,
"grad_norm": 0.03868361935019493,
"learning_rate": 0.0002,
"loss": 0.5630732178688049,
"mean_token_accuracy": 0.773595780134201,
"num_tokens": 1618810.0,
"step": 99
},
{
"entropy": 0.568704292178154,
"epoch": 0.3734827264239029,
"grad_norm": 0.03236787021160126,
"learning_rate": 0.0002,
"loss": 0.5669816732406616,
"mean_token_accuracy": 0.7704071253538132,
"num_tokens": 1635290.0,
"step": 100
},
{
"entropy": 0.551744356751442,
"epoch": 0.3772175536881419,
"grad_norm": 0.03913586586713791,
"learning_rate": 0.0002,
"loss": 0.5576678514480591,
"mean_token_accuracy": 0.7771230936050415,
"num_tokens": 1651818.0,
"step": 101
},
{
"entropy": 0.5260472893714905,
"epoch": 0.38095238095238093,
"grad_norm": 0.035290028899908066,
"learning_rate": 0.0002,
"loss": 0.5295023918151855,
"mean_token_accuracy": 0.7862183749675751,
"num_tokens": 1668252.0,
"step": 102
},
{
"entropy": 0.5585302114486694,
"epoch": 0.38468720821662,
"grad_norm": 0.03497280925512314,
"learning_rate": 0.0002,
"loss": 0.5631093978881836,
"mean_token_accuracy": 0.7744487524032593,
"num_tokens": 1684730.0,
"step": 103
},
{
"entropy": 0.5317506641149521,
"epoch": 0.388422035480859,
"grad_norm": 0.038267575204372406,
"learning_rate": 0.0002,
"loss": 0.5366777777671814,
"mean_token_accuracy": 0.7837612628936768,
"num_tokens": 1700724.0,
"step": 104
},
{
"entropy": 0.5369188189506531,
"epoch": 0.39215686274509803,
"grad_norm": 0.03429935500025749,
"learning_rate": 0.0002,
"loss": 0.5283028483390808,
"mean_token_accuracy": 0.7885325402021408,
"num_tokens": 1717105.0,
"step": 105
},
{
"entropy": 0.5693536698818207,
"epoch": 0.39589169000933705,
"grad_norm": 0.038153599947690964,
"learning_rate": 0.0002,
"loss": 0.5606598258018494,
"mean_token_accuracy": 0.7737682908773422,
"num_tokens": 1733363.0,
"step": 106
},
{
"entropy": 0.5737781524658203,
"epoch": 0.3996265172735761,
"grad_norm": 0.034137699753046036,
"learning_rate": 0.0002,
"loss": 0.5676036477088928,
"mean_token_accuracy": 0.7725923210382462,
"num_tokens": 1749928.0,
"step": 107
},
{
"entropy": 0.5680664926767349,
"epoch": 0.40336134453781514,
"grad_norm": 0.035801518708467484,
"learning_rate": 0.0002,
"loss": 0.5669195055961609,
"mean_token_accuracy": 0.7720014601945877,
"num_tokens": 1766520.0,
"step": 108
},
{
"entropy": 0.5640780180692673,
"epoch": 0.40709617180205415,
"grad_norm": 0.036836352199316025,
"learning_rate": 0.0002,
"loss": 0.5703918933868408,
"mean_token_accuracy": 0.7716377079486847,
"num_tokens": 1783002.0,
"step": 109
},
{
"entropy": 0.554967850446701,
"epoch": 0.41083099906629317,
"grad_norm": 0.03882612660527229,
"learning_rate": 0.0002,
"loss": 0.5642282962799072,
"mean_token_accuracy": 0.7699488997459412,
"num_tokens": 1799237.0,
"step": 110
},
{
"entropy": 0.5514571368694305,
"epoch": 0.41456582633053224,
"grad_norm": 0.03324515372514725,
"learning_rate": 0.0002,
"loss": 0.5484537482261658,
"mean_token_accuracy": 0.7782372832298279,
"num_tokens": 1815769.0,
"step": 111
},
{
"entropy": 0.573599174618721,
"epoch": 0.41830065359477125,
"grad_norm": 0.03034473955631256,
"learning_rate": 0.0002,
"loss": 0.5679251551628113,
"mean_token_accuracy": 0.7719407975673676,
"num_tokens": 1831989.0,
"step": 112
},
{
"entropy": 0.5896201282739639,
"epoch": 0.42203548085901027,
"grad_norm": 0.03557023033499718,
"learning_rate": 0.0002,
"loss": 0.5836873054504395,
"mean_token_accuracy": 0.7634387165307999,
"num_tokens": 1848590.0,
"step": 113
},
{
"entropy": 0.5535563677549362,
"epoch": 0.4257703081232493,
"grad_norm": 0.032203588634729385,
"learning_rate": 0.0002,
"loss": 0.5510682463645935,
"mean_token_accuracy": 0.7764001041650772,
"num_tokens": 1864862.0,
"step": 114
},
{
"entropy": 0.5557997226715088,
"epoch": 0.4295051353874883,
"grad_norm": 0.033370040357112885,
"learning_rate": 0.0002,
"loss": 0.5584062933921814,
"mean_token_accuracy": 0.7749063074588776,
"num_tokens": 1881168.0,
"step": 115
},
{
"entropy": 0.5543448776006699,
"epoch": 0.4332399626517274,
"grad_norm": 0.030230488628149033,
"learning_rate": 0.0002,
"loss": 0.5530171990394592,
"mean_token_accuracy": 0.7758816778659821,
"num_tokens": 1897482.0,
"step": 116
},
{
"entropy": 0.5602561086416245,
"epoch": 0.4369747899159664,
"grad_norm": 0.03355773538351059,
"learning_rate": 0.0002,
"loss": 0.5631951093673706,
"mean_token_accuracy": 0.7723173201084137,
"num_tokens": 1913520.0,
"step": 117
},
{
"entropy": 0.5448198318481445,
"epoch": 0.4407096171802054,
"grad_norm": 0.03538920357823372,
"learning_rate": 0.0002,
"loss": 0.5498956441879272,
"mean_token_accuracy": 0.7779627591371536,
"num_tokens": 1929827.0,
"step": 118
},
{
"entropy": 0.5492925643920898,
"epoch": 0.4444444444444444,
"grad_norm": 0.03334996476769447,
"learning_rate": 0.0002,
"loss": 0.5524949431419373,
"mean_token_accuracy": 0.7753683775663376,
"num_tokens": 1946145.0,
"step": 119
},
{
"entropy": 0.5578335374593735,
"epoch": 0.4481792717086835,
"grad_norm": 0.029814472422003746,
"learning_rate": 0.0002,
"loss": 0.5506975650787354,
"mean_token_accuracy": 0.7767714560031891,
"num_tokens": 1962460.0,
"step": 120
},
{
"entropy": 0.5471834242343903,
"epoch": 0.4519140989729225,
"grad_norm": 0.030702516436576843,
"learning_rate": 0.0002,
"loss": 0.5459597110748291,
"mean_token_accuracy": 0.7779918015003204,
"num_tokens": 1978468.0,
"step": 121
},
{
"entropy": 0.5746940076351166,
"epoch": 0.4556489262371615,
"grad_norm": 0.028086913749575615,
"learning_rate": 0.0002,
"loss": 0.5758755207061768,
"mean_token_accuracy": 0.766986295580864,
"num_tokens": 1994816.0,
"step": 122
},
{
"entropy": 0.5609753727912903,
"epoch": 0.45938375350140054,
"grad_norm": 0.027476167306303978,
"learning_rate": 0.0002,
"loss": 0.5596047639846802,
"mean_token_accuracy": 0.7727872580289841,
"num_tokens": 2011498.0,
"step": 123
},
{
"entropy": 0.5600833296775818,
"epoch": 0.4631185807656396,
"grad_norm": 0.03369581326842308,
"learning_rate": 0.0002,
"loss": 0.5641721487045288,
"mean_token_accuracy": 0.7693867385387421,
"num_tokens": 2027843.0,
"step": 124
},
{
"entropy": 0.5480703115463257,
"epoch": 0.4668534080298786,
"grad_norm": 0.029643159359693527,
"learning_rate": 0.0002,
"loss": 0.554192841053009,
"mean_token_accuracy": 0.7775781005620956,
"num_tokens": 2044099.0,
"step": 125
},
{
"entropy": 0.5571865439414978,
"epoch": 0.47058823529411764,
"grad_norm": 0.032963886857032776,
"learning_rate": 0.0002,
"loss": 0.5603472590446472,
"mean_token_accuracy": 0.7727210968732834,
"num_tokens": 2060417.0,
"step": 126
},
{
"entropy": 0.5587971061468124,
"epoch": 0.47432306255835666,
"grad_norm": 0.028774971142411232,
"learning_rate": 0.0002,
"loss": 0.5552476644515991,
"mean_token_accuracy": 0.7738739997148514,
"num_tokens": 2076710.0,
"step": 127
},
{
"entropy": 0.5658144652843475,
"epoch": 0.4780578898225957,
"grad_norm": 0.03230098634958267,
"learning_rate": 0.0002,
"loss": 0.557459831237793,
"mean_token_accuracy": 0.7754161208868027,
"num_tokens": 2093196.0,
"step": 128
},
{
"entropy": 0.5515187084674835,
"epoch": 0.48179271708683474,
"grad_norm": 0.03461001068353653,
"learning_rate": 0.0002,
"loss": 0.547848641872406,
"mean_token_accuracy": 0.7798665314912796,
"num_tokens": 2109091.0,
"step": 129
},
{
"entropy": 0.5527725219726562,
"epoch": 0.48552754435107376,
"grad_norm": 0.03391197323799133,
"learning_rate": 0.0002,
"loss": 0.5531637072563171,
"mean_token_accuracy": 0.7753576338291168,
"num_tokens": 2125292.0,
"step": 130
},
{
"entropy": 0.5310224145650864,
"epoch": 0.4892623716153128,
"grad_norm": 0.037288419902324677,
"learning_rate": 0.0002,
"loss": 0.5368673801422119,
"mean_token_accuracy": 0.7833587974309921,
"num_tokens": 2141768.0,
"step": 131
},
{
"entropy": 0.5471584349870682,
"epoch": 0.49299719887955185,
"grad_norm": 0.03433871641755104,
"learning_rate": 0.0002,
"loss": 0.5525721907615662,
"mean_token_accuracy": 0.776105523109436,
"num_tokens": 2158143.0,
"step": 132
},
{
"entropy": 0.5587402433156967,
"epoch": 0.49673202614379086,
"grad_norm": 0.03347739949822426,
"learning_rate": 0.0002,
"loss": 0.5661599636077881,
"mean_token_accuracy": 0.7718635648488998,
"num_tokens": 2174416.0,
"step": 133
},
{
"entropy": 0.5683765709400177,
"epoch": 0.5004668534080299,
"grad_norm": 0.03381507471203804,
"learning_rate": 0.0002,
"loss": 0.5622847080230713,
"mean_token_accuracy": 0.7744656354188919,
"num_tokens": 2190880.0,
"step": 134
},
{
"entropy": 0.5644540786743164,
"epoch": 0.5042016806722689,
"grad_norm": 0.03272015228867531,
"learning_rate": 0.0002,
"loss": 0.5552080869674683,
"mean_token_accuracy": 0.7752301692962646,
"num_tokens": 2207174.0,
"step": 135
},
{
"entropy": 0.5678849667310715,
"epoch": 0.5079365079365079,
"grad_norm": 0.031616441905498505,
"learning_rate": 0.0002,
"loss": 0.5582877993583679,
"mean_token_accuracy": 0.7729764580726624,
"num_tokens": 2223657.0,
"step": 136
},
{
"entropy": 0.560051366686821,
"epoch": 0.5116713352007469,
"grad_norm": 0.03558259457349777,
"learning_rate": 0.0002,
"loss": 0.5536358952522278,
"mean_token_accuracy": 0.7764490097761154,
"num_tokens": 2239931.0,
"step": 137
},
{
"entropy": 0.5550469309091568,
"epoch": 0.5154061624649859,
"grad_norm": 0.034295059740543365,
"learning_rate": 0.0002,
"loss": 0.5614034533500671,
"mean_token_accuracy": 0.7718400210142136,
"num_tokens": 2256301.0,
"step": 138
},
{
"entropy": 0.5675243437290192,
"epoch": 0.5191409897292251,
"grad_norm": 0.03538001328706741,
"learning_rate": 0.0002,
"loss": 0.5784004926681519,
"mean_token_accuracy": 0.7684118300676346,
"num_tokens": 2272718.0,
"step": 139
},
{
"entropy": 0.5533763766288757,
"epoch": 0.5228758169934641,
"grad_norm": 0.034997887909412384,
"learning_rate": 0.0002,
"loss": 0.563084602355957,
"mean_token_accuracy": 0.7709241509437561,
"num_tokens": 2289039.0,
"step": 140
},
{
"entropy": 0.5602118372917175,
"epoch": 0.5266106442577031,
"grad_norm": 0.033439598977565765,
"learning_rate": 0.0002,
"loss": 0.5643538236618042,
"mean_token_accuracy": 0.7725736945867538,
"num_tokens": 2305409.0,
"step": 141
},
{
"entropy": 0.572220578789711,
"epoch": 0.5303454715219421,
"grad_norm": 0.02899010293185711,
"learning_rate": 0.0002,
"loss": 0.56317138671875,
"mean_token_accuracy": 0.7727230340242386,
"num_tokens": 2321812.0,
"step": 142
},
{
"entropy": 0.5518327206373215,
"epoch": 0.5340802987861811,
"grad_norm": 0.03380458429455757,
"learning_rate": 0.0002,
"loss": 0.5400616526603699,
"mean_token_accuracy": 0.7813573479652405,
"num_tokens": 2338293.0,
"step": 143
},
{
"entropy": 0.59617879986763,
"epoch": 0.5378151260504201,
"grad_norm": 0.03466860204935074,
"learning_rate": 0.0002,
"loss": 0.58748859167099,
"mean_token_accuracy": 0.7642232924699783,
"num_tokens": 2354694.0,
"step": 144
},
{
"entropy": 0.5574633181095123,
"epoch": 0.5415499533146592,
"grad_norm": 0.030799690634012222,
"learning_rate": 0.0002,
"loss": 0.5586976408958435,
"mean_token_accuracy": 0.774814635515213,
"num_tokens": 2370998.0,
"step": 145
},
{
"entropy": 0.5298123508691788,
"epoch": 0.5452847805788982,
"grad_norm": 0.032734956592321396,
"learning_rate": 0.0002,
"loss": 0.5359174609184265,
"mean_token_accuracy": 0.782838299870491,
"num_tokens": 2387173.0,
"step": 146
},
{
"entropy": 0.5436026155948639,
"epoch": 0.5490196078431373,
"grad_norm": 0.03734711930155754,
"learning_rate": 0.0002,
"loss": 0.5544965267181396,
"mean_token_accuracy": 0.7772063612937927,
"num_tokens": 2403457.0,
"step": 147
},
{
"entropy": 0.5453614443540573,
"epoch": 0.5527544351073763,
"grad_norm": 0.030067089945077896,
"learning_rate": 0.0002,
"loss": 0.5510781407356262,
"mean_token_accuracy": 0.7755871117115021,
"num_tokens": 2419735.0,
"step": 148
},
{
"entropy": 0.55818210542202,
"epoch": 0.5564892623716153,
"grad_norm": 0.02786589413881302,
"learning_rate": 0.0002,
"loss": 0.5563390851020813,
"mean_token_accuracy": 0.7738417237997055,
"num_tokens": 2436098.0,
"step": 149
},
{
"entropy": 0.5619741082191467,
"epoch": 0.5602240896358543,
"grad_norm": 0.030777357518672943,
"learning_rate": 0.0002,
"loss": 0.5554664134979248,
"mean_token_accuracy": 0.7789015769958496,
"num_tokens": 2452471.0,
"step": 150
},
{
"entropy": 0.5570534616708755,
"epoch": 0.5639589169000934,
"grad_norm": 0.03233370929956436,
"learning_rate": 0.0002,
"loss": 0.5482333898544312,
"mean_token_accuracy": 0.7772232443094254,
"num_tokens": 2468628.0,
"step": 151
},
{
"entropy": 0.5588962733745575,
"epoch": 0.5676937441643324,
"grad_norm": 0.03047763742506504,
"learning_rate": 0.0002,
"loss": 0.5532917380332947,
"mean_token_accuracy": 0.7753781825304031,
"num_tokens": 2485072.0,
"step": 152
},
{
"entropy": 0.549691841006279,
"epoch": 0.5714285714285714,
"grad_norm": 0.02944052591919899,
"learning_rate": 0.0002,
"loss": 0.5515119433403015,
"mean_token_accuracy": 0.7769780606031418,
"num_tokens": 2501327.0,
"step": 153
},
{
"entropy": 0.5404879450798035,
"epoch": 0.5751633986928104,
"grad_norm": 0.032262854278087616,
"learning_rate": 0.0002,
"loss": 0.5476431846618652,
"mean_token_accuracy": 0.7793239504098892,
"num_tokens": 2517799.0,
"step": 154
},
{
"entropy": 0.5289865881204605,
"epoch": 0.5788982259570495,
"grad_norm": 0.03042609617114067,
"learning_rate": 0.0002,
"loss": 0.531823992729187,
"mean_token_accuracy": 0.7862056195735931,
"num_tokens": 2534300.0,
"step": 155
},
{
"entropy": 0.5359181612730026,
"epoch": 0.5826330532212886,
"grad_norm": 0.030735395848751068,
"learning_rate": 0.0002,
"loss": 0.5355162024497986,
"mean_token_accuracy": 0.7830311506986618,
"num_tokens": 2550561.0,
"step": 156
},
{
"entropy": 0.555221676826477,
"epoch": 0.5863678804855276,
"grad_norm": 0.03072836995124817,
"learning_rate": 0.0002,
"loss": 0.5626713037490845,
"mean_token_accuracy": 0.7714420855045319,
"num_tokens": 2566961.0,
"step": 157
},
{
"entropy": 0.553142175078392,
"epoch": 0.5901027077497666,
"grad_norm": 0.030098870396614075,
"learning_rate": 0.0002,
"loss": 0.5467352867126465,
"mean_token_accuracy": 0.7787252068519592,
"num_tokens": 2583507.0,
"step": 158
},
{
"entropy": 0.5665386617183685,
"epoch": 0.5938375350140056,
"grad_norm": 0.03258649259805679,
"learning_rate": 0.0002,
"loss": 0.5577669143676758,
"mean_token_accuracy": 0.7736402750015259,
"num_tokens": 2599944.0,
"step": 159
},
{
"entropy": 0.5569501370191574,
"epoch": 0.5975723622782446,
"grad_norm": 0.03186054900288582,
"learning_rate": 0.0002,
"loss": 0.5573895573616028,
"mean_token_accuracy": 0.776360809803009,
"num_tokens": 2616293.0,
"step": 160
},
{
"entropy": 0.5284514650702477,
"epoch": 0.6013071895424836,
"grad_norm": 0.029392873868346214,
"learning_rate": 0.0002,
"loss": 0.53591388463974,
"mean_token_accuracy": 0.7802938669919968,
"num_tokens": 2632542.0,
"step": 161
},
{
"entropy": 0.5517806857824326,
"epoch": 0.6050420168067226,
"grad_norm": 0.03547659516334534,
"learning_rate": 0.0002,
"loss": 0.5624344348907471,
"mean_token_accuracy": 0.7713066786527634,
"num_tokens": 2648855.0,
"step": 162
},
{
"entropy": 0.5444875061511993,
"epoch": 0.6087768440709617,
"grad_norm": 0.032323673367500305,
"learning_rate": 0.0002,
"loss": 0.5506078004837036,
"mean_token_accuracy": 0.7763939499855042,
"num_tokens": 2665389.0,
"step": 163
},
{
"entropy": 0.552508682012558,
"epoch": 0.6125116713352008,
"grad_norm": 0.029938260093331337,
"learning_rate": 0.0002,
"loss": 0.5556696653366089,
"mean_token_accuracy": 0.774255782365799,
"num_tokens": 2681574.0,
"step": 164
},
{
"entropy": 0.5732054561376572,
"epoch": 0.6162464985994398,
"grad_norm": 0.027899837121367455,
"learning_rate": 0.0002,
"loss": 0.5643041133880615,
"mean_token_accuracy": 0.7738403379917145,
"num_tokens": 2697956.0,
"step": 165
},
{
"entropy": 0.5575381070375443,
"epoch": 0.6199813258636788,
"grad_norm": 0.03164415806531906,
"learning_rate": 0.0002,
"loss": 0.5456005930900574,
"mean_token_accuracy": 0.7768769711256027,
"num_tokens": 2714390.0,
"step": 166
},
{
"entropy": 0.5516810864210129,
"epoch": 0.6237161531279178,
"grad_norm": 0.02569694072008133,
"learning_rate": 0.0002,
"loss": 0.5495009422302246,
"mean_token_accuracy": 0.774631917476654,
"num_tokens": 2730912.0,
"step": 167
},
{
"entropy": 0.5496233999729156,
"epoch": 0.6274509803921569,
"grad_norm": 0.03019907884299755,
"learning_rate": 0.0002,
"loss": 0.5496887564659119,
"mean_token_accuracy": 0.7817335277795792,
"num_tokens": 2747282.0,
"step": 168
},
{
"entropy": 0.5489860326051712,
"epoch": 0.6311858076563959,
"grad_norm": 0.03389516472816467,
"learning_rate": 0.0002,
"loss": 0.5572369694709778,
"mean_token_accuracy": 0.7735096365213394,
"num_tokens": 2763708.0,
"step": 169
},
{
"entropy": 0.5558005720376968,
"epoch": 0.6349206349206349,
"grad_norm": 0.02765459194779396,
"learning_rate": 0.0002,
"loss": 0.5571833848953247,
"mean_token_accuracy": 0.7726074606180191,
"num_tokens": 2780084.0,
"step": 170
},
{
"entropy": 0.5543476939201355,
"epoch": 0.6386554621848739,
"grad_norm": 0.0267086960375309,
"learning_rate": 0.0002,
"loss": 0.5579585433006287,
"mean_token_accuracy": 0.7720465064048767,
"num_tokens": 2796592.0,
"step": 171
},
{
"entropy": 0.5531720370054245,
"epoch": 0.642390289449113,
"grad_norm": 0.03003924898803234,
"learning_rate": 0.0002,
"loss": 0.5539361238479614,
"mean_token_accuracy": 0.7745767682790756,
"num_tokens": 2813004.0,
"step": 172
},
{
"entropy": 0.5696417987346649,
"epoch": 0.646125116713352,
"grad_norm": 0.030649833381175995,
"learning_rate": 0.0002,
"loss": 0.5720299482345581,
"mean_token_accuracy": 0.7685467600822449,
"num_tokens": 2829346.0,
"step": 173
},
{
"entropy": 0.5682009905576706,
"epoch": 0.6498599439775911,
"grad_norm": 0.028095850721001625,
"learning_rate": 0.0002,
"loss": 0.5576902627944946,
"mean_token_accuracy": 0.7762027978897095,
"num_tokens": 2845908.0,
"step": 174
},
{
"entropy": 0.5714679658412933,
"epoch": 0.6535947712418301,
"grad_norm": 0.028559835627675056,
"learning_rate": 0.0002,
"loss": 0.5658706426620483,
"mean_token_accuracy": 0.7675664275884628,
"num_tokens": 2862417.0,
"step": 175
},
{
"entropy": 0.5519525855779648,
"epoch": 0.6573295985060691,
"grad_norm": 0.034554384648799896,
"learning_rate": 0.0002,
"loss": 0.5615457892417908,
"mean_token_accuracy": 0.7730480134487152,
"num_tokens": 2878691.0,
"step": 176
},
{
"entropy": 0.5469972342252731,
"epoch": 0.6610644257703081,
"grad_norm": 0.038470808416604996,
"learning_rate": 0.0002,
"loss": 0.5615893006324768,
"mean_token_accuracy": 0.7721795290708542,
"num_tokens": 2894997.0,
"step": 177
},
{
"entropy": 0.5659243762493134,
"epoch": 0.6647992530345471,
"grad_norm": 0.028726449236273766,
"learning_rate": 0.0002,
"loss": 0.5627461671829224,
"mean_token_accuracy": 0.7720647305250168,
"num_tokens": 2911504.0,
"step": 178
},
{
"entropy": 0.5529140681028366,
"epoch": 0.6685340802987861,
"grad_norm": 0.02865666151046753,
"learning_rate": 0.0002,
"loss": 0.551171064376831,
"mean_token_accuracy": 0.7765299677848816,
"num_tokens": 2927890.0,
"step": 179
},
{
"entropy": 0.5597221851348877,
"epoch": 0.6722689075630253,
"grad_norm": 0.030919602140784264,
"learning_rate": 0.0002,
"loss": 0.5537790656089783,
"mean_token_accuracy": 0.7759328931570053,
"num_tokens": 2944242.0,
"step": 180
},
{
"entropy": 0.562122106552124,
"epoch": 0.6760037348272643,
"grad_norm": 0.03044375404715538,
"learning_rate": 0.0002,
"loss": 0.5568514466285706,
"mean_token_accuracy": 0.7706819474697113,
"num_tokens": 2960500.0,
"step": 181
},
{
"entropy": 0.5697348713874817,
"epoch": 0.6797385620915033,
"grad_norm": 0.031796056777238846,
"learning_rate": 0.0002,
"loss": 0.5688814520835876,
"mean_token_accuracy": 0.7685033828020096,
"num_tokens": 2976732.0,
"step": 182
},
{
"entropy": 0.5696271657943726,
"epoch": 0.6834733893557423,
"grad_norm": 0.034152235835790634,
"learning_rate": 0.0002,
"loss": 0.570652186870575,
"mean_token_accuracy": 0.7676333039999008,
"num_tokens": 2993011.0,
"step": 183
},
{
"entropy": 0.5509230494499207,
"epoch": 0.6872082166199813,
"grad_norm": 0.030170850455760956,
"learning_rate": 0.0002,
"loss": 0.5528304576873779,
"mean_token_accuracy": 0.7786384671926498,
"num_tokens": 3009475.0,
"step": 184
},
{
"entropy": 0.549485370516777,
"epoch": 0.6909430438842203,
"grad_norm": 0.03623858466744423,
"learning_rate": 0.0002,
"loss": 0.5553773045539856,
"mean_token_accuracy": 0.7744152545928955,
"num_tokens": 3025920.0,
"step": 185
},
{
"entropy": 0.5484632700681686,
"epoch": 0.6946778711484594,
"grad_norm": 0.033118441700935364,
"learning_rate": 0.0002,
"loss": 0.5544424653053284,
"mean_token_accuracy": 0.7758429795503616,
"num_tokens": 3042293.0,
"step": 186
},
{
"entropy": 0.5471510142087936,
"epoch": 0.6984126984126984,
"grad_norm": 0.027027102187275887,
"learning_rate": 0.0002,
"loss": 0.5416866540908813,
"mean_token_accuracy": 0.7816910296678543,
"num_tokens": 3058771.0,
"step": 187
},
{
"entropy": 0.5579911917448044,
"epoch": 0.7021475256769374,
"grad_norm": 0.03291584923863411,
"learning_rate": 0.0002,
"loss": 0.5471009016036987,
"mean_token_accuracy": 0.7790512144565582,
"num_tokens": 3075134.0,
"step": 188
},
{
"entropy": 0.5525984019041061,
"epoch": 0.7058823529411765,
"grad_norm": 0.029011745005846024,
"learning_rate": 0.0002,
"loss": 0.5483554005622864,
"mean_token_accuracy": 0.7763502299785614,
"num_tokens": 3091306.0,
"step": 189
},
{
"entropy": 0.5610422939062119,
"epoch": 0.7096171802054155,
"grad_norm": 0.02904326282441616,
"learning_rate": 0.0002,
"loss": 0.5617838501930237,
"mean_token_accuracy": 0.7707021087408066,
"num_tokens": 3107639.0,
"step": 190
},
{
"entropy": 0.5382349342107773,
"epoch": 0.7133520074696545,
"grad_norm": 0.027915941551327705,
"learning_rate": 0.0002,
"loss": 0.5406217575073242,
"mean_token_accuracy": 0.7792213708162308,
"num_tokens": 3123888.0,
"step": 191
},
{
"entropy": 0.5334387570619583,
"epoch": 0.7170868347338936,
"grad_norm": 0.024687422439455986,
"learning_rate": 0.0002,
"loss": 0.5337969660758972,
"mean_token_accuracy": 0.7827744781970978,
"num_tokens": 3140136.0,
"step": 192
},
{
"entropy": 0.5519388318061829,
"epoch": 0.7208216619981326,
"grad_norm": 0.03399450331926346,
"learning_rate": 0.0002,
"loss": 0.5664753317832947,
"mean_token_accuracy": 0.7712263911962509,
"num_tokens": 3156560.0,
"step": 193
},
{
"entropy": 0.5329768806695938,
"epoch": 0.7245564892623716,
"grad_norm": 0.03143489733338356,
"learning_rate": 0.0002,
"loss": 0.5424296259880066,
"mean_token_accuracy": 0.7808002233505249,
"num_tokens": 3172868.0,
"step": 194
},
{
"entropy": 0.5407986044883728,
"epoch": 0.7282913165266106,
"grad_norm": 0.02865898422896862,
"learning_rate": 0.0002,
"loss": 0.5426485538482666,
"mean_token_accuracy": 0.7797252386808395,
"num_tokens": 3188845.0,
"step": 195
},
{
"entropy": 0.5540356040000916,
"epoch": 0.7320261437908496,
"grad_norm": 0.031195135787129402,
"learning_rate": 0.0002,
"loss": 0.5537624359130859,
"mean_token_accuracy": 0.772818997502327,
"num_tokens": 3205059.0,
"step": 196
},
{
"entropy": 0.547016367316246,
"epoch": 0.7357609710550888,
"grad_norm": 0.026600942015647888,
"learning_rate": 0.0002,
"loss": 0.5409566164016724,
"mean_token_accuracy": 0.7801954299211502,
"num_tokens": 3221339.0,
"step": 197
},
{
"entropy": 0.5571199655532837,
"epoch": 0.7394957983193278,
"grad_norm": 0.027464795857667923,
"learning_rate": 0.0002,
"loss": 0.5505565404891968,
"mean_token_accuracy": 0.7758535593748093,
"num_tokens": 3237556.0,
"step": 198
},
{
"entropy": 0.5562743991613388,
"epoch": 0.7432306255835668,
"grad_norm": 0.029805589467287064,
"learning_rate": 0.0002,
"loss": 0.5526044368743896,
"mean_token_accuracy": 0.7738559246063232,
"num_tokens": 3253871.0,
"step": 199
},
{
"entropy": 0.5585610568523407,
"epoch": 0.7469654528478058,
"grad_norm": 0.03004448115825653,
"learning_rate": 0.0002,
"loss": 0.5598405599594116,
"mean_token_accuracy": 0.7726627141237259,
"num_tokens": 3269973.0,
"step": 200
},
{
"entropy": 0.5488641411066055,
"epoch": 0.7507002801120448,
"grad_norm": 0.027654554694890976,
"learning_rate": 0.0002,
"loss": 0.5513002276420593,
"mean_token_accuracy": 0.7737944573163986,
"num_tokens": 3286201.0,
"step": 201
},
{
"entropy": 0.5287523940205574,
"epoch": 0.7544351073762838,
"grad_norm": 0.03466613590717316,
"learning_rate": 0.0002,
"loss": 0.5311362743377686,
"mean_token_accuracy": 0.7847718745470047,
"num_tokens": 3302467.0,
"step": 202
},
{
"entropy": 0.5560965240001678,
"epoch": 0.7581699346405228,
"grad_norm": 0.034095581620931625,
"learning_rate": 0.0002,
"loss": 0.5613946914672852,
"mean_token_accuracy": 0.7737453281879425,
"num_tokens": 3318768.0,
"step": 203
},
{
"entropy": 0.5630687177181244,
"epoch": 0.7619047619047619,
"grad_norm": 0.03233996778726578,
"learning_rate": 0.0002,
"loss": 0.564468264579773,
"mean_token_accuracy": 0.7691166549921036,
"num_tokens": 3335233.0,
"step": 204
},
{
"entropy": 0.5651765614748001,
"epoch": 0.765639589169001,
"grad_norm": 0.030395060777664185,
"learning_rate": 0.0002,
"loss": 0.5597318410873413,
"mean_token_accuracy": 0.7716515213251114,
"num_tokens": 3351439.0,
"step": 205
},
{
"entropy": 0.5476003587245941,
"epoch": 0.76937441643324,
"grad_norm": 0.03382452204823494,
"learning_rate": 0.0002,
"loss": 0.5447301864624023,
"mean_token_accuracy": 0.7816700637340546,
"num_tokens": 3367520.0,
"step": 206
},
{
"entropy": 0.5484471321105957,
"epoch": 0.773109243697479,
"grad_norm": 0.02830951102077961,
"learning_rate": 0.0002,
"loss": 0.5454609394073486,
"mean_token_accuracy": 0.7790801376104355,
"num_tokens": 3383667.0,
"step": 207
},
{
"entropy": 0.5659755617380142,
"epoch": 0.776844070961718,
"grad_norm": 0.02530798688530922,
"learning_rate": 0.0002,
"loss": 0.5655397772789001,
"mean_token_accuracy": 0.770569920539856,
"num_tokens": 3400150.0,
"step": 208
},
{
"entropy": 0.5427214205265045,
"epoch": 0.780578898225957,
"grad_norm": 0.03361448645591736,
"learning_rate": 0.0002,
"loss": 0.5476981401443481,
"mean_token_accuracy": 0.7780336290597916,
"num_tokens": 3416165.0,
"step": 209
},
{
"entropy": 0.5494136065244675,
"epoch": 0.7843137254901961,
"grad_norm": 0.029303058981895447,
"learning_rate": 0.0002,
"loss": 0.555971086025238,
"mean_token_accuracy": 0.7742915004491806,
"num_tokens": 3432668.0,
"step": 210
},
{
"entropy": 0.5408063977956772,
"epoch": 0.7880485527544351,
"grad_norm": 0.024706227704882622,
"learning_rate": 0.0002,
"loss": 0.5423460006713867,
"mean_token_accuracy": 0.7791419923305511,
"num_tokens": 3449230.0,
"step": 211
},
{
"entropy": 0.5585084557533264,
"epoch": 0.7917833800186741,
"grad_norm": 0.031753819435834885,
"learning_rate": 0.0002,
"loss": 0.5534642934799194,
"mean_token_accuracy": 0.7761369943618774,
"num_tokens": 3465888.0,
"step": 212
},
{
"entropy": 0.5470991730690002,
"epoch": 0.7955182072829131,
"grad_norm": 0.02627946063876152,
"learning_rate": 0.0002,
"loss": 0.543735921382904,
"mean_token_accuracy": 0.7773504257202148,
"num_tokens": 3482200.0,
"step": 213
},
{
"entropy": 0.5522027462720871,
"epoch": 0.7992530345471522,
"grad_norm": 0.02693161368370056,
"learning_rate": 0.0002,
"loss": 0.5497567057609558,
"mean_token_accuracy": 0.7760942578315735,
"num_tokens": 3498472.0,
"step": 214
},
{
"entropy": 0.5438102185726166,
"epoch": 0.8029878618113913,
"grad_norm": 0.029677148908376694,
"learning_rate": 0.0002,
"loss": 0.5449556112289429,
"mean_token_accuracy": 0.7757529467344284,
"num_tokens": 3514748.0,
"step": 215
},
{
"entropy": 0.5440456867218018,
"epoch": 0.8067226890756303,
"grad_norm": 0.028825437650084496,
"learning_rate": 0.0002,
"loss": 0.5460283160209656,
"mean_token_accuracy": 0.7805955857038498,
"num_tokens": 3530910.0,
"step": 216
},
{
"entropy": 0.5444321483373642,
"epoch": 0.8104575163398693,
"grad_norm": 0.023829322308301926,
"learning_rate": 0.0002,
"loss": 0.5420593023300171,
"mean_token_accuracy": 0.7787522822618484,
"num_tokens": 3547036.0,
"step": 217
},
{
"entropy": 0.5498476177453995,
"epoch": 0.8141923436041083,
"grad_norm": 0.025729795917868614,
"learning_rate": 0.0002,
"loss": 0.5429421067237854,
"mean_token_accuracy": 0.7785259187221527,
"num_tokens": 3563357.0,
"step": 218
},
{
"entropy": 0.544920951128006,
"epoch": 0.8179271708683473,
"grad_norm": 0.027102749794721603,
"learning_rate": 0.0002,
"loss": 0.5383168458938599,
"mean_token_accuracy": 0.7817831486463547,
"num_tokens": 3579822.0,
"step": 219
},
{
"entropy": 0.5497463345527649,
"epoch": 0.8216619981325863,
"grad_norm": 0.0323423407971859,
"learning_rate": 0.0002,
"loss": 0.5521490573883057,
"mean_token_accuracy": 0.7747017741203308,
"num_tokens": 3596053.0,
"step": 220
},
{
"entropy": 0.5389717519283295,
"epoch": 0.8253968253968254,
"grad_norm": 0.027372388169169426,
"learning_rate": 0.0002,
"loss": 0.540256679058075,
"mean_token_accuracy": 0.7825071215629578,
"num_tokens": 3612271.0,
"step": 221
},
{
"entropy": 0.5472569465637207,
"epoch": 0.8291316526610645,
"grad_norm": 0.028159258887171745,
"learning_rate": 0.0002,
"loss": 0.5517306327819824,
"mean_token_accuracy": 0.7758912444114685,
"num_tokens": 3628658.0,
"step": 222
},
{
"entropy": 0.5441670119762421,
"epoch": 0.8328664799253035,
"grad_norm": 0.0357636883854866,
"learning_rate": 0.0002,
"loss": 0.5485156178474426,
"mean_token_accuracy": 0.7771351188421249,
"num_tokens": 3645179.0,
"step": 223
},
{
"entropy": 0.5535278022289276,
"epoch": 0.8366013071895425,
"grad_norm": 0.032881151884794235,
"learning_rate": 0.0002,
"loss": 0.5619751811027527,
"mean_token_accuracy": 0.7715311944484711,
"num_tokens": 3661296.0,
"step": 224
},
{
"entropy": 0.5683074444532394,
"epoch": 0.8403361344537815,
"grad_norm": 0.03166094422340393,
"learning_rate": 0.0002,
"loss": 0.5676220059394836,
"mean_token_accuracy": 0.7721768617630005,
"num_tokens": 3677506.0,
"step": 225
},
{
"entropy": 0.5672677755355835,
"epoch": 0.8440709617180205,
"grad_norm": 0.029754942283034325,
"learning_rate": 0.0002,
"loss": 0.5636724233627319,
"mean_token_accuracy": 0.7715145349502563,
"num_tokens": 3693949.0,
"step": 226
},
{
"entropy": 0.5544100105762482,
"epoch": 0.8478057889822596,
"grad_norm": 0.027808941900730133,
"learning_rate": 0.0002,
"loss": 0.5551161170005798,
"mean_token_accuracy": 0.7762546241283417,
"num_tokens": 3710403.0,
"step": 227
},
{
"entropy": 0.5428061038255692,
"epoch": 0.8515406162464986,
"grad_norm": 0.032082680612802505,
"learning_rate": 0.0002,
"loss": 0.5452495813369751,
"mean_token_accuracy": 0.7784813046455383,
"num_tokens": 3726407.0,
"step": 228
},
{
"entropy": 0.5473134368658066,
"epoch": 0.8552754435107376,
"grad_norm": 0.030095776543021202,
"learning_rate": 0.0002,
"loss": 0.5461854934692383,
"mean_token_accuracy": 0.7758107632398605,
"num_tokens": 3742861.0,
"step": 229
},
{
"entropy": 0.5752474963665009,
"epoch": 0.8590102707749766,
"grad_norm": 0.030156588181853294,
"learning_rate": 0.0002,
"loss": 0.5713618397712708,
"mean_token_accuracy": 0.7695687711238861,
"num_tokens": 3759464.0,
"step": 230
},
{
"entropy": 0.5410983264446259,
"epoch": 0.8627450980392157,
"grad_norm": 0.026288261637091637,
"learning_rate": 0.0002,
"loss": 0.5398176908493042,
"mean_token_accuracy": 0.7807286381721497,
"num_tokens": 3775673.0,
"step": 231
},
{
"entropy": 0.5493600815534592,
"epoch": 0.8664799253034547,
"grad_norm": 0.03065655194222927,
"learning_rate": 0.0002,
"loss": 0.5482011437416077,
"mean_token_accuracy": 0.7772542536258698,
"num_tokens": 3791787.0,
"step": 232
},
{
"entropy": 0.5542360842227936,
"epoch": 0.8702147525676938,
"grad_norm": 0.032031431794166565,
"learning_rate": 0.0002,
"loss": 0.5554019212722778,
"mean_token_accuracy": 0.7739447802305222,
"num_tokens": 3808316.0,
"step": 233
},
{
"entropy": 0.5599103569984436,
"epoch": 0.8739495798319328,
"grad_norm": 0.027463702484965324,
"learning_rate": 0.0002,
"loss": 0.5579502582550049,
"mean_token_accuracy": 0.771759495139122,
"num_tokens": 3824701.0,
"step": 234
},
{
"entropy": 0.5677217245101929,
"epoch": 0.8776844070961718,
"grad_norm": 0.03142165020108223,
"learning_rate": 0.0002,
"loss": 0.5663169622421265,
"mean_token_accuracy": 0.7691013365983963,
"num_tokens": 3841435.0,
"step": 235
},
{
"entropy": 0.5482347160577774,
"epoch": 0.8814192343604108,
"grad_norm": 0.031262561678886414,
"learning_rate": 0.0002,
"loss": 0.552081823348999,
"mean_token_accuracy": 0.7783354371786118,
"num_tokens": 3857866.0,
"step": 236
},
{
"entropy": 0.5391282737255096,
"epoch": 0.8851540616246498,
"grad_norm": 0.030781790614128113,
"learning_rate": 0.0002,
"loss": 0.5469714403152466,
"mean_token_accuracy": 0.7780267000198364,
"num_tokens": 3874216.0,
"step": 237
},
{
"entropy": 0.5443921983242035,
"epoch": 0.8888888888888888,
"grad_norm": 0.032567523419857025,
"learning_rate": 0.0002,
"loss": 0.549781084060669,
"mean_token_accuracy": 0.7772793620824814,
"num_tokens": 3890382.0,
"step": 238
},
{
"entropy": 0.5604461878538132,
"epoch": 0.892623716153128,
"grad_norm": 0.02667226456105709,
"learning_rate": 0.0002,
"loss": 0.5538907051086426,
"mean_token_accuracy": 0.7770420461893082,
"num_tokens": 3906697.0,
"step": 239
},
{
"entropy": 0.5541103631258011,
"epoch": 0.896358543417367,
"grad_norm": 0.027397198602557182,
"learning_rate": 0.0002,
"loss": 0.5516767501831055,
"mean_token_accuracy": 0.7767754942178726,
"num_tokens": 3922978.0,
"step": 240
},
{
"entropy": 0.5521068722009659,
"epoch": 0.900093370681606,
"grad_norm": 0.032886214554309845,
"learning_rate": 0.0002,
"loss": 0.5538557171821594,
"mean_token_accuracy": 0.7769301533699036,
"num_tokens": 3939282.0,
"step": 241
},
{
"entropy": 0.5449024885892868,
"epoch": 0.903828197945845,
"grad_norm": 0.026176048442721367,
"learning_rate": 0.0002,
"loss": 0.5478168725967407,
"mean_token_accuracy": 0.7779200524091721,
"num_tokens": 3955520.0,
"step": 242
},
{
"entropy": 0.5615669041872025,
"epoch": 0.907563025210084,
"grad_norm": 0.02917352132499218,
"learning_rate": 0.0002,
"loss": 0.5631118416786194,
"mean_token_accuracy": 0.769850417971611,
"num_tokens": 3971679.0,
"step": 243
},
{
"entropy": 0.5360025763511658,
"epoch": 0.911297852474323,
"grad_norm": 0.028804168105125427,
"learning_rate": 0.0002,
"loss": 0.5399969816207886,
"mean_token_accuracy": 0.7786188125610352,
"num_tokens": 3987832.0,
"step": 244
},
{
"entropy": 0.5670223534107208,
"epoch": 0.9150326797385621,
"grad_norm": 0.032523807138204575,
"learning_rate": 0.0002,
"loss": 0.568830668926239,
"mean_token_accuracy": 0.7703544050455093,
"num_tokens": 4004046.0,
"step": 245
},
{
"entropy": 0.5482122004032135,
"epoch": 0.9187675070028011,
"grad_norm": 0.024507107213139534,
"learning_rate": 0.0002,
"loss": 0.5461756587028503,
"mean_token_accuracy": 0.7785715907812119,
"num_tokens": 4020396.0,
"step": 246
},
{
"entropy": 0.5435233414173126,
"epoch": 0.9225023342670402,
"grad_norm": 0.026535481214523315,
"learning_rate": 0.0002,
"loss": 0.5347612500190735,
"mean_token_accuracy": 0.7819430381059647,
"num_tokens": 4036657.0,
"step": 247
},
{
"entropy": 0.5606936663389206,
"epoch": 0.9262371615312792,
"grad_norm": 0.03222998231649399,
"learning_rate": 0.0002,
"loss": 0.5588559508323669,
"mean_token_accuracy": 0.7731847912073135,
"num_tokens": 4052932.0,
"step": 248
},
{
"entropy": 0.5559582114219666,
"epoch": 0.9299719887955182,
"grad_norm": 0.027079764753580093,
"learning_rate": 0.0002,
"loss": 0.5551950931549072,
"mean_token_accuracy": 0.7739483118057251,
"num_tokens": 4069465.0,
"step": 249
},
{
"entropy": 0.5464590489864349,
"epoch": 0.9337068160597572,
"grad_norm": 0.025224287062883377,
"learning_rate": 0.0002,
"loss": 0.548494815826416,
"mean_token_accuracy": 0.7777067720890045,
"num_tokens": 4085793.0,
"step": 250
},
{
"entropy": 0.5697829127311707,
"epoch": 0.9374416433239963,
"grad_norm": 0.03149845451116562,
"learning_rate": 0.0002,
"loss": 0.5725698471069336,
"mean_token_accuracy": 0.7667296230792999,
"num_tokens": 4102389.0,
"step": 251
},
{
"entropy": 0.5524837523698807,
"epoch": 0.9411764705882353,
"grad_norm": 0.027573609724640846,
"learning_rate": 0.0002,
"loss": 0.5497711896896362,
"mean_token_accuracy": 0.7749225348234177,
"num_tokens": 4118604.0,
"step": 252
},
{
"entropy": 0.5428849905729294,
"epoch": 0.9449112978524743,
"grad_norm": 0.025667617097496986,
"learning_rate": 0.0002,
"loss": 0.5428351163864136,
"mean_token_accuracy": 0.7771738916635513,
"num_tokens": 4135001.0,
"step": 253
},
{
"entropy": 0.5520694851875305,
"epoch": 0.9486461251167133,
"grad_norm": 0.035842686891555786,
"learning_rate": 0.0002,
"loss": 0.550408661365509,
"mean_token_accuracy": 0.7740647196769714,
"num_tokens": 4151260.0,
"step": 254
},
{
"entropy": 0.5418593287467957,
"epoch": 0.9523809523809523,
"grad_norm": 0.0381033793091774,
"learning_rate": 0.0002,
"loss": 0.5492621660232544,
"mean_token_accuracy": 0.7769514173269272,
"num_tokens": 4167360.0,
"step": 255
},
{
"entropy": 0.5375488549470901,
"epoch": 0.9561157796451915,
"grad_norm": 0.029893534258008003,
"learning_rate": 0.0002,
"loss": 0.5434277057647705,
"mean_token_accuracy": 0.7754911035299301,
"num_tokens": 4183517.0,
"step": 256
},
{
"entropy": 0.5487121939659119,
"epoch": 0.9598506069094305,
"grad_norm": 0.03323543071746826,
"learning_rate": 0.0002,
"loss": 0.549543559551239,
"mean_token_accuracy": 0.7791514545679092,
"num_tokens": 4200020.0,
"step": 257
},
{
"entropy": 0.5533169955015182,
"epoch": 0.9635854341736695,
"grad_norm": 0.1564125418663025,
"learning_rate": 0.0002,
"loss": 0.5513023138046265,
"mean_token_accuracy": 0.7750032246112823,
"num_tokens": 4216280.0,
"step": 258
},
{
"entropy": 0.5475684553384781,
"epoch": 0.9673202614379085,
"grad_norm": 0.05765023082494736,
"learning_rate": 0.0002,
"loss": 0.5540170073509216,
"mean_token_accuracy": 0.778236523270607,
"num_tokens": 4232501.0,
"step": 259
},
{
"entropy": 0.5620233714580536,
"epoch": 0.9710550887021475,
"grad_norm": 0.046510934829711914,
"learning_rate": 0.0002,
"loss": 0.5589131712913513,
"mean_token_accuracy": 0.7736849784851074,
"num_tokens": 4248855.0,
"step": 260
},
{
"entropy": 0.565828487277031,
"epoch": 0.9747899159663865,
"grad_norm": 0.0395890548825264,
"learning_rate": 0.0002,
"loss": 0.5624877214431763,
"mean_token_accuracy": 0.7722225338220596,
"num_tokens": 4265077.0,
"step": 261
},
{
"entropy": 0.5551140010356903,
"epoch": 0.9785247432306255,
"grad_norm": 0.03330749273300171,
"learning_rate": 0.0002,
"loss": 0.5576150417327881,
"mean_token_accuracy": 0.7741483747959137,
"num_tokens": 4281357.0,
"step": 262
},
{
"entropy": 0.5746229141950607,
"epoch": 0.9822595704948646,
"grad_norm": 0.03519619628787041,
"learning_rate": 0.0002,
"loss": 0.582584023475647,
"mean_token_accuracy": 0.7654829919338226,
"num_tokens": 4297699.0,
"step": 263
},
{
"entropy": 0.5782353579998016,
"epoch": 0.9859943977591037,
"grad_norm": 0.03913693502545357,
"learning_rate": 0.0002,
"loss": 0.5755780339241028,
"mean_token_accuracy": 0.7660959511995316,
"num_tokens": 4314249.0,
"step": 264
},
{
"entropy": 0.5513299107551575,
"epoch": 0.9897292250233427,
"grad_norm": 0.030444784089922905,
"learning_rate": 0.0002,
"loss": 0.5514294505119324,
"mean_token_accuracy": 0.7750695049762726,
"num_tokens": 4330437.0,
"step": 265
},
{
"entropy": 0.5386128276586533,
"epoch": 0.9934640522875817,
"grad_norm": 0.03275322541594505,
"learning_rate": 0.0002,
"loss": 0.540998637676239,
"mean_token_accuracy": 0.7796358019113541,
"num_tokens": 4346677.0,
"step": 266
},
{
"entropy": 0.5513150691986084,
"epoch": 0.9971988795518207,
"grad_norm": 0.03458503261208534,
"learning_rate": 0.0002,
"loss": 0.5484628677368164,
"mean_token_accuracy": 0.779531255364418,
"num_tokens": 4363004.0,
"step": 267
},
{
"entropy": 0.5694002906481425,
"epoch": 1.0,
"grad_norm": 0.033372946083545685,
"learning_rate": 0.0002,
"loss": 0.5757001638412476,
"mean_token_accuracy": 0.7725784182548523,
"num_tokens": 4364721.0,
"step": 268
},
{
"entropy": 0.5490456074476242,
"epoch": 1.003734827264239,
"grad_norm": 0.030816873535513878,
"learning_rate": 0.0002,
"loss": 0.5466992855072021,
"mean_token_accuracy": 0.7772593349218369,
"num_tokens": 4380959.0,
"step": 269
},
{
"entropy": 0.5297957360744476,
"epoch": 1.007469654528478,
"grad_norm": 0.0300835482776165,
"learning_rate": 0.0002,
"loss": 0.5296781063079834,
"mean_token_accuracy": 0.7851966172456741,
"num_tokens": 4397319.0,
"step": 270
},
{
"entropy": 0.5426550507545471,
"epoch": 1.011204481792717,
"grad_norm": 0.0309379193931818,
"learning_rate": 0.0002,
"loss": 0.5401790142059326,
"mean_token_accuracy": 0.7784202843904495,
"num_tokens": 4413503.0,
"step": 271
},
{
"entropy": 0.536088228225708,
"epoch": 1.014939309056956,
"grad_norm": 0.030822666361927986,
"learning_rate": 0.0002,
"loss": 0.533880352973938,
"mean_token_accuracy": 0.7821955978870392,
"num_tokens": 4429731.0,
"step": 272
},
{
"entropy": 0.5376520156860352,
"epoch": 1.018674136321195,
"grad_norm": 0.03910338878631592,
"learning_rate": 0.0002,
"loss": 0.5515881776809692,
"mean_token_accuracy": 0.7752164155244827,
"num_tokens": 4445975.0,
"step": 273
},
{
"entropy": 0.5337154120206833,
"epoch": 1.022408963585434,
"grad_norm": 0.030765611678361893,
"learning_rate": 0.0002,
"loss": 0.5412506461143494,
"mean_token_accuracy": 0.7780167758464813,
"num_tokens": 4462105.0,
"step": 274
},
{
"entropy": 0.5487084090709686,
"epoch": 1.026143790849673,
"grad_norm": 0.03003527596592903,
"learning_rate": 0.0002,
"loss": 0.540929913520813,
"mean_token_accuracy": 0.7784045934677124,
"num_tokens": 4478591.0,
"step": 275
},
{
"entropy": 0.5385126918554306,
"epoch": 1.0298786181139121,
"grad_norm": 0.027475042268633842,
"learning_rate": 0.0002,
"loss": 0.5318593978881836,
"mean_token_accuracy": 0.7862093448638916,
"num_tokens": 4495044.0,
"step": 276
},
{
"entropy": 0.5600587129592896,
"epoch": 1.0336134453781514,
"grad_norm": 0.029431000351905823,
"learning_rate": 0.0002,
"loss": 0.5559869408607483,
"mean_token_accuracy": 0.7744521498680115,
"num_tokens": 4511459.0,
"step": 277
},
{
"entropy": 0.5381200164556503,
"epoch": 1.0373482726423904,
"grad_norm": 0.02848048508167267,
"learning_rate": 0.0002,
"loss": 0.5395113229751587,
"mean_token_accuracy": 0.7798527628183365,
"num_tokens": 4527903.0,
"step": 278
},
{
"entropy": 0.5346540361642838,
"epoch": 1.0410830999066294,
"grad_norm": 0.033454034477472305,
"learning_rate": 0.0002,
"loss": 0.5404960513114929,
"mean_token_accuracy": 0.7793795019388199,
"num_tokens": 4544182.0,
"step": 279
},
{
"entropy": 0.544955238699913,
"epoch": 1.0448179271708684,
"grad_norm": 0.02894734963774681,
"learning_rate": 0.0002,
"loss": 0.5436176061630249,
"mean_token_accuracy": 0.7777452617883682,
"num_tokens": 4560880.0,
"step": 280
},
{
"entropy": 0.5431416183710098,
"epoch": 1.0485527544351074,
"grad_norm": 0.02903336100280285,
"learning_rate": 0.0002,
"loss": 0.5436229109764099,
"mean_token_accuracy": 0.7780826389789581,
"num_tokens": 4577183.0,
"step": 281
},
{
"entropy": 0.5408187806606293,
"epoch": 1.0522875816993464,
"grad_norm": 0.029271787032485008,
"learning_rate": 0.0002,
"loss": 0.5370380282402039,
"mean_token_accuracy": 0.7815099805593491,
"num_tokens": 4593864.0,
"step": 282
},
{
"entropy": 0.5497590750455856,
"epoch": 1.0560224089635855,
"grad_norm": 0.028807660564780235,
"learning_rate": 0.0002,
"loss": 0.5504873991012573,
"mean_token_accuracy": 0.777531310915947,
"num_tokens": 4610349.0,
"step": 283
},
{
"entropy": 0.5368742346763611,
"epoch": 1.0597572362278245,
"grad_norm": 0.031959034502506256,
"learning_rate": 0.0002,
"loss": 0.5419926643371582,
"mean_token_accuracy": 0.7784341871738434,
"num_tokens": 4626437.0,
"step": 284
},
{
"entropy": 0.5532872825860977,
"epoch": 1.0634920634920635,
"grad_norm": 0.028826460242271423,
"learning_rate": 0.0002,
"loss": 0.5571930408477783,
"mean_token_accuracy": 0.7746778875589371,
"num_tokens": 4642633.0,
"step": 285
},
{
"entropy": 0.5407113283872604,
"epoch": 1.0672268907563025,
"grad_norm": 0.03065388835966587,
"learning_rate": 0.0002,
"loss": 0.5436424612998962,
"mean_token_accuracy": 0.779659166932106,
"num_tokens": 4658940.0,
"step": 286
},
{
"entropy": 0.5552934855222702,
"epoch": 1.0709617180205415,
"grad_norm": 0.03264114633202553,
"learning_rate": 0.0002,
"loss": 0.5482615232467651,
"mean_token_accuracy": 0.7754945755004883,
"num_tokens": 4675263.0,
"step": 287
},
{
"entropy": 0.5442743301391602,
"epoch": 1.0746965452847805,
"grad_norm": 0.031116079539060593,
"learning_rate": 0.0002,
"loss": 0.538812518119812,
"mean_token_accuracy": 0.7806833982467651,
"num_tokens": 4691415.0,
"step": 288
},
{
"entropy": 0.5530855804681778,
"epoch": 1.0784313725490196,
"grad_norm": 0.03077593445777893,
"learning_rate": 0.0002,
"loss": 0.548968493938446,
"mean_token_accuracy": 0.7756039202213287,
"num_tokens": 4707736.0,
"step": 289
},
{
"entropy": 0.5455960035324097,
"epoch": 1.0821661998132586,
"grad_norm": 0.028605274856090546,
"learning_rate": 0.0002,
"loss": 0.5435131788253784,
"mean_token_accuracy": 0.7795460671186447,
"num_tokens": 4724095.0,
"step": 290
},
{
"entropy": 0.5397526025772095,
"epoch": 1.0859010270774976,
"grad_norm": 0.03644070401787758,
"learning_rate": 0.0002,
"loss": 0.5488567352294922,
"mean_token_accuracy": 0.7778657674789429,
"num_tokens": 4740602.0,
"step": 291
},
{
"entropy": 0.5470818132162094,
"epoch": 1.0896358543417366,
"grad_norm": 0.033212918788194656,
"learning_rate": 0.0002,
"loss": 0.555572509765625,
"mean_token_accuracy": 0.7734686136245728,
"num_tokens": 4756842.0,
"step": 292
},
{
"entropy": 0.5398264974355698,
"epoch": 1.0933706816059758,
"grad_norm": 0.027302522212266922,
"learning_rate": 0.0002,
"loss": 0.5371235013008118,
"mean_token_accuracy": 0.7826644480228424,
"num_tokens": 4773499.0,
"step": 293
},
{
"entropy": 0.564954400062561,
"epoch": 1.0971055088702149,
"grad_norm": 0.02829107642173767,
"learning_rate": 0.0002,
"loss": 0.5558594465255737,
"mean_token_accuracy": 0.7749541401863098,
"num_tokens": 4790183.0,
"step": 294
},
{
"entropy": 0.5593573749065399,
"epoch": 1.1008403361344539,
"grad_norm": 0.027547527104616165,
"learning_rate": 0.0002,
"loss": 0.5560394525527954,
"mean_token_accuracy": 0.7725719660520554,
"num_tokens": 4806455.0,
"step": 295
},
{
"entropy": 0.5377779453992844,
"epoch": 1.1045751633986929,
"grad_norm": 0.03161724656820297,
"learning_rate": 0.0002,
"loss": 0.5370453596115112,
"mean_token_accuracy": 0.782875582575798,
"num_tokens": 4822731.0,
"step": 296
},
{
"entropy": 0.5386165231466293,
"epoch": 1.108309990662932,
"grad_norm": 0.03147651255130768,
"learning_rate": 0.0002,
"loss": 0.5423634648323059,
"mean_token_accuracy": 0.7768422961235046,
"num_tokens": 4839112.0,
"step": 297
},
{
"entropy": 0.5279396325349808,
"epoch": 1.112044817927171,
"grad_norm": 0.031283456832170486,
"learning_rate": 0.0002,
"loss": 0.5321308970451355,
"mean_token_accuracy": 0.7849069982767105,
"num_tokens": 4855229.0,
"step": 298
},
{
"entropy": 0.5327593311667442,
"epoch": 1.11577964519141,
"grad_norm": 0.03042989782989025,
"learning_rate": 0.0002,
"loss": 0.5393236875534058,
"mean_token_accuracy": 0.7804521471261978,
"num_tokens": 4871644.0,
"step": 299
},
{
"entropy": 0.560793936252594,
"epoch": 1.119514472455649,
"grad_norm": 0.029397251084446907,
"learning_rate": 0.0002,
"loss": 0.5557554960250854,
"mean_token_accuracy": 0.7728655338287354,
"num_tokens": 4887992.0,
"step": 300
},
{
"entropy": 0.5604539066553116,
"epoch": 1.123249299719888,
"grad_norm": 0.02948898635804653,
"learning_rate": 0.0002,
"loss": 0.5545894503593445,
"mean_token_accuracy": 0.7743670493364334,
"num_tokens": 4904384.0,
"step": 301
},
{
"entropy": 0.5394376814365387,
"epoch": 1.126984126984127,
"grad_norm": 0.029182471334934235,
"learning_rate": 0.0002,
"loss": 0.5341510772705078,
"mean_token_accuracy": 0.7823253571987152,
"num_tokens": 4920587.0,
"step": 302
},
{
"entropy": 0.5301040560007095,
"epoch": 1.130718954248366,
"grad_norm": 0.03680079057812691,
"learning_rate": 0.0002,
"loss": 0.5372604131698608,
"mean_token_accuracy": 0.7793124318122864,
"num_tokens": 4937055.0,
"step": 303
},
{
"entropy": 0.5290943831205368,
"epoch": 1.134453781512605,
"grad_norm": 0.03931280970573425,
"learning_rate": 0.0002,
"loss": 0.5391898155212402,
"mean_token_accuracy": 0.7829029709100723,
"num_tokens": 4953281.0,
"step": 304
},
{
"entropy": 0.5609545707702637,
"epoch": 1.138188608776844,
"grad_norm": 0.030014565214514732,
"learning_rate": 0.0002,
"loss": 0.5609763264656067,
"mean_token_accuracy": 0.7726535797119141,
"num_tokens": 4969665.0,
"step": 305
},
{
"entropy": 0.5205260962247849,
"epoch": 1.141923436041083,
"grad_norm": 0.03301642835140228,
"learning_rate": 0.0002,
"loss": 0.5286065340042114,
"mean_token_accuracy": 0.7840328961610794,
"num_tokens": 4985863.0,
"step": 306
},
{
"entropy": 0.5605068057775497,
"epoch": 1.145658263305322,
"grad_norm": 0.029299437999725342,
"learning_rate": 0.0002,
"loss": 0.5569101572036743,
"mean_token_accuracy": 0.7721403539180756,
"num_tokens": 5002543.0,
"step": 307
},
{
"entropy": 0.552753359079361,
"epoch": 1.149393090569561,
"grad_norm": 0.027307430282235146,
"learning_rate": 0.0002,
"loss": 0.5464527606964111,
"mean_token_accuracy": 0.7777755260467529,
"num_tokens": 5019035.0,
"step": 308
},
{
"entropy": 0.5565258711576462,
"epoch": 1.1531279178338,
"grad_norm": 0.028590641915798187,
"learning_rate": 0.0002,
"loss": 0.551773726940155,
"mean_token_accuracy": 0.7753841280937195,
"num_tokens": 5035778.0,
"step": 309
},
{
"entropy": 0.5335747301578522,
"epoch": 1.156862745098039,
"grad_norm": 0.02846100926399231,
"learning_rate": 0.0002,
"loss": 0.5332034826278687,
"mean_token_accuracy": 0.7849084585905075,
"num_tokens": 5052106.0,
"step": 310
},
{
"entropy": 0.5462342649698257,
"epoch": 1.1605975723622783,
"grad_norm": 0.03037341870367527,
"learning_rate": 0.0002,
"loss": 0.5533976554870605,
"mean_token_accuracy": 0.7761731296777725,
"num_tokens": 5068494.0,
"step": 311
},
{
"entropy": 0.5365739315748215,
"epoch": 1.1643323996265174,
"grad_norm": 0.0328284353017807,
"learning_rate": 0.0002,
"loss": 0.5443044900894165,
"mean_token_accuracy": 0.7775984853506088,
"num_tokens": 5084698.0,
"step": 312
},
{
"entropy": 0.5469802767038345,
"epoch": 1.1680672268907564,
"grad_norm": 0.029220817610621452,
"learning_rate": 0.0002,
"loss": 0.5449838638305664,
"mean_token_accuracy": 0.7794362902641296,
"num_tokens": 5101231.0,
"step": 313
},
{
"entropy": 0.5534107983112335,
"epoch": 1.1718020541549954,
"grad_norm": 0.03240218386054039,
"learning_rate": 0.0002,
"loss": 0.5596653819084167,
"mean_token_accuracy": 0.7733468264341354,
"num_tokens": 5117669.0,
"step": 314
},
{
"entropy": 0.5505286902189255,
"epoch": 1.1755368814192344,
"grad_norm": 0.030088460072875023,
"learning_rate": 0.0002,
"loss": 0.5460378527641296,
"mean_token_accuracy": 0.7785163521766663,
"num_tokens": 5134044.0,
"step": 315
},
{
"entropy": 0.5583444237709045,
"epoch": 1.1792717086834734,
"grad_norm": 0.03908608481287956,
"learning_rate": 0.0002,
"loss": 0.5499372482299805,
"mean_token_accuracy": 0.7741111516952515,
"num_tokens": 5150155.0,
"step": 316
},
{
"entropy": 0.5583514273166656,
"epoch": 1.1830065359477124,
"grad_norm": 0.03262948617339134,
"learning_rate": 0.0002,
"loss": 0.5514504909515381,
"mean_token_accuracy": 0.7749726176261902,
"num_tokens": 5166653.0,
"step": 317
},
{
"entropy": 0.54158616065979,
"epoch": 1.1867413632119514,
"grad_norm": 0.030375484377145767,
"learning_rate": 0.0002,
"loss": 0.535007119178772,
"mean_token_accuracy": 0.78143410384655,
"num_tokens": 5182849.0,
"step": 318
},
{
"entropy": 0.5355552136898041,
"epoch": 1.1904761904761905,
"grad_norm": 0.034217700362205505,
"learning_rate": 0.0002,
"loss": 0.5416175723075867,
"mean_token_accuracy": 0.7821937054395676,
"num_tokens": 5199310.0,
"step": 319
},
{
"entropy": 0.5375736951828003,
"epoch": 1.1942110177404295,
"grad_norm": 0.03742173686623573,
"learning_rate": 0.0002,
"loss": 0.5497441291809082,
"mean_token_accuracy": 0.779162734746933,
"num_tokens": 5215628.0,
"step": 320
},
{
"entropy": 0.5327057242393494,
"epoch": 1.1979458450046685,
"grad_norm": 0.03143603354692459,
"learning_rate": 0.0002,
"loss": 0.5377879738807678,
"mean_token_accuracy": 0.7819731533527374,
"num_tokens": 5232104.0,
"step": 321
},
{
"entropy": 0.5589822083711624,
"epoch": 1.2016806722689075,
"grad_norm": 0.030957849696278572,
"learning_rate": 0.0002,
"loss": 0.5600837469100952,
"mean_token_accuracy": 0.772526428103447,
"num_tokens": 5248228.0,
"step": 322
},
{
"entropy": 0.5267817825078964,
"epoch": 1.2054154995331465,
"grad_norm": 0.028181420639157295,
"learning_rate": 0.0002,
"loss": 0.5258863568305969,
"mean_token_accuracy": 0.7852722406387329,
"num_tokens": 5264722.0,
"step": 323
},
{
"entropy": 0.5596602708101273,
"epoch": 1.2091503267973855,
"grad_norm": 0.0294583011418581,
"learning_rate": 0.0002,
"loss": 0.5542659163475037,
"mean_token_accuracy": 0.7757792323827744,
"num_tokens": 5281102.0,
"step": 324
},
{
"entropy": 0.5631477683782578,
"epoch": 1.2128851540616246,
"grad_norm": 0.028790894895792007,
"learning_rate": 0.0002,
"loss": 0.5568723678588867,
"mean_token_accuracy": 0.771973267197609,
"num_tokens": 5297684.0,
"step": 325
},
{
"entropy": 0.5380028486251831,
"epoch": 1.2166199813258638,
"grad_norm": 0.031924713402986526,
"learning_rate": 0.0002,
"loss": 0.5376958847045898,
"mean_token_accuracy": 0.7829422205686569,
"num_tokens": 5313908.0,
"step": 326
},
{
"entropy": 0.5375301241874695,
"epoch": 1.2203548085901028,
"grad_norm": 0.03397483006119728,
"learning_rate": 0.0002,
"loss": 0.5478475093841553,
"mean_token_accuracy": 0.7765705734491348,
"num_tokens": 5329966.0,
"step": 327
},
{
"entropy": 0.5427165776491165,
"epoch": 1.2240896358543418,
"grad_norm": 0.035384900867938995,
"learning_rate": 0.0002,
"loss": 0.5524033308029175,
"mean_token_accuracy": 0.7745779901742935,
"num_tokens": 5346453.0,
"step": 328
},
{
"entropy": 0.5400120764970779,
"epoch": 1.2278244631185808,
"grad_norm": 0.030376868322491646,
"learning_rate": 0.0002,
"loss": 0.5346859097480774,
"mean_token_accuracy": 0.7804136276245117,
"num_tokens": 5362598.0,
"step": 329
},
{
"entropy": 0.5525883883237839,
"epoch": 1.2315592903828199,
"grad_norm": 0.029532834887504578,
"learning_rate": 0.0002,
"loss": 0.5460601449012756,
"mean_token_accuracy": 0.7782909572124481,
"num_tokens": 5378809.0,
"step": 330
},
{
"entropy": 0.5435810536146164,
"epoch": 1.2352941176470589,
"grad_norm": 0.02912810444831848,
"learning_rate": 0.0002,
"loss": 0.5412687659263611,
"mean_token_accuracy": 0.7805328518152237,
"num_tokens": 5394964.0,
"step": 331
},
{
"entropy": 0.5558127015829086,
"epoch": 1.239028944911298,
"grad_norm": 0.03399093821644783,
"learning_rate": 0.0002,
"loss": 0.5503210425376892,
"mean_token_accuracy": 0.7771144658327103,
"num_tokens": 5411296.0,
"step": 332
},
{
"entropy": 0.5612344145774841,
"epoch": 1.242763772175537,
"grad_norm": 0.028297265991568565,
"learning_rate": 0.0002,
"loss": 0.561404824256897,
"mean_token_accuracy": 0.7735303044319153,
"num_tokens": 5427522.0,
"step": 333
},
{
"entropy": 0.5317913144826889,
"epoch": 1.246498599439776,
"grad_norm": 0.03494315594434738,
"learning_rate": 0.0002,
"loss": 0.5433036684989929,
"mean_token_accuracy": 0.7796971648931503,
"num_tokens": 5443757.0,
"step": 334
},
{
"entropy": 0.542137622833252,
"epoch": 1.250233426704015,
"grad_norm": 0.02819279581308365,
"learning_rate": 0.0002,
"loss": 0.5451513528823853,
"mean_token_accuracy": 0.7785246819257736,
"num_tokens": 5460219.0,
"step": 335
},
{
"entropy": 0.5389015078544617,
"epoch": 1.253968253968254,
"grad_norm": 0.029153091832995415,
"learning_rate": 0.0002,
"loss": 0.5426021218299866,
"mean_token_accuracy": 0.7783170789480209,
"num_tokens": 5476465.0,
"step": 336
},
{
"entropy": 0.5529672205448151,
"epoch": 1.257703081232493,
"grad_norm": 0.03458336368203163,
"learning_rate": 0.0002,
"loss": 0.540812611579895,
"mean_token_accuracy": 0.7807324081659317,
"num_tokens": 5492565.0,
"step": 337
},
{
"entropy": 0.581393301486969,
"epoch": 1.261437908496732,
"grad_norm": 0.031111041083931923,
"learning_rate": 0.0002,
"loss": 0.5751311779022217,
"mean_token_accuracy": 0.7666933685541153,
"num_tokens": 5509003.0,
"step": 338
},
{
"entropy": 0.5588483065366745,
"epoch": 1.265172735760971,
"grad_norm": 0.030144309625029564,
"learning_rate": 0.0002,
"loss": 0.5589640140533447,
"mean_token_accuracy": 0.7755171656608582,
"num_tokens": 5525262.0,
"step": 339
},
{
"entropy": 0.5336481779813766,
"epoch": 1.26890756302521,
"grad_norm": 0.03417432680726051,
"learning_rate": 0.0002,
"loss": 0.5390788316726685,
"mean_token_accuracy": 0.780031830072403,
"num_tokens": 5541654.0,
"step": 340
},
{
"entropy": 0.5282999128103256,
"epoch": 1.272642390289449,
"grad_norm": 0.03498517721891403,
"learning_rate": 0.0002,
"loss": 0.5387616157531738,
"mean_token_accuracy": 0.7800437808036804,
"num_tokens": 5557983.0,
"step": 341
},
{
"entropy": 0.5369831025600433,
"epoch": 1.276377217553688,
"grad_norm": 0.029845617711544037,
"learning_rate": 0.0002,
"loss": 0.535378634929657,
"mean_token_accuracy": 0.7823457568883896,
"num_tokens": 5574311.0,
"step": 342
},
{
"entropy": 0.5538373440504074,
"epoch": 1.280112044817927,
"grad_norm": 0.027923226356506348,
"learning_rate": 0.0002,
"loss": 0.5500721335411072,
"mean_token_accuracy": 0.7771336436271667,
"num_tokens": 5590547.0,
"step": 343
},
{
"entropy": 0.5545977205038071,
"epoch": 1.283846872082166,
"grad_norm": 0.0305513683706522,
"learning_rate": 0.0002,
"loss": 0.5511223077774048,
"mean_token_accuracy": 0.7757980972528458,
"num_tokens": 5606717.0,
"step": 344
},
{
"entropy": 0.560431718826294,
"epoch": 1.287581699346405,
"grad_norm": 0.029267068952322006,
"learning_rate": 0.0002,
"loss": 0.5540031790733337,
"mean_token_accuracy": 0.7738614529371262,
"num_tokens": 5623238.0,
"step": 345
},
{
"entropy": 0.5598475635051727,
"epoch": 1.2913165266106443,
"grad_norm": 0.032441407442092896,
"learning_rate": 0.0002,
"loss": 0.5511676669120789,
"mean_token_accuracy": 0.775727853178978,
"num_tokens": 5639482.0,
"step": 346
},
{
"entropy": 0.532151535153389,
"epoch": 1.2950513538748833,
"grad_norm": 0.03496084734797478,
"learning_rate": 0.0002,
"loss": 0.5387351512908936,
"mean_token_accuracy": 0.7811897695064545,
"num_tokens": 5655745.0,
"step": 347
},
{
"entropy": 0.5362464487552643,
"epoch": 1.2987861811391224,
"grad_norm": 0.03774246945977211,
"learning_rate": 0.0002,
"loss": 0.5451931953430176,
"mean_token_accuracy": 0.7775505632162094,
"num_tokens": 5672305.0,
"step": 348
},
{
"entropy": 0.5285972878336906,
"epoch": 1.3025210084033614,
"grad_norm": 0.0332336500287056,
"learning_rate": 0.0002,
"loss": 0.5353838801383972,
"mean_token_accuracy": 0.7838114500045776,
"num_tokens": 5688630.0,
"step": 349
},
{
"entropy": 0.5421172678470612,
"epoch": 1.3062558356676004,
"grad_norm": 0.03457598015666008,
"learning_rate": 0.0002,
"loss": 0.5392417311668396,
"mean_token_accuracy": 0.7807410657405853,
"num_tokens": 5705054.0,
"step": 350
},
{
"entropy": 0.5382883250713348,
"epoch": 1.3099906629318394,
"grad_norm": 0.031050430610775948,
"learning_rate": 0.0002,
"loss": 0.5347834825515747,
"mean_token_accuracy": 0.7828159481287003,
"num_tokens": 5721382.0,
"step": 351
},
{
"entropy": 0.550368145108223,
"epoch": 1.3137254901960784,
"grad_norm": 0.03463875129818916,
"learning_rate": 0.0002,
"loss": 0.5514199137687683,
"mean_token_accuracy": 0.7735539227724075,
"num_tokens": 5737730.0,
"step": 352
},
{
"entropy": 0.538982629776001,
"epoch": 1.3174603174603174,
"grad_norm": 0.03956155851483345,
"learning_rate": 0.0002,
"loss": 0.5469655990600586,
"mean_token_accuracy": 0.7747407406568527,
"num_tokens": 5753795.0,
"step": 353
},
{
"entropy": 0.5339585244655609,
"epoch": 1.3211951447245565,
"grad_norm": 0.029367057606577873,
"learning_rate": 0.0002,
"loss": 0.536923348903656,
"mean_token_accuracy": 0.7791249603033066,
"num_tokens": 5770100.0,
"step": 354
},
{
"entropy": 0.5469655245542526,
"epoch": 1.3249299719887955,
"grad_norm": 0.044070687144994736,
"learning_rate": 0.0002,
"loss": 0.5485926270484924,
"mean_token_accuracy": 0.7760020345449448,
"num_tokens": 5786242.0,
"step": 355
},
{
"entropy": 0.5686767846345901,
"epoch": 1.3286647992530345,
"grad_norm": 0.0298174861818552,
"learning_rate": 0.0002,
"loss": 0.5646032691001892,
"mean_token_accuracy": 0.7700935900211334,
"num_tokens": 5802594.0,
"step": 356
},
{
"entropy": 0.5524211078882217,
"epoch": 1.3323996265172735,
"grad_norm": 0.03443749621510506,
"learning_rate": 0.0002,
"loss": 0.5538625717163086,
"mean_token_accuracy": 0.7730942517518997,
"num_tokens": 5818733.0,
"step": 357
},
{
"entropy": 0.5450694710016251,
"epoch": 1.3361344537815127,
"grad_norm": 0.042639389634132385,
"learning_rate": 0.0002,
"loss": 0.5457915663719177,
"mean_token_accuracy": 0.7793462425470352,
"num_tokens": 5834966.0,
"step": 358
},
{
"entropy": 0.5628755837678909,
"epoch": 1.3398692810457518,
"grad_norm": 0.031939953565597534,
"learning_rate": 0.0002,
"loss": 0.5615131855010986,
"mean_token_accuracy": 0.7720433920621872,
"num_tokens": 5851352.0,
"step": 359
},
{
"entropy": 0.5299947410821915,
"epoch": 1.3436041083099908,
"grad_norm": 0.03047833777964115,
"learning_rate": 0.0002,
"loss": 0.5295021533966064,
"mean_token_accuracy": 0.7874699085950851,
"num_tokens": 5867820.0,
"step": 360
},
{
"entropy": 0.5308109223842621,
"epoch": 1.3473389355742298,
"grad_norm": 0.032848697155714035,
"learning_rate": 0.0002,
"loss": 0.5431129336357117,
"mean_token_accuracy": 0.7857107818126678,
"num_tokens": 5883984.0,
"step": 361
},
{
"entropy": 0.5426601469516754,
"epoch": 1.3510737628384688,
"grad_norm": 0.033830493688583374,
"learning_rate": 0.0002,
"loss": 0.5514194965362549,
"mean_token_accuracy": 0.77635657787323,
"num_tokens": 5900290.0,
"step": 362
},
{
"entropy": 0.5411643236875534,
"epoch": 1.3548085901027078,
"grad_norm": 0.029694274067878723,
"learning_rate": 0.0002,
"loss": 0.5333205461502075,
"mean_token_accuracy": 0.7832283675670624,
"num_tokens": 5916469.0,
"step": 363
},
{
"entropy": 0.5501731634140015,
"epoch": 1.3585434173669468,
"grad_norm": 0.03007029928267002,
"learning_rate": 0.0002,
"loss": 0.5431393980979919,
"mean_token_accuracy": 0.7804041355848312,
"num_tokens": 5932693.0,
"step": 364
},
{
"entropy": 0.5419217795133591,
"epoch": 1.3622782446311859,
"grad_norm": 0.030986929312348366,
"learning_rate": 0.0002,
"loss": 0.5391764044761658,
"mean_token_accuracy": 0.7810684144496918,
"num_tokens": 5949053.0,
"step": 365
},
{
"entropy": 0.529257670044899,
"epoch": 1.3660130718954249,
"grad_norm": 0.0282028466463089,
"learning_rate": 0.0002,
"loss": 0.5282759666442871,
"mean_token_accuracy": 0.7846860438585281,
"num_tokens": 5965428.0,
"step": 366
},
{
"entropy": 0.5425796508789062,
"epoch": 1.3697478991596639,
"grad_norm": 0.03842358663678169,
"learning_rate": 0.0002,
"loss": 0.5492331981658936,
"mean_token_accuracy": 0.7747556120157242,
"num_tokens": 5981730.0,
"step": 367
},
{
"entropy": 0.5349410325288773,
"epoch": 1.373482726423903,
"grad_norm": 0.033598389476537704,
"learning_rate": 0.0002,
"loss": 0.5436474084854126,
"mean_token_accuracy": 0.7797878831624985,
"num_tokens": 5997949.0,
"step": 368
},
{
"entropy": 0.552407756447792,
"epoch": 1.377217553688142,
"grad_norm": 0.03342469781637192,
"learning_rate": 0.0002,
"loss": 0.5567049980163574,
"mean_token_accuracy": 0.7723858207464218,
"num_tokens": 6014178.0,
"step": 369
},
{
"entropy": 0.5454883426427841,
"epoch": 1.380952380952381,
"grad_norm": 0.03550714999437332,
"learning_rate": 0.0002,
"loss": 0.5418342351913452,
"mean_token_accuracy": 0.7798961699008942,
"num_tokens": 6030806.0,
"step": 370
},
{
"entropy": 0.552109032869339,
"epoch": 1.38468720821662,
"grad_norm": 0.03026903234422207,
"learning_rate": 0.0002,
"loss": 0.5456339120864868,
"mean_token_accuracy": 0.7773927599191666,
"num_tokens": 6046782.0,
"step": 371
},
{
"entropy": 0.5603116452693939,
"epoch": 1.388422035480859,
"grad_norm": 0.03449714556336403,
"learning_rate": 0.0002,
"loss": 0.5605192184448242,
"mean_token_accuracy": 0.7709443271160126,
"num_tokens": 6063178.0,
"step": 372
},
{
"entropy": 0.5442145317792892,
"epoch": 1.392156862745098,
"grad_norm": 0.03407449275255203,
"learning_rate": 0.0002,
"loss": 0.5482808947563171,
"mean_token_accuracy": 0.7804455161094666,
"num_tokens": 6079813.0,
"step": 373
},
{
"entropy": 0.5443685501813889,
"epoch": 1.395891690009337,
"grad_norm": 0.03118809685111046,
"learning_rate": 0.0002,
"loss": 0.5504392385482788,
"mean_token_accuracy": 0.7759056687355042,
"num_tokens": 6096208.0,
"step": 374
},
{
"entropy": 0.5544550269842148,
"epoch": 1.399626517273576,
"grad_norm": 0.03532007709145546,
"learning_rate": 0.0002,
"loss": 0.5569352507591248,
"mean_token_accuracy": 0.7748352587223053,
"num_tokens": 6112356.0,
"step": 375
},
{
"entropy": 0.5439307242631912,
"epoch": 1.403361344537815,
"grad_norm": 0.0334586501121521,
"learning_rate": 0.0002,
"loss": 0.542488694190979,
"mean_token_accuracy": 0.777744397521019,
"num_tokens": 6128800.0,
"step": 376
},
{
"entropy": 0.5407049357891083,
"epoch": 1.407096171802054,
"grad_norm": 0.029349738731980324,
"learning_rate": 0.0002,
"loss": 0.5370444655418396,
"mean_token_accuracy": 0.7816447019577026,
"num_tokens": 6145053.0,
"step": 377
},
{
"entropy": 0.5527060329914093,
"epoch": 1.410830999066293,
"grad_norm": 0.030373841524124146,
"learning_rate": 0.0002,
"loss": 0.5530543327331543,
"mean_token_accuracy": 0.775768980383873,
"num_tokens": 6161518.0,
"step": 378
},
{
"entropy": 0.5383686721324921,
"epoch": 1.4145658263305323,
"grad_norm": 0.033442895859479904,
"learning_rate": 0.0002,
"loss": 0.539923369884491,
"mean_token_accuracy": 0.7825078517198563,
"num_tokens": 6177817.0,
"step": 379
},
{
"entropy": 0.5557737052440643,
"epoch": 1.4183006535947713,
"grad_norm": 0.03396908566355705,
"learning_rate": 0.0002,
"loss": 0.5632482767105103,
"mean_token_accuracy": 0.7692397683858871,
"num_tokens": 6194312.0,
"step": 380
},
{
"entropy": 0.5457819253206253,
"epoch": 1.4220354808590103,
"grad_norm": 0.02866293303668499,
"learning_rate": 0.0002,
"loss": 0.5467988848686218,
"mean_token_accuracy": 0.7775601893663406,
"num_tokens": 6210818.0,
"step": 381
},
{
"entropy": 0.5640534311532974,
"epoch": 1.4257703081232493,
"grad_norm": 0.027476362884044647,
"learning_rate": 0.0002,
"loss": 0.5636141896247864,
"mean_token_accuracy": 0.7717417329549789,
"num_tokens": 6227080.0,
"step": 382
},
{
"entropy": 0.560546487569809,
"epoch": 1.4295051353874884,
"grad_norm": 0.030654683709144592,
"learning_rate": 0.0002,
"loss": 0.5566866397857666,
"mean_token_accuracy": 0.7725766897201538,
"num_tokens": 6243654.0,
"step": 383
},
{
"entropy": 0.5566196143627167,
"epoch": 1.4332399626517274,
"grad_norm": 0.03377790376543999,
"learning_rate": 0.0002,
"loss": 0.5511550903320312,
"mean_token_accuracy": 0.7775295376777649,
"num_tokens": 6259998.0,
"step": 384
},
{
"entropy": 0.5302538275718689,
"epoch": 1.4369747899159664,
"grad_norm": 0.028172362595796585,
"learning_rate": 0.0002,
"loss": 0.5359051823616028,
"mean_token_accuracy": 0.7816868871450424,
"num_tokens": 6276398.0,
"step": 385
},
{
"entropy": 0.543848991394043,
"epoch": 1.4407096171802054,
"grad_norm": 0.03123684599995613,
"learning_rate": 0.0002,
"loss": 0.5530490875244141,
"mean_token_accuracy": 0.7756175249814987,
"num_tokens": 6292623.0,
"step": 386
},
{
"entropy": 0.5351638197898865,
"epoch": 1.4444444444444444,
"grad_norm": 0.032041870057582855,
"learning_rate": 0.0002,
"loss": 0.5453383326530457,
"mean_token_accuracy": 0.7787481844425201,
"num_tokens": 6308980.0,
"step": 387
},
{
"entropy": 0.5499856919050217,
"epoch": 1.4481792717086834,
"grad_norm": 0.03275283798575401,
"learning_rate": 0.0002,
"loss": 0.5510199666023254,
"mean_token_accuracy": 0.7770793437957764,
"num_tokens": 6325352.0,
"step": 388
},
{
"entropy": 0.5473773032426834,
"epoch": 1.4519140989729225,
"grad_norm": 0.02793571725487709,
"learning_rate": 0.0002,
"loss": 0.540398120880127,
"mean_token_accuracy": 0.7805086821317673,
"num_tokens": 6341686.0,
"step": 389
},
{
"entropy": 0.553907573223114,
"epoch": 1.4556489262371615,
"grad_norm": 0.02763449028134346,
"learning_rate": 0.0002,
"loss": 0.5470324754714966,
"mean_token_accuracy": 0.7763955593109131,
"num_tokens": 6358367.0,
"step": 390
},
{
"entropy": 0.54300856590271,
"epoch": 1.4593837535014005,
"grad_norm": 0.0320272259414196,
"learning_rate": 0.0002,
"loss": 0.5394243001937866,
"mean_token_accuracy": 0.7796929031610489,
"num_tokens": 6374332.0,
"step": 391
},
{
"entropy": 0.5419201552867889,
"epoch": 1.4631185807656397,
"grad_norm": 0.029694141820073128,
"learning_rate": 0.0002,
"loss": 0.5459417104721069,
"mean_token_accuracy": 0.7794879227876663,
"num_tokens": 6390817.0,
"step": 392
},
{
"entropy": 0.533346489071846,
"epoch": 1.4668534080298787,
"grad_norm": 0.031921736896038055,
"learning_rate": 0.0002,
"loss": 0.5339134335517883,
"mean_token_accuracy": 0.7845402210950851,
"num_tokens": 6407105.0,
"step": 393
},
{
"entropy": 0.5490029752254486,
"epoch": 1.4705882352941178,
"grad_norm": 0.031292662024497986,
"learning_rate": 0.0002,
"loss": 0.5461300611495972,
"mean_token_accuracy": 0.7792785912752151,
"num_tokens": 6423432.0,
"step": 394
},
{
"entropy": 0.5407290160655975,
"epoch": 1.4743230625583568,
"grad_norm": 0.029509229585528374,
"learning_rate": 0.0002,
"loss": 0.5409979224205017,
"mean_token_accuracy": 0.7798801958560944,
"num_tokens": 6440111.0,
"step": 395
},
{
"entropy": 0.5352925509214401,
"epoch": 1.4780578898225958,
"grad_norm": 0.03132627159357071,
"learning_rate": 0.0002,
"loss": 0.5360226035118103,
"mean_token_accuracy": 0.7835162281990051,
"num_tokens": 6456553.0,
"step": 396
},
{
"entropy": 0.5409245789051056,
"epoch": 1.4817927170868348,
"grad_norm": 0.032262932509183884,
"learning_rate": 0.0002,
"loss": 0.5367339253425598,
"mean_token_accuracy": 0.779682844877243,
"num_tokens": 6472831.0,
"step": 397
},
{
"entropy": 0.5202168971300125,
"epoch": 1.4855275443510738,
"grad_norm": 0.033896930515766144,
"learning_rate": 0.0002,
"loss": 0.5268123149871826,
"mean_token_accuracy": 0.7819826900959015,
"num_tokens": 6488931.0,
"step": 398
},
{
"entropy": 0.5325956791639328,
"epoch": 1.4892623716153128,
"grad_norm": 0.03540036827325821,
"learning_rate": 0.0002,
"loss": 0.5433887839317322,
"mean_token_accuracy": 0.778034120798111,
"num_tokens": 6505354.0,
"step": 399
},
{
"entropy": 0.5327711254358292,
"epoch": 1.4929971988795518,
"grad_norm": 0.02958959899842739,
"learning_rate": 0.0002,
"loss": 0.5335476398468018,
"mean_token_accuracy": 0.7828179448843002,
"num_tokens": 6521544.0,
"step": 400
},
{
"entropy": 0.5357908606529236,
"epoch": 1.4967320261437909,
"grad_norm": 0.027617521584033966,
"learning_rate": 0.0002,
"loss": 0.5293720364570618,
"mean_token_accuracy": 0.7868403792381287,
"num_tokens": 6537889.0,
"step": 401
},
{
"entropy": 0.5473283380270004,
"epoch": 1.5004668534080299,
"grad_norm": 0.028360038995742798,
"learning_rate": 0.0002,
"loss": 0.5436528325080872,
"mean_token_accuracy": 0.7810066491365433,
"num_tokens": 6554149.0,
"step": 402
},
{
"entropy": 0.5518513321876526,
"epoch": 1.504201680672269,
"grad_norm": 0.031041931360960007,
"learning_rate": 0.0002,
"loss": 0.545119047164917,
"mean_token_accuracy": 0.7779288738965988,
"num_tokens": 6570521.0,
"step": 403
},
{
"entropy": 0.5428237915039062,
"epoch": 1.507936507936508,
"grad_norm": 0.032197825610637665,
"learning_rate": 0.0002,
"loss": 0.5472823977470398,
"mean_token_accuracy": 0.7758528888225555,
"num_tokens": 6587086.0,
"step": 404
},
{
"entropy": 0.5483403950929642,
"epoch": 1.511671335200747,
"grad_norm": 0.03174825757741928,
"learning_rate": 0.0002,
"loss": 0.5524789094924927,
"mean_token_accuracy": 0.7772649824619293,
"num_tokens": 6603513.0,
"step": 405
},
{
"entropy": 0.5337469726800919,
"epoch": 1.515406162464986,
"grad_norm": 0.03365413472056389,
"learning_rate": 0.0002,
"loss": 0.5418713688850403,
"mean_token_accuracy": 0.7772432267665863,
"num_tokens": 6619737.0,
"step": 406
},
{
"entropy": 0.5614880919456482,
"epoch": 1.519140989729225,
"grad_norm": 0.030781377106904984,
"learning_rate": 0.0002,
"loss": 0.5604795217514038,
"mean_token_accuracy": 0.7718411535024643,
"num_tokens": 6636097.0,
"step": 407
},
{
"entropy": 0.5390657633543015,
"epoch": 1.522875816993464,
"grad_norm": 0.02782733179628849,
"learning_rate": 0.0002,
"loss": 0.5329728126525879,
"mean_token_accuracy": 0.7839234322309494,
"num_tokens": 6652406.0,
"step": 408
},
{
"entropy": 0.5573919266462326,
"epoch": 1.526610644257703,
"grad_norm": 0.027401108294725418,
"learning_rate": 0.0002,
"loss": 0.5554807186126709,
"mean_token_accuracy": 0.7726366519927979,
"num_tokens": 6668812.0,
"step": 409
},
{
"entropy": 0.5391197204589844,
"epoch": 1.530345471521942,
"grad_norm": 0.03163023665547371,
"learning_rate": 0.0002,
"loss": 0.5407525897026062,
"mean_token_accuracy": 0.7810121178627014,
"num_tokens": 6685040.0,
"step": 410
},
{
"entropy": 0.5353195369243622,
"epoch": 1.534080298786181,
"grad_norm": 0.026917260140180588,
"learning_rate": 0.0002,
"loss": 0.5328407883644104,
"mean_token_accuracy": 0.7829948961734772,
"num_tokens": 6701433.0,
"step": 411
},
{
"entropy": 0.5223068818449974,
"epoch": 1.53781512605042,
"grad_norm": 0.03261617571115494,
"learning_rate": 0.0002,
"loss": 0.5255942344665527,
"mean_token_accuracy": 0.785964623093605,
"num_tokens": 6717710.0,
"step": 412
},
{
"entropy": 0.5453132838010788,
"epoch": 1.541549953314659,
"grad_norm": 0.03235824778676033,
"learning_rate": 0.0002,
"loss": 0.5518944263458252,
"mean_token_accuracy": 0.7770064026117325,
"num_tokens": 6733942.0,
"step": 413
},
{
"entropy": 0.5489854216575623,
"epoch": 1.545284780578898,
"grad_norm": 0.02913379855453968,
"learning_rate": 0.0002,
"loss": 0.5539657473564148,
"mean_token_accuracy": 0.7730102986097336,
"num_tokens": 6749978.0,
"step": 414
},
{
"entropy": 0.5504709929227829,
"epoch": 1.5490196078431373,
"grad_norm": 0.03497619554400444,
"learning_rate": 0.0002,
"loss": 0.5534422397613525,
"mean_token_accuracy": 0.7738368958234787,
"num_tokens": 6766386.0,
"step": 415
},
{
"entropy": 0.5360163599252701,
"epoch": 1.5527544351073763,
"grad_norm": 0.03147003799676895,
"learning_rate": 0.0002,
"loss": 0.5354920625686646,
"mean_token_accuracy": 0.7844124883413315,
"num_tokens": 6782497.0,
"step": 416
},
{
"entropy": 0.5680203884840012,
"epoch": 1.5564892623716153,
"grad_norm": 0.030537011101841927,
"learning_rate": 0.0002,
"loss": 0.5605371594429016,
"mean_token_accuracy": 0.772536501288414,
"num_tokens": 6799059.0,
"step": 417
},
{
"entropy": 0.5505528301000595,
"epoch": 1.5602240896358543,
"grad_norm": 0.028710143640637398,
"learning_rate": 0.0002,
"loss": 0.5522081255912781,
"mean_token_accuracy": 0.7738733440637589,
"num_tokens": 6815363.0,
"step": 418
},
{
"entropy": 0.5502945929765701,
"epoch": 1.5639589169000934,
"grad_norm": 0.0320894755423069,
"learning_rate": 0.0002,
"loss": 0.5519194006919861,
"mean_token_accuracy": 0.775145635008812,
"num_tokens": 6831823.0,
"step": 419
},
{
"entropy": 0.5572039783000946,
"epoch": 1.5676937441643324,
"grad_norm": 0.028658481314778328,
"learning_rate": 0.0002,
"loss": 0.5568941831588745,
"mean_token_accuracy": 0.7728902250528336,
"num_tokens": 6848346.0,
"step": 420
},
{
"entropy": 0.5431763082742691,
"epoch": 1.5714285714285714,
"grad_norm": 0.027273258194327354,
"learning_rate": 0.0002,
"loss": 0.5424181818962097,
"mean_token_accuracy": 0.7814328521490097,
"num_tokens": 6864537.0,
"step": 421
},
{
"entropy": 0.5466543883085251,
"epoch": 1.5751633986928104,
"grad_norm": 0.02875494956970215,
"learning_rate": 0.0002,
"loss": 0.5450119972229004,
"mean_token_accuracy": 0.7765506953001022,
"num_tokens": 6881053.0,
"step": 422
},
{
"entropy": 0.5499023944139481,
"epoch": 1.5788982259570497,
"grad_norm": 0.02958599291741848,
"learning_rate": 0.0002,
"loss": 0.5486996173858643,
"mean_token_accuracy": 0.778396338224411,
"num_tokens": 6897409.0,
"step": 423
},
{
"entropy": 0.5387710481882095,
"epoch": 1.5826330532212887,
"grad_norm": 0.030644621700048447,
"learning_rate": 0.0002,
"loss": 0.5404931306838989,
"mean_token_accuracy": 0.7786550223827362,
"num_tokens": 6913681.0,
"step": 424
},
{
"entropy": 0.5346106290817261,
"epoch": 1.5863678804855277,
"grad_norm": 0.028904983773827553,
"learning_rate": 0.0002,
"loss": 0.5413768887519836,
"mean_token_accuracy": 0.7797856479883194,
"num_tokens": 6930096.0,
"step": 425
},
{
"entropy": 0.5166824460029602,
"epoch": 1.5901027077497667,
"grad_norm": 0.03321892023086548,
"learning_rate": 0.0002,
"loss": 0.5238149166107178,
"mean_token_accuracy": 0.7857634872198105,
"num_tokens": 6946449.0,
"step": 426
},
{
"entropy": 0.5426425486803055,
"epoch": 1.5938375350140057,
"grad_norm": 0.030873097479343414,
"learning_rate": 0.0002,
"loss": 0.5491586923599243,
"mean_token_accuracy": 0.7750476896762848,
"num_tokens": 6962805.0,
"step": 427
},
{
"entropy": 0.555439367890358,
"epoch": 1.5975723622782447,
"grad_norm": 0.030430428683757782,
"learning_rate": 0.0002,
"loss": 0.5504173040390015,
"mean_token_accuracy": 0.7780658453702927,
"num_tokens": 6979378.0,
"step": 428
},
{
"entropy": 0.5425661355257034,
"epoch": 1.6013071895424837,
"grad_norm": 0.033183399587869644,
"learning_rate": 0.0002,
"loss": 0.5338144302368164,
"mean_token_accuracy": 0.7815939337015152,
"num_tokens": 6995576.0,
"step": 429
},
{
"entropy": 0.5580693334341049,
"epoch": 1.6050420168067228,
"grad_norm": 0.02936139702796936,
"learning_rate": 0.0002,
"loss": 0.5471250414848328,
"mean_token_accuracy": 0.7805830985307693,
"num_tokens": 7011887.0,
"step": 430
},
{
"entropy": 0.5445709973573685,
"epoch": 1.6087768440709618,
"grad_norm": 0.029686426743865013,
"learning_rate": 0.0002,
"loss": 0.5449705719947815,
"mean_token_accuracy": 0.7791666090488434,
"num_tokens": 7028245.0,
"step": 431
},
{
"entropy": 0.5352734625339508,
"epoch": 1.6125116713352008,
"grad_norm": 0.0335598923265934,
"learning_rate": 0.0002,
"loss": 0.5456029772758484,
"mean_token_accuracy": 0.7778525203466415,
"num_tokens": 7044490.0,
"step": 432
},
{
"entropy": 0.548936665058136,
"epoch": 1.6162464985994398,
"grad_norm": 0.03590673953294754,
"learning_rate": 0.0002,
"loss": 0.5520269870758057,
"mean_token_accuracy": 0.7742140144109726,
"num_tokens": 7060917.0,
"step": 433
},
{
"entropy": 0.5434507131576538,
"epoch": 1.6199813258636788,
"grad_norm": 0.028407955542206764,
"learning_rate": 0.0002,
"loss": 0.5414606332778931,
"mean_token_accuracy": 0.778694823384285,
"num_tokens": 7077100.0,
"step": 434
},
{
"entropy": 0.5490714907646179,
"epoch": 1.6237161531279178,
"grad_norm": 0.0324469618499279,
"learning_rate": 0.0002,
"loss": 0.5481012463569641,
"mean_token_accuracy": 0.7763958275318146,
"num_tokens": 7093665.0,
"step": 435
},
{
"entropy": 0.5379714071750641,
"epoch": 1.6274509803921569,
"grad_norm": 0.030424365773797035,
"learning_rate": 0.0002,
"loss": 0.5396856665611267,
"mean_token_accuracy": 0.7815098166465759,
"num_tokens": 7110174.0,
"step": 436
},
{
"entropy": 0.5480812042951584,
"epoch": 1.6311858076563959,
"grad_norm": 0.029105886816978455,
"learning_rate": 0.0002,
"loss": 0.5511510372161865,
"mean_token_accuracy": 0.7754542678594589,
"num_tokens": 7126486.0,
"step": 437
},
{
"entropy": 0.5540740191936493,
"epoch": 1.6349206349206349,
"grad_norm": 0.027599727734923363,
"learning_rate": 0.0002,
"loss": 0.5574399828910828,
"mean_token_accuracy": 0.7723194360733032,
"num_tokens": 7143064.0,
"step": 438
},
{
"entropy": 0.5382533967494965,
"epoch": 1.638655462184874,
"grad_norm": 0.02985025756061077,
"learning_rate": 0.0002,
"loss": 0.542414665222168,
"mean_token_accuracy": 0.7797781080007553,
"num_tokens": 7159194.0,
"step": 439
},
{
"entropy": 0.545093446969986,
"epoch": 1.642390289449113,
"grad_norm": 0.033221568912267685,
"learning_rate": 0.0002,
"loss": 0.5397443771362305,
"mean_token_accuracy": 0.781465008854866,
"num_tokens": 7175448.0,
"step": 440
},
{
"entropy": 0.547942727804184,
"epoch": 1.646125116713352,
"grad_norm": 0.030130675062537193,
"learning_rate": 0.0002,
"loss": 0.5471298098564148,
"mean_token_accuracy": 0.7778923958539963,
"num_tokens": 7191951.0,
"step": 441
},
{
"entropy": 0.5388812720775604,
"epoch": 1.649859943977591,
"grad_norm": 0.03608401492238045,
"learning_rate": 0.0002,
"loss": 0.5405545234680176,
"mean_token_accuracy": 0.7795072197914124,
"num_tokens": 7208082.0,
"step": 442
},
{
"entropy": 0.5480445921421051,
"epoch": 1.65359477124183,
"grad_norm": 0.03251367062330246,
"learning_rate": 0.0002,
"loss": 0.5486726760864258,
"mean_token_accuracy": 0.7771764397621155,
"num_tokens": 7224432.0,
"step": 443
},
{
"entropy": 0.5502856224775314,
"epoch": 1.657329598506069,
"grad_norm": 0.03557496517896652,
"learning_rate": 0.0002,
"loss": 0.5455541014671326,
"mean_token_accuracy": 0.7788678556680679,
"num_tokens": 7241112.0,
"step": 444
},
{
"entropy": 0.5650181323289871,
"epoch": 1.661064425770308,
"grad_norm": 0.036821287125349045,
"learning_rate": 0.0002,
"loss": 0.5659928321838379,
"mean_token_accuracy": 0.7705142349004745,
"num_tokens": 7257646.0,
"step": 445
},
{
"entropy": 0.5301887840032578,
"epoch": 1.664799253034547,
"grad_norm": 0.028849398717284203,
"learning_rate": 0.0002,
"loss": 0.5311304926872253,
"mean_token_accuracy": 0.7853154540061951,
"num_tokens": 7273883.0,
"step": 446
},
{
"entropy": 0.5287686139345169,
"epoch": 1.668534080298786,
"grad_norm": 0.027796290814876556,
"learning_rate": 0.0002,
"loss": 0.5300359129905701,
"mean_token_accuracy": 0.7818829715251923,
"num_tokens": 7290094.0,
"step": 447
},
{
"entropy": 0.5384389162063599,
"epoch": 1.6722689075630253,
"grad_norm": 0.03137550130486488,
"learning_rate": 0.0002,
"loss": 0.5358840227127075,
"mean_token_accuracy": 0.7822984606027603,
"num_tokens": 7306318.0,
"step": 448
},
{
"entropy": 0.5409219712018967,
"epoch": 1.6760037348272643,
"grad_norm": 0.03238392993807793,
"learning_rate": 0.0002,
"loss": 0.5490888357162476,
"mean_token_accuracy": 0.7757006883621216,
"num_tokens": 7322518.0,
"step": 449
},
{
"entropy": 0.5399473458528519,
"epoch": 1.6797385620915033,
"grad_norm": 0.03108685463666916,
"learning_rate": 0.0002,
"loss": 0.5397608876228333,
"mean_token_accuracy": 0.7774724215269089,
"num_tokens": 7338931.0,
"step": 450
},
{
"entropy": 0.5551822930574417,
"epoch": 1.6834733893557423,
"grad_norm": 0.02780800126492977,
"learning_rate": 0.0002,
"loss": 0.5481570959091187,
"mean_token_accuracy": 0.7780963182449341,
"num_tokens": 7355336.0,
"step": 451
},
{
"entropy": 0.54237399995327,
"epoch": 1.6872082166199813,
"grad_norm": 0.04012434557080269,
"learning_rate": 0.0002,
"loss": 0.5462750792503357,
"mean_token_accuracy": 0.7741427570581436,
"num_tokens": 7371655.0,
"step": 452
},
{
"entropy": 0.5476243197917938,
"epoch": 1.6909430438842203,
"grad_norm": 0.031238745898008347,
"learning_rate": 0.0002,
"loss": 0.5490629076957703,
"mean_token_accuracy": 0.7778069078922272,
"num_tokens": 7387779.0,
"step": 453
},
{
"entropy": 0.5370198786258698,
"epoch": 1.6946778711484594,
"grad_norm": 0.0672907754778862,
"learning_rate": 0.0002,
"loss": 0.5387383699417114,
"mean_token_accuracy": 0.7835952490568161,
"num_tokens": 7404160.0,
"step": 454
},
{
"entropy": 0.5476315915584564,
"epoch": 1.6984126984126984,
"grad_norm": 0.029196592047810555,
"learning_rate": 0.0002,
"loss": 0.5511754751205444,
"mean_token_accuracy": 0.7767634838819504,
"num_tokens": 7420779.0,
"step": 455
},
{
"entropy": 0.5495481044054031,
"epoch": 1.7021475256769374,
"grad_norm": 0.03591341897845268,
"learning_rate": 0.0002,
"loss": 0.5475634336471558,
"mean_token_accuracy": 0.7761732786893845,
"num_tokens": 7437268.0,
"step": 456
},
{
"entropy": 0.5471929609775543,
"epoch": 1.7058823529411766,
"grad_norm": 0.07272505015134811,
"learning_rate": 0.0002,
"loss": 0.5460875630378723,
"mean_token_accuracy": 0.7771887481212616,
"num_tokens": 7453407.0,
"step": 457
},
{
"entropy": 0.5470087379217148,
"epoch": 1.7096171802054156,
"grad_norm": 0.027592960745096207,
"learning_rate": 0.0002,
"loss": 0.544583797454834,
"mean_token_accuracy": 0.7774143517017365,
"num_tokens": 7469641.0,
"step": 458
},
{
"entropy": 0.5607744753360748,
"epoch": 1.7133520074696547,
"grad_norm": 0.031071651726961136,
"learning_rate": 0.0002,
"loss": 0.5542961955070496,
"mean_token_accuracy": 0.7748319655656815,
"num_tokens": 7486190.0,
"step": 459
},
{
"entropy": 0.5514983385801315,
"epoch": 1.7170868347338937,
"grad_norm": 0.03477690741419792,
"learning_rate": 0.0002,
"loss": 0.5511950254440308,
"mean_token_accuracy": 0.7754039019346237,
"num_tokens": 7502685.0,
"step": 460
},
{
"entropy": 0.5462844371795654,
"epoch": 1.7208216619981327,
"grad_norm": 0.02956387773156166,
"learning_rate": 0.0002,
"loss": 0.5578323602676392,
"mean_token_accuracy": 0.7759933173656464,
"num_tokens": 7518976.0,
"step": 461
},
{
"entropy": 0.5413178950548172,
"epoch": 1.7245564892623717,
"grad_norm": 0.03515993058681488,
"learning_rate": 0.0002,
"loss": 0.5494832992553711,
"mean_token_accuracy": 0.7766997069120407,
"num_tokens": 7535230.0,
"step": 462
},
{
"entropy": 0.5519613027572632,
"epoch": 1.7282913165266107,
"grad_norm": 0.03921071067452431,
"learning_rate": 0.0002,
"loss": 0.5593541860580444,
"mean_token_accuracy": 0.7729771286249161,
"num_tokens": 7551766.0,
"step": 463
},
{
"entropy": 0.5483202934265137,
"epoch": 1.7320261437908497,
"grad_norm": 0.02950095944106579,
"learning_rate": 0.0002,
"loss": 0.5464847683906555,
"mean_token_accuracy": 0.7769839763641357,
"num_tokens": 7568028.0,
"step": 464
},
{
"entropy": 0.5524065643548965,
"epoch": 1.7357609710550888,
"grad_norm": 0.038918618112802505,
"learning_rate": 0.0002,
"loss": 0.5422624945640564,
"mean_token_accuracy": 0.7797468602657318,
"num_tokens": 7584397.0,
"step": 465
},
{
"entropy": 0.546732097864151,
"epoch": 1.7394957983193278,
"grad_norm": 0.03082694672048092,
"learning_rate": 0.0002,
"loss": 0.5352342128753662,
"mean_token_accuracy": 0.78376704454422,
"num_tokens": 7600719.0,
"step": 466
},
{
"entropy": 0.557578444480896,
"epoch": 1.7432306255835668,
"grad_norm": 0.031017586588859558,
"learning_rate": 0.0002,
"loss": 0.54631108045578,
"mean_token_accuracy": 0.7787049263715744,
"num_tokens": 7617277.0,
"step": 467
},
{
"entropy": 0.5322857201099396,
"epoch": 1.7469654528478058,
"grad_norm": 0.0356813408434391,
"learning_rate": 0.0002,
"loss": 0.5350920557975769,
"mean_token_accuracy": 0.7820670753717422,
"num_tokens": 7633468.0,
"step": 468
},
{
"entropy": 0.5373670607805252,
"epoch": 1.7507002801120448,
"grad_norm": 0.0339689627289772,
"learning_rate": 0.0002,
"loss": 0.5516907572746277,
"mean_token_accuracy": 0.7766174525022507,
"num_tokens": 7649778.0,
"step": 469
},
{
"entropy": 0.522003561258316,
"epoch": 1.7544351073762838,
"grad_norm": 0.034353625029325485,
"learning_rate": 0.0002,
"loss": 0.533075749874115,
"mean_token_accuracy": 0.7833420485258102,
"num_tokens": 7666182.0,
"step": 470
},
{
"entropy": 0.5592000931501389,
"epoch": 1.7581699346405228,
"grad_norm": 0.029966510832309723,
"learning_rate": 0.0002,
"loss": 0.5585059523582458,
"mean_token_accuracy": 0.7732168585062027,
"num_tokens": 7682572.0,
"step": 471
},
{
"entropy": 0.5302631109952927,
"epoch": 1.7619047619047619,
"grad_norm": 0.030881982296705246,
"learning_rate": 0.0002,
"loss": 0.5330703854560852,
"mean_token_accuracy": 0.7825835943222046,
"num_tokens": 7698564.0,
"step": 472
},
{
"entropy": 0.5615632385015488,
"epoch": 1.7656395891690009,
"grad_norm": 0.03000018559396267,
"learning_rate": 0.0002,
"loss": 0.5536789298057556,
"mean_token_accuracy": 0.7739888280630112,
"num_tokens": 7714922.0,
"step": 473
},
{
"entropy": 0.5522587448358536,
"epoch": 1.76937441643324,
"grad_norm": 0.031349826604127884,
"learning_rate": 0.0002,
"loss": 0.551250696182251,
"mean_token_accuracy": 0.7755384594202042,
"num_tokens": 7731301.0,
"step": 474
},
{
"entropy": 0.5275092422962189,
"epoch": 1.773109243697479,
"grad_norm": 0.026553746312856674,
"learning_rate": 0.0002,
"loss": 0.5240329504013062,
"mean_token_accuracy": 0.7870848327875137,
"num_tokens": 7747693.0,
"step": 475
},
{
"entropy": 0.5298073589801788,
"epoch": 1.776844070961718,
"grad_norm": 0.03024754300713539,
"learning_rate": 0.0002,
"loss": 0.5267937183380127,
"mean_token_accuracy": 0.7867465615272522,
"num_tokens": 7763990.0,
"step": 476
},
{
"entropy": 0.5466170459985733,
"epoch": 1.780578898225957,
"grad_norm": 0.03677600622177124,
"learning_rate": 0.0002,
"loss": 0.5455999374389648,
"mean_token_accuracy": 0.7789721339941025,
"num_tokens": 7780428.0,
"step": 477
},
{
"entropy": 0.5342886596918106,
"epoch": 1.784313725490196,
"grad_norm": 0.03470218554139137,
"learning_rate": 0.0002,
"loss": 0.5434668660163879,
"mean_token_accuracy": 0.7787842005491257,
"num_tokens": 7796524.0,
"step": 478
},
{
"entropy": 0.5427644997835159,
"epoch": 1.788048552754435,
"grad_norm": 0.026957696303725243,
"learning_rate": 0.0002,
"loss": 0.5418925285339355,
"mean_token_accuracy": 0.7785145193338394,
"num_tokens": 7813105.0,
"step": 479
},
{
"entropy": 0.528566911816597,
"epoch": 1.791783380018674,
"grad_norm": 0.037975575774908066,
"learning_rate": 0.0002,
"loss": 0.5284658074378967,
"mean_token_accuracy": 0.7871547490358353,
"num_tokens": 7829398.0,
"step": 480
},
{
"entropy": 0.5551463812589645,
"epoch": 1.795518207282913,
"grad_norm": 0.028514336794614792,
"learning_rate": 0.0002,
"loss": 0.556096076965332,
"mean_token_accuracy": 0.7756756544113159,
"num_tokens": 7845626.0,
"step": 481
},
{
"entropy": 0.5317743271589279,
"epoch": 1.7992530345471522,
"grad_norm": 0.03154602646827698,
"learning_rate": 0.0002,
"loss": 0.5321435332298279,
"mean_token_accuracy": 0.7815971374511719,
"num_tokens": 7861817.0,
"step": 482
},
{
"entropy": 0.547456681728363,
"epoch": 1.8029878618113913,
"grad_norm": 0.03746788948774338,
"learning_rate": 0.0002,
"loss": 0.5512088537216187,
"mean_token_accuracy": 0.7785567492246628,
"num_tokens": 7878075.0,
"step": 483
},
{
"entropy": 0.5500114560127258,
"epoch": 1.8067226890756303,
"grad_norm": 0.030493978410959244,
"learning_rate": 0.0002,
"loss": 0.5513818264007568,
"mean_token_accuracy": 0.773513063788414,
"num_tokens": 7894502.0,
"step": 484
},
{
"entropy": 0.543748289346695,
"epoch": 1.8104575163398693,
"grad_norm": 0.036304932087659836,
"learning_rate": 0.0002,
"loss": 0.5411792993545532,
"mean_token_accuracy": 0.7785163670778275,
"num_tokens": 7910890.0,
"step": 485
},
{
"entropy": 0.5393827706575394,
"epoch": 1.8141923436041083,
"grad_norm": 0.03712041303515434,
"learning_rate": 0.0002,
"loss": 0.540428876876831,
"mean_token_accuracy": 0.7790835350751877,
"num_tokens": 7927094.0,
"step": 486
},
{
"entropy": 0.5430537164211273,
"epoch": 1.8179271708683473,
"grad_norm": 0.03853759169578552,
"learning_rate": 0.0002,
"loss": 0.5471268892288208,
"mean_token_accuracy": 0.7801193594932556,
"num_tokens": 7943326.0,
"step": 487
},
{
"entropy": 0.5636092722415924,
"epoch": 1.8216619981325863,
"grad_norm": 0.0457291305065155,
"learning_rate": 0.0002,
"loss": 0.5627254843711853,
"mean_token_accuracy": 0.7725824415683746,
"num_tokens": 7959760.0,
"step": 488
},
{
"entropy": 0.543666809797287,
"epoch": 1.8253968253968254,
"grad_norm": 0.02919071726500988,
"learning_rate": 0.0002,
"loss": 0.5421757102012634,
"mean_token_accuracy": 0.7801477611064911,
"num_tokens": 7975860.0,
"step": 489
},
{
"entropy": 0.5545783638954163,
"epoch": 1.8291316526610646,
"grad_norm": 0.03340514004230499,
"learning_rate": 0.0002,
"loss": 0.5518795251846313,
"mean_token_accuracy": 0.7791309058666229,
"num_tokens": 7992340.0,
"step": 490
},
{
"entropy": 0.5608565956354141,
"epoch": 1.8328664799253036,
"grad_norm": 0.03725928068161011,
"learning_rate": 0.0002,
"loss": 0.5564695596694946,
"mean_token_accuracy": 0.7749106585979462,
"num_tokens": 8008783.0,
"step": 491
},
{
"entropy": 0.5600428581237793,
"epoch": 1.8366013071895426,
"grad_norm": 0.030761808156967163,
"learning_rate": 0.0002,
"loss": 0.5595075488090515,
"mean_token_accuracy": 0.7733119577169418,
"num_tokens": 8025204.0,
"step": 492
},
{
"entropy": 0.5233868211507797,
"epoch": 1.8403361344537816,
"grad_norm": 0.030873069539666176,
"learning_rate": 0.0002,
"loss": 0.5303994417190552,
"mean_token_accuracy": 0.784132570028305,
"num_tokens": 8041524.0,
"step": 493
},
{
"entropy": 0.5531543642282486,
"epoch": 1.8440709617180207,
"grad_norm": 0.037785280495882034,
"learning_rate": 0.0002,
"loss": 0.5541731119155884,
"mean_token_accuracy": 0.7754590958356857,
"num_tokens": 8057944.0,
"step": 494
},
{
"entropy": 0.542868971824646,
"epoch": 1.8478057889822597,
"grad_norm": 0.03054802305996418,
"learning_rate": 0.0002,
"loss": 0.5407766699790955,
"mean_token_accuracy": 0.7781128138303757,
"num_tokens": 8074585.0,
"step": 495
},
{
"entropy": 0.5384076237678528,
"epoch": 1.8515406162464987,
"grad_norm": 0.024639198556542397,
"learning_rate": 0.0002,
"loss": 0.5381752848625183,
"mean_token_accuracy": 0.7817619889974594,
"num_tokens": 8091097.0,
"step": 496
},
{
"entropy": 0.5398432165384293,
"epoch": 1.8552754435107377,
"grad_norm": 0.04202251881361008,
"learning_rate": 0.0002,
"loss": 0.5468040704727173,
"mean_token_accuracy": 0.7771125733852386,
"num_tokens": 8107370.0,
"step": 497
},
{
"entropy": 0.5353098064661026,
"epoch": 1.8590102707749767,
"grad_norm": 0.03730052337050438,
"learning_rate": 0.0002,
"loss": 0.5450741052627563,
"mean_token_accuracy": 0.7791319042444229,
"num_tokens": 8123388.0,
"step": 498
},
{
"entropy": 0.537789598107338,
"epoch": 1.8627450980392157,
"grad_norm": 0.02861681580543518,
"learning_rate": 0.0002,
"loss": 0.5363599061965942,
"mean_token_accuracy": 0.7793796509504318,
"num_tokens": 8139491.0,
"step": 499
},
{
"entropy": 0.5609306395053864,
"epoch": 1.8664799253034547,
"grad_norm": 0.04193006083369255,
"learning_rate": 0.0002,
"loss": 0.5556061267852783,
"mean_token_accuracy": 0.7729236781597137,
"num_tokens": 8155893.0,
"step": 500
},
{
"entropy": 0.5393400639295578,
"epoch": 1.8702147525676938,
"grad_norm": 0.030415907502174377,
"learning_rate": 0.0002,
"loss": 0.5372475385665894,
"mean_token_accuracy": 0.7827758193016052,
"num_tokens": 8172021.0,
"step": 501
},
{
"entropy": 0.5631109476089478,
"epoch": 1.8739495798319328,
"grad_norm": 0.030597561970353127,
"learning_rate": 0.0002,
"loss": 0.56128990650177,
"mean_token_accuracy": 0.7722143828868866,
"num_tokens": 8188761.0,
"step": 502
},
{
"entropy": 0.540698915719986,
"epoch": 1.8776844070961718,
"grad_norm": 0.03197801113128662,
"learning_rate": 0.0002,
"loss": 0.5419467687606812,
"mean_token_accuracy": 0.7789603024721146,
"num_tokens": 8205080.0,
"step": 503
},
{
"entropy": 0.5343400835990906,
"epoch": 1.8814192343604108,
"grad_norm": 0.03577344864606857,
"learning_rate": 0.0002,
"loss": 0.5340043306350708,
"mean_token_accuracy": 0.7837951481342316,
"num_tokens": 8221164.0,
"step": 504
},
{
"entropy": 0.5417536497116089,
"epoch": 1.8851540616246498,
"grad_norm": 0.029083728790283203,
"learning_rate": 0.0002,
"loss": 0.5438728332519531,
"mean_token_accuracy": 0.7775007486343384,
"num_tokens": 8237535.0,
"step": 505
},
{
"entropy": 0.5649835765361786,
"epoch": 1.8888888888888888,
"grad_norm": 0.03408566117286682,
"learning_rate": 0.0002,
"loss": 0.5633872151374817,
"mean_token_accuracy": 0.7726111114025116,
"num_tokens": 8253827.0,
"step": 506
},
{
"entropy": 0.5582909733057022,
"epoch": 1.8926237161531279,
"grad_norm": 0.028437087312340736,
"learning_rate": 0.0002,
"loss": 0.556007981300354,
"mean_token_accuracy": 0.7727185785770416,
"num_tokens": 8270404.0,
"step": 507
},
{
"entropy": 0.5577380061149597,
"epoch": 1.8963585434173669,
"grad_norm": 0.029986968263983727,
"learning_rate": 0.0002,
"loss": 0.5514963865280151,
"mean_token_accuracy": 0.7755957692861557,
"num_tokens": 8286963.0,
"step": 508
},
{
"entropy": 0.5398396402597427,
"epoch": 1.9000933706816059,
"grad_norm": 0.030943697318434715,
"learning_rate": 0.0002,
"loss": 0.5466131567955017,
"mean_token_accuracy": 0.7787002176046371,
"num_tokens": 8303122.0,
"step": 509
},
{
"entropy": 0.536215141415596,
"epoch": 1.903828197945845,
"grad_norm": 0.03370903804898262,
"learning_rate": 0.0002,
"loss": 0.5468170046806335,
"mean_token_accuracy": 0.7753093987703323,
"num_tokens": 8319505.0,
"step": 510
},
{
"entropy": 0.5411160290241241,
"epoch": 1.907563025210084,
"grad_norm": 0.028430534526705742,
"learning_rate": 0.0002,
"loss": 0.5434973835945129,
"mean_token_accuracy": 0.7790606617927551,
"num_tokens": 8335861.0,
"step": 511
},
{
"entropy": 0.5555713921785355,
"epoch": 1.911297852474323,
"grad_norm": 0.029101036489009857,
"learning_rate": 0.0002,
"loss": 0.5541608929634094,
"mean_token_accuracy": 0.7740498781204224,
"num_tokens": 8352413.0,
"step": 512
},
{
"entropy": 0.5440339744091034,
"epoch": 1.915032679738562,
"grad_norm": 0.029705537483096123,
"learning_rate": 0.0002,
"loss": 0.5449399352073669,
"mean_token_accuracy": 0.7799241691827774,
"num_tokens": 8368524.0,
"step": 513
},
{
"entropy": 0.5385466068983078,
"epoch": 1.918767507002801,
"grad_norm": 0.02762160450220108,
"learning_rate": 0.0002,
"loss": 0.5408512353897095,
"mean_token_accuracy": 0.7800593823194504,
"num_tokens": 8384881.0,
"step": 514
},
{
"entropy": 0.5469230860471725,
"epoch": 1.9225023342670402,
"grad_norm": 0.02923613414168358,
"learning_rate": 0.0002,
"loss": 0.5409518480300903,
"mean_token_accuracy": 0.7801833301782608,
"num_tokens": 8401135.0,
"step": 515
},
{
"entropy": 0.5446400791406631,
"epoch": 1.9262371615312792,
"grad_norm": 0.031235719099640846,
"learning_rate": 0.0002,
"loss": 0.5424818992614746,
"mean_token_accuracy": 0.7767819166183472,
"num_tokens": 8417485.0,
"step": 516
},
{
"entropy": 0.5608149170875549,
"epoch": 1.9299719887955182,
"grad_norm": 0.027529114857316017,
"learning_rate": 0.0002,
"loss": 0.5587472915649414,
"mean_token_accuracy": 0.7713829576969147,
"num_tokens": 8433808.0,
"step": 517
},
{
"entropy": 0.560915470123291,
"epoch": 1.9337068160597572,
"grad_norm": 0.03099709376692772,
"learning_rate": 0.0002,
"loss": 0.5625618100166321,
"mean_token_accuracy": 0.7697023302316666,
"num_tokens": 8450212.0,
"step": 518
},
{
"entropy": 0.5411669313907623,
"epoch": 1.9374416433239963,
"grad_norm": 0.03581510856747627,
"learning_rate": 0.0002,
"loss": 0.5449709892272949,
"mean_token_accuracy": 0.779253363609314,
"num_tokens": 8466650.0,
"step": 519
},
{
"entropy": 0.5495533496141434,
"epoch": 1.9411764705882353,
"grad_norm": 0.02863345853984356,
"learning_rate": 0.0002,
"loss": 0.5461183786392212,
"mean_token_accuracy": 0.7790695428848267,
"num_tokens": 8482819.0,
"step": 520
},
{
"entropy": 0.5496646910905838,
"epoch": 1.9449112978524743,
"grad_norm": 0.028455862775444984,
"learning_rate": 0.0002,
"loss": 0.5562914609909058,
"mean_token_accuracy": 0.7747850865125656,
"num_tokens": 8499201.0,
"step": 521
},
{
"entropy": 0.5566077679395676,
"epoch": 1.9486461251167133,
"grad_norm": 0.030010810121893883,
"learning_rate": 0.0002,
"loss": 0.551722526550293,
"mean_token_accuracy": 0.7771954238414764,
"num_tokens": 8515798.0,
"step": 522
},
{
"entropy": 0.5467117130756378,
"epoch": 1.9523809523809523,
"grad_norm": 0.027012262493371964,
"learning_rate": 0.0002,
"loss": 0.5425857305526733,
"mean_token_accuracy": 0.7798562794923782,
"num_tokens": 8531958.0,
"step": 523
},
{
"entropy": 0.5346378833055496,
"epoch": 1.9561157796451916,
"grad_norm": 0.028377590700984,
"learning_rate": 0.0002,
"loss": 0.5295640230178833,
"mean_token_accuracy": 0.7838203459978104,
"num_tokens": 8548384.0,
"step": 524
},
{
"entropy": 0.5571393668651581,
"epoch": 1.9598506069094306,
"grad_norm": 0.02818567305803299,
"learning_rate": 0.0002,
"loss": 0.5521214008331299,
"mean_token_accuracy": 0.7728232592344284,
"num_tokens": 8564872.0,
"step": 525
},
{
"entropy": 0.5285107642412186,
"epoch": 1.9635854341736696,
"grad_norm": 0.03457087650895119,
"learning_rate": 0.0002,
"loss": 0.5370362401008606,
"mean_token_accuracy": 0.7813837081193924,
"num_tokens": 8581245.0,
"step": 526
},
{
"entropy": 0.5266488045454025,
"epoch": 1.9673202614379086,
"grad_norm": 0.030525686219334602,
"learning_rate": 0.0002,
"loss": 0.5345274806022644,
"mean_token_accuracy": 0.7815807163715363,
"num_tokens": 8597625.0,
"step": 527
},
{
"entropy": 0.5280887708067894,
"epoch": 1.9710550887021476,
"grad_norm": 0.03248651325702667,
"learning_rate": 0.0002,
"loss": 0.536238431930542,
"mean_token_accuracy": 0.781073585152626,
"num_tokens": 8613792.0,
"step": 528
},
{
"entropy": 0.5472559034824371,
"epoch": 1.9747899159663866,
"grad_norm": 0.029427766799926758,
"learning_rate": 0.0002,
"loss": 0.5451797842979431,
"mean_token_accuracy": 0.7770361602306366,
"num_tokens": 8629870.0,
"step": 529
},
{
"entropy": 0.5381799042224884,
"epoch": 1.9785247432306257,
"grad_norm": 0.028413154184818268,
"learning_rate": 0.0002,
"loss": 0.5342366695404053,
"mean_token_accuracy": 0.7803498655557632,
"num_tokens": 8646077.0,
"step": 530
},
{
"entropy": 0.5565104633569717,
"epoch": 1.9822595704948647,
"grad_norm": 0.031074564903974533,
"learning_rate": 0.0002,
"loss": 0.5515958666801453,
"mean_token_accuracy": 0.7759076505899429,
"num_tokens": 8662535.0,
"step": 531
},
{
"entropy": 0.5381414890289307,
"epoch": 1.9859943977591037,
"grad_norm": 0.027250438928604126,
"learning_rate": 0.0002,
"loss": 0.534949004650116,
"mean_token_accuracy": 0.7819445878267288,
"num_tokens": 8679064.0,
"step": 532
},
{
"entropy": 0.550770565867424,
"epoch": 1.9897292250233427,
"grad_norm": 0.03366328775882721,
"learning_rate": 0.0002,
"loss": 0.560295045375824,
"mean_token_accuracy": 0.7720893323421478,
"num_tokens": 8695198.0,
"step": 533
},
{
"entropy": 0.5551019459962845,
"epoch": 1.9934640522875817,
"grad_norm": 0.03133872151374817,
"learning_rate": 0.0002,
"loss": 0.5596403479576111,
"mean_token_accuracy": 0.7717682421207428,
"num_tokens": 8711690.0,
"step": 534
},
{
"entropy": 0.5346082448959351,
"epoch": 1.9971988795518207,
"grad_norm": 0.027525832876563072,
"learning_rate": 0.0002,
"loss": 0.5321208834648132,
"mean_token_accuracy": 0.7810429036617279,
"num_tokens": 8727828.0,
"step": 535
},
{
"entropy": 0.5438209176063538,
"epoch": 2.0,
"grad_norm": 0.03134825825691223,
"learning_rate": 0.0002,
"loss": 0.5398504734039307,
"mean_token_accuracy": 0.7799929777781168,
"num_tokens": 8729600.0,
"step": 536
},
{
"entropy": 0.5402208417654037,
"epoch": 2.003734827264239,
"grad_norm": 0.03922782838344574,
"learning_rate": 0.0002,
"loss": 0.5171674489974976,
"mean_token_accuracy": 0.7900938540697098,
"num_tokens": 8745809.0,
"step": 537
},
{
"entropy": 0.5227422267198563,
"epoch": 2.007469654528478,
"grad_norm": 0.032982293516397476,
"learning_rate": 0.0002,
"loss": 0.5183929204940796,
"mean_token_accuracy": 0.7890477329492569,
"num_tokens": 8762197.0,
"step": 538
},
{
"entropy": 0.5411823242902756,
"epoch": 2.011204481792717,
"grad_norm": 0.043377745896577835,
"learning_rate": 0.0002,
"loss": 0.5554962158203125,
"mean_token_accuracy": 0.7736512869596481,
"num_tokens": 8778400.0,
"step": 539
},
{
"entropy": 0.5156290903687477,
"epoch": 2.014939309056956,
"grad_norm": 0.05257771536707878,
"learning_rate": 0.0002,
"loss": 0.5335375666618347,
"mean_token_accuracy": 0.7833946198225021,
"num_tokens": 8794851.0,
"step": 540
},
{
"entropy": 0.5122585743665695,
"epoch": 2.018674136321195,
"grad_norm": 0.03504469618201256,
"learning_rate": 0.0002,
"loss": 0.5155843496322632,
"mean_token_accuracy": 0.7894317060709,
"num_tokens": 8811019.0,
"step": 541
},
{
"entropy": 0.5448772013187408,
"epoch": 2.022408963585434,
"grad_norm": 0.0317138209939003,
"learning_rate": 0.0002,
"loss": 0.5368859171867371,
"mean_token_accuracy": 0.7842776328325272,
"num_tokens": 8827258.0,
"step": 542
},
{
"entropy": 0.5527419149875641,
"epoch": 2.026143790849673,
"grad_norm": 0.03476279601454735,
"learning_rate": 0.0002,
"loss": 0.5388182997703552,
"mean_token_accuracy": 0.7820452451705933,
"num_tokens": 8843634.0,
"step": 543
},
{
"entropy": 0.5452789962291718,
"epoch": 2.029878618113912,
"grad_norm": 0.036034028977155685,
"learning_rate": 0.0002,
"loss": 0.5357140302658081,
"mean_token_accuracy": 0.7825983464717865,
"num_tokens": 8859977.0,
"step": 544
},
{
"entropy": 0.5304267108440399,
"epoch": 2.033613445378151,
"grad_norm": 0.02969290129840374,
"learning_rate": 0.0002,
"loss": 0.5306066870689392,
"mean_token_accuracy": 0.7856841534376144,
"num_tokens": 8876314.0,
"step": 545
},
{
"entropy": 0.5096816495060921,
"epoch": 2.03734827264239,
"grad_norm": 0.043957311660051346,
"learning_rate": 0.0002,
"loss": 0.5185045599937439,
"mean_token_accuracy": 0.7923233062028885,
"num_tokens": 8892568.0,
"step": 546
},
{
"entropy": 0.5319265872240067,
"epoch": 2.041083099906629,
"grad_norm": 0.035869866609573364,
"learning_rate": 0.0002,
"loss": 0.5334051847457886,
"mean_token_accuracy": 0.7822142988443375,
"num_tokens": 8909094.0,
"step": 547
},
{
"entropy": 0.527954563498497,
"epoch": 2.044817927170868,
"grad_norm": 0.034570369869470596,
"learning_rate": 0.0002,
"loss": 0.5274232029914856,
"mean_token_accuracy": 0.7842770516872406,
"num_tokens": 8925492.0,
"step": 548
},
{
"entropy": 0.5419287383556366,
"epoch": 2.048552754435107,
"grad_norm": 0.03259408101439476,
"learning_rate": 0.0002,
"loss": 0.5387848615646362,
"mean_token_accuracy": 0.7804249227046967,
"num_tokens": 8941717.0,
"step": 549
},
{
"entropy": 0.5271529629826546,
"epoch": 2.052287581699346,
"grad_norm": 0.03245944157242775,
"learning_rate": 0.0002,
"loss": 0.5230631828308105,
"mean_token_accuracy": 0.785658523440361,
"num_tokens": 8957856.0,
"step": 550
},
{
"entropy": 0.5363311916589737,
"epoch": 2.0560224089635852,
"grad_norm": 0.035185229033231735,
"learning_rate": 0.0002,
"loss": 0.5378749370574951,
"mean_token_accuracy": 0.781972661614418,
"num_tokens": 8974161.0,
"step": 551
},
{
"entropy": 0.513224758207798,
"epoch": 2.0597572362278243,
"grad_norm": 0.032956283539533615,
"learning_rate": 0.0002,
"loss": 0.5172683000564575,
"mean_token_accuracy": 0.7906839847564697,
"num_tokens": 8990304.0,
"step": 552
},
{
"entropy": 0.5387901067733765,
"epoch": 2.0634920634920633,
"grad_norm": 0.03281653672456741,
"learning_rate": 0.0002,
"loss": 0.5435392260551453,
"mean_token_accuracy": 0.7788385599851608,
"num_tokens": 9006661.0,
"step": 553
},
{
"entropy": 0.5324967205524445,
"epoch": 2.0672268907563027,
"grad_norm": 0.03808191418647766,
"learning_rate": 0.0002,
"loss": 0.5265247225761414,
"mean_token_accuracy": 0.7839124202728271,
"num_tokens": 9022887.0,
"step": 554
},
{
"entropy": 0.513438269495964,
"epoch": 2.0709617180205417,
"grad_norm": 0.033963609486818314,
"learning_rate": 0.0002,
"loss": 0.5105268955230713,
"mean_token_accuracy": 0.7911703735589981,
"num_tokens": 9039478.0,
"step": 555
},
{
"entropy": 0.526850014925003,
"epoch": 2.0746965452847808,
"grad_norm": 0.03211839497089386,
"learning_rate": 0.0002,
"loss": 0.5205508470535278,
"mean_token_accuracy": 0.7859012186527252,
"num_tokens": 9055612.0,
"step": 556
},
{
"entropy": 0.5130272284150124,
"epoch": 2.0784313725490198,
"grad_norm": 0.03543682396411896,
"learning_rate": 0.0002,
"loss": 0.5140283703804016,
"mean_token_accuracy": 0.7925766706466675,
"num_tokens": 9072145.0,
"step": 557
},
{
"entropy": 0.540324792265892,
"epoch": 2.082166199813259,
"grad_norm": 0.037342023104429245,
"learning_rate": 0.0002,
"loss": 0.5462511777877808,
"mean_token_accuracy": 0.7783039510250092,
"num_tokens": 9088571.0,
"step": 558
},
{
"entropy": 0.5168541818857193,
"epoch": 2.085901027077498,
"grad_norm": 0.03552469611167908,
"learning_rate": 0.0002,
"loss": 0.5188402533531189,
"mean_token_accuracy": 0.7880326211452484,
"num_tokens": 9104869.0,
"step": 559
},
{
"entropy": 0.5319818705320358,
"epoch": 2.089635854341737,
"grad_norm": 0.03719151020050049,
"learning_rate": 0.0002,
"loss": 0.5254620313644409,
"mean_token_accuracy": 0.7848033308982849,
"num_tokens": 9121231.0,
"step": 560
},
{
"entropy": 0.5197737812995911,
"epoch": 2.093370681605976,
"grad_norm": 0.03636628016829491,
"learning_rate": 0.0002,
"loss": 0.5256960988044739,
"mean_token_accuracy": 0.7841715961694717,
"num_tokens": 9137375.0,
"step": 561
},
{
"entropy": 0.5284384936094284,
"epoch": 2.097105508870215,
"grad_norm": 0.04217526316642761,
"learning_rate": 0.0002,
"loss": 0.5343865752220154,
"mean_token_accuracy": 0.7833265513181686,
"num_tokens": 9153783.0,
"step": 562
},
{
"entropy": 0.541428878903389,
"epoch": 2.100840336134454,
"grad_norm": 0.035067781805992126,
"learning_rate": 0.0002,
"loss": 0.532885730266571,
"mean_token_accuracy": 0.7817672491073608,
"num_tokens": 9170090.0,
"step": 563
},
{
"entropy": 0.5429966300725937,
"epoch": 2.104575163398693,
"grad_norm": 0.0392267219722271,
"learning_rate": 0.0002,
"loss": 0.541841447353363,
"mean_token_accuracy": 0.7772143185138702,
"num_tokens": 9186453.0,
"step": 564
},
{
"entropy": 0.5249236822128296,
"epoch": 2.108309990662932,
"grad_norm": 0.036935608834028244,
"learning_rate": 0.0002,
"loss": 0.5244463086128235,
"mean_token_accuracy": 0.7873810976743698,
"num_tokens": 9202852.0,
"step": 565
},
{
"entropy": 0.5256126970052719,
"epoch": 2.112044817927171,
"grad_norm": 0.03337714821100235,
"learning_rate": 0.0002,
"loss": 0.5264843702316284,
"mean_token_accuracy": 0.7856813371181488,
"num_tokens": 9219197.0,
"step": 566
},
{
"entropy": 0.5338774845004082,
"epoch": 2.11577964519141,
"grad_norm": 0.03405802696943283,
"learning_rate": 0.0002,
"loss": 0.5289718508720398,
"mean_token_accuracy": 0.7856559157371521,
"num_tokens": 9235376.0,
"step": 567
},
{
"entropy": 0.5203371495008469,
"epoch": 2.119514472455649,
"grad_norm": 0.0316944345831871,
"learning_rate": 0.0002,
"loss": 0.5218056440353394,
"mean_token_accuracy": 0.7897576838731766,
"num_tokens": 9251814.0,
"step": 568
},
{
"entropy": 0.5288322418928146,
"epoch": 2.123249299719888,
"grad_norm": 0.03991817682981491,
"learning_rate": 0.0002,
"loss": 0.538847029209137,
"mean_token_accuracy": 0.7831858396530151,
"num_tokens": 9268205.0,
"step": 569
},
{
"entropy": 0.5342643857002258,
"epoch": 2.126984126984127,
"grad_norm": 0.030493905767798424,
"learning_rate": 0.0002,
"loss": 0.5326664447784424,
"mean_token_accuracy": 0.7838881760835648,
"num_tokens": 9284834.0,
"step": 570
},
{
"entropy": 0.5280385613441467,
"epoch": 2.130718954248366,
"grad_norm": 0.033054206520318985,
"learning_rate": 0.0002,
"loss": 0.5298633575439453,
"mean_token_accuracy": 0.7841726392507553,
"num_tokens": 9301172.0,
"step": 571
},
{
"entropy": 0.5260151326656342,
"epoch": 2.134453781512605,
"grad_norm": 0.03629712015390396,
"learning_rate": 0.0002,
"loss": 0.5276213884353638,
"mean_token_accuracy": 0.7840316295623779,
"num_tokens": 9317580.0,
"step": 572
},
{
"entropy": 0.5471695214509964,
"epoch": 2.138188608776844,
"grad_norm": 0.036552250385284424,
"learning_rate": 0.0002,
"loss": 0.5418487787246704,
"mean_token_accuracy": 0.7832918912172318,
"num_tokens": 9333967.0,
"step": 573
},
{
"entropy": 0.5421722680330276,
"epoch": 2.141923436041083,
"grad_norm": 0.03261527791619301,
"learning_rate": 0.0002,
"loss": 0.535561203956604,
"mean_token_accuracy": 0.7817313969135284,
"num_tokens": 9350202.0,
"step": 574
},
{
"entropy": 0.5255165547132492,
"epoch": 2.145658263305322,
"grad_norm": 0.04084421694278717,
"learning_rate": 0.0002,
"loss": 0.531385064125061,
"mean_token_accuracy": 0.7830993682146072,
"num_tokens": 9366633.0,
"step": 575
},
{
"entropy": 0.5387750118970871,
"epoch": 2.149393090569561,
"grad_norm": 0.03884339705109596,
"learning_rate": 0.0002,
"loss": 0.5440813302993774,
"mean_token_accuracy": 0.7815608382225037,
"num_tokens": 9382903.0,
"step": 576
},
{
"entropy": 0.5333912819623947,
"epoch": 2.1531279178338,
"grad_norm": 0.03951586037874222,
"learning_rate": 0.0002,
"loss": 0.5371235013008118,
"mean_token_accuracy": 0.7820965349674225,
"num_tokens": 9399354.0,
"step": 577
},
{
"entropy": 0.5337669998407364,
"epoch": 2.156862745098039,
"grad_norm": 0.03831348195672035,
"learning_rate": 0.0002,
"loss": 0.5345415472984314,
"mean_token_accuracy": 0.7816608846187592,
"num_tokens": 9415670.0,
"step": 578
},
{
"entropy": 0.5471907705068588,
"epoch": 2.160597572362278,
"grad_norm": 0.04574183374643326,
"learning_rate": 0.0002,
"loss": 0.5423465967178345,
"mean_token_accuracy": 0.7815522998571396,
"num_tokens": 9432026.0,
"step": 579
},
{
"entropy": 0.5296851545572281,
"epoch": 2.164332399626517,
"grad_norm": 0.036245960742235184,
"learning_rate": 0.0002,
"loss": 0.5283267498016357,
"mean_token_accuracy": 0.782764196395874,
"num_tokens": 9448243.0,
"step": 580
},
{
"entropy": 0.5230330973863602,
"epoch": 2.168067226890756,
"grad_norm": 0.042745113372802734,
"learning_rate": 0.0002,
"loss": 0.5258357524871826,
"mean_token_accuracy": 0.7882087379693985,
"num_tokens": 9464651.0,
"step": 581
},
{
"entropy": 0.527550533413887,
"epoch": 2.171802054154995,
"grad_norm": 0.037547484040260315,
"learning_rate": 0.0002,
"loss": 0.5317714214324951,
"mean_token_accuracy": 0.7830660939216614,
"num_tokens": 9480840.0,
"step": 582
},
{
"entropy": 0.5365846008062363,
"epoch": 2.175536881419234,
"grad_norm": 0.031849246472120285,
"learning_rate": 0.0002,
"loss": 0.5385018587112427,
"mean_token_accuracy": 0.7798628509044647,
"num_tokens": 9497134.0,
"step": 583
},
{
"entropy": 0.5381672978401184,
"epoch": 2.179271708683473,
"grad_norm": 0.03450456261634827,
"learning_rate": 0.0002,
"loss": 0.534706711769104,
"mean_token_accuracy": 0.7828627675771713,
"num_tokens": 9513638.0,
"step": 584
},
{
"entropy": 0.5432828962802887,
"epoch": 2.183006535947712,
"grad_norm": 0.03337936848402023,
"learning_rate": 0.0002,
"loss": 0.537329375743866,
"mean_token_accuracy": 0.7817478477954865,
"num_tokens": 9530154.0,
"step": 585
},
{
"entropy": 0.5273857861757278,
"epoch": 2.1867413632119517,
"grad_norm": 0.03686324506998062,
"learning_rate": 0.0002,
"loss": 0.5241349339485168,
"mean_token_accuracy": 0.7857643216848373,
"num_tokens": 9546371.0,
"step": 586
},
{
"entropy": 0.5210235714912415,
"epoch": 2.1904761904761907,
"grad_norm": 0.036837268620729446,
"learning_rate": 0.0002,
"loss": 0.52490234375,
"mean_token_accuracy": 0.7855679392814636,
"num_tokens": 9562751.0,
"step": 587
},
{
"entropy": 0.5189210176467896,
"epoch": 2.1942110177404297,
"grad_norm": 0.034773845225572586,
"learning_rate": 0.0002,
"loss": 0.5269665718078613,
"mean_token_accuracy": 0.7840563803911209,
"num_tokens": 9579070.0,
"step": 588
},
{
"entropy": 0.5213501304388046,
"epoch": 2.1979458450046687,
"grad_norm": 0.03593657165765762,
"learning_rate": 0.0002,
"loss": 0.5271462798118591,
"mean_token_accuracy": 0.7872640639543533,
"num_tokens": 9595317.0,
"step": 589
},
{
"entropy": 0.5206883400678635,
"epoch": 2.2016806722689077,
"grad_norm": 0.04565085843205452,
"learning_rate": 0.0002,
"loss": 0.5203741192817688,
"mean_token_accuracy": 0.7881180793046951,
"num_tokens": 9611799.0,
"step": 590
},
{
"entropy": 0.5511275231838226,
"epoch": 2.2054154995331468,
"grad_norm": 0.03642827644944191,
"learning_rate": 0.0002,
"loss": 0.543250322341919,
"mean_token_accuracy": 0.7804217487573624,
"num_tokens": 9628251.0,
"step": 591
},
{
"entropy": 0.5495842546224594,
"epoch": 2.2091503267973858,
"grad_norm": 0.03284912183880806,
"learning_rate": 0.0002,
"loss": 0.5446897745132446,
"mean_token_accuracy": 0.7783705443143845,
"num_tokens": 9644703.0,
"step": 592
},
{
"entropy": 0.5297017693519592,
"epoch": 2.212885154061625,
"grad_norm": 0.04696131870150566,
"learning_rate": 0.0002,
"loss": 0.5325087308883667,
"mean_token_accuracy": 0.7825257629156113,
"num_tokens": 9660837.0,
"step": 593
},
{
"entropy": 0.5143487304449081,
"epoch": 2.216619981325864,
"grad_norm": 0.03802449256181717,
"learning_rate": 0.0002,
"loss": 0.5171544551849365,
"mean_token_accuracy": 0.7884373366832733,
"num_tokens": 9676767.0,
"step": 594
},
{
"entropy": 0.5370919853448868,
"epoch": 2.220354808590103,
"grad_norm": 0.0338297039270401,
"learning_rate": 0.0002,
"loss": 0.5430178046226501,
"mean_token_accuracy": 0.7808142453432083,
"num_tokens": 9693155.0,
"step": 595
},
{
"entropy": 0.5210085138678551,
"epoch": 2.224089635854342,
"grad_norm": 0.04106014966964722,
"learning_rate": 0.0002,
"loss": 0.5282027125358582,
"mean_token_accuracy": 0.7855826020240784,
"num_tokens": 9709374.0,
"step": 596
},
{
"entropy": 0.516735278069973,
"epoch": 2.227824463118581,
"grad_norm": 0.03890896216034889,
"learning_rate": 0.0002,
"loss": 0.5173130035400391,
"mean_token_accuracy": 0.7897009253501892,
"num_tokens": 9725684.0,
"step": 597
},
{
"entropy": 0.5427378565073013,
"epoch": 2.23155929038282,
"grad_norm": 0.038357146084308624,
"learning_rate": 0.0002,
"loss": 0.5421530604362488,
"mean_token_accuracy": 0.7789111882448196,
"num_tokens": 9742139.0,
"step": 598
},
{
"entropy": 0.5454076677560806,
"epoch": 2.235294117647059,
"grad_norm": 0.037645429372787476,
"learning_rate": 0.0002,
"loss": 0.5428951978683472,
"mean_token_accuracy": 0.7789873778820038,
"num_tokens": 9758607.0,
"step": 599
},
{
"entropy": 0.5404632985591888,
"epoch": 2.239028944911298,
"grad_norm": 0.039499301463365555,
"learning_rate": 0.0002,
"loss": 0.5404936075210571,
"mean_token_accuracy": 0.7793222069740295,
"num_tokens": 9775018.0,
"step": 600
},
{
"entropy": 0.530501589179039,
"epoch": 2.242763772175537,
"grad_norm": 0.040064238011837006,
"learning_rate": 0.0002,
"loss": 0.5247729420661926,
"mean_token_accuracy": 0.7874402105808258,
"num_tokens": 9791324.0,
"step": 601
},
{
"entropy": 0.5369330644607544,
"epoch": 2.246498599439776,
"grad_norm": 0.037321336567401886,
"learning_rate": 0.0002,
"loss": 0.5377377271652222,
"mean_token_accuracy": 0.782483384013176,
"num_tokens": 9807623.0,
"step": 602
},
{
"entropy": 0.5428077727556229,
"epoch": 2.250233426704015,
"grad_norm": 0.03844759240746498,
"learning_rate": 0.0002,
"loss": 0.5476452112197876,
"mean_token_accuracy": 0.776596188545227,
"num_tokens": 9824163.0,
"step": 603
},
{
"entropy": 0.5409123748540878,
"epoch": 2.253968253968254,
"grad_norm": 0.03608860820531845,
"learning_rate": 0.0002,
"loss": 0.5434892177581787,
"mean_token_accuracy": 0.7794551849365234,
"num_tokens": 9840911.0,
"step": 604
},
{
"entropy": 0.5327287763357162,
"epoch": 2.257703081232493,
"grad_norm": 0.037285350263118744,
"learning_rate": 0.0002,
"loss": 0.5284507274627686,
"mean_token_accuracy": 0.7831137478351593,
"num_tokens": 9857296.0,
"step": 605
},
{
"entropy": 0.5373975485563278,
"epoch": 2.261437908496732,
"grad_norm": 0.03957006335258484,
"learning_rate": 0.0002,
"loss": 0.5341996550559998,
"mean_token_accuracy": 0.7810620963573456,
"num_tokens": 9873850.0,
"step": 606
},
{
"entropy": 0.5290782749652863,
"epoch": 2.265172735760971,
"grad_norm": 0.040026333183050156,
"learning_rate": 0.0002,
"loss": 0.5341078042984009,
"mean_token_accuracy": 0.780807763338089,
"num_tokens": 9890140.0,
"step": 607
},
{
"entropy": 0.5333269834518433,
"epoch": 2.26890756302521,
"grad_norm": 0.03453996032476425,
"learning_rate": 0.0002,
"loss": 0.5351240634918213,
"mean_token_accuracy": 0.7807765603065491,
"num_tokens": 9906713.0,
"step": 608
},
{
"entropy": 0.5285785049200058,
"epoch": 2.272642390289449,
"grad_norm": 0.04334354028105736,
"learning_rate": 0.0002,
"loss": 0.5339541435241699,
"mean_token_accuracy": 0.7852969169616699,
"num_tokens": 9922956.0,
"step": 609
},
{
"entropy": 0.5360069870948792,
"epoch": 2.276377217553688,
"grad_norm": 0.03924287483096123,
"learning_rate": 0.0002,
"loss": 0.5359759330749512,
"mean_token_accuracy": 0.7819220721721649,
"num_tokens": 9939069.0,
"step": 610
},
{
"entropy": 0.5435689836740494,
"epoch": 2.280112044817927,
"grad_norm": 0.037971340119838715,
"learning_rate": 0.0002,
"loss": 0.5404746532440186,
"mean_token_accuracy": 0.779410719871521,
"num_tokens": 9955662.0,
"step": 611
},
{
"entropy": 0.5445673018693924,
"epoch": 2.283846872082166,
"grad_norm": 0.03730984404683113,
"learning_rate": 0.0002,
"loss": 0.5418494939804077,
"mean_token_accuracy": 0.7813813239336014,
"num_tokens": 9971957.0,
"step": 612
},
{
"entropy": 0.5419572293758392,
"epoch": 2.287581699346405,
"grad_norm": 0.041550587862730026,
"learning_rate": 0.0002,
"loss": 0.5388907194137573,
"mean_token_accuracy": 0.7824303805828094,
"num_tokens": 9988368.0,
"step": 613
},
{
"entropy": 0.5267663449048996,
"epoch": 2.291316526610644,
"grad_norm": 0.03576701879501343,
"learning_rate": 0.0002,
"loss": 0.5305144786834717,
"mean_token_accuracy": 0.784039780497551,
"num_tokens": 10004679.0,
"step": 614
},
{
"entropy": 0.5274334847927094,
"epoch": 2.295051353874883,
"grad_norm": 0.03758349269628525,
"learning_rate": 0.0002,
"loss": 0.5307276844978333,
"mean_token_accuracy": 0.7834599912166595,
"num_tokens": 10021146.0,
"step": 615
},
{
"entropy": 0.5317062586545944,
"epoch": 2.298786181139122,
"grad_norm": 0.04096253216266632,
"learning_rate": 0.0002,
"loss": 0.5370841026306152,
"mean_token_accuracy": 0.7823186218738556,
"num_tokens": 10037433.0,
"step": 616
},
{
"entropy": 0.5429483950138092,
"epoch": 2.302521008403361,
"grad_norm": 0.04739284887909889,
"learning_rate": 0.0002,
"loss": 0.5423600673675537,
"mean_token_accuracy": 0.7782929539680481,
"num_tokens": 10053809.0,
"step": 617
},
{
"entropy": 0.5375621318817139,
"epoch": 2.3062558356676,
"grad_norm": 0.03586879000067711,
"learning_rate": 0.0002,
"loss": 0.5351011157035828,
"mean_token_accuracy": 0.7827189415693283,
"num_tokens": 10070403.0,
"step": 618
},
{
"entropy": 0.5332229882478714,
"epoch": 2.309990662931839,
"grad_norm": 0.039749447256326675,
"learning_rate": 0.0002,
"loss": 0.5316674113273621,
"mean_token_accuracy": 0.7804222106933594,
"num_tokens": 10086520.0,
"step": 619
},
{
"entropy": 0.5367073863744736,
"epoch": 2.313725490196078,
"grad_norm": 0.03144790232181549,
"learning_rate": 0.0002,
"loss": 0.5297679305076599,
"mean_token_accuracy": 0.7887496650218964,
"num_tokens": 10102990.0,
"step": 620
},
{
"entropy": 0.5359086692333221,
"epoch": 2.317460317460317,
"grad_norm": 0.03297298401594162,
"learning_rate": 0.0002,
"loss": 0.5307391881942749,
"mean_token_accuracy": 0.7840029001235962,
"num_tokens": 10119527.0,
"step": 621
},
{
"entropy": 0.5245398730039597,
"epoch": 2.3211951447245567,
"grad_norm": 0.04077174887061119,
"learning_rate": 0.0002,
"loss": 0.5315594673156738,
"mean_token_accuracy": 0.7830156534910202,
"num_tokens": 10135668.0,
"step": 622
},
{
"entropy": 0.5435569882392883,
"epoch": 2.3249299719887957,
"grad_norm": 0.037014640867710114,
"learning_rate": 0.0002,
"loss": 0.5420113801956177,
"mean_token_accuracy": 0.7790663093328476,
"num_tokens": 10152032.0,
"step": 623
},
{
"entropy": 0.5408807992935181,
"epoch": 2.3286647992530347,
"grad_norm": 0.040016841143369675,
"learning_rate": 0.0002,
"loss": 0.5407621264457703,
"mean_token_accuracy": 0.7807969450950623,
"num_tokens": 10168548.0,
"step": 624
},
{
"entropy": 0.5394706726074219,
"epoch": 2.3323996265172737,
"grad_norm": 0.038603588938713074,
"learning_rate": 0.0002,
"loss": 0.5371181964874268,
"mean_token_accuracy": 0.7830179631710052,
"num_tokens": 10185087.0,
"step": 625
},
{
"entropy": 0.5331766307353973,
"epoch": 2.3361344537815127,
"grad_norm": 0.03732473403215408,
"learning_rate": 0.0002,
"loss": 0.5403282046318054,
"mean_token_accuracy": 0.7811668664216995,
"num_tokens": 10201643.0,
"step": 626
},
{
"entropy": 0.5270423144102097,
"epoch": 2.3398692810457518,
"grad_norm": 0.039125435054302216,
"learning_rate": 0.0002,
"loss": 0.5314643383026123,
"mean_token_accuracy": 0.7847079634666443,
"num_tokens": 10218028.0,
"step": 627
},
{
"entropy": 0.5217768847942352,
"epoch": 2.3436041083099908,
"grad_norm": 0.031856924295425415,
"learning_rate": 0.0002,
"loss": 0.5211607813835144,
"mean_token_accuracy": 0.787055104970932,
"num_tokens": 10234302.0,
"step": 628
},
{
"entropy": 0.5297789797186852,
"epoch": 2.34733893557423,
"grad_norm": 0.044731732457876205,
"learning_rate": 0.0002,
"loss": 0.5366175174713135,
"mean_token_accuracy": 0.7815698832273483,
"num_tokens": 10250527.0,
"step": 629
},
{
"entropy": 0.5372533053159714,
"epoch": 2.351073762838469,
"grad_norm": 0.03578559309244156,
"learning_rate": 0.0002,
"loss": 0.5398249626159668,
"mean_token_accuracy": 0.782914400100708,
"num_tokens": 10266845.0,
"step": 630
},
{
"entropy": 0.5397268682718277,
"epoch": 2.354808590102708,
"grad_norm": 0.04053846001625061,
"learning_rate": 0.0002,
"loss": 0.5417327880859375,
"mean_token_accuracy": 0.7793487906455994,
"num_tokens": 10283134.0,
"step": 631
},
{
"entropy": 0.5457513332366943,
"epoch": 2.358543417366947,
"grad_norm": 0.039855144917964935,
"learning_rate": 0.0002,
"loss": 0.5377854704856873,
"mean_token_accuracy": 0.7803535759449005,
"num_tokens": 10299673.0,
"step": 632
},
{
"entropy": 0.5374201238155365,
"epoch": 2.362278244631186,
"grad_norm": 0.03583669289946556,
"learning_rate": 0.0002,
"loss": 0.5346733331680298,
"mean_token_accuracy": 0.7818557769060135,
"num_tokens": 10316146.0,
"step": 633
},
{
"entropy": 0.5228708907961845,
"epoch": 2.366013071895425,
"grad_norm": 0.0356278158724308,
"learning_rate": 0.0002,
"loss": 0.5220701694488525,
"mean_token_accuracy": 0.7894868850708008,
"num_tokens": 10332482.0,
"step": 634
},
{
"entropy": 0.5448856949806213,
"epoch": 2.369747899159664,
"grad_norm": 0.045307550579309464,
"learning_rate": 0.0002,
"loss": 0.555870771408081,
"mean_token_accuracy": 0.7770739197731018,
"num_tokens": 10348970.0,
"step": 635
},
{
"entropy": 0.5384282767772675,
"epoch": 2.373482726423903,
"grad_norm": 0.03949993476271629,
"learning_rate": 0.0002,
"loss": 0.5424531102180481,
"mean_token_accuracy": 0.7787595987319946,
"num_tokens": 10365074.0,
"step": 636
},
{
"entropy": 0.532962828874588,
"epoch": 2.377217553688142,
"grad_norm": 0.0345122404396534,
"learning_rate": 0.0002,
"loss": 0.5286644697189331,
"mean_token_accuracy": 0.7851764559745789,
"num_tokens": 10381036.0,
"step": 637
},
{
"entropy": 0.5396641790866852,
"epoch": 2.380952380952381,
"grad_norm": 0.038070570677518845,
"learning_rate": 0.0002,
"loss": 0.5350325703620911,
"mean_token_accuracy": 0.783138781785965,
"num_tokens": 10397441.0,
"step": 638
},
{
"entropy": 0.5453281551599503,
"epoch": 2.38468720821662,
"grad_norm": 0.03477659448981285,
"learning_rate": 0.0002,
"loss": 0.5431845188140869,
"mean_token_accuracy": 0.7779907435178757,
"num_tokens": 10413843.0,
"step": 639
},
{
"entropy": 0.5235247910022736,
"epoch": 2.388422035480859,
"grad_norm": 0.04054819047451019,
"learning_rate": 0.0002,
"loss": 0.5272566080093384,
"mean_token_accuracy": 0.7897930145263672,
"num_tokens": 10430041.0,
"step": 640
},
{
"entropy": 0.5263708084821701,
"epoch": 2.392156862745098,
"grad_norm": 0.042338334023952484,
"learning_rate": 0.0002,
"loss": 0.5340385437011719,
"mean_token_accuracy": 0.7824059575796127,
"num_tokens": 10446100.0,
"step": 641
},
{
"entropy": 0.543594166636467,
"epoch": 2.395891690009337,
"grad_norm": 0.04357817769050598,
"learning_rate": 0.0002,
"loss": 0.5377992391586304,
"mean_token_accuracy": 0.781853511929512,
"num_tokens": 10462519.0,
"step": 642
},
{
"entropy": 0.5444612801074982,
"epoch": 2.399626517273576,
"grad_norm": 0.03883645310997963,
"learning_rate": 0.0002,
"loss": 0.5423793196678162,
"mean_token_accuracy": 0.7786720097064972,
"num_tokens": 10478807.0,
"step": 643
},
{
"entropy": 0.5298498719930649,
"epoch": 2.403361344537815,
"grad_norm": 0.03690332546830177,
"learning_rate": 0.0002,
"loss": 0.5272641181945801,
"mean_token_accuracy": 0.782812237739563,
"num_tokens": 10494864.0,
"step": 644
},
{
"entropy": 0.528311550617218,
"epoch": 2.407096171802054,
"grad_norm": 0.04098167642951012,
"learning_rate": 0.0002,
"loss": 0.5349369049072266,
"mean_token_accuracy": 0.7804581671953201,
"num_tokens": 10511211.0,
"step": 645
},
{
"entropy": 0.5355981737375259,
"epoch": 2.410830999066293,
"grad_norm": 0.040713947266340256,
"learning_rate": 0.0002,
"loss": 0.5427882075309753,
"mean_token_accuracy": 0.7789115309715271,
"num_tokens": 10527252.0,
"step": 646
},
{
"entropy": 0.5335679203271866,
"epoch": 2.414565826330532,
"grad_norm": 0.03578624129295349,
"learning_rate": 0.0002,
"loss": 0.5237961411476135,
"mean_token_accuracy": 0.7891885042190552,
"num_tokens": 10543508.0,
"step": 647
},
{
"entropy": 0.550647184252739,
"epoch": 2.418300653594771,
"grad_norm": 0.041548412293195724,
"learning_rate": 0.0002,
"loss": 0.5482417941093445,
"mean_token_accuracy": 0.7743094116449356,
"num_tokens": 10559883.0,
"step": 648
},
{
"entropy": 0.5099608227610588,
"epoch": 2.42203548085901,
"grad_norm": 0.035532381385564804,
"learning_rate": 0.0002,
"loss": 0.5146307349205017,
"mean_token_accuracy": 0.7876965999603271,
"num_tokens": 10576102.0,
"step": 649
},
{
"entropy": 0.5289439111948013,
"epoch": 2.425770308123249,
"grad_norm": 0.03995847702026367,
"learning_rate": 0.0002,
"loss": 0.529523491859436,
"mean_token_accuracy": 0.7849718630313873,
"num_tokens": 10592278.0,
"step": 650
},
{
"entropy": 0.5271874070167542,
"epoch": 2.429505135387488,
"grad_norm": 0.038978736847639084,
"learning_rate": 0.0002,
"loss": 0.5379216074943542,
"mean_token_accuracy": 0.7814365327358246,
"num_tokens": 10608707.0,
"step": 651
},
{
"entropy": 0.5211434736847878,
"epoch": 2.4332399626517276,
"grad_norm": 0.04277133196592331,
"learning_rate": 0.0002,
"loss": 0.5224626660346985,
"mean_token_accuracy": 0.7893835753202438,
"num_tokens": 10625209.0,
"step": 652
},
{
"entropy": 0.5353395342826843,
"epoch": 2.4369747899159666,
"grad_norm": 0.03804321959614754,
"learning_rate": 0.0002,
"loss": 0.5317578911781311,
"mean_token_accuracy": 0.7827101796865463,
"num_tokens": 10641678.0,
"step": 653
},
{
"entropy": 0.5419681817293167,
"epoch": 2.4407096171802056,
"grad_norm": 0.03237481042742729,
"learning_rate": 0.0002,
"loss": 0.5347220301628113,
"mean_token_accuracy": 0.7828710377216339,
"num_tokens": 10658020.0,
"step": 654
},
{
"entropy": 0.54988232254982,
"epoch": 2.4444444444444446,
"grad_norm": 0.0367792509496212,
"learning_rate": 0.0002,
"loss": 0.548277735710144,
"mean_token_accuracy": 0.7793003767728806,
"num_tokens": 10674273.0,
"step": 655
},
{
"entropy": 0.5270714908838272,
"epoch": 2.4481792717086837,
"grad_norm": 0.04078115150332451,
"learning_rate": 0.0002,
"loss": 0.5275436639785767,
"mean_token_accuracy": 0.7857778370380402,
"num_tokens": 10690682.0,
"step": 656
},
{
"entropy": 0.5229745805263519,
"epoch": 2.4519140989729227,
"grad_norm": 0.03635413572192192,
"learning_rate": 0.0002,
"loss": 0.5259315967559814,
"mean_token_accuracy": 0.7876160591840744,
"num_tokens": 10706935.0,
"step": 657
},
{
"entropy": 0.520149365067482,
"epoch": 2.4556489262371617,
"grad_norm": 0.04523176699876785,
"learning_rate": 0.0002,
"loss": 0.5284128189086914,
"mean_token_accuracy": 0.7826286852359772,
"num_tokens": 10723130.0,
"step": 658
},
{
"entropy": 0.5226980745792389,
"epoch": 2.4593837535014007,
"grad_norm": 0.04385685920715332,
"learning_rate": 0.0002,
"loss": 0.5277330279350281,
"mean_token_accuracy": 0.7842638790607452,
"num_tokens": 10739706.0,
"step": 659
},
{
"entropy": 0.5327855497598648,
"epoch": 2.4631185807656397,
"grad_norm": 0.03833289071917534,
"learning_rate": 0.0002,
"loss": 0.529242753982544,
"mean_token_accuracy": 0.784681499004364,
"num_tokens": 10756135.0,
"step": 660
},
{
"entropy": 0.5270693749189377,
"epoch": 2.4668534080298787,
"grad_norm": 0.04420669376850128,
"learning_rate": 0.0002,
"loss": 0.5234766602516174,
"mean_token_accuracy": 0.7881586104631424,
"num_tokens": 10772473.0,
"step": 661
},
{
"entropy": 0.5432615429162979,
"epoch": 2.4705882352941178,
"grad_norm": 0.03388570621609688,
"learning_rate": 0.0002,
"loss": 0.5459257364273071,
"mean_token_accuracy": 0.7780051380395889,
"num_tokens": 10788831.0,
"step": 662
},
{
"entropy": 0.5488771200180054,
"epoch": 2.4743230625583568,
"grad_norm": 0.04762876406311989,
"learning_rate": 0.0002,
"loss": 0.5534912943840027,
"mean_token_accuracy": 0.7749715596437454,
"num_tokens": 10805527.0,
"step": 663
},
{
"entropy": 0.5422950983047485,
"epoch": 2.478057889822596,
"grad_norm": 0.03591262549161911,
"learning_rate": 0.0002,
"loss": 0.5398073792457581,
"mean_token_accuracy": 0.779995933175087,
"num_tokens": 10821915.0,
"step": 664
},
{
"entropy": 0.567908450961113,
"epoch": 2.481792717086835,
"grad_norm": 0.04293651878833771,
"learning_rate": 0.0002,
"loss": 0.5645220875740051,
"mean_token_accuracy": 0.771768257021904,
"num_tokens": 10838601.0,
"step": 665
},
{
"entropy": 0.534419909119606,
"epoch": 2.485527544351074,
"grad_norm": 0.036424651741981506,
"learning_rate": 0.0002,
"loss": 0.5309603214263916,
"mean_token_accuracy": 0.786093220114708,
"num_tokens": 10854981.0,
"step": 666
},
{
"entropy": 0.5380399525165558,
"epoch": 2.489262371615313,
"grad_norm": 0.04585183784365654,
"learning_rate": 0.0002,
"loss": 0.5384916067123413,
"mean_token_accuracy": 0.7809207290410995,
"num_tokens": 10871328.0,
"step": 667
},
{
"entropy": 0.5118337720632553,
"epoch": 2.492997198879552,
"grad_norm": 0.03870607912540436,
"learning_rate": 0.0002,
"loss": 0.5148553252220154,
"mean_token_accuracy": 0.7906211614608765,
"num_tokens": 10887543.0,
"step": 668
},
{
"entropy": 0.539421871304512,
"epoch": 2.496732026143791,
"grad_norm": 0.04092569276690483,
"learning_rate": 0.0002,
"loss": 0.5474343299865723,
"mean_token_accuracy": 0.7810823172330856,
"num_tokens": 10904063.0,
"step": 669
},
{
"entropy": 0.5357869118452072,
"epoch": 2.50046685340803,
"grad_norm": 0.03857175633311272,
"learning_rate": 0.0002,
"loss": 0.5365599393844604,
"mean_token_accuracy": 0.7816625684499741,
"num_tokens": 10920474.0,
"step": 670
},
{
"entropy": 0.5330220460891724,
"epoch": 2.504201680672269,
"grad_norm": 0.03685252368450165,
"learning_rate": 0.0002,
"loss": 0.5331542491912842,
"mean_token_accuracy": 0.7820776700973511,
"num_tokens": 10936663.0,
"step": 671
},
{
"entropy": 0.524094969034195,
"epoch": 2.507936507936508,
"grad_norm": 0.03893151134252548,
"learning_rate": 0.0002,
"loss": 0.5277613997459412,
"mean_token_accuracy": 0.7860450148582458,
"num_tokens": 10952950.0,
"step": 672
},
{
"entropy": 0.5463172346353531,
"epoch": 2.511671335200747,
"grad_norm": 0.039967626333236694,
"learning_rate": 0.0002,
"loss": 0.5425282716751099,
"mean_token_accuracy": 0.7801816016435623,
"num_tokens": 10969412.0,
"step": 673
},
{
"entropy": 0.5239230394363403,
"epoch": 2.515406162464986,
"grad_norm": 0.046231936663389206,
"learning_rate": 0.0002,
"loss": 0.5241309404373169,
"mean_token_accuracy": 0.787441685795784,
"num_tokens": 10985869.0,
"step": 674
},
{
"entropy": 0.5359321981668472,
"epoch": 2.519140989729225,
"grad_norm": 0.040779855102300644,
"learning_rate": 0.0002,
"loss": 0.536766767501831,
"mean_token_accuracy": 0.7817385196685791,
"num_tokens": 11002074.0,
"step": 675
},
{
"entropy": 0.5319357812404633,
"epoch": 2.522875816993464,
"grad_norm": 0.03476366400718689,
"learning_rate": 0.0002,
"loss": 0.5311717391014099,
"mean_token_accuracy": 0.7856648862361908,
"num_tokens": 11018648.0,
"step": 676
},
{
"entropy": 0.5231706351041794,
"epoch": 2.526610644257703,
"grad_norm": 0.03785642236471176,
"learning_rate": 0.0002,
"loss": 0.5269960165023804,
"mean_token_accuracy": 0.7866117358207703,
"num_tokens": 11034686.0,
"step": 677
},
{
"entropy": 0.5381273478269577,
"epoch": 2.530345471521942,
"grad_norm": 0.03976747393608093,
"learning_rate": 0.0002,
"loss": 0.5381407141685486,
"mean_token_accuracy": 0.7849810570478439,
"num_tokens": 11050922.0,
"step": 678
},
{
"entropy": 0.5456480979919434,
"epoch": 2.534080298786181,
"grad_norm": 0.039225250482559204,
"learning_rate": 0.0002,
"loss": 0.5425232648849487,
"mean_token_accuracy": 0.7785615175962448,
"num_tokens": 11067148.0,
"step": 679
},
{
"entropy": 0.5407412797212601,
"epoch": 2.53781512605042,
"grad_norm": 0.03705086559057236,
"learning_rate": 0.0002,
"loss": 0.536932110786438,
"mean_token_accuracy": 0.7821008861064911,
"num_tokens": 11083363.0,
"step": 680
},
{
"entropy": 0.5263440012931824,
"epoch": 2.541549953314659,
"grad_norm": 0.0353594608604908,
"learning_rate": 0.0002,
"loss": 0.5256474018096924,
"mean_token_accuracy": 0.7849348187446594,
"num_tokens": 11099785.0,
"step": 681
},
{
"entropy": 0.5354757159948349,
"epoch": 2.545284780578898,
"grad_norm": 0.04532964155077934,
"learning_rate": 0.0002,
"loss": 0.5450004935264587,
"mean_token_accuracy": 0.7807293385267258,
"num_tokens": 11115892.0,
"step": 682
},
{
"entropy": 0.5281579941511154,
"epoch": 2.549019607843137,
"grad_norm": 0.03604253754019737,
"learning_rate": 0.0002,
"loss": 0.5311046838760376,
"mean_token_accuracy": 0.7845126688480377,
"num_tokens": 11132189.0,
"step": 683
},
{
"entropy": 0.5354526489973068,
"epoch": 2.552754435107376,
"grad_norm": 0.03747657313942909,
"learning_rate": 0.0002,
"loss": 0.5361717343330383,
"mean_token_accuracy": 0.7801252007484436,
"num_tokens": 11148681.0,
"step": 684
},
{
"entropy": 0.5386267453432083,
"epoch": 2.556489262371615,
"grad_norm": 0.037825409322977066,
"learning_rate": 0.0002,
"loss": 0.5390512347221375,
"mean_token_accuracy": 0.7815877050161362,
"num_tokens": 11165013.0,
"step": 685
},
{
"entropy": 0.530585527420044,
"epoch": 2.560224089635854,
"grad_norm": 0.03970746695995331,
"learning_rate": 0.0002,
"loss": 0.5291422009468079,
"mean_token_accuracy": 0.7839111536741257,
"num_tokens": 11181301.0,
"step": 686
},
{
"entropy": 0.5292850136756897,
"epoch": 2.563958916900093,
"grad_norm": 0.03387298434972763,
"learning_rate": 0.0002,
"loss": 0.5319269299507141,
"mean_token_accuracy": 0.7840193659067154,
"num_tokens": 11197537.0,
"step": 687
},
{
"entropy": 0.5399095267057419,
"epoch": 2.567693744164332,
"grad_norm": 0.038681600242853165,
"learning_rate": 0.0002,
"loss": 0.5435532331466675,
"mean_token_accuracy": 0.7806709408760071,
"num_tokens": 11213896.0,
"step": 688
},
{
"entropy": 0.5498056858778,
"epoch": 2.571428571428571,
"grad_norm": 0.03758297860622406,
"learning_rate": 0.0002,
"loss": 0.5467256307601929,
"mean_token_accuracy": 0.7782751470804214,
"num_tokens": 11230383.0,
"step": 689
},
{
"entropy": 0.5476771891117096,
"epoch": 2.57516339869281,
"grad_norm": 0.03605665639042854,
"learning_rate": 0.0002,
"loss": 0.541588544845581,
"mean_token_accuracy": 0.7791445404291153,
"num_tokens": 11246749.0,
"step": 690
},
{
"entropy": 0.542407214641571,
"epoch": 2.5788982259570497,
"grad_norm": 0.04616822302341461,
"learning_rate": 0.0002,
"loss": 0.535969614982605,
"mean_token_accuracy": 0.7812883108854294,
"num_tokens": 11263093.0,
"step": 691
},
{
"entropy": 0.5215721130371094,
"epoch": 2.5826330532212887,
"grad_norm": 0.040278688073158264,
"learning_rate": 0.0002,
"loss": 0.5306443572044373,
"mean_token_accuracy": 0.783096119761467,
"num_tokens": 11279295.0,
"step": 692
},
{
"entropy": 0.5300876200199127,
"epoch": 2.5863678804855277,
"grad_norm": 0.04465034604072571,
"learning_rate": 0.0002,
"loss": 0.5408331751823425,
"mean_token_accuracy": 0.781616821885109,
"num_tokens": 11295488.0,
"step": 693
},
{
"entropy": 0.529060423374176,
"epoch": 2.5901027077497667,
"grad_norm": 0.03697149083018303,
"learning_rate": 0.0002,
"loss": 0.5315713286399841,
"mean_token_accuracy": 0.784434586763382,
"num_tokens": 11311910.0,
"step": 694
},
{
"entropy": 0.5421274900436401,
"epoch": 2.5938375350140057,
"grad_norm": 0.03769063949584961,
"learning_rate": 0.0002,
"loss": 0.5342295169830322,
"mean_token_accuracy": 0.7821343541145325,
"num_tokens": 11328227.0,
"step": 695
},
{
"entropy": 0.5521349608898163,
"epoch": 2.5975723622782447,
"grad_norm": 0.037369053810834885,
"learning_rate": 0.0002,
"loss": 0.5406404733657837,
"mean_token_accuracy": 0.7816728502511978,
"num_tokens": 11344754.0,
"step": 696
},
{
"entropy": 0.5276040434837341,
"epoch": 2.6013071895424837,
"grad_norm": 0.04295807331800461,
"learning_rate": 0.0002,
"loss": 0.531209409236908,
"mean_token_accuracy": 0.781694307923317,
"num_tokens": 11360846.0,
"step": 697
},
{
"entropy": 0.5329545885324478,
"epoch": 2.6050420168067228,
"grad_norm": 0.04680144414305687,
"learning_rate": 0.0002,
"loss": 0.5448673963546753,
"mean_token_accuracy": 0.7803032696247101,
"num_tokens": 11376984.0,
"step": 698
},
{
"entropy": 0.5330372750759125,
"epoch": 2.6087768440709618,
"grad_norm": 0.038128506392240524,
"learning_rate": 0.0002,
"loss": 0.5345317125320435,
"mean_token_accuracy": 0.7848279774188995,
"num_tokens": 11393192.0,
"step": 699
},
{
"entropy": 0.5620173513889313,
"epoch": 2.612511671335201,
"grad_norm": 0.0405871607363224,
"learning_rate": 0.0002,
"loss": 0.5558884143829346,
"mean_token_accuracy": 0.7717028856277466,
"num_tokens": 11409571.0,
"step": 700
},
{
"entropy": 0.5401062965393066,
"epoch": 2.61624649859944,
"grad_norm": 0.033952489495277405,
"learning_rate": 0.0002,
"loss": 0.5324668884277344,
"mean_token_accuracy": 0.7836252152919769,
"num_tokens": 11426157.0,
"step": 701
},
{
"entropy": 0.5401272624731064,
"epoch": 2.619981325863679,
"grad_norm": 0.03486888110637665,
"learning_rate": 0.0002,
"loss": 0.5405600666999817,
"mean_token_accuracy": 0.780670240521431,
"num_tokens": 11442706.0,
"step": 702
},
{
"entropy": 0.5286990851163864,
"epoch": 2.623716153127918,
"grad_norm": 0.03971569985151291,
"learning_rate": 0.0002,
"loss": 0.5301419496536255,
"mean_token_accuracy": 0.7845329642295837,
"num_tokens": 11459059.0,
"step": 703
},
{
"entropy": 0.5408699810504913,
"epoch": 2.627450980392157,
"grad_norm": 0.03566860780119896,
"learning_rate": 0.0002,
"loss": 0.5422340631484985,
"mean_token_accuracy": 0.7786179780960083,
"num_tokens": 11475473.0,
"step": 704
},
{
"entropy": 0.5306770950555801,
"epoch": 2.631185807656396,
"grad_norm": 0.038531865924596786,
"learning_rate": 0.0002,
"loss": 0.5311087965965271,
"mean_token_accuracy": 0.784186452627182,
"num_tokens": 11491765.0,
"step": 705
},
{
"entropy": 0.5391299277544022,
"epoch": 2.634920634920635,
"grad_norm": 0.036147549748420715,
"learning_rate": 0.0002,
"loss": 0.5403758883476257,
"mean_token_accuracy": 0.7817845791578293,
"num_tokens": 11508291.0,
"step": 706
},
{
"entropy": 0.5316940769553185,
"epoch": 2.638655462184874,
"grad_norm": 0.036513980478048325,
"learning_rate": 0.0002,
"loss": 0.5340716242790222,
"mean_token_accuracy": 0.7847382575273514,
"num_tokens": 11524811.0,
"step": 707
},
{
"entropy": 0.5237598121166229,
"epoch": 2.642390289449113,
"grad_norm": 0.03360476344823837,
"learning_rate": 0.0002,
"loss": 0.5258880257606506,
"mean_token_accuracy": 0.7865117788314819,
"num_tokens": 11541335.0,
"step": 708
},
{
"entropy": 0.5325336754322052,
"epoch": 2.646125116713352,
"grad_norm": 0.03501066192984581,
"learning_rate": 0.0002,
"loss": 0.5358341336250305,
"mean_token_accuracy": 0.7841547876596451,
"num_tokens": 11557859.0,
"step": 709
},
{
"entropy": 0.5220260694622993,
"epoch": 2.649859943977591,
"grad_norm": 0.038072340190410614,
"learning_rate": 0.0002,
"loss": 0.5222914814949036,
"mean_token_accuracy": 0.7870404571294785,
"num_tokens": 11574116.0,
"step": 710
},
{
"entropy": 0.5257419422268867,
"epoch": 2.65359477124183,
"grad_norm": 0.03713792935013771,
"learning_rate": 0.0002,
"loss": 0.5267120599746704,
"mean_token_accuracy": 0.7852788418531418,
"num_tokens": 11590295.0,
"step": 711
},
{
"entropy": 0.5383759438991547,
"epoch": 2.657329598506069,
"grad_norm": 0.04603256285190582,
"learning_rate": 0.0002,
"loss": 0.5421494841575623,
"mean_token_accuracy": 0.781136080622673,
"num_tokens": 11606581.0,
"step": 712
},
{
"entropy": 0.5336297005414963,
"epoch": 2.661064425770308,
"grad_norm": 0.03931435942649841,
"learning_rate": 0.0002,
"loss": 0.5313882231712341,
"mean_token_accuracy": 0.7825400978326797,
"num_tokens": 11622793.0,
"step": 713
},
{
"entropy": 0.5316190719604492,
"epoch": 2.664799253034547,
"grad_norm": 0.03564710542559624,
"learning_rate": 0.0002,
"loss": 0.530137836933136,
"mean_token_accuracy": 0.7861842215061188,
"num_tokens": 11638909.0,
"step": 714
},
{
"entropy": 0.529007188975811,
"epoch": 2.668534080298786,
"grad_norm": 0.03671964257955551,
"learning_rate": 0.0002,
"loss": 0.5294506549835205,
"mean_token_accuracy": 0.7843856066465378,
"num_tokens": 11655048.0,
"step": 715
},
{
"entropy": 0.5391807407140732,
"epoch": 2.6722689075630255,
"grad_norm": 0.043020427227020264,
"learning_rate": 0.0002,
"loss": 0.5463923215866089,
"mean_token_accuracy": 0.7798423320055008,
"num_tokens": 11671616.0,
"step": 716
},
{
"entropy": 0.5369218289852142,
"epoch": 2.6760037348272645,
"grad_norm": 0.04039768502116203,
"learning_rate": 0.0002,
"loss": 0.5428373217582703,
"mean_token_accuracy": 0.7810155898332596,
"num_tokens": 11687981.0,
"step": 717
},
{
"entropy": 0.5410373359918594,
"epoch": 2.6797385620915035,
"grad_norm": 0.032212115824222565,
"learning_rate": 0.0002,
"loss": 0.538726806640625,
"mean_token_accuracy": 0.7823937833309174,
"num_tokens": 11704497.0,
"step": 718
},
{
"entropy": 0.5408433228731155,
"epoch": 2.6834733893557425,
"grad_norm": 0.04190416634082794,
"learning_rate": 0.0002,
"loss": 0.5312804579734802,
"mean_token_accuracy": 0.7858607023954391,
"num_tokens": 11720759.0,
"step": 719
},
{
"entropy": 0.52065759152174,
"epoch": 2.6872082166199815,
"grad_norm": 0.03749416023492813,
"learning_rate": 0.0002,
"loss": 0.5172442197799683,
"mean_token_accuracy": 0.7908001989126205,
"num_tokens": 11736897.0,
"step": 720
},
{
"entropy": 0.5223864614963531,
"epoch": 2.6909430438842206,
"grad_norm": 0.03889421746134758,
"learning_rate": 0.0002,
"loss": 0.5262103080749512,
"mean_token_accuracy": 0.7863954603672028,
"num_tokens": 11753026.0,
"step": 721
},
{
"entropy": 0.5417105704545975,
"epoch": 2.6946778711484596,
"grad_norm": 0.03900585323572159,
"learning_rate": 0.0002,
"loss": 0.548478901386261,
"mean_token_accuracy": 0.7769544124603271,
"num_tokens": 11769364.0,
"step": 722
},
{
"entropy": 0.5348965376615524,
"epoch": 2.6984126984126986,
"grad_norm": 0.040531598031520844,
"learning_rate": 0.0002,
"loss": 0.5366338491439819,
"mean_token_accuracy": 0.7824574261903763,
"num_tokens": 11785662.0,
"step": 723
},
{
"entropy": 0.5472202748060226,
"epoch": 2.7021475256769376,
"grad_norm": 0.03544607013463974,
"learning_rate": 0.0002,
"loss": 0.546108067035675,
"mean_token_accuracy": 0.7778937071561813,
"num_tokens": 11802091.0,
"step": 724
},
{
"entropy": 0.5445298254489899,
"epoch": 2.7058823529411766,
"grad_norm": 0.045996710658073425,
"learning_rate": 0.0002,
"loss": 0.5458025336265564,
"mean_token_accuracy": 0.7784214168787003,
"num_tokens": 11818307.0,
"step": 725
},
{
"entropy": 0.5437731146812439,
"epoch": 2.7096171802054156,
"grad_norm": 0.040692199021577835,
"learning_rate": 0.0002,
"loss": 0.5425392389297485,
"mean_token_accuracy": 0.7800037413835526,
"num_tokens": 11834733.0,
"step": 726
},
{
"entropy": 0.5586313903331757,
"epoch": 2.7133520074696547,
"grad_norm": 0.05102645978331566,
"learning_rate": 0.0002,
"loss": 0.5646232962608337,
"mean_token_accuracy": 0.7713905870914459,
"num_tokens": 11851346.0,
"step": 727
},
{
"entropy": 0.5276175439357758,
"epoch": 2.7170868347338937,
"grad_norm": 0.04199473559856415,
"learning_rate": 0.0002,
"loss": 0.5330867767333984,
"mean_token_accuracy": 0.7841922342777252,
"num_tokens": 11867709.0,
"step": 728
},
{
"entropy": 0.5365078300237656,
"epoch": 2.7208216619981327,
"grad_norm": 0.038084954023361206,
"learning_rate": 0.0002,
"loss": 0.5328811407089233,
"mean_token_accuracy": 0.7830130755901337,
"num_tokens": 11884172.0,
"step": 729
},
{
"entropy": 0.5306914746761322,
"epoch": 2.7245564892623717,
"grad_norm": 0.04009576886892319,
"learning_rate": 0.0002,
"loss": 0.5335056185722351,
"mean_token_accuracy": 0.784161165356636,
"num_tokens": 11900524.0,
"step": 730
},
{
"entropy": 0.5325679033994675,
"epoch": 2.7282913165266107,
"grad_norm": 0.0398661270737648,
"learning_rate": 0.0002,
"loss": 0.5311678051948547,
"mean_token_accuracy": 0.7866542786359787,
"num_tokens": 11916696.0,
"step": 731
},
{
"entropy": 0.5234319120645523,
"epoch": 2.7320261437908497,
"grad_norm": 0.03887765109539032,
"learning_rate": 0.0002,
"loss": 0.5243536233901978,
"mean_token_accuracy": 0.786685973405838,
"num_tokens": 11933375.0,
"step": 732
},
{
"entropy": 0.5323622822761536,
"epoch": 2.7357609710550888,
"grad_norm": 0.041390158236026764,
"learning_rate": 0.0002,
"loss": 0.5382110476493835,
"mean_token_accuracy": 0.7813025563955307,
"num_tokens": 11949641.0,
"step": 733
},
{
"entropy": 0.5282771736383438,
"epoch": 2.7394957983193278,
"grad_norm": 0.03821795806288719,
"learning_rate": 0.0002,
"loss": 0.5237923860549927,
"mean_token_accuracy": 0.7858958840370178,
"num_tokens": 11965904.0,
"step": 734
},
{
"entropy": 0.5336133688688278,
"epoch": 2.743230625583567,
"grad_norm": 0.040790773928165436,
"learning_rate": 0.0002,
"loss": 0.5322080850601196,
"mean_token_accuracy": 0.7814221978187561,
"num_tokens": 11982242.0,
"step": 735
},
{
"entropy": 0.5447276085615158,
"epoch": 2.746965452847806,
"grad_norm": 0.03733038902282715,
"learning_rate": 0.0002,
"loss": 0.5435236096382141,
"mean_token_accuracy": 0.7783806473016739,
"num_tokens": 11998525.0,
"step": 736
},
{
"entropy": 0.5370974391698837,
"epoch": 2.750700280112045,
"grad_norm": 0.035691265016794205,
"learning_rate": 0.0002,
"loss": 0.5391957759857178,
"mean_token_accuracy": 0.7787430435419083,
"num_tokens": 12014726.0,
"step": 737
},
{
"entropy": 0.5190877616405487,
"epoch": 2.754435107376284,
"grad_norm": 0.037242453545331955,
"learning_rate": 0.0002,
"loss": 0.5239222645759583,
"mean_token_accuracy": 0.7867171913385391,
"num_tokens": 12030648.0,
"step": 738
},
{
"entropy": 0.5201060324907303,
"epoch": 2.758169934640523,
"grad_norm": 0.03840528428554535,
"learning_rate": 0.0002,
"loss": 0.5264686942100525,
"mean_token_accuracy": 0.7854082137346268,
"num_tokens": 12046824.0,
"step": 739
},
{
"entropy": 0.5208890736103058,
"epoch": 2.761904761904762,
"grad_norm": 0.038443028926849365,
"learning_rate": 0.0002,
"loss": 0.5207111239433289,
"mean_token_accuracy": 0.7860049307346344,
"num_tokens": 12063182.0,
"step": 740
},
{
"entropy": 0.5337280184030533,
"epoch": 2.765639589169001,
"grad_norm": 0.0408535934984684,
"learning_rate": 0.0002,
"loss": 0.5295891165733337,
"mean_token_accuracy": 0.7857932895421982,
"num_tokens": 12079411.0,
"step": 741
},
{
"entropy": 0.5374506562948227,
"epoch": 2.76937441643324,
"grad_norm": 0.04354558512568474,
"learning_rate": 0.0002,
"loss": 0.5414345860481262,
"mean_token_accuracy": 0.7807870209217072,
"num_tokens": 12095874.0,
"step": 742
},
{
"entropy": 0.5360343009233475,
"epoch": 2.773109243697479,
"grad_norm": 0.03928976133465767,
"learning_rate": 0.0002,
"loss": 0.5380703210830688,
"mean_token_accuracy": 0.7798075079917908,
"num_tokens": 12112215.0,
"step": 743
},
{
"entropy": 0.5264292061328888,
"epoch": 2.776844070961718,
"grad_norm": 0.03775021806359291,
"learning_rate": 0.0002,
"loss": 0.5281617045402527,
"mean_token_accuracy": 0.7842919081449509,
"num_tokens": 12128361.0,
"step": 744
},
{
"entropy": 0.5419831871986389,
"epoch": 2.780578898225957,
"grad_norm": 0.032331038266420364,
"learning_rate": 0.0002,
"loss": 0.5362944602966309,
"mean_token_accuracy": 0.7816326916217804,
"num_tokens": 12144755.0,
"step": 745
},
{
"entropy": 0.5174460113048553,
"epoch": 2.784313725490196,
"grad_norm": 0.03798742592334747,
"learning_rate": 0.0002,
"loss": 0.515007495880127,
"mean_token_accuracy": 0.7882062345743179,
"num_tokens": 12161034.0,
"step": 746
},
{
"entropy": 0.5355328992009163,
"epoch": 2.788048552754435,
"grad_norm": 0.036557331681251526,
"learning_rate": 0.0002,
"loss": 0.5344611406326294,
"mean_token_accuracy": 0.7847500294446945,
"num_tokens": 12177479.0,
"step": 747
},
{
"entropy": 0.538584902882576,
"epoch": 2.791783380018674,
"grad_norm": 0.039520300924777985,
"learning_rate": 0.0002,
"loss": 0.5427792072296143,
"mean_token_accuracy": 0.7786386609077454,
"num_tokens": 12193830.0,
"step": 748
},
{
"entropy": 0.51973095536232,
"epoch": 2.795518207282913,
"grad_norm": 0.04126165434718132,
"learning_rate": 0.0002,
"loss": 0.5279180407524109,
"mean_token_accuracy": 0.784518226981163,
"num_tokens": 12210022.0,
"step": 749
},
{
"entropy": 0.5385647118091583,
"epoch": 2.799253034547152,
"grad_norm": 0.03742329403758049,
"learning_rate": 0.0002,
"loss": 0.5358390808105469,
"mean_token_accuracy": 0.7814119607210159,
"num_tokens": 12226184.0,
"step": 750
},
{
"entropy": 0.5483904033899307,
"epoch": 2.802987861811391,
"grad_norm": 0.03444087877869606,
"learning_rate": 0.0002,
"loss": 0.5442800521850586,
"mean_token_accuracy": 0.7782953381538391,
"num_tokens": 12242564.0,
"step": 751
},
{
"entropy": 0.5447859466075897,
"epoch": 2.80672268907563,
"grad_norm": 0.037425972521305084,
"learning_rate": 0.0002,
"loss": 0.5376838445663452,
"mean_token_accuracy": 0.7805659919977188,
"num_tokens": 12259077.0,
"step": 752
},
{
"entropy": 0.526421070098877,
"epoch": 2.810457516339869,
"grad_norm": 0.039544545114040375,
"learning_rate": 0.0002,
"loss": 0.5272819399833679,
"mean_token_accuracy": 0.7836880385875702,
"num_tokens": 12275297.0,
"step": 753
},
{
"entropy": 0.538783460855484,
"epoch": 2.814192343604108,
"grad_norm": 0.035788971930742264,
"learning_rate": 0.0002,
"loss": 0.5417999625205994,
"mean_token_accuracy": 0.7819748818874359,
"num_tokens": 12291643.0,
"step": 754
},
{
"entropy": 0.5367716252803802,
"epoch": 2.817927170868347,
"grad_norm": 0.040753189474344254,
"learning_rate": 0.0002,
"loss": 0.5376288294792175,
"mean_token_accuracy": 0.7829637825489044,
"num_tokens": 12307987.0,
"step": 755
},
{
"entropy": 0.5418078452348709,
"epoch": 2.821661998132586,
"grad_norm": 0.036726806312799454,
"learning_rate": 0.0002,
"loss": 0.5469898581504822,
"mean_token_accuracy": 0.7801835685968399,
"num_tokens": 12324503.0,
"step": 756
},
{
"entropy": 0.525896355509758,
"epoch": 2.825396825396825,
"grad_norm": 0.034559980034828186,
"learning_rate": 0.0002,
"loss": 0.5265108942985535,
"mean_token_accuracy": 0.7867930829524994,
"num_tokens": 12340881.0,
"step": 757
},
{
"entropy": 0.5369487851858139,
"epoch": 2.8291316526610646,
"grad_norm": 0.03595944494009018,
"learning_rate": 0.0002,
"loss": 0.5396771430969238,
"mean_token_accuracy": 0.7813677042722702,
"num_tokens": 12357352.0,
"step": 758
},
{
"entropy": 0.5467210859060287,
"epoch": 2.8328664799253036,
"grad_norm": 0.03524104505777359,
"learning_rate": 0.0002,
"loss": 0.544916570186615,
"mean_token_accuracy": 0.7771721184253693,
"num_tokens": 12373526.0,
"step": 759
},
{
"entropy": 0.5246351063251495,
"epoch": 2.8366013071895426,
"grad_norm": 0.036806508898735046,
"learning_rate": 0.0002,
"loss": 0.5221924781799316,
"mean_token_accuracy": 0.7871624380350113,
"num_tokens": 12389771.0,
"step": 760
},
{
"entropy": 0.530710369348526,
"epoch": 2.8403361344537816,
"grad_norm": 0.04332499951124191,
"learning_rate": 0.0002,
"loss": 0.5322965383529663,
"mean_token_accuracy": 0.7832685261964798,
"num_tokens": 12406028.0,
"step": 761
},
{
"entropy": 0.5254833996295929,
"epoch": 2.8440709617180207,
"grad_norm": 0.038304176181554794,
"learning_rate": 0.0002,
"loss": 0.5253804922103882,
"mean_token_accuracy": 0.7873952239751816,
"num_tokens": 12422639.0,
"step": 762
},
{
"entropy": 0.5236704498529434,
"epoch": 2.8478057889822597,
"grad_norm": 0.03660830482840538,
"learning_rate": 0.0002,
"loss": 0.5286169052124023,
"mean_token_accuracy": 0.7816056311130524,
"num_tokens": 12438922.0,
"step": 763
},
{
"entropy": 0.5321139246225357,
"epoch": 2.8515406162464987,
"grad_norm": 0.04276243969798088,
"learning_rate": 0.0002,
"loss": 0.5400298237800598,
"mean_token_accuracy": 0.7802720963954926,
"num_tokens": 12455234.0,
"step": 764
},
{
"entropy": 0.5383250862360001,
"epoch": 2.8552754435107377,
"grad_norm": 0.04291578382253647,
"learning_rate": 0.0002,
"loss": 0.5375620722770691,
"mean_token_accuracy": 0.7810464948415756,
"num_tokens": 12471352.0,
"step": 765
},
{
"entropy": 0.5423205345869064,
"epoch": 2.8590102707749767,
"grad_norm": 0.04575496166944504,
"learning_rate": 0.0002,
"loss": 0.5404216647148132,
"mean_token_accuracy": 0.7788951247930527,
"num_tokens": 12487810.0,
"step": 766
},
{
"entropy": 0.5412723869085312,
"epoch": 2.8627450980392157,
"grad_norm": 0.03895537182688713,
"learning_rate": 0.0002,
"loss": 0.5416159629821777,
"mean_token_accuracy": 0.7791194468736649,
"num_tokens": 12504261.0,
"step": 767
},
{
"entropy": 0.551712304353714,
"epoch": 2.8664799253034547,
"grad_norm": 0.04248276725411415,
"learning_rate": 0.0002,
"loss": 0.5512599945068359,
"mean_token_accuracy": 0.7787346094846725,
"num_tokens": 12520594.0,
"step": 768
},
{
"entropy": 0.5365375429391861,
"epoch": 2.8702147525676938,
"grad_norm": 0.0429382361471653,
"learning_rate": 0.0002,
"loss": 0.5369971990585327,
"mean_token_accuracy": 0.7795698195695877,
"num_tokens": 12537097.0,
"step": 769
},
{
"entropy": 0.5311344265937805,
"epoch": 2.8739495798319328,
"grad_norm": 0.03710220381617546,
"learning_rate": 0.0002,
"loss": 0.5327049493789673,
"mean_token_accuracy": 0.784042477607727,
"num_tokens": 12553319.0,
"step": 770
},
{
"entropy": 0.5425883233547211,
"epoch": 2.877684407096172,
"grad_norm": 0.04352175444364548,
"learning_rate": 0.0002,
"loss": 0.5457234382629395,
"mean_token_accuracy": 0.7795119434595108,
"num_tokens": 12569370.0,
"step": 771
},
{
"entropy": 0.5384223312139511,
"epoch": 2.881419234360411,
"grad_norm": 0.046248357743024826,
"learning_rate": 0.0002,
"loss": 0.5449962615966797,
"mean_token_accuracy": 0.7777050882577896,
"num_tokens": 12585550.0,
"step": 772
},
{
"entropy": 0.5304270684719086,
"epoch": 2.88515406162465,
"grad_norm": 0.03803584724664688,
"learning_rate": 0.0002,
"loss": 0.5308764576911926,
"mean_token_accuracy": 0.7852406352758408,
"num_tokens": 12601869.0,
"step": 773
},
{
"entropy": 0.5238187685608864,
"epoch": 2.888888888888889,
"grad_norm": 0.04374956712126732,
"learning_rate": 0.0002,
"loss": 0.5296017527580261,
"mean_token_accuracy": 0.7867107540369034,
"num_tokens": 12618133.0,
"step": 774
},
{
"entropy": 0.545166626572609,
"epoch": 2.892623716153128,
"grad_norm": 0.04235200583934784,
"learning_rate": 0.0002,
"loss": 0.5444045066833496,
"mean_token_accuracy": 0.7811264097690582,
"num_tokens": 12634590.0,
"step": 775
},
{
"entropy": 0.552961677312851,
"epoch": 2.896358543417367,
"grad_norm": 0.04033121094107628,
"learning_rate": 0.0002,
"loss": 0.5423647165298462,
"mean_token_accuracy": 0.7789802700281143,
"num_tokens": 12650990.0,
"step": 776
},
{
"entropy": 0.5362664610147476,
"epoch": 2.900093370681606,
"grad_norm": 0.039799049496650696,
"learning_rate": 0.0002,
"loss": 0.5340068340301514,
"mean_token_accuracy": 0.7801271975040436,
"num_tokens": 12667374.0,
"step": 777
},
{
"entropy": 0.540292888879776,
"epoch": 2.903828197945845,
"grad_norm": 0.04687785729765892,
"learning_rate": 0.0002,
"loss": 0.5417227149009705,
"mean_token_accuracy": 0.7800564914941788,
"num_tokens": 12683778.0,
"step": 778
},
{
"entropy": 0.5580530762672424,
"epoch": 2.907563025210084,
"grad_norm": 0.04104934632778168,
"learning_rate": 0.0002,
"loss": 0.553903341293335,
"mean_token_accuracy": 0.7754019796848297,
"num_tokens": 12700259.0,
"step": 779
},
{
"entropy": 0.5188224613666534,
"epoch": 2.911297852474323,
"grad_norm": 0.04876643791794777,
"learning_rate": 0.0002,
"loss": 0.525776207447052,
"mean_token_accuracy": 0.7853571325540543,
"num_tokens": 12716566.0,
"step": 780
},
{
"entropy": 0.5420665293931961,
"epoch": 2.915032679738562,
"grad_norm": 0.04760121926665306,
"learning_rate": 0.0002,
"loss": 0.5495279431343079,
"mean_token_accuracy": 0.7769062519073486,
"num_tokens": 12732949.0,
"step": 781
},
{
"entropy": 0.5393791049718857,
"epoch": 2.918767507002801,
"grad_norm": 0.0337008535861969,
"learning_rate": 0.0002,
"loss": 0.5375462174415588,
"mean_token_accuracy": 0.7824095785617828,
"num_tokens": 12749208.0,
"step": 782
},
{
"entropy": 0.5315912365913391,
"epoch": 2.9225023342670404,
"grad_norm": 0.04428756982088089,
"learning_rate": 0.0002,
"loss": 0.5206541419029236,
"mean_token_accuracy": 0.7908456176519394,
"num_tokens": 12765331.0,
"step": 783
},
{
"entropy": 0.5318206250667572,
"epoch": 2.9262371615312794,
"grad_norm": 0.04391348361968994,
"learning_rate": 0.0002,
"loss": 0.5263054370880127,
"mean_token_accuracy": 0.7842861711978912,
"num_tokens": 12781575.0,
"step": 784
},
{
"entropy": 0.5414671450853348,
"epoch": 2.9299719887955185,
"grad_norm": 0.03392143175005913,
"learning_rate": 0.0002,
"loss": 0.5417372584342957,
"mean_token_accuracy": 0.779655933380127,
"num_tokens": 12797804.0,
"step": 785
},
{
"entropy": 0.5150401219725609,
"epoch": 2.9337068160597575,
"grad_norm": 0.04989241063594818,
"learning_rate": 0.0002,
"loss": 0.5268764495849609,
"mean_token_accuracy": 0.7849253863096237,
"num_tokens": 12814387.0,
"step": 786
},
{
"entropy": 0.5104701817035675,
"epoch": 2.9374416433239965,
"grad_norm": 0.04267291724681854,
"learning_rate": 0.0002,
"loss": 0.5144373178482056,
"mean_token_accuracy": 0.7921061366796494,
"num_tokens": 12830547.0,
"step": 787
},
{
"entropy": 0.5301306545734406,
"epoch": 2.9411764705882355,
"grad_norm": 0.041861243546009064,
"learning_rate": 0.0002,
"loss": 0.5351182818412781,
"mean_token_accuracy": 0.7849584370851517,
"num_tokens": 12846796.0,
"step": 788
},
{
"entropy": 0.5566616058349609,
"epoch": 2.9449112978524745,
"grad_norm": 0.04726849123835564,
"learning_rate": 0.0002,
"loss": 0.5562955737113953,
"mean_token_accuracy": 0.7750595211982727,
"num_tokens": 12863231.0,
"step": 789
},
{
"entropy": 0.5550259649753571,
"epoch": 2.9486461251167135,
"grad_norm": 0.04144451022148132,
"learning_rate": 0.0002,
"loss": 0.5501708388328552,
"mean_token_accuracy": 0.7760492265224457,
"num_tokens": 12879599.0,
"step": 790
},
{
"entropy": 0.5439048856496811,
"epoch": 2.9523809523809526,
"grad_norm": 0.038411688059568405,
"learning_rate": 0.0002,
"loss": 0.5328619480133057,
"mean_token_accuracy": 0.7869621217250824,
"num_tokens": 12895954.0,
"step": 791
},
{
"entropy": 0.5426651537418365,
"epoch": 2.9561157796451916,
"grad_norm": 0.035909172147512436,
"learning_rate": 0.0002,
"loss": 0.5376070141792297,
"mean_token_accuracy": 0.7810229063034058,
"num_tokens": 12912468.0,
"step": 792
},
{
"entropy": 0.5385068506002426,
"epoch": 2.9598506069094306,
"grad_norm": 0.04422811418771744,
"learning_rate": 0.0002,
"loss": 0.5405643582344055,
"mean_token_accuracy": 0.7827010452747345,
"num_tokens": 12929047.0,
"step": 793
},
{
"entropy": 0.5246873497962952,
"epoch": 2.9635854341736696,
"grad_norm": 0.042685672640800476,
"learning_rate": 0.0002,
"loss": 0.537744402885437,
"mean_token_accuracy": 0.7845292538404465,
"num_tokens": 12945498.0,
"step": 794
},
{
"entropy": 0.534453883767128,
"epoch": 2.9673202614379086,
"grad_norm": 0.04630210995674133,
"learning_rate": 0.0002,
"loss": 0.5448824763298035,
"mean_token_accuracy": 0.7790633589029312,
"num_tokens": 12961911.0,
"step": 795
},
{
"entropy": 0.551120862364769,
"epoch": 2.9710550887021476,
"grad_norm": 0.038833893835544586,
"learning_rate": 0.0002,
"loss": 0.5517142415046692,
"mean_token_accuracy": 0.7771248668432236,
"num_tokens": 12978275.0,
"step": 796
},
{
"entropy": 0.540284737944603,
"epoch": 2.9747899159663866,
"grad_norm": 0.034402430057525635,
"learning_rate": 0.0002,
"loss": 0.5354663133621216,
"mean_token_accuracy": 0.7817137837409973,
"num_tokens": 12994610.0,
"step": 797
},
{
"entropy": 0.5466310381889343,
"epoch": 2.9785247432306257,
"grad_norm": 0.07181618362665176,
"learning_rate": 0.0002,
"loss": 0.5540565848350525,
"mean_token_accuracy": 0.7755098789930344,
"num_tokens": 13011180.0,
"step": 798
},
{
"entropy": 0.5366263538599014,
"epoch": 2.9822595704948647,
"grad_norm": 0.038452569395303726,
"learning_rate": 0.0002,
"loss": 0.5375447869300842,
"mean_token_accuracy": 0.7817091047763824,
"num_tokens": 13027553.0,
"step": 799
},
{
"entropy": 0.5117043852806091,
"epoch": 2.9859943977591037,
"grad_norm": 0.040419358760118484,
"learning_rate": 0.0002,
"loss": 0.5115300416946411,
"mean_token_accuracy": 0.7910782992839813,
"num_tokens": 13043466.0,
"step": 800
},
{
"entropy": 0.5549824833869934,
"epoch": 2.9897292250233427,
"grad_norm": 0.04015415534377098,
"learning_rate": 0.0002,
"loss": 0.5516586303710938,
"mean_token_accuracy": 0.7774178683757782,
"num_tokens": 13059980.0,
"step": 801
},
{
"entropy": 0.5470731258392334,
"epoch": 2.9934640522875817,
"grad_norm": 0.03732411563396454,
"learning_rate": 0.0002,
"loss": 0.5440268516540527,
"mean_token_accuracy": 0.7784831672906876,
"num_tokens": 13076305.0,
"step": 802
},
{
"entropy": 0.5496807992458344,
"epoch": 2.9971988795518207,
"grad_norm": 0.042060188949108124,
"learning_rate": 0.0002,
"loss": 0.5516492128372192,
"mean_token_accuracy": 0.7782593071460724,
"num_tokens": 13092596.0,
"step": 803
},
{
"entropy": 0.5623628298441569,
"epoch": 3.0,
"grad_norm": 0.04183833301067352,
"learning_rate": 0.0002,
"loss": 0.5470706820487976,
"mean_token_accuracy": 0.7766743898391724,
"num_tokens": 13094419.0,
"step": 804
}
],
"logging_steps": 1,
"max_steps": 804,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2222761209723617e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}