sublim-phase4-combo-07 / trainer_state.json
eac123's picture
Upload final checkpoint (checkpoint-804)
8cb8c8f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 804,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.1245547831058502,
"epoch": 0.0037313432835820895,
"grad_norm": 1.6273682117462158,
"learning_rate": 0.0002,
"loss": 2.482689619064331,
"mean_token_accuracy": 0.5370704382658005,
"num_tokens": 16322.0,
"step": 1
},
{
"entropy": 1.2366806268692017,
"epoch": 0.007462686567164179,
"grad_norm": 1.4647141695022583,
"learning_rate": 0.0002,
"loss": 2.1726250648498535,
"mean_token_accuracy": 0.5635550767183304,
"num_tokens": 32624.0,
"step": 2
},
{
"entropy": 1.3885400295257568,
"epoch": 0.011194029850746268,
"grad_norm": 1.1605029106140137,
"learning_rate": 0.0002,
"loss": 1.7200348377227783,
"mean_token_accuracy": 0.596715897321701,
"num_tokens": 48781.0,
"step": 3
},
{
"entropy": 1.3746764063835144,
"epoch": 0.014925373134328358,
"grad_norm": 0.932724118232727,
"learning_rate": 0.0002,
"loss": 1.4033262729644775,
"mean_token_accuracy": 0.6351611912250519,
"num_tokens": 65119.0,
"step": 4
},
{
"entropy": 1.3346630930900574,
"epoch": 0.018656716417910446,
"grad_norm": 1.0168325901031494,
"learning_rate": 0.0002,
"loss": 1.2731056213378906,
"mean_token_accuracy": 0.6540397107601166,
"num_tokens": 81735.0,
"step": 5
},
{
"entropy": 1.2580328285694122,
"epoch": 0.022388059701492536,
"grad_norm": 0.5265628695487976,
"learning_rate": 0.0002,
"loss": 1.1689575910568237,
"mean_token_accuracy": 0.6603054255247116,
"num_tokens": 98081.0,
"step": 6
},
{
"entropy": 1.1583980917930603,
"epoch": 0.026119402985074626,
"grad_norm": 0.4118923842906952,
"learning_rate": 0.0002,
"loss": 1.078832983970642,
"mean_token_accuracy": 0.6707835346460342,
"num_tokens": 114185.0,
"step": 7
},
{
"entropy": 1.0589762330055237,
"epoch": 0.029850746268656716,
"grad_norm": 0.41156867146492004,
"learning_rate": 0.0002,
"loss": 1.0044282674789429,
"mean_token_accuracy": 0.6823764145374298,
"num_tokens": 130498.0,
"step": 8
},
{
"entropy": 0.9924780577421188,
"epoch": 0.033582089552238806,
"grad_norm": 0.5590541362762451,
"learning_rate": 0.0002,
"loss": 0.9619787931442261,
"mean_token_accuracy": 0.6892934292554855,
"num_tokens": 146820.0,
"step": 9
},
{
"entropy": 0.9725948423147202,
"epoch": 0.03731343283582089,
"grad_norm": 0.4368315637111664,
"learning_rate": 0.0002,
"loss": 0.8887773752212524,
"mean_token_accuracy": 0.7022321075201035,
"num_tokens": 163228.0,
"step": 10
},
{
"entropy": 0.9371236711740494,
"epoch": 0.041044776119402986,
"grad_norm": 0.43285107612609863,
"learning_rate": 0.0002,
"loss": 0.8475317358970642,
"mean_token_accuracy": 0.706597164273262,
"num_tokens": 179681.0,
"step": 11
},
{
"entropy": 0.8875125199556351,
"epoch": 0.04477611940298507,
"grad_norm": 6.3542633056640625,
"learning_rate": 0.0002,
"loss": 0.8327640295028687,
"mean_token_accuracy": 0.7034512162208557,
"num_tokens": 196348.0,
"step": 12
},
{
"entropy": 0.8179645836353302,
"epoch": 0.048507462686567165,
"grad_norm": 0.44303053617477417,
"learning_rate": 0.0002,
"loss": 0.7809244394302368,
"mean_token_accuracy": 0.7242531627416611,
"num_tokens": 213052.0,
"step": 13
},
{
"entropy": 0.7955248355865479,
"epoch": 0.05223880597014925,
"grad_norm": 0.8472722172737122,
"learning_rate": 0.0002,
"loss": 0.7439039945602417,
"mean_token_accuracy": 0.7328712791204453,
"num_tokens": 229644.0,
"step": 14
},
{
"entropy": 0.7496374696493149,
"epoch": 0.055970149253731345,
"grad_norm": 2.1060233116149902,
"learning_rate": 0.0002,
"loss": 0.7229201793670654,
"mean_token_accuracy": 0.7347650229930878,
"num_tokens": 246138.0,
"step": 15
},
{
"entropy": 0.6943426132202148,
"epoch": 0.05970149253731343,
"grad_norm": 0.4210701882839203,
"learning_rate": 0.0002,
"loss": 0.6997749209403992,
"mean_token_accuracy": 0.7390953898429871,
"num_tokens": 262489.0,
"step": 16
},
{
"entropy": 0.689127504825592,
"epoch": 0.06343283582089553,
"grad_norm": 0.3434777855873108,
"learning_rate": 0.0002,
"loss": 0.6818345189094543,
"mean_token_accuracy": 0.7421105057001114,
"num_tokens": 278800.0,
"step": 17
},
{
"entropy": 0.6688047796487808,
"epoch": 0.06716417910447761,
"grad_norm": 0.43096405267715454,
"learning_rate": 0.0002,
"loss": 0.65822833776474,
"mean_token_accuracy": 0.7513366043567657,
"num_tokens": 295153.0,
"step": 18
},
{
"entropy": 0.6683900207281113,
"epoch": 0.0708955223880597,
"grad_norm": 0.2875062823295593,
"learning_rate": 0.0002,
"loss": 0.6513902544975281,
"mean_token_accuracy": 0.7488225400447845,
"num_tokens": 311631.0,
"step": 19
},
{
"entropy": 0.6681984066963196,
"epoch": 0.07462686567164178,
"grad_norm": 0.34322109818458557,
"learning_rate": 0.0002,
"loss": 0.6516908407211304,
"mean_token_accuracy": 0.7477276474237442,
"num_tokens": 327810.0,
"step": 20
},
{
"entropy": 0.657578319311142,
"epoch": 0.07835820895522388,
"grad_norm": 0.3035106360912323,
"learning_rate": 0.0002,
"loss": 0.6391871571540833,
"mean_token_accuracy": 0.7518605440855026,
"num_tokens": 344148.0,
"step": 21
},
{
"entropy": 0.6416258066892624,
"epoch": 0.08208955223880597,
"grad_norm": 0.2896852493286133,
"learning_rate": 0.0002,
"loss": 0.6108838319778442,
"mean_token_accuracy": 0.7639093101024628,
"num_tokens": 360467.0,
"step": 22
},
{
"entropy": 0.6126270890235901,
"epoch": 0.08582089552238806,
"grad_norm": 0.28889304399490356,
"learning_rate": 0.0002,
"loss": 0.5967156887054443,
"mean_token_accuracy": 0.7673086673021317,
"num_tokens": 376740.0,
"step": 23
},
{
"entropy": 0.607315257191658,
"epoch": 0.08955223880597014,
"grad_norm": 0.26258257031440735,
"learning_rate": 0.0002,
"loss": 0.5931278467178345,
"mean_token_accuracy": 0.7683079540729523,
"num_tokens": 393035.0,
"step": 24
},
{
"entropy": 0.6071023941040039,
"epoch": 0.09328358208955224,
"grad_norm": 0.2627218961715698,
"learning_rate": 0.0002,
"loss": 0.5975178480148315,
"mean_token_accuracy": 0.7655056416988373,
"num_tokens": 409513.0,
"step": 25
},
{
"entropy": 0.6166605055332184,
"epoch": 0.09701492537313433,
"grad_norm": 0.2591419517993927,
"learning_rate": 0.0002,
"loss": 0.6048401594161987,
"mean_token_accuracy": 0.7606765776872635,
"num_tokens": 425838.0,
"step": 26
},
{
"entropy": 0.5888677388429642,
"epoch": 0.10074626865671642,
"grad_norm": 0.23267361521720886,
"learning_rate": 0.0002,
"loss": 0.5792773365974426,
"mean_token_accuracy": 0.7714710682630539,
"num_tokens": 442275.0,
"step": 27
},
{
"entropy": 0.6097696423530579,
"epoch": 0.1044776119402985,
"grad_norm": 0.25834810733795166,
"learning_rate": 0.0002,
"loss": 0.6025165915489197,
"mean_token_accuracy": 0.7594742327928543,
"num_tokens": 458633.0,
"step": 28
},
{
"entropy": 0.5876014679670334,
"epoch": 0.10820895522388059,
"grad_norm": 0.24802696704864502,
"learning_rate": 0.0002,
"loss": 0.577584445476532,
"mean_token_accuracy": 0.7709765136241913,
"num_tokens": 475114.0,
"step": 29
},
{
"entropy": 0.577396959066391,
"epoch": 0.11194029850746269,
"grad_norm": 0.24076423048973083,
"learning_rate": 0.0002,
"loss": 0.5727118849754333,
"mean_token_accuracy": 0.7744314223527908,
"num_tokens": 491389.0,
"step": 30
},
{
"entropy": 0.5895106196403503,
"epoch": 0.11567164179104478,
"grad_norm": 0.21412523090839386,
"learning_rate": 0.0002,
"loss": 0.5863120555877686,
"mean_token_accuracy": 0.7693659514188766,
"num_tokens": 507969.0,
"step": 31
},
{
"entropy": 0.5717187374830246,
"epoch": 0.11940298507462686,
"grad_norm": 0.1944267749786377,
"learning_rate": 0.0002,
"loss": 0.568047046661377,
"mean_token_accuracy": 0.7752875536680222,
"num_tokens": 524169.0,
"step": 32
},
{
"entropy": 0.5736564546823502,
"epoch": 0.12313432835820895,
"grad_norm": 0.23050418496131897,
"learning_rate": 0.0002,
"loss": 0.5761005282402039,
"mean_token_accuracy": 0.7727629542350769,
"num_tokens": 540463.0,
"step": 33
},
{
"entropy": 0.589300200343132,
"epoch": 0.12686567164179105,
"grad_norm": 0.21381224691867828,
"learning_rate": 0.0002,
"loss": 0.5865699052810669,
"mean_token_accuracy": 0.7672912329435349,
"num_tokens": 557025.0,
"step": 34
},
{
"entropy": 0.5663471221923828,
"epoch": 0.13059701492537312,
"grad_norm": 0.21070359647274017,
"learning_rate": 0.0002,
"loss": 0.5665886998176575,
"mean_token_accuracy": 0.7742704451084137,
"num_tokens": 573346.0,
"step": 35
},
{
"entropy": 0.5744731575250626,
"epoch": 0.13432835820895522,
"grad_norm": 0.2001814991235733,
"learning_rate": 0.0002,
"loss": 0.5742104649543762,
"mean_token_accuracy": 0.7708545625209808,
"num_tokens": 589678.0,
"step": 36
},
{
"entropy": 0.5785809606313705,
"epoch": 0.13805970149253732,
"grad_norm": 0.1615011990070343,
"learning_rate": 0.0002,
"loss": 0.5697225332260132,
"mean_token_accuracy": 0.7719135135412216,
"num_tokens": 606081.0,
"step": 37
},
{
"entropy": 0.5571976453065872,
"epoch": 0.1417910447761194,
"grad_norm": 0.1849016547203064,
"learning_rate": 0.0002,
"loss": 0.5493215322494507,
"mean_token_accuracy": 0.7809059321880341,
"num_tokens": 622168.0,
"step": 38
},
{
"entropy": 0.5916045606136322,
"epoch": 0.1455223880597015,
"grad_norm": 0.19314663112163544,
"learning_rate": 0.0002,
"loss": 0.5800106525421143,
"mean_token_accuracy": 0.7677847892045975,
"num_tokens": 638480.0,
"step": 39
},
{
"entropy": 0.5791963338851929,
"epoch": 0.14925373134328357,
"grad_norm": 0.18138627707958221,
"learning_rate": 0.0002,
"loss": 0.5779139399528503,
"mean_token_accuracy": 0.767883911728859,
"num_tokens": 654651.0,
"step": 40
},
{
"entropy": 0.5743307769298553,
"epoch": 0.15298507462686567,
"grad_norm": 0.17246870696544647,
"learning_rate": 0.0002,
"loss": 0.5706084370613098,
"mean_token_accuracy": 0.7700994461774826,
"num_tokens": 670948.0,
"step": 41
},
{
"entropy": 0.5432448089122772,
"epoch": 0.15671641791044777,
"grad_norm": 0.19110122323036194,
"learning_rate": 0.0002,
"loss": 0.5484994649887085,
"mean_token_accuracy": 0.7811570912599564,
"num_tokens": 687540.0,
"step": 42
},
{
"entropy": 0.5750848650932312,
"epoch": 0.16044776119402984,
"grad_norm": 0.1716981679201126,
"learning_rate": 0.0002,
"loss": 0.579657793045044,
"mean_token_accuracy": 0.7663937658071518,
"num_tokens": 704015.0,
"step": 43
},
{
"entropy": 0.561103492975235,
"epoch": 0.16417910447761194,
"grad_norm": 0.1821409910917282,
"learning_rate": 0.0002,
"loss": 0.5600441098213196,
"mean_token_accuracy": 0.774185299873352,
"num_tokens": 720451.0,
"step": 44
},
{
"entropy": 0.5737239718437195,
"epoch": 0.16791044776119404,
"grad_norm": 0.174806609749794,
"learning_rate": 0.0002,
"loss": 0.5676751732826233,
"mean_token_accuracy": 0.770918071269989,
"num_tokens": 736682.0,
"step": 45
},
{
"entropy": 0.5712144523859024,
"epoch": 0.17164179104477612,
"grad_norm": 0.18145714700222015,
"learning_rate": 0.0002,
"loss": 0.5659744143486023,
"mean_token_accuracy": 0.7729035317897797,
"num_tokens": 753217.0,
"step": 46
},
{
"entropy": 0.5745559930801392,
"epoch": 0.17537313432835822,
"grad_norm": 0.1639634072780609,
"learning_rate": 0.0002,
"loss": 0.5735749006271362,
"mean_token_accuracy": 0.770696684718132,
"num_tokens": 769822.0,
"step": 47
},
{
"entropy": 0.5605441480875015,
"epoch": 0.1791044776119403,
"grad_norm": 0.18234604597091675,
"learning_rate": 0.0002,
"loss": 0.5633875131607056,
"mean_token_accuracy": 0.7749416828155518,
"num_tokens": 786359.0,
"step": 48
},
{
"entropy": 0.5490550547838211,
"epoch": 0.1828358208955224,
"grad_norm": 0.18433044850826263,
"learning_rate": 0.0002,
"loss": 0.5567543506622314,
"mean_token_accuracy": 0.7788835614919662,
"num_tokens": 802963.0,
"step": 49
},
{
"entropy": 0.5616811364889145,
"epoch": 0.1865671641791045,
"grad_norm": 0.15450991690158844,
"learning_rate": 0.0002,
"loss": 0.5657309889793396,
"mean_token_accuracy": 0.774708479642868,
"num_tokens": 819668.0,
"step": 50
},
{
"entropy": 0.5582916140556335,
"epoch": 0.19029850746268656,
"grad_norm": 0.14035002887248993,
"learning_rate": 0.0002,
"loss": 0.551848828792572,
"mean_token_accuracy": 0.7806462794542313,
"num_tokens": 835858.0,
"step": 51
},
{
"entropy": 0.5508538037538528,
"epoch": 0.19402985074626866,
"grad_norm": 0.17560449242591858,
"learning_rate": 0.0002,
"loss": 0.5406010150909424,
"mean_token_accuracy": 0.7840944528579712,
"num_tokens": 852146.0,
"step": 52
},
{
"entropy": 0.5527998208999634,
"epoch": 0.19776119402985073,
"grad_norm": 0.15798722207546234,
"learning_rate": 0.0002,
"loss": 0.5423352718353271,
"mean_token_accuracy": 0.782536968588829,
"num_tokens": 868660.0,
"step": 53
},
{
"entropy": 0.5586383640766144,
"epoch": 0.20149253731343283,
"grad_norm": 0.15477648377418518,
"learning_rate": 0.0002,
"loss": 0.5521284937858582,
"mean_token_accuracy": 0.7778433710336685,
"num_tokens": 885133.0,
"step": 54
},
{
"entropy": 0.5694690942764282,
"epoch": 0.20522388059701493,
"grad_norm": 0.16944538056850433,
"learning_rate": 0.0002,
"loss": 0.5759178400039673,
"mean_token_accuracy": 0.7684573978185654,
"num_tokens": 901816.0,
"step": 55
},
{
"entropy": 0.5426557958126068,
"epoch": 0.208955223880597,
"grad_norm": 0.16989077627658844,
"learning_rate": 0.0002,
"loss": 0.5477243661880493,
"mean_token_accuracy": 0.7811359614133835,
"num_tokens": 918275.0,
"step": 56
},
{
"entropy": 0.5754421502351761,
"epoch": 0.2126865671641791,
"grad_norm": 0.15350034832954407,
"learning_rate": 0.0002,
"loss": 0.5865313410758972,
"mean_token_accuracy": 0.7631517648696899,
"num_tokens": 934630.0,
"step": 57
},
{
"entropy": 0.5742448717355728,
"epoch": 0.21641791044776118,
"grad_norm": 0.18639785051345825,
"learning_rate": 0.0002,
"loss": 0.575249433517456,
"mean_token_accuracy": 0.7669856697320938,
"num_tokens": 950844.0,
"step": 58
},
{
"entropy": 0.5708972364664078,
"epoch": 0.22014925373134328,
"grad_norm": 0.15229687094688416,
"learning_rate": 0.0002,
"loss": 0.5669128894805908,
"mean_token_accuracy": 0.7711773067712784,
"num_tokens": 966973.0,
"step": 59
},
{
"entropy": 0.5682551562786102,
"epoch": 0.22388059701492538,
"grad_norm": 0.1677161157131195,
"learning_rate": 0.0002,
"loss": 0.5593635439872742,
"mean_token_accuracy": 0.7725416421890259,
"num_tokens": 983221.0,
"step": 60
},
{
"entropy": 0.5679890364408493,
"epoch": 0.22761194029850745,
"grad_norm": 0.18057392537593842,
"learning_rate": 0.0002,
"loss": 0.5580260753631592,
"mean_token_accuracy": 0.7754660546779633,
"num_tokens": 999424.0,
"step": 61
},
{
"entropy": 0.5804609507322311,
"epoch": 0.23134328358208955,
"grad_norm": 0.143987238407135,
"learning_rate": 0.0002,
"loss": 0.570034384727478,
"mean_token_accuracy": 0.7708772122859955,
"num_tokens": 1015903.0,
"step": 62
},
{
"entropy": 0.5699467211961746,
"epoch": 0.23507462686567165,
"grad_norm": 0.15400487184524536,
"learning_rate": 0.0002,
"loss": 0.5733590126037598,
"mean_token_accuracy": 0.7680967003107071,
"num_tokens": 1032549.0,
"step": 63
},
{
"entropy": 0.5582360923290253,
"epoch": 0.23880597014925373,
"grad_norm": 0.17451652884483337,
"learning_rate": 0.0002,
"loss": 0.5732641220092773,
"mean_token_accuracy": 0.7692582160234451,
"num_tokens": 1048935.0,
"step": 64
},
{
"entropy": 0.5475955605506897,
"epoch": 0.24253731343283583,
"grad_norm": 0.1549489051103592,
"learning_rate": 0.0002,
"loss": 0.5526400804519653,
"mean_token_accuracy": 0.7788676619529724,
"num_tokens": 1065104.0,
"step": 65
},
{
"entropy": 0.5664391964673996,
"epoch": 0.2462686567164179,
"grad_norm": 0.14476634562015533,
"learning_rate": 0.0002,
"loss": 0.5617241263389587,
"mean_token_accuracy": 0.7786661833524704,
"num_tokens": 1081393.0,
"step": 66
},
{
"entropy": 0.5560042560100555,
"epoch": 0.25,
"grad_norm": 0.16752755641937256,
"learning_rate": 0.0002,
"loss": 0.5503427982330322,
"mean_token_accuracy": 0.7781690061092377,
"num_tokens": 1097575.0,
"step": 67
},
{
"entropy": 0.5609089732170105,
"epoch": 0.2537313432835821,
"grad_norm": 0.17903153598308563,
"learning_rate": 0.0002,
"loss": 0.5497362017631531,
"mean_token_accuracy": 0.7771856188774109,
"num_tokens": 1113937.0,
"step": 68
},
{
"entropy": 0.5642896294593811,
"epoch": 0.2574626865671642,
"grad_norm": 0.16974171996116638,
"learning_rate": 0.0002,
"loss": 0.563960611820221,
"mean_token_accuracy": 0.7738614976406097,
"num_tokens": 1130103.0,
"step": 69
},
{
"entropy": 0.5726548284292221,
"epoch": 0.26119402985074625,
"grad_norm": 0.14435403048992157,
"learning_rate": 0.0002,
"loss": 0.5712643265724182,
"mean_token_accuracy": 0.7692683339118958,
"num_tokens": 1146423.0,
"step": 70
},
{
"entropy": 0.5441250950098038,
"epoch": 0.26492537313432835,
"grad_norm": 0.14253664016723633,
"learning_rate": 0.0002,
"loss": 0.544674813747406,
"mean_token_accuracy": 0.7780104726552963,
"num_tokens": 1162733.0,
"step": 71
},
{
"entropy": 0.5444895774126053,
"epoch": 0.26865671641791045,
"grad_norm": 0.14379332959651947,
"learning_rate": 0.0002,
"loss": 0.5479044318199158,
"mean_token_accuracy": 0.7788853794336319,
"num_tokens": 1178848.0,
"step": 72
},
{
"entropy": 0.5541743487119675,
"epoch": 0.27238805970149255,
"grad_norm": 0.1346455216407776,
"learning_rate": 0.0002,
"loss": 0.5573484897613525,
"mean_token_accuracy": 0.7779737412929535,
"num_tokens": 1195357.0,
"step": 73
},
{
"entropy": 0.5649544596672058,
"epoch": 0.27611940298507465,
"grad_norm": 0.136294886469841,
"learning_rate": 0.0002,
"loss": 0.5603638291358948,
"mean_token_accuracy": 0.7719381302595139,
"num_tokens": 1211921.0,
"step": 74
},
{
"entropy": 0.5381972342729568,
"epoch": 0.2798507462686567,
"grad_norm": 0.12611278891563416,
"learning_rate": 0.0002,
"loss": 0.533305287361145,
"mean_token_accuracy": 0.7839507907629013,
"num_tokens": 1228381.0,
"step": 75
},
{
"entropy": 0.5607545524835587,
"epoch": 0.2835820895522388,
"grad_norm": 0.1318938434123993,
"learning_rate": 0.0002,
"loss": 0.5617884397506714,
"mean_token_accuracy": 0.7753878086805344,
"num_tokens": 1244769.0,
"step": 76
},
{
"entropy": 0.5631186813116074,
"epoch": 0.2873134328358209,
"grad_norm": 0.1374509632587433,
"learning_rate": 0.0002,
"loss": 0.5608174204826355,
"mean_token_accuracy": 0.7753797173500061,
"num_tokens": 1261197.0,
"step": 77
},
{
"entropy": 0.5789693742990494,
"epoch": 0.291044776119403,
"grad_norm": 0.1388232558965683,
"learning_rate": 0.0002,
"loss": 0.5779432058334351,
"mean_token_accuracy": 0.7658645212650299,
"num_tokens": 1277998.0,
"step": 78
},
{
"entropy": 0.5439933687448502,
"epoch": 0.2947761194029851,
"grad_norm": 0.15839162468910217,
"learning_rate": 0.0002,
"loss": 0.5506725311279297,
"mean_token_accuracy": 0.7786760181188583,
"num_tokens": 1294293.0,
"step": 79
},
{
"entropy": 0.5581207424402237,
"epoch": 0.29850746268656714,
"grad_norm": 0.16782821714878082,
"learning_rate": 0.0002,
"loss": 0.56475830078125,
"mean_token_accuracy": 0.7746179699897766,
"num_tokens": 1310588.0,
"step": 80
},
{
"entropy": 0.588770255446434,
"epoch": 0.30223880597014924,
"grad_norm": 0.17123626172542572,
"learning_rate": 0.0002,
"loss": 0.5832362174987793,
"mean_token_accuracy": 0.7644577324390411,
"num_tokens": 1327129.0,
"step": 81
},
{
"entropy": 0.5512869954109192,
"epoch": 0.30597014925373134,
"grad_norm": 0.12713028490543365,
"learning_rate": 0.0002,
"loss": 0.538611888885498,
"mean_token_accuracy": 0.7855131775140762,
"num_tokens": 1343481.0,
"step": 82
},
{
"entropy": 0.5826849788427353,
"epoch": 0.30970149253731344,
"grad_norm": 0.15148760378360748,
"learning_rate": 0.0002,
"loss": 0.580060601234436,
"mean_token_accuracy": 0.7675654888153076,
"num_tokens": 1359709.0,
"step": 83
},
{
"entropy": 0.581380233168602,
"epoch": 0.31343283582089554,
"grad_norm": 0.1486639529466629,
"learning_rate": 0.0002,
"loss": 0.5737113952636719,
"mean_token_accuracy": 0.7694955766201019,
"num_tokens": 1376209.0,
"step": 84
},
{
"entropy": 0.5577070415019989,
"epoch": 0.31716417910447764,
"grad_norm": 0.14268359541893005,
"learning_rate": 0.0002,
"loss": 0.5592327117919922,
"mean_token_accuracy": 0.7741715162992477,
"num_tokens": 1392271.0,
"step": 85
},
{
"entropy": 0.5519531518220901,
"epoch": 0.3208955223880597,
"grad_norm": 0.19115421175956726,
"learning_rate": 0.0002,
"loss": 0.5649857521057129,
"mean_token_accuracy": 0.7735026627779007,
"num_tokens": 1408680.0,
"step": 86
},
{
"entropy": 0.5389833152294159,
"epoch": 0.3246268656716418,
"grad_norm": 0.1511470526456833,
"learning_rate": 0.0002,
"loss": 0.5499240159988403,
"mean_token_accuracy": 0.7795019447803497,
"num_tokens": 1425241.0,
"step": 87
},
{
"entropy": 0.5535243153572083,
"epoch": 0.3283582089552239,
"grad_norm": 0.13003994524478912,
"learning_rate": 0.0002,
"loss": 0.5464329123497009,
"mean_token_accuracy": 0.7804087400436401,
"num_tokens": 1441530.0,
"step": 88
},
{
"entropy": 0.5626068562269211,
"epoch": 0.332089552238806,
"grad_norm": 0.1472884714603424,
"learning_rate": 0.0002,
"loss": 0.5579521656036377,
"mean_token_accuracy": 0.7757730484008789,
"num_tokens": 1457843.0,
"step": 89
},
{
"entropy": 0.5722664147615433,
"epoch": 0.3358208955223881,
"grad_norm": 0.14036864042282104,
"learning_rate": 0.0002,
"loss": 0.5636782050132751,
"mean_token_accuracy": 0.7743526548147202,
"num_tokens": 1474209.0,
"step": 90
},
{
"entropy": 0.5577493757009506,
"epoch": 0.33955223880597013,
"grad_norm": 0.12171963602304459,
"learning_rate": 0.0002,
"loss": 0.5502208471298218,
"mean_token_accuracy": 0.7802051454782486,
"num_tokens": 1490390.0,
"step": 91
},
{
"entropy": 0.547787681221962,
"epoch": 0.34328358208955223,
"grad_norm": 0.1525270640850067,
"learning_rate": 0.0002,
"loss": 0.5497896075248718,
"mean_token_accuracy": 0.7809301018714905,
"num_tokens": 1506675.0,
"step": 92
},
{
"entropy": 0.5554802119731903,
"epoch": 0.34701492537313433,
"grad_norm": 0.1502194106578827,
"learning_rate": 0.0002,
"loss": 0.5645507574081421,
"mean_token_accuracy": 0.7722718119621277,
"num_tokens": 1523263.0,
"step": 93
},
{
"entropy": 0.5594951659440994,
"epoch": 0.35074626865671643,
"grad_norm": 0.13331742584705353,
"learning_rate": 0.0002,
"loss": 0.5637622475624084,
"mean_token_accuracy": 0.7736085057258606,
"num_tokens": 1540004.0,
"step": 94
},
{
"entropy": 0.5551023185253143,
"epoch": 0.35447761194029853,
"grad_norm": 0.1213943138718605,
"learning_rate": 0.0002,
"loss": 0.5518482327461243,
"mean_token_accuracy": 0.7777320593595505,
"num_tokens": 1556547.0,
"step": 95
},
{
"entropy": 0.557207852602005,
"epoch": 0.3582089552238806,
"grad_norm": 0.1314304620027542,
"learning_rate": 0.0002,
"loss": 0.5546322464942932,
"mean_token_accuracy": 0.7763337790966034,
"num_tokens": 1572997.0,
"step": 96
},
{
"entropy": 0.556539997458458,
"epoch": 0.3619402985074627,
"grad_norm": 0.14363965392112732,
"learning_rate": 0.0002,
"loss": 0.5549654364585876,
"mean_token_accuracy": 0.7731640189886093,
"num_tokens": 1589289.0,
"step": 97
},
{
"entropy": 0.568042978644371,
"epoch": 0.3656716417910448,
"grad_norm": 0.11934816092252731,
"learning_rate": 0.0002,
"loss": 0.5679082274436951,
"mean_token_accuracy": 0.768884465098381,
"num_tokens": 1605516.0,
"step": 98
},
{
"entropy": 0.5484860688447952,
"epoch": 0.3694029850746269,
"grad_norm": 0.16246412694454193,
"learning_rate": 0.0002,
"loss": 0.5522934794425964,
"mean_token_accuracy": 0.776402086019516,
"num_tokens": 1622108.0,
"step": 99
},
{
"entropy": 0.5548600405454636,
"epoch": 0.373134328358209,
"grad_norm": 0.12589918076992035,
"learning_rate": 0.0002,
"loss": 0.5544294714927673,
"mean_token_accuracy": 0.7768803536891937,
"num_tokens": 1638659.0,
"step": 100
},
{
"entropy": 0.5692953765392303,
"epoch": 0.376865671641791,
"grad_norm": 0.12726213037967682,
"learning_rate": 0.0002,
"loss": 0.5662153363227844,
"mean_token_accuracy": 0.7698657661676407,
"num_tokens": 1654877.0,
"step": 101
},
{
"entropy": 0.560271605849266,
"epoch": 0.3805970149253731,
"grad_norm": 0.13260267674922943,
"learning_rate": 0.0002,
"loss": 0.5487651824951172,
"mean_token_accuracy": 0.7778149247169495,
"num_tokens": 1671436.0,
"step": 102
},
{
"entropy": 0.5644612163305283,
"epoch": 0.3843283582089552,
"grad_norm": 0.13504348695278168,
"learning_rate": 0.0002,
"loss": 0.5573433041572571,
"mean_token_accuracy": 0.7781724482774734,
"num_tokens": 1687817.0,
"step": 103
},
{
"entropy": 0.55845807492733,
"epoch": 0.3880597014925373,
"grad_norm": 0.1202038824558258,
"learning_rate": 0.0002,
"loss": 0.5552661418914795,
"mean_token_accuracy": 0.7772795557975769,
"num_tokens": 1704568.0,
"step": 104
},
{
"entropy": 0.5440086871385574,
"epoch": 0.3917910447761194,
"grad_norm": 0.12728044390678406,
"learning_rate": 0.0002,
"loss": 0.5538181662559509,
"mean_token_accuracy": 0.7744371294975281,
"num_tokens": 1720774.0,
"step": 105
},
{
"entropy": 0.5394178926944733,
"epoch": 0.39552238805970147,
"grad_norm": 0.14098908007144928,
"learning_rate": 0.0002,
"loss": 0.552955150604248,
"mean_token_accuracy": 0.776681050658226,
"num_tokens": 1737050.0,
"step": 106
},
{
"entropy": 0.5602739453315735,
"epoch": 0.39925373134328357,
"grad_norm": 0.1373777687549591,
"learning_rate": 0.0002,
"loss": 0.5666458010673523,
"mean_token_accuracy": 0.7684379816055298,
"num_tokens": 1753616.0,
"step": 107
},
{
"entropy": 0.5688735842704773,
"epoch": 0.40298507462686567,
"grad_norm": 0.12947675585746765,
"learning_rate": 0.0002,
"loss": 0.5618643760681152,
"mean_token_accuracy": 0.7724806815385818,
"num_tokens": 1770077.0,
"step": 108
},
{
"entropy": 0.569103866815567,
"epoch": 0.40671641791044777,
"grad_norm": 0.1482311338186264,
"learning_rate": 0.0002,
"loss": 0.5661442875862122,
"mean_token_accuracy": 0.7717588543891907,
"num_tokens": 1786557.0,
"step": 109
},
{
"entropy": 0.5550140291452408,
"epoch": 0.41044776119402987,
"grad_norm": 0.13066281378269196,
"learning_rate": 0.0002,
"loss": 0.5546547770500183,
"mean_token_accuracy": 0.7755738943815231,
"num_tokens": 1803029.0,
"step": 110
},
{
"entropy": 0.5526944696903229,
"epoch": 0.4141791044776119,
"grad_norm": 0.11755255609750748,
"learning_rate": 0.0002,
"loss": 0.5436115860939026,
"mean_token_accuracy": 0.779561460018158,
"num_tokens": 1819561.0,
"step": 111
},
{
"entropy": 0.5528556704521179,
"epoch": 0.417910447761194,
"grad_norm": 0.14607787132263184,
"learning_rate": 0.0002,
"loss": 0.5589385032653809,
"mean_token_accuracy": 0.7751224488019943,
"num_tokens": 1835992.0,
"step": 112
},
{
"entropy": 0.5393927693367004,
"epoch": 0.4216417910447761,
"grad_norm": 0.12512564659118652,
"learning_rate": 0.0002,
"loss": 0.5430585741996765,
"mean_token_accuracy": 0.7801438719034195,
"num_tokens": 1852545.0,
"step": 113
},
{
"entropy": 0.5346394777297974,
"epoch": 0.4253731343283582,
"grad_norm": 0.13879786431789398,
"learning_rate": 0.0002,
"loss": 0.5470178723335266,
"mean_token_accuracy": 0.7800125926733017,
"num_tokens": 1868767.0,
"step": 114
},
{
"entropy": 0.552959531545639,
"epoch": 0.4291044776119403,
"grad_norm": 0.13570789992809296,
"learning_rate": 0.0002,
"loss": 0.5606270432472229,
"mean_token_accuracy": 0.7728203237056732,
"num_tokens": 1885207.0,
"step": 115
},
{
"entropy": 0.5681584924459457,
"epoch": 0.43283582089552236,
"grad_norm": 0.13311345875263214,
"learning_rate": 0.0002,
"loss": 0.561408519744873,
"mean_token_accuracy": 0.7729704976081848,
"num_tokens": 1901670.0,
"step": 116
},
{
"entropy": 0.580392524600029,
"epoch": 0.43656716417910446,
"grad_norm": 0.15006045997142792,
"learning_rate": 0.0002,
"loss": 0.5710599422454834,
"mean_token_accuracy": 0.7692873626947403,
"num_tokens": 1918297.0,
"step": 117
},
{
"entropy": 0.5402243435382843,
"epoch": 0.44029850746268656,
"grad_norm": 0.13022655248641968,
"learning_rate": 0.0002,
"loss": 0.5290783047676086,
"mean_token_accuracy": 0.7855078428983688,
"num_tokens": 1934811.0,
"step": 118
},
{
"entropy": 0.5673187673091888,
"epoch": 0.44402985074626866,
"grad_norm": 0.1210206151008606,
"learning_rate": 0.0002,
"loss": 0.5625845193862915,
"mean_token_accuracy": 0.771060049533844,
"num_tokens": 1951276.0,
"step": 119
},
{
"entropy": 0.5444270074367523,
"epoch": 0.44776119402985076,
"grad_norm": 0.14453133940696716,
"learning_rate": 0.0002,
"loss": 0.5478600263595581,
"mean_token_accuracy": 0.7782215029001236,
"num_tokens": 1967851.0,
"step": 120
},
{
"entropy": 0.5516166985034943,
"epoch": 0.45149253731343286,
"grad_norm": 0.15330393612384796,
"learning_rate": 0.0002,
"loss": 0.5627217292785645,
"mean_token_accuracy": 0.7735389173030853,
"num_tokens": 1984175.0,
"step": 121
},
{
"entropy": 0.5447670072317123,
"epoch": 0.4552238805970149,
"grad_norm": 0.11896508932113647,
"learning_rate": 0.0002,
"loss": 0.5453386306762695,
"mean_token_accuracy": 0.7792693227529526,
"num_tokens": 2000419.0,
"step": 122
},
{
"entropy": 0.5593693852424622,
"epoch": 0.458955223880597,
"grad_norm": 0.14641404151916504,
"learning_rate": 0.0002,
"loss": 0.5527093410491943,
"mean_token_accuracy": 0.7784133702516556,
"num_tokens": 2016812.0,
"step": 123
},
{
"entropy": 0.5516424775123596,
"epoch": 0.4626865671641791,
"grad_norm": 0.13001076877117157,
"learning_rate": 0.0002,
"loss": 0.5495356917381287,
"mean_token_accuracy": 0.7777290046215057,
"num_tokens": 2032898.0,
"step": 124
},
{
"entropy": 0.5469458252191544,
"epoch": 0.4664179104477612,
"grad_norm": 0.12713271379470825,
"learning_rate": 0.0002,
"loss": 0.5466877222061157,
"mean_token_accuracy": 0.7783260345458984,
"num_tokens": 2049023.0,
"step": 125
},
{
"entropy": 0.5528912246227264,
"epoch": 0.4701492537313433,
"grad_norm": 0.13111256062984467,
"learning_rate": 0.0002,
"loss": 0.5582880973815918,
"mean_token_accuracy": 0.7739576250314713,
"num_tokens": 2065421.0,
"step": 126
},
{
"entropy": 0.536289632320404,
"epoch": 0.47388059701492535,
"grad_norm": 0.1449650228023529,
"learning_rate": 0.0002,
"loss": 0.5477018356323242,
"mean_token_accuracy": 0.7764868587255478,
"num_tokens": 2081738.0,
"step": 127
},
{
"entropy": 0.5412490218877792,
"epoch": 0.47761194029850745,
"grad_norm": 0.12087342143058777,
"learning_rate": 0.0002,
"loss": 0.5445610880851746,
"mean_token_accuracy": 0.7799812257289886,
"num_tokens": 2098128.0,
"step": 128
},
{
"entropy": 0.5749060362577438,
"epoch": 0.48134328358208955,
"grad_norm": 0.13593946397304535,
"learning_rate": 0.0002,
"loss": 0.5713242292404175,
"mean_token_accuracy": 0.7683141082525253,
"num_tokens": 2114660.0,
"step": 129
},
{
"entropy": 0.5624695718288422,
"epoch": 0.48507462686567165,
"grad_norm": 0.13926997780799866,
"learning_rate": 0.0002,
"loss": 0.5603138208389282,
"mean_token_accuracy": 0.7724832147359848,
"num_tokens": 2130850.0,
"step": 130
},
{
"entropy": 0.564590647816658,
"epoch": 0.48880597014925375,
"grad_norm": 0.1541988104581833,
"learning_rate": 0.0002,
"loss": 0.5548843145370483,
"mean_token_accuracy": 0.7774635404348373,
"num_tokens": 2147198.0,
"step": 131
},
{
"entropy": 0.5638516694307327,
"epoch": 0.4925373134328358,
"grad_norm": 0.14475074410438538,
"learning_rate": 0.0002,
"loss": 0.559626579284668,
"mean_token_accuracy": 0.7742670625448227,
"num_tokens": 2163592.0,
"step": 132
},
{
"entropy": 0.546675980091095,
"epoch": 0.4962686567164179,
"grad_norm": 0.14459353685379028,
"learning_rate": 0.0002,
"loss": 0.5525697469711304,
"mean_token_accuracy": 0.7782329767942429,
"num_tokens": 2179735.0,
"step": 133
},
{
"entropy": 0.5720339864492416,
"epoch": 0.5,
"grad_norm": 0.16138529777526855,
"learning_rate": 0.0002,
"loss": 0.5745345950126648,
"mean_token_accuracy": 0.7678724527359009,
"num_tokens": 2196300.0,
"step": 134
},
{
"entropy": 0.5302732288837433,
"epoch": 0.503731343283582,
"grad_norm": 0.13007810711860657,
"learning_rate": 0.0002,
"loss": 0.5221583843231201,
"mean_token_accuracy": 0.786575123667717,
"num_tokens": 2212703.0,
"step": 135
},
{
"entropy": 0.5611361563205719,
"epoch": 0.5074626865671642,
"grad_norm": 0.16084182262420654,
"learning_rate": 0.0002,
"loss": 0.557313084602356,
"mean_token_accuracy": 0.7753567546606064,
"num_tokens": 2229364.0,
"step": 136
},
{
"entropy": 0.5539422780275345,
"epoch": 0.5111940298507462,
"grad_norm": 0.1412162035703659,
"learning_rate": 0.0002,
"loss": 0.559614896774292,
"mean_token_accuracy": 0.7726200222969055,
"num_tokens": 2245576.0,
"step": 137
},
{
"entropy": 0.562326043844223,
"epoch": 0.5149253731343284,
"grad_norm": 0.12138223648071289,
"learning_rate": 0.0002,
"loss": 0.5638246536254883,
"mean_token_accuracy": 0.7736532688140869,
"num_tokens": 2261877.0,
"step": 138
},
{
"entropy": 0.5490357279777527,
"epoch": 0.5186567164179104,
"grad_norm": 0.13067315518856049,
"learning_rate": 0.0002,
"loss": 0.5565229654312134,
"mean_token_accuracy": 0.7710774689912796,
"num_tokens": 2278167.0,
"step": 139
},
{
"entropy": 0.5594187080860138,
"epoch": 0.5223880597014925,
"grad_norm": 0.15731613337993622,
"learning_rate": 0.0002,
"loss": 0.5585336089134216,
"mean_token_accuracy": 0.7744586318731308,
"num_tokens": 2294498.0,
"step": 140
},
{
"entropy": 0.5464736074209213,
"epoch": 0.5261194029850746,
"grad_norm": 0.11038337647914886,
"learning_rate": 0.0002,
"loss": 0.538608968257904,
"mean_token_accuracy": 0.7829599231481552,
"num_tokens": 2311130.0,
"step": 141
},
{
"entropy": 0.5605999529361725,
"epoch": 0.5298507462686567,
"grad_norm": 0.14088644087314606,
"learning_rate": 0.0002,
"loss": 0.552900493144989,
"mean_token_accuracy": 0.7778186202049255,
"num_tokens": 2327728.0,
"step": 142
},
{
"entropy": 0.5528270900249481,
"epoch": 0.5335820895522388,
"grad_norm": 0.1425020396709442,
"learning_rate": 0.0002,
"loss": 0.5515353083610535,
"mean_token_accuracy": 0.7752819806337357,
"num_tokens": 2343709.0,
"step": 143
},
{
"entropy": 0.548284262418747,
"epoch": 0.5373134328358209,
"grad_norm": 0.11753518134355545,
"learning_rate": 0.0002,
"loss": 0.5451334118843079,
"mean_token_accuracy": 0.778195932507515,
"num_tokens": 2360064.0,
"step": 144
},
{
"entropy": 0.5573805719614029,
"epoch": 0.5410447761194029,
"grad_norm": 0.16544298827648163,
"learning_rate": 0.0002,
"loss": 0.5645371675491333,
"mean_token_accuracy": 0.774710014462471,
"num_tokens": 2376625.0,
"step": 145
},
{
"entropy": 0.5539259165525436,
"epoch": 0.5447761194029851,
"grad_norm": 0.13032706081867218,
"learning_rate": 0.0002,
"loss": 0.5533608198165894,
"mean_token_accuracy": 0.7761502712965012,
"num_tokens": 2393124.0,
"step": 146
},
{
"entropy": 0.5611738562583923,
"epoch": 0.5485074626865671,
"grad_norm": 0.11081252992153168,
"learning_rate": 0.0002,
"loss": 0.5593815445899963,
"mean_token_accuracy": 0.7766542136669159,
"num_tokens": 2409745.0,
"step": 147
},
{
"entropy": 0.5696390718221664,
"epoch": 0.5522388059701493,
"grad_norm": 0.15060319006443024,
"learning_rate": 0.0002,
"loss": 0.5638480186462402,
"mean_token_accuracy": 0.7716973423957825,
"num_tokens": 2426282.0,
"step": 148
},
{
"entropy": 0.5485384464263916,
"epoch": 0.5559701492537313,
"grad_norm": 0.1222362369298935,
"learning_rate": 0.0002,
"loss": 0.5475510954856873,
"mean_token_accuracy": 0.7770865708589554,
"num_tokens": 2442853.0,
"step": 149
},
{
"entropy": 0.5401834696531296,
"epoch": 0.5597014925373134,
"grad_norm": 0.1280064433813095,
"learning_rate": 0.0002,
"loss": 0.546281099319458,
"mean_token_accuracy": 0.777226597070694,
"num_tokens": 2459134.0,
"step": 150
},
{
"entropy": 0.5523836761713028,
"epoch": 0.5634328358208955,
"grad_norm": 0.13370104134082794,
"learning_rate": 0.0002,
"loss": 0.5567190647125244,
"mean_token_accuracy": 0.7742304503917694,
"num_tokens": 2475612.0,
"step": 151
},
{
"entropy": 0.5323238670825958,
"epoch": 0.5671641791044776,
"grad_norm": 0.13501204550266266,
"learning_rate": 0.0002,
"loss": 0.5404109358787537,
"mean_token_accuracy": 0.7807471007108688,
"num_tokens": 2492038.0,
"step": 152
},
{
"entropy": 0.5367552191019058,
"epoch": 0.5708955223880597,
"grad_norm": 0.11861642450094223,
"learning_rate": 0.0002,
"loss": 0.5417584180831909,
"mean_token_accuracy": 0.7794559895992279,
"num_tokens": 2508568.0,
"step": 153
},
{
"entropy": 0.5438606441020966,
"epoch": 0.5746268656716418,
"grad_norm": 0.14000006020069122,
"learning_rate": 0.0002,
"loss": 0.5418928861618042,
"mean_token_accuracy": 0.7817023396492004,
"num_tokens": 2524812.0,
"step": 154
},
{
"entropy": 0.5425677746534348,
"epoch": 0.5783582089552238,
"grad_norm": 0.12695865333080292,
"learning_rate": 0.0002,
"loss": 0.5364310145378113,
"mean_token_accuracy": 0.7822788208723068,
"num_tokens": 2540971.0,
"step": 155
},
{
"entropy": 0.5774415135383606,
"epoch": 0.582089552238806,
"grad_norm": 0.13525983691215515,
"learning_rate": 0.0002,
"loss": 0.5755460858345032,
"mean_token_accuracy": 0.7673929333686829,
"num_tokens": 2557582.0,
"step": 156
},
{
"entropy": 0.5472007393836975,
"epoch": 0.585820895522388,
"grad_norm": 0.14802482724189758,
"learning_rate": 0.0002,
"loss": 0.5489597320556641,
"mean_token_accuracy": 0.777190089225769,
"num_tokens": 2573624.0,
"step": 157
},
{
"entropy": 0.5569610297679901,
"epoch": 0.5895522388059702,
"grad_norm": 0.12167536467313766,
"learning_rate": 0.0002,
"loss": 0.5526796579360962,
"mean_token_accuracy": 0.7753524631261826,
"num_tokens": 2590085.0,
"step": 158
},
{
"entropy": 0.5524294823408127,
"epoch": 0.5932835820895522,
"grad_norm": 0.11966220289468765,
"learning_rate": 0.0002,
"loss": 0.5499304533004761,
"mean_token_accuracy": 0.7759323716163635,
"num_tokens": 2606611.0,
"step": 159
},
{
"entropy": 0.5380967259407043,
"epoch": 0.5970149253731343,
"grad_norm": 0.12815536558628082,
"learning_rate": 0.0002,
"loss": 0.5423661470413208,
"mean_token_accuracy": 0.7792660146951675,
"num_tokens": 2623057.0,
"step": 160
},
{
"entropy": 0.5472327321767807,
"epoch": 0.6007462686567164,
"grad_norm": 0.1232324093580246,
"learning_rate": 0.0002,
"loss": 0.5512628555297852,
"mean_token_accuracy": 0.7756103277206421,
"num_tokens": 2639412.0,
"step": 161
},
{
"entropy": 0.53459233045578,
"epoch": 0.6044776119402985,
"grad_norm": 0.1279020607471466,
"learning_rate": 0.0002,
"loss": 0.530642569065094,
"mean_token_accuracy": 0.784668356180191,
"num_tokens": 2655725.0,
"step": 162
},
{
"entropy": 0.5487090200185776,
"epoch": 0.6082089552238806,
"grad_norm": 0.11489348113536835,
"learning_rate": 0.0002,
"loss": 0.5467615127563477,
"mean_token_accuracy": 0.7774748206138611,
"num_tokens": 2671780.0,
"step": 163
},
{
"entropy": 0.5611004680395126,
"epoch": 0.6119402985074627,
"grad_norm": 0.12106446921825409,
"learning_rate": 0.0002,
"loss": 0.5621192455291748,
"mean_token_accuracy": 0.7757818549871445,
"num_tokens": 2688187.0,
"step": 164
},
{
"entropy": 0.5655875951051712,
"epoch": 0.6156716417910447,
"grad_norm": 0.11722180247306824,
"learning_rate": 0.0002,
"loss": 0.5597223043441772,
"mean_token_accuracy": 0.7729662656784058,
"num_tokens": 2704679.0,
"step": 165
},
{
"entropy": 0.5630869567394257,
"epoch": 0.6194029850746269,
"grad_norm": 0.1220882460474968,
"learning_rate": 0.0002,
"loss": 0.5666179060935974,
"mean_token_accuracy": 0.7716799974441528,
"num_tokens": 2721384.0,
"step": 166
},
{
"entropy": 0.5498328506946564,
"epoch": 0.6231343283582089,
"grad_norm": 0.12011860311031342,
"learning_rate": 0.0002,
"loss": 0.5489162802696228,
"mean_token_accuracy": 0.7789698839187622,
"num_tokens": 2737648.0,
"step": 167
},
{
"entropy": 0.5477638095617294,
"epoch": 0.6268656716417911,
"grad_norm": 0.11750344932079315,
"learning_rate": 0.0002,
"loss": 0.5432245135307312,
"mean_token_accuracy": 0.7796685546636581,
"num_tokens": 2753735.0,
"step": 168
},
{
"entropy": 0.5453169494867325,
"epoch": 0.6305970149253731,
"grad_norm": 0.11574184149503708,
"learning_rate": 0.0002,
"loss": 0.5411070585250854,
"mean_token_accuracy": 0.779533714056015,
"num_tokens": 2770229.0,
"step": 169
},
{
"entropy": 0.545142874121666,
"epoch": 0.6343283582089553,
"grad_norm": 0.13359719514846802,
"learning_rate": 0.0002,
"loss": 0.5482118129730225,
"mean_token_accuracy": 0.7763011008501053,
"num_tokens": 2786644.0,
"step": 170
},
{
"entropy": 0.5370890945196152,
"epoch": 0.6380597014925373,
"grad_norm": 0.14816807210445404,
"learning_rate": 0.0002,
"loss": 0.5420677661895752,
"mean_token_accuracy": 0.7803799211978912,
"num_tokens": 2802914.0,
"step": 171
},
{
"entropy": 0.5518854707479477,
"epoch": 0.6417910447761194,
"grad_norm": 0.1388852596282959,
"learning_rate": 0.0002,
"loss": 0.5512416958808899,
"mean_token_accuracy": 0.7771147638559341,
"num_tokens": 2819398.0,
"step": 172
},
{
"entropy": 0.5400035530328751,
"epoch": 0.6455223880597015,
"grad_norm": 0.1363624781370163,
"learning_rate": 0.0002,
"loss": 0.5326176881790161,
"mean_token_accuracy": 0.7852664589881897,
"num_tokens": 2835742.0,
"step": 173
},
{
"entropy": 0.5528566986322403,
"epoch": 0.6492537313432836,
"grad_norm": 0.13000693917274475,
"learning_rate": 0.0002,
"loss": 0.5492731928825378,
"mean_token_accuracy": 0.7760010659694672,
"num_tokens": 2852099.0,
"step": 174
},
{
"entropy": 0.5556752383708954,
"epoch": 0.6529850746268657,
"grad_norm": 0.11847010999917984,
"learning_rate": 0.0002,
"loss": 0.5595160722732544,
"mean_token_accuracy": 0.7731318473815918,
"num_tokens": 2868521.0,
"step": 175
},
{
"entropy": 0.5382126122713089,
"epoch": 0.6567164179104478,
"grad_norm": 0.13996672630310059,
"learning_rate": 0.0002,
"loss": 0.5406076312065125,
"mean_token_accuracy": 0.7809479385614395,
"num_tokens": 2884940.0,
"step": 176
},
{
"entropy": 0.5601803660392761,
"epoch": 0.6604477611940298,
"grad_norm": 0.17110760509967804,
"learning_rate": 0.0002,
"loss": 0.5693113207817078,
"mean_token_accuracy": 0.7711411267518997,
"num_tokens": 2901255.0,
"step": 177
},
{
"entropy": 0.5570882558822632,
"epoch": 0.664179104477612,
"grad_norm": 0.13338999450206757,
"learning_rate": 0.0002,
"loss": 0.5597653388977051,
"mean_token_accuracy": 0.7734159678220749,
"num_tokens": 2917815.0,
"step": 178
},
{
"entropy": 0.5541604459285736,
"epoch": 0.667910447761194,
"grad_norm": 0.15003007650375366,
"learning_rate": 0.0002,
"loss": 0.550830066204071,
"mean_token_accuracy": 0.773952454328537,
"num_tokens": 2934029.0,
"step": 179
},
{
"entropy": 0.5483301132917404,
"epoch": 0.6716417910447762,
"grad_norm": 0.13809660077095032,
"learning_rate": 0.0002,
"loss": 0.544836163520813,
"mean_token_accuracy": 0.7802225351333618,
"num_tokens": 2950186.0,
"step": 180
},
{
"entropy": 0.563317745923996,
"epoch": 0.6753731343283582,
"grad_norm": 0.11954832822084427,
"learning_rate": 0.0002,
"loss": 0.5579479932785034,
"mean_token_accuracy": 0.7754767686128616,
"num_tokens": 2966696.0,
"step": 181
},
{
"entropy": 0.5388910472393036,
"epoch": 0.6791044776119403,
"grad_norm": 0.1495479792356491,
"learning_rate": 0.0002,
"loss": 0.5441924929618835,
"mean_token_accuracy": 0.7800770252943039,
"num_tokens": 2982704.0,
"step": 182
},
{
"entropy": 0.5419297218322754,
"epoch": 0.6828358208955224,
"grad_norm": 0.13201352953910828,
"learning_rate": 0.0002,
"loss": 0.5452746152877808,
"mean_token_accuracy": 0.7787511199712753,
"num_tokens": 2998931.0,
"step": 183
},
{
"entropy": 0.5475537180900574,
"epoch": 0.6865671641791045,
"grad_norm": 0.11876624077558517,
"learning_rate": 0.0002,
"loss": 0.5537864565849304,
"mean_token_accuracy": 0.77639339864254,
"num_tokens": 3015465.0,
"step": 184
},
{
"entropy": 0.5443734228610992,
"epoch": 0.6902985074626866,
"grad_norm": 0.142917662858963,
"learning_rate": 0.0002,
"loss": 0.5402485728263855,
"mean_token_accuracy": 0.7805273532867432,
"num_tokens": 3031848.0,
"step": 185
},
{
"entropy": 0.5626855194568634,
"epoch": 0.6940298507462687,
"grad_norm": 0.12896916270256042,
"learning_rate": 0.0002,
"loss": 0.5567379593849182,
"mean_token_accuracy": 0.7732013463973999,
"num_tokens": 3048160.0,
"step": 186
},
{
"entropy": 0.5523503571748734,
"epoch": 0.6977611940298507,
"grad_norm": 0.13464562594890594,
"learning_rate": 0.0002,
"loss": 0.5460264086723328,
"mean_token_accuracy": 0.7796957343816757,
"num_tokens": 3064378.0,
"step": 187
},
{
"entropy": 0.5515571534633636,
"epoch": 0.7014925373134329,
"grad_norm": 0.1277887523174286,
"learning_rate": 0.0002,
"loss": 0.5548107028007507,
"mean_token_accuracy": 0.773384153842926,
"num_tokens": 3080909.0,
"step": 188
},
{
"entropy": 0.5496191382408142,
"epoch": 0.7052238805970149,
"grad_norm": 0.1543433964252472,
"learning_rate": 0.0002,
"loss": 0.5634362101554871,
"mean_token_accuracy": 0.7713208198547363,
"num_tokens": 3097164.0,
"step": 189
},
{
"entropy": 0.533801332116127,
"epoch": 0.7089552238805971,
"grad_norm": 0.1185467317700386,
"learning_rate": 0.0002,
"loss": 0.5395026206970215,
"mean_token_accuracy": 0.7796055674552917,
"num_tokens": 3113434.0,
"step": 190
},
{
"entropy": 0.5635387450456619,
"epoch": 0.7126865671641791,
"grad_norm": 0.12236445397138596,
"learning_rate": 0.0002,
"loss": 0.5628854632377625,
"mean_token_accuracy": 0.7733010798692703,
"num_tokens": 3129906.0,
"step": 191
},
{
"entropy": 0.5444195717573166,
"epoch": 0.7164179104477612,
"grad_norm": 0.1353861391544342,
"learning_rate": 0.0002,
"loss": 0.5396167039871216,
"mean_token_accuracy": 0.7793399095535278,
"num_tokens": 3145901.0,
"step": 192
},
{
"entropy": 0.5682615637779236,
"epoch": 0.7201492537313433,
"grad_norm": 0.11948243528604507,
"learning_rate": 0.0002,
"loss": 0.5587157011032104,
"mean_token_accuracy": 0.774067297577858,
"num_tokens": 3162257.0,
"step": 193
},
{
"entropy": 0.5397479832172394,
"epoch": 0.7238805970149254,
"grad_norm": 0.14794877171516418,
"learning_rate": 0.0002,
"loss": 0.5473200678825378,
"mean_token_accuracy": 0.7760735005140305,
"num_tokens": 3178362.0,
"step": 194
},
{
"entropy": 0.5612514019012451,
"epoch": 0.7276119402985075,
"grad_norm": 0.12478621304035187,
"learning_rate": 0.0002,
"loss": 0.5709495544433594,
"mean_token_accuracy": 0.771531730890274,
"num_tokens": 3195003.0,
"step": 195
},
{
"entropy": 0.5640581250190735,
"epoch": 0.7313432835820896,
"grad_norm": 0.13103285431861877,
"learning_rate": 0.0002,
"loss": 0.5633752942085266,
"mean_token_accuracy": 0.7763072997331619,
"num_tokens": 3211488.0,
"step": 196
},
{
"entropy": 0.5409631133079529,
"epoch": 0.7350746268656716,
"grad_norm": 0.11954586207866669,
"learning_rate": 0.0002,
"loss": 0.5412945747375488,
"mean_token_accuracy": 0.7807609885931015,
"num_tokens": 3227872.0,
"step": 197
},
{
"entropy": 0.5516713857650757,
"epoch": 0.7388059701492538,
"grad_norm": 0.1291007399559021,
"learning_rate": 0.0002,
"loss": 0.5551599264144897,
"mean_token_accuracy": 0.776901364326477,
"num_tokens": 3244275.0,
"step": 198
},
{
"entropy": 0.5520838648080826,
"epoch": 0.7425373134328358,
"grad_norm": 0.1325356811285019,
"learning_rate": 0.0002,
"loss": 0.5542269945144653,
"mean_token_accuracy": 0.7749388813972473,
"num_tokens": 3260730.0,
"step": 199
},
{
"entropy": 0.5531659871339798,
"epoch": 0.746268656716418,
"grad_norm": 0.11382137984037399,
"learning_rate": 0.0002,
"loss": 0.5500154495239258,
"mean_token_accuracy": 0.7769201993942261,
"num_tokens": 3277054.0,
"step": 200
},
{
"entropy": 0.5739943087100983,
"epoch": 0.75,
"grad_norm": 0.116433285176754,
"learning_rate": 0.0002,
"loss": 0.5693427920341492,
"mean_token_accuracy": 0.7700029015541077,
"num_tokens": 3293536.0,
"step": 201
},
{
"entropy": 0.5410773009061813,
"epoch": 0.753731343283582,
"grad_norm": 0.12128517776727676,
"learning_rate": 0.0002,
"loss": 0.5383925437927246,
"mean_token_accuracy": 0.7806861847639084,
"num_tokens": 3310044.0,
"step": 202
},
{
"entropy": 0.5345109105110168,
"epoch": 0.7574626865671642,
"grad_norm": 0.11475860327482224,
"learning_rate": 0.0002,
"loss": 0.5396114587783813,
"mean_token_accuracy": 0.7786486446857452,
"num_tokens": 3326424.0,
"step": 203
},
{
"entropy": 0.5596074312925339,
"epoch": 0.7611940298507462,
"grad_norm": 0.1144401878118515,
"learning_rate": 0.0002,
"loss": 0.559008777141571,
"mean_token_accuracy": 0.7744818329811096,
"num_tokens": 3342803.0,
"step": 204
},
{
"entropy": 0.5440013706684113,
"epoch": 0.7649253731343284,
"grad_norm": 0.117170050740242,
"learning_rate": 0.0002,
"loss": 0.5520018935203552,
"mean_token_accuracy": 0.7764452546834946,
"num_tokens": 3359289.0,
"step": 205
},
{
"entropy": 0.5440059304237366,
"epoch": 0.7686567164179104,
"grad_norm": 0.12146680057048798,
"learning_rate": 0.0002,
"loss": 0.543918251991272,
"mean_token_accuracy": 0.7812443971633911,
"num_tokens": 3375680.0,
"step": 206
},
{
"entropy": 0.559204563498497,
"epoch": 0.7723880597014925,
"grad_norm": 0.11677462607622147,
"learning_rate": 0.0002,
"loss": 0.5479013323783875,
"mean_token_accuracy": 0.7783834487199783,
"num_tokens": 3392230.0,
"step": 207
},
{
"entropy": 0.5695496201515198,
"epoch": 0.7761194029850746,
"grad_norm": 0.12663210928440094,
"learning_rate": 0.0002,
"loss": 0.5560157895088196,
"mean_token_accuracy": 0.7768621742725372,
"num_tokens": 3408667.0,
"step": 208
},
{
"entropy": 0.5218568593263626,
"epoch": 0.7798507462686567,
"grad_norm": 0.13396473228931427,
"learning_rate": 0.0002,
"loss": 0.5200244784355164,
"mean_token_accuracy": 0.7892128974199295,
"num_tokens": 3424766.0,
"step": 209
},
{
"entropy": 0.5524403154850006,
"epoch": 0.7835820895522388,
"grad_norm": 0.11780054867267609,
"learning_rate": 0.0002,
"loss": 0.5549524426460266,
"mean_token_accuracy": 0.7762513756752014,
"num_tokens": 3441010.0,
"step": 210
},
{
"entropy": 0.5339344441890717,
"epoch": 0.7873134328358209,
"grad_norm": 0.13986989855766296,
"learning_rate": 0.0002,
"loss": 0.5432649254798889,
"mean_token_accuracy": 0.7810570001602173,
"num_tokens": 3457051.0,
"step": 211
},
{
"entropy": 0.5393660813570023,
"epoch": 0.7910447761194029,
"grad_norm": 0.14846238493919373,
"learning_rate": 0.0002,
"loss": 0.5462239980697632,
"mean_token_accuracy": 0.7770469635725021,
"num_tokens": 3473237.0,
"step": 212
},
{
"entropy": 0.5482676774263382,
"epoch": 0.7947761194029851,
"grad_norm": 0.1279968023300171,
"learning_rate": 0.0002,
"loss": 0.5470429062843323,
"mean_token_accuracy": 0.7772368937730789,
"num_tokens": 3489557.0,
"step": 213
},
{
"entropy": 0.5750377625226974,
"epoch": 0.7985074626865671,
"grad_norm": 0.1574614942073822,
"learning_rate": 0.0002,
"loss": 0.5681816339492798,
"mean_token_accuracy": 0.7696330845355988,
"num_tokens": 3506111.0,
"step": 214
},
{
"entropy": 0.5552468150854111,
"epoch": 0.8022388059701493,
"grad_norm": 0.11573337018489838,
"learning_rate": 0.0002,
"loss": 0.5513306260108948,
"mean_token_accuracy": 0.7750436067581177,
"num_tokens": 3522546.0,
"step": 215
},
{
"entropy": 0.5544361621141434,
"epoch": 0.8059701492537313,
"grad_norm": 0.11837700754404068,
"learning_rate": 0.0002,
"loss": 0.553516685962677,
"mean_token_accuracy": 0.7765354365110397,
"num_tokens": 3539207.0,
"step": 216
},
{
"entropy": 0.5567323267459869,
"epoch": 0.8097014925373134,
"grad_norm": 0.15473680198192596,
"learning_rate": 0.0002,
"loss": 0.5699406862258911,
"mean_token_accuracy": 0.769306480884552,
"num_tokens": 3555606.0,
"step": 217
},
{
"entropy": 0.55356065928936,
"epoch": 0.8134328358208955,
"grad_norm": 0.10959180444478989,
"learning_rate": 0.0002,
"loss": 0.5509120225906372,
"mean_token_accuracy": 0.7775351405143738,
"num_tokens": 3571937.0,
"step": 218
},
{
"entropy": 0.5506166815757751,
"epoch": 0.8171641791044776,
"grad_norm": 0.1107836365699768,
"learning_rate": 0.0002,
"loss": 0.5498772859573364,
"mean_token_accuracy": 0.7781967967748642,
"num_tokens": 3588147.0,
"step": 219
},
{
"entropy": 0.5483623296022415,
"epoch": 0.8208955223880597,
"grad_norm": 0.12760840356349945,
"learning_rate": 0.0002,
"loss": 0.5440163016319275,
"mean_token_accuracy": 0.7794655859470367,
"num_tokens": 3604413.0,
"step": 220
},
{
"entropy": 0.5516934990882874,
"epoch": 0.8246268656716418,
"grad_norm": 0.13432522118091583,
"learning_rate": 0.0002,
"loss": 0.5498266220092773,
"mean_token_accuracy": 0.7779892683029175,
"num_tokens": 3620667.0,
"step": 221
},
{
"entropy": 0.5583075881004333,
"epoch": 0.8283582089552238,
"grad_norm": 0.1205005794763565,
"learning_rate": 0.0002,
"loss": 0.5606446266174316,
"mean_token_accuracy": 0.7730143070220947,
"num_tokens": 3637160.0,
"step": 222
},
{
"entropy": 0.5281430184841156,
"epoch": 0.832089552238806,
"grad_norm": 0.11834297329187393,
"learning_rate": 0.0002,
"loss": 0.5331573486328125,
"mean_token_accuracy": 0.7839753329753876,
"num_tokens": 3653562.0,
"step": 223
},
{
"entropy": 0.5474057644605637,
"epoch": 0.835820895522388,
"grad_norm": 0.12258574366569519,
"learning_rate": 0.0002,
"loss": 0.5449813604354858,
"mean_token_accuracy": 0.780377060174942,
"num_tokens": 3669951.0,
"step": 224
},
{
"entropy": 0.5545710325241089,
"epoch": 0.8395522388059702,
"grad_norm": 0.1338793784379959,
"learning_rate": 0.0002,
"loss": 0.5493278503417969,
"mean_token_accuracy": 0.7759524881839752,
"num_tokens": 3686193.0,
"step": 225
},
{
"entropy": 0.5437184125185013,
"epoch": 0.8432835820895522,
"grad_norm": 0.11655160784721375,
"learning_rate": 0.0002,
"loss": 0.5418398380279541,
"mean_token_accuracy": 0.7775491774082184,
"num_tokens": 3702353.0,
"step": 226
},
{
"entropy": 0.5532678067684174,
"epoch": 0.8470149253731343,
"grad_norm": 0.1549050509929657,
"learning_rate": 0.0002,
"loss": 0.5550553798675537,
"mean_token_accuracy": 0.7763772308826447,
"num_tokens": 3719232.0,
"step": 227
},
{
"entropy": 0.5559423863887787,
"epoch": 0.8507462686567164,
"grad_norm": 0.14761976897716522,
"learning_rate": 0.0002,
"loss": 0.5570894479751587,
"mean_token_accuracy": 0.772933155298233,
"num_tokens": 3735537.0,
"step": 228
},
{
"entropy": 0.5467868000268936,
"epoch": 0.8544776119402985,
"grad_norm": 0.1289997398853302,
"learning_rate": 0.0002,
"loss": 0.5503818988800049,
"mean_token_accuracy": 0.7735268622636795,
"num_tokens": 3751761.0,
"step": 229
},
{
"entropy": 0.5500779002904892,
"epoch": 0.8582089552238806,
"grad_norm": 0.1492077112197876,
"learning_rate": 0.0002,
"loss": 0.5505205392837524,
"mean_token_accuracy": 0.777638703584671,
"num_tokens": 3768182.0,
"step": 230
},
{
"entropy": 0.539194718003273,
"epoch": 0.8619402985074627,
"grad_norm": 0.11280067265033722,
"learning_rate": 0.0002,
"loss": 0.5417665243148804,
"mean_token_accuracy": 0.7794284075498581,
"num_tokens": 3784647.0,
"step": 231
},
{
"entropy": 0.5511510968208313,
"epoch": 0.8656716417910447,
"grad_norm": 0.13110041618347168,
"learning_rate": 0.0002,
"loss": 0.5588247776031494,
"mean_token_accuracy": 0.7747578173875809,
"num_tokens": 3801072.0,
"step": 232
},
{
"entropy": 0.5328868925571442,
"epoch": 0.8694029850746269,
"grad_norm": 0.11132191121578217,
"learning_rate": 0.0002,
"loss": 0.5321682095527649,
"mean_token_accuracy": 0.785084918141365,
"num_tokens": 3817270.0,
"step": 233
},
{
"entropy": 0.5497525930404663,
"epoch": 0.8731343283582089,
"grad_norm": 0.12497328221797943,
"learning_rate": 0.0002,
"loss": 0.5490625500679016,
"mean_token_accuracy": 0.7780804187059402,
"num_tokens": 3833650.0,
"step": 234
},
{
"entropy": 0.5649874210357666,
"epoch": 0.8768656716417911,
"grad_norm": 0.10820397734642029,
"learning_rate": 0.0002,
"loss": 0.5612732172012329,
"mean_token_accuracy": 0.7699918150901794,
"num_tokens": 3849965.0,
"step": 235
},
{
"entropy": 0.5564968436956406,
"epoch": 0.8805970149253731,
"grad_norm": 0.11200150102376938,
"learning_rate": 0.0002,
"loss": 0.5574247241020203,
"mean_token_accuracy": 0.7737843245267868,
"num_tokens": 3866325.0,
"step": 236
},
{
"entropy": 0.5345783978700638,
"epoch": 0.8843283582089553,
"grad_norm": 0.11046700924634933,
"learning_rate": 0.0002,
"loss": 0.5353702902793884,
"mean_token_accuracy": 0.7825029641389847,
"num_tokens": 3882836.0,
"step": 237
},
{
"entropy": 0.5462570339441299,
"epoch": 0.8880597014925373,
"grad_norm": 0.13713142275810242,
"learning_rate": 0.0002,
"loss": 0.5531303286552429,
"mean_token_accuracy": 0.775889053940773,
"num_tokens": 3899019.0,
"step": 238
},
{
"entropy": 0.5346651673316956,
"epoch": 0.8917910447761194,
"grad_norm": 0.11298073828220367,
"learning_rate": 0.0002,
"loss": 0.5383750796318054,
"mean_token_accuracy": 0.780723512172699,
"num_tokens": 3915451.0,
"step": 239
},
{
"entropy": 0.5661043077707291,
"epoch": 0.8955223880597015,
"grad_norm": 0.12630173563957214,
"learning_rate": 0.0002,
"loss": 0.5633317232131958,
"mean_token_accuracy": 0.7725178003311157,
"num_tokens": 3931857.0,
"step": 240
},
{
"entropy": 0.5499769002199173,
"epoch": 0.8992537313432836,
"grad_norm": 0.10539573431015015,
"learning_rate": 0.0002,
"loss": 0.5443609356880188,
"mean_token_accuracy": 0.7807674556970596,
"num_tokens": 3948251.0,
"step": 241
},
{
"entropy": 0.5542334765195847,
"epoch": 0.9029850746268657,
"grad_norm": 0.10860421508550644,
"learning_rate": 0.0002,
"loss": 0.5467254519462585,
"mean_token_accuracy": 0.7777283936738968,
"num_tokens": 3964506.0,
"step": 242
},
{
"entropy": 0.5593715906143188,
"epoch": 0.9067164179104478,
"grad_norm": 0.11269830167293549,
"learning_rate": 0.0002,
"loss": 0.5568402409553528,
"mean_token_accuracy": 0.7743813842535019,
"num_tokens": 3980991.0,
"step": 243
},
{
"entropy": 0.5386274456977844,
"epoch": 0.9104477611940298,
"grad_norm": 0.12022864073514938,
"learning_rate": 0.0002,
"loss": 0.538654088973999,
"mean_token_accuracy": 0.7814032137393951,
"num_tokens": 3997541.0,
"step": 244
},
{
"entropy": 0.5274675115942955,
"epoch": 0.914179104477612,
"grad_norm": 0.14818064868450165,
"learning_rate": 0.0002,
"loss": 0.5381026268005371,
"mean_token_accuracy": 0.7816068381071091,
"num_tokens": 4013664.0,
"step": 245
},
{
"entropy": 0.5379235744476318,
"epoch": 0.917910447761194,
"grad_norm": 0.1228220984339714,
"learning_rate": 0.0002,
"loss": 0.5409340858459473,
"mean_token_accuracy": 0.7790304571390152,
"num_tokens": 4029963.0,
"step": 246
},
{
"entropy": 0.5446107536554337,
"epoch": 0.9216417910447762,
"grad_norm": 0.12891873717308044,
"learning_rate": 0.0002,
"loss": 0.5515777468681335,
"mean_token_accuracy": 0.7764184921979904,
"num_tokens": 4046258.0,
"step": 247
},
{
"entropy": 0.5525491833686829,
"epoch": 0.9253731343283582,
"grad_norm": 0.1355786919593811,
"learning_rate": 0.0002,
"loss": 0.5416724681854248,
"mean_token_accuracy": 0.7802292257547379,
"num_tokens": 4062506.0,
"step": 248
},
{
"entropy": 0.536956250667572,
"epoch": 0.9291044776119403,
"grad_norm": 0.12736709415912628,
"learning_rate": 0.0002,
"loss": 0.5312113761901855,
"mean_token_accuracy": 0.783654510974884,
"num_tokens": 4078661.0,
"step": 249
},
{
"entropy": 0.5549832433462143,
"epoch": 0.9328358208955224,
"grad_norm": 0.12017148733139038,
"learning_rate": 0.0002,
"loss": 0.5565866827964783,
"mean_token_accuracy": 0.773817777633667,
"num_tokens": 4095022.0,
"step": 250
},
{
"entropy": 0.5422243773937225,
"epoch": 0.9365671641791045,
"grad_norm": 0.13573786616325378,
"learning_rate": 0.0002,
"loss": 0.5521195530891418,
"mean_token_accuracy": 0.7785970866680145,
"num_tokens": 4111402.0,
"step": 251
},
{
"entropy": 0.5538443177938461,
"epoch": 0.9402985074626866,
"grad_norm": 0.11428782343864441,
"learning_rate": 0.0002,
"loss": 0.5559377670288086,
"mean_token_accuracy": 0.7728682309389114,
"num_tokens": 4127625.0,
"step": 252
},
{
"entropy": 0.5606874525547028,
"epoch": 0.9440298507462687,
"grad_norm": 0.11228293180465698,
"learning_rate": 0.0002,
"loss": 0.5537079572677612,
"mean_token_accuracy": 0.7777886986732483,
"num_tokens": 4144209.0,
"step": 253
},
{
"entropy": 0.5587089955806732,
"epoch": 0.9477611940298507,
"grad_norm": 0.11430441588163376,
"learning_rate": 0.0002,
"loss": 0.5511766672134399,
"mean_token_accuracy": 0.7764836251735687,
"num_tokens": 4160587.0,
"step": 254
},
{
"entropy": 0.5543984770774841,
"epoch": 0.9514925373134329,
"grad_norm": 0.11914564669132233,
"learning_rate": 0.0002,
"loss": 0.5457825064659119,
"mean_token_accuracy": 0.7772367298603058,
"num_tokens": 4177078.0,
"step": 255
},
{
"entropy": 0.5496934354305267,
"epoch": 0.9552238805970149,
"grad_norm": 0.11808159202337265,
"learning_rate": 0.0002,
"loss": 0.5523373484611511,
"mean_token_accuracy": 0.7758414000272751,
"num_tokens": 4193671.0,
"step": 256
},
{
"entropy": 0.5323416441679001,
"epoch": 0.9589552238805971,
"grad_norm": 0.12709033489227295,
"learning_rate": 0.0002,
"loss": 0.5384759902954102,
"mean_token_accuracy": 0.7808651477098465,
"num_tokens": 4210085.0,
"step": 257
},
{
"entropy": 0.5338983610272408,
"epoch": 0.9626865671641791,
"grad_norm": 0.13908886909484863,
"learning_rate": 0.0002,
"loss": 0.5462735891342163,
"mean_token_accuracy": 0.7780435681343079,
"num_tokens": 4226494.0,
"step": 258
},
{
"entropy": 0.5453044772148132,
"epoch": 0.9664179104477612,
"grad_norm": 0.12644866108894348,
"learning_rate": 0.0002,
"loss": 0.551929235458374,
"mean_token_accuracy": 0.775839775800705,
"num_tokens": 4242785.0,
"step": 259
},
{
"entropy": 0.5603075176477432,
"epoch": 0.9701492537313433,
"grad_norm": 0.12755440175533295,
"learning_rate": 0.0002,
"loss": 0.5524581670761108,
"mean_token_accuracy": 0.7771914452314377,
"num_tokens": 4259299.0,
"step": 260
},
{
"entropy": 0.5615698993206024,
"epoch": 0.9738805970149254,
"grad_norm": 0.12908904254436493,
"learning_rate": 0.0002,
"loss": 0.5537154078483582,
"mean_token_accuracy": 0.7739745527505875,
"num_tokens": 4275749.0,
"step": 261
},
{
"entropy": 0.5526564866304398,
"epoch": 0.9776119402985075,
"grad_norm": 0.10715582221746445,
"learning_rate": 0.0002,
"loss": 0.5478145480155945,
"mean_token_accuracy": 0.7770287841558456,
"num_tokens": 4291706.0,
"step": 262
},
{
"entropy": 0.5461979508399963,
"epoch": 0.9813432835820896,
"grad_norm": 0.14307166635990143,
"learning_rate": 0.0002,
"loss": 0.5454379916191101,
"mean_token_accuracy": 0.7798766791820526,
"num_tokens": 4308137.0,
"step": 263
},
{
"entropy": 0.5203245729207993,
"epoch": 0.9850746268656716,
"grad_norm": 0.15710005164146423,
"learning_rate": 0.0002,
"loss": 0.5299646258354187,
"mean_token_accuracy": 0.7843145579099655,
"num_tokens": 4324411.0,
"step": 264
},
{
"entropy": 0.5302061140537262,
"epoch": 0.9888059701492538,
"grad_norm": 0.1519300937652588,
"learning_rate": 0.0002,
"loss": 0.5403961539268494,
"mean_token_accuracy": 0.7806786000728607,
"num_tokens": 4340384.0,
"step": 265
},
{
"entropy": 0.5364599078893661,
"epoch": 0.9925373134328358,
"grad_norm": 0.13450899720191956,
"learning_rate": 0.0002,
"loss": 0.5356532335281372,
"mean_token_accuracy": 0.7834792584180832,
"num_tokens": 4356954.0,
"step": 266
},
{
"entropy": 0.5519508272409439,
"epoch": 0.996268656716418,
"grad_norm": 0.13190409541130066,
"learning_rate": 0.0002,
"loss": 0.5425809621810913,
"mean_token_accuracy": 0.7814677059650421,
"num_tokens": 4373557.0,
"step": 267
},
{
"entropy": 0.5717380940914154,
"epoch": 1.0,
"grad_norm": 0.13511350750923157,
"learning_rate": 0.0002,
"loss": 0.5594110488891602,
"mean_token_accuracy": 0.7763755470514297,
"num_tokens": 4390028.0,
"step": 268
},
{
"entropy": 0.5333094298839569,
"epoch": 1.0037313432835822,
"grad_norm": 0.11232882738113403,
"learning_rate": 0.0002,
"loss": 0.5279825925827026,
"mean_token_accuracy": 0.7831753939390182,
"num_tokens": 4406075.0,
"step": 269
},
{
"entropy": 0.5085988268256187,
"epoch": 1.007462686567164,
"grad_norm": 0.1554645448923111,
"learning_rate": 0.0002,
"loss": 0.516677737236023,
"mean_token_accuracy": 0.7916137427091599,
"num_tokens": 4422444.0,
"step": 270
},
{
"entropy": 0.5372590869665146,
"epoch": 1.0111940298507462,
"grad_norm": 0.14206163585186005,
"learning_rate": 0.0002,
"loss": 0.542325496673584,
"mean_token_accuracy": 0.7813751995563507,
"num_tokens": 4438619.0,
"step": 271
},
{
"entropy": 0.5327645987272263,
"epoch": 1.0149253731343284,
"grad_norm": 0.12639598548412323,
"learning_rate": 0.0002,
"loss": 0.5381733775138855,
"mean_token_accuracy": 0.7798869907855988,
"num_tokens": 4455013.0,
"step": 272
},
{
"entropy": 0.5318270623683929,
"epoch": 1.0186567164179103,
"grad_norm": 0.14597581326961517,
"learning_rate": 0.0002,
"loss": 0.5323677659034729,
"mean_token_accuracy": 0.7859037518501282,
"num_tokens": 4471596.0,
"step": 273
},
{
"entropy": 0.549939751625061,
"epoch": 1.0223880597014925,
"grad_norm": 0.14265935122966766,
"learning_rate": 0.0002,
"loss": 0.5377833247184753,
"mean_token_accuracy": 0.7833307683467865,
"num_tokens": 4487885.0,
"step": 274
},
{
"entropy": 0.549922838807106,
"epoch": 1.0261194029850746,
"grad_norm": 0.1281050145626068,
"learning_rate": 0.0002,
"loss": 0.5483719706535339,
"mean_token_accuracy": 0.7763915956020355,
"num_tokens": 4504279.0,
"step": 275
},
{
"entropy": 0.5519027858972549,
"epoch": 1.0298507462686568,
"grad_norm": 0.13199536502361298,
"learning_rate": 0.0002,
"loss": 0.5520401000976562,
"mean_token_accuracy": 0.7754272371530533,
"num_tokens": 4520877.0,
"step": 276
},
{
"entropy": 0.5326957255601883,
"epoch": 1.0335820895522387,
"grad_norm": 0.13716775178909302,
"learning_rate": 0.0002,
"loss": 0.5377839207649231,
"mean_token_accuracy": 0.77959144115448,
"num_tokens": 4537306.0,
"step": 277
},
{
"entropy": 0.5343386679887772,
"epoch": 1.037313432835821,
"grad_norm": 0.12250324338674545,
"learning_rate": 0.0002,
"loss": 0.5346370935440063,
"mean_token_accuracy": 0.7819696217775345,
"num_tokens": 4553694.0,
"step": 278
},
{
"entropy": 0.5221862643957138,
"epoch": 1.041044776119403,
"grad_norm": 0.14083418250083923,
"learning_rate": 0.0002,
"loss": 0.5204699039459229,
"mean_token_accuracy": 0.7915231883525848,
"num_tokens": 4569929.0,
"step": 279
},
{
"entropy": 0.5506787896156311,
"epoch": 1.044776119402985,
"grad_norm": 0.11459501832723618,
"learning_rate": 0.0002,
"loss": 0.5497503280639648,
"mean_token_accuracy": 0.7762598097324371,
"num_tokens": 4586327.0,
"step": 280
},
{
"entropy": 0.5387643724679947,
"epoch": 1.0485074626865671,
"grad_norm": 0.1149069145321846,
"learning_rate": 0.0002,
"loss": 0.536687970161438,
"mean_token_accuracy": 0.7849635928869247,
"num_tokens": 4602577.0,
"step": 281
},
{
"entropy": 0.5402974784374237,
"epoch": 1.0522388059701493,
"grad_norm": 0.13960953056812286,
"learning_rate": 0.0002,
"loss": 0.5357297658920288,
"mean_token_accuracy": 0.782235711812973,
"num_tokens": 4618829.0,
"step": 282
},
{
"entropy": 0.5379159897565842,
"epoch": 1.0559701492537314,
"grad_norm": 0.12440282106399536,
"learning_rate": 0.0002,
"loss": 0.5391443967819214,
"mean_token_accuracy": 0.7829291224479675,
"num_tokens": 4635167.0,
"step": 283
},
{
"entropy": 0.5129481852054596,
"epoch": 1.0597014925373134,
"grad_norm": 0.13519050180912018,
"learning_rate": 0.0002,
"loss": 0.5105025768280029,
"mean_token_accuracy": 0.7926614433526993,
"num_tokens": 4651165.0,
"step": 284
},
{
"entropy": 0.5542086809873581,
"epoch": 1.0634328358208955,
"grad_norm": 0.14323101937770844,
"learning_rate": 0.0002,
"loss": 0.5622052550315857,
"mean_token_accuracy": 0.7727599292993546,
"num_tokens": 4667347.0,
"step": 285
},
{
"entropy": 0.5243228375911713,
"epoch": 1.0671641791044777,
"grad_norm": 0.1330215483903885,
"learning_rate": 0.0002,
"loss": 0.5247523188591003,
"mean_token_accuracy": 0.7867335379123688,
"num_tokens": 4684015.0,
"step": 286
},
{
"entropy": 0.5412201136350632,
"epoch": 1.0708955223880596,
"grad_norm": 0.13448479771614075,
"learning_rate": 0.0002,
"loss": 0.54647296667099,
"mean_token_accuracy": 0.7774277031421661,
"num_tokens": 4700242.0,
"step": 287
},
{
"entropy": 0.5454149097204208,
"epoch": 1.0746268656716418,
"grad_norm": 0.13259278237819672,
"learning_rate": 0.0002,
"loss": 0.5461288690567017,
"mean_token_accuracy": 0.7782861590385437,
"num_tokens": 4716442.0,
"step": 288
},
{
"entropy": 0.526309534907341,
"epoch": 1.078358208955224,
"grad_norm": 0.12522561848163605,
"learning_rate": 0.0002,
"loss": 0.5221973061561584,
"mean_token_accuracy": 0.789994552731514,
"num_tokens": 4732742.0,
"step": 289
},
{
"entropy": 0.5411332100629807,
"epoch": 1.0820895522388059,
"grad_norm": 0.12081784009933472,
"learning_rate": 0.0002,
"loss": 0.5372704863548279,
"mean_token_accuracy": 0.7822500914335251,
"num_tokens": 4749084.0,
"step": 290
},
{
"entropy": 0.5575008988380432,
"epoch": 1.085820895522388,
"grad_norm": 0.11303576827049255,
"learning_rate": 0.0002,
"loss": 0.5508702397346497,
"mean_token_accuracy": 0.7754259258508682,
"num_tokens": 4765562.0,
"step": 291
},
{
"entropy": 0.5357666164636612,
"epoch": 1.0895522388059702,
"grad_norm": 0.12666599452495575,
"learning_rate": 0.0002,
"loss": 0.5432624220848083,
"mean_token_accuracy": 0.7804068475961685,
"num_tokens": 4781995.0,
"step": 292
},
{
"entropy": 0.5331733524799347,
"epoch": 1.0932835820895523,
"grad_norm": 0.12246809899806976,
"learning_rate": 0.0002,
"loss": 0.5331196784973145,
"mean_token_accuracy": 0.7823672741651535,
"num_tokens": 4798355.0,
"step": 293
},
{
"entropy": 0.531685009598732,
"epoch": 1.0970149253731343,
"grad_norm": 0.12172231823205948,
"learning_rate": 0.0002,
"loss": 0.5293748378753662,
"mean_token_accuracy": 0.7843722105026245,
"num_tokens": 4814357.0,
"step": 294
},
{
"entropy": 0.554166242480278,
"epoch": 1.1007462686567164,
"grad_norm": 0.14191463589668274,
"learning_rate": 0.0002,
"loss": 0.5532712936401367,
"mean_token_accuracy": 0.7733844220638275,
"num_tokens": 4830954.0,
"step": 295
},
{
"entropy": 0.5282094776630402,
"epoch": 1.1044776119402986,
"grad_norm": 0.14205436408519745,
"learning_rate": 0.0002,
"loss": 0.530907392501831,
"mean_token_accuracy": 0.7830108106136322,
"num_tokens": 4847654.0,
"step": 296
},
{
"entropy": 0.5379532426595688,
"epoch": 1.1082089552238805,
"grad_norm": 0.12750715017318726,
"learning_rate": 0.0002,
"loss": 0.5367629528045654,
"mean_token_accuracy": 0.7796261459589005,
"num_tokens": 4864209.0,
"step": 297
},
{
"entropy": 0.5312085449695587,
"epoch": 1.1119402985074627,
"grad_norm": 0.11801420152187347,
"learning_rate": 0.0002,
"loss": 0.5278028845787048,
"mean_token_accuracy": 0.7856296449899673,
"num_tokens": 4880489.0,
"step": 298
},
{
"entropy": 0.5340657457709312,
"epoch": 1.1156716417910448,
"grad_norm": 0.1341157853603363,
"learning_rate": 0.0002,
"loss": 0.5332481265068054,
"mean_token_accuracy": 0.7815297544002533,
"num_tokens": 4897040.0,
"step": 299
},
{
"entropy": 0.5495938658714294,
"epoch": 1.1194029850746268,
"grad_norm": 0.15130798518657684,
"learning_rate": 0.0002,
"loss": 0.5522593855857849,
"mean_token_accuracy": 0.7767154276371002,
"num_tokens": 4913499.0,
"step": 300
},
{
"entropy": 0.5539788007736206,
"epoch": 1.123134328358209,
"grad_norm": 0.16235828399658203,
"learning_rate": 0.0002,
"loss": 0.556696891784668,
"mean_token_accuracy": 0.7743791192770004,
"num_tokens": 4930129.0,
"step": 301
},
{
"entropy": 0.5188294276595116,
"epoch": 1.126865671641791,
"grad_norm": 0.15251989662647247,
"learning_rate": 0.0002,
"loss": 0.5240339040756226,
"mean_token_accuracy": 0.7848995476961136,
"num_tokens": 4946505.0,
"step": 302
},
{
"entropy": 0.5330336540937424,
"epoch": 1.1305970149253732,
"grad_norm": 0.12010055035352707,
"learning_rate": 0.0002,
"loss": 0.530551552772522,
"mean_token_accuracy": 0.7852707505226135,
"num_tokens": 4963130.0,
"step": 303
},
{
"entropy": 0.5485537797212601,
"epoch": 1.1343283582089552,
"grad_norm": 0.12690100073814392,
"learning_rate": 0.0002,
"loss": 0.5355115532875061,
"mean_token_accuracy": 0.7832664847373962,
"num_tokens": 4979396.0,
"step": 304
},
{
"entropy": 0.5363626033067703,
"epoch": 1.1380597014925373,
"grad_norm": 0.12670499086380005,
"learning_rate": 0.0002,
"loss": 0.5318777561187744,
"mean_token_accuracy": 0.7821652144193649,
"num_tokens": 4995808.0,
"step": 305
},
{
"entropy": 0.556913822889328,
"epoch": 1.1417910447761195,
"grad_norm": 0.1417754739522934,
"learning_rate": 0.0002,
"loss": 0.5632070899009705,
"mean_token_accuracy": 0.7711838483810425,
"num_tokens": 5012247.0,
"step": 306
},
{
"entropy": 0.531732589006424,
"epoch": 1.1455223880597014,
"grad_norm": 0.12725508213043213,
"learning_rate": 0.0002,
"loss": 0.5370599627494812,
"mean_token_accuracy": 0.7827656418085098,
"num_tokens": 5028592.0,
"step": 307
},
{
"entropy": 0.5216507539153099,
"epoch": 1.1492537313432836,
"grad_norm": 0.14518076181411743,
"learning_rate": 0.0002,
"loss": 0.5285972952842712,
"mean_token_accuracy": 0.7866590619087219,
"num_tokens": 5044691.0,
"step": 308
},
{
"entropy": 0.5357843339443207,
"epoch": 1.1529850746268657,
"grad_norm": 0.14331640303134918,
"learning_rate": 0.0002,
"loss": 0.5414748191833496,
"mean_token_accuracy": 0.7796436995267868,
"num_tokens": 5060981.0,
"step": 309
},
{
"entropy": 0.550069585442543,
"epoch": 1.1567164179104479,
"grad_norm": 0.1419994831085205,
"learning_rate": 0.0002,
"loss": 0.5494908690452576,
"mean_token_accuracy": 0.774166613817215,
"num_tokens": 5077445.0,
"step": 310
},
{
"entropy": 0.5334684997797012,
"epoch": 1.1604477611940298,
"grad_norm": 0.13464997708797455,
"learning_rate": 0.0002,
"loss": 0.5329424738883972,
"mean_token_accuracy": 0.7852184623479843,
"num_tokens": 5093959.0,
"step": 311
},
{
"entropy": 0.5384779423475266,
"epoch": 1.164179104477612,
"grad_norm": 0.12344568222761154,
"learning_rate": 0.0002,
"loss": 0.5393214821815491,
"mean_token_accuracy": 0.783161386847496,
"num_tokens": 5110114.0,
"step": 312
},
{
"entropy": 0.566596269607544,
"epoch": 1.1679104477611941,
"grad_norm": 0.13426469266414642,
"learning_rate": 0.0002,
"loss": 0.5611933469772339,
"mean_token_accuracy": 0.7707538902759552,
"num_tokens": 5126500.0,
"step": 313
},
{
"entropy": 0.5522208511829376,
"epoch": 1.171641791044776,
"grad_norm": 0.11628863960504532,
"learning_rate": 0.0002,
"loss": 0.544135332107544,
"mean_token_accuracy": 0.7789785116910934,
"num_tokens": 5143003.0,
"step": 314
},
{
"entropy": 0.5286403447389603,
"epoch": 1.1753731343283582,
"grad_norm": 0.1331920623779297,
"learning_rate": 0.0002,
"loss": 0.5280863046646118,
"mean_token_accuracy": 0.7847232520580292,
"num_tokens": 5159209.0,
"step": 315
},
{
"entropy": 0.5208230093121529,
"epoch": 1.1791044776119404,
"grad_norm": 0.16730330884456635,
"learning_rate": 0.0002,
"loss": 0.5261422395706177,
"mean_token_accuracy": 0.7885824292898178,
"num_tokens": 5175336.0,
"step": 316
},
{
"entropy": 0.5139501839876175,
"epoch": 1.1828358208955223,
"grad_norm": 0.17113769054412842,
"learning_rate": 0.0002,
"loss": 0.5231570601463318,
"mean_token_accuracy": 0.7852117121219635,
"num_tokens": 5191589.0,
"step": 317
},
{
"entropy": 0.5446046590805054,
"epoch": 1.1865671641791045,
"grad_norm": 0.13907761871814728,
"learning_rate": 0.0002,
"loss": 0.5399054288864136,
"mean_token_accuracy": 0.7820506691932678,
"num_tokens": 5207939.0,
"step": 318
},
{
"entropy": 0.5267596393823624,
"epoch": 1.1902985074626866,
"grad_norm": 0.1434536576271057,
"learning_rate": 0.0002,
"loss": 0.5265440344810486,
"mean_token_accuracy": 0.7849590480327606,
"num_tokens": 5224274.0,
"step": 319
},
{
"entropy": 0.5274358987808228,
"epoch": 1.1940298507462686,
"grad_norm": 0.1331617832183838,
"learning_rate": 0.0002,
"loss": 0.5201226472854614,
"mean_token_accuracy": 0.7877639383077621,
"num_tokens": 5240488.0,
"step": 320
},
{
"entropy": 0.5438350588083267,
"epoch": 1.1977611940298507,
"grad_norm": 0.13051791489124298,
"learning_rate": 0.0002,
"loss": 0.5417760610580444,
"mean_token_accuracy": 0.7801128923892975,
"num_tokens": 5256913.0,
"step": 321
},
{
"entropy": 0.5419559478759766,
"epoch": 1.2014925373134329,
"grad_norm": 0.1651846319437027,
"learning_rate": 0.0002,
"loss": 0.5418766140937805,
"mean_token_accuracy": 0.78228460252285,
"num_tokens": 5273335.0,
"step": 322
},
{
"entropy": 0.5415368527173996,
"epoch": 1.205223880597015,
"grad_norm": 0.16951487958431244,
"learning_rate": 0.0002,
"loss": 0.5506861209869385,
"mean_token_accuracy": 0.7753586024045944,
"num_tokens": 5289759.0,
"step": 323
},
{
"entropy": 0.5358785539865494,
"epoch": 1.208955223880597,
"grad_norm": 0.1276499480009079,
"learning_rate": 0.0002,
"loss": 0.536015510559082,
"mean_token_accuracy": 0.7820306271314621,
"num_tokens": 5305982.0,
"step": 324
},
{
"entropy": 0.5399276316165924,
"epoch": 1.212686567164179,
"grad_norm": 0.13910017907619476,
"learning_rate": 0.0002,
"loss": 0.5390846133232117,
"mean_token_accuracy": 0.7822140157222748,
"num_tokens": 5322089.0,
"step": 325
},
{
"entropy": 0.54273721575737,
"epoch": 1.2164179104477613,
"grad_norm": 0.14252571761608124,
"learning_rate": 0.0002,
"loss": 0.544661283493042,
"mean_token_accuracy": 0.7795404642820358,
"num_tokens": 5338453.0,
"step": 326
},
{
"entropy": 0.5249434560537338,
"epoch": 1.2201492537313432,
"grad_norm": 0.1477581411600113,
"learning_rate": 0.0002,
"loss": 0.5217203497886658,
"mean_token_accuracy": 0.7876597344875336,
"num_tokens": 5354700.0,
"step": 327
},
{
"entropy": 0.5396385788917542,
"epoch": 1.2238805970149254,
"grad_norm": 0.14778634905815125,
"learning_rate": 0.0002,
"loss": 0.5354180335998535,
"mean_token_accuracy": 0.7824464589357376,
"num_tokens": 5371063.0,
"step": 328
},
{
"entropy": 0.5529858469963074,
"epoch": 1.2276119402985075,
"grad_norm": 0.13042840361595154,
"learning_rate": 0.0002,
"loss": 0.5544819831848145,
"mean_token_accuracy": 0.7761342972517014,
"num_tokens": 5387332.0,
"step": 329
},
{
"entropy": 0.5454379618167877,
"epoch": 1.2313432835820897,
"grad_norm": 0.15361081063747406,
"learning_rate": 0.0002,
"loss": 0.5482691526412964,
"mean_token_accuracy": 0.7785263955593109,
"num_tokens": 5403888.0,
"step": 330
},
{
"entropy": 0.5411872565746307,
"epoch": 1.2350746268656716,
"grad_norm": 0.1457548439502716,
"learning_rate": 0.0002,
"loss": 0.5460063219070435,
"mean_token_accuracy": 0.7781393676996231,
"num_tokens": 5420504.0,
"step": 331
},
{
"entropy": 0.5440556704998016,
"epoch": 1.2388059701492538,
"grad_norm": 0.17071455717086792,
"learning_rate": 0.0002,
"loss": 0.5447981357574463,
"mean_token_accuracy": 0.7792220860719681,
"num_tokens": 5436983.0,
"step": 332
},
{
"entropy": 0.5312773138284683,
"epoch": 1.242537313432836,
"grad_norm": 0.15535041689872742,
"learning_rate": 0.0002,
"loss": 0.5284558534622192,
"mean_token_accuracy": 0.7843498289585114,
"num_tokens": 5453439.0,
"step": 333
},
{
"entropy": 0.5413801819086075,
"epoch": 1.2462686567164178,
"grad_norm": 0.12389594316482544,
"learning_rate": 0.0002,
"loss": 0.5376867651939392,
"mean_token_accuracy": 0.7829112410545349,
"num_tokens": 5470171.0,
"step": 334
},
{
"entropy": 0.5580787807703018,
"epoch": 1.25,
"grad_norm": 0.15255525708198547,
"learning_rate": 0.0002,
"loss": 0.5539383292198181,
"mean_token_accuracy": 0.7776496410369873,
"num_tokens": 5486721.0,
"step": 335
},
{
"entropy": 0.551739051938057,
"epoch": 1.2537313432835822,
"grad_norm": 0.14014676213264465,
"learning_rate": 0.0002,
"loss": 0.5544667840003967,
"mean_token_accuracy": 0.7750911116600037,
"num_tokens": 5502822.0,
"step": 336
},
{
"entropy": 0.5480811297893524,
"epoch": 1.2574626865671643,
"grad_norm": 0.1353754997253418,
"learning_rate": 0.0002,
"loss": 0.5507966876029968,
"mean_token_accuracy": 0.7761414647102356,
"num_tokens": 5519323.0,
"step": 337
},
{
"entropy": 0.5414211302995682,
"epoch": 1.2611940298507462,
"grad_norm": 0.1243680939078331,
"learning_rate": 0.0002,
"loss": 0.5453186631202698,
"mean_token_accuracy": 0.7782161980867386,
"num_tokens": 5535863.0,
"step": 338
},
{
"entropy": 0.527251847088337,
"epoch": 1.2649253731343284,
"grad_norm": 0.1459769904613495,
"learning_rate": 0.0002,
"loss": 0.5396205186843872,
"mean_token_accuracy": 0.7795730829238892,
"num_tokens": 5552171.0,
"step": 339
},
{
"entropy": 0.5239678472280502,
"epoch": 1.2686567164179103,
"grad_norm": 0.12427864223718643,
"learning_rate": 0.0002,
"loss": 0.5271449089050293,
"mean_token_accuracy": 0.7882652282714844,
"num_tokens": 5568175.0,
"step": 340
},
{
"entropy": 0.543644979596138,
"epoch": 1.2723880597014925,
"grad_norm": 0.11923787742853165,
"learning_rate": 0.0002,
"loss": 0.5382894277572632,
"mean_token_accuracy": 0.7825156450271606,
"num_tokens": 5584465.0,
"step": 341
},
{
"entropy": 0.5515155345201492,
"epoch": 1.2761194029850746,
"grad_norm": 0.11743160337209702,
"learning_rate": 0.0002,
"loss": 0.5425710082054138,
"mean_token_accuracy": 0.7795869261026382,
"num_tokens": 5601282.0,
"step": 342
},
{
"entropy": 0.556594654917717,
"epoch": 1.2798507462686568,
"grad_norm": 0.13206258416175842,
"learning_rate": 0.0002,
"loss": 0.553520679473877,
"mean_token_accuracy": 0.7744052857160568,
"num_tokens": 5617511.0,
"step": 343
},
{
"entropy": 0.5562093108892441,
"epoch": 1.2835820895522387,
"grad_norm": 0.1419561356306076,
"learning_rate": 0.0002,
"loss": 0.5573539733886719,
"mean_token_accuracy": 0.7758442610502243,
"num_tokens": 5634008.0,
"step": 344
},
{
"entropy": 0.5295949876308441,
"epoch": 1.287313432835821,
"grad_norm": 0.136697456240654,
"learning_rate": 0.0002,
"loss": 0.536439836025238,
"mean_token_accuracy": 0.7857220619916916,
"num_tokens": 5650510.0,
"step": 345
},
{
"entropy": 0.5379302501678467,
"epoch": 1.291044776119403,
"grad_norm": 0.12953169643878937,
"learning_rate": 0.0002,
"loss": 0.5420789122581482,
"mean_token_accuracy": 0.7796627283096313,
"num_tokens": 5667049.0,
"step": 346
},
{
"entropy": 0.5327381789684296,
"epoch": 1.294776119402985,
"grad_norm": 0.12574538588523865,
"learning_rate": 0.0002,
"loss": 0.5231812000274658,
"mean_token_accuracy": 0.7879898250102997,
"num_tokens": 5683103.0,
"step": 347
},
{
"entropy": 0.5485990345478058,
"epoch": 1.2985074626865671,
"grad_norm": 0.12788420915603638,
"learning_rate": 0.0002,
"loss": 0.5398032665252686,
"mean_token_accuracy": 0.782793402671814,
"num_tokens": 5699531.0,
"step": 348
},
{
"entropy": 0.533822700381279,
"epoch": 1.3022388059701493,
"grad_norm": 0.12131965160369873,
"learning_rate": 0.0002,
"loss": 0.5313589572906494,
"mean_token_accuracy": 0.7867582440376282,
"num_tokens": 5715578.0,
"step": 349
},
{
"entropy": 0.5322218984365463,
"epoch": 1.3059701492537314,
"grad_norm": 0.13636337220668793,
"learning_rate": 0.0002,
"loss": 0.5401290655136108,
"mean_token_accuracy": 0.781011775135994,
"num_tokens": 5731885.0,
"step": 350
},
{
"entropy": 0.5119979977607727,
"epoch": 1.3097014925373134,
"grad_norm": 0.1538715660572052,
"learning_rate": 0.0002,
"loss": 0.5197798013687134,
"mean_token_accuracy": 0.787521705031395,
"num_tokens": 5748165.0,
"step": 351
},
{
"entropy": 0.522780068218708,
"epoch": 1.3134328358208955,
"grad_norm": 0.16598650813102722,
"learning_rate": 0.0002,
"loss": 0.5323340892791748,
"mean_token_accuracy": 0.7844688296318054,
"num_tokens": 5764530.0,
"step": 352
},
{
"entropy": 0.5400198400020599,
"epoch": 1.3171641791044777,
"grad_norm": 0.13400353491306305,
"learning_rate": 0.0002,
"loss": 0.5443472266197205,
"mean_token_accuracy": 0.7780963182449341,
"num_tokens": 5780899.0,
"step": 353
},
{
"entropy": 0.556030884385109,
"epoch": 1.3208955223880596,
"grad_norm": 0.13756664097309113,
"learning_rate": 0.0002,
"loss": 0.5470365285873413,
"mean_token_accuracy": 0.7808873951435089,
"num_tokens": 5796973.0,
"step": 354
},
{
"entropy": 0.5455010533332825,
"epoch": 1.3246268656716418,
"grad_norm": 0.17140203714370728,
"learning_rate": 0.0002,
"loss": 0.534233808517456,
"mean_token_accuracy": 0.7828006148338318,
"num_tokens": 5813201.0,
"step": 355
},
{
"entropy": 0.5456499308347702,
"epoch": 1.328358208955224,
"grad_norm": 0.13772569596767426,
"learning_rate": 0.0002,
"loss": 0.5461813807487488,
"mean_token_accuracy": 0.7786128669977188,
"num_tokens": 5829457.0,
"step": 356
},
{
"entropy": 0.5223972797393799,
"epoch": 1.332089552238806,
"grad_norm": 0.22252066433429718,
"learning_rate": 0.0002,
"loss": 0.5330066084861755,
"mean_token_accuracy": 0.7818692922592163,
"num_tokens": 5845786.0,
"step": 357
},
{
"entropy": 0.5292713642120361,
"epoch": 1.335820895522388,
"grad_norm": 0.14202645421028137,
"learning_rate": 0.0002,
"loss": 0.5392715930938721,
"mean_token_accuracy": 0.7805515229701996,
"num_tokens": 5862226.0,
"step": 358
},
{
"entropy": 0.5300968736410141,
"epoch": 1.3395522388059702,
"grad_norm": 0.18332785367965698,
"learning_rate": 0.0002,
"loss": 0.5347115993499756,
"mean_token_accuracy": 0.7835317403078079,
"num_tokens": 5878683.0,
"step": 359
},
{
"entropy": 0.5431934744119644,
"epoch": 1.3432835820895521,
"grad_norm": 0.14532189071178436,
"learning_rate": 0.0002,
"loss": 0.5330429077148438,
"mean_token_accuracy": 0.7804477661848068,
"num_tokens": 5895049.0,
"step": 360
},
{
"entropy": 0.5435428023338318,
"epoch": 1.3470149253731343,
"grad_norm": 0.1675368696451187,
"learning_rate": 0.0002,
"loss": 0.5300995707511902,
"mean_token_accuracy": 0.785721018910408,
"num_tokens": 5911501.0,
"step": 361
},
{
"entropy": 0.5362260937690735,
"epoch": 1.3507462686567164,
"grad_norm": 0.12240255624055862,
"learning_rate": 0.0002,
"loss": 0.5256680846214294,
"mean_token_accuracy": 0.7851513922214508,
"num_tokens": 5927731.0,
"step": 362
},
{
"entropy": 0.5452938824892044,
"epoch": 1.3544776119402986,
"grad_norm": 0.15949903428554535,
"learning_rate": 0.0002,
"loss": 0.5495162010192871,
"mean_token_accuracy": 0.7768245339393616,
"num_tokens": 5944077.0,
"step": 363
},
{
"entropy": 0.5237463638186455,
"epoch": 1.3582089552238805,
"grad_norm": 0.2120627760887146,
"learning_rate": 0.0002,
"loss": 0.5346443057060242,
"mean_token_accuracy": 0.7835520654916763,
"num_tokens": 5960532.0,
"step": 364
},
{
"entropy": 0.5450356751680374,
"epoch": 1.3619402985074627,
"grad_norm": 0.12423616647720337,
"learning_rate": 0.0002,
"loss": 0.5510310530662537,
"mean_token_accuracy": 0.7749469876289368,
"num_tokens": 5976893.0,
"step": 365
},
{
"entropy": 0.5489538311958313,
"epoch": 1.3656716417910448,
"grad_norm": 0.17930445075035095,
"learning_rate": 0.0002,
"loss": 0.5512227416038513,
"mean_token_accuracy": 0.7759018093347549,
"num_tokens": 5993262.0,
"step": 366
},
{
"entropy": 0.5524207949638367,
"epoch": 1.3694029850746268,
"grad_norm": 0.12074736505746841,
"learning_rate": 0.0002,
"loss": 0.5450834631919861,
"mean_token_accuracy": 0.7803297787904739,
"num_tokens": 6009831.0,
"step": 367
},
{
"entropy": 0.5440987944602966,
"epoch": 1.373134328358209,
"grad_norm": 0.13452184200286865,
"learning_rate": 0.0002,
"loss": 0.5378953814506531,
"mean_token_accuracy": 0.7820150256156921,
"num_tokens": 6026331.0,
"step": 368
},
{
"entropy": 0.5413002520799637,
"epoch": 1.376865671641791,
"grad_norm": 0.1278562843799591,
"learning_rate": 0.0002,
"loss": 0.5359137654304504,
"mean_token_accuracy": 0.783556342124939,
"num_tokens": 6042945.0,
"step": 369
},
{
"entropy": 0.5525120049715042,
"epoch": 1.3805970149253732,
"grad_norm": 0.1208810955286026,
"learning_rate": 0.0002,
"loss": 0.5459328889846802,
"mean_token_accuracy": 0.7781365811824799,
"num_tokens": 6059427.0,
"step": 370
},
{
"entropy": 0.5276467949151993,
"epoch": 1.3843283582089552,
"grad_norm": 0.21167868375778198,
"learning_rate": 0.0002,
"loss": 0.5329975485801697,
"mean_token_accuracy": 0.7855836153030396,
"num_tokens": 6075868.0,
"step": 371
},
{
"entropy": 0.523284301161766,
"epoch": 1.3880597014925373,
"grad_norm": 0.13116827607154846,
"learning_rate": 0.0002,
"loss": 0.5309988260269165,
"mean_token_accuracy": 0.7828356921672821,
"num_tokens": 6092149.0,
"step": 372
},
{
"entropy": 0.5434711575508118,
"epoch": 1.3917910447761195,
"grad_norm": 0.3316550850868225,
"learning_rate": 0.0002,
"loss": 0.553439199924469,
"mean_token_accuracy": 0.7766979038715363,
"num_tokens": 6108567.0,
"step": 373
},
{
"entropy": 0.5287135094404221,
"epoch": 1.3955223880597014,
"grad_norm": 0.15037605166435242,
"learning_rate": 0.0002,
"loss": 0.5357441306114197,
"mean_token_accuracy": 0.7817093282938004,
"num_tokens": 6124527.0,
"step": 374
},
{
"entropy": 0.5508522838354111,
"epoch": 1.3992537313432836,
"grad_norm": 0.19524440169334412,
"learning_rate": 0.0002,
"loss": 0.5512291789054871,
"mean_token_accuracy": 0.7776720374822617,
"num_tokens": 6141075.0,
"step": 375
},
{
"entropy": 0.5336653590202332,
"epoch": 1.4029850746268657,
"grad_norm": 0.15542961657047272,
"learning_rate": 0.0002,
"loss": 0.5334641933441162,
"mean_token_accuracy": 0.7813901156187057,
"num_tokens": 6157438.0,
"step": 376
},
{
"entropy": 0.5536468476057053,
"epoch": 1.4067164179104479,
"grad_norm": 0.11985230445861816,
"learning_rate": 0.0002,
"loss": 0.5497922301292419,
"mean_token_accuracy": 0.7766197621822357,
"num_tokens": 6174052.0,
"step": 377
},
{
"entropy": 0.5455610156059265,
"epoch": 1.4104477611940298,
"grad_norm": 0.1377374231815338,
"learning_rate": 0.0002,
"loss": 0.5400494337081909,
"mean_token_accuracy": 0.7812647223472595,
"num_tokens": 6190741.0,
"step": 378
},
{
"entropy": 0.5355032831430435,
"epoch": 1.414179104477612,
"grad_norm": 0.12337534874677658,
"learning_rate": 0.0002,
"loss": 0.5313869118690491,
"mean_token_accuracy": 0.7843705862760544,
"num_tokens": 6207346.0,
"step": 379
},
{
"entropy": 0.5320865362882614,
"epoch": 1.417910447761194,
"grad_norm": 0.1453101485967636,
"learning_rate": 0.0002,
"loss": 0.5400369167327881,
"mean_token_accuracy": 0.7805843502283096,
"num_tokens": 6223644.0,
"step": 380
},
{
"entropy": 0.5373547524213791,
"epoch": 1.421641791044776,
"grad_norm": 0.19084329903125763,
"learning_rate": 0.0002,
"loss": 0.5499929785728455,
"mean_token_accuracy": 0.7757923603057861,
"num_tokens": 6239901.0,
"step": 381
},
{
"entropy": 0.5443465709686279,
"epoch": 1.4253731343283582,
"grad_norm": 0.11772217601537704,
"learning_rate": 0.0002,
"loss": 0.5418881773948669,
"mean_token_accuracy": 0.7812986522912979,
"num_tokens": 6256285.0,
"step": 382
},
{
"entropy": 0.5499950498342514,
"epoch": 1.4291044776119404,
"grad_norm": 0.1847136914730072,
"learning_rate": 0.0002,
"loss": 0.5488113760948181,
"mean_token_accuracy": 0.7776869833469391,
"num_tokens": 6272664.0,
"step": 383
},
{
"entropy": 0.5412472188472748,
"epoch": 1.4328358208955223,
"grad_norm": 0.1461949199438095,
"learning_rate": 0.0002,
"loss": 0.5365965366363525,
"mean_token_accuracy": 0.7832726240158081,
"num_tokens": 6289098.0,
"step": 384
},
{
"entropy": 0.5493346899747849,
"epoch": 1.4365671641791045,
"grad_norm": 0.17751483619213104,
"learning_rate": 0.0002,
"loss": 0.5465101003646851,
"mean_token_accuracy": 0.7778099924325943,
"num_tokens": 6305547.0,
"step": 385
},
{
"entropy": 0.5415252298116684,
"epoch": 1.4402985074626866,
"grad_norm": 0.13513009250164032,
"learning_rate": 0.0002,
"loss": 0.538934588432312,
"mean_token_accuracy": 0.7832966297864914,
"num_tokens": 6321844.0,
"step": 386
},
{
"entropy": 0.5470823347568512,
"epoch": 1.4440298507462686,
"grad_norm": 0.15616844594478607,
"learning_rate": 0.0002,
"loss": 0.5563836097717285,
"mean_token_accuracy": 0.7730062156915665,
"num_tokens": 6338401.0,
"step": 387
},
{
"entropy": 0.5151138752698898,
"epoch": 1.4477611940298507,
"grad_norm": 0.13514217734336853,
"learning_rate": 0.0002,
"loss": 0.5200275182723999,
"mean_token_accuracy": 0.7898600101470947,
"num_tokens": 6354762.0,
"step": 388
},
{
"entropy": 0.5174058377742767,
"epoch": 1.4514925373134329,
"grad_norm": 0.13703469932079315,
"learning_rate": 0.0002,
"loss": 0.5161208510398865,
"mean_token_accuracy": 0.7918747067451477,
"num_tokens": 6370840.0,
"step": 389
},
{
"entropy": 0.5557476729154587,
"epoch": 1.455223880597015,
"grad_norm": 0.11840767413377762,
"learning_rate": 0.0002,
"loss": 0.5515946745872498,
"mean_token_accuracy": 0.7783915251493454,
"num_tokens": 6387355.0,
"step": 390
},
{
"entropy": 0.5518558323383331,
"epoch": 1.458955223880597,
"grad_norm": 0.13202938437461853,
"learning_rate": 0.0002,
"loss": 0.5526413321495056,
"mean_token_accuracy": 0.776582270860672,
"num_tokens": 6403938.0,
"step": 391
},
{
"entropy": 0.5571378320455551,
"epoch": 1.462686567164179,
"grad_norm": 0.13269183039665222,
"learning_rate": 0.0002,
"loss": 0.5643842220306396,
"mean_token_accuracy": 0.7722982317209244,
"num_tokens": 6420250.0,
"step": 392
},
{
"entropy": 0.5537096560001373,
"epoch": 1.4664179104477613,
"grad_norm": 0.14151525497436523,
"learning_rate": 0.0002,
"loss": 0.553024411201477,
"mean_token_accuracy": 0.7778746634721756,
"num_tokens": 6436546.0,
"step": 393
},
{
"entropy": 0.5346309244632721,
"epoch": 1.4701492537313432,
"grad_norm": 0.13563434779644012,
"learning_rate": 0.0002,
"loss": 0.5249274969100952,
"mean_token_accuracy": 0.7853583991527557,
"num_tokens": 6453243.0,
"step": 394
},
{
"entropy": 0.5460333377122879,
"epoch": 1.4738805970149254,
"grad_norm": 0.14244568347930908,
"learning_rate": 0.0002,
"loss": 0.5472844243049622,
"mean_token_accuracy": 0.7797000557184219,
"num_tokens": 6469565.0,
"step": 395
},
{
"entropy": 0.5330733209848404,
"epoch": 1.4776119402985075,
"grad_norm": 0.15417160093784332,
"learning_rate": 0.0002,
"loss": 0.538681149482727,
"mean_token_accuracy": 0.7821140140295029,
"num_tokens": 6486038.0,
"step": 396
},
{
"entropy": 0.5275893434882164,
"epoch": 1.4813432835820897,
"grad_norm": 0.1634518802165985,
"learning_rate": 0.0002,
"loss": 0.5361412167549133,
"mean_token_accuracy": 0.7828765362501144,
"num_tokens": 6502376.0,
"step": 397
},
{
"entropy": 0.5401307940483093,
"epoch": 1.4850746268656716,
"grad_norm": 0.14567126333713531,
"learning_rate": 0.0002,
"loss": 0.5489403605461121,
"mean_token_accuracy": 0.7781455963850021,
"num_tokens": 6518668.0,
"step": 398
},
{
"entropy": 0.5669757276773453,
"epoch": 1.4888059701492538,
"grad_norm": 0.1354297697544098,
"learning_rate": 0.0002,
"loss": 0.5657601356506348,
"mean_token_accuracy": 0.7712653428316116,
"num_tokens": 6535182.0,
"step": 399
},
{
"entropy": 0.5363806635141373,
"epoch": 1.4925373134328357,
"grad_norm": 0.12377993017435074,
"learning_rate": 0.0002,
"loss": 0.529585599899292,
"mean_token_accuracy": 0.7840481698513031,
"num_tokens": 6551666.0,
"step": 400
},
{
"entropy": 0.5551501959562302,
"epoch": 1.4962686567164178,
"grad_norm": 0.14788372814655304,
"learning_rate": 0.0002,
"loss": 0.553497314453125,
"mean_token_accuracy": 0.7757378667593002,
"num_tokens": 6568256.0,
"step": 401
},
{
"entropy": 0.5353442132472992,
"epoch": 1.5,
"grad_norm": 0.12778371572494507,
"learning_rate": 0.0002,
"loss": 0.5333885550498962,
"mean_token_accuracy": 0.7825479656457901,
"num_tokens": 6584443.0,
"step": 402
},
{
"entropy": 0.5460584759712219,
"epoch": 1.5037313432835822,
"grad_norm": 0.1357504278421402,
"learning_rate": 0.0002,
"loss": 0.5496041774749756,
"mean_token_accuracy": 0.7750886082649231,
"num_tokens": 6600907.0,
"step": 403
},
{
"entropy": 0.5397640466690063,
"epoch": 1.5074626865671643,
"grad_norm": 0.13449276983737946,
"learning_rate": 0.0002,
"loss": 0.5374521017074585,
"mean_token_accuracy": 0.783362939953804,
"num_tokens": 6617309.0,
"step": 404
},
{
"entropy": 0.545674204826355,
"epoch": 1.5111940298507462,
"grad_norm": 0.12818823754787445,
"learning_rate": 0.0002,
"loss": 0.5414538383483887,
"mean_token_accuracy": 0.7811758369207382,
"num_tokens": 6633409.0,
"step": 405
},
{
"entropy": 0.5237551480531693,
"epoch": 1.5149253731343284,
"grad_norm": 0.1332634538412094,
"learning_rate": 0.0002,
"loss": 0.5288904905319214,
"mean_token_accuracy": 0.7863495498895645,
"num_tokens": 6649677.0,
"step": 406
},
{
"entropy": 0.5475018620491028,
"epoch": 1.5186567164179103,
"grad_norm": 0.1226048395037651,
"learning_rate": 0.0002,
"loss": 0.5457717180252075,
"mean_token_accuracy": 0.7798316031694412,
"num_tokens": 6665941.0,
"step": 407
},
{
"entropy": 0.5388360321521759,
"epoch": 1.5223880597014925,
"grad_norm": 0.11307930946350098,
"learning_rate": 0.0002,
"loss": 0.5332959294319153,
"mean_token_accuracy": 0.7827007919549942,
"num_tokens": 6682727.0,
"step": 408
},
{
"entropy": 0.5245520323514938,
"epoch": 1.5261194029850746,
"grad_norm": 0.13594341278076172,
"learning_rate": 0.0002,
"loss": 0.527988851070404,
"mean_token_accuracy": 0.7841480374336243,
"num_tokens": 6699061.0,
"step": 409
},
{
"entropy": 0.5443517565727234,
"epoch": 1.5298507462686568,
"grad_norm": 0.12875105440616608,
"learning_rate": 0.0002,
"loss": 0.5445384979248047,
"mean_token_accuracy": 0.7800036072731018,
"num_tokens": 6715276.0,
"step": 410
},
{
"entropy": 0.5312410593032837,
"epoch": 1.533582089552239,
"grad_norm": 0.14251653850078583,
"learning_rate": 0.0002,
"loss": 0.5363666415214539,
"mean_token_accuracy": 0.7820229083299637,
"num_tokens": 6731754.0,
"step": 411
},
{
"entropy": 0.5279273837804794,
"epoch": 1.537313432835821,
"grad_norm": 0.14002381265163422,
"learning_rate": 0.0002,
"loss": 0.533150851726532,
"mean_token_accuracy": 0.7839628010988235,
"num_tokens": 6748198.0,
"step": 412
},
{
"entropy": 0.5359641313552856,
"epoch": 1.5410447761194028,
"grad_norm": 0.12248595803976059,
"learning_rate": 0.0002,
"loss": 0.5377635359764099,
"mean_token_accuracy": 0.7816402763128281,
"num_tokens": 6764658.0,
"step": 413
},
{
"entropy": 0.5304668098688126,
"epoch": 1.544776119402985,
"grad_norm": 0.1455898880958557,
"learning_rate": 0.0002,
"loss": 0.527800440788269,
"mean_token_accuracy": 0.7847253680229187,
"num_tokens": 6780948.0,
"step": 414
},
{
"entropy": 0.5399336069822311,
"epoch": 1.5485074626865671,
"grad_norm": 0.1414983719587326,
"learning_rate": 0.0002,
"loss": 0.5367389917373657,
"mean_token_accuracy": 0.7821487188339233,
"num_tokens": 6797350.0,
"step": 415
},
{
"entropy": 0.5576040744781494,
"epoch": 1.5522388059701493,
"grad_norm": 0.12719132006168365,
"learning_rate": 0.0002,
"loss": 0.5524293780326843,
"mean_token_accuracy": 0.7746585160493851,
"num_tokens": 6813754.0,
"step": 416
},
{
"entropy": 0.5370134860277176,
"epoch": 1.5559701492537314,
"grad_norm": 0.1307905912399292,
"learning_rate": 0.0002,
"loss": 0.5359637141227722,
"mean_token_accuracy": 0.7802634984254837,
"num_tokens": 6829931.0,
"step": 417
},
{
"entropy": 0.5672536343336105,
"epoch": 1.5597014925373134,
"grad_norm": 0.14925286173820496,
"learning_rate": 0.0002,
"loss": 0.5706211924552917,
"mean_token_accuracy": 0.7692793905735016,
"num_tokens": 6846619.0,
"step": 418
},
{
"entropy": 0.5455258339643478,
"epoch": 1.5634328358208955,
"grad_norm": 0.13767075538635254,
"learning_rate": 0.0002,
"loss": 0.5497614145278931,
"mean_token_accuracy": 0.7742694765329361,
"num_tokens": 6862943.0,
"step": 419
},
{
"entropy": 0.5383682698011398,
"epoch": 1.5671641791044775,
"grad_norm": 0.14676761627197266,
"learning_rate": 0.0002,
"loss": 0.5352654457092285,
"mean_token_accuracy": 0.7820954322814941,
"num_tokens": 6879478.0,
"step": 420
},
{
"entropy": 0.5393406301736832,
"epoch": 1.5708955223880596,
"grad_norm": 0.14782963693141937,
"learning_rate": 0.0002,
"loss": 0.539406418800354,
"mean_token_accuracy": 0.7811137288808823,
"num_tokens": 6895819.0,
"step": 421
},
{
"entropy": 0.5472134947776794,
"epoch": 1.5746268656716418,
"grad_norm": 0.1328146755695343,
"learning_rate": 0.0002,
"loss": 0.5461377501487732,
"mean_token_accuracy": 0.7797697186470032,
"num_tokens": 6912305.0,
"step": 422
},
{
"entropy": 0.5397001504898071,
"epoch": 1.578358208955224,
"grad_norm": 0.12005209177732468,
"learning_rate": 0.0002,
"loss": 0.5396695137023926,
"mean_token_accuracy": 0.7789896428585052,
"num_tokens": 6928851.0,
"step": 423
},
{
"entropy": 0.5323083251714706,
"epoch": 1.582089552238806,
"grad_norm": 0.14206735789775848,
"learning_rate": 0.0002,
"loss": 0.5357058048248291,
"mean_token_accuracy": 0.7814851403236389,
"num_tokens": 6945117.0,
"step": 424
},
{
"entropy": 0.5220139473676682,
"epoch": 1.585820895522388,
"grad_norm": 0.13408760726451874,
"learning_rate": 0.0002,
"loss": 0.5282811522483826,
"mean_token_accuracy": 0.7859802693128586,
"num_tokens": 6961475.0,
"step": 425
},
{
"entropy": 0.5279606133699417,
"epoch": 1.5895522388059702,
"grad_norm": 0.1342962682247162,
"learning_rate": 0.0002,
"loss": 0.5310772061347961,
"mean_token_accuracy": 0.7856840938329697,
"num_tokens": 6977917.0,
"step": 426
},
{
"entropy": 0.5404426008462906,
"epoch": 1.5932835820895521,
"grad_norm": 0.11640056222677231,
"learning_rate": 0.0002,
"loss": 0.5350806713104248,
"mean_token_accuracy": 0.7831773906946182,
"num_tokens": 6994309.0,
"step": 427
},
{
"entropy": 0.546152800321579,
"epoch": 1.5970149253731343,
"grad_norm": 0.11648745834827423,
"learning_rate": 0.0002,
"loss": 0.5432876348495483,
"mean_token_accuracy": 0.7806773632764816,
"num_tokens": 7010651.0,
"step": 428
},
{
"entropy": 0.5330662578344345,
"epoch": 1.6007462686567164,
"grad_norm": 0.1201220154762268,
"learning_rate": 0.0002,
"loss": 0.5310200452804565,
"mean_token_accuracy": 0.7844978868961334,
"num_tokens": 7027129.0,
"step": 429
},
{
"entropy": 0.5318699181079865,
"epoch": 1.6044776119402986,
"grad_norm": 0.12328798323869705,
"learning_rate": 0.0002,
"loss": 0.5332854986190796,
"mean_token_accuracy": 0.7820296734571457,
"num_tokens": 7043492.0,
"step": 430
},
{
"entropy": 0.5330018848180771,
"epoch": 1.6082089552238807,
"grad_norm": 0.1538732498884201,
"learning_rate": 0.0002,
"loss": 0.5346086621284485,
"mean_token_accuracy": 0.7841860204935074,
"num_tokens": 7059825.0,
"step": 431
},
{
"entropy": 0.5369807183742523,
"epoch": 1.6119402985074627,
"grad_norm": 0.13523033261299133,
"learning_rate": 0.0002,
"loss": 0.543128490447998,
"mean_token_accuracy": 0.779476061463356,
"num_tokens": 7076083.0,
"step": 432
},
{
"entropy": 0.5597919672727585,
"epoch": 1.6156716417910446,
"grad_norm": 0.13593490421772003,
"learning_rate": 0.0002,
"loss": 0.56092369556427,
"mean_token_accuracy": 0.7705628126859665,
"num_tokens": 7092494.0,
"step": 433
},
{
"entropy": 0.5592869371175766,
"epoch": 1.6194029850746268,
"grad_norm": 0.13970784842967987,
"learning_rate": 0.0002,
"loss": 0.5588337182998657,
"mean_token_accuracy": 0.7716414630413055,
"num_tokens": 7108787.0,
"step": 434
},
{
"entropy": 0.5510755926370621,
"epoch": 1.623134328358209,
"grad_norm": 0.14515163004398346,
"learning_rate": 0.0002,
"loss": 0.5508431792259216,
"mean_token_accuracy": 0.7757678478956223,
"num_tokens": 7125326.0,
"step": 435
},
{
"entropy": 0.5493544340133667,
"epoch": 1.626865671641791,
"grad_norm": 0.13484683632850647,
"learning_rate": 0.0002,
"loss": 0.5357339382171631,
"mean_token_accuracy": 0.7844331711530685,
"num_tokens": 7141623.0,
"step": 436
},
{
"entropy": 0.5371888130903244,
"epoch": 1.6305970149253732,
"grad_norm": 0.12795639038085938,
"learning_rate": 0.0002,
"loss": 0.5337157249450684,
"mean_token_accuracy": 0.7853695005178452,
"num_tokens": 7158003.0,
"step": 437
},
{
"entropy": 0.5294598788022995,
"epoch": 1.6343283582089554,
"grad_norm": 0.13173329830169678,
"learning_rate": 0.0002,
"loss": 0.5329991579055786,
"mean_token_accuracy": 0.7873143553733826,
"num_tokens": 7174417.0,
"step": 438
},
{
"entropy": 0.5183067172765732,
"epoch": 1.6380597014925373,
"grad_norm": 0.14890097081661224,
"learning_rate": 0.0002,
"loss": 0.5276235938072205,
"mean_token_accuracy": 0.7841698378324509,
"num_tokens": 7190789.0,
"step": 439
},
{
"entropy": 0.5212598145008087,
"epoch": 1.6417910447761193,
"grad_norm": 0.1251063346862793,
"learning_rate": 0.0002,
"loss": 0.5228430032730103,
"mean_token_accuracy": 0.7859450131654739,
"num_tokens": 7207139.0,
"step": 440
},
{
"entropy": 0.5322405844926834,
"epoch": 1.6455223880597014,
"grad_norm": 0.13600069284439087,
"learning_rate": 0.0002,
"loss": 0.5263532996177673,
"mean_token_accuracy": 0.7853893488645554,
"num_tokens": 7223453.0,
"step": 441
},
{
"entropy": 0.5205891877412796,
"epoch": 1.6492537313432836,
"grad_norm": 0.13653913140296936,
"learning_rate": 0.0002,
"loss": 0.5208824872970581,
"mean_token_accuracy": 0.7881260365247726,
"num_tokens": 7240006.0,
"step": 442
},
{
"entropy": 0.5441347062587738,
"epoch": 1.6529850746268657,
"grad_norm": 0.14450038969516754,
"learning_rate": 0.0002,
"loss": 0.5436342358589172,
"mean_token_accuracy": 0.7799146473407745,
"num_tokens": 7256390.0,
"step": 443
},
{
"entropy": 0.5312005802989006,
"epoch": 1.6567164179104479,
"grad_norm": 0.12901286780834198,
"learning_rate": 0.0002,
"loss": 0.5335438847541809,
"mean_token_accuracy": 0.78382308781147,
"num_tokens": 7272830.0,
"step": 444
},
{
"entropy": 0.5523424595594406,
"epoch": 1.6604477611940298,
"grad_norm": 0.13704852759838104,
"learning_rate": 0.0002,
"loss": 0.5541114807128906,
"mean_token_accuracy": 0.7756187319755554,
"num_tokens": 7289085.0,
"step": 445
},
{
"entropy": 0.5462750494480133,
"epoch": 1.664179104477612,
"grad_norm": 0.1385122686624527,
"learning_rate": 0.0002,
"loss": 0.5408669114112854,
"mean_token_accuracy": 0.7794688045978546,
"num_tokens": 7305251.0,
"step": 446
},
{
"entropy": 0.5703910887241364,
"epoch": 1.667910447761194,
"grad_norm": 0.12344513088464737,
"learning_rate": 0.0002,
"loss": 0.5666346549987793,
"mean_token_accuracy": 0.7705821841955185,
"num_tokens": 7321796.0,
"step": 447
},
{
"entropy": 0.5504626631736755,
"epoch": 1.671641791044776,
"grad_norm": 0.12487871944904327,
"learning_rate": 0.0002,
"loss": 0.5492321848869324,
"mean_token_accuracy": 0.7753137797117233,
"num_tokens": 7338182.0,
"step": 448
},
{
"entropy": 0.5314936190843582,
"epoch": 1.6753731343283582,
"grad_norm": 0.1390916407108307,
"learning_rate": 0.0002,
"loss": 0.5342849493026733,
"mean_token_accuracy": 0.7855862826108932,
"num_tokens": 7354707.0,
"step": 449
},
{
"entropy": 0.5125585347414017,
"epoch": 1.6791044776119404,
"grad_norm": 0.13132618367671967,
"learning_rate": 0.0002,
"loss": 0.5202143788337708,
"mean_token_accuracy": 0.7874000519514084,
"num_tokens": 7370797.0,
"step": 450
},
{
"entropy": 0.5190107151865959,
"epoch": 1.6828358208955225,
"grad_norm": 0.15053601562976837,
"learning_rate": 0.0002,
"loss": 0.5218467116355896,
"mean_token_accuracy": 0.7879750281572342,
"num_tokens": 7387448.0,
"step": 451
},
{
"entropy": 0.5473128408193588,
"epoch": 1.6865671641791045,
"grad_norm": 0.14291800558567047,
"learning_rate": 0.0002,
"loss": 0.5459562540054321,
"mean_token_accuracy": 0.7800840735435486,
"num_tokens": 7403768.0,
"step": 452
},
{
"entropy": 0.5372306257486343,
"epoch": 1.6902985074626866,
"grad_norm": 0.14737331867218018,
"learning_rate": 0.0002,
"loss": 0.5391932725906372,
"mean_token_accuracy": 0.7811848223209381,
"num_tokens": 7420197.0,
"step": 453
},
{
"entropy": 0.5366326868534088,
"epoch": 1.6940298507462686,
"grad_norm": 0.13737186789512634,
"learning_rate": 0.0002,
"loss": 0.5392562747001648,
"mean_token_accuracy": 0.7824465036392212,
"num_tokens": 7436532.0,
"step": 454
},
{
"entropy": 0.5506515055894852,
"epoch": 1.6977611940298507,
"grad_norm": 0.15034589171409607,
"learning_rate": 0.0002,
"loss": 0.5501772165298462,
"mean_token_accuracy": 0.7773263603448868,
"num_tokens": 7452842.0,
"step": 455
},
{
"entropy": 0.5643105208873749,
"epoch": 1.7014925373134329,
"grad_norm": 0.14214570820331573,
"learning_rate": 0.0002,
"loss": 0.5492639541625977,
"mean_token_accuracy": 0.7783908396959305,
"num_tokens": 7469451.0,
"step": 456
},
{
"entropy": 0.5516497120261192,
"epoch": 1.705223880597015,
"grad_norm": 0.14590683579444885,
"learning_rate": 0.0002,
"loss": 0.5515267252922058,
"mean_token_accuracy": 0.774686187505722,
"num_tokens": 7485822.0,
"step": 457
},
{
"entropy": 0.5483950823545456,
"epoch": 1.7089552238805972,
"grad_norm": 0.15629805624485016,
"learning_rate": 0.0002,
"loss": 0.5422750115394592,
"mean_token_accuracy": 0.7802471369504929,
"num_tokens": 7502363.0,
"step": 458
},
{
"entropy": 0.5315360128879547,
"epoch": 1.712686567164179,
"grad_norm": 0.15466850996017456,
"learning_rate": 0.0002,
"loss": 0.5331098437309265,
"mean_token_accuracy": 0.7842396944761276,
"num_tokens": 7518672.0,
"step": 459
},
{
"entropy": 0.5366538316011429,
"epoch": 1.716417910447761,
"grad_norm": 0.15616163611412048,
"learning_rate": 0.0002,
"loss": 0.5455700755119324,
"mean_token_accuracy": 0.7823781222105026,
"num_tokens": 7534957.0,
"step": 460
},
{
"entropy": 0.5233009159564972,
"epoch": 1.7201492537313432,
"grad_norm": 0.1496264487504959,
"learning_rate": 0.0002,
"loss": 0.5298243761062622,
"mean_token_accuracy": 0.7823347896337509,
"num_tokens": 7551350.0,
"step": 461
},
{
"entropy": 0.5345755070447922,
"epoch": 1.7238805970149254,
"grad_norm": 0.15188711881637573,
"learning_rate": 0.0002,
"loss": 0.5339583158493042,
"mean_token_accuracy": 0.7852912098169327,
"num_tokens": 7567796.0,
"step": 462
},
{
"entropy": 0.525611899793148,
"epoch": 1.7276119402985075,
"grad_norm": 0.12338917702436447,
"learning_rate": 0.0002,
"loss": 0.5274109840393066,
"mean_token_accuracy": 0.7858613133430481,
"num_tokens": 7583895.0,
"step": 463
},
{
"entropy": 0.5306848883628845,
"epoch": 1.7313432835820897,
"grad_norm": 0.16974470019340515,
"learning_rate": 0.0002,
"loss": 0.5279258489608765,
"mean_token_accuracy": 0.7865510731935501,
"num_tokens": 7600124.0,
"step": 464
},
{
"entropy": 0.5408849269151688,
"epoch": 1.7350746268656716,
"grad_norm": 0.12648795545101166,
"learning_rate": 0.0002,
"loss": 0.5382460951805115,
"mean_token_accuracy": 0.7846677452325821,
"num_tokens": 7616438.0,
"step": 465
},
{
"entropy": 0.5429423898458481,
"epoch": 1.7388059701492538,
"grad_norm": 0.1650669425725937,
"learning_rate": 0.0002,
"loss": 0.549877941608429,
"mean_token_accuracy": 0.7792258560657501,
"num_tokens": 7632788.0,
"step": 466
},
{
"entropy": 0.5318955481052399,
"epoch": 1.7425373134328357,
"grad_norm": 0.12288089841604233,
"learning_rate": 0.0002,
"loss": 0.5323612093925476,
"mean_token_accuracy": 0.7859359383583069,
"num_tokens": 7649308.0,
"step": 467
},
{
"entropy": 0.548863410949707,
"epoch": 1.7462686567164178,
"grad_norm": 0.1326245218515396,
"learning_rate": 0.0002,
"loss": 0.5457996129989624,
"mean_token_accuracy": 0.7799065709114075,
"num_tokens": 7665793.0,
"step": 468
},
{
"entropy": 0.5389255881309509,
"epoch": 1.75,
"grad_norm": 0.12419410794973373,
"learning_rate": 0.0002,
"loss": 0.5312763452529907,
"mean_token_accuracy": 0.7822507619857788,
"num_tokens": 7682000.0,
"step": 469
},
{
"entropy": 0.5358720868825912,
"epoch": 1.7537313432835822,
"grad_norm": 0.13035476207733154,
"learning_rate": 0.0002,
"loss": 0.5321502685546875,
"mean_token_accuracy": 0.7836209833621979,
"num_tokens": 7698643.0,
"step": 470
},
{
"entropy": 0.5370121747255325,
"epoch": 1.7574626865671643,
"grad_norm": 0.1549667865037918,
"learning_rate": 0.0002,
"loss": 0.5385861396789551,
"mean_token_accuracy": 0.7808156907558441,
"num_tokens": 7714815.0,
"step": 471
},
{
"entropy": 0.5387648344039917,
"epoch": 1.7611940298507462,
"grad_norm": 0.14527052640914917,
"learning_rate": 0.0002,
"loss": 0.5470720529556274,
"mean_token_accuracy": 0.7775331288576126,
"num_tokens": 7731250.0,
"step": 472
},
{
"entropy": 0.5520026981830597,
"epoch": 1.7649253731343284,
"grad_norm": 0.19052588939666748,
"learning_rate": 0.0002,
"loss": 0.5578737854957581,
"mean_token_accuracy": 0.7744869738817215,
"num_tokens": 7747721.0,
"step": 473
},
{
"entropy": 0.5377953052520752,
"epoch": 1.7686567164179103,
"grad_norm": 0.13061052560806274,
"learning_rate": 0.0002,
"loss": 0.5413972735404968,
"mean_token_accuracy": 0.7811722010374069,
"num_tokens": 7763904.0,
"step": 474
},
{
"entropy": 0.5519908219575882,
"epoch": 1.7723880597014925,
"grad_norm": 0.1454058736562729,
"learning_rate": 0.0002,
"loss": 0.5414596796035767,
"mean_token_accuracy": 0.7813711762428284,
"num_tokens": 7780581.0,
"step": 475
},
{
"entropy": 0.5267625749111176,
"epoch": 1.7761194029850746,
"grad_norm": 0.1326485425233841,
"learning_rate": 0.0002,
"loss": 0.5213202834129333,
"mean_token_accuracy": 0.7871652394533157,
"num_tokens": 7796973.0,
"step": 476
},
{
"entropy": 0.553408294916153,
"epoch": 1.7798507462686568,
"grad_norm": 0.13312950730323792,
"learning_rate": 0.0002,
"loss": 0.5529948472976685,
"mean_token_accuracy": 0.7743393182754517,
"num_tokens": 7813279.0,
"step": 477
},
{
"entropy": 0.553880587220192,
"epoch": 1.783582089552239,
"grad_norm": 0.16114220023155212,
"learning_rate": 0.0002,
"loss": 0.5641807317733765,
"mean_token_accuracy": 0.7722779810428619,
"num_tokens": 7829823.0,
"step": 478
},
{
"entropy": 0.5241200774908066,
"epoch": 1.787313432835821,
"grad_norm": 0.15040791034698486,
"learning_rate": 0.0002,
"loss": 0.5346534252166748,
"mean_token_accuracy": 0.7823406606912613,
"num_tokens": 7845983.0,
"step": 479
},
{
"entropy": 0.5474425554275513,
"epoch": 1.7910447761194028,
"grad_norm": 0.13473069667816162,
"learning_rate": 0.0002,
"loss": 0.5514643788337708,
"mean_token_accuracy": 0.775032564997673,
"num_tokens": 7862179.0,
"step": 480
},
{
"entropy": 0.5494029968976974,
"epoch": 1.794776119402985,
"grad_norm": 0.14377883076667786,
"learning_rate": 0.0002,
"loss": 0.5433907508850098,
"mean_token_accuracy": 0.7781640440225601,
"num_tokens": 7878779.0,
"step": 481
},
{
"entropy": 0.5409138202667236,
"epoch": 1.7985074626865671,
"grad_norm": 0.14134465157985687,
"learning_rate": 0.0002,
"loss": 0.5372306704521179,
"mean_token_accuracy": 0.7832998037338257,
"num_tokens": 7895136.0,
"step": 482
},
{
"entropy": 0.5516301095485687,
"epoch": 1.8022388059701493,
"grad_norm": 0.13915129005908966,
"learning_rate": 0.0002,
"loss": 0.5529888272285461,
"mean_token_accuracy": 0.7746001183986664,
"num_tokens": 7911482.0,
"step": 483
},
{
"entropy": 0.5409607142210007,
"epoch": 1.8059701492537314,
"grad_norm": 0.1552349179983139,
"learning_rate": 0.0002,
"loss": 0.5396745204925537,
"mean_token_accuracy": 0.7830557972192764,
"num_tokens": 7927769.0,
"step": 484
},
{
"entropy": 0.5268412679433823,
"epoch": 1.8097014925373134,
"grad_norm": 0.16648107767105103,
"learning_rate": 0.0002,
"loss": 0.5397533178329468,
"mean_token_accuracy": 0.782973125576973,
"num_tokens": 7944237.0,
"step": 485
},
{
"entropy": 0.5383498221635818,
"epoch": 1.8134328358208955,
"grad_norm": 0.1299259066581726,
"learning_rate": 0.0002,
"loss": 0.5412971377372742,
"mean_token_accuracy": 0.7789154797792435,
"num_tokens": 7960404.0,
"step": 486
},
{
"entropy": 0.5497616678476334,
"epoch": 1.8171641791044775,
"grad_norm": 0.1571415513753891,
"learning_rate": 0.0002,
"loss": 0.5444965362548828,
"mean_token_accuracy": 0.7790942490100861,
"num_tokens": 7976843.0,
"step": 487
},
{
"entropy": 0.5411071628332138,
"epoch": 1.8208955223880596,
"grad_norm": 0.12472257018089294,
"learning_rate": 0.0002,
"loss": 0.5377678275108337,
"mean_token_accuracy": 0.7812906056642532,
"num_tokens": 7993308.0,
"step": 488
},
{
"entropy": 0.5332149565219879,
"epoch": 1.8246268656716418,
"grad_norm": 0.14515501260757446,
"learning_rate": 0.0002,
"loss": 0.532054603099823,
"mean_token_accuracy": 0.7860440015792847,
"num_tokens": 8009749.0,
"step": 489
},
{
"entropy": 0.5376683920621872,
"epoch": 1.828358208955224,
"grad_norm": 0.1362919807434082,
"learning_rate": 0.0002,
"loss": 0.5361682772636414,
"mean_token_accuracy": 0.7828832864761353,
"num_tokens": 8026107.0,
"step": 490
},
{
"entropy": 0.541684627532959,
"epoch": 1.832089552238806,
"grad_norm": 0.1390708088874817,
"learning_rate": 0.0002,
"loss": 0.5428534746170044,
"mean_token_accuracy": 0.7796362638473511,
"num_tokens": 8042519.0,
"step": 491
},
{
"entropy": 0.5491971075534821,
"epoch": 1.835820895522388,
"grad_norm": 0.18899311125278473,
"learning_rate": 0.0002,
"loss": 0.5468783378601074,
"mean_token_accuracy": 0.7760737091302872,
"num_tokens": 8058733.0,
"step": 492
},
{
"entropy": 0.5467192232608795,
"epoch": 1.8395522388059702,
"grad_norm": 0.12224384397268295,
"learning_rate": 0.0002,
"loss": 0.5412194728851318,
"mean_token_accuracy": 0.7836457341909409,
"num_tokens": 8075111.0,
"step": 493
},
{
"entropy": 0.5190225690603256,
"epoch": 1.8432835820895521,
"grad_norm": 0.17859016358852386,
"learning_rate": 0.0002,
"loss": 0.5287451148033142,
"mean_token_accuracy": 0.7872583419084549,
"num_tokens": 8091539.0,
"step": 494
},
{
"entropy": 0.5457055866718292,
"epoch": 1.8470149253731343,
"grad_norm": 0.14652208983898163,
"learning_rate": 0.0002,
"loss": 0.5511422157287598,
"mean_token_accuracy": 0.7764985859394073,
"num_tokens": 8107924.0,
"step": 495
},
{
"entropy": 0.5412308424711227,
"epoch": 1.8507462686567164,
"grad_norm": 0.14928752183914185,
"learning_rate": 0.0002,
"loss": 0.5386866331100464,
"mean_token_accuracy": 0.7840718477964401,
"num_tokens": 8124327.0,
"step": 496
},
{
"entropy": 0.5487564355134964,
"epoch": 1.8544776119402986,
"grad_norm": 0.14009299874305725,
"learning_rate": 0.0002,
"loss": 0.5402563810348511,
"mean_token_accuracy": 0.781055673956871,
"num_tokens": 8140629.0,
"step": 497
},
{
"entropy": 0.5530242621898651,
"epoch": 1.8582089552238807,
"grad_norm": 0.13880518078804016,
"learning_rate": 0.0002,
"loss": 0.5397564172744751,
"mean_token_accuracy": 0.7810083031654358,
"num_tokens": 8157176.0,
"step": 498
},
{
"entropy": 0.5339633226394653,
"epoch": 1.8619402985074627,
"grad_norm": 0.16541644930839539,
"learning_rate": 0.0002,
"loss": 0.5336776971817017,
"mean_token_accuracy": 0.7829927057027817,
"num_tokens": 8173382.0,
"step": 499
},
{
"entropy": 0.5558539777994156,
"epoch": 1.8656716417910446,
"grad_norm": 0.15278875827789307,
"learning_rate": 0.0002,
"loss": 0.5627698302268982,
"mean_token_accuracy": 0.7725099176168442,
"num_tokens": 8189820.0,
"step": 500
},
{
"entropy": 0.5367425978183746,
"epoch": 1.8694029850746268,
"grad_norm": 0.15401561558246613,
"learning_rate": 0.0002,
"loss": 0.546620786190033,
"mean_token_accuracy": 0.7765664905309677,
"num_tokens": 8205989.0,
"step": 501
},
{
"entropy": 0.5408999174833298,
"epoch": 1.873134328358209,
"grad_norm": 0.13051092624664307,
"learning_rate": 0.0002,
"loss": 0.5466805696487427,
"mean_token_accuracy": 0.7781471610069275,
"num_tokens": 8222509.0,
"step": 502
},
{
"entropy": 0.5321147739887238,
"epoch": 1.876865671641791,
"grad_norm": 0.13755947351455688,
"learning_rate": 0.0002,
"loss": 0.527456521987915,
"mean_token_accuracy": 0.7872339636087418,
"num_tokens": 8238911.0,
"step": 503
},
{
"entropy": 0.5611546188592911,
"epoch": 1.8805970149253732,
"grad_norm": 0.13492627441883087,
"learning_rate": 0.0002,
"loss": 0.548973798751831,
"mean_token_accuracy": 0.7786827385425568,
"num_tokens": 8255331.0,
"step": 504
},
{
"entropy": 0.5648814886808395,
"epoch": 1.8843283582089554,
"grad_norm": 0.13315370678901672,
"learning_rate": 0.0002,
"loss": 0.5626882314682007,
"mean_token_accuracy": 0.7693315893411636,
"num_tokens": 8271717.0,
"step": 505
},
{
"entropy": 0.528036579489708,
"epoch": 1.8880597014925373,
"grad_norm": 0.13826221227645874,
"learning_rate": 0.0002,
"loss": 0.5317479372024536,
"mean_token_accuracy": 0.7865342795848846,
"num_tokens": 8287916.0,
"step": 506
},
{
"entropy": 0.5300939381122589,
"epoch": 1.8917910447761193,
"grad_norm": 0.14022263884544373,
"learning_rate": 0.0002,
"loss": 0.5405997633934021,
"mean_token_accuracy": 0.7812036871910095,
"num_tokens": 8304453.0,
"step": 507
},
{
"entropy": 0.52273790538311,
"epoch": 1.8955223880597014,
"grad_norm": 0.1394582986831665,
"learning_rate": 0.0002,
"loss": 0.526207685470581,
"mean_token_accuracy": 0.7882105112075806,
"num_tokens": 8320635.0,
"step": 508
},
{
"entropy": 0.5376584082841873,
"epoch": 1.8992537313432836,
"grad_norm": 0.16204339265823364,
"learning_rate": 0.0002,
"loss": 0.5367757678031921,
"mean_token_accuracy": 0.7841790616512299,
"num_tokens": 8337016.0,
"step": 509
},
{
"entropy": 0.5457427948713303,
"epoch": 1.9029850746268657,
"grad_norm": 0.13758644461631775,
"learning_rate": 0.0002,
"loss": 0.5404728651046753,
"mean_token_accuracy": 0.7789884358644485,
"num_tokens": 8353374.0,
"step": 510
},
{
"entropy": 0.5548366904258728,
"epoch": 1.9067164179104479,
"grad_norm": 0.15079155564308167,
"learning_rate": 0.0002,
"loss": 0.5460405349731445,
"mean_token_accuracy": 0.7766790390014648,
"num_tokens": 8369864.0,
"step": 511
},
{
"entropy": 0.5432726740837097,
"epoch": 1.9104477611940298,
"grad_norm": 0.14672084152698517,
"learning_rate": 0.0002,
"loss": 0.5391443371772766,
"mean_token_accuracy": 0.7813593149185181,
"num_tokens": 8386310.0,
"step": 512
},
{
"entropy": 0.5469253212213516,
"epoch": 1.914179104477612,
"grad_norm": 0.12065178155899048,
"learning_rate": 0.0002,
"loss": 0.5509493350982666,
"mean_token_accuracy": 0.7752728313207626,
"num_tokens": 8402902.0,
"step": 513
},
{
"entropy": 0.5332511216402054,
"epoch": 1.917910447761194,
"grad_norm": 0.13797524571418762,
"learning_rate": 0.0002,
"loss": 0.5396395325660706,
"mean_token_accuracy": 0.783454567193985,
"num_tokens": 8418969.0,
"step": 514
},
{
"entropy": 0.5430255383253098,
"epoch": 1.921641791044776,
"grad_norm": 0.15779103338718414,
"learning_rate": 0.0002,
"loss": 0.5497632026672363,
"mean_token_accuracy": 0.776575118303299,
"num_tokens": 8435342.0,
"step": 515
},
{
"entropy": 0.541492372751236,
"epoch": 1.9253731343283582,
"grad_norm": 0.14993441104888916,
"learning_rate": 0.0002,
"loss": 0.5440635085105896,
"mean_token_accuracy": 0.779094398021698,
"num_tokens": 8451438.0,
"step": 516
},
{
"entropy": 0.5484725534915924,
"epoch": 1.9291044776119404,
"grad_norm": 0.12014457583427429,
"learning_rate": 0.0002,
"loss": 0.5494801998138428,
"mean_token_accuracy": 0.7743937969207764,
"num_tokens": 8467793.0,
"step": 517
},
{
"entropy": 0.5424629300832748,
"epoch": 1.9328358208955225,
"grad_norm": 0.1372799575328827,
"learning_rate": 0.0002,
"loss": 0.5402990579605103,
"mean_token_accuracy": 0.7788502424955368,
"num_tokens": 8484069.0,
"step": 518
},
{
"entropy": 0.544426254928112,
"epoch": 1.9365671641791045,
"grad_norm": 0.12580935657024384,
"learning_rate": 0.0002,
"loss": 0.5430607199668884,
"mean_token_accuracy": 0.7801959961652756,
"num_tokens": 8500603.0,
"step": 519
},
{
"entropy": 0.5405134111642838,
"epoch": 1.9402985074626866,
"grad_norm": 0.13943250477313995,
"learning_rate": 0.0002,
"loss": 0.5387794971466064,
"mean_token_accuracy": 0.7797143012285233,
"num_tokens": 8516792.0,
"step": 520
},
{
"entropy": 0.5363973081111908,
"epoch": 1.9440298507462686,
"grad_norm": 0.15255886316299438,
"learning_rate": 0.0002,
"loss": 0.5392638444900513,
"mean_token_accuracy": 0.778968557715416,
"num_tokens": 8533178.0,
"step": 521
},
{
"entropy": 0.5569429993629456,
"epoch": 1.9477611940298507,
"grad_norm": 0.14009712636470795,
"learning_rate": 0.0002,
"loss": 0.5554465055465698,
"mean_token_accuracy": 0.7732362002134323,
"num_tokens": 8549795.0,
"step": 522
},
{
"entropy": 0.560676708817482,
"epoch": 1.9514925373134329,
"grad_norm": 0.1429370492696762,
"learning_rate": 0.0002,
"loss": 0.5586832761764526,
"mean_token_accuracy": 0.7744071185588837,
"num_tokens": 8566708.0,
"step": 523
},
{
"entropy": 0.5566927641630173,
"epoch": 1.955223880597015,
"grad_norm": 0.1273992359638214,
"learning_rate": 0.0002,
"loss": 0.5483277440071106,
"mean_token_accuracy": 0.7761266380548477,
"num_tokens": 8582993.0,
"step": 524
},
{
"entropy": 0.5535138845443726,
"epoch": 1.9589552238805972,
"grad_norm": 0.15844318270683289,
"learning_rate": 0.0002,
"loss": 0.5520558953285217,
"mean_token_accuracy": 0.7790683060884476,
"num_tokens": 8599225.0,
"step": 525
},
{
"entropy": 0.5255821049213409,
"epoch": 1.962686567164179,
"grad_norm": 0.1505620777606964,
"learning_rate": 0.0002,
"loss": 0.5302370190620422,
"mean_token_accuracy": 0.7846137434244156,
"num_tokens": 8615790.0,
"step": 526
},
{
"entropy": 0.5364990532398224,
"epoch": 1.966417910447761,
"grad_norm": 0.18815594911575317,
"learning_rate": 0.0002,
"loss": 0.5442203283309937,
"mean_token_accuracy": 0.7792959064245224,
"num_tokens": 8632007.0,
"step": 527
},
{
"entropy": 0.5499100834131241,
"epoch": 1.9701492537313432,
"grad_norm": 0.12838681042194366,
"learning_rate": 0.0002,
"loss": 0.5423155426979065,
"mean_token_accuracy": 0.77956822514534,
"num_tokens": 8648517.0,
"step": 528
},
{
"entropy": 0.5600726753473282,
"epoch": 1.9738805970149254,
"grad_norm": 0.13670910894870758,
"learning_rate": 0.0002,
"loss": 0.5591787695884705,
"mean_token_accuracy": 0.7713638991117477,
"num_tokens": 8665136.0,
"step": 529
},
{
"entropy": 0.5376773029565811,
"epoch": 1.9776119402985075,
"grad_norm": 0.12114886194467545,
"learning_rate": 0.0002,
"loss": 0.5407379865646362,
"mean_token_accuracy": 0.7814544290304184,
"num_tokens": 8681529.0,
"step": 530
},
{
"entropy": 0.5403832793235779,
"epoch": 1.9813432835820897,
"grad_norm": 0.13908495008945465,
"learning_rate": 0.0002,
"loss": 0.5482066869735718,
"mean_token_accuracy": 0.777704581618309,
"num_tokens": 8697730.0,
"step": 531
},
{
"entropy": 0.5356862396001816,
"epoch": 1.9850746268656716,
"grad_norm": 0.13925939798355103,
"learning_rate": 0.0002,
"loss": 0.5371193289756775,
"mean_token_accuracy": 0.783266693353653,
"num_tokens": 8714219.0,
"step": 532
},
{
"entropy": 0.5331960469484329,
"epoch": 1.9888059701492538,
"grad_norm": 0.15995416045188904,
"learning_rate": 0.0002,
"loss": 0.5319101810455322,
"mean_token_accuracy": 0.7843216061592102,
"num_tokens": 8730525.0,
"step": 533
},
{
"entropy": 0.5409343987703323,
"epoch": 1.9925373134328357,
"grad_norm": 0.1330004185438156,
"learning_rate": 0.0002,
"loss": 0.5445230603218079,
"mean_token_accuracy": 0.7773614227771759,
"num_tokens": 8746950.0,
"step": 534
},
{
"entropy": 0.5394200682640076,
"epoch": 1.9962686567164178,
"grad_norm": 0.14103004336357117,
"learning_rate": 0.0002,
"loss": 0.5359162092208862,
"mean_token_accuracy": 0.785576581954956,
"num_tokens": 8763337.0,
"step": 535
},
{
"entropy": 0.5349156558513641,
"epoch": 2.0,
"grad_norm": 0.12837927043437958,
"learning_rate": 0.0002,
"loss": 0.5329214334487915,
"mean_token_accuracy": 0.785938173532486,
"num_tokens": 8779938.0,
"step": 536
},
{
"entropy": 0.5407280772924423,
"epoch": 2.003731343283582,
"grad_norm": 0.14622488617897034,
"learning_rate": 0.0002,
"loss": 0.5321956872940063,
"mean_token_accuracy": 0.7852865755558014,
"num_tokens": 8796464.0,
"step": 537
},
{
"entropy": 0.5337665975093842,
"epoch": 2.0074626865671643,
"grad_norm": 0.16594251990318298,
"learning_rate": 0.0002,
"loss": 0.5266042351722717,
"mean_token_accuracy": 0.7868293672800064,
"num_tokens": 8812777.0,
"step": 538
},
{
"entropy": 0.5268868803977966,
"epoch": 2.0111940298507465,
"grad_norm": 0.15608331561088562,
"learning_rate": 0.0002,
"loss": 0.5311114192008972,
"mean_token_accuracy": 0.7839187681674957,
"num_tokens": 8829112.0,
"step": 539
},
{
"entropy": 0.527610257267952,
"epoch": 2.014925373134328,
"grad_norm": 0.13121342658996582,
"learning_rate": 0.0002,
"loss": 0.5283110737800598,
"mean_token_accuracy": 0.7851767688989639,
"num_tokens": 8845686.0,
"step": 540
},
{
"entropy": 0.5114267989993095,
"epoch": 2.0186567164179103,
"grad_norm": 0.15982377529144287,
"learning_rate": 0.0002,
"loss": 0.5138009190559387,
"mean_token_accuracy": 0.7923145592212677,
"num_tokens": 8862042.0,
"step": 541
},
{
"entropy": 0.5179557651281357,
"epoch": 2.0223880597014925,
"grad_norm": 0.15685375034809113,
"learning_rate": 0.0002,
"loss": 0.5175086855888367,
"mean_token_accuracy": 0.790000781416893,
"num_tokens": 8878269.0,
"step": 542
},
{
"entropy": 0.5284497290849686,
"epoch": 2.0261194029850746,
"grad_norm": 0.155994713306427,
"learning_rate": 0.0002,
"loss": 0.5248953104019165,
"mean_token_accuracy": 0.7887215316295624,
"num_tokens": 8894744.0,
"step": 543
},
{
"entropy": 0.5114204958081245,
"epoch": 2.029850746268657,
"grad_norm": 0.1587519645690918,
"learning_rate": 0.0002,
"loss": 0.5146663784980774,
"mean_token_accuracy": 0.7908709943294525,
"num_tokens": 8911019.0,
"step": 544
},
{
"entropy": 0.5258788168430328,
"epoch": 2.033582089552239,
"grad_norm": 0.17405946552753448,
"learning_rate": 0.0002,
"loss": 0.5257717967033386,
"mean_token_accuracy": 0.7857701331377029,
"num_tokens": 8927423.0,
"step": 545
},
{
"entropy": 0.5308232307434082,
"epoch": 2.0373134328358207,
"grad_norm": 0.16010001301765442,
"learning_rate": 0.0002,
"loss": 0.5299814343452454,
"mean_token_accuracy": 0.7874948382377625,
"num_tokens": 8943802.0,
"step": 546
},
{
"entropy": 0.516572117805481,
"epoch": 2.041044776119403,
"grad_norm": 0.16816852986812592,
"learning_rate": 0.0002,
"loss": 0.5154708623886108,
"mean_token_accuracy": 0.7876496762037277,
"num_tokens": 8959993.0,
"step": 547
},
{
"entropy": 0.5281299874186516,
"epoch": 2.044776119402985,
"grad_norm": 0.14758102595806122,
"learning_rate": 0.0002,
"loss": 0.524406373500824,
"mean_token_accuracy": 0.7861409038305283,
"num_tokens": 8976245.0,
"step": 548
},
{
"entropy": 0.5246195495128632,
"epoch": 2.048507462686567,
"grad_norm": 0.16330084204673767,
"learning_rate": 0.0002,
"loss": 0.5244280099868774,
"mean_token_accuracy": 0.7878082692623138,
"num_tokens": 8992638.0,
"step": 549
},
{
"entropy": 0.514888346195221,
"epoch": 2.0522388059701493,
"grad_norm": 0.1649155467748642,
"learning_rate": 0.0002,
"loss": 0.5206322073936462,
"mean_token_accuracy": 0.7888449877500534,
"num_tokens": 9008736.0,
"step": 550
},
{
"entropy": 0.5066314935684204,
"epoch": 2.0559701492537314,
"grad_norm": 0.1575276404619217,
"learning_rate": 0.0002,
"loss": 0.5027191042900085,
"mean_token_accuracy": 0.7947296053171158,
"num_tokens": 9025125.0,
"step": 551
},
{
"entropy": 0.5268809348344803,
"epoch": 2.0597014925373136,
"grad_norm": 0.1932123601436615,
"learning_rate": 0.0002,
"loss": 0.526970386505127,
"mean_token_accuracy": 0.7861645221710205,
"num_tokens": 9041360.0,
"step": 552
},
{
"entropy": 0.5089156553149223,
"epoch": 2.0634328358208953,
"grad_norm": 0.17611229419708252,
"learning_rate": 0.0002,
"loss": 0.5170955061912537,
"mean_token_accuracy": 0.7898762077093124,
"num_tokens": 9057425.0,
"step": 553
},
{
"entropy": 0.5314554125070572,
"epoch": 2.0671641791044775,
"grad_norm": 0.16261620819568634,
"learning_rate": 0.0002,
"loss": 0.5317267775535583,
"mean_token_accuracy": 0.7857931405305862,
"num_tokens": 9073634.0,
"step": 554
},
{
"entropy": 0.5275600850582123,
"epoch": 2.0708955223880596,
"grad_norm": 0.1528756469488144,
"learning_rate": 0.0002,
"loss": 0.5216519832611084,
"mean_token_accuracy": 0.784853920340538,
"num_tokens": 9090072.0,
"step": 555
},
{
"entropy": 0.533121645450592,
"epoch": 2.074626865671642,
"grad_norm": 0.15978476405143738,
"learning_rate": 0.0002,
"loss": 0.5330748558044434,
"mean_token_accuracy": 0.7852211892604828,
"num_tokens": 9106310.0,
"step": 556
},
{
"entropy": 0.5289422124624252,
"epoch": 2.078358208955224,
"grad_norm": 0.18613378703594208,
"learning_rate": 0.0002,
"loss": 0.5246477127075195,
"mean_token_accuracy": 0.7871279567480087,
"num_tokens": 9122599.0,
"step": 557
},
{
"entropy": 0.5288784801959991,
"epoch": 2.082089552238806,
"grad_norm": 0.19494648277759552,
"learning_rate": 0.0002,
"loss": 0.5310162305831909,
"mean_token_accuracy": 0.783275917172432,
"num_tokens": 9138955.0,
"step": 558
},
{
"entropy": 0.5063241422176361,
"epoch": 2.0858208955223883,
"grad_norm": 0.17457328736782074,
"learning_rate": 0.0002,
"loss": 0.5103744268417358,
"mean_token_accuracy": 0.7956038117408752,
"num_tokens": 9155471.0,
"step": 559
},
{
"entropy": 0.5165305808186531,
"epoch": 2.08955223880597,
"grad_norm": 0.16135407984256744,
"learning_rate": 0.0002,
"loss": 0.5219785571098328,
"mean_token_accuracy": 0.7876863032579422,
"num_tokens": 9171894.0,
"step": 560
},
{
"entropy": 0.5188902914524078,
"epoch": 2.093283582089552,
"grad_norm": 0.16337014734745026,
"learning_rate": 0.0002,
"loss": 0.516549825668335,
"mean_token_accuracy": 0.7918221950531006,
"num_tokens": 9188463.0,
"step": 561
},
{
"entropy": 0.513557106256485,
"epoch": 2.0970149253731343,
"grad_norm": 0.1818535476922989,
"learning_rate": 0.0002,
"loss": 0.506076991558075,
"mean_token_accuracy": 0.7936830073595047,
"num_tokens": 9204870.0,
"step": 562
},
{
"entropy": 0.5341264307498932,
"epoch": 2.1007462686567164,
"grad_norm": 0.1677771359682083,
"learning_rate": 0.0002,
"loss": 0.530627965927124,
"mean_token_accuracy": 0.7831838876008987,
"num_tokens": 9221094.0,
"step": 563
},
{
"entropy": 0.5140577107667923,
"epoch": 2.1044776119402986,
"grad_norm": 0.17054656147956848,
"learning_rate": 0.0002,
"loss": 0.5144332647323608,
"mean_token_accuracy": 0.7923785746097565,
"num_tokens": 9237391.0,
"step": 564
},
{
"entropy": 0.497653529047966,
"epoch": 2.1082089552238807,
"grad_norm": 0.18110354244709015,
"learning_rate": 0.0002,
"loss": 0.5102217197418213,
"mean_token_accuracy": 0.7931897193193436,
"num_tokens": 9253611.0,
"step": 565
},
{
"entropy": 0.524284727871418,
"epoch": 2.111940298507463,
"grad_norm": 0.2005971521139145,
"learning_rate": 0.0002,
"loss": 0.5303030014038086,
"mean_token_accuracy": 0.7885997593402863,
"num_tokens": 9269952.0,
"step": 566
},
{
"entropy": 0.5399997532367706,
"epoch": 2.1156716417910446,
"grad_norm": 0.1460496038198471,
"learning_rate": 0.0002,
"loss": 0.5352809429168701,
"mean_token_accuracy": 0.7851102352142334,
"num_tokens": 9286381.0,
"step": 567
},
{
"entropy": 0.5403535813093185,
"epoch": 2.1194029850746268,
"grad_norm": 0.2164795845746994,
"learning_rate": 0.0002,
"loss": 0.5310791730880737,
"mean_token_accuracy": 0.7864344716072083,
"num_tokens": 9302619.0,
"step": 568
},
{
"entropy": 0.5281778201460838,
"epoch": 2.123134328358209,
"grad_norm": 0.14520607888698578,
"learning_rate": 0.0002,
"loss": 0.5214827060699463,
"mean_token_accuracy": 0.7891172915697098,
"num_tokens": 9319199.0,
"step": 569
},
{
"entropy": 0.5376487374305725,
"epoch": 2.126865671641791,
"grad_norm": 0.20075996220111847,
"learning_rate": 0.0002,
"loss": 0.5414179563522339,
"mean_token_accuracy": 0.7825666964054108,
"num_tokens": 9335645.0,
"step": 570
},
{
"entropy": 0.544133722782135,
"epoch": 2.1305970149253732,
"grad_norm": 0.17108148336410522,
"learning_rate": 0.0002,
"loss": 0.5474769473075867,
"mean_token_accuracy": 0.778696671128273,
"num_tokens": 9352250.0,
"step": 571
},
{
"entropy": 0.5139511153101921,
"epoch": 2.1343283582089554,
"grad_norm": 0.20305298268795013,
"learning_rate": 0.0002,
"loss": 0.5138852000236511,
"mean_token_accuracy": 0.7916316092014313,
"num_tokens": 9368581.0,
"step": 572
},
{
"entropy": 0.5336194783449173,
"epoch": 2.138059701492537,
"grad_norm": 0.17313581705093384,
"learning_rate": 0.0002,
"loss": 0.5371931195259094,
"mean_token_accuracy": 0.7810296416282654,
"num_tokens": 9385005.0,
"step": 573
},
{
"entropy": 0.5428188145160675,
"epoch": 2.1417910447761193,
"grad_norm": 0.18904267251491547,
"learning_rate": 0.0002,
"loss": 0.5414341688156128,
"mean_token_accuracy": 0.7817030698060989,
"num_tokens": 9401264.0,
"step": 574
},
{
"entropy": 0.5036500468850136,
"epoch": 2.1455223880597014,
"grad_norm": 0.16260603070259094,
"learning_rate": 0.0002,
"loss": 0.5049091577529907,
"mean_token_accuracy": 0.7955426573753357,
"num_tokens": 9417452.0,
"step": 575
},
{
"entropy": 0.5125822275876999,
"epoch": 2.1492537313432836,
"grad_norm": 0.18752527236938477,
"learning_rate": 0.0002,
"loss": 0.520676851272583,
"mean_token_accuracy": 0.787801519036293,
"num_tokens": 9433830.0,
"step": 576
},
{
"entropy": 0.5220265239477158,
"epoch": 2.1529850746268657,
"grad_norm": 0.17956171929836273,
"learning_rate": 0.0002,
"loss": 0.5259777903556824,
"mean_token_accuracy": 0.7890594154596329,
"num_tokens": 9449942.0,
"step": 577
},
{
"entropy": 0.5411542505025864,
"epoch": 2.156716417910448,
"grad_norm": 0.16276296973228455,
"learning_rate": 0.0002,
"loss": 0.5392127633094788,
"mean_token_accuracy": 0.7827239036560059,
"num_tokens": 9466361.0,
"step": 578
},
{
"entropy": 0.5376486927270889,
"epoch": 2.16044776119403,
"grad_norm": 0.18284423649311066,
"learning_rate": 0.0002,
"loss": 0.5354690551757812,
"mean_token_accuracy": 0.7847119867801666,
"num_tokens": 9482738.0,
"step": 579
},
{
"entropy": 0.527974009513855,
"epoch": 2.1641791044776117,
"grad_norm": 0.15606842935085297,
"learning_rate": 0.0002,
"loss": 0.5216515064239502,
"mean_token_accuracy": 0.7893972098827362,
"num_tokens": 9499285.0,
"step": 580
},
{
"entropy": 0.5080907642841339,
"epoch": 2.167910447761194,
"grad_norm": 0.19228458404541016,
"learning_rate": 0.0002,
"loss": 0.5062891840934753,
"mean_token_accuracy": 0.7950604856014252,
"num_tokens": 9515408.0,
"step": 581
},
{
"entropy": 0.5310265123844147,
"epoch": 2.171641791044776,
"grad_norm": 0.1585681140422821,
"learning_rate": 0.0002,
"loss": 0.5329898595809937,
"mean_token_accuracy": 0.7825100123882294,
"num_tokens": 9531802.0,
"step": 582
},
{
"entropy": 0.5163623988628387,
"epoch": 2.175373134328358,
"grad_norm": 0.16819821298122406,
"learning_rate": 0.0002,
"loss": 0.5175923109054565,
"mean_token_accuracy": 0.7890376448631287,
"num_tokens": 9548285.0,
"step": 583
},
{
"entropy": 0.5143009200692177,
"epoch": 2.1791044776119404,
"grad_norm": 0.16217826306819916,
"learning_rate": 0.0002,
"loss": 0.5155395865440369,
"mean_token_accuracy": 0.7922197580337524,
"num_tokens": 9564428.0,
"step": 584
},
{
"entropy": 0.5416625738143921,
"epoch": 2.1828358208955225,
"grad_norm": 0.15060050785541534,
"learning_rate": 0.0002,
"loss": 0.5370927453041077,
"mean_token_accuracy": 0.7829685211181641,
"num_tokens": 9580974.0,
"step": 585
},
{
"entropy": 0.5395999997854233,
"epoch": 2.1865671641791047,
"grad_norm": 0.17097517848014832,
"learning_rate": 0.0002,
"loss": 0.5385570526123047,
"mean_token_accuracy": 0.7842200845479965,
"num_tokens": 9597372.0,
"step": 586
},
{
"entropy": 0.5397211164236069,
"epoch": 2.1902985074626864,
"grad_norm": 0.1612662672996521,
"learning_rate": 0.0002,
"loss": 0.5392184257507324,
"mean_token_accuracy": 0.7815093398094177,
"num_tokens": 9613832.0,
"step": 587
},
{
"entropy": 0.5179775580763817,
"epoch": 2.1940298507462686,
"grad_norm": 0.17580583691596985,
"learning_rate": 0.0002,
"loss": 0.5214508771896362,
"mean_token_accuracy": 0.7890152186155319,
"num_tokens": 9630021.0,
"step": 588
},
{
"entropy": 0.5112824365496635,
"epoch": 2.1977611940298507,
"grad_norm": 0.2011307030916214,
"learning_rate": 0.0002,
"loss": 0.5203381180763245,
"mean_token_accuracy": 0.7900092750787735,
"num_tokens": 9646188.0,
"step": 589
},
{
"entropy": 0.5356829464435577,
"epoch": 2.201492537313433,
"grad_norm": 0.16764222085475922,
"learning_rate": 0.0002,
"loss": 0.5318949818611145,
"mean_token_accuracy": 0.7853176593780518,
"num_tokens": 9662704.0,
"step": 590
},
{
"entropy": 0.532988578081131,
"epoch": 2.205223880597015,
"grad_norm": 0.1625567525625229,
"learning_rate": 0.0002,
"loss": 0.5286852121353149,
"mean_token_accuracy": 0.7845050990581512,
"num_tokens": 9679126.0,
"step": 591
},
{
"entropy": 0.5083666741847992,
"epoch": 2.208955223880597,
"grad_norm": 0.17014159262180328,
"learning_rate": 0.0002,
"loss": 0.5085889101028442,
"mean_token_accuracy": 0.7938840687274933,
"num_tokens": 9695574.0,
"step": 592
},
{
"entropy": 0.5348383486270905,
"epoch": 2.2126865671641793,
"grad_norm": 0.15370626747608185,
"learning_rate": 0.0002,
"loss": 0.5363180041313171,
"mean_token_accuracy": 0.7823249995708466,
"num_tokens": 9711759.0,
"step": 593
},
{
"entropy": 0.521574854850769,
"epoch": 2.216417910447761,
"grad_norm": 0.1618925929069519,
"learning_rate": 0.0002,
"loss": 0.5165284872055054,
"mean_token_accuracy": 0.7902027070522308,
"num_tokens": 9728297.0,
"step": 594
},
{
"entropy": 0.5246837437152863,
"epoch": 2.220149253731343,
"grad_norm": 0.16403713822364807,
"learning_rate": 0.0002,
"loss": 0.5284984111785889,
"mean_token_accuracy": 0.785593718290329,
"num_tokens": 9745025.0,
"step": 595
},
{
"entropy": 0.5146933272480965,
"epoch": 2.2238805970149254,
"grad_norm": 0.16364289820194244,
"learning_rate": 0.0002,
"loss": 0.5155675411224365,
"mean_token_accuracy": 0.7914301306009293,
"num_tokens": 9761573.0,
"step": 596
},
{
"entropy": 0.5164592936635017,
"epoch": 2.2276119402985075,
"grad_norm": 0.16107001900672913,
"learning_rate": 0.0002,
"loss": 0.520284116268158,
"mean_token_accuracy": 0.790960431098938,
"num_tokens": 9777994.0,
"step": 597
},
{
"entropy": 0.5009781569242477,
"epoch": 2.2313432835820897,
"grad_norm": 0.17092035710811615,
"learning_rate": 0.0002,
"loss": 0.5013527870178223,
"mean_token_accuracy": 0.7965078949928284,
"num_tokens": 9794247.0,
"step": 598
},
{
"entropy": 0.5145166665315628,
"epoch": 2.235074626865672,
"grad_norm": 0.17742900550365448,
"learning_rate": 0.0002,
"loss": 0.5136178731918335,
"mean_token_accuracy": 0.7902016937732697,
"num_tokens": 9810623.0,
"step": 599
},
{
"entropy": 0.521144449710846,
"epoch": 2.2388059701492535,
"grad_norm": 0.1866447478532791,
"learning_rate": 0.0002,
"loss": 0.5256049633026123,
"mean_token_accuracy": 0.7880899459123611,
"num_tokens": 9827216.0,
"step": 600
},
{
"entropy": 0.5078264698386192,
"epoch": 2.2425373134328357,
"grad_norm": 0.18190419673919678,
"learning_rate": 0.0002,
"loss": 0.5107334852218628,
"mean_token_accuracy": 0.7921731919050217,
"num_tokens": 9843424.0,
"step": 601
},
{
"entropy": 0.5391242802143097,
"epoch": 2.246268656716418,
"grad_norm": 0.1664401739835739,
"learning_rate": 0.0002,
"loss": 0.5404478907585144,
"mean_token_accuracy": 0.779574453830719,
"num_tokens": 9859528.0,
"step": 602
},
{
"entropy": 0.5163552165031433,
"epoch": 2.25,
"grad_norm": 0.19338326156139374,
"learning_rate": 0.0002,
"loss": 0.5106169581413269,
"mean_token_accuracy": 0.7929095774888992,
"num_tokens": 9875496.0,
"step": 603
},
{
"entropy": 0.538531944155693,
"epoch": 2.253731343283582,
"grad_norm": 0.16355083882808685,
"learning_rate": 0.0002,
"loss": 0.5421521067619324,
"mean_token_accuracy": 0.7775969356298447,
"num_tokens": 9891706.0,
"step": 604
},
{
"entropy": 0.5201183184981346,
"epoch": 2.2574626865671643,
"grad_norm": 0.2061741203069687,
"learning_rate": 0.0002,
"loss": 0.5298879742622375,
"mean_token_accuracy": 0.7839659005403519,
"num_tokens": 9907901.0,
"step": 605
},
{
"entropy": 0.5299466401338577,
"epoch": 2.2611940298507465,
"grad_norm": 0.1585988998413086,
"learning_rate": 0.0002,
"loss": 0.5266643762588501,
"mean_token_accuracy": 0.7857095748186111,
"num_tokens": 9924584.0,
"step": 606
},
{
"entropy": 0.5331060588359833,
"epoch": 2.264925373134328,
"grad_norm": 0.22515474259853363,
"learning_rate": 0.0002,
"loss": 0.5281371474266052,
"mean_token_accuracy": 0.7846943885087967,
"num_tokens": 9940921.0,
"step": 607
},
{
"entropy": 0.5365794003009796,
"epoch": 2.2686567164179103,
"grad_norm": 0.14158517122268677,
"learning_rate": 0.0002,
"loss": 0.5241664052009583,
"mean_token_accuracy": 0.7902594655752182,
"num_tokens": 9957418.0,
"step": 608
},
{
"entropy": 0.5098173916339874,
"epoch": 2.2723880597014925,
"grad_norm": 0.19847925007343292,
"learning_rate": 0.0002,
"loss": 0.5109040141105652,
"mean_token_accuracy": 0.7907959967851639,
"num_tokens": 9973783.0,
"step": 609
},
{
"entropy": 0.507322758436203,
"epoch": 2.2761194029850746,
"grad_norm": 0.1904480904340744,
"learning_rate": 0.0002,
"loss": 0.5145297050476074,
"mean_token_accuracy": 0.791220560669899,
"num_tokens": 9990362.0,
"step": 610
},
{
"entropy": 0.5185896158218384,
"epoch": 2.279850746268657,
"grad_norm": 0.23211340606212616,
"learning_rate": 0.0002,
"loss": 0.524868905544281,
"mean_token_accuracy": 0.7855911701917648,
"num_tokens": 10006762.0,
"step": 611
},
{
"entropy": 0.5282359346747398,
"epoch": 2.283582089552239,
"grad_norm": 0.1768886297941208,
"learning_rate": 0.0002,
"loss": 0.5229817628860474,
"mean_token_accuracy": 0.7895976901054382,
"num_tokens": 10023191.0,
"step": 612
},
{
"entropy": 0.5275277346372604,
"epoch": 2.2873134328358207,
"grad_norm": 0.19380177557468414,
"learning_rate": 0.0002,
"loss": 0.5169612765312195,
"mean_token_accuracy": 0.7907349169254303,
"num_tokens": 10039350.0,
"step": 613
},
{
"entropy": 0.5204345509409904,
"epoch": 2.291044776119403,
"grad_norm": 0.15632414817810059,
"learning_rate": 0.0002,
"loss": 0.513292670249939,
"mean_token_accuracy": 0.7925348877906799,
"num_tokens": 10055872.0,
"step": 614
},
{
"entropy": 0.5112610086798668,
"epoch": 2.294776119402985,
"grad_norm": 0.18102124333381653,
"learning_rate": 0.0002,
"loss": 0.520767092704773,
"mean_token_accuracy": 0.7886828035116196,
"num_tokens": 10072419.0,
"step": 615
},
{
"entropy": 0.5232729762792587,
"epoch": 2.298507462686567,
"grad_norm": 0.25390854477882385,
"learning_rate": 0.0002,
"loss": 0.5408729314804077,
"mean_token_accuracy": 0.7815985828638077,
"num_tokens": 10088715.0,
"step": 616
},
{
"entropy": 0.529785230755806,
"epoch": 2.3022388059701493,
"grad_norm": 0.15947353839874268,
"learning_rate": 0.0002,
"loss": 0.5309044718742371,
"mean_token_accuracy": 0.784679189324379,
"num_tokens": 10105206.0,
"step": 617
},
{
"entropy": 0.5409619510173798,
"epoch": 2.3059701492537314,
"grad_norm": 0.21774348616600037,
"learning_rate": 0.0002,
"loss": 0.5331413745880127,
"mean_token_accuracy": 0.7848716974258423,
"num_tokens": 10121951.0,
"step": 618
},
{
"entropy": 0.5404030680656433,
"epoch": 2.3097014925373136,
"grad_norm": 0.17135120928287506,
"learning_rate": 0.0002,
"loss": 0.5320269465446472,
"mean_token_accuracy": 0.7863317579030991,
"num_tokens": 10138520.0,
"step": 619
},
{
"entropy": 0.543184906244278,
"epoch": 2.3134328358208958,
"grad_norm": 0.18270884454250336,
"learning_rate": 0.0002,
"loss": 0.5362977981567383,
"mean_token_accuracy": 0.7825828939676285,
"num_tokens": 10155242.0,
"step": 620
},
{
"entropy": 0.5144708007574081,
"epoch": 2.3171641791044775,
"grad_norm": 0.19776520133018494,
"learning_rate": 0.0002,
"loss": 0.5190030336380005,
"mean_token_accuracy": 0.7893546521663666,
"num_tokens": 10171493.0,
"step": 621
},
{
"entropy": 0.5012815147638321,
"epoch": 2.3208955223880596,
"grad_norm": 0.18417391180992126,
"learning_rate": 0.0002,
"loss": 0.5140509009361267,
"mean_token_accuracy": 0.7917021214962006,
"num_tokens": 10187924.0,
"step": 622
},
{
"entropy": 0.5291815996170044,
"epoch": 2.324626865671642,
"grad_norm": 0.18122002482414246,
"learning_rate": 0.0002,
"loss": 0.5308645367622375,
"mean_token_accuracy": 0.7827988862991333,
"num_tokens": 10204223.0,
"step": 623
},
{
"entropy": 0.5316928327083588,
"epoch": 2.328358208955224,
"grad_norm": 0.17393858730793,
"learning_rate": 0.0002,
"loss": 0.5351020097732544,
"mean_token_accuracy": 0.7837810218334198,
"num_tokens": 10220678.0,
"step": 624
},
{
"entropy": 0.5380063354969025,
"epoch": 2.332089552238806,
"grad_norm": 0.16641174256801605,
"learning_rate": 0.0002,
"loss": 0.5311377644538879,
"mean_token_accuracy": 0.78605717420578,
"num_tokens": 10236761.0,
"step": 625
},
{
"entropy": 0.5296464115381241,
"epoch": 2.3358208955223883,
"grad_norm": 0.16847732663154602,
"learning_rate": 0.0002,
"loss": 0.5290564894676208,
"mean_token_accuracy": 0.7866681218147278,
"num_tokens": 10253110.0,
"step": 626
},
{
"entropy": 0.5196742564439774,
"epoch": 2.33955223880597,
"grad_norm": 0.16526693105697632,
"learning_rate": 0.0002,
"loss": 0.516907811164856,
"mean_token_accuracy": 0.7920583933591843,
"num_tokens": 10269492.0,
"step": 627
},
{
"entropy": 0.541998103260994,
"epoch": 2.343283582089552,
"grad_norm": 0.18568557500839233,
"learning_rate": 0.0002,
"loss": 0.5372257828712463,
"mean_token_accuracy": 0.7823797762393951,
"num_tokens": 10285927.0,
"step": 628
},
{
"entropy": 0.5108761489391327,
"epoch": 2.3470149253731343,
"grad_norm": 0.1934242844581604,
"learning_rate": 0.0002,
"loss": 0.5139164924621582,
"mean_token_accuracy": 0.7933155596256256,
"num_tokens": 10302023.0,
"step": 629
},
{
"entropy": 0.5217199325561523,
"epoch": 2.3507462686567164,
"grad_norm": 0.17553211748600006,
"learning_rate": 0.0002,
"loss": 0.5230180025100708,
"mean_token_accuracy": 0.7875964045524597,
"num_tokens": 10318268.0,
"step": 630
},
{
"entropy": 0.5330761075019836,
"epoch": 2.3544776119402986,
"grad_norm": 0.15872074663639069,
"learning_rate": 0.0002,
"loss": 0.5290681719779968,
"mean_token_accuracy": 0.7844167649745941,
"num_tokens": 10334766.0,
"step": 631
},
{
"entropy": 0.5369035452604294,
"epoch": 2.3582089552238807,
"grad_norm": 0.1846853792667389,
"learning_rate": 0.0002,
"loss": 0.5329739451408386,
"mean_token_accuracy": 0.7838435918092728,
"num_tokens": 10351349.0,
"step": 632
},
{
"entropy": 0.5287653654813766,
"epoch": 2.361940298507463,
"grad_norm": 0.1996822953224182,
"learning_rate": 0.0002,
"loss": 0.5347191095352173,
"mean_token_accuracy": 0.7811494767665863,
"num_tokens": 10367871.0,
"step": 633
},
{
"entropy": 0.5239842683076859,
"epoch": 2.3656716417910446,
"grad_norm": 0.19435462355613708,
"learning_rate": 0.0002,
"loss": 0.530573308467865,
"mean_token_accuracy": 0.7837476581335068,
"num_tokens": 10384315.0,
"step": 634
},
{
"entropy": 0.5206383317708969,
"epoch": 2.3694029850746268,
"grad_norm": 0.19717657566070557,
"learning_rate": 0.0002,
"loss": 0.5275444388389587,
"mean_token_accuracy": 0.7842705696821213,
"num_tokens": 10400769.0,
"step": 635
},
{
"entropy": 0.5064749270677567,
"epoch": 2.373134328358209,
"grad_norm": 0.19260841608047485,
"learning_rate": 0.0002,
"loss": 0.51506507396698,
"mean_token_accuracy": 0.789744108915329,
"num_tokens": 10417006.0,
"step": 636
},
{
"entropy": 0.5361980348825455,
"epoch": 2.376865671641791,
"grad_norm": 0.17480432987213135,
"learning_rate": 0.0002,
"loss": 0.5336955189704895,
"mean_token_accuracy": 0.7836211174726486,
"num_tokens": 10433294.0,
"step": 637
},
{
"entropy": 0.5383089035749435,
"epoch": 2.3805970149253732,
"grad_norm": 0.18294544517993927,
"learning_rate": 0.0002,
"loss": 0.5289636254310608,
"mean_token_accuracy": 0.7852412611246109,
"num_tokens": 10449674.0,
"step": 638
},
{
"entropy": 0.5097021907567978,
"epoch": 2.3843283582089554,
"grad_norm": 0.16242100298404694,
"learning_rate": 0.0002,
"loss": 0.5021054148674011,
"mean_token_accuracy": 0.7972816228866577,
"num_tokens": 10465855.0,
"step": 639
},
{
"entropy": 0.5423515290021896,
"epoch": 2.388059701492537,
"grad_norm": 0.22227367758750916,
"learning_rate": 0.0002,
"loss": 0.548687756061554,
"mean_token_accuracy": 0.776146799325943,
"num_tokens": 10482179.0,
"step": 640
},
{
"entropy": 0.5074172541499138,
"epoch": 2.3917910447761193,
"grad_norm": 0.1631743311882019,
"learning_rate": 0.0002,
"loss": 0.5108535289764404,
"mean_token_accuracy": 0.7928425967693329,
"num_tokens": 10498617.0,
"step": 641
},
{
"entropy": 0.5141904726624489,
"epoch": 2.3955223880597014,
"grad_norm": 0.22901000082492828,
"learning_rate": 0.0002,
"loss": 0.5239617228507996,
"mean_token_accuracy": 0.7894341051578522,
"num_tokens": 10514855.0,
"step": 642
},
{
"entropy": 0.548003762960434,
"epoch": 2.3992537313432836,
"grad_norm": 0.1889556348323822,
"learning_rate": 0.0002,
"loss": 0.5518738627433777,
"mean_token_accuracy": 0.7756821662187576,
"num_tokens": 10531113.0,
"step": 643
},
{
"entropy": 0.5271116495132446,
"epoch": 2.4029850746268657,
"grad_norm": 0.15567590296268463,
"learning_rate": 0.0002,
"loss": 0.516383171081543,
"mean_token_accuracy": 0.7933164685964584,
"num_tokens": 10547691.0,
"step": 644
},
{
"entropy": 0.5330717116594315,
"epoch": 2.406716417910448,
"grad_norm": 0.17213337123394012,
"learning_rate": 0.0002,
"loss": 0.5231931209564209,
"mean_token_accuracy": 0.7853028923273087,
"num_tokens": 10563993.0,
"step": 645
},
{
"entropy": 0.542450025677681,
"epoch": 2.41044776119403,
"grad_norm": 0.16203731298446655,
"learning_rate": 0.0002,
"loss": 0.5375291109085083,
"mean_token_accuracy": 0.7830152362585068,
"num_tokens": 10580464.0,
"step": 646
},
{
"entropy": 0.5074228942394257,
"epoch": 2.4141791044776117,
"grad_norm": 0.16541871428489685,
"learning_rate": 0.0002,
"loss": 0.5123732089996338,
"mean_token_accuracy": 0.7941079437732697,
"num_tokens": 10596747.0,
"step": 647
},
{
"entropy": 0.5105165019631386,
"epoch": 2.417910447761194,
"grad_norm": 0.182412788271904,
"learning_rate": 0.0002,
"loss": 0.5217914581298828,
"mean_token_accuracy": 0.7893105298280716,
"num_tokens": 10612951.0,
"step": 648
},
{
"entropy": 0.5206151753664017,
"epoch": 2.421641791044776,
"grad_norm": 0.20678837597370148,
"learning_rate": 0.0002,
"loss": 0.5335655212402344,
"mean_token_accuracy": 0.7840552628040314,
"num_tokens": 10629467.0,
"step": 649
},
{
"entropy": 0.5416827350854874,
"epoch": 2.425373134328358,
"grad_norm": 0.16378135979175568,
"learning_rate": 0.0002,
"loss": 0.5401762127876282,
"mean_token_accuracy": 0.782837986946106,
"num_tokens": 10645981.0,
"step": 650
},
{
"entropy": 0.5352658033370972,
"epoch": 2.4291044776119404,
"grad_norm": 0.17120513319969177,
"learning_rate": 0.0002,
"loss": 0.5229877233505249,
"mean_token_accuracy": 0.7894999831914902,
"num_tokens": 10662599.0,
"step": 651
},
{
"entropy": 0.5378601551055908,
"epoch": 2.4328358208955225,
"grad_norm": 0.18634538352489471,
"learning_rate": 0.0002,
"loss": 0.5370844602584839,
"mean_token_accuracy": 0.7834650576114655,
"num_tokens": 10678905.0,
"step": 652
},
{
"entropy": 0.5139342248439789,
"epoch": 2.4365671641791042,
"grad_norm": 0.1823841780424118,
"learning_rate": 0.0002,
"loss": 0.5105010271072388,
"mean_token_accuracy": 0.7942702323198318,
"num_tokens": 10695354.0,
"step": 653
},
{
"entropy": 0.5001704916357994,
"epoch": 2.4402985074626864,
"grad_norm": 0.18246224522590637,
"learning_rate": 0.0002,
"loss": 0.5092322826385498,
"mean_token_accuracy": 0.7953812628984451,
"num_tokens": 10711419.0,
"step": 654
},
{
"entropy": 0.5088636800646782,
"epoch": 2.4440298507462686,
"grad_norm": 0.16581419110298157,
"learning_rate": 0.0002,
"loss": 0.5136841535568237,
"mean_token_accuracy": 0.7919897437095642,
"num_tokens": 10727853.0,
"step": 655
},
{
"entropy": 0.5198448672890663,
"epoch": 2.4477611940298507,
"grad_norm": 0.16655242443084717,
"learning_rate": 0.0002,
"loss": 0.5188886523246765,
"mean_token_accuracy": 0.7890329360961914,
"num_tokens": 10744204.0,
"step": 656
},
{
"entropy": 0.5168529972434044,
"epoch": 2.451492537313433,
"grad_norm": 0.18366754055023193,
"learning_rate": 0.0002,
"loss": 0.5171942114830017,
"mean_token_accuracy": 0.7899800539016724,
"num_tokens": 10760669.0,
"step": 657
},
{
"entropy": 0.5348050147294998,
"epoch": 2.455223880597015,
"grad_norm": 0.18297524750232697,
"learning_rate": 0.0002,
"loss": 0.5392665266990662,
"mean_token_accuracy": 0.779433473944664,
"num_tokens": 10777093.0,
"step": 658
},
{
"entropy": 0.5245852321386337,
"epoch": 2.458955223880597,
"grad_norm": 0.19149278104305267,
"learning_rate": 0.0002,
"loss": 0.5260974764823914,
"mean_token_accuracy": 0.7873388528823853,
"num_tokens": 10793455.0,
"step": 659
},
{
"entropy": 0.5311989635229111,
"epoch": 2.4626865671641793,
"grad_norm": 0.1547309309244156,
"learning_rate": 0.0002,
"loss": 0.5266692042350769,
"mean_token_accuracy": 0.7839333266019821,
"num_tokens": 10809788.0,
"step": 660
},
{
"entropy": 0.5379379391670227,
"epoch": 2.466417910447761,
"grad_norm": 0.15859338641166687,
"learning_rate": 0.0002,
"loss": 0.5321581363677979,
"mean_token_accuracy": 0.7827870547771454,
"num_tokens": 10825837.0,
"step": 661
},
{
"entropy": 0.5471830368041992,
"epoch": 2.470149253731343,
"grad_norm": 0.16068732738494873,
"learning_rate": 0.0002,
"loss": 0.5360886454582214,
"mean_token_accuracy": 0.7848220616579056,
"num_tokens": 10842037.0,
"step": 662
},
{
"entropy": 0.5252791494131088,
"epoch": 2.4738805970149254,
"grad_norm": 0.1590043157339096,
"learning_rate": 0.0002,
"loss": 0.5276464819908142,
"mean_token_accuracy": 0.786907747387886,
"num_tokens": 10858320.0,
"step": 663
},
{
"entropy": 0.525018036365509,
"epoch": 2.4776119402985075,
"grad_norm": 0.17438893020153046,
"learning_rate": 0.0002,
"loss": 0.5300197005271912,
"mean_token_accuracy": 0.7852317094802856,
"num_tokens": 10874855.0,
"step": 664
},
{
"entropy": 0.5394986271858215,
"epoch": 2.4813432835820897,
"grad_norm": 0.17128010094165802,
"learning_rate": 0.0002,
"loss": 0.5422081351280212,
"mean_token_accuracy": 0.7800386846065521,
"num_tokens": 10891526.0,
"step": 665
},
{
"entropy": 0.5076115503907204,
"epoch": 2.485074626865672,
"grad_norm": 0.1781933754682541,
"learning_rate": 0.0002,
"loss": 0.507164716720581,
"mean_token_accuracy": 0.7957528084516525,
"num_tokens": 10907862.0,
"step": 666
},
{
"entropy": 0.5271291732788086,
"epoch": 2.4888059701492535,
"grad_norm": 0.17105896770954132,
"learning_rate": 0.0002,
"loss": 0.5228562355041504,
"mean_token_accuracy": 0.7889808863401413,
"num_tokens": 10924235.0,
"step": 667
},
{
"entropy": 0.5363548994064331,
"epoch": 2.4925373134328357,
"grad_norm": 0.1583063155412674,
"learning_rate": 0.0002,
"loss": 0.5336060523986816,
"mean_token_accuracy": 0.7860426157712936,
"num_tokens": 10940599.0,
"step": 668
},
{
"entropy": 0.503924198448658,
"epoch": 2.496268656716418,
"grad_norm": 0.17252567410469055,
"learning_rate": 0.0002,
"loss": 0.5028519034385681,
"mean_token_accuracy": 0.7955358028411865,
"num_tokens": 10956649.0,
"step": 669
},
{
"entropy": 0.5256816297769547,
"epoch": 2.5,
"grad_norm": 0.1619226038455963,
"learning_rate": 0.0002,
"loss": 0.5266148447990417,
"mean_token_accuracy": 0.787626251578331,
"num_tokens": 10972977.0,
"step": 670
},
{
"entropy": 0.5120773613452911,
"epoch": 2.503731343283582,
"grad_norm": 0.16918344795703888,
"learning_rate": 0.0002,
"loss": 0.5207507610321045,
"mean_token_accuracy": 0.7914620935916901,
"num_tokens": 10989327.0,
"step": 671
},
{
"entropy": 0.5181663334369659,
"epoch": 2.5074626865671643,
"grad_norm": 0.19783611595630646,
"learning_rate": 0.0002,
"loss": 0.5268117189407349,
"mean_token_accuracy": 0.7864458560943604,
"num_tokens": 11005449.0,
"step": 672
},
{
"entropy": 0.5229259878396988,
"epoch": 2.5111940298507465,
"grad_norm": 0.1657666116952896,
"learning_rate": 0.0002,
"loss": 0.5208563208580017,
"mean_token_accuracy": 0.7903305888175964,
"num_tokens": 11021576.0,
"step": 673
},
{
"entropy": 0.5335699021816254,
"epoch": 2.5149253731343286,
"grad_norm": 0.1847028136253357,
"learning_rate": 0.0002,
"loss": 0.5323396921157837,
"mean_token_accuracy": 0.7818653434514999,
"num_tokens": 11038174.0,
"step": 674
},
{
"entropy": 0.5297135561704636,
"epoch": 2.5186567164179103,
"grad_norm": 0.17212164402008057,
"learning_rate": 0.0002,
"loss": 0.5294620990753174,
"mean_token_accuracy": 0.7868784368038177,
"num_tokens": 11054527.0,
"step": 675
},
{
"entropy": 0.5551169812679291,
"epoch": 2.5223880597014925,
"grad_norm": 0.19568513333797455,
"learning_rate": 0.0002,
"loss": 0.5539876222610474,
"mean_token_accuracy": 0.775226280093193,
"num_tokens": 11070805.0,
"step": 676
},
{
"entropy": 0.5319524109363556,
"epoch": 2.5261194029850746,
"grad_norm": 0.14972956478595734,
"learning_rate": 0.0002,
"loss": 0.5295209288597107,
"mean_token_accuracy": 0.7860101461410522,
"num_tokens": 11087510.0,
"step": 677
},
{
"entropy": 0.5265523195266724,
"epoch": 2.529850746268657,
"grad_norm": 0.16056260466575623,
"learning_rate": 0.0002,
"loss": 0.5248823761940002,
"mean_token_accuracy": 0.7860508859157562,
"num_tokens": 11103933.0,
"step": 678
},
{
"entropy": 0.5225390195846558,
"epoch": 2.533582089552239,
"grad_norm": 0.22218124568462372,
"learning_rate": 0.0002,
"loss": 0.5301728248596191,
"mean_token_accuracy": 0.7851128876209259,
"num_tokens": 11120292.0,
"step": 679
},
{
"entropy": 0.5265638679265976,
"epoch": 2.5373134328358207,
"grad_norm": 0.15814287960529327,
"learning_rate": 0.0002,
"loss": 0.5240415930747986,
"mean_token_accuracy": 0.788665235042572,
"num_tokens": 11136784.0,
"step": 680
},
{
"entropy": 0.5306698828935623,
"epoch": 2.541044776119403,
"grad_norm": 0.1664581149816513,
"learning_rate": 0.0002,
"loss": 0.5277557373046875,
"mean_token_accuracy": 0.7860920429229736,
"num_tokens": 11153320.0,
"step": 681
},
{
"entropy": 0.5291799604892731,
"epoch": 2.544776119402985,
"grad_norm": 0.1872314065694809,
"learning_rate": 0.0002,
"loss": 0.5320236086845398,
"mean_token_accuracy": 0.7843979746103287,
"num_tokens": 11169723.0,
"step": 682
},
{
"entropy": 0.53035868704319,
"epoch": 2.548507462686567,
"grad_norm": 0.20792965590953827,
"learning_rate": 0.0002,
"loss": 0.5358518362045288,
"mean_token_accuracy": 0.7849173247814178,
"num_tokens": 11186035.0,
"step": 683
},
{
"entropy": 0.5152866542339325,
"epoch": 2.5522388059701493,
"grad_norm": 0.20304447412490845,
"learning_rate": 0.0002,
"loss": 0.512556791305542,
"mean_token_accuracy": 0.7908182591199875,
"num_tokens": 11201972.0,
"step": 684
},
{
"entropy": 0.520212933421135,
"epoch": 2.5559701492537314,
"grad_norm": 0.19615566730499268,
"learning_rate": 0.0002,
"loss": 0.5241949558258057,
"mean_token_accuracy": 0.7870226055383682,
"num_tokens": 11218085.0,
"step": 685
},
{
"entropy": 0.523841142654419,
"epoch": 2.5597014925373136,
"grad_norm": 0.18903784453868866,
"learning_rate": 0.0002,
"loss": 0.5217975974082947,
"mean_token_accuracy": 0.7914077341556549,
"num_tokens": 11234466.0,
"step": 686
},
{
"entropy": 0.5006226599216461,
"epoch": 2.5634328358208958,
"grad_norm": 0.2238045483827591,
"learning_rate": 0.0002,
"loss": 0.503075122833252,
"mean_token_accuracy": 0.7985939383506775,
"num_tokens": 11250619.0,
"step": 687
},
{
"entropy": 0.522046685218811,
"epoch": 2.5671641791044775,
"grad_norm": 0.1861460655927658,
"learning_rate": 0.0002,
"loss": 0.5256574749946594,
"mean_token_accuracy": 0.7879543006420135,
"num_tokens": 11267052.0,
"step": 688
},
{
"entropy": 0.5404367446899414,
"epoch": 2.5708955223880596,
"grad_norm": 0.18886177241802216,
"learning_rate": 0.0002,
"loss": 0.5377542972564697,
"mean_token_accuracy": 0.781608834862709,
"num_tokens": 11283385.0,
"step": 689
},
{
"entropy": 0.526772603392601,
"epoch": 2.574626865671642,
"grad_norm": 0.16710662841796875,
"learning_rate": 0.0002,
"loss": 0.5189668536186218,
"mean_token_accuracy": 0.7905929088592529,
"num_tokens": 11299758.0,
"step": 690
},
{
"entropy": 0.528350904583931,
"epoch": 2.578358208955224,
"grad_norm": 0.17797508835792542,
"learning_rate": 0.0002,
"loss": 0.5194413661956787,
"mean_token_accuracy": 0.7911931574344635,
"num_tokens": 11316130.0,
"step": 691
},
{
"entropy": 0.52931809425354,
"epoch": 2.582089552238806,
"grad_norm": 0.21212708950042725,
"learning_rate": 0.0002,
"loss": 0.5379958152770996,
"mean_token_accuracy": 0.7827763855457306,
"num_tokens": 11332658.0,
"step": 692
},
{
"entropy": 0.5531658977270126,
"epoch": 2.585820895522388,
"grad_norm": 0.17241588234901428,
"learning_rate": 0.0002,
"loss": 0.5588712692260742,
"mean_token_accuracy": 0.7756764441728592,
"num_tokens": 11349446.0,
"step": 693
},
{
"entropy": 0.5219079852104187,
"epoch": 2.58955223880597,
"grad_norm": 0.15809156000614166,
"learning_rate": 0.0002,
"loss": 0.5210216045379639,
"mean_token_accuracy": 0.7904610931873322,
"num_tokens": 11366050.0,
"step": 694
},
{
"entropy": 0.5322331935167313,
"epoch": 2.593283582089552,
"grad_norm": 0.18396085500717163,
"learning_rate": 0.0002,
"loss": 0.5301384925842285,
"mean_token_accuracy": 0.7841024845838547,
"num_tokens": 11382491.0,
"step": 695
},
{
"entropy": 0.5307652056217194,
"epoch": 2.5970149253731343,
"grad_norm": 0.16308656334877014,
"learning_rate": 0.0002,
"loss": 0.5239346623420715,
"mean_token_accuracy": 0.7880617082118988,
"num_tokens": 11398802.0,
"step": 696
},
{
"entropy": 0.5340842455625534,
"epoch": 2.6007462686567164,
"grad_norm": 0.19761645793914795,
"learning_rate": 0.0002,
"loss": 0.5363891124725342,
"mean_token_accuracy": 0.7838073074817657,
"num_tokens": 11415128.0,
"step": 697
},
{
"entropy": 0.5340555012226105,
"epoch": 2.6044776119402986,
"grad_norm": 0.1661156415939331,
"learning_rate": 0.0002,
"loss": 0.5325526595115662,
"mean_token_accuracy": 0.7847229689359665,
"num_tokens": 11431318.0,
"step": 698
},
{
"entropy": 0.5427940785884857,
"epoch": 2.6082089552238807,
"grad_norm": 0.16063573956489563,
"learning_rate": 0.0002,
"loss": 0.5501501560211182,
"mean_token_accuracy": 0.7748306840658188,
"num_tokens": 11447713.0,
"step": 699
},
{
"entropy": 0.5213874280452728,
"epoch": 2.611940298507463,
"grad_norm": 0.1618213802576065,
"learning_rate": 0.0002,
"loss": 0.5210378170013428,
"mean_token_accuracy": 0.787492960691452,
"num_tokens": 11464142.0,
"step": 700
},
{
"entropy": 0.5329896062612534,
"epoch": 2.6156716417910446,
"grad_norm": 0.18406495451927185,
"learning_rate": 0.0002,
"loss": 0.5365204215049744,
"mean_token_accuracy": 0.7818106710910797,
"num_tokens": 11480468.0,
"step": 701
},
{
"entropy": 0.5018042698502541,
"epoch": 2.6194029850746268,
"grad_norm": 0.1559264361858368,
"learning_rate": 0.0002,
"loss": 0.507462203502655,
"mean_token_accuracy": 0.7951454520225525,
"num_tokens": 11496824.0,
"step": 702
},
{
"entropy": 0.5304955393075943,
"epoch": 2.623134328358209,
"grad_norm": 0.16140370070934296,
"learning_rate": 0.0002,
"loss": 0.5346159338951111,
"mean_token_accuracy": 0.7851942926645279,
"num_tokens": 11513567.0,
"step": 703
},
{
"entropy": 0.5185345709323883,
"epoch": 2.626865671641791,
"grad_norm": 0.16598905622959137,
"learning_rate": 0.0002,
"loss": 0.5121718645095825,
"mean_token_accuracy": 0.7958889752626419,
"num_tokens": 11530042.0,
"step": 704
},
{
"entropy": 0.5373921394348145,
"epoch": 2.6305970149253732,
"grad_norm": 0.18821974098682404,
"learning_rate": 0.0002,
"loss": 0.5302144289016724,
"mean_token_accuracy": 0.7860950380563736,
"num_tokens": 11546594.0,
"step": 705
},
{
"entropy": 0.5182069316506386,
"epoch": 2.6343283582089554,
"grad_norm": 0.17032590508460999,
"learning_rate": 0.0002,
"loss": 0.5235993266105652,
"mean_token_accuracy": 0.7881369441747665,
"num_tokens": 11562996.0,
"step": 706
},
{
"entropy": 0.5120366662740707,
"epoch": 2.638059701492537,
"grad_norm": 0.20226538181304932,
"learning_rate": 0.0002,
"loss": 0.5154089331626892,
"mean_token_accuracy": 0.7893324643373489,
"num_tokens": 11579247.0,
"step": 707
},
{
"entropy": 0.5271363854408264,
"epoch": 2.6417910447761193,
"grad_norm": 0.2367754727602005,
"learning_rate": 0.0002,
"loss": 0.529344916343689,
"mean_token_accuracy": 0.7863059490919113,
"num_tokens": 11595557.0,
"step": 708
},
{
"entropy": 0.5211906433105469,
"epoch": 2.6455223880597014,
"grad_norm": 0.17606736719608307,
"learning_rate": 0.0002,
"loss": 0.5162103176116943,
"mean_token_accuracy": 0.7936627119779587,
"num_tokens": 11612153.0,
"step": 709
},
{
"entropy": 0.5413748621940613,
"epoch": 2.6492537313432836,
"grad_norm": 0.16839931905269623,
"learning_rate": 0.0002,
"loss": 0.5375933051109314,
"mean_token_accuracy": 0.7837605625391006,
"num_tokens": 11628672.0,
"step": 710
},
{
"entropy": 0.5492138266563416,
"epoch": 2.6529850746268657,
"grad_norm": 0.1578325480222702,
"learning_rate": 0.0002,
"loss": 0.5387027263641357,
"mean_token_accuracy": 0.7828567028045654,
"num_tokens": 11645327.0,
"step": 711
},
{
"entropy": 0.5294462591409683,
"epoch": 2.656716417910448,
"grad_norm": 0.18846334517002106,
"learning_rate": 0.0002,
"loss": 0.5310033559799194,
"mean_token_accuracy": 0.7850282490253448,
"num_tokens": 11661886.0,
"step": 712
},
{
"entropy": 0.5195821523666382,
"epoch": 2.66044776119403,
"grad_norm": 0.1722957044839859,
"learning_rate": 0.0002,
"loss": 0.5247335433959961,
"mean_token_accuracy": 0.7882849276065826,
"num_tokens": 11678052.0,
"step": 713
},
{
"entropy": 0.5254689157009125,
"epoch": 2.664179104477612,
"grad_norm": 0.175649493932724,
"learning_rate": 0.0002,
"loss": 0.5303612947463989,
"mean_token_accuracy": 0.7877318859100342,
"num_tokens": 11694539.0,
"step": 714
},
{
"entropy": 0.5156526416540146,
"epoch": 2.667910447761194,
"grad_norm": 0.21296396851539612,
"learning_rate": 0.0002,
"loss": 0.5188760161399841,
"mean_token_accuracy": 0.7886723130941391,
"num_tokens": 11710806.0,
"step": 715
},
{
"entropy": 0.5304235517978668,
"epoch": 2.671641791044776,
"grad_norm": 0.1557040810585022,
"learning_rate": 0.0002,
"loss": 0.532120943069458,
"mean_token_accuracy": 0.7845920920372009,
"num_tokens": 11727178.0,
"step": 716
},
{
"entropy": 0.5396947711706161,
"epoch": 2.675373134328358,
"grad_norm": 0.23430386185646057,
"learning_rate": 0.0002,
"loss": 0.5410381555557251,
"mean_token_accuracy": 0.7820145785808563,
"num_tokens": 11743592.0,
"step": 717
},
{
"entropy": 0.5290116220712662,
"epoch": 2.6791044776119404,
"grad_norm": 0.18491677939891815,
"learning_rate": 0.0002,
"loss": 0.5220689177513123,
"mean_token_accuracy": 0.7880972176790237,
"num_tokens": 11759881.0,
"step": 718
},
{
"entropy": 0.5365530252456665,
"epoch": 2.6828358208955225,
"grad_norm": 0.20658747851848602,
"learning_rate": 0.0002,
"loss": 0.5274034738540649,
"mean_token_accuracy": 0.7877165377140045,
"num_tokens": 11776103.0,
"step": 719
},
{
"entropy": 0.5193691104650497,
"epoch": 2.6865671641791042,
"grad_norm": 0.15166765451431274,
"learning_rate": 0.0002,
"loss": 0.5179476737976074,
"mean_token_accuracy": 0.7924929708242416,
"num_tokens": 11792614.0,
"step": 720
},
{
"entropy": 0.5238720774650574,
"epoch": 2.6902985074626864,
"grad_norm": 0.2068144679069519,
"learning_rate": 0.0002,
"loss": 0.5365906953811646,
"mean_token_accuracy": 0.7825643718242645,
"num_tokens": 11808884.0,
"step": 721
},
{
"entropy": 0.5160530805587769,
"epoch": 2.6940298507462686,
"grad_norm": 0.1884981393814087,
"learning_rate": 0.0002,
"loss": 0.5255499482154846,
"mean_token_accuracy": 0.785829171538353,
"num_tokens": 11825190.0,
"step": 722
},
{
"entropy": 0.5381662398576736,
"epoch": 2.6977611940298507,
"grad_norm": 0.22528207302093506,
"learning_rate": 0.0002,
"loss": 0.5401077270507812,
"mean_token_accuracy": 0.780912771821022,
"num_tokens": 11841581.0,
"step": 723
},
{
"entropy": 0.5353066176176071,
"epoch": 2.701492537313433,
"grad_norm": 0.16518141329288483,
"learning_rate": 0.0002,
"loss": 0.5283069014549255,
"mean_token_accuracy": 0.7859592884778976,
"num_tokens": 11857924.0,
"step": 724
},
{
"entropy": 0.5316939651966095,
"epoch": 2.705223880597015,
"grad_norm": 0.1674748808145523,
"learning_rate": 0.0002,
"loss": 0.5228734016418457,
"mean_token_accuracy": 0.7879570424556732,
"num_tokens": 11874385.0,
"step": 725
},
{
"entropy": 0.5669917911291122,
"epoch": 2.708955223880597,
"grad_norm": 0.18983666598796844,
"learning_rate": 0.0002,
"loss": 0.5586099624633789,
"mean_token_accuracy": 0.7734153866767883,
"num_tokens": 11890893.0,
"step": 726
},
{
"entropy": 0.5250157564878464,
"epoch": 2.7126865671641793,
"grad_norm": 0.16966547071933746,
"learning_rate": 0.0002,
"loss": 0.5228544473648071,
"mean_token_accuracy": 0.7863233536481857,
"num_tokens": 11907436.0,
"step": 727
},
{
"entropy": 0.5265001058578491,
"epoch": 2.716417910447761,
"grad_norm": 0.21439625322818756,
"learning_rate": 0.0002,
"loss": 0.5315214991569519,
"mean_token_accuracy": 0.7847255766391754,
"num_tokens": 11923778.0,
"step": 728
},
{
"entropy": 0.5284342169761658,
"epoch": 2.720149253731343,
"grad_norm": 0.1824498325586319,
"learning_rate": 0.0002,
"loss": 0.5404508709907532,
"mean_token_accuracy": 0.7798212766647339,
"num_tokens": 11940075.0,
"step": 729
},
{
"entropy": 0.501299723982811,
"epoch": 2.7238805970149254,
"grad_norm": 0.2304428666830063,
"learning_rate": 0.0002,
"loss": 0.5122545957565308,
"mean_token_accuracy": 0.791194960474968,
"num_tokens": 11956336.0,
"step": 730
},
{
"entropy": 0.5443384349346161,
"epoch": 2.7276119402985075,
"grad_norm": 0.1537434458732605,
"learning_rate": 0.0002,
"loss": 0.5363157987594604,
"mean_token_accuracy": 0.7845837771892548,
"num_tokens": 11972840.0,
"step": 731
},
{
"entropy": 0.5315753519535065,
"epoch": 2.7313432835820897,
"grad_norm": 0.17106328904628754,
"learning_rate": 0.0002,
"loss": 0.5220600366592407,
"mean_token_accuracy": 0.7875728458166122,
"num_tokens": 11989350.0,
"step": 732
},
{
"entropy": 0.5302078127861023,
"epoch": 2.7350746268656714,
"grad_norm": 0.17003247141838074,
"learning_rate": 0.0002,
"loss": 0.5270202159881592,
"mean_token_accuracy": 0.787715807557106,
"num_tokens": 12005809.0,
"step": 733
},
{
"entropy": 0.527949333190918,
"epoch": 2.7388059701492535,
"grad_norm": 0.21327127516269684,
"learning_rate": 0.0002,
"loss": 0.5354670882225037,
"mean_token_accuracy": 0.7835386097431183,
"num_tokens": 12022336.0,
"step": 734
},
{
"entropy": 0.5089609026908875,
"epoch": 2.7425373134328357,
"grad_norm": 0.16088151931762695,
"learning_rate": 0.0002,
"loss": 0.5117763876914978,
"mean_token_accuracy": 0.7938453704118729,
"num_tokens": 12038779.0,
"step": 735
},
{
"entropy": 0.5126267448067665,
"epoch": 2.746268656716418,
"grad_norm": 0.1757761836051941,
"learning_rate": 0.0002,
"loss": 0.5135779976844788,
"mean_token_accuracy": 0.7931608110666275,
"num_tokens": 12054869.0,
"step": 736
},
{
"entropy": 0.5239577889442444,
"epoch": 2.75,
"grad_norm": 0.1817576140165329,
"learning_rate": 0.0002,
"loss": 0.5234410762786865,
"mean_token_accuracy": 0.7875021547079086,
"num_tokens": 12071361.0,
"step": 737
},
{
"entropy": 0.5307980924844742,
"epoch": 2.753731343283582,
"grad_norm": 0.1653635948896408,
"learning_rate": 0.0002,
"loss": 0.5298102498054504,
"mean_token_accuracy": 0.7864446491003036,
"num_tokens": 12087634.0,
"step": 738
},
{
"entropy": 0.5222239643335342,
"epoch": 2.7574626865671643,
"grad_norm": 0.18040236830711365,
"learning_rate": 0.0002,
"loss": 0.5258353352546692,
"mean_token_accuracy": 0.7891390025615692,
"num_tokens": 12103943.0,
"step": 739
},
{
"entropy": 0.5332596972584724,
"epoch": 2.7611940298507465,
"grad_norm": 0.15495066344738007,
"learning_rate": 0.0002,
"loss": 0.5282677412033081,
"mean_token_accuracy": 0.785639688372612,
"num_tokens": 12120325.0,
"step": 740
},
{
"entropy": 0.5371799468994141,
"epoch": 2.7649253731343286,
"grad_norm": 0.17130646109580994,
"learning_rate": 0.0002,
"loss": 0.5295438170433044,
"mean_token_accuracy": 0.7828952521085739,
"num_tokens": 12136761.0,
"step": 741
},
{
"entropy": 0.5405760109424591,
"epoch": 2.7686567164179103,
"grad_norm": 0.16763344407081604,
"learning_rate": 0.0002,
"loss": 0.5373218655586243,
"mean_token_accuracy": 0.7816964089870453,
"num_tokens": 12153043.0,
"step": 742
},
{
"entropy": 0.5118273198604584,
"epoch": 2.7723880597014925,
"grad_norm": 0.17398576438426971,
"learning_rate": 0.0002,
"loss": 0.5121888518333435,
"mean_token_accuracy": 0.7949073165655136,
"num_tokens": 12169387.0,
"step": 743
},
{
"entropy": 0.5252756625413895,
"epoch": 2.7761194029850746,
"grad_norm": 0.20275278389453888,
"learning_rate": 0.0002,
"loss": 0.5319023132324219,
"mean_token_accuracy": 0.7827770113945007,
"num_tokens": 12185773.0,
"step": 744
},
{
"entropy": 0.5281336456537247,
"epoch": 2.779850746268657,
"grad_norm": 0.16486869752407074,
"learning_rate": 0.0002,
"loss": 0.5282880663871765,
"mean_token_accuracy": 0.7841639369726181,
"num_tokens": 12202185.0,
"step": 745
},
{
"entropy": 0.5157778561115265,
"epoch": 2.783582089552239,
"grad_norm": 0.1883569210767746,
"learning_rate": 0.0002,
"loss": 0.5159796476364136,
"mean_token_accuracy": 0.791821077466011,
"num_tokens": 12218279.0,
"step": 746
},
{
"entropy": 0.5459621995687485,
"epoch": 2.7873134328358207,
"grad_norm": 0.15937039256095886,
"learning_rate": 0.0002,
"loss": 0.5399669408798218,
"mean_token_accuracy": 0.7847357988357544,
"num_tokens": 12234867.0,
"step": 747
},
{
"entropy": 0.52740877866745,
"epoch": 2.791044776119403,
"grad_norm": 0.14844611287117004,
"learning_rate": 0.0002,
"loss": 0.5260165929794312,
"mean_token_accuracy": 0.7880454957485199,
"num_tokens": 12251419.0,
"step": 748
},
{
"entropy": 0.5150434598326683,
"epoch": 2.794776119402985,
"grad_norm": 0.16429124772548676,
"learning_rate": 0.0002,
"loss": 0.5152871012687683,
"mean_token_accuracy": 0.7888982892036438,
"num_tokens": 12267583.0,
"step": 749
},
{
"entropy": 0.5261992961168289,
"epoch": 2.798507462686567,
"grad_norm": 0.18603260815143585,
"learning_rate": 0.0002,
"loss": 0.5299534201622009,
"mean_token_accuracy": 0.7854207009077072,
"num_tokens": 12284129.0,
"step": 750
},
{
"entropy": 0.529946893453598,
"epoch": 2.8022388059701493,
"grad_norm": 0.18355652689933777,
"learning_rate": 0.0002,
"loss": 0.5360465049743652,
"mean_token_accuracy": 0.7842213064432144,
"num_tokens": 12300683.0,
"step": 751
},
{
"entropy": 0.5377232730388641,
"epoch": 2.8059701492537314,
"grad_norm": 0.17548733949661255,
"learning_rate": 0.0002,
"loss": 0.5429165363311768,
"mean_token_accuracy": 0.7822890281677246,
"num_tokens": 12316833.0,
"step": 752
},
{
"entropy": 0.5407239943742752,
"epoch": 2.8097014925373136,
"grad_norm": 0.17476212978363037,
"learning_rate": 0.0002,
"loss": 0.5398030281066895,
"mean_token_accuracy": 0.7804454267024994,
"num_tokens": 12333283.0,
"step": 753
},
{
"entropy": 0.520610861480236,
"epoch": 2.8134328358208958,
"grad_norm": 0.15137535333633423,
"learning_rate": 0.0002,
"loss": 0.5157968401908875,
"mean_token_accuracy": 0.7898696959018707,
"num_tokens": 12349570.0,
"step": 754
},
{
"entropy": 0.5343620032072067,
"epoch": 2.8171641791044775,
"grad_norm": 0.16463439166545868,
"learning_rate": 0.0002,
"loss": 0.5255429148674011,
"mean_token_accuracy": 0.7910490483045578,
"num_tokens": 12366111.0,
"step": 755
},
{
"entropy": 0.5226383879780769,
"epoch": 2.8208955223880596,
"grad_norm": 0.17591623961925507,
"learning_rate": 0.0002,
"loss": 0.5295028686523438,
"mean_token_accuracy": 0.7862412929534912,
"num_tokens": 12382176.0,
"step": 756
},
{
"entropy": 0.5329883769154549,
"epoch": 2.824626865671642,
"grad_norm": 0.17046134173870087,
"learning_rate": 0.0002,
"loss": 0.5395819544792175,
"mean_token_accuracy": 0.7815450727939606,
"num_tokens": 12398954.0,
"step": 757
},
{
"entropy": 0.5189251601696014,
"epoch": 2.828358208955224,
"grad_norm": 0.17623355984687805,
"learning_rate": 0.0002,
"loss": 0.5211597681045532,
"mean_token_accuracy": 0.7862699329853058,
"num_tokens": 12415518.0,
"step": 758
},
{
"entropy": 0.5435206592082977,
"epoch": 2.832089552238806,
"grad_norm": 0.16461242735385895,
"learning_rate": 0.0002,
"loss": 0.5449641346931458,
"mean_token_accuracy": 0.7772939503192902,
"num_tokens": 12431840.0,
"step": 759
},
{
"entropy": 0.5242071002721786,
"epoch": 2.835820895522388,
"grad_norm": 0.16906797885894775,
"learning_rate": 0.0002,
"loss": 0.5236470103263855,
"mean_token_accuracy": 0.7878623157739639,
"num_tokens": 12447985.0,
"step": 760
},
{
"entropy": 0.5331535488367081,
"epoch": 2.83955223880597,
"grad_norm": 0.1613229662179947,
"learning_rate": 0.0002,
"loss": 0.5270719528198242,
"mean_token_accuracy": 0.7869479656219482,
"num_tokens": 12464369.0,
"step": 761
},
{
"entropy": 0.5153749734163284,
"epoch": 2.843283582089552,
"grad_norm": 0.1861318051815033,
"learning_rate": 0.0002,
"loss": 0.5134626626968384,
"mean_token_accuracy": 0.7917421609163284,
"num_tokens": 12480705.0,
"step": 762
},
{
"entropy": 0.5185382887721062,
"epoch": 2.8470149253731343,
"grad_norm": 0.15517400205135345,
"learning_rate": 0.0002,
"loss": 0.520057201385498,
"mean_token_accuracy": 0.7887658178806305,
"num_tokens": 12496768.0,
"step": 763
},
{
"entropy": 0.525531992316246,
"epoch": 2.8507462686567164,
"grad_norm": 0.2088494747877121,
"learning_rate": 0.0002,
"loss": 0.5236872434616089,
"mean_token_accuracy": 0.7884621620178223,
"num_tokens": 12513264.0,
"step": 764
},
{
"entropy": 0.516917809844017,
"epoch": 2.8544776119402986,
"grad_norm": 0.1747450977563858,
"learning_rate": 0.0002,
"loss": 0.5234484076499939,
"mean_token_accuracy": 0.7843039780855179,
"num_tokens": 12529856.0,
"step": 765
},
{
"entropy": 0.5171080678701401,
"epoch": 2.8582089552238807,
"grad_norm": 0.17318587005138397,
"learning_rate": 0.0002,
"loss": 0.520793080329895,
"mean_token_accuracy": 0.7862659096717834,
"num_tokens": 12546530.0,
"step": 766
},
{
"entropy": 0.540691614151001,
"epoch": 2.861940298507463,
"grad_norm": 0.15875069797039032,
"learning_rate": 0.0002,
"loss": 0.5400336384773254,
"mean_token_accuracy": 0.7827646285295486,
"num_tokens": 12563086.0,
"step": 767
},
{
"entropy": 0.5084429755806923,
"epoch": 2.8656716417910446,
"grad_norm": 0.14828889071941376,
"learning_rate": 0.0002,
"loss": 0.5024577379226685,
"mean_token_accuracy": 0.7963315397500992,
"num_tokens": 12579183.0,
"step": 768
},
{
"entropy": 0.5370931923389435,
"epoch": 2.8694029850746268,
"grad_norm": 0.14752823114395142,
"learning_rate": 0.0002,
"loss": 0.5261865854263306,
"mean_token_accuracy": 0.7877734899520874,
"num_tokens": 12596077.0,
"step": 769
},
{
"entropy": 0.5546486079692841,
"epoch": 2.873134328358209,
"grad_norm": 0.1517077535390854,
"learning_rate": 0.0002,
"loss": 0.5500649809837341,
"mean_token_accuracy": 0.7785899043083191,
"num_tokens": 12612620.0,
"step": 770
},
{
"entropy": 0.5144929736852646,
"epoch": 2.876865671641791,
"grad_norm": 0.18645553290843964,
"learning_rate": 0.0002,
"loss": 0.5184378623962402,
"mean_token_accuracy": 0.7887341529130936,
"num_tokens": 12628974.0,
"step": 771
},
{
"entropy": 0.5363174676895142,
"epoch": 2.8805970149253732,
"grad_norm": 0.173641175031662,
"learning_rate": 0.0002,
"loss": 0.5404868125915527,
"mean_token_accuracy": 0.7838273793458939,
"num_tokens": 12645473.0,
"step": 772
},
{
"entropy": 0.5220237821340561,
"epoch": 2.8843283582089554,
"grad_norm": 0.1810951977968216,
"learning_rate": 0.0002,
"loss": 0.5300620794296265,
"mean_token_accuracy": 0.7870841026306152,
"num_tokens": 12661871.0,
"step": 773
},
{
"entropy": 0.5215499252080917,
"epoch": 2.888059701492537,
"grad_norm": 0.17195403575897217,
"learning_rate": 0.0002,
"loss": 0.5228441953659058,
"mean_token_accuracy": 0.7888252288103104,
"num_tokens": 12678403.0,
"step": 774
},
{
"entropy": 0.5262960642576218,
"epoch": 2.8917910447761193,
"grad_norm": 0.16115020215511322,
"learning_rate": 0.0002,
"loss": 0.5279878973960876,
"mean_token_accuracy": 0.7827633023262024,
"num_tokens": 12694636.0,
"step": 775
},
{
"entropy": 0.5458672344684601,
"epoch": 2.8955223880597014,
"grad_norm": 0.18671803176403046,
"learning_rate": 0.0002,
"loss": 0.5379894971847534,
"mean_token_accuracy": 0.7803581058979034,
"num_tokens": 12711335.0,
"step": 776
},
{
"entropy": 0.5334444046020508,
"epoch": 2.8992537313432836,
"grad_norm": 0.16968129575252533,
"learning_rate": 0.0002,
"loss": 0.5301728248596191,
"mean_token_accuracy": 0.7843312919139862,
"num_tokens": 12727428.0,
"step": 777
},
{
"entropy": 0.5264092683792114,
"epoch": 2.9029850746268657,
"grad_norm": 0.17358112335205078,
"learning_rate": 0.0002,
"loss": 0.5304536819458008,
"mean_token_accuracy": 0.7818145751953125,
"num_tokens": 12743928.0,
"step": 778
},
{
"entropy": 0.521320641040802,
"epoch": 2.906716417910448,
"grad_norm": 0.19404703378677368,
"learning_rate": 0.0002,
"loss": 0.5308122038841248,
"mean_token_accuracy": 0.7851481735706329,
"num_tokens": 12760425.0,
"step": 779
},
{
"entropy": 0.5253891497850418,
"epoch": 2.91044776119403,
"grad_norm": 0.23603156208992004,
"learning_rate": 0.0002,
"loss": 0.537718653678894,
"mean_token_accuracy": 0.7832214832305908,
"num_tokens": 12776783.0,
"step": 780
},
{
"entropy": 0.5522697567939758,
"epoch": 2.914179104477612,
"grad_norm": 0.16655920445919037,
"learning_rate": 0.0002,
"loss": 0.5428380966186523,
"mean_token_accuracy": 0.7817497551441193,
"num_tokens": 12793260.0,
"step": 781
},
{
"entropy": 0.5386251360177994,
"epoch": 2.917910447761194,
"grad_norm": 0.17462746798992157,
"learning_rate": 0.0002,
"loss": 0.5273305773735046,
"mean_token_accuracy": 0.7866194099187851,
"num_tokens": 12809754.0,
"step": 782
},
{
"entropy": 0.5417182147502899,
"epoch": 2.921641791044776,
"grad_norm": 0.16420036554336548,
"learning_rate": 0.0002,
"loss": 0.5311017632484436,
"mean_token_accuracy": 0.7847865968942642,
"num_tokens": 12826135.0,
"step": 783
},
{
"entropy": 0.5094658881425858,
"epoch": 2.925373134328358,
"grad_norm": 0.209514319896698,
"learning_rate": 0.0002,
"loss": 0.5230738520622253,
"mean_token_accuracy": 0.7901812642812729,
"num_tokens": 12842378.0,
"step": 784
},
{
"entropy": 0.5122962892055511,
"epoch": 2.9291044776119404,
"grad_norm": 0.17986896634101868,
"learning_rate": 0.0002,
"loss": 0.5213406682014465,
"mean_token_accuracy": 0.7899868190288544,
"num_tokens": 12858715.0,
"step": 785
},
{
"entropy": 0.5239143073558807,
"epoch": 2.9328358208955225,
"grad_norm": 0.17349380254745483,
"learning_rate": 0.0002,
"loss": 0.5260440707206726,
"mean_token_accuracy": 0.7880281209945679,
"num_tokens": 12875134.0,
"step": 786
},
{
"entropy": 0.5183478370308876,
"epoch": 2.9365671641791042,
"grad_norm": 0.15738630294799805,
"learning_rate": 0.0002,
"loss": 0.5146017074584961,
"mean_token_accuracy": 0.7944561541080475,
"num_tokens": 12891435.0,
"step": 787
},
{
"entropy": 0.5321111530065536,
"epoch": 2.9402985074626864,
"grad_norm": 0.169599249958992,
"learning_rate": 0.0002,
"loss": 0.5332249402999878,
"mean_token_accuracy": 0.7841628640890121,
"num_tokens": 12907955.0,
"step": 788
},
{
"entropy": 0.5348423272371292,
"epoch": 2.9440298507462686,
"grad_norm": 0.1703958362340927,
"learning_rate": 0.0002,
"loss": 0.5319628715515137,
"mean_token_accuracy": 0.7853727787733078,
"num_tokens": 12924187.0,
"step": 789
},
{
"entropy": 0.5348647981882095,
"epoch": 2.9477611940298507,
"grad_norm": 0.16257572174072266,
"learning_rate": 0.0002,
"loss": 0.5274540185928345,
"mean_token_accuracy": 0.7864417731761932,
"num_tokens": 12940471.0,
"step": 790
},
{
"entropy": 0.5246876776218414,
"epoch": 2.951492537313433,
"grad_norm": 0.21989069879055023,
"learning_rate": 0.0002,
"loss": 0.532191276550293,
"mean_token_accuracy": 0.7841058969497681,
"num_tokens": 12956753.0,
"step": 791
},
{
"entropy": 0.5206954181194305,
"epoch": 2.955223880597015,
"grad_norm": 0.18530453741550446,
"learning_rate": 0.0002,
"loss": 0.5260450839996338,
"mean_token_accuracy": 0.7853500992059708,
"num_tokens": 12972983.0,
"step": 792
},
{
"entropy": 0.5218585133552551,
"epoch": 2.958955223880597,
"grad_norm": 0.19632470607757568,
"learning_rate": 0.0002,
"loss": 0.524539589881897,
"mean_token_accuracy": 0.7870173752307892,
"num_tokens": 12989538.0,
"step": 793
},
{
"entropy": 0.5301937758922577,
"epoch": 2.9626865671641793,
"grad_norm": 0.1759789139032364,
"learning_rate": 0.0002,
"loss": 0.5322460532188416,
"mean_token_accuracy": 0.7846620082855225,
"num_tokens": 13005865.0,
"step": 794
},
{
"entropy": 0.5316169708967209,
"epoch": 2.966417910447761,
"grad_norm": 0.18013249337673187,
"learning_rate": 0.0002,
"loss": 0.5267240405082703,
"mean_token_accuracy": 0.7860967516899109,
"num_tokens": 13022162.0,
"step": 795
},
{
"entropy": 0.5342477560043335,
"epoch": 2.970149253731343,
"grad_norm": 0.15967167913913727,
"learning_rate": 0.0002,
"loss": 0.531574010848999,
"mean_token_accuracy": 0.7845140397548676,
"num_tokens": 13038634.0,
"step": 796
},
{
"entropy": 0.5358534008264542,
"epoch": 2.9738805970149254,
"grad_norm": 0.18192364275455475,
"learning_rate": 0.0002,
"loss": 0.531234085559845,
"mean_token_accuracy": 0.7822518199682236,
"num_tokens": 13054913.0,
"step": 797
},
{
"entropy": 0.5332595482468605,
"epoch": 2.9776119402985075,
"grad_norm": 0.16098462045192719,
"learning_rate": 0.0002,
"loss": 0.5331971645355225,
"mean_token_accuracy": 0.7841719388961792,
"num_tokens": 13071687.0,
"step": 798
},
{
"entropy": 0.5196807980537415,
"epoch": 2.9813432835820897,
"grad_norm": 0.16396892070770264,
"learning_rate": 0.0002,
"loss": 0.5180687308311462,
"mean_token_accuracy": 0.79112908244133,
"num_tokens": 13088263.0,
"step": 799
},
{
"entropy": 0.5160314440727234,
"epoch": 2.9850746268656714,
"grad_norm": 0.18938018381595612,
"learning_rate": 0.0002,
"loss": 0.5278008580207825,
"mean_token_accuracy": 0.7868732959032059,
"num_tokens": 13104420.0,
"step": 800
},
{
"entropy": 0.5099834352731705,
"epoch": 2.9888059701492535,
"grad_norm": 0.18755869567394257,
"learning_rate": 0.0002,
"loss": 0.5147690176963806,
"mean_token_accuracy": 0.790816992521286,
"num_tokens": 13120862.0,
"step": 801
},
{
"entropy": 0.5440191924571991,
"epoch": 2.9925373134328357,
"grad_norm": 0.16148996353149414,
"learning_rate": 0.0002,
"loss": 0.5402988195419312,
"mean_token_accuracy": 0.7817222625017166,
"num_tokens": 13137523.0,
"step": 802
},
{
"entropy": 0.5369501113891602,
"epoch": 2.996268656716418,
"grad_norm": 0.17043927311897278,
"learning_rate": 0.0002,
"loss": 0.5288562178611755,
"mean_token_accuracy": 0.7866682559251785,
"num_tokens": 13153684.0,
"step": 803
},
{
"entropy": 0.5347233563661575,
"epoch": 3.0,
"grad_norm": 0.17972980439662933,
"learning_rate": 0.0002,
"loss": 0.5365173816680908,
"mean_token_accuracy": 0.782272219657898,
"num_tokens": 13170027.0,
"step": 804
}
],
"logging_steps": 1,
"max_steps": 804,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2276685185818296e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}